From 798fd756952c4b6cb7dfe6f6437e9f02da79a5bc Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 26 Jul 2016 15:22:30 -0700 Subject: mm: zap ZONE_OOM_LOCKED Not used since oom_lock was instroduced. Link: http://lkml.kernel.org/r/1464358093-22663-1-git-send-email-vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ddf74487f848..398c245a484a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -988,8 +988,8 @@ bool out_of_memory(struct oom_control *oc) /* * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a - * parallel oom killing is already in progress so do nothing. + * memory-hogging task. If oom_lock is held by somebody else, a parallel oom + * killing is already in progress so do nothing. */ void pagefault_out_of_memory(void) { -- cgit v1.2.3 From 2a966b77ae3ede207e787e7538b87d1011c4364e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 26 Jul 2016 15:22:33 -0700 Subject: mm: oom: add memcg to oom_control It's a part of oom context just like allocation order and nodemask, so let's move it to oom_control instead of passing it in the argument list. Link: http://lkml.kernel.org/r/40e03fd7aaf1f55c75d787128d6d17c5a71226c2.1464358556.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/tty/sysrq.c | 1 + include/linux/oom.h | 8 +++++--- mm/memcontrol.c | 5 +++-- mm/oom_kill.c | 32 +++++++++++++++----------------- mm/page_alloc.c | 1 + 5 files changed, 25 insertions(+), 22 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index e5139402e7f8..52bbd27e93ae 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -363,6 +363,7 @@ static void moom_callback(struct work_struct *ignored) struct oom_control oc = { .zonelist = node_zonelist(first_memory_node, gfp_mask), .nodemask = NULL, + .memcg = NULL, .gfp_mask = gfp_mask, .order = -1, }; diff --git a/include/linux/oom.h b/include/linux/oom.h index 83469522690a..cbc24a5fe28d 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -23,6 +23,9 @@ struct oom_control { /* Used to determine mempolicy */ nodemask_t *nodemask; + /* Memory cgroup in which oom is invoked, or NULL for global oom */ + struct mem_cgroup *memcg; + /* Used to determine cpuset and node locality requirement */ const gfp_t gfp_mask; @@ -83,11 +86,10 @@ extern unsigned long oom_badness(struct task_struct *p, extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, const char *message); + const char *message); extern void check_panic_on_oom(struct oom_control *oc, - enum oom_constraint constraint, - struct mem_cgroup *memcg); + enum oom_constraint constraint); extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, struct task_struct *task, unsigned long totalpages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3a755212448e..caea25a21c70 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1259,6 +1259,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct oom_control oc = { .zonelist = NULL, .nodemask = NULL, + .memcg = memcg, .gfp_mask = gfp_mask, .order = order, }; @@ -1281,7 +1282,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, goto unlock; } - check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); + check_panic_on_oom(&oc, CONSTRAINT_MEMCG); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1329,7 +1330,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, if (chosen) { points = chosen_points * 1000 / totalpages; - oom_kill_process(&oc, chosen, points, totalpages, memcg, + oom_kill_process(&oc, chosen, points, totalpages, "Memory cgroup out of memory"); } unlock: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 398c245a484a..a376f1ebdad5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -383,8 +383,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) rcu_read_unlock(); } -static void dump_header(struct oom_control *oc, struct task_struct *p, - struct mem_cgroup *memcg) +static void dump_header(struct oom_control *oc, struct task_struct *p) { pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, @@ -392,12 +391,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p, cpuset_print_current_mems_allowed(); dump_stack(); - if (memcg) - mem_cgroup_print_oom_info(memcg, p); + if (oc->memcg) + mem_cgroup_print_oom_info(oc->memcg, p); else show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) - dump_tasks(memcg, oc->nodemask); + dump_tasks(oc->memcg, oc->nodemask); } /* @@ -739,7 +738,7 @@ void oom_killer_enable(void) */ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, const char *message) + const char *message) { struct task_struct *victim = p; struct task_struct *child; @@ -765,7 +764,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, task_unlock(p); if (__ratelimit(&oom_rs)) - dump_header(oc, p, memcg); + dump_header(oc, p); pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", message, task_pid_nr(p), p->comm, points); @@ -786,8 +785,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* * oom_badness() returns 0 if the thread is unkillable */ - child_points = oom_badness(child, memcg, oc->nodemask, - totalpages); + child_points = oom_badness(child, + oc->memcg, oc->nodemask, totalpages); if (child_points > victim_points) { put_task_struct(victim); victim = child; @@ -865,8 +864,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, - struct mem_cgroup *memcg) +void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint) { if (likely(!sysctl_panic_on_oom)) return; @@ -882,7 +880,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, /* Do not panic for oom kills triggered by sysrq */ if (is_sysrq_oom(oc)) return; - dump_header(oc, NULL, memcg); + dump_header(oc, NULL); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -957,13 +955,13 @@ bool out_of_memory(struct oom_control *oc) constraint = constrained_alloc(oc, &totalpages); if (constraint != CONSTRAINT_MEMORY_POLICY) oc->nodemask = NULL; - check_panic_on_oom(oc, constraint, NULL); + check_panic_on_oom(oc, constraint); if (sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oom_kill_process(oc, current, 0, totalpages, NULL, + oom_kill_process(oc, current, 0, totalpages, "Out of memory (oom_kill_allocating_task)"); return true; } @@ -971,12 +969,11 @@ bool out_of_memory(struct oom_control *oc) p = select_bad_process(oc, &points, totalpages); /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p && !is_sysrq_oom(oc)) { - dump_header(oc, NULL, NULL); + dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } if (p && p != (void *)-1UL) { - oom_kill_process(oc, p, points, totalpages, NULL, - "Out of memory"); + oom_kill_process(oc, p, points, totalpages, "Out of memory"); /* * Give the killed process a good chance to exit before trying * to allocate memory again. @@ -996,6 +993,7 @@ void pagefault_out_of_memory(void) struct oom_control oc = { .zonelist = NULL, .nodemask = NULL, + .memcg = NULL, .gfp_mask = 0, .order = 0, }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8129922a1504..f7bb1aef54f2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3105,6 +3105,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct oom_control oc = { .zonelist = ac->zonelist, .nodemask = ac->nodemask, + .memcg = NULL, .gfp_mask = gfp_mask, .order = order, }; -- cgit v1.2.3 From fbe84a09da746f781553051bb3dbc63f7b0a5162 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 26 Jul 2016 15:24:39 -0700 Subject: mm,oom: remove unused argument from oom_scan_process_thread(). oom_scan_process_thread() does not use totalpages argument. oom_badness() uses it. Link: http://lkml.kernel.org/r/1463796041-7889-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 +- mm/memcontrol.c | 2 +- mm/oom_kill.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/include/linux/oom.h b/include/linux/oom.h index cbc24a5fe28d..606137b3b778 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -92,7 +92,7 @@ extern void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint); extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, - struct task_struct *task, unsigned long totalpages); + struct task_struct *task); extern bool out_of_memory(struct oom_control *oc); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ff53e348c4bb..1a1a3093a5c9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1290,7 +1290,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_start(&iter->css, &it); while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(&oc, task, totalpages)) { + switch (oom_scan_process_thread(&oc, task)) { case OOM_SCAN_SELECT: if (chosen) put_task_struct(chosen); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a376f1ebdad5..c11f8bdd0c12 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -274,7 +274,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, #endif enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, - struct task_struct *task, unsigned long totalpages) + struct task_struct *task) { if (oom_unkillable_task(task, NULL, oc->nodemask)) return OOM_SCAN_CONTINUE; @@ -311,7 +311,7 @@ static struct task_struct *select_bad_process(struct oom_control *oc, for_each_process(p) { unsigned int points; - switch (oom_scan_process_thread(oc, p, totalpages)) { + switch (oom_scan_process_thread(oc, p)) { case OOM_SCAN_SELECT: chosen = p; chosen_points = ULONG_MAX; -- cgit v1.2.3 From e5e3f4c4f0e95ecbad2f8d2f4f6a29bb8a90226b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2016 15:24:50 -0700 Subject: mm, oom_reaper: make sure that mmput_async is called only when memory was reaped Tetsuo is worried that mmput_async might still lead to a premature new oom victim selection due to the following race: __oom_reap_task exit_mm find_lock_task_mm atomic_inc(mm->mm_users) # = 2 task_unlock task_lock task->mm = NULL up_read(&mm->mmap_sem) < somebody write locks mmap_sem > task_unlock mmput atomic_dec_and_test # = 1 exit_oom_victim down_read_trylock # failed - no reclaim mmput_async # Takes unpredictable amount of time < new OOM situation > the final __mmput will be executed in the delayed context which might happen far in the future. Such a race is highly unlikely because the write holder of mmap_sem would have to be an external task (all direct holders are already killed or exiting) and it usually have to pin mm_users in order to do anything reasonable. We can, however, make sure that the mmput_async is only called when we do not back off and reap some memory. That would reduce the impact of the delayed __mmput because the real content would be already freed. Pin mm_count to keep it alive after we drop task_lock and before we try to get mmap_sem. If the mmap_sem succeeds we can try to grab mm_users reference and then go on with unmapping the address space. It is not clear whether this race is possible at all but it is better to be more robust and do not pin mm_users unless we are sure we are actually doing some real work during __oom_reap_task. Link: http://lkml.kernel.org/r/1465306987-30297-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'mm/oom_kill.c') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c11f8bdd0c12..d4a929d79470 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -452,7 +452,7 @@ static bool __oom_reap_task(struct task_struct *tsk) * We have to make sure to not race with the victim exit path * and cause premature new oom victim selection: * __oom_reap_task exit_mm - * atomic_inc_not_zero + * mmget_not_zero * mmput * atomic_dec_and_test * exit_oom_victim @@ -474,12 +474,22 @@ static bool __oom_reap_task(struct task_struct *tsk) if (!p) goto unlock_oom; mm = p->mm; - atomic_inc(&mm->mm_users); + atomic_inc(&mm->mm_count); task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; - goto unlock_oom; + goto mm_drop; + } + + /* + * increase mm_users only after we know we will reap something so + * that the mmput_async is called only when we have reaped something + * and delayed __mmput doesn't matter that much + */ + if (!mmget_not_zero(mm)) { + up_read(&mm->mmap_sem); + goto mm_drop; } tlb_gather_mmu(&tlb, mm, 0, -1); @@ -521,15 +531,16 @@ static bool __oom_reap_task(struct task_struct *tsk) * to release its memory. */ set_bit(MMF_OOM_REAPED, &mm->flags); -unlock_oom: - mutex_unlock(&oom_lock); /* * Drop our reference but make sure the mmput slow path is called from a * different context because we shouldn't risk we get stuck there and * put the oom_reaper out of the way. */ - if (mm) - mmput_async(mm); + mmput_async(mm); +mm_drop: + mmdrop(mm); +unlock_oom: + mutex_unlock(&oom_lock); return ret; } -- cgit v1.2.3