summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug9
-rw-r--r--mm/filemap.c29
-rw-r--r--mm/gup.c4
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memcontrol.c21
-rw-r--r--mm/memory.c45
-rw-r--r--mm/mmap.c13
-rw-r--r--mm/page-writeback.c43
-rw-r--r--mm/page_alloc.c82
-rw-r--r--mm/rmap.c42
-rw-r--r--mm/vmscan.c26
11 files changed, 158 insertions, 158 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 56badfc4810a..957d3da53ddd 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC
depends on !KMEMCHECK
select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
- select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
-config WANT_PAGE_DEBUG_FLAGS
- bool
-
config PAGE_POISONING
bool
- select WANT_PAGE_DEBUG_FLAGS
-
-config PAGE_GUARD
- bool
- select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/filemap.c b/mm/filemap.c
index bd8543c6508f..673e4581a2e5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1046,8 +1046,7 @@ EXPORT_SYMBOL(find_lock_entry);
* @mapping: the address_space to search
* @offset: the page index
* @fgp_flags: PCG flags
- * @cache_gfp_mask: gfp mask to use for the page cache data page allocation
- * @radix_gfp_mask: gfp mask to use for radix tree node allocation
+ * @gfp_mask: gfp mask to use for the page cache data page allocation
*
* Looks up the page cache slot at @mapping & @offset.
*
@@ -1056,11 +1055,9 @@ EXPORT_SYMBOL(find_lock_entry);
* FGP_ACCESSED: the page will be marked accessed
* FGP_LOCK: Page is return locked
* FGP_CREAT: If page is not present then a new page is allocated using
- * @cache_gfp_mask and added to the page cache and the VM's LRU
- * list. If radix tree nodes are allocated during page cache
- * insertion then @radix_gfp_mask is used. The page is returned
- * locked and with an increased refcount. Otherwise, %NULL is
- * returned.
+ * @gfp_mask and added to the page cache and the VM's LRU
+ * list. The page is returned locked and with an increased
+ * refcount. Otherwise, %NULL is returned.
*
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
* if the GFP flags specified for FGP_CREAT are atomic.
@@ -1068,7 +1065,7 @@ EXPORT_SYMBOL(find_lock_entry);
* If there is a page cache page, it is returned with an increased refcount.
*/
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
- int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
+ int fgp_flags, gfp_t gfp_mask)
{
struct page *page;
@@ -1105,13 +1102,11 @@ no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
int err;
if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
- cache_gfp_mask |= __GFP_WRITE;
- if (fgp_flags & FGP_NOFS) {
- cache_gfp_mask &= ~__GFP_FS;
- radix_gfp_mask &= ~__GFP_FS;
- }
+ gfp_mask |= __GFP_WRITE;
+ if (fgp_flags & FGP_NOFS)
+ gfp_mask &= ~__GFP_FS;
- page = __page_cache_alloc(cache_gfp_mask);
+ page = __page_cache_alloc(gfp_mask);
if (!page)
return NULL;
@@ -1122,7 +1117,8 @@ no_page:
if (fgp_flags & FGP_ACCESSED)
__SetPageReferenced(page);
- err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
+ err = add_to_page_cache_lru(page, mapping, offset,
+ gfp_mask & GFP_RECLAIM_MASK);
if (unlikely(err)) {
page_cache_release(page);
page = NULL;
@@ -2443,8 +2439,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
fgp_flags |= FGP_NOFS;
page = pagecache_get_page(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping),
- GFP_KERNEL);
+ mapping_gfp_mask(mapping));
if (page)
wait_for_stable_page(page);
diff --git a/mm/gup.c b/mm/gup.c
index a900759cc807..8dd50ce6326f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -296,7 +296,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
return -ENOMEM;
if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
- if (ret & VM_FAULT_SIGBUS)
+ if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
return -EFAULT;
BUG();
}
@@ -571,7 +571,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
return -ENOMEM;
if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
return -EHWPOISON;
- if (ret & VM_FAULT_SIGBUS)
+ if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
return -EFAULT;
BUG();
}
diff --git a/mm/ksm.c b/mm/ksm.c
index d247efab5073..15647fb0394f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -376,7 +376,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
else
ret = VM_FAULT_WRITE;
put_page(page);
- } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
/*
* We must loop because handle_mm_fault() may back out if there's
* any difficulty e.g. if pte accessed bit gets updated concurrently.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ef91e856c7e4..683b4782019b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1477,9 +1477,9 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_info("Task in ");
pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_info(" killed as a result of limit of ");
+ pr_cont(" killed as a result of limit of ");
pr_cont_cgroup_path(memcg->css.cgroup);
- pr_info("\n");
+ pr_cont("\n");
rcu_read_unlock();
@@ -3043,18 +3043,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
mem_cgroup_swap_statistics(to, true);
- /*
- * This function is only called from task migration context now.
- * It postpones page_counter and refcount handling till the end
- * of task migration(mem_cgroup_clear_mc()) for performance
- * improvement. But we cannot postpone css_get(to) because if
- * the process that has been moved to @to does swap-in, the
- * refcount of @to might be decreased to 0.
- *
- * We are in attach() phase, so the cgroup is guaranteed to be
- * alive, so we can just call css_get().
- */
- css_get(&to->css);
return 0;
}
return -EINVAL;
@@ -4679,6 +4667,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent_css == NULL) {
root_mem_cgroup = memcg;
page_counter_init(&memcg->memory, NULL);
+ memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
}
@@ -4724,6 +4713,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent->use_hierarchy) {
page_counter_init(&memcg->memory, &parent->memory);
+ memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4733,6 +4723,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
*/
} else {
page_counter_init(&memcg->memory, NULL);
+ memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
/*
@@ -4807,7 +4798,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
- memcg->soft_limit = 0;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
}
#ifdef CONFIG_MMU
diff --git a/mm/memory.c b/mm/memory.c
index 649e7d440bd7..2c3536cc6c63 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -235,6 +235,9 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
+ if (!tlb->end)
+ return;
+
tlb_flush(tlb);
mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
@@ -247,7 +250,7 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
- for (batch = &tlb->local; batch; batch = batch->next) {
+ for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
batch->nr = 0;
}
@@ -256,9 +259,6 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
void tlb_flush_mmu(struct mmu_gather *tlb)
{
- if (!tlb->end)
- return;
-
tlb_flush_mmu_tlbonly(tlb);
tlb_flush_mmu_free(tlb);
}
@@ -2137,17 +2137,24 @@ reuse:
if (!dirty_page)
return ret;
- /*
- * Yes, Virginia, this is actually required to prevent a race
- * with clear_page_dirty_for_io() from clearing the page dirty
- * bit after it clear all dirty ptes, but before a racing
- * do_wp_page installs a dirty pte.
- *
- * do_shared_fault is protected similarly.
- */
if (!page_mkwrite) {
- wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page);
+ struct address_space *mapping;
+ int dirtied;
+
+ lock_page(dirty_page);
+ dirtied = set_page_dirty(dirty_page);
+ VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
+ mapping = dirty_page->mapping;
+ unlock_page(dirty_page);
+
+ if (dirtied && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
@@ -2378,12 +2385,12 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
- i_mmap_lock_read(mapping);
+ i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
- i_mmap_unlock_read(mapping);
+ i_mmap_unlock_write(mapping);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -2593,7 +2600,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
if (prev && prev->vm_end == address)
return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
- expand_downwards(vma, address - PAGE_SIZE);
+ return expand_downwards(vma, address - PAGE_SIZE);
}
if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
struct vm_area_struct *next = vma->vm_next;
@@ -2602,7 +2609,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
if (next && next->vm_start == address + PAGE_SIZE)
return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
- expand_upwards(vma, address + PAGE_SIZE);
+ return expand_upwards(vma, address + PAGE_SIZE);
}
return 0;
}
@@ -2625,7 +2632,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
- return VM_FAULT_SIGBUS;
+ return VM_FAULT_SIGSEGV;
/* Use the zero-page for reads */
if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b36aa7cc89a..7f684d5a8087 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -778,10 +778,12 @@ again: remove_next = 1 + (end > next->vm_end);
if (exporter && exporter->anon_vma && !importer->anon_vma) {
int error;
+ importer->anon_vma = exporter->anon_vma;
error = anon_vma_clone(importer, exporter);
- if (error)
+ if (error) {
+ importer->anon_vma = NULL;
return error;
- importer->anon_vma = exporter->anon_vma;
+ }
}
}
@@ -2099,14 +2101,17 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
- unsigned long new_start;
+ unsigned long new_start, actual_size;
/* address space limit tests */
if (!may_expand_vm(mm, grow))
return -ENOMEM;
/* Stack limit test */
- if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ actual_size = size;
+ if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
+ actual_size -= PAGE_SIZE;
+ if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d5d81f5384d1..6f4335238e33 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1541,16 +1541,6 @@ pause:
bdi_start_background_writeback(bdi);
}
-void set_page_dirty_balance(struct page *page)
-{
- if (set_page_dirty(page)) {
- struct address_space *mapping = page_mapping(page);
-
- if (mapping)
- balance_dirty_pages_ratelimited(mapping);
- }
-}
-
static DEFINE_PER_CPU(int, bdp_ratelimits);
/*
@@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied);
* page dirty in that case, but not all the buffers. This is a "bottom-up"
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
*
- * Most callers have locked the page, which pins the address_space in memory.
- * But zap_pte_range() does not lock the page, however in that case the
- * mapping is pinned by the vma's ->vm_file reference.
- *
- * We take care to handle the case where the page was truncated from the
- * mapping by re-checking page_mapping() inside tree_lock.
+ * The caller must ensure this doesn't race with truncation. Most will simply
+ * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
+ * the pte lock held, which also locks out truncation.
*/
int __set_page_dirty_nobuffers(struct page *page)
{
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
- struct address_space *mapping2;
unsigned long flags;
if (!mapping)
return 1;
spin_lock_irqsave(&mapping->tree_lock, flags);
- mapping2 = page_mapping(page);
- if (mapping2) { /* Race with truncate? */
- BUG_ON(mapping2 != mapping);
- WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
- }
+ BUG_ON(page_mapping(page) != mapping);
+ WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree, page_index(page),
+ PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
if (mapping->host) {
/* !PageAnon && !swapper_space */
@@ -2305,12 +2288,10 @@ int clear_page_dirty_for_io(struct page *page)
/*
* We carefully synchronise fault handlers against
* installing a dirty pte and marking the page dirty
- * at this point. We do this by having them hold the
- * page lock at some point after installing their
- * pte, but before marking the page dirty.
- * Pages are always locked coming in here, so we get
- * the desired exclusion. See mm/memory.c:do_wp_page()
- * for more comments.
+ * at this point. We do this by having them hold the
+ * page lock while dirtying the page, and pages are
+ * always locked coming in here, so we get the desired
+ * exclusion.
*/
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7633c503a116..8e20f9c2fa5a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2332,12 +2332,21 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype)
+ int classzone_idx, int migratetype, unsigned long *did_some_progress)
{
struct page *page;
- /* Acquire the per-zone oom lock for each zone */
+ *did_some_progress = 0;
+
+ if (oom_killer_disabled)
+ return NULL;
+
+ /*
+ * Acquire the per-zone oom lock for each zone. If that
+ * fails, somebody else is making progress for us.
+ */
if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
+ *did_some_progress = 1;
schedule_timeout_uninterruptible(1);
return NULL;
}
@@ -2363,12 +2372,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
if (!(gfp_mask & __GFP_NOFAIL)) {
+ /* Coredumps can quickly deplete all memory reserves */
+ if (current->flags & PF_DUMPCORE)
+ goto out;
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (high_zoneidx < ZONE_NORMAL)
goto out;
+ /* The OOM killer does not compensate for light reclaim */
+ if (!(gfp_mask & __GFP_FS))
+ goto out;
/*
* GFP_THISNODE contains __GFP_NORETRY and we never hit this.
* Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2381,7 +2396,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
}
/* Exhausted what can be done so it's blamo time */
out_of_memory(zonelist, gfp_mask, order, nodemask, false);
-
+ *did_some_progress = 1;
out:
oom_zonelist_unlock(zonelist, gfp_mask);
return page;
@@ -2658,7 +2673,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
-restart:
+retry:
if (!(gfp_mask & __GFP_NO_KSWAPD))
wake_all_kswapds(order, zonelist, high_zoneidx,
preferred_zone, nodemask);
@@ -2681,7 +2696,6 @@ restart:
classzone_idx = zonelist_zone_idx(preferred_zoneref);
}
-rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2788,54 +2802,28 @@ rebalance:
if (page)
goto got_pg;
- /*
- * If we failed to make any progress reclaiming, then we are
- * running out of options and have to consider going OOM
- */
- if (!did_some_progress) {
- if (oom_gfp_allowed(gfp_mask)) {
- if (oom_killer_disabled)
- goto nopage;
- /* Coredumps can quickly deplete all memory reserves */
- if ((current->flags & PF_DUMPCORE) &&
- !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
- page = __alloc_pages_may_oom(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask, preferred_zone,
- classzone_idx, migratetype);
- if (page)
- goto got_pg;
-
- if (!(gfp_mask & __GFP_NOFAIL)) {
- /*
- * The oom killer is not called for high-order
- * allocations that may fail, so if no progress
- * is being made, there are no other options and
- * retrying is unlikely to help.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- goto nopage;
- /*
- * The oom killer is not called for lowmem
- * allocations to prevent needlessly killing
- * innocent tasks.
- */
- if (high_zoneidx < ZONE_NORMAL)
- goto nopage;
- }
-
- goto restart;
- }
- }
-
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, did_some_progress,
pages_reclaimed)) {
+ /*
+ * If we fail to make progress by freeing individual
+ * pages, but the allocation wants us to keep going,
+ * start OOM killing tasks.
+ */
+ if (!did_some_progress) {
+ page = __alloc_pages_may_oom(gfp_mask, order, zonelist,
+ high_zoneidx, nodemask,
+ preferred_zone, classzone_idx,
+ migratetype,&did_some_progress);
+ if (page)
+ goto got_pg;
+ if (!did_some_progress)
+ goto nopage;
+ }
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
- goto rebalance;
+ goto retry;
} else {
/*
* High-order allocations do not necessarily loop after
diff --git a/mm/rmap.c b/mm/rmap.c
index c5bc241127b2..71cd5bd0c17d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
+ anon_vma->degree = 1; /* Reference for first vma */
+ anon_vma->parent = anon_vma;
/*
* Initialise the anon_vma root to point to itself. If called
* from fork, the root will be reset to the parents anon_vma.
@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
+ /* vma reference or self-parent link for new root */
+ anon_vma->degree++;
allocated = NULL;
avc = NULL;
}
@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
/*
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
+ *
+ * If dst->anon_vma is NULL this function tries to find and reuse existing
+ * anon_vma which has no vmas and only one child anon_vma. This prevents
+ * degradation of anon_vma hierarchy to endless linear chain in case of
+ * constantly forking task. On the other hand, an anon_vma with more than one
+ * child isn't reused even if there was no alive vma, thus rmap walker has a
+ * good chance of avoiding scanning the whole hierarchy when it searches where
+ * page is mapped.
*/
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
anon_vma = pavc->anon_vma;
root = lock_anon_vma_root(root, anon_vma);
anon_vma_chain_link(dst, avc, anon_vma);
+
+ /*
+ * Reuse existing anon_vma if its degree lower than two,
+ * that means it has no vma and only one anon_vma child.
+ *
+ * Do not chose parent anon_vma, otherwise first child
+ * will always reuse it. Root anon_vma is never reused:
+ * it has self-parent reference and at least one child.
+ */
+ if (!dst->anon_vma && anon_vma != src->anon_vma &&
+ anon_vma->degree < 2)
+ dst->anon_vma = anon_vma;
}
+ if (dst->anon_vma)
+ dst->anon_vma->degree++;
unlock_anon_vma_root(root);
return 0;
@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
if (!pvma->anon_vma)
return 0;
+ /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
+ vma->anon_vma = NULL;
+
/*
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
if (error)
return error;
+ /* An existing anon_vma has been reused, all done then. */
+ if (vma->anon_vma)
+ return 0;
+
/* Then add our own anon_vma. */
anon_vma = anon_vma_alloc();
if (!anon_vma)
@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
* lock any of the anon_vmas in this anon_vma tree.
*/
anon_vma->root = pvma->anon_vma->root;
+ anon_vma->parent = pvma->anon_vma;
/*
* With refcounts, an anon_vma can stay around longer than the
* process it belongs to. The root anon_vma needs to be pinned until
@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma->parent->degree++;
anon_vma_unlock_write(anon_vma);
return 0;
@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
* Leave empty anon_vmas on the list - we'll need
* to free them outside the lock.
*/
- if (RB_EMPTY_ROOT(&anon_vma->rb_root))
+ if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
+ anon_vma->parent->degree--;
continue;
+ }
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
+ if (vma->anon_vma)
+ vma->anon_vma->degree--;
unlock_anon_vma_root(root);
/*
@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
+ BUG_ON(anon_vma->degree);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd9a72bc4a1b..dcd90c891d8e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2656,7 +2656,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
* should make reasonable progress.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
- gfp_mask, nodemask) {
+ gfp_zone(gfp_mask), nodemask) {
if (zone_idx(zone) > ZONE_NORMAL)
continue;
@@ -2921,18 +2921,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
return false;
/*
- * There is a potential race between when kswapd checks its watermarks
- * and a process gets throttled. There is also a potential race if
- * processes get throttled, kswapd wakes, a large process exits therby
- * balancing the zones that causes kswapd to miss a wakeup. If kswapd
- * is going to sleep, no process should be sleeping on pfmemalloc_wait
- * so wake them now if necessary. If necessary, processes will wake
- * kswapd and get throttled again
+ * The throttled processes are normally woken up in balance_pgdat() as
+ * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+ * race between when kswapd checks the watermarks and a process gets
+ * throttled. There is also a potential race if processes get
+ * throttled, kswapd wakes, a large process exits thereby balancing the
+ * zones, which causes kswapd to exit balance_pgdat() before reaching
+ * the wake up checks. If kswapd is going to sleep, no process should
+ * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
+ * the wake up is premature, processes will wake kswapd and get
+ * throttled again. The difference from wake ups in balance_pgdat() is
+ * that here we are under prepare_to_wait().
*/
- if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
- wake_up(&pgdat->pfmemalloc_wait);
- return false;
- }
+ if (waitqueue_active(&pgdat->pfmemalloc_wait))
+ wake_up_all(&pgdat->pfmemalloc_wait);
return pgdat_balanced(pgdat, order, classzone_idx);
}