summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c328
1 files changed, 250 insertions, 78 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7990ca154d1b..d2186ecb36f7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
#include <linux/ftrace_event.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
+#include <linux/page-debug-flags.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory. This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~GFP_IOFS;
}
+
+bool pm_suspended_storage(void)
+{
+ if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+ return false;
+ return true;
+}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -381,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
clear_highpage(page + i);
}
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+ unsigned long res;
+
+ if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
+ printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ return 0;
+ }
+ _debug_guardpage_minorder = res;
+ printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+
+static inline void set_page_guard_flag(struct page *page)
+{
+ __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+
+static inline void clear_page_guard_flag(struct page *page)
+{
+ __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
+
static inline void set_page_order(struct page *page, int order)
{
set_page_private(page, order);
@@ -438,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
+ if (page_is_guard(buddy) && page_order(buddy) == order) {
+ VM_BUG_ON(page_count(buddy) != 0);
+ return 1;
+ }
+
if (PageBuddy(buddy) && page_order(buddy) == order) {
VM_BUG_ON(page_count(buddy) != 0);
return 1;
@@ -494,11 +546,19 @@ static inline void __free_one_page(struct page *page,
buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
break;
-
- /* Our buddy is free, merge with it and move up one order. */
- list_del(&buddy->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(buddy);
+ /*
+ * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+ * merge with it and move up one order.
+ */
+ if (page_is_guard(buddy)) {
+ clear_page_guard_flag(buddy);
+ set_page_private(page, 0);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+ } else {
+ list_del(&buddy->lru);
+ zone->free_area[order].nr_free--;
+ rmv_page_order(buddy);
+ }
combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
@@ -632,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
int i;
int bad = 0;
- trace_mm_page_free_direct(page, order);
+ trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
if (PageAnon(page))
@@ -670,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-/*
- * permit the bootmem allocator to evade page validation on high-order frees
- */
void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
{
- if (order == 0) {
- __ClearPageReserved(page);
- set_page_count(page, 0);
- set_page_refcounted(page);
- __free_page(page);
- } else {
- int loop;
+ unsigned int nr_pages = 1 << order;
+ unsigned int loop;
- prefetchw(page);
- for (loop = 0; loop < (1 << order); loop++) {
- struct page *p = &page[loop];
+ prefetchw(page);
+ for (loop = 0; loop < nr_pages; loop++) {
+ struct page *p = &page[loop];
- if (loop + 1 < (1 << order))
- prefetchw(p + 1);
- __ClearPageReserved(p);
- set_page_count(p, 0);
- }
-
- set_page_refcounted(page);
- __free_pages(page, order);
+ if (loop + 1 < nr_pages)
+ prefetchw(p + 1);
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
}
+
+ set_page_refcounted(page);
+ __free_pages(page, order);
}
@@ -724,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page,
high--;
size >>= 1;
VM_BUG_ON(bad_range(zone, &page[size]));
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (high < debug_guardpage_minorder()) {
+ /*
+ * Mark as guard pages (or page), that will allow to
+ * merge back to allocator when buddy will be freed.
+ * Corresponding page table entries will not be touched,
+ * pages will stay not present in virtual address space
+ */
+ INIT_LIST_HEAD(&page[size].lru);
+ set_page_guard_flag(&page[size]);
+ set_page_private(&page[size], high);
+ /* Guard pages are not available for any usage */
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+ continue;
+ }
+#endif
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
@@ -1189,6 +1257,19 @@ out:
}
/*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ trace_mm_page_free_batched(page, cold);
+ free_hot_cold_page(page, cold);
+ }
+}
+
+/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
* Each sub-page must be freed individually.
@@ -1435,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
long min = mark;
int o;
- free_pages -= (1 << order) + 1;
+ free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
@@ -1645,6 +1726,35 @@ zonelist_scan:
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
+ /*
+ * When allocating a page cache page for writing, we
+ * want to get it from a zone that is within its dirty
+ * limit, such that no single zone holds more than its
+ * proportional share of globally allowed dirty pages.
+ * The dirty limits take into account the zone's
+ * lowmem reserves and high watermark so that kswapd
+ * should be able to balance it without having to
+ * write pages from its LRU list.
+ *
+ * This may look like it could increase pressure on
+ * lower zones by failing allocations in higher zones
+ * before they are full. But the pages that do spill
+ * over are limited as the lower zones are protected
+ * by this very same mechanism. It should not become
+ * a practical burden to them.
+ *
+ * XXX: For now, allow allocations to potentially
+ * exceed the per-zone dirty limit in the slowpath
+ * (ALLOC_WMARK_LOW unset) before going into reclaim,
+ * which is important when on a NUMA setup the allowed
+ * zones are together not big enough to reach the
+ * global limit. The proper fix for these situations
+ * will require awareness of zones in the
+ * dirty-throttling and the flusher threads.
+ */
+ if ((alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+ goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1734,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+ debug_guardpage_minorder() > 0)
return;
/*
@@ -1773,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
static inline int
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+ unsigned long did_some_progress,
unsigned long pages_reclaimed)
{
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
return 0;
+ /* Always retry if specifically requested */
+ if (gfp_mask & __GFP_NOFAIL)
+ return 1;
+
+ /*
+ * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+ * making forward progress without invoking OOM. Suspend also disables
+ * storage devices so kswapd will not help. Bail if we are suspending.
+ */
+ if (!did_some_progress && pm_suspended_storage())
+ return 0;
+
/*
* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
* means __GFP_NOFAIL, but that may not be true in other
@@ -1797,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
return 1;
- /*
- * Don't let big-order allocations loop unless the caller
- * explicitly requests that.
- */
- if (gfp_mask & __GFP_NOFAIL)
- return 1;
-
return 0;
}
@@ -1864,14 +1981,20 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
struct page *page;
- if (!order || compaction_deferred(preferred_zone))
+ if (!order)
return NULL;
+ if (compaction_deferred(preferred_zone)) {
+ *deferred_compaction = true;
+ return NULL;
+ }
+
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration);
@@ -1899,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
- defer_compaction(preferred_zone);
+
+ /*
+ * As async compaction considers a subset of pageblocks, only
+ * defer if the failure was a sync compaction failure.
+ */
+ if (sync_migration)
+ defer_compaction(preferred_zone);
cond_resched();
}
@@ -1911,8 +2040,9 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
return NULL;
}
@@ -2062,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
+ bool deferred_compaction = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2142,12 +2273,22 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
sync_migration = true;
+ /*
+ * If compaction is deferred for high-order allocations, it is because
+ * sync compaction recently failed. In this is the case and the caller
+ * has requested the system not be heavily disrupted, fail the
+ * allocation now instead of entering direct reclaim
+ */
+ if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+ goto nopage;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
@@ -2196,7 +2337,8 @@ rebalance:
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
- if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+ if (should_alloc_retry(gfp_mask, order, did_some_progress,
+ pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
@@ -2210,8 +2352,9 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
}
@@ -2306,16 +2449,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page);
-void __pagevec_free(struct pagevec *pvec)
-{
- int i = pagevec_count(pvec);
-
- while (--i >= 0) {
- trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
- free_hot_cold_page(pvec->pages[i], pvec->cold);
- }
-}
-
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
@@ -3385,25 +3518,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
if (page_to_nid(page) != zone_to_nid(zone))
continue;
- /* Blocks with reserved pages will never free, skip them. */
- block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
- if (pageblock_is_reserved(pfn, block_end_pfn))
- continue;
-
block_migratetype = get_pageblock_migratetype(page);
- /* If this block is reserved, account for it */
- if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
- reserve--;
- continue;
- }
+ /* Only test what is necessary when the reserves are not met */
+ if (reserve > 0) {
+ /*
+ * Blocks with reserved pages will never free, skip
+ * them.
+ */
+ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+ if (pageblock_is_reserved(pfn, block_end_pfn))
+ continue;
- /* Suitable for reserving if this block is movable */
- if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
- set_pageblock_migratetype(page, MIGRATE_RESERVE);
- move_freepages_block(zone, page, MIGRATE_RESERVE);
- reserve--;
- continue;
+ /* If this block is reserved, account for it */
+ if (block_migratetype == MIGRATE_RESERVE) {
+ reserve--;
+ continue;
+ }
+
+ /* Suitable for reserving if this block is movable */
+ if (block_migratetype == MIGRATE_MOVABLE) {
+ set_pageblock_migratetype(page,
+ MIGRATE_RESERVE);
+ move_freepages_block(zone, page,
+ MIGRATE_RESERVE);
+ reserve--;
+ continue;
+ }
}
/*
@@ -4121,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
- enum lru_list l;
+ enum lru_list lru;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4171,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- for_each_lru(l)
- INIT_LIST_HEAD(&zone->lru[l].list);
+ for_each_lru(lru)
+ INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4526,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat)
for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
- if (zone->present_pages)
+ if (zone->present_pages) {
node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+ break;
+ }
}
#endif
}
@@ -4734,8 +4877,19 @@ static void calculate_totalreserve_pages(void)
if (max > zone->present_pages)
max = zone->present_pages;
reserve_pages += max;
+ /*
+ * Lowmem reserves are not available to
+ * GFP_HIGHUSER page cache allocations and
+ * kswapd tries to balance zones to their high
+ * watermark. As a result, neither should be
+ * regarded as dirtyable memory, to prevent a
+ * situation where reclaim has to clean pages
+ * in order to balance the zones.
+ */
+ zone->dirty_balance_reserve = max;
}
}
+ dirty_balance_reserve = reserve_pages;
totalreserve_pages = reserve_pages;
}
@@ -5259,7 +5413,25 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
bool is_pageblock_removable_nolock(struct page *page)
{
- struct zone *zone = page_zone(page);
+ struct zone *zone;
+ unsigned long pfn;
+
+ /*
+ * We have to be careful here because we are iterating over memory
+ * sections which are not zone aware so we might end up outside of
+ * the zone but still within the section.
+ * We have to take care about the node as well. If the node is offline
+ * its NODE_DATA will be NULL - see page_zone.
+ */
+ if (!node_online(page_to_nid(page)))
+ return false;
+
+ zone = page_zone(page);
+ pfn = page_to_pfn(page);
+ if (zone->zone_start_pfn > pfn ||
+ zone->zone_start_pfn + zone->spanned_pages <= pfn)
+ return false;
+
return __count_immobile_pages(zone, page, 0);
}