summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c353
1 files changed, 212 insertions, 141 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fab5e97dc9ca..23f5066bd4a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
#include <linux/nmi.h>
#include <linux/psi.h>
#include <linux/padata.h>
+#include <linux/khugepaged.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -77,6 +78,34 @@
#include "shuffle.h"
#include "page_reporting.h"
+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
+typedef int __bitwise fpi_t;
+
+/* No special request */
+#define FPI_NONE ((__force fpi_t)0)
+
+/*
+ * Skip free page reporting notification for the (possibly merged) page.
+ * This does not hinder free page reporting from grabbing the page,
+ * reporting it and marking it "reported" - it only skips notifying
+ * the free page reporting infrastructure about a newly freed page. For
+ * example, used when temporarily pulling a page from a freelist and
+ * putting it back unmodified.
+ */
+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
+
+/*
+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
+ * page shuffling (relevant code - e.g., memory onlining - is expected to
+ * shuffle the whole zone).
+ *
+ * Note: No code should rely on this flag for correctness - it's purely
+ * to allow for optimizations when handing back either fresh pages
+ * (memory onlining) or untouched pages (page isolation, free page
+ * reporting).
+ */
+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
@@ -155,16 +184,16 @@ static int __init early_init_on_alloc(char *buf)
int ret;
bool bool_result;
- if (!buf)
- return -EINVAL;
ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
if (bool_result && page_poisoning_enabled())
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
if (bool_result)
static_branch_enable(&init_on_alloc);
else
static_branch_disable(&init_on_alloc);
- return ret;
+ return 0;
}
early_param("init_on_alloc", early_init_on_alloc);
@@ -173,16 +202,16 @@ static int __init early_init_on_free(char *buf)
int ret;
bool bool_result;
- if (!buf)
- return -EINVAL;
ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
if (bool_result && page_poisoning_enabled())
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
if (bool_result)
static_branch_enable(&init_on_free);
else
static_branch_disable(&init_on_free);
- return ret;
+ return 0;
}
early_param("init_on_free", early_init_on_free);
@@ -246,7 +275,8 @@ bool pm_suspended_storage(void)
unsigned int pageblock_order __read_mostly;
#endif
-static void __free_pages_ok(struct page *page, unsigned int order);
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
@@ -658,7 +688,7 @@ out:
void free_compound_page(struct page *page)
{
mem_cgroup_uncharge(page);
- __free_pages_ok(page, compound_order(page));
+ __free_pages_ok(page, compound_order(page), FPI_NONE);
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -762,7 +792,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
-static inline void set_page_order(struct page *page, unsigned int order)
+static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
@@ -787,7 +817,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy,
if (!page_is_guard(buddy) && !PageBuddy(buddy))
return false;
- if (page_order(buddy) != order)
+ if (buddy_order(buddy) != order)
return false;
/*
@@ -872,13 +902,17 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
area->nr_free++;
}
-/* Used for pages which are on another list */
+/*
+ * Used for pages which are on another list. Move the pages to the tail
+ * of the list - so the moved pages won't immediately be considered for
+ * allocation again (e.g., optimization for memory onlining).
+ */
static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
- list_move(&page->lru, &area->free_list[migratetype]);
+ list_move_tail(&page->lru, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -951,7 +985,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
- int migratetype, bool report)
+ int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn;
@@ -1025,9 +1059,11 @@ continue_merging:
}
done_merging:
- set_page_order(page, order);
+ set_buddy_order(page, order);
- if (is_shuffle_order(order))
+ if (fpi_flags & FPI_TO_TAIL)
+ to_tail = true;
+ else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
@@ -1038,7 +1074,7 @@ done_merging:
add_to_free_list(page, zone, order, migratetype);
/* Notify page reporting subsystem of freed page */
- if (report)
+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}
@@ -1173,6 +1209,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
trace_mm_page_free(page, order);
+ if (unlikely(PageHWPoison(page)) && !order) {
+ /*
+ * Do not let hwpoison pages hit pcplists/buddy
+ * Untie memcg state and reset page's owner
+ */
+ if (memcg_kmem_enabled() && PageKmemcg(page))
+ __memcg_kmem_uncharge_page(page, order);
+ reset_page_owner(page, order);
+ return false;
+ }
+
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
@@ -1368,7 +1415,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
- __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
@@ -1377,14 +1424,14 @@ static void free_pcppages_bulk(struct zone *zone, int count,
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
- int migratetype)
+ int migratetype, fpi_t fpi_flags)
{
spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
- __free_one_page(page, pfn, zone, order, migratetype, true);
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock(&zone->lock);
}
@@ -1462,7 +1509,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
}
}
-static void __free_pages_ok(struct page *page, unsigned int order)
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long flags;
int migratetype;
@@ -1474,7 +1522,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, pfn, order, migratetype);
+ free_one_page(page_zone(page), page, pfn, order, migratetype,
+ fpi_flags);
local_irq_restore(flags);
}
@@ -1484,6 +1533,11 @@ void __free_pages_core(struct page *page, unsigned int order)
struct page *p = page;
unsigned int loop;
+ /*
+ * When initializing the memmap, __init_single_page() sets the refcount
+ * of all pages to 1 ("allocated"/"not free"). We have to set the
+ * refcount of all involved pages to 0.
+ */
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
@@ -1494,8 +1548,12 @@ void __free_pages_core(struct page *page, unsigned int order)
set_page_count(p, 0);
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
- set_page_refcounted(page);
- __free_pages(page, order);
+
+ /*
+ * Bypass PCP and place fresh pages right to the tail, primarily
+ * relevant for memory onlining.
+ */
+ __free_pages_ok(page, order, FPI_TO_TAIL);
}
#ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -2120,7 +2178,7 @@ static inline void expand(struct zone *zone, struct page *page,
continue;
add_to_free_list(&page[size], zone, high, migratetype);
- set_page_order(&page[size], high);
+ set_buddy_order(&page[size], high);
}
}
@@ -2298,7 +2356,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
#endif
/*
- * Move the free pages in a range to the free lists of the requested type.
+ * Move the free pages in a range to the freelist tail of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
@@ -2334,7 +2392,7 @@ static int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
- order = page_order(page);
+ order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
page += 1 << order;
pages_moved += 1 << order;
@@ -2458,7 +2516,7 @@ static inline void boost_watermark(struct zone *zone)
static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
- unsigned int current_order = page_order(page);
+ unsigned int current_order = buddy_order(page);
int free_pages, movable_pages, alike_pages;
int old_block_type;
@@ -3122,7 +3180,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, 0, migratetype);
+ free_one_page(zone, page, pfn, 0, migratetype,
+ FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
@@ -3208,7 +3267,7 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- split_page_owner(page, order);
+ split_page_owner(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -3277,7 +3336,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
lockdep_assert_held(&zone->lock);
/* Return isolated page to tail of freelist. */
- __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
+ FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
}
/*
@@ -3367,9 +3427,16 @@ struct page *rmqueue(struct zone *preferred_zone,
struct page *page;
if (likely(order == 0)) {
- page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
+ /*
+ * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
+ * we need to skip it when CMA area isn't allowed.
+ */
+ if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
+ migratetype != MIGRATE_MOVABLE) {
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
migratetype, alloc_flags);
- goto out;
+ goto out;
+ }
}
/*
@@ -3381,7 +3448,13 @@ struct page *rmqueue(struct zone *preferred_zone,
do {
page = NULL;
- if (alloc_flags & ALLOC_HARDER) {
+ /*
+ * order-0 request can reach here when the pcplist is skipped
+ * due to non-CMA allocation context. HIGHATOMIC area is
+ * reserved for high-order atomic allocation, so order-0
+ * request should skip it.
+ */
+ if (order > 0 && alloc_flags & ALLOC_HARDER) {
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -3482,7 +3555,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
-static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
}
@@ -3727,8 +3800,8 @@ retry:
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
- ac->highest_zoneidx, ac->nodemask) {
+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
+ ac->nodemask) {
struct page *page;
unsigned long mark;
@@ -3972,8 +4045,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* success so it is time to admit defeat. We will skip the OOM killer
* because it is very likely that the caller has a more reasonable
* fallback than shooting a random task.
+ *
+ * The OOM killer may not free memory on a specific node.
*/
- if (gfp_mask & __GFP_RETRY_MAYFAIL)
+ if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->highest_zoneidx < ZONE_NORMAL)
@@ -3990,10 +4065,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* failures more gracefully we should just bail out here.
*/
- /* The OOM killer may not free memory on a specific node */
- if (gfp_mask & __GFP_THISNODE)
- goto out;
-
/* Exhausted what can be done so it's blame time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
@@ -4241,13 +4312,12 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
/* Perform direct synchronous page reclaim */
-static int
+static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
- int progress;
unsigned int noreclaim_flag;
- unsigned long pflags;
+ unsigned long pflags, progress;
cond_resched();
@@ -4826,12 +4896,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
- return true;
-}
-
-/* Determine whether to spread dirty pages and what the first usable zone */
-static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
-{
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -4842,6 +4906,8 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
+
+ return true;
}
/*
@@ -4870,8 +4936,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
- finalise_ac(gfp_mask, &ac);
-
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
@@ -4940,13 +5004,16 @@ static inline void free_the_page(struct page *page, unsigned int order)
if (order == 0) /* Via pcp? */
free_unref_page(page);
else
- __free_pages_ok(page, order);
+ __free_pages_ok(page, order, FPI_NONE);
}
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page))
free_the_page(page, order);
+ else if (!PageHead(page))
+ while (order-- > 0)
+ free_the_page(page + (1 << order), order);
}
EXPORT_SYMBOL(__free_pages);
@@ -5637,7 +5704,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
int min_val = INT_MAX;
int best_node = NUMA_NO_NODE;
- const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
@@ -5658,8 +5724,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
val += (n < node);
/* Give preference to headless and unused nodes */
- tmp = cpumask_of_node(n);
- if (!cpumask_empty(tmp))
+ if (!cpumask_empty(cpumask_of_node(n)))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
@@ -5955,7 +6020,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (*pfn < memblock_region_memory_end_pfn(r))
break;
}
@@ -5973,10 +6038,15 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
* done. Non-atomic initialization, single-pass.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum memmap_context context,
- struct vmem_altmap *altmap)
+ unsigned long start_pfn,
+ enum meminit_context context,
+ struct vmem_altmap *altmap, int migratetype)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
@@ -6007,7 +6077,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context == MEMMAP_EARLY) {
+ if (context == MEMINIT_EARLY) {
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, end_pfn))
@@ -6016,23 +6086,16 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
- if (context == MEMMAP_HOTPLUG)
+ if (context == MEMINIT_HOTPLUG)
__SetPageReserved(page);
/*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
+ * such that unmovable allocations won't be scattered all
+ * over the place during system boot.
*/
- if (!(pfn & (pageblock_nr_pages - 1))) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ set_pageblock_migratetype(page, migratetype);
cond_resched();
}
pfn++;
@@ -6094,15 +6157,10 @@ void __ref memmap_init_zone_device(struct zone *zone,
* the address space during boot when many long-lived
* kernel allocations are made.
*
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- *
- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
- if (!(pfn & (pageblock_nr_pages - 1))) {
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
@@ -6137,7 +6195,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid,
if (end_pfn > start_pfn) {
size = end_pfn - start_pfn;
memmap_init_zone(size, nid, zone, start_pfn,
- MEMMAP_EARLY, NULL);
+ MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
}
}
}
@@ -6540,7 +6598,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long start_pfn, end_pfn;
struct memblock_region *r;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
start_pfn = clamp(memblock_region_memory_base_pfn(r),
zone_start_pfn, zone_end_pfn);
end_pfn = clamp(memblock_region_memory_end_pfn(r),
@@ -6984,8 +7042,7 @@ static void __init init_unavailable_mem(void)
* Loop through unavailable ranges not covered by memblock.memory.
*/
pgcnt = 0;
- for_each_mem_range(i, &memblock.memory, NULL,
- NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_mem_range(i, &start, &end) {
if (next < start)
pgcnt += init_unavailable_range(PFN_DOWN(next),
PFN_UP(start));
@@ -7135,7 +7192,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
* options.
*/
if (movable_node_is_enabled()) {
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (!memblock_is_hotpluggable(r))
continue;
@@ -7156,7 +7213,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (mirrored_kernelcore) {
bool mem_below_4gb_not_mirrored = false;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (memblock_is_mirror(r))
continue;
@@ -7891,6 +7948,8 @@ int __meminit init_per_zone_wmark_min(void)
setup_min_slab_ratio();
#endif
+ khugepaged_min_free_kbytes_update();
+
return 0;
}
postcore_initcall(init_per_zone_wmark_min)
@@ -8218,14 +8277,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
{
unsigned long iter = 0;
unsigned long pfn = page_to_pfn(page);
-
- /*
- * TODO we could make this much more efficient by not checking every
- * page in the range if we know all of them are in MOVABLE_ZONE and
- * that the movable zone guarantees that pages are migratable but
- * the later is not the case right now unfortunatelly. E.g. movablecore
- * can still lead to having bootmem allocations in zone_movable.
- */
+ unsigned long offset = pfn % pageblock_nr_pages;
if (is_migrate_cma_page(page)) {
/*
@@ -8239,12 +8291,18 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
return page;
}
- for (; iter < pageblock_nr_pages; iter++) {
+ for (; iter < pageblock_nr_pages - offset; iter++) {
if (!pfn_valid_within(pfn + iter))
continue;
page = pfn_to_page(pfn + iter);
+ /*
+ * Both, bootmem allocations and memory holes are marked
+ * PG_reserved and are unmovable. We can even have unmovable
+ * allocations inside ZONE_MOVABLE, for example when
+ * specifying "movablecore".
+ */
if (PageReserved(page))
return page;
@@ -8286,7 +8344,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
*/
if (!page_ref_count(page)) {
if (PageBuddy(page))
- iter += (1 << page_order(page)) - 1;
+ iter += (1 << buddy_order(page)) - 1;
continue;
}
@@ -8318,14 +8376,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
* it. But now, memory offline itself doesn't call
* shrink_node_slabs() and it still to be fixed.
*/
- /*
- * If the page is not RAM, page_count()should be 0.
- * we don't need more check. This is an _used_ not-movable page.
- *
- * The problematic thing here is PG_reserved pages. PG_reserved
- * is set to both of a memory hole page and a _used_ kernel
- * page at boot.
- */
return page;
}
return NULL;
@@ -8459,7 +8509,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = start_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype, 0);
- if (ret < 0)
+ if (ret)
return ret;
/*
@@ -8507,7 +8557,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
if (outer_start != start) {
- order = page_order(pfn_to_page(outer_start));
+ order = buddy_order(pfn_to_page(outer_start));
/*
* outer_start page could be small order buddy page and
@@ -8695,35 +8745,21 @@ void zone_pcp_reset(struct zone *zone)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * All pages in the range must be in a single zone and isolated
- * before calling this.
+ * All pages in the range must be in a single zone, must not contain holes,
+ * must span full sections, and must be isolated before calling this function.
*/
-unsigned long
-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
+ unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
unsigned int order;
- unsigned long pfn;
unsigned long flags;
- unsigned long offlined_pages = 0;
-
- /* find the first valid pfn */
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
- if (pfn_valid(pfn))
- break;
- if (pfn == end_pfn)
- return offlined_pages;
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
- pfn = start_pfn;
while (pfn < end_pfn) {
- if (!pfn_valid(pfn)) {
- pfn++;
- continue;
- }
page = pfn_to_page(pfn);
/*
* The HWPoisoned page may be not in buddy system, and
@@ -8731,7 +8767,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
*/
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
- offlined_pages++;
continue;
}
/*
@@ -8742,20 +8777,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(page_count(page));
BUG_ON(PageBuddy(page));
pfn++;
- offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
- order = page_order(page);
- offlined_pages += 1 << order;
+ order = buddy_order(page);
del_page_from_free_list(page, zone, order);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return offlined_pages;
}
#endif
@@ -8770,7 +8801,7 @@ bool is_free_buddy_page(struct page *page)
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
- if (PageBuddy(page_head) && page_order(page_head) >= order)
+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
break;
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -8780,30 +8811,70 @@ bool is_free_buddy_page(struct page *page)
#ifdef CONFIG_MEMORY_FAILURE
/*
- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
- * test is performed under the zone lock to prevent a race against page
- * allocation.
+ * Break down a higher-order page in sub-pages, and keep our target out of
+ * buddy allocator.
*/
-bool set_hwpoison_free_buddy_page(struct page *page)
+static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ struct page *target, int low, int high,
+ int migratetype)
+{
+ unsigned long size = 1 << high;
+ struct page *current_buddy, *next_page;
+
+ while (high > low) {
+ high--;
+ size >>= 1;
+
+ if (target >= &page[size]) {
+ next_page = page + size;
+ current_buddy = page;
+ } else {
+ next_page = page;
+ current_buddy = page + size;
+ }
+
+ if (set_page_guard(zone, current_buddy, high, migratetype))
+ continue;
+
+ if (current_buddy != target) {
+ add_to_free_list(current_buddy, zone, high, migratetype);
+ set_buddy_order(current_buddy, high);
+ page = next_page;
+ }
+ }
+}
+
+/*
+ * Take a page that will be marked as poisoned off the buddy allocator.
+ */
+bool take_page_off_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
unsigned int order;
- bool hwpoisoned = false;
+ bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
+ int page_order = buddy_order(page_head);
+
+ if (PageBuddy(page_head) && page_order >= order) {
+ unsigned long pfn_head = page_to_pfn(page_head);
+ int migratetype = get_pfnblock_migratetype(page_head,
+ pfn_head);
- if (PageBuddy(page_head) && page_order(page_head) >= order) {
- if (!TestSetPageHWPoison(page))
- hwpoisoned = true;
+ del_page_from_free_list(page_head, zone, page_order);
+ break_down_buddy_pages(zone, page_head, page, 0,
+ page_order, migratetype);
+ ret = true;
break;
}
+ if (page_count(page_head) > 0)
+ break;
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return hwpoisoned;
+ return ret;
}
#endif