summaryrefslogtreecommitdiff
path: root/mm/mm_init.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mm_init.c')
-rw-r--r--mm/mm_init.c216
1 files changed, 90 insertions, 126 deletions
diff --git a/mm/mm_init.c b/mm/mm_init.c
index b6a1fcf6e13a..f72b852bd5b8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -24,6 +24,7 @@
#include <linux/page_ext.h>
#include <linux/pti.h>
#include <linux/pgtable.h>
+#include <linux/stackdepot.h>
#include <linux/swap.h>
#include <linux/cma.h>
#include <linux/crash_dump.h>
@@ -227,7 +228,6 @@ static unsigned long required_movablecore_percent __initdata;
static unsigned long nr_kernel_pages __initdata;
static unsigned long nr_all_pages __initdata;
-static unsigned long dma_reserve __initdata;
static bool deferred_struct_pages __meminitdata;
@@ -1145,7 +1145,7 @@ static void __init adjust_zone_range_for_zone_movable(int nid,
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
* then all holes in the requested range will be accounted for.
*/
-unsigned long __init __absent_pages_in_range(int nid,
+static unsigned long __init __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
@@ -1266,6 +1266,30 @@ static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
}
+static void __init calc_nr_kernel_pages(void)
+{
+ unsigned long start_pfn, end_pfn;
+ phys_addr_t start_addr, end_addr;
+ u64 u;
+#ifdef CONFIG_HIGHMEM
+ unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
+#endif
+
+ for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
+ start_pfn = PFN_UP(start_addr);
+ end_pfn = PFN_DOWN(end_addr);
+
+ if (start_pfn < end_pfn) {
+ nr_all_pages += end_pfn - start_pfn;
+#ifdef CONFIG_HIGHMEM
+ start_pfn = clamp(start_pfn, 0, high_zone_low);
+ end_pfn = clamp(end_pfn, 0, high_zone_low);
+#endif
+ nr_kernel_pages += end_pfn - start_pfn;
+ }
+ }
+}
+
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn)
@@ -1309,26 +1333,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}
-static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
- unsigned long present_pages)
-{
- unsigned long pages = spanned_pages;
-
- /*
- * Provide a more accurate estimation if there are holes within
- * the zone and SPARSEMEM is in use. If there are holes within the
- * zone, each populated memory region may cost us one or two extra
- * memmap pages due to alignment because memmap pages for each
- * populated regions may not be naturally aligned on page boundary.
- * So the (present_pages >> 4) heuristic is a tradeoff for that.
- */
- if (spanned_pages > present_pages + (present_pages >> 4) &&
- IS_ENABLED(CONFIG_SPARSEMEM))
- pages = present_pages;
-
- return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void pgdat_init_split_queue(struct pglist_data *pgdat)
{
@@ -1543,15 +1547,6 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
}
#endif
-/*
- * Set up the zone data structures:
- * - mark all pages reserved
- * - mark all memory queues empty
- * - clear the memory bitmaps
- *
- * NOTE: pgdat should get zeroed by caller.
- * NOTE: this function is only called during early init.
- */
static void __init free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
@@ -1562,47 +1557,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long size, freesize, memmap_pages;
-
- size = zone->spanned_pages;
- freesize = zone->present_pages;
-
- /*
- * Adjust freesize so that it accounts for how much memory
- * is used by this zone for memmap. This affects the watermark
- * and per-cpu initialisations
- */
- memmap_pages = calc_memmap_size(size, freesize);
- if (!is_highmem_idx(j)) {
- if (freesize >= memmap_pages) {
- freesize -= memmap_pages;
- if (memmap_pages)
- pr_debug(" %s zone: %lu pages used for memmap\n",
- zone_names[j], memmap_pages);
- } else
- pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
- zone_names[j], memmap_pages, freesize);
- }
-
- /* Account for reserved pages */
- if (j == 0 && freesize > dma_reserve) {
- freesize -= dma_reserve;
- pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
- }
-
- if (!is_highmem_idx(j))
- nr_kernel_pages += freesize;
- /* Charge for highmem memmap if there are enough kernel pages */
- else if (nr_kernel_pages > memmap_pages * 2)
- nr_kernel_pages -= memmap_pages;
- nr_all_pages += freesize;
+ unsigned long size = zone->spanned_pages;
/*
- * Set an approximate value for lowmem here, it will be adjusted
- * when the bootmem allocator frees pages into the buddy system.
- * And all highmem pages will be managed by the buddy system.
+ * Initialize zone->managed_pages as 0 , it will be reset
+ * when memblock allocator frees pages into buddy system.
*/
- zone_init_internals(zone, j, nid, freesize);
+ zone_init_internals(zone, j, nid, zone->present_pages);
if (!size)
continue;
@@ -1875,30 +1836,26 @@ void __init free_area_init(unsigned long *max_zone_pfn)
panic("Cannot allocate %zuB for node %d.\n",
sizeof(*pgdat), nid);
arch_refresh_nodedata(nid, pgdat);
- free_area_init_node(nid);
-
- /*
- * We do not want to confuse userspace by sysfs
- * files/directories for node without any memory
- * attached to it, so this node is not marked as
- * N_MEMORY and not marked online so that no sysfs
- * hierarchy will be created via register_one_node for
- * it. The pgdat will get fully initialized by
- * hotadd_init_pgdat() when memory is hotplugged into
- * this node.
- */
- continue;
}
pgdat = NODE_DATA(nid);
free_area_init_node(nid);
- /* Any memory on that node */
- if (pgdat->node_present_pages)
+ /*
+ * No sysfs hierarcy will be created via register_one_node()
+ *for memory-less node because here it's not marked as N_MEMORY
+ *and won't be set online later. The benefit is userspace
+ *program won't be confused by sysfs files/directories of
+ *memory-less node. The pgdat will get fully initialized by
+ *hotadd_init_pgdat() when memory is hotplugged into this node.
+ */
+ if (pgdat->node_present_pages) {
node_set_state(nid, N_MEMORY);
- check_for_memory(pgdat);
+ check_for_memory(pgdat);
+ }
}
+ calc_nr_kernel_pages();
memmap_init();
/* disable hash distribution for systems with a single node */
@@ -2058,7 +2015,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
__init_single_page(page, pfn, zid, nid);
nr_pages++;
}
- return (nr_pages);
+ return nr_pages;
}
/*
@@ -2260,10 +2217,6 @@ zone_empty:
* Return true when zone was grown, otherwise return false. We return true even
* when we grow less than requested, to let the caller decide if there are
* enough pages to satisfy the allocation.
- *
- * Note: We use noinline because this function is needed only during boot, and
- * it is called from a __ref function _deferred_grow_zone. This way we are
- * making sure that it is not inlined into permanent text section.
*/
bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
{
@@ -2413,17 +2366,6 @@ void __init page_alloc_init_late(void)
page_alloc_sysctl_init();
}
-#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-/*
- * Returns the number of pages that arch has reserved but
- * is not known to alloc_large_system_hash().
- */
-static unsigned long __init arch_reserved_kernel_pages(void)
-{
- return 0;
-}
-#endif
-
/*
* Adaptive scale is meant to reduce sizes of hash tables on large memory
* machines. As memory size is increased the scale is also increased but at
@@ -2466,7 +2408,6 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
numentries = nr_kernel_pages;
- numentries -= arch_reserved_kernel_pages();
/* It isn't necessary when PAGE_SIZE >= 1MB */
if (PAGE_SIZE < SZ_1M)
@@ -2548,26 +2489,9 @@ void *__init alloc_large_system_hash(const char *tablename,
return table;
}
-/**
- * set_dma_reserve - set the specified number of pages reserved in the first zone
- * @new_dma_reserve: The number of pages to mark reserved
- *
- * The per-cpu batchsize and zone watermarks are determined by managed_pages.
- * In the DMA zone, a significant percentage may be consumed by kernel image
- * and other unfreeable allocations which can skew the watermarks badly. This
- * function may optionally be used to account for unfreeable pages in the
- * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
- * smaller per-cpu batchsize.
- */
-void __init set_dma_reserve(unsigned long new_dma_reserve)
-{
- dma_reserve = new_dma_reserve;
-}
-
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
-
if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
int nid = early_pfn_to_nid(pfn);
@@ -2579,6 +2503,17 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
/* KMSAN will take care of these pages. */
return;
}
+
+ /* pages were reserved and not allocated */
+ if (mem_alloc_profiling_enabled()) {
+ union codetag_ref *ref = get_page_tag_ref(page);
+
+ if (ref) {
+ set_codetag_empty(ref);
+ put_page_tag_ref(ref);
+ }
+ }
+
__free_pages_core(page, order);
}
@@ -2588,6 +2523,9 @@ EXPORT_SYMBOL(init_on_alloc);
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
EXPORT_SYMBOL(init_on_free);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free);
+EXPORT_SYMBOL(init_mlocked_on_free);
+
static bool _init_on_alloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
static int __init early_init_on_alloc(char *buf)
@@ -2605,6 +2543,14 @@ static int __init early_init_on_free(char *buf)
}
early_param("init_on_free", early_init_on_free);
+static bool _init_mlocked_on_free_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON);
+static int __init early_init_mlocked_on_free(char *buf)
+{
+ return kstrtobool(buf, &_init_mlocked_on_free_enabled_early);
+}
+early_param("init_mlocked_on_free", early_init_mlocked_on_free);
+
DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
/*
@@ -2632,12 +2578,21 @@ static void __init mem_debugging_and_hardening_init(void)
}
#endif
- if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
+ if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early ||
+ _init_mlocked_on_free_enabled_early) &&
page_poisoning_requested) {
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
- "will take precedence over init_on_alloc and init_on_free\n");
+ "will take precedence over init_on_alloc, init_on_free "
+ "and init_mlocked_on_free\n");
_init_on_alloc_enabled_early = false;
_init_on_free_enabled_early = false;
+ _init_mlocked_on_free_enabled_early = false;
+ }
+
+ if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) {
+ pr_info("mem auto-init: init_on_free is on, "
+ "will take precedence over init_mlocked_on_free\n");
+ _init_mlocked_on_free_enabled_early = false;
}
if (_init_on_alloc_enabled_early) {
@@ -2654,9 +2609,17 @@ static void __init mem_debugging_and_hardening_init(void)
static_branch_disable(&init_on_free);
}
- if (IS_ENABLED(CONFIG_KMSAN) &&
- (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
- pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
+ if (_init_mlocked_on_free_enabled_early) {
+ want_check_pages = true;
+ static_branch_enable(&init_mlocked_on_free);
+ } else {
+ static_branch_disable(&init_mlocked_on_free);
+ }
+
+ if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early ||
+ _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early))
+ pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and "
+ "init_mlocked_on_free are disabled when running KMSAN\n");
#ifdef CONFIG_DEBUG_PAGEALLOC
if (debug_pagealloc_enabled()) {
@@ -2695,9 +2658,10 @@ static void __init report_meminit(void)
else
stack = "off";
- pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
+ pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n",
stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
- want_init_on_free() ? "on" : "off");
+ want_init_on_free() ? "on" : "off",
+ want_init_mlocked_on_free() ? "on" : "off");
if (want_init_on_free())
pr_info("mem auto-init: clearing system memory may take some time...\n");
}