From dcfe378c81f72f146890ce1dcfdcc742d3b66924 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 21 Mar 2024 09:36:36 -0700 Subject: lib: introduce support for page allocation tagging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce helper functions to easily instrument page allocators by storing a pointer to the allocation tag associated with the code that allocated the page in a page_ext field. Link: https://lkml.kernel.org/r/20240321163705.3067592-15-surenb@google.com Signed-off-by: Suren Baghdasaryan Co-developed-by: Kent Overstreet Signed-off-by: Kent Overstreet Reviewed-by: Vlastimil Babka Tested-by: Kees Cook Cc: Alexander Viro Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Boqun Feng Cc: Christoph Lameter Cc: Dennis Zhou Cc: Gary Guo Cc: Miguel Ojeda Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Tejun Heo Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- mm/mm_init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 549e76af8f82..2fd9bf044a79 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From d224eb0287fbd84f4f13eca042c7f08f87138f3b Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 21 Mar 2024 09:36:56 -0700 Subject: codetag: debug: mark codetags for reserved pages as empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid debug warnings while freeing reserved pages which were not allocated with usual allocators, mark their codetags as empty before freeing. Link: https://lkml.kernel.org/r/20240321163705.3067592-35-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Tested-by: Kees Cook Cc: Alexander Viro Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Boqun Feng Cc: Christoph Lameter Cc: Dennis Zhou Cc: Gary Guo Cc: Kent Overstreet Cc: Miguel Ojeda Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 1 + include/linux/mm.h | 9 +++++++++ include/linux/pgalloc_tag.h | 2 ++ mm/mm_init.c | 12 +++++++++++- 4 files changed, 23 insertions(+), 1 deletion(-) (limited to 'mm/mm_init.c') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 3af5d6a5ced4..afc9e259a2d3 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -47,6 +47,7 @@ static inline void set_codetag_empty(union codetag_ref *ref) #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } +static inline void set_codetag_empty(union codetag_ref *ref) {} #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ diff --git a/include/linux/mm.h b/include/linux/mm.h index b6bdaa18b9e9..0dbd737cfabd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -3134,6 +3135,14 @@ extern void reserve_bootmem_region(phys_addr_t start, /* Free the reserved page into the buddy system, so it gets managed. */ static inline void free_reserved_page(struct page *page) { + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } ClearPageReserved(page); init_page_count(page); __free_page(page); diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 50d212330bbb..62d8dad74b37 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -120,6 +120,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) #else /* CONFIG_MEM_ALLOC_PROFILING */ +static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } +static inline void put_page_tag_ref(union codetag_ref *ref) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} diff --git a/mm/mm_init.c b/mm/mm_init.c index 2fd9bf044a79..f45c2b32ba82 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2567,7 +2567,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { int nid = early_pfn_to_nid(pfn); @@ -2579,6 +2578,17 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* KMSAN will take care of these pages. */ return; } + + /* pages were reserved and not allocated */ + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } + __free_pages_core(page, order); } -- cgit v1.2.3 From 6600a6b10c3210280e7623b8ba7feac9619cc2cd Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 25 Mar 2024 22:56:42 +0800 Subject: mm/mm_init.c: remove the useless dma_reserve Now nobody calls set_dma_reserve() to set value for dma_reserve, remove set_dma_reserve(), global variable dma_reserve and the codes using it. Link: https://lkml.kernel.org/r/20240325145646.1044760-3-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/mm_init.c | 23 ----------------------- 2 files changed, 24 deletions(-) (limited to 'mm/mm_init.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index fe66b515aaab..22ae6ab621da 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3250,7 +3250,6 @@ static inline int early_pfn_to_nid(unsigned long pfn) extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif -extern void set_dma_reserve(unsigned long new_dma_reserve); extern void mem_init(void); extern void __init mmap_init(void); diff --git a/mm/mm_init.c b/mm/mm_init.c index f45c2b32ba82..91bb8600c0fa 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -227,7 +227,6 @@ static unsigned long required_movablecore_percent __initdata; static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; -static unsigned long dma_reserve __initdata; static bool deferred_struct_pages __meminitdata; @@ -1584,12 +1583,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat) zone_names[j], memmap_pages, freesize); } - /* Account for reserved pages */ - if (j == 0 && freesize > dma_reserve) { - freesize -= dma_reserve; - pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); - } - if (!is_highmem_idx(j)) nr_kernel_pages += freesize; /* Charge for highmem memmap if there are enough kernel pages */ @@ -2548,22 +2541,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/** - * set_dma_reserve - set the specified number of pages reserved in the first zone - * @new_dma_reserve: The number of pages to mark reserved - * - * The per-cpu batchsize and zone watermarks are determined by managed_pages. - * In the DMA zone, a significant percentage may be consumed by kernel image - * and other unfreeable allocations which can skew the watermarks badly. This - * function may optionally be used to account for unfreeable pages in the - * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and - * smaller per-cpu batchsize. - */ -void __init set_dma_reserve(unsigned long new_dma_reserve) -{ - dma_reserve = new_dma_reserve; -} - void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { -- cgit v1.2.3 From 8ad4184985e844ab5848dbf8b7b02f4e14b5d010 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 25 Mar 2024 22:56:43 +0800 Subject: mm/mm_init.c: add new function calc_nr_all_pages() This is a preparation to calculate nr_kernel_pages and nr_all_pages, both of which will be used later in alloc_large_system_hash(). nr_all_pages counts up all free but not reserved memory in memblock allocator, including HIGHMEM memory. While nr_kernel_pages counts up all free but not reserved low memory in memblock allocator, excluding HIGHMEM memory. Link: https://lkml.kernel.org/r/20240325145646.1044760-4-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 91bb8600c0fa..ae9149956497 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1265,6 +1265,30 @@ static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat) pr_debug("On node %d totalpages: 0\n", pgdat->node_id); } +static void __init calc_nr_kernel_pages(void) +{ + unsigned long start_pfn, end_pfn; + phys_addr_t start_addr, end_addr; + u64 u; +#ifdef CONFIG_HIGHMEM + unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; +#endif + + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { + start_pfn = PFN_UP(start_addr); + end_pfn = PFN_DOWN(end_addr); + + if (start_pfn < end_pfn) { + nr_all_pages += end_pfn - start_pfn; +#ifdef CONFIG_HIGHMEM + start_pfn = clamp(start_pfn, 0, high_zone_low); + end_pfn = clamp(end_pfn, 0, high_zone_low); +#endif + nr_kernel_pages += end_pfn - start_pfn; + } + } +} + static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn) -- cgit v1.2.3 From 0ac5e785dcb797a3f0af1b205ce48134e186e88f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 25 Mar 2024 22:56:44 +0800 Subject: mm/mm_init.c: remove meaningless calculation of zone->managed_pages in free_area_init_core() Currently, in free_area_init_core(), when initialize zone's field, a rough value is set to zone->managed_pages. That value is calculated by (zone->present_pages - memmap_pages). In the meantime, add the value to nr_all_pages and nr_kernel_pages which represent all free pages of system (only low memory or including HIGHMEM memory separately). Both of them are gonna be used in alloc_large_system_hash(). However, the rough calculation and setting of zone->managed_pages is meaningless because a) memmap pages are allocated on units of node in sparse_init() or alloc_node_mem_map(pgdat); The simple (zone->present_pages - memmap_pages) is too rough to make sense for zone; b) the set zone->managed_pages will be zeroed out and reset with acutal value in mem_init() via memblock_free_all(). Before the resetting, no buddy allocation request is issued. Here, remove the meaningless and complicated calculation of (zone->present_pages - memmap_pages), directly set zone->managed_pages as zone->present_pages for now. It will be adjusted in mem_init(). And also remove the assignment of nr_all_pages and nr_kernel_pages in free_area_init_core(). Instead, call the newly added calc_nr_kernel_pages() to count up all free but not reserved memory in memblock and assign to nr_all_pages and nr_kernel_pages. The counting excludes memmap_pages, and other kernel used data, which is more accurate than old way and simpler, and can also cover the ppc required arch_reserved_kernel_pages() case. And also clean up the outdated code comment above free_area_init_core(). And free_area_init_core() is easy to understand now, no need to add words to explain. [bhe@redhat.com: initialize zone->managed_pages as zone->present_pages for now] Link: https://lkml.kernel.org/r/ZgU0bsJ2FEjykvju@MiWiFi-R3L-srv Link: https://lkml.kernel.org/r/20240325145646.1044760-5-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 46 +++++----------------------------------------- 1 file changed, 5 insertions(+), 41 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index ae9149956497..b211b30231cb 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1566,15 +1566,6 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) } #endif -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - * - * NOTE: pgdat should get zeroed by caller. - * NOTE: this function is only called during early init. - */ static void __init free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; @@ -1585,41 +1576,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, freesize, memmap_pages; - - size = zone->spanned_pages; - freesize = zone->present_pages; - - /* - * Adjust freesize so that it accounts for how much memory - * is used by this zone for memmap. This affects the watermark - * and per-cpu initialisations - */ - memmap_pages = calc_memmap_size(size, freesize); - if (!is_highmem_idx(j)) { - if (freesize >= memmap_pages) { - freesize -= memmap_pages; - if (memmap_pages) - pr_debug(" %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); - } else - pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n", - zone_names[j], memmap_pages, freesize); - } - - if (!is_highmem_idx(j)) - nr_kernel_pages += freesize; - /* Charge for highmem memmap if there are enough kernel pages */ - else if (nr_kernel_pages > memmap_pages * 2) - nr_kernel_pages -= memmap_pages; - nr_all_pages += freesize; + unsigned long size = zone->spanned_pages; /* - * Set an approximate value for lowmem here, it will be adjusted - * when the bootmem allocator frees pages into the buddy system. - * And all highmem pages will be managed by the buddy system. + * Initialize zone->managed_pages as 0 , it will be reset + * when memblock allocator frees pages into buddy system. */ - zone_init_internals(zone, j, nid, freesize); + zone_init_internals(zone, j, nid, zone->present_pages); if (!size) continue; @@ -1916,6 +1879,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) check_for_memory(pgdat); } + calc_nr_kernel_pages(); memmap_init(); /* disable hash distribution for systems with a single node */ -- cgit v1.2.3 From 90e796e22e35af0d19874c36fa4a22709aec1659 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 25 Mar 2024 22:56:45 +0800 Subject: mm/mm_init.c: remove unneeded calc_memmap_size() Nobody calls calc_memmap_size() now. Link: https://lkml.kernel.org/r/20240325145646.1044760-6-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index b211b30231cb..8c261572ca6e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1332,26 +1332,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } -static unsigned long __init calc_memmap_size(unsigned long spanned_pages, - unsigned long present_pages) -{ - unsigned long pages = spanned_pages; - - /* - * Provide a more accurate estimation if there are holes within - * the zone and SPARSEMEM is in use. If there are holes within the - * zone, each populated memory region may cost us one or two extra - * memmap pages due to alignment because memmap pages for each - * populated regions may not be naturally aligned on page boundary. - * So the (present_pages >> 4) heuristic is a tradeoff for that. - */ - if (spanned_pages > present_pages + (present_pages >> 4) && - IS_ENABLED(CONFIG_SPARSEMEM)) - pages = present_pages; - - return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { -- cgit v1.2.3 From 0b52663f7547520fbf54e2c07f61f8bd1b5cb6eb Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 25 Mar 2024 22:56:46 +0800 Subject: mm/mm_init.c: remove arch_reserved_kernel_pages() Since the current calculation of calc_nr_kernel_pages() has taken into consideration of kernel reserved memory, no need to have arch_reserved_kernel_pages() any more. Link: https://lkml.kernel.org/r/20240325145646.1044760-7-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/mmu.h | 4 ---- arch/powerpc/kernel/fadump.c | 5 ----- include/linux/mm.h | 3 --- mm/mm_init.c | 12 ------------ 4 files changed, 24 deletions(-) (limited to 'mm/mm_init.c') diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 3b72c7ed24cf..aa5c0fd5edb1 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -406,9 +406,5 @@ extern void *abatron_pteptrs[2]; #include #endif -#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) -#define __HAVE_ARCH_RESERVED_KERNEL_PAGES -#endif - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MMU_H_ */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d14eda1e8589..ae8c7619e597 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1735,8 +1735,3 @@ static void __init fadump_reserve_crash_area(u64 base) memblock_reserve(mstart, msize); } } - -unsigned long __init arch_reserved_kernel_pages(void) -{ - return memblock_reserved_size() / PAGE_SIZE; -} diff --git a/include/linux/mm.h b/include/linux/mm.h index 22ae6ab621da..dd47ba7a1c04 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3261,9 +3261,6 @@ static inline void show_mem(void) extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); -#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES -extern unsigned long arch_reserved_kernel_pages(void); -#endif extern __printf(3, 4) void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); diff --git a/mm/mm_init.c b/mm/mm_init.c index 8c261572ca6e..0d32bcc301e2 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2374,17 +2374,6 @@ void __init page_alloc_init_late(void) page_alloc_sysctl_init(); } -#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES -/* - * Returns the number of pages that arch has reserved but - * is not known to alloc_large_system_hash(). - */ -static unsigned long __init arch_reserved_kernel_pages(void) -{ - return 0; -} -#endif - /* * Adaptive scale is meant to reduce sizes of hash tables on large memory * machines. As memory size is increased the scale is also increased but at @@ -2427,7 +2416,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; - numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SIZE < SZ_1M) -- cgit v1.2.3 From c091dd963f99a4ae1a5c31bff68e5c2ebd786b07 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 26 Mar 2024 14:11:28 +0800 Subject: mm/init: remove the unnecessary special treatment for memory-less node Because memory-less node's ->node_present_pages and its zone's ->present_pages are all 0, the judgement before calling node_set_state() to set N_MEMORY, N_HIGH_MEMORY, N_NORMAL_MEMORY for node is enough to skip memory-less node. The 'continue;' statement inside for_each_node() loop of free_area_init() is gilding the lily. Here, remove the special handling to make memory-less node share the same code flow as normal node. And also rephrase the code comments above the 'continue' statement and move them above above line 'if (pgdat->node_present_pages)'. [bhe@redhat.com: redo code comments, per Mike] Link: https://lkml.kernel.org/r/ZhYJAVQRYJSTKZng@MiWiFi-R3L-srv Link: https://lkml.kernel.org/r/20240326061134.1055295-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: Mel Gorman Cc: "Mike Rapoport (IBM)" Signed-off-by: Andrew Morton --- mm/mm_init.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 0d32bcc301e2..0fe466abde43 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1835,28 +1835,23 @@ void __init free_area_init(unsigned long *max_zone_pfn) panic("Cannot allocate %zuB for node %d.\n", sizeof(*pgdat), nid); arch_refresh_nodedata(nid, pgdat); - free_area_init_node(nid); - - /* - * We do not want to confuse userspace by sysfs - * files/directories for node without any memory - * attached to it, so this node is not marked as - * N_MEMORY and not marked online so that no sysfs - * hierarchy will be created via register_one_node for - * it. The pgdat will get fully initialized by - * hotadd_init_pgdat() when memory is hotplugged into - * this node. - */ - continue; } pgdat = NODE_DATA(nid); free_area_init_node(nid); - /* Any memory on that node */ - if (pgdat->node_present_pages) + /* + * No sysfs hierarcy will be created via register_one_node() + *for memory-less node because here it's not marked as N_MEMORY + *and won't be set online later. The benefit is userspace + *program won't be confused by sysfs files/directories of + *memory-less node. The pgdat will get fully initialized by + *hotadd_init_pgdat() when memory is hotplugged into this node. + */ + if (pgdat->node_present_pages) { node_set_state(nid, N_MEMORY); - check_for_memory(pgdat); + check_for_memory(pgdat); + } } calc_nr_kernel_pages(); -- cgit v1.2.3 From b6dd94596f7fafd162da236b438e31b12d352187 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 26 Mar 2024 14:11:29 +0800 Subject: mm: make __absent_pages_in_range() as static It's only called in mm/mm_init.c now. Link: https://lkml.kernel.org/r/20240326061134.1055295-4-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/mm_init.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'mm/mm_init.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c0e441664d4..00a46a5d5c9d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3229,8 +3229,6 @@ static inline unsigned long get_num_physpages(void) */ void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); -unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, - unsigned long end_pfn); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, diff --git a/mm/mm_init.c b/mm/mm_init.c index 0fe466abde43..19c78b54fc8a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1144,7 +1144,7 @@ static void __init adjust_zone_range_for_zone_movable(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -unsigned long __init __absent_pages_in_range(int nid, +static unsigned long __init __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { -- cgit v1.2.3 From f55d3471b78a15e8ec1db88fe12456aad70600a4 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 26 Mar 2024 14:11:31 +0800 Subject: mm/mm_init.c: remove the outdated code comment above deferred_grow_zone() The noinline attribute has been taken off in commit 9420f89db2dd ("mm: move most of core MM initialization to mm/mm_init.c"). So remove the unneeded code comment above deferred_grow_zone(). And also remove the unneeded bracket in deferred_init_pages(). Link: https://lkml.kernel.org/r/20240326061134.1055295-6-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/mm_init.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm/mm_init.c') diff --git a/mm/mm_init.c b/mm/mm_init.c index 19c78b54fc8a..d01912b8a597 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2014,7 +2014,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, __init_single_page(page, pfn, zid, nid); nr_pages++; } - return (nr_pages); + return nr_pages; } /* @@ -2216,10 +2216,6 @@ zone_empty: * Return true when zone was grown, otherwise return false. We return true even * when we grow less than requested, to let the caller decide if there are * enough pages to satisfy the allocation. - * - * Note: We use noinline because this function is needed only during boot, and - * it is called from a __ref function _deferred_grow_zone. This way we are - * making sure that it is not inlined into permanent text section. */ bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { -- cgit v1.2.3 From ba42b524a0408b5f92bd41edaee1ea84309ab9ae Mon Sep 17 00:00:00 2001 From: York Jasper Niebuhr Date: Fri, 29 Mar 2024 15:56:05 +0100 Subject: mm: init_mlocked_on_free_v3 Implements the "init_mlocked_on_free" boot option. When this boot option is enabled, any mlock'ed pages are zeroed on free. If the pages are munlock'ed beforehand, no initialization takes place. This boot option is meant to combat the performance hit of "init_on_free" as reported in commit 6471384af2a6 ("mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options"). With "init_mlocked_on_free=1" only relevant data is freed while everything else is left untouched by the kernel. Correspondingly, this patch introduces no performance hit for unmapping non-mlock'ed memory. The unmapping overhead for purely mlocked memory was measured to be approximately 13%. Realistically, most systems mlock only a fraction of the total memory so the real-world system overhead should be close to zero. Optimally, userspace programs clear any key material or other confidential memory before exit and munlock the according memory regions. If a program crashes, userspace key managers fail to do this job. Accordingly, no munlock operations are performed so the data is caught and zeroed by the kernel. Should the program not crash, all memory will ideally be munlocked so no overhead is caused. CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON can be set to enable "init_mlocked_on_free" by default. Link: https://lkml.kernel.org/r/20240329145605.149917-1-yjnworkstation@gmail.com Signed-off-by: York Jasper Niebuhr Cc: Matthew Wilcox (Oracle) Cc: York Jasper Niebuhr Cc: Kees Cook Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++ include/linux/mm.h | 9 +++++- mm/internal.h | 1 + mm/memory.c | 6 ++++ mm/mm_init.c | 43 +++++++++++++++++++++---- mm/page_alloc.c | 2 +- security/Kconfig.hardening | 15 +++++++++ 7 files changed, 73 insertions(+), 9 deletions(-) (limited to 'mm/mm_init.c') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 902ecd92a29f..3ff97de349da 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2148,6 +2148,12 @@ Format: 0 | 1 Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. + init_mlocked_on_free= [MM] Fill freed userspace memory with zeroes if + it was mlock'ed and not explicitly munlock'ed + afterwards. + Format: 0 | 1 + Default set by CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON + init_pkru= [X86] Specify the default memory protection keys rights register contents for all processes. 0x55555554 by default (disallow access to all but pkey 0). Can diff --git a/include/linux/mm.h b/include/linux/mm.h index 2d5e492ef57f..4f4e460d7853 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3762,7 +3762,14 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, - &init_on_free); + &init_on_free); +} + +DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); +static inline bool want_init_mlocked_on_free(void) +{ + return static_branch_maybe(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, + &init_mlocked_on_free); } extern bool _debug_pagealloc_enabled_early; diff --git a/mm/internal.h b/mm/internal.h index 6614ba4ca9de..cf7799e29391 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -506,6 +506,7 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); +extern void kernel_init_pages(struct page *page, int numpages); /* * This will have no effect, other than possibly generating a warning, if the diff --git a/mm/memory.c b/mm/memory.c index 0b92336bcebd..80944acb5b4e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1506,6 +1506,12 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); } + + if (want_init_mlocked_on_free() && folio_test_mlocked(folio) && + !delay_rmap && folio_test_anon(folio)) { + kernel_init_pages(page, folio_nr_pages(folio)); + } + if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; diff --git a/mm/mm_init.c b/mm/mm_init.c index d01912b8a597..2c8f3af4430f 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2522,6 +2522,9 @@ EXPORT_SYMBOL(init_on_alloc); DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); EXPORT_SYMBOL(init_on_free); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); +EXPORT_SYMBOL(init_mlocked_on_free); + static bool _init_on_alloc_enabled_early __read_mostly = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) @@ -2539,6 +2542,14 @@ static int __init early_init_on_free(char *buf) } early_param("init_on_free", early_init_on_free); +static bool _init_mlocked_on_free_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON); +static int __init early_init_mlocked_on_free(char *buf) +{ + return kstrtobool(buf, &_init_mlocked_on_free_enabled_early); +} +early_param("init_mlocked_on_free", early_init_mlocked_on_free); + DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); /* @@ -2566,12 +2577,21 @@ static void __init mem_debugging_and_hardening_init(void) } #endif - if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && + if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early || + _init_mlocked_on_free_enabled_early) && page_poisoning_requested) { pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " - "will take precedence over init_on_alloc and init_on_free\n"); + "will take precedence over init_on_alloc, init_on_free " + "and init_mlocked_on_free\n"); _init_on_alloc_enabled_early = false; _init_on_free_enabled_early = false; + _init_mlocked_on_free_enabled_early = false; + } + + if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) { + pr_info("mem auto-init: init_on_free is on, " + "will take precedence over init_mlocked_on_free\n"); + _init_mlocked_on_free_enabled_early = false; } if (_init_on_alloc_enabled_early) { @@ -2588,9 +2608,17 @@ static void __init mem_debugging_and_hardening_init(void) static_branch_disable(&init_on_free); } - if (IS_ENABLED(CONFIG_KMSAN) && - (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) - pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); + if (_init_mlocked_on_free_enabled_early) { + want_check_pages = true; + static_branch_enable(&init_mlocked_on_free); + } else { + static_branch_disable(&init_mlocked_on_free); + } + + if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early || + _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and " + "init_mlocked_on_free are disabled when running KMSAN\n"); #ifdef CONFIG_DEBUG_PAGEALLOC if (debug_pagealloc_enabled()) { @@ -2629,9 +2657,10 @@ static void __init report_meminit(void) else stack = "off"; - pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", + pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n", stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off", - want_init_on_free() ? "on" : "off"); + want_init_on_free() ? "on" : "off", + want_init_mlocked_on_free() ? "on" : "off"); if (want_init_on_free()) pr_info("mem auto-init: clearing system memory may take some time...\n"); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 47ab0297838a..e030ccf9d5bc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1032,7 +1032,7 @@ static inline bool should_skip_kasan_poison(struct page *page) return page_kasan_tag(page) == KASAN_TAG_KERNEL; } -static void kernel_init_pages(struct page *page, int numpages) +void kernel_init_pages(struct page *page, int numpages) { int i; diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index 2cff851ebfd7..effbf5982be1 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -255,6 +255,21 @@ config INIT_ON_FREE_DEFAULT_ON touching "cold" memory areas. Most cases see 3-5% impact. Some synthetic workloads have measured as high as 8%. +config INIT_MLOCKED_ON_FREE_DEFAULT_ON + bool "Enable mlocked memory zeroing on free" + depends on !KMSAN + help + This config has the effect of setting "init_mlocked_on_free=1" + on the kernel command line. If it is enabled, all mlocked process + memory is zeroed when freed. This restriction to mlocked memory + improves performance over "init_on_free" but can still be used to + protect confidential data like key material from content exposures + to other processes, as well as live forensics and cold boot attacks. + Any non-mlocked memory is not cleared before it is reassigned. This + configuration can be overwritten by setting "init_mlocked_on_free=0" + on the command line. The "init_on_free" boot option takes + precedence over "init_mlocked_on_free". + config CC_HAS_ZERO_CALL_USED_REGS def_bool $(cc-option,-fzero-call-used-regs=used-gpr) # https://github.com/ClangBuiltLinux/linux/issues/1766 -- cgit v1.2.3