From ff6c3d81f2e86b63a3a530683f89ef393882782a Mon Sep 17 00:00:00 2001 From: Liam Ni Date: Thu, 26 Oct 2023 10:03:29 +0800 Subject: NUMA: optimize detection of memory with no node id assigned by firmware Sanity check that makes sure the nodes cover all memory loops over numa_meminfo to count the pages that have node id assigned by the firmware, then loops again over memblock.memory to find the total amount of memory and in the end checks that the difference between the total memory and memory that covered by nodes is less than some threshold. Worse, the loop over numa_meminfo calls __absent_pages_in_range() that also partially traverses memblock.memory. It's much simpler and more efficient to have a single traversal of memblock.memory that verifies that amount of memory not covered by nodes is less than a threshold. Introduce memblock_validate_numa_coverage() that does exactly that and use it instead of numa_meminfo_cover_memory(). Link: https://lkml.kernel.org/r/20231026020329.327329-1-zhiguangni01@gmail.com Signed-off-by: Liam Ni Reviewed-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Bibo Mao Cc: Binbin Zhou Cc: Borislav Petkov Cc: Dave Hansen Cc: Feiyang Chen Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: WANG Xuerui Signed-off-by: Andrew Morton --- arch/x86/mm/numa.c | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index b29ceb19e46e..adc497b93f03 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -449,37 +449,6 @@ int __node_distance(int from, int to) } EXPORT_SYMBOL(__node_distance); -/* - * Sanity check to catch more bad NUMA configurations (they are amazingly - * common). Make sure the nodes cover all memory. - */ -static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) -{ - u64 numaram, e820ram; - int i; - - numaram = 0; - for (i = 0; i < mi->nr_blks; i++) { - u64 s = mi->blk[i].start >> PAGE_SHIFT; - u64 e = mi->blk[i].end >> PAGE_SHIFT; - numaram += e - s; - numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); - if ((s64)numaram < 0) - numaram = 0; - } - - e820ram = max_pfn - absent_pages_in_range(0, max_pfn); - - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { - printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", - (numaram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return false; - } - return true; -} - /* * Mark all currently memblock-reserved physical memory (which covers the * kernel's own memory ranges) as hot-unswappable. @@ -585,7 +554,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) return -EINVAL; } } - if (!numa_meminfo_cover_memory(mi)) + + if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; /* Finally register nodes. */ -- cgit v1.2.3 From 71ce1ab54a505736786d9c5921e6c2718c7ec535 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:01 +0000 Subject: mm/mglru: add CONFIG_ARCH_HAS_HW_PTE_YOUNG Patch series "mm/mglru: Kconfig cleanup", v4. This series is the result of the following discussion: https://lore.kernel.org/47066176-bd93-55dd-c2fa-002299d9e034@linux.ibm.com/ It mainly avoids building the code that walks page tables on CPUs that use it, i.e., those don't support hardware accessed bit. Specifically, it introduces a new Kconfig to guard some of functions added by commit bd74fdaea146 ("mm: multi-gen LRU: support page table walks") on CPUs like POWER9, on which the series was tested. This patch (of 5): Some architectures are able to set the accessed bit in PTEs when PTEs are used as part of linear address translations. Add CONFIG_ARCH_HAS_HW_PTE_YOUNG for such architectures to be able to override arch_has_hw_pte_young(). Link: https://lkml.kernel.org/r/20231227141205.2200125-1-kinseyho@google.com Link: https://lkml.kernel.org/r/20231227141205.2200125-2-kinseyho@google.com Signed-off-by: Kinsey Ho Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Tested-by: Donet Tom Acked-by: Yu Zhao Cc: kernel test robot Signed-off-by: Andrew Morton --- arch/Kconfig | 8 ++++++++ arch/arm64/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable.h | 6 ------ include/linux/pgtable.h | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index f4b210ab0612..8c8901f80586 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1470,6 +1470,14 @@ config DYNAMIC_SIGFRAME config HAVE_ARCH_NODE_DEV_GROUP bool +config ARCH_HAS_HW_PTE_YOUNG + bool + help + Architectures that select this option are capable of setting the + accessed bit in PTE entries when using them as part of linear address + translations. Architectures that require runtime check should select + this option and override arch_has_hw_pte_young(). + config ARCH_HAS_NONLEAF_PMD_YOUNG bool help diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7b071a00425d..12d611f3da5d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -36,6 +36,7 @@ config ARM64 select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_HW_PTE_YOUNG select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1566748f16c4..04941a1ffc0a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -88,6 +88,7 @@ config X86 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_HW_PTE_YOUNG select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_COPY_MC if X86_64 diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 57bab91bbf50..08b5cb22d9a6 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1679,12 +1679,6 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } -#define arch_has_hw_pte_young arch_has_hw_pte_young -static inline bool arch_has_hw_pte_young(void) -{ - return true; -} - #define arch_check_zapped_pte arch_check_zapped_pte void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index af7639c3b0a3..9ecc20fa6269 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -375,7 +375,7 @@ static inline bool arch_has_hw_nonleaf_pmd_young(void) */ static inline bool arch_has_hw_pte_young(void) { - return false; + return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif -- cgit v1.2.3 From 533c67e6358406727145efae32882c4dc355d6c5 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:04 +0000 Subject: mm/mglru: add dummy pmd_dirty() Add dummy pmd_dirty() for architectures that don't provide it. This is similar to commit 6617da8fb565 ("mm: add dummy pmd_young() for architectures not having it"). Link: https://lkml.kernel.org/r/20231227141205.2200125-5-kinseyho@google.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312210606.1Etqz3M4-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202312210042.xQEiqlEh-lkp@intel.com/ Signed-off-by: Kinsey Ho Suggested-by: Yu Zhao Cc: Aneesh Kumar K.V Cc: Donet Tom Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgtable.h | 1 + arch/mips/include/asm/pgtable.h | 1 + arch/riscv/include/asm/pgtable.h | 1 + arch/s390/include/asm/pgtable.h | 1 + arch/sparc/include/asm/pgtable_64.h | 1 + arch/x86/include/asm/pgtable.h | 1 + include/linux/pgtable.h | 7 +++++++ 7 files changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 29d9b12298bc..8b5df1bbf9e9 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -523,6 +523,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd) return pmd; } +#define pmd_dirty pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return !!(pmd_val(pmd) & (_PAGE_DIRTY | _PAGE_MODIFIED)); diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 430b208c0130..e27a4c83c548 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -655,6 +655,7 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) return pmd; } +#define pmd_dirty pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_MODIFIED); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index ab00235b018f..7b4287f36054 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -673,6 +673,7 @@ static inline int pmd_write(pmd_t pmd) return pte_write(pmd_pte(pmd)); } +#define pmd_dirty pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return pte_dirty(pmd_pte(pmd)); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 601e87fa8a9a..1299b56e43f6 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -770,6 +770,7 @@ static inline int pud_write(pud_t pud) return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0; } +#define pmd_dirty pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 5e41033bf4ca..a8c871b7d786 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -706,6 +706,7 @@ static inline unsigned long pmd_write(pmd_t pmd) #define pud_write(pud) pte_write(__pte(pud_val(pud))) #ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define pmd_dirty pmd_dirty static inline unsigned long pmd_dirty(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 08b5cb22d9a6..9d077bca6a10 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -141,6 +141,7 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +#define pmd_dirty pmd_dirty static inline bool pmd_dirty(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_DIRTY_BITS; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9ecc20fa6269..466cf477551a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -184,6 +184,13 @@ static inline int pmd_young(pmd_t pmd) } #endif +#ifndef pmd_dirty +static inline int pmd_dirty(pmd_t pmd) +{ + return 0; +} +#endif + /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode -- cgit v1.2.3