From 1f7a44f63e6c782c9c2aa9f18f40c23914e6b46a Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 13 Oct 2020 16:47:33 -0700 Subject: compiler-clang: add build check for clang 10.0.1 Patch series "set clang minimum version to 10.0.1", v3. Adds a compile time #error to compiler-clang.h setting the effective minimum supported version to clang 10.0.1. A separate patch has already been picked up into the Documentation/ tree also confirming the version. Next are a series of reverts. One for 32b arm is a partial revert. Then Marco suggested fixes to KASAN docs. Finally, improve the warning for GCC too as per Kees. This patch (of 7): During Plumbers 2020, we voted to just support the latest release of Clang for now. Add a compile time check for this. We plan to remove workarounds for older versions now, which will break in subtle and not so subtle ways. Suggested-by: Sedat Dilek Suggested-by: Nathan Chancellor Suggested-by: Kees Cook Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Tested-by: Sedat Dilek Reviewed-by: Kees Cook Reviewed-by: Miguel Ojeda Reviewed-by: Sedat Dilek Acked-by: Marco Elver Acked-by: Nathan Chancellor Acked-by: Sedat Dilek Cc: Andrey Konovalov Cc: Fangrui Song Cc: Masahiro Yamada Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Will Deacon Cc: Vincenzo Frascino Link: https://lkml.kernel.org/r/20200902225911.209899-1-ndesaulniers@google.com Link: https://lkml.kernel.org/r/20200902225911.209899-2-ndesaulniers@google.com Link: https://github.com/ClangBuiltLinux/linux/issues/9 Link: https://github.com/ClangBuiltLinux/linux/issues/941 Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index cee0c728d39a..230604e7f057 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -3,6 +3,14 @@ #error "Please don't include directly, include instead." #endif +#define CLANG_VERSION (__clang_major__ * 10000 \ + + __clang_minor__ * 100 \ + + __clang_patchlevel__) + +#if CLANG_VERSION < 100001 +# error Sorry, your version of Clang is too old - please use 10.0.1 or newer. +#endif + /* Compiler specific definitions for Clang compiler */ /* same as gcc, this was present in clang-2.6 so we can assume it works -- cgit v1.2.3 From c8db3b0a7ba7614f761f309d6aa7499127b18a0b Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 13 Oct 2020 16:47:55 -0700 Subject: compiler-gcc: improve version error As Kees suggests, doing so provides developers with two useful pieces of information: - The kernel build was attempting to use GCC. (Maybe they accidentally poked the wrong configs in a CI.) - They need 4.9 or better. ("Upgrade to what version?" doesn't need to be dug out of documentation, headers, etc.) Suggested-by: Kees Cook Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Tested-by: Sedat Dilek Reviewed-by: Kees Cook Reviewed-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Reviewed-by: Sedat Dilek Cc: Andrey Konovalov Cc: Fangrui Song Cc: Marco Elver Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Masahiro Yamada Cc: Vincenzo Frascino Cc: Will Deacon Link: https://lkml.kernel.org/r/20200902225911.209899-8-ndesaulniers@google.com Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7a3769040d7d..d1e3c6896b71 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -12,7 +12,7 @@ /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */ #if GCC_VERSION < 40900 -# error Sorry, your compiler is too old - please upgrade it. +# error Sorry, your version of GCC is too old - please use 4.9 or newer. #endif /* Optimization barrier */ -- cgit v1.2.3 From a25c13b3aa1bdbf100e8770902c30908728f8410 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 13 Oct 2020 16:47:58 -0700 Subject: compiler.h: avoid escaped section names The stringification operator, `#`, in the preprocessor escapes strings. For example, `# "foo"` becomes `"\"foo\""`. GCC and Clang differ in how they treat section names that contain \". The portable solution is to not use a string literal with the preprocessor stringification operator. In this case, since __section unconditionally uses the stringification operator, we actually want the more verbose __attribute__((__section__())). Fixes: commit e04462fb82f8 ("Compiler Attributes: remove uses of __attribute__ from compiler.h") Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Cc: Miguel Ojeda Cc: Luc Van Oostenryck Cc: Nathan Chancellor Cc: Arvind Sankar Link: https://bugs.llvm.org/show_bug.cgi?id=42950 Link: https://lkml.kernel.org/r/20200929194318.548707-1-ndesaulniers@google.com Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 92ef163a7479..ac45f6d40d39 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -155,7 +155,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, extern typeof(sym) sym; \ static const unsigned long __kentry_##sym \ __used \ - __section("___kentry" "+" #sym ) \ + __attribute__((__section__("___kentry+" #sym))) \ = (unsigned long)&sym; #endif -- cgit v1.2.3 From 4d6fb34acb5d0bfc579ccd29df9cc6f653e51ab2 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 13 Oct 2020 16:48:01 -0700 Subject: export.h: fix section name for CONFIG_TRIM_UNUSED_KSYMS for Clang When enabling CONFIG_TRIM_UNUSED_KSYMS, the linker will warn about the orphan sections: (".discard.ksym") is being placed in '".discard.ksym"' repeatedly when linking vmlinux. This is because the stringification operator, `#`, in the preprocessor escapes strings. GCC and Clang differ in how they treat section names that contain \". The portable solution is to not use a string literal with the preprocessor stringification operator. Fixes: commit bbda5ec671d3 ("kbuild: simplify dependency generation for CONFIG_TRIM_UNUSED_KSYMS") Reported-by: kbuild test robot Suggested-by: Kees Cook Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Nathan Chancellor Cc: Masahiro Yamada Cc: Matthias Maennich Cc: Jessica Yu Cc: Greg Kroah-Hartman Cc: Will Deacon Link: https://bugs.llvm.org/show_bug.cgi?id=42950 Link: https://github.com/ClangBuiltLinux/linux/issues/1166 Link: https://lkml.kernel.org/r/20200929190701.398762-1-ndesaulniers@google.com Signed-off-by: Linus Torvalds --- include/linux/export.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/export.h b/include/linux/export.h index fceb5e855717..8933ff6ad23a 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -130,7 +130,7 @@ struct kernel_symbol { * discarded in the final link stage. */ #define __ksym_marker(sym) \ - static int __ksym_marker_##sym[0] __section(".discard.ksym") __used + static int __ksym_marker_##sym[0] __section(.discard.ksym) __used #define __EXPORT_SYMBOL(sym, sec, ns) \ __ksym_marker(sym); \ -- cgit v1.2.3 From d7cff4ded857ff7cc3e49eb39cc14df9345b9662 Mon Sep 17 00:00:00 2001 From: tangjianqiang Date: Tue, 13 Oct 2020 16:48:37 -0700 Subject: include/linux/slab.h: fix a typo error in comment fix a typo error in slab.h "allocagtor" -> "allocator" Signed-off-by: tangjianqiang Signed-off-by: Andrew Morton Acked-by: Souptick Joarder Link: https://lkml.kernel.org/r/1600230053-24303-1-git-send-email-tangjianqiang@xiaomi.com Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 24df2393ec03..9e155cc83b8a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -279,7 +279,7 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX) /* Maximum size for which we actually use a slab cache */ #define KMALLOC_MAX_CACHE_SIZE (1UL << KMALLOC_SHIFT_HIGH) -/* Maximum order allocatable via the slab allocagtor */ +/* Maximum order allocatable via the slab allocator */ #define KMALLOC_MAX_ORDER (KMALLOC_SHIFT_MAX - PAGE_SHIFT) /* -- cgit v1.2.3 From 2dd57d3415f8623a5e9494c88978a202886041aa Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2020 16:48:57 -0700 Subject: x86/numa: cleanup configuration dependent command-line options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "device-dax: Support sub-dividing soft-reserved ranges", v5. The device-dax facility allows an address range to be directly mapped through a chardev, or optionally hotplugged to the core kernel page allocator as System-RAM. It is the mechanism for converting persistent memory (pmem) to be used as another volatile memory pool i.e. the current Memory Tiering hot topic on linux-mm. In the case of pmem the nvdimm-namespace-label mechanism can sub-divide it, but that labeling mechanism is not available / applicable to soft-reserved ("EFI specific purpose") memory [3]. This series provides a sysfs-mechanism for the daxctl utility to enable provisioning of volatile-soft-reserved memory ranges. The motivations for this facility are: 1/ Allow performance differentiated memory ranges to be split between kernel-managed and directly-accessed use cases. 2/ Allow physical memory to be provisioned along performance relevant address boundaries. For example, divide a memory-side cache [4] along cache-color boundaries. 3/ Parcel out soft-reserved memory to VMs using device-dax as a security / permissions boundary [5]. Specifically I have seen people (ab)using memmap=nn!ss (mark System-RAM as Persistent Memory) just to get the device-dax interface on custom address ranges. A follow-on for the VM use case is to teach device-dax to dynamically allocate 'struct page' at runtime to reduce the duplication of 'struct page' space in both the guest and the host kernel for the same physical pages. [2]: http://lore.kernel.org/r/20200713160837.13774-11-joao.m.martins@oracle.com [3]: http://lore.kernel.org/r/157309097008.1579826.12818463304589384434.stgit@dwillia2-desk3.amr.corp.intel.com [4]: http://lore.kernel.org/r/154899811738.3165233.12325692939590944259.stgit@dwillia2-desk3.amr.corp.intel.com [5]: http://lore.kernel.org/r/20200110190313.17144-1-joao.m.martins@oracle.com This patch (of 23): In preparation for adding a new numa= option clean up the existing ones to avoid ifdefs in numa_setup(), and provide feedback when the option is numa=fake= option is invalid due to kernel config. The same does not need to be done for numa=noacpi, since the capability is already hard disabled at compile-time. Suggested-by: Rafael J. Wysocki Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Ben Skeggs Cc: Borislav Petkov Cc: Brice Goglin Cc: Catalin Marinas Cc: Daniel Vetter Cc: Dave Hansen Cc: Dave Jiang Cc: David Airlie Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Jeff Moyer Cc: Jia He Cc: Joao Martins Cc: Jonathan Cameron Cc: Michael Ellerman Cc: Mike Rapoport Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tom Lendacky Cc: Vishal Verma Cc: Wei Yang Cc: Will Deacon Cc: Ard Biesheuvel Cc: Bjorn Helgaas Cc: Boris Ostrovsky Cc: Hulk Robot Cc: Jason Yan Cc: "Jérôme Glisse" Cc: Juergen Gross Cc: kernel test robot Cc: Randy Dunlap Cc: Stefano Stabellini Cc: Vivek Goyal Link: https://lkml.kernel.org/r/160106109960.30709.7379926726669669398.stgit@dwillia2-desk3.amr.corp.intel.com Link: https://lkml.kernel.org/r/159643094279.4062302.17779410714418721328.stgit@dwillia2-desk3.amr.corp.intel.com Link: https://lkml.kernel.org/r/159643094925.4062302.14979872973043772305.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- arch/x86/include/asm/numa.h | 8 +++++++- arch/x86/mm/numa.c | 8 ++------ arch/x86/mm/numa_emulation.c | 3 ++- arch/x86/xen/enlighten_pv.c | 2 +- drivers/acpi/numa/srat.c | 9 +++++++-- include/acpi/acpi_numa.h | 6 +++++- 6 files changed, 24 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index bbfde3d2662f..0aecc0b629e0 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -3,6 +3,7 @@ #define _ASM_X86_NUMA_H #include +#include #include #include @@ -77,7 +78,12 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable); #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -void numa_emu_cmdline(char *); +int numa_emu_cmdline(char *str); +#else /* CONFIG_NUMA_EMU */ +static inline int numa_emu_cmdline(char *str) +{ + return -EINVAL; +} #endif /* CONFIG_NUMA_EMU */ #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index aa76ec2d359b..87c52822cc44 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -37,14 +37,10 @@ static __init int numa_setup(char *opt) return -EINVAL; if (!strncmp(opt, "off", 3)) numa_off = 1; -#ifdef CONFIG_NUMA_EMU if (!strncmp(opt, "fake=", 5)) - numa_emu_cmdline(opt + 5); -#endif -#ifdef CONFIG_ACPI_NUMA + return numa_emu_cmdline(opt + 5); if (!strncmp(opt, "noacpi", 6)) - acpi_numa = -1; -#endif + disable_srat(); return 0; } early_param("numa", numa_setup); diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 683cd12f4793..87d77cc52f86 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -13,9 +13,10 @@ static int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; -void __init numa_emu_cmdline(char *str) +int __init numa_emu_cmdline(char *str) { emu_cmdline = str; + return 0; } static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 41485a8a6dcf..b1418a6c0e90 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1300,7 +1300,7 @@ asmlinkage __visible void __init xen_start_kernel(void) * any NUMA information the kernel tries to get from ACPI will * be meaningless. Prevent it from trying. */ - acpi_numa = -1; + disable_srat(); #endif WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 15bbaab8500b..1b0ae0a1959b 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -27,7 +27,12 @@ static int node_to_pxm_map[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL }; unsigned char acpi_srat_revision __initdata; -int acpi_numa __initdata; +static int acpi_numa __initdata; + +void __init disable_srat(void) +{ + acpi_numa = -1; +} int pxm_to_node(int pxm) { @@ -163,7 +168,7 @@ static int __init slit_valid(struct acpi_table_slit *slit) void __init bad_srat(void) { pr_err("SRAT: SRAT not used.\n"); - acpi_numa = -1; + disable_srat(); } int __init srat_disabled(void) diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index fdebcfc6c8df..8784183b2204 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -17,10 +17,14 @@ extern int pxm_to_node(int); extern int node_to_pxm(int); extern int acpi_map_pxm_to_node(int); extern unsigned char acpi_srat_revision; -extern int acpi_numa __initdata; +extern void disable_srat(void); extern void bad_srat(void); extern int srat_disabled(void); +#else /* CONFIG_ACPI_NUMA */ +static inline void disable_srat(void) +{ +} #endif /* CONFIG_ACPI_NUMA */ #endif /* __ACP_NUMA_H */ -- cgit v1.2.3 From 3b0d31011d39759e3ba7214f75f77bb31983b5a4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2020 16:49:02 -0700 Subject: x86/numa: add 'nohmat' option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Disable parsing of the HMAT for debug, to workaround broken platform instances, or cases where it is otherwise not wanted. [rdunlap@infradead.org: fix build when CONFIG_ACPI is not set] Link: https://lkml.kernel.org/r/70e5ee34-9809-a997-7b49-499e4be61307@infradead.org Signed-off-by: Dan Williams Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ard Biesheuvel Cc: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Ben Skeggs Cc: Brice Goglin Cc: Catalin Marinas Cc: Daniel Vetter Cc: Dave Jiang Cc: David Airlie Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Jeff Moyer Cc: Jia He Cc: Joao Martins Cc: Jonathan Cameron Cc: Michael Ellerman Cc: Mike Rapoport Cc: Paul Mackerras Cc: Pavel Tatashin Cc: "Rafael J. Wysocki" Cc: Tom Lendacky Cc: Vishal Verma Cc: Wei Yang Cc: Will Deacon Cc: Bjorn Helgaas Cc: Boris Ostrovsky Cc: Hulk Robot Cc: Jason Yan Cc: "Jérôme Glisse" Cc: Juergen Gross Cc: kernel test robot Cc: Randy Dunlap Cc: Stefano Stabellini Cc: Vivek Goyal Link: https://lkml.kernel.org/r/159643095540.4062302.732962081968036212.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- Documentation/x86/x86_64/boot-options.rst | 4 ++++ arch/x86/mm/numa.c | 2 ++ drivers/acpi/numa/hmat.c | 8 +++++++- include/acpi/acpi_numa.h | 8 ++++++++ include/linux/acpi.h | 2 ++ 5 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst index 2b98efb5ba7f..324cefff92e7 100644 --- a/Documentation/x86/x86_64/boot-options.rst +++ b/Documentation/x86/x86_64/boot-options.rst @@ -173,6 +173,10 @@ NUMA numa=noacpi Don't parse the SRAT table for NUMA setup + numa=nohmat + Don't parse the HMAT table for NUMA setup, or soft-reserved memory + partitioning. + numa=fake=[MG] If given as a memory unit, fills all system RAM with nodes of size interleaved over physical nodes. diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 87c52822cc44..f3805bbaa784 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -41,6 +41,8 @@ static __init int numa_setup(char *opt) return numa_emu_cmdline(opt + 5); if (!strncmp(opt, "noacpi", 6)) disable_srat(); + if (!strncmp(opt, "nohmat", 6)) + disable_hmat(); return 0; } early_param("numa", numa_setup); diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 2c32cfb72370..a12e36a12618 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -26,6 +26,12 @@ #include static u8 hmat_revision; +static int hmat_disable __initdata; + +void __init disable_hmat(void) +{ + hmat_disable = 1; +} static LIST_HEAD(targets); static LIST_HEAD(initiators); @@ -814,7 +820,7 @@ static __init int hmat_init(void) enum acpi_hmat_type i; acpi_status status; - if (srat_disabled()) + if (srat_disabled() || hmat_disable) return 0; status = acpi_get_table(ACPI_SIG_SRAT, 0, &tbl); diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index 8784183b2204..0e9302285f14 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -27,4 +27,12 @@ static inline void disable_srat(void) { } #endif /* CONFIG_ACPI_NUMA */ + +#ifdef CONFIG_ACPI_HMAT +extern void disable_hmat(void); +#else /* CONFIG_ACPI_HMAT */ +static inline void disable_hmat(void) +{ +} +#endif /* CONFIG_ACPI_HMAT */ #endif /* __ACP_NUMA_H */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 64ae25c59d55..cfa8c0015863 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -709,6 +709,8 @@ static inline u64 acpi_arch_get_root_pointer(void) #define ACPI_HANDLE_FWNODE(fwnode) (NULL) #define ACPI_DEVICE_CLASS(_cls, _msk) .cls = (0), .cls_msk = (0), +#include + struct fwnode_handle; static inline bool acpi_dev_found(const char *hid) -- cgit v1.2.3 From c01044cc819160323f3ca4acd44fca487c4432e6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2020 16:49:13 -0700 Subject: ACPI: HMAT: refactor hmat_register_target_device to hmem_register_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for exposing "Soft Reserved" memory ranges without an HMAT, move the hmem device registration to its own compilation unit and make the implementation generic. The generic implementation drops usage acpi_map_pxm_to_online_node() that was translating ACPI proximity domain values and instead relies on numa_map_to_online_node() to determine the numa node for the device. [joao.m.martins@oracle.com: CONFIG_DEV_DAX_HMEM_DEVICES should depend on CONFIG_DAX=y] Link: https://lkml.kernel.org/r/8f34727f-ec2d-9395-cb18-969ec8a5d0d4@oracle.com Signed-off-by: Dan Williams Signed-off-by: Joao Martins Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Ben Skeggs Cc: Borislav Petkov Cc: Brice Goglin Cc: Catalin Marinas Cc: Daniel Vetter Cc: Dave Hansen Cc: Dave Jiang Cc: David Airlie Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Jeff Moyer Cc: Jia He Cc: Joao Martins Cc: Jonathan Cameron Cc: Michael Ellerman Cc: Mike Rapoport Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tom Lendacky Cc: Vishal Verma Cc: Wei Yang Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ard Biesheuvel Cc: Bjorn Helgaas Cc: Boris Ostrovsky Cc: Hulk Robot Cc: Jason Yan Cc: "Jérôme Glisse" Cc: Juergen Gross Cc: kernel test robot Cc: Randy Dunlap Cc: Stefano Stabellini Cc: Vivek Goyal Link: https://lkml.kernel.org/r/159643096584.4062302.5035370788475153738.stgit@dwillia2-desk3.amr.corp.intel.com Link: https://lore.kernel.org/r/158318761484.2216124.2049322072599482736.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- drivers/acpi/numa/hmat.c | 68 +++++------------------------------------------ drivers/dax/Kconfig | 4 +++ drivers/dax/Makefile | 3 +-- drivers/dax/hmem.c | 56 -------------------------------------- drivers/dax/hmem/Makefile | 5 ++++ drivers/dax/hmem/device.c | 65 ++++++++++++++++++++++++++++++++++++++++++++ drivers/dax/hmem/hmem.c | 56 ++++++++++++++++++++++++++++++++++++++ include/linux/dax.h | 8 ++++++ 8 files changed, 145 insertions(+), 120 deletions(-) delete mode 100644 drivers/dax/hmem.c create mode 100644 drivers/dax/hmem/Makefile create mode 100644 drivers/dax/hmem/device.c create mode 100644 drivers/dax/hmem/hmem.c (limited to 'include') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index a12e36a12618..134bcb40b2af 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -24,6 +24,7 @@ #include #include #include +#include static u8 hmat_revision; static int hmat_disable __initdata; @@ -640,66 +641,6 @@ static void hmat_register_target_perf(struct memory_target *target) node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0); } -static void hmat_register_target_device(struct memory_target *target, - struct resource *r) -{ - /* define a clean / non-busy resource for the platform device */ - struct resource res = { - .start = r->start, - .end = r->end, - .flags = IORESOURCE_MEM, - }; - struct platform_device *pdev; - struct memregion_info info; - int rc, id; - - rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM, - IORES_DESC_SOFT_RESERVED); - if (rc != REGION_INTERSECTS) - return; - - id = memregion_alloc(GFP_KERNEL); - if (id < 0) { - pr_err("memregion allocation failure for %pr\n", &res); - return; - } - - pdev = platform_device_alloc("hmem", id); - if (!pdev) { - pr_err("hmem device allocation failure for %pr\n", &res); - goto out_pdev; - } - - pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm); - info = (struct memregion_info) { - .target_node = acpi_map_pxm_to_node(target->memory_pxm), - }; - rc = platform_device_add_data(pdev, &info, sizeof(info)); - if (rc < 0) { - pr_err("hmem memregion_info allocation failure for %pr\n", &res); - goto out_pdev; - } - - rc = platform_device_add_resources(pdev, &res, 1); - if (rc < 0) { - pr_err("hmem resource allocation failure for %pr\n", &res); - goto out_resource; - } - - rc = platform_device_add(pdev); - if (rc < 0) { - dev_err(&pdev->dev, "device add failed for %pr\n", &res); - goto out_resource; - } - - return; - -out_resource: - put_device(&pdev->dev); -out_pdev: - memregion_free(id); -} - static void hmat_register_target_devices(struct memory_target *target) { struct resource *res; @@ -711,8 +652,11 @@ static void hmat_register_target_devices(struct memory_target *target) if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM)) return; - for (res = target->memregions.child; res; res = res->sibling) - hmat_register_target_device(target, res); + for (res = target->memregions.child; res; res = res->sibling) { + int target_nid = acpi_map_pxm_to_node(target->memory_pxm); + + hmem_register_device(target_nid, res); + } } static void hmat_register_target(struct memory_target *target) diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 3b6c06f07326..a66dff78f298 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -48,6 +48,10 @@ config DEV_DAX_HMEM Say M if unsure. +config DEV_DAX_HMEM_DEVICES + depends on DEV_DAX_HMEM && DAX=y + def_bool y + config DEV_DAX_KMEM tristate "KMEM DAX: volatile-use of persistent memory" default DEV_DAX diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 80065b38b3c4..9d4ba672d305 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -2,11 +2,10 @@ obj-$(CONFIG_DAX) += dax.o obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o -obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o dax-y := super.o dax-y += bus.o device_dax-y := device.o -dax_hmem-y := hmem.o obj-y += pmem/ +obj-y += hmem/ diff --git a/drivers/dax/hmem.c b/drivers/dax/hmem.c deleted file mode 100644 index fe7214daf62e..000000000000 --- a/drivers/dax/hmem.c +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include "bus.h" - -static int dax_hmem_probe(struct platform_device *pdev) -{ - struct device *dev = &pdev->dev; - struct dev_pagemap pgmap = { }; - struct dax_region *dax_region; - struct memregion_info *mri; - struct dev_dax *dev_dax; - struct resource *res; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENOMEM; - - mri = dev->platform_data; - memcpy(&pgmap.res, res, sizeof(*res)); - - dax_region = alloc_dax_region(dev, pdev->id, res, mri->target_node, - PMD_SIZE, PFN_DEV|PFN_MAP); - if (!dax_region) - return -ENOMEM; - - dev_dax = devm_create_dev_dax(dax_region, 0, &pgmap); - if (IS_ERR(dev_dax)) - return PTR_ERR(dev_dax); - - /* child dev_dax instances now own the lifetime of the dax_region */ - dax_region_put(dax_region); - return 0; -} - -static int dax_hmem_remove(struct platform_device *pdev) -{ - /* devm handles teardown */ - return 0; -} - -static struct platform_driver dax_hmem_driver = { - .probe = dax_hmem_probe, - .remove = dax_hmem_remove, - .driver = { - .name = "hmem", - }, -}; - -module_platform_driver(dax_hmem_driver); - -MODULE_ALIAS("platform:hmem*"); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/dax/hmem/Makefile b/drivers/dax/hmem/Makefile new file mode 100644 index 000000000000..a9d353d0c9ed --- /dev/null +++ b/drivers/dax/hmem/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o +obj-$(CONFIG_DEV_DAX_HMEM_DEVICES) += device.o + +dax_hmem-y := hmem.o diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c new file mode 100644 index 000000000000..b9dd6b27745c --- /dev/null +++ b/drivers/dax/hmem/device.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +void hmem_register_device(int target_nid, struct resource *r) +{ + /* define a clean / non-busy resource for the platform device */ + struct resource res = { + .start = r->start, + .end = r->end, + .flags = IORESOURCE_MEM, + }; + struct platform_device *pdev; + struct memregion_info info; + int rc, id; + + rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED); + if (rc != REGION_INTERSECTS) + return; + + id = memregion_alloc(GFP_KERNEL); + if (id < 0) { + pr_err("memregion allocation failure for %pr\n", &res); + return; + } + + pdev = platform_device_alloc("hmem", id); + if (!pdev) { + pr_err("hmem device allocation failure for %pr\n", &res); + goto out_pdev; + } + + pdev->dev.numa_node = numa_map_to_online_node(target_nid); + info = (struct memregion_info) { + .target_node = target_nid, + }; + rc = platform_device_add_data(pdev, &info, sizeof(info)); + if (rc < 0) { + pr_err("hmem memregion_info allocation failure for %pr\n", &res); + goto out_pdev; + } + + rc = platform_device_add_resources(pdev, &res, 1); + if (rc < 0) { + pr_err("hmem resource allocation failure for %pr\n", &res); + goto out_resource; + } + + rc = platform_device_add(pdev); + if (rc < 0) { + dev_err(&pdev->dev, "device add failed for %pr\n", &res); + goto out_resource; + } + + return; + +out_resource: + put_device(&pdev->dev); +out_pdev: + memregion_free(id); +} diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c new file mode 100644 index 000000000000..29ceb5795297 --- /dev/null +++ b/drivers/dax/hmem/hmem.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "../bus.h" + +static int dax_hmem_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct dev_pagemap pgmap = { }; + struct dax_region *dax_region; + struct memregion_info *mri; + struct dev_dax *dev_dax; + struct resource *res; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENOMEM; + + mri = dev->platform_data; + memcpy(&pgmap.res, res, sizeof(*res)); + + dax_region = alloc_dax_region(dev, pdev->id, res, mri->target_node, + PMD_SIZE, PFN_DEV|PFN_MAP); + if (!dax_region) + return -ENOMEM; + + dev_dax = devm_create_dev_dax(dax_region, 0, &pgmap); + if (IS_ERR(dev_dax)) + return PTR_ERR(dev_dax); + + /* child dev_dax instances now own the lifetime of the dax_region */ + dax_region_put(dax_region); + return 0; +} + +static int dax_hmem_remove(struct platform_device *pdev) +{ + /* devm handles teardown */ + return 0; +} + +static struct platform_driver dax_hmem_driver = { + .probe = dax_hmem_probe, + .remove = dax_hmem_remove, + .driver = { + .name = "hmem", + }, +}; + +module_platform_driver(dax_hmem_driver); + +MODULE_ALIAS("platform:hmem*"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/include/linux/dax.h b/include/linux/dax.h index 43b39ab9de1a..4ec0bbf86205 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -238,4 +238,12 @@ static inline bool dax_mapping(struct address_space *mapping) return mapping->host && IS_DAX(mapping->host); } +#ifdef CONFIG_DEV_DAX_HMEM_DEVICES +void hmem_register_device(int target_nid, struct resource *r); +#else +static inline void hmem_register_device(int target_nid, struct resource *r) +{ +} +#endif + #endif -- cgit v1.2.3 From a035b6bf863e5c42c2746de2a8ed6600140307e7 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2020 16:49:23 -0700 Subject: mm/memory_hotplug: introduce default phys_to_target_node() implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to set a fallback value for dev_dax->target_node, introduce generic fallback helpers for phys_to_target_node() A generic implementation based on node-data or memblock was proposed, but as noted by Mike: "Here again, I would prefer to add a weak default for phys_to_target_node() because the "generic" implementation is not really generic. The fallback to reserved ranges is x86 specfic because on x86 most of the reserved areas is not in memblock.memory. AFAIK, no other architecture does this." The info message in the generic memory_add_physaddr_to_nid() implementation is fixed up to properly reflect that memory_add_physaddr_to_nid() communicates "online" node info and phys_to_target_node() indicates "target / to-be-onlined" node info. [akpm@linux-foundation.org: fix CONFIG_MEMORY_HOTPLUG=n build] Link: https://lkml.kernel.org/r/202008252130.7YrHIyMI%25lkp@intel.com Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Cc: David Hildenbrand Cc: Mike Rapoport Cc: Jia He Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Ben Skeggs Cc: Borislav Petkov Cc: Brice Goglin Cc: Catalin Marinas Cc: Daniel Vetter Cc: Dave Hansen Cc: Dave Jiang Cc: David Airlie Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Jeff Moyer Cc: Joao Martins Cc: Jonathan Cameron Cc: Michael Ellerman Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tom Lendacky Cc: Vishal Verma Cc: Wei Yang Cc: Will Deacon Cc: Ard Biesheuvel Cc: Bjorn Helgaas Cc: Boris Ostrovsky Cc: Hulk Robot Cc: Jason Yan Cc: "Jérôme Glisse" Cc: Juergen Gross Cc: kernel test robot Cc: Randy Dunlap Cc: Stefano Stabellini Cc: Vivek Goyal Link: https://lkml.kernel.org/r/159643097768.4062302.3135192588966888630.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 1 - include/linux/memory_hotplug.h | 23 ++++++++++++++--------- include/linux/numa.h | 11 ----------- mm/memory_hotplug.c | 10 +++++++++- 4 files changed, 23 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f3805bbaa784..c62e274d52d0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -917,7 +917,6 @@ int phys_to_target_node(phys_addr_t start) return meminfo_to_nid(&numa_reserved_meminfo, start); } -EXPORT_SYMBOL_GPL(phys_to_target_node); int memory_add_physaddr_to_nid(u64 start) { diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 375515803cd8..c0faa7a30c46 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -149,15 +149,6 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_params *params); #endif /* ARCH_HAS_ADD_PAGES */ -#ifdef CONFIG_NUMA -extern int memory_add_physaddr_to_nid(u64 start); -#else -static inline int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -#endif - #ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION /* * For supporting node-hotadd, we have to allocate a new pgdat. @@ -284,6 +275,20 @@ static inline bool movable_node_is_enabled(void) } #endif /* ! CONFIG_MEMORY_HOTPLUG */ +#ifdef CONFIG_NUMA +extern int memory_add_physaddr_to_nid(u64 start); +extern int phys_to_target_node(u64 start); +#else +static inline int memory_add_physaddr_to_nid(u64 start) +{ + return 0; +} +static inline int phys_to_target_node(u64 start) +{ + return 0; +} +#endif + #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * pgdat resizing functions diff --git a/include/linux/numa.h b/include/linux/numa.h index a42df804679e..8cb33ccfb671 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -23,22 +23,11 @@ #ifdef CONFIG_NUMA /* Generic implementation available */ int numa_map_to_online_node(int node); - -/* - * Optional architecture specific implementation, users need a "depends - * on $ARCH" - */ -int phys_to_target_node(phys_addr_t addr); #else static inline int numa_map_to_online_node(int node) { return NUMA_NO_NODE; } - -static inline int phys_to_target_node(phys_addr_t addr) -{ - return NUMA_NO_NODE; -} #endif #endif /* _LINUX_NUMA_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ce3e73e3a5c1..8e9e2d44cdad 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -353,11 +353,19 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, #ifdef CONFIG_NUMA int __weak memory_add_physaddr_to_nid(u64 start) { - pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", start); return 0; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + +int __weak phys_to_target_node(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(phys_to_target_node); #endif /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ -- cgit v1.2.3 From a4574f63edc6f76fb46dcd65d3eb4d5a8e23ba38 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2020 16:50:29 -0700 Subject: mm/memremap_pages: convert to 'struct range' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'struct resource' in 'struct dev_pagemap' is only used for holding resource span information. The other fields, 'name', 'flags', 'desc', 'parent', 'sibling', and 'child' are all unused wasted space. This is in preparation for introducing a multi-range extension of devm_memremap_pages(). The bulk of this change is unwinding all the places internal to libnvdimm that used 'struct resource' unnecessarily, and replacing instances of 'struct dev_pagemap'.res with 'struct dev_pagemap'.range. P2PDMA had a minor usage of the resource flags field, but only to report failures with "%pR". That is replaced with an open coded print of the range. [dan.carpenter@oracle.com: mm/hmm/test: use after free in dmirror_allocate_chunk()] Link: https://lkml.kernel.org/r/20200926121402.GA7467@kadam Signed-off-by: Dan Williams Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Reviewed-by: Boris Ostrovsky [xen] Cc: Paul Mackerras Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Vishal Verma Cc: Vivek Goyal Cc: Dave Jiang Cc: Ben Skeggs Cc: David Airlie Cc: Daniel Vetter Cc: Ira Weiny Cc: Bjorn Helgaas Cc: Juergen Gross Cc: Stefano Stabellini Cc: "Jérôme Glisse" Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Brice Goglin Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Hulk Robot Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Jason Yan Cc: Jeff Moyer Cc: Jia He Cc: Joao Martins Cc: Jonathan Cameron Cc: kernel test robot Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Tom Lendacky Cc: Wei Yang Cc: Will Deacon Link: https://lkml.kernel.org/r/159643103173.4062302.768998885691711532.stgit@dwillia2-desk3.amr.corp.intel.com Link: https://lkml.kernel.org/r/160106115761.30709.13539840236873663620.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_hv_uvmem.c | 13 +++--- drivers/dax/bus.c | 10 ++--- drivers/dax/bus.h | 2 +- drivers/dax/dax-private.h | 5 --- drivers/dax/device.c | 3 +- drivers/dax/hmem/hmem.c | 5 ++- drivers/dax/pmem/core.c | 12 +++--- drivers/gpu/drm/nouveau/nouveau_dmem.c | 14 +++---- drivers/nvdimm/badrange.c | 26 ++++++------ drivers/nvdimm/claim.c | 13 +++--- drivers/nvdimm/nd.h | 3 +- drivers/nvdimm/pfn_devs.c | 12 +++--- drivers/nvdimm/pmem.c | 26 ++++++------ drivers/nvdimm/region.c | 21 ++++++---- drivers/pci/p2pdma.c | 11 +++-- drivers/xen/unpopulated-alloc.c | 44 ++++++++++++------- include/linux/memremap.h | 5 ++- include/linux/range.h | 6 +++ lib/test_hmm.c | 50 +++++++++++----------- mm/memremap.c | 77 +++++++++++++++++----------------- tools/testing/nvdimm/test/iomap.c | 2 +- 21 files changed, 195 insertions(+), 165 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 7705d5557239..29ec555055c2 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -687,9 +687,9 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) struct kvmppc_uvmem_page_pvt *pvt; unsigned long pfn_last, pfn_first; - pfn_first = kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT; + pfn_first = kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT; pfn_last = pfn_first + - (resource_size(&kvmppc_uvmem_pgmap.res) >> PAGE_SHIFT); + (range_len(&kvmppc_uvmem_pgmap.range) >> PAGE_SHIFT); spin_lock(&kvmppc_uvmem_bitmap_lock); bit = find_first_zero_bit(kvmppc_uvmem_bitmap, @@ -1007,7 +1007,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) static void kvmppc_uvmem_page_free(struct page *page) { unsigned long pfn = page_to_pfn(page) - - (kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT); + (kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT); struct kvmppc_uvmem_page_pvt *pvt; spin_lock(&kvmppc_uvmem_bitmap_lock); @@ -1170,7 +1170,8 @@ int kvmppc_uvmem_init(void) } kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE; - kvmppc_uvmem_pgmap.res = *res; + kvmppc_uvmem_pgmap.range.start = res->start; + kvmppc_uvmem_pgmap.range.end = res->end; kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops; /* just one global instance: */ kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap; @@ -1205,7 +1206,7 @@ void kvmppc_uvmem_free(void) return; memunmap_pages(&kvmppc_uvmem_pgmap); - release_mem_region(kvmppc_uvmem_pgmap.res.start, - resource_size(&kvmppc_uvmem_pgmap.res)); + release_mem_region(kvmppc_uvmem_pgmap.range.start, + range_len(&kvmppc_uvmem_pgmap.range)); kfree(kvmppc_uvmem_bitmap); } diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 53d07f2f1285..00fa73a8dfb4 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -515,7 +515,7 @@ static void dax_region_unregister(void *region) } struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, int target_node, unsigned int align, + struct range *range, int target_node, unsigned int align, unsigned long flags) { struct dax_region *dax_region; @@ -530,8 +530,8 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, return NULL; } - if (!IS_ALIGNED(res->start, align) - || !IS_ALIGNED(resource_size(res), align)) + if (!IS_ALIGNED(range->start, align) + || !IS_ALIGNED(range_len(range), align)) return NULL; dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); @@ -546,8 +546,8 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, dax_region->target_node = target_node; ida_init(&dax_region->ida); dax_region->res = (struct resource) { - .start = res->start, - .end = res->end, + .start = range->start, + .end = range->end, .flags = IORESOURCE_MEM | flags, }; diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index da27ea70a19a..72b92f95509f 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -13,7 +13,7 @@ void dax_region_put(struct dax_region *dax_region); #define IORESOURCE_DAX_STATIC (1UL << 0) struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, int target_node, unsigned int align, + struct range *range, int target_node, unsigned int align, unsigned long flags); enum dev_dax_subsys { diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index b81a1494d82b..0cbb2ec81ca7 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -61,11 +61,6 @@ struct dev_dax { struct range range; }; -static inline u64 range_len(struct range *range) -{ - return range->end - range->start + 1; -} - static inline struct dev_dax *to_dev_dax(struct device *dev) { return container_of(dev, struct dev_dax, dev); diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 9833fa83b537..a14448bca83d 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -416,8 +416,7 @@ int dev_dax_probe(struct dev_dax *dev_dax) pgmap = devm_kzalloc(dev, sizeof(*pgmap), GFP_KERNEL); if (!pgmap) return -ENOMEM; - pgmap->res.start = range->start; - pgmap->res.end = range->end; + pgmap->range = *range; } pgmap->type = MEMORY_DEVICE_GENERIC; addr = devm_memremap_pages(dev, pgmap); diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index aa260009dfc7..1a3347bb6143 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -13,13 +13,16 @@ static int dax_hmem_probe(struct platform_device *pdev) struct dev_dax_data data; struct dev_dax *dev_dax; struct resource *res; + struct range range; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) return -ENOMEM; mri = dev->platform_data; - dax_region = alloc_dax_region(dev, pdev->id, res, mri->target_node, + range.start = res->start; + range.end = res->end; + dax_region = alloc_dax_region(dev, pdev->id, &range, mri->target_node, PMD_SIZE, 0); if (!dax_region) return -ENOMEM; diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c index 4fe700884338..62b26bfceab1 100644 --- a/drivers/dax/pmem/core.c +++ b/drivers/dax/pmem/core.c @@ -9,7 +9,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) { - struct resource res; + struct range range; int rc, id, region_id; resource_size_t offset; struct nd_pfn_sb *pfn_sb; @@ -50,10 +50,10 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) if (rc != 2) return ERR_PTR(-EINVAL); - /* adjust the dax_region resource to the start of data */ - memcpy(&res, &pgmap.res, sizeof(res)); - res.start += offset; - dax_region = alloc_dax_region(dev, region_id, &res, + /* adjust the dax_region range to the start of data */ + range = pgmap.range; + range.start += offset, + dax_region = alloc_dax_region(dev, region_id, &range, nd_region->target_node, le32_to_cpu(pfn_sb->align), IORESOURCE_DAX_STATIC); if (!dax_region) @@ -64,7 +64,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) .id = id, .pgmap = &pgmap, .subsys = subsys, - .size = resource_size(&res), + .size = range_len(&range), }; dev_dax = devm_create_dev_dax(&data); diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 4e8112fde3e6..25811ed7e274 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -101,7 +101,7 @@ unsigned long nouveau_dmem_page_addr(struct page *page) { struct nouveau_dmem_chunk *chunk = nouveau_page_to_chunk(page); unsigned long off = (page_to_pfn(page) << PAGE_SHIFT) - - chunk->pagemap.res.start; + chunk->pagemap.range.start; return chunk->bo->offset + off; } @@ -249,7 +249,8 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) chunk->drm = drm; chunk->pagemap.type = MEMORY_DEVICE_PRIVATE; - chunk->pagemap.res = *res; + chunk->pagemap.range.start = res->start; + chunk->pagemap.range.end = res->end; chunk->pagemap.ops = &nouveau_dmem_pagemap_ops; chunk->pagemap.owner = drm->dev; @@ -273,7 +274,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) list_add(&chunk->list, &drm->dmem->chunks); mutex_unlock(&drm->dmem->mutex); - pfn_first = chunk->pagemap.res.start >> PAGE_SHIFT; + pfn_first = chunk->pagemap.range.start >> PAGE_SHIFT; page = pfn_to_page(pfn_first); spin_lock(&drm->dmem->lock); for (i = 0; i < DMEM_CHUNK_NPAGES - 1; ++i, ++page) { @@ -294,8 +295,7 @@ out_bo_unpin: out_bo_free: nouveau_bo_ref(NULL, &chunk->bo); out_release: - release_mem_region(chunk->pagemap.res.start, - resource_size(&chunk->pagemap.res)); + release_mem_region(chunk->pagemap.range.start, range_len(&chunk->pagemap.range)); out_free: kfree(chunk); out: @@ -382,8 +382,8 @@ nouveau_dmem_fini(struct nouveau_drm *drm) nouveau_bo_ref(NULL, &chunk->bo); list_del(&chunk->list); memunmap_pages(&chunk->pagemap); - release_mem_region(chunk->pagemap.res.start, - resource_size(&chunk->pagemap.res)); + release_mem_region(chunk->pagemap.range.start, + range_len(&chunk->pagemap.range)); kfree(chunk); } diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c index b9eeefa27e3a..aaf6e215a8c6 100644 --- a/drivers/nvdimm/badrange.c +++ b/drivers/nvdimm/badrange.c @@ -211,7 +211,7 @@ static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) } static void badblocks_populate(struct badrange *badrange, - struct badblocks *bb, const struct resource *res) + struct badblocks *bb, const struct range *range) { struct badrange_entry *bre; @@ -222,34 +222,34 @@ static void badblocks_populate(struct badrange *badrange, u64 bre_end = bre->start + bre->length - 1; /* Discard intervals with no intersection */ - if (bre_end < res->start) + if (bre_end < range->start) continue; - if (bre->start > res->end) + if (bre->start > range->end) continue; /* Deal with any overlap after start of the namespace */ - if (bre->start >= res->start) { + if (bre->start >= range->start) { u64 start = bre->start; u64 len; - if (bre_end <= res->end) + if (bre_end <= range->end) len = bre->length; else - len = res->start + resource_size(res) + len = range->start + range_len(range) - bre->start; - __add_badblock_range(bb, start - res->start, len); + __add_badblock_range(bb, start - range->start, len); continue; } /* * Deal with overlap for badrange starting before * the namespace. */ - if (bre->start < res->start) { + if (bre->start < range->start) { u64 len; - if (bre_end < res->end) - len = bre->start + bre->length - res->start; + if (bre_end < range->end) + len = bre->start + bre->length - range->start; else - len = resource_size(res); + len = range_len(range); __add_badblock_range(bb, 0, len); } } @@ -267,7 +267,7 @@ static void badblocks_populate(struct badrange *badrange, * and add badblocks entries for all matching sub-ranges */ void nvdimm_badblocks_populate(struct nd_region *nd_region, - struct badblocks *bb, const struct resource *res) + struct badblocks *bb, const struct range *range) { struct nvdimm_bus *nvdimm_bus; @@ -279,7 +279,7 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region, nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); nvdimm_bus_lock(&nvdimm_bus->dev); - badblocks_populate(&nvdimm_bus->badrange, bb, res); + badblocks_populate(&nvdimm_bus->badrange, bb, range); nvdimm_bus_unlock(&nvdimm_bus->dev); } EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 22d865ba6353..5a7c80053c62 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -303,13 +303,16 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio, resource_size_t size) { - struct resource *res = &nsio->res; struct nd_namespace_common *ndns = &nsio->common; + struct range range = { + .start = nsio->res.start, + .end = nsio->res.end, + }; nsio->size = size; - if (!devm_request_mem_region(dev, res->start, size, + if (!devm_request_mem_region(dev, range.start, size, dev_name(&ndns->dev))) { - dev_warn(dev, "could not reserve region %pR\n", res); + dev_warn(dev, "could not reserve region %pR\n", &nsio->res); return -EBUSY; } @@ -317,9 +320,9 @@ int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio, if (devm_init_badblocks(dev, &nsio->bb)) return -ENOMEM; nvdimm_badblocks_populate(to_nd_region(ndns->dev.parent), &nsio->bb, - &nsio->res); + &range); - nsio->addr = devm_memremap(dev, res->start, size, ARCH_MEMREMAP_PMEM); + nsio->addr = devm_memremap(dev, range.start, size, ARCH_MEMREMAP_PMEM); return PTR_ERR_OR_ZERO(nsio->addr); } diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 72740108ba42..696b55556d4d 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -377,8 +377,9 @@ int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); unsigned int pmem_sector_size(struct nd_namespace_common *ndns); +struct range; void nvdimm_badblocks_populate(struct nd_region *nd_region, - struct badblocks *bb, const struct resource *res); + struct badblocks *bb, const struct range *range); int devm_namespace_enable(struct device *dev, struct nd_namespace_common *ndns, resource_size_t size); void devm_namespace_disable(struct device *dev, diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 3e11ef8d3f5b..3c4787b92a6a 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -672,7 +672,7 @@ static unsigned long init_altmap_reserve(resource_size_t base) static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) { - struct resource *res = &pgmap->res; + struct range *range = &pgmap->range; struct vmem_altmap *altmap = &pgmap->altmap; struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; u64 offset = le64_to_cpu(pfn_sb->dataoff); @@ -689,16 +689,16 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) .end_pfn = PHYS_PFN(end), }; - memcpy(res, &nsio->res, sizeof(*res)); - res->start += start_pad; - res->end -= end_trunc; - + *range = (struct range) { + .start = nsio->res.start + start_pad, + .end = nsio->res.end - end_trunc, + }; if (nd_pfn->mode == PFN_MODE_RAM) { if (offset < reserve) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); } else if (nd_pfn->mode == PFN_MODE_PMEM) { - nd_pfn->npfns = PHYS_PFN((resource_size(res) - offset)); + nd_pfn->npfns = PHYS_PFN((range_len(range) - offset)); if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) dev_info(&nd_pfn->dev, "number of pfns truncated from %lld to %ld\n", diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index c86a0ceaece6..1f394f44838f 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -375,7 +375,7 @@ static int pmem_attach_disk(struct device *dev, struct nd_region *nd_region = to_nd_region(dev->parent); int nid = dev_to_node(dev), fua; struct resource *res = &nsio->res; - struct resource bb_res; + struct range bb_range; struct nd_pfn *nd_pfn = NULL; struct dax_device *dax_dev; struct nd_pfn_sb *pfn_sb; @@ -434,24 +434,26 @@ static int pmem_attach_disk(struct device *dev, pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); pmem->pfn_pad = resource_size(res) - - resource_size(&pmem->pgmap.res); + range_len(&pmem->pgmap.range); pmem->pfn_flags |= PFN_MAP; - memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); - bb_res.start += pmem->data_offset; + bb_range = pmem->pgmap.range; + bb_range.start += pmem->data_offset; } else if (pmem_should_map_pages(dev)) { - memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); + pmem->pgmap.range.start = res->start; + pmem->pgmap.range.end = res->end; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; - memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); + bb_range = pmem->pgmap.range; } else { if (devm_add_action_or_reset(dev, pmem_release_queue, &pmem->pgmap)) return -ENOMEM; addr = devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); - memcpy(&bb_res, &nsio->res, sizeof(bb_res)); + bb_range.start = res->start; + bb_range.end = res->end; } if (IS_ERR(addr)) @@ -480,7 +482,7 @@ static int pmem_attach_disk(struct device *dev, / 512); if (devm_init_badblocks(dev, &pmem->bb)) return -ENOMEM; - nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res); + nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range); disk->bb = &pmem->bb; if (is_nvdimm_sync(nd_region)) @@ -591,8 +593,8 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) resource_size_t offset = 0, end_trunc = 0; struct nd_namespace_common *ndns; struct nd_namespace_io *nsio; - struct resource res; struct badblocks *bb; + struct range range; struct kernfs_node *bb_state; if (event != NVDIMM_REVALIDATE_POISON) @@ -628,9 +630,9 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) nsio = to_nd_namespace_io(&ndns->dev); } - res.start = nsio->res.start + offset; - res.end = nsio->res.end - end_trunc; - nvdimm_badblocks_populate(nd_region, bb, &res); + range.start = nsio->res.start + offset; + range.end = nsio->res.end - end_trunc; + nvdimm_badblocks_populate(nd_region, bb, &range); if (bb_state) sysfs_notify_dirent(bb_state); } diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 0f6978e72e7c..bfce87ed72ab 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -35,7 +35,10 @@ static int nd_region_probe(struct device *dev) return rc; if (is_memory(&nd_region->dev)) { - struct resource ndr_res; + struct range range = { + .start = nd_region->ndr_start, + .end = nd_region->ndr_start + nd_region->ndr_size - 1, + }; if (devm_init_badblocks(dev, &nd_region->bb)) return -ENODEV; @@ -44,9 +47,7 @@ static int nd_region_probe(struct device *dev) if (!nd_region->bb_state) dev_warn(&nd_region->dev, "'badblocks' notification disabled\n"); - ndr_res.start = nd_region->ndr_start; - ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1; - nvdimm_badblocks_populate(nd_region, &nd_region->bb, &ndr_res); + nvdimm_badblocks_populate(nd_region, &nd_region->bb, &range); } rc = nd_region_register_namespaces(nd_region, &err); @@ -121,14 +122,16 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event) { if (event == NVDIMM_REVALIDATE_POISON) { struct nd_region *nd_region = to_nd_region(dev); - struct resource res; if (is_memory(&nd_region->dev)) { - res.start = nd_region->ndr_start; - res.end = nd_region->ndr_start + - nd_region->ndr_size - 1; + struct range range = { + .start = nd_region->ndr_start, + .end = nd_region->ndr_start + + nd_region->ndr_size - 1, + }; + nvdimm_badblocks_populate(nd_region, - &nd_region->bb, &res); + &nd_region->bb, &range); if (nd_region->bb_state) sysfs_notify_dirent(nd_region->bb_state); } diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index f357f9a32b3a..256850513813 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -185,9 +185,8 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, return -ENOMEM; pgmap = &p2p_pgmap->pgmap; - pgmap->res.start = pci_resource_start(pdev, bar) + offset; - pgmap->res.end = pgmap->res.start + size - 1; - pgmap->res.flags = pci_resource_flags(pdev, bar); + pgmap->range.start = pci_resource_start(pdev, bar) + offset; + pgmap->range.end = pgmap->range.start + size - 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; p2p_pgmap->provider = pdev; @@ -202,13 +201,13 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr, pci_bus_address(pdev, bar) + offset, - resource_size(&pgmap->res), dev_to_node(&pdev->dev), + range_len(&pgmap->range), dev_to_node(&pdev->dev), pgmap->ref); if (error) goto pages_free; - pci_info(pdev, "added peer-to-peer DMA memory %pR\n", - &pgmap->res); + pci_info(pdev, "added peer-to-peer DMA memory %#llx-%#llx\n", + pgmap->range.start, pgmap->range.end); return 0; diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c index 3b98dc921426..091b8669eca3 100644 --- a/drivers/xen/unpopulated-alloc.c +++ b/drivers/xen/unpopulated-alloc.c @@ -18,27 +18,37 @@ static unsigned int list_count; static int fill_list(unsigned int nr_pages) { struct dev_pagemap *pgmap; + struct resource *res; void *vaddr; unsigned int i, alloc_pages = round_up(nr_pages, PAGES_PER_SECTION); - int ret; + int ret = -ENOMEM; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); if (!pgmap) - return -ENOMEM; + goto err_pgmap; pgmap->type = MEMORY_DEVICE_GENERIC; - pgmap->res.name = "Xen scratch"; - pgmap->res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res->name = "Xen scratch"; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - ret = allocate_resource(&iomem_resource, &pgmap->res, + ret = allocate_resource(&iomem_resource, res, alloc_pages * PAGE_SIZE, 0, -1, PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); if (ret < 0) { pr_err("Cannot allocate new IOMEM resource\n"); - kfree(pgmap); - return ret; + goto err_resource; } + pgmap->range = (struct range) { + .start = res->start, + .end = res->end, + }; + pgmap->owner = res; + #ifdef CONFIG_XEN_HAVE_PVMMU /* * memremap will build page tables for the new memory so @@ -50,14 +60,13 @@ static int fill_list(unsigned int nr_pages) * conflict with any devices. */ if (!xen_feature(XENFEAT_auto_translated_physmap)) { - xen_pfn_t pfn = PFN_DOWN(pgmap->res.start); + xen_pfn_t pfn = PFN_DOWN(res->start); for (i = 0; i < alloc_pages; i++) { if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) { pr_warn("set_phys_to_machine() failed, no memory added\n"); - release_resource(&pgmap->res); - kfree(pgmap); - return -ENOMEM; + ret = -ENOMEM; + goto err_memremap; } } } @@ -66,9 +75,8 @@ static int fill_list(unsigned int nr_pages) vaddr = memremap_pages(pgmap, NUMA_NO_NODE); if (IS_ERR(vaddr)) { pr_err("Cannot remap memory range\n"); - release_resource(&pgmap->res); - kfree(pgmap); - return PTR_ERR(vaddr); + ret = PTR_ERR(vaddr); + goto err_memremap; } for (i = 0; i < alloc_pages; i++) { @@ -80,6 +88,14 @@ static int fill_list(unsigned int nr_pages) } return 0; + +err_memremap: + release_resource(res); +err_resource: + kfree(pgmap); +err_pgmap: + kfree(res); + return ret; } /** diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e5862746751b..d0dd261d87c0 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ +#include #include #include @@ -93,7 +94,7 @@ struct dev_pagemap_ops { /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations - * @res: physical address range covered by @ref + * @range: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping * @internal_ref: internal reference if @ref is not provided by the caller * @done: completion for @internal_ref @@ -106,7 +107,7 @@ struct dev_pagemap_ops { */ struct dev_pagemap { struct vmem_altmap altmap; - struct resource res; + struct range range; struct percpu_ref *ref; struct percpu_ref internal_ref; struct completion done; diff --git a/include/linux/range.h b/include/linux/range.h index d1fbeb664012..274681cc3154 100644 --- a/include/linux/range.h +++ b/include/linux/range.h @@ -1,12 +1,18 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RANGE_H #define _LINUX_RANGE_H +#include struct range { u64 start; u64 end; }; +static inline u64 range_len(const struct range *range) +{ + return range->end - range->start + 1; +} + int add_range(struct range *range, int az, int nr_range, u64 start, u64 end); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e7dc3de355b7..e97ca8ec0bce 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -460,6 +460,21 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, unsigned long pfn_last; void *ptr; + devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); + if (!devmem) + return -ENOMEM; + + res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, + "hmm_dmirror"); + if (IS_ERR(res)) + goto err_devmem; + + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + devmem->pagemap.range.start = res->start; + devmem->pagemap.range.end = res->end; + devmem->pagemap.ops = &dmirror_devmem_ops; + devmem->pagemap.owner = mdevice; + mutex_lock(&mdevice->devmem_lock); if (mdevice->devmem_count == mdevice->devmem_capacity) { @@ -472,33 +487,18 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, sizeof(new_chunks[0]) * new_capacity, GFP_KERNEL); if (!new_chunks) - goto err; + goto err_release; mdevice->devmem_capacity = new_capacity; mdevice->devmem_chunks = new_chunks; } - res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, - "hmm_dmirror"); - if (IS_ERR(res)) - goto err; - - devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); - if (!devmem) - goto err_release; - - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - devmem->pagemap.res = *res; - devmem->pagemap.ops = &dmirror_devmem_ops; - devmem->pagemap.owner = mdevice; - ptr = memremap_pages(&devmem->pagemap, numa_node_id()); if (IS_ERR(ptr)) - goto err_free; + goto err_release; devmem->mdevice = mdevice; - pfn_first = devmem->pagemap.res.start >> PAGE_SHIFT; - pfn_last = pfn_first + - (resource_size(&devmem->pagemap.res) >> PAGE_SHIFT); + pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; + pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; mutex_unlock(&mdevice->devmem_lock); @@ -525,12 +525,12 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, return true; -err_free: - kfree(devmem); err_release: - release_mem_region(res->start, resource_size(res)); -err: mutex_unlock(&mdevice->devmem_lock); + release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); +err_devmem: + kfree(devmem); + return false; } @@ -1100,8 +1100,8 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) mdevice->devmem_chunks[i]; memunmap_pages(&devmem->pagemap); - release_mem_region(devmem->pagemap.res.start, - resource_size(&devmem->pagemap.res)); + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); kfree(devmem); } kfree(mdevice->devmem_chunks); diff --git a/mm/memremap.c b/mm/memremap.c index 006dace60b1a..d958d348b3ca 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -70,24 +70,24 @@ static void devmap_managed_enable_put(void) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ -static void pgmap_array_delete(struct resource *res) +static void pgmap_array_delete(struct range *range) { - xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), + xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end), NULL, GFP_KERNEL); synchronize_rcu(); } static unsigned long pfn_first(struct dev_pagemap *pgmap) { - return PHYS_PFN(pgmap->res.start) + + return PHYS_PFN(pgmap->range.start) + vmem_altmap_offset(pgmap_altmap(pgmap)); } static unsigned long pfn_end(struct dev_pagemap *pgmap) { - const struct resource *res = &pgmap->res; + const struct range *range = &pgmap->range; - return (res->start + resource_size(res)) >> PAGE_SHIFT; + return (range->start + range_len(range)) >> PAGE_SHIFT; } static unsigned long pfn_next(unsigned long pfn) @@ -126,7 +126,7 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) void memunmap_pages(struct dev_pagemap *pgmap) { - struct resource *res = &pgmap->res; + struct range *range = &pgmap->range; struct page *first_page; unsigned long pfn; int nid; @@ -143,20 +143,20 @@ void memunmap_pages(struct dev_pagemap *pgmap) nid = page_to_nid(first_page); mem_hotplug_begin(); - remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start), - PHYS_PFN(resource_size(res))); + remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start), + PHYS_PFN(range_len(range))); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - __remove_pages(PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), NULL); + __remove_pages(PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), NULL); } else { - arch_remove_memory(nid, res->start, resource_size(res), + arch_remove_memory(nid, range->start, range_len(range), pgmap_altmap(pgmap)); - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); + kasan_remove_zero_shadow(__va(range->start), range_len(range)); } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); - pgmap_array_delete(res); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + pgmap_array_delete(range); WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); devmap_managed_enable_put(); } @@ -182,7 +182,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) */ void *memremap_pages(struct dev_pagemap *pgmap, int nid) { - struct resource *res = &pgmap->res; + struct range *range = &pgmap->range; struct dev_pagemap *conflict_pgmap; struct mhp_params params = { /* @@ -251,7 +251,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(error); } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); @@ -259,7 +259,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) goto err_array; } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); @@ -267,26 +267,27 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) goto err_array; } - is_ram = region_intersects(res->start, resource_size(res), + is_ram = region_intersects(range->start, range_len(range), IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); if (is_ram != REGION_DISJOINT) { - WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__, - is_ram == REGION_MIXED ? "mixed" : "ram", res); + WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n", + is_ram == REGION_MIXED ? "mixed" : "ram", + range->start, range->end); error = -ENXIO; goto err_array; } - error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), - PHYS_PFN(res->end), pgmap, GFP_KERNEL)); + error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start), + PHYS_PFN(range->end), pgmap, GFP_KERNEL)); if (error) goto err_array; if (nid < 0) nid = numa_mem_id(); - error = track_pfn_remap(NULL, ¶ms.pgprot, PHYS_PFN(res->start), - 0, resource_size(res)); + error = track_pfn_remap(NULL, ¶ms.pgprot, PHYS_PFN(range->start), 0, + range_len(range)); if (error) goto err_pfn_remap; @@ -304,16 +305,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) * arch_add_memory(). */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - error = add_pages(nid, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), ¶ms); + error = add_pages(nid, PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), ¶ms); } else { - error = kasan_add_zero_shadow(__va(res->start), resource_size(res)); + error = kasan_add_zero_shadow(__va(range->start), range_len(range)); if (error) { mem_hotplug_done(); goto err_kasan; } - error = arch_add_memory(nid, res->start, resource_size(res), + error = arch_add_memory(nid, range->start, range_len(range), ¶ms); } @@ -321,8 +322,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) struct zone *zone; zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; - move_pfn_range_to_zone(zone, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), params.altmap); + move_pfn_range_to_zone(zone, PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), params.altmap); } mem_hotplug_done(); @@ -334,17 +335,17 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) * to allow us to do the work while not holding the hotplug lock. */ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), pgmap); + PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), pgmap); percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); - return __va(res->start); + return __va(range->start); err_add_memory: - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); + kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); err_pfn_remap: - pgmap_array_delete(res); + pgmap_array_delete(range); err_array: dev_pagemap_kill(pgmap); dev_pagemap_cleanup(pgmap); @@ -369,7 +370,7 @@ EXPORT_SYMBOL_GPL(memremap_pages); * 'live' on entry and will be killed and reaped at * devm_memremap_pages_release() time, or if this routine fails. * - * 4/ res is expected to be a host memory range that could feasibly be + * 4/ range is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but * this is not enforced. */ @@ -426,7 +427,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, * In the cached case we're already holding a live reference. */ if (pgmap) { - if (phys >= pgmap->res.start && phys <= pgmap->res.end) + if (phys >= pgmap->range.start && phys <= pgmap->range.end) return pgmap; put_dev_pagemap(pgmap); } diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 03e40b3b0106..c62d372d426f 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -126,7 +126,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { int error; - resource_size_t offset = pgmap->res.start; + resource_size_t offset = pgmap->range.start; struct nfit_test_resource *nfit_res = get_nfit_res(offset); if (!nfit_res) -- cgit v1.2.3 From b7b3c01b191596d27a6980d1a42504f5b607f802 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2020 16:50:34 -0700 Subject: mm/memremap_pages: support multiple ranges per invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In support of device-dax growing the ability to front physically dis-contiguous ranges of memory, update devm_memremap_pages() to track multiple ranges with a single reference counter and devm instance. Convert all [devm_]memremap_pages() users to specify the number of ranges they are mapping in their 'struct dev_pagemap' instance. Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Cc: Paul Mackerras Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Vishal Verma Cc: Vivek Goyal Cc: Dave Jiang Cc: Ben Skeggs Cc: David Airlie Cc: Daniel Vetter Cc: Ira Weiny Cc: Bjorn Helgaas Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: "Jérôme Glisse" Cc: Ard Biesheuvel Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Brice Goglin Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Hulk Robot Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Jason Yan Cc: Jeff Moyer Cc: "Jérôme Glisse" Cc: Jia He Cc: Joao Martins Cc: Jonathan Cameron Cc: kernel test robot Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Tom Lendacky Cc: Wei Yang Cc: Will Deacon Link: https://lkml.kernel.org/r/159643103789.4062302.18426128170217903785.stgit@dwillia2-desk3.amr.corp.intel.com Link: https://lkml.kernel.org/r/160106116293.30709.13350662794915396198.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 + drivers/dax/device.c | 1 + drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 + drivers/nvdimm/pfn_devs.c | 1 + drivers/nvdimm/pmem.c | 1 + drivers/pci/p2pdma.c | 1 + drivers/xen/unpopulated-alloc.c | 1 + include/linux/memremap.h | 10 +- lib/test_hmm.c | 1 + mm/memremap.c | 258 +++++++++++++++++++-------------- 10 files changed, 166 insertions(+), 110 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 29ec555055c2..84e5a2dc8be5 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -1172,6 +1172,7 @@ int kvmppc_uvmem_init(void) kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE; kvmppc_uvmem_pgmap.range.start = res->start; kvmppc_uvmem_pgmap.range.end = res->end; + kvmppc_uvmem_pgmap.nr_range = 1; kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops; /* just one global instance: */ kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index a14448bca83d..5f808617672a 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -417,6 +417,7 @@ int dev_dax_probe(struct dev_dax *dev_dax) if (!pgmap) return -ENOMEM; pgmap->range = *range; + pgmap->nr_range = 1; } pgmap->type = MEMORY_DEVICE_GENERIC; addr = devm_memremap_pages(dev, pgmap); diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 25811ed7e274..a13c6215bba8 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -251,6 +251,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) chunk->pagemap.type = MEMORY_DEVICE_PRIVATE; chunk->pagemap.range.start = res->start; chunk->pagemap.range.end = res->end; + chunk->pagemap.nr_range = 1; chunk->pagemap.ops = &nouveau_dmem_pagemap_ops; chunk->pagemap.owner = drm->dev; diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 3c4787b92a6a..b499df630d4d 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -693,6 +693,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) .start = nsio->res.start + start_pad, .end = nsio->res.end - end_trunc, }; + pgmap->nr_range = 1; if (nd_pfn->mode == PFN_MODE_RAM) { if (offset < reserve) return -EINVAL; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 1f394f44838f..875076b0ea6c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -441,6 +441,7 @@ static int pmem_attach_disk(struct device *dev, } else if (pmem_should_map_pages(dev)) { pmem->pgmap.range.start = res->start; pmem->pgmap.range.end = res->end; + pmem->pgmap.nr_range = 1; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 256850513813..9d53c16b7329 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -187,6 +187,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap = &p2p_pgmap->pgmap; pgmap->range.start = pci_resource_start(pdev, bar) + offset; pgmap->range.end = pgmap->range.start + size - 1; + pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; p2p_pgmap->provider = pdev; diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c index 091b8669eca3..8c512ea550bb 100644 --- a/drivers/xen/unpopulated-alloc.c +++ b/drivers/xen/unpopulated-alloc.c @@ -47,6 +47,7 @@ static int fill_list(unsigned int nr_pages) .start = res->start, .end = res->end, }; + pgmap->nr_range = 1; pgmap->owner = res; #ifdef CONFIG_XEN_HAVE_PVMMU diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d0dd261d87c0..79c49e7f5c30 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -94,7 +94,6 @@ struct dev_pagemap_ops { /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations - * @range: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping * @internal_ref: internal reference if @ref is not provided by the caller * @done: completion for @internal_ref @@ -104,10 +103,12 @@ struct dev_pagemap_ops { * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no * foreign ZONE_DEVICE memory is accessed. + * @nr_range: number of ranges to be mapped + * @range: range to be mapped when nr_range == 1 + * @ranges: array of ranges to be mapped when nr_range > 1 */ struct dev_pagemap { struct vmem_altmap altmap; - struct range range; struct percpu_ref *ref; struct percpu_ref internal_ref; struct completion done; @@ -115,6 +116,11 @@ struct dev_pagemap { unsigned int flags; const struct dev_pagemap_ops *ops; void *owner; + int nr_range; + union { + struct range range; + struct range ranges[0]; + }; }; static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e97ca8ec0bce..c710b4c5714d 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -472,6 +472,7 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.range.start = res->start; devmem->pagemap.range.end = res->end; + devmem->pagemap.nr_range = 1; devmem->pagemap.ops = &dmirror_devmem_ops; devmem->pagemap.owner = mdevice; diff --git a/mm/memremap.c b/mm/memremap.c index d958d348b3ca..532ec3d36ab4 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -77,15 +77,19 @@ static void pgmap_array_delete(struct range *range) synchronize_rcu(); } -static unsigned long pfn_first(struct dev_pagemap *pgmap) +static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id) { - return PHYS_PFN(pgmap->range.start) + - vmem_altmap_offset(pgmap_altmap(pgmap)); + struct range *range = &pgmap->ranges[range_id]; + unsigned long pfn = PHYS_PFN(range->start); + + if (range_id) + return pfn; + return pfn + vmem_altmap_offset(pgmap_altmap(pgmap)); } -static unsigned long pfn_end(struct dev_pagemap *pgmap) +static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) { - const struct range *range = &pgmap->range; + const struct range *range = &pgmap->ranges[range_id]; return (range->start + range_len(range)) >> PAGE_SHIFT; } @@ -97,8 +101,8 @@ static unsigned long pfn_next(unsigned long pfn) return pfn + 1; } -#define for_each_device_pfn(pfn, map) \ - for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +#define for_each_device_pfn(pfn, map, i) \ + for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) static void dev_pagemap_kill(struct dev_pagemap *pgmap) { @@ -124,20 +128,14 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) pgmap->ref = NULL; } -void memunmap_pages(struct dev_pagemap *pgmap) +static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { - struct range *range = &pgmap->range; + struct range *range = &pgmap->ranges[range_id]; struct page *first_page; - unsigned long pfn; int nid; - dev_pagemap_kill(pgmap); - for_each_device_pfn(pfn, pgmap) - put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); - /* make sure to access a memmap that was actually initialized */ - first_page = pfn_to_page(pfn_first(pgmap)); + first_page = pfn_to_page(pfn_first(pgmap, range_id)); /* pages are dead and unused, undo the arch mapping */ nid = page_to_nid(first_page); @@ -157,6 +155,22 @@ void memunmap_pages(struct dev_pagemap *pgmap) untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); pgmap_array_delete(range); +} + +void memunmap_pages(struct dev_pagemap *pgmap) +{ + unsigned long pfn; + int i; + + dev_pagemap_kill(pgmap); + for (i = 0; i < pgmap->nr_range; i++) + for_each_device_pfn(pfn, pgmap, i) + put_page(pfn_to_page(pfn)); + dev_pagemap_cleanup(pgmap); + + for (i = 0; i < pgmap->nr_range; i++) + pageunmap_range(pgmap, i); + WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); devmap_managed_enable_put(); } @@ -175,96 +189,29 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) complete(&pgmap->done); } -/* - * Not device managed version of dev_memremap_pages, undone by - * memunmap_pages(). Please use dev_memremap_pages if you have a struct - * device available. - */ -void *memremap_pages(struct dev_pagemap *pgmap, int nid) +static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, + int range_id, int nid) { - struct range *range = &pgmap->range; + struct range *range = &pgmap->ranges[range_id]; struct dev_pagemap *conflict_pgmap; - struct mhp_params params = { - /* - * We do not want any optional features only our own memmap - */ - .altmap = pgmap_altmap(pgmap), - .pgprot = PAGE_KERNEL, - }; int error, is_ram; - bool need_devmap_managed = true; - switch (pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { - WARN(1, "Device private memory not supported\n"); - return ERR_PTR(-EINVAL); - } - if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { - WARN(1, "Missing migrate_to_ram method\n"); - return ERR_PTR(-EINVAL); - } - if (!pgmap->owner) { - WARN(1, "Missing owner\n"); - return ERR_PTR(-EINVAL); - } - break; - case MEMORY_DEVICE_FS_DAX: - if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || - IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { - WARN(1, "File system DAX not supported\n"); - return ERR_PTR(-EINVAL); - } - break; - case MEMORY_DEVICE_GENERIC: - need_devmap_managed = false; - break; - case MEMORY_DEVICE_PCI_P2PDMA: - params.pgprot = pgprot_noncached(params.pgprot); - need_devmap_managed = false; - break; - default: - WARN(1, "Invalid pgmap type %d\n", pgmap->type); - break; - } - - if (!pgmap->ref) { - if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) - return ERR_PTR(-EINVAL); - - init_completion(&pgmap->done); - error = percpu_ref_init(&pgmap->internal_ref, - dev_pagemap_percpu_release, 0, GFP_KERNEL); - if (error) - return ERR_PTR(error); - pgmap->ref = &pgmap->internal_ref; - } else { - if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); - } - } - - if (need_devmap_managed) { - error = devmap_managed_enable_get(pgmap); - if (error) - return ERR_PTR(error); - } + if (WARN_ONCE(pgmap_altmap(pgmap) && range_id > 0, + "altmap not supported for multiple ranges\n")) + return -EINVAL; conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; + return -ENOMEM; } conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; + return -ENOMEM; } is_ram = region_intersects(range->start, range_len(range), @@ -274,19 +221,18 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n", is_ram == REGION_MIXED ? "mixed" : "ram", range->start, range->end); - error = -ENXIO; - goto err_array; + return -ENXIO; } error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end), pgmap, GFP_KERNEL)); if (error) - goto err_array; + return error; if (nid < 0) nid = numa_mem_id(); - error = track_pfn_remap(NULL, ¶ms.pgprot, PHYS_PFN(range->start), 0, + error = track_pfn_remap(NULL, ¶ms->pgprot, PHYS_PFN(range->start), 0, range_len(range)); if (error) goto err_pfn_remap; @@ -306,7 +252,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), ¶ms); + PHYS_PFN(range_len(range)), params); } else { error = kasan_add_zero_shadow(__va(range->start), range_len(range)); if (error) { @@ -315,7 +261,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } error = arch_add_memory(nid, range->start, range_len(range), - ¶ms); + params); } if (!error) { @@ -323,7 +269,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), params.altmap); + PHYS_PFN(range_len(range)), params->altmap); } mem_hotplug_done(); @@ -337,20 +283,116 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); - return __va(range->start); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id) + - pfn_first(pgmap, range_id)); + return 0; - err_add_memory: +err_add_memory: kasan_remove_zero_shadow(__va(range->start), range_len(range)); - err_kasan: +err_kasan: untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); - err_pfn_remap: +err_pfn_remap: pgmap_array_delete(range); - err_array: - dev_pagemap_kill(pgmap); - dev_pagemap_cleanup(pgmap); - devmap_managed_enable_put(); - return ERR_PTR(error); + return error; +} + + +/* + * Not device managed version of dev_memremap_pages, undone by + * memunmap_pages(). Please use dev_memremap_pages if you have a struct + * device available. + */ +void *memremap_pages(struct dev_pagemap *pgmap, int nid) +{ + struct mhp_params params = { + .altmap = pgmap_altmap(pgmap), + .pgprot = PAGE_KERNEL, + }; + const int nr_range = pgmap->nr_range; + bool need_devmap_managed = true; + int error, i; + + if (WARN_ONCE(!nr_range, "nr_range must be specified\n")) + return ERR_PTR(-EINVAL); + + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { + WARN(1, "Device private memory not supported\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { + WARN(1, "Missing migrate_to_ram method\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_FS_DAX: + if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || + IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { + WARN(1, "File system DAX not supported\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_GENERIC: + need_devmap_managed = false; + break; + case MEMORY_DEVICE_PCI_P2PDMA: + params.pgprot = pgprot_noncached(params.pgprot); + need_devmap_managed = false; + break; + default: + WARN(1, "Invalid pgmap type %d\n", pgmap->type); + break; + } + + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } + } + + if (need_devmap_managed) { + error = devmap_managed_enable_get(pgmap); + if (error) + return ERR_PTR(error); + } + + /* + * Clear the pgmap nr_range as it will be incremented for each + * successfully processed range. This communicates how many + * regions to unwind in the abort case. + */ + pgmap->nr_range = 0; + error = 0; + for (i = 0; i < nr_range; i++) { + error = pagemap_range(pgmap, ¶ms, i, nid); + if (error) + break; + pgmap->nr_range++; + } + + if (i < nr_range) { + memunmap_pages(pgmap); + pgmap->nr_range = nr_range; + return ERR_PTR(error); + } + + return __va(pgmap->ranges[0].start); } EXPORT_SYMBOL_GPL(memremap_pages); -- cgit v1.2.3 From bac3cf4d01d43b587c873360dc8c84e3b570b344 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 13 Oct 2020 16:51:14 -0700 Subject: mm, dump_page: rename head_mapcount() --> head_compound_mapcount() Rename head_pincount() --> head_compound_pincount(). These names are more accurate (or less misleading) than the original ones. Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Cc: Qian Cai Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Mike Rapoport Cc: William Kucharski Link: https://lkml.kernel.org/r/20200807183358.105097-1-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++---- mm/debug.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 13dc9b9ccf8e..9cc0894e7d61 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -791,7 +791,7 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); -static inline int head_mapcount(struct page *head) +static inline int head_compound_mapcount(struct page *head) { return atomic_read(compound_mapcount_ptr(head)) + 1; } @@ -805,7 +805,7 @@ static inline int compound_mapcount(struct page *page) { VM_BUG_ON_PAGE(!PageCompound(page), page); page = compound_head(page); - return head_mapcount(page); + return head_compound_mapcount(page); } /* @@ -918,7 +918,7 @@ static inline bool hpage_pincount_available(struct page *page) return PageCompound(page) && compound_order(page) > 1; } -static inline int head_pincount(struct page *head) +static inline int head_compound_pincount(struct page *head) { return atomic_read(compound_pincount_ptr(head)); } @@ -927,7 +927,7 @@ static inline int compound_pincount(struct page *page) { VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); page = compound_head(page); - return head_pincount(page); + return head_compound_pincount(page); } static inline void set_compound_order(struct page *page, unsigned int order) diff --git a/mm/debug.c b/mm/debug.c index 2a767865145c..ccca576b2899 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -102,12 +102,12 @@ void __dump_page(struct page *page, const char *reason) if (hpage_pincount_available(page)) { pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", head, compound_order(head), - head_mapcount(head), - head_pincount(head)); + head_compound_mapcount(head), + head_compound_pincount(head)); } else { pr_warn("head:%p order:%u compound_mapcount:%d\n", head, compound_order(head), - head_mapcount(head)); + head_compound_mapcount(head)); } } if (PageKsm(page)) -- cgit v1.2.3 From 61ef1865570452801f6e554a668e049c2e25c1fd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 13 Oct 2020 16:51:17 -0700 Subject: mm: factor find_get_incore_page out of mincore_page Patch series "Return head pages from find_*_entry", v2. This patch series started out as part of the THP patch set, but it has some nice effects along the way and it seems worth splitting it out and submitting separately. Currently find_get_entry() and find_lock_entry() return the page corresponding to the requested index, but the first thing most callers do is find the head page, which we just threw away. As part of auditing all the callers, I found some misuses of the APIs and some plain inefficiencies that I've fixed. The diffstat is unflattering, but I added more kernel-doc and a new wrapper. This patch (of 8); Provide this functionality from the swap cache. It's useful for more than just mincore(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Hugh Dickins Cc: William Kucharski Cc: Jani Nikula Cc: Alexey Dobriyan Cc: Johannes Weiner Cc: Chris Wilson Cc: Matthew Auld Cc: Huang Ying Link: https://lkml.kernel.org/r/20200910183318.20139-1-willy@infradead.org Link: https://lkml.kernel.org/r/20200910183318.20139-2-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 +++++++ mm/mincore.c | 28 ++-------------------------- mm/swap_state.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4340a7b6e7a1..23c6e43a956d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -427,6 +427,7 @@ extern void free_pages_and_swap_cache(struct page **, int); extern struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, bool do_poll); @@ -570,6 +571,12 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp, return NULL; } +static inline +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +{ + return find_get_page(mapping, index); +} + static inline int add_to_swap(struct page *page) { return 0; diff --git a/mm/mincore.c b/mm/mincore.c index 453ff112470f..02db1a834021 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -48,7 +48,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, * and is up to date; i.e. that no page-in operation would be required * at this time if an application were to map and access this page. */ -static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) +static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) { unsigned char present = 0; struct page *page; @@ -59,31 +59,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ -#ifdef CONFIG_SWAP - if (shmem_mapping(mapping)) { - page = find_get_entry(mapping, pgoff); - /* - * shmem/tmpfs may return swap: account for swapcache - * page too. - */ - if (xa_is_value(page)) { - swp_entry_t swp = radix_to_swp_entry(page); - struct swap_info_struct *si; - - /* Prevent swap device to being swapoff under us */ - si = get_swap_device(swp); - if (si) { - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); - put_swap_device(si); - } else - page = NULL; - } - } else - page = find_get_page(mapping, pgoff); -#else - page = find_get_page(mapping, pgoff); -#endif + page = find_get_incore_page(mapping, index); if (page) { present = PageUptodate(page); put_page(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index c16eebb81d8b..c79e2242dd04 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "internal.h" /* @@ -414,6 +415,37 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, return page; } +/** + * find_get_incore_page - Find and get a page from the page or swap caches. + * @mapping: The address_space to search. + * @index: The page cache index. + * + * This differs from find_get_page() in that it will also look for the + * page in the swap cache. + * + * Return: The found page or %NULL. + */ +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +{ + swp_entry_t swp; + struct swap_info_struct *si; + struct page *page = find_get_entry(mapping, index); + + if (!xa_is_value(page)) + return page; + if (!shmem_mapping(mapping)) + return NULL; + + swp = radix_to_swp_entry(page); + /* Prevent swapoff from happening to us */ + si = get_swap_device(swp); + if (!si) + return NULL; + page = find_get_page(swap_address_space(swp), swp_offset(swp)); + put_swap_device(si); + return page; +} + struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) -- cgit v1.2.3 From 9dfc8ff34b951f83632815a87e97a625a11360f0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 13 Oct 2020 16:51:31 -0700 Subject: i915: use find_lock_page instead of find_lock_entry i915 does not want to see value entries. Switch it to use find_lock_page() instead, and remove the export of find_lock_entry(). Move find_lock_entry() and find_get_entry() to mm/internal.h to discourage any future use. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Cc: Alexey Dobriyan Cc: Chris Wilson Cc: Huang Ying Cc: Hugh Dickins Cc: Jani Nikula Cc: Matthew Auld Cc: William Kucharski Link: https://lkml.kernel.org/r/20200910183318.20139-6-willy@infradead.org Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 4 ++-- include/linux/pagemap.h | 2 -- mm/filemap.c | 1 - mm/internal.h | 3 +++ 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 38113d3c0138..75e8b71c18b9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -258,8 +258,8 @@ shmem_writeback(struct drm_i915_gem_object *obj) for (i = 0; i < obj->base.size >> PAGE_SHIFT; i++) { struct page *page; - page = find_lock_entry(mapping, i); - if (!page || xa_is_value(page)) + page = find_lock_page(mapping, i); + if (!page) continue; if (!page_mapped(page) && clear_page_dirty_for_io(page)) { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 434c9c34aeb6..9d282fe6700d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -385,8 +385,6 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) return head + (index & (thp_nr_pages(head) - 1)); } -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices); diff --git a/mm/filemap.c b/mm/filemap.c index 748b7b1b4f6d..089a2f02067f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1726,7 +1726,6 @@ repeat: } return page; } -EXPORT_SYMBOL(find_lock_entry); /** * pagecache_get_page - Find and get a reference to a page. diff --git a/mm/internal.h b/mm/internal.h index 10c677655912..a801a4d51f26 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -65,6 +65,9 @@ static inline void ra_submit(struct file_ra_state *ra, ra->start, ra->size, ra->async_size); } +struct page *find_get_entry(struct address_space *mapping, pgoff_t index); +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index); + /** * page_evictable - test whether a page is evictable * @page: the page to test -- cgit v1.2.3 From 63ec1973ddf3eb70feb5728088ca190f1af449cb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 13 Oct 2020 16:51:38 -0700 Subject: mm/shmem: return head page from find_lock_entry Convert shmem_getpage_gfp() (the only remaining caller of find_lock_entry()) to cope with a head page being returned instead of the subpage for the index. [willy@infradead.org: fix BUG()s] Link https://lore.kernel.org/linux-mm/20200912032042.GA6583@casper.infradead.org/ Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Alexey Dobriyan Cc: Chris Wilson Cc: Huang Ying Cc: Hugh Dickins Cc: Jani Nikula Cc: Johannes Weiner Cc: Matthew Auld Cc: William Kucharski Link: https://lkml.kernel.org/r/20200910183318.20139-8-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 9 +++++++++ mm/filemap.c | 25 +++++++++++-------------- mm/shmem.c | 19 ++++++++++--------- 3 files changed, 30 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 9d282fe6700d..5176009e4ffa 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -372,6 +372,15 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } +/* Does this page contain this index? */ +static inline bool thp_contains(struct page *head, pgoff_t index) +{ + /* HugeTLBfs indexes the page cache in units of hpage_size */ + if (PageHuge(head)) + return head->index == index; + return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL)); +} + /* * Given the page we found in the page cache, return the page corresponding * to this index in the file diff --git a/mm/filemap.c b/mm/filemap.c index 9457d4f8b1c1..b44346a748fa 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1692,37 +1692,34 @@ out: } /** - * find_lock_entry - locate, pin and lock a page cache entry - * @mapping: the address_space to search - * @offset: the page cache index + * find_lock_entry - Locate and lock a page cache entry. + * @mapping: The address_space to search. + * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned locked and with an increased - * refcount. + * Looks up the page at @mapping & @index. If there is a page in the + * cache, the head page is returned locked and with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * find_lock_entry() may sleep. - * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Context: May sleep. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) { struct page *page; repeat: - page = find_get_entry(mapping, offset); + page = find_get_entry(mapping, index); if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ - if (unlikely(page_mapping(page) != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - page = find_subpage(page, offset); - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } return page; } diff --git a/mm/shmem.c b/mm/shmem.c index d42c27e4769f..6d4ddef4a24f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1830,6 +1830,8 @@ repeat: return error; } + if (page) + hindex = page->index; if (page && sgp == SGP_WRITE) mark_page_accessed(page); @@ -1840,11 +1842,10 @@ repeat: unlock_page(page); put_page(page); page = NULL; + hindex = index; } - if (page || sgp == SGP_READ) { - *pagep = page; - return 0; - } + if (page || sgp == SGP_READ) + goto out; /* * Fast cache lookup did not find it: @@ -1969,14 +1970,13 @@ clear: * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !PageUptodate(page)) { - struct page *head = compound_head(page); int i; - for (i = 0; i < compound_nr(head); i++) { - clear_highpage(head + i); - flush_dcache_page(head + i); + for (i = 0; i < compound_nr(page); i++) { + clear_highpage(page + i); + flush_dcache_page(page + i); } - SetPageUptodate(head); + SetPageUptodate(page); } /* Perhaps the file has been truncated since we checked */ @@ -1992,6 +1992,7 @@ clear: error = -EINVAL; goto unlock; } +out: *pagep = page + index - hindex; return 0; -- cgit v1.2.3 From a8cf7f272b5a28a62ecfc39d6f7d75b4f486e350 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 13 Oct 2020 16:51:41 -0700 Subject: mm: add find_lock_head Add a new FGP_HEAD flag which avoids calling find_subpage() and add a convenience wrapper for it. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Alexey Dobriyan Cc: Chris Wilson Cc: Huang Ying Cc: Hugh Dickins Cc: Jani Nikula Cc: Johannes Weiner Cc: Matthew Auld Cc: William Kucharski Link: https://lkml.kernel.org/r/20200910183318.20139-9-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 32 ++++++++++++++++++++++++++------ mm/filemap.c | 9 ++++++--- 2 files changed, 32 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 5176009e4ffa..1a3554f5d992 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -279,6 +279,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 +#define FGP_HEAD 0x00000080 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t cache_gfp_mask); @@ -310,18 +311,37 @@ static inline struct page *find_get_page_flags(struct address_space *mapping, * @mapping: the address_space to search * @offset: the page index * - * Looks up the page cache slot at @mapping & @offset. If there is a + * Looks up the page cache entry at @mapping & @offset. If there is a * page cache page, it is returned locked and with an increased * refcount. * - * Otherwise, %NULL is returned. - * - * find_lock_page() may sleep. + * Context: May sleep. + * Return: A struct page or %NULL if there is no page in the cache for this + * index. */ static inline struct page *find_lock_page(struct address_space *mapping, - pgoff_t offset) + pgoff_t index) +{ + return pagecache_get_page(mapping, index, FGP_LOCK, 0); +} + +/** + * find_lock_head - Locate, pin and lock a pagecache page. + * @mapping: The address_space to search. + * @offset: The page index. + * + * Looks up the page cache entry at @mapping & @offset. If there is a + * page cache page, its head page is returned locked and with an increased + * refcount. + * + * Context: May sleep. + * Return: A struct page which is !PageTail, or %NULL if there is no page + * in the cache for this index. + */ +static inline struct page *find_lock_head(struct address_space *mapping, + pgoff_t index) { - return pagecache_get_page(mapping, offset, FGP_LOCK, 0); + return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0); } /** diff --git a/mm/filemap.c b/mm/filemap.c index b44346a748fa..63d2fed539d7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1737,6 +1737,8 @@ repeat: * * * %FGP_ACCESSED - The page will be marked accessed. * * %FGP_LOCK - The page is returned locked. + * * %FGP_HEAD - If the page is present and a THP, return the head page + * rather than the exact page specified by the index. * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. @@ -1765,7 +1767,6 @@ repeat: page = NULL; if (!page) goto no_page; - page = find_subpage(page, index); if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { @@ -1778,12 +1779,12 @@ repeat: } /* Has the page been truncated? */ - if (unlikely(compound_head(page)->mapping != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page->index != index, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } if (fgp_flags & FGP_ACCESSED) @@ -1793,6 +1794,8 @@ repeat: if (page_is_idle(page)) clear_page_idle(page); } + if (!(fgp_flags & FGP_HEAD)) + page = find_subpage(page, index); no_page: if (!page && (fgp_flags & FGP_CREAT)) { -- cgit v1.2.3 From eb1d7a65f08a52dfb828bf45b4ead7f617c64047 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 13 Oct 2020 16:51:47 -0700 Subject: mm, fadvise: improve the expensive remote LRU cache draining after FADV_DONTNEED Our users reported that there're some random latency spikes when their RT process is running. Finally we found that latency spike is caused by FADV_DONTNEED. Which may call lru_add_drain_all() to drain LRU cache on remote CPUs, and then waits the per-cpu work to complete. The wait time is uncertain, which may be tens millisecond. That behavior is unreasonable, because this process is bound to a specific CPU and the file is only accessed by itself, IOW, there should be no pagecache pages on a per-cpu pagevec of a remote CPU. That unreasonable behavior is partially caused by the wrong comparation of the number of invalidated pages and the number of the target. For example, if (count < (end_index - start_index + 1)) The count above is how many pages were invalidated in the local CPU, and (end_index - start_index + 1) is how many pages should be invalidated. The usage of (end_index - start_index + 1) is incorrect, because they are virtual addresses, which may not mapped to pages. Besides that, there may be holes between start and end. So we'd better check whether there are still pages on per-cpu pagevec after drain the local cpu, and then decide whether or not to call lru_add_drain_all(). After I applied it with a hotfix to our production environment, most of the lru_add_drain_all() can be avoided. Suggested-by: Mel Gorman Signed-off-by: Yafang Shao Signed-off-by: Andrew Morton Acked-by: Mel Gorman Cc: Johannes Weiner Link: https://lkml.kernel.org/r/20200923133318.14373-1-laoar.shao@gmail.com Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 ++++ mm/fadvise.c | 9 +++++---- mm/truncate.c | 58 +++++++++++++++++++++++++++++++++++++----------------- 3 files changed, 49 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2e621d28cd65..5815f7d4dbf4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2581,6 +2581,10 @@ extern bool is_bad_inode(struct inode *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); +void invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, + unsigned long *nr_pagevec); + static inline void invalidate_remote_inode(struct inode *inode) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || diff --git a/mm/fadvise.c b/mm/fadvise.c index 0e66f2aaeea3..d6baa4f451c5 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -141,7 +141,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) } if (end_index >= start_index) { - unsigned long count; + unsigned long nr_pagevec = 0; /* * It's common to FADV_DONTNEED right after @@ -154,8 +154,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) */ lru_add_drain(); - count = invalidate_mapping_pages(mapping, - start_index, end_index); + invalidate_mapping_pagevec(mapping, + start_index, end_index, + &nr_pagevec); /* * If fewer pages were invalidated than expected then @@ -163,7 +164,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) * a per-cpu pagevec for a remote CPU. Drain all * pagevecs and try again. */ - if (count < (end_index - start_index + 1)) { + if (nr_pagevec) { lru_add_drain_all(); invalidate_mapping_pages(mapping, start_index, end_index); diff --git a/mm/truncate.c b/mm/truncate.c index dd9ebc1da356..6bbe0f0b3ce9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -528,23 +528,8 @@ void truncate_inode_pages_final(struct address_space *mapping) } EXPORT_SYMBOL(truncate_inode_pages_final); -/** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - * - * Return: the number of the pages that were invalidated - */ -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) +unsigned long __invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; @@ -610,8 +595,13 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, * Invalidation is a hint that the page is no longer * of interest and try to speed up its reclaim. */ - if (!ret) + if (!ret) { deactivate_file_page(page); + /* It is likely on the pagevec of a remote CPU */ + if (nr_pagevec) + (*nr_pagevec)++; + } + if (PageTransHuge(page)) put_page(page); count += ret; @@ -623,8 +613,40 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } return count; } + +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + * + * Return: the number of the pages that were invalidated + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + return __invalidate_mapping_pages(mapping, start, end, NULL); +} EXPORT_SYMBOL(invalidate_mapping_pages); +/** + * This helper is similar with the above one, except that it accounts for pages + * that are likely on a pagevec and count them in @nr_pagevec, which will used by + * the caller. + */ +void invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) +{ + __invalidate_mapping_pages(mapping, start, end, nr_pagevec); +} + /* * This is like invalidate_complete_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger -- cgit v1.2.3 From 3264631548b1f2bf89b71793d06bfd0f748f649d Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 13 Oct 2020 16:52:04 -0700 Subject: swap: rename SWP_FS to SWAP_FS_OPS to avoid ambiguity SWP_FS is used to make swap_{read,write}page() go through the filesystem, and it's only used for swap files over NFS for now. Otherwise it will directly submit IO to blockdev according to swapfile extents reported by filesystems in advance. As Matthew pointed out [1], SWP_FS naming is somewhat confusing, so let's rename to SWP_FS_OPS. [1] https://lore.kernel.org/r/20200820113448.GM17456@casper.infradead.org Suggested-by: Matthew Wilcox Signed-off-by: Gao Xiang Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200822113019.11319-1-hsiangkao@redhat.com Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/page_io.c | 6 +++--- mm/swap_state.c | 2 +- mm/swapfile.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 23c6e43a956d..7bd5b4aac049 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -170,7 +170,7 @@ enum { SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ - SWP_FS = (1 << 8), /* swap file goes through fs */ + SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ diff --git a/mm/page_io.c b/mm/page_io.c index f9e9267f296f..2ffe4c4a6d97 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -312,7 +312,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (data_race(sis->flags & SWP_FS)) { + if (data_race(sis->flags & SWP_FS_OPS)) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -403,7 +403,7 @@ int swap_readpage(struct page *page, bool synchronous) goto out; } - if (data_race(sis->flags & SWP_FS)) { + if (data_race(sis->flags & SWP_FS_OPS)) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -467,7 +467,7 @@ int swap_set_page_dirty(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); - if (data_race(sis->flags & SWP_FS)) { + if (data_race(sis->flags & SWP_FS_OPS)) { struct address_space *mapping = sis->swap_file->f_mapping; VM_BUG_ON_PAGE(!PageSwapCache(page), page); diff --git a/mm/swap_state.c b/mm/swap_state.c index f24f2cea4238..aa40e706604c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -665,7 +665,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, goto skip; /* Test swap type to make sure the dereference is safe */ - if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) { + if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) { struct inode *inode = si->swap_file->f_mapping->host; if (inode_read_congested(inode)) goto skip; diff --git a/mm/swapfile.c b/mm/swapfile.c index ced4635d924c..183b87bc87cc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2437,7 +2437,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (ret >= 0) sis->flags |= SWP_ACTIVATED; if (!ret) { - sis->flags |= SWP_FS; + sis->flags |= SWP_FS_OPS; ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; } -- cgit v1.2.3 From cc2828b21c764f901128ca2e7b9f056d0e72104f Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Tue, 13 Oct 2020 16:52:08 -0700 Subject: mm: remove activate_page() from unuse_pte() We don't initially add anon pages to active lruvec after commit b518154e59aa ("mm/vmscan: protect the workingset on anonymous LRU"). Remove activate_page() from unuse_pte(), which seems to be missed by the commit. And make the function static while we are at it. Before the commit, we called lru_cache_add_active_or_unevictable() to add new ksm pages to active lruvec. Therefore, activate_page() wasn't necessary for them in the first place. Signed-off-by: Yu Zhao Signed-off-by: Andrew Morton Reviewed-by: Yang Shi Cc: Alexander Duyck Cc: Huang Ying Cc: David Hildenbrand Cc: Michal Hocko Cc: Qian Cai Cc: Mel Gorman Cc: Nicholas Piggin Cc: Hugh Dickins Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200818184704.3625199-1-yuzhao@google.com Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - mm/swap.c | 4 ++-- mm/swapfile.c | 5 ----- 3 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 7bd5b4aac049..667935c0dbd4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -340,7 +340,6 @@ extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); -extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); diff --git a/mm/swap.c b/mm/swap.c index 65ef7e3525bf..82ddefda4904 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -348,7 +348,7 @@ static bool need_activate_page_drain(int cpu) return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } -void activate_page(struct page *page) +static void activate_page(struct page *page) { page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -368,7 +368,7 @@ static inline void activate_page_drain(int cpu) { } -void activate_page(struct page *page) +static void activate_page(struct page *page) { pg_data_t *pgdat = page_pgdat(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 183b87bc87cc..d5c19d0baf06 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1929,11 +1929,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, lru_cache_add_inactive_or_unevictable(page, vma); } swap_free(entry); - /* - * Move the page to the active list so it is not - * immediately swapped out again after swapon. - */ - activate_page(page); out: pte_unmap_unlock(pte, ptl); if (page != swapcache) { -- cgit v1.2.3 From f3bc52cb04bcfccd12da3f03ca8bc50484898436 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 13 Oct 2020 16:52:18 -0700 Subject: mm/swap_slots.c: remove always zero and unused return value of enable_swap_slots_cache() enable_swap_slots_cache() always return zero and its return value is just ignored by the caller. So make enable_swap_slots_cache() void. Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200924113554.50614-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- include/linux/swap_slots.h | 2 +- mm/swap_slots.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index e36b200c2a77..347f1a304190 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -23,7 +23,7 @@ struct swap_slots_cache { void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); -int enable_swap_slots_cache(void); +void enable_swap_slots_cache(void); int free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 3e6453573a89..0357fbe70645 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -237,7 +237,7 @@ static int free_slot_cache(unsigned int cpu) return 0; } -int enable_swap_slots_cache(void) +void enable_swap_slots_cache(void) { mutex_lock(&swap_slots_cache_enable_mutex); if (!swap_slot_cache_initialized) { @@ -255,7 +255,6 @@ int enable_swap_slots_cache(void) __reenable_swap_slots_cache(); out_unlock: mutex_unlock(&swap_slots_cache_enable_mutex); - return 0; } /* called with swap slot cache's alloc lock held */ -- cgit v1.2.3 From bd0b230fe14554bfffbae54e19038716f96f5a41 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Oct 2020 16:52:56 -0700 Subject: mm/memcg: unify swap and memsw page counters The swap page counter is v2 only while memsw is v1 only. As v1 and v2 controllers cannot be active at the same time, there is no point to keep both swap and memsw page counters in mem_cgroup. The previous patch has made sure that memsw page counter is updated and accessed only when in v1 code paths. So it is now safe to alias the v1 memsw page counter to v2 swap page counter. This saves 14 long's in the size of mem_cgroup. This is a saving of 112 bytes for 64-bit archs. While at it, also document which page counters are used in v1 and/or v2. Signed-off-by: Waiman Long Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Chris Down Cc: Johannes Weiner Cc: Roman Gushchin Cc: Tejun Heo Cc: Vladimir Davydov Cc: Yafang Shao Link: https://lkml.kernel.org/r/20200914024452.19167-4-longman@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 13 ++++++++----- mm/memcontrol.c | 3 --- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d0b036123c6a..6ef4a552e09d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -215,13 +215,16 @@ struct mem_cgroup { struct mem_cgroup_id id; /* Accounted resources */ - struct page_counter memory; - struct page_counter swap; + struct page_counter memory; /* Both v1 & v2 */ + + union { + struct page_counter swap; /* v2 only */ + struct page_counter memsw; /* v1 only */ + }; /* Legacy consumer-oriented counters */ - struct page_counter memsw; - struct page_counter kmem; - struct page_counter tcpmem; + struct page_counter kmem; /* v1 only */ + struct page_counter tcpmem; /* v1 only */ /* Range enforcement for interrupt charges */ struct work_struct high_work; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 962f8d649b83..a0bfc92804b7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5295,13 +5295,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); - page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); - page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); /* @@ -5430,7 +5428,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); - page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); -- cgit v1.2.3 From b2b29d6d01194404dfef4eafa026959be301705b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 13 Oct 2020 16:53:22 -0700 Subject: mm: account PMD tables like PTE tables We account the PTE level of the page tables to the process in order to make smarter OOM decisions and help diagnose why memory is fragmented. For these same reasons, we should account pages allocated for PMDs. With larger process address spaces and ASLR, the number of PMDs in use is higher than it used to be so the inaccuracy is starting to matter. [rppt@linux.ibm.com: arm: __pmd_free_tlb(): call page table destructor] Link: https://lkml.kernel.org/r/20200825111303.GB69694@linux.ibm.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Mike Rapoport Cc: Abdul Haleem Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Joerg Roedel Cc: Max Filippov Cc: Peter Zijlstra Cc: Satheesh Rajendran Cc: Stafford Horne Cc: Naresh Kamboju Cc: Anders Roxell Link: http://lkml.kernel.org/r/20200627184642.GF25039@casper.infradead.org Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 1 + include/linux/mm.h | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 9415222b49ad..b8cbe03ad260 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -59,6 +59,7 @@ __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) #ifdef CONFIG_ARM_LPAE struct page *page = virt_to_page(pmdp); + pgtable_pmd_page_dtor(page); tlb_remove_table(tlb, page); #endif } diff --git a/include/linux/mm.h b/include/linux/mm.h index 9cc0894e7d61..5320e7ab843f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2254,7 +2254,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(pmd_to_page(pmd)); } -static inline bool pgtable_pmd_page_ctor(struct page *page) +static inline bool pmd_ptlock_init(struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE page->pmd_huge_pte = NULL; @@ -2262,7 +2262,7 @@ static inline bool pgtable_pmd_page_ctor(struct page *page) return ptlock_init(page); } -static inline void pgtable_pmd_page_dtor(struct page *page) +static inline void pmd_ptlock_free(struct page *page) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON_PAGE(page->pmd_huge_pte, page); @@ -2279,8 +2279,8 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return &mm->page_table_lock; } -static inline bool pgtable_pmd_page_ctor(struct page *page) { return true; } -static inline void pgtable_pmd_page_dtor(struct page *page) {} +static inline bool pmd_ptlock_init(struct page *page) { return true; } +static inline void pmd_ptlock_free(struct page *page) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -2293,6 +2293,22 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } +static inline bool pgtable_pmd_page_ctor(struct page *page) +{ + if (!pmd_ptlock_init(page)) + return false; + __SetPageTable(page); + inc_zone_page_state(page, NR_PAGETABLE); + return true; +} + +static inline void pgtable_pmd_page_dtor(struct page *page) +{ + pmd_ptlock_free(page); + __ClearPageTable(page); + dec_zone_page_state(page, NR_PAGETABLE); +} + /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be -- cgit v1.2.3 From 07e5bfe651f8595ca6a777d989016aa8d8217924 Mon Sep 17 00:00:00 2001 From: Chinwen Chang Date: Tue, 13 Oct 2020 16:53:39 -0700 Subject: mmap locking API: add mmap_lock_is_contended() Patch series "Try to release mmap_lock temporarily in smaps_rollup", v4. Recently, we have observed some janky issues caused by unpleasantly long contention on mmap_lock which is held by smaps_rollup when probing large processes. To address the problem, we let smaps_rollup detect if anyone wants to acquire mmap_lock for write attempts. If yes, just release the lock temporarily to ease the contention. smaps_rollup is a procfs interface which allows users to summarize the process's memory usage without the overhead of seq_* calls. Android uses it to sample the memory usage of various processes to balance its memory pool sizes. If no one wants to take the lock for write requests, smaps_rollup with this patch will behave like the original one. Although there are on-going mmap_lock optimizations like range-based locks, the lock applied to smaps_rollup would be the coarse one, which is hard to avoid the occurrence of aforementioned issues. So the detection and temporary release for write attempts on mmap_lock in smaps_rollup is still necessary. This patch (of 3): Add new API to query if someone wants to acquire mmap_lock for write attempts. Using this instead of rwsem_is_contended makes it more tolerant of future changes to the lock type. Signed-off-by: Chinwen Chang Signed-off-by: Andrew Morton Reviewed-by: Steven Price Acked-by: Michel Lespinasse Cc: Alexey Dobriyan Cc: Daniel Jordan Cc: Daniel Kiss Cc: Davidlohr Bueso Cc: Huang Ying Cc: Jason Gunthorpe Cc: Jimmy Assarsson Cc: Laurent Dufour Cc: "Matthew Wilcox (Oracle)" Cc: Matthias Brugger Cc: Song Liu Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/1597715898-3854-1-git-send-email-chinwen.chang@mediatek.com Link: http://lkml.kernel.org/r/1597715898-3854-2-git-send-email-chinwen.chang@mediatek.com Signed-off-by: Linus Torvalds --- include/linux/mmap_lock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 0707671851a8..18e7eae9b5ba 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -87,4 +87,9 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); } +static inline int mmap_lock_is_contended(struct mm_struct *mm) +{ + return rwsem_is_contended(&mm->mmap_lock); +} + #endif /* _LINUX_MMAP_LOCK_H */ -- cgit v1.2.3 From e18c45ffcfa347b13c2f300f290bacff55a4b41e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 13 Oct 2020 16:53:50 -0700 Subject: mm: move PageDoubleMap bit Patch series "Fix PageDoubleMap". This is a purely theoretical problem for now as none of the filesystems which use PG_private_2 (ie PG_fscache) are being converted at this time, but it's confusing to leave it like this. This patch (of 2): PG_private_2 is defined as being PF_ANY (applicable to tail pages as well as regular & head pages). That means that the first tail page of a double-map page will appear to have Private2 set. Use the Workingset bit instead which is defined as PF_HEAD so any attempt to access the Workingset bit on a tail page will redirect to the head page's Workingset bit. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Zi Yan Link: https://lkml.kernel.org/r/20200629151933.15671-1-willy@infradead.org Link: https://lkml.kernel.org/r/20200629151933.15671-2-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 276140c94f4a..76413b2ffef0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -167,7 +167,7 @@ enum pageflags { PG_slob_free = PG_private, /* Compound pages. Stored in first tail page's flags */ - PG_double_map = PG_private_2, + PG_double_map = PG_workingset, /* non-lru isolated movable page */ PG_isolated = PG_reclaim, -- cgit v1.2.3 From a08d93e5752a35a771054f6c463f789720f9a3e8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 13 Oct 2020 16:53:54 -0700 Subject: mm: simplify PageDoubleMap with PF_SECOND policy Introduce the new page policy of PF_SECOND which lets us use the normal pageflags generation machinery to create the various DoubleMap manipulation functions. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Zi Yan Link: https://lkml.kernel.org/r/20200629151933.15671-3-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 76413b2ffef0..38ded408bd4c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -235,6 +235,9 @@ static inline void page_init_poison(struct page *page, size_t size) * * PF_NO_COMPOUND: * the page flag is not relevant for compound pages. + * + * PF_SECOND: + * the page flag is stored in the first tail page. */ #define PF_POISONED_CHECK(page) ({ \ VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ @@ -250,6 +253,9 @@ static inline void page_init_poison(struct page *page, size_t size) #define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ PF_POISONED_CHECK(page); }) +#define PF_SECOND(page, enforce) ({ \ + VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ + PF_POISONED_CHECK(&page[1]); }) /* * Macros to create function definitions for page flags @@ -688,42 +694,15 @@ static inline int PageTransTail(struct page *page) * * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). */ -static inline int PageDoubleMap(struct page *page) -{ - return PageHead(page) && test_bit(PG_double_map, &page[1].flags); -} - -static inline void SetPageDoubleMap(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - set_bit(PG_double_map, &page[1].flags); -} - -static inline void ClearPageDoubleMap(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - clear_bit(PG_double_map, &page[1].flags); -} -static inline int TestSetPageDoubleMap(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - return test_and_set_bit(PG_double_map, &page[1].flags); -} - -static inline int TestClearPageDoubleMap(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - return test_and_clear_bit(PG_double_map, &page[1].flags); -} - +PAGEFLAG(DoubleMap, double_map, PF_SECOND) + TESTSCFLAG(DoubleMap, double_map, PF_SECOND) #else TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) TESTPAGEFLAG_FALSE(TransCompoundMap) TESTPAGEFLAG_FALSE(TransTail) PAGEFLAG_FALSE(DoubleMap) - TESTSETFLAG_FALSE(DoubleMap) - TESTCLEARFLAG_FALSE(DoubleMap) + TESTSCFLAG_FALSE(DoubleMap) #endif /* @@ -888,6 +867,7 @@ static inline int page_has_private(struct page *page) #undef PF_ONLY_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND +#undef PF_SECOND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ -- cgit v1.2.3 From c78f463649d60f4ed12df97a32def4d77b583853 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 13 Oct 2020 16:54:21 -0700 Subject: mm: remove src/dst mm parameter in copy_page_range() Both of the mm pointers are not needed after commit 7a4830c380f3 ("mm/fork: Pass new vma pointer into copy_page_range()"). Jason Gunthorpe also reported that the ordering of copy_page_range() is odd. Since working at it, reorder the parameters to be logical, by (1) always put the dst_* fields to be before src_* fields, and (2) keep the same type of parameters together. [peterx@redhat.com: further reorder some parameters and line format, per Jason] Link: https://lkml.kernel.org/r/20201002192647.7161-1-peterx@redhat.com [peterx@redhat.com: fix warnings] Link: https://lkml.kernel.org/r/20201006200138.GA6026@xz-x1 Reported-by: Kirill A. Shutemov Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jason Gunthorpe Acked-by: Kirill A. Shutemov Link: https://lkml.kernel.org/r/20200930204950.6668-1-peterx@redhat.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 +- kernel/fork.c | 2 +- mm/memory.c | 141 +++++++++++++++++++++++++++-------------------------- 3 files changed, 76 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5320e7ab843f..620961e4f32b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1653,8 +1653,8 @@ struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma, struct vm_area_struct *new); +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); diff --git a/kernel/fork.c b/kernel/fork.c index 2dcb19a30650..ede26e5a6097 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -590,7 +590,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt, tmp); + retval = copy_page_range(tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); diff --git a/mm/memory.c b/mm/memory.c index 5a1c4c9759dc..f482af8bc828 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -794,15 +794,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * lock. */ static inline int -copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, - struct vm_area_struct *vma, struct vm_area_struct *new, - unsigned long addr, int *rss, struct page **prealloc, - pte_t pte, struct page *page) +copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, + struct page **prealloc, pte_t pte, struct page *page) { + struct mm_struct *src_mm = src_vma->vm_mm; struct page *new_page; - if (!is_cow_mapping(vma->vm_flags)) + if (!is_cow_mapping(src_vma->vm_flags)) return 1; /* @@ -832,16 +831,16 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, * over and copy the page & arm it. */ *prealloc = NULL; - copy_user_highpage(new_page, page, addr, vma); + copy_user_highpage(new_page, page, addr, src_vma); __SetPageUptodate(new_page); - page_add_new_anon_rmap(new_page, new, addr, false); - lru_cache_add_inactive_or_unevictable(new_page, new); + page_add_new_anon_rmap(new_page, dst_vma, addr, false); + lru_cache_add_inactive_or_unevictable(new_page, dst_vma); rss[mm_counter(new_page)]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(new_page, new->vm_page_prot); - pte = maybe_mkwrite(pte_mkdirty(pte), new); - set_pte_at(dst_mm, addr, dst_pte, pte); + pte = mk_pte(new_page, dst_vma->vm_page_prot); + pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -850,24 +849,21 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, * is required to copy this pte. */ static inline int -copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, int *rss, struct page **prealloc) +copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, + struct page **prealloc) { - unsigned long vm_flags = vma->vm_flags; + struct mm_struct *src_mm = src_vma->vm_mm; + unsigned long vm_flags = src_vma->vm_flags; pte_t pte = *src_pte; struct page *page; - page = vm_normal_page(vma, addr, pte); + page = vm_normal_page(src_vma, addr, pte); if (page) { int retval; - retval = copy_present_page(dst_mm, src_mm, - dst_pte, src_pte, - vma, new, - addr, rss, prealloc, - pte, page); + retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, pte, page); if (retval <= 0) return retval; @@ -901,7 +897,7 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!(vm_flags & VM_UFFD_WP)) pte = pte_clear_uffd_wp(pte); - set_pte_at(dst_mm, addr, dst_pte, pte); + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -924,11 +920,13 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, return new_page; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static int +copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; @@ -971,15 +969,15 @@ again: if (unlikely(!pte_present(*src_pte))) { entry.val = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, addr, rss); + src_vma, addr, rss); if (entry.val) break; progress += 8; continue; } /* copy_present_pte() will clear `*prealloc' if consumed */ - ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, new, addr, rss, &prealloc); + ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. @@ -1014,7 +1012,7 @@ again: entry.val = 0; } else if (ret) { WARN_ON_ONCE(ret != -EAGAIN); - prealloc = page_copy_prealloc(src_mm, vma, addr); + prealloc = page_copy_prealloc(src_mm, src_vma, addr); if (!prealloc) return -ENOMEM; /* We've captured and resolved the error. Reset, try again. */ @@ -1028,11 +1026,13 @@ out: return ret; } -static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static inline int +copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pmd_t *src_pmd, *dst_pmd; unsigned long next; @@ -1045,9 +1045,9 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; - VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, - dst_pmd, src_pmd, addr, vma); + dst_pmd, src_pmd, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) @@ -1056,18 +1056,20 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src } if (pmd_none_or_clear_bad(src_pmd)) continue; - if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, new, addr, next)) + if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, + addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } -static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static inline int +copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pud_t *src_pud, *dst_pud; unsigned long next; @@ -1080,9 +1082,9 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { int err; - VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma); + VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); err = copy_huge_pud(dst_mm, src_mm, - dst_pud, src_pud, addr, vma); + dst_pud, src_pud, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) @@ -1091,18 +1093,19 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src } if (pud_none_or_clear_bad(src_pud)) continue; - if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, new, addr, next)) + if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, + addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } -static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, - struct vm_area_struct *new, - unsigned long addr, unsigned long end) +static inline int +copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; p4d_t *src_p4d, *dst_p4d; unsigned long next; @@ -1114,20 +1117,22 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(src_p4d)) continue; - if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, - vma, new, addr, next)) + if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, + addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } -int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma, struct vm_area_struct *new) +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; + unsigned long addr = src_vma->vm_start; + unsigned long end = src_vma->vm_end; + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; bool is_cow; int ret; @@ -1138,19 +1143,19 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && - !vma->anon_vma) + if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && + !src_vma->anon_vma) return 0; - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst_mm, src_mm, vma); + if (is_vm_hugetlb_page(src_vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, src_vma); - if (unlikely(vma->vm_flags & VM_PFNMAP)) { + if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ - ret = track_pfn_copy(vma); + ret = track_pfn_copy(src_vma); if (ret) return ret; } @@ -1161,11 +1166,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ - is_cow = is_cow_mapping(vma->vm_flags); + is_cow = is_cow_mapping(src_vma->vm_flags); if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, vma, src_mm, addr, end); + 0, src_vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1176,8 +1181,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, new, addr, next))) { + if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, + addr, next))) { ret = -ENOMEM; break; } -- cgit v1.2.3 From f577e143d85aa7ea3d9c62c607ad00fc46a5730c Mon Sep 17 00:00:00 2001 From: yuleixzhang Date: Tue, 13 Oct 2020 16:54:25 -0700 Subject: include/linux/huge_mm.h: remove mincore_huge_pmd declaration As mincore_huge_pmd() was dropped, remove the declaration from the header file. Signed-off-by: Yulei Zhang Signed-off-by: Andrew Morton Reviewed-by: Zi Yan Link: https://lkml.kernel.org/r/20200922083423.15074-1-yuleixzhang@tencent.com Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8a8bc46a2432..0365aa97f8e7 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -38,9 +38,6 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); -extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - unsigned char *vec); extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); -- cgit v1.2.3 From 393824f650fabf6ea32bb09bea7acbc3a062dac8 Mon Sep 17 00:00:00 2001 From: Patricia Alfonso Date: Tue, 13 Oct 2020 16:54:58 -0700 Subject: kasan/kunit: add KUnit Struct to Current Task Patch series "KASAN-KUnit Integration", v14. This patchset contains everything needed to integrate KASAN and KUnit. KUnit will be able to: (1) Fail tests when an unexpected KASAN error occurs (2) Pass tests when an expected KASAN error occurs Convert KASAN tests to KUnit with the exception of copy_user_test because KUnit is unable to test those. Add documentation on how to run the KASAN tests with KUnit and what to expect when running these tests. This patch (of 5): In order to integrate debugging tools like KASAN into the KUnit framework, add KUnit struct to the current task to keep track of the current KUnit test. Signed-off-by: Patricia Alfonso Signed-off-by: David Gow Signed-off-by: Andrew Morton Tested-by: Andrey Konovalov Reviewed-by: Brendan Higgins Cc: Brendan Higgins Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Shuah Khan Link: https://lkml.kernel.org/r/20200915035828.570483-1-davidgow@google.com Link: https://lkml.kernel.org/r/20200915035828.570483-2-davidgow@google.com Link: https://lkml.kernel.org/r/20200910070331.3358048-1-davidgow@google.com Link: https://lkml.kernel.org/r/20200910070331.3358048-2-davidgow@google.com Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 829b0697d19c..9030f3abd969 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1208,6 +1208,10 @@ struct task_struct { #endif #endif +#if IS_ENABLED(CONFIG_KUNIT) + struct kunit *kunit_test; +#endif + #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; -- cgit v1.2.3 From 83c4e7a0363bdb8104f510370907161623e31086 Mon Sep 17 00:00:00 2001 From: Patricia Alfonso Date: Tue, 13 Oct 2020 16:55:02 -0700 Subject: KUnit: KASAN Integration Integrate KASAN into KUnit testing framework. - Fail tests when KASAN reports an error that is not expected - Use KUNIT_EXPECT_KASAN_FAIL to expect a KASAN error in KASAN tests - Expected KASAN reports pass tests and are still printed when run without kunit_tool (kunit_tool still bypasses the report due to the test passing) - KUnit struct in current task used to keep track of the current test from KASAN code Make use of "[PATCH v3 kunit-next 1/2] kunit: generalize kunit_resource API beyond allocated resources" and "[PATCH v3 kunit-next 2/2] kunit: add support for named resources" from Alan Maguire [1] - A named resource is added to a test when a KASAN report is expected - This resource contains a struct for kasan_data containing booleans representing if a KASAN report is expected and if a KASAN report is found [1] (https://lore.kernel.org/linux-kselftest/1583251361-12748-1-git-send-email-alan.maguire@oracle.com/T/#t) Signed-off-by: Patricia Alfonso Signed-off-by: David Gow Signed-off-by: Andrew Morton Tested-by: Andrey Konovalov Reviewed-by: Andrey Konovalov Reviewed-by: Dmitry Vyukov Acked-by: Brendan Higgins Cc: Andrey Ryabinin Cc: Ingo Molnar Cc: Juri Lelli Cc: Peter Zijlstra Cc: Shuah Khan Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20200915035828.570483-3-davidgow@google.com Link: https://lkml.kernel.org/r/20200910070331.3358048-3-davidgow@google.com Signed-off-by: Linus Torvalds --- include/kunit/test.h | 5 +++++ include/linux/kasan.h | 6 ++++++ lib/kunit/test.c | 13 ++++++++----- lib/test_kasan.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- mm/kasan/report.c | 32 ++++++++++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index 59f3144f009a..3391f38389f8 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -224,6 +224,11 @@ struct kunit { struct list_head resources; /* Protected by lock. */ }; +static inline void kunit_set_failure(struct kunit *test) +{ + WRITE_ONCE(test->success, false); +} + void kunit_init_test(struct kunit *test, const char *name, char *log); int kunit_run_tests(struct kunit_suite *suite); diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 087fba34b209..30d343b4a40a 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -14,6 +14,12 @@ struct task_struct; #include #include +/* kasan_data struct is used in KUnit tests for KASAN expected failures */ +struct kunit_kasan_expectation { + bool report_expected; + bool report_found; +}; + extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE]; extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD]; diff --git a/lib/kunit/test.c b/lib/kunit/test.c index c36037200310..dcc35fd30d95 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -10,16 +10,12 @@ #include #include #include +#include #include "debugfs.h" #include "string-stream.h" #include "try-catch-impl.h" -static void kunit_set_failure(struct kunit *test) -{ - WRITE_ONCE(test->success, false); -} - static void kunit_print_tap_version(void) { static bool kunit_has_printed_tap_version; @@ -288,6 +284,10 @@ static void kunit_try_run_case(void *data) struct kunit_suite *suite = ctx->suite; struct kunit_case *test_case = ctx->test_case; +#if (IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT)) + current->kunit_test = test; +#endif /* IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT) */ + /* * kunit_run_case_internal may encounter a fatal error; if it does, * abort will be called, this thread will exit, and finally the parent @@ -602,6 +602,9 @@ void kunit_cleanup(struct kunit *test) spin_unlock(&test->lock); kunit_remove_resource(test, res); } +#if (IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT)) + current->kunit_test = NULL; +#endif /* IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_KUNIT)*/ } EXPORT_SYMBOL_GPL(kunit_cleanup); diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 53e953bb1d1d..58bffadd8367 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -23,6 +23,8 @@ #include +#include + #include "../mm/kasan/kasan.h" #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_SHADOW_SCALE_SIZE) @@ -32,14 +34,55 @@ * are not eliminated as dead code. */ -int kasan_int_result; void *kasan_ptr_result; +int kasan_int_result; + +static struct kunit_resource resource; +static struct kunit_kasan_expectation fail_data; +static bool multishot; + +static int kasan_test_init(struct kunit *test) +{ + /* + * Temporarily enable multi-shot mode and set panic_on_warn=0. + * Otherwise, we'd only get a report for the first case. + */ + multishot = kasan_save_enable_multi_shot(); + + return 0; +} + +static void kasan_test_exit(struct kunit *test) +{ + kasan_restore_multi_shot(multishot); +} + +/** + * KUNIT_EXPECT_KASAN_FAIL() - Causes a test failure when the expression does + * not cause a KASAN error. This uses a KUnit resource named "kasan_data." Do + * Do not use this name for a KUnit resource outside here. + * + */ +#define KUNIT_EXPECT_KASAN_FAIL(test, condition) do { \ + fail_data.report_expected = true; \ + fail_data.report_found = false; \ + kunit_add_named_resource(test, \ + NULL, \ + NULL, \ + &resource, \ + "kasan_data", &fail_data); \ + condition; \ + KUNIT_EXPECT_EQ(test, \ + fail_data.report_expected, \ + fail_data.report_found); \ +} while (0) + + /* * Note: test functions are marked noinline so that their names appear in * reports. */ - static noinline void __init kmalloc_oob_right(void) { char *ptr; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 4f49fa6cd1aa..e2c14b10bc81 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -33,6 +33,8 @@ #include +#include + #include "kasan.h" #include "../slab.h" @@ -464,12 +466,37 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } +#if IS_ENABLED(CONFIG_KUNIT) +static void kasan_update_kunit_status(struct kunit *cur_test) +{ + struct kunit_resource *resource; + struct kunit_kasan_expectation *kasan_data; + + resource = kunit_find_named_resource(cur_test, "kasan_data"); + + if (!resource) { + kunit_set_failure(cur_test); + return; + } + + kasan_data = (struct kunit_kasan_expectation *)resource->data; + kasan_data->report_found = true; + kunit_put_resource(resource); +} +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + void kasan_report_invalid_free(void *object, unsigned long ip) { unsigned long flags; u8 tag = get_tag(object); object = reset_tag(object); + +#if IS_ENABLED(CONFIG_KUNIT) + if (current->kunit_test) + kasan_update_kunit_status(current->kunit_test); +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); print_tags(tag, object); @@ -488,6 +515,11 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, void *untagged_addr; unsigned long flags; +#if IS_ENABLED(CONFIG_KUNIT) + if (current->kunit_test) + kasan_update_kunit_status(current->kunit_test); +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + disable_trace_on_warning(); tagged_addr = (void *)addr; -- cgit v1.2.3 From 9181a980625a45425085ccec0fc38074a16470a5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 13 Oct 2020 16:55:35 -0700 Subject: mm: document semantics of ZONE_MOVABLE Let's document what ZONE_MOVABLE means, how it's used, and which special cases we have regarding unmovable pages (memory offlining vs. migration / allocations). Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Acked-by: Mike Rapoport Cc: Michal Hocko Cc: Michael S. Tsirkin Cc: Mike Kravetz Cc: Pankaj Gupta Cc: Baoquan He Cc: Jason Wang Cc: Qian Cai Link: http://lkml.kernel.org/r/20200816125333.7434-7-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0f7a4ff4b059..927bd7e98a88 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -396,6 +396,41 @@ enum zone_type { */ ZONE_HIGHMEM, #endif + /* + * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains + * movable pages with few exceptional cases described below. Main use + * cases for ZONE_MOVABLE are to make memory offlining/unplug more + * likely to succeed, and to locally limit unmovable allocations - e.g., + * to increase the number of THP/huge pages. Notable special cases are: + * + * 1. Pinned pages: (long-term) pinning of movable pages might + * essentially turn such pages unmovable. Memory offlining might + * retry a long time. + * 2. memblock allocations: kernelcore/movablecore setups might create + * situations where ZONE_MOVABLE contains unmovable allocations + * after boot. Memory offlining and allocations fail early. + * 3. Memory holes: kernelcore/movablecore setups might create very rare + * situations where ZONE_MOVABLE contains memory holes after boot, + * for example, if we have sections that are only partially + * populated. Memory offlining and allocations fail early. + * 4. PG_hwpoison pages: while poisoned pages can be skipped during + * memory offlining, such pages cannot be allocated. + * 5. Unmovable PG_offline pages: in paravirtualized environments, + * hotplugged memory blocks might only partially be managed by the + * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The + * parts not manged by the buddy are unmovable PG_offline pages. In + * some cases (virtio-mem), such pages can be skipped during + * memory offlining, however, cannot be moved/allocated. These + * techniques might use alloc_contig_range() to hide previously + * exposed pages from the buddy again (e.g., to implement some sort + * of memory unplug in virtio-mem). + * + * In general, no unmovable allocations that degrade memory offlining + * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) + * have to expect that migrating pages in ZONE_MOVABLE can fail (even + * if has_unmovable_pages() states that there are no unmovable pages, + * there can be false negatives). + */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, -- cgit v1.2.3 From 30d8ec73e8772b32a7eae626d14004bd37d8f13c Mon Sep 17 00:00:00 2001 From: Mateusz Nosek Date: Tue, 13 Oct 2020 16:55:57 -0700 Subject: mmzone: clean code by removing unused macro parameter Previously 'for_next_zone_zonelist_nodemask' macro parameter 'zlist' was unused so this patch removes it. Signed-off-by: Mateusz Nosek Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200917211906.30059-1-mateusznosek0@gmail.com Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- mm/page_alloc.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 927bd7e98a88..c27fb1faffe5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1116,7 +1116,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) -#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ +#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = z->zone; \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a105c657be37..6263a97e39c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3741,8 +3741,8 @@ retry: */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->highest_zoneidx, ac->nodemask) { + for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, + ac->nodemask) { struct page *page; unsigned long mark; -- cgit v1.2.3 From ab00db216c9c78cc0a68bc4e27889c1ee374598d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 13 Oct 2020 16:56:07 -0700 Subject: include/linux/gfp.h: clarify usage of GFP_ATOMIC in !preemptible contexts There is a general understanding that GFP_ATOMIC/GFP_NOWAIT are to be used from atomic contexts. E.g. from within a spin lock or from the IRQ context. This is correct but there are some atomic contexts where the above doesn't hold. One of them would be an NMI context. Page allocator has never supported that and the general fear of this context didn't let anybody to actually even try to use the allocator there. Good, but let's be more specific about that. Another such a context, and that is where people seem to be more daring, is raw_spin_lock. Mostly because it simply resembles regular spin lock which is supported by the allocator and there is not any implementation difference with !RT kernels in the first place. Be explicit that such a context is not supported by the allocator. The underlying reason is that zone->lock would have to become raw_spin_lock as well and that has turned out to be a problem for RT (http://lkml.kernel.org/r/87mu305c1w.fsf@nanos.tec.linutronix.de). Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Thomas Gleixner Reviewed-by: Uladzislau Rezki Cc: "Paul E. McKenney" Link: https://lkml.kernel.org/r/20200929123010.5137-1-mhocko@kernel.org Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 67a0774e080b..2e8370cf60c7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -238,7 +238,9 @@ struct vm_area_struct; * %__GFP_FOO flags as necessary. * * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower - * watermark is applied to allow access to "atomic reserves" + * watermark is applied to allow access to "atomic reserves". + * The current implementation doesn't support NMI and few other strict + * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT. * * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim. -- cgit v1.2.3 From 74c9da4e1dc0ecf70e7fa78568821e3ed8f77938 Mon Sep 17 00:00:00 2001 From: Mateusz Nosek Date: Tue, 13 Oct 2020 16:57:01 -0700 Subject: include/linux/compaction.h: clean code by removing unused enum value The enum value 'COMPACT_INACTIVE' is never used so can be removed. Signed-off-by: Mateusz Nosek Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200917110750.12015-1-mateusznosek0@gmail.com Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 25a521d299c1..1de5a1151ee7 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -29,9 +29,6 @@ enum compact_result { /* compaction didn't start as it was deferred due to past failures */ COMPACT_DEFERRED, - /* compaction not active last round */ - COMPACT_INACTIVE = COMPACT_DEFERRED, - /* For more detailed tracepoint output - internal to compaction */ COMPACT_NO_SUITABLE_PAGE, /* compaction should continue to another pageblock */ -- cgit v1.2.3 From f8fd52535c7326d72645c9878d7897aaf44db51c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 13 Oct 2020 16:57:11 -0700 Subject: mm: remove unused alloc_page_vma_node() No one use this macro anymore. Also fix code style of policy_node(). Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: https://lkml.kernel.org/r/20200921021401.84508-1-richard.weiyang@linux.alibaba.com Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 -- mm/mempolicy.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2e8370cf60c7..07e481993ef5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -562,8 +562,6 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) -#define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 69ce708c3a3a..3fde772ef5ef 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1882,8 +1882,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) } /* Return the node id preferred by the given mempolicy, or the given id */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, - int nd) +static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node; -- cgit v1.2.3 From cd991db8ddc33d2d2c7af45627fc48352915001c Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 13 Oct 2020 16:57:49 -0700 Subject: memblock: make for_each_memblock_type() iterator private for_each_memblock_type() is not used outside mm/memblock.c, move it there from include/linux/memblock.h Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Daniel Axtens Cc: Dave Hansen Cc: Emil Renner Berthing Cc: Hari Bathini Cc: Ingo Molnar Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Marek Szyprowski Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Miguel Ojeda Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200818151634.14343-9-rppt@kernel.org Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 5 ----- mm/memblock.c | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 9d925db0d355..550faf69fc1c 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -552,11 +552,6 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ region++) -#define for_each_memblock_type(i, memblock_type, rgn) \ - for (i = 0, rgn = &memblock_type->regions[0]; \ - i < memblock_type->cnt; \ - i++, rgn = &memblock_type->regions[i]) - extern void *alloc_large_system_hash(const char *tablename, unsigned long bucketsize, unsigned long numentries, diff --git a/mm/memblock.c b/mm/memblock.c index 45f198750be9..59f3998ae5db 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -132,6 +132,11 @@ struct memblock_type physmem = { }; #endif +#define for_each_memblock_type(i, memblock_type, rgn) \ + for (i = 0, rgn = &memblock_type->regions[0]; \ + i < memblock_type->cnt; \ + i++, rgn = &memblock_type->regions[i]) + int memblock_debug __initdata_memblock; static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; -- cgit v1.2.3 From 87c55870f01266fe22f345a0767162f85f1cf8f1 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 13 Oct 2020 16:57:54 -0700 Subject: memblock: make memblock_debug and related functionality private The only user of memblock_dbg() outside memblock was s390 setup code and it is converted to use pr_debug() instead. This allows to stop exposing memblock_debug and memblock_dbg() to the rest of the kernel. [akpm@linux-foundation.org: make memblock_dbg() safer and neater] Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Daniel Axtens Cc: Dave Hansen Cc: Emil Renner Berthing Cc: Hari Bathini Cc: Ingo Molnar Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Marek Szyprowski Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Miguel Ojeda Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200818151634.14343-10-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/s390/kernel/setup.c | 4 ++-- include/linux/memblock.h | 12 +----------- mm/memblock.c | 16 ++++++++++++++-- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index c2c1b4e723ea..115c92839af5 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -776,8 +776,8 @@ static void __init memblock_add_mem_detect_info(void) unsigned long start, end; int i; - memblock_dbg("physmem info source: %s (%hhd)\n", - get_mem_info_source(), mem_detect.info_source); + pr_debug("physmem info source: %s (%hhd)\n", + get_mem_info_source(), mem_detect.info_source); /* keep memblock lists close to the kernel */ memblock_set_bottom_up(true); for_each_mem_detect_block(i, &start, &end) { diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 550faf69fc1c..47a76e237fca 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -86,7 +86,6 @@ struct memblock { }; extern struct memblock memblock; -extern int memblock_debug; #ifndef CONFIG_ARCH_KEEP_MEMBLOCK #define __init_memblock __meminit @@ -98,9 +97,6 @@ void memblock_discard(void); static inline void memblock_discard(void) {} #endif -#define memblock_dbg(fmt, ...) \ - if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) - phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); void memblock_allow_resize(void); @@ -476,13 +472,7 @@ bool memblock_is_region_memory(phys_addr_t base, phys_addr_t size); bool memblock_is_reserved(phys_addr_t addr); bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); -extern void __memblock_dump_all(void); - -static inline void memblock_dump_all(void) -{ - if (memblock_debug) - __memblock_dump_all(); -} +void memblock_dump_all(void); /** * memblock_set_current_limit - Set the current allocation limit to allow diff --git a/mm/memblock.c b/mm/memblock.c index 59f3998ae5db..e9b6efd74329 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -137,7 +137,13 @@ struct memblock_type physmem = { i < memblock_type->cnt; \ i++, rgn = &memblock_type->regions[i]) -int memblock_debug __initdata_memblock; +#define memblock_dbg(fmt, ...) \ + do { \ + if (memblock_debug) \ + pr_info(fmt, ##__VA_ARGS__); \ + } while (0) + +static int memblock_debug __initdata_memblock; static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; @@ -1920,7 +1926,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) } } -void __init_memblock __memblock_dump_all(void) +static void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); pr_info(" memory size = %pa reserved size = %pa\n", @@ -1934,6 +1940,12 @@ void __init_memblock __memblock_dump_all(void) #endif } +void __init_memblock memblock_dump_all(void) +{ + if (memblock_debug) + __memblock_dump_all(); +} + void __init memblock_allow_resize(void) { memblock_can_resize = 1; -- cgit v1.2.3 From 6e245ad4a17ab92dba63406d3f517520a86c0a80 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 13 Oct 2020 16:57:59 -0700 Subject: memblock: reduce number of parameters in for_each_mem_range() Currently for_each_mem_range() and for_each_mem_range_rev() iterators are the most generic way to traverse memblock regions. As such, they have 8 parameters and they are hardly convenient to users. Most users choose to utilize one of their wrappers and the only user that actually needs most of the parameters is memblock itself. To avoid yet another naming for memblock iterators, rename the existing for_each_mem_range[_rev]() to __for_each_mem_range[_rev]() and add a new for_each_mem_range[_rev]() wrappers with only index, start and end parameters. The new wrapper nicely fits into init_unavailable_mem() and will be used in upcoming changes to simplify memblock traversals. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Acked-by: Thomas Bogendoerfer [MIPS] Cc: Andy Lutomirski Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Daniel Axtens Cc: Dave Hansen Cc: Emil Renner Berthing Cc: Hari Bathini Cc: Ingo Molnar Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Marek Szyprowski Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Miguel Ojeda Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Russell King Cc: Stafford Horne Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200818151634.14343-11-rppt@kernel.org Signed-off-by: Linus Torvalds --- .clang-format | 2 ++ arch/arm64/kernel/machine_kexec_file.c | 6 ++--- arch/powerpc/kexec/file_load_64.c | 6 ++--- include/linux/memblock.h | 41 +++++++++++++++++++++++++--------- mm/page_alloc.c | 3 +-- 5 files changed, 38 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/.clang-format b/.clang-format index badfc1ba440a..0366a3d2e561 100644 --- a/.clang-format +++ b/.clang-format @@ -207,7 +207,9 @@ ForEachMacros: - 'for_each_memblock_type' - 'for_each_memcg_cache_index' - 'for_each_mem_pfn_range' + - '__for_each_mem_range' - 'for_each_mem_range' + - '__for_each_mem_range_rev' - 'for_each_mem_range_rev' - 'for_each_migratetype_order' - 'for_each_msi_entry' diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 361a1143e09e..5b0e67b93cdc 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -215,8 +215,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz) phys_addr_t start, end; nr_ranges = 1; /* for exclusion of crashkernel region */ - for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) + for_each_mem_range(i, &start, &end) nr_ranges++; cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); @@ -225,8 +224,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz) cmem->max_nr_ranges = nr_ranges; cmem->nr_ranges = 0; - for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) { + for_each_mem_range(i, &start, &end) { cmem->ranges[cmem->nr_ranges].start = start; cmem->ranges[cmem->nr_ranges].end = end - 1; cmem->nr_ranges++; diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 53bb71e3a2e1..2c9d908eab96 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -250,8 +250,7 @@ static int __locate_mem_hole_top_down(struct kexec_buf *kbuf, phys_addr_t start, end; u64 i; - for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) { + for_each_mem_range_rev(i, &start, &end) { /* * memblock uses [start, end) convention while it is * [start, end] here. Fix the off-by-one to have the @@ -350,8 +349,7 @@ static int __locate_mem_hole_bottom_up(struct kexec_buf *kbuf, phys_addr_t start, end; u64 i; - for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_NONE, &start, &end, NULL) { + for_each_mem_range(i, &start, &end) { /* * memblock uses [start, end) convention while it is * [start, end] here. Fix the off-by-one to have the diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 47a76e237fca..27c3b84d1615 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -162,7 +162,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, #endif /* CONFIG_HAVE_MEMBLOCK_PHYS_MAP */ /** - * for_each_mem_range - iterate through memblock areas from type_a and not + * __for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. * @i: u64 used as loop variable * @type_a: ptr to memblock_type to iterate @@ -173,7 +173,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ -#define for_each_mem_range(i, type_a, type_b, nid, flags, \ +#define __for_each_mem_range(i, type_a, type_b, nid, flags, \ p_start, p_end, p_nid) \ for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid); \ @@ -182,7 +182,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, p_start, p_end, p_nid)) /** - * for_each_mem_range_rev - reverse iterate through memblock areas from + * __for_each_mem_range_rev - reverse iterate through memblock areas from * type_a and not included in type_b. Or just type_a if type_b is NULL. * @i: u64 used as loop variable * @type_a: ptr to memblock_type to iterate @@ -193,15 +193,36 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ -#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \ - p_start, p_end, p_nid) \ +#define __for_each_mem_range_rev(i, type_a, type_b, nid, flags, \ + p_start, p_end, p_nid) \ for (i = (u64)ULLONG_MAX, \ - __next_mem_range_rev(&i, nid, flags, type_a, type_b,\ + __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) +/** + * for_each_mem_range - iterate through memory areas. + * @i: u64 used as loop variable + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + */ +#define for_each_mem_range(i, p_start, p_end) \ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, \ + MEMBLOCK_NONE, p_start, p_end, NULL) + +/** + * for_each_mem_range_rev - reverse iterate through memblock areas from + * type_a and not included in type_b. Or just type_a if type_b is NULL. + * @i: u64 used as loop variable + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + */ +#define for_each_mem_range_rev(i, p_start, p_end) \ + __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \ + MEMBLOCK_NONE, p_start, p_end, NULL) + /** * for_each_reserved_mem_region - iterate over all reserved memblock areas * @i: u64 used as loop variable @@ -307,8 +328,8 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); * soon as memblock is initialized. */ #define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \ - for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ - nid, flags, p_start, p_end, p_nid) + __for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ + nid, flags, p_start, p_end, p_nid) /** * for_each_free_mem_range_reverse - rev-iterate through free memblock areas @@ -324,8 +345,8 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); */ #define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \ p_nid) \ - for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ - nid, flags, p_start, p_end, p_nid) + __for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ + nid, flags, p_start, p_end, p_nid) int memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 73e33ab6d249..34ac7127d1e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6990,8 +6990,7 @@ static void __init init_unavailable_mem(void) * Loop through unavailable ranges not covered by memblock.memory. */ pgcnt = 0; - for_each_mem_range(i, &memblock.memory, NULL, - NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { + for_each_mem_range(i, &start, &end) { if (next < start) pgcnt += init_unavailable_range(PFN_DOWN(next), PFN_UP(start)); -- cgit v1.2.3 From 5bd0960b85d7e3e4a2dc5bbf1c87d0b505115d71 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 13 Oct 2020 16:58:20 -0700 Subject: memblock: remove unused memblock_mem_size() The only user of memblock_mem_size() was x86 setup code, it is gone now and memblock_mem_size() funciton can be removed. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Daniel Axtens Cc: Dave Hansen Cc: Emil Renner Berthing Cc: Hari Bathini Cc: Ingo Molnar Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Marek Szyprowski Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Miguel Ojeda Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200818151634.14343-16-rppt@kernel.org Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 - mm/memblock.c | 15 --------------- 2 files changed, 16 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 27c3b84d1615..15ed119701c1 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -481,7 +481,6 @@ static inline bool memblock_bottom_up(void) phys_addr_t memblock_phys_mem_size(void); phys_addr_t memblock_reserved_size(void); -phys_addr_t memblock_mem_size(unsigned long limit_pfn); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); diff --git a/mm/memblock.c b/mm/memblock.c index d66d805dec96..4de76cf48434 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1660,21 +1660,6 @@ phys_addr_t __init_memblock memblock_reserved_size(void) return memblock.reserved.total_size; } -phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) -{ - unsigned long pages = 0; - unsigned long start_pfn, end_pfn; - int i; - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min_t(unsigned long, start_pfn, limit_pfn); - end_pfn = min_t(unsigned long, end_pfn, limit_pfn); - pages += end_pfn - start_pfn; - } - - return PFN_PHYS(pages); -} - /* lowest address */ phys_addr_t __init_memblock memblock_start_of_DRAM(void) { -- cgit v1.2.3 From 9f3d5eaa3c60f95d9fff1ce4eea7553a3dc04906 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 13 Oct 2020 16:58:25 -0700 Subject: memblock: implement for_each_reserved_mem_region() using __next_mem_region() Iteration over memblock.reserved with for_each_reserved_mem_region() used __next_reserved_mem_region() that implemented a subset of __next_mem_region(). Use __for_each_mem_range() and, essentially, __next_mem_region() with appropriate parameters to reduce code duplication. While on it, rename for_each_reserved_mem_region() to for_each_reserved_mem_range() for consistency. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Acked-by: Miguel Ojeda [.clang-format] Cc: Andy Lutomirski Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Daniel Axtens Cc: Dave Hansen Cc: Emil Renner Berthing Cc: Hari Bathini Cc: Ingo Molnar Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Marek Szyprowski Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200818151634.14343-17-rppt@kernel.org Signed-off-by: Linus Torvalds --- .clang-format | 2 +- arch/arm64/kernel/setup.c | 2 +- drivers/irqchip/irq-gic-v3-its.c | 2 +- include/linux/memblock.h | 12 +++------ mm/memblock.c | 56 ++++++++++++++-------------------------- 5 files changed, 27 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/.clang-format b/.clang-format index 0366a3d2e561..8806bb21b6c2 100644 --- a/.clang-format +++ b/.clang-format @@ -273,7 +273,7 @@ ForEachMacros: - 'for_each_registered_fb' - 'for_each_requested_gpio' - 'for_each_requested_gpio_in_range' - - 'for_each_reserved_mem_region' + - 'for_each_reserved_mem_range' - 'for_each_rtd_codec_dais' - 'for_each_rtd_codec_dais_rollback' - 'for_each_rtd_components' diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 53acbeca4f57..16aae43badfd 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -257,7 +257,7 @@ static int __init reserve_memblock_reserved_regions(void) if (!memblock_is_region_reserved(mem->start, mem_size)) continue; - for_each_reserved_mem_region(j, &r_start, &r_end) { + for_each_reserved_mem_range(j, &r_start, &r_end) { resource_size_t start, end; start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start); diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 0418071a3724..46d885575601 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2198,7 +2198,7 @@ static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size) addr_end = addr + size - 1; - for_each_reserved_mem_region(i, &start, &end) { + for_each_reserved_mem_range(i, &start, &end) { if (addr >= start && addr_end <= end) return true; } diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 15ed119701c1..354078713cd1 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -132,9 +132,6 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, - phys_addr_t *out_end); - void __memblock_free_late(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -224,7 +221,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, MEMBLOCK_NONE, p_start, p_end, NULL) /** - * for_each_reserved_mem_region - iterate over all reserved memblock areas + * for_each_reserved_mem_range - iterate over all reserved memblock areas * @i: u64 used as loop variable * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL @@ -232,10 +229,9 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, * Walks over reserved areas of memblock. Available as soon as memblock * is initialized. */ -#define for_each_reserved_mem_region(i, p_start, p_end) \ - for (i = 0UL, __next_reserved_mem_region(&i, p_start, p_end); \ - i != (u64)ULLONG_MAX; \ - __next_reserved_mem_region(&i, p_start, p_end)) +#define for_each_reserved_mem_range(i, p_start, p_end) \ + __for_each_mem_range(i, &memblock.reserved, NULL, NUMA_NO_NODE, \ + MEMBLOCK_NONE, p_start, p_end, NULL) static inline bool memblock_is_hotpluggable(struct memblock_region *m) { diff --git a/mm/memblock.c b/mm/memblock.c index 4de76cf48434..a09cc4f057f0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -132,6 +132,14 @@ struct memblock_type physmem = { }; #endif +/* + * keep a pointer to &memblock.memory in the text section to use it in + * __next_mem_range() and its helpers. + * For architectures that do not keep memblock data after init, this + * pointer will be reset to NULL at memblock_discard() + */ +static __refdata struct memblock_type *memblock_memory = &memblock.memory; + #define for_each_memblock_type(i, memblock_type, rgn) \ for (i = 0, rgn = &memblock_type->regions[0]; \ i < memblock_type->cnt; \ @@ -402,6 +410,8 @@ void __init memblock_discard(void) memblock.memory.max); __memblock_free_late(addr, size); } + + memblock_memory = NULL; } #endif @@ -952,42 +962,16 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); } -/** - * __next_reserved_mem_region - next function for for_each_reserved_region() - * @idx: pointer to u64 loop variable - * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL - * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL - * - * Iterate over all reserved memory regions. - */ -void __init_memblock __next_reserved_mem_region(u64 *idx, - phys_addr_t *out_start, - phys_addr_t *out_end) -{ - struct memblock_type *type = &memblock.reserved; - - if (*idx < type->cnt) { - struct memblock_region *r = &type->regions[*idx]; - phys_addr_t base = r->base; - phys_addr_t size = r->size; - - if (out_start) - *out_start = base; - if (out_end) - *out_end = base + size - 1; - - *idx += 1; - return; - } - - /* signal end of iteration */ - *idx = ULLONG_MAX; -} - -static bool should_skip_region(struct memblock_region *m, int nid, int flags) +static bool should_skip_region(struct memblock_type *type, + struct memblock_region *m, + int nid, int flags) { int m_nid = memblock_get_region_node(m); + /* we never skip regions when iterating memblock.reserved or physmem */ + if (type != memblock_memory) + return false; + /* only memory regions are associated with nodes, check it */ if (nid != NUMA_NO_NODE && nid != m_nid) return true; @@ -1052,7 +1036,7 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); - if (should_skip_region(m, nid, flags)) + if (should_skip_region(type_a, m, nid, flags)) continue; if (!type_b) { @@ -1156,7 +1140,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); - if (should_skip_region(m, nid, flags)) + if (should_skip_region(type_a, m, nid, flags)) continue; if (!type_b) { @@ -1981,7 +1965,7 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); - for_each_reserved_mem_region(i, &start, &end) + for_each_reserved_mem_range(i, &start, &end) reserve_bootmem_region(start, end); /* -- cgit v1.2.3 From cc6de1680538633e4ef9540b2313fa2481a7c641 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 13 Oct 2020 16:58:30 -0700 Subject: memblock: use separate iterators for memory and reserved regions for_each_memblock() is used to iterate over memblock.memory in a few places that use data from memblock_region rather than the memory ranges. Introduce separate for_each_mem_region() and for_each_reserved_mem_region() to improve encapsulation of memblock internals from its users. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Acked-by: Ingo Molnar [x86] Acked-by: Thomas Bogendoerfer [MIPS] Acked-by: Miguel Ojeda [.clang-format] Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Daniel Axtens Cc: Dave Hansen Cc: Emil Renner Berthing Cc: Hari Bathini Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Marek Szyprowski Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Russell King Cc: Stafford Horne Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Link: https://lkml.kernel.org/r/20200818151634.14343-18-rppt@kernel.org Signed-off-by: Linus Torvalds --- .clang-format | 3 ++- arch/arm64/kernel/setup.c | 2 +- arch/arm64/mm/numa.c | 2 +- arch/mips/netlogic/xlp/setup.c | 2 +- arch/riscv/mm/init.c | 2 +- arch/x86/mm/numa.c | 2 +- include/linux/memblock.h | 19 ++++++++++++++++--- mm/memblock.c | 4 ++-- mm/page_alloc.c | 8 ++++---- 9 files changed, 29 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/.clang-format b/.clang-format index 8806bb21b6c2..6cbd6ee51610 100644 --- a/.clang-format +++ b/.clang-format @@ -203,7 +203,7 @@ ForEachMacros: - 'for_each_matching_node' - 'for_each_matching_node_and_match' - 'for_each_member' - - 'for_each_memblock' + - 'for_each_mem_region' - 'for_each_memblock_type' - 'for_each_memcg_cache_index' - 'for_each_mem_pfn_range' @@ -274,6 +274,7 @@ ForEachMacros: - 'for_each_requested_gpio' - 'for_each_requested_gpio_in_range' - 'for_each_reserved_mem_range' + - 'for_each_reserved_mem_region' - 'for_each_rtd_codec_dais' - 'for_each_rtd_codec_dais_rollback' - 'for_each_rtd_components' diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 16aae43badfd..133257ffd859 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -217,7 +217,7 @@ static void __init request_standard_resources(void) if (!standard_resources) panic("%s: Failed to allocate %zu bytes\n", __func__, res_size); - for_each_memblock(memory, region) { + for_each_mem_region(region) { res = &standard_resources[i++]; if (memblock_is_nomap(region)) { res->name = "reserved"; diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 2040b97e92d0..a8303bc6b62a 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -354,7 +354,7 @@ static int __init numa_register_nodes(void) struct memblock_region *mblk; /* Check that valid nid is set to memblks */ - for_each_memblock(memory, mblk) { + for_each_mem_region(mblk) { int mblk_nid = memblock_get_region_node(mblk); if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) { diff --git a/arch/mips/netlogic/xlp/setup.c b/arch/mips/netlogic/xlp/setup.c index 1a0fc5b62ba4..6e3102bcd2f1 100644 --- a/arch/mips/netlogic/xlp/setup.c +++ b/arch/mips/netlogic/xlp/setup.c @@ -70,7 +70,7 @@ static void nlm_fixup_mem(void) const int pref_backup = 512; struct memblock_region *mem; - for_each_memblock(memory, mem) { + for_each_mem_region(mem) { memblock_remove(mem->base + mem->size - pref_backup, pref_backup); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 1e8c3e24e0c4..1dc89303b679 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -531,7 +531,7 @@ static void __init resource_init(void) { struct memblock_region *region; - for_each_memblock(memory, region) { + for_each_mem_region(region) { struct resource *res; res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c62e274d52d0..9df94e0aaee1 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -514,7 +514,7 @@ static void __init numa_clear_kernel_node_hotplug(void) * memory ranges, because quirks such as trim_snb_memory() * reserve specific pages for Sandy Bridge graphics. ] */ - for_each_memblock(reserved, mb_region) { + for_each_reserved_mem_region(mb_region) { int nid = memblock_get_region_node(mb_region); if (nid != MAX_NUMNODES) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 354078713cd1..ef131255cedc 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -553,9 +553,22 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo return PFN_UP(reg->base + reg->size); } -#define for_each_memblock(memblock_type, region) \ - for (region = memblock.memblock_type.regions; \ - region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ +/** + * for_each_mem_region - itereate over memory regions + * @region: loop variable + */ +#define for_each_mem_region(region) \ + for (region = memblock.memory.regions; \ + region < (memblock.memory.regions + memblock.memory.cnt); \ + region++) + +/** + * for_each_reserved_mem_region - itereate over reserved memory regions + * @region: loop variable + */ +#define for_each_reserved_mem_region(region) \ + for (region = memblock.reserved.regions; \ + region < (memblock.reserved.regions + memblock.reserved.cnt); \ region++) extern void *alloc_large_system_hash(const char *tablename, diff --git a/mm/memblock.c b/mm/memblock.c index a09cc4f057f0..165f40a8a254 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1667,7 +1667,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) * the memory memblock regions, if the @limit exceeds the total size * of those regions, max_addr will keep original value PHYS_ADDR_MAX */ - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (limit <= r->size) { max_addr = r->base + limit; break; @@ -1837,7 +1837,7 @@ void __init_memblock memblock_trim_memory(phys_addr_t align) phys_addr_t start, end, orig_start, orig_end; struct memblock_region *r; - for_each_memblock(memory, r) { + for_each_mem_region(r) { orig_start = r->base; orig_end = r->base + r->size; start = round_up(orig_start, align); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 34ac7127d1e6..05fe1ddb033c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5961,7 +5961,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) if (mirrored_kernelcore && zone == ZONE_MOVABLE) { if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (*pfn < memblock_region_memory_end_pfn(r)) break; } @@ -6546,7 +6546,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long start_pfn, end_pfn; struct memblock_region *r; - for_each_memblock(memory, r) { + for_each_mem_region(r) { start_pfn = clamp(memblock_region_memory_base_pfn(r), zone_start_pfn, zone_end_pfn); end_pfn = clamp(memblock_region_memory_end_pfn(r), @@ -7140,7 +7140,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) * options. */ if (movable_node_is_enabled()) { - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (!memblock_is_hotpluggable(r)) continue; @@ -7161,7 +7161,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (memblock_is_mirror(r)) continue; -- cgit v1.2.3 From 67197a4f28d28d0b073ab0427b03cb2ee5382578 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 13 Oct 2020 16:58:35 -0700 Subject: mm, oom_adj: don't loop through tasks in __set_oom_adj when not necessary Currently __set_oom_adj loops through all processes in the system to keep oom_score_adj and oom_score_adj_min in sync between processes sharing their mm. This is done for any task with more that one mm_users, which includes processes with multiple threads (sharing mm and signals). However for such processes the loop is unnecessary because their signal structure is shared as well. Android updates oom_score_adj whenever a tasks changes its role (background/foreground/...) or binds to/unbinds from a service, making it more/less important. Such operation can happen frequently. We noticed that updates to oom_score_adj became more expensive and after further investigation found out that the patch mentioned in "Fixes" introduced a regression. Using Pixel 4 with a typical Android workload, write time to oom_score_adj increased from ~3.57us to ~362us. Moreover this regression linearly depends on the number of multi-threaded processes running on the system. Mark the mm with a new MMF_MULTIPROCESS flag bit when task is created with (CLONE_VM && !CLONE_THREAD && !CLONE_VFORK). Change __set_oom_adj to use MMF_MULTIPROCESS instead of mm_users to decide whether oom_score_adj update should be synchronized between multiple processes. To prevent races between clone() and __set_oom_adj(), when oom_score_adj of the process being cloned might be modified from userspace, we use oom_adj_mutex. Its scope is changed to global. The combination of (CLONE_VM && !CLONE_THREAD) is rarely used except for the case of vfork(). To prevent performance regressions of vfork(), we skip taking oom_adj_mutex and setting MMF_MULTIPROCESS when CLONE_VFORK is specified. Clearing the MMF_MULTIPROCESS flag (when the last process sharing the mm exits) is left out of this patch to keep it simple and because it is believed that this threading model is rare. Should there ever be a need for optimizing that case as well, it can be done by hooking into the exit path, likely following the mm_update_next_owner pattern. With the combination of (CLONE_VM && !CLONE_THREAD && !CLONE_VFORK) being quite rare, the regression is gone after the change is applied. [surenb@google.com: v3] Link: https://lkml.kernel.org/r/20200902012558.2335613-1-surenb@google.com Fixes: 44a70adec910 ("mm, oom_adj: make sure processes sharing mm have same view of oom_score_adj") Reported-by: Tim Murray Suggested-by: Michal Hocko Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton Acked-by: Christian Brauner Acked-by: Michal Hocko Acked-by: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Eugene Syromiatnikov Cc: Christian Kellner Cc: Adrian Reber Cc: Shakeel Butt Cc: Aleksa Sarai Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Cc: Alexey Gladkov Cc: Michel Lespinasse Cc: Daniel Jordan Cc: Andrei Vagin Cc: Bernd Edlinger Cc: John Johansen Cc: Yafang Shao Link: https://lkml.kernel.org/r/20200824153036.3201505-1-surenb@google.com Debugged-by: Minchan Kim Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +-- include/linux/oom.h | 1 + include/linux/sched/coredump.h | 1 + kernel/fork.c | 21 +++++++++++++++++++++ mm/oom_kill.c | 2 ++ 5 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/proc/base.c b/fs/proc/base.c index 617db4e0faa0..aa69c35d904c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1055,7 +1055,6 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { - static DEFINE_MUTEX(oom_adj_mutex); struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; @@ -1095,7 +1094,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task); if (p) { - if (atomic_read(&p->mm->mm_users) > 1) { + if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { mm = p->mm; mmgrab(mm); } diff --git a/include/linux/oom.h b/include/linux/oom.h index f022f581ac29..2db9a1432511 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -55,6 +55,7 @@ struct oom_control { }; extern struct mutex oom_lock; +extern struct mutex oom_adj_mutex; static inline void set_current_oom_origin(void) { diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index ecdc6542070f..dfd82eab2902 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -72,6 +72,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/kernel/fork.c b/kernel/fork.c index ede26e5a6097..50c90d368117 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1812,6 +1812,25 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) free_task(tsk); } +static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) +{ + /* Skip if kernel thread */ + if (!tsk->mm) + return; + + /* Skip if spawning a thread or using vfork */ + if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) + return; + + /* We need to synchronize with __set_oom_adj */ + mutex_lock(&oom_adj_mutex); + set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + /* Update the values in case they were changed after copy_signal */ + tsk->signal->oom_score_adj = current->signal->oom_score_adj; + tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; + mutex_unlock(&oom_adj_mutex); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2288,6 +2307,8 @@ static __latent_entropy struct task_struct *copy_process( trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); + copy_oom_score_adj(clone_flags, p); + return p; bad_fork_cancel_cgroup: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e90f25d6385d..8b84661a6410 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -64,6 +64,8 @@ int sysctl_oom_dump_tasks = 1; * and mark_oom_victim */ DEFINE_MUTEX(oom_lock); +/* Serializes oom_score_adj and oom_score_adj_min updates */ +DEFINE_MUTEX(oom_adj_mutex); static inline bool is_memcg_oom(struct oom_control *oc) { -- cgit v1.2.3