From 8e7a7f8619f1f93736d9bb7e31caf4721bdc739d Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 30 Jun 2015 14:56:41 -0700 Subject: memblock: introduce a for_each_reserved_mem_region iterator Struct page initialisation had been identified as one of the reasons why large machines take a long time to boot. Patches were posted a long time ago to defer initialisation until they were first used. This was rejected on the grounds it should not be necessary to hurt the fast paths. This series reuses much of the work from that time but defers the initialisation of memory to kswapd so that one thread per node initialises memory local to that node. After applying the series and setting the appropriate Kconfig variable I see this in the boot log on a 64G machine [ 7.383764] kswapd 0 initialised deferred memory in 188ms [ 7.404253] kswapd 1 initialised deferred memory in 208ms [ 7.411044] kswapd 3 initialised deferred memory in 216ms [ 7.411551] kswapd 2 initialised deferred memory in 216ms On a 1TB machine, I see [ 8.406511] kswapd 3 initialised deferred memory in 1116ms [ 8.428518] kswapd 1 initialised deferred memory in 1140ms [ 8.435977] kswapd 0 initialised deferred memory in 1148ms [ 8.437416] kswapd 2 initialised deferred memory in 1148ms Once booted the machine appears to work as normal. Boot times were measured from the time shutdown was called until ssh was available again. In the 64G case, the boot time savings are negligible. On the 1TB machine, the savings were 16 seconds. Nate Zimmer said: : On an older 8 TB box with lots and lots of cpus the boot time, as : measure from grub to login prompt, the boot time improved from 1484 : seconds to exactly 1000 seconds. Waiman Long said: : I ran a bootup timing test on a 12-TB 16-socket IvyBridge-EX system. From : grub menu to ssh login, the bootup time was 453s before the patch and 265s : after the patch - a saving of 188s (42%). Daniel Blueman said: : On a 7TB, 1728-core NumaConnect system with 108 NUMA nodes, we're seeing : stock 4.0 boot in 7136s. This drops to 2159s, or a 70% reduction with : this patchset. Non-temporal PMD init (https://lkml.org/lkml/2015/4/23/350) : drops this to 1045s. This patch (of 13): As part of initializing struct page's in 2MiB chunks, we noticed that at the end of free_all_bootmem(), there was nothing which had forced the reserved/allocated 4KiB pages to be initialized. This helper function will be used for that expansion. Signed-off-by: Robin Holt Signed-off-by: Nate Zimmer Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 0215ffd63069..cc4b01972060 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -101,6 +101,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); +void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, + phys_addr_t *out_end); + /** * for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. @@ -142,6 +145,21 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags, __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) +/** + * for_each_reserved_mem_region - iterate over all reserved memblock areas + * @i: u64 used as loop variable + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over reserved areas of memblock. Available as soon as memblock + * is initialized. + */ +#define for_each_reserved_mem_region(i, p_start, p_end) \ + for (i = 0UL, \ + __next_reserved_mem_region(&i, p_start, p_end); \ + i != (u64)ULLONG_MAX; \ + __next_reserved_mem_region(&i, p_start, p_end)) + #ifdef CONFIG_MOVABLE_NODE static inline bool memblock_is_hotpluggable(struct memblock_region *m) { -- cgit v1.2.3 From 92923ca3aacef63c92dc297a75ad0c6dfe4eab37 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Tue, 30 Jun 2015 14:56:48 -0700 Subject: mm: meminit: only set page reserved in the memblock region Currently each page struct is set as reserved upon initialization. This patch leaves the reserved bit clear and only sets the reserved bit when it is known the memory was allocated by the bootmem allocator. This makes it easier to distinguish between uninitialised struct pages and reserved struct pages in later patches. Signed-off-by: Robin Holt Signed-off-by: Nathan Zimmer Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/nobootmem.c | 3 +++ mm/page_alloc.c | 17 ++++++++++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 99959a34f4f1..d662af2d0d01 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1635,6 +1635,8 @@ extern void free_highmem_page(struct page *page); extern void adjust_managed_page_count(struct page *page, long count); extern void mem_init_print_info(const char *str); +extern void reserve_bootmem_region(unsigned long start, unsigned long end); + /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 5258386fa1be..4af8f88c2bd1 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -130,6 +130,9 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); + for_each_reserved_mem_region(i, &start, &end) + reserve_bootmem_region(start, end); + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bc5da2cdfc84..39c8d56a4056 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -774,7 +774,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, init_page_count(page); page_mapcount_reset(page); page_cpupid_reset_last(page); - SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for @@ -809,6 +808,22 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); } +/* + * Initialised pages do not have PageReserved set. This function is + * called for each range allocated by the bootmem allocator and + * marks the pages PageReserved. The remaining valid pages are later + * sent to the buddy page allocator. + */ +void reserve_bootmem_region(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + for (; start_pfn < end_pfn; start_pfn++) + if (pfn_valid(start_pfn)) + SetPageReserved(pfn_to_page(start_pfn)); +} + static bool free_pages_prepare(struct page *page, unsigned int order) { bool compound = PageCompound(page); -- cgit v1.2.3 From 8a942fdea560d4ac0e9d9fabcd5201ad20e0c382 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:56:55 -0700 Subject: mm: meminit: make __early_pfn_to_nid SMP-safe and introduce meminit_pfn_in_nid __early_pfn_to_nid() use static variables to cache recent lookups as memblock lookups are very expensive but it assumes that memory initialisation is single-threaded. Parallel initialisation of struct pages will break that assumption so this patch makes __early_pfn_to_nid() SMP-safe by requiring the caller to cache recent search information. early_pfn_to_nid() keeps the same interface but is only safe to use early in boot due to the use of a global static variable. meminit_pfn_in_nid() is an SMP-safe version that callers must maintain their own state for. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/numa.c | 19 +++++++------------ include/linux/mm.h | 6 ++++-- include/linux/mmzone.h | 16 +++++++++++++++- mm/page_alloc.c | 40 +++++++++++++++++++++++++--------------- 4 files changed, 51 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index ea21d4cad540..aa19b7ac8222 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -58,27 +58,22 @@ paddr_to_nid(unsigned long paddr) * SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where * the section resides. */ -int __meminit __early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec; - /* - * NOTE: The following SMP-unsafe globals are only used early in boot - * when the kernel is running single-threaded. - */ - static int __meminitdata last_ssec, last_esec; - static int __meminitdata last_nid; - if (section >= last_ssec && section < last_esec) - return last_nid; + if (section >= state->last_start && section < state->last_end) + return state->last_nid; for (i = 0; i < num_node_memblks; i++) { ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT; esec = (node_memblk[i].start_paddr + node_memblk[i].size + ((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT; if (section >= ssec && section < esec) { - last_ssec = ssec; - last_esec = esec; - last_nid = node_memblk[i].nid; + state->last_start = ssec; + state->last_end = esec; + state->last_nid = node_memblk[i].nid; return node_memblk[i].nid; } } diff --git a/include/linux/mm.h b/include/linux/mm.h index d662af2d0d01..2e872f92dbac 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1726,7 +1726,8 @@ extern void sparse_memory_present_with_active_regions(int nid); #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn) +static inline int __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { return 0; } @@ -1734,7 +1735,8 @@ static inline int __early_pfn_to_nid(unsigned long pfn) /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); /* there is a per-arch backend function. */ -extern int __meminit __early_pfn_to_nid(unsigned long pfn); +extern int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 54d74f6eb233..b2473d822549 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1216,10 +1216,24 @@ void sparse_init(void); #define sparse_index_init(_sec, _nid) do {} while (0) #endif /* CONFIG_SPARSEMEM */ +/* + * During memory init memblocks map pfns to nids. The search is expensive and + * this caches recent lookups. The implementation of __early_pfn_to_nid + * may treat start/end as pfns or sections. + */ +struct mminit_pfnnid_cache { + unsigned long last_start; + unsigned long last_end; + int last_nid; +}; + #ifdef CONFIG_NODES_SPAN_OTHER_NODES bool early_pfn_in_nid(unsigned long pfn, int nid); +bool meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state); #else -#define early_pfn_in_nid(pfn, nid) (1) +#define early_pfn_in_nid(pfn, nid) (1) +#define meminit_pfn_in_nid(pfn, nid, state) (1) #endif #ifndef early_pfn_valid diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2ee4ecad083..ffdb2308848d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4551,39 +4551,41 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ -int __meminit __early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { unsigned long start_pfn, end_pfn; int nid; - /* - * NOTE: The following SMP-unsafe globals are only used early in boot - * when the kernel is running single-threaded. - */ - static unsigned long __meminitdata last_start_pfn, last_end_pfn; - static int __meminitdata last_nid; - if (last_start_pfn <= pfn && pfn < last_end_pfn) - return last_nid; + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); if (nid != -1) { - last_start_pfn = start_pfn; - last_end_pfn = end_pfn; - last_nid = nid; + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; } return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +/* Only safe to use early in boot when initialisation is single-threaded */ int __meminit early_pfn_to_nid(unsigned long pfn) { int nid; - nid = __early_pfn_to_nid(pfn); + /* The system will behave unpredictably otherwise */ + BUG_ON(system_state != SYSTEM_BOOTING); + + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid >= 0) return nid; /* just returns 0 */ @@ -4591,15 +4593,23 @@ int __meminit early_pfn_to_nid(unsigned long pfn) } #ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { int nid; - nid = __early_pfn_to_nid(pfn); + nid = __early_pfn_to_nid(pfn, state); if (nid >= 0 && nid != node) return false; return true; } + +/* Only safe to use early in boot when initialisation is single-threaded */ +bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); +} + #endif /** -- cgit v1.2.3 From 75a592a47129dcfc1aec40e7d3cdf239a767d441 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:56:59 -0700 Subject: mm: meminit: inline some helper functions early_pfn_in_nid() and meminit_pfn_in_nid() are small functions that are unnecessarily visible outside memory initialisation. As well as unnecessary visibility, it's unnecessary function call overhead when initialising pages. This patch moves the helpers inline. [akpm@linux-foundation.org: fix build] [mhocko@suse.cz: fix build] Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 9 ----- mm/page_alloc.c | 89 +++++++++++++++++++++++++++++--------------------- 2 files changed, 52 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b2473d822549..1e05dc7449cd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1227,15 +1227,6 @@ struct mminit_pfnnid_cache { int last_nid; }; -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool early_pfn_in_nid(unsigned long pfn, int nid); -bool meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state); -#else -#define early_pfn_in_nid(pfn, nid) (1) -#define meminit_pfn_in_nid(pfn, nid, state) (1) -#endif - #ifndef early_pfn_valid #define early_pfn_valid(pfn) (1) #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ffdb2308848d..12a81870815f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -899,6 +899,58 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, __free_pages(page, order); } +#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ + defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +/* Only safe to use early in boot when initialisation is single-threaded */ +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + /* The system will behave unpredictably otherwise */ + BUG_ON(system_state != SYSTEM_BOOTING); + + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); + if (nid >= 0) + return nid; + /* just returns 0 */ + return 0; +} +#endif + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + int nid; + + nid = __early_pfn_to_nid(pfn, state); + if (nid >= 0 && nid != node) + return false; + return true; +} + +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); +} + +#else + +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return true; +} +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + return true; +} +#endif + + #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) @@ -4575,43 +4627,6 @@ int __meminit __early_pfn_to_nid(unsigned long pfn, } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; - -/* Only safe to use early in boot when initialisation is single-threaded */ -int __meminit early_pfn_to_nid(unsigned long pfn) -{ - int nid; - - /* The system will behave unpredictably otherwise */ - BUG_ON(system_state != SYSTEM_BOOTING); - - nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; -} - -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) -{ - int nid; - - nid = __early_pfn_to_nid(pfn, state); - if (nid >= 0 && nid != node) - return false; - return true; -} - -/* Only safe to use early in boot when initialisation is single-threaded */ -bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); -} - -#endif - /** * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. -- cgit v1.2.3 From 3a80a7fa7989fbb6aa56bb6ad31811b62cf99e60 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:02 -0700 Subject: mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set This patch initalises all low memory struct pages and 2G of the highest zone on each node during memory initialisation if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set. That config option cannot be set but will be available in a later patch. Parallel initialisation of struct page depends on some features from memory hotplug and it is necessary to alter alter section annotations. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 6 +++- include/linux/mmzone.h | 8 ++++++ mm/Kconfig | 18 ++++++++++++ mm/internal.h | 18 ++++++++++++ mm/page_alloc.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 124 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/base/node.c b/drivers/base/node.c index a2aa65b4215d..31df474d72f4 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -359,12 +359,16 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #define page_initialized(page) (page->lru.next) -static int get_nid_for_pfn(unsigned long pfn) +static int __init_refok get_nid_for_pfn(unsigned long pfn) { struct page *page; if (!pfn_valid_within(pfn)) return -1; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + if (system_state == SYSTEM_BOOTING) + return early_pfn_to_nid(pfn); +#endif page = pfn_to_page(pfn); if (!page_initialized(page)) return -1; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1e05dc7449cd..754c25966a0a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -762,6 +762,14 @@ typedef struct pglist_data { /* Number of pages migrated during the rate limiting time interval */ unsigned long numabalancing_migrate_nr_pages; #endif + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * If memory initialisation on large machines is deferred then this + * is the first PFN that needs to be initialised. + */ + unsigned long first_deferred_pfn; +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/mm/Kconfig b/mm/Kconfig index c180af880ed5..e79de2bd12cd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -636,3 +636,21 @@ config MAX_STACK_SIZE_MB changed to a smaller value in which case that is used. A sane initial value is 80 MB. + +# For architectures that support deferred memory initialisation +config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + bool + +config DEFERRED_STRUCT_PAGE_INIT + bool "Defer initialisation of struct pages to kswapd" + default n + depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + depends on MEMORY_HOTPLUG + help + Ordinarily all struct pages are initialised during early boot in a + single thread. On very large machines this can take a considerable + amount of time. If this option is set, large machines will bring up + a subset of memmap at boot and then initialise the rest in parallel + when kswapd starts. This has a potential performance impact on + processes running early in the lifetime of the systemm until kswapd + finishes the initialisation. diff --git a/mm/internal.h b/mm/internal.h index 58e9022e3757..88ac7be741ca 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -387,6 +387,24 @@ static inline void mminit_verify_zonelist(void) } #endif /* CONFIG_DEBUG_MEMORY_INIT */ +/* + * Deferred struct page initialisation requires init functions that are freed + * before kswapd is available. Reuse the memory hotplug section annotation + * to mark the required code. + * + * __defermem_init is code that always exists but is annotated __meminit to + * avoid section warnings. + * __defer_init code gets marked __meminit when deferring struct page + * initialistion but is otherwise in the init section. + */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +#define __defermem_init __meminit +#define __defer_init __meminit +#else +#define __defermem_init +#define __defer_init __init +#endif + /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ #if defined(CONFIG_SPARSEMEM) extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12a81870815f..7af45b2e8870 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -235,6 +235,64 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ + pgdat->first_deferred_pfn = ULONG_MAX; +} + +/* Returns true if the struct page for the pfn is uninitialised */ +static inline bool __defermem_init early_page_uninitialised(unsigned long pfn) +{ + int nid = early_pfn_to_nid(pfn); + + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +/* + * Returns false when the remaining initialisation should be deferred until + * later in the boot cycle when it can be parallelised. + */ +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + /* Always populate low zones for address-contrained allocations */ + if (zone_end < pgdat_end_pfn(pgdat)) + return true; + + /* Initialise at least 2G of the highest zone */ + (*nr_initialised)++; + if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + pgdat->first_deferred_pfn = pfn; + return false; + } + + return true; +} +#else +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ +} + +static inline bool early_page_uninitialised(unsigned long pfn) +{ + return false; +} + +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + return true; +} +#endif + + void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled && @@ -878,8 +936,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -void __init __free_pages_bootmem(struct page *page, unsigned long pfn, - unsigned int order) +static void __defer_init __free_pages_boot_core(struct page *page, + unsigned long pfn, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -951,6 +1009,14 @@ static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, #endif +void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order) +{ + if (early_page_uninitialised(pfn)) + return; + return __free_pages_boot_core(page, pfn, order); +} + #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) @@ -4325,14 +4391,16 @@ static void setup_zone_migrate_reserve(struct zone *zone) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { + pg_data_t *pgdat = NODE_DATA(nid); unsigned long end_pfn = start_pfn + size; unsigned long pfn; struct zone *z; + unsigned long nr_initialised = 0; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &NODE_DATA(nid)->node_zones[zone]; + z = &pgdat->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s @@ -4344,6 +4412,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; if (!early_pfn_in_nid(pfn, nid)) continue; + if (!update_defer_init(pgdat, pfn, end_pfn, + &nr_initialised)) + break; } __init_single_pfn(pfn, zone, nid); } @@ -5144,6 +5215,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -- cgit v1.2.3 From 0e1cc95b4cc7293bb7b39175035e7f7e45c90977 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:27 -0700 Subject: mm: meminit: finish initialisation of struct pages before basic setup Waiman Long reported that 24TB machines hit OOM during basic setup when struct page initialisation was deferred. One approach is to initialise memory on demand but it interferes with page allocator paths. This patch creates dedicated threads to initialise memory before basic setup. It then blocks on a rw_semaphore until completion as a wait_queue and counter is overkill. This may be slower to boot but it's simplier overall and also gets rid of a section mangling which existed so kswapd could do the initialisation. [akpm@linux-foundation.org: include rwsem.h, use DECLARE_RWSEM, fix comment, remove unneeded cast] Signed-off-by: Mel Gorman Cc: Waiman Long Cc: Dave Hansen Cc: Scott Norton Tested-by: Daniel J Blueman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 8 ++++++++ init/main.c | 2 ++ mm/internal.h | 24 ------------------------ mm/page_alloc.c | 46 +++++++++++++++++++++++++++++++++++++--------- mm/vmscan.c | 6 ++---- 5 files changed, 49 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6ba7cf23748f..ad35f300b9a4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -384,6 +384,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +void page_alloc_init_late(void); +#else +static inline void page_alloc_init_late(void) +{ +} +#endif + /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are diff --git a/init/main.c b/init/main.c index c599aea23bb1..c5d5626289ce 100644 --- a/init/main.c +++ b/init/main.c @@ -1004,6 +1004,8 @@ static noinline void __init kernel_init_freeable(void) smp_init(); sched_init_smp(); + page_alloc_init_late(); + do_basic_setup(); /* Open the /dev/console on the rootfs, this should never fail */ diff --git a/mm/internal.h b/mm/internal.h index a48cbefde8ca..36b23f1e2ca6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -379,30 +379,6 @@ static inline void mminit_verify_zonelist(void) } #endif /* CONFIG_DEBUG_MEMORY_INIT */ -/* - * Deferred struct page initialisation requires init functions that are freed - * before kswapd is available. Reuse the memory hotplug section annotation - * to mark the required code. - * - * __defermem_init is code that always exists but is annotated __meminit to - * avoid section warnings. - * __defer_init code gets marked __meminit when deferring struct page - * initialistion but is otherwise in the init section. - */ -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -#define __defermem_init __meminit -#define __defer_init __meminit - -void deferred_init_memmap(int nid); -#else -#define __defermem_init -#define __defer_init __init - -static inline void deferred_init_memmap(int nid) -{ -} -#endif - /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ #if defined(CONFIG_SPARSEMEM) extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a38e39b30d1..506eac8b38af 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -61,6 +62,7 @@ #include #include #include +#include #include #include @@ -242,7 +244,7 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) } /* Returns true if the struct page for the pfn is uninitialised */ -static inline bool __defermem_init early_page_uninitialised(unsigned long pfn) +static inline bool __meminit early_page_uninitialised(unsigned long pfn) { int nid = early_pfn_to_nid(pfn); @@ -958,7 +960,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __defer_init __free_pages_boot_core(struct page *page, +static void __init __free_pages_boot_core(struct page *page, unsigned long pfn, unsigned int order) { unsigned int nr_pages = 1 << order; @@ -1031,7 +1033,7 @@ static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, #endif -void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn, +void __init __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order) { if (early_page_uninitialised(pfn)) @@ -1040,7 +1042,7 @@ void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn, } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void __defermem_init deferred_free_range(struct page *page, +static void __init deferred_free_range(struct page *page, unsigned long pfn, int nr_pages) { int i; @@ -1060,20 +1062,30 @@ static void __defermem_init deferred_free_range(struct page *page, __free_pages_boot_core(page, pfn, 0); } +static __initdata DECLARE_RWSEM(pgdat_init_rwsem); + /* Initialise remaining memory on a node */ -void __defermem_init deferred_init_memmap(int nid) +static int __init deferred_init_memmap(void *data) { + pg_data_t *pgdat = data; + int nid = pgdat->node_id; struct mminit_pfnnid_cache nid_init_state = { }; unsigned long start = jiffies; unsigned long nr_pages = 0; unsigned long walk_start, walk_end; int i, zid; struct zone *zone; - pg_data_t *pgdat = NODE_DATA(nid); unsigned long first_init_pfn = pgdat->first_deferred_pfn; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - if (first_init_pfn == ULONG_MAX) - return; + if (first_init_pfn == ULONG_MAX) { + up_read(&pgdat_init_rwsem); + return 0; + } + + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); /* Sanity check boundaries */ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); @@ -1165,8 +1177,24 @@ free_range: /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("kswapd %d initialised %lu pages in %ums\n", nid, nr_pages, + pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, jiffies_to_msecs(jiffies - start)); + up_read(&pgdat_init_rwsem); + return 0; +} + +void __init page_alloc_init_late(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + down_read(&pgdat_init_rwsem); + kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); + } + + /* Block until all are initialised */ + down_write(&pgdat_init_rwsem); + up_write(&pgdat_init_rwsem); } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ diff --git a/mm/vmscan.c b/mm/vmscan.c index f4a487110764..e61445dce04e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3386,7 +3386,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -static int __defermem_init kswapd(void *p) +static int kswapd(void *p) { unsigned long order, new_order; unsigned balanced_order; @@ -3421,8 +3421,6 @@ static int __defermem_init kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - deferred_init_memmap(pgdat->node_id); - order = new_order = 0; balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; @@ -3578,7 +3576,7 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action, * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ -int __defermem_init kswapd_run(int nid) +int kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); int ret = 0; -- cgit v1.2.3 From 5375b708f2547f70cd2bee2fd8663ab7035f9551 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Tue, 30 Jun 2015 14:57:46 -0700 Subject: kernel/panic/kexec: fix "crash_kexec_post_notifiers" option issue in oops path Commit f06e5153f4ae2e ("kernel/panic.c: add "crash_kexec_post_notifiers" option for kdump after panic_notifers") introduced "crash_kexec_post_notifiers" kernel boot option, which toggles wheather panic() calls crash_kexec() before panic_notifiers and dump kmsg or after. The problem is that the commit overlooks panic_on_oops kernel boot option. If it is enabled, crash_kexec() is called directly without going through panic() in oops path. To fix this issue, this patch adds a check to "crash_kexec_post_notifiers" in the condition of kexec_should_crash(). Also, put a comment in kexec_should_crash() to explain not obvious things on this patch. Signed-off-by: HATAYAMA Daisuke Acked-by: Baoquan He Tested-by: Hidehiro Kawai Reviewed-by: Masami Hiramatsu Cc: Vivek Goyal Cc: Ingo Molnar Cc: Hidehiro Kawai Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 3 +++ kernel/kexec.c | 11 +++++++++++ kernel/panic.c | 2 +- 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5acf5b70866d..0dfa4e31563d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -439,6 +439,9 @@ extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; extern int sysctl_panic_on_stackoverflow; + +extern bool crash_kexec_post_notifiers; + /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. diff --git a/kernel/kexec.c b/kernel/kexec.c index 7a36fdcca5bf..a785c1015e25 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -84,6 +84,17 @@ struct resource crashk_low_res = { int kexec_should_crash(struct task_struct *p) { + /* + * If crash_kexec_post_notifiers is enabled, don't run + * crash_kexec() here yet, which must be run after panic + * notifiers in panic(). + */ + if (crash_kexec_post_notifiers) + return 0; + /* + * There are 4 panic() calls in do_exit() path, each of which + * corresponds to each of these 4 conditions. + */ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) return 1; return 0; diff --git a/kernel/panic.c b/kernel/panic.c index 774614f72cbd..04e91ff7560b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,7 +32,7 @@ static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); -static bool crash_kexec_post_notifiers; +bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; -- cgit v1.2.3 From 2a1bf8f93b33992bb0457512b28d046e279bbd7e Mon Sep 17 00:00:00 2001 From: Dave Gordon Date: Tue, 30 Jun 2015 14:58:54 -0700 Subject: lib/scatterlist: mark input buffer parameters as 'const' The 'buf' parameter of sg(p)copy_from_buffer() can and should be const-qualified, although because of the shared implementation of _to_buffer() and _from_buffer(), we have to cast this away internally. This means that callers who have a 'const' buffer containing the data to be copied to the sg-list no longer have to cast away the const-ness themselves. It also enables improved coverage by code analysis tools. Signed-off-by: Dave Gordon Cc: Akinobu Mita Cc: "Martin K. Petersen" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/scatterlist.h | 4 ++-- lib/scatterlist.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 50a8486c524b..505d0481df1e 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -266,12 +266,12 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, gfp_t gfp_mask); size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen); + const void *buf, size_t buflen); size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen); size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen, off_t skip); + const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 965c36e7a5a4..317b62c6da3c 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -701,9 +701,9 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, * **/ size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen) + const void *buf, size_t buflen) { - return sg_copy_buffer(sgl, nents, buf, buflen, 0, false); + return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false); } EXPORT_SYMBOL(sg_copy_from_buffer); @@ -736,9 +736,9 @@ EXPORT_SYMBOL(sg_copy_to_buffer); * **/ size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen, off_t skip) + const void *buf, size_t buflen, off_t skip) { - return sg_copy_buffer(sgl, nents, buf, buflen, skip, false); + return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false); } EXPORT_SYMBOL(sg_pcopy_from_buffer); -- cgit v1.2.3 From 386ecb1216f9e38947ce6a2af22e5e1e47256a97 Mon Sep 17 00:00:00 2001 From: Dave Gordon Date: Tue, 30 Jun 2015 14:58:57 -0700 Subject: drivers/scsi/scsi_debug.c: resolve sg buffer const-ness issue do_device_access() takes a separate parameter to indicate the direction of data transfer, which it used to use to select the appropriate function out of sg_pcopy_{to,from}_buffer(). However these two functions now have So this patch makes it bypass these wrappers and call the underlying function sg_copy_buffer() directly; this has the same calling style as do_device_access() i.e. a separate direction-of-transfer parameter and no pointers-to-const, so skipping the wrappers not only eliminates the warning, it also make the code simpler :) [akpm@linux-foundation.org: fix very broken build] Signed-off-by: Dave Gordon Acked-by: Arnd Bergmann Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/scsi/scsi_debug.c | 12 ++++-------- include/linux/scatterlist.h | 3 +++ lib/scatterlist.c | 6 +++--- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 1f8e2dc9c616..30268bb2ddb6 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -2363,17 +2363,13 @@ do_device_access(struct scsi_cmnd *scmd, u64 lba, u32 num, bool do_write) u64 block, rest = 0; struct scsi_data_buffer *sdb; enum dma_data_direction dir; - size_t (*func)(struct scatterlist *, unsigned int, void *, size_t, - off_t); if (do_write) { sdb = scsi_out(scmd); dir = DMA_TO_DEVICE; - func = sg_pcopy_to_buffer; } else { sdb = scsi_in(scmd); dir = DMA_FROM_DEVICE; - func = sg_pcopy_from_buffer; } if (!sdb->length) @@ -2385,16 +2381,16 @@ do_device_access(struct scsi_cmnd *scmd, u64 lba, u32 num, bool do_write) if (block + num > sdebug_store_sectors) rest = block + num - sdebug_store_sectors; - ret = func(sdb->table.sgl, sdb->table.nents, + ret = sg_copy_buffer(sdb->table.sgl, sdb->table.nents, fake_storep + (block * scsi_debug_sector_size), - (num - rest) * scsi_debug_sector_size, 0); + (num - rest) * scsi_debug_sector_size, 0, do_write); if (ret != (num - rest) * scsi_debug_sector_size) return ret; if (rest) { - ret += func(sdb->table.sgl, sdb->table.nents, + ret += sg_copy_buffer(sdb->table.sgl, sdb->table.nents, fake_storep, rest * scsi_debug_sector_size, - (num - rest) * scsi_debug_sector_size); + (num - rest) * scsi_debug_sector_size, do_write); } return ret; diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 505d0481df1e..9b1ef0c820a7 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -265,6 +265,9 @@ int sg_alloc_table_from_pages(struct sg_table *sgt, unsigned long offset, unsigned long size, gfp_t gfp_mask); +size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, + size_t buflen, off_t skip, bool to_buffer); + size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen); size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 317b62c6da3c..d105a9f56878 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -650,9 +650,8 @@ EXPORT_SYMBOL(sg_miter_stop); * Returns the number of copied bytes. * **/ -static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, - void *buf, size_t buflen, off_t skip, - bool to_buffer) +size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, + size_t buflen, off_t skip, bool to_buffer) { unsigned int offset = 0; struct sg_mapping_iter miter; @@ -689,6 +688,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, local_irq_restore(flags); return offset; } +EXPORT_SYMBOL(sg_copy_buffer); /** * sg_copy_from_buffer - Copy from a linear buffer to an SG list -- cgit v1.2.3 From 15e21cd1631e0eca16af545bbdf89e2ffa4cdaee Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 30 Jun 2015 14:59:15 -0700 Subject: drm: use kvfree() in drm_free_large() Use kvfree() instead of open-coding it. Signed-off-by: Pekka Enberg Cc: David Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/drm/drm_mem_util.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/drm/drm_mem_util.h b/include/drm/drm_mem_util.h index 19a240446fca..e42495ad8136 100644 --- a/include/drm/drm_mem_util.h +++ b/include/drm/drm_mem_util.h @@ -56,10 +56,7 @@ static __inline__ void *drm_malloc_ab(size_t nmemb, size_t size) static __inline void drm_free_large(void *ptr) { - if (!is_vmalloc_addr(ptr)) - return kfree(ptr); - - vfree(ptr); + kvfree(ptr); } #endif -- cgit v1.2.3 From 0030edf296db8a7afb13573eb12977b7d399cd40 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Tue, 30 Jun 2015 15:00:03 -0700 Subject: genalloc: rename dev_get_gen_pool() to gen_pool_get() To be consistent with other genalloc interface namings, rename dev_get_gen_pool() to gen_pool_get(). The original omitted "dev_" prefix is removed, since it points to argument type of the function, and so it does not bring any useful information. [akpm@linux-foundation.org: update arch/arm/mach-socfpga/pm.c] Signed-off-by: Vladimir Zapolskiy Acked-by: Nicolas Ferre Cc: Philipp Zabel Cc: Shawn Guo Cc: Sascha Hauer Cc: Alexandre Belloni Cc: Russell King Cc: Mauro Carvalho Chehab Cc: Vinod Koul Cc: Takashi Iwai Cc: Jaroslav Kysela Cc: Mark Brown Cc: Nicolas Ferre Cc: Alan Tull Cc: Dinh Nguyen Cc: Kevin Hilman Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mach-at91/pm.c | 2 +- arch/arm/mach-imx/pm-imx5.c | 2 +- arch/arm/mach-imx/pm-imx6.c | 2 +- arch/arm/mach-socfpga/pm.c | 2 +- drivers/media/platform/coda/coda-common.c | 2 +- include/linux/genalloc.h | 2 +- lib/genalloc.c | 8 ++++---- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 1e184767c3be..e24df77abd79 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void) return; } - sram_pool = dev_get_gen_pool(&pdev->dev); + sram_pool = gen_pool_get(&pdev->dev); if (!sram_pool) { pr_warn("%s: sram pool unavailable!\n", __func__); return; diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c index 0309ccda36a9..1885676c23c0 100644 --- a/arch/arm/mach-imx/pm-imx5.c +++ b/arch/arm/mach-imx/pm-imx5.c @@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram( goto put_node; } - ocram_pool = dev_get_gen_pool(&pdev->dev); + ocram_pool = gen_pool_get(&pdev->dev); if (!ocram_pool) { pr_warn("%s: ocram pool unavailable!\n", __func__); ret = -ENODEV; diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c index b01650d94f91..93ecf559d06d 100644 --- a/arch/arm/mach-imx/pm-imx6.c +++ b/arch/arm/mach-imx/pm-imx6.c @@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata) goto put_node; } - ocram_pool = dev_get_gen_pool(&pdev->dev); + ocram_pool = gen_pool_get(&pdev->dev); if (!ocram_pool) { pr_warn("%s: ocram pool unavailable!\n", __func__); ret = -ENODEV; diff --git a/arch/arm/mach-socfpga/pm.c b/arch/arm/mach-socfpga/pm.c index 1ed89fc2b7a8..6a4199f2bffb 100644 --- a/arch/arm/mach-socfpga/pm.c +++ b/arch/arm/mach-socfpga/pm.c @@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void) goto put_node; } - ocram_pool = dev_get_gen_pool(&pdev->dev); + ocram_pool = gen_pool_get(&pdev->dev); if (!ocram_pool) { pr_warn("%s: ocram pool unavailable!\n", __func__); ret = -ENODEV; diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c index 6d6e0ca91fb4..6e640c0f0b35 100644 --- a/drivers/media/platform/coda/coda-common.c +++ b/drivers/media/platform/coda/coda-common.c @@ -2157,7 +2157,7 @@ static int coda_probe(struct platform_device *pdev) /* Get IRAM pool from device tree or platform data */ pool = of_get_named_gen_pool(np, "iram", 0); if (!pool && pdata) - pool = dev_get_gen_pool(pdata->iram_dev); + pool = gen_pool_get(pdata->iram_dev); if (!pool) { dev_err(&pdev->dev, "iram pool not available\n"); return -ENOMEM; diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 1ccaab44abcc..015d17068615 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -119,7 +119,7 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, extern struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, int nid); -extern struct gen_pool *dev_get_gen_pool(struct device *dev); +extern struct gen_pool *gen_pool_get(struct device *dev); bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, size_t size); diff --git a/lib/genalloc.c b/lib/genalloc.c index d214866eeea2..948e92cd9794 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -602,12 +602,12 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, EXPORT_SYMBOL(devm_gen_pool_create); /** - * dev_get_gen_pool - Obtain the gen_pool (if any) for a device + * gen_pool_get - Obtain the gen_pool (if any) for a device * @dev: device to retrieve the gen_pool from * * Returns the gen_pool for the device if one is present, or NULL. */ -struct gen_pool *dev_get_gen_pool(struct device *dev) +struct gen_pool *gen_pool_get(struct device *dev) { struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL, NULL); @@ -616,7 +616,7 @@ struct gen_pool *dev_get_gen_pool(struct device *dev) return NULL; return *p; } -EXPORT_SYMBOL_GPL(dev_get_gen_pool); +EXPORT_SYMBOL_GPL(gen_pool_get); #ifdef CONFIG_OF /** @@ -642,7 +642,7 @@ struct gen_pool *of_get_named_gen_pool(struct device_node *np, of_node_put(np_pool); if (!pdev) return NULL; - return dev_get_gen_pool(&pdev->dev); + return gen_pool_get(&pdev->dev); } EXPORT_SYMBOL_GPL(of_get_named_gen_pool); #endif /* CONFIG_OF */ -- cgit v1.2.3 From abdd4a7025282fbe3737e1bcb5f51afc8d8ea1b8 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Tue, 30 Jun 2015 15:00:07 -0700 Subject: genalloc: rename of_get_named_gen_pool() to of_gen_pool_get() To be consistent with other kernel interface namings, rename of_get_named_gen_pool() to of_gen_pool_get(). In the original function name "_named" suffix references to a device tree property, which contains a phandle to a device and the corresponding device driver is assumed to register a gen_pool object. Due to a weak relation and to avoid any confusion (e.g. in future possible scenario if gen_pool objects are named) the suffix is removed. [sfr@canb.auug.org.au: crypto/marvell/cesa - fix up for of_get_named_gen_pool() rename] Signed-off-by: Vladimir Zapolskiy Cc: Nicolas Ferre Cc: Philipp Zabel Cc: Shawn Guo Cc: Sascha Hauer Cc: Alexandre Belloni Cc: Russell King Cc: Mauro Carvalho Chehab Cc: Vinod Koul Cc: Takashi Iwai Cc: Jaroslav Kysela Signed-off-by: Stephen Rothwell Cc: Herbert Xu Cc: Boris BREZILLON Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/crypto/marvell/cesa.c | 5 ++--- drivers/dma/mmp_tdma.c | 2 +- drivers/media/platform/coda/coda-common.c | 2 +- include/linux/genalloc.h | 4 ++-- lib/genalloc.c | 6 +++--- sound/core/memalloc.c | 2 +- 6 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/crypto/marvell/cesa.c b/drivers/crypto/marvell/cesa.c index a432633bced4..1c6f98dd88f4 100644 --- a/drivers/crypto/marvell/cesa.c +++ b/drivers/crypto/marvell/cesa.c @@ -321,9 +321,8 @@ static int mv_cesa_get_sram(struct platform_device *pdev, int idx) const char *res_name = "sram"; struct resource *res; - engine->pool = of_get_named_gen_pool(cesa->dev->of_node, - "marvell,crypto-srams", - idx); + engine->pool = of_gen_pool_get(cesa->dev->of_node, + "marvell,crypto-srams", idx); if (engine->pool) { engine->sram = gen_pool_dma_alloc(engine->pool, cesa->sram_size, diff --git a/drivers/dma/mmp_tdma.c b/drivers/dma/mmp_tdma.c index 449e785def17..e683761e0f8f 100644 --- a/drivers/dma/mmp_tdma.c +++ b/drivers/dma/mmp_tdma.c @@ -657,7 +657,7 @@ static int mmp_tdma_probe(struct platform_device *pdev) INIT_LIST_HEAD(&tdev->device.channels); if (pdev->dev.of_node) - pool = of_get_named_gen_pool(pdev->dev.of_node, "asram", 0); + pool = of_gen_pool_get(pdev->dev.of_node, "asram", 0); else pool = sram_get_gpool("asram"); if (!pool) { diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c index 6e640c0f0b35..58f65486de33 100644 --- a/drivers/media/platform/coda/coda-common.c +++ b/drivers/media/platform/coda/coda-common.c @@ -2155,7 +2155,7 @@ static int coda_probe(struct platform_device *pdev) } /* Get IRAM pool from device tree or platform data */ - pool = of_get_named_gen_pool(np, "iram", 0); + pool = of_gen_pool_get(np, "iram", 0); if (!pool && pdata) pool = gen_pool_get(pdata->iram_dev); if (!pool) { diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 015d17068615..5383bb1394a1 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -125,10 +125,10 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, size_t size); #ifdef CONFIG_OF -extern struct gen_pool *of_get_named_gen_pool(struct device_node *np, +extern struct gen_pool *of_gen_pool_get(struct device_node *np, const char *propname, int index); #else -static inline struct gen_pool *of_get_named_gen_pool(struct device_node *np, +static inline struct gen_pool *of_gen_pool_get(struct device_node *np, const char *propname, int index) { return NULL; diff --git a/lib/genalloc.c b/lib/genalloc.c index 948e92cd9794..daf0afb6d979 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -620,7 +620,7 @@ EXPORT_SYMBOL_GPL(gen_pool_get); #ifdef CONFIG_OF /** - * of_get_named_gen_pool - find a pool by phandle property + * of_gen_pool_get - find a pool by phandle property * @np: device node * @propname: property name containing phandle(s) * @index: index into the phandle array @@ -629,7 +629,7 @@ EXPORT_SYMBOL_GPL(gen_pool_get); * address of the device tree node pointed at by the phandle property, * or NULL if not found. */ -struct gen_pool *of_get_named_gen_pool(struct device_node *np, +struct gen_pool *of_gen_pool_get(struct device_node *np, const char *propname, int index) { struct platform_device *pdev; @@ -644,5 +644,5 @@ struct gen_pool *of_get_named_gen_pool(struct device_node *np, return NULL; return gen_pool_get(&pdev->dev); } -EXPORT_SYMBOL_GPL(of_get_named_gen_pool); +EXPORT_SYMBOL_GPL(of_gen_pool_get); #endif /* CONFIG_OF */ diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 082509eb805d..f05cb6a8cbe0 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -124,7 +124,7 @@ static void snd_malloc_dev_iram(struct snd_dma_buffer *dmab, size_t size) dmab->addr = 0; if (dev->of_node) - pool = of_get_named_gen_pool(dev->of_node, "iram", 0); + pool = of_gen_pool_get(dev->of_node, "iram", 0); if (!pool) return; -- cgit v1.2.3