From 83da7510058736c09a14b9c17ec7d851940a4332 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 18 Apr 2014 15:07:10 -0700 Subject: vmscan: reclaim_clean_pages_from_list() must use mod_zone_page_state() Seems to be called with preemption enabled. Therefore it must use mod_zone_page_state instead. Signed-off-by: Christoph Lameter Reported-by: Grygorii Strashko Tested-by: Grygorii Strashko Cc: Tejun Heo Cc: Santosh Shilimkar Cc: Ingo Molnar Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9b6497eda806..3f56c8deb3c0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1158,7 +1158,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, TTU_UNMAP|TTU_IGNORE_ACCESS, &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; } -- cgit v1.2.3 From 82c04ff89eba09d0e46e3f3649c6d3aa18e764a0 Mon Sep 17 00:00:00 2001 From: Peter Foley Date: Fri, 18 Apr 2014 15:07:11 -0700 Subject: init/Kconfig: move the trusted keyring config option to general setup The SYSTEM_TRUSTED_KEYRING config option is not in any menu, causing it to show up in the toplevel of the kernel configuration. Fix this by moving it under the General Setup menu. Signed-off-by: Peter Foley Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 765018c24cf9..9d3585bb2a7a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1646,6 +1646,18 @@ config MMAP_ALLOW_UNINITIALIZED See Documentation/nommu-mmap.txt for more information. +config SYSTEM_TRUSTED_KEYRING + bool "Provide system-wide ring of trusted keys" + depends on KEYS + help + Provide a system keyring to which trusted keys can be added. Keys in + the keyring are considered to be trusted. Keys may be added at will + by the kernel from compiled-in data and from hardware key stores, but + userspace may only add extra keys if those keys can be verified by + keys already in the keyring. + + Keys in this keyring are used by module signature checking. + config PROFILING bool "Profiling support" help @@ -1681,18 +1693,6 @@ config BASE_SMALL default 0 if BASE_FULL default 1 if !BASE_FULL -config SYSTEM_TRUSTED_KEYRING - bool "Provide system-wide ring of trusted keys" - depends on KEYS - help - Provide a system keyring to which trusted keys can be added. Keys in - the keyring are considered to be trusted. Keys may be added at will - by the kernel from compiled-in data and from hardware key stores, but - userspace may only add extra keys if those keys can be verified by - keys already in the keyring. - - Keys in this keyring are used by module signature checking. - menuconfig MODULES bool "Enable loadable module support" option modules -- cgit v1.2.3 From 7861144b8cb217634d738e94a748deeae139a1e2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 18 Apr 2014 15:07:12 -0700 Subject: kernel/watchdog.c:touch_softlockup_watchdog(): use raw_cpu_write() Fix: BUG: using __this_cpu_write() in preemptible [00000000] code: systemd-udevd/497 caller is __this_cpu_preempt_check+0x13/0x20 CPU: 3 PID: 497 Comm: systemd-udevd Tainted: G W 3.15.0-rc1 #9 Hardware name: Hewlett-Packard HP EliteBook 8470p/179B, BIOS 68ICF Ver. F.02 04/27/2012 Call Trace: check_preemption_disabled+0xe1/0xf0 __this_cpu_preempt_check+0x13/0x20 touch_nmi_watchdog+0x28/0x40 Reported-by: Luis Henriques Tested-by: Luis Henriques Cc: Eric Piel Cc: Robert Moore Cc: Lv Zheng Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e90089fd78e0..516203e665fc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -138,7 +138,11 @@ static void __touch_watchdog(void) void touch_softlockup_watchdog(void) { - __this_cpu_write(watchdog_touch_ts, 0); + /* + * Preemption can be enabled. It doesn't matter which CPU's timestamp + * gets zeroed here, so use the raw_ operation. + */ + raw_cpu_write(watchdog_touch_ts, 0); } EXPORT_SYMBOL(touch_softlockup_watchdog); -- cgit v1.2.3 From 12c743eb2289bcaace32859d4919417ff5707768 Mon Sep 17 00:00:00 2001 From: Mike Qiu Date: Fri, 18 Apr 2014 15:07:14 -0700 Subject: powerpc/mm: fix ".__node_distance" undefined CHK include/config/kernel.release CHK include/generated/uapi/linux/version.h CHK include/generated/utsrelease.h ... Building modules, stage 2. WARNING: 1 bad relocations c0000000013d6a30 R_PPC64_ADDR64 uprobes_fetch_type_table WRAP arch/powerpc/boot/zImage.pseries WRAP arch/powerpc/boot/zImage.epapr MODPOST 1849 modules ERROR: ".__node_distance" [drivers/block/nvme.ko] undefined! make[1]: *** [__modpost] Error 1 make: *** [modules] Error 2 make: *** Waiting for unfinished jobs.... The reason is symbol "__node_distance" not been exported in powerpc. Signed-off-by: Mike Qiu Acked-by: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Nathan Fontenot Cc: Stephen Rothwell Cc: Srivatsa S. Bhat Cc: Jesse Larrew Cc: Robert Jennings Cc: Alistair Popple Cc: Mike Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/numa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 4ebbb9e99286..3b181b22cd46 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -232,6 +232,7 @@ int __node_distance(int a, int b) return distance; } +EXPORT_SYMBOL(__node_distance); static void initialize_distance_lookup_table(int nid, const __be32 *associativity) -- cgit v1.2.3 From 8f28ed92d9314b98dc2033df770f5e6b85c5ffb7 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 18 Apr 2014 15:07:15 -0700 Subject: Documentation/vm/numa_memory_policy.txt: fix wrong document in numa_memory_policy.txt In document numa_memory_policy.txt, the following examples for flag MPOL_F_RELATIVE_NODES are incorrect. For example, consider a task that is attached to a cpuset with mems 2-5 that sets an Interleave policy over the same set with MPOL_F_RELATIVE_NODES. If the cpuset's mems change to 3-7, the interleave now occurs over nodes 3,5-6. If the cpuset's mems then change to 0,2-3,5, then the interleave occurs over nodes 0,3,5. According to the comment of the patch adding flag MPOL_F_RELATIVE_NODES, the nodemasks the user specifies should be considered relative to the current task's mems_allowed. (https://lkml.org/lkml/2008/2/29/428) And according to numa_memory_policy.txt, if the user's nodemask includes nodes that are outside the range of the new set of allowed nodes, then the remap wraps around to the beginning of the nodemask and, if not already set, sets the node in the mempolicy nodemask. So in the example, if the user specifies 2-5, for a task whose mems_allowed is 3-7, the nodemasks should be remapped the third, fourth, fifth, sixth node in mems_allowed. like the following: mems_allowed: 3 4 5 6 7 relative index: 0 1 2 3 4 5 So the nodemasks should be remapped to 3,5-7, but not 3,5-6. And for a task whose mems_allowed is 0,2-3,5, the nodemasks should be remapped to 0,2-3,5, but not 0,3,5. mems_allowed: 0 2 3 5 relative index: 0 1 2 3 4 5 Signed-off-by: Tang Chen Cc: Randy Dunlap Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/numa_memory_policy.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt index 4e7da6543424..badb0507608f 100644 --- a/Documentation/vm/numa_memory_policy.txt +++ b/Documentation/vm/numa_memory_policy.txt @@ -174,7 +174,6 @@ Components of Memory Policies allocation fails, the kernel will search other nodes, in order of increasing distance from the preferred node based on information provided by the platform firmware. - containing the cpu where the allocation takes place. Internally, the Preferred policy uses a single node--the preferred_node member of struct mempolicy. When the internal @@ -275,9 +274,9 @@ Components of Memory Policies For example, consider a task that is attached to a cpuset with mems 2-5 that sets an Interleave policy over the same set with MPOL_F_RELATIVE_NODES. If the cpuset's mems change to 3-7, the - interleave now occurs over nodes 3,5-6. If the cpuset's mems + interleave now occurs over nodes 3,5-7. If the cpuset's mems then change to 0,2-3,5, then the interleave occurs over nodes - 0,3,5. + 0,2-3,5. Thanks to the consistent remapping, applications preparing nodemasks to specify memory policies using this flag should -- cgit v1.2.3 From 9cc236827fde5e254fd995a0023c05c5ee3a3ba6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 18 Apr 2014 15:07:16 -0700 Subject: Shiraz has moved shiraz.hashim@st.com email-id doesn't exist anymore as he has left the company. Replace ST's id with shiraz.linux.kernel@gmail.com. It also updates .mailmap file to fix address for 'git shortlog'. Signed-off-by: Viresh Kumar Cc: Shiraz Hashim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 1 + MAINTAINERS | 2 +- arch/arm/boot/dts/spear320-hmi.dts | 2 +- arch/arm/mach-spear/headsmp.S | 2 +- arch/arm/mach-spear/platsmp.c | 2 +- arch/arm/mach-spear/time.c | 2 +- drivers/gpio/gpio-spear-spics.c | 4 ++-- drivers/irqchip/spear-shirq.c | 2 +- drivers/mtd/devices/spear_smi.c | 4 ++-- drivers/pwm/pwm-spear.c | 4 ++-- include/linux/mtd/spear_smi.h | 2 +- 11 files changed, 14 insertions(+), 13 deletions(-) diff --git a/.mailmap b/.mailmap index 658003aa9446..df1baba43a64 100644 --- a/.mailmap +++ b/.mailmap @@ -99,6 +99,7 @@ Sachin P Sant Sam Ravnborg Sascha Hauer S.Çağlar Onur +Shiraz Hashim Simon Kelley Stéphane Witzmann Stephen Hemminger diff --git a/MAINTAINERS b/MAINTAINERS index 80399fff805d..e67ea2442041 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8315,7 +8315,7 @@ F: include/linux/compiler.h SPEAR PLATFORM SUPPORT M: Viresh Kumar -M: Shiraz Hashim +M: Shiraz Hashim L: spear-devel@list.st.com L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.st.com/spear diff --git a/arch/arm/boot/dts/spear320-hmi.dts b/arch/arm/boot/dts/spear320-hmi.dts index 3075d2d3a8be..0aa6fef5ce22 100644 --- a/arch/arm/boot/dts/spear320-hmi.dts +++ b/arch/arm/boot/dts/spear320-hmi.dts @@ -1,7 +1,7 @@ /* * DTS file for SPEAr320 Evaluation Baord * - * Copyright 2012 Shiraz Hashim + * Copyright 2012 Shiraz Hashim * * The code contained herein is licensed under the GNU General Public * License. You may obtain a copy of the GNU General Public License diff --git a/arch/arm/mach-spear/headsmp.S b/arch/arm/mach-spear/headsmp.S index ed85473a047f..c52192dc3d9f 100644 --- a/arch/arm/mach-spear/headsmp.S +++ b/arch/arm/mach-spear/headsmp.S @@ -3,7 +3,7 @@ * * Picked from realview * Copyright (c) 2012 ST Microelectronics Limited - * Shiraz Hashim + * Shiraz Hashim * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c index 5c4a19887b2b..c19751fff2c6 100644 --- a/arch/arm/mach-spear/platsmp.c +++ b/arch/arm/mach-spear/platsmp.c @@ -4,7 +4,7 @@ * based upon linux/arch/arm/mach-realview/platsmp.c * * Copyright (C) 2012 ST Microelectronics Ltd. - * Shiraz Hashim + * Shiraz Hashim * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/arch/arm/mach-spear/time.c b/arch/arm/mach-spear/time.c index 218ba5b67d92..64790353951f 100644 --- a/arch/arm/mach-spear/time.c +++ b/arch/arm/mach-spear/time.c @@ -2,7 +2,7 @@ * arch/arm/plat-spear/time.c * * Copyright (C) 2010 ST Microelectronics - * Shiraz Hashim + * Shiraz Hashim * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any diff --git a/drivers/gpio/gpio-spear-spics.c b/drivers/gpio/gpio-spear-spics.c index e9a0415834ea..30bcc539425d 100644 --- a/drivers/gpio/gpio-spear-spics.c +++ b/drivers/gpio/gpio-spear-spics.c @@ -2,7 +2,7 @@ * SPEAr platform SPI chipselect abstraction over gpiolib * * Copyright (C) 2012 ST Microelectronics - * Shiraz Hashim + * Shiraz Hashim * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any @@ -205,6 +205,6 @@ static int __init spics_gpio_init(void) } subsys_initcall(spics_gpio_init); -MODULE_AUTHOR("Shiraz Hashim "); +MODULE_AUTHOR("Shiraz Hashim "); MODULE_DESCRIPTION("ST Microlectronics SPEAr SPI Chip Select Abstraction"); MODULE_LICENSE("GPL"); diff --git a/drivers/irqchip/spear-shirq.c b/drivers/irqchip/spear-shirq.c index 8527743b5cef..3fdda3a40269 100644 --- a/drivers/irqchip/spear-shirq.c +++ b/drivers/irqchip/spear-shirq.c @@ -5,7 +5,7 @@ * Viresh Kumar * * Copyright (C) 2012 ST Microelectronics - * Shiraz Hashim + * Shiraz Hashim * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any diff --git a/drivers/mtd/devices/spear_smi.c b/drivers/mtd/devices/spear_smi.c index 363da96e6891..c4176b0f382d 100644 --- a/drivers/mtd/devices/spear_smi.c +++ b/drivers/mtd/devices/spear_smi.c @@ -6,7 +6,7 @@ * * Copyright © 2010 STMicroelectronics. * Ashish Priyadarshi - * Shiraz Hashim + * Shiraz Hashim * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any @@ -1089,5 +1089,5 @@ static struct platform_driver spear_smi_driver = { module_platform_driver(spear_smi_driver); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ashish Priyadarshi, Shiraz Hashim "); +MODULE_AUTHOR("Ashish Priyadarshi, Shiraz Hashim "); MODULE_DESCRIPTION("MTD SMI driver for serial nor flash chips"); diff --git a/drivers/pwm/pwm-spear.c b/drivers/pwm/pwm-spear.c index 8ad26b8bf418..cb2d4f0f9711 100644 --- a/drivers/pwm/pwm-spear.c +++ b/drivers/pwm/pwm-spear.c @@ -2,7 +2,7 @@ * ST Microelectronics SPEAr Pulse Width Modulator driver * * Copyright (C) 2012 ST Microelectronics - * Shiraz Hashim + * Shiraz Hashim * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any @@ -264,6 +264,6 @@ static struct platform_driver spear_pwm_driver = { module_platform_driver(spear_pwm_driver); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Shiraz Hashim "); +MODULE_AUTHOR("Shiraz Hashim "); MODULE_AUTHOR("Viresh Kumar "); MODULE_ALIAS("platform:spear-pwm"); diff --git a/include/linux/mtd/spear_smi.h b/include/linux/mtd/spear_smi.h index 8ae1726044c3..581603ac1277 100644 --- a/include/linux/mtd/spear_smi.h +++ b/include/linux/mtd/spear_smi.h @@ -1,6 +1,6 @@ /* * Copyright © 2010 ST Microelectronics - * Shiraz Hashim + * Shiraz Hashim * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any -- cgit v1.2.3 From 8b32201de1f87878ace971bfdc2846a4f3a5bb2b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Apr 2014 15:07:17 -0700 Subject: wait: explain the shadowing and type inconsistencies Stick in a comment before someone else tries to fix the sparse warning this generates. Suggested-by: Andrew Morton Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-o2ro6f3vkxklni0bc8f7m68s@git.kernel.org Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index e7d9d9ed14f5..bd68819f0815 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -191,11 +191,23 @@ wait_queue_head_t *bit_waitqueue(void *, int); (!__builtin_constant_p(state) || \ state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ +/* + * The below macro ___wait_event() has an explicit shadow of the __ret + * variable when used from the wait_event_*() macros. + * + * This is so that both can use the ___wait_cond_timeout() construct + * to wrap the condition. + * + * The type inconsistency of the wait_event_*() __ret variable is also + * on purpose; we use long where we can return timeout values and int + * otherwise. + */ + #define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ wait_queue_t __wait; \ - long __ret = ret; \ + long __ret = ret; /* explicit shadow */ \ \ INIT_LIST_HEAD(&__wait.task_list); \ if (exclusive) \ -- cgit v1.2.3 From 7848a4bf51b34f41fcc9bd77e837126d99ae84e3 Mon Sep 17 00:00:00 2001 From: "Mizuma, Masayoshi" Date: Fri, 18 Apr 2014 15:07:18 -0700 Subject: mm/hugetlb.c: add cond_resched_lock() in return_unused_surplus_pages() soft lockup in freeing gigantic hugepage fixed in commit 55f67141a892 "mm: hugetlb: fix softlockup when a large number of hugepages are freed." can happen in return_unused_surplus_pages(), so let's fix it. Signed-off-by: Masayoshi Mizuma Signed-off-by: Naoya Horiguchi Cc: Joonsoo Kim Cc: Michal Hocko Cc: Aneesh Kumar Cc: KOSAKI Motohiro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd30f22b35e0..246192929a2d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1172,6 +1172,7 @@ static void return_unused_surplus_pages(struct hstate *h, while (nr_pages--) { if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) break; + cond_resched_lock(&hugetlb_lock); } } -- cgit v1.2.3 From 8229f1a044894f84324292608c149f0b4563532b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 18 Apr 2014 15:07:19 -0700 Subject: mips: export flush_icache_range The lkdtm module performs tests against executable memory ranges, so it needs to flush the icache for proper behaviors. Other architectures already export this, so do the same for MIPS. [akpm@linux-foundation.org: relocate export sites] Signed-off-by: Kees Cook Cc: Paul Gortmaker Cc: Ralf Baechle Cc: Sanjay Lal Cc: John Crispin Cc: Sergei Shtylyov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/cache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index e422b38d3113..9e67cdea3c74 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -29,15 +29,15 @@ void (*flush_cache_range)(struct vm_area_struct *vma, unsigned long start, void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn); void (*flush_icache_range)(unsigned long start, unsigned long end); +EXPORT_SYMBOL_GPL(flush_icache_range); void (*local_flush_icache_range)(unsigned long start, unsigned long end); void (*__flush_cache_vmap)(void); void (*__flush_cache_vunmap)(void); void (*__flush_kernel_vmap_range)(unsigned long vaddr, int size); -void (*__invalidate_kernel_vmap_range)(unsigned long vaddr, int size); - EXPORT_SYMBOL_GPL(__flush_kernel_vmap_range); +void (*__invalidate_kernel_vmap_range)(unsigned long vaddr, int size); /* MIPS specific cache operations */ void (*flush_cache_sigtramp)(unsigned long addr); -- cgit v1.2.3 From 29c7787075c92ca8af353acd5301481e6f37082f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 18 Apr 2014 15:07:21 -0700 Subject: mm: use paravirt friendly ops for NUMA hinting ptes David Vrabel identified a regression when using automatic NUMA balancing under Xen whereby page table entries were getting corrupted due to the use of native PTE operations. Quoting him Xen PV guest page tables require that their entries use machine addresses if the preset bit (_PAGE_PRESENT) is set, and (for successful migration) non-present PTEs must use pseudo-physical addresses. This is because on migration MFNs in present PTEs are translated to PFNs (canonicalised) so they may be translated back to the new MFN in the destination domain (uncanonicalised). pte_mknonnuma(), pmd_mknonnuma(), pte_mknuma() and pmd_mknuma() set and clear the _PAGE_PRESENT bit using pte_set_flags(), pte_clear_flags(), etc. In a Xen PV guest, these functions must translate MFNs to PFNs when clearing _PAGE_PRESENT and translate PFNs to MFNs when setting _PAGE_PRESENT. His suggested fix converted p[te|md]_[set|clear]_flags to using paravirt-friendly ops but this is overkill. He suggested an alternative of using p[te|md]_modify in the NUMA page table operations but this is does more work than necessary and would require looking up a VMA for protections. This patch modifies the NUMA page table operations to use paravirt friendly operations to set/clear the flags of interest. Unfortunately this will take a performance hit when updating the PTEs on CONFIG_PARAVIRT but I do not see a way around it that does not break Xen. Signed-off-by: Mel Gorman Acked-by: David Vrabel Tested-by: David Vrabel Cc: Ingo Molnar Cc: Peter Anvin Cc: Fengguang Wu Cc: Linus Torvalds Cc: Steven Noonan Cc: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Srikar Dronamraju Cc: Cyrill Gorcunov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 1ec08c198b66..a8015a7a55bb 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -693,24 +693,35 @@ static inline int pmd_numa(pmd_t pmd) #ifndef pte_mknonnuma static inline pte_t pte_mknonnuma(pte_t pte) { - pte = pte_clear_flags(pte, _PAGE_NUMA); - return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED); + pteval_t val = pte_val(pte); + + val &= ~_PAGE_NUMA; + val |= (_PAGE_PRESENT|_PAGE_ACCESSED); + return __pte(val); } #endif #ifndef pmd_mknonnuma static inline pmd_t pmd_mknonnuma(pmd_t pmd) { - pmd = pmd_clear_flags(pmd, _PAGE_NUMA); - return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED); + pmdval_t val = pmd_val(pmd); + + val &= ~_PAGE_NUMA; + val |= (_PAGE_PRESENT|_PAGE_ACCESSED); + + return __pmd(val); } #endif #ifndef pte_mknuma static inline pte_t pte_mknuma(pte_t pte) { - pte = pte_set_flags(pte, _PAGE_NUMA); - return pte_clear_flags(pte, _PAGE_PRESENT); + pteval_t val = pte_val(pte); + + val &= ~_PAGE_PRESENT; + val |= _PAGE_NUMA; + + return __pte(val); } #endif @@ -729,8 +740,12 @@ static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, #ifndef pmd_mknuma static inline pmd_t pmd_mknuma(pmd_t pmd) { - pmd = pmd_set_flags(pmd, _PAGE_NUMA); - return pmd_clear_flags(pmd, _PAGE_PRESENT); + pmdval_t val = pmd_val(pmd); + + val &= ~_PAGE_PRESENT; + val |= _PAGE_NUMA; + + return __pmd(val); } #endif -- cgit v1.2.3 From a663dad65f3f41a0f9a8d0f2f1bc352524c4ff65 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 18 Apr 2014 15:07:22 -0700 Subject: mm: fix CONFIG_DEBUG_VM_RB description This appears to be a copy/paste error. Update the description to reflect extra rbtree debug and checks for the config option instead of duplicating CONFIG_DEBUG_VM. Signed-off-by: Davidlohr Bueso Cc: Aswin Chandramouleeswaran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 140b66a874c1..819ac51202c0 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -505,8 +505,7 @@ config DEBUG_VM_RB bool "Debug VM red-black trees" depends on DEBUG_VM help - Enable this to turn on more extended checks in the virtual-memory - system that may impact performance. + Enable VM red-black tree debugging information and extra validations. If unsure, say N. -- cgit v1.2.3 From b59b8cbca6291a23928f1a103defba03d1cdab2c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 18 Apr 2014 15:07:23 -0700 Subject: mm: fix new kernel-doc warning in filemap.c Fix new kernel-doc warning in mm/filemap.c: Warning(mm/filemap.c:2600): Excess function parameter 'ppos' description in '__generic_file_aio_write' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index a82fbe4c9e8e..5020b280a771 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2581,7 +2581,6 @@ EXPORT_SYMBOL(generic_perform_write); * @iocb: IO state structure (file, offset, etc.) * @iov: vector with data to write * @nr_segs: number of segments in the vector - * @ppos: position where to write * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates -- cgit v1.2.3 From b5a8cad376eebbd8598642697e92a27983aee802 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 18 Apr 2014 15:07:25 -0700 Subject: thp: close race between split and zap huge pages Sasha Levin has reported two THP BUGs[1][2]. I believe both of them have the same root cause. Let's look to them one by one. The first bug[1] is "kernel BUG at mm/huge_memory.c:1829!". It's BUG_ON(mapcount != page_mapcount(page)) in __split_huge_page(). From my testing I see that page_mapcount() is higher than mapcount here. I think it happens due to race between zap_huge_pmd() and page_check_address_pmd(). page_check_address_pmd() misses PMD which is under zap: CPU0 CPU1 zap_huge_pmd() pmdp_get_and_clear() __split_huge_page() anon_vma_interval_tree_foreach() __split_huge_page_splitting() page_check_address_pmd() mm_find_pmd() /* * We check if PMD present without taking ptl: no * serialization against zap_huge_pmd(). We miss this PMD, * it's not accounted to 'mapcount' in __split_huge_page(). */ pmd_present(pmd) == 0 BUG_ON(mapcount != page_mapcount(page)) // CRASH!!! page_remove_rmap(page) atomic_add_negative(-1, &page->_mapcount) The second bug[2] is "kernel BUG at mm/huge_memory.c:1371!". It's VM_BUG_ON_PAGE(!PageHead(page), page) in zap_huge_pmd(). This happens in similar way: CPU0 CPU1 zap_huge_pmd() pmdp_get_and_clear() page_remove_rmap(page) atomic_add_negative(-1, &page->_mapcount) __split_huge_page() anon_vma_interval_tree_foreach() __split_huge_page_splitting() page_check_address_pmd() mm_find_pmd() pmd_present(pmd) == 0 /* The same comment as above */ /* * No crash this time since we already decremented page->_mapcount in * zap_huge_pmd(). */ BUG_ON(mapcount != page_mapcount(page)) /* * We split the compound page here into small pages without * serialization against zap_huge_pmd() */ __split_huge_page_refcount() VM_BUG_ON_PAGE(!PageHead(page), page); // CRASH!!! So my understanding the problem is pmd_present() check in mm_find_pmd() without taking page table lock. The bug was introduced by me commit with commit 117b0791ac42. Sorry for that. :( Let's open code mm_find_pmd() in page_check_address_pmd() and do the check under page table lock. Note that __page_check_address() does the same for PTE entires if sync != 0. I've stress tested split and zap code paths for 36+ hours by now and don't see crashes with the patch applied. Before it took <20 min to trigger the first bug and few hours for second one (if we ignore first). [1] https://lkml.kernel.org/g/<53440991.9090001@oracle.com> [2] https://lkml.kernel.org/g/<5310C56C.60709@oracle.com> Signed-off-by: Kirill A. Shutemov Reported-by: Sasha Levin Tested-by: Sasha Levin Cc: Bob Liu Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Michel Lespinasse Cc: Dave Jones Cc: Vlastimil Babka Cc: [3.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 64635f5278ff..b4b1feba6472 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1536,16 +1536,23 @@ pmd_t *page_check_address_pmd(struct page *page, enum page_check_address_pmd_flag flag, spinlock_t **ptl) { + pgd_t *pgd; + pud_t *pud; pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) return NULL; - pmd = mm_find_pmd(mm, address); - if (!pmd) + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) return NULL; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + *ptl = pmd_lock(mm, pmd); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) goto unlock; if (pmd_page(*pmd) != page) goto unlock; -- cgit v1.2.3