From 8609d1b5daa36350e020e737946c40887af1743a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Nov 2015 17:07:55 -0800 Subject: x86/mm: Turn CONFIG_X86_PTDUMP into a module Being able to examine page tables is handy, so make this a module that can be loaded as needed. Signed-off-by: Kees Cook Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Vladimir Murzin Cc: Will Deacon Link: http://lkml.kernel.org/r/20151120010755.GA9060@www.outflux.net Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 2 +- arch/x86/mm/Makefile | 1 + arch/x86/mm/debug_pagetables.c | 46 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/dump_pagetables.c | 34 ++----------------------------- 4 files changed, 50 insertions(+), 33 deletions(-) create mode 100644 arch/x86/mm/debug_pagetables.c (limited to 'arch') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 137dfa96aa14..110253ce83af 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -69,7 +69,7 @@ config X86_PTDUMP_CORE def_bool n config X86_PTDUMP - bool "Export kernel pagetable layout to userspace via debugfs" + tristate "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL select DEBUG_FS select X86_PTDUMP_CORE diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 65c47fda26fc..f9d38a48e3c8 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c new file mode 100644 index 000000000000..b35ee86a9316 --- /dev/null +++ b/arch/x86/mm/debug_pagetables.c @@ -0,0 +1,46 @@ +#include +#include +#include +#include + +static int ptdump_show(struct seq_file *m, void *v) +{ + ptdump_walk_pgd_level(m, NULL); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .owner = THIS_MODULE, + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *pe; + +static int __init pt_dump_debug_init(void) +{ + pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, + &ptdump_fops); + if (!pe) + return -ENOMEM; + + return 0; +} + +static void __exit pt_dump_debug_exit(void) +{ + debugfs_remove_recursive(pe); +} + +module_init(pt_dump_debug_init); +module_exit(pt_dump_debug_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arjan van de Ven "); +MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a035c2aa7801..90a1dc054873 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -426,38 +426,15 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) { ptdump_walk_pgd_level_core(m, pgd, false); } +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); void ptdump_walk_pgd_level_checkwx(void) { ptdump_walk_pgd_level_core(NULL, NULL, true); } -#ifdef CONFIG_X86_PTDUMP -static int ptdump_show(struct seq_file *m, void *v) +static int __init pt_dump_init(void) { - ptdump_walk_pgd_level(m, NULL); - return 0; -} - -static int ptdump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif - -static int pt_dump_init(void) -{ -#ifdef CONFIG_X86_PTDUMP - struct dentry *pe; -#endif - #ifdef CONFIG_X86_32 /* Not a compile-time constant on x86-32 */ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; @@ -468,13 +445,6 @@ static int pt_dump_init(void) address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; #endif -#ifdef CONFIG_X86_PTDUMP - pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, - &ptdump_fops); - if (!pe) - return -ENOMEM; -#endif - return 0; } -- cgit v1.2.3 From c332813b51cbe807d539bb059b81235abf1e3fdd Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 1 Dec 2015 21:44:50 +0100 Subject: x86/mm/mtrr: Mark the 'range_new' static variable in mtrr_calc_range_state() as __initdata 'range_new' doesn't seem to be used after init. It is only passed to memset(), sum_ranges(), memcmp() and x86_get_mtrr_mem_range(), the latter of which also only passes it on to various *range* library functions. So mark it __initdata to free up an extra page after init. Its contents are wiped at every call to mtrr_calc_range_state(), so it being static is not about preserving state between calls, but simply to avoid a 4k+ stack frame. While there, add a comment explaining this and why it's safe. We could also mark nr_range_new as __initdata, but since it's just a single int and also doesn't carry state between calls (it is unconditionally assigned to before it is read), we might as well make it an ordinary automatic variable. Signed-off-by: Rasmus Villemoes Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/1449002691-20783-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 70d7c93f4550..0d98503c2245 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -593,9 +593,16 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size, unsigned long x_remove_base, unsigned long x_remove_size, int i) { - static struct range range_new[RANGE_NUM]; + /* + * range_new should really be an automatic variable, but + * putting 4096 bytes on the stack is frowned upon, to put it + * mildly. It is safe to make it a static __initdata variable, + * since mtrr_calc_range_state is only called during init and + * there's no way it will call itself recursively. + */ + static struct range range_new[RANGE_NUM] __initdata; unsigned long range_sums_new; - static int nr_range_new; + int nr_range_new; int num_reg; /* Convert ranges to var ranges state: */ -- cgit v1.2.3 From 071ac0c4e8e90d5de05f0779b03ae69ce84820d5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 Nov 2015 13:12:59 +0100 Subject: x86/mm/ptdump: Make (debugfs)/kernel_page_tables read-only File should be created with S_IRUSR and not with S_IWUSR too because writing to it doesn't make any sense. I mean, we don't have a ->write method anyway but let's have the permissions correct too. Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1448885579-32506-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/debug_pagetables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index b35ee86a9316..bfcffdf6c577 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -26,7 +26,7 @@ static struct dentry *pe; static int __init pt_dump_debug_init(void) { - pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, + pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, &ptdump_fops); if (!pe) return -ENOMEM; -- cgit v1.2.3 From 8dd3303001976aa8583bf20f6b93590c74114308 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Fri, 4 Dec 2015 14:07:05 +0100 Subject: x86/mm: Introduce max_possible_pfn max_possible_pfn will be used for tracking max possible PFN for memory that isn't present in E820 table and could be hotplugged later. By default max_possible_pfn is initialized with max_pfn, but later it could be updated with highest PFN of hotpluggable memory ranges declared in ACPI SRAT table if any present. Signed-off-by: Igor Mammedov Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: fujita.tomonori@lab.ntt.co.jp Cc: konrad.wilk@oracle.com Cc: pbonzini@redhat.com Cc: revers@redhat.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1449234426-273049-2-git-send-email-imammedo@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 ++ arch/x86/mm/srat.c | 2 ++ include/linux/bootmem.h | 4 ++++ mm/bootmem.c | 1 + mm/nobootmem.c | 1 + 5 files changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 29db25f9a745..16a846548ece 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1048,6 +1048,8 @@ void __init setup_arch(char **cmdline_p) if (mtrr_trim_uncached_memory(max_pfn)) max_pfn = e820_end_of_ram_pfn(); + max_possible_pfn = max_pfn; + #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ find_low_pfn_range(); diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index c2aea63bee20..b5f821881465 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -203,6 +203,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", (unsigned long long)start, (unsigned long long)end - 1); + max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1)); + return 0; out_err_bad_srat: bad_srat(); diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index f589222bfa87..35b22f94d2d2 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -19,6 +19,10 @@ extern unsigned long min_low_pfn; * highest page */ extern unsigned long max_pfn; +/* + * highest possible page + */ +extern unsigned long long max_possible_pfn; #ifndef CONFIG_NO_BOOTMEM /* diff --git a/mm/bootmem.c b/mm/bootmem.c index 3b6380784c28..91e32bc8517f 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -33,6 +33,7 @@ EXPORT_SYMBOL(contig_page_data); unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; +unsigned long long max_possible_pfn; bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e57cf24babd6..99feb2b07fc5 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL(contig_page_data); unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; +unsigned long long max_possible_pfn; static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) -- cgit v1.2.3 From ec941c5ffede4d788b9fc008f9eeca75b9e964f5 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Fri, 4 Dec 2015 14:07:06 +0100 Subject: x86/mm/64: Enable SWIOTLB if system has SRAT memory regions above MAX_DMA32_PFN when memory hotplug enabled system is booted with less than 4GB of RAM and then later more RAM is hotplugged 32-bit devices stop functioning with following error: nommu_map_single: overflow 327b4f8c0+1522 of device mask ffffffff the reason for this is that if x86_64 system were booted with RAM less than 4GB, it doesn't enable SWIOTLB and when memory is hotplugged beyond MAX_DMA32_PFN, devices that expect 32-bit addresses can't handle 64-bit addresses. Fix it by tracking max possible PFN when parsing memory affinity structures from SRAT ACPI table and enable SWIOTLB if there is hotpluggable memory regions beyond MAX_DMA32_PFN. It fixes KVM guests when they use emulated devices (reproduces with ata_piix, e1000 and usb devices, RHBZ: 1275941, 1275977, 1271527) It also fixes the HyperV, VMWare with emulated devices which are affected by this issue as well. Signed-off-by: Igor Mammedov Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: fujita.tomonori@lab.ntt.co.jp Cc: konrad.wilk@oracle.com Cc: pbonzini@redhat.com Cc: revers@redhat.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1449234426-273049-3-git-send-email-imammedo@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index adf0392d549a..7c577a178859 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -88,7 +88,7 @@ int __init pci_swiotlb_detect_4gb(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if (!no_iommu && max_pfn > MAX_DMA32_PFN) + if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif return swiotlb; -- cgit v1.2.3 From 0d430e3fb3f7cdc13c0d22078b820f682821b45a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 22 Dec 2015 08:42:44 -0700 Subject: x86/LDT: Print the real LDT base address This was meant to print base address and entry count; make it do so again. Fixes: 37868fe113ff "x86/ldt: Make modify_ldt synchronous" Signed-off-by: Jan Beulich Acked-by: Andy Lutomirski Link: http://lkml.kernel.org/r/56797D8402000078000C24F0@prv-mh.provo.novell.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e835d263a33b..b9d99e0f82c4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -125,7 +125,7 @@ void release_thread(struct task_struct *dead_task) if (dead_task->mm->context.ldt) { pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, - dead_task->mm->context.ldt, + dead_task->mm->context.ldt->entries, dead_task->mm->context.ldt->size); BUG(); } -- cgit v1.2.3 From 9abb0ecdee69a2577560cc283368e490da974934 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Mon, 21 Dec 2015 12:01:14 -0800 Subject: x86/mm: Drop WARN from multi-BAR check ioremapping multiple BARs produces a warning with a message "Your kernel is fine". This message mostly serves to comfort kernel developers. Users do not read the message, they only see the big scary warning which means something must be horribly broken with their system. Less dramatically, the warn also sets the taint flag which makes it difficult to differentiate problems. If the kernel is actually fine as the warning claims it doesn't make sense for it to be tainted. Change the WARN_ONCE to a pr_warn with the caller of the ioremap. Signed-off-by: Laura Abbott Link: http://lkml.kernel.org/r/1450728074-31029-1-git-send-email-labbott@fedoraproject.org Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index b9c78f3bcd67..0d8d53d1f5cc 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -194,8 +194,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Check if the request spans more than any BAR in the iomem resource * tree. */ - WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size), - KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); + if (iomem_map_sanity_check(unaligned_phys_addr, unaligned_size)) + pr_warn("caller %pS mapping multiple BARs\n", caller); return ret_addr; err_free_area: -- cgit v1.2.3 From d9fe4fab11976e56b2e992980bf6ce948bdf02ac Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 22 Dec 2015 17:54:23 -0700 Subject: x86/mm/pat: Add untrack_pfn_moved for mremap mremap() with MREMAP_FIXED on a VM_PFNMAP range causes the following WARN_ON_ONCE() message in untrack_pfn(). WARNING: CPU: 1 PID: 3493 at arch/x86/mm/pat.c:985 untrack_pfn+0xbd/0xd0() Call Trace: [] dump_stack+0x45/0x57 [] warn_slowpath_common+0x86/0xc0 [] warn_slowpath_null+0x1a/0x20 [] untrack_pfn+0xbd/0xd0 [] unmap_single_vma+0x80e/0x860 [] unmap_vmas+0x55/0xb0 [] unmap_region+0xac/0x120 [] do_munmap+0x28a/0x460 [] move_vma+0x1b3/0x2e0 [] SyS_mremap+0x3b3/0x510 [] entry_SYSCALL_64_fastpath+0x12/0x71 MREMAP_FIXED moves a pfnmap from old vma to new vma. untrack_pfn() is called with the old vma after its pfnmap page table has been removed, which causes follow_phys() to fail. The new vma has a new pfnmap to the same pfn & cache type with VM_PAT set. Therefore, we only need to clear VM_PAT from the old vma in this case. Add untrack_pfn_moved(), which clears VM_PAT from a given old vma. move_vma() is changed to call this function with the old vma when VM_PFNMAP is set. move_vma() then calls do_munmap(), and untrack_pfn() is a no-op since VM_PAT is cleared. Reported-by: Stas Sergeev Signed-off-by: Toshi Kani Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Borislav Petkov Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1450832064-10093-2-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pat.c | 10 ++++++++++ include/asm-generic/pgtable.h | 10 +++++++++- mm/mremap.c | 4 ++++ 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 188e3e07eeeb..1aca073ba571 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -992,6 +992,16 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, vma->vm_flags &= ~VM_PAT; } +/* + * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, + * with the old vma after its pfnmap page table has been removed. The new + * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. + */ +void untrack_pfn_moved(struct vm_area_struct *vma) +{ + vma->vm_flags &= ~VM_PAT; +} + pgprot_t pgprot_writecombine(pgprot_t prot) { return __pgprot(pgprot_val(prot) | diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 14b0ff32fb9f..3a6803cb0ec9 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -569,7 +569,7 @@ static inline int track_pfn_copy(struct vm_area_struct *vma) } /* - * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ @@ -577,6 +577,13 @@ static inline void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size) { } + +/* + * untrack_pfn_moved is called while mremapping a pfnmap for a new region. + */ +static inline void untrack_pfn_moved(struct vm_area_struct *vma) +{ +} #else extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, @@ -586,6 +593,7 @@ extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); +extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif #ifdef __HAVE_COLOR_ZERO_PAGE diff --git a/mm/mremap.c b/mm/mremap.c index c25bc6268e46..de824e72c3e8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -319,6 +319,10 @@ static unsigned long move_vma(struct vm_area_struct *vma, hiwater_vm = mm->hiwater_vm; vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + /* Tell pfnmap has moved from this vma */ + if (unlikely(vma->vm_flags & VM_PFNMAP)) + untrack_pfn_moved(vma); + if (do_munmap(mm, old_addr, old_len) < 0) { /* OOM: unable to split vma, just get accounts right */ vm_unacct_memory(excess >> PAGE_SHIFT); -- cgit v1.2.3 From 2039e6acaf94d83ec6b6d9f3d0bce7ea1f099918 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 22 Dec 2015 17:54:24 -0700 Subject: x86/mm/pat: Change free_memtype() to support shrinking case Using mremap() to shrink the map size of a VM_PFNMAP range causes the following error message, and leaves the pfn range allocated. x86/PAT: test:3493 freeing invalid memtype [mem 0x483200000-0x4863fffff] This is because rbt_memtype_erase(), called from free_memtype() with spin_lock held, only supports to free a whole memtype node in memtype_rbroot. Therefore, this patch changes rbt_memtype_erase() to support a request that shrinks the size of a memtype node for mremap(). memtype_rb_exact_match() is renamed to memtype_rb_match(), and is enhanced to support EXACT_MATCH and END_MATCH in @match_type. Since the memtype_rbroot tree allows overlapping ranges, rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free a whole node for the munmap case. If no such entry is found, it then checks with END_MATCH, i.e. shrink the size of a node from the end for the mremap case. On the mremap case, rbt_memtype_erase() proceeds in two steps, 1) remove the node, and then 2) insert the updated node. This allows proper update of augmented values, subtree_max_end, in the tree. Signed-off-by: Toshi Kani Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Borislav Petkov Cc: stsp@list.ru Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1450832064-10093-3-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pat.c | 2 +- arch/x86/mm/pat_rbtree.c | 52 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1aca073ba571..031782e74231 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -586,7 +586,7 @@ int free_memtype(u64 start, u64 end) entry = rbt_memtype_erase(start, end); spin_unlock(&memtype_lock); - if (!entry) { + if (IS_ERR(entry)) { pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, start, end - 1); return -EINVAL; diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 63931080366a..2f7702253ccf 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -98,8 +98,13 @@ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, return last_lower; /* Returns NULL if there is no overlap */ } -static struct memtype *memtype_rb_exact_match(struct rb_root *root, - u64 start, u64 end) +enum { + MEMTYPE_EXACT_MATCH = 0, + MEMTYPE_END_MATCH = 1 +}; + +static struct memtype *memtype_rb_match(struct rb_root *root, + u64 start, u64 end, int match_type) { struct memtype *match; @@ -107,7 +112,12 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root, while (match != NULL && match->start < end) { struct rb_node *node; - if (match->start == start && match->end == end) + if ((match_type == MEMTYPE_EXACT_MATCH) && + (match->start == start) && (match->end == end)) + return match; + + if ((match_type == MEMTYPE_END_MATCH) && + (match->start < start) && (match->end == end)) return match; node = rb_next(&match->rb); @@ -117,7 +127,7 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root, match = NULL; } - return NULL; /* Returns NULL if there is no exact match */ + return NULL; /* Returns NULL if there is no match */ } static int memtype_rb_check_conflict(struct rb_root *root, @@ -210,12 +220,36 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end) { struct memtype *data; - data = memtype_rb_exact_match(&memtype_rbroot, start, end); - if (!data) - goto out; + /* + * Since the memtype_rbroot tree allows overlapping ranges, + * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free + * a whole node for the munmap case. If no such entry is found, + * it then checks with END_MATCH, i.e. shrink the size of a node + * from the end for the mremap case. + */ + data = memtype_rb_match(&memtype_rbroot, start, end, + MEMTYPE_EXACT_MATCH); + if (!data) { + data = memtype_rb_match(&memtype_rbroot, start, end, + MEMTYPE_END_MATCH); + if (!data) + return ERR_PTR(-EINVAL); + } + + if (data->start == start) { + /* munmap: erase this node */ + rb_erase_augmented(&data->rb, &memtype_rbroot, + &memtype_rb_augment_cb); + } else { + /* mremap: update the end value of this node */ + rb_erase_augmented(&data->rb, &memtype_rbroot, + &memtype_rb_augment_cb); + data->end = start; + data->subtree_max_end = data->end; + memtype_rb_insert(&memtype_rbroot, data); + return NULL; + } - rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb); -out: return data; } -- cgit v1.2.3 From 1f1a89ac05f6e88aa341e86e57435fdbb1177c0c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 8 Jan 2016 09:55:33 +0000 Subject: x86/mm: Micro-optimise clflush_cache_range() Whilst inspecting the asm for clflush_cache_range() and some perf profiles that required extensive flushing of single cachelines (from part of the intel-gpu-tools GPU benchmarks), we noticed that gcc was reloading boot_cpu_data.x86_clflush_size on every iteration of the loop. We can manually hoist that read which perf regarded as taking ~25% of the function time for a single cacheline flush. Signed-off-by: Chris Wilson Reviewed-by: Ross Zwisler Acked-by: "H. Peter Anvin" Cc: Toshi Kani Cc: Borislav Petkov Cc: Luis R. Rodriguez Cc: Stephen Rothwell Cc: Sai Praneeth Link: http://lkml.kernel.org/r/1452246933-10890-1-git-send-email-chris@chris-wilson.co.uk Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a3137a4feed1..6000ad7f560c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -129,14 +129,16 @@ within(unsigned long addr, unsigned long start, unsigned long end) */ void clflush_cache_range(void *vaddr, unsigned int size) { - unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; + void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); void *vend = vaddr + size; - void *p; + + if (p >= vend) + return; mb(); - for (p = (void *)((unsigned long)vaddr & ~clflush_mask); - p < vend; p += boot_cpu_data.x86_clflush_size) + for (; p < vend; p += clflush_size) clflushopt(p); mb(); -- cgit v1.2.3