From 8625147cafaa9ba74713d682f5185eb62cb2aedb Mon Sep 17 00:00:00 2001 From: James Houghton Date: Tue, 18 Oct 2022 20:01:25 +0000 Subject: hugetlbfs: don't delete error page from pagecache This change is very similar to the change that was made for shmem [1], and it solves the same problem but for HugeTLBFS instead. Currently, when poison is found in a HugeTLB page, the page is removed from the page cache. That means that attempting to map or read that hugepage in the future will result in a new hugepage being allocated instead of notifying the user that the page was poisoned. As [1] states, this is effectively memory corruption. The fix is to leave the page in the page cache. If the user attempts to use a poisoned HugeTLB page with a syscall, the syscall will fail with EIO, the same error code that shmem uses. For attempts to map the page, the thread will get a BUS_MCEERR_AR SIGBUS. [1]: commit a76054266661 ("mm: shmem: don't truncate page if memory failure happens") Link: https://lkml.kernel.org/r/20221018200125.848471-1-jthoughton@google.com Signed-off-by: James Houghton Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Tested-by: Naoya Horiguchi Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: James Houghton Cc: Miaohe Lin Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++++ mm/memory-failure.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 546df97c31e4..e48f8ef45b17 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6111,6 +6111,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ptl = huge_pte_lock(h, dst_mm, dst_pte); + ret = -EIO; + if (PageHWPoison(page)) + goto out_release_unlock; + /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 145bb561ddb3..bead6bccc7f2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1080,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) int res; struct page *hpage = compound_head(p); struct address_space *mapping; + bool extra_pins = false; if (!PageHuge(hpage)) return MF_DELAYED; @@ -1087,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p) mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, page_to_pfn(p), mapping); + /* The page is kept in page cache. */ + extra_pins = true; unlock_page(hpage); } else { unlock_page(hpage); @@ -1104,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) } } - if (has_extra_refcount(ps, p, false)) + if (has_extra_refcount(ps, p, extra_pins)) res = MF_FAILED; return res; -- cgit v1.2.3 From cc674ab3c0188002917c8a2c28e4424131f1fd7e Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Fri, 28 Oct 2022 15:37:17 +0800 Subject: mm/mmap: fix memory leak in mmap_region() There is a memory leak reported by kmemleak: unreferenced object 0xffff88817231ce40 (size 224): comm "mount.cifs", pid 19308, jiffies 4295917571 (age 405.880s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 60 c0 b2 00 81 88 ff ff 98 83 01 42 81 88 ff ff `..........B.... backtrace: [] __alloc_file+0x21/0x250 [] alloc_empty_file+0x41/0xf0 [] alloc_file+0x59/0x710 [] alloc_file_pseudo+0x154/0x210 [] __shmem_file_setup+0xff/0x2a0 [] shmem_zero_setup+0x8d/0x160 [] mmap_region+0x1075/0x19d0 [] do_mmap+0x727/0x1110 [] vm_mmap_pgoff+0x112/0x1e0 [] do_syscall_64+0x35/0x80 [] entry_SYSCALL_64_after_hwframe+0x46/0xb0 The root cause was traced to an error handing path in mmap_region() when arch_validate_flags() or mas_preallocate() fails. In the shared anonymous mapping sence, vma will be setuped and mapped with a new shared anonymous file via shmem_zero_setup(). So in this case, the file resource needs to be released. Fix it by calling fput(vma->vm_file) and unmap_region() when arch_validate_flags() or mas_preallocate() returns an error in the shared anonymous mapping sence. Link: https://lkml.kernel.org/r/20221028073717.1179380-1-lizetao1@huawei.com Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree") Fixes: c462ac288f2c ("mm: Introduce arch_validate_flags()") Signed-off-by: Li Zetao Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 2def55555e05..c3c5c1d6103d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2674,6 +2674,8 @@ cannot_expand: error = -EINVAL; if (file) goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; else goto free_vma; } @@ -2682,6 +2684,8 @@ cannot_expand: error = -ENOMEM; if (file) goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; else goto free_vma; } @@ -2751,7 +2755,7 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); - if (vm_flags & VM_SHARED) + if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); free_vma: vm_area_free(vma); -- cgit v1.2.3 From 624a2c94f5b7a08120aaf26b3275a17463789273 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 8 Nov 2022 15:57:23 -0800 Subject: Partly revert "mm/thp: carry over dirty bit when thp splits on pmd" Anatoly Pugachev reported sparc64 breakage on the patch: https://lore.kernel.org/r/20221021160603.GA23307@u164.east.ru The sparc64 impl of pte_mkdirty() is definitely slightly special in that it leverages a code patching mechanism for sun4u/sun4v on relevant pgtable entry operations. Before having a clue of why the sparc64 is special and caused the patch to SIGSEGV the processes, revert the patch for now. The swap path of dirty bit inheritage is kept because that's using the swap shared code so we assume it'll not be affected. Link: https://lkml.kernel.org/r/Y1Wbi4yyVvDtg4zN@x1n Fixes: 0ccf7f168e17 ("mm/thp: carry over dirty bit when thp splits on pmd") Signed-off-by: Peter Xu Reported-by: Anatoly Pugachev Tested-by: Anatoly Pugachev Cc: Alistair Popple Cc: Andi Kleen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: David S. Miller Cc: "Huang, Ying" Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Minchan Kim Cc: Nadav Amit Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/huge_memory.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 561a42567477..811d19b5c4f6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2206,9 +2206,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); - /* NOTE: this may set soft-dirty too on some archs */ - if (dirty) - entry = pte_mkdirty(entry); + /* + * NOTE: we don't do pte_mkdirty when dirty==true + * because it breaks sparc64 which can sigsegv + * random process. Need to revisit when we figure + * out what is special with sparc64. + */ if (soft_dirty) entry = pte_mksoft_dirty(entry); if (uffd_wp) -- cgit v1.2.3 From 867400af90f1f953ff9e10b1b87ecaf9369a7eb8 Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Wed, 2 Nov 2022 11:07:28 -0500 Subject: mm/memremap.c: map FS_DAX device memory as decrypted virtio_pmem use devm_memremap_pages() to map the device memory. By default this memory is mapped as encrypted with SEV. Guest reboot changes the current encryption key and guest no longer properly decrypts the FSDAX device meta data. Mark the corresponding device memory region for FSDAX devices (mapped with memremap_pages) as decrypted to retain the persistent memory property. Link: https://lkml.kernel.org/r/20221102160728.3184016-1-pankaj.gupta@amd.com Fixes: b7b3c01b19159 ("mm/memremap_pages: support multiple ranges per invocation") Signed-off-by: Pankaj Gupta Cc: Dan Williams Cc: Tom Lendacky Cc: Signed-off-by: Andrew Morton --- mm/memremap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/memremap.c b/mm/memremap.c index 421bec3a29ee..08cbf54fe037 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -335,6 +335,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "File system DAX not supported\n"); return ERR_PTR(-EINVAL); } + params.pgprot = pgprot_decrypted(params.pgprot); break; case MEMORY_DEVICE_GENERIC: break; -- cgit v1.2.3 From 93b0d9178743a68723babe8448981f658aebc58e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 2 Nov 2022 14:41:52 -0400 Subject: mm/shmem: use page_mapping() to detect page cache for uffd continue mfill_atomic_install_pte() checks page->mapping to detect whether one page is used in the page cache. However as pointed out by Matthew, the page can logically be a tail page rather than always the head in the case of uffd minor mode with UFFDIO_CONTINUE. It means we could wrongly install one pte with shmem thp tail page assuming it's an anonymous page. It's not that clear even for anonymous page, since normally anonymous pages also have page->mapping being setup with the anon vma. It's safe here only because the only such caller to mfill_atomic_install_pte() is always passing in a newly allocated page (mcopy_atomic_pte()), whose page->mapping is not yet setup. However that's not extremely obvious either. For either of above, use page_mapping() instead. Link: https://lkml.kernel.org/r/Y2K+y7wnhC4vbnP2@x1n Fixes: 153132571f02 ("userfaultfd/shmem: support UFFDIO_CONTINUE for shmem") Signed-off-by: Peter Xu Reported-by: Matthew Wilcox Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Axel Rasmussen Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3d0fef3980b3..650ab6cfd5f4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -64,7 +64,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, pte_t _dst_pte, *dst_pte; bool writable = dst_vma->vm_flags & VM_WRITE; bool vm_shared = dst_vma->vm_flags & VM_SHARED; - bool page_in_cache = page->mapping; + bool page_in_cache = page_mapping(page); spinlock_t *ptl; struct inode *inode; pgoff_t offset, max_off; -- cgit v1.2.3 From db5e8d84319bcdb51e1d3cfa42b410291d6d1cfa Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 2 Nov 2022 19:09:17 +0100 Subject: mm: hugetlb_vmemmap: include missing linux/moduleparam.h The kernel test robot reported build failures with a 'randconfig' on s390: >> mm/hugetlb_vmemmap.c:421:11: error: a function declaration without a prototype is deprecated in all versions of C [-Werror,-Wstrict-prototypes] core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); ^ Link: https://lore.kernel.org/linux-mm/202210300751.rG3UDsuc-lkp@intel.com/ Link: https://lkml.kernel.org/r/patch.git-296b83ca939b.your-ad-here.call-01667411912-ext-5073@work.hours Fixes: 30152245c63b ("mm: hugetlb_vmemmap: replace early_param() with core_param()") Signed-off-by: Vasily Gorbik Reported-by: kernel test robot Reviewed-by: Muchun Song Cc: Gerald Schaefer Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ba2a2596fb4e..4962dd1ba4a6 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "HugeTLB: " fmt #include +#include #include #include #include -- cgit v1.2.3 From cbadaf71f7cf9e67c073eec673c6c050cecd0ec8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 2 Nov 2022 12:06:07 +0100 Subject: kmsan: core: kmsan_in_runtime() should return true in NMI context Without that, every call to __msan_poison_alloca() in NMI may end up allocating memory, which is NMI-unsafe. Link: https://lkml.kernel.org/r/20221102110611.1085175-1-glider@google.com Link: https://lore.kernel.org/lkml/20221025221755.3810809-1-glider@google.com/ Signed-off-by: Alexander Potapenko Acked-by: Peter Zijlstra (Intel) Cc: Dmitry Vyukov Cc: Marco Elver Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Kees Cook Cc: Masahiro Yamada Cc: Nick Desaulniers Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- mm/kmsan/kmsan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 7019c46d33a7..a14744205435 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -124,6 +124,8 @@ static __always_inline bool kmsan_in_runtime(void) { if ((hardirq_count() >> HARDIRQ_SHIFT) > 1) return true; + if (in_nmi()) + return true; return kmsan_get_context()->kmsan_in_runtime; } -- cgit v1.2.3 From 1de09a7281edecfdba19b3a07417f6d65243ab5f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 7 Nov 2022 16:50:00 +0000 Subject: mm/damon/dbgfs: check if rm_contexts input is for a real context A user could write a name of a file under 'damon/' debugfs directory, which is not a user-created context, to 'rm_contexts' file. In the case, 'dbgfs_rm_context()' just assumes it's the valid DAMON context directory only if a file of the name exist. As a result, invalid memory access could happen as below. Fix the bug by checking if the given input is for a directory. This check can filter out non-context inputs because directories under 'damon/' debugfs directory can be created via only 'mk_contexts' file. This bug has found by syzbot[1]. [1] https://lore.kernel.org/damon/000000000000ede3ac05ec4abf8e@google.com/ Link: https://lkml.kernel.org/r/20221107165001.5717-2-sj@kernel.org Fixes: 75c1c2b53c78 ("mm/damon/dbgfs: support multiple contexts") Signed-off-by: SeongJae Park Reported-by: syzbot+6087eafb76a94c4ac9eb@syzkaller.appspotmail.com Cc: [5.15.x] Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 6f0ae7d3ae39..b3f454a5c682 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -890,6 +890,7 @@ out: static int dbgfs_rm_context(char *name) { struct dentry *root, *dir, **new_dirs; + struct inode *inode; struct damon_ctx **new_ctxs; int i, j; int ret = 0; @@ -905,6 +906,12 @@ static int dbgfs_rm_context(char *name) if (!dir) return -ENOENT; + inode = d_inode(dir); + if (!S_ISDIR(inode->i_mode)) { + ret = -EINVAL; + goto out_dput; + } + new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), GFP_KERNEL); if (!new_dirs) { -- cgit v1.2.3