From efd4149342db2df41b1bbe68972ead853b30e444 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Aug 2022 12:00:03 -0400 Subject: mm/smaps: don't access young/dirty bit if pte unpresent These bits should only be valid when the ptes are present. Introducing two booleans for it and set it to false when !pte_present() for both pte and pmd accountings. The bug is found during code reading and no real world issue reported, but logically such an error can cause incorrect readings for either smaps or smaps_rollup output on quite a few fields. For example, it could cause over-estimate on values like Shared_Dirty, Private_Dirty, Referenced. Or it could also cause under-estimate on values like LazyFree, Shared_Clean, Private_Clean. Link: https://lkml.kernel.org/r/20220805160003.58929-1-peterx@redhat.com Fixes: b1d4d9e0cbd0 ("proc/smaps: carefully handle migration entries") Fixes: c94b6923fa0a ("/proc/PID/smaps: Add PMD migration entry parsing") Signed-off-by: Peter Xu Reviewed-by: Vlastimil Babka Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Konstantin Khlebnikov Cc: Huang Ying Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a3398d0f1927..4e0023643f8b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -527,10 +527,12 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; - bool migration = false; + bool migration = false, young = false, dirty = false; if (pte_present(*pte)) { page = vm_normal_page(vma, addr, *pte); + young = pte_young(*pte); + dirty = pte_dirty(*pte); } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); @@ -560,8 +562,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), - locked, migration); + smaps_account(mss, page, false, young, dirty, locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v1.2.3 From f369b07c861435bd812a9d14493f71b34132ed6f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 16:13:40 -0400 Subject: mm/uffd: reset write protection when unregister with wp-mode The motivation of this patch comes from a recent report and patchfix from David Hildenbrand on hugetlb shared handling of wr-protected page [1]. With the reproducer provided in commit message of [1], one can leverage the uffd-wp lazy-reset of ptes to trigger a hugetlb issue which can affect not only the attacker process, but also the whole system. The lazy-reset mechanism of uffd-wp was used to make unregister faster, meanwhile it has an assumption that any leftover pgtable entries should only affect the process on its own, so not only the user should be aware of anything it does, but also it should not affect outside of the process. But it seems that this is not true, and it can also be utilized to make some exploit easier. So far there's no clue showing that the lazy-reset is important to any userfaultfd users because normally the unregister will only happen once for a specific range of memory of the lifecycle of the process. Considering all above, what this patch proposes is to do explicit pte resets when unregister an uffd region with wr-protect mode enabled. It should be the same as calling ioctl(UFFDIO_WRITEPROTECT, wp=false) right before ioctl(UFFDIO_UNREGISTER) for the user. So potentially it'll make the unregister slower. From that pov it's a very slight abi change, but hopefully nothing should break with this change either. Regarding to the change itself - core of uffd write [un]protect operation is moved into a separate function (uffd_wp_range()) and it is reused in the unregister code path. Note that the new function will not check for anything, e.g. ranges or memory types, because they should have been checked during the previous UFFDIO_REGISTER or it should have failed already. It also doesn't check mmap_changing because we're with mmap write lock held anyway. I added a Fixes upon introducing of uffd-wp shmem+hugetlbfs because that's the only issue reported so far and that's the commit David's reproducer will start working (v5.19+). But the whole idea actually applies to not only file memories but also anonymous. It's just that we don't need to fix anonymous prior to v5.19- because there's no known way to exploit. IOW, this patch can also fix the issue reported in [1] as the patch 2 does. [1] https://lore.kernel.org/all/20220811103435.188481-3-david@redhat.com/ Link: https://lkml.kernel.org/r/20220811201340.39342-1-peterx@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Cc: David Hildenbrand Cc: Mike Rapoport Cc: Mike Kravetz Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Axel Rasmussen Cc: Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 4 ++++ include/linux/userfaultfd_k.h | 2 ++ mm/userfaultfd.c | 29 ++++++++++++++++++----------- 3 files changed, 24 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1c44bf75f916..175de70e3adf 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1601,6 +1601,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(mm, vma, start, vma_end - start, false); + new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 732b522bacb7..e1b8a915e9e9 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -73,6 +73,8 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); +extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, + unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 07d3befc80e4..7327b2573f7c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -703,14 +703,29 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, mmap_changing, 0); } +void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, + unsigned long start, unsigned long len, bool enable_wp) +{ + struct mmu_gather tlb; + pgprot_t newprot; + + if (enable_wp) + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + else + newprot = vm_get_page_prot(dst_vma->vm_flags); + + tlb_gather_mmu(&tlb, dst_mm); + change_protection(&tlb, dst_vma, start, start + len, newprot, + enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + tlb_finish_mmu(&tlb); +} + int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; unsigned long page_mask; - struct mmu_gather tlb; - pgprot_t newprot; int err; /* @@ -750,15 +765,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, goto out_unlock; } - if (enable_wp) - newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); - else - newprot = vm_get_page_prot(dst_vma->vm_flags); - - tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, newprot, - enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); - tlb_finish_mmu(&tlb); + uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); err = 0; out_unlock: -- cgit v1.2.3