summaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
authorDavid Hildenbrand <david@redhat.com>2022-05-10 04:20:45 +0300
committerakpm <akpm@linux-foundation.org>2022-05-10 04:20:45 +0300
commita7f226604170acd6b142b76472c1a49c12ebb83d (patch)
tree2f1c5aa6fb51a9c61c74527fc95b41f41ddf080e /mm/hugetlb.c
parentc89357e27f20dda3fff6791d27bb6c91eae99f4a (diff)
downloadlinux-a7f226604170acd6b142b76472c1a49c12ebb83d.tar.xz
mm/gup: trigger FAULT_FLAG_UNSHARE when R/O-pinning a possibly shared anonymous page
Whenever GUP currently ends up taking a R/O pin on an anonymous page that might be shared -- mapped R/O and !PageAnonExclusive() -- any write fault on the page table entry will end up replacing the mapped anonymous page due to COW, resulting in the GUP pin no longer being consistent with the page actually mapped into the page table. The possible ways to deal with this situation are: (1) Ignore and pin -- what we do right now. (2) Fail to pin -- which would be rather surprising to callers and could break user space. (3) Trigger unsharing and pin the now exclusive page -- reliable R/O pins. Let's implement 3) because it provides the clearest semantics and allows for checking in unpin_user_pages() and friends for possible BUGs: when trying to unpin a page that's no longer exclusive, clearly something went very wrong and might result in memory corruptions that might be hard to debug. So we better have a nice way to spot such issues. This change implies that whenever user space *wrote* to a private mapping (IOW, we have an anonymous page mapped), that GUP pins will always remain consistent: reliable R/O GUP pins of anonymous pages. As a side note, this commit fixes the COW security issue for hugetlb with FOLL_PIN as documented in: https://lore.kernel.org/r/3ae33b08-d9ef-f846-56fb-645e3b9b4c66@redhat.com The vmsplice reproducer still applies, because vmsplice uses FOLL_GET instead of FOLL_PIN. Note that follow_huge_pmd() doesn't apply because we cannot end up in there with FOLL_PIN. This commit is heavily based on prototype patches by Andrea. Link: https://lkml.kernel.org/r/20220428083441.37290-17-david@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: David Hildenbrand <david@redhat.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Co-developed-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: David Rientjes <rientjes@google.com> Cc: Don Dutile <ddutile@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: Jann Horn <jannh@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Khalid Aziz <khalid.aziz@oracle.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Liang Zhang <zhangliang5@huawei.com> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport <rppt@linux.ibm.com> Cc: Nadav Amit <namit@vmware.com> Cc: Oded Gabbay <oded.gabbay@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com> Cc: Peter Xu <peterx@redhat.com> Cc: Rik van Riel <riel@surriel.com> Cc: Roman Gushchin <guro@fb.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Yang Shi <shy828301@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c27
1 files changed, 24 insertions, 3 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9cefa3dd9af3..5f8d3a77a2ec 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5982,6 +5982,25 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
}
}
+static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+ bool *unshare)
+{
+ pte_t pteval = huge_ptep_get(pte);
+
+ *unshare = false;
+ if (is_swap_pte(pteval))
+ return true;
+ if (huge_pte_write(pteval))
+ return false;
+ if (flags & FOLL_WRITE)
+ return true;
+ if (gup_must_unshare(flags, pte_page(pteval))) {
+ *unshare = true;
+ return true;
+ }
+ return false;
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -5996,6 +6015,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
spinlock_t *ptl = NULL;
+ bool unshare = false;
int absent;
struct page *page;
@@ -6046,9 +6066,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* both cases, and because we can't follow correct pages
* directly from any kind of swap entries.
*/
- if (absent || is_swap_pte(huge_ptep_get(pte)) ||
- ((flags & FOLL_WRITE) &&
- !huge_pte_write(huge_ptep_get(pte)))) {
+ if (absent ||
+ __follow_hugetlb_must_fault(flags, pte, &unshare)) {
vm_fault_t ret;
unsigned int fault_flags = 0;
@@ -6056,6 +6075,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
+ else if (unshare)
+ fault_flags |= FAULT_FLAG_UNSHARE;
if (locked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_KILLABLE;