summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/hugetlbfs/inode.c66
-rw-r--r--mm/hugetlb.c102
-rw-r--r--mm/memory.c2
-rw-r--r--mm/rmap.c100
-rw-r--r--mm/userfaultfd.c9
5 files changed, 233 insertions, 46 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3bb1772fce2f..009ae539b9b2 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -434,6 +434,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
struct folio *folio, pgoff_t index)
{
struct rb_root_cached *root = &mapping->i_mmap;
+ struct hugetlb_vma_lock *vma_lock;
struct page *page = &folio->page;
struct vm_area_struct *vma;
unsigned long v_start;
@@ -444,7 +445,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
end = (index + 1) * pages_per_huge_page(h);
i_mmap_lock_write(mapping);
-
+retry:
+ vma_lock = NULL;
vma_interval_tree_foreach(vma, root, start, end - 1) {
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
@@ -452,11 +454,63 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
continue;
+ if (!hugetlb_vma_trylock_write(vma)) {
+ vma_lock = vma->vm_private_data;
+ /*
+ * If we can not get vma lock, we need to drop
+ * immap_sema and take locks in order. First,
+ * take a ref on the vma_lock structure so that
+ * we can be guaranteed it will not go away when
+ * dropping immap_sema.
+ */
+ kref_get(&vma_lock->refs);
+ break;
+ }
+
unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
NULL, ZAP_FLAG_DROP_MARKER);
+ hugetlb_vma_unlock_write(vma);
}
i_mmap_unlock_write(mapping);
+
+ if (vma_lock) {
+ /*
+ * Wait on vma_lock. We know it is still valid as we have
+ * a reference. We must 'open code' vma locking as we do
+ * not know if vma_lock is still attached to vma.
+ */
+ down_write(&vma_lock->rw_sema);
+ i_mmap_lock_write(mapping);
+
+ vma = vma_lock->vma;
+ if (!vma) {
+ /*
+ * If lock is no longer attached to vma, then just
+ * unlock, drop our reference and retry looking for
+ * other vmas.
+ */
+ up_write(&vma_lock->rw_sema);
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+ goto retry;
+ }
+
+ /*
+ * vma_lock is still attached to vma. Check to see if vma
+ * still maps page and if so, unmap.
+ */
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+ if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+ unmap_hugepage_range(vma, vma->vm_start + v_start,
+ v_end, NULL,
+ ZAP_FLAG_DROP_MARKER);
+
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+ hugetlb_vma_unlock_write(vma);
+
+ goto retry;
+ }
}
static void
@@ -474,11 +528,21 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
unsigned long v_start;
unsigned long v_end;
+ if (!hugetlb_vma_trylock_write(vma))
+ continue;
+
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
NULL, zap_flags);
+
+ /*
+ * Note that vma lock only exists for shared/non-private
+ * vmas. Therefore, lock is not held when calling
+ * unmap_hugepage_range for private vmas.
+ */
+ hugetlb_vma_unlock_write(vma);
}
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f44b79998ac2..d78504959df7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4796,6 +4796,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
mmu_notifier_invalidate_range_start(&range);
mmap_assert_write_locked(src);
raw_write_seqcount_begin(&src->write_protect_seq);
+ } else {
+ /*
+ * For shared mappings the vma lock must be held before
+ * calling huge_pte_offset in the src vma. Otherwise, the
+ * returned ptep could go away if part of a shared pmd and
+ * another thread calls huge_pmd_unshare.
+ */
+ hugetlb_vma_lock_read(src_vma);
}
last_addr_mask = hugetlb_mask_last_page(h);
@@ -4942,6 +4950,8 @@ again:
if (cow) {
raw_write_seqcount_end(&src->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
+ } else {
+ hugetlb_vma_unlock_read(src_vma);
}
return ret;
@@ -5000,6 +5010,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
/* Prevent race with file truncation */
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -5031,6 +5042,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
flush_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
+ hugetlb_vma_unlock_write(vma);
return len + old_addr - old_end;
}
@@ -5350,8 +5362,29 @@ retry_avoidcopy:
* may get SIGKILLed if it later faults.
*/
if (outside_reserve) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx;
+ u32 hash;
+
put_page(old_page);
+ /*
+ * Drop hugetlb_fault_mutex and vma_lock before
+ * unmapping. unmapping needs to hold vma_lock
+ * in write mode. Dropping vma_lock in read mode
+ * here is OK as COW mappings do not interact with
+ * PMD sharing.
+ *
+ * Reacquire both after unmap operation.
+ */
+ idx = vma_hugecache_offset(h, vma, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
unmap_ref_private(mm, vma, old_page, haddr);
+
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(vma);
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
@@ -5500,14 +5533,16 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
};
/*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
+ * vma_lock and hugetlb_fault_mutex must be
* dropped before handling userfault. Reacquire
* after handling fault to make calling code simpler.
*/
+ hugetlb_vma_unlock_read(vma);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, reason);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(vma);
return ret;
}
@@ -5741,6 +5776,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
+ /*
+ * Since we hold no locks, ptep could be stale. That is
+ * OK as we are only making decisions based on content and
+ * not actually modifying content here.
+ */
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, ptep);
@@ -5748,23 +5788,35 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
- } else {
- ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
- if (!ptep)
- return VM_FAULT_OOM;
}
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, haddr);
-
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ /*
+ * Acquire vma lock before calling huge_pte_alloc and hold
+ * until finished with ptep. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the ptep no longer valid.
+ *
+ * ptep could have already be assigned via huge_pte_offset. That
+ * is OK, as huge_pte_alloc will return the same value unless
+ * something has changed.
+ */
+ hugetlb_vma_lock_read(vma);
+ ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
+ if (!ptep) {
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ return VM_FAULT_OOM;
+ }
+
entry = huge_ptep_get(ptep);
/* PTE markers should be handled the same way as none pte */
if (huge_pte_none_mostly(entry)) {
@@ -5825,6 +5877,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(pagecache_page);
put_page(pagecache_page);
}
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return handle_userfault(&vmf, VM_UFFD_WP);
}
@@ -5868,6 +5921,7 @@ out_ptl:
put_page(pagecache_page);
}
out_mutex:
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
* Generally it's safe to hold refcount during waiting page lock. But
@@ -6330,8 +6384,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
- last_addr_mask = hugetlb_mask_last_page(h);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
+ last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address, psize);
@@ -6430,6 +6485,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* See Documentation/mm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
@@ -6931,6 +6987,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ hugetlb_vma_assert_locked(vma);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
@@ -6943,6 +7000,31 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ return 1;
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
void hugetlb_vma_lock_release(struct kref *kref)
{
}
@@ -7325,6 +7407,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
ptep = huge_pte_offset(mm, address, sz);
@@ -7336,6 +7419,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
/*
* No need to call mmu_notifier_invalidate_range(), see
* Documentation/mm/mmu_notifier.rst.
diff --git a/mm/memory.c b/mm/memory.c
index c01c12500169..b3ed17219d77 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1684,10 +1684,12 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file) {
zap_flags_t zap_flags = details ?
details->zap_flags : 0;
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
__unmap_hugepage_range_final(tlb, vma, start, end,
NULL, zap_flags);
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
}
} else
unmap_page_range(tlb, vma, start, end, details);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0e179c823e0a..b6743c2b8b5f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1551,24 +1551,39 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
* do this outside rmap routines.
+ *
+ * We also must hold hugetlb vma_lock in write mode.
+ * Lock order dictates acquiring vma_lock BEFORE
+ * i_mmap_rwsem. We can only try lock here and fail
+ * if unsuccessful.
*/
- VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
- if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
+ if (!anon) {
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (!hugetlb_vma_trylock_write(vma)) {
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
+ }
+ if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ flush_tlb_range(vma,
+ range.start, range.end);
+ mmu_notifier_invalidate_range(mm,
+ range.start, range.end);
+ /*
+ * The ref count of the PMD page was
+ * dropped which is part of the way map
+ * counting is done for shared PMDs.
+ * Return 'true' here. When there is
+ * no other sharing, huge_pmd_unshare
+ * returns false and we will unmap the
+ * actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ hugetlb_vma_unlock_write(vma);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
@@ -1926,26 +1941,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
* do this outside rmap routines.
+ *
+ * We also must hold hugetlb vma_lock in write mode.
+ * Lock order dictates acquiring vma_lock BEFORE
+ * i_mmap_rwsem. We can only try lock here and
+ * fail if unsuccessful.
*/
- VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
- if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
+ if (!anon) {
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (!hugetlb_vma_trylock_write(vma)) {
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
+ }
+ if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ flush_tlb_range(vma,
+ range.start, range.end);
+ mmu_notifier_invalidate_range(mm,
+ range.start, range.end);
+
+ /*
+ * The ref count of the PMD page was
+ * dropped which is part of the way map
+ * counting is done for shared PMDs.
+ * Return 'true' here. When there is
+ * no other sharing, huge_pmd_unshare
+ * returns false and we will unmap the
+ * actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ hugetlb_vma_unlock_write(vma);
}
-
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0fdbd2c05587..e24e8a47ce8a 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -379,16 +379,21 @@ retry:
BUG_ON(dst_addr >= dst_start + len);
/*
- * Serialize via hugetlb_fault_mutex.
+ * Serialize via vma_lock and hugetlb_fault_mutex.
+ * vma_lock ensures the dst_pte remains valid even
+ * in the case of shared pmds. fault mutex prevents
+ * races with other faulting threads.
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(dst_vma);
err = -ENOMEM;
dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
if (!dst_pte) {
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
goto out_unlock;
}
@@ -396,6 +401,7 @@ retry:
if (mode != MCOPY_ATOMIC_CONTINUE &&
!huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
err = -EEXIST;
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
goto out_unlock;
}
@@ -404,6 +410,7 @@ retry:
dst_addr, src_addr, mode, &page,
wp_copy);
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
cond_resched();