summaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c102
1 files changed, 93 insertions, 9 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f44b79998ac2..d78504959df7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4796,6 +4796,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
mmu_notifier_invalidate_range_start(&range);
mmap_assert_write_locked(src);
raw_write_seqcount_begin(&src->write_protect_seq);
+ } else {
+ /*
+ * For shared mappings the vma lock must be held before
+ * calling huge_pte_offset in the src vma. Otherwise, the
+ * returned ptep could go away if part of a shared pmd and
+ * another thread calls huge_pmd_unshare.
+ */
+ hugetlb_vma_lock_read(src_vma);
}
last_addr_mask = hugetlb_mask_last_page(h);
@@ -4942,6 +4950,8 @@ again:
if (cow) {
raw_write_seqcount_end(&src->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
+ } else {
+ hugetlb_vma_unlock_read(src_vma);
}
return ret;
@@ -5000,6 +5010,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
/* Prevent race with file truncation */
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -5031,6 +5042,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
flush_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
+ hugetlb_vma_unlock_write(vma);
return len + old_addr - old_end;
}
@@ -5350,8 +5362,29 @@ retry_avoidcopy:
* may get SIGKILLed if it later faults.
*/
if (outside_reserve) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx;
+ u32 hash;
+
put_page(old_page);
+ /*
+ * Drop hugetlb_fault_mutex and vma_lock before
+ * unmapping. unmapping needs to hold vma_lock
+ * in write mode. Dropping vma_lock in read mode
+ * here is OK as COW mappings do not interact with
+ * PMD sharing.
+ *
+ * Reacquire both after unmap operation.
+ */
+ idx = vma_hugecache_offset(h, vma, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
unmap_ref_private(mm, vma, old_page, haddr);
+
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(vma);
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
@@ -5500,14 +5533,16 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
};
/*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
+ * vma_lock and hugetlb_fault_mutex must be
* dropped before handling userfault. Reacquire
* after handling fault to make calling code simpler.
*/
+ hugetlb_vma_unlock_read(vma);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, reason);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(vma);
return ret;
}
@@ -5741,6 +5776,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
+ /*
+ * Since we hold no locks, ptep could be stale. That is
+ * OK as we are only making decisions based on content and
+ * not actually modifying content here.
+ */
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, ptep);
@@ -5748,23 +5788,35 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
- } else {
- ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
- if (!ptep)
- return VM_FAULT_OOM;
}
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, haddr);
-
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ /*
+ * Acquire vma lock before calling huge_pte_alloc and hold
+ * until finished with ptep. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the ptep no longer valid.
+ *
+ * ptep could have already be assigned via huge_pte_offset. That
+ * is OK, as huge_pte_alloc will return the same value unless
+ * something has changed.
+ */
+ hugetlb_vma_lock_read(vma);
+ ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
+ if (!ptep) {
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ return VM_FAULT_OOM;
+ }
+
entry = huge_ptep_get(ptep);
/* PTE markers should be handled the same way as none pte */
if (huge_pte_none_mostly(entry)) {
@@ -5825,6 +5877,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(pagecache_page);
put_page(pagecache_page);
}
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return handle_userfault(&vmf, VM_UFFD_WP);
}
@@ -5868,6 +5921,7 @@ out_ptl:
put_page(pagecache_page);
}
out_mutex:
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
* Generally it's safe to hold refcount during waiting page lock. But
@@ -6330,8 +6384,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
- last_addr_mask = hugetlb_mask_last_page(h);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
+ last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address, psize);
@@ -6430,6 +6485,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* See Documentation/mm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
@@ -6931,6 +6987,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ hugetlb_vma_assert_locked(vma);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
@@ -6943,6 +7000,31 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ return 1;
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
void hugetlb_vma_lock_release(struct kref *kref)
{
}
@@ -7325,6 +7407,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
ptep = huge_pte_offset(mm, address, sz);
@@ -7336,6 +7419,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
/*
* No need to call mmu_notifier_invalidate_range(), see
* Documentation/mm/mmu_notifier.rst.