diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/compaction.c | 3 | ||||
-rw-r--r-- | mm/filemap.c | 183 | ||||
-rw-r--r-- | mm/frame_vector.c | 240 | ||||
-rw-r--r-- | mm/highmem.c | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 37 | ||||
-rw-r--r-- | mm/hugetlb.c | 71 | ||||
-rw-r--r-- | mm/kasan/hw_tags.c | 73 | ||||
-rw-r--r-- | mm/kasan/init.c | 23 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 2 | ||||
-rw-r--r-- | mm/khugepaged.c | 37 | ||||
-rw-r--r-- | mm/madvise.c | 12 | ||||
-rw-r--r-- | mm/memblock.c | 57 | ||||
-rw-r--r-- | mm/memcontrol.c | 9 | ||||
-rw-r--r-- | mm/memory-failure.c | 20 | ||||
-rw-r--r-- | mm/memory.c | 318 | ||||
-rw-r--r-- | mm/migrate.c | 29 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/mmu_gather.c | 31 | ||||
-rw-r--r-- | mm/mremap.c | 8 | ||||
-rw-r--r-- | mm/nommu.c | 3 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page_alloc.c | 10 | ||||
-rw-r--r-- | mm/page_io.c | 47 | ||||
-rw-r--r-- | mm/percpu.c | 36 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/slab.c | 20 | ||||
-rw-r--r-- | mm/slab.h | 12 | ||||
-rw-r--r-- | mm/slab_common.c | 75 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 69 | ||||
-rw-r--r-- | mm/swapfile.c | 47 | ||||
-rw-r--r-- | mm/util.c | 31 | ||||
-rw-r--r-- | mm/vmalloc.c | 13 |
35 files changed, 792 insertions, 761 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f730605b8dcf..24c045b24b95 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -804,9 +804,6 @@ config DEVICE_PRIVATE config VMAP_PFN bool -config FRAME_VECTOR - bool - config ARCH_USES_HIGH_VMA_FLAGS bool config ARCH_HAS_PKEYS diff --git a/mm/Makefile b/mm/Makefile index b6cd2fffa492..135bbb65511a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -110,7 +110,6 @@ obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o -obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o diff --git a/mm/compaction.c b/mm/compaction.c index e5acb9714436..190ccdaa6c19 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1342,7 +1342,7 @@ fast_isolate_freepages(struct compact_control *cc) { unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1); unsigned int nr_scanned = 0; - unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0; + unsigned long low_pfn, min_pfn, highest = 0; unsigned long nr_isolated = 0; unsigned long distance; struct page *page = NULL; @@ -1387,6 +1387,7 @@ fast_isolate_freepages(struct compact_control *cc) struct page *freepage; unsigned long flags; unsigned int order_scanned = 0; + unsigned long high_pfn = 0; if (!area->nr_free) continue; diff --git a/mm/filemap.c b/mm/filemap.c index 5c9d564317a5..6ff2a3fb0dc7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,8 @@ #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -835,6 +837,7 @@ noinline int __add_to_page_cache_locked(struct page *page, XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); int error; + bool charged = false; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); @@ -848,6 +851,7 @@ noinline int __add_to_page_cache_locked(struct page *page, error = mem_cgroup_charge(page, current->mm, gfp); if (error) goto error; + charged = true; } gfp &= GFP_RECLAIM_MASK; @@ -896,6 +900,8 @@ unlock: if (xas_error(&xas)) { error = xas_error(&xas); + if (charged) + mem_cgroup_uncharge(page); goto error; } @@ -2911,74 +2917,163 @@ out_retry: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_fault *vmf, - pgoff_t start_pgoff, pgoff_t end_pgoff) +static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) { - struct file *file = vmf->vma->vm_file; + struct mm_struct *mm = vmf->vma->vm_mm; + + /* Huge page is mapped? No need to proceed. */ + if (pmd_trans_huge(*vmf->pmd)) { + unlock_page(page); + put_page(page); + return true; + } + + if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { + vm_fault_t ret = do_set_pmd(vmf, page); + if (!ret) { + /* The page is mapped successfully, reference consumed. */ + unlock_page(page); + return true; + } + } + + if (pmd_none(*vmf->pmd)) { + vmf->ptl = pmd_lock(mm, vmf->pmd); + if (likely(pmd_none(*vmf->pmd))) { + mm_inc_nr_ptes(mm); + pmd_populate(mm, vmf->pmd, vmf->prealloc_pte); + vmf->prealloc_pte = NULL; + } + spin_unlock(vmf->ptl); + } + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) { + unlock_page(page); + put_page(page); + return true; + } + + return false; +} + +static struct page *next_uptodate_page(struct page *page, + struct address_space *mapping, + struct xa_state *xas, pgoff_t end_pgoff) +{ + unsigned long max_idx; + + do { + if (!page) + return NULL; + if (xas_retry(xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (PageLocked(page)) + continue; + if (!page_cache_get_speculative(page)) + continue; + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(xas))) + goto skip; + if (!PageUptodate(page) || PageReadahead(page)) + goto skip; + if (PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + if (page->mapping != mapping) + goto unlock; + if (!PageUptodate(page)) + goto unlock; + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (xas->xa_index >= max_idx) + goto unlock; + return page; +unlock: + unlock_page(page); +skip: + put_page(page); + } while ((page = xas_next_entry(xas, end_pgoff)) != NULL); + + return NULL; +} + +static inline struct page *first_map_page(struct address_space *mapping, + struct xa_state *xas, + pgoff_t end_pgoff) +{ + return next_uptodate_page(xas_find(xas, end_pgoff), + mapping, xas, end_pgoff); +} + +static inline struct page *next_map_page(struct address_space *mapping, + struct xa_state *xas, + pgoff_t end_pgoff) +{ + return next_uptodate_page(xas_next_entry(xas, end_pgoff), + mapping, xas, end_pgoff); +} + +vm_fault_t filemap_map_pages(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff) +{ + struct vm_area_struct *vma = vmf->vma; + struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - unsigned long max_idx; + unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); + vm_fault_t ret = 0; rcu_read_lock(); - xas_for_each(&xas, head, end_pgoff) { - if (xas_retry(&xas, head)) - continue; - if (xa_is_value(head)) - goto next; + head = first_map_page(mapping, &xas, end_pgoff); + if (!head) + goto out; - /* - * Check for a locked page first, as a speculative - * reference may adversely influence page migration. - */ - if (PageLocked(head)) - goto next; - if (!page_cache_get_speculative(head)) - goto next; + if (filemap_map_pmd(vmf, head)) { + ret = VM_FAULT_NOPAGE; + goto out; + } - /* Has the page moved or been split? */ - if (unlikely(head != xas_reload(&xas))) - goto skip; + addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); + do { page = find_subpage(head, xas.xa_index); - - if (!PageUptodate(head) || - PageReadahead(page) || - PageHWPoison(page)) - goto skip; - if (!trylock_page(head)) - goto skip; - - if (head->mapping != mapping || !PageUptodate(head)) - goto unlock; - - max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); - if (xas.xa_index >= max_idx) + if (PageHWPoison(page)) goto unlock; if (mmap_miss > 0) mmap_miss--; - vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; - if (vmf->pte) - vmf->pte += xas.xa_index - last_pgoff; + addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; + vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - if (alloc_set_pte(vmf, page)) + + if (!pte_none(*vmf->pte)) goto unlock; + + /* We're about to handle the fault */ + if (vmf->address == addr) + ret = VM_FAULT_NOPAGE; + + do_set_pte(vmf, page, addr); + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, addr, vmf->pte); unlock_page(head); - goto next; + continue; unlock: unlock_page(head); -skip: put_page(head); -next: - /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*vmf->pmd)) - break; - } + } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); + pte_unmap_unlock(vmf->pte, vmf->ptl); +out: rcu_read_unlock(); WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); + return ret; } EXPORT_SYMBOL(filemap_map_pages); diff --git a/mm/frame_vector.c b/mm/frame_vector.c deleted file mode 100644 index 10f82d5643b6..000000000000 --- a/mm/frame_vector.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/err.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/pagemap.h> -#include <linux/sched.h> - -/** - * get_vaddr_frames() - map virtual addresses to pfns - * @start: starting user address - * @nr_frames: number of pages / pfns from start to map - * @gup_flags: flags modifying lookup behaviour - * @vec: structure which receives pages / pfns of the addresses mapped. - * It should have space for at least nr_frames entries. - * - * This function maps virtual addresses from @start and fills @vec structure - * with page frame numbers or page pointers to corresponding pages (choice - * depends on the type of the vma underlying the virtual address). If @start - * belongs to a normal vma, the function grabs reference to each of the pages - * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't - * touch page structures and the caller must make sure pfns aren't reused for - * anything else while he is using them. - * - * The function returns number of pages mapped which may be less than - * @nr_frames. In particular we stop mapping if there are more vmas of - * different type underlying the specified range of virtual addresses. - * When the function isn't able to map a single page, it returns error. - * - * This function takes care of grabbing mmap_lock as necessary. - */ -int get_vaddr_frames(unsigned long start, unsigned int nr_frames, - unsigned int gup_flags, struct frame_vector *vec) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int ret = 0; - int err; - int locked; - - if (nr_frames == 0) - return 0; - - if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) - nr_frames = vec->nr_allocated; - - start = untagged_addr(start); - - mmap_read_lock(mm); - locked = 1; - vma = find_vma_intersection(mm, start, start + 1); - if (!vma) { - ret = -EFAULT; - goto out; - } - - /* - * While get_vaddr_frames() could be used for transient (kernel - * controlled lifetime) pinning of memory pages all current - * users establish long term (userspace controlled lifetime) - * page pinning. Treat get_vaddr_frames() like - * get_user_pages_longterm() and disallow it for filesystem-dax - * mappings. - */ - if (vma_is_fsdax(vma)) { - ret = -EOPNOTSUPP; - goto out; - } - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { - vec->got_ref = true; - vec->is_pfns = false; - ret = pin_user_pages_locked(start, nr_frames, - gup_flags, (struct page **)(vec->ptrs), &locked); - goto out; - } - - vec->got_ref = false; - vec->is_pfns = true; - do { - unsigned long *nums = frame_vector_pfns(vec); - - while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) { - err = follow_pfn(vma, start, &nums[ret]); - if (err) { - if (ret == 0) - ret = err; - goto out; - } - start += PAGE_SIZE; - ret++; - } - /* - * We stop if we have enough pages or if VMA doesn't completely - * cover the tail page. - */ - if (ret >= nr_frames || start < vma->vm_end) - break; - vma = find_vma_intersection(mm, start, start + 1); - } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP)); -out: - if (locked) - mmap_read_unlock(mm); - if (!ret) - ret = -EFAULT; - if (ret > 0) - vec->nr_frames = ret; - return ret; -} -EXPORT_SYMBOL(get_vaddr_frames); - -/** - * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired - * them - * @vec: frame vector to put - * - * Drop references to pages if get_vaddr_frames() acquired them. We also - * invalidate the frame vector so that it is prepared for the next call into - * get_vaddr_frames(). - */ -void put_vaddr_frames(struct frame_vector *vec) -{ - struct page **pages; - - if (!vec->got_ref) - goto out; - pages = frame_vector_pages(vec); - /* - * frame_vector_pages() might needed to do a conversion when - * get_vaddr_frames() got pages but vec was later converted to pfns. - * But it shouldn't really fail to convert pfns back... - */ - if (WARN_ON(IS_ERR(pages))) - goto out; - - unpin_user_pages(pages, vec->nr_frames); - vec->got_ref = false; -out: - vec->nr_frames = 0; -} -EXPORT_SYMBOL(put_vaddr_frames); - -/** - * frame_vector_to_pages - convert frame vector to contain page pointers - * @vec: frame vector to convert - * - * Convert @vec to contain array of page pointers. If the conversion is - * successful, return 0. Otherwise return an error. Note that we do not grab - * page references for the page structures. - */ -int frame_vector_to_pages(struct frame_vector *vec) -{ - int i; - unsigned long *nums; - struct page **pages; - - if (!vec->is_pfns) - return 0; - nums = frame_vector_pfns(vec); - for (i = 0; i < vec->nr_frames; i++) - if (!pfn_valid(nums[i])) - return -EINVAL; - pages = (struct page **)nums; - for (i = 0; i < vec->nr_frames; i++) - pages[i] = pfn_to_page(nums[i]); - vec->is_pfns = false; - return 0; -} -EXPORT_SYMBOL(frame_vector_to_pages); - -/** - * frame_vector_to_pfns - convert frame vector to contain pfns - * @vec: frame vector to convert - * - * Convert @vec to contain array of pfns. - */ -void frame_vector_to_pfns(struct frame_vector *vec) -{ - int i; - unsigned long *nums; - struct page **pages; - - if (vec->is_pfns) - return; - pages = (struct page **)(vec->ptrs); - nums = (unsigned long *)pages; - for (i = 0; i < vec->nr_frames; i++) - nums[i] = page_to_pfn(pages[i]); - vec->is_pfns = true; -} -EXPORT_SYMBOL(frame_vector_to_pfns); - -/** - * frame_vector_create() - allocate & initialize structure for pinned pfns - * @nr_frames: number of pfns slots we should reserve - * - * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns - * pfns. - */ -struct frame_vector *frame_vector_create(unsigned int nr_frames) -{ - struct frame_vector *vec; - int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames; - - if (WARN_ON_ONCE(nr_frames == 0)) - return NULL; - /* - * This is absurdly high. It's here just to avoid strange effects when - * arithmetics overflows. - */ - if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2)) - return NULL; - /* - * Avoid higher order allocations, use vmalloc instead. It should - * be rare anyway. - */ - vec = kvmalloc(size, GFP_KERNEL); - if (!vec) - return NULL; - vec->nr_allocated = nr_frames; - vec->nr_frames = 0; - return vec; -} -EXPORT_SYMBOL(frame_vector_create); - -/** - * frame_vector_destroy() - free memory allocated to carry frame vector - * @vec: Frame vector to free - * - * Free structure allocated by frame_vector_create() to carry frames. - */ -void frame_vector_destroy(struct frame_vector *vec) -{ - /* Make sure put_vaddr_frames() got called properly... */ - VM_BUG_ON(vec->nr_frames > 0); - kvfree(vec); -} -EXPORT_SYMBOL(frame_vector_destroy); diff --git a/mm/highmem.c b/mm/highmem.c index c3a9ea7875ef..874b732b120c 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -473,6 +473,11 @@ static inline void *arch_kmap_local_high_get(struct page *page) } #endif +#ifndef arch_kmap_local_set_pte +#define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) \ + set_pte_at(mm, vaddr, ptep, ptev) +#endif + /* Unmap a local mapping which was obtained by kmap_high_get() */ static inline bool kmap_high_unmap_local(unsigned long vaddr) { @@ -515,7 +520,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte - idx))); pteval = pfn_pte(pfn, prot); - set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); + arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte - idx, pteval); arch_kmap_local_post_map(vaddr, pteval); current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; preempt_enable(); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9237976abe72..91ca9b103ee5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2202,7 +2202,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mmu_notifier_range range; - bool was_locked = false; + bool do_unlock_page = false; pmd_t _pmd; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, @@ -2218,7 +2218,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON(freeze && !page); if (page) { VM_WARN_ON_ONCE(!PageLocked(page)); - was_locked = true; if (page != pmd_page(*pmd)) goto out; } @@ -2227,19 +2226,29 @@ repeat: if (pmd_trans_huge(*pmd)) { if (!page) { page = pmd_page(*pmd); - if (unlikely(!trylock_page(page))) { - get_page(page); - _pmd = *pmd; - spin_unlock(ptl); - lock_page(page); - spin_lock(ptl); - if (unlikely(!pmd_same(*pmd, _pmd))) { - unlock_page(page); + /* + * An anonymous page must be locked, to ensure that a + * concurrent reuse_swap_page() sees stable mapcount; + * but reuse_swap_page() is not used on shmem or file, + * and page lock must not be taken when zap_pmd_range() + * calls __split_huge_pmd() while i_mmap_lock is held. + */ + if (PageAnon(page)) { + if (unlikely(!trylock_page(page))) { + get_page(page); + _pmd = *pmd; + spin_unlock(ptl); + lock_page(page); + spin_lock(ptl); + if (unlikely(!pmd_same(*pmd, _pmd))) { + unlock_page(page); + put_page(page); + page = NULL; + goto repeat; + } put_page(page); - page = NULL; - goto repeat; } - put_page(page); + do_unlock_page = true; } } if (PageMlocked(page)) @@ -2249,7 +2258,7 @@ repeat: __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); - if (!was_locked && page) + if (do_unlock_page) unlock_page(page); /* * No need to double call mmu_notifier->invalidate_range() callback. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 18f6ee317900..905a7d549b00 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -79,6 +79,21 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; +static inline bool PageHugeFreed(struct page *head) +{ + return page_private(head + 4) == -1UL; +} + +static inline void SetPageHugeFreed(struct page *head) +{ + set_page_private(head + 4, -1UL); +} + +static inline void ClearPageHugeFreed(struct page *head) +{ + set_page_private(head + 4, 0); +} + /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -1028,6 +1043,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; + SetPageHugeFreed(page); } static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) @@ -1044,6 +1060,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); + ClearPageHugeFreed(page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return page; @@ -1344,12 +1361,11 @@ struct hstate *size_to_hstate(unsigned long size) */ bool page_huge_active(struct page *page) { - VM_BUG_ON_PAGE(!PageHuge(page), page); - return PageHead(page) && PagePrivate(&page[1]); + return PageHeadHuge(page) && PagePrivate(&page[1]); } /* never called for tail page */ -static void set_page_huge_active(struct page *page) +void set_page_huge_active(struct page *page) { VM_BUG_ON_PAGE(!PageHeadHuge(page), page); SetPagePrivate(&page[1]); @@ -1505,6 +1521,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) spin_lock(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; + ClearPageHugeFreed(page); spin_unlock(&hugetlb_lock); } @@ -1755,6 +1772,7 @@ int dissolve_free_huge_page(struct page *page) { int rc = -EBUSY; +retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ if (!PageHuge(page)) return 0; @@ -1771,6 +1789,26 @@ int dissolve_free_huge_page(struct page *page) int nid = page_to_nid(head); if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; + + /* + * We should make sure that the page is already on the free list + * when it is dissolved. + */ + if (unlikely(!PageHugeFreed(head))) { + spin_unlock(&hugetlb_lock); + cond_resched(); + + /* + * Theoretically, we should return -EBUSY when we + * encounter this race. In fact, we have a chance + * to successfully dissolve the page if we do a + * retry. Because the race window is quite small. + * If we seize this opportunity, it is an optimization + * for increasing the success rate of dissolving page. + */ + goto retry; + } + /* * Move PageHWPoison flag from head page to the raw error page, * which makes any subpages rather than the error page reusable. @@ -2009,13 +2047,16 @@ retry: /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + int zeroed; + if ((--needed) < 0) break; /* * This page is now managed by the hugetlb allocator and has * no users -- drop the buddy allocator's reference. */ - VM_BUG_ON_PAGE(!put_page_testzero(page), page); + zeroed = put_page_testzero(page); + VM_BUG_ON_PAGE(!zeroed, page); enqueue_huge_page(h, page); } free: @@ -3967,25 +4008,11 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - struct mm_struct *mm; struct mmu_gather tlb; - unsigned long tlb_start = start; - unsigned long tlb_end = end; - - /* - * If shared PMDs were possibly used within this vma range, adjust - * start/end for worst case tlb flushing. - * Note that we can not be sure if PMDs are shared until we try to - * unmap pages. However, we want to make sure TLB flushing covers - * the largest possible range. - */ - adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end); - - mm = vma->vm_mm; - tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end); + tlb_gather_mmu(&tlb, vma->vm_mm); __unmap_hugepage_range(&tlb, vma, start, end, ref_page); - tlb_finish_mmu(&tlb, tlb_start, tlb_end); + tlb_finish_mmu(&tlb); } /* @@ -5555,9 +5582,9 @@ bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; - VM_BUG_ON_PAGE(!PageHead(page), page); spin_lock(&hugetlb_lock); - if (!page_huge_active(page) || !get_page_unless_zero(page)) { + if (!PageHeadHuge(page) || !page_huge_active(page) || + !get_page_unless_zero(page)) { ret = false; goto unlock; } diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 55bd6f09c70f..d558799b25b3 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -19,11 +19,10 @@ #include "kasan.h" -enum kasan_arg_mode { - KASAN_ARG_MODE_DEFAULT, - KASAN_ARG_MODE_OFF, - KASAN_ARG_MODE_PROD, - KASAN_ARG_MODE_FULL, +enum kasan_arg { + KASAN_ARG_DEFAULT, + KASAN_ARG_OFF, + KASAN_ARG_ON, }; enum kasan_arg_stacktrace { @@ -38,7 +37,7 @@ enum kasan_arg_fault { KASAN_ARG_FAULT_PANIC, }; -static enum kasan_arg_mode kasan_arg_mode __ro_after_init; +static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; static enum kasan_arg_fault kasan_arg_fault __ro_after_init; @@ -52,26 +51,24 @@ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); /* Whether panic or disable tag checking on fault. */ bool kasan_flag_panic __ro_after_init; -/* kasan.mode=off/prod/full */ -static int __init early_kasan_mode(char *arg) +/* kasan=off/on */ +static int __init early_kasan_flag(char *arg) { if (!arg) return -EINVAL; if (!strcmp(arg, "off")) - kasan_arg_mode = KASAN_ARG_MODE_OFF; - else if (!strcmp(arg, "prod")) - kasan_arg_mode = KASAN_ARG_MODE_PROD; - else if (!strcmp(arg, "full")) - kasan_arg_mode = KASAN_ARG_MODE_FULL; + kasan_arg = KASAN_ARG_OFF; + else if (!strcmp(arg, "on")) + kasan_arg = KASAN_ARG_ON; else return -EINVAL; return 0; } -early_param("kasan.mode", early_kasan_mode); +early_param("kasan", early_kasan_flag); -/* kasan.stack=off/on */ +/* kasan.stacktrace=off/on */ static int __init early_kasan_flag_stacktrace(char *arg) { if (!arg) @@ -113,8 +110,8 @@ void kasan_init_hw_tags_cpu(void) * as this function is only called for MTE-capable hardware. */ - /* If KASAN is disabled, do nothing. */ - if (kasan_arg_mode == KASAN_ARG_MODE_OFF) + /* If KASAN is disabled via command line, don't initialize it. */ + if (kasan_arg == KASAN_ARG_OFF) return; hw_init_tags(KASAN_TAG_MAX); @@ -124,43 +121,24 @@ void kasan_init_hw_tags_cpu(void) /* kasan_init_hw_tags() is called once on boot CPU. */ void __init kasan_init_hw_tags(void) { - /* If hardware doesn't support MTE, do nothing. */ + /* If hardware doesn't support MTE, don't initialize KASAN. */ if (!system_supports_mte()) return; - /* Choose KASAN mode if kasan boot parameter is not provided. */ - if (kasan_arg_mode == KASAN_ARG_MODE_DEFAULT) { - if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) - kasan_arg_mode = KASAN_ARG_MODE_FULL; - else - kasan_arg_mode = KASAN_ARG_MODE_PROD; - } - - /* Preset parameter values based on the mode. */ - switch (kasan_arg_mode) { - case KASAN_ARG_MODE_DEFAULT: - /* Shouldn't happen as per the check above. */ - WARN_ON(1); + /* If KASAN is disabled via command line, don't initialize it. */ + if (kasan_arg == KASAN_ARG_OFF) return; - case KASAN_ARG_MODE_OFF: - /* If KASAN is disabled, do nothing. */ - return; - case KASAN_ARG_MODE_PROD: - static_branch_enable(&kasan_flag_enabled); - break; - case KASAN_ARG_MODE_FULL: - static_branch_enable(&kasan_flag_enabled); - static_branch_enable(&kasan_flag_stacktrace); - break; - } - /* Now, optionally override the presets. */ + /* Enable KASAN. */ + static_branch_enable(&kasan_flag_enabled); switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: + /* Default to enabling stack trace collection. */ + static_branch_enable(&kasan_flag_stacktrace); break; case KASAN_ARG_STACKTRACE_OFF: - static_branch_disable(&kasan_flag_stacktrace); + /* Do nothing, kasan_flag_stacktrace keeps its default value. */ break; case KASAN_ARG_STACKTRACE_ON: static_branch_enable(&kasan_flag_stacktrace); @@ -169,11 +147,16 @@ void __init kasan_init_hw_tags(void) switch (kasan_arg_fault) { case KASAN_ARG_FAULT_DEFAULT: + /* + * Default to no panic on report. + * Do nothing, kasan_flag_panic keeps its default value. + */ break; case KASAN_ARG_FAULT_REPORT: - kasan_flag_panic = false; + /* Do nothing, kasan_flag_panic keeps its default value. */ break; case KASAN_ARG_FAULT_PANIC: + /* Enable panic on report. */ kasan_flag_panic = true; break; } diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 7ca0b92d5886..c4605ac9837b 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -373,9 +373,10 @@ static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr, if (kasan_pte_table(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && - IS_ALIGNED(next, PMD_SIZE)) + IS_ALIGNED(next, PMD_SIZE)) { pmd_clear(pmd); - continue; + continue; + } } pte = pte_offset_kernel(pmd, addr); kasan_remove_pte_table(pte, addr, next); @@ -398,9 +399,10 @@ static void kasan_remove_pud_table(pud_t *pud, unsigned long addr, if (kasan_pmd_table(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && - IS_ALIGNED(next, PUD_SIZE)) + IS_ALIGNED(next, PUD_SIZE)) { pud_clear(pud); - continue; + continue; + } } pmd = pmd_offset(pud, addr); pmd_base = pmd_offset(pud, 0); @@ -424,9 +426,10 @@ static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr, if (kasan_pud_table(*p4d)) { if (IS_ALIGNED(addr, P4D_SIZE) && - IS_ALIGNED(next, P4D_SIZE)) + IS_ALIGNED(next, P4D_SIZE)) { p4d_clear(p4d); - continue; + continue; + } } pud = pud_offset(p4d, addr); kasan_remove_pud_table(pud, addr, next); @@ -457,9 +460,10 @@ void kasan_remove_zero_shadow(void *start, unsigned long size) if (kasan_p4d_table(*pgd)) { if (IS_ALIGNED(addr, PGDIR_SIZE) && - IS_ALIGNED(next, PGDIR_SIZE)) + IS_ALIGNED(next, PGDIR_SIZE)) { pgd_clear(pgd); - continue; + continue; + } } p4d = p4d_offset(pgd, addr); @@ -482,7 +486,6 @@ int kasan_add_zero_shadow(void *start, unsigned long size) ret = kasan_populate_early_shadow(shadow_start, shadow_end); if (ret) - kasan_remove_zero_shadow(shadow_start, - size >> KASAN_SHADOW_SCALE_SHIFT); + kasan_remove_zero_shadow(start, size); return ret; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cc4d9e1d49b1..8c706e7652f2 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -209,7 +209,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write, static inline bool addr_has_metadata(const void *addr) { - return true; + return (is_vmalloc_addr(addr) || virt_addr_valid(addr)); } #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 67ab391a5373..fb0fdaec34d5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -991,38 +991,41 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, static bool __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, + unsigned long haddr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; - struct vm_fault vmf = { - .vma = vma, - .address = address, - .flags = FAULT_FLAG_ALLOW_RETRY, - .pmd = pmd, - .pgoff = linear_page_index(vma, address), - }; - - vmf.pte = pte_offset_map(pmd, address); - for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; - vmf.pte++, vmf.address += PAGE_SIZE) { + unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); + + for (address = haddr; address < end; address += PAGE_SIZE) { + struct vm_fault vmf = { + .vma = vma, + .address = address, + .pgoff = linear_page_index(vma, haddr), + .flags = FAULT_FLAG_ALLOW_RETRY, + .pmd = pmd, + }; + + vmf.pte = pte_offset_map(pmd, address); vmf.orig_pte = *vmf.pte; - if (!is_swap_pte(vmf.orig_pte)) + if (!is_swap_pte(vmf.orig_pte)) { + pte_unmap(vmf.pte); continue; + } swapped_in++; ret = do_swap_page(&vmf); /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); - if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { + if (hugepage_vma_revalidate(mm, haddr, &vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) { + if (mm_find_pmd(mm, haddr) != pmd) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } @@ -1031,11 +1034,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } - /* pte is unmapped now, we need to map it */ - vmf.pte = pte_offset_map(pmd, vmf.address); } - vmf.pte--; - pte_unmap(vmf.pte); /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ if (swapped_in) diff --git a/mm/madvise.c b/mm/madvise.c index d4f5eece9d56..df692d2e35d4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -506,9 +506,9 @@ static long madvise_cold(struct vm_area_struct *vma, return -EINVAL; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + tlb_gather_mmu(&tlb, mm); madvise_cold_page_range(&tlb, vma, start_addr, end_addr); - tlb_finish_mmu(&tlb, start_addr, end_addr); + tlb_finish_mmu(&tlb); return 0; } @@ -559,9 +559,9 @@ static long madvise_pageout(struct vm_area_struct *vma, return 0; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + tlb_gather_mmu(&tlb, mm); madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); - tlb_finish_mmu(&tlb, start_addr, end_addr); + tlb_finish_mmu(&tlb); return 0; } @@ -724,7 +724,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm, range.start, range.end); + tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); @@ -733,7 +733,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, &madvise_free_walk_ops, &tlb); tlb_end_vma(&tlb, vma); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb, range.start, range.end); + tlb_finish_mmu(&tlb); return 0; } diff --git a/mm/memblock.c b/mm/memblock.c index d24bcfa88d2f..afaefa8fc6ab 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -275,14 +275,6 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * * Find @size free area aligned to @align in the specified range and node. * - * When allocation direction is bottom-up, the @start should be greater - * than the end of the kernel image. Otherwise, it will be trimmed. The - * reason is that we want the bottom-up allocation just near the kernel - * image so it is highly likely that the allocated memory and the kernel - * will reside in the same node. - * - * If bottom-up allocation failed, will try to allocate memory top-down. - * * Return: * Found address on success, 0 on failure. */ @@ -291,8 +283,6 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t end, int nid, enum memblock_flags flags) { - phys_addr_t kernel_end, ret; - /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE || end == MEMBLOCK_ALLOC_KASAN) @@ -301,40 +291,13 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, /* avoid allocating the first page */ start = max_t(phys_addr_t, start, PAGE_SIZE); end = max(start, end); - kernel_end = __pa_symbol(_end); - - /* - * try bottom-up allocation only when bottom-up mode - * is set and @end is above the kernel image. - */ - if (memblock_bottom_up() && end > kernel_end) { - phys_addr_t bottom_up_start; - - /* make sure we will allocate above the kernel */ - bottom_up_start = max(start, kernel_end); - - /* ok, try bottom-up allocation first */ - ret = __memblock_find_range_bottom_up(bottom_up_start, end, - size, align, nid, flags); - if (ret) - return ret; - /* - * we always limit bottom-up allocation above the kernel, - * but top-down allocation doesn't have the limit, so - * retrying top-down allocation may succeed when bottom-up - * allocation failed. - * - * bottom-up allocation is expected to be fail very rarely, - * so we use WARN_ONCE() here to see the stack trace if - * fail happens. - */ - WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE), - "memblock: bottom-up allocation failed, memory hotremove may be affected\n"); - } - - return __memblock_find_range_top_down(start, end, size, align, nid, - flags); + if (memblock_bottom_up()) + return __memblock_find_range_bottom_up(start, end, size, align, + nid, flags); + else + return __memblock_find_range_top_down(start, end, size, align, + nid, flags); } /** @@ -1427,7 +1390,7 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, } /** - * memblock_phys_alloc_try_nid - allocate a memory block from specified MUMA node + * memblock_phys_alloc_try_nid - allocate a memory block from specified NUMA node * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @nid: nid of the free area to find, %NUMA_NO_NODE for any node @@ -2087,10 +2050,8 @@ void __init reset_all_zones_managed_pages(void) /** * memblock_free_all - release free pages to the buddy allocator - * - * Return: the number of pages actually released. */ -unsigned long __init memblock_free_all(void) +void __init memblock_free_all(void) { unsigned long pages; @@ -2099,8 +2060,6 @@ unsigned long __init memblock_free_all(void) pages = free_low_memory_core_early(); totalram_pages_add(pages); - - return pages; } #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cf9076f58582..0b9bd354e97e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3115,9 +3115,7 @@ void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->kmem, nr_pages); - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); + refill_stock(memcg, nr_pages); } /** @@ -6273,6 +6271,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; + page_counter_set_high(&memcg->memory, high); + for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); unsigned long reclaimed; @@ -6296,10 +6296,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, break; } - page_counter_set_high(&memcg->memory, high); - memcg_wb_domain_size_changed(memcg); - return nbytes; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 04d9f154a130..e9481632fcd1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1885,6 +1885,12 @@ static int soft_offline_free_page(struct page *page) return rc; } +static void put_ref_page(struct page *page) +{ + if (page) + put_page(page); +} + /** * soft_offline_page - Soft offline a page. * @pfn: pfn to soft-offline @@ -1910,20 +1916,26 @@ static int soft_offline_free_page(struct page *page) int soft_offline_page(unsigned long pfn, int flags) { int ret; - struct page *page; bool try_again = true; + struct page *page, *ref_page = NULL; + + WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED)); if (!pfn_valid(pfn)) return -ENXIO; + if (flags & MF_COUNT_INCREASED) + ref_page = pfn_to_page(pfn); + /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ page = pfn_to_online_page(pfn); - if (!page) + if (!page) { + put_ref_page(ref_page); return -EIO; + } if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); - if (flags & MF_COUNT_INCREASED) - put_page(page); + put_ref_page(ref_page); return 0; } diff --git a/mm/memory.c b/mm/memory.c index feff48e1465a..5da964079678 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -134,6 +134,18 @@ static inline bool arch_faults_on_old_pte(void) } #endif +#ifndef arch_wants_old_prefaulted_pte +static inline bool arch_wants_old_prefaulted_pte(void) +{ + /* + * Transitioning a PTE from 'old' to 'young' can be expensive on + * some architectures, even if it's performed in hardware. By + * default, "false" means prefaulted entries will be 'young'. + */ + return false; +} +#endif + static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -1534,13 +1546,13 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, start, start + size); - tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); + tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) unmap_single_vma(&tlb, vma, start, range.end, NULL); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb, start, range.end); + tlb_finish_mmu(&tlb); } /** @@ -1561,12 +1573,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, address + size); - tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); + tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); unmap_single_vma(&tlb, vma, address, range.end, details); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb, address, range.end); + tlb_finish_mmu(&tlb); } /** @@ -3503,7 +3515,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; - /* See the comment in pte_alloc_one_map() */ + /* See comment in handle_pte_fault() */ if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; @@ -3643,66 +3655,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) return ret; } -/* - * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. - * If we check pmd_trans_unstable() first we will trip the bad_pmd() check - * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly - * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. - */ -static int pmd_devmap_trans_unstable(pmd_t *pmd) -{ - return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); -} - -static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - - if (!pmd_none(*vmf->pmd)) - goto map_pte; - if (vmf->prealloc_pte) { - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_none(*vmf->pmd))) { - spin_unlock(vmf->ptl); - goto map_pte; - } - - mm_inc_nr_ptes(vma->vm_mm); - pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); - spin_unlock(vmf->ptl); - vmf->prealloc_pte = NULL; - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { - return VM_FAULT_OOM; - } -map_pte: - /* - * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of - * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge - * under us and then back to pmd_none, as a result of MADV_DONTNEED - * running immediately after a huge pmd fault in a different thread of - * this mm, in turn leading to a misleading pmd_trans_huge() retval. - * All we have to ensure is that it is a regular pmd that we can walk - * with pte_offset_map() and we can do that through an atomic read in - * C, which is what pmd_trans_unstable() provides. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return VM_FAULT_NOPAGE; - - /* - * At this point we know that our vmf->pmd points to a page of ptes - * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() - * for the duration of the fault. If a racing MADV_DONTNEED runs and - * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still - * be valid and we will re-check to make sure the vmf->pte isn't - * pte_none() under vmf->ptl protection when we return to - * alloc_set_pte(). - */ - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); - return 0; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { @@ -3717,7 +3669,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -3775,76 +3727,41 @@ out: return ret; } #else -static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { - BUILD_BUG(); - return 0; + return VM_FAULT_FALLBACK; } #endif -/** - * alloc_set_pte - setup new PTE entry for given page and add reverse page - * mapping. If needed, the function allocates page table or use pre-allocated. - * - * @vmf: fault environment - * @page: page to map - * - * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on - * return. - * - * Target users are page handler itself and implementations of - * vm_ops->map_pages. - * - * Return: %0 on success, %VM_FAULT_ code in case of error. - */ -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool prefault = vmf->address != addr; pte_t entry; - vm_fault_t ret; - - if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); - if (ret != VM_FAULT_FALLBACK) - return ret; - } - - if (!vmf->pte) { - ret = pte_alloc_one_map(vmf); - if (ret) - return ret; - } - - /* Re-check under ptl */ - if (unlikely(!pte_none(*vmf->pte))) { - update_mmu_tlb(vma, vmf->address, vmf->pte); - return VM_FAULT_NOPAGE; - } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); + + if (prefault && arch_wants_old_prefaulted_pte()) + entry = pte_mkold(entry); + else + entry = pte_sw_mkyoung(entry); + if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); - - return 0; + set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } - /** * finish_fault - finish page fault once we have prepared the page to fault * @@ -3862,12 +3779,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) */ vm_fault_t finish_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct page *page; - vm_fault_t ret = 0; + vm_fault_t ret; /* Did we COW the page? */ - if ((vmf->flags & FAULT_FLAG_WRITE) && - !(vmf->vma->vm_flags & VM_SHARED)) + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) page = vmf->cow_page; else page = vmf->page; @@ -3876,12 +3793,38 @@ vm_fault_t finish_fault(struct vm_fault *vmf) * check even for read faults because we might have lost our CoWed * page */ - if (!(vmf->vma->vm_flags & VM_SHARED)) - ret = check_stable_address_space(vmf->vma->vm_mm); - if (!ret) - ret = alloc_set_pte(vmf, page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (!(vma->vm_flags & VM_SHARED)) { + ret = check_stable_address_space(vma->vm_mm); + if (ret) + return ret; + } + + if (pmd_none(*vmf->pmd)) { + if (PageTransCompound(page)) { + ret = do_set_pmd(vmf, page); + if (ret != VM_FAULT_FALLBACK) + return ret; + } + + if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) + return VM_FAULT_OOM; + } + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) + return 0; + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + ret = 0; + /* Re-check under ptl */ + if (likely(pte_none(*vmf->pte))) + do_set_pte(vmf, page, vmf->address); + else + ret = VM_FAULT_NOPAGE; + + update_mmu_tlb(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } @@ -3951,13 +3894,12 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off; - vm_fault_t ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - vmf->address = max(address & mask, vmf->vma->vm_start); - off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + address = max(address & mask, vmf->vma->vm_start); + off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3965,7 +3907,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) * the vma or nr_pages from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); @@ -3973,31 +3915,11 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) if (pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) - goto out; + return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); - - /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*vmf->pmd)) { - ret = VM_FAULT_NOPAGE; - goto out; - } - - /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!vmf->pte) - goto out; - - /* check if the page fault is solved */ - vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*vmf->pte)) - ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(vmf->pte, vmf->ptl); -out: - vmf->address = address; - vmf->pte = NULL; - return ret; + return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); } static vm_fault_t do_read_fault(struct vm_fault *vmf) @@ -4353,7 +4275,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) */ vmf->pte = NULL; } else { - /* See comment in pte_alloc_one_map() */ + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead + * of pmd_trans_huge() to ensure the pmd didn't become + * pmd_trans_huge under us and then back to pmd_none, as a + * result of MADV_DONTNEED running immediately after a huge pmd + * fault in a different thread of this mm, in turn leading to a + * misleading pmd_trans_huge() retval. All we have to ensure is + * that it is a regular pmd that we can walk with + * pte_offset_map() and we can do that through an atomic read + * in C, which is what pmd_trans_unstable() provides. + */ if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; /* @@ -4709,9 +4642,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -int follow_pte(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, - spinlock_t **ptlp) +int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, + struct mmu_notifier_range *range, pte_t **ptepp, + pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; p4d_t *p4d; @@ -4777,6 +4710,34 @@ out: } /** + * follow_pte - look up PTE at a user virtual address + * @mm: the mm_struct of the target address space + * @address: user virtual address + * @ptepp: location to store found PTE + * @ptlp: location to store the lock for the PTE + * + * On a successful return, the pointer to the PTE is stored in @ptepp; + * the corresponding lock is taken and its location is stored in @ptlp. + * The contents of the PTE are only stable until @ptlp is released; + * any further use, if any, must be protected against invalidation + * with MMU notifiers. + * + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + * should be taken for read. + * + * KVM uses this function. While it is arguably less bad than ``follow_pfn``, + * it is not a good general-purpose API. + * + * Return: zero on success, -ve otherwise. + */ +int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp); +} +EXPORT_SYMBOL_GPL(follow_pte); + +/** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping * @address: user virtual address @@ -4784,6 +4745,9 @@ out: * * Only IO mappings and raw PFN mappings are allowed. * + * This function does not allow the caller to read the permissions + * of the PTE. Do not use it. + * * Return: zero and the pfn at @pfn on success, -ve otherwise. */ int follow_pfn(struct vm_area_struct *vma, unsigned long address, @@ -4796,7 +4760,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) return ret; - ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl); + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; *pfn = pte_pfn(*ptep); @@ -4817,7 +4781,7 @@ int follow_phys(struct vm_area_struct *vma, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; - if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl)) + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; pte = *ptep; @@ -4834,28 +4798,68 @@ out: return ret; } +/** + * generic_access_phys - generic implementation for iomem mmap access + * @vma: the vma to access + * @addr: userspace addres, not relative offset within @vma + * @buf: buffer to read/write + * @len: length of transfer + * @write: set to FOLL_WRITE when writing, otherwise reading + * + * This is a generic implementation for &vm_operations_struct.access for an + * iomem mapping. This callback is used by access_process_vm() when the @vma is + * not page based. + */ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; unsigned long prot = 0; void __iomem *maddr; - int offset = addr & (PAGE_SIZE-1); + pte_t *ptep, pte; + spinlock_t *ptl; + int offset = offset_in_page(addr); + int ret = -EINVAL; - if (follow_phys(vma, addr, write, &prot, &phys_addr)) + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + +retry: + if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + return -EINVAL; + pte = *ptep; + pte_unmap_unlock(ptep, ptl); + + prot = pgprot_val(pte_pgprot(pte)); + phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + + if ((write & FOLL_WRITE) && !pte_write(pte)) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; + if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + goto out_unmap; + + if (!pte_same(pte, *ptep)) { + pte_unmap_unlock(ptep, ptl); + iounmap(maddr); + + goto retry; + } + if (write) memcpy_toio(maddr + offset, buf, len); else memcpy_fromio(buf, maddr + offset, len); + ret = len; + pte_unmap_unlock(ptep, ptl); +out_unmap: iounmap(maddr); - return len; + return ret; } EXPORT_SYMBOL_GPL(generic_access_phys); #endif diff --git a/mm/migrate.c b/mm/migrate.c index ee5e612b4cd8..20ca887ea769 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -402,6 +402,7 @@ int migrate_page_move_mapping(struct address_space *mapping, struct zone *oldzone, *newzone; int dirty; int expected_count = expected_page_refs(mapping, page) + extra_count; + int nr = thp_nr_pages(page); if (!mapping) { /* Anonymous page without mapping */ @@ -437,7 +438,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ newpage->index = page->index; newpage->mapping = page->mapping; - page_ref_add(newpage, thp_nr_pages(page)); /* add cache reference */ + page_ref_add(newpage, nr); /* add cache reference */ if (PageSwapBacked(page)) { __SetPageSwapBacked(newpage); if (PageSwapCache(page)) { @@ -459,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping, if (PageTransHuge(page)) { int i; - for (i = 1; i < HPAGE_PMD_NR; i++) { + for (i = 1; i < nr; i++) { xas_next(&xas); xas_store(&xas, newpage); } @@ -470,7 +471,7 @@ int migrate_page_move_mapping(struct address_space *mapping, * to one less reference. * We know this isn't the last reference. */ - page_ref_unfreeze(page, expected_count - thp_nr_pages(page)); + page_ref_unfreeze(page, expected_count - nr); xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ @@ -493,17 +494,17 @@ int migrate_page_move_mapping(struct address_space *mapping, old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); - __dec_lruvec_state(old_lruvec, NR_FILE_PAGES); - __inc_lruvec_state(new_lruvec, NR_FILE_PAGES); + __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); + __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_lruvec_state(old_lruvec, NR_SHMEM); - __inc_lruvec_state(new_lruvec, NR_SHMEM); + __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); + __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); } if (dirty && mapping_can_writeback(mapping)) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); - __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); - __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); - __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); + __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); + __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); + __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); + __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } } local_irq_enable(); @@ -1279,6 +1280,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOSYS; } + if (page_count(hpage) == 1) { + /* page was freed from under us. So we are done. */ + putback_active_hugepage(hpage); + return MIGRATEPAGE_SUCCESS; + } + new_hpage = get_new_page(hpage, private); if (!new_hpage) return -ENOMEM; diff --git a/mm/mmap.c b/mm/mmap.c index dc7206032387..90673febce6a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2671,12 +2671,12 @@ static void unmap_region(struct mm_struct *mm, struct mmu_gather tlb; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start, end); + tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); - tlb_finish_mmu(&tlb, start, end); + tlb_finish_mmu(&tlb); } /* @@ -3214,12 +3214,12 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb_gather_mmu(&tlb, mm, 0, -1); + tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); - tlb_finish_mmu(&tlb, 0, -1); + tlb_finish_mmu(&tlb); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 03c33c93a582..0dc7149b0c61 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -253,21 +253,17 @@ void tlb_flush_mmu(struct mmu_gather *tlb) * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space - * @start: start of the region that will be removed from the page-table - * @end: end of the region that will be removed from the page-table + * @fullmm: @mm is without users and we're going to destroy the full address + * space (exit/execve) * * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. The @start and @end are set to 0 and -1 - * respectively when @mm is without users and we're going to destroy - * the full address space (exit/execve). + * tear-down from @mm. */ -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) +static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + bool fullmm) { tlb->mm = mm; - - /* Is it from 0 to ~0? */ - tlb->fullmm = !(start | (end+1)); + tlb->fullmm = fullmm; #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb->need_flush_all = 0; @@ -287,17 +283,24 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, inc_tlb_flush_pending(tlb->mm); } +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) +{ + __tlb_gather_mmu(tlb, mm, false); +} + +void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) +{ + __tlb_gather_mmu(tlb, mm, true); +} + /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish - * @start: start of the region that will be removed from the page-table - * @end: end of the region that will be removed from the page-table * * Called at the end of the shootdown operation to free up any resources that * were required. */ -void tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end) +void tlb_finish_mmu(struct mmu_gather *tlb) { /* * If there are parallel threads are doing PTE changes on same range diff --git a/mm/mremap.c b/mm/mremap.c index f554320281cc..47192691fe32 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -22,7 +22,6 @@ #include <linux/syscalls.h> #include <linux/mmu_notifier.h> #include <linux/uaccess.h> -#include <linux/mm-arch-hooks.h> #include <linux/userfaultfd_k.h> #include <asm/cacheflush.h> @@ -336,8 +335,9 @@ enum pgt_entry { * valid. Else returns a smaller extent bounded by the end of the source and * destination pgt_entry. */ -static unsigned long get_extent(enum pgt_entry entry, unsigned long old_addr, - unsigned long old_end, unsigned long new_addr) +static __always_inline unsigned long get_extent(enum pgt_entry entry, + unsigned long old_addr, unsigned long old_end, + unsigned long new_addr) { unsigned long next, extent, mask, size; @@ -562,8 +562,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, new_addr = err; } else { mremap_userfaultfd_prep(new_vma, uf); - arch_remap(mm, old_addr, old_addr + old_len, - new_addr, new_addr + new_len); } /* Conceal VM_ACCOUNT so old reservation is not undone */ diff --git a/mm/nommu.c b/mm/nommu.c index 870fea12823e..5c9ab799c0e6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1668,10 +1668,11 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_fault *vmf, +vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); + return 0; } EXPORT_SYMBOL(filemap_map_pages); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 04b19b7b5435..c9a33ffe38b7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -546,15 +546,15 @@ bool __oom_reap_task_mm(struct mm_struct *mm) mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, vma->vm_start, vma->vm_end); - tlb_gather_mmu(&tlb, mm, range.start, range.end); + tlb_gather_mmu(&tlb, mm); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { - tlb_finish_mmu(&tlb, range.start, range.end); + tlb_finish_mmu(&tlb); ret = false; continue; } unmap_page_range(&tlb, vma, range.start, range.end, NULL); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb, range.start, range.end); + tlb_finish_mmu(&tlb); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 027f6481ba59..ef5070fed76b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1207,8 +1207,10 @@ static void kernel_init_free_pages(struct page *page, int numpages) /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); for (i = 0; i < numpages; i++) { + u8 tag = page_kasan_tag(page + i); page_kasan_tag_reset(page + i); clear_highpage(page + i); + page_kasan_tag_set(page + i, tag); } kasan_enable_current(); } @@ -5135,8 +5137,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) } EXPORT_SYMBOL(__page_frag_cache_drain); -void *page_frag_alloc(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) +void *page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -5188,11 +5191,12 @@ refill: } nc->pagecnt_bias--; + offset &= align_mask; nc->offset = offset; return nc->va + offset; } -EXPORT_SYMBOL(page_frag_alloc); +EXPORT_SYMBOL(page_frag_alloc_align); /* * Frees a page fragment allocated out of either a compound or order 0 page. diff --git a/mm/page_io.c b/mm/page_io.c index 9bca17ecc4df..92f7941c6d01 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -26,25 +26,6 @@ #include <linux/uio.h> #include <linux/sched/task.h> -static struct bio *get_swap_bio(gfp_t gfp_flags, - struct page *page, bio_end_io_t end_io) -{ - struct bio *bio; - - bio = bio_alloc(gfp_flags, 1); - if (bio) { - struct block_device *bdev; - - bio->bi_iter.bi_sector = map_swap_page(page, &bdev); - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; - bio->bi_end_io = end_io; - - bio_add_page(bio, page, thp_size(page), 0); - } - return bio; -} - void end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -361,13 +342,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } - bio = get_swap_bio(GFP_NOIO, page, end_write_func); - if (bio == NULL) { - set_page_dirty(page); - unlock_page(page); - return -ENOMEM; - } + bio = bio_alloc(GFP_NOIO, 1); + bio_set_dev(bio, sis->bdev); + bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); + bio->bi_end_io = end_write_func; + bio_add_page(bio, page, thp_size(page), 0); + bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); @@ -427,18 +408,18 @@ int swap_readpage(struct page *page, bool synchronous) } ret = 0; - bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); - if (bio == NULL) { - unlock_page(page); - ret = -ENOMEM; - goto out; - } - disk = bio->bi_disk; + bio = bio_alloc(GFP_KERNEL, 1); + bio_set_dev(bio, sis->bdev); + bio->bi_opf = REQ_OP_READ; + bio->bi_iter.bi_sector = swap_page_sector(page); + bio->bi_end_io = end_swap_bio_read; + bio_add_page(bio, page, thp_size(page), 0); + + disk = bio->bi_bdev->bd_disk; /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); if (synchronous) { bio->bi_opf |= REQ_HIPRI; get_task_struct(current); diff --git a/mm/percpu.c b/mm/percpu.c index ad7a37ee74ef..6596a0a4286e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -69,6 +69,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bitmap.h> +#include <linux/cpumask.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/lcm.h> @@ -2662,13 +2663,14 @@ early_param("percpu_alloc", percpu_alloc_setup); * On success, pointer to the new allocation_info is returned. On * failure, ERR_PTR value is returned. */ -static struct pcpu_alloc_info * __init pcpu_build_alloc_info( +static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; + static struct cpumask mask __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; @@ -2681,6 +2683,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( /* this function may be called multiple times */ memset(group_map, 0, sizeof(group_map)); memset(group_cnt, 0, sizeof(group_cnt)); + cpumask_clear(&mask); /* calculate size_sum and ensure dyn_size is enough for early alloc */ size_sum = PFN_ALIGN(static_size + reserved_size + @@ -2702,24 +2705,27 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( upa--; max_upa = upa; + cpumask_copy(&mask, cpu_possible_mask); + /* group cpus according to their proximity */ - for_each_possible_cpu(cpu) { - group = 0; - next_group: - for_each_possible_cpu(tcpu) { - if (cpu == tcpu) - break; - if (group_map[tcpu] == group && cpu_distance_fn && - (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || - cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { - group++; - nr_groups = max(nr_groups, group + 1); - goto next_group; - } - } + for (group = 0; !cpumask_empty(&mask); group++) { + /* pop the group's first cpu */ + cpu = cpumask_first(&mask); group_map[cpu] = group; group_cnt[group]++; + cpumask_clear_cpu(cpu, &mask); + + for_each_cpu(tcpu, &mask) { + if (!cpu_distance_fn || + (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE && + cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) { + group_map[tcpu] = group; + group_cnt[group]++; + cpumask_clear_cpu(tcpu, &mask); + } + } } + nr_groups = group; /* * Wasted space is caused by a ratio imbalance of upa to group_cnt. diff --git a/mm/shmem.c b/mm/shmem.c index facdd1a9c524..7924b3bf46fb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1522,11 +1522,11 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, { struct vm_area_struct pvma; struct page *page; - struct vm_fault vmf; + struct vm_fault vmf = { + .vma = &pvma, + }; shmem_pseudo_vma_init(&pvma, info, index); - vmf.vma = &pvma; - vmf.address = 0; page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); diff --git a/mm/slab.c b/mm/slab.c index d7c8da9319c7..dcc55e78f353 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3635,6 +3635,26 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif /* CONFIG_NUMA */ +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + struct kmem_cache *cachep; + unsigned int objnr; + void *objp; + + kpp->kp_ptr = object; + kpp->kp_page = page; + cachep = page->slab_cache; + kpp->kp_slab_cache = cachep; + objp = object - obj_offset(cachep); + kpp->kp_data_offset = obj_offset(cachep); + page = virt_to_head_page(objp); + objnr = obj_to_index(cachep, page, objp); + objp = index_to_obj(cachep, page, objnr); + kpp->kp_objp = objp; + if (DEBUG && cachep->flags & SLAB_STORE_USER) + kpp->kp_ret = *dbg_userword(cachep, objp); +} + /** * __do_kmalloc - allocate memory * @size: how many bytes of memory are required. diff --git a/mm/slab.h b/mm/slab.h index 1a756a359fa8..ecad9b57bc44 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -615,4 +615,16 @@ static inline bool slab_want_init_on_free(struct kmem_cache *c) return false; } +#define KS_ADDRS_COUNT 16 +struct kmem_obj_info { + void *kp_ptr; + struct page *kp_page; + void *kp_objp; + unsigned long kp_data_offset; + struct kmem_cache *kp_slab_cache; + void *kp_ret; + void *kp_stack[KS_ADDRS_COUNT]; +}; +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index e981c80d216c..adbace4256ef 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -537,6 +537,81 @@ bool slab_is_available(void) return slab_state >= UP; } +/** + * kmem_valid_obj - does the pointer reference a valid slab object? + * @object: pointer to query. + * + * Return: %true if the pointer is to a not-yet-freed object from + * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer + * is to an already-freed object, and %false otherwise. + */ +bool kmem_valid_obj(void *object) +{ + struct page *page; + + /* Some arches consider ZERO_SIZE_PTR to be a valid address. */ + if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) + return false; + page = virt_to_head_page(object); + return PageSlab(page); +} + +/** + * kmem_dump_obj - Print available slab provenance information + * @object: slab object for which to find provenance information. + * + * This function uses pr_cont(), so that the caller is expected to have + * printed out whatever preamble is appropriate. The provenance information + * depends on the type of object and on how much debugging is enabled. + * For a slab-cache object, the fact that it is a slab object is printed, + * and, if available, the slab name, return address, and stack trace from + * the allocation of that object. + * + * This function will splat if passed a pointer to a non-slab object. + * If you are not sure what type of object you have, you should instead + * use mem_dump_obj(). + */ +void kmem_dump_obj(void *object) +{ + char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc"; + int i; + struct page *page; + unsigned long ptroffset; + struct kmem_obj_info kp = { }; + + if (WARN_ON_ONCE(!virt_addr_valid(object))) + return; + page = virt_to_head_page(object); + if (WARN_ON_ONCE(!PageSlab(page))) { + pr_cont(" non-slab memory.\n"); + return; + } + kmem_obj_info(&kp, object, page); + if (kp.kp_slab_cache) + pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); + else + pr_cont(" slab%s", cp); + if (kp.kp_objp) + pr_cont(" start %px", kp.kp_objp); + if (kp.kp_data_offset) + pr_cont(" data offset %lu", kp.kp_data_offset); + if (kp.kp_objp) { + ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset; + pr_cont(" pointer offset %lu", ptroffset); + } + if (kp.kp_slab_cache && kp.kp_slab_cache->usersize) + pr_cont(" size %u", kp.kp_slab_cache->usersize); + if (kp.kp_ret) + pr_cont(" allocated at %pS\n", kp.kp_ret); + else + pr_cont("\n"); + for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) { + if (!kp.kp_stack[i]) + break; + pr_info(" %pS\n", kp.kp_stack[i]); + } +} + #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ void __init create_boot_cache(struct kmem_cache *s, const char *name, diff --git a/mm/slob.c b/mm/slob.c index 8d4bfa46247f..ef87ada8705d 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -461,6 +461,12 @@ out: spin_unlock_irqrestore(&slob_lock, flags); } +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + kpp->kp_ptr = object; + kpp->kp_page = page; +} + /* * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ diff --git a/mm/slub.c b/mm/slub.c index d9e4e10683cc..f5baf429654f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2791,7 +2791,8 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, void *obj) { if (unlikely(slab_want_init_on_free(s)) && obj) - memset((void *)((char *)obj + s->offset), 0, sizeof(void *)); + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); } /* @@ -2883,7 +2884,7 @@ redo: stat(s, ALLOC_FASTPATH); } - maybe_wipe_obj_freeptr(s, kasan_reset_tag(object)); + maybe_wipe_obj_freeptr(s, object); if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) memset(kasan_reset_tag(object), 0, s->object_size); @@ -3329,7 +3330,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, int j; for (j = 0; j < i; j++) - memset(p[j], 0, s->object_size); + memset(kasan_reset_tag(p[j]), 0, s->object_size); } /* memcg and kmem_cache debug support */ @@ -3422,6 +3423,7 @@ static inline int calculate_order(unsigned int size) unsigned int order; unsigned int min_objects; unsigned int max_objects; + unsigned int nr_cpus; /* * Attempt to find best configuration for a slab. This @@ -3432,8 +3434,21 @@ static inline int calculate_order(unsigned int size) * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; - if (!min_objects) - min_objects = 4 * (fls(num_online_cpus()) + 1); + if (!min_objects) { + /* + * Some architectures will only update present cpus when + * onlining them, so don't trust the number if it's just 1. But + * we also don't want to use nr_cpu_ids always, as on some other + * architectures, there can be many possible cpus, but never + * onlined. Here we compromise between trying to avoid too high + * order on systems that appear larger than they are, and too + * low order on systems that appear smaller than they are. + */ + nr_cpus = num_present_cpus(); + if (nr_cpus <= 1) + nr_cpus = nr_cpu_ids; + min_objects = 4 * (fls(nr_cpus) + 1); + } max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); @@ -3918,6 +3933,46 @@ int __kmem_cache_shutdown(struct kmem_cache *s) return 0; } +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + void *base; + int __maybe_unused i; + unsigned int objnr; + void *objp; + void *objp0; + struct kmem_cache *s = page->slab_cache; + struct track __maybe_unused *trackp; + + kpp->kp_ptr = object; + kpp->kp_page = page; + kpp->kp_slab_cache = s; + base = page_address(page); + objp0 = kasan_reset_tag(object); +#ifdef CONFIG_SLUB_DEBUG + objp = restore_red_left(s, objp0); +#else + objp = objp0; +#endif + objnr = obj_to_index(s, page, objp); + kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp); + objp = base + s->size * objnr; + kpp->kp_objp = objp; + if (WARN_ON_ONCE(objp < base || objp >= base + page->objects * s->size || (objp - base) % s->size) || + !(s->flags & SLAB_STORE_USER)) + return; +#ifdef CONFIG_SLUB_DEBUG + trackp = get_track(s, objp, TRACK_ALLOC); + kpp->kp_ret = (void *)trackp->addr; +#ifdef CONFIG_STACKTRACE + for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { + kpp->kp_stack[i] = (void *)trackp->addrs[i]; + if (!kpp->kp_stack[i]) + break; + } +#endif +#endif +} + /******************************************************************** * Kmalloc subsystem *******************************************************************/ @@ -5624,10 +5679,8 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = kset; err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); - if (err) { - kobject_put(&s->kobj); + if (err) goto out; - } err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) diff --git a/mm/swapfile.c b/mm/swapfile.c index 9fffc5af29d1..96799a2f6957 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,7 +47,6 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static sector_t map_swap_entry(swp_entry_t, struct block_device**); DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -1850,12 +1849,13 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { - struct block_device *bdev; struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; - return map_swap_entry(swp_entry(type, offset), &bdev); + se = offset_to_swap_extent(si, offset); + return se->start_block + (offset - se->start_page); } /* @@ -1951,8 +1951,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { - struct vm_fault vmf; - if (!is_swap_pte(*pte)) continue; @@ -1968,9 +1966,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, swap_map = &si->swap_map[offset]; page = lookup_swap_cache(entry, vma, addr); if (!page) { - vmf.vma = vma; - vmf.address = addr; - vmf.pmd = pmd; + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .pmd = pmd, + }; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); } @@ -2282,36 +2283,6 @@ static void drain_mmlist(void) } /* - * Use this swapdev's extent info to locate the (PAGE_SIZE) block which - * corresponds to page offset for the specified swap entry. - * Note that the type of this function is sector_t, but it returns page offset - * into the bdev, not sector offset. - */ -static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) -{ - struct swap_info_struct *sis; - struct swap_extent *se; - pgoff_t offset; - - sis = swp_swap_info(entry); - *bdev = sis->bdev; - - offset = swp_offset(entry); - se = offset_to_swap_extent(sis, offset); - return se->start_block + (offset - se->start_page); -} - -/* - * Returns the page offset into bdev for the specified page's swap entry. - */ -sector_t map_swap_page(struct page *page, struct block_device **bdev) -{ - swp_entry_t entry; - entry.val = page_private(page); - return map_swap_entry(entry, bdev); -} - -/* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) diff --git a/mm/util.c b/mm/util.c index 8c9b7d1e7c49..54870226cea6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -982,3 +982,34 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) kunmap_atomic(addr1); return ret; } + +/** + * mem_dump_obj - Print available provenance information + * @object: object for which to find provenance information. + * + * This function uses pr_cont(), so that the caller is expected to have + * printed out whatever preamble is appropriate. The provenance information + * depends on the type of object and on how much debugging is enabled. + * For example, for a slab-cache object, the slab name is printed, and, + * if available, the return address and stack trace from the allocation + * of that object. + */ +void mem_dump_obj(void *object) +{ + if (kmem_valid_obj(object)) { + kmem_dump_obj(object); + return; + } + if (vmalloc_dump_obj(object)) + return; + if (!virt_addr_valid(object)) { + if (object == NULL) + pr_cont(" NULL pointer.\n"); + else if (object == ZERO_SIZE_PTR) + pr_cont(" zero-size pointer.\n"); + else + pr_cont(" non-paged memory.\n"); + return; + } + pr_cont(" non-slab/vmalloc memory.\n"); +} diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e6f352bf0498..4f5f8c907897 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3450,6 +3450,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) } #endif /* CONFIG_SMP */ +bool vmalloc_dump_obj(void *object) +{ + struct vm_struct *vm; + void *objp = (void *)PAGE_ALIGN((unsigned long)object); + + vm = find_vm_area(objp); + if (!vm) + return false; + pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", + vm->nr_pages, (unsigned long)vm->addr, vm->caller); + return true; +} + #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_purge_lock) |