summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile1
-rw-r--r--mm/compaction.c3
-rw-r--r--mm/filemap.c183
-rw-r--r--mm/frame_vector.c240
-rw-r--r--mm/highmem.c7
-rw-r--r--mm/huge_memory.c37
-rw-r--r--mm/hugetlb.c71
-rw-r--r--mm/kasan/hw_tags.c73
-rw-r--r--mm/kasan/init.c23
-rw-r--r--mm/kasan/kasan.h2
-rw-r--r--mm/khugepaged.c37
-rw-r--r--mm/madvise.c12
-rw-r--r--mm/memblock.c57
-rw-r--r--mm/memcontrol.c9
-rw-r--r--mm/memory-failure.c20
-rw-r--r--mm/memory.c318
-rw-r--r--mm/migrate.c29
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/mmu_gather.c31
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page_alloc.c10
-rw-r--r--mm/page_io.c47
-rw-r--r--mm/percpu.c36
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/slab.c20
-rw-r--r--mm/slab.h12
-rw-r--r--mm/slab_common.c75
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c69
-rw-r--r--mm/swapfile.c47
-rw-r--r--mm/util.c31
-rw-r--r--mm/vmalloc.c13
35 files changed, 792 insertions, 761 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f730605b8dcf..24c045b24b95 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -804,9 +804,6 @@ config DEVICE_PRIVATE
config VMAP_PFN
bool
-config FRAME_VECTOR
- bool
-
config ARCH_USES_HIGH_VMA_FLAGS
bool
config ARCH_HAS_PKEYS
diff --git a/mm/Makefile b/mm/Makefile
index b6cd2fffa492..135bbb65511a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -110,7 +110,6 @@ obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
-obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
diff --git a/mm/compaction.c b/mm/compaction.c
index e5acb9714436..190ccdaa6c19 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1342,7 +1342,7 @@ fast_isolate_freepages(struct compact_control *cc)
{
unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
unsigned int nr_scanned = 0;
- unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
+ unsigned long low_pfn, min_pfn, highest = 0;
unsigned long nr_isolated = 0;
unsigned long distance;
struct page *page = NULL;
@@ -1387,6 +1387,7 @@ fast_isolate_freepages(struct compact_control *cc)
struct page *freepage;
unsigned long flags;
unsigned int order_scanned = 0;
+ unsigned long high_pfn = 0;
if (!area->nr_free)
continue;
diff --git a/mm/filemap.c b/mm/filemap.c
index 5c9d564317a5..6ff2a3fb0dc7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,6 +42,8 @@
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -835,6 +837,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
int error;
+ bool charged = false;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@ -848,6 +851,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
error = mem_cgroup_charge(page, current->mm, gfp);
if (error)
goto error;
+ charged = true;
}
gfp &= GFP_RECLAIM_MASK;
@@ -896,6 +900,8 @@ unlock:
if (xas_error(&xas)) {
error = xas_error(&xas);
+ if (charged)
+ mem_cgroup_uncharge(page);
goto error;
}
@@ -2911,74 +2917,163 @@ out_retry:
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct vm_fault *vmf,
- pgoff_t start_pgoff, pgoff_t end_pgoff)
+static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
{
- struct file *file = vmf->vma->vm_file;
+ struct mm_struct *mm = vmf->vma->vm_mm;
+
+ /* Huge page is mapped? No need to proceed. */
+ if (pmd_trans_huge(*vmf->pmd)) {
+ unlock_page(page);
+ put_page(page);
+ return true;
+ }
+
+ if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
+ vm_fault_t ret = do_set_pmd(vmf, page);
+ if (!ret) {
+ /* The page is mapped successfully, reference consumed. */
+ unlock_page(page);
+ return true;
+ }
+ }
+
+ if (pmd_none(*vmf->pmd)) {
+ vmf->ptl = pmd_lock(mm, vmf->pmd);
+ if (likely(pmd_none(*vmf->pmd))) {
+ mm_inc_nr_ptes(mm);
+ pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
+ vmf->prealloc_pte = NULL;
+ }
+ spin_unlock(vmf->ptl);
+ }
+
+ /* See comment in handle_pte_fault() */
+ if (pmd_devmap_trans_unstable(vmf->pmd)) {
+ unlock_page(page);
+ put_page(page);
+ return true;
+ }
+
+ return false;
+}
+
+static struct page *next_uptodate_page(struct page *page,
+ struct address_space *mapping,
+ struct xa_state *xas, pgoff_t end_pgoff)
+{
+ unsigned long max_idx;
+
+ do {
+ if (!page)
+ return NULL;
+ if (xas_retry(xas, page))
+ continue;
+ if (xa_is_value(page))
+ continue;
+ if (PageLocked(page))
+ continue;
+ if (!page_cache_get_speculative(page))
+ continue;
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(xas)))
+ goto skip;
+ if (!PageUptodate(page) || PageReadahead(page))
+ goto skip;
+ if (PageHWPoison(page))
+ goto skip;
+ if (!trylock_page(page))
+ goto skip;
+ if (page->mapping != mapping)
+ goto unlock;
+ if (!PageUptodate(page))
+ goto unlock;
+ max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+ if (xas->xa_index >= max_idx)
+ goto unlock;
+ return page;
+unlock:
+ unlock_page(page);
+skip:
+ put_page(page);
+ } while ((page = xas_next_entry(xas, end_pgoff)) != NULL);
+
+ return NULL;
+}
+
+static inline struct page *first_map_page(struct address_space *mapping,
+ struct xa_state *xas,
+ pgoff_t end_pgoff)
+{
+ return next_uptodate_page(xas_find(xas, end_pgoff),
+ mapping, xas, end_pgoff);
+}
+
+static inline struct page *next_map_page(struct address_space *mapping,
+ struct xa_state *xas,
+ pgoff_t end_pgoff)
+{
+ return next_uptodate_page(xas_next_entry(xas, end_pgoff),
+ mapping, xas, end_pgoff);
+}
+
+vm_fault_t filemap_map_pages(struct vm_fault *vmf,
+ pgoff_t start_pgoff, pgoff_t end_pgoff)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
- unsigned long max_idx;
+ unsigned long addr;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
+ vm_fault_t ret = 0;
rcu_read_lock();
- xas_for_each(&xas, head, end_pgoff) {
- if (xas_retry(&xas, head))
- continue;
- if (xa_is_value(head))
- goto next;
+ head = first_map_page(mapping, &xas, end_pgoff);
+ if (!head)
+ goto out;
- /*
- * Check for a locked page first, as a speculative
- * reference may adversely influence page migration.
- */
- if (PageLocked(head))
- goto next;
- if (!page_cache_get_speculative(head))
- goto next;
+ if (filemap_map_pmd(vmf, head)) {
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
- /* Has the page moved or been split? */
- if (unlikely(head != xas_reload(&xas)))
- goto skip;
+ addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
+ do {
page = find_subpage(head, xas.xa_index);
-
- if (!PageUptodate(head) ||
- PageReadahead(page) ||
- PageHWPoison(page))
- goto skip;
- if (!trylock_page(head))
- goto skip;
-
- if (head->mapping != mapping || !PageUptodate(head))
- goto unlock;
-
- max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
- if (xas.xa_index >= max_idx)
+ if (PageHWPoison(page))
goto unlock;
if (mmap_miss > 0)
mmap_miss--;
- vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
- if (vmf->pte)
- vmf->pte += xas.xa_index - last_pgoff;
+ addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
+ vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
- if (alloc_set_pte(vmf, page))
+
+ if (!pte_none(*vmf->pte))
goto unlock;
+
+ /* We're about to handle the fault */
+ if (vmf->address == addr)
+ ret = VM_FAULT_NOPAGE;
+
+ do_set_pte(vmf, page, addr);
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, addr, vmf->pte);
unlock_page(head);
- goto next;
+ continue;
unlock:
unlock_page(head);
-skip:
put_page(head);
-next:
- /* Huge page is mapped? No need to proceed. */
- if (pmd_trans_huge(*vmf->pmd))
- break;
- }
+ } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+out:
rcu_read_unlock();
WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
+ return ret;
}
EXPORT_SYMBOL(filemap_map_pages);
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
deleted file mode 100644
index 10f82d5643b6..000000000000
--- a/mm/frame_vector.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/pagemap.h>
-#include <linux/sched.h>
-
-/**
- * get_vaddr_frames() - map virtual addresses to pfns
- * @start: starting user address
- * @nr_frames: number of pages / pfns from start to map
- * @gup_flags: flags modifying lookup behaviour
- * @vec: structure which receives pages / pfns of the addresses mapped.
- * It should have space for at least nr_frames entries.
- *
- * This function maps virtual addresses from @start and fills @vec structure
- * with page frame numbers or page pointers to corresponding pages (choice
- * depends on the type of the vma underlying the virtual address). If @start
- * belongs to a normal vma, the function grabs reference to each of the pages
- * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
- * touch page structures and the caller must make sure pfns aren't reused for
- * anything else while he is using them.
- *
- * The function returns number of pages mapped which may be less than
- * @nr_frames. In particular we stop mapping if there are more vmas of
- * different type underlying the specified range of virtual addresses.
- * When the function isn't able to map a single page, it returns error.
- *
- * This function takes care of grabbing mmap_lock as necessary.
- */
-int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
- unsigned int gup_flags, struct frame_vector *vec)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int ret = 0;
- int err;
- int locked;
-
- if (nr_frames == 0)
- return 0;
-
- if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
- nr_frames = vec->nr_allocated;
-
- start = untagged_addr(start);
-
- mmap_read_lock(mm);
- locked = 1;
- vma = find_vma_intersection(mm, start, start + 1);
- if (!vma) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * While get_vaddr_frames() could be used for transient (kernel
- * controlled lifetime) pinning of memory pages all current
- * users establish long term (userspace controlled lifetime)
- * page pinning. Treat get_vaddr_frames() like
- * get_user_pages_longterm() and disallow it for filesystem-dax
- * mappings.
- */
- if (vma_is_fsdax(vma)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
- vec->got_ref = true;
- vec->is_pfns = false;
- ret = pin_user_pages_locked(start, nr_frames,
- gup_flags, (struct page **)(vec->ptrs), &locked);
- goto out;
- }
-
- vec->got_ref = false;
- vec->is_pfns = true;
- do {
- unsigned long *nums = frame_vector_pfns(vec);
-
- while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
- err = follow_pfn(vma, start, &nums[ret]);
- if (err) {
- if (ret == 0)
- ret = err;
- goto out;
- }
- start += PAGE_SIZE;
- ret++;
- }
- /*
- * We stop if we have enough pages or if VMA doesn't completely
- * cover the tail page.
- */
- if (ret >= nr_frames || start < vma->vm_end)
- break;
- vma = find_vma_intersection(mm, start, start + 1);
- } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
-out:
- if (locked)
- mmap_read_unlock(mm);
- if (!ret)
- ret = -EFAULT;
- if (ret > 0)
- vec->nr_frames = ret;
- return ret;
-}
-EXPORT_SYMBOL(get_vaddr_frames);
-
-/**
- * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
- * them
- * @vec: frame vector to put
- *
- * Drop references to pages if get_vaddr_frames() acquired them. We also
- * invalidate the frame vector so that it is prepared for the next call into
- * get_vaddr_frames().
- */
-void put_vaddr_frames(struct frame_vector *vec)
-{
- struct page **pages;
-
- if (!vec->got_ref)
- goto out;
- pages = frame_vector_pages(vec);
- /*
- * frame_vector_pages() might needed to do a conversion when
- * get_vaddr_frames() got pages but vec was later converted to pfns.
- * But it shouldn't really fail to convert pfns back...
- */
- if (WARN_ON(IS_ERR(pages)))
- goto out;
-
- unpin_user_pages(pages, vec->nr_frames);
- vec->got_ref = false;
-out:
- vec->nr_frames = 0;
-}
-EXPORT_SYMBOL(put_vaddr_frames);
-
-/**
- * frame_vector_to_pages - convert frame vector to contain page pointers
- * @vec: frame vector to convert
- *
- * Convert @vec to contain array of page pointers. If the conversion is
- * successful, return 0. Otherwise return an error. Note that we do not grab
- * page references for the page structures.
- */
-int frame_vector_to_pages(struct frame_vector *vec)
-{
- int i;
- unsigned long *nums;
- struct page **pages;
-
- if (!vec->is_pfns)
- return 0;
- nums = frame_vector_pfns(vec);
- for (i = 0; i < vec->nr_frames; i++)
- if (!pfn_valid(nums[i]))
- return -EINVAL;
- pages = (struct page **)nums;
- for (i = 0; i < vec->nr_frames; i++)
- pages[i] = pfn_to_page(nums[i]);
- vec->is_pfns = false;
- return 0;
-}
-EXPORT_SYMBOL(frame_vector_to_pages);
-
-/**
- * frame_vector_to_pfns - convert frame vector to contain pfns
- * @vec: frame vector to convert
- *
- * Convert @vec to contain array of pfns.
- */
-void frame_vector_to_pfns(struct frame_vector *vec)
-{
- int i;
- unsigned long *nums;
- struct page **pages;
-
- if (vec->is_pfns)
- return;
- pages = (struct page **)(vec->ptrs);
- nums = (unsigned long *)pages;
- for (i = 0; i < vec->nr_frames; i++)
- nums[i] = page_to_pfn(pages[i]);
- vec->is_pfns = true;
-}
-EXPORT_SYMBOL(frame_vector_to_pfns);
-
-/**
- * frame_vector_create() - allocate & initialize structure for pinned pfns
- * @nr_frames: number of pfns slots we should reserve
- *
- * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
- * pfns.
- */
-struct frame_vector *frame_vector_create(unsigned int nr_frames)
-{
- struct frame_vector *vec;
- int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
-
- if (WARN_ON_ONCE(nr_frames == 0))
- return NULL;
- /*
- * This is absurdly high. It's here just to avoid strange effects when
- * arithmetics overflows.
- */
- if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2))
- return NULL;
- /*
- * Avoid higher order allocations, use vmalloc instead. It should
- * be rare anyway.
- */
- vec = kvmalloc(size, GFP_KERNEL);
- if (!vec)
- return NULL;
- vec->nr_allocated = nr_frames;
- vec->nr_frames = 0;
- return vec;
-}
-EXPORT_SYMBOL(frame_vector_create);
-
-/**
- * frame_vector_destroy() - free memory allocated to carry frame vector
- * @vec: Frame vector to free
- *
- * Free structure allocated by frame_vector_create() to carry frames.
- */
-void frame_vector_destroy(struct frame_vector *vec)
-{
- /* Make sure put_vaddr_frames() got called properly... */
- VM_BUG_ON(vec->nr_frames > 0);
- kvfree(vec);
-}
-EXPORT_SYMBOL(frame_vector_destroy);
diff --git a/mm/highmem.c b/mm/highmem.c
index c3a9ea7875ef..874b732b120c 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -473,6 +473,11 @@ static inline void *arch_kmap_local_high_get(struct page *page)
}
#endif
+#ifndef arch_kmap_local_set_pte
+#define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) \
+ set_pte_at(mm, vaddr, ptep, ptev)
+#endif
+
/* Unmap a local mapping which was obtained by kmap_high_get() */
static inline bool kmap_high_unmap_local(unsigned long vaddr)
{
@@ -515,7 +520,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte - idx)));
pteval = pfn_pte(pfn, prot);
- set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval);
+ arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte - idx, pteval);
arch_kmap_local_post_map(vaddr, pteval);
current->kmap_ctrl.pteval[kmap_local_idx()] = pteval;
preempt_enable();
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9237976abe72..91ca9b103ee5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2202,7 +2202,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
spinlock_t *ptl;
struct mmu_notifier_range range;
- bool was_locked = false;
+ bool do_unlock_page = false;
pmd_t _pmd;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
@@ -2218,7 +2218,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
VM_BUG_ON(freeze && !page);
if (page) {
VM_WARN_ON_ONCE(!PageLocked(page));
- was_locked = true;
if (page != pmd_page(*pmd))
goto out;
}
@@ -2227,19 +2226,29 @@ repeat:
if (pmd_trans_huge(*pmd)) {
if (!page) {
page = pmd_page(*pmd);
- if (unlikely(!trylock_page(page))) {
- get_page(page);
- _pmd = *pmd;
- spin_unlock(ptl);
- lock_page(page);
- spin_lock(ptl);
- if (unlikely(!pmd_same(*pmd, _pmd))) {
- unlock_page(page);
+ /*
+ * An anonymous page must be locked, to ensure that a
+ * concurrent reuse_swap_page() sees stable mapcount;
+ * but reuse_swap_page() is not used on shmem or file,
+ * and page lock must not be taken when zap_pmd_range()
+ * calls __split_huge_pmd() while i_mmap_lock is held.
+ */
+ if (PageAnon(page)) {
+ if (unlikely(!trylock_page(page))) {
+ get_page(page);
+ _pmd = *pmd;
+ spin_unlock(ptl);
+ lock_page(page);
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(*pmd, _pmd))) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ goto repeat;
+ }
put_page(page);
- page = NULL;
- goto repeat;
}
- put_page(page);
+ do_unlock_page = true;
}
}
if (PageMlocked(page))
@@ -2249,7 +2258,7 @@ repeat:
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
- if (!was_locked && page)
+ if (do_unlock_page)
unlock_page(page);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 18f6ee317900..905a7d549b00 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -79,6 +79,21 @@ DEFINE_SPINLOCK(hugetlb_lock);
static int num_fault_mutexes;
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
+static inline bool PageHugeFreed(struct page *head)
+{
+ return page_private(head + 4) == -1UL;
+}
+
+static inline void SetPageHugeFreed(struct page *head)
+{
+ set_page_private(head + 4, -1UL);
+}
+
+static inline void ClearPageHugeFreed(struct page *head)
+{
+ set_page_private(head + 4, 0);
+}
+
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -1028,6 +1043,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
+ SetPageHugeFreed(page);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -1044,6 +1060,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
+ ClearPageHugeFreed(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
return page;
@@ -1344,12 +1361,11 @@ struct hstate *size_to_hstate(unsigned long size)
*/
bool page_huge_active(struct page *page)
{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
- return PageHead(page) && PagePrivate(&page[1]);
+ return PageHeadHuge(page) && PagePrivate(&page[1]);
}
/* never called for tail page */
-static void set_page_huge_active(struct page *page)
+void set_page_huge_active(struct page *page)
{
VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
SetPagePrivate(&page[1]);
@@ -1505,6 +1521,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
spin_lock(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
+ ClearPageHugeFreed(page);
spin_unlock(&hugetlb_lock);
}
@@ -1755,6 +1772,7 @@ int dissolve_free_huge_page(struct page *page)
{
int rc = -EBUSY;
+retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
if (!PageHuge(page))
return 0;
@@ -1771,6 +1789,26 @@ int dissolve_free_huge_page(struct page *page)
int nid = page_to_nid(head);
if (h->free_huge_pages - h->resv_huge_pages == 0)
goto out;
+
+ /*
+ * We should make sure that the page is already on the free list
+ * when it is dissolved.
+ */
+ if (unlikely(!PageHugeFreed(head))) {
+ spin_unlock(&hugetlb_lock);
+ cond_resched();
+
+ /*
+ * Theoretically, we should return -EBUSY when we
+ * encounter this race. In fact, we have a chance
+ * to successfully dissolve the page if we do a
+ * retry. Because the race window is quite small.
+ * If we seize this opportunity, it is an optimization
+ * for increasing the success rate of dissolving page.
+ */
+ goto retry;
+ }
+
/*
* Move PageHWPoison flag from head page to the raw error page,
* which makes any subpages rather than the error page reusable.
@@ -2009,13 +2047,16 @@ retry:
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ int zeroed;
+
if ((--needed) < 0)
break;
/*
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
*/
- VM_BUG_ON_PAGE(!put_page_testzero(page), page);
+ zeroed = put_page_testzero(page);
+ VM_BUG_ON_PAGE(!zeroed, page);
enqueue_huge_page(h, page);
}
free:
@@ -3967,25 +4008,11 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
- struct mm_struct *mm;
struct mmu_gather tlb;
- unsigned long tlb_start = start;
- unsigned long tlb_end = end;
-
- /*
- * If shared PMDs were possibly used within this vma range, adjust
- * start/end for worst case tlb flushing.
- * Note that we can not be sure if PMDs are shared until we try to
- * unmap pages. However, we want to make sure TLB flushing covers
- * the largest possible range.
- */
- adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
-
- mm = vma->vm_mm;
- tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
+ tlb_gather_mmu(&tlb, vma->vm_mm);
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
- tlb_finish_mmu(&tlb, tlb_start, tlb_end);
+ tlb_finish_mmu(&tlb);
}
/*
@@ -5555,9 +5582,9 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
{
bool ret = true;
- VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
- if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+ if (!PageHeadHuge(page) || !page_huge_active(page) ||
+ !get_page_unless_zero(page)) {
ret = false;
goto unlock;
}
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 55bd6f09c70f..d558799b25b3 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -19,11 +19,10 @@
#include "kasan.h"
-enum kasan_arg_mode {
- KASAN_ARG_MODE_DEFAULT,
- KASAN_ARG_MODE_OFF,
- KASAN_ARG_MODE_PROD,
- KASAN_ARG_MODE_FULL,
+enum kasan_arg {
+ KASAN_ARG_DEFAULT,
+ KASAN_ARG_OFF,
+ KASAN_ARG_ON,
};
enum kasan_arg_stacktrace {
@@ -38,7 +37,7 @@ enum kasan_arg_fault {
KASAN_ARG_FAULT_PANIC,
};
-static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
+static enum kasan_arg kasan_arg __ro_after_init;
static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
static enum kasan_arg_fault kasan_arg_fault __ro_after_init;
@@ -52,26 +51,24 @@ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
/* Whether panic or disable tag checking on fault. */
bool kasan_flag_panic __ro_after_init;
-/* kasan.mode=off/prod/full */
-static int __init early_kasan_mode(char *arg)
+/* kasan=off/on */
+static int __init early_kasan_flag(char *arg)
{
if (!arg)
return -EINVAL;
if (!strcmp(arg, "off"))
- kasan_arg_mode = KASAN_ARG_MODE_OFF;
- else if (!strcmp(arg, "prod"))
- kasan_arg_mode = KASAN_ARG_MODE_PROD;
- else if (!strcmp(arg, "full"))
- kasan_arg_mode = KASAN_ARG_MODE_FULL;
+ kasan_arg = KASAN_ARG_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg = KASAN_ARG_ON;
else
return -EINVAL;
return 0;
}
-early_param("kasan.mode", early_kasan_mode);
+early_param("kasan", early_kasan_flag);
-/* kasan.stack=off/on */
+/* kasan.stacktrace=off/on */
static int __init early_kasan_flag_stacktrace(char *arg)
{
if (!arg)
@@ -113,8 +110,8 @@ void kasan_init_hw_tags_cpu(void)
* as this function is only called for MTE-capable hardware.
*/
- /* If KASAN is disabled, do nothing. */
- if (kasan_arg_mode == KASAN_ARG_MODE_OFF)
+ /* If KASAN is disabled via command line, don't initialize it. */
+ if (kasan_arg == KASAN_ARG_OFF)
return;
hw_init_tags(KASAN_TAG_MAX);
@@ -124,43 +121,24 @@ void kasan_init_hw_tags_cpu(void)
/* kasan_init_hw_tags() is called once on boot CPU. */
void __init kasan_init_hw_tags(void)
{
- /* If hardware doesn't support MTE, do nothing. */
+ /* If hardware doesn't support MTE, don't initialize KASAN. */
if (!system_supports_mte())
return;
- /* Choose KASAN mode if kasan boot parameter is not provided. */
- if (kasan_arg_mode == KASAN_ARG_MODE_DEFAULT) {
- if (IS_ENABLED(CONFIG_DEBUG_KERNEL))
- kasan_arg_mode = KASAN_ARG_MODE_FULL;
- else
- kasan_arg_mode = KASAN_ARG_MODE_PROD;
- }
-
- /* Preset parameter values based on the mode. */
- switch (kasan_arg_mode) {
- case KASAN_ARG_MODE_DEFAULT:
- /* Shouldn't happen as per the check above. */
- WARN_ON(1);
+ /* If KASAN is disabled via command line, don't initialize it. */
+ if (kasan_arg == KASAN_ARG_OFF)
return;
- case KASAN_ARG_MODE_OFF:
- /* If KASAN is disabled, do nothing. */
- return;
- case KASAN_ARG_MODE_PROD:
- static_branch_enable(&kasan_flag_enabled);
- break;
- case KASAN_ARG_MODE_FULL:
- static_branch_enable(&kasan_flag_enabled);
- static_branch_enable(&kasan_flag_stacktrace);
- break;
- }
- /* Now, optionally override the presets. */
+ /* Enable KASAN. */
+ static_branch_enable(&kasan_flag_enabled);
switch (kasan_arg_stacktrace) {
case KASAN_ARG_STACKTRACE_DEFAULT:
+ /* Default to enabling stack trace collection. */
+ static_branch_enable(&kasan_flag_stacktrace);
break;
case KASAN_ARG_STACKTRACE_OFF:
- static_branch_disable(&kasan_flag_stacktrace);
+ /* Do nothing, kasan_flag_stacktrace keeps its default value. */
break;
case KASAN_ARG_STACKTRACE_ON:
static_branch_enable(&kasan_flag_stacktrace);
@@ -169,11 +147,16 @@ void __init kasan_init_hw_tags(void)
switch (kasan_arg_fault) {
case KASAN_ARG_FAULT_DEFAULT:
+ /*
+ * Default to no panic on report.
+ * Do nothing, kasan_flag_panic keeps its default value.
+ */
break;
case KASAN_ARG_FAULT_REPORT:
- kasan_flag_panic = false;
+ /* Do nothing, kasan_flag_panic keeps its default value. */
break;
case KASAN_ARG_FAULT_PANIC:
+ /* Enable panic on report. */
kasan_flag_panic = true;
break;
}
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 7ca0b92d5886..c4605ac9837b 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -373,9 +373,10 @@ static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr,
if (kasan_pte_table(*pmd)) {
if (IS_ALIGNED(addr, PMD_SIZE) &&
- IS_ALIGNED(next, PMD_SIZE))
+ IS_ALIGNED(next, PMD_SIZE)) {
pmd_clear(pmd);
- continue;
+ continue;
+ }
}
pte = pte_offset_kernel(pmd, addr);
kasan_remove_pte_table(pte, addr, next);
@@ -398,9 +399,10 @@ static void kasan_remove_pud_table(pud_t *pud, unsigned long addr,
if (kasan_pmd_table(*pud)) {
if (IS_ALIGNED(addr, PUD_SIZE) &&
- IS_ALIGNED(next, PUD_SIZE))
+ IS_ALIGNED(next, PUD_SIZE)) {
pud_clear(pud);
- continue;
+ continue;
+ }
}
pmd = pmd_offset(pud, addr);
pmd_base = pmd_offset(pud, 0);
@@ -424,9 +426,10 @@ static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr,
if (kasan_pud_table(*p4d)) {
if (IS_ALIGNED(addr, P4D_SIZE) &&
- IS_ALIGNED(next, P4D_SIZE))
+ IS_ALIGNED(next, P4D_SIZE)) {
p4d_clear(p4d);
- continue;
+ continue;
+ }
}
pud = pud_offset(p4d, addr);
kasan_remove_pud_table(pud, addr, next);
@@ -457,9 +460,10 @@ void kasan_remove_zero_shadow(void *start, unsigned long size)
if (kasan_p4d_table(*pgd)) {
if (IS_ALIGNED(addr, PGDIR_SIZE) &&
- IS_ALIGNED(next, PGDIR_SIZE))
+ IS_ALIGNED(next, PGDIR_SIZE)) {
pgd_clear(pgd);
- continue;
+ continue;
+ }
}
p4d = p4d_offset(pgd, addr);
@@ -482,7 +486,6 @@ int kasan_add_zero_shadow(void *start, unsigned long size)
ret = kasan_populate_early_shadow(shadow_start, shadow_end);
if (ret)
- kasan_remove_zero_shadow(shadow_start,
- size >> KASAN_SHADOW_SCALE_SHIFT);
+ kasan_remove_zero_shadow(start, size);
return ret;
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index cc4d9e1d49b1..8c706e7652f2 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -209,7 +209,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
static inline bool addr_has_metadata(const void *addr)
{
- return true;
+ return (is_vmalloc_addr(addr) || virt_addr_valid(addr));
}
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 67ab391a5373..fb0fdaec34d5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -991,38 +991,41 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
static bool __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
+ unsigned long haddr, pmd_t *pmd,
int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
- struct vm_fault vmf = {
- .vma = vma,
- .address = address,
- .flags = FAULT_FLAG_ALLOW_RETRY,
- .pmd = pmd,
- .pgoff = linear_page_index(vma, address),
- };
-
- vmf.pte = pte_offset_map(pmd, address);
- for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
- vmf.pte++, vmf.address += PAGE_SIZE) {
+ unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
+
+ for (address = haddr; address < end; address += PAGE_SIZE) {
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = address,
+ .pgoff = linear_page_index(vma, haddr),
+ .flags = FAULT_FLAG_ALLOW_RETRY,
+ .pmd = pmd,
+ };
+
+ vmf.pte = pte_offset_map(pmd, address);
vmf.orig_pte = *vmf.pte;
- if (!is_swap_pte(vmf.orig_pte))
+ if (!is_swap_pte(vmf.orig_pte)) {
+ pte_unmap(vmf.pte);
continue;
+ }
swapped_in++;
ret = do_swap_page(&vmf);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
if (ret & VM_FAULT_RETRY) {
mmap_read_lock(mm);
- if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
+ if (hugepage_vma_revalidate(mm, haddr, &vma)) {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
/* check if the pmd is still valid */
- if (mm_find_pmd(mm, address) != pmd) {
+ if (mm_find_pmd(mm, haddr) != pmd) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
@@ -1031,11 +1034,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
- /* pte is unmapped now, we need to map it */
- vmf.pte = pte_offset_map(pmd, vmf.address);
}
- vmf.pte--;
- pte_unmap(vmf.pte);
/* Drain LRU add pagevec to remove extra pin on the swapped in pages */
if (swapped_in)
diff --git a/mm/madvise.c b/mm/madvise.c
index d4f5eece9d56..df692d2e35d4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -506,9 +506,9 @@ static long madvise_cold(struct vm_area_struct *vma,
return -EINVAL;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, mm);
madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
- tlb_finish_mmu(&tlb, start_addr, end_addr);
+ tlb_finish_mmu(&tlb);
return 0;
}
@@ -559,9 +559,9 @@ static long madvise_pageout(struct vm_area_struct *vma,
return 0;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, mm);
madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
- tlb_finish_mmu(&tlb, start_addr, end_addr);
+ tlb_finish_mmu(&tlb);
return 0;
}
@@ -724,7 +724,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
@@ -733,7 +733,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
&madvise_free_walk_ops, &tlb);
tlb_end_vma(&tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, range.start, range.end);
+ tlb_finish_mmu(&tlb);
return 0;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index d24bcfa88d2f..afaefa8fc6ab 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -275,14 +275,6 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
*
* Find @size free area aligned to @align in the specified range and node.
*
- * When allocation direction is bottom-up, the @start should be greater
- * than the end of the kernel image. Otherwise, it will be trimmed. The
- * reason is that we want the bottom-up allocation just near the kernel
- * image so it is highly likely that the allocated memory and the kernel
- * will reside in the same node.
- *
- * If bottom-up allocation failed, will try to allocate memory top-down.
- *
* Return:
* Found address on success, 0 on failure.
*/
@@ -291,8 +283,6 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t end, int nid,
enum memblock_flags flags)
{
- phys_addr_t kernel_end, ret;
-
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
end == MEMBLOCK_ALLOC_KASAN)
@@ -301,40 +291,13 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
- kernel_end = __pa_symbol(_end);
-
- /*
- * try bottom-up allocation only when bottom-up mode
- * is set and @end is above the kernel image.
- */
- if (memblock_bottom_up() && end > kernel_end) {
- phys_addr_t bottom_up_start;
-
- /* make sure we will allocate above the kernel */
- bottom_up_start = max(start, kernel_end);
-
- /* ok, try bottom-up allocation first */
- ret = __memblock_find_range_bottom_up(bottom_up_start, end,
- size, align, nid, flags);
- if (ret)
- return ret;
- /*
- * we always limit bottom-up allocation above the kernel,
- * but top-down allocation doesn't have the limit, so
- * retrying top-down allocation may succeed when bottom-up
- * allocation failed.
- *
- * bottom-up allocation is expected to be fail very rarely,
- * so we use WARN_ONCE() here to see the stack trace if
- * fail happens.
- */
- WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE),
- "memblock: bottom-up allocation failed, memory hotremove may be affected\n");
- }
-
- return __memblock_find_range_top_down(start, end, size, align, nid,
- flags);
+ if (memblock_bottom_up())
+ return __memblock_find_range_bottom_up(start, end, size, align,
+ nid, flags);
+ else
+ return __memblock_find_range_top_down(start, end, size, align,
+ nid, flags);
}
/**
@@ -1427,7 +1390,7 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
}
/**
- * memblock_phys_alloc_try_nid - allocate a memory block from specified MUMA node
+ * memblock_phys_alloc_try_nid - allocate a memory block from specified NUMA node
* @size: size of memory block to be allocated in bytes
* @align: alignment of the region and block's size
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
@@ -2087,10 +2050,8 @@ void __init reset_all_zones_managed_pages(void)
/**
* memblock_free_all - release free pages to the buddy allocator
- *
- * Return: the number of pages actually released.
*/
-unsigned long __init memblock_free_all(void)
+void __init memblock_free_all(void)
{
unsigned long pages;
@@ -2099,8 +2060,6 @@ unsigned long __init memblock_free_all(void)
pages = free_low_memory_core_early();
totalram_pages_add(pages);
-
- return pages;
}
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf9076f58582..0b9bd354e97e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3115,9 +3115,7 @@ void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->kmem, nr_pages);
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ refill_stock(memcg, nr_pages);
}
/**
@@ -6273,6 +6271,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;
+ page_counter_set_high(&memcg->memory, high);
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -6296,10 +6296,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
break;
}
- page_counter_set_high(&memcg->memory, high);
-
memcg_wb_domain_size_changed(memcg);
-
return nbytes;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 04d9f154a130..e9481632fcd1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1885,6 +1885,12 @@ static int soft_offline_free_page(struct page *page)
return rc;
}
+static void put_ref_page(struct page *page)
+{
+ if (page)
+ put_page(page);
+}
+
/**
* soft_offline_page - Soft offline a page.
* @pfn: pfn to soft-offline
@@ -1910,20 +1916,26 @@ static int soft_offline_free_page(struct page *page)
int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
- struct page *page;
bool try_again = true;
+ struct page *page, *ref_page = NULL;
+
+ WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
if (!pfn_valid(pfn))
return -ENXIO;
+ if (flags & MF_COUNT_INCREASED)
+ ref_page = pfn_to_page(pfn);
+
/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
page = pfn_to_online_page(pfn);
- if (!page)
+ if (!page) {
+ put_ref_page(ref_page);
return -EIO;
+ }
if (PageHWPoison(page)) {
pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
- if (flags & MF_COUNT_INCREASED)
- put_page(page);
+ put_ref_page(ref_page);
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index feff48e1465a..5da964079678 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -134,6 +134,18 @@ static inline bool arch_faults_on_old_pte(void)
}
#endif
+#ifndef arch_wants_old_prefaulted_pte
+static inline bool arch_wants_old_prefaulted_pte(void)
+{
+ /*
+ * Transitioning a PTE from 'old' to 'young' can be expensive on
+ * some architectures, even if it's performed in hardware. By
+ * default, "false" means prefaulted entries will be 'young'.
+ */
+ return false;
+}
+#endif
+
static int __init disable_randmaps(char *s)
{
randomize_va_space = 0;
@@ -1534,13 +1546,13 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
start, start + size);
- tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
+ tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
unmap_single_vma(&tlb, vma, start, range.end, NULL);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, start, range.end);
+ tlb_finish_mmu(&tlb);
}
/**
@@ -1561,12 +1573,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, address + size);
- tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
+ tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
unmap_single_vma(&tlb, vma, address, range.end, details);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, address, range.end);
+ tlb_finish_mmu(&tlb);
}
/**
@@ -3503,7 +3515,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
- /* See the comment in pte_alloc_one_map() */
+ /* See comment in handle_pte_fault() */
if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
@@ -3643,66 +3655,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
return ret;
}
-/*
- * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
- * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
- * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
- * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
- */
-static int pmd_devmap_trans_unstable(pmd_t *pmd)
-{
- return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
-}
-
-static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
-{
- struct vm_area_struct *vma = vmf->vma;
-
- if (!pmd_none(*vmf->pmd))
- goto map_pte;
- if (vmf->prealloc_pte) {
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_none(*vmf->pmd))) {
- spin_unlock(vmf->ptl);
- goto map_pte;
- }
-
- mm_inc_nr_ptes(vma->vm_mm);
- pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
- spin_unlock(vmf->ptl);
- vmf->prealloc_pte = NULL;
- } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
- return VM_FAULT_OOM;
- }
-map_pte:
- /*
- * If a huge pmd materialized under us just retry later. Use
- * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
- * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
- * under us and then back to pmd_none, as a result of MADV_DONTNEED
- * running immediately after a huge pmd fault in a different thread of
- * this mm, in turn leading to a misleading pmd_trans_huge() retval.
- * All we have to ensure is that it is a regular pmd that we can walk
- * with pte_offset_map() and we can do that through an atomic read in
- * C, which is what pmd_trans_unstable() provides.
- */
- if (pmd_devmap_trans_unstable(vmf->pmd))
- return VM_FAULT_NOPAGE;
-
- /*
- * At this point we know that our vmf->pmd points to a page of ptes
- * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
- * for the duration of the fault. If a racing MADV_DONTNEED runs and
- * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
- * be valid and we will re-check to make sure the vmf->pte isn't
- * pte_none() under vmf->ptl protection when we return to
- * alloc_set_pte().
- */
- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
- &vmf->ptl);
- return 0;
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
@@ -3717,7 +3669,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}
-static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -3775,76 +3727,41 @@ out:
return ret;
}
#else
-static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
- BUILD_BUG();
- return 0;
+ return VM_FAULT_FALLBACK;
}
#endif
-/**
- * alloc_set_pte - setup new PTE entry for given page and add reverse page
- * mapping. If needed, the function allocates page table or use pre-allocated.
- *
- * @vmf: fault environment
- * @page: page to map
- *
- * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
- * return.
- *
- * Target users are page handler itself and implementations of
- * vm_ops->map_pages.
- *
- * Return: %0 on success, %VM_FAULT_ code in case of error.
- */
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
+void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool prefault = vmf->address != addr;
pte_t entry;
- vm_fault_t ret;
-
- if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
- ret = do_set_pmd(vmf, page);
- if (ret != VM_FAULT_FALLBACK)
- return ret;
- }
-
- if (!vmf->pte) {
- ret = pte_alloc_one_map(vmf);
- if (ret)
- return ret;
- }
-
- /* Re-check under ptl */
- if (unlikely(!pte_none(*vmf->pte))) {
- update_mmu_tlb(vma, vmf->address, vmf->pte);
- return VM_FAULT_NOPAGE;
- }
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
- entry = pte_sw_mkyoung(entry);
+
+ if (prefault && arch_wants_old_prefaulted_pte())
+ entry = pte_mkold(entry);
+ else
+ entry = pte_sw_mkyoung(entry);
+
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, vmf->address, false);
+ page_add_new_anon_rmap(page, vma, addr, false);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
}
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
-
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, vmf->address, vmf->pte);
-
- return 0;
+ set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
}
-
/**
* finish_fault - finish page fault once we have prepared the page to fault
*
@@ -3862,12 +3779,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
*/
vm_fault_t finish_fault(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
struct page *page;
- vm_fault_t ret = 0;
+ vm_fault_t ret;
/* Did we COW the page? */
- if ((vmf->flags & FAULT_FLAG_WRITE) &&
- !(vmf->vma->vm_flags & VM_SHARED))
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
page = vmf->cow_page;
else
page = vmf->page;
@@ -3876,12 +3793,38 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* check even for read faults because we might have lost our CoWed
* page
*/
- if (!(vmf->vma->vm_flags & VM_SHARED))
- ret = check_stable_address_space(vmf->vma->vm_mm);
- if (!ret)
- ret = alloc_set_pte(vmf, page);
- if (vmf->pte)
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (!(vma->vm_flags & VM_SHARED)) {
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ return ret;
+ }
+
+ if (pmd_none(*vmf->pmd)) {
+ if (PageTransCompound(page)) {
+ ret = do_set_pmd(vmf, page);
+ if (ret != VM_FAULT_FALLBACK)
+ return ret;
+ }
+
+ if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
+ return VM_FAULT_OOM;
+ }
+
+ /* See comment in handle_pte_fault() */
+ if (pmd_devmap_trans_unstable(vmf->pmd))
+ return 0;
+
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ ret = 0;
+ /* Re-check under ptl */
+ if (likely(pte_none(*vmf->pte)))
+ do_set_pte(vmf, page, vmf->address);
+ else
+ ret = VM_FAULT_NOPAGE;
+
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
@@ -3951,13 +3894,12 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
int off;
- vm_fault_t ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
- vmf->address = max(address & mask, vmf->vma->vm_start);
- off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ address = max(address & mask, vmf->vma->vm_start);
+ off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off;
/*
@@ -3965,7 +3907,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
* the vma or nr_pages from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
- ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
@@ -3973,31 +3915,11 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
if (pmd_none(*vmf->pmd)) {
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
- goto out;
+ return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
- vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
-
- /* Huge page is mapped? Page fault is solved */
- if (pmd_trans_huge(*vmf->pmd)) {
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- /* ->map_pages() haven't done anything useful. Cold page cache? */
- if (!vmf->pte)
- goto out;
-
- /* check if the page fault is solved */
- vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
- if (!pte_none(*vmf->pte))
- ret = VM_FAULT_NOPAGE;
- pte_unmap_unlock(vmf->pte, vmf->ptl);
-out:
- vmf->address = address;
- vmf->pte = NULL;
- return ret;
+ return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
}
static vm_fault_t do_read_fault(struct vm_fault *vmf)
@@ -4353,7 +4275,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
*/
vmf->pte = NULL;
} else {
- /* See comment in pte_alloc_one_map() */
+ /*
+ * If a huge pmd materialized under us just retry later. Use
+ * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
+ * of pmd_trans_huge() to ensure the pmd didn't become
+ * pmd_trans_huge under us and then back to pmd_none, as a
+ * result of MADV_DONTNEED running immediately after a huge pmd
+ * fault in a different thread of this mm, in turn leading to a
+ * misleading pmd_trans_huge() retval. All we have to ensure is
+ * that it is a regular pmd that we can walk with
+ * pte_offset_map() and we can do that through an atomic read
+ * in C, which is what pmd_trans_unstable() provides.
+ */
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
/*
@@ -4709,9 +4642,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-int follow_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
- spinlock_t **ptlp)
+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+ struct mmu_notifier_range *range, pte_t **ptepp,
+ pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4777,6 +4710,34 @@ out:
}
/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
+{
+ return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
+}
+EXPORT_SYMBOL_GPL(follow_pte);
+
+/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
* @address: user virtual address
@@ -4784,6 +4745,9 @@ out:
*
* Only IO mappings and raw PFN mappings are allowed.
*
+ * This function does not allow the caller to read the permissions
+ * of the PTE. Do not use it.
+ *
* Return: zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
@@ -4796,7 +4760,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return ret;
- ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl);
+ ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
*pfn = pte_pfn(*ptep);
@@ -4817,7 +4781,7 @@ int follow_phys(struct vm_area_struct *vma,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
- if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl))
+ if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
goto out;
pte = *ptep;
@@ -4834,28 +4798,68 @@ out:
return ret;
}
+/**
+ * generic_access_phys - generic implementation for iomem mmap access
+ * @vma: the vma to access
+ * @addr: userspace addres, not relative offset within @vma
+ * @buf: buffer to read/write
+ * @len: length of transfer
+ * @write: set to FOLL_WRITE when writing, otherwise reading
+ *
+ * This is a generic implementation for &vm_operations_struct.access for an
+ * iomem mapping. This callback is used by access_process_vm() when the @vma is
+ * not page based.
+ */
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
unsigned long prot = 0;
void __iomem *maddr;
- int offset = addr & (PAGE_SIZE-1);
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ int offset = offset_in_page(addr);
+ int ret = -EINVAL;
- if (follow_phys(vma, addr, write, &prot, &phys_addr))
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return -EINVAL;
+
+retry:
+ if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+ return -EINVAL;
+ pte = *ptep;
+ pte_unmap_unlock(ptep, ptl);
+
+ prot = pgprot_val(pte_pgprot(pte));
+ phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+
+ if ((write & FOLL_WRITE) && !pte_write(pte))
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (!maddr)
return -ENOMEM;
+ if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+ goto out_unmap;
+
+ if (!pte_same(pte, *ptep)) {
+ pte_unmap_unlock(ptep, ptl);
+ iounmap(maddr);
+
+ goto retry;
+ }
+
if (write)
memcpy_toio(maddr + offset, buf, len);
else
memcpy_fromio(buf, maddr + offset, len);
+ ret = len;
+ pte_unmap_unlock(ptep, ptl);
+out_unmap:
iounmap(maddr);
- return len;
+ return ret;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif
diff --git a/mm/migrate.c b/mm/migrate.c
index ee5e612b4cd8..20ca887ea769 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -402,6 +402,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
struct zone *oldzone, *newzone;
int dirty;
int expected_count = expected_page_refs(mapping, page) + extra_count;
+ int nr = thp_nr_pages(page);
if (!mapping) {
/* Anonymous page without mapping */
@@ -437,7 +438,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
newpage->index = page->index;
newpage->mapping = page->mapping;
- page_ref_add(newpage, thp_nr_pages(page)); /* add cache reference */
+ page_ref_add(newpage, nr); /* add cache reference */
if (PageSwapBacked(page)) {
__SetPageSwapBacked(newpage);
if (PageSwapCache(page)) {
@@ -459,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
if (PageTransHuge(page)) {
int i;
- for (i = 1; i < HPAGE_PMD_NR; i++) {
+ for (i = 1; i < nr; i++) {
xas_next(&xas);
xas_store(&xas, newpage);
}
@@ -470,7 +471,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
* to one less reference.
* We know this isn't the last reference.
*/
- page_ref_unfreeze(page, expected_count - thp_nr_pages(page));
+ page_ref_unfreeze(page, expected_count - nr);
xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
@@ -493,17 +494,17 @@ int migrate_page_move_mapping(struct address_space *mapping,
old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
- __dec_lruvec_state(old_lruvec, NR_FILE_PAGES);
- __inc_lruvec_state(new_lruvec, NR_FILE_PAGES);
+ __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
if (PageSwapBacked(page) && !PageSwapCache(page)) {
- __dec_lruvec_state(old_lruvec, NR_SHMEM);
- __inc_lruvec_state(new_lruvec, NR_SHMEM);
+ __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
}
if (dirty && mapping_can_writeback(mapping)) {
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
- __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
- __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
- __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
+ __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
+ __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
+ __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
}
}
local_irq_enable();
@@ -1279,6 +1280,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return -ENOSYS;
}
+ if (page_count(hpage) == 1) {
+ /* page was freed from under us. So we are done. */
+ putback_active_hugepage(hpage);
+ return MIGRATEPAGE_SUCCESS;
+ }
+
new_hpage = get_new_page(hpage, private);
if (!new_hpage)
return -ENOMEM;
diff --git a/mm/mmap.c b/mm/mmap.c
index dc7206032387..90673febce6a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2671,12 +2671,12 @@ static void unmap_region(struct mm_struct *mm,
struct mmu_gather tlb;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start, end);
+ tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
- tlb_finish_mmu(&tlb, start, end);
+ tlb_finish_mmu(&tlb);
}
/*
@@ -3214,12 +3214,12 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
- tlb_gather_mmu(&tlb, mm, 0, -1);
+ tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
- tlb_finish_mmu(&tlb, 0, -1);
+ tlb_finish_mmu(&tlb);
/*
* Walk the list again, actually closing and freeing it,
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 03c33c93a582..0dc7149b0c61 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -253,21 +253,17 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
* tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
* @tlb: the mmu_gather structure to initialize
* @mm: the mm_struct of the target address space
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
+ * @fullmm: @mm is without users and we're going to destroy the full address
+ * space (exit/execve)
*
* Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @start and @end are set to 0 and -1
- * respectively when @mm is without users and we're going to destroy
- * the full address space (exit/execve).
+ * tear-down from @mm.
*/
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
+static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ bool fullmm)
{
tlb->mm = mm;
-
- /* Is it from 0 to ~0? */
- tlb->fullmm = !(start | (end+1));
+ tlb->fullmm = fullmm;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb->need_flush_all = 0;
@@ -287,17 +283,24 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
inc_tlb_flush_pending(tlb->mm);
}
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
+{
+ __tlb_gather_mmu(tlb, mm, false);
+}
+
+void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
+{
+ __tlb_gather_mmu(tlb, mm, true);
+}
+
/**
* tlb_finish_mmu - finish an mmu_gather structure
* @tlb: the mmu_gather structure to finish
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
*
* Called at the end of the shootdown operation to free up any resources that
* were required.
*/
-void tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end)
+void tlb_finish_mmu(struct mmu_gather *tlb)
{
/*
* If there are parallel threads are doing PTE changes on same range
diff --git a/mm/mremap.c b/mm/mremap.c
index f554320281cc..47192691fe32 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,7 +22,6 @@
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
#include <linux/uaccess.h>
-#include <linux/mm-arch-hooks.h>
#include <linux/userfaultfd_k.h>
#include <asm/cacheflush.h>
@@ -336,8 +335,9 @@ enum pgt_entry {
* valid. Else returns a smaller extent bounded by the end of the source and
* destination pgt_entry.
*/
-static unsigned long get_extent(enum pgt_entry entry, unsigned long old_addr,
- unsigned long old_end, unsigned long new_addr)
+static __always_inline unsigned long get_extent(enum pgt_entry entry,
+ unsigned long old_addr, unsigned long old_end,
+ unsigned long new_addr)
{
unsigned long next, extent, mask, size;
@@ -562,8 +562,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
new_addr = err;
} else {
mremap_userfaultfd_prep(new_vma, uf);
- arch_remap(mm, old_addr, old_addr + old_len,
- new_addr, new_addr + new_len);
}
/* Conceal VM_ACCOUNT so old reservation is not undone */
diff --git a/mm/nommu.c b/mm/nommu.c
index 870fea12823e..5c9ab799c0e6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1668,10 +1668,11 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct vm_fault *vmf,
+vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
BUG();
+ return 0;
}
EXPORT_SYMBOL(filemap_map_pages);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 04b19b7b5435..c9a33ffe38b7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -546,15 +546,15 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
vma, mm, vma->vm_start,
vma->vm_end);
- tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
- tlb_finish_mmu(&tlb, range.start, range.end);
+ tlb_finish_mmu(&tlb);
ret = false;
continue;
}
unmap_page_range(&tlb, vma, range.start, range.end, NULL);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, range.start, range.end);
+ tlb_finish_mmu(&tlb);
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 027f6481ba59..ef5070fed76b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1207,8 +1207,10 @@ static void kernel_init_free_pages(struct page *page, int numpages)
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
for (i = 0; i < numpages; i++) {
+ u8 tag = page_kasan_tag(page + i);
page_kasan_tag_reset(page + i);
clear_highpage(page + i);
+ page_kasan_tag_set(page + i, tag);
}
kasan_enable_current();
}
@@ -5135,8 +5137,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
}
EXPORT_SYMBOL(__page_frag_cache_drain);
-void *page_frag_alloc(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask)
+void *page_frag_alloc_align(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask,
+ unsigned int align_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -5188,11 +5191,12 @@ refill:
}
nc->pagecnt_bias--;
+ offset &= align_mask;
nc->offset = offset;
return nc->va + offset;
}
-EXPORT_SYMBOL(page_frag_alloc);
+EXPORT_SYMBOL(page_frag_alloc_align);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
diff --git a/mm/page_io.c b/mm/page_io.c
index 9bca17ecc4df..92f7941c6d01 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -26,25 +26,6 @@
#include <linux/uio.h>
#include <linux/sched/task.h>
-static struct bio *get_swap_bio(gfp_t gfp_flags,
- struct page *page, bio_end_io_t end_io)
-{
- struct bio *bio;
-
- bio = bio_alloc(gfp_flags, 1);
- if (bio) {
- struct block_device *bdev;
-
- bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
- bio->bi_end_io = end_io;
-
- bio_add_page(bio, page, thp_size(page), 0);
- }
- return bio;
-}
-
void end_swap_bio_write(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
@@ -361,13 +342,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
- bio = get_swap_bio(GFP_NOIO, page, end_write_func);
- if (bio == NULL) {
- set_page_dirty(page);
- unlock_page(page);
- return -ENOMEM;
- }
+ bio = bio_alloc(GFP_NOIO, 1);
+ bio_set_dev(bio, sis->bdev);
+ bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
+ bio->bi_end_io = end_write_func;
+ bio_add_page(bio, page, thp_size(page), 0);
+
bio_associate_blkg_from_page(bio, page);
count_swpout_vm_event(page);
set_page_writeback(page);
@@ -427,18 +408,18 @@ int swap_readpage(struct page *page, bool synchronous)
}
ret = 0;
- bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
- if (bio == NULL) {
- unlock_page(page);
- ret = -ENOMEM;
- goto out;
- }
- disk = bio->bi_disk;
+ bio = bio_alloc(GFP_KERNEL, 1);
+ bio_set_dev(bio, sis->bdev);
+ bio->bi_opf = REQ_OP_READ;
+ bio->bi_iter.bi_sector = swap_page_sector(page);
+ bio->bi_end_io = end_swap_bio_read;
+ bio_add_page(bio, page, thp_size(page), 0);
+
+ disk = bio->bi_bdev->bd_disk;
/*
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
*/
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (synchronous) {
bio->bi_opf |= REQ_HIPRI;
get_task_struct(current);
diff --git a/mm/percpu.c b/mm/percpu.c
index ad7a37ee74ef..6596a0a4286e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -69,6 +69,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/bitmap.h>
+#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/lcm.h>
@@ -2662,13 +2663,14 @@ early_param("percpu_alloc", percpu_alloc_setup);
* On success, pointer to the new allocation_info is returned. On
* failure, ERR_PTR value is returned.
*/
-static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
+ static struct cpumask mask __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
@@ -2681,6 +2683,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
/* this function may be called multiple times */
memset(group_map, 0, sizeof(group_map));
memset(group_cnt, 0, sizeof(group_cnt));
+ cpumask_clear(&mask);
/* calculate size_sum and ensure dyn_size is enough for early alloc */
size_sum = PFN_ALIGN(static_size + reserved_size +
@@ -2702,24 +2705,27 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
upa--;
max_upa = upa;
+ cpumask_copy(&mask, cpu_possible_mask);
+
/* group cpus according to their proximity */
- for_each_possible_cpu(cpu) {
- group = 0;
- next_group:
- for_each_possible_cpu(tcpu) {
- if (cpu == tcpu)
- break;
- if (group_map[tcpu] == group && cpu_distance_fn &&
- (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
- cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
- group++;
- nr_groups = max(nr_groups, group + 1);
- goto next_group;
- }
- }
+ for (group = 0; !cpumask_empty(&mask); group++) {
+ /* pop the group's first cpu */
+ cpu = cpumask_first(&mask);
group_map[cpu] = group;
group_cnt[group]++;
+ cpumask_clear_cpu(cpu, &mask);
+
+ for_each_cpu(tcpu, &mask) {
+ if (!cpu_distance_fn ||
+ (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
+ cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
+ group_map[tcpu] = group;
+ group_cnt[group]++;
+ cpumask_clear_cpu(tcpu, &mask);
+ }
+ }
}
+ nr_groups = group;
/*
* Wasted space is caused by a ratio imbalance of upa to group_cnt.
diff --git a/mm/shmem.c b/mm/shmem.c
index facdd1a9c524..7924b3bf46fb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1522,11 +1522,11 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
{
struct vm_area_struct pvma;
struct page *page;
- struct vm_fault vmf;
+ struct vm_fault vmf = {
+ .vma = &pvma,
+ };
shmem_pseudo_vma_init(&pvma, info, index);
- vmf.vma = &pvma;
- vmf.address = 0;
page = swap_cluster_readahead(swap, gfp, &vmf);
shmem_pseudo_vma_destroy(&pvma);
diff --git a/mm/slab.c b/mm/slab.c
index d7c8da9319c7..dcc55e78f353 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3635,6 +3635,26 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif /* CONFIG_NUMA */
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+{
+ struct kmem_cache *cachep;
+ unsigned int objnr;
+ void *objp;
+
+ kpp->kp_ptr = object;
+ kpp->kp_page = page;
+ cachep = page->slab_cache;
+ kpp->kp_slab_cache = cachep;
+ objp = object - obj_offset(cachep);
+ kpp->kp_data_offset = obj_offset(cachep);
+ page = virt_to_head_page(objp);
+ objnr = obj_to_index(cachep, page, objp);
+ objp = index_to_obj(cachep, page, objnr);
+ kpp->kp_objp = objp;
+ if (DEBUG && cachep->flags & SLAB_STORE_USER)
+ kpp->kp_ret = *dbg_userword(cachep, objp);
+}
+
/**
* __do_kmalloc - allocate memory
* @size: how many bytes of memory are required.
diff --git a/mm/slab.h b/mm/slab.h
index 1a756a359fa8..ecad9b57bc44 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -615,4 +615,16 @@ static inline bool slab_want_init_on_free(struct kmem_cache *c)
return false;
}
+#define KS_ADDRS_COUNT 16
+struct kmem_obj_info {
+ void *kp_ptr;
+ struct page *kp_page;
+ void *kp_objp;
+ unsigned long kp_data_offset;
+ struct kmem_cache *kp_slab_cache;
+ void *kp_ret;
+ void *kp_stack[KS_ADDRS_COUNT];
+};
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
+
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e981c80d216c..adbace4256ef 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -537,6 +537,81 @@ bool slab_is_available(void)
return slab_state >= UP;
}
+/**
+ * kmem_valid_obj - does the pointer reference a valid slab object?
+ * @object: pointer to query.
+ *
+ * Return: %true if the pointer is to a not-yet-freed object from
+ * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
+ * is to an already-freed object, and %false otherwise.
+ */
+bool kmem_valid_obj(void *object)
+{
+ struct page *page;
+
+ /* Some arches consider ZERO_SIZE_PTR to be a valid address. */
+ if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
+ return false;
+ page = virt_to_head_page(object);
+ return PageSlab(page);
+}
+
+/**
+ * kmem_dump_obj - Print available slab provenance information
+ * @object: slab object for which to find provenance information.
+ *
+ * This function uses pr_cont(), so that the caller is expected to have
+ * printed out whatever preamble is appropriate. The provenance information
+ * depends on the type of object and on how much debugging is enabled.
+ * For a slab-cache object, the fact that it is a slab object is printed,
+ * and, if available, the slab name, return address, and stack trace from
+ * the allocation of that object.
+ *
+ * This function will splat if passed a pointer to a non-slab object.
+ * If you are not sure what type of object you have, you should instead
+ * use mem_dump_obj().
+ */
+void kmem_dump_obj(void *object)
+{
+ char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
+ int i;
+ struct page *page;
+ unsigned long ptroffset;
+ struct kmem_obj_info kp = { };
+
+ if (WARN_ON_ONCE(!virt_addr_valid(object)))
+ return;
+ page = virt_to_head_page(object);
+ if (WARN_ON_ONCE(!PageSlab(page))) {
+ pr_cont(" non-slab memory.\n");
+ return;
+ }
+ kmem_obj_info(&kp, object, page);
+ if (kp.kp_slab_cache)
+ pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
+ else
+ pr_cont(" slab%s", cp);
+ if (kp.kp_objp)
+ pr_cont(" start %px", kp.kp_objp);
+ if (kp.kp_data_offset)
+ pr_cont(" data offset %lu", kp.kp_data_offset);
+ if (kp.kp_objp) {
+ ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
+ pr_cont(" pointer offset %lu", ptroffset);
+ }
+ if (kp.kp_slab_cache && kp.kp_slab_cache->usersize)
+ pr_cont(" size %u", kp.kp_slab_cache->usersize);
+ if (kp.kp_ret)
+ pr_cont(" allocated at %pS\n", kp.kp_ret);
+ else
+ pr_cont("\n");
+ for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) {
+ if (!kp.kp_stack[i])
+ break;
+ pr_info(" %pS\n", kp.kp_stack[i]);
+ }
+}
+
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
void __init create_boot_cache(struct kmem_cache *s, const char *name,
diff --git a/mm/slob.c b/mm/slob.c
index 8d4bfa46247f..ef87ada8705d 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -461,6 +461,12 @@ out:
spin_unlock_irqrestore(&slob_lock, flags);
}
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+{
+ kpp->kp_ptr = object;
+ kpp->kp_page = page;
+}
+
/*
* End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
*/
diff --git a/mm/slub.c b/mm/slub.c
index d9e4e10683cc..f5baf429654f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2791,7 +2791,8 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
void *obj)
{
if (unlikely(slab_want_init_on_free(s)) && obj)
- memset((void *)((char *)obj + s->offset), 0, sizeof(void *));
+ memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
+ 0, sizeof(void *));
}
/*
@@ -2883,7 +2884,7 @@ redo:
stat(s, ALLOC_FASTPATH);
}
- maybe_wipe_obj_freeptr(s, kasan_reset_tag(object));
+ maybe_wipe_obj_freeptr(s, object);
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(kasan_reset_tag(object), 0, s->object_size);
@@ -3329,7 +3330,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
int j;
for (j = 0; j < i; j++)
- memset(p[j], 0, s->object_size);
+ memset(kasan_reset_tag(p[j]), 0, s->object_size);
}
/* memcg and kmem_cache debug support */
@@ -3422,6 +3423,7 @@ static inline int calculate_order(unsigned int size)
unsigned int order;
unsigned int min_objects;
unsigned int max_objects;
+ unsigned int nr_cpus;
/*
* Attempt to find best configuration for a slab. This
@@ -3432,8 +3434,21 @@ static inline int calculate_order(unsigned int size)
* we reduce the minimum objects required in a slab.
*/
min_objects = slub_min_objects;
- if (!min_objects)
- min_objects = 4 * (fls(num_online_cpus()) + 1);
+ if (!min_objects) {
+ /*
+ * Some architectures will only update present cpus when
+ * onlining them, so don't trust the number if it's just 1. But
+ * we also don't want to use nr_cpu_ids always, as on some other
+ * architectures, there can be many possible cpus, but never
+ * onlined. Here we compromise between trying to avoid too high
+ * order on systems that appear larger than they are, and too
+ * low order on systems that appear smaller than they are.
+ */
+ nr_cpus = num_present_cpus();
+ if (nr_cpus <= 1)
+ nr_cpus = nr_cpu_ids;
+ min_objects = 4 * (fls(nr_cpus) + 1);
+ }
max_objects = order_objects(slub_max_order, size);
min_objects = min(min_objects, max_objects);
@@ -3918,6 +3933,46 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
return 0;
}
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+{
+ void *base;
+ int __maybe_unused i;
+ unsigned int objnr;
+ void *objp;
+ void *objp0;
+ struct kmem_cache *s = page->slab_cache;
+ struct track __maybe_unused *trackp;
+
+ kpp->kp_ptr = object;
+ kpp->kp_page = page;
+ kpp->kp_slab_cache = s;
+ base = page_address(page);
+ objp0 = kasan_reset_tag(object);
+#ifdef CONFIG_SLUB_DEBUG
+ objp = restore_red_left(s, objp0);
+#else
+ objp = objp0;
+#endif
+ objnr = obj_to_index(s, page, objp);
+ kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
+ objp = base + s->size * objnr;
+ kpp->kp_objp = objp;
+ if (WARN_ON_ONCE(objp < base || objp >= base + page->objects * s->size || (objp - base) % s->size) ||
+ !(s->flags & SLAB_STORE_USER))
+ return;
+#ifdef CONFIG_SLUB_DEBUG
+ trackp = get_track(s, objp, TRACK_ALLOC);
+ kpp->kp_ret = (void *)trackp->addr;
+#ifdef CONFIG_STACKTRACE
+ for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
+ kpp->kp_stack[i] = (void *)trackp->addrs[i];
+ if (!kpp->kp_stack[i])
+ break;
+ }
+#endif
+#endif
+}
+
/********************************************************************
* Kmalloc subsystem
*******************************************************************/
@@ -5624,10 +5679,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
s->kobj.kset = kset;
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
- if (err) {
- kobject_put(&s->kobj);
+ if (err)
goto out;
- }
err = sysfs_create_group(&s->kobj, &slab_attr_group);
if (err)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9fffc5af29d1..96799a2f6957 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,7 +47,6 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static sector_t map_swap_entry(swp_entry_t, struct block_device**);
DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -1850,12 +1849,13 @@ int find_first_swap(dev_t *device)
*/
sector_t swapdev_block(int type, pgoff_t offset)
{
- struct block_device *bdev;
struct swap_info_struct *si = swap_type_to_swap_info(type);
+ struct swap_extent *se;
if (!si || !(si->flags & SWP_WRITEOK))
return 0;
- return map_swap_entry(swp_entry(type, offset), &bdev);
+ se = offset_to_swap_extent(si, offset);
+ return se->start_block + (offset - se->start_page);
}
/*
@@ -1951,8 +1951,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
si = swap_info[type];
pte = pte_offset_map(pmd, addr);
do {
- struct vm_fault vmf;
-
if (!is_swap_pte(*pte))
continue;
@@ -1968,9 +1966,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
swap_map = &si->swap_map[offset];
page = lookup_swap_cache(entry, vma, addr);
if (!page) {
- vmf.vma = vma;
- vmf.address = addr;
- vmf.pmd = pmd;
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = addr,
+ .pmd = pmd,
+ };
+
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
&vmf);
}
@@ -2282,36 +2283,6 @@ static void drain_mmlist(void)
}
/*
- * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
- * corresponds to page offset for the specified swap entry.
- * Note that the type of this function is sector_t, but it returns page offset
- * into the bdev, not sector offset.
- */
-static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
-{
- struct swap_info_struct *sis;
- struct swap_extent *se;
- pgoff_t offset;
-
- sis = swp_swap_info(entry);
- *bdev = sis->bdev;
-
- offset = swp_offset(entry);
- se = offset_to_swap_extent(sis, offset);
- return se->start_block + (offset - se->start_page);
-}
-
-/*
- * Returns the page offset into bdev for the specified page's swap entry.
- */
-sector_t map_swap_page(struct page *page, struct block_device **bdev)
-{
- swp_entry_t entry;
- entry.val = page_private(page);
- return map_swap_entry(entry, bdev);
-}
-
-/*
* Free all of a swapdev's extent information
*/
static void destroy_swap_extents(struct swap_info_struct *sis)
diff --git a/mm/util.c b/mm/util.c
index 8c9b7d1e7c49..54870226cea6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -982,3 +982,34 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
kunmap_atomic(addr1);
return ret;
}
+
+/**
+ * mem_dump_obj - Print available provenance information
+ * @object: object for which to find provenance information.
+ *
+ * This function uses pr_cont(), so that the caller is expected to have
+ * printed out whatever preamble is appropriate. The provenance information
+ * depends on the type of object and on how much debugging is enabled.
+ * For example, for a slab-cache object, the slab name is printed, and,
+ * if available, the return address and stack trace from the allocation
+ * of that object.
+ */
+void mem_dump_obj(void *object)
+{
+ if (kmem_valid_obj(object)) {
+ kmem_dump_obj(object);
+ return;
+ }
+ if (vmalloc_dump_obj(object))
+ return;
+ if (!virt_addr_valid(object)) {
+ if (object == NULL)
+ pr_cont(" NULL pointer.\n");
+ else if (object == ZERO_SIZE_PTR)
+ pr_cont(" zero-size pointer.\n");
+ else
+ pr_cont(" non-paged memory.\n");
+ return;
+ }
+ pr_cont(" non-slab/vmalloc memory.\n");
+}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e6f352bf0498..4f5f8c907897 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3450,6 +3450,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
}
#endif /* CONFIG_SMP */
+bool vmalloc_dump_obj(void *object)
+{
+ struct vm_struct *vm;
+ void *objp = (void *)PAGE_ALIGN((unsigned long)object);
+
+ vm = find_vm_area(objp);
+ if (!vm)
+ return false;
+ pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
+ vm->nr_pages, (unsigned long)vm->addr, vm->caller);
+ return true;
+}
+
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
__acquires(&vmap_purge_lock)