summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig55
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cma.c15
-rw-r--r--mm/gup.c9
-rw-r--r--mm/hmm.c587
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/maccess.c122
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c35
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/memory.c62
-rw-r--r--mm/memory_hotplug.c337
-rw-r--r--mm/mempolicy.c1
-rw-r--r--mm/migrate.c35
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/shmem.c11
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/sparse-vmemmap.c21
-rw-r--r--mm/sparse.c355
-rw-r--r--mm/swap.c15
-rw-r--r--mm/util.c75
-rw-r--r--mm/vmscan.c44
-rw-r--r--mm/z3fold.c43
-rw-r--r--mm/zsmalloc.c12
25 files changed, 867 insertions, 1028 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0b4352557dd5..56cec636a1fc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,8 +649,7 @@ config IDLE_PAGE_TRACKING
See Documentation/admin-guide/mm/idle_page_tracking.rst for
more details.
-# arch_add_memory() comprehends device memory
-config ARCH_HAS_ZONE_DEVICE
+config ARCH_HAS_PTE_DEVMAP
bool
config ZONE_DEVICE
@@ -658,7 +657,7 @@ config ZONE_DEVICE
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
- depends on ARCH_HAS_ZONE_DEVICE
+ depends on ARCH_HAS_PTE_DEVMAP
select XARRAY_MULTI
help
@@ -670,47 +669,17 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
-config ARCH_HAS_HMM_MIRROR
- bool
- default y
- depends on (X86_64 || PPC64)
- depends on MMU && 64BIT
-
-config ARCH_HAS_HMM_DEVICE
- bool
- default y
- depends on (X86_64 || PPC64)
- depends on MEMORY_HOTPLUG
- depends on MEMORY_HOTREMOVE
- depends on SPARSEMEM_VMEMMAP
- depends on ARCH_HAS_ZONE_DEVICE
- select XARRAY_MULTI
-
-config ARCH_HAS_HMM
- bool
- default y
- depends on (X86_64 || PPC64)
- depends on ZONE_DEVICE
- depends on MMU && 64BIT
- depends on MEMORY_HOTPLUG
- depends on MEMORY_HOTREMOVE
- depends on SPARSEMEM_VMEMMAP
-
config MIGRATE_VMA_HELPER
bool
config DEV_PAGEMAP_OPS
bool
-config HMM
- bool
- select MMU_NOTIFIER
- select MIGRATE_VMA_HELPER
-
config HMM_MIRROR
bool "HMM mirror CPU page table into a device page table"
- depends on ARCH_HAS_HMM
- select HMM
+ depends on (X86_64 || PPC64)
+ depends on MMU && 64BIT
+ select MMU_NOTIFIER
help
Select HMM_MIRROR if you want to mirror range of the CPU page table of a
process into a device page table. Here, mirror means "keep synchronized".
@@ -720,8 +689,7 @@ config HMM_MIRROR
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
- depends on ARCH_HAS_HMM
- select HMM
+ depends on ZONE_DEVICE
select DEV_PAGEMAP_OPS
help
@@ -729,17 +697,6 @@ config DEVICE_PRIVATE
memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR.
-config DEVICE_PUBLIC
- bool "Addressable device memory (like GPU memory)"
- depends on ARCH_HAS_HMM
- select HMM
- select DEV_PAGEMAP_OPS
-
- help
- Allows creation of struct pages to represent addressable device
- memory; i.e., memory that is accessible from both the device and
- the CPU
-
config FRAME_VECTOR
bool
diff --git a/mm/Makefile b/mm/Makefile
index dc0746ca1109..338e528ad436 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
-obj-$(CONFIG_HMM) += hmm.o
+obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/cma.c b/mm/cma.c
index 3340ef34c154..7fe0b8356775 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -278,6 +278,12 @@ int __init cma_declare_contiguous(phys_addr_t base,
*/
alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
+ if (fixed && base & (alignment - 1)) {
+ ret = -EINVAL;
+ pr_err("Region at %pa must be aligned to %pa bytes\n",
+ &base, &alignment);
+ goto err;
+ }
base = ALIGN(base, alignment);
size = ALIGN(size, alignment);
limit &= ~(alignment - 1);
@@ -308,6 +314,13 @@ int __init cma_declare_contiguous(phys_addr_t base,
if (limit == 0 || limit > memblock_end)
limit = memblock_end;
+ if (base + size > limit) {
+ ret = -EINVAL;
+ pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
+ &size, &base, &limit);
+ goto err;
+ }
+
/* Reserve memory */
if (fixed) {
if (memblock_is_region_reserved(base, size) ||
@@ -494,7 +507,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
* @pages: Allocated pages.
* @count: Number of allocated pages.
*
- * This function releases memory allocated by alloc_cma().
+ * This function releases memory allocated by cma_alloc().
* It returns false when provided pages do not belong to contiguous area and
* true otherwise.
*/
diff --git a/mm/gup.c b/mm/gup.c
index 43b7d875de37..98f13ab37bac 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -609,13 +609,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
goto unmap;
*page = pte_page(*pte);
-
- /*
- * This should never happen (a device public page in the gate
- * area).
- */
- if (is_device_public_page(*page))
- goto unmap;
}
if (unlikely(!try_get_page(*page))) {
ret = -ENOMEM;
@@ -1902,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
diff --git a/mm/hmm.c b/mm/hmm.c
index f702a3895d05..e1eedef129cf 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -20,26 +20,14 @@
#include <linux/swapops.h>
#include <linux/hugetlb.h>
#include <linux/memremap.h>
+#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
-#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-
-#if IS_ENABLED(CONFIG_HMM_MIRROR)
static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
-static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
-{
- struct hmm *hmm = READ_ONCE(mm->hmm);
-
- if (hmm && kref_get_unless_zero(&hmm->kref))
- return hmm;
-
- return NULL;
-}
-
/**
* hmm_get_or_create - register HMM against an mm (HMM internal)
*
@@ -54,11 +42,16 @@ static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
*/
static struct hmm *hmm_get_or_create(struct mm_struct *mm)
{
- struct hmm *hmm = mm_get_hmm(mm);
- bool cleanup = false;
+ struct hmm *hmm;
- if (hmm)
- return hmm;
+ lockdep_assert_held_write(&mm->mmap_sem);
+
+ /* Abuse the page_table_lock to also protect mm->hmm. */
+ spin_lock(&mm->page_table_lock);
+ hmm = mm->hmm;
+ if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref))
+ goto out_unlock;
+ spin_unlock(&mm->page_table_lock);
hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
if (!hmm)
@@ -68,55 +61,50 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
init_rwsem(&hmm->mirrors_sem);
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(&hmm->ranges);
- mutex_init(&hmm->lock);
+ spin_lock_init(&hmm->ranges_lock);
kref_init(&hmm->kref);
hmm->notifiers = 0;
- hmm->dead = false;
hmm->mm = mm;
- spin_lock(&mm->page_table_lock);
- if (!mm->hmm)
- mm->hmm = hmm;
- else
- cleanup = true;
- spin_unlock(&mm->page_table_lock);
+ hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
+ if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
+ kfree(hmm);
+ return NULL;
+ }
- if (cleanup)
- goto error;
+ mmgrab(hmm->mm);
/*
- * We should only get here if hold the mmap_sem in write mode ie on
- * registration of first mirror through hmm_mirror_register()
+ * We hold the exclusive mmap_sem here so we know that mm->hmm is
+ * still NULL or 0 kref, and is safe to update.
*/
- hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
- if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
- goto error_mm;
+ spin_lock(&mm->page_table_lock);
+ mm->hmm = hmm;
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
return hmm;
+}
-error_mm:
- spin_lock(&mm->page_table_lock);
- if (mm->hmm == hmm)
- mm->hmm = NULL;
- spin_unlock(&mm->page_table_lock);
-error:
+static void hmm_free_rcu(struct rcu_head *rcu)
+{
+ struct hmm *hmm = container_of(rcu, struct hmm, rcu);
+
+ mmdrop(hmm->mm);
kfree(hmm);
- return NULL;
}
static void hmm_free(struct kref *kref)
{
struct hmm *hmm = container_of(kref, struct hmm, kref);
- struct mm_struct *mm = hmm->mm;
- mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
+ spin_lock(&hmm->mm->page_table_lock);
+ if (hmm->mm->hmm == hmm)
+ hmm->mm->hmm = NULL;
+ spin_unlock(&hmm->mm->page_table_lock);
- spin_lock(&mm->page_table_lock);
- if (mm->hmm == hmm)
- mm->hmm = NULL;
- spin_unlock(&mm->page_table_lock);
-
- kfree(hmm);
+ mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm);
+ mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
}
static inline void hmm_put(struct hmm *hmm)
@@ -124,86 +112,73 @@ static inline void hmm_put(struct hmm *hmm)
kref_put(&hmm->kref, hmm_free);
}
-void hmm_mm_destroy(struct mm_struct *mm)
+static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
- struct hmm *hmm;
+ struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
+ struct hmm_mirror *mirror;
- spin_lock(&mm->page_table_lock);
- hmm = mm_get_hmm(mm);
- mm->hmm = NULL;
- if (hmm) {
- hmm->mm = NULL;
- hmm->dead = true;
- spin_unlock(&mm->page_table_lock);
- hmm_put(hmm);
+ /* Bail out if hmm is in the process of being freed */
+ if (!kref_get_unless_zero(&hmm->kref))
return;
+
+ /*
+ * Since hmm_range_register() holds the mmget() lock hmm_release() is
+ * prevented as long as a range exists.
+ */
+ WARN_ON(!list_empty_careful(&hmm->ranges));
+
+ down_read(&hmm->mirrors_sem);
+ list_for_each_entry(mirror, &hmm->mirrors, list) {
+ /*
+ * Note: The driver is not allowed to trigger
+ * hmm_mirror_unregister() from this thread.
+ */
+ if (mirror->ops->release)
+ mirror->ops->release(mirror);
}
+ up_read(&hmm->mirrors_sem);
- spin_unlock(&mm->page_table_lock);
+ hmm_put(hmm);
}
-static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
+static void notifiers_decrement(struct hmm *hmm)
{
- struct hmm *hmm = mm_get_hmm(mm);
- struct hmm_mirror *mirror;
- struct hmm_range *range;
-
- /* Report this HMM as dying. */
- hmm->dead = true;
+ unsigned long flags;
- /* Wake-up everyone waiting on any range. */
- mutex_lock(&hmm->lock);
- list_for_each_entry(range, &hmm->ranges, list) {
- range->valid = false;
- }
- wake_up_all(&hmm->wq);
- mutex_unlock(&hmm->lock);
+ spin_lock_irqsave(&hmm->ranges_lock, flags);
+ hmm->notifiers--;
+ if (!hmm->notifiers) {
+ struct hmm_range *range;
- down_write(&hmm->mirrors_sem);
- mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
- list);
- while (mirror) {
- list_del_init(&mirror->list);
- if (mirror->ops->release) {
- /*
- * Drop mirrors_sem so callback can wait on any pending
- * work that might itself trigger mmu_notifier callback
- * and thus would deadlock with us.
- */
- up_write(&hmm->mirrors_sem);
- mirror->ops->release(mirror);
- down_write(&hmm->mirrors_sem);
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (range->valid)
+ continue;
+ range->valid = true;
}
- mirror = list_first_entry_or_null(&hmm->mirrors,
- struct hmm_mirror, list);
+ wake_up_all(&hmm->wq);
}
- up_write(&hmm->mirrors_sem);
-
- hmm_put(hmm);
+ spin_unlock_irqrestore(&hmm->ranges_lock, flags);
}
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
const struct mmu_notifier_range *nrange)
{
- struct hmm *hmm = mm_get_hmm(nrange->mm);
+ struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
struct hmm_mirror *mirror;
struct hmm_update update;
struct hmm_range *range;
+ unsigned long flags;
int ret = 0;
- VM_BUG_ON(!hmm);
+ if (!kref_get_unless_zero(&hmm->kref))
+ return 0;
update.start = nrange->start;
update.end = nrange->end;
update.event = HMM_UPDATE_INVALIDATE;
update.blockable = mmu_notifier_range_blockable(nrange);
- if (mmu_notifier_range_blockable(nrange))
- mutex_lock(&hmm->lock);
- else if (!mutex_trylock(&hmm->lock)) {
- ret = -EAGAIN;
- goto out;
- }
+ spin_lock_irqsave(&hmm->ranges_lock, flags);
hmm->notifiers++;
list_for_each_entry(range, &hmm->ranges, list) {
if (update.end < range->start || update.start >= range->end)
@@ -211,7 +186,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
range->valid = false;
}
- mutex_unlock(&hmm->lock);
+ spin_unlock_irqrestore(&hmm->ranges_lock, flags);
if (mmu_notifier_range_blockable(nrange))
down_read(&hmm->mirrors_sem);
@@ -219,19 +194,23 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
ret = -EAGAIN;
goto out;
}
+
list_for_each_entry(mirror, &hmm->mirrors, list) {
- int ret;
+ int rc;
- ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
- if (!update.blockable && ret == -EAGAIN) {
- up_read(&hmm->mirrors_sem);
+ rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+ if (rc) {
+ if (WARN_ON(update.blockable || rc != -EAGAIN))
+ continue;
ret = -EAGAIN;
- goto out;
+ break;
}
}
up_read(&hmm->mirrors_sem);
out:
+ if (ret)
+ notifiers_decrement(hmm);
hmm_put(hmm);
return ret;
}
@@ -239,24 +218,12 @@ out:
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
const struct mmu_notifier_range *nrange)
{
- struct hmm *hmm = mm_get_hmm(nrange->mm);
-
- VM_BUG_ON(!hmm);
+ struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
- mutex_lock(&hmm->lock);
- hmm->notifiers--;
- if (!hmm->notifiers) {
- struct hmm_range *range;
-
- list_for_each_entry(range, &hmm->ranges, list) {
- if (range->valid)
- continue;
- range->valid = true;
- }
- wake_up_all(&hmm->wq);
- }
- mutex_unlock(&hmm->lock);
+ if (!kref_get_unless_zero(&hmm->kref))
+ return;
+ notifiers_decrement(hmm);
hmm_put(hmm);
}
@@ -271,14 +238,15 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
*
* @mirror: new mirror struct to register
* @mm: mm to register against
+ * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments
*
* To start mirroring a process address space, the device driver must register
* an HMM mirror struct.
- *
- * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
*/
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
{
+ lockdep_assert_held_write(&mm->mmap_sem);
+
/* Sanity check */
if (!mm || !mirror || !mirror->ops)
return -EINVAL;
@@ -298,23 +266,17 @@ EXPORT_SYMBOL(hmm_mirror_register);
/*
* hmm_mirror_unregister() - unregister a mirror
*
- * @mirror: new mirror struct to register
+ * @mirror: mirror struct to unregister
*
* Stop mirroring a process address space, and cleanup.
*/
void hmm_mirror_unregister(struct hmm_mirror *mirror)
{
- struct hmm *hmm = READ_ONCE(mirror->hmm);
-
- if (hmm == NULL)
- return;
+ struct hmm *hmm = mirror->hmm;
down_write(&hmm->mirrors_sem);
- list_del_init(&mirror->list);
- /* To protect us against double unregister ... */
- mirror->hmm = NULL;
+ list_del(&mirror->list);
up_write(&hmm->mirrors_sem);
-
hmm_put(hmm);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
@@ -330,7 +292,7 @@ struct hmm_vma_walk {
static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
bool write_fault, uint64_t *pfn)
{
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
+ unsigned int flags = FAULT_FLAG_REMOTE;
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
@@ -372,7 +334,7 @@ static int hmm_pfns_bad(unsigned long addr,
* @fault: should we fault or not ?
* @write_fault: write fault ?
* @walk: mm_walk structure
- * Returns: 0 on success, -EBUSY after page fault, or page fault error
+ * Return: 0 on success, -EBUSY after page fault, or page fault error
*
* This function will be called whenever pmd_none() or pte_none() returns true,
* or whenever there is no page directory covering the virtual address range.
@@ -550,7 +512,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
{
- if (pte_none(pte) || !pte_present(pte))
+ if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
return 0;
return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
range->flags[HMM_PFN_WRITE] :
@@ -788,7 +750,6 @@ again:
return hmm_vma_walk_hole_(addr, end, fault,
write_fault, walk);
-#ifdef CONFIG_HUGETLB_PAGE
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn) {
hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
@@ -804,9 +765,6 @@ again:
}
hmm_vma_walk->last = end;
return 0;
-#else
- return -EINVAL;
-#endif
}
split_huge_pud(walk->vma, pudp, addr);
@@ -909,12 +867,14 @@ static void hmm_pfns_clear(struct hmm_range *range,
* Track updates to the CPU page table see include/linux/hmm.h
*/
int hmm_range_register(struct hmm_range *range,
- struct mm_struct *mm,
+ struct hmm_mirror *mirror,
unsigned long start,
unsigned long end,
unsigned page_shift)
{
unsigned long mask = ((1UL << page_shift) - 1UL);
+ struct hmm *hmm = mirror->hmm;
+ unsigned long flags;
range->valid = false;
range->hmm = NULL;
@@ -928,28 +888,24 @@ int hmm_range_register(struct hmm_range *range,
range->start = start;
range->end = end;
- range->hmm = hmm_get_or_create(mm);
- if (!range->hmm)
- return -EFAULT;
-
- /* Check if hmm_mm_destroy() was call. */
- if (range->hmm->mm == NULL || range->hmm->dead) {
- hmm_put(range->hmm);
+ /* Prevent hmm_release() from running while the range is valid */
+ if (!mmget_not_zero(hmm->mm))
return -EFAULT;
- }
- /* Initialize range to track CPU page table update */
- mutex_lock(&range->hmm->lock);
+ /* Initialize range to track CPU page table updates. */
+ spin_lock_irqsave(&hmm->ranges_lock, flags);
- list_add_rcu(&range->list, &range->hmm->ranges);
+ range->hmm = hmm;
+ kref_get(&hmm->kref);
+ list_add(&range->list, &hmm->ranges);
/*
* If there are any concurrent notifiers we have to wait for them for
* the range to be valid (see hmm_range_wait_until_valid()).
*/
- if (!range->hmm->notifiers)
+ if (!hmm->notifiers)
range->valid = true;
- mutex_unlock(&range->hmm->lock);
+ spin_unlock_irqrestore(&hmm->ranges_lock, flags);
return 0;
}
@@ -964,25 +920,31 @@ EXPORT_SYMBOL(hmm_range_register);
*/
void hmm_range_unregister(struct hmm_range *range)
{
- /* Sanity check this really should not happen. */
- if (range->hmm == NULL || range->end <= range->start)
- return;
+ struct hmm *hmm = range->hmm;
+ unsigned long flags;
- mutex_lock(&range->hmm->lock);
- list_del_rcu(&range->list);
- mutex_unlock(&range->hmm->lock);
+ spin_lock_irqsave(&hmm->ranges_lock, flags);
+ list_del_init(&range->list);
+ spin_unlock_irqrestore(&hmm->ranges_lock, flags);
/* Drop reference taken by hmm_range_register() */
+ mmput(hmm->mm);
+ hmm_put(hmm);
+
+ /*
+ * The range is now invalid and the ref on the hmm is dropped, so
+ * poison the pointer. Leave other fields in place, for the caller's
+ * use.
+ */
range->valid = false;
- hmm_put(range->hmm);
- range->hmm = NULL;
+ memset(&range->hmm, POISON_INUSE, sizeof(range->hmm));
}
EXPORT_SYMBOL(hmm_range_unregister);
/*
* hmm_range_snapshot() - snapshot CPU page table for a range
* @range: range
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
* permission (for instance asking for write and range is read only),
* -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
* vma or it is illegal to access that range), number of valid pages
@@ -1001,10 +963,7 @@ long hmm_range_snapshot(struct hmm_range *range)
struct vm_area_struct *vma;
struct mm_walk mm_walk;
- /* Check if hmm_mm_destroy() was call. */
- if (hmm->mm == NULL || hmm->dead)
- return -EFAULT;
-
+ lockdep_assert_held(&hmm->mm->mmap_sem);
do {
/* If range is no longer valid force retry. */
if (!range->valid)
@@ -1015,9 +974,8 @@ long hmm_range_snapshot(struct hmm_range *range)
return -EFAULT;
if (is_vm_hugetlb_page(vma)) {
- struct hstate *h = hstate_vma(vma);
-
- if (huge_page_shift(h) != range->page_shift &&
+ if (huge_page_shift(hstate_vma(vma)) !=
+ range->page_shift &&
range->page_shift != PAGE_SHIFT)
return -EINVAL;
} else {
@@ -1066,7 +1024,7 @@ EXPORT_SYMBOL(hmm_range_snapshot);
* hmm_range_fault() - try to fault some address in a virtual address range
* @range: range being faulted
* @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: number of valid pages in range->pfns[] (from range start
+ * Return: number of valid pages in range->pfns[] (from range start
* address). This may be zero. If the return value is negative,
* then one of the following values may be returned:
*
@@ -1100,9 +1058,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
struct mm_walk mm_walk;
int ret;
- /* Check if hmm_mm_destroy() was call. */
- if (hmm->mm == NULL || hmm->dead)
- return -EFAULT;
+ lockdep_assert_held(&hmm->mm->mmap_sem);
do {
/* If range is no longer valid force retry. */
@@ -1184,7 +1140,7 @@ EXPORT_SYMBOL(hmm_range_fault);
* @device: device against to dma map page to
* @daddrs: dma address of mapped pages
* @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been
* drop and you need to try again, some other error value otherwise
*
* Note same usage pattern as hmm_range_fault().
@@ -1272,7 +1228,7 @@ EXPORT_SYMBOL(hmm_range_dma_map);
* @device: device against which dma map was done
* @daddrs: dma address of mapped pages
* @dirty: dirty page if it had the write flag set
- * Returns: number of page unmapped on success, -EINVAL otherwise
+ * Return: number of page unmapped on success, -EINVAL otherwise
*
* Note that caller MUST abide by mmu notifier or use HMM mirror and abide
* to the sync_cpu_device_pagetables() callback so that it is safe here to
@@ -1328,284 +1284,3 @@ long hmm_range_dma_unmap(struct hmm_range *range,
return cpages;
}
EXPORT_SYMBOL(hmm_range_dma_unmap);
-#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-
-
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
-struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
- unsigned long addr)
-{
- struct page *page;
-
- page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
- if (!page)
- return NULL;
- lock_page(page);
- return page;
-}
-EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
-
-
-static void hmm_devmem_ref_release(struct percpu_ref *ref)
-{
- struct hmm_devmem *devmem;
-
- devmem = container_of(ref, struct hmm_devmem, ref);
- complete(&devmem->completion);
-}
-
-static void hmm_devmem_ref_exit(struct percpu_ref *ref)
-{
- struct hmm_devmem *devmem;
-
- devmem = container_of(ref, struct hmm_devmem, ref);
- wait_for_completion(&devmem->completion);
- percpu_ref_exit(ref);
-}
-
-static void hmm_devmem_ref_kill(struct percpu_ref *ref)
-{
- percpu_ref_kill(ref);
-}
-
-static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
- unsigned long addr,
- const struct page *page,
- unsigned int flags,
- pmd_t *pmdp)
-{
- struct hmm_devmem *devmem = page->pgmap->data;
-
- return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
-}
-
-static void hmm_devmem_free(struct page *page, void *data)
-{
- struct hmm_devmem *devmem = data;
-
- page->mapping = NULL;
-
- devmem->ops->free(devmem, page);
-}
-
-/*
- * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
- *
- * @ops: memory event device driver callback (see struct hmm_devmem_ops)
- * @device: device struct to bind the resource too
- * @size: size in bytes of the device memory to add
- * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
- *
- * This function first finds an empty range of physical address big enough to
- * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
- * in turn allocates struct pages. It does not do anything beyond that; all
- * events affecting the memory will go through the various callbacks provided
- * by hmm_devmem_ops struct.
- *
- * Device driver should call this function during device initialization and
- * is then responsible of memory management. HMM only provides helpers.
- */
-struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
- struct device *device,
- unsigned long size)
-{
- struct hmm_devmem *devmem;
- resource_size_t addr;
- void *result;
- int ret;
-
- dev_pagemap_get_ops();
-
- devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
- if (!devmem)
- return ERR_PTR(-ENOMEM);
-
- init_completion(&devmem->completion);
- devmem->pfn_first = -1UL;
- devmem->pfn_last = -1UL;
- devmem->resource = NULL;
- devmem->device = device;
- devmem->ops = ops;
-
- ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
- 0, GFP_KERNEL);
- if (ret)
- return ERR_PTR(ret);
-
- size = ALIGN(size, PA_SECTION_SIZE);
- addr = min((unsigned long)iomem_resource.end,
- (1UL << MAX_PHYSMEM_BITS) - 1);
- addr = addr - size + 1UL;
-
- /*
- * FIXME add a new helper to quickly walk resource tree and find free
- * range
- *
- * FIXME what about ioport_resource resource ?
- */
- for (; addr > size && addr >= iomem_resource.start; addr -= size) {
- ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
- if (ret != REGION_DISJOINT)
- continue;
-
- devmem->resource = devm_request_mem_region(device, addr, size,
- dev_name(device));
- if (!devmem->resource)
- return ERR_PTR(-ENOMEM);
- break;
- }
- if (!devmem->resource)
- return ERR_PTR(-ERANGE);
-
- devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
- devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
- devmem->pfn_last = devmem->pfn_first +
- (resource_size(devmem->resource) >> PAGE_SHIFT);
- devmem->page_fault = hmm_devmem_fault;
-
- devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
- devmem->pagemap.res = *devmem->resource;
- devmem->pagemap.page_free = hmm_devmem_free;
- devmem->pagemap.altmap_valid = false;
- devmem->pagemap.ref = &devmem->ref;
- devmem->pagemap.data = devmem;
- devmem->pagemap.kill = hmm_devmem_ref_kill;
- devmem->pagemap.cleanup = hmm_devmem_ref_exit;
-
- result = devm_memremap_pages(devmem->device, &devmem->pagemap);
- if (IS_ERR(result))
- return result;
- return devmem;
-}
-EXPORT_SYMBOL_GPL(hmm_devmem_add);
-
-struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
- struct device *device,
- struct resource *res)
-{
- struct hmm_devmem *devmem;
- void *result;
- int ret;
-
- if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
- return ERR_PTR(-EINVAL);
-
- dev_pagemap_get_ops();
-
- devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
- if (!devmem)
- return ERR_PTR(-ENOMEM);
-
- init_completion(&devmem->completion);
- devmem->pfn_first = -1UL;
- devmem->pfn_last = -1UL;
- devmem->resource = res;
- devmem->device = device;
- devmem->ops = ops;
-
- ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
- 0, GFP_KERNEL);
- if (ret)
- return ERR_PTR(ret);
-
- devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
- devmem->pfn_last = devmem->pfn_first +
- (resource_size(devmem->resource) >> PAGE_SHIFT);
- devmem->page_fault = hmm_devmem_fault;
-
- devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
- devmem->pagemap.res = *devmem->resource;
- devmem->pagemap.page_free = hmm_devmem_free;
- devmem->pagemap.altmap_valid = false;
- devmem->pagemap.ref = &devmem->ref;
- devmem->pagemap.data = devmem;
- devmem->pagemap.kill = hmm_devmem_ref_kill;
- devmem->pagemap.cleanup = hmm_devmem_ref_exit;
-
- result = devm_memremap_pages(devmem->device, &devmem->pagemap);
- if (IS_ERR(result))
- return result;
- return devmem;
-}
-EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
-
-/*
- * A device driver that wants to handle multiple devices memory through a
- * single fake device can use hmm_device to do so. This is purely a helper
- * and it is not needed to make use of any HMM functionality.
- */
-#define HMM_DEVICE_MAX 256
-
-static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
-static DEFINE_SPINLOCK(hmm_device_lock);
-static struct class *hmm_device_class;
-static dev_t hmm_device_devt;
-
-static void hmm_device_release(struct device *device)
-{
- struct hmm_device *hmm_device;
-
- hmm_device = container_of(device, struct hmm_device, device);
- spin_lock(&hmm_device_lock);
- clear_bit(hmm_device->minor, hmm_device_mask);
- spin_unlock(&hmm_device_lock);
-
- kfree(hmm_device);
-}
-
-struct hmm_device *hmm_device_new(void *drvdata)
-{
- struct hmm_device *hmm_device;
-
- hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
- if (!hmm_device)
- return ERR_PTR(-ENOMEM);
-
- spin_lock(&hmm_device_lock);
- hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
- if (hmm_device->minor >= HMM_DEVICE_MAX) {
- spin_unlock(&hmm_device_lock);
- kfree(hmm_device);
- return ERR_PTR(-EBUSY);
- }
- set_bit(hmm_device->minor, hmm_device_mask);
- spin_unlock(&hmm_device_lock);
-
- dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
- hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
- hmm_device->minor);
- hmm_device->device.release = hmm_device_release;
- dev_set_drvdata(&hmm_device->device, drvdata);
- hmm_device->device.class = hmm_device_class;
- device_initialize(&hmm_device->device);
-
- return hmm_device;
-}
-EXPORT_SYMBOL(hmm_device_new);
-
-void hmm_device_put(struct hmm_device *hmm_device)
-{
- put_device(&hmm_device->device);
-}
-EXPORT_SYMBOL(hmm_device_put);
-
-static int __init hmm_init(void)
-{
- int ret;
-
- ret = alloc_chrdev_region(&hmm_device_devt, 0,
- HMM_DEVICE_MAX,
- "hmm_device");
- if (ret)
- return ret;
-
- hmm_device_class = class_create(THIS_MODULE, "hmm_device");
- if (IS_ERR(hmm_device_class)) {
- unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
- return PTR_ERR(hmm_device_class);
- }
- return 0;
-}
-
-device_initcall(hmm_init);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 885642c82aaa..1334ede667a8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -63,10 +63,15 @@ struct page *huge_zero_page __read_mostly;
bool transparent_hugepage_enabled(struct vm_area_struct *vma)
{
+ /* The addr is used to check if the vma size fits */
+ unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+
+ if (!transhuge_vma_suitable(vma, addr))
+ return false;
if (vma_is_anonymous(vma))
return __transparent_hugepage_enabled(vma);
- if (vma_is_shmem(vma) && shmem_huge_enabled(vma))
- return __transparent_hugepage_enabled(vma);
+ if (vma_is_shmem(vma))
+ return shmem_huge_enabled(vma);
return false;
}
@@ -689,7 +694,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ if (!transhuge_vma_suitable(vma, haddr))
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
diff --git a/mm/maccess.c b/mm/maccess.c
index 482d4d670f19..d065736f6b87 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -6,8 +6,20 @@
#include <linux/mm.h>
#include <linux/uaccess.h>
+static __always_inline long
+probe_read_common(void *dst, const void __user *src, size_t size)
+{
+ long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
/**
- * probe_kernel_read(): safely attempt to read from a location
+ * probe_kernel_read(): safely attempt to read from a kernel-space location
* @dst: pointer to the buffer that shall take the data
* @src: address to read from
* @size: size of the data chunk
@@ -30,17 +42,41 @@ long __probe_kernel_read(void *dst, const void *src, size_t size)
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_from_user_inatomic(dst,
- (__force const void __user *)src, size);
- pagefault_enable();
+ ret = probe_read_common(dst, (__force const void __user *)src, size);
set_fs(old_fs);
- return ret ? -EFAULT : 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(probe_kernel_read);
/**
+ * probe_user_read(): safely attempt to read from a user-space location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from. This must be a user address.
+ * @size: size of the data chunk
+ *
+ * Safely read from user address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_read(void *dst, const void __user *src, size_t size)
+ __attribute__((alias("__probe_user_read")));
+
+long __probe_user_read(void *dst, const void __user *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(USER_DS);
+ if (access_ok(src, size))
+ ret = probe_read_common(dst, src, size);
+ set_fs(old_fs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_read);
+
+/**
* probe_kernel_write(): safely attempt to write to a location
* @dst: address to write to
* @src: pointer to the data that shall be written
@@ -67,6 +103,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
}
EXPORT_SYMBOL_GPL(probe_kernel_write);
+
/**
* strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
* @dst: Destination address, in kernel space. This buffer must be at
@@ -106,3 +143,76 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
return ret ? -EFAULT : src - unsafe_addr;
}
+
+/**
+ * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user
+ * address.
+ * @dst: Destination address, in kernel space. This buffer must be at
+ * least @count bytes long.
+ * @unsafe_addr: Unsafe user address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe user address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+ long count)
+{
+ mm_segment_t old_fs = get_fs();
+ long ret;
+
+ if (unlikely(count <= 0))
+ return 0;
+
+ set_fs(USER_DS);
+ pagefault_disable();
+ ret = strncpy_from_user(dst, unsafe_addr, count);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret >= count) {
+ ret = count;
+ dst[ret - 1] = '\0';
+ } else if (ret > 0) {
+ ret++;
+ }
+
+ return ret;
+}
+
+/**
+ * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL.
+ * @unsafe_addr: The string to measure.
+ * @count: Maximum count (including NUL)
+ *
+ * Get the size of a NUL-terminated string in user space without pagefault.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ *
+ * If the string is too long, returns a number larger than @count. User
+ * has to check the return value against "> count".
+ * On exception (or invalid count), returns 0.
+ *
+ * Unlike strnlen_user, this can be used from IRQ handler etc. because
+ * it disables pagefaults.
+ */
+long strnlen_unsafe_user(const void __user *unsafe_addr, long count)
+{
+ mm_segment_t old_fs = get_fs();
+ int ret;
+
+ set_fs(USER_DS);
+ pagefault_disable();
+ ret = strnlen_user(unsafe_addr, count);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret;
+}
diff --git a/mm/madvise.c b/mm/madvise.c
index 628022e674a7..968df3aa069f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -354,7 +354,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
}
- page = _vm_normal_page(vma, addr, ptent, true);
+ page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4f05735b02d3..cdbb7a84cb6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -695,12 +695,15 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
if (mem_cgroup_disabled())
return;
- __this_cpu_add(memcg->vmstats_local->stat[idx], val);
-
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup *mi;
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &mi->vmstats[idx]);
x = 0;
@@ -749,13 +752,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
- /* Update lruvec */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
-
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup_per_node *pi;
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
atomic_long_add(x, &pi->lruvec_stat[idx]);
x = 0;
@@ -777,12 +782,15 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (mem_cgroup_disabled())
return;
- __this_cpu_add(memcg->vmstats_local->events[idx], count);
-
x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
if (unlikely(x > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup *mi;
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &mi->vmevents[idx]);
x = 0;
@@ -4908,7 +4916,7 @@ enum mc_target_type {
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
- struct page *page = _vm_normal_page(vma, addr, ptent, true);
+ struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
@@ -5109,8 +5117,8 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
+ * (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -5144,8 +5152,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page) ||
- is_device_public_page(page))
+ if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
@@ -5216,8 +5223,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
if (ptl) {
/*
* Note their can not be MC_TARGET_DEVICE for now as we do not
- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
- * MEMORY_DEVICE_PRIVATE but this might change.
+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
+ * this might change.
*/
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7e08cbf3ba49..7ef849da8278 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1177,16 +1177,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
goto unlock;
}
- switch (pgmap->type) {
- case MEMORY_DEVICE_PRIVATE:
- case MEMORY_DEVICE_PUBLIC:
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
/*
* TODO: Handle HMM pages which may need coordination
* with device-side memory.
*/
goto unlock;
- default:
- break;
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 53bd59579861..e2bb51b6242e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -571,8 +571,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
* PFNMAP mappings in order to support COWable mappings.
*
*/
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte, bool with_public_device)
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
@@ -585,29 +585,6 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (is_zero_pfn(pfn))
return NULL;
-
- /*
- * Device public pages are special pages (they are ZONE_DEVICE
- * pages but different from persistent memory). They behave
- * allmost like normal pages. The difference is that they are
- * not on the lru and thus should never be involve with any-
- * thing that involve lru manipulation (mlock, numa balancing,
- * ...).
- *
- * This is why we still want to return NULL for such page from
- * vm_normal_page() so that we do not have to special case all
- * call site of vm_normal_page().
- */
- if (likely(pfn <= highest_memmap_pfn)) {
- struct page *page = pfn_to_page(pfn);
-
- if (is_device_public_page(page)) {
- if (with_public_device)
- return page;
- return NULL;
- }
- }
-
if (pte_devmap(pte))
return NULL;
@@ -797,17 +774,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
rss[mm_counter(page)]++;
} else if (pte_devmap(pte)) {
page = pte_page(pte);
-
- /*
- * Cache coherent device memory behave like regular page and
- * not like persistent memory page. For more informations see
- * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
- */
- if (is_device_public_page(page)) {
- get_page(page);
- page_dup_rmap(page, false);
- rss[mm_counter(page)]++;
- }
}
out_set_pte:
@@ -1063,7 +1029,7 @@ again:
if (pte_present(ptent)) {
struct page *page;
- page = _vm_normal_page(vma, addr, ptent, true);
+ page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
@@ -2777,13 +2743,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
} else if (is_device_private_entry(entry)) {
- /*
- * For un-addressable device memory we call the pgmap
- * fault handler callback. The callback must migrate
- * the page back to some CPU accessible page.
- */
- ret = device_private_entry_fault(vma, vmf->address, entry,
- vmf->flags, vmf->pmd);
+ vmf->page = device_private_entry_to_page(entry);
+ ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
@@ -3201,19 +3162,6 @@ map_pte:
}
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-
-#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
- unsigned long haddr)
-{
- if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
- (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
- return false;
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
- return false;
- return true;
-}
-
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e096c987d261..2a9bbddb0e55 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -166,9 +166,10 @@ void put_page_bootmem(struct page *page)
#ifndef CONFIG_SPARSEMEM_VMEMMAP
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
- unsigned long *usemap, mapsize, section_nr, i;
+ unsigned long mapsize, section_nr, i;
struct mem_section *ms;
struct page *page, *memmap;
+ struct mem_section_usage *usage;
section_nr = pfn_to_section_nr(start_pfn);
ms = __nr_to_section(section_nr);
@@ -188,10 +189,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, SECTION_INFO);
- usemap = ms->pageblock_flags;
- page = virt_to_page(usemap);
+ usage = ms->usage;
+ page = virt_to_page(usage);
- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
@@ -200,9 +201,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
#else /* CONFIG_SPARSEMEM_VMEMMAP */
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
- unsigned long *usemap, mapsize, section_nr, i;
+ unsigned long mapsize, section_nr, i;
struct mem_section *ms;
struct page *page, *memmap;
+ struct mem_section_usage *usage;
section_nr = pfn_to_section_nr(start_pfn);
ms = __nr_to_section(section_nr);
@@ -211,10 +213,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
- usemap = ms->pageblock_flags;
- page = virt_to_page(usemap);
+ usage = ms->usage;
+ page = virt_to_page(usage);
- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
@@ -250,22 +252,31 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
}
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
- struct vmem_altmap *altmap, bool want_memblock)
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
+ const char *reason)
{
- int ret;
-
- if (pfn_valid(phys_start_pfn))
- return -EEXIST;
-
- ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
- if (ret < 0)
- return ret;
-
- if (!want_memblock)
- return 0;
-
- return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
+ /*
+ * Disallow all operations smaller than a sub-section and only
+ * allow operations smaller than a section for
+ * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
+ * enforces a larger memory_block_size_bytes() granularity for
+ * memory that will be marked online, so this check should only
+ * fire for direct arch_{add,remove}_memory() users outside of
+ * add_memory_resource().
+ */
+ unsigned long min_align;
+
+ if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+ min_align = PAGES_PER_SUBSECTION;
+ else
+ min_align = PAGES_PER_SECTION;
+ if (!IS_ALIGNED(pfn, min_align)
+ || !IS_ALIGNED(nr_pages, min_align)) {
+ WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
+ reason, pfn, pfn + nr_pages - 1);
+ return -EINVAL;
+ }
+ return 0;
}
/*
@@ -274,62 +285,54 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
* call this function after deciding the zone to which to
* add the new pages.
*/
-int __ref __add_pages(int nid, unsigned long phys_start_pfn,
- unsigned long nr_pages, struct mhp_restrictions *restrictions)
+int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
+ struct mhp_restrictions *restrictions)
{
- unsigned long i;
- int err = 0;
- int start_sec, end_sec;
+ int err;
+ unsigned long nr, start_sec, end_sec;
struct vmem_altmap *altmap = restrictions->altmap;
- /* during initialize mem_map, align hot-added range to section */
- start_sec = pfn_to_section_nr(phys_start_pfn);
- end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
-
if (altmap) {
/*
* Validate altmap is within bounds of the total request
*/
- if (altmap->base_pfn != phys_start_pfn
+ if (altmap->base_pfn != pfn
|| vmem_altmap_offset(altmap) > nr_pages) {
pr_warn_once("memory add fail, invalid altmap\n");
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
altmap->alloc = 0;
}
- for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, section_nr_to_pfn(i), altmap,
- restrictions->flags & MHP_MEMBLOCK_API);
+ err = check_pfn_span(pfn, nr_pages, "add");
+ if (err)
+ return err;
- /*
- * EEXIST is finally dealt with by ioresource collision
- * check. see add_memory() => register_memory_resource()
- * Warning will be printed if there is collision.
- */
- if (err && (err != -EEXIST))
+ start_sec = pfn_to_section_nr(pfn);
+ end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+ for (nr = start_sec; nr <= end_sec; nr++) {
+ unsigned long pfns;
+
+ pfns = min(nr_pages, PAGES_PER_SECTION
+ - (pfn & ~PAGE_SECTION_MASK));
+ err = sparse_add_section(nid, pfn, pfns, altmap);
+ if (err)
break;
- err = 0;
+ pfn += pfns;
+ nr_pages -= pfns;
cond_resched();
}
vmemmap_populate_print_last();
-out:
return err;
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
-
- for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(start_pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_valid(start_pfn)))
continue;
if (unlikely(pfn_to_nid(start_pfn) != nid))
@@ -349,15 +352,12 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
unsigned long pfn;
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
- for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_valid(pfn)))
continue;
if (unlikely(pfn_to_nid(pfn) != nid))
@@ -379,7 +379,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
unsigned long zone_end_pfn = z;
unsigned long pfn;
- struct mem_section *ms;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
@@ -416,17 +415,15 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
* it check the zone has only hole or not.
*/
pfn = zone_start_pfn;
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_valid(pfn)))
continue;
if (page_zone(pfn_to_page(pfn)) != zone)
continue;
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
+ /* Skip range to be removed */
+ if (pfn >= start_pfn && pfn < end_pfn)
continue;
/* If we find valid section, we have nothing to do */
@@ -447,7 +444,6 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
unsigned long pgdat_end_pfn = p;
unsigned long pfn;
- struct mem_section *ms;
int nid = pgdat->node_id;
if (pgdat_start_pfn == start_pfn) {
@@ -484,17 +480,15 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
* has only hole or not.
*/
pfn = pgdat_start_pfn;
- for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_valid(pfn)))
continue;
if (pfn_to_nid(pfn) != nid)
continue;
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
+ /* Skip range to be removed */
+ if (pfn >= start_pfn && pfn < end_pfn)
continue;
/* If we find valid section, we have nothing to do */
@@ -506,10 +500,10 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
pgdat->node_spanned_pages = 0;
}
-static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+static void __remove_zone(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages)
{
struct pglist_data *pgdat = zone->zone_pgdat;
- int nr_pages = PAGES_PER_SECTION;
unsigned long flags;
pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -518,29 +512,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
-static void __remove_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset,
- struct vmem_altmap *altmap)
+static void __remove_section(struct zone *zone, unsigned long pfn,
+ unsigned long nr_pages, unsigned long map_offset,
+ struct vmem_altmap *altmap)
{
- unsigned long start_pfn;
- int scn_nr;
+ struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
if (WARN_ON_ONCE(!valid_section(ms)))
return;
- unregister_memory_section(ms);
-
- scn_nr = __section_nr(ms);
- start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
- __remove_zone(zone, start_pfn);
-
- sparse_remove_one_section(zone, ms, map_offset, altmap);
+ __remove_zone(zone, pfn, nr_pages);
+ sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
}
/**
* __remove_pages() - remove sections of pages from a zone
* @zone: zone from which pages need to be removed
- * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
* @altmap: alternative device page map or %NULL if default memmap is used
*
@@ -549,40 +537,35 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
* sure that pages are marked reserved and zones are adjust properly by
* calling offline_pages().
*/
-void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+void __remove_pages(struct zone *zone, unsigned long pfn,
unsigned long nr_pages, struct vmem_altmap *altmap)
{
- unsigned long i;
unsigned long map_offset = 0;
- int sections_to_remove;
+ unsigned long nr, start_sec, end_sec;
- /* In the ZONE_DEVICE case device driver owns the memory region */
- if (is_dev_zone(zone)) {
- if (altmap)
- map_offset = vmem_altmap_offset(altmap);
- }
+ map_offset = vmem_altmap_offset(altmap);
clear_zone_contiguous(zone);
- /*
- * We can only remove entire sections
- */
- BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
- BUG_ON(nr_pages % PAGES_PER_SECTION);
+ if (check_pfn_span(pfn, nr_pages, "remove"))
+ return;
- sections_to_remove = nr_pages / PAGES_PER_SECTION;
- for (i = 0; i < sections_to_remove; i++) {
- unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+ start_sec = pfn_to_section_nr(pfn);
+ end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+ for (nr = start_sec; nr <= end_sec; nr++) {
+ unsigned long pfns;
cond_resched();
- __remove_section(zone, __pfn_to_section(pfn), map_offset,
- altmap);
+ pfns = min(nr_pages, PAGES_PER_SECTION
+ - (pfn & ~PAGE_SECTION_MASK));
+ __remove_section(zone, pfn, pfns, map_offset, altmap);
+ pfn += pfns;
+ nr_pages -= pfns;
map_offset = 0;
}
set_zone_contiguous(zone);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
int set_online_page_callback(online_page_callback_t callback)
{
@@ -1051,16 +1034,11 @@ int try_online_node(int nid)
static int check_hotplug_memory_range(u64 start, u64 size)
{
- unsigned long block_sz = memory_block_size_bytes();
- u64 block_nr_pages = block_sz >> PAGE_SHIFT;
- u64 nr_pages = size >> PAGE_SHIFT;
- u64 start_pfn = PFN_DOWN(start);
-
/* memory range must be block size aligned */
- if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) ||
- !IS_ALIGNED(nr_pages, block_nr_pages)) {
+ if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
+ !IS_ALIGNED(size, memory_block_size_bytes())) {
pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
- block_sz, start, size);
+ memory_block_size_bytes(), start, size);
return -EINVAL;
}
@@ -1080,9 +1058,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*/
int __ref add_memory_resource(int nid, struct resource *res)
{
- struct mhp_restrictions restrictions = {
- .flags = MHP_MEMBLOCK_API,
- };
+ struct mhp_restrictions restrictions = {};
u64 start, size;
bool new_node = false;
int ret;
@@ -1114,6 +1090,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
if (ret < 0)
goto error;
+ /* create memory block devices after memory was added */
+ ret = create_memory_block_devices(start, size);
+ if (ret) {
+ arch_remove_memory(nid, start, size, NULL);
+ goto error;
+ }
+
if (new_node) {
/* If sysfs file of new node can't be created, cpu on the node
* can't be hot-added. There is no rollback way now.
@@ -1137,8 +1120,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* online pages if requested */
if (memhp_auto_online)
- walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
- NULL, online_memory_block);
+ walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
error:
@@ -1673,58 +1655,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
-
-/**
- * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
- * @start_pfn: start pfn of the memory range
- * @end_pfn: end pfn of the memory range
- * @arg: argument passed to func
- * @func: callback for each memory section walked
- *
- * This function walks through all present mem sections in range
- * [start_pfn, end_pfn) and call func on each mem section.
- *
- * Returns the return value of func.
- */
-int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
- void *arg, int (*func)(struct memory_block *, void *))
-{
- struct memory_block *mem = NULL;
- struct mem_section *section;
- unsigned long pfn, section_nr;
- int ret;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- section_nr = pfn_to_section_nr(pfn);
- if (!present_section_nr(section_nr))
- continue;
-
- section = __nr_to_section(section_nr);
- /* same memblock? */
- if (mem)
- if ((section_nr >= mem->start_section_nr) &&
- (section_nr <= mem->end_section_nr))
- continue;
-
- mem = find_memory_block_hinted(section, mem);
- if (!mem)
- continue;
-
- ret = func(mem, arg);
- if (ret) {
- kobject_put(&mem->dev.kobj);
- return ret;
- }
- }
-
- if (mem)
- kobject_put(&mem->dev.kobj);
-
- return 0;
-}
-#ifdef CONFIG_MEMORY_HOTREMOVE
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1736,9 +1667,10 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
- }
- return ret;
+ return -EBUSY;
+ }
+ return 0;
}
static int check_cpu_on_node(pg_data_t *pgdat)
@@ -1821,19 +1753,9 @@ static void __release_memory_resource(resource_size_t start,
}
}
-/**
- * remove_memory
- * @nid: the node ID
- * @start: physical address of the region to remove
- * @size: size of the region to remove
- *
- * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
- * and online/offline operations before this call, as required by
- * try_offline_node().
- */
-void __ref __remove_memory(int nid, u64 start, u64 size)
+static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
- int ret;
+ int rc = 0;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -1841,32 +1763,65 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
/*
* All memory blocks must be offlined before removing memory. Check
- * whether all memory blocks in question are offline and trigger a BUG()
+ * whether all memory blocks in question are offline and return error
* if this is not the case.
*/
- ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
- check_memblock_offlined_cb);
- if (ret)
- BUG();
+ rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
+ if (rc)
+ goto done;
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
memblock_free(start, size);
memblock_remove(start, size);
+ /* remove memory block devices before removing memory */
+ remove_memory_block_devices(start, size);
+
arch_remove_memory(nid, start, size, NULL);
__release_memory_resource(start, size);
try_offline_node(nid);
+done:
mem_hotplug_done();
+ return rc;
}
-void remove_memory(int nid, u64 start, u64 size)
+/**
+ * remove_memory
+ * @nid: the node ID
+ * @start: physical address of the region to remove
+ * @size: size of the region to remove
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call, as required by
+ * try_offline_node().
+ */
+void __remove_memory(int nid, u64 start, u64 size)
{
+
+ /*
+ * trigger BUG() is some memory is not offlined prior to calling this
+ * function
+ */
+ if (try_remove_memory(nid, start, size))
+ BUG();
+}
+
+/*
+ * Remove memory if every memory block is offline, otherwise return -EBUSY is
+ * some memory is not offline
+ */
+int remove_memory(int nid, u64 start, u64 size)
+{
+ int rc;
+
lock_device_hotplug();
- __remove_memory(nid, start, size);
+ rc = try_remove_memory(nid, start, size);
unlock_device_hotplug();
+
+ return rc;
}
EXPORT_SYMBOL_GPL(remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index fdcb73536319..f48693f75b37 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2098,6 +2098,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
out:
return page;
}
+EXPORT_SYMBOL(alloc_pages_vma);
/**
* alloc_pages_current - Allocate pages.
diff --git a/mm/migrate.c b/mm/migrate.c
index e9594bc0d406..8992741f10aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -246,8 +246,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_device_private_page(new)) {
entry = make_device_private_entry(new, pte_write(pte));
pte = swp_entry_to_pte(entry);
- } else if (is_device_public_page(new)) {
- pte = pte_mkdevmap(pte);
}
}
@@ -381,7 +379,6 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
* ZONE_DEVICE pages.
*/
expected_count += is_device_private_page(page);
- expected_count += is_device_public_page(page);
if (mapping)
expected_count += hpage_nr_pages(page) + page_has_private(page);
@@ -397,8 +394,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode,
- int extra_count)
+ struct page *newpage, struct page *page, int extra_count)
{
XA_STATE(xas, &mapping->i_pages, page_index(page));
struct zone *oldzone, *newzone;
@@ -684,7 +680,7 @@ int migrate_page(struct address_space *mapping,
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -783,7 +779,7 @@ recheck_buffers:
}
}
- rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
if (rc != MIGRATEPAGE_SUCCESS)
goto unlock_buffers;
@@ -994,10 +990,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
if (!PageMappingFlags(page))
page->mapping = NULL;
- if (unlikely(is_zone_device_page(newpage))) {
- if (is_device_public_page(newpage))
- flush_dcache_page(newpage);
- } else
+ if (likely(!is_zone_device_page(newpage)))
flush_dcache_page(newpage);
}
@@ -2265,7 +2258,7 @@ again:
pfn = 0;
goto next;
}
- page = _vm_normal_page(migrate->vma, addr, pte, true);
+ page = vm_normal_page(migrate->vma, addr, pte);
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
@@ -2406,16 +2399,7 @@ static bool migrate_vma_check_page(struct page *page)
* FIXME proper solution is to rework migration_entry_wait() so
* it does not need to take a reference on page.
*/
- if (is_device_private_page(page))
- return true;
-
- /*
- * Only allow device public page to be migrated and account for
- * the extra reference count imply by ZONE_DEVICE pages.
- */
- if (!is_device_public_page(page))
- return false;
- extra++;
+ return is_device_private_page(page);
}
/* For file back page */
@@ -2665,11 +2649,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
entry = swp_entry_to_pte(swp_entry);
- } else if (is_device_public_page(page)) {
- entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
- if (vma->vm_flags & VM_WRITE)
- entry = pte_mkwrite(pte_mkdirty(entry));
- entry = pte_mkdevmap(entry);
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
@@ -2789,7 +2768,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
- } else if (!is_device_public_page(newpage)) {
+ } else {
/*
* Other types of ZONE_DEVICE page are not
* supported.
diff --git a/mm/nommu.c b/mm/nommu.c
index eb3e2e558da1..fed1b6e9c89b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1261,7 +1261,9 @@ unsigned long do_mmap(struct file *file,
add_nommu_region(region);
/* clear anonymous mappings that don't ask for uninitialized data */
- if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
+ if (!vma->vm_file &&
+ (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) ||
+ !(flags & MAP_UNINITIALIZED)))
memset((void *)region->vm_start, 0,
region->vm_end - region->vm_start);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dbd0d5cbbcbb..272c6de1bf4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -450,7 +450,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
- return __pfn_to_section(pfn)->pageblock_flags;
+ return section_to_usemap(__pfn_to_section(pfn));
#else
return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
@@ -4102,7 +4102,6 @@ static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
- struct reclaim_state reclaim_state;
int progress;
unsigned int noreclaim_flag;
unsigned long pflags;
@@ -4114,13 +4113,10 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
- reclaim_state.reclaimed_slab = 0;
- current->reclaim_state = &reclaim_state;
progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
ac->nodemask);
- current->reclaim_state = NULL;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
psi_memstall_leave(&pflags);
@@ -5925,11 +5921,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
{
unsigned long pfn, end_pfn = start_pfn + size;
struct pglist_data *pgdat = zone->zone_pgdat;
+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
unsigned long zone_idx = zone_idx(zone);
unsigned long start = jiffies;
int nid = pgdat->node_id;
- if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
+ if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
return;
/*
@@ -5937,9 +5934,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
* of the pages reserved for the memmap, so we can just jump to
* the end of that region and start processing the device pages.
*/
- if (pgmap->altmap_valid) {
- struct vmem_altmap *altmap = &pgmap->altmap;
-
+ if (altmap) {
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
size = end_pfn - start_pfn;
}
@@ -5959,12 +5954,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
__SetPageReserved(page);
/*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back
- * pointer and hmm_data. It is a bug if a ZONE_DEVICE
- * page is ever freed or placed on a driver-private list.
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
+ * ever freed or placed on a driver-private list.
*/
page->pgmap = pgmap;
- page->hmm_data = 0;
+ page->zone_device_data = NULL;
/*
* Mark the block movable so that blocks are reserved for
@@ -5979,7 +5974,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
* pfn out of zone.
*
* Please note that MEMMAP_HOTPLUG path doesn't clear memmap
- * because this is done early in sparse_add_one_section
+ * because this is done early in section_activate()
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
@@ -7356,12 +7351,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
- /* Print out the early node map */
+ /*
+ * Print out the early node map, and initialize the
+ * subsection-map relative to active online memory ranges to
+ * enable future "sub-section" extensions of the memory map.
+ */
pr_info("Early memory node ranges\n");
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
+ subsection_map_init(start_pfn, end_pfn - start_pfn);
+ }
/* Initialise every node */
mminit_verify_pageflags_layout();
diff --git a/mm/shmem.c b/mm/shmem.c
index f4dce9c8670d..626d8c74b973 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -400,7 +400,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
static int shmem_huge __read_mostly;
-#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
+#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
{
if (!strcmp(str, "never"))
@@ -417,7 +417,9 @@ static int shmem_parse_huge(const char *str)
return SHMEM_HUGE_FORCE;
return -EINVAL;
}
+#endif
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static const char *shmem_format_huge(int huge)
{
switch (huge) {
@@ -3775,10 +3777,6 @@ int __init shmem_init(void)
{
int error;
- /* If rootfs called this, don't re-init */
- if (shmem_inode_cachep)
- return 0;
-
shmem_init_inodecache();
error = register_filesystem(&shmem_fs_type);
@@ -3872,6 +3870,9 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
loff_t i_size;
pgoff_t off;
+ if ((vma->vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ return false;
if (shmem_huge == SHMEM_HUGE_FORCE)
return true;
if (shmem_huge == SHMEM_HUGE_DENY)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6c49dbb3769e..807490fe217a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1028,7 +1028,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
}
struct kmem_cache *
-kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
+kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
+{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
EXPORT_SYMBOL(kmalloc_caches);
/*
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 7fec05796796..200aef686722 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -245,19 +245,26 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
return 0;
}
-struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
- struct vmem_altmap *altmap)
+struct page * __meminit __populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
unsigned long start;
unsigned long end;
- struct page *map;
- map = pfn_to_page(pnum * PAGES_PER_SECTION);
- start = (unsigned long)map;
- end = (unsigned long)(map + PAGES_PER_SECTION);
+ /*
+ * The minimum granularity of memmap extensions is
+ * PAGES_PER_SUBSECTION as allocations are tracked in the
+ * 'subsection_map' bitmap of the section.
+ */
+ end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);
+ pfn &= PAGE_SUBSECTION_MASK;
+ nr_pages = end - pfn;
+
+ start = (unsigned long) pfn_to_page(pfn);
+ end = start + nr_pages * sizeof(struct page);
if (vmemmap_populate(start, end, nid, altmap))
return NULL;
- return map;
+ return pfn_to_page(pfn);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index fd13166949b5..72f010d9bff5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -83,8 +83,15 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
unsigned long root = SECTION_NR_TO_ROOT(section_nr);
struct mem_section *section;
+ /*
+ * An existing section is possible in the sub-section hotplug
+ * case. First hot-add instantiates, follow-on hot-add reuses
+ * the existing section.
+ *
+ * The mem_hotplug_lock resolves the apparent race below.
+ */
if (mem_section[root])
- return -EEXIST;
+ return 0;
section = sparse_index_alloc(nid);
if (!section)
@@ -102,7 +109,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
#endif
#ifdef CONFIG_SPARSEMEM_EXTREME
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
{
unsigned long root_nr;
struct mem_section *root = NULL;
@@ -121,9 +128,9 @@ int __section_nr(struct mem_section* ms)
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
#else
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
{
- return (int)(ms - mem_section[0]);
+ return (unsigned long)(ms - mem_section[0]);
}
#endif
@@ -178,10 +185,10 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
* Keeping track of this gives us an easy way to break out of
* those loops early.
*/
-int __highest_present_section_nr;
+unsigned long __highest_present_section_nr;
static void section_mark_present(struct mem_section *ms)
{
- int section_nr = __section_nr(ms);
+ unsigned long section_nr = __section_nr(ms);
if (section_nr > __highest_present_section_nr)
__highest_present_section_nr = section_nr;
@@ -189,7 +196,7 @@ static void section_mark_present(struct mem_section *ms)
ms->section_mem_map |= SECTION_MARKED_PRESENT;
}
-static inline int next_present_section_nr(int section_nr)
+static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
do {
section_nr++;
@@ -210,6 +217,41 @@ static inline unsigned long first_present_section_nr(void)
return next_present_section_nr(-1);
}
+void subsection_mask_set(unsigned long *map, unsigned long pfn,
+ unsigned long nr_pages)
+{
+ int idx = subsection_map_index(pfn);
+ int end = subsection_map_index(pfn + nr_pages - 1);
+
+ bitmap_set(map, idx, end - idx + 1);
+}
+
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+ int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+ unsigned long nr, start_sec = pfn_to_section_nr(pfn);
+
+ if (!nr_pages)
+ return;
+
+ for (nr = start_sec; nr <= end_sec; nr++) {
+ struct mem_section *ms;
+ unsigned long pfns;
+
+ pfns = min(nr_pages, PAGES_PER_SECTION
+ - (pfn & ~PAGE_SECTION_MASK));
+ ms = __nr_to_section(nr);
+ subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
+
+ pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
+ pfns, subsection_map_index(pfn),
+ subsection_map_index(pfn + pfns - 1));
+
+ pfn += pfns;
+ nr_pages -= pfns;
+ }
+}
+
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -288,33 +330,31 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
- unsigned long *pageblock_bitmap)
+ struct mem_section_usage *usage, unsigned long flags)
{
ms->section_mem_map &= ~SECTION_MAP_MASK;
- ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
- SECTION_HAS_MEM_MAP;
- ms->pageblock_flags = pageblock_bitmap;
+ ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
+ | SECTION_HAS_MEM_MAP | flags;
+ ms->usage = usage;
}
-unsigned long usemap_size(void)
+static unsigned long usemap_size(void)
{
return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static unsigned long *__kmalloc_section_usemap(void)
+size_t mem_section_usage_size(void)
{
- return kmalloc(usemap_size(), GFP_KERNEL);
+ return sizeof(struct mem_section_usage) + usemap_size();
}
-#endif /* CONFIG_MEMORY_HOTPLUG */
#ifdef CONFIG_MEMORY_HOTREMOVE
-static unsigned long * __init
+static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
+ struct mem_section_usage *usage;
unsigned long goal, limit;
- unsigned long *p;
int nid;
/*
* A page may contain usemaps for other sections preventing the
@@ -330,15 +370,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
- p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
- if (!p && limit) {
+ usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
+ if (!usage && limit) {
limit = 0;
goto again;
}
- return p;
+ return usage;
}
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+ struct mem_section_usage *usage)
{
unsigned long usemap_snr, pgdat_snr;
static unsigned long old_usemap_snr;
@@ -352,7 +393,7 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
old_pgdat_snr = NR_MEM_SECTIONS;
}
- usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+ usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
return;
@@ -380,14 +421,15 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
usemap_snr, pgdat_snr, nid);
}
#else
-static unsigned long * __init
+static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
}
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+ struct mem_section_usage *usage)
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -404,8 +446,8 @@ static unsigned long __init section_map_size(void)
return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}
-struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
- struct vmem_altmap *altmap)
+struct page __init *__populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
unsigned long size = section_map_size();
struct page *map = sparse_buffer_alloc(size);
@@ -474,32 +516,35 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count)
{
- unsigned long pnum, usemap_longs, *usemap;
+ struct mem_section_usage *usage;
+ unsigned long pnum;
struct page *map;
- usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
- usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
- usemap_size() *
- map_count);
- if (!usemap) {
+ usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+ mem_section_usage_size() * map_count);
+ if (!usage) {
pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
goto failed;
}
sparse_buffer_init(map_count * section_map_size(), nid);
for_each_present_section_nr(pnum_begin, pnum) {
+ unsigned long pfn = section_nr_to_pfn(pnum);
+
if (pnum >= pnum_end)
break;
- map = sparse_mem_map_populate(pnum, nid, NULL);
+ map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
+ nid, NULL);
if (!map) {
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
pnum_begin = pnum;
goto failed;
}
- check_usemap_section_nr(nid, usemap);
- sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
- usemap += usemap_longs;
+ check_usemap_section_nr(nid, usage);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
+ SECTION_IS_EARLY);
+ usage = (void *) usage + mem_section_usage_size();
}
sparse_buffer_fini();
return;
@@ -590,21 +635,20 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
- struct vmem_altmap *altmap)
+static struct page *populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
- /* This will make the necessary allocations eventually. */
- return sparse_mem_map_populate(pnum, nid, altmap);
+ return __populate_section_memmap(pfn, nr_pages, nid, altmap);
}
-static void __kfree_section_memmap(struct page *memmap,
+
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- unsigned long start = (unsigned long)memmap;
- unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+ unsigned long start = (unsigned long) pfn_to_page(pfn);
+ unsigned long end = start + nr_pages * sizeof(struct page);
vmemmap_free(start, end, altmap);
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap)
{
unsigned long start = (unsigned long)memmap;
@@ -612,9 +656,9 @@ static void free_map_bootmem(struct page *memmap)
vmemmap_free(start, end, NULL);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#else
-static struct page *__kmalloc_section_memmap(void)
+struct page *populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
struct page *page, *ret;
unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
@@ -635,15 +679,11 @@ got_map_ptr:
return ret;
}
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- return __kmalloc_section_memmap();
-}
+ struct page *memmap = pfn_to_page(pfn);
-static void __kfree_section_memmap(struct page *memmap,
- struct vmem_altmap *altmap)
-{
if (is_vmalloc_addr(memmap))
vfree(memmap);
else
@@ -651,7 +691,6 @@ static void __kfree_section_memmap(struct page *memmap,
get_order(sizeof(struct page) * PAGES_PER_SECTION));
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap)
{
unsigned long maps_section_nr, removing_section_nr, i;
@@ -681,13 +720,122 @@ static void free_map_bootmem(struct page *memmap)
put_page_bootmem(page);
}
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
+{
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+ struct mem_section *ms = __pfn_to_section(pfn);
+ bool section_is_early = early_section(ms);
+ struct page *memmap = NULL;
+ unsigned long *subsection_map = ms->usage
+ ? &ms->usage->subsection_map[0] : NULL;
+
+ subsection_mask_set(map, pfn, nr_pages);
+ if (subsection_map)
+ bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+ if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+ "section already deactivated (%#lx + %ld)\n",
+ pfn, nr_pages))
+ return;
+
+ /*
+ * There are 3 cases to handle across two configurations
+ * (SPARSEMEM_VMEMMAP={y,n}):
+ *
+ * 1/ deactivation of a partial hot-added section (only possible
+ * in the SPARSEMEM_VMEMMAP=y case).
+ * a/ section was present at memory init
+ * b/ section was hot-added post memory init
+ * 2/ deactivation of a complete hot-added section
+ * 3/ deactivation of a complete section from memory init
+ *
+ * For 1/, when subsection_map does not empty we will not be
+ * freeing the usage map, but still need to free the vmemmap
+ * range.
+ *
+ * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
+ */
+ bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+ if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+ unsigned long section_nr = pfn_to_section_nr(pfn);
+
+ if (!section_is_early) {
+ kfree(ms->usage);
+ ms->usage = NULL;
+ }
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+ ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+ }
+
+ if (section_is_early && memmap)
+ free_map_bootmem(memmap);
+ else
+ depopulate_section_memmap(pfn, nr_pages, altmap);
+}
+
+static struct page * __meminit section_activate(int nid, unsigned long pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ struct mem_section *ms = __pfn_to_section(pfn);
+ struct mem_section_usage *usage = NULL;
+ unsigned long *subsection_map;
+ struct page *memmap;
+ int rc = 0;
+
+ subsection_mask_set(map, pfn, nr_pages);
+
+ if (!ms->usage) {
+ usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+ if (!usage)
+ return ERR_PTR(-ENOMEM);
+ ms->usage = usage;
+ }
+ subsection_map = &ms->usage->subsection_map[0];
+
+ if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+ rc = -EINVAL;
+ else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+ rc = -EEXIST;
+ else
+ bitmap_or(subsection_map, map, subsection_map,
+ SUBSECTIONS_PER_SECTION);
+
+ if (rc) {
+ if (usage)
+ ms->usage = NULL;
+ kfree(usage);
+ return ERR_PTR(rc);
+ }
+
+ /*
+ * The early init code does not consider partially populated
+ * initial sections, it simply assumes that memory will never be
+ * referenced. If we hot-add memory into such a section then we
+ * do not need to populate the memmap and can simply reuse what
+ * is already there.
+ */
+ if (nr_pages < PAGES_PER_SECTION && early_section(ms))
+ return pfn_to_page(pfn);
+
+ memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
+ if (!memmap) {
+ section_deactivate(pfn, nr_pages, altmap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return memmap;
+}
+
/**
- * sparse_add_one_section - add a memory section
+ * sparse_add_section - add a memory section, or populate an existing one
* @nid: The node to add section on
* @start_pfn: start pfn of the memory range
+ * @nr_pages: number of pfns to add in the section
* @altmap: device page map
*
* This is only intended for hotplug.
@@ -697,56 +845,40 @@ static void free_map_bootmem(struct page *memmap)
* * -EEXIST - Section has been present.
* * -ENOMEM - Out of memory.
*/
-int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
- struct vmem_altmap *altmap)
+int __meminit sparse_add_section(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct mem_section *ms;
struct page *memmap;
- unsigned long *usemap;
int ret;
- /*
- * no locking for this, because it does its own
- * plus, it does a kmalloc
- */
ret = sparse_index_init(section_nr, nid);
- if (ret < 0 && ret != -EEXIST)
+ if (ret < 0)
return ret;
- ret = 0;
- memmap = kmalloc_section_memmap(section_nr, nid, altmap);
- if (!memmap)
- return -ENOMEM;
- usemap = __kmalloc_section_usemap();
- if (!usemap) {
- __kfree_section_memmap(memmap, altmap);
- return -ENOMEM;
- }
- ms = __pfn_to_section(start_pfn);
- if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
- ret = -EEXIST;
- goto out;
- }
+ memmap = section_activate(nid, start_pfn, nr_pages, altmap);
+ if (IS_ERR(memmap))
+ return PTR_ERR(memmap);
/*
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
+ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+ ms = __pfn_to_section(start_pfn);
+ set_section_nid(section_nr, nid);
section_mark_present(ms);
- sparse_init_one_section(ms, section_nr, memmap, usemap);
-out:
- if (ret < 0) {
- kfree(usemap);
- __kfree_section_memmap(memmap, altmap);
- }
- return ret;
+ /* Align memmap to section boundary in the subsection case */
+ if (section_nr_to_pfn(section_nr) != start_pfn)
+ memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
+ sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
+
+ return 0;
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
@@ -777,51 +909,12 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
}
#endif
-static void free_section_usemap(struct page *memmap, unsigned long *usemap,
+void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
+ unsigned long nr_pages, unsigned long map_offset,
struct vmem_altmap *altmap)
{
- struct page *usemap_page;
-
- if (!usemap)
- return;
-
- usemap_page = virt_to_page(usemap);
- /*
- * Check to see if allocation came from hot-plug-add
- */
- if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
- kfree(usemap);
- if (memmap)
- __kfree_section_memmap(memmap, altmap);
- return;
- }
-
- /*
- * The usemap came from bootmem. This is packed with other usemaps
- * on the section which has pgdat at boot time. Just keep it as is now.
- */
-
- if (memmap)
- free_map_bootmem(memmap);
-}
-
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset, struct vmem_altmap *altmap)
-{
- struct page *memmap = NULL;
- unsigned long *usemap = NULL;
-
- if (ms->section_mem_map) {
- usemap = ms->pageblock_flags;
- memmap = sparse_decode_mem_map(ms->section_mem_map,
- __section_nr(ms));
- ms->section_mem_map = 0;
- ms->pageblock_flags = NULL;
- }
-
- clear_hwpoisoned_pages(memmap + map_offset,
- PAGES_PER_SECTION - map_offset);
- free_section_usemap(memmap, usemap, altmap);
+ clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
+ nr_pages - map_offset);
+ section_deactivate(pfn, nr_pages, altmap);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 7ede3eddc12a..ae300397dfda 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -8,7 +8,7 @@
/*
* This file contains the default values for the operation of the
* Linux VM subsystem. Fine-tuning documentation can be found in
- * Documentation/sysctl/vm.txt.
+ * Documentation/admin-guide/sysctl/vm.rst.
* Started 18.12.91
* Swap aging added 23.2.95, Stephen Tweedie.
* Buffermem limits added 12.3.98, Rik van Riel.
@@ -740,15 +740,20 @@ void release_pages(struct page **pages, int nr)
if (is_huge_zero_page(page))
continue;
- /* Device public page can not be huge page */
- if (is_device_public_page(page)) {
+ if (is_zone_device_page(page)) {
if (locked_pgdat) {
spin_unlock_irqrestore(&locked_pgdat->lru_lock,
flags);
locked_pgdat = NULL;
}
- put_devmap_managed_page(page);
- continue;
+ /*
+ * ZONE_DEVICE pages that return 'false' from
+ * put_devmap_managed_page() do not require special
+ * processing, and instead, expect a call to
+ * put_page_testzero().
+ */
+ if (put_devmap_managed_page(page))
+ continue;
}
page = compound_head(page);
diff --git a/mm/util.c b/mm/util.c
index 68575a315dc5..e6351a80f248 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,7 @@
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
@@ -300,6 +301,80 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
}
#endif
+/**
+ * __account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm: mm to account against
+ * @pages: number of pages to account
+ * @inc: %true if @pages should be considered positive, %false if not
+ * @task: task used to check RLIMIT_MEMLOCK
+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
+ *
+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
+ * that mmap_sem is held as writer.
+ *
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+ struct task_struct *task, bool bypass_rlim)
+{
+ unsigned long locked_vm, limit;
+ int ret = 0;
+
+ lockdep_assert_held_write(&mm->mmap_sem);
+
+ locked_vm = mm->locked_vm;
+ if (inc) {
+ if (!bypass_rlim) {
+ limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked_vm + pages > limit)
+ ret = -ENOMEM;
+ }
+ if (!ret)
+ mm->locked_vm = locked_vm + pages;
+ } else {
+ WARN_ON_ONCE(pages > locked_vm);
+ mm->locked_vm = locked_vm - pages;
+ }
+
+ pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
+ (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
+ locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
+ ret ? " - exceeded" : "");
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__account_locked_vm);
+
+/**
+ * account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm: mm to account against, may be NULL
+ * @pages: number of pages to account
+ * @inc: %true if @pages should be considered positive, %false if not
+ *
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
+ *
+ * Return:
+ * * 0 on success, or if mm is NULL
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
+{
+ int ret;
+
+ if (pages == 0 || !mm)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+ ret = __account_locked_vm(mm, pages, inc, current,
+ capable(CAP_IPC_LOCK));
+ up_write(&mm->mmap_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(account_locked_vm);
+
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8e3dcd527b8..44df66a98f2a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -131,6 +131,9 @@ struct scan_control {
unsigned int file_taken;
unsigned int taken;
} nr;
+
+ /* for recording the reclaimed slab by now */
+ struct reclaim_state reclaim_state;
};
#ifdef ARCH_HAS_PREFETCH
@@ -238,6 +241,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
}
#endif /* CONFIG_MEMCG_KMEM */
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
@@ -3191,11 +3206,13 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
+ set_task_reclaim_state(current, &sc.reclaim_state);
trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
@@ -3218,6 +3235,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
};
unsigned long lru_pages;
+ set_task_reclaim_state(current, &sc.reclaim_state);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -3235,7 +3253,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
*nr_scanned = sc.nr_scanned;
+
return sc.nr_reclaimed;
}
@@ -3262,6 +3282,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_shrinkslab = 1,
};
+ set_task_reclaim_state(current, &sc.reclaim_state);
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
* take care of from where we get pages. So the node where we start the
@@ -3282,6 +3303,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
psi_memstall_leave(&pflags);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
@@ -3483,6 +3505,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
.may_unmap = 1,
};
+ set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire();
@@ -3664,6 +3687,8 @@ out:
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
psi_memstall_leave(&pflags);
+ set_task_reclaim_state(current, NULL);
+
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
@@ -3787,15 +3812,10 @@ static int kswapd(void *p)
unsigned int classzone_idx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
-
- struct reclaim_state reclaim_state = {
- .reclaimed_slab = 0,
- };
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
- current->reclaim_state = &reclaim_state;
/*
* Tell the memory management that we're a "memory allocator",
@@ -3857,7 +3877,6 @@ kswapd_try_sleep:
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
- current->reclaim_state = NULL;
return 0;
}
@@ -3922,7 +3941,6 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
*/
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
- struct reclaim_state reclaim_state;
struct scan_control sc = {
.nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3934,18 +3952,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.hibernation_mode = 1,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
- struct task_struct *p = current;
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
fs_reclaim_acquire(sc.gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ set_task_reclaim_state(current, &sc.reclaim_state);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
- p->reclaim_state = NULL;
+ set_task_reclaim_state(current, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
@@ -4110,7 +4126,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
- struct reclaim_state reclaim_state;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -4135,8 +4150,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ set_task_reclaim_state(p, &sc.reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
@@ -4148,7 +4162,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
- p->reclaim_state = NULL;
+ set_task_reclaim_state(p, NULL);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index dfcd69d08c1e..1a029a7432ee 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -26,7 +26,6 @@
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
-#include <linux/dcache.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
@@ -36,12 +35,14 @@
#include <linux/compaction.h>
#include <linux/percpu.h>
#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
#include <linux/fs.h>
#include <linux/preempt.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zpool.h>
+#include <linux/magic.h>
/*
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
@@ -101,6 +102,7 @@ struct z3fold_buddy_slots {
* @refcount: reference count for the z3fold page
* @work: work_struct for page layout optimization
* @slots: pointer to the structure holding buddy slots
+ * @pool: pointer to the containing pool
* @cpu: CPU which this page "belongs" to
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @middle_chunks: the size of the middle buddy in chunks, 0 if free
@@ -114,6 +116,7 @@ struct z3fold_header {
struct kref refcount;
struct work_struct work;
struct z3fold_buddy_slots *slots;
+ struct z3fold_pool *pool;
short cpu;
unsigned short first_chunks;
unsigned short middle_chunks;
@@ -193,8 +196,10 @@ static void compact_page_work(struct work_struct *w);
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
gfp_t gfp)
{
- struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle,
- gfp);
+ struct z3fold_buddy_slots *slots;
+
+ slots = kmem_cache_alloc(pool->c_handle,
+ (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
if (slots) {
memset(slots->slot, 0, sizeof(slots->slot));
@@ -241,19 +246,14 @@ static inline void free_handle(unsigned long handle)
}
}
-static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int z3fold_init_fs_context(struct fs_context *fc)
{
- static const struct dentry_operations ops = {
- .d_dname = simple_dname,
- };
-
- return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33);
+ return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
}
static struct file_system_type z3fold_fs = {
.name = "z3fold",
- .mount = z3fold_do_mount,
+ .init_fs_context = z3fold_init_fs_context,
.kill_sb = kill_anon_super,
};
@@ -320,6 +320,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
zhdr->start_middle = 0;
zhdr->cpu = -1;
zhdr->slots = slots;
+ zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
INIT_WORK(&zhdr->work, compact_page_work);
return zhdr;
@@ -426,7 +427,7 @@ static enum buddy handle_to_buddy(unsigned long handle)
static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
{
- return slots_to_pool(zhdr->slots);
+ return zhdr->pool;
}
static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
@@ -850,7 +851,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
enum buddy bud;
bool can_sleep = gfpflags_allow_blocking(gfp);
- if (!size || (gfp & __GFP_HIGHMEM))
+ if (!size)
return -EINVAL;
if (size > PAGE_SIZE)
@@ -1345,24 +1346,29 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
zhdr = page_address(page);
pool = zhdr_to_pool(zhdr);
- if (!trylock_page(page))
- return -EAGAIN;
-
if (!z3fold_page_trylock(zhdr)) {
- unlock_page(page);
return -EAGAIN;
}
if (zhdr->mapped_count != 0) {
z3fold_page_unlock(zhdr);
- unlock_page(page);
return -EBUSY;
}
+ if (work_pending(&zhdr->work)) {
+ z3fold_page_unlock(zhdr);
+ return -EAGAIN;
+ }
new_zhdr = page_address(newpage);
memcpy(new_zhdr, zhdr, PAGE_SIZE);
newpage->private = page->private;
page->private = 0;
z3fold_page_unlock(zhdr);
spin_lock_init(&new_zhdr->page_lock);
+ INIT_WORK(&new_zhdr->work, compact_page_work);
+ /*
+ * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
+ * so we only have to reinitialize it.
+ */
+ INIT_LIST_HEAD(&new_zhdr->buddy);
new_mapping = page_mapping(page);
__ClearPageMovable(page);
ClearPagePrivate(page);
@@ -1386,7 +1392,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
page_mapcount_reset(page);
- unlock_page(page);
put_page(page);
return 0;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index db09eb3669c5..57fbb7ced69f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -52,6 +52,7 @@
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
#include <linux/migrate.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
@@ -1798,19 +1799,14 @@ static void lock_zspage(struct zspage *zspage)
} while ((page = get_next_page(page)) != NULL);
}
-static struct dentry *zs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int zs_init_fs_context(struct fs_context *fc)
{
- static const struct dentry_operations ops = {
- .d_dname = simple_dname,
- };
-
- return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
+ return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
}
static struct file_system_type zsmalloc_fs = {
.name = "zsmalloc",
- .mount = zs_mount,
+ .init_fs_context = zs_init_fs_context,
.kill_sb = kill_anon_super,
};