summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig77
-rw-r--r--mm/Makefile7
-rw-r--r--mm/backing-dev.c66
-rw-r--r--mm/balloon_compaction.c4
-rw-r--r--mm/cma.c62
-rw-r--r--mm/cma.h25
-rw-r--r--mm/cma_debug.c8
-rw-r--r--mm/cma_sysfs.c112
-rw-r--r--mm/compaction.c115
-rw-r--r--mm/debug.c25
-rw-r--r--mm/debug_vm_pgtable.c71
-rw-r--r--mm/dmapool.c7
-rw-r--r--mm/filemap.c185
-rw-r--r--mm/frontswap.c12
-rw-r--r--mm/gup.c398
-rw-r--r--mm/gup_test.c29
-rw-r--r--mm/gup_test.h3
-rw-r--r--mm/highmem.c13
-rw-r--r--mm/huge_memory.c386
-rw-r--r--mm/hugetlb.c942
-rw-r--r--mm/hugetlb_cgroup.c9
-rw-r--r--mm/internal.h126
-rw-r--r--mm/interval_tree.c2
-rw-r--r--mm/io-mapping.c29
-rw-r--r--mm/ioremap.c225
-rw-r--r--mm/kasan/Makefile4
-rw-r--r--mm/kasan/common.c53
-rw-r--r--mm/kasan/generic.c15
-rw-r--r--mm/kasan/hw_tags.c104
-rw-r--r--mm/kasan/init.c10
-rw-r--r--mm/kasan/kasan.h84
-rw-r--r--mm/kasan/quarantine.c4
-rw-r--r--mm/kasan/report.c26
-rw-r--r--mm/kasan/report_generic.c4
-rw-r--r--mm/kasan/report_hw_tags.c5
-rw-r--r--mm/kasan/report_sw_tags.c43
-rw-r--r--mm/kasan/report_tags.c51
-rw-r--r--mm/kasan/shadow.c20
-rw-r--r--mm/kasan/sw_tags.c54
-rw-r--r--mm/kasan/tags.c59
-rw-r--r--mm/kfence/core.c59
-rw-r--r--mm/kfence/kfence_test.c5
-rw-r--r--mm/kfence/report.c2
-rw-r--r--mm/khugepaged.c65
-rw-r--r--mm/kmemleak.c20
-rw-r--r--mm/ksm.c26
-rw-r--r--mm/list_lru.c6
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/mapping_dirty_helpers.c2
-rw-r--r--mm/memblock.c8
-rw-r--r--mm/memcontrol.c1168
-rw-r--r--mm/memfd.c4
-rw-r--r--mm/memory-failure.c457
-rw-r--r--mm/memory.c276
-rw-r--r--mm/memory_hotplug.c222
-rw-r--r--mm/mempolicy.c98
-rw-r--r--mm/mempool.c8
-rw-r--r--mm/memremap.c2
-rw-r--r--mm/migrate.c104
-rw-r--r--mm/mlock.c26
-rw-r--r--mm/mm_init.c4
-rw-r--r--mm/mmap.c92
-rw-r--r--mm/mmap_lock.c33
-rw-r--r--mm/mmu_gather.c29
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c13
-rw-r--r--mm/msync.c6
-rw-r--r--mm/nommu.c12
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c107
-rw-r--r--mm/page_alloc.c1251
-rw-r--r--mm/page_counter.c8
-rw-r--r--mm/page_ext.c2
-rw-r--r--mm/page_owner.c72
-rw-r--r--mm/page_poison.c6
-rw-r--r--mm/page_reporting.c19
-rw-r--r--mm/page_reporting.h5
-rw-r--r--mm/page_vma_mapped.c162
-rw-r--r--mm/pagewalk.c58
-rw-r--r--mm/percpu-internal.h2
-rw-r--r--mm/percpu-vm.c7
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/pgalloc-track.h6
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/process_vm_access.c1
-rw-r--r--mm/ptdump.c2
-rw-r--r--mm/readahead.c101
-rw-r--r--mm/rmap.c41
-rw-r--r--mm/shmem.c67
-rw-r--r--mm/shuffle.c4
-rw-r--r--mm/shuffle.h4
-rw-r--r--mm/slab.c53
-rw-r--r--mm/slab.h49
-rw-r--r--mm/slab_common.c85
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c551
-rw-r--r--mm/sparse.c18
-rw-r--r--mm/swap.c77
-rw-r--r--mm/swap_slots.c4
-rw-r--r--mm/swap_state.c39
-rw-r--r--mm/swapfile.c183
-rw-r--r--mm/truncate.c62
-rw-r--r--mm/userfaultfd.c67
-rw-r--r--mm/util.c39
-rw-r--r--mm/vmalloc.c872
-rw-r--r--mm/vmscan.c454
-rw-r--r--mm/vmstat.c283
-rw-r--r--mm/workingset.c3
-rw-r--r--mm/z3fold.c2
-rw-r--r--mm/zpool.c2
-rw-r--r--mm/zsmalloc.c12
-rw-r--r--mm/zswap.c2
112 files changed, 6969 insertions, 4055 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 24c045b24b95..ded98fb859ab 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -9,7 +9,6 @@ config SELECT_MEMORY_MODEL
choice
prompt "Memory model"
depends on SELECT_MEMORY_MODEL
- default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
help
@@ -20,7 +19,7 @@ choice
config FLATMEM_MANUAL
bool "Flat Memory"
- depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
+ depends on !ARCH_SPARSEMEM_ENABLE || ARCH_FLATMEM_ENABLE
help
This option is best suited for non-NUMA systems with
flat address space. The FLATMEM is the most efficient
@@ -33,21 +32,6 @@ config FLATMEM_MANUAL
If unsure, choose this option (Flat Memory) over any other.
-config DISCONTIGMEM_MANUAL
- bool "Discontiguous Memory"
- depends on ARCH_DISCONTIGMEM_ENABLE
- help
- This option provides enhanced support for discontiguous
- memory systems, over FLATMEM. These systems have holes
- in their physical address spaces, and this option provides
- more efficient handling of these holes.
-
- Although "Discontiguous Memory" is still used by several
- architectures, it is considered deprecated in favor of
- "Sparse Memory".
-
- If unsure, choose "Sparse Memory" over this option.
-
config SPARSEMEM_MANUAL
bool "Sparse Memory"
depends on ARCH_SPARSEMEM_ENABLE
@@ -63,30 +47,13 @@ config SPARSEMEM_MANUAL
endchoice
-config DISCONTIGMEM
- def_bool y
- depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
-
config SPARSEMEM
def_bool y
depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
config FLATMEM
def_bool y
- depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
-
-config FLAT_NODE_MEM_MAP
- def_bool y
- depends on !SPARSEMEM
-
-#
-# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
-# to represent different areas of memory. This variable allows
-# those dependencies to exist individually.
-#
-config NEED_MULTIPLE_NODES
- def_bool y
- depends on DISCONTIGMEM || NUMA
+ depends on !SPARSEMEM || FLATMEM_MANUAL
#
# SPARSEMEM_EXTREME (which is the default) does some bootmem
@@ -149,6 +116,9 @@ config MEMORY_ISOLATION
config HAVE_BOOTMEM_INFO_NODE
def_bool n
+config ARCH_ENABLE_MEMORY_HOTPLUG
+ bool
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
@@ -177,12 +147,20 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
Say N here if you want the default policy to keep all hot-plugged
memory blocks in 'offline' state.
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ bool
+
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
+config MHP_MEMMAP_ON_MEMORY
+ def_bool y
+ depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
+ depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
@@ -274,6 +252,13 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
config ARCH_ENABLE_THP_MIGRATION
bool
+config HUGETLB_PAGE_SIZE_VARIABLE
+ def_bool n
+ help
+ Allows the pageblock_order value to be dynamic instead of just standard
+ HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
+ on a platform.
+
config CONTIG_ALLOC
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
@@ -283,12 +268,11 @@ config PHYS_ADDR_T_64BIT
config BOUNCE
bool "Enable bounce buffers"
default y
- depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+ depends on BLOCK && MMU && HIGHMEM
help
- Enable bounce buffers for devices that cannot access
- the full range of memory available to the CPU. Enabled
- by default when ZONE_DMA or HIGHMEM is selected, but you
- may say n to override this.
+ Enable bounce buffers for devices that cannot access the full range of
+ memory available to the CPU. Enabled by default when HIGHMEM is
+ selected, but you may say n to override this.
config VIRT_TO_BUS
bool
@@ -513,6 +497,13 @@ config CMA_DEBUGFS
help
Turns on the DebugFS interface for CMA.
+config CMA_SYSFS
+ bool "CMA information through sysfs interface"
+ depends on CMA && SYSFS
+ help
+ This option exposes some sysfs attributes to get information
+ from CMA.
+
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
@@ -760,6 +751,9 @@ config IDLE_PAGE_TRACKING
See Documentation/admin-guide/mm/idle_page_tracking.rst for
more details.
+config ARCH_HAS_CACHE_LINE_SIZE
+ bool
+
config ARCH_HAS_PTE_DEVMAP
bool
@@ -872,4 +866,7 @@ config MAPPING_DIRTY_HELPERS
config KMAP_LOCAL
bool
+# struct io_mapping based helper. Selected by drivers that need them
+config IO_MAPPING
+ bool
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 72227b24a616..bf71e295e9f6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -58,9 +58,13 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
page-alloc-y := page_alloc.o
page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
+# Give 'memory_hotplug' its own module-parameter namespace
+memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+
obj-y += page-alloc.o
obj-y += init-mm.o
obj-y += memblock.o
+obj-y += $(memory-hotplug-y)
ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
@@ -83,7 +87,6 @@ obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_KFENCE) += kfence/
obj-$(CONFIG_FAILSLAB) += failslab.o
-obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
@@ -109,6 +112,7 @@ obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
@@ -120,3 +124,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_IO_MAPPING) += io-mapping.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 576220acd686..271f2ca862c8 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -371,12 +371,16 @@ static void wb_exit(struct bdi_writeback *wb)
#include <linux/memcontrol.h>
/*
- * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
- * bdi->cgwb_tree is also RCU protected.
+ * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
+ * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
static struct workqueue_struct *cgwb_release_wq;
+static LIST_HEAD(offline_cgwbs);
+static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
+static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
+
static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
@@ -395,7 +399,13 @@ static void cgwb_release_workfn(struct work_struct *work)
fprop_local_destroy_percpu(&wb->memcg_completions);
percpu_ref_exit(&wb->refcnt);
+
+ spin_lock_irq(&cgwb_lock);
+ list_del(&wb->offline_node);
+ spin_unlock_irq(&cgwb_lock);
+
wb_exit(wb);
+ WARN_ON_ONCE(!list_empty(&wb->b_attached));
kfree_rcu(wb, rcu);
}
@@ -413,6 +423,7 @@ static void cgwb_kill(struct bdi_writeback *wb)
WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
list_del(&wb->memcg_node);
list_del(&wb->blkcg_node);
+ list_add(&wb->offline_node, &offline_cgwbs);
percpu_ref_kill(&wb->refcnt);
}
@@ -472,6 +483,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
wb->memcg_css = memcg_css;
wb->blkcg_css = blkcg_css;
+ INIT_LIST_HEAD(&wb->b_attached);
INIT_WORK(&wb->release_work, cgwb_release_workfn);
set_bit(WB_registered, &wb->state);
@@ -633,6 +645,54 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
mutex_unlock(&bdi->cgwb_release_mutex);
}
+/*
+ * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
+ *
+ * Try to release dying cgwbs by switching attached inodes to the nearest
+ * living ancestor's writeback. Processed wbs are placed at the end
+ * of the list to guarantee the forward progress.
+ */
+static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
+{
+ struct bdi_writeback *wb;
+ LIST_HEAD(processed);
+
+ spin_lock_irq(&cgwb_lock);
+
+ while (!list_empty(&offline_cgwbs)) {
+ wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
+ offline_node);
+ list_move(&wb->offline_node, &processed);
+
+ /*
+ * If wb is dirty, cleaning up the writeback by switching
+ * attached inodes will result in an effective removal of any
+ * bandwidth restrictions, which isn't the goal. Instead,
+ * it can be postponed until the next time, when all io
+ * will be likely completed. If in the meantime some inodes
+ * will get re-dirtied, they should be eventually switched to
+ * a new cgwb.
+ */
+ if (wb_has_dirty_io(wb))
+ continue;
+
+ if (!wb_tryget(wb))
+ continue;
+
+ spin_unlock_irq(&cgwb_lock);
+ while (cleanup_offline_cgwb(wb))
+ cond_resched();
+ spin_lock_irq(&cgwb_lock);
+
+ wb_put(wb);
+ }
+
+ if (!list_empty(&processed))
+ list_splice_tail(&processed, &offline_cgwbs);
+
+ spin_unlock_irq(&cgwb_lock);
+}
+
/**
* wb_memcg_offline - kill all wb's associated with a memcg being offlined
* @memcg: memcg being offlined
@@ -649,6 +709,8 @@ void wb_memcg_offline(struct mem_cgroup *memcg)
cgwb_kill(wb);
memcg_cgwb_list->next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
+
+ queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
}
/**
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 26de020aae7b..907fefde2572 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
/**
* balloon_page_list_dequeue() - removes pages from balloon's page list and
* returns a list of the pages.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
* @pages: pointer to the list of pages that would be returned to the caller.
* @n_req_pages: number of requested pages.
*
@@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue);
/*
* balloon_page_dequeue - removes a page from balloon's page list and returns
* its address to allow the driver to release the page.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
*
* Driver must call this function to properly dequeue a previously enqueued page
* before definitively releasing it back to the guest system.
diff --git a/mm/cma.c b/mm/cma.c
index 54eee2119822..995e15480937 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -24,7 +24,6 @@
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/mm.h>
-#include <linux/mutex.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/log2.h>
@@ -80,16 +79,17 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
}
static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
- unsigned int count)
+ unsigned long count)
{
unsigned long bitmap_no, bitmap_count;
+ unsigned long flags;
bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
- mutex_lock(&cma->lock);
+ spin_lock_irqsave(&cma->lock, flags);
bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
- mutex_unlock(&cma->lock);
+ spin_unlock_irqrestore(&cma->lock, flags);
}
static void __init cma_activate_area(struct cma *cma)
@@ -118,7 +118,7 @@ static void __init cma_activate_area(struct cma *cma)
pfn += pageblock_nr_pages)
init_cma_reserved_pageblock(pfn_to_page(pfn));
- mutex_init(&cma->lock);
+ spin_lock_init(&cma->lock);
#ifdef CONFIG_CMA_DEBUGFS
INIT_HLIST_HEAD(&cma->mem_head);
@@ -392,7 +392,7 @@ static void cma_debug_show_areas(struct cma *cma)
unsigned long nr_part, nr_total = 0;
unsigned long nbits = cma_bitmap_maxno(cma);
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
pr_info("number of available pages: ");
for (;;) {
next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
@@ -407,7 +407,7 @@ static void cma_debug_show_areas(struct cma *cma)
start = next_zero_bit + nr_zero;
}
pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
}
#else
static inline void cma_debug_show_areas(struct cma *cma) { }
@@ -423,25 +423,27 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
-struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
- bool no_warn)
+struct page *cma_alloc(struct cma *cma, unsigned long count,
+ unsigned int align, bool no_warn)
{
unsigned long mask, offset;
unsigned long pfn = -1;
unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
- size_t i;
+ unsigned long i;
struct page *page = NULL;
int ret = -ENOMEM;
if (!cma || !cma->count || !cma->bitmap)
- return NULL;
+ goto out;
- pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
+ pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma,
count, align);
if (!count)
- return NULL;
+ goto out;
+
+ trace_cma_alloc_start(cma->name, count, align);
mask = cma_bitmap_aligned_mask(cma, align);
offset = cma_bitmap_aligned_offset(cma, align);
@@ -449,15 +451,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
if (bitmap_count > bitmap_maxno)
- return NULL;
+ goto out;
for (;;) {
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
bitmap_maxno, start, bitmap_count, mask,
offset);
if (bitmap_no >= bitmap_maxno) {
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
break;
}
bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
@@ -466,7 +468,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
* our exclusive use. If the migration fails we will take the
* lock again and unmark it.
*/
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
@@ -483,11 +485,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
pr_debug("%s(): memory range at %p is busy, retrying\n",
__func__, pfn_to_page(pfn));
+
+ trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
+ count, align);
/* try again with a bit different memory target */
start = bitmap_no + mask + 1;
}
- trace_cma_alloc(pfn, page, count, align);
+ trace_cma_alloc_finish(cma->name, pfn, page, count, align);
/*
* CMA can allocate multiple page blocks, which results in different
@@ -500,12 +505,22 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
}
if (ret && !no_warn) {
- pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n",
- __func__, cma->name, count, ret);
+ pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n",
+ __func__, cma->name, count, ret);
cma_debug_show_areas(cma);
}
pr_debug("%s(): returned %p\n", __func__, page);
+out:
+ if (page) {
+ count_vm_event(CMA_ALLOC_SUCCESS);
+ cma_sysfs_account_success_pages(cma, count);
+ } else {
+ count_vm_event(CMA_ALLOC_FAIL);
+ if (cma)
+ cma_sysfs_account_fail_pages(cma, count);
+ }
+
return page;
}
@@ -519,14 +534,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
* It returns false when provided pages do not belong to contiguous area and
* true otherwise.
*/
-bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
+bool cma_release(struct cma *cma, const struct page *pages,
+ unsigned long count)
{
unsigned long pfn;
if (!cma || !pages)
return false;
- pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count);
+ pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
pfn = page_to_pfn(pages);
@@ -537,7 +553,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
free_contig_range(pfn, count);
cma_clear_bitmap(cma, pfn, count);
- trace_cma_release(pfn, pages, count);
+ trace_cma_release(cma->name, pfn, pages, count);
return true;
}
diff --git a/mm/cma.h b/mm/cma.h
index 42ae082cb067..2c775877eae2 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -3,19 +3,33 @@
#define __MM_CMA_H__
#include <linux/debugfs.h>
+#include <linux/kobject.h>
+
+struct cma_kobject {
+ struct kobject kobj;
+ struct cma *cma;
+};
struct cma {
unsigned long base_pfn;
unsigned long count;
unsigned long *bitmap;
unsigned int order_per_bit; /* Order of pages represented by one bit */
- struct mutex lock;
+ spinlock_t lock;
#ifdef CONFIG_CMA_DEBUGFS
struct hlist_head mem_head;
spinlock_t mem_head_lock;
struct debugfs_u32_array dfs_bitmap;
#endif
char name[CMA_MAX_NAME];
+#ifdef CONFIG_CMA_SYSFS
+ /* the number of CMA page successful allocations */
+ atomic64_t nr_pages_succeeded;
+ /* the number of CMA page allocation failures */
+ atomic64_t nr_pages_failed;
+ /* kobject requires dynamic object */
+ struct cma_kobject *cma_kobj;
+#endif
};
extern struct cma cma_areas[MAX_CMA_AREAS];
@@ -26,4 +40,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma)
return cma->count >> cma->order_per_bit;
}
+#ifdef CONFIG_CMA_SYSFS
+void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages);
+void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages);
+#else
+static inline void cma_sysfs_account_success_pages(struct cma *cma,
+ unsigned long nr_pages) {};
+static inline void cma_sysfs_account_fail_pages(struct cma *cma,
+ unsigned long nr_pages) {};
+#endif
#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index d5bf8aa34fdc..2e7704955f4f 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -36,10 +36,10 @@ static int cma_used_get(void *data, u64 *val)
struct cma *cma = data;
unsigned long used;
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
/* pages counter is smaller than sizeof(int) */
used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
*val = (u64)used << cma->order_per_bit;
return 0;
@@ -53,7 +53,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
unsigned long start, end = 0;
unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
- mutex_lock(&cma->lock);
+ spin_lock_irq(&cma->lock);
for (;;) {
start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
if (start >= bitmap_maxno)
@@ -61,7 +61,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
end = find_next_bit(cma->bitmap, bitmap_maxno, start);
maxchunk = max(end - start, maxchunk);
}
- mutex_unlock(&cma->lock);
+ spin_unlock_irq(&cma->lock);
*val = (u64)maxchunk << cma->order_per_bit;
return 0;
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
new file mode 100644
index 000000000000..eb2f39caff59
--- /dev/null
+++ b/mm/cma_sysfs.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CMA SysFS Interface
+ *
+ * Copyright (c) 2021 Minchan Kim <minchan@kernel.org>
+ */
+
+#include <linux/cma.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "cma.h"
+
+#define CMA_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages)
+{
+ atomic64_add(nr_pages, &cma->nr_pages_succeeded);
+}
+
+void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages)
+{
+ atomic64_add(nr_pages, &cma->nr_pages_failed);
+}
+
+static inline struct cma *cma_from_kobj(struct kobject *kobj)
+{
+ return container_of(kobj, struct cma_kobject, kobj)->cma;
+}
+
+static ssize_t alloc_pages_success_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%llu\n",
+ atomic64_read(&cma->nr_pages_succeeded));
+}
+CMA_ATTR_RO(alloc_pages_success);
+
+static ssize_t alloc_pages_fail_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed));
+}
+CMA_ATTR_RO(alloc_pages_fail);
+
+static void cma_kobj_release(struct kobject *kobj)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+ struct cma_kobject *cma_kobj = cma->cma_kobj;
+
+ kfree(cma_kobj);
+ cma->cma_kobj = NULL;
+}
+
+static struct attribute *cma_attrs[] = {
+ &alloc_pages_success_attr.attr,
+ &alloc_pages_fail_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(cma);
+
+static struct kobj_type cma_ktype = {
+ .release = cma_kobj_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = cma_groups,
+};
+
+static int __init cma_sysfs_init(void)
+{
+ struct kobject *cma_kobj_root;
+ struct cma_kobject *cma_kobj;
+ struct cma *cma;
+ int i, err;
+
+ cma_kobj_root = kobject_create_and_add("cma", mm_kobj);
+ if (!cma_kobj_root)
+ return -ENOMEM;
+
+ for (i = 0; i < cma_area_count; i++) {
+ cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL);
+ if (!cma_kobj) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ cma = &cma_areas[i];
+ cma->cma_kobj = cma_kobj;
+ cma_kobj->cma = cma;
+ err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype,
+ cma_kobj_root, "%s", cma->name);
+ if (err) {
+ kobject_put(&cma_kobj->kobj);
+ goto out;
+ }
+ }
+
+ return 0;
+out:
+ while (--i >= 0) {
+ cma = &cma_areas[i];
+ kobject_put(&cma->cma_kobj->kobj);
+ }
+ kobject_put(cma_kobj_root);
+
+ return err;
+}
+subsys_initcall(cma_sysfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
index e04f4476e68e..3a509fbf2bea 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -787,15 +787,14 @@ static bool too_many_isolated(pg_data_t *pgdat)
*
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
- * Returns zero if there is a fatal signal pending, otherwise PFN of the
- * first page that was not scanned (which may be both less, equal to or more
- * than end_pfn).
+ * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
+ * -ENOMEM in case we could not allocate a page, or 0.
+ * cc->migrate_pfn will contain the next pfn to scan.
*
* The pages are isolated on cc->migratepages list (not required to be empty),
- * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
- * is neither read nor updated.
+ * and cc->nr_migratepages is updated accordingly.
*/
-static unsigned long
+static int
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long end_pfn, isolate_mode_t isolate_mode)
{
@@ -809,6 +808,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
bool skip_updated = false;
+ int ret = 0;
+
+ cc->migrate_pfn = low_pfn;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -818,16 +820,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
while (unlikely(too_many_isolated(pgdat))) {
/* stop isolation if there are still pages not migrated */
if (cc->nr_migratepages)
- return 0;
+ return -EAGAIN;
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
- return 0;
+ return -EAGAIN;
congestion_wait(BLK_RW_ASYNC, HZ/10);
if (fatal_signal_pending(current))
- return 0;
+ return -EINTR;
}
cond_resched();
@@ -875,8 +877,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (fatal_signal_pending(current)) {
cc->contended = true;
+ ret = -EINTR;
- low_pfn = 0;
goto fatal_pending;
}
@@ -904,6 +906,38 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
valid_page = page;
}
+ if (PageHuge(page) && cc->alloc_contig) {
+ ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
+
+ /*
+ * Fail isolation in case isolate_or_dissolve_huge_page()
+ * reports an error. In case of -ENOMEM, abort right away.
+ */
+ if (ret < 0) {
+ /* Do not report -EBUSY down the chain */
+ if (ret == -EBUSY)
+ ret = 0;
+ low_pfn += (1UL << compound_order(page)) - 1;
+ goto isolate_fail;
+ }
+
+ if (PageHuge(page)) {
+ /*
+ * Hugepage was successfully isolated and placed
+ * on the cc->migratepages list.
+ */
+ low_pfn += compound_nr(page) - 1;
+ goto isolate_success_no_list;
+ }
+
+ /*
+ * Ok, the hugepage was dissolved. Now these pages are
+ * Buddy and cannot be re-allocated because they are
+ * isolated. Fall-through as the check below handles
+ * Buddy pages.
+ */
+ }
+
/*
* Skip if free. We read page order here without zone lock
* which is generally unsafe, but the race window is small and
@@ -994,7 +1028,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!TestClearPageLRU(page))
goto isolate_fail_put;
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = mem_cgroup_page_lruvec(page);
/* If we already hold the lock, we can skip some rechecking */
if (lruvec != locked) {
@@ -1037,6 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
isolate_success:
list_add(&page->lru, &cc->migratepages);
+isolate_success_no_list:
cc->nr_migratepages += compound_nr(page);
nr_isolated += compound_nr(page);
@@ -1063,7 +1098,7 @@ isolate_fail_put:
put_page(page);
isolate_fail:
- if (!skip_on_failure)
+ if (!skip_on_failure && ret != -ENOMEM)
continue;
/*
@@ -1089,6 +1124,9 @@ isolate_fail:
*/
next_skip_pfn += 1UL << cc->order;
}
+
+ if (ret == -ENOMEM)
+ break;
}
/*
@@ -1130,7 +1168,9 @@ fatal_pending:
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
- return low_pfn;
+ cc->migrate_pfn = low_pfn;
+
+ return ret;
}
/**
@@ -1139,15 +1179,15 @@ fatal_pending:
* @start_pfn: The first PFN to start isolating.
* @end_pfn: The one-past-last PFN.
*
- * Returns zero if isolation fails fatally due to e.g. pending signal.
- * Otherwise, function returns one-past-the-last PFN of isolated page
- * (which may be greater than end_pfn if end fell in a middle of a THP page).
+ * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
+ * in case we could not allocate a page, or 0.
*/
-unsigned long
+int
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long pfn, block_start_pfn, block_end_pfn;
+ int ret = 0;
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
@@ -1166,17 +1206,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
block_end_pfn, cc->zone))
continue;
- pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
- ISOLATE_UNEVICTABLE);
+ ret = isolate_migratepages_block(cc, pfn, block_end_pfn,
+ ISOLATE_UNEVICTABLE);
- if (!pfn)
+ if (ret)
break;
if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
break;
}
- return pfn;
+ return ret;
}
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
@@ -1847,7 +1887,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
*/
for (; block_end_pfn <= cc->free_pfn;
fast_find_block = false,
- low_pfn = block_end_pfn,
+ cc->migrate_pfn = low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
@@ -1889,10 +1929,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
}
/* Perform the isolation */
- low_pfn = isolate_migratepages_block(cc, low_pfn,
- block_end_pfn, isolate_mode);
-
- if (!low_pfn)
+ if (isolate_migratepages_block(cc, low_pfn, block_end_pfn,
+ isolate_mode))
return ISOLATE_ABORT;
/*
@@ -1903,9 +1941,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
break;
}
- /* Record where migration scanner will be restarted. */
- cc->migrate_pfn = low_pfn;
-
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
@@ -1920,7 +1955,7 @@ static inline bool is_via_compact_memory(int order)
static bool kswapd_is_running(pg_data_t *pgdat)
{
- return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+ return pgdat->kswapd && task_is_running(pgdat->kswapd);
}
/*
@@ -1977,8 +2012,8 @@ static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
unsigned int wmark_low;
/*
- * Cap the low watermak to avoid excessive compaction
- * activity in case a user sets the proactivess tunable
+ * Cap the low watermark to avoid excessive compaction
+ * activity in case a user sets the proactiveness tunable
* close to 100 (maximum).
*/
wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
@@ -2319,7 +2354,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
- migrate_prep_local();
+ /* lru_add_drain_all could be expensive with involving other CPUs */
+ lru_add_drain();
while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
int err;
@@ -2494,6 +2530,14 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
*/
WRITE_ONCE(current->capture_control, NULL);
*capture = READ_ONCE(capc.page);
+ /*
+ * Technically, it is also possible that compaction is skipped but
+ * the page is still captured out of luck(IRQ came and freed the page).
+ * Returning COMPACT_SUCCESS in such cases helps in properly accounting
+ * the COMPACT[STALL|FAIL] when compaction is skipped.
+ */
+ if (*capture)
+ ret = COMPACT_SUCCESS;
return ret;
}
@@ -2657,9 +2701,6 @@ static void compact_nodes(void)
compact_node(nid);
}
-/* The written value is actually unused, all memory is compacted */
-int sysctl_compact_memory;
-
/*
* Tunable for proactive compaction. It determines how
* aggressively the kernel should compact memory in the
@@ -2844,7 +2885,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
*/
static int kcompactd(void *p)
{
- pg_data_t *pgdat = (pg_data_t*)p;
+ pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
unsigned int proactive_defer = 0;
diff --git a/mm/debug.c b/mm/debug.c
index 0bdda8407f71..e73fe0a8ec3d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -42,11 +42,10 @@ const struct trace_print_flags vmaflag_names[] = {
{0, NULL}
};
-void __dump_page(struct page *page, const char *reason)
+static void __dump_page(struct page *page)
{
struct page *head = compound_head(page);
struct address_space *mapping;
- bool page_poisoned = PagePoisoned(page);
bool compound = PageCompound(page);
/*
* Accessing the pageblock without the zone lock. It could change to
@@ -58,16 +57,6 @@ void __dump_page(struct page *page, const char *reason)
int mapcount;
char *type = "";
- /*
- * If struct page is poisoned don't access Page*() functions as that
- * leads to recursive loop. Page*() check for poisoned pages, and calls
- * dump_page() when detected.
- */
- if (page_poisoned) {
- pr_warn("page:%px is uninitialized and poisoned", page);
- goto hex_only;
- }
-
if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
/*
* Corrupt page, so we cannot call page_mapping. Instead, do a
@@ -173,8 +162,6 @@ out_mapping:
pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags,
page_cma ? " CMA" : "");
-
-hex_only:
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
@@ -182,14 +169,16 @@ hex_only:
print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), head,
sizeof(struct page), false);
-
- if (reason)
- pr_warn("page dumped because: %s\n", reason);
}
void dump_page(struct page *page, const char *reason)
{
- __dump_page(page, reason);
+ if (PagePoisoned(page))
+ pr_warn("page:%p is uninitialized and poisoned", page);
+ else
+ __dump_page(page);
+ if (reason)
+ pr_warn("page dumped because: %s\n", reason);
dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index a9bd6ce1ba02..92bfc37300df 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -146,13 +146,14 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
static void __init pmd_basic_tests(unsigned long pfn, int idx)
{
pgprot_t prot = protection_map[idx];
- pmd_t pmd = pfn_pmd(pfn, prot);
unsigned long val = idx, *ptr = &val;
+ pmd_t pmd;
if (!has_transparent_hugepage())
return;
pr_debug("Validating PMD basic (%pGv)\n", ptr);
+ pmd = pfn_pmd(pfn, prot);
/*
* This test needs to be executed after the given page table entry
@@ -185,14 +186,14 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
unsigned long pfn, unsigned long vaddr,
pgprot_t prot, pgtable_t pgtable)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
if (!has_transparent_hugepage())
return;
pr_debug("Validating PMD advanced\n");
/* Align the address wrt HPAGE_PMD_SIZE */
- vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+ vaddr &= HPAGE_PMD_MASK;
pgtable_trans_huge_deposit(mm, pmdp, pgtable);
@@ -232,9 +233,14 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
pr_debug("Validating PMD leaf\n");
+ pmd = pfn_pmd(pfn, prot);
+
/*
* PMD based THP is a leaf entry.
*/
@@ -247,7 +253,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
{
pmd_t pmd;
- if (!arch_ioremap_pmd_supported())
+ if (!arch_vmap_pmd_supported(prot))
return;
pr_debug("Validating PMD huge\n");
@@ -267,12 +273,16 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
return;
+ if (!has_transparent_hugepage())
+ return;
+
pr_debug("Validating PMD saved write\n");
+ pmd = pfn_pmd(pfn, prot);
WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
}
@@ -281,13 +291,14 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx)
{
pgprot_t prot = protection_map[idx];
- pud_t pud = pfn_pud(pfn, prot);
unsigned long val = idx, *ptr = &val;
+ pud_t pud;
if (!has_transparent_hugepage())
return;
pr_debug("Validating PUD basic (%pGv)\n", ptr);
+ pud = pfn_pud(pfn, prot);
/*
* This test needs to be executed after the given page table entry
@@ -323,15 +334,16 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
unsigned long pfn, unsigned long vaddr,
pgprot_t prot)
{
- pud_t pud = pfn_pud(pfn, prot);
+ pud_t pud;
if (!has_transparent_hugepage())
return;
pr_debug("Validating PUD advanced\n");
/* Align the address wrt HPAGE_PUD_SIZE */
- vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
+ vaddr &= HPAGE_PUD_MASK;
+ pud = pfn_pud(pfn, prot);
set_pud_at(mm, vaddr, pudp, pud);
pudp_set_wrprotect(mm, vaddr, pudp);
pud = READ_ONCE(*pudp);
@@ -370,9 +382,13 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
{
- pud_t pud = pfn_pud(pfn, prot);
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
pr_debug("Validating PUD leaf\n");
+ pud = pfn_pud(pfn, prot);
/*
* PUD based THP is a leaf entry.
*/
@@ -385,7 +401,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
{
pud_t pud;
- if (!arch_ioremap_pud_supported())
+ if (!arch_vmap_pud_supported(prot))
return;
pr_debug("Validating PUD huge\n");
@@ -654,12 +670,16 @@ static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
{
- pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ pmd_t pmd;
if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
return;
+ if (!has_transparent_hugepage())
+ return;
+
pr_debug("Validating PMD protnone\n");
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
WARN_ON(!pmd_protnone(pmd));
WARN_ON(!pmd_present(pmd));
}
@@ -679,18 +699,26 @@ static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
pr_debug("Validating PMD devmap\n");
+ pmd = pfn_pmd(pfn, prot);
WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
{
- pud_t pud = pfn_pud(pfn, prot);
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
pr_debug("Validating PUD devmap\n");
+ pud = pfn_pud(pfn, prot);
WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -733,25 +761,33 @@ static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
return;
+ if (!has_transparent_hugepage())
+ return;
+
pr_debug("Validating PMD soft dirty\n");
+ pmd = pfn_pmd(pfn, prot);
WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
}
static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
{
- pmd_t pmd = pfn_pmd(pfn, prot);
+ pmd_t pmd;
if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) ||
!IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
return;
+ if (!has_transparent_hugepage())
+ return;
+
pr_debug("Validating PMD swap soft dirty\n");
+ pmd = pfn_pmd(pfn, prot);
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
@@ -780,6 +816,9 @@ static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
swp_entry_t swp;
pmd_t pmd;
+ if (!has_transparent_hugepage())
+ return;
+
pr_debug("Validating PMD swap\n");
pmd = pfn_pmd(pfn, prot);
swp = __pmd_to_swp_entry(pmd);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f3791532fef2..64b537b3ccb0 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -62,8 +62,7 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
static DEFINE_MUTEX(pools_lock);
static DEFINE_MUTEX(pools_reg_lock);
-static ssize_t
-show_pools(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t pools_show(struct device *dev, struct device_attribute *attr, char *buf)
{
unsigned temp;
unsigned size;
@@ -103,7 +102,7 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf)
return PAGE_SIZE - size;
}
-static DEVICE_ATTR(pools, 0444, show_pools, NULL);
+static DEVICE_ATTR_RO(pools);
/**
* dma_pool_create - Creates a pool of consistent memory blocks, for dma.
@@ -157,7 +156,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
if (!retval)
return retval;
- strlcpy(retval->name, name, sizeof(retval->name));
+ strscpy(retval->name, name, sizeof(retval->name));
retval->dev = dev;
diff --git a/mm/filemap.c b/mm/filemap.c
index 43700480d897..ac82a93d4f38 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -142,17 +142,6 @@ static void page_cache_delete(struct address_space *mapping,
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
-
- if (shadow) {
- mapping->nrexceptional += nr;
- /*
- * Make sure the nrexceptional update is committed before
- * the nrpages update so that final truncate racing
- * with reclaim does not see both counters 0 at the
- * same time and miss a shadow entry.
- */
- smp_wmb();
- }
mapping->nrpages -= nr;
}
@@ -629,13 +618,53 @@ EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
- if (dax_mapping(mapping))
- return mapping->nrexceptional;
-
return mapping->nrpages;
}
/**
+ * filemap_range_needs_writeback - check if range potentially needs writeback
+ * @mapping: address space within which to check
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback. Used by O_DIRECT
+ * read/write with IOCB_NOWAIT, to see if the caller needs to do
+ * filemap_write_and_wait_range() before proceeding.
+ *
+ * Return: %true if the caller should do filemap_write_and_wait_range() before
+ * doing O_DIRECT to a page in this range, %false otherwise.
+ */
+bool filemap_range_needs_writeback(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+ pgoff_t max = end_byte >> PAGE_SHIFT;
+ struct page *page;
+
+ if (!mapping_needs_writeback(mapping))
+ return false;
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+ return false;
+ if (end_byte < start_byte)
+ return false;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, max) {
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
+ continue;
+ if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
+ break;
+ }
+ rcu_read_unlock();
+ return page != NULL;
+}
+EXPORT_SYMBOL_GPL(filemap_range_needs_writeback);
+
+/**
* filemap_write_and_wait_range - write out & wait on a file range
* @mapping: the address_space for the pages
* @lstart: offset in bytes where the range starts
@@ -843,7 +872,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
page->index = offset;
if (!huge) {
- error = mem_cgroup_charge(page, current->mm, gfp);
+ error = mem_cgroup_charge(page, NULL, gfp);
if (error)
goto error;
charged = true;
@@ -882,8 +911,6 @@ noinline int __add_to_page_cache_locked(struct page *page,
if (xas_error(&xas))
goto unlock;
- if (old)
- mapping->nrexceptional--;
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
@@ -1433,6 +1460,67 @@ void unlock_page(struct page *page)
EXPORT_SYMBOL(unlock_page);
/**
+ * end_page_private_2 - Clear PG_private_2 and release any waiters
+ * @page: The page
+ *
+ * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for
+ * this. The page ref held for PG_private_2 being set is released.
+ *
+ * This is, for example, used when a netfs page is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same page to be
+ * serialised.
+ */
+void end_page_private_2(struct page *page)
+{
+ page = compound_head(page);
+ VM_BUG_ON_PAGE(!PagePrivate2(page), page);
+ clear_bit_unlock(PG_private_2, &page->flags);
+ wake_up_page_bit(page, PG_private_2);
+ put_page(page);
+}
+EXPORT_SYMBOL(end_page_private_2);
+
+/**
+ * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page
+ * @page: The page to wait on
+ *
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page.
+ */
+void wait_on_page_private_2(struct page *page)
+{
+ page = compound_head(page);
+ while (PagePrivate2(page))
+ wait_on_page_bit(page, PG_private_2);
+}
+EXPORT_SYMBOL(wait_on_page_private_2);
+
+/**
+ * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page
+ * @page: The page to wait on
+ *
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a
+ * fatal signal is received by the calling task.
+ *
+ * Return:
+ * - 0 if successful.
+ * - -EINTR if a fatal signal was encountered.
+ */
+int wait_on_page_private_2_killable(struct page *page)
+{
+ int ret = 0;
+
+ page = compound_head(page);
+ while (PagePrivate2(page)) {
+ ret = wait_on_page_bit_killable(page, PG_private_2);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_on_page_private_2_killable);
+
+/**
* end_page_writeback - end writeback against a page
* @page: the page
*/
@@ -1663,7 +1751,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
* @mapping: the address_space to search
* @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
+ * Looks up the page cache slot at @mapping & @index. If there is a
* page cache page, the head page is returned with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
@@ -1969,8 +2057,14 @@ unlock:
put:
put_page(page);
next:
- if (!xa_is_value(page) && PageTransHuge(page))
- xas_set(&xas, page->index + thp_nr_pages(page));
+ if (!xa_is_value(page) && PageTransHuge(page)) {
+ unsigned int nr_pages = thp_nr_pages(page);
+
+ /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */
+ xas_set(&xas, page->index + nr_pages);
+ if (xas.xa_index < nr_pages)
+ break;
+ }
}
rcu_read_unlock();
@@ -2238,8 +2332,6 @@ static int filemap_read_page(struct file *file, struct address_space *mapping,
return error;
if (PageUptodate(page))
return 0;
- if (!page->mapping) /* page truncated */
- return AOP_TRUNCATED_PAGE;
shrink_readahead_size_eio(&file->f_ra);
return -EIO;
}
@@ -2571,8 +2663,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
size = i_size_read(inode);
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_has_page(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1))
+ if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1))
return -EAGAIN;
} else {
retval = filemap_write_and_wait_range(mapping,
@@ -2663,7 +2755,7 @@ unsigned int seek_page_size(struct xa_state *xas, struct page *page)
* entirely memory-based such as tmpfs, and filesystems which support
* unwritten extents.
*
- * Return: The requested offset on successs, or -ENXIO if @whence specifies
+ * Return: The requested offset on success, or -ENXIO if @whence specifies
* SEEK_DATA and there is no data after @start. There is an implicit hole
* after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
* and @end contain data.
@@ -2672,7 +2764,7 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
loff_t end, int whence)
{
XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
- pgoff_t max = (end - 1) / PAGE_SIZE;
+ pgoff_t max = (end - 1) >> PAGE_SHIFT;
bool seek_data = (whence == SEEK_DATA);
struct page *page;
@@ -2681,7 +2773,8 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
rcu_read_lock();
while ((page = find_get_entry(&xas, max, XA_PRESENT))) {
- loff_t pos = xas.xa_index * PAGE_SIZE;
+ loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
+ unsigned int seek_size;
if (start < pos) {
if (!seek_data)
@@ -2689,25 +2782,25 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
start = pos;
}
- pos += seek_page_size(&xas, page);
+ seek_size = seek_page_size(&xas, page);
+ pos = round_up(pos + 1, seek_size);
start = page_seek_hole_data(&xas, mapping, page, start, pos,
seek_data);
if (start < pos)
goto unlock;
+ if (start >= end)
+ break;
+ if (seek_size > PAGE_SIZE)
+ xas_set(&xas, pos >> PAGE_SHIFT);
if (!xa_is_value(page))
put_page(page);
}
- rcu_read_unlock();
-
if (seek_data)
- return -ENXIO;
- goto out;
-
+ start = -ENXIO;
unlock:
rcu_read_unlock();
- if (!xa_is_value(page))
+ if (page && !xa_is_value(page))
put_page(page);
-out:
if (start > end)
return end;
return start;
@@ -2771,7 +2864,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
- DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
+ DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
struct file *fpin = NULL;
unsigned int mmap_miss;
@@ -2783,7 +2876,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_ra(&ractl, ra, ra->ra_pages);
+ page_cache_sync_ra(&ractl, ra->ra_pages);
return fpin;
}
@@ -2869,7 +2962,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
- struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
pgoff_t max_off;
@@ -2956,14 +3048,8 @@ page_not_uptodate:
* because there really aren't any performance issues here
* and we need to check for errors.
*/
- ClearPageError(page);
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (!PageUptodate(page))
- error = -EIO;
- }
+ error = filemap_read_page(file, mapping, page);
if (fpin)
goto out_retry;
put_page(page);
@@ -2971,7 +3057,6 @@ page_not_uptodate:
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
- shrink_readahead_size_eio(ra);
return VM_FAULT_SIGBUS;
out_retry:
@@ -3182,7 +3267,7 @@ const struct vm_operations_struct generic_file_vm_ops = {
/* This is used for a general mmap of a disk file */
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct address_space *mapping = file->f_mapping;
@@ -3207,11 +3292,11 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
return VM_FAULT_SIGBUS;
}
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
-int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
@@ -3639,7 +3724,7 @@ EXPORT_SYMBOL(generic_perform_write);
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t written = 0;
ssize_t err;
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 2183a56c7874..130e301c5ac0 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -60,16 +60,20 @@ static u64 frontswap_succ_stores;
static u64 frontswap_failed_stores;
static u64 frontswap_invalidates;
-static inline void inc_frontswap_loads(void) {
+static inline void inc_frontswap_loads(void)
+{
data_race(frontswap_loads++);
}
-static inline void inc_frontswap_succ_stores(void) {
+static inline void inc_frontswap_succ_stores(void)
+{
data_race(frontswap_succ_stores++);
}
-static inline void inc_frontswap_failed_stores(void) {
+static inline void inc_frontswap_failed_stores(void)
+{
data_race(frontswap_failed_stores++);
}
-static inline void inc_frontswap_invalidates(void) {
+static inline void inc_frontswap_invalidates(void)
+{
data_race(frontswap_invalidates++);
}
#else
diff --git a/mm/gup.c b/mm/gup.c
index ef7d2da9f03f..8651309f8ec3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -44,6 +44,23 @@ static void hpage_pincount_sub(struct page *page, int refs)
atomic_sub(refs, compound_pincount_ptr(page));
}
+/* Equivalent to calling put_page() @refs times. */
+static void put_page_refs(struct page *page, int refs)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
+ return;
+#endif
+
+ /*
+ * Calling put_page() for each ref is unnecessarily slow. Only the last
+ * ref needs a put_page().
+ */
+ if (refs > 1)
+ page_ref_sub(page, refs - 1);
+ put_page(page);
+}
+
/*
* Return the compound head page with ref appropriately incremented,
* or NULL if that failed.
@@ -56,6 +73,21 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
return NULL;
if (unlikely(!page_cache_add_speculative(head, refs)))
return NULL;
+
+ /*
+ * At this point we have a stable reference to the head page; but it
+ * could be that between the compound_head() lookup and the refcount
+ * increment, the compound page was split, in which case we'd end up
+ * holding a reference on a page that has nothing to do with the page
+ * we were given anymore.
+ * So now that the head page is stable, recheck that the pages still
+ * belong together.
+ */
+ if (unlikely(compound_head(page) != head)) {
+ put_page_refs(head, refs);
+ return NULL;
+ }
+
return head;
}
@@ -87,11 +119,20 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
int orig_refs = refs;
/*
- * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
- * path, so fail and let the caller fall back to the slow path.
+ * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+ * right zone, so fail and let the caller fall back to the slow
+ * path.
+ */
+ if (unlikely((flags & FOLL_LONGTERM) &&
+ !is_pinnable_page(page)))
+ return NULL;
+
+ /*
+ * CAUTION: Don't use compound_head() on the page before this
+ * point, the result won't be stable.
*/
- if (unlikely(flags & FOLL_LONGTERM) &&
- is_migrate_cma_page(page))
+ page = try_get_compound_head(page, refs);
+ if (!page)
return NULL;
/*
@@ -102,15 +143,10 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
* However, be sure to *also* increment the normal page refcount
* field at least once, so that the page really is pinned.
*/
- if (!hpage_pincount_available(page))
- refs *= GUP_PIN_COUNTING_BIAS;
-
- page = try_get_compound_head(page, refs);
- if (!page)
- return NULL;
-
if (hpage_pincount_available(page))
hpage_pincount_add(page, refs);
+ else
+ page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
orig_refs);
@@ -134,14 +170,7 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
refs *= GUP_PIN_COUNTING_BIAS;
}
- VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
- /*
- * Calling put_page() for each ref is unnecessarily slow. Only the last
- * ref needs a put_page().
- */
- if (refs > 1)
- page_ref_sub(page, refs - 1);
- put_page(page);
+ put_page_refs(page, refs);
}
/**
@@ -213,6 +242,58 @@ void unpin_user_page(struct page *page)
}
EXPORT_SYMBOL(unpin_user_page);
+static inline void compound_range_next(unsigned long i, unsigned long npages,
+ struct page **list, struct page **head,
+ unsigned int *ntails)
+{
+ struct page *next, *page;
+ unsigned int nr = 1;
+
+ if (i >= npages)
+ return;
+
+ next = *list + i;
+ page = compound_head(next);
+ if (PageCompound(page) && compound_order(page) >= 1)
+ nr = min_t(unsigned int,
+ page + compound_nr(page) - next, npages - i);
+
+ *head = page;
+ *ntails = nr;
+}
+
+#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \
+ for (__i = 0, \
+ compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \
+ __i < __npages; __i += __ntails, \
+ compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
+
+static inline void compound_next(unsigned long i, unsigned long npages,
+ struct page **list, struct page **head,
+ unsigned int *ntails)
+{
+ struct page *page;
+ unsigned int nr;
+
+ if (i >= npages)
+ return;
+
+ page = compound_head(list[i]);
+ for (nr = i + 1; nr < npages; nr++) {
+ if (compound_head(list[nr]) != page)
+ break;
+ }
+
+ *head = page;
+ *ntails = nr - i;
+}
+
+#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \
+ for (__i = 0, \
+ compound_next(__i, __npages, __list, &(__head), &(__ntails)); \
+ __i < __npages; __i += __ntails, \
+ compound_next(__i, __npages, __list, &(__head), &(__ntails)))
+
/**
* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* @pages: array of pages to be maybe marked dirty, and definitely released.
@@ -239,20 +320,15 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty)
{
unsigned long index;
-
- /*
- * TODO: this can be optimized for huge pages: if a series of pages is
- * physically contiguous and part of the same compound page, then a
- * single operation to the head page should suffice.
- */
+ struct page *head;
+ unsigned int ntails;
if (!make_dirty) {
unpin_user_pages(pages, npages);
return;
}
- for (index = 0; index < npages; index++) {
- struct page *page = compound_head(pages[index]);
+ for_each_compound_head(index, pages, npages, head, ntails) {
/*
* Checking PageDirty at this point may race with
* clear_page_dirty_for_io(), but that's OK. Two key
@@ -273,14 +349,50 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
* written back, so it gets written back again in the
* next writeback cycle. This is harmless.
*/
- if (!PageDirty(page))
- set_page_dirty_lock(page);
- unpin_user_page(page);
+ if (!PageDirty(head))
+ set_page_dirty_lock(head);
+ put_compound_head(head, ntails, FOLL_PIN);
}
}
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
/**
+ * unpin_user_page_range_dirty_lock() - release and optionally dirty
+ * gup-pinned page range
+ *
+ * @page: the starting page of a range maybe marked dirty, and definitely released.
+ * @npages: number of consecutive pages to release.
+ * @make_dirty: whether to mark the pages dirty
+ *
+ * "gup-pinned page range" refers to a range of pages that has had one of the
+ * pin_user_pages() variants called on that page.
+ *
+ * For the page ranges defined by [page .. page+npages], make that range (or
+ * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
+ * page range was previously listed as clean.
+ *
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), unpin_user_page().
+ *
+ */
+void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
+ bool make_dirty)
+{
+ unsigned long index;
+ struct page *head;
+ unsigned int ntails;
+
+ for_each_compound_range(index, &page, npages, head, ntails) {
+ if (make_dirty && !PageDirty(head))
+ set_page_dirty_lock(head);
+ put_compound_head(head, ntails, FOLL_PIN);
+ }
+}
+EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
+
+/**
* unpin_user_pages() - release an array of gup-pinned pages.
* @pages: array of pages to be marked dirty and released.
* @npages: number of pages in the @pages array.
@@ -292,6 +404,8 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
void unpin_user_pages(struct page **pages, unsigned long npages)
{
unsigned long index;
+ struct page *head;
+ unsigned int ntails;
/*
* If this WARN_ON() fires, then the system *might* be leaking pages (by
@@ -300,16 +414,23 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
*/
if (WARN_ON(IS_ERR_VALUE(npages)))
return;
- /*
- * TODO: this can be optimized for huge pages: if a series of pages is
- * physically contiguous and part of the same compound page, then a
- * single operation to the head page should suffice.
- */
- for (index = 0; index < npages; index++)
- unpin_user_page(pages[index]);
+
+ for_each_compound_head(index, pages, npages, head, ntails)
+ put_compound_head(head, ntails, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_pages);
+/*
+ * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
+ * lifecycle. Avoid setting the bit unless necessary, or it might cause write
+ * cache bouncing on large SMP machines for concurrent pinned gups.
+ */
+static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
+{
+ if (!test_bit(MMF_HAS_PINNED, mm_flags))
+ set_bit(MMF_HAS_PINNED, mm_flags);
+}
+
#ifdef CONFIG_MMU
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags)
@@ -435,18 +556,6 @@ retry:
}
}
- if (flags & FOLL_SPLIT && PageTransCompound(page)) {
- get_page(page);
- pte_unmap_unlock(ptep, ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (ret)
- return ERR_PTR(ret);
- goto retry;
- }
-
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
if (unlikely(!try_grab_page(page, flags))) {
page = ERR_PTR(-ENOMEM);
@@ -591,7 +700,7 @@ retry_locked:
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
+ if (flags & FOLL_SPLIT_PMD) {
int ret;
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
@@ -600,19 +709,7 @@ retry_locked:
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
- } else if (flags & FOLL_SPLIT) {
- if (unlikely(!try_get_page(page))) {
- spin_unlock(ptl);
- return ERR_PTR(-ENOMEM);
- }
- spin_unlock(ptl);
- lock_page(page);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (pmd_none(*pmd))
- return no_page_table(vma, flags);
- } else { /* flags & FOLL_SPLIT_PMD */
+ } else {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, address);
ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
@@ -1235,7 +1332,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
}
if (flags & FOLL_PIN)
- atomic_set(&mm->has_pinned, 1);
+ mm_set_has_pinned_flag(&mm->flags);
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
@@ -1470,7 +1567,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
{
struct vm_area_struct *vma;
unsigned long vm_flags;
- int i;
+ long i;
/* calculate required read or write permissions.
* If FOLL_FORCE is set, we only require the "MAY" flags.
@@ -1517,7 +1614,7 @@ finish_or_fault:
* Returns NULL on any kind of failure - a hole must then be inserted into
* the corefile, to preserve alignment with its headers; and also returns
* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
+ * allowing a hole to be left in the corefile to save disk space.
*
* Called without mmap_lock (takes and releases the mmap_lock by itself).
*/
@@ -1535,120 +1632,96 @@ struct page *get_dump_page(unsigned long addr)
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
if (locked)
mmap_read_unlock(mm);
-
- if (ret == 1 && is_page_poisoned(page))
- return NULL;
-
return (ret == 1) ? page : NULL;
}
#endif /* CONFIG_ELF_CORE */
-#ifdef CONFIG_CMA
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
- unsigned long start,
- unsigned long nr_pages,
- struct page **pages,
- struct vm_area_struct **vmas,
- unsigned int gup_flags)
+#ifdef CONFIG_MIGRATION
+/*
+ * Check whether all pages are pinnable, if so return number of pages. If some
+ * pages are not pinnable, migrate them, and unpin all pages. Return zero if
+ * pages were migrated, or if some pages were not successfully isolated.
+ * Return negative error if migration fails.
+ */
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+ struct page **pages,
+ unsigned int gup_flags)
{
unsigned long i;
- unsigned long step;
+ unsigned long isolation_error_count = 0;
bool drain_allow = true;
- bool migrate_allow = true;
- LIST_HEAD(cma_page_list);
- long ret = nr_pages;
+ LIST_HEAD(movable_page_list);
+ long ret = 0;
+ struct page *prev_head = NULL;
+ struct page *head;
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
+ .gfp_mask = GFP_USER | __GFP_NOWARN,
};
-check_again:
- for (i = 0; i < nr_pages;) {
-
- struct page *head = compound_head(pages[i]);
-
- /*
- * gup may start from a tail page. Advance step by the left
- * part.
- */
- step = compound_nr(head) - (pages[i] - head);
+ for (i = 0; i < nr_pages; i++) {
+ head = compound_head(pages[i]);
+ if (head == prev_head)
+ continue;
+ prev_head = head;
/*
- * If we get a page from the CMA zone, since we are going to
- * be pinning these entries, we might as well move them out
- * of the CMA zone if possible.
+ * If we get a movable page, since we are going to be pinning
+ * these entries, try to move them out if possible.
*/
- if (is_migrate_cma_page(head)) {
- if (PageHuge(head))
- isolate_huge_page(head, &cma_page_list);
- else {
+ if (!is_pinnable_page(head)) {
+ if (PageHuge(head)) {
+ if (!isolate_huge_page(head, &movable_page_list))
+ isolation_error_count++;
+ } else {
if (!PageLRU(head) && drain_allow) {
lru_add_drain_all();
drain_allow = false;
}
- if (!isolate_lru_page(head)) {
- list_add_tail(&head->lru, &cma_page_list);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON +
- page_is_file_lru(head),
- thp_nr_pages(head));
+ if (isolate_lru_page(head)) {
+ isolation_error_count++;
+ continue;
}
+ list_add_tail(&head->lru, &movable_page_list);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON +
+ page_is_file_lru(head),
+ thp_nr_pages(head));
}
}
-
- i += step;
}
- if (!list_empty(&cma_page_list)) {
- /*
- * drop the above get_user_pages reference.
- */
- if (gup_flags & FOLL_PIN)
- unpin_user_pages(pages, nr_pages);
- else
- for (i = 0; i < nr_pages; i++)
- put_page(pages[i]);
-
- if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
- (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
- /*
- * some of the pages failed migration. Do get_user_pages
- * without migration.
- */
- migrate_allow = false;
+ /*
+ * If list is empty, and no isolation errors, means that all pages are
+ * in the correct zone.
+ */
+ if (list_empty(&movable_page_list) && !isolation_error_count)
+ return nr_pages;
- if (!list_empty(&cma_page_list))
- putback_movable_pages(&cma_page_list);
- }
- /*
- * We did migrate all the pages, Try to get the page references
- * again migrating any new CMA pages which we failed to isolate
- * earlier.
- */
- ret = __get_user_pages_locked(mm, start, nr_pages,
- pages, vmas, NULL,
- gup_flags);
-
- if ((ret > 0) && migrate_allow) {
- nr_pages = ret;
- drain_allow = true;
- goto check_again;
- }
+ if (gup_flags & FOLL_PIN) {
+ unpin_user_pages(pages, nr_pages);
+ } else {
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+ }
+ if (!list_empty(&movable_page_list)) {
+ ret = migrate_pages(&movable_page_list, alloc_migration_target,
+ NULL, (unsigned long)&mtc, MIGRATE_SYNC,
+ MR_LONGTERM_PIN);
+ if (ret && !list_empty(&movable_page_list))
+ putback_movable_pages(&movable_page_list);
}
- return ret;
+ return ret > 0 ? -ENOMEM : ret;
}
#else
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
- unsigned long start,
- unsigned long nr_pages,
- struct page **pages,
- struct vm_area_struct **vmas,
- unsigned int gup_flags)
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+ struct page **pages,
+ unsigned int gup_flags)
{
return nr_pages;
}
-#endif /* CONFIG_CMA */
+#endif /* CONFIG_MIGRATION */
/*
* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
@@ -1661,21 +1734,22 @@ static long __gup_longterm_locked(struct mm_struct *mm,
struct vm_area_struct **vmas,
unsigned int gup_flags)
{
- unsigned long flags = 0;
+ unsigned int flags;
long rc;
- if (gup_flags & FOLL_LONGTERM)
- flags = memalloc_nocma_save();
-
- rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
- gup_flags);
+ if (!(gup_flags & FOLL_LONGTERM))
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ NULL, gup_flags);
+ flags = memalloc_pin_save();
+ do {
+ rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ NULL, gup_flags);
+ if (rc <= 0)
+ break;
+ rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
+ } while (!rc);
+ memalloc_pin_restore(flags);
- if (gup_flags & FOLL_LONGTERM) {
- if (rc > 0)
- rc = check_and_migrate_cma_pages(mm, start, rc, pages,
- vmas, gup_flags);
- memalloc_nocma_restore(flags);
- }
return rc;
}
@@ -2579,7 +2653,7 @@ static int internal_get_user_pages_fast(unsigned long start,
return -EINVAL;
if (gup_flags & FOLL_PIN)
- atomic_set(&current->mm->has_pinned, 1);
+ mm_set_has_pinned_flag(&current->mm->flags);
if (!(gup_flags & FOLL_FAST_ONLY))
might_lock_read(&current->mm->mmap_lock);
diff --git a/mm/gup_test.c b/mm/gup_test.c
index e3cf78e5873e..d974dec19e1c 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -52,6 +52,12 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
dump_page(page, "gup_test failure");
break;
+ } else if (cmd == PIN_LONGTERM_BENCHMARK &&
+ WARN(!is_pinnable_page(page),
+ "pages[%lu] is NOT pinnable but pinned\n",
+ i)) {
+ dump_page(page, "gup_test failure");
+ break;
}
}
break;
@@ -94,7 +100,7 @@ static int __gup_test_ioctl(unsigned int cmd,
{
ktime_t start_time, end_time;
unsigned long i, nr_pages, addr, next;
- int nr;
+ long nr;
struct page **pages;
int ret = 0;
bool needs_mmap_lock =
@@ -126,37 +132,34 @@ static int __gup_test_ioctl(unsigned int cmd,
nr = (next - addr) / PAGE_SIZE;
}
- /* Filter out most gup flags: only allow a tiny subset here: */
- gup->flags &= FOLL_WRITE;
-
switch (cmd) {
case GUP_FAST_BENCHMARK:
- nr = get_user_pages_fast(addr, nr, gup->flags,
+ nr = get_user_pages_fast(addr, nr, gup->gup_flags,
pages + i);
break;
case GUP_BASIC_TEST:
- nr = get_user_pages(addr, nr, gup->flags, pages + i,
+ nr = get_user_pages(addr, nr, gup->gup_flags, pages + i,
NULL);
break;
case PIN_FAST_BENCHMARK:
- nr = pin_user_pages_fast(addr, nr, gup->flags,
+ nr = pin_user_pages_fast(addr, nr, gup->gup_flags,
pages + i);
break;
case PIN_BASIC_TEST:
- nr = pin_user_pages(addr, nr, gup->flags, pages + i,
+ nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i,
NULL);
break;
case PIN_LONGTERM_BENCHMARK:
nr = pin_user_pages(addr, nr,
- gup->flags | FOLL_LONGTERM,
+ gup->gup_flags | FOLL_LONGTERM,
pages + i, NULL);
break;
case DUMP_USER_PAGES_TEST:
- if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
- nr = pin_user_pages(addr, nr, gup->flags,
+ if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
+ nr = pin_user_pages(addr, nr, gup->gup_flags,
pages + i, NULL);
else
- nr = get_user_pages(addr, nr, gup->flags,
+ nr = get_user_pages(addr, nr, gup->gup_flags,
pages + i, NULL);
break;
default:
@@ -187,7 +190,7 @@ static int __gup_test_ioctl(unsigned int cmd,
start_time = ktime_get();
- put_back_pages(cmd, pages, nr_pages, gup->flags);
+ put_back_pages(cmd, pages, nr_pages, gup->test_flags);
end_time = ktime_get();
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
diff --git a/mm/gup_test.h b/mm/gup_test.h
index 90a6713d50eb..887ac1d5f5bc 100644
--- a/mm/gup_test.h
+++ b/mm/gup_test.h
@@ -21,7 +21,8 @@ struct gup_test {
__u64 addr;
__u64 size;
__u32 nr_pages_per_call;
- __u32 flags;
+ __u32 gup_flags;
+ __u32 test_flags;
/*
* Each non-zero entry is the number of the page (1-based: first page is
* page 1, so that zero entries mean "do nothing") from the .addr base.
diff --git a/mm/highmem.c b/mm/highmem.c
index 6ef8f5e05e7e..4fb51d735aa6 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
atomic_long_t _totalhigh_pages __read_mostly;
EXPORT_SYMBOL(_totalhigh_pages);
-unsigned int __nr_free_highpages (void)
+unsigned int __nr_free_highpages(void)
{
struct zone *zone;
unsigned int pages = 0;
@@ -120,7 +120,7 @@ unsigned int __nr_free_highpages (void)
static int pkmap_count[LAST_PKMAP];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
-pte_t * pkmap_page_table;
+pte_t *pkmap_page_table;
/*
* Most architectures have no use for kmap_high_get(), so let's abstract
@@ -147,6 +147,7 @@ struct page *__kmap_to_page(void *vaddr)
if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
int i = PKMAP_NR(addr);
+
return pte_page(pkmap_page_table[i]);
}
@@ -278,9 +279,8 @@ void *kmap_high(struct page *page)
pkmap_count[PKMAP_NR(vaddr)]++;
BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
unlock_kmap();
- return (void*) vaddr;
+ return (void *) vaddr;
}
-
EXPORT_SYMBOL(kmap_high);
#ifdef ARCH_NEEDS_KMAP_HIGH_GET
@@ -305,7 +305,7 @@ void *kmap_high_get(struct page *page)
pkmap_count[PKMAP_NR(vaddr)]++;
}
unlock_kmap_any(flags);
- return (void*) vaddr;
+ return (void *) vaddr;
}
#endif
@@ -519,7 +519,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
/*
* Disable migration so resulting virtual address is stable
- * accross preemption.
+ * across preemption.
*/
migrate_disable();
preempt_disable();
@@ -737,7 +737,6 @@ done:
spin_unlock_irqrestore(&pas->lock, flags);
return ret;
}
-
EXPORT_SYMBOL(page_address);
/**
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ae907a9c2050..6d2a0119fc58 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
@@ -61,6 +62,7 @@ static struct shrinker deferred_split_shrinker;
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
bool transparent_hugepage_enabled(struct vm_area_struct *vma)
{
@@ -77,18 +79,18 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
return false;
}
-static struct page *get_huge_zero_page(void)
+static bool get_huge_zero_page(void)
{
struct page *zero_page;
retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
- return READ_ONCE(huge_zero_page);
+ return true;
zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
if (!zero_page) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
- return NULL;
+ return false;
}
count_vm_event(THP_ZERO_PAGE_ALLOC);
preempt_disable();
@@ -97,11 +99,12 @@ retry:
__free_pages(zero_page, compound_order(zero_page));
goto retry;
}
+ WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
- return READ_ONCE(huge_zero_page);
+ return true;
}
static void put_huge_zero_page(void)
@@ -146,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
+ WRITE_ONCE(huge_zero_pfn, ~0UL);
__free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR;
}
@@ -624,14 +628,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
/* Deliver the page fault to userland */
if (userfaultfd_missing(vma)) {
- vm_fault_t ret2;
-
spin_unlock(vmf->ptl);
put_page(page);
pte_free(vma->vm_mm, pgtable);
- ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
- VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
- return ret2;
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ return ret;
}
entry = mk_huge_pmd(page, vma->vm_page_prot);
@@ -1293,7 +1295,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
}
page = pmd_page(orig_pmd);
- VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
/* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
@@ -1464,12 +1466,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
- if (target_nid == NUMA_NO_NODE) {
- /* If the page was locked, there are no parallel migrations */
- if (page_locked)
- goto clear_pmdnuma;
- }
-
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
page_nid = NUMA_NO_NODE;
@@ -1478,6 +1474,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
spin_unlock(vmf->ptl);
put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
goto out;
+ } else if (target_nid == NUMA_NO_NODE) {
+ /* There are no parallel migrations and page is in the right
+ * node. Clear the numa hinting info in this pmd.
+ */
+ goto clear_pmdnuma;
}
/*
@@ -1696,7 +1697,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
- page = pfn_to_page(swp_offset(entry));
+ page = migration_entry_to_page(entry);
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -1794,8 +1795,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
/*
* Returns
* - 0 if PMD could not be locked
- * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
- * - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
+ * - HPAGE_PMD_NR if protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
@@ -2046,7 +2047,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
count_vm_event(THP_SPLIT_PMD);
if (!vma_is_anonymous(vma)) {
- _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
@@ -2055,16 +2056,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
zap_deposited_table(mm, pmd);
if (vma_is_special_huge(vma))
return;
- page = pmd_page(_pmd);
- if (!PageDirty(page) && pmd_dirty(_pmd))
- set_page_dirty(page);
- if (!PageReferenced(page) && pmd_young(_pmd))
- SetPageReferenced(page);
- page_remove_rmap(page, true);
- put_page(page);
+ if (unlikely(is_pmd_migration_entry(old_pmd))) {
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(old_pmd);
+ page = migration_entry_to_page(entry);
+ } else {
+ page = pmd_page(old_pmd);
+ if (!PageDirty(page) && pmd_dirty(old_pmd))
+ set_page_dirty(page);
+ if (!PageReferenced(page) && pmd_young(old_pmd))
+ SetPageReferenced(page);
+ page_remove_rmap(page, true);
+ put_page(page);
+ }
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
- } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
+ }
+
+ if (is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
@@ -2104,7 +2114,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = pfn_to_page(swp_offset(entry));
+ page = migration_entry_to_page(entry);
write = is_write_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
@@ -2303,60 +2313,54 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
__split_huge_pmd(vma, pmd, address, freeze, page);
}
+static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
+{
+ /*
+ * If the new address isn't hpage aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
+ range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
+ ALIGN(address, HPAGE_PMD_SIZE)))
+ split_huge_pmd_address(vma, address, false, NULL);
+}
+
void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next)
{
- /*
- * If the new start address isn't hpage aligned and it could
- * previously contain an hugepage: check if we need to split
- * an huge pmd.
- */
- if (start & ~HPAGE_PMD_MASK &&
- (start & HPAGE_PMD_MASK) >= vma->vm_start &&
- (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, start, false, NULL);
+ /* Check if we need to split start first. */
+ split_huge_pmd_if_needed(vma, start);
- /*
- * If the new end address isn't hpage aligned and it could
- * previously contain an hugepage: check if we need to split
- * an huge pmd.
- */
- if (end & ~HPAGE_PMD_MASK &&
- (end & HPAGE_PMD_MASK) >= vma->vm_start &&
- (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, end, false, NULL);
+ /* Check if we need to split end next. */
+ split_huge_pmd_if_needed(vma, end);
/*
- * If we're also updating the vma->vm_next->vm_start, if the new
- * vm_next->vm_start isn't hpage aligned and it could previously
- * contain an hugepage: check if we need to split an huge pmd.
+ * If we're also updating the vma->vm_next->vm_start,
+ * check if we need to split it.
*/
if (adjust_next > 0) {
struct vm_area_struct *next = vma->vm_next;
unsigned long nstart = next->vm_start;
nstart += adjust_next;
- if (nstart & ~HPAGE_PMD_MASK &&
- (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
- (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_pmd_address(next, nstart, false, NULL);
+ split_huge_pmd_if_needed(next, nstart);
}
}
static void unmap_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
- bool unmap_success;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
ttu_flags |= TTU_SPLIT_FREEZE;
- unmap_success = try_to_unmap(page, ttu_flags);
- VM_BUG_ON_PAGE(!unmap_success, page);
+ try_to_unmap(page, ttu_flags);
+
+ VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
static void remap_page(struct page *page, unsigned int nr)
@@ -2477,7 +2481,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}
- /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+ /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
lruvec = lock_page_lruvec(head);
for (i = nr - 1; i >= 1; i--) {
@@ -2667,7 +2671,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
- int count, mapcount, extra_pins, ret;
+ int extra_pins, ret;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
@@ -2726,7 +2730,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
unmap_page(head);
- VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
@@ -2744,9 +2747,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- count = page_count(head);
- mapcount = total_mapcount(head);
- if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
+ if (page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
@@ -2766,16 +2767,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
__split_huge_page(page, list, end);
ret = 0;
} else {
- if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- pr_alert("total_mapcount: %u, page_count(): %u\n",
- mapcount, count);
- if (PageTail(page))
- dump_page(head, NULL);
- dump_page(page, "total_mapcount(head) > 0");
- BUG();
- }
spin_unlock(&ds_queue->split_queue_lock);
-fail: if (mapping)
+fail:
+ if (mapping)
xa_unlock(&mapping->i_pages);
local_irq_enable();
remap_page(head, thp_nr_pages(head));
@@ -2838,8 +2832,8 @@ void deferred_split_huge_page(struct page *page)
ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG
if (memcg)
- memcg_set_shrinker_bit(memcg, page_to_nid(page),
- deferred_split_shrinker.id);
+ set_shrinker_bit(memcg, page_to_nid(page),
+ deferred_split_shrinker.id);
#endif
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
@@ -2924,16 +2918,14 @@ static struct shrinker deferred_split_shrinker = {
};
#ifdef CONFIG_DEBUG_FS
-static int split_huge_pages_set(void *data, u64 val)
+static void split_huge_pages_all(void)
{
struct zone *zone;
struct page *page;
unsigned long pfn, max_zone_pfn;
unsigned long total = 0, split = 0;
- if (val != 1)
- return -EINVAL;
-
+ pr_debug("Split all THPs\n");
for_each_populated_zone(zone) {
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
@@ -2957,15 +2949,243 @@ static int split_huge_pages_set(void *data, u64 val)
unlock_page(page);
next:
put_page(page);
+ cond_resched();
}
}
- pr_info("%lu of %lu THP split\n", split, total);
+ pr_debug("%lu of %lu THP split\n", split, total);
+}
- return 0;
+static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
+{
+ return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
+ is_vm_hugetlb_page(vma);
}
-DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
- "%llu\n");
+
+static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
+ unsigned long vaddr_end)
+{
+ int ret = 0;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ unsigned long total = 0, split = 0;
+ unsigned long addr;
+
+ vaddr_start &= PAGE_MASK;
+ vaddr_end &= PAGE_MASK;
+
+ /* Find the task_struct from pid */
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ ret = -ESRCH;
+ goto out;
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ /* Find the mm_struct */
+ mm = get_task_mm(task);
+ put_task_struct(task);
+
+ if (!mm) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
+ pid, vaddr_start, vaddr_end);
+
+ mmap_read_lock(mm);
+ /*
+ * always increase addr by PAGE_SIZE, since we could have a PTE page
+ * table filled with PTE-mapped THPs, each of which is distinct.
+ */
+ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
+ struct vm_area_struct *vma = find_vma(mm, addr);
+ unsigned int follflags;
+ struct page *page;
+
+ if (!vma || addr < vma->vm_start)
+ break;
+
+ /* skip special VMA and hugetlb VMA */
+ if (vma_not_suitable_for_thp_split(vma)) {
+ addr = vma->vm_end;
+ continue;
+ }
+
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ follflags = FOLL_GET | FOLL_DUMP;
+ page = follow_page(vma, addr, follflags);
+
+ if (IS_ERR(page))
+ continue;
+ if (!page)
+ continue;
+
+ if (!is_transparent_hugepage(page))
+ goto next;
+
+ total++;
+ if (!can_split_huge_page(compound_head(page), NULL))
+ goto next;
+
+ if (!trylock_page(page))
+ goto next;
+
+ if (!split_huge_page(page))
+ split++;
+
+ unlock_page(page);
+next:
+ put_page(page);
+ cond_resched();
+ }
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ pr_debug("%lu of %lu THP split\n", split, total);
+
+out:
+ return ret;
+}
+
+static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
+ pgoff_t off_end)
+{
+ struct filename *file;
+ struct file *candidate;
+ struct address_space *mapping;
+ int ret = -EINVAL;
+ pgoff_t index;
+ int nr_pages = 1;
+ unsigned long total = 0, split = 0;
+
+ file = getname_kernel(file_path);
+ if (IS_ERR(file))
+ return ret;
+
+ candidate = file_open_name(file, O_RDONLY, 0);
+ if (IS_ERR(candidate))
+ goto out;
+
+ pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
+ file_path, off_start, off_end);
+
+ mapping = candidate->f_mapping;
+
+ for (index = off_start; index < off_end; index += nr_pages) {
+ struct page *fpage = pagecache_get_page(mapping, index,
+ FGP_ENTRY | FGP_HEAD, 0);
+
+ nr_pages = 1;
+ if (xa_is_value(fpage) || !fpage)
+ continue;
+
+ if (!is_transparent_hugepage(fpage))
+ goto next;
+
+ total++;
+ nr_pages = thp_nr_pages(fpage);
+
+ if (!trylock_page(fpage))
+ goto next;
+
+ if (!split_huge_page(fpage))
+ split++;
+
+ unlock_page(fpage);
+next:
+ put_page(fpage);
+ cond_resched();
+ }
+
+ filp_close(candidate, NULL);
+ ret = 0;
+
+ pr_debug("%lu of %lu file-backed THP split\n", split, total);
+out:
+ putname(file);
+ return ret;
+}
+
+#define MAX_INPUT_BUF_SZ 255
+
+static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppops)
+{
+ static DEFINE_MUTEX(split_debug_mutex);
+ ssize_t ret;
+ /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
+ char input_buf[MAX_INPUT_BUF_SZ];
+ int pid;
+ unsigned long vaddr_start, vaddr_end;
+
+ ret = mutex_lock_interruptible(&split_debug_mutex);
+ if (ret)
+ return ret;
+
+ ret = -EFAULT;
+
+ memset(input_buf, 0, MAX_INPUT_BUF_SZ);
+ if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
+ goto out;
+
+ input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
+
+ if (input_buf[0] == '/') {
+ char *tok;
+ char *buf = input_buf;
+ char file_path[MAX_INPUT_BUF_SZ];
+ pgoff_t off_start = 0, off_end = 0;
+ size_t input_len = strlen(input_buf);
+
+ tok = strsep(&buf, ",");
+ if (tok) {
+ strncpy(file_path, tok, MAX_INPUT_BUF_SZ);
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
+ if (ret != 2) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = split_huge_pages_in_file(file_path, off_start, off_end);
+ if (!ret)
+ ret = input_len;
+
+ goto out;
+ }
+
+ ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
+ if (ret == 1 && pid == 1) {
+ split_huge_pages_all();
+ ret = strlen(input_buf);
+ goto out;
+ } else if (ret != 3) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
+ if (!ret)
+ ret = strlen(input_buf);
+out:
+ mutex_unlock(&split_debug_mutex);
+ return ret;
+
+}
+
+static const struct file_operations split_huge_pages_fops = {
+ .owner = THIS_MODULE,
+ .write = split_huge_pages_write,
+ .llseek = no_llseek,
+};
static int __init split_huge_pages_debugfs(void)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a86a58ef132d..103f1187043f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -39,7 +39,6 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
-#include <linux/userfaultfd_k.h>
#include <linux/page_owner.h>
#include "internal.h"
@@ -94,9 +93,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool)
return true;
}
-static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
+ unsigned long irq_flags)
{
- spin_unlock(&spool->lock);
+ spin_unlock_irqrestore(&spool->lock, irq_flags);
/* If no pages are used, and no other handles to the subpool
* remain, give up any reservations based on minimum size and
@@ -135,10 +135,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
void hugepage_put_subpool(struct hugepage_subpool *spool)
{
- spin_lock(&spool->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&spool->lock, flags);
BUG_ON(!spool->count);
spool->count--;
- unlock_or_release_subpool(spool);
+ unlock_or_release_subpool(spool, flags);
}
/*
@@ -157,7 +159,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
if (!spool)
return ret;
- spin_lock(&spool->lock);
+ spin_lock_irq(&spool->lock);
if (spool->max_hpages != -1) { /* maximum size accounting */
if ((spool->used_hpages + delta) <= spool->max_hpages)
@@ -184,7 +186,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
}
unlock_ret:
- spin_unlock(&spool->lock);
+ spin_unlock_irq(&spool->lock);
return ret;
}
@@ -198,11 +200,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
long delta)
{
long ret = delta;
+ unsigned long flags;
if (!spool)
return delta;
- spin_lock(&spool->lock);
+ spin_lock_irqsave(&spool->lock, flags);
if (spool->max_hpages != -1) /* maximum size accounting */
spool->used_hpages -= delta;
@@ -223,7 +226,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
* If hugetlbfs_put_super couldn't free spool due to an outstanding
* quota reference, free it now.
*/
- unlock_or_release_subpool(spool);
+ unlock_or_release_subpool(spool, flags);
return ret;
}
@@ -463,7 +466,7 @@ static int allocate_file_region_entries(struct resv_map *resv,
resv->region_cache_count;
/* At this point, we should have enough entries in the cache
- * for all the existings adds_in_progress. We should only be
+ * for all the existing adds_in_progress. We should only be
* needing to allocate for regions_needed.
*/
VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
@@ -553,7 +556,6 @@ retry:
resv->adds_in_progress -= in_regions_needed;
spin_unlock(&resv->lock);
- VM_BUG_ON(add < 0);
return add;
}
@@ -743,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
{
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
+ bool reserved = false;
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
- if (rsv_adjust) {
+ if (rsv_adjust > 0) {
struct hstate *h = hstate_inode(inode);
- hugetlb_acct_memory(h, 1);
+ if (!hugetlb_acct_memory(h, 1))
+ reserved = true;
+ } else if (!rsv_adjust) {
+ reserved = true;
}
+
+ if (!reserved)
+ pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}
/*
@@ -1059,6 +1068,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
+
+ lockdep_assert_held(&hugetlb_lock);
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
@@ -1068,10 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
- bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+ bool pin = !!(current->flags & PF_MEMALLOC_PIN);
+ lockdep_assert_held(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
- if (nocma && is_migrate_cma_page(page))
+ if (pin && !is_pinnable_page(page))
continue;
if (PageHWPoison(page))
@@ -1205,7 +1217,7 @@ static int hstate_next_node_to_alloc(struct hstate *h,
}
/*
- * helper for free_pool_huge_page() - return the previously saved
+ * helper for remove_pool_huge_page() - return the previously saved
* node ["this node"] from which to free a huge page. Advance the
* next node id whether or not we find a free huge page to free so
* that the next attempt to free addresses the next node.
@@ -1273,7 +1285,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
- unsigned long nr_pages = 1UL << huge_page_order(h);
+ unsigned long nr_pages = pages_per_huge_page(h);
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
@@ -1327,6 +1339,42 @@ static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
#endif
+/*
+ * Remove hugetlb page from lists, and update dtor so that page appears
+ * as just a compound page. A reference is held on the page.
+ *
+ * Must be called with hugetlb lock held.
+ */
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ int nid = page_to_nid(page);
+
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+
+ lockdep_assert_held(&hugetlb_lock);
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ return;
+
+ list_del(&page->lru);
+
+ if (HPageFreed(page)) {
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ }
+ if (adjust_surplus) {
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[nid]--;
+ }
+
+ set_page_refcounted(page);
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+
+ h->nr_huge_pages--;
+ h->nr_huge_pages_node[nid]--;
+}
+
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
@@ -1335,8 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page)
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
- h->nr_huge_pages--;
- h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1344,24 +1390,24 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
}
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
- set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
- set_page_refcounted(page);
if (hstate_is_gigantic(h)) {
- /*
- * Temporarily drop the hugetlb_lock, because
- * we might block in free_gigantic_page().
- */
- spin_unlock(&hugetlb_lock);
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
- spin_lock(&hugetlb_lock);
} else {
__free_pages(page, huge_page_order(h));
}
}
+static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+{
+ struct page *page, *t_page;
+
+ list_for_each_entry_safe(page, t_page, list, lru) {
+ update_and_free_page(h, page);
+ cond_resched();
+ }
+}
+
struct hstate *size_to_hstate(unsigned long size)
{
struct hstate *h;
@@ -1373,7 +1419,7 @@ struct hstate *size_to_hstate(unsigned long size)
return NULL;
}
-static void __free_huge_page(struct page *page)
+void free_huge_page(struct page *page)
{
/*
* Can't pass hstate in here because it is called from the
@@ -1383,6 +1429,7 @@ static void __free_huge_page(struct page *page)
int nid = page_to_nid(page);
struct hugepage_subpool *spool = hugetlb_page_subpool(page);
bool restore_reserve;
+ unsigned long flags;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
@@ -1411,7 +1458,7 @@ static void __free_huge_page(struct page *page)
restore_reserve = true;
}
- spin_lock(&hugetlb_lock);
+ spin_lock_irqsave(&hugetlb_lock, flags);
ClearHPageMigratable(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
@@ -1421,82 +1468,46 @@ static void __free_huge_page(struct page *page)
h->resv_huge_pages++;
if (HPageTemporary(page)) {
- list_del(&page->lru);
- ClearHPageTemporary(page);
+ remove_hugetlb_page(h, page, false);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_page(h, page);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
- list_del(&page->lru);
+ remove_hugetlb_page(h, page, true);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_page(h, page);
- h->surplus_huge_pages--;
- h->surplus_huge_pages_node[nid]--;
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
}
- spin_unlock(&hugetlb_lock);
}
/*
- * As free_huge_page() can be called from a non-task context, we have
- * to defer the actual freeing in a workqueue to prevent potential
- * hugetlb_lock deadlock.
- *
- * free_hpage_workfn() locklessly retrieves the linked list of pages to
- * be freed and frees them one-by-one. As the page->mapping pointer is
- * going to be cleared in __free_huge_page() anyway, it is reused as the
- * llist_node structure of a lockless linked list of huge pages to be freed.
+ * Must be called with the hugetlb lock held
*/
-static LLIST_HEAD(hpage_freelist);
-
-static void free_hpage_workfn(struct work_struct *work)
-{
- struct llist_node *node;
- struct page *page;
-
- node = llist_del_all(&hpage_freelist);
-
- while (node) {
- page = container_of((struct address_space **)node,
- struct page, mapping);
- node = node->next;
- __free_huge_page(page);
- }
-}
-static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
-
-void free_huge_page(struct page *page)
+static void __prep_account_new_huge_page(struct hstate *h, int nid)
{
- /*
- * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
- */
- if (!in_task()) {
- /*
- * Only call schedule_work() if hpage_freelist is previously
- * empty. Otherwise, schedule_work() had been called but the
- * workfn hasn't retrieved the list yet.
- */
- if (llist_add((struct llist_node *)&page->mapping,
- &hpage_freelist))
- schedule_work(&free_hpage_work);
- return;
- }
-
- __free_huge_page(page);
+ lockdep_assert_held(&hugetlb_lock);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+static void __prep_new_huge_page(struct page *page)
{
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
- spin_lock(&hugetlb_lock);
- h->nr_huge_pages++;
- h->nr_huge_pages_node[nid]++;
- ClearHPageFreed(page);
- spin_unlock(&hugetlb_lock);
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+ __prep_new_huge_page(page);
+ spin_lock_irq(&hugetlb_lock);
+ __prep_account_new_huge_page(h, nid);
+ spin_unlock_irq(&hugetlb_lock);
}
static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1577,15 +1588,12 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
return NULL;
}
-pgoff_t __basepage_index(struct page *page)
+pgoff_t hugetlb_basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
- if (!PageHuge(page_head))
- return page_index(page);
-
if (compound_order(page_head) >= MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
@@ -1616,7 +1624,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
- page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+ page = __alloc_pages(gfp_mask, order, nid, nmask);
if (page)
__count_vm_event(HTLB_BUDDY_PGALLOC);
else
@@ -1693,17 +1701,20 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
}
/*
- * Free huge page from pool from next node to free.
- * Attempt to keep persistent huge pages more or less
- * balanced over allowed nodes.
+ * Remove huge page from pool from next node to free. Attempt to keep
+ * persistent huge pages more or less balanced over allowed nodes.
+ * This routine only 'removes' the hugetlb page. The caller must make
+ * an additional call to free the page to low level allocators.
* Called with hugetlb_lock locked.
*/
-static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
- bool acct_surplus)
+static struct page *remove_pool_huge_page(struct hstate *h,
+ nodemask_t *nodes_allowed,
+ bool acct_surplus)
{
int nr_nodes, node;
- int ret = 0;
+ struct page *page = NULL;
+ lockdep_assert_held(&hugetlb_lock);
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
/*
* If we're returning unused surplus pages, only examine
@@ -1711,23 +1722,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
*/
if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
!list_empty(&h->hugepage_freelists[node])) {
- struct page *page =
- list_entry(h->hugepage_freelists[node].next,
+ page = list_entry(h->hugepage_freelists[node].next,
struct page, lru);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[node]--;
- if (acct_surplus) {
- h->surplus_huge_pages--;
- h->surplus_huge_pages_node[node]--;
- }
- update_and_free_page(h, page);
- ret = 1;
+ remove_hugetlb_page(h, page, acct_surplus);
break;
}
}
- return ret;
+ return page;
}
/*
@@ -1749,7 +1751,7 @@ retry:
if (!PageHuge(page))
return 0;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (!PageHuge(page)) {
rc = 0;
goto out;
@@ -1758,7 +1760,6 @@ retry:
if (!page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
- int nid = page_to_nid(head);
if (h->free_huge_pages - h->resv_huge_pages == 0)
goto out;
@@ -1767,7 +1768,7 @@ retry:
* when it is dissolved.
*/
if (unlikely(!HPageFreed(head))) {
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
cond_resched();
/*
@@ -1789,15 +1790,14 @@ retry:
SetPageHWPoison(page);
ClearPageHWPoison(head);
}
- list_del(&head->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
+ remove_hugetlb_page(h, head, false);
h->max_huge_pages--;
+ spin_unlock_irq(&hugetlb_lock);
update_and_free_page(h, head);
- rc = 0;
+ return 0;
}
out:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return rc;
}
@@ -1839,16 +1839,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
if (hstate_is_gigantic(h))
return NULL;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
goto out_unlock;
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
/*
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
@@ -1858,7 +1858,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetHPageTemporary(page);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
put_page(page);
return NULL;
} else {
@@ -1867,7 +1867,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
}
out_unlock:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return page;
}
@@ -1917,17 +1917,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0) {
struct page *page;
page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
if (page) {
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return page;
}
}
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
}
@@ -1964,6 +1964,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
long needed, allocated;
bool alloc_ok = true;
+ lockdep_assert_held(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
h->resv_huge_pages += delta;
@@ -1975,7 +1976,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
ret = -ENOMEM;
retry:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL);
@@ -1992,7 +1993,7 @@ retry:
* After retaking hugetlb_lock, we need to recalculate 'needed'
* because either resv_huge_pages or free_huge_pages may have changed.
*/
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) -
(h->free_huge_pages + allocated);
if (needed > 0) {
@@ -2032,12 +2033,12 @@ retry:
enqueue_huge_page(h, page);
}
free:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
list_for_each_entry_safe(page, tmp, &surplus_list, lru)
put_page(page);
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
return ret;
}
@@ -2049,17 +2050,17 @@ free:
* to the associated reservation map.
* 2) Free any unused surplus pages that may have been allocated to satisfy
* the reservation. As many as unused_resv_pages may be freed.
- *
- * Called with hugetlb_lock held. However, the lock could be dropped (and
- * reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
- * we must make sure nobody else can claim pages we are in the process of
- * freeing. Do this by ensuring resv_huge_page always is greater than the
- * number of huge pages we plan to free when dropping the lock.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
+ struct page *page;
+ LIST_HEAD(page_list);
+
+ lockdep_assert_held(&hugetlb_lock);
+ /* Uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
/* Cannot return gigantic pages currently */
if (hstate_is_gigantic(h))
@@ -2076,24 +2077,21 @@ static void return_unused_surplus_pages(struct hstate *h,
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
- * free_pool_huge_page() will balance the freed pages across the
+ * remove_pool_huge_page() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
- *
- * Note that we decrement resv_huge_pages as we free the pages. If
- * we drop the lock, resv_huge_pages will still be sufficiently large
- * to cover subsequent pages we may free.
*/
while (nr_pages--) {
- h->resv_huge_pages--;
- unused_resv_pages--;
- if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+ page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
+ if (!page)
goto out;
- cond_resched_lock(&hugetlb_lock);
+
+ list_add(&page->lru, &page_list);
}
out:
- /* Fully uncommit the reservation */
- h->resv_huge_pages -= unused_resv_pages;
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_pages_bulk(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
}
@@ -2120,12 +2118,18 @@ out:
* be restored when a newly allocated huge page must be freed. It is
* to be called after calling vma_needs_reservation to determine if a
* reservation exists.
+ *
+ * vma_del_reservation is used in error paths where an entry in the reserve
+ * map was created during huge page allocation and must be removed. It is to
+ * be called after calling vma_needs_reservation to determine if a reservation
+ * exists.
*/
enum vma_resv_mode {
VMA_NEEDS_RESV,
VMA_COMMIT_RESV,
VMA_END_RESV,
VMA_ADD_RESV,
+ VMA_DEL_RESV,
};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
@@ -2169,33 +2173,42 @@ static long __vma_reservation_common(struct hstate *h,
ret = region_del(resv, idx, idx + 1);
}
break;
+ case VMA_DEL_RESV:
+ if (vma->vm_flags & VM_MAYSHARE) {
+ region_abort(resv, idx, idx + 1, 1);
+ ret = region_del(resv, idx, idx + 1);
+ } else {
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
+ }
+ break;
default:
BUG();
}
- if (vma->vm_flags & VM_MAYSHARE)
+ if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
return ret;
- else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
- /*
- * In most cases, reserves always exist for private mappings.
- * However, a file associated with mapping could have been
- * hole punched or truncated after reserves were consumed.
- * As subsequent fault on such a range will not use reserves.
- * Subtle - The reserve map for private mappings has the
- * opposite meaning than that of shared mappings. If NO
- * entry is in the reserve map, it means a reservation exists.
- * If an entry exists in the reserve map, it means the
- * reservation has already been consumed. As a result, the
- * return value of this routine is the opposite of the
- * value returned from reserve map manipulation routines above.
- */
- if (ret)
- return 0;
- else
- return 1;
- }
- else
- return ret < 0 ? ret : 0;
+ /*
+ * We know private mapping must have HPAGE_RESV_OWNER set.
+ *
+ * In most cases, reserves always exist for private mappings.
+ * However, a file associated with mapping could have been
+ * hole punched or truncated after reserves were consumed.
+ * As subsequent fault on such a range will not use reserves.
+ * Subtle - The reserve map for private mappings has the
+ * opposite meaning than that of shared mappings. If NO
+ * entry is in the reserve map, it means a reservation exists.
+ * If an entry exists in the reserve map, it means the
+ * reservation has already been consumed. As a result, the
+ * return value of this routine is the opposite of the
+ * value returned from reserve map manipulation routines above.
+ */
+ if (ret > 0)
+ return 0;
+ if (ret == 0)
+ return 1;
+ return ret;
}
static long vma_needs_reservation(struct hstate *h,
@@ -2222,25 +2235,39 @@ static long vma_add_reservation(struct hstate *h,
return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}
+static long vma_del_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
+}
+
/*
- * This routine is called to restore a reservation on error paths. In the
- * specific error paths, a huge page was allocated (via alloc_huge_page)
- * and is about to be freed. If a reservation for the page existed,
- * alloc_huge_page would have consumed the reservation and set
- * HPageRestoreReserve in the newly allocated page. When the page is freed
- * via free_huge_page, the global reservation count will be incremented if
- * HPageRestoreReserve is set. However, free_huge_page can not adjust the
- * reserve map. Adjust the reserve map here to be consistent with global
- * reserve count adjustments to be made by free_huge_page.
+ * This routine is called to restore reservation information on error paths.
+ * It should ONLY be called for pages allocated via alloc_huge_page(), and
+ * the hugetlb mutex should remain held when calling this routine.
+ *
+ * It handles two specific cases:
+ * 1) A reservation was in place and the page consumed the reservation.
+ * HPageRestoreReserve is set in the page.
+ * 2) No reservation was in place for the page, so HPageRestoreReserve is
+ * not set. However, alloc_huge_page always updates the reserve map.
+ *
+ * In case 1, free_huge_page later in the error path will increment the
+ * global reserve count. But, free_huge_page does not have enough context
+ * to adjust the reservation map. This case deals primarily with private
+ * mappings. Adjust the reserve map here to be consistent with global
+ * reserve count adjustments to be made by free_huge_page. Make sure the
+ * reserve map indicates there is a reservation present.
+ *
+ * In case 2, simply undo reserve map modifications done by alloc_huge_page.
*/
-static void restore_reserve_on_error(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address,
- struct page *page)
+void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
+ unsigned long address, struct page *page)
{
- if (unlikely(HPageRestoreReserve(page))) {
- long rc = vma_needs_reservation(h, vma, address);
+ long rc = vma_needs_reservation(h, vma, address);
- if (unlikely(rc < 0)) {
+ if (HPageRestoreReserve(page)) {
+ if (unlikely(rc < 0))
/*
* Rare out of memory condition in reserve map
* manipulation. Clear HPageRestoreReserve so that
@@ -2253,17 +2280,186 @@ static void restore_reserve_on_error(struct hstate *h,
* accounting of reserve counts.
*/
ClearHPageRestoreReserve(page);
- } else if (rc) {
- rc = vma_add_reservation(h, vma, address);
- if (unlikely(rc < 0))
+ else if (rc)
+ (void)vma_add_reservation(h, vma, address);
+ else
+ vma_end_reservation(h, vma, address);
+ } else {
+ if (!rc) {
+ /*
+ * This indicates there is an entry in the reserve map
+ * added by alloc_huge_page. We know it was added
+ * before the alloc_huge_page call, otherwise
+ * HPageRestoreReserve would be set on the page.
+ * Remove the entry so that a subsequent allocation
+ * does not consume a reservation.
+ */
+ rc = vma_del_reservation(h, vma, address);
+ if (rc < 0)
+ /*
+ * VERY rare out of memory condition. Since
+ * we can not delete the entry, set
+ * HPageRestoreReserve so that the reserve
+ * count will be incremented when the page
+ * is freed. This reserve will be consumed
+ * on a subsequent allocation.
+ */
+ SetHPageRestoreReserve(page);
+ } else if (rc < 0) {
+ /*
+ * Rare out of memory condition from
+ * vma_needs_reservation call. Memory allocation is
+ * only attempted if a new entry is needed. Therefore,
+ * this implies there is not an entry in the
+ * reserve map.
+ *
+ * For shared mappings, no entry in the map indicates
+ * no reservation. We are done.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE))
/*
- * See above comment about rare out of
- * memory condition.
+ * For private mappings, no entry indicates
+ * a reservation is present. Since we can
+ * not add an entry, set SetHPageRestoreReserve
+ * on the page so reserve count will be
+ * incremented when freed. This reserve will
+ * be consumed on a subsequent allocation.
*/
- ClearHPageRestoreReserve(page);
+ SetHPageRestoreReserve(page);
} else
- vma_end_reservation(h, vma, address);
+ /*
+ * No reservation present, do nothing
+ */
+ vma_end_reservation(h, vma, address);
+ }
+}
+
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * @list: List to isolate the page in case we need to
+ * Returns 0 on success, otherwise negated error.
+ */
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
+ struct list_head *list)
+{
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ int nid = page_to_nid(old_page);
+ struct page *new_page;
+ int ret = 0;
+
+ /*
+ * Before dissolving the page, we need to allocate a new one for the
+ * pool to remain stable. Using alloc_buddy_huge_page() allows us to
+ * not having to deal with prep_new_huge_page() and avoids dealing of any
+ * counters. This simplifies and let us do the whole thing under the
+ * lock.
+ */
+ new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
+ if (!new_page)
+ return -ENOMEM;
+
+retry:
+ spin_lock_irq(&hugetlb_lock);
+ if (!PageHuge(old_page)) {
+ /*
+ * Freed from under us. Drop new_page too.
+ */
+ goto free_new;
+ } else if (page_count(old_page)) {
+ /*
+ * Someone has grabbed the page, try to isolate it here.
+ * Fail with -EBUSY if not possible.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ if (!isolate_huge_page(old_page, list))
+ ret = -EBUSY;
+ spin_lock_irq(&hugetlb_lock);
+ goto free_new;
+ } else if (!HPageFreed(old_page)) {
+ /*
+ * Page's refcount is 0 but it has not been enqueued in the
+ * freelist yet. Race window is small, so we can succeed here if
+ * we retry.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ cond_resched();
+ goto retry;
+ } else {
+ /*
+ * Ok, old_page is still a genuine free hugepage. Remove it from
+ * the freelist and decrease the counters. These will be
+ * incremented again when calling __prep_account_new_huge_page()
+ * and enqueue_huge_page() for new_page. The counters will remain
+ * stable since this happens under the lock.
+ */
+ remove_hugetlb_page(h, old_page, false);
+
+ /*
+ * new_page needs to be initialized with the standard hugetlb
+ * state. This is normally done by prep_new_huge_page() but
+ * that takes hugetlb_lock which is already held so we need to
+ * open code it here.
+ * Reference count trick is needed because allocator gives us
+ * referenced page but the pool requires pages with 0 refcount.
+ */
+ __prep_new_huge_page(new_page);
+ __prep_account_new_huge_page(h, nid);
+ page_ref_dec(new_page);
+ enqueue_huge_page(h, new_page);
+
+ /*
+ * Pages have been replaced, we can safely free the old one.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_page(h, old_page);
}
+
+ return ret;
+
+free_new:
+ spin_unlock_irq(&hugetlb_lock);
+ __free_pages(new_page, huge_page_order(h));
+
+ return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+{
+ struct hstate *h;
+ struct page *head;
+ int ret = -EBUSY;
+
+ /*
+ * The page might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ * Return success when racing as if we dissolved the page ourselves.
+ */
+ spin_lock_irq(&hugetlb_lock);
+ if (PageHuge(page)) {
+ head = compound_head(page);
+ h = page_hstate(head);
+ } else {
+ spin_unlock_irq(&hugetlb_lock);
+ return 0;
+ }
+ spin_unlock_irq(&hugetlb_lock);
+
+ /*
+ * Fence off gigantic pages as there is a cyclic dependency between
+ * alloc_contig_range and them. Return -ENOMEM as this has the effect
+ * of bailing out right away without further retrying.
+ */
+ if (hstate_is_gigantic(h))
+ return -ENOMEM;
+
+ if (page_count(head) && isolate_huge_page(head, list))
+ ret = 0;
+ else if (!page_count(head))
+ ret = alloc_and_dissolve_huge_page(h, head, list);
+
+ return ret;
}
struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -2316,7 +2512,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
/* If this allocation is not consuming a reservation, charge it now.
*/
- deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+ deferred_reserve = map_chg || avoid_reserve;
if (deferred_reserve) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
idx, pages_per_huge_page(h), &h_cg);
@@ -2328,7 +2524,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
if (ret)
goto out_uncharge_cgroup_reservation;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
/*
* glb_chg is passed to indicate whether or not a page must be taken
* from the global free pool (global change). gbl_chg == 0 indicates
@@ -2336,7 +2532,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
*/
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
@@ -2344,7 +2540,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
list_add(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
@@ -2357,7 +2553,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
h_cg, page);
}
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
hugetlb_set_page_subpool(page, spool);
@@ -2547,24 +2743,32 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{
int i;
+ LIST_HEAD(page_list);
+ lockdep_assert_held(&hugetlb_lock);
if (hstate_is_gigantic(h))
return;
+ /*
+ * Collect pages to be freed on a list, and free after dropping lock
+ */
for_each_node_mask(i, *nodes_allowed) {
struct page *page, *next;
struct list_head *freel = &h->hugepage_freelists[i];
list_for_each_entry_safe(page, next, freel, lru) {
if (count >= h->nr_huge_pages)
- return;
+ goto out;
if (PageHighMem(page))
continue;
- list_del(&page->lru);
- update_and_free_page(h, page);
- h->free_huge_pages--;
- h->free_huge_pages_node[page_to_nid(page)]--;
+ remove_hugetlb_page(h, page, false);
+ list_add(&page->lru, &page_list);
}
}
+
+out:
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_pages_bulk(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
}
#else
static inline void try_to_free_low(struct hstate *h, unsigned long count,
@@ -2583,6 +2787,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
{
int nr_nodes, node;
+ lockdep_assert_held(&hugetlb_lock);
VM_BUG_ON(delta != -1 && delta != 1);
if (delta < 0) {
@@ -2610,6 +2815,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
+ struct page *page;
+ LIST_HEAD(page_list);
NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
/*
@@ -2622,7 +2829,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
else
return -ENOMEM;
- spin_lock(&hugetlb_lock);
+ /*
+ * resize_lock mutex prevents concurrent adjustments to number of
+ * pages in hstate via the proc/sysfs interfaces.
+ */
+ mutex_lock(&h->resize_lock);
+ spin_lock_irq(&hugetlb_lock);
/*
* Check for a node specific request.
@@ -2653,7 +2865,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
*/
if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
if (count > persistent_huge_pages(h)) {
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
NODEMASK_FREE(node_alloc_noretry);
return -EINVAL;
}
@@ -2682,14 +2895,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* page, free_huge_page will handle it by freeing the page
* and reducing the surplus.
*/
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
/* yield cpu to avoid soft lockup */
cond_resched();
ret = alloc_pool_huge_page(h, nodes_allowed,
node_alloc_noretry);
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (!ret)
goto out;
@@ -2716,18 +2929,30 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
min_count = max(count, min_count);
try_to_free_low(h, min_count, nodes_allowed);
+
+ /*
+ * Collect pages to be removed on list without dropping lock
+ */
while (min_count < persistent_huge_pages(h)) {
- if (!free_pool_huge_page(h, nodes_allowed, 0))
+ page = remove_pool_huge_page(h, nodes_allowed, 0);
+ if (!page)
break;
- cond_resched_lock(&hugetlb_lock);
+
+ list_add(&page->lru, &page_list);
}
+ /* free the pages after dropping lock */
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_pages_bulk(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
+
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
break;
}
out:
h->max_huge_pages = persistent_huge_pages(h);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
NODEMASK_FREE(node_alloc_noretry);
@@ -2882,9 +3107,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
if (err)
return err;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
h->nr_overcommit_huge_pages = input;
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return count;
}
@@ -3215,6 +3440,7 @@ void __init hugetlb_add_hstate(unsigned int order)
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order == 0);
h = &hstates[hugetlb_max_hstate++];
+ mutex_init(&h->resize_lock);
h->order = order;
h->mask = ~(huge_page_size(h) - 1);
for (i = 0; i < MAX_NUMNODES; ++i)
@@ -3267,10 +3493,10 @@ static int __init hugepages_setup(char *s)
/*
* Global state is always initialized later in hugetlb_init.
- * But we need to allocate >= MAX_ORDER hstates here early to still
+ * But we need to allocate gigantic hstates here early to still
* use the bootmem allocator.
*/
- if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+ if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
hugetlb_hstate_alloc_pages(parsed_hstate);
last_mhp = mhp;
@@ -3470,9 +3696,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
goto out;
if (write) {
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
h->nr_overcommit_huge_pages = tmp;
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
}
out:
return ret;
@@ -3568,7 +3794,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
if (!delta)
return 0;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
/*
* When cpuset is configured, it breaks the strict hugetlb page
* reservation as the accounting is done on a global variable. Such
@@ -3607,7 +3833,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
return_unused_surplus_pages(h, (unsigned long) -delta);
out:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return ret;
}
@@ -3795,7 +4021,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
- dst_pte = huge_pte_alloc(dst, addr, sz);
+ dst_pte = huge_pte_alloc(dst, vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
break;
@@ -3879,6 +4105,8 @@ again:
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
+ restore_reserve_on_error(h, vma, addr,
+ new);
put_page(new);
/* dst_entry won't change as in child */
goto again;
@@ -3898,6 +4126,7 @@ again:
* See Documentation/vm/mmu_notifier.rst
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
+ entry = huge_pte_wrprotect(entry);
}
page_dup_rmap(ptepage, true);
@@ -4310,6 +4539,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
return 0;
}
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx,
+ unsigned int flags,
+ unsigned long haddr,
+ unsigned long reason)
+{
+ vm_fault_t ret;
+ u32 hash;
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .flags = flags,
+
+ /*
+ * Hard to debug if it ends up being
+ * used by a callee that assumes
+ * something about the other
+ * uninitialized fields... same as in
+ * memory.c
+ */
+ };
+
+ /*
+ * hugetlb_fault_mutex and i_mmap_rwsem must be
+ * dropped before handling userfault. Reacquire
+ * after handling fault to make calling code simpler.
+ */
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ ret = handle_userfault(&vmf, reason);
+ i_mmap_lock_read(mapping);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ return ret;
+}
+
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
@@ -4348,35 +4615,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- /*
- * Check for page in userfault range
- */
+ /* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
- u32 hash;
- struct vm_fault vmf = {
- .vma = vma,
- .address = haddr,
- .flags = flags,
- /*
- * Hard to debug if it ends up being
- * used by a callee that assumes
- * something about the other
- * uninitialized fields... same as in
- * memory.c
- */
- };
-
- /*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
- */
- hash = hugetlb_fault_mutex_hash(mapping, idx);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
- ret = handle_userfault(&vmf, VM_UFFD_MISSING);
- i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ ret = hugetlb_handle_userfault(vma, mapping, idx,
+ flags, haddr,
+ VM_UFFD_MISSING);
goto out;
}
@@ -4395,13 +4638,10 @@ retry:
* sure there really is no pte entry.
*/
ptl = huge_pte_lock(h, mm, ptep);
- if (!huge_pte_none(huge_ptep_get(ptep))) {
- ret = 0;
- spin_unlock(ptl);
- goto out;
- }
+ ret = 0;
+ if (huge_pte_none(huge_ptep_get(ptep)))
+ ret = vmf_error(PTR_ERR(page));
spin_unlock(ptl);
- ret = vmf_error(PTR_ERR(page));
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -4435,6 +4675,16 @@ retry:
VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
+
+ /* Check for page in userfault range. */
+ if (userfaultfd_minor(vma)) {
+ unlock_page(page);
+ put_page(page);
+ ret = hugetlb_handle_userfault(vma, mapping, idx,
+ flags, haddr,
+ VM_UFFD_MINOR);
+ goto out;
+ }
}
/*
@@ -4563,7 +4813,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
mapping = vma->vm_file->f_mapping;
i_mmap_lock_read(mapping);
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
if (!ptep) {
i_mmap_unlock_read(mapping);
return VM_FAULT_OOM;
@@ -4675,6 +4925,7 @@ out_mutex:
return ret;
}
+#ifdef CONFIG_USERFAULTFD
/*
* Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
* modifications for huge pages.
@@ -4684,8 +4935,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
+ enum mcopy_atomic_mode mode,
struct page **pagep)
{
+ bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct address_space *mapping;
pgoff_t idx;
unsigned long size;
@@ -4695,12 +4948,31 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
spinlock_t *ptl;
int ret;
struct page *page;
+ int writable;
+
+ mapping = dst_vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+ if (is_continue) {
+ ret = -EFAULT;
+ page = find_lock_page(mapping, idx);
+ if (!page)
+ goto out;
+ } else if (!*pagep) {
+ /* If a page already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
- if (!*pagep) {
- ret = -ENOMEM;
page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
goto out;
+ }
ret = copy_huge_page_from_user(page,
(const void __user *) src_addr,
@@ -4725,13 +4997,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
*/
__SetPageUptodate(page);
- mapping = dst_vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
- /*
- * If shared, add to page cache
- */
- if (vm_shared) {
+ /* Add shared, newly allocated pages to the page cache. */
+ if (vm_shared && !is_continue) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
ret = -EFAULT;
if (idx >= size)
@@ -4776,8 +5043,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
}
- _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
- if (dst_vma->vm_flags & VM_WRITE)
+ /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
+ if (is_continue && !vm_shared)
+ writable = 0;
+ else
+ writable = dst_vma->vm_flags & VM_WRITE;
+
+ _dst_pte = make_huge_pte(dst_vma, page, writable);
+ if (writable)
_dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);
@@ -4791,20 +5064,23 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
- SetHPageMigratable(page);
- if (vm_shared)
+ if (!is_continue)
+ SetHPageMigratable(page);
+ if (vm_shared || is_continue)
unlock_page(page);
ret = 0;
out:
return ret;
out_release_unlock:
spin_unlock(ptl);
- if (vm_shared)
+ if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
+#endif /* CONFIG_USERFAULTFD */
static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
int refs, struct page **pages,
@@ -4996,14 +5272,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
return i ? i : err;
}
-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
-/*
- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
- * implement this.
- */
-#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
-#endif
-
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
@@ -5280,6 +5548,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
/*
* If the subpool has a minimum size, the number of global
* reservations to be released may be adjusted.
+ *
+ * Note that !resv_map implies freed == 0. So (chg - freed)
+ * won't go negative.
*/
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -gbl_reserve);
@@ -5326,6 +5597,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
return false;
}
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+#ifdef CONFIG_USERFAULTFD
+ if (uffd_disable_huge_pmd_share(vma))
+ return false;
+#endif
+ return vma_shareable(vma, addr);
+}
+
/*
* Determine if start,end range within vma could be mapped by shared pmd.
* If yes, adjust start and end to cover range associated with possible
@@ -5338,8 +5618,8 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
/*
- * vma need span at least one aligned PUD size and the start,end range
- * must at least partialy within it.
+ * vma needs to span at least one aligned PUD size, and the range
+ * must be at least partially within in.
*/
if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
(*end <= v_start) || (*start >= v_end))
@@ -5370,9 +5650,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
* only required for subsequent processing.
*/
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud)
{
- struct vm_area_struct *vma = find_vma(mm, addr);
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
@@ -5382,9 +5662,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
pte_t *pte;
spinlock_t *ptl;
- if (!vma_shareable(vma, addr))
- return (pte_t *)pmd_alloc(mm, pud, addr);
-
i_mmap_assert_locked(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
@@ -5448,9 +5725,10 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
return 1;
}
-#define want_pmd_share() (1)
+
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud)
{
return NULL;
}
@@ -5465,11 +5743,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
}
-#define want_pmd_share() (0)
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+ return false;
+}
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
@@ -5487,8 +5769,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte = (pte_t *)pud;
} else {
BUG_ON(sz != PMD_SIZE);
- if (want_pmd_share() && pud_none(*pud))
- pte = huge_pmd_share(mm, addr, pud);
+ if (want_pmd_share(vma, addr) && pud_none(*pud))
+ pte = huge_pmd_share(mm, vma, addr, pud);
else
pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
@@ -5632,7 +5914,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
{
bool ret = true;
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
if (!PageHeadHuge(page) ||
!HPageMigratable(page) ||
!get_page_unless_zero(page)) {
@@ -5642,16 +5924,33 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
ClearHPageMigratable(page);
list_move_tail(&page->lru, list);
unlock:
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
+ return ret;
+}
+
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+{
+ int ret = 0;
+
+ *hugetlb = false;
+ spin_lock_irq(&hugetlb_lock);
+ if (PageHeadHuge(page)) {
+ *hugetlb = true;
+ if (HPageFreed(page) || HPageMigratable(page))
+ ret = get_page_unless_zero(page);
+ else
+ ret = -EBUSY;
+ }
+ spin_unlock_irq(&hugetlb_lock);
return ret;
}
void putback_active_hugepage(struct page *page)
{
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
SetHPageMigratable(page);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
put_page(page);
}
@@ -5679,13 +5978,70 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
SetHPageTemporary(oldpage);
ClearHPageTemporary(newpage);
- spin_lock(&hugetlb_lock);
+ /*
+ * There is no need to transfer the per-node surplus state
+ * when we do not cross the node.
+ */
+ if (new_nid == old_nid)
+ return;
+ spin_lock_irq(&hugetlb_lock);
if (h->surplus_huge_pages_node[old_nid]) {
h->surplus_huge_pages_node[old_nid]--;
h->surplus_huge_pages_node[new_nid]++;
}
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
+ }
+}
+
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
+ unsigned long address, start, end;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ start = ALIGN(vma->vm_start, PUD_SIZE);
+ end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+ if (start >= end)
+ return;
+
+ /*
+ * No need to call adjust_range_if_pmd_sharing_possible(), because
+ * we have already done the PUD_SIZE alignment.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ start, end);
+ mmu_notifier_invalidate_range_start(&range);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ for (address = start; address < end; address += PUD_SIZE) {
+ unsigned long tmp = address;
+
+ ptep = huge_pte_offset(mm, address, sz);
+ if (!ptep)
+ continue;
+ ptl = huge_pte_lock(h, mm, ptep);
+ /* We don't want 'address' to be changed */
+ huge_pmd_unshare(mm, vma, &tmp, ptep);
+ spin_unlock(ptl);
}
+ flush_hugetlb_tlb_range(vma, start, end);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ /*
+ * No need to call mmu_notifier_invalidate_range(), see
+ * Documentation/vm/mmu_notifier.rst.
+ */
+ mmu_notifier_invalidate_range_end(&range);
}
#ifdef CONFIG_CMA
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 603a131e262d..5383023d0cca 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -204,11 +204,11 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
do {
idx = 0;
for_each_hstate(h) {
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
hugetlb_cgroup_move_parent(idx, h_cg, page);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
idx++;
}
cond_resched();
@@ -784,8 +784,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
if (hugetlb_cgroup_disabled())
return;
- VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
- spin_lock(&hugetlb_lock);
+ spin_lock_irq(&hugetlb_lock);
h_cg = hugetlb_cgroup_from_page(oldhpage);
h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
set_hugetlb_cgroup(oldhpage, NULL);
@@ -795,7 +794,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
set_hugetlb_cgroup(newhpage, h_cg);
set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
list_move(&newhpage->lru, &h->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
+ spin_unlock_irq(&hugetlb_lock);
return;
}
diff --git a/mm/internal.h b/mm/internal.h
index cb3c5e0a7799..6ec2cea9926b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -51,13 +51,12 @@ void unmap_page_range(struct mmu_gather *tlb,
void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
unsigned long lookahead_size);
-void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
- unsigned long nr);
+void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
struct file *file, pgoff_t index, unsigned long nr_to_read)
{
- DEFINE_READAHEAD(ractl, file, mapping, index);
- force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
+ DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
+ force_page_cache_ra(&ractl, nr_to_read);
}
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
@@ -97,26 +96,6 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
-/*
- * When kernel touch the user page, the user page may be have been marked
- * poison but still mapped in user space, if without this page, the kernel
- * can guarantee the data integrity and operation success, the kernel is
- * better to check the posion status and avoid touching it, be good not to
- * panic, coredump for process fatal signal is a sample case matching this
- * scenario. Or if kernel can't guarantee the data integrity, it's better
- * not to call this function, let kernel touch the poison page and get to
- * panic.
- */
-static inline bool is_page_poisoned(struct page *page)
-{
- if (PageHWPoison(page))
- return true;
- else if (PageHuge(page) && PageHWPoison(compound_head(page)))
- return true;
-
- return false;
-}
-
extern unsigned long highest_memmap_pfn;
/*
@@ -137,6 +116,11 @@ extern void putback_lru_page(struct page *page);
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
+ * in mm/memcontrol.c:
+ */
+extern bool cgroup_memory_nokmem;
+
+/*
* in mm/page_alloc.c
*/
@@ -146,10 +130,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
* family of functions.
*
* nodemask, migratetype and highest_zoneidx are initialized only once in
- * __alloc_pages_nodemask() and then never change.
+ * __alloc_pages() and then never change.
*
* zonelist, preferred_zone and highest_zoneidx are set first in
- * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * __alloc_pages() for the fast path, and might be later changed
* in __alloc_pages_slowpath(). All other functions pass the whole structure
* by a const pointer.
*/
@@ -219,10 +203,10 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
-extern void free_unref_page(struct page *page);
+extern void free_unref_page(struct page *page, unsigned int order);
extern void free_unref_page_list(struct list_head *list);
-extern void zone_pcp_update(struct zone *zone);
+extern void zone_pcp_update(struct zone *zone, int cpu_online);
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
@@ -245,7 +229,13 @@ struct compact_control {
unsigned int nr_freepages; /* Number of isolated free pages */
unsigned int nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
- unsigned long migrate_pfn; /* isolate_migratepages search base */
+ /*
+ * Acts as an in/out parameter to page isolation for migration.
+ * isolate_migratepages uses it as a search base.
+ * isolate_migratepages_block will update the value to the next pfn
+ * after the last isolated one.
+ */
+ unsigned long migrate_pfn;
unsigned long fast_start_pfn; /* a pfn to start linear scan from */
struct zone *zone;
unsigned long total_migrate_scanned;
@@ -281,7 +271,7 @@ struct capture_control {
unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn);
-unsigned long
+int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
int find_suitable_fallback(struct free_area *area, unsigned int order,
@@ -329,7 +319,7 @@ static inline bool is_exec_mapping(vm_flags_t flags)
}
/*
- * Stack area - atomatically grows in one direction
+ * Stack area - automatically grows in one direction
*
* VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
* do_mmap() forbids all other combinations.
@@ -399,27 +389,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
- * At what user virtual address is page expected in @vma?
+ * At what user virtual address is page expected in vma?
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
*/
static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address(struct page *page, struct vm_area_struct *vma)
{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ pgoff_t pgoff;
+ unsigned long address;
+
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page);
+ if (pgoff >= vma->vm_pgoff) {
+ address = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address >= vma->vm_end)
+ address = -EFAULT;
+ } else if (PageHead(page) &&
+ pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+ /* Test above avoids possibility of wrap to 0 on 32-bit */
+ address = vma->vm_start;
+ } else {
+ address = -EFAULT;
+ }
+ return address;
}
+/*
+ * Then at what user virtual address will none of the page be found in vma?
+ * Assumes that vma_address() already returned a good starting address.
+ * If page is a compound head, the entire compound page is considered.
+ */
static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address_end(struct page *page, struct vm_area_struct *vma)
{
- unsigned long start, end;
-
- start = __vma_address(page, vma);
- end = start + thp_size(page) - PAGE_SIZE;
-
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma);
-
- return max(start, vma->vm_start);
+ pgoff_t pgoff;
+ unsigned long address;
+
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page) + compound_nr(page);
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address > vma->vm_end)
+ address = vma->vm_end;
+ return address;
}
static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
@@ -447,7 +462,9 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
-
+static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+}
#endif /* !CONFIG_MMU */
/*
@@ -638,4 +655,21 @@ struct migration_target_control {
gfp_t gfp_mask;
};
+/*
+ * mm/vmalloc.c
+ */
+#ifdef CONFIG_MMU
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift);
+#else
+static inline
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ return -EINVAL;
+}
+#endif
+
+void vunmap_range_noflush(unsigned long start, unsigned long end);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 11c75fb07584..32e390c42c53 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -22,7 +22,7 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
unsigned long, shared.rb_subtree_last,
- vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+ vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree)
/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
new file mode 100644
index 000000000000..01b362799930
--- /dev/null
+++ b/mm/io-mapping.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/mm.h>
+#include <linux/io-mapping.h>
+
+/**
+ * io_mapping_map_user - remap an I/O mapping to userspace
+ * @iomap: the source io_mapping
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ */
+int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, unsigned long size)
+{
+ vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+
+ if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
+ return -EINVAL;
+
+ /* We rely on prevalidation of the io-mapping to skip track_pfn(). */
+ return remap_pfn_range_notrack(vma, addr, pfn, size,
+ __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
+ (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
+}
+EXPORT_SYMBOL_GPL(io_mapping_map_user);
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 5fa1ab41d152..8ee0136f8cb0 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -16,237 +16,22 @@
#include "pgalloc-track.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static int __read_mostly ioremap_p4d_capable;
-static int __read_mostly ioremap_pud_capable;
-static int __read_mostly ioremap_pmd_capable;
-static int __read_mostly ioremap_huge_disabled;
+static unsigned int __ro_after_init iomap_max_page_shift = BITS_PER_LONG - 1;
static int __init set_nohugeiomap(char *str)
{
- ioremap_huge_disabled = 1;
+ iomap_max_page_shift = PAGE_SHIFT;
return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
-
-void __init ioremap_huge_init(void)
-{
- if (!ioremap_huge_disabled) {
- if (arch_ioremap_p4d_supported())
- ioremap_p4d_capable = 1;
- if (arch_ioremap_pud_supported())
- ioremap_pud_capable = 1;
- if (arch_ioremap_pmd_supported())
- ioremap_pmd_capable = 1;
- }
-}
-
-static inline int ioremap_p4d_enabled(void)
-{
- return ioremap_p4d_capable;
-}
-
-static inline int ioremap_pud_enabled(void)
-{
- return ioremap_pud_capable;
-}
-
-static inline int ioremap_pmd_enabled(void)
-{
- return ioremap_pmd_capable;
-}
-
-#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static inline int ioremap_p4d_enabled(void) { return 0; }
-static inline int ioremap_pud_enabled(void) { return 0; }
-static inline int ioremap_pmd_enabled(void) { return 0; }
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static const unsigned int iomap_max_page_shift = PAGE_SHIFT;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pte_t *pte;
- u64 pfn;
-
- pfn = phys_addr >> PAGE_SHIFT;
- pte = pte_alloc_kernel_track(pmd, addr, mask);
- if (!pte)
- return -ENOMEM;
- do {
- BUG_ON(!pte_none(*pte));
- set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
- pfn++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- *mask |= PGTBL_PTE_MODIFIED;
- return 0;
-}
-
-static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_pmd_enabled())
- return 0;
-
- if ((end - addr) != PMD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PMD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PMD_SIZE))
- return 0;
-
- if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
- return 0;
-
- return pmd_set_huge(pmd, phys_addr, prot);
-}
-
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
- if (!pmd)
- return -ENOMEM;
- do {
- next = pmd_addr_end(addr, end);
-
- if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_PMD_MODIFIED;
- continue;
- }
-
- if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_pud_enabled())
- return 0;
-
- if ((end - addr) != PUD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PUD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PUD_SIZE))
- return 0;
-
- if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
- return 0;
-
- return pud_set_huge(pud, phys_addr, prot);
-}
-
-static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_alloc_track(&init_mm, p4d, addr, mask);
- if (!pud)
- return -ENOMEM;
- do {
- next = pud_addr_end(addr, end);
-
- if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_PUD_MODIFIED;
- continue;
- }
-
- if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_p4d_enabled())
- return 0;
-
- if ((end - addr) != P4D_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, P4D_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, P4D_SIZE))
- return 0;
-
- if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
- return 0;
-
- return p4d_set_huge(p4d, phys_addr, prot);
-}
-
-static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- p4d_t *p4d;
- unsigned long next;
-
- p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
- if (!p4d)
- return -ENOMEM;
- do {
- next = p4d_addr_end(addr, end);
-
- if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
- *mask |= PGTBL_P4D_MODIFIED;
- continue;
- }
-
- if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
int ioremap_page_range(unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
- pgd_t *pgd;
- unsigned long start;
- unsigned long next;
- int err;
- pgtbl_mod_mask mask = 0;
-
- might_sleep();
- BUG_ON(addr >= end);
-
- start = addr;
- pgd = pgd_offset_k(addr);
- do {
- next = pgd_addr_end(addr, end);
- err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot,
- &mask);
- if (err)
- break;
- } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
-
- flush_cache_vmap(start, end);
-
- if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
- arch_sync_kernel_mappings(start, end);
-
- return err;
+ return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift);
}
#ifdef CONFIG_GENERIC_IOREMAP
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 9fe39a66388a..adcd9acaef61 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -37,5 +37,5 @@ CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
obj-$(CONFIG_KASAN) := common.o report.o
obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
-obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o
-obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o
+obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o
+obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b5e08d4cefec..2baf121fb8c5 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -51,25 +51,28 @@ void kasan_enable_current(void)
{
current->kasan_depth++;
}
+EXPORT_SYMBOL(kasan_enable_current);
void kasan_disable_current(void)
{
current->kasan_depth--;
}
+EXPORT_SYMBOL(kasan_disable_current);
+
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
void __kasan_unpoison_range(const void *address, size_t size)
{
- kasan_unpoison(address, size);
+ kasan_unpoison(address, size, false);
}
-#if CONFIG_KASAN_STACK
+#ifdef CONFIG_KASAN_STACK
/* Unpoison the entire stack for a task. */
void kasan_unpoison_task_stack(struct task_struct *task)
{
void *base = task_stack_page(task);
- kasan_unpoison(base, THREAD_SIZE);
+ kasan_unpoison(base, THREAD_SIZE, false);
}
/* Unpoison the stack for the current task beyond a watermark sp value. */
@@ -82,7 +85,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
*/
void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
- kasan_unpoison(base, watermark - base);
+ kasan_unpoison(base, watermark - base, false);
}
#endif /* CONFIG_KASAN_STACK */
@@ -97,7 +100,7 @@ slab_flags_t __kasan_never_merge(void)
return 0;
}
-void __kasan_alloc_pages(struct page *page, unsigned int order)
+void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
{
u8 tag;
unsigned long i;
@@ -108,14 +111,14 @@ void __kasan_alloc_pages(struct page *page, unsigned int order)
tag = kasan_random_tag();
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
- kasan_unpoison(page_address(page), PAGE_SIZE << order);
+ kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
}
-void __kasan_free_pages(struct page *page, unsigned int order)
+void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
{
if (likely(!PageHighMem(page)))
kasan_poison(page_address(page), PAGE_SIZE << order,
- KASAN_FREE_PAGE);
+ KASAN_FREE_PAGE, init);
}
/*
@@ -251,18 +254,18 @@ void __kasan_poison_slab(struct page *page)
for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
kasan_poison(page_address(page), page_size(page),
- KASAN_KMALLOC_REDZONE);
+ KASAN_KMALLOC_REDZONE, false);
}
void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
{
- kasan_unpoison(object, cache->object_size);
+ kasan_unpoison(object, cache->object_size, false);
}
void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
{
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
- KASAN_KMALLOC_REDZONE);
+ KASAN_KMALLOC_REDZONE, false);
}
/*
@@ -322,12 +325,15 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
return (void *)object;
}
-static inline bool ____kasan_slab_free(struct kmem_cache *cache,
- void *object, unsigned long ip, bool quarantine)
+static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
+ unsigned long ip, bool quarantine, bool init)
{
u8 tag;
void *tagged_object;
+ if (!kasan_arch_is_ready())
+ return false;
+
tag = get_tag(object);
tagged_object = object;
object = kasan_reset_tag(object);
@@ -351,7 +357,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache,
}
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
- KASAN_KMALLOC_FREE);
+ KASAN_KMALLOC_FREE, init);
if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine))
return false;
@@ -362,9 +368,10 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache,
return kasan_quarantine_put(cache, object);
}
-bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+ unsigned long ip, bool init)
{
- return ____kasan_slab_free(cache, object, ip, true);
+ return ____kasan_slab_free(cache, object, ip, true, init);
}
static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
@@ -407,9 +414,9 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
if (unlikely(!PageSlab(page))) {
if (____kasan_kfree_large(ptr, ip))
return;
- kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE);
+ kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false);
} else {
- ____kasan_slab_free(page->slab_cache, ptr, ip, false);
+ ____kasan_slab_free(page->slab_cache, ptr, ip, false, false);
}
}
@@ -428,7 +435,7 @@ static void set_alloc_info(struct kmem_cache *cache, void *object,
}
void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
- void *object, gfp_t flags)
+ void *object, gfp_t flags, bool init)
{
u8 tag;
void *tagged_object;
@@ -453,7 +460,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
* Unpoison the whole object.
* For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
*/
- kasan_unpoison(tagged_object, cache->object_size);
+ kasan_unpoison(tagged_object, cache->object_size, init);
/* Save alloc info (if possible) for non-kmalloc() allocations. */
if (kasan_stack_collection_enabled())
@@ -496,7 +503,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache,
redzone_end = round_up((unsigned long)(object + cache->object_size),
KASAN_GRANULE_SIZE);
kasan_poison((void *)redzone_start, redzone_end - redzone_start,
- KASAN_KMALLOC_REDZONE);
+ KASAN_KMALLOC_REDZONE, false);
/*
* Save alloc info (if possible) for kmalloc() allocations.
@@ -546,7 +553,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
KASAN_GRANULE_SIZE);
redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr));
kasan_poison((void *)redzone_start, redzone_end - redzone_start,
- KASAN_PAGE_REDZONE);
+ KASAN_PAGE_REDZONE, false);
return (void *)ptr;
}
@@ -563,7 +570,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
* Part of it might already have been unpoisoned, but it's unknown
* how big that part is.
*/
- kasan_unpoison(object, size);
+ kasan_unpoison(object, size, false);
page = virt_to_head_page(object);
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 2e55e0f82f39..c3f5ba7a294a 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -163,6 +163,9 @@ static __always_inline bool check_region_inline(unsigned long addr,
size_t size, bool write,
unsigned long ret_ip)
{
+ if (!kasan_arch_is_ready())
+ return true;
+
if (unlikely(size == 0))
return true;
@@ -208,11 +211,11 @@ static void register_global(struct kasan_global *global)
{
size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE);
- kasan_unpoison(global->beg, global->size);
+ kasan_unpoison(global->beg, global->size, false);
kasan_poison(global->beg + aligned_size,
global->size_with_redzone - aligned_size,
- KASAN_GLOBAL_REDZONE);
+ KASAN_GLOBAL_REDZONE, false);
}
void __asan_register_globals(struct kasan_global *globals, size_t size)
@@ -292,11 +295,11 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
kasan_unpoison((const void *)(addr + rounded_down_size),
- size - rounded_down_size);
+ size - rounded_down_size, false);
kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_LEFT);
+ KASAN_ALLOCA_LEFT, false);
kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_RIGHT);
+ KASAN_ALLOCA_RIGHT, false);
}
EXPORT_SYMBOL(__asan_alloca_poison);
@@ -306,7 +309,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
if (unlikely(!stack_top || stack_top > stack_bottom))
return;
- kasan_unpoison(stack_top, stack_bottom - stack_top);
+ kasan_unpoison(stack_top, stack_bottom - stack_top, false);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 2aad21fda156..4ea8c368b5b8 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -25,6 +25,12 @@ enum kasan_arg {
KASAN_ARG_ON,
};
+enum kasan_arg_mode {
+ KASAN_ARG_MODE_DEFAULT,
+ KASAN_ARG_MODE_SYNC,
+ KASAN_ARG_MODE_ASYNC,
+};
+
enum kasan_arg_stacktrace {
KASAN_ARG_STACKTRACE_DEFAULT,
KASAN_ARG_STACKTRACE_OFF,
@@ -38,6 +44,7 @@ enum kasan_arg_fault {
};
static enum kasan_arg kasan_arg __ro_after_init;
+static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
static enum kasan_arg_fault kasan_arg_fault __ro_after_init;
@@ -45,6 +52,10 @@ static enum kasan_arg_fault kasan_arg_fault __ro_after_init;
DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
EXPORT_SYMBOL(kasan_flag_enabled);
+/* Whether the asynchronous mode is enabled. */
+bool kasan_flag_async __ro_after_init;
+EXPORT_SYMBOL_GPL(kasan_flag_async);
+
/* Whether to collect alloc/free stack traces. */
DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
@@ -68,6 +79,23 @@ static int __init early_kasan_flag(char *arg)
}
early_param("kasan", early_kasan_flag);
+/* kasan.mode=sync/async */
+static int __init early_kasan_mode(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "sync"))
+ kasan_arg_mode = KASAN_ARG_MODE_SYNC;
+ else if (!strcmp(arg, "async"))
+ kasan_arg_mode = KASAN_ARG_MODE_ASYNC;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.mode", early_kasan_mode);
+
/* kasan.stacktrace=off/on */
static int __init early_kasan_flag_stacktrace(char *arg)
{
@@ -115,7 +143,15 @@ void kasan_init_hw_tags_cpu(void)
return;
hw_init_tags(KASAN_TAG_MAX);
- hw_enable_tagging();
+
+ /*
+ * Enable async mode only when explicitly requested through
+ * the command line.
+ */
+ if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
+ hw_enable_tagging_async();
+ else
+ hw_enable_tagging_sync();
}
/* kasan_init_hw_tags() is called once on boot CPU. */
@@ -132,6 +168,22 @@ void __init kasan_init_hw_tags(void)
/* Enable KASAN. */
static_branch_enable(&kasan_flag_enabled);
+ switch (kasan_arg_mode) {
+ case KASAN_ARG_MODE_DEFAULT:
+ /*
+ * Default to sync mode.
+ * Do nothing, kasan_flag_async keeps its default value.
+ */
+ break;
+ case KASAN_ARG_MODE_SYNC:
+ /* Do nothing, kasan_flag_async keeps its default value. */
+ break;
+ case KASAN_ARG_MODE_ASYNC:
+ /* Async mode enabled. */
+ kasan_flag_async = true;
+ break;
+ }
+
switch (kasan_arg_stacktrace) {
case KASAN_ARG_STACKTRACE_DEFAULT:
/* Default to enabling stack trace collection. */
@@ -164,26 +216,36 @@ void __init kasan_init_hw_tags(void)
pr_info("KernelAddressSanitizer initialized\n");
}
-void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
+void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
{
- struct kasan_alloc_meta *alloc_meta;
+ /*
+ * This condition should match the one in post_alloc_hook() in
+ * page_alloc.c.
+ */
+ bool init = !want_init_on_free() && want_init_on_alloc(flags);
+
+ if (flags & __GFP_SKIP_KASAN_POISON)
+ SetPageSkipKASanPoison(page);
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (alloc_meta)
- kasan_set_track(&alloc_meta->free_track[0], GFP_NOWAIT);
+ if (flags & __GFP_ZEROTAGS) {
+ int i;
+
+ for (i = 0; i != 1 << order; ++i)
+ tag_clear_highpage(page + i);
+ } else {
+ kasan_unpoison_pages(page, order, init);
+ }
}
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
+void kasan_free_pages(struct page *page, unsigned int order)
{
- struct kasan_alloc_meta *alloc_meta;
-
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (!alloc_meta)
- return NULL;
+ /*
+ * This condition should match the one in free_pages_prepare() in
+ * page_alloc.c.
+ */
+ bool init = want_init_on_free();
- return &alloc_meta->free_track[0];
+ kasan_poison_pages(page, order, init);
}
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
@@ -194,10 +256,16 @@ void kasan_set_tagging_report_once(bool state)
}
EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once);
-void kasan_enable_tagging(void)
+void kasan_enable_tagging_sync(void)
+{
+ hw_enable_tagging_sync();
+}
+EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync);
+
+void kasan_force_async_fault(void)
{
- hw_enable_tagging();
+ hw_force_async_tag_fault();
}
-EXPORT_SYMBOL_GPL(kasan_enable_tagging);
+EXPORT_SYMBOL_GPL(kasan_force_async_fault);
#endif
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index c4605ac9837b..cc64ed6858c6 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -41,7 +41,7 @@ static inline bool kasan_p4d_table(pgd_t pgd)
}
#endif
#if CONFIG_PGTABLE_LEVELS > 3
-pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
+pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss;
static inline bool kasan_pud_table(p4d_t p4d)
{
return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
@@ -53,7 +53,7 @@ static inline bool kasan_pud_table(p4d_t p4d)
}
#endif
#if CONFIG_PGTABLE_LEVELS > 2
-pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss;
static inline bool kasan_pmd_table(pud_t pud)
{
return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
@@ -64,7 +64,7 @@ static inline bool kasan_pmd_table(pud_t pud)
return false;
}
#endif
-pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]
+pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]
__page_aligned_bss;
static inline bool kasan_pte_table(pmd_t pmd)
@@ -220,8 +220,8 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
/**
* kasan_populate_early_shadow - populate shadow memory region with
* kasan_early_shadow_page
- * @shadow_start - start of the memory range to populate
- * @shadow_end - end of the memory range to populate
+ * @shadow_start: start of the memory range to populate
+ * @shadow_end: end of the memory range to populate
*/
int __ref kasan_populate_early_shadow(const void *shadow_start,
const void *shadow_end)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8c55634d6edd..98e3059bfea4 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -7,20 +7,37 @@
#include <linux/stackdepot.h>
#ifdef CONFIG_KASAN_HW_TAGS
+
#include <linux/static_key.h>
+
DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+extern bool kasan_flag_async __ro_after_init;
+
static inline bool kasan_stack_collection_enabled(void)
{
return static_branch_unlikely(&kasan_flag_stacktrace);
}
+
+static inline bool kasan_async_mode_enabled(void)
+{
+ return kasan_flag_async;
+}
#else
+
static inline bool kasan_stack_collection_enabled(void)
{
return true;
}
+
+static inline bool kasan_async_mode_enabled(void)
+{
+ return false;
+}
+
#endif
extern bool kasan_flag_panic __ro_after_init;
+extern bool kasan_flag_async __ro_after_init;
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
@@ -38,9 +55,9 @@ extern bool kasan_flag_panic __ro_after_init;
#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */
#ifdef CONFIG_KASAN_HW_TAGS
-#define KASAN_TAG_MIN 0xF0 /* mimimum value for random tags */
+#define KASAN_TAG_MIN 0xF0 /* minimum value for random tags */
#else
-#define KASAN_TAG_MIN 0x00 /* mimimum value for random tags */
+#define KASAN_TAG_MIN 0x00 /* minimum value for random tags */
#endif
#ifdef CONFIG_KASAN_GENERIC
@@ -136,7 +153,7 @@ struct kasan_track {
depot_stack_handle_t stack;
};
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+#if defined(CONFIG_KASAN_TAGS_IDENTIFY) && defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_NR_FREE_STACKS 5
#else
#define KASAN_NR_FREE_STACKS 1
@@ -146,14 +163,14 @@ struct kasan_alloc_meta {
struct kasan_track alloc_track;
#ifdef CONFIG_KASAN_GENERIC
/*
- * call_rcu() call stack is stored into struct kasan_alloc_meta.
+ * The auxiliary stack is stored into struct kasan_alloc_meta.
* The free stack is stored into struct kasan_free_meta.
*/
depot_stack_handle_t aux_stack[2];
#else
struct kasan_track free_track[KASAN_NR_FREE_STACKS];
#endif
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+#ifdef CONFIG_KASAN_TAGS_IDENTIFY
u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
u8 free_track_idx;
#endif
@@ -231,7 +248,7 @@ void *kasan_find_first_bad_addr(void *addr, size_t size);
const char *kasan_get_bug_type(struct kasan_access_info *info);
void kasan_metadata_fetch_row(char *buffer, void *row);
-#if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK
+#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK)
void kasan_print_address_stack_frame(const void *addr);
#else
static inline void kasan_print_address_stack_frame(const void *addr) { }
@@ -275,8 +292,11 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#ifdef CONFIG_KASAN_HW_TAGS
-#ifndef arch_enable_tagging
-#define arch_enable_tagging()
+#ifndef arch_enable_tagging_sync
+#define arch_enable_tagging_sync()
+#endif
+#ifndef arch_enable_tagging_async
+#define arch_enable_tagging_async()
#endif
#ifndef arch_init_tags
#define arch_init_tags(max_tag)
@@ -284,6 +304,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#ifndef arch_set_tagging_report_once
#define arch_set_tagging_report_once(state)
#endif
+#ifndef arch_force_async_tag_fault
+#define arch_force_async_tag_fault()
+#endif
#ifndef arch_get_random_tag
#define arch_get_random_tag() (0xFF)
#endif
@@ -291,19 +314,23 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define arch_get_mem_tag(addr) (0xFF)
#endif
#ifndef arch_set_mem_tag_range
-#define arch_set_mem_tag_range(addr, size, tag) ((void *)(addr))
+#define arch_set_mem_tag_range(addr, size, tag, init) ((void *)(addr))
#endif
-#define hw_enable_tagging() arch_enable_tagging()
+#define hw_enable_tagging_sync() arch_enable_tagging_sync()
+#define hw_enable_tagging_async() arch_enable_tagging_async()
#define hw_init_tags(max_tag) arch_init_tags(max_tag)
#define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state)
+#define hw_force_async_tag_fault() arch_force_async_tag_fault()
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
-#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag))
+#define hw_set_mem_tag_range(addr, size, tag, init) \
+ arch_set_mem_tag_range((addr), (size), (tag), (init))
#else /* CONFIG_KASAN_HW_TAGS */
-#define hw_enable_tagging()
+#define hw_enable_tagging_sync()
+#define hw_enable_tagging_async()
#define hw_set_tagging_report_once(state)
#endif /* CONFIG_KASAN_HW_TAGS */
@@ -311,12 +338,14 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
void kasan_set_tagging_report_once(bool state);
-void kasan_enable_tagging(void);
+void kasan_enable_tagging_sync(void);
+void kasan_force_async_fault(void);
#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
static inline void kasan_set_tagging_report_once(bool state) { }
-static inline void kasan_enable_tagging(void) { }
+static inline void kasan_enable_tagging_sync(void) { }
+static inline void kasan_force_async_fault(void) { }
#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
@@ -330,7 +359,7 @@ static inline u8 kasan_random_tag(void) { return 0; }
#ifdef CONFIG_KASAN_HW_TAGS
-static inline void kasan_poison(const void *addr, size_t size, u8 value)
+static inline void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
addr = kasan_reset_tag(addr);
@@ -343,10 +372,10 @@ static inline void kasan_poison(const void *addr, size_t size, u8 value)
if (WARN_ON(size & KASAN_GRANULE_MASK))
return;
- hw_set_mem_tag_range((void *)addr, size, value);
+ hw_set_mem_tag_range((void *)addr, size, value, init);
}
-static inline void kasan_unpoison(const void *addr, size_t size)
+static inline void kasan_unpoison(const void *addr, size_t size, bool init)
{
u8 tag = get_tag(addr);
@@ -360,7 +389,7 @@ static inline void kasan_unpoison(const void *addr, size_t size)
return;
size = round_up(size, KASAN_GRANULE_SIZE);
- hw_set_mem_tag_range((void *)addr, size, tag);
+ hw_set_mem_tag_range((void *)addr, size, tag, init);
}
static inline bool kasan_byte_accessible(const void *addr)
@@ -368,33 +397,34 @@ static inline bool kasan_byte_accessible(const void *addr)
u8 ptr_tag = get_tag(addr);
u8 mem_tag = hw_get_mem_tag((void *)addr);
- return (mem_tag != KASAN_TAG_INVALID) &&
- (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag);
+ return ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag;
}
#else /* CONFIG_KASAN_HW_TAGS */
/**
- * kasan_poison - mark the memory range as unaccessible
+ * kasan_poison - mark the memory range as inaccessible
* @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
* @size - range size, must be aligned to KASAN_GRANULE_SIZE
* @value - value that's written to metadata for the range
+ * @init - whether to initialize the memory range (only for hardware tag-based)
*
* The size gets aligned to KASAN_GRANULE_SIZE before marking the range.
*/
-void kasan_poison(const void *addr, size_t size, u8 value);
+void kasan_poison(const void *addr, size_t size, u8 value, bool init);
/**
* kasan_unpoison - mark the memory range as accessible
* @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
* @size - range size, can be unaligned
+ * @init - whether to initialize the memory range (only for hardware tag-based)
*
* For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before
* marking the range.
* For the generic mode, the last granule of the memory range gets partially
* unpoisoned based on the @size.
*/
-void kasan_unpoison(const void *addr, size_t size);
+void kasan_unpoison(const void *addr, size_t size, bool init);
bool kasan_byte_accessible(const void *addr);
@@ -404,7 +434,7 @@ bool kasan_byte_accessible(const void *addr);
/**
* kasan_poison_last_granule - mark the last granule of the memory range as
- * unaccessible
+ * inaccessible
* @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
* @size - range size
*
@@ -419,6 +449,12 @@ static inline void kasan_poison_last_granule(const void *address, size_t size) {
#endif /* CONFIG_KASAN_GENERIC */
+#ifndef kasan_arch_is_ready
+static inline bool kasan_arch_is_ready(void) { return true; }
+#elif !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE)
+#error kasan_arch_is_ready only works in KASAN generic outline mode!
+#endif
+
/*
* Exported functions for interfaces called from assembly or from generated
* code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 728fb24c5683..d8ccff4c1275 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -27,7 +27,7 @@
/* Data structure and operations for quarantine queues. */
/*
- * Each queue is a signle-linked list, which also stores the total size of
+ * Each queue is a single-linked list, which also stores the total size of
* objects inside of it.
*/
struct qlist_head {
@@ -138,7 +138,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
local_irq_save(flags);
/*
- * As the object now gets freed from the quaratine, assume that its
+ * As the object now gets freed from the quarantine, assume that its
* free track is no longer valid.
*/
*(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 87b271206163..8fff1825b22c 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -87,7 +87,8 @@ static void start_report(unsigned long *flags)
static void end_report(unsigned long *flags, unsigned long addr)
{
- trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
+ if (!kasan_async_mode_enabled())
+ trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
@@ -229,7 +230,7 @@ static void print_address_description(void *addr, u8 tag)
{
struct page *page = kasan_addr_to_page(addr);
- dump_stack();
+ dump_stack_lvl(KERN_ERR);
pr_err("\n");
if (page && PageSlab(page)) {
@@ -360,6 +361,25 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
end_report(&flags, (unsigned long)object);
}
+#ifdef CONFIG_KASAN_HW_TAGS
+void kasan_report_async(void)
+{
+ unsigned long flags;
+
+#if IS_ENABLED(CONFIG_KUNIT)
+ if (current->kunit_test)
+ kasan_update_kunit_status(current->kunit_test);
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
+ start_report(&flags);
+ pr_err("BUG: KASAN: invalid-access\n");
+ pr_err("Asynchronous mode enabled: no access details available\n");
+ pr_err("\n");
+ dump_stack_lvl(KERN_ERR);
+ end_report(&flags, 0);
+}
+#endif /* CONFIG_KASAN_HW_TAGS */
+
static void __kasan_report(unsigned long addr, size_t size, bool is_write,
unsigned long ip)
{
@@ -400,7 +420,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
pr_err("\n");
print_memory_metadata(info.first_bad_addr);
} else {
- dump_stack();
+ dump_stack_lvl(KERN_ERR);
}
end_report(&flags, addr);
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 41f374585144..139615ef326b 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -128,7 +128,7 @@ void kasan_metadata_fetch_row(char *buffer, void *row)
memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
}
-#if CONFIG_KASAN_STACK
+#ifdef CONFIG_KASAN_STACK
static bool __must_check tokenize_frame_descr(const char **frame_descr,
char *token, size_t max_tok_len,
unsigned long *value)
@@ -148,7 +148,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr,
}
/* Copy token (+ 1 byte for '\0'). */
- strlcpy(token, *frame_descr, tok_len + 1);
+ strscpy(token, *frame_descr, tok_len + 1);
}
/* Advance frame_descr past separator. */
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index 42b2168755d6..5dbbbb930e7a 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -15,11 +15,6 @@
#include "kasan.h"
-const char *kasan_get_bug_type(struct kasan_access_info *info)
-{
- return "invalid-access";
-}
-
void *kasan_find_first_bad_addr(void *addr, size_t size)
{
return kasan_reset_tag(addr);
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index 3d20d3451d9e..d2298c357834 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -29,49 +29,6 @@
#include "kasan.h"
#include "../slab.h"
-const char *kasan_get_bug_type(struct kasan_access_info *info)
-{
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- struct kasan_alloc_meta *alloc_meta;
- struct kmem_cache *cache;
- struct page *page;
- const void *addr;
- void *object;
- u8 tag;
- int i;
-
- tag = get_tag(info->access_addr);
- addr = kasan_reset_tag(info->access_addr);
- page = kasan_addr_to_page(addr);
- if (page && PageSlab(page)) {
- cache = page->slab_cache;
- object = nearest_obj(cache, page, (void *)addr);
- alloc_meta = kasan_get_alloc_meta(cache, object);
-
- if (alloc_meta) {
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
- if (alloc_meta->free_pointer_tag[i] == tag)
- return "use-after-free";
- }
- }
- return "out-of-bounds";
- }
-
-#endif
- /*
- * If access_size is a negative number, then it has reason to be
- * defined as out-of-bounds bug type.
- *
- * Casting negative numbers to size_t would indeed turn up as
- * a large size_t and its value will be larger than ULONG_MAX/2,
- * so that this can qualify as out-of-bounds.
- */
- if (info->access_addr + info->access_size < info->access_addr)
- return "out-of-bounds";
-
- return "invalid-access";
-}
-
void *kasan_find_first_bad_addr(void *addr, size_t size)
{
u8 tag = get_tag(addr);
diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c
new file mode 100644
index 000000000000..8a319fc16dab
--- /dev/null
+++ b/mm/kasan/report_tags.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Copyright (c) 2020 Google, Inc.
+ */
+
+#include "kasan.h"
+#include "../slab.h"
+
+const char *kasan_get_bug_type(struct kasan_access_info *info)
+{
+#ifdef CONFIG_KASAN_TAGS_IDENTIFY
+ struct kasan_alloc_meta *alloc_meta;
+ struct kmem_cache *cache;
+ struct page *page;
+ const void *addr;
+ void *object;
+ u8 tag;
+ int i;
+
+ tag = get_tag(info->access_addr);
+ addr = kasan_reset_tag(info->access_addr);
+ page = kasan_addr_to_page(addr);
+ if (page && PageSlab(page)) {
+ cache = page->slab_cache;
+ object = nearest_obj(cache, page, (void *)addr);
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+
+ if (alloc_meta) {
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ return "use-after-free";
+ }
+ }
+ return "out-of-bounds";
+ }
+#endif
+
+ /*
+ * If access_size is a negative number, then it has reason to be
+ * defined as out-of-bounds bug type.
+ *
+ * Casting negative numbers to size_t would indeed turn up as
+ * a large size_t and its value will be larger than ULONG_MAX/2,
+ * so that this can qualify as out-of-bounds.
+ */
+ if (info->access_addr + info->access_size < info->access_addr)
+ return "out-of-bounds";
+
+ return "invalid-access";
+}
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 63f43443f5d7..8d95ee52d019 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -69,10 +69,13 @@ void *memcpy(void *dest, const void *src, size_t len)
return __memcpy(dest, src, len);
}
-void kasan_poison(const void *addr, size_t size, u8 value)
+void kasan_poison(const void *addr, size_t size, u8 value, bool init)
{
void *shadow_start, *shadow_end;
+ if (!kasan_arch_is_ready())
+ return;
+
/*
* Perform shadow offset calculation based on untagged address, as
* some of the callers (e.g. kasan_poison_object_data) pass tagged
@@ -99,6 +102,9 @@ EXPORT_SYMBOL(kasan_poison);
#ifdef CONFIG_KASAN_GENERIC
void kasan_poison_last_granule(const void *addr, size_t size)
{
+ if (!kasan_arch_is_ready())
+ return;
+
if (size & KASAN_GRANULE_MASK) {
u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size);
*shadow = size & KASAN_GRANULE_MASK;
@@ -106,7 +112,7 @@ void kasan_poison_last_granule(const void *addr, size_t size)
}
#endif
-void kasan_unpoison(const void *addr, size_t size)
+void kasan_unpoison(const void *addr, size_t size, bool init)
{
u8 tag = get_tag(addr);
@@ -129,7 +135,7 @@ void kasan_unpoison(const void *addr, size_t size)
return;
/* Unpoison all granules that cover the object. */
- kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag);
+ kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag, false);
/* Partially poison the last granule for the generic mode. */
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
@@ -316,7 +322,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
* // rest of vmalloc process <data dependency>
* STORE p, a LOAD shadow(x+99)
*
- * If there is no barrier between the end of unpoisioning the shadow
+ * If there is no barrier between the end of unpoisoning the shadow
* and the store of the result to p, the stores could be committed
* in a different order by CPU#0, and CPU#1 could erroneously observe
* poison in the shadow.
@@ -344,7 +350,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size)
return;
size = round_up(size, KASAN_GRANULE_SIZE);
- kasan_poison(start, size, KASAN_VMALLOC_INVALID);
+ kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
}
void kasan_unpoison_vmalloc(const void *start, unsigned long size)
@@ -352,7 +358,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size)
if (!is_vmalloc_or_module_addr(start))
return;
- kasan_unpoison(start, size);
+ kasan_unpoison(start, size, false);
}
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
@@ -384,7 +390,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
* How does this work?
* -------------------
*
- * We have a region that is page aligned, labelled as A.
+ * We have a region that is page aligned, labeled as A.
* That might not map onto the shadow in a way that is page-aligned:
*
* start end
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 94c2d33be333..bd3f540feb47 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -121,10 +121,14 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
bool kasan_byte_accessible(const void *addr)
{
u8 tag = get_tag(addr);
- u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
+ void *untagged_addr = kasan_reset_tag(addr);
+ u8 shadow_byte;
- return (shadow_byte != KASAN_TAG_INVALID) &&
- (tag == KASAN_TAG_KERNEL || tag == shadow_byte);
+ if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))
+ return false;
+
+ shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr));
+ return tag == KASAN_TAG_KERNEL || tag == shadow_byte;
}
#define DEFINE_HWASAN_LOAD_STORE(size) \
@@ -159,47 +163,13 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort);
void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
{
- kasan_poison((void *)addr, size, tag);
+ kasan_poison((void *)addr, size, tag, false);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
-void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
+void kasan_tag_mismatch(unsigned long addr, unsigned long access_info,
+ unsigned long ret_ip)
{
- struct kasan_alloc_meta *alloc_meta;
- u8 idx = 0;
-
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (!alloc_meta)
- return;
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- idx = alloc_meta->free_track_idx;
- alloc_meta->free_pointer_tag[idx] = tag;
- alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
-#endif
-
- kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
-}
-
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- struct kasan_alloc_meta *alloc_meta;
- int i = 0;
-
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (!alloc_meta)
- return NULL;
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
- if (alloc_meta->free_pointer_tag[i] == tag)
- break;
- }
- if (i == KASAN_NR_FREE_STACKS)
- i = alloc_meta->free_track_idx;
-#endif
-
- return &alloc_meta->free_track[i];
+ kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10,
+ ret_ip);
}
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
new file mode 100644
index 000000000000..8f48b9502a17
--- /dev/null
+++ b/mm/kasan/tags.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains common tag-based KASAN code.
+ *
+ * Copyright (c) 2018 Google, Inc.
+ * Copyright (c) 2020 Google, Inc.
+ */
+
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/static_key.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ u8 idx = 0;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return;
+
+#ifdef CONFIG_KASAN_TAGS_IDENTIFY
+ idx = alloc_meta->free_track_idx;
+ alloc_meta->free_pointer_tag[idx] = tag;
+ alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+ kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ int i = 0;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return NULL;
+
+#ifdef CONFIG_KASAN_TAGS_IDENTIFY
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ break;
+ }
+ if (i == KASAN_NR_FREE_STACKS)
+ i = alloc_meta->free_track_idx;
+#endif
+
+ return &alloc_meta->free_track[i];
+}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index d53c91f881a4..4d21ac44d5d3 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -10,6 +10,7 @@
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
+#include <linux/irq_work.h>
#include <linux/kcsan-checks.h>
#include <linux/kfence.h>
#include <linux/kmemleak.h>
@@ -19,6 +20,7 @@
#include <linux/moduleparam.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
+#include <linux/sched/sysctl.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -372,6 +374,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
/* Restore page protection if there was an OOB access. */
if (meta->unprotected_page) {
+ memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
kfence_protect(meta->unprotected_page);
meta->unprotected_page = 0;
}
@@ -586,6 +589,17 @@ late_initcall(kfence_debugfs_init);
/* === Allocation Gate Timer ================================================ */
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* Wait queue to wake up allocation-gate timer task. */
+static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
+
+static void wake_up_kfence_timer(struct irq_work *work)
+{
+ wake_up(&allocation_wait);
+}
+static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
+#endif
+
/*
* Set up delayed work, which will enable and disable the static key. We need to
* use a work queue (rather than a simple timer), since enabling and disabling a
@@ -603,29 +617,27 @@ static void toggle_allocation_gate(struct work_struct *work)
if (!READ_ONCE(kfence_enabled))
return;
- /* Enable static key, and await allocation to happen. */
atomic_set(&kfence_allocation_gate, 0);
#ifdef CONFIG_KFENCE_STATIC_KEYS
+ /* Enable static key, and await allocation to happen. */
static_branch_enable(&kfence_allocation_key);
- /*
- * Await an allocation. Timeout after 1 second, in case the kernel stops
- * doing allocations, to avoid stalling this worker task for too long.
- */
- {
- unsigned long end_wait = jiffies + HZ;
-
- do {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (atomic_read(&kfence_allocation_gate) != 0)
- break;
- schedule_timeout(1);
- } while (time_before(jiffies, end_wait));
- __set_current_state(TASK_RUNNING);
+
+ if (sysctl_hung_task_timeout_secs) {
+ /*
+ * During low activity with no allocations we might wait a
+ * while; let's avoid the hung task warning.
+ */
+ wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
+ sysctl_hung_task_timeout_secs * HZ / 2);
+ } else {
+ wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
}
+
/* Disable static key and reset timer. */
static_branch_disable(&kfence_allocation_key);
#endif
- schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval));
+ queue_delayed_work(system_power_efficient_wq, &kfence_timer,
+ msecs_to_jiffies(kfence_sample_interval));
}
static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
@@ -654,7 +666,7 @@ void __init kfence_init(void)
}
WRITE_ONCE(kfence_enabled, true);
- schedule_delayed_work(&kfence_timer, 0);
+ queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0);
pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
(void *)(__kfence_pool + KFENCE_POOL_SIZE));
@@ -728,6 +740,19 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
*/
if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
return NULL;
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+ /*
+ * waitqueue_active() is fully ordered after the update of
+ * kfence_allocation_gate per atomic_inc_return().
+ */
+ if (waitqueue_active(&allocation_wait)) {
+ /*
+ * Calling wake_up() here may deadlock when allocations happen
+ * from within timer code. Use an irq_work to defer it.
+ */
+ irq_work_queue(&wake_up_kfence_timer_work);
+ }
+#endif
if (!READ_ONCE(kfence_enabled))
return NULL;
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 4acf4251ee04..7f24b9bcb2ec 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -197,7 +197,7 @@ static void test_cache_destroy(void)
static inline size_t kmalloc_cache_alignment(size_t size)
{
- return kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]->align;
+ return kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]->align;
}
/* Must always inline to match stack trace against caller. */
@@ -267,7 +267,8 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
if (is_kfence_address(alloc)) {
struct page *page = virt_to_head_page(alloc);
- struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)];
+ struct kmem_cache *s = test_cache ?:
+ kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)];
/*
* Verify that various helpers return the right values
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index e3f71451ad9e..2a319c21c939 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -263,6 +263,6 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
if (panic_on_warn)
panic("panic_on_warn set ...\n");
- /* We encountered a memory unsafety error, taint the kernel! */
+ /* We encountered a memory safety error, taint the kernel! */
add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a7d6cb912b05..6c0185fdd815 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -481,7 +481,7 @@ int __khugepaged_enter(struct mm_struct *mm)
return -ENOMEM;
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
+ VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
@@ -667,7 +667,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*
* The page table that maps the page has been already unlinked
* from the page table tree and this process cannot get
- * an additinal pin on the page.
+ * an additional pin on the page.
*
* New pins can come later if the page is shared across fork,
* but not from this process. The other process cannot write to
@@ -716,17 +716,17 @@ next:
if (pte_write(pteval))
writable = true;
}
- if (likely(writable)) {
- if (likely(referenced)) {
- result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
- referenced, writable, result);
- return 1;
- }
- } else {
+
+ if (unlikely(!writable)) {
result = SCAN_PAGE_RO;
+ } else if (unlikely(!referenced)) {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ } else {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return 1;
}
-
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
@@ -809,7 +809,7 @@ static bool khugepaged_scan_abort(int nid)
* If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally.
*/
- if (!node_reclaim_mode)
+ if (!node_reclaim_enabled())
return false;
/* If there is a count for this node already, it must be acceptable */
@@ -1128,10 +1128,10 @@ static void collapse_huge_page(struct mm_struct *mm,
mmap_write_lock(mm);
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
- goto out;
+ goto out_up_write;
/* check if the pmd is still valid */
if (mm_find_pmd(mm, address) != pmd)
- goto out;
+ goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
@@ -1171,7 +1171,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
result = SCAN_FAIL;
- goto out;
+ goto out_up_write;
}
/*
@@ -1183,19 +1183,18 @@ static void collapse_huge_page(struct mm_struct *mm,
__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
&compound_pagelist);
pte_unmap(pte);
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), but
+ * the smp_wmb() inside __SetPageUptodate() can be reused to
+ * avoid the copy_huge_page writes to become visible after
+ * the set_pmd_at() write.
+ */
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
- /*
- * spin_lock() below is not the equivalent of smp_wmb(), so
- * this is needed to avoid the copy_huge_page writes to become
- * visible after the set_pmd_at() write.
- */
- smp_wmb();
-
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
@@ -1216,8 +1215,6 @@ out_nolock:
mem_cgroup_uncharge(*hpage);
trace_mm_collapse_huge_page(mm, isolated, result);
return;
-out:
- goto out_up_write;
}
static int khugepaged_scan_pmd(struct mm_struct *mm,
@@ -1274,10 +1271,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
}
- if (!pte_present(pteval)) {
- result = SCAN_PTE_NON_PRESENT;
- goto out_unmap;
- }
if (pte_uffd_wp(pteval)) {
/*
* Don't collapse the page if any of the small
@@ -1447,7 +1440,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
int i;
if (!vma || !vma->vm_file ||
- vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+ !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
return;
/*
@@ -1533,16 +1526,16 @@ abort:
goto drop_hpage;
}
-static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
{
struct mm_struct *mm = mm_slot->mm;
int i;
if (likely(mm_slot->nr_pte_mapped_thp == 0))
- return 0;
+ return;
if (!mmap_write_trylock(mm))
- return -EBUSY;
+ return;
if (unlikely(khugepaged_test_exit(mm)))
goto out;
@@ -1553,7 +1546,6 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
out:
mm_slot->nr_pte_mapped_thp = 0;
mmap_write_unlock(mm);
- return 0;
}
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
@@ -2057,9 +2049,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
BUILD_BUG();
}
-static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
{
- return 0;
}
#endif
@@ -2205,11 +2196,9 @@ static void khugepaged_do_scan(void)
{
struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
- unsigned int pages = khugepaged_pages_to_scan;
+ unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
bool wait = true;
- barrier(); /* write khugepaged_pages_to_scan to local stack */
-
lru_add_drain_all();
while (progress < pages) {
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index fe6e3ae8e8c6..228a2fbe0657 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -219,7 +219,7 @@ static struct task_struct *scan_thread;
static unsigned long jiffies_min_age;
static unsigned long jiffies_last_scan;
/* delay between automatic memory scannings */
-static signed long jiffies_scan_wait;
+static unsigned long jiffies_scan_wait;
/* enables or disables the task stacks scanning */
static int kmemleak_stack_scan = 1;
/* protects the memory scanning, parameters and debug/kmemleak file access */
@@ -1203,7 +1203,7 @@ static void update_refs(struct kmemleak_object *object)
}
/*
- * Memory scanning is a long process and it needs to be interruptable. This
+ * Memory scanning is a long process and it needs to be interruptible. This
* function checks whether such interrupt condition occurred.
*/
static int scan_should_stop(void)
@@ -1567,7 +1567,7 @@ static int kmemleak_scan_thread(void *arg)
}
while (!kthread_should_stop()) {
- signed long timeout = jiffies_scan_wait;
+ signed long timeout = READ_ONCE(jiffies_scan_wait);
mutex_lock(&scan_mutex);
kmemleak_scan();
@@ -1807,14 +1807,20 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
else if (strncmp(buf, "scan=off", 8) == 0)
stop_scan_thread();
else if (strncmp(buf, "scan=", 5) == 0) {
- unsigned long secs;
+ unsigned secs;
+ unsigned long msecs;
- ret = kstrtoul(buf + 5, 0, &secs);
+ ret = kstrtouint(buf + 5, 0, &secs);
if (ret < 0)
goto out;
+
+ msecs = secs * MSEC_PER_SEC;
+ if (msecs > UINT_MAX)
+ msecs = UINT_MAX;
+
stop_scan_thread();
- if (secs) {
- jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
+ if (msecs) {
+ WRITE_ONCE(jiffies_scan_wait, msecs_to_jiffies(msecs));
start_scan_thread();
}
} else if (strncmp(buf, "scan", 4) == 0)
diff --git a/mm/ksm.c b/mm/ksm.c
index 9694ee2c71de..3fa9bc8a67cf 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -215,8 +215,6 @@ struct rmap_item {
#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
#define STABLE_FLAG 0x200 /* is listed from the stable tree */
-#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
- /* to mask all the flags */
/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -461,7 +459,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
- * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ * mmap of /dev/mem, where we would not want to touch it.
*
* FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
* of the process that owns 'vma'. We also do not want to enforce
@@ -523,10 +521,8 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
struct vm_area_struct *vma;
if (ksm_test_exit(mm))
return NULL;
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
- return NULL;
- if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ vma = vma_lookup(mm, addr);
+ if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
return NULL;
return vma;
}
@@ -794,6 +790,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
+ rmap_item->head = NULL;
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -817,8 +814,7 @@ out:
cond_resched(); /* we're called from many long loops */
}
-static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
- struct rmap_item **rmap_list)
+static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
{
while (*rmap_list) {
struct rmap_item *rmap_item = *rmap_list;
@@ -989,7 +985,7 @@ static int unmerge_and_remove_all_rmap_items(void)
goto error;
}
- remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+ remove_trailing_rmap_items(&mm_slot->rmap_list);
mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
@@ -1068,7 +1064,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
/*
* Ok this is tricky, when get_user_pages_fast() run it doesn't
* take any lock, therefore the check that we are going to make
- * with the pagecount against the mapcount is racey and
+ * with the pagecount against the mapcount is racy and
* O_DIRECT can happen right after the check.
* So we clear the pte and flush the tlb before the check
* this assure us that no O_DIRECT can happen after the check
@@ -1438,7 +1434,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
*/
*_stable_node = found;
/*
- * Just for robustneess as stable_node is
+ * Just for robustness, as stable_node is
* otherwise left as a stable pointer, the
* compiler shall optimize it away at build
* time.
@@ -1771,7 +1767,6 @@ chain_append:
* stable_node_dup is the dup to replace.
*/
if (stable_node_dup == stable_node) {
- VM_BUG_ON(is_stable_node_chain(stable_node_dup));
VM_BUG_ON(is_stable_node_dup(stable_node_dup));
/* chain is missing so create it */
stable_node = alloc_stable_node_chain(stable_node_dup,
@@ -1785,7 +1780,6 @@ chain_append:
* of the current nid for this page
* content.
*/
- VM_BUG_ON(!is_stable_node_chain(stable_node));
VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
VM_BUG_ON(page_node->head != &migrate_nodes);
list_del(&page_node->list);
@@ -2337,7 +2331,7 @@ next_mm:
* Nuke all the rmap_items that are above this current rmap:
* because there were no VM_MERGEABLE vmas with such addresses.
*/
- remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
+ remove_trailing_rmap_items(ksm_scan.rmap_list);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(slot->mm_list.next,
@@ -2634,7 +2628,7 @@ again:
vma = vmac->vma;
/* Ignore the stable/unstable/sqnr flags */
- addr = rmap_item->address & ~KSM_FLAG_MASK;
+ addr = rmap_item->address & PAGE_MASK;
if (addr < vma->vm_start || addr >= vma->vm_end)
continue;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 6f067b6b935f..cd58790d0fb3 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
list_add_tail(item, &l->list);
/* Set shrinker bit if the first element was added */
if (!l->nr_items++)
- memcg_set_shrinker_bit(memcg, nid,
- lru_shrinker_id(lru));
+ set_shrinker_bit(memcg, nid,
+ lru_shrinker_id(lru));
nlru->nr_items++;
spin_unlock(&nlru->lock);
return true;
@@ -540,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
if (src->nr_items) {
dst->nr_items += src->nr_items;
- memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
+ set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
src->nr_items = 0;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 01fef79ac761..63e489e5bfdb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -799,7 +799,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
- * vma was splitted while the mmap_lock was
+ * vma was split while the mmap_lock was
* released the effect of the concurrent
* operation may not cause madvise() to
* have an undefined result. There may be an
@@ -1039,7 +1039,7 @@ process_madvise_behavior_valid(int behavior)
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
* MADV_COLD - the application is not expected to use this memory soon,
* deactivate pages in this range so that they can be reclaimed
- * easily if memory pressure hanppens.
+ * easily if memory pressure happens.
* MADV_PAGEOUT - the application is not expected to use this memory soon,
* page out the pages in this range immediately.
*
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index b59054ef2e10..b890854ec761 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -165,10 +165,12 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
return 0;
}
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
/* Huge pud */
walk->action = ACTION_CONTINUE;
if (pud_trans_huge(pudval) || pud_devmap(pudval))
WARN_ON(pud_write(pudval) || pud_dirty(pudval));
+#endif
return 0;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index afaefa8fc6ab..123feef5259d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -92,7 +92,7 @@
* system initialization completes.
*/
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
struct pglist_data __refdata contig_page_data;
EXPORT_SYMBOL(contig_page_data);
#endif
@@ -607,7 +607,7 @@ repeat:
* area, insert that portion.
*/
if (rbase > base) {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
WARN_ON(flags != rgn->flags);
@@ -1205,7 +1205,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
struct memblock_type *type, int nid)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
int start_rgn, end_rgn;
int i, ret;
@@ -1849,7 +1849,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
size = rgn->size;
end = base + size - 1;
flags = rgn->flags;
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1fa9b00ec71d..b80aae448a49 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -78,12 +78,13 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
+EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
-static bool cgroup_memory_nokmem __ro_after_init;
+bool cgroup_memory_nokmem __ro_after_init;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
@@ -215,7 +216,7 @@ enum res_type {
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
-/* Used for OOM nofiier */
+/* Used for OOM notifier */
#define OOM_CONTROL (0)
/*
@@ -260,15 +261,12 @@ bool mem_cgroup_kmem_disabled(void)
return cgroup_memory_nokmem;
}
-static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages);
-static void __memcg_kmem_uncharge(struct mem_cgroup *memcg,
- unsigned int nr_pages);
+static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
+ unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
- struct mem_cgroup *memcg;
unsigned int nr_bytes;
unsigned int nr_pages;
unsigned long flags;
@@ -297,12 +295,11 @@ static void obj_cgroup_release(struct percpu_ref *ref)
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
- spin_lock_irqsave(&css_set_lock, flags);
- memcg = obj_cgroup_memcg(objcg);
if (nr_pages)
- __memcg_kmem_uncharge(memcg, nr_pages);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
+
+ spin_lock_irqsave(&css_set_lock, flags);
list_del(&objcg->list);
- mem_cgroup_put(memcg);
spin_unlock_irqrestore(&css_set_lock, flags);
percpu_ref_exit(ref);
@@ -337,17 +334,12 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
spin_lock_irq(&css_set_lock);
- /* Move active objcg to the parent's list */
- xchg(&objcg->memcg, parent);
- css_get(&parent->css);
- list_add(&objcg->list, &parent->objcg_list);
-
- /* Move already reparented objcgs to the parent's list */
- list_for_each_entry(iter, &memcg->objcg_list, list) {
- css_get(&parent->css);
- xchg(&iter->memcg, parent);
- css_put(&memcg->css);
- }
+ /* 1) Ready to reparent active objcg. */
+ list_add(&objcg->list, &memcg->objcg_list);
+ /* 2) Reparent active objcg and already reparented objcgs to parent. */
+ list_for_each_entry(iter, &memcg->objcg_list, list)
+ WRITE_ONCE(iter->memcg, parent);
+ /* 3) Move already reparented objcgs to the parent's list */
list_splice(&memcg->objcg_list, &parent->objcg_list);
spin_unlock_irq(&css_set_lock);
@@ -407,129 +399,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
#endif
-static int memcg_shrinker_map_size;
-static DEFINE_MUTEX(memcg_shrinker_map_mutex);
-
-static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
-{
- kvfree(container_of(head, struct memcg_shrinker_map, rcu));
-}
-
-static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
- int size, int old_size)
-{
- struct memcg_shrinker_map *new, *old;
- int nid;
-
- lockdep_assert_held(&memcg_shrinker_map_mutex);
-
- for_each_node(nid) {
- old = rcu_dereference_protected(
- mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
- /* Not yet online memcg */
- if (!old)
- return 0;
-
- new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
- if (!new)
- return -ENOMEM;
-
- /* Set all old bits, clear all new bits */
- memset(new->map, (int)0xff, old_size);
- memset((void *)new->map + old_size, 0, size - old_size);
-
- rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
- call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
- }
-
- return 0;
-}
-
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
-{
- struct mem_cgroup_per_node *pn;
- struct memcg_shrinker_map *map;
- int nid;
-
- if (mem_cgroup_is_root(memcg))
- return;
-
- for_each_node(nid) {
- pn = mem_cgroup_nodeinfo(memcg, nid);
- map = rcu_dereference_protected(pn->shrinker_map, true);
- kvfree(map);
- rcu_assign_pointer(pn->shrinker_map, NULL);
- }
-}
-
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
- struct memcg_shrinker_map *map;
- int nid, size, ret = 0;
-
- if (mem_cgroup_is_root(memcg))
- return 0;
-
- mutex_lock(&memcg_shrinker_map_mutex);
- size = memcg_shrinker_map_size;
- for_each_node(nid) {
- map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
- if (!map) {
- memcg_free_shrinker_maps(memcg);
- ret = -ENOMEM;
- break;
- }
- rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
- }
- mutex_unlock(&memcg_shrinker_map_mutex);
-
- return ret;
-}
-
-int memcg_expand_shrinker_maps(int new_id)
-{
- int size, old_size, ret = 0;
- struct mem_cgroup *memcg;
-
- size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
- old_size = memcg_shrinker_map_size;
- if (size <= old_size)
- return 0;
-
- mutex_lock(&memcg_shrinker_map_mutex);
- if (!root_mem_cgroup)
- goto unlock;
-
- for_each_mem_cgroup(memcg) {
- if (mem_cgroup_is_root(memcg))
- continue;
- ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
- if (ret) {
- mem_cgroup_iter_break(NULL, memcg);
- goto unlock;
- }
- }
-unlock:
- if (!ret)
- memcg_shrinker_map_size = size;
- mutex_unlock(&memcg_shrinker_map_mutex);
- return ret;
-}
-
-void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
-{
- if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
- struct memcg_shrinker_map *map;
-
- rcu_read_lock();
- map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
- /* Pairs with smp mb in shrink_slab() */
- smp_mb__before_atomic();
- set_bit(shrinker_id, map->map);
- rcu_read_unlock();
- }
-}
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -718,7 +587,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
int nid;
for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(memcg, nid);
+ mz = memcg->nodeinfo[nid];
mctz = soft_limit_tree_node(nid);
if (mctz)
mem_cgroup_remove_exceeded(mz, mctz);
@@ -769,28 +638,37 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
*/
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- long x, threshold = MEMCG_CHARGE_BATCH;
-
if (mem_cgroup_disabled())
return;
- if (memcg_stat_item_in_bytes(idx))
- threshold <<= PAGE_SHIFT;
+ __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+ cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+}
- x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
- if (unlikely(abs(x) > threshold)) {
- struct mem_cgroup *mi;
+/* idx can be of type enum memcg_stat_item or node_stat_item. */
+static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = READ_ONCE(memcg->vmstats.state[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(memcg->vmstats_local->stat[idx], x);
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmstats[idx]);
+/* idx can be of type enum memcg_stat_item or node_stat_item. */
+static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
+{
+ long x = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
+#ifdef CONFIG_SMP
+ if (x < 0)
x = 0;
- }
- __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+#endif
+ return x;
}
static struct mem_cgroup_per_node *
@@ -801,7 +679,7 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
parent = parent_mem_cgroup(pn->memcg);
if (!parent)
return NULL;
- return mem_cgroup_nodeinfo(parent, nid);
+ return parent->nodeinfo[nid];
}
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
@@ -860,18 +738,22 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
int val)
{
struct page *head = compound_head(page); /* rmap on tail pages */
- struct mem_cgroup *memcg = page_memcg(head);
+ struct mem_cgroup *memcg;
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
+ rcu_read_lock();
+ memcg = page_memcg(head);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
+ rcu_read_unlock();
__mod_node_page_state(pgdat, idx, val);
return;
}
lruvec = mem_cgroup_lruvec(memcg, pgdat);
__mod_lruvec_state(lruvec, idx, val);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(__mod_lruvec_page_state);
@@ -899,39 +781,43 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
rcu_read_unlock();
}
+/*
+ * mod_objcg_mlstate() may be called with irq enabled, so
+ * mod_memcg_lruvec_state() should be used.
+ */
+static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ rcu_read_unlock();
+}
+
/**
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
- * @count: the number of events that occured
+ * @count: the number of events that occurred
*/
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
- unsigned long x;
-
if (mem_cgroup_disabled())
return;
- x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
- if (unlikely(x > MEMCG_CHARGE_BATCH)) {
- struct mem_cgroup *mi;
-
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(memcg->vmstats_local->events[idx], x);
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmevents[idx]);
- x = 0;
- }
- __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+ __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+ cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
- return atomic_long_read(&memcg->vmevents[event]);
+ return READ_ONCE(memcg->vmstats.events[event]);
}
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
@@ -940,7 +826,7 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
int cpu;
for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_local->events[event], cpu);
+ x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
return x;
}
@@ -1017,13 +903,24 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
}
EXPORT_SYMBOL(mem_cgroup_from_task);
+static __always_inline struct mem_cgroup *active_memcg(void)
+{
+ if (in_interrupt())
+ return this_cpu_read(int_active_memcg);
+ else
+ return current->active_memcg;
+}
+
/**
* get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
* @mm: mm from which memcg should be extracted. It can be NULL.
*
- * Obtain a reference on mm->memcg and returns it if successful. Otherwise
- * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
- * returned.
+ * Obtain a reference on mm->memcg and returns it if successful. If mm
+ * is NULL, then the memcg is chosen as follows:
+ * 1) The active memcg, if set.
+ * 2) current->mm->memcg, if available
+ * 3) root memcg
+ * If mem_cgroup is disabled, NULL is returned.
*/
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
@@ -1032,48 +929,38 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
if (mem_cgroup_disabled())
return NULL;
+ /*
+ * Page cache insertions can happen without an
+ * actual mm context, e.g. during disk probing
+ * on boot, loopback IO, acct() writes etc.
+ *
+ * No need to css_get on root memcg as the reference
+ * counting is disabled on the root level in the
+ * cgroup core. See CSS_NO_REF.
+ */
+ if (unlikely(!mm)) {
+ memcg = active_memcg();
+ if (unlikely(memcg)) {
+ /* remote memcg must hold a ref */
+ css_get(&memcg->css);
+ return memcg;
+ }
+ mm = current->mm;
+ if (unlikely(!mm))
+ return root_mem_cgroup;
+ }
+
rcu_read_lock();
do {
- /*
- * Page cache insertions can happen withou an
- * actual mm context, e.g. during disk probing
- * on boot, loopback IO, acct() writes etc.
- */
- if (unlikely(!mm))
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
memcg = root_mem_cgroup;
- else {
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
- memcg = root_mem_cgroup;
- }
} while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
EXPORT_SYMBOL(get_mem_cgroup_from_mm);
-static __always_inline struct mem_cgroup *active_memcg(void)
-{
- if (in_interrupt())
- return this_cpu_read(int_active_memcg);
- else
- return current->active_memcg;
-}
-
-static __always_inline struct mem_cgroup *get_active_memcg(void)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = active_memcg();
- /* remote memcg must hold a ref. */
- if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
- rcu_read_unlock();
-
- return memcg;
-}
-
static __always_inline bool memcg_kmem_bypass(void)
{
/* Allow remote memcg charging from any context. */
@@ -1088,20 +975,6 @@ static __always_inline bool memcg_kmem_bypass(void)
}
/**
- * If active memcg is set, do not fallback to current->mm->memcg.
- */
-static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
-{
- if (memcg_kmem_bypass())
- return NULL;
-
- if (unlikely(active_memcg()))
- return get_active_memcg();
-
- return get_mem_cgroup_from_mm(current->mm);
-}
-
-/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
@@ -1141,7 +1014,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (reclaim) {
struct mem_cgroup_per_node *mz;
- mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
+ mz = root->nodeinfo[reclaim->pgdat->node_id];
iter = &mz->iter;
if (prev && reclaim->generation != iter->generation)
@@ -1243,7 +1116,7 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
int nid;
for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(from, nid);
+ mz = from->nodeinfo[nid];
iter = &mz->iter;
cmpxchg(&iter->position, dead_memcg, NULL);
}
@@ -1337,9 +1210,8 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
struct lruvec *lock_page_lruvec(struct page *page)
{
struct lruvec *lruvec;
- struct pglist_data *pgdat = page_pgdat(page);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = mem_cgroup_page_lruvec(page);
spin_lock(&lruvec->lru_lock);
lruvec_memcg_debug(lruvec, page);
@@ -1350,9 +1222,8 @@ struct lruvec *lock_page_lruvec(struct page *page)
struct lruvec *lock_page_lruvec_irq(struct page *page)
{
struct lruvec *lruvec;
- struct pglist_data *pgdat = page_pgdat(page);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = mem_cgroup_page_lruvec(page);
spin_lock_irq(&lruvec->lru_lock);
lruvec_memcg_debug(lruvec, page);
@@ -1363,9 +1234,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page)
struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
{
struct lruvec *lruvec;
- struct pglist_data *pgdat = page_pgdat(page);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = mem_cgroup_page_lruvec(page);
spin_lock_irqsave(&lruvec->lru_lock, *flags);
lruvec_memcg_debug(lruvec, page);
@@ -1576,6 +1446,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
*
* Current memory state:
*/
+ cgroup_rstat_flush(memcg->css.cgroup);
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
@@ -1870,7 +1741,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
/*
- * Be careful about under_oom underflows becase a child memcg
+ * Be careful about under_oom underflows because a child memcg
* could have been added after mem_cgroup_mark_under_oom.
*/
spin_lock(&memcg_oom_lock);
@@ -2042,7 +1913,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
/*
* There is no guarantee that an OOM-lock contender
* sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitely.
+ * uncharges. Wake any sleepers explicitly.
*/
memcg_oom_recover(memcg);
}
@@ -2123,11 +1994,10 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
* This function protects unlocked LRU pages from being moved to
* another cgroup.
*
- * It ensures lifetime of the returned memcg. Caller is responsible
- * for the lifetime of the page; __unlock_page_memcg() is available
- * when @page might get freed inside the locked section.
+ * It ensures lifetime of the locked memcg. Caller is responsible
+ * for the lifetime of the page.
*/
-struct mem_cgroup *lock_page_memcg(struct page *page)
+void lock_page_memcg(struct page *page)
{
struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
@@ -2137,21 +2007,15 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when
- * the page state that is going to change is the only thing
- * preventing the page itself from being freed. E.g. writeback
- * doesn't hold a page reference and relies on PG_writeback to
- * keep off truncation, migration and so forth.
*/
rcu_read_lock();
if (mem_cgroup_disabled())
- return NULL;
+ return;
again:
memcg = page_memcg(head);
if (unlikely(!memcg))
- return NULL;
+ return;
#ifdef CONFIG_PROVE_LOCKING
local_irq_save(flags);
@@ -2160,7 +2024,7 @@ again:
#endif
if (atomic_read(&memcg->moving_account) <= 0)
- return memcg;
+ return;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page_memcg(head)) {
@@ -2169,24 +2033,17 @@ again:
}
/*
- * When charge migration first begins, we can have locked and
- * unlocked page stat updates happening concurrently. Track
- * the task who has the lock for unlock_page_memcg().
+ * When charge migration first begins, we can have multiple
+ * critical sections holding the fast-path RCU lock and one
+ * holding the slowpath move_lock. Track the task who has the
+ * move_lock for unlock_page_memcg().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
-
- return memcg;
}
EXPORT_SYMBOL(lock_page_memcg);
-/**
- * __unlock_page_memcg - unlock and unpin a memcg
- * @memcg: the memcg
- *
- * Unlock and unpin a memcg returned by lock_page_memcg().
- */
-void __unlock_page_memcg(struct mem_cgroup *memcg)
+static void __unlock_page_memcg(struct mem_cgroup *memcg)
{
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -2212,14 +2069,23 @@ void unlock_page_memcg(struct page *page)
}
EXPORT_SYMBOL(unlock_page_memcg);
-struct memcg_stock_pcp {
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
-
+struct obj_stock {
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg;
+ struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
+ int nr_slab_reclaimable_b;
+ int nr_slab_unreclaimable_b;
+#else
+ int dummy[0];
#endif
+};
+
+struct memcg_stock_pcp {
+ struct mem_cgroup *cached; /* this never be root cgroup */
+ unsigned int nr_pages;
+ struct obj_stock task_obj;
+ struct obj_stock irq_obj;
struct work_struct work;
unsigned long flags;
@@ -2229,12 +2095,12 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
#ifdef CONFIG_MEMCG_KMEM
-static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static void drain_obj_stock(struct obj_stock *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg);
#else
-static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+static inline void drain_obj_stock(struct obj_stock *stock)
{
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
@@ -2244,6 +2110,41 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
}
#endif
+/*
+ * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
+ * sequence used in this case to access content from object stock is slow.
+ * To optimize for user context access, there are now two object stocks for
+ * task context and interrupt context access respectively.
+ *
+ * The task context object stock can be accessed by disabling preemption only
+ * which is cheap in non-preempt kernel. The interrupt context object stock
+ * can only be accessed after disabling interrupt. User context code can
+ * access interrupt object stock, but not vice versa.
+ */
+static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
+{
+ struct memcg_stock_pcp *stock;
+
+ if (likely(in_task())) {
+ *pflags = 0UL;
+ preempt_disable();
+ stock = this_cpu_ptr(&memcg_stock);
+ return &stock->task_obj;
+ }
+
+ local_irq_save(*pflags);
+ stock = this_cpu_ptr(&memcg_stock);
+ return &stock->irq_obj;
+}
+
+static inline void put_obj_stock(unsigned long flags)
+{
+ if (likely(in_task()))
+ preempt_enable();
+ else
+ local_irq_restore(flags);
+}
+
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
@@ -2310,7 +2211,9 @@ static void drain_local_stock(struct work_struct *dummy)
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
- drain_obj_stock(stock);
+ drain_obj_stock(&stock->irq_obj);
+ if (in_task())
+ drain_obj_stock(&stock->task_obj);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
@@ -2386,50 +2289,39 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-static int memcg_hotplug_cpu_dead(unsigned int cpu)
+static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
{
- struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg, *mi;
-
- stock = &per_cpu(memcg_stock, cpu);
- drain_stock(stock);
+ int nid;
- for_each_mem_cgroup(memcg) {
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ unsigned long stat[NR_VM_NODE_STAT_ITEMS];
+ struct batched_lruvec_stat *lstatc;
int i;
- for (i = 0; i < MEMCG_NR_STAT; i++) {
- int nid;
- long x;
-
- x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
- if (x)
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &memcg->vmstats[i]);
-
- if (i >= NR_VM_NODE_STAT_ITEMS)
- continue;
+ lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ stat[i] = lstatc->count[i];
+ lstatc->count[i] = 0;
+ }
- for_each_node(nid) {
- struct mem_cgroup_per_node *pn;
+ do {
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ atomic_long_add(stat[i], &pn->lruvec_stat[i]);
+ } while ((pn = parent_nodeinfo(pn, nid)));
+ }
+}
- pn = mem_cgroup_nodeinfo(memcg, nid);
- x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
- if (x)
- do {
- atomic_long_add(x, &pn->lruvec_stat[i]);
- } while ((pn = parent_nodeinfo(pn, nid)));
- }
- }
+static int memcg_hotplug_cpu_dead(unsigned int cpu)
+{
+ struct memcg_stock_pcp *stock;
+ struct mem_cgroup *memcg;
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
- long x;
+ stock = &per_cpu(memcg_stock, cpu);
+ drain_stock(stock);
- x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
- if (x)
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &memcg->vmevents[i]);
- }
- }
+ for_each_mem_cgroup(memcg)
+ memcg_flush_lruvec_page_state(memcg, cpu);
return 0;
}
@@ -2687,8 +2579,8 @@ out:
css_put(&memcg->css);
}
-static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
+static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
@@ -2700,8 +2592,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
bool drained = false;
unsigned long pflags;
- if (mem_cgroup_is_root(memcg))
- return 0;
retry:
if (consume_stock(memcg, nr_pages))
return 0;
@@ -2798,9 +2688,6 @@ retry:
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
- if (gfp_mask & __GFP_NOFAIL)
- goto force;
-
if (fatal_signal_pending(current))
goto force;
@@ -2884,6 +2771,15 @@ done_restock:
return 0;
}
+static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
+{
+ if (mem_cgroup_is_root(memcg))
+ return 0;
+
+ return try_charge_memcg(memcg, gfp_mask, nr_pages);
+}
+
#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
@@ -2910,7 +2806,28 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
page->memcg_data = (unsigned long)memcg;
}
+static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+retry:
+ memcg = obj_cgroup_memcg(objcg);
+ if (unlikely(!css_tryget(&memcg->css)))
+ goto retry;
+ rcu_read_unlock();
+
+ return memcg;
+}
+
#ifdef CONFIG_MEMCG_KMEM
+/*
+ * The allocated objcg pointers array is not accounted directly.
+ * Moreover, it should not come from DMA buffer and is not readily
+ * reclaimable. So those GFP bits should be masked off.
+ */
+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
+
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
gfp_t gfp, bool new_page)
{
@@ -2918,6 +2835,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
unsigned long memcg_data;
void *vec;
+ gfp &= ~OBJCGS_CLEAR_MASK;
vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
page_to_nid(page));
if (!vec)
@@ -3061,23 +2979,45 @@ static void memcg_free_cache_id(int id)
ida_simple_remove(&memcg_cache_ida, id);
}
-/**
- * __memcg_kmem_charge: charge a number of kernel pages to a memcg
- * @memcg: memory cgroup to charge
+/*
+ * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
+ * @objcg: object cgroup to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
+ unsigned int nr_pages)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+ refill_stock(memcg, nr_pages);
+
+ css_put(&memcg->css);
+}
+
+/*
+ * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
+ * @objcg: object cgroup to charge
* @gfp: reclaim mode
* @nr_pages: number of pages to charge
*
* Returns 0 on success, an error code on failure.
*/
-static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages)
+static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
+ unsigned int nr_pages)
{
struct page_counter *counter;
+ struct mem_cgroup *memcg;
int ret;
- ret = try_charge(memcg, gfp, nr_pages);
+ memcg = get_mem_cgroup_from_objcg(objcg);
+
+ ret = try_charge_memcg(memcg, gfp, nr_pages);
if (ret)
- return ret;
+ goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
@@ -3089,25 +3029,15 @@ static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
*/
if (gfp & __GFP_NOFAIL) {
page_counter_charge(&memcg->kmem, nr_pages);
- return 0;
+ goto out;
}
cancel_charge(memcg, nr_pages);
- return -ENOMEM;
+ ret = -ENOMEM;
}
- return 0;
-}
-
-/**
- * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
- * @memcg: memcg to uncharge
- * @nr_pages: number of pages to uncharge
- */
-static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
+out:
+ css_put(&memcg->css);
- refill_stock(memcg, nr_pages);
+ return ret;
}
/**
@@ -3120,18 +3050,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
*/
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
int ret = 0;
- memcg = get_mem_cgroup_from_current();
- if (memcg && !mem_cgroup_is_root(memcg)) {
- ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+ objcg = get_obj_cgroup_from_current();
+ if (objcg) {
+ ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
- page->memcg_data = (unsigned long)memcg |
+ page->memcg_data = (unsigned long)objcg |
MEMCG_DATA_KMEM;
return 0;
}
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}
return ret;
}
@@ -3143,38 +3073,93 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct mem_cgroup *memcg = page_memcg(page);
+ struct obj_cgroup *objcg;
unsigned int nr_pages = 1 << order;
- if (!memcg)
+ if (!PageMemcgKmem(page))
return;
- VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- __memcg_kmem_uncharge(memcg, nr_pages);
+ objcg = __page_objcg(page);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
page->memcg_data = 0;
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
+}
+
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ unsigned long flags;
+ struct obj_stock *stock = get_obj_stock(&flags);
+ int *bytes;
+
+ /*
+ * Save vmstat data in stock and skip vmstat array update unless
+ * accumulating over a page of vmstat data or when pgdat or idx
+ * changes.
+ */
+ if (stock->cached_objcg != objcg) {
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ stock->cached_objcg = objcg;
+ stock->cached_pgdat = pgdat;
+ } else if (stock->cached_pgdat != pgdat) {
+ /* Flush the existing cached vmstat data */
+ if (stock->nr_slab_reclaimable_b) {
+ mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B,
+ stock->nr_slab_reclaimable_b);
+ stock->nr_slab_reclaimable_b = 0;
+ }
+ if (stock->nr_slab_unreclaimable_b) {
+ mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B,
+ stock->nr_slab_unreclaimable_b);
+ stock->nr_slab_unreclaimable_b = 0;
+ }
+ stock->cached_pgdat = pgdat;
+ }
+
+ bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
+ : &stock->nr_slab_unreclaimable_b;
+ /*
+ * Even for large object >= PAGE_SIZE, the vmstat data will still be
+ * cached locally at least once before pushing it out.
+ */
+ if (!*bytes) {
+ *bytes = nr;
+ nr = 0;
+ } else {
+ *bytes += nr;
+ if (abs(*bytes) > PAGE_SIZE) {
+ nr = *bytes;
+ *bytes = 0;
+ } else {
+ nr = 0;
+ }
+ }
+ if (nr)
+ mod_objcg_mlstate(objcg, pgdat, idx, nr);
+
+ put_obj_stock(flags);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{
- struct memcg_stock_pcp *stock;
unsigned long flags;
+ struct obj_stock *stock = get_obj_stock(&flags);
bool ret = false;
- local_irq_save(flags);
-
- stock = this_cpu_ptr(&memcg_stock);
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
- local_irq_restore(flags);
+ put_obj_stock(flags);
return ret;
}
-static void drain_obj_stock(struct memcg_stock_pcp *stock)
+static void drain_obj_stock(struct obj_stock *stock)
{
struct obj_cgroup *old = stock->cached_objcg;
@@ -3185,11 +3170,8 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
- if (nr_pages) {
- rcu_read_lock();
- __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
- rcu_read_unlock();
- }
+ if (nr_pages)
+ obj_cgroup_uncharge_pages(old, nr_pages);
/*
* The leftover is flushed to the centralized per-memcg value.
@@ -3205,6 +3187,25 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
stock->nr_bytes = 0;
}
+ /*
+ * Flush the vmstat data in current stock
+ */
+ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
+ if (stock->nr_slab_reclaimable_b) {
+ mod_objcg_mlstate(old, stock->cached_pgdat,
+ NR_SLAB_RECLAIMABLE_B,
+ stock->nr_slab_reclaimable_b);
+ stock->nr_slab_reclaimable_b = 0;
+ }
+ if (stock->nr_slab_unreclaimable_b) {
+ mod_objcg_mlstate(old, stock->cached_pgdat,
+ NR_SLAB_UNRECLAIMABLE_B,
+ stock->nr_slab_unreclaimable_b);
+ stock->nr_slab_unreclaimable_b = 0;
+ }
+ stock->cached_pgdat = NULL;
+ }
+
obj_cgroup_put(old);
stock->cached_objcg = NULL;
}
@@ -3214,8 +3215,13 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
{
struct mem_cgroup *memcg;
- if (stock->cached_objcg) {
- memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (in_task() && stock->task_obj.cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ return true;
+ }
+ if (stock->irq_obj.cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3223,31 +3229,36 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
return false;
}
-static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ bool allow_uncharge)
{
- struct memcg_stock_pcp *stock;
unsigned long flags;
+ struct obj_stock *stock = get_obj_stock(&flags);
+ unsigned int nr_pages = 0;
- local_irq_save(flags);
-
- stock = this_cpu_ptr(&memcg_stock);
if (stock->cached_objcg != objcg) { /* reset if necessary */
drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->cached_objcg = objcg;
- stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
- if (stock->nr_bytes > PAGE_SIZE)
- drain_obj_stock(stock);
+ if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
+ nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ stock->nr_bytes &= (PAGE_SIZE - 1);
+ }
- local_irq_restore(flags);
+ put_obj_stock(flags);
+
+ if (nr_pages)
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
}
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
{
- struct mem_cgroup *memcg;
unsigned int nr_pages, nr_bytes;
int ret;
@@ -3255,39 +3266,44 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
return 0;
/*
- * In theory, memcg->nr_charged_bytes can have enough
+ * In theory, objcg->nr_charged_bytes can have enough
* pre-charged bytes to satisfy the allocation. However,
- * flushing memcg->nr_charged_bytes requires two atomic
- * operations, and memcg->nr_charged_bytes can't be big,
- * so it's better to ignore it and try grab some new pages.
- * memcg->nr_charged_bytes will be flushed in
- * refill_obj_stock(), called from this function or
- * independently later.
+ * flushing objcg->nr_charged_bytes requires two atomic
+ * operations, and objcg->nr_charged_bytes can't be big.
+ * The shared objcg->nr_charged_bytes can also become a
+ * performance bottleneck if all tasks of the same memcg are
+ * trying to update it. So it's better to ignore it and try
+ * grab some new pages. The stock's nr_bytes will be flushed to
+ * objcg->nr_charged_bytes later on when objcg changes.
+ *
+ * The stock's nr_bytes may contain enough pre-charged bytes
+ * to allow one less page from being charged, but we can't rely
+ * on the pre-charged bytes not being changed outside of
+ * consume_obj_stock() or refill_obj_stock(). So ignore those
+ * pre-charged bytes as well when charging pages. To avoid a
+ * page uncharge right after a page charge, we set the
+ * allow_uncharge flag to false when calling refill_obj_stock()
+ * to temporarily allow the pre-charged bytes to exceed the page
+ * size limit. The maximum reachable value of the pre-charged
+ * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
+ * race.
*/
- rcu_read_lock();
-retry:
- memcg = obj_cgroup_memcg(objcg);
- if (unlikely(!css_tryget(&memcg->css)))
- goto retry;
- rcu_read_unlock();
-
nr_pages = size >> PAGE_SHIFT;
nr_bytes = size & (PAGE_SIZE - 1);
if (nr_bytes)
nr_pages += 1;
- ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+ ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
if (!ret && nr_bytes)
- refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
- css_put(&memcg->css);
return ret;
}
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_obj_stock(objcg, size);
+ refill_obj_stock(objcg, size, true);
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -3305,7 +3321,11 @@ void split_page_memcg(struct page *head, unsigned int nr)
for (i = 1; i < nr; i++)
head[i].memcg_data = head->memcg_data;
- css_get_many(&memcg->css, nr - 1);
+
+ if (PageMemcgKmem(head))
+ obj_cgroup_get_many(__page_objcg(head), nr - 1);
+ else
+ css_get_many(&memcg->css, nr - 1);
}
#ifdef CONFIG_MEMCG_SWAP
@@ -3554,6 +3574,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
+ cgroup_rstat_flush(memcg->css.cgroup);
val = memcg_page_state(memcg, NR_FILE_PAGES) +
memcg_page_state(memcg, NR_ANON_MAPPED);
if (swap)
@@ -3618,57 +3639,6 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
-{
- unsigned long stat[MEMCG_NR_STAT] = {0};
- struct mem_cgroup *mi;
- int node, cpu, i;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < MEMCG_NR_STAT; i++)
- stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
-
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = 0; i < MEMCG_NR_STAT; i++)
- atomic_long_add(stat[i], &mi->vmstats[i]);
-
- for_each_node(node) {
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
- struct mem_cgroup_per_node *pi;
-
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- stat[i] = 0;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- stat[i] += per_cpu(
- pn->lruvec_stat_cpu->count[i], cpu);
-
- for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- atomic_long_add(stat[i], &pi->lruvec_stat[i]);
- }
-}
-
-static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
-{
- unsigned long events[NR_VM_EVENT_ITEMS];
- struct mem_cgroup *mi;
- int cpu, i;
-
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] = 0;
-
- for_each_online_cpu(cpu)
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] += per_cpu(memcg->vmstats_percpu->events[i],
- cpu);
-
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- atomic_long_add(events[i], &mi->vmevents[i]);
-}
-
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -3985,6 +3955,8 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
int nid;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ cgroup_rstat_flush(memcg->css.cgroup);
+
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
seq_printf(m, "%s=%lu", stat->name,
mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
@@ -4055,6 +4027,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
+ cgroup_rstat_flush(memcg->css.cgroup);
+
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
@@ -4113,7 +4087,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long file_cost = 0;
for_each_online_pgdat(pgdat) {
- mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ mz = memcg->nodeinfo[pgdat->node_id];
anon_cost += mz->lruvec.anon_cost;
file_cost += mz->lruvec.file_cost;
@@ -4142,7 +4116,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
if (val > 100)
return -EINVAL;
- if (css->parent)
+ if (!mem_cgroup_is_root(memcg))
memcg->swappiness = val;
else
vm_swappiness = val;
@@ -4492,7 +4466,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/* cannot set to root cgroup and only 0 and 1 are allowed */
- if (!css->parent || !((val == 0) || (val == 1)))
+ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
return -EINVAL;
memcg->oom_kill_disable = val;
@@ -4531,22 +4505,6 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
return &memcg->cgwb_domain;
}
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page().
- */
-static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
-{
- long x = atomic_long_read(&memcg->vmstats[idx]);
- int cpu;
-
- for_each_online_cpu(cpu)
- x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
- if (x < 0)
- x = 0;
- return x;
-}
-
/**
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
* @wb: bdi_writeback in question
@@ -4572,13 +4530,14 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+ cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
- *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
- *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
- memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
- *pheadroom = PAGE_COUNTER_MAX;
+ *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+ *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_page_state(memcg, NR_ACTIVE_FILE);
+ *pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
READ_ONCE(memcg->memory.high));
@@ -4593,7 +4552,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
* Foreign dirty flushing
*
* There's an inherent mismatch between memcg and writeback. The former
- * trackes ownership per-page while the latter per-inode. This was a
+ * tracks ownership per-page while the latter per-inode. This was a
* deliberate design decision because honoring per-page ownership in the
* writeback path is complicated, may lead to higher CPU and IO overheads
* and deemed unnecessary given that write-sharing an inode across
@@ -4608,9 +4567,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
* triggering background writeback. A will be slowed down without a way to
* make writeback of the dirty pages happen.
*
- * Conditions like the above can lead to a cgroup getting repatedly and
+ * Conditions like the above can lead to a cgroup getting repeatedly and
* severely throttled after making some progress after each
- * dirty_expire_interval while the underyling IO device is almost
+ * dirty_expire_interval while the underlying IO device is almost
* completely idle.
*
* Solving this problem completely requires matching the ownership tracking
@@ -5210,19 +5169,20 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
- free_percpu(memcg->vmstats_local);
kfree(memcg);
}
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ int cpu;
+
memcg_wb_domain_exit(memcg);
/*
- * Flush percpu vmstats and vmevents to guarantee the value correctness
- * on parent's and all ancestor levels.
+ * Flush percpu lruvec stats to guarantee the value
+ * correctness on parent's and all ancestor levels.
*/
- memcg_flush_percpu_vmstats(memcg);
- memcg_flush_percpu_vmevents(memcg);
+ for_each_online_cpu(cpu)
+ memcg_flush_lruvec_page_state(memcg, cpu);
__mem_cgroup_free(memcg);
}
@@ -5249,11 +5209,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
}
- memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
- GFP_KERNEL_ACCOUNT);
- if (!memcg->vmstats_local)
- goto fail;
-
memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_percpu)
@@ -5351,11 +5306,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/*
- * A memcg must be visible for memcg_expand_shrinker_maps()
+ * A memcg must be visible for expand_shrinker_info()
* by the time the maps are allocated. So, we allocate maps
* here, when for_each_mem_cgroup() can't skip it.
*/
- if (memcg_alloc_shrinker_maps(memcg)) {
+ if (alloc_shrinker_info(memcg)) {
mem_cgroup_id_remove(memcg);
return -ENOMEM;
}
@@ -5387,6 +5342,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
page_counter_set_low(&memcg->memory, 0);
memcg_offline_kmem(memcg);
+ reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
drain_all_stock(memcg);
@@ -5419,7 +5375,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
- memcg_free_shrinker_maps(memcg);
+ free_shrinker_info(memcg);
memcg_free_kmem(memcg);
mem_cgroup_free(memcg);
}
@@ -5453,6 +5409,62 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg_wb_domain_size_changed(memcg);
}
+static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct memcg_vmstats_percpu *statc;
+ long delta, v;
+ int i;
+
+ statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
+
+ for (i = 0; i < MEMCG_NR_STAT; i++) {
+ /*
+ * Collect the aggregated propagation counts of groups
+ * below us. We're in a per-cpu loop here and this is
+ * a global counter, so the first cycle will get them.
+ */
+ delta = memcg->vmstats.state_pending[i];
+ if (delta)
+ memcg->vmstats.state_pending[i] = 0;
+
+ /* Add CPU changes on this level since the last flush */
+ v = READ_ONCE(statc->state[i]);
+ if (v != statc->state_prev[i]) {
+ delta += v - statc->state_prev[i];
+ statc->state_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ /* Aggregate counts on this level and propagate upwards */
+ memcg->vmstats.state[i] += delta;
+ if (parent)
+ parent->vmstats.state_pending[i] += delta;
+ }
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+ delta = memcg->vmstats.events_pending[i];
+ if (delta)
+ memcg->vmstats.events_pending[i] = 0;
+
+ v = READ_ONCE(statc->events[i]);
+ if (v != statc->events_prev[i]) {
+ delta += v - statc->events_prev[i];
+ statc->events_prev[i] = v;
+ }
+
+ if (!delta)
+ continue;
+
+ memcg->vmstats.events[i] += delta;
+ if (parent)
+ parent->vmstats.events_pending[i] += delta;
+ }
+}
+
#ifdef CONFIG_MMU
/* Handlers for move charge at task migration. */
static int mem_cgroup_do_precharge(unsigned long count)
@@ -5950,7 +5962,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
return 0;
/*
- * We are now commited to this value whatever it is. Changes in this
+ * We are now committed to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
@@ -6506,6 +6518,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
+ .css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
@@ -6688,6 +6701,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_low_usage)));
}
+static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
+ gfp_t gfp)
+{
+ unsigned int nr_pages = thp_nr_pages(page);
+ int ret;
+
+ ret = try_charge(memcg, gfp, nr_pages);
+ if (ret)
+ goto out;
+
+ css_get(&memcg->css);
+ commit_charge(page, memcg);
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ memcg_check_events(memcg, page);
+ local_irq_enable();
+out:
+ return ret;
+}
+
/**
* mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge
@@ -6695,57 +6729,74 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
* @gfp_mask: reclaim mode
*
* Try to charge @page to the memcg that @mm belongs to, reclaiming
- * pages according to @gfp_mask if necessary.
+ * pages according to @gfp_mask if necessary. if @mm is NULL, try to
+ * charge to the active memcg.
+ *
+ * Do not use this for pages allocated for swapin.
*
* Returns 0 on success. Otherwise, an error code is returned.
*/
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
- unsigned int nr_pages = thp_nr_pages(page);
- struct mem_cgroup *memcg = NULL;
- int ret = 0;
+ struct mem_cgroup *memcg;
+ int ret;
if (mem_cgroup_disabled())
- goto out;
+ return 0;
- if (PageSwapCache(page)) {
- swp_entry_t ent = { .val = page_private(page), };
- unsigned short id;
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = __mem_cgroup_charge(page, memcg, gfp_mask);
+ css_put(&memcg->css);
- /*
- * Every swap fault against a single page tries to charge the
- * page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. page and memcg binding is
- * protected by the page lock, which serializes swap cache
- * removal, which in turn serializes uncharging.
- */
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (page_memcg(compound_head(page)))
- goto out;
+ return ret;
+}
- id = lookup_swap_cgroup_id(ent);
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
+/**
+ * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp: reclaim mode
+ * @entry: swap entry for which the page is allocated
+ *
+ * This function charges a page allocated for swapin. Please call this before
+ * adding the page to the swapcache.
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+ gfp_t gfp, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ unsigned short id;
+ int ret;
- if (!memcg)
- memcg = get_mem_cgroup_from_mm(mm);
+ if (mem_cgroup_disabled())
+ return 0;
- ret = try_charge(memcg, gfp_mask, nr_pages);
- if (ret)
- goto out_put;
+ id = lookup_swap_cgroup_id(entry);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (!memcg || !css_tryget_online(&memcg->css))
+ memcg = get_mem_cgroup_from_mm(mm);
+ rcu_read_unlock();
- css_get(&memcg->css);
- commit_charge(page, memcg);
+ ret = __mem_cgroup_charge(page, memcg, gfp);
- local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, nr_pages);
- memcg_check_events(memcg, page);
- local_irq_enable();
+ css_put(&memcg->css);
+ return ret;
+}
+/*
+ * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
+ * @entry: swap entry for which the page is charged
+ *
+ * Call this function after successfully adding the charged page to swapcache.
+ *
+ * Note: This function assumes the page for which swap slot is being uncharged
+ * is order 0 page.
+ */
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
/*
* Cgroup1's unified memory+swap counter has been charged with the
* new swapcache page, finish the transfer by uncharging the swap
@@ -6758,25 +6809,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
* correspond 1:1 to page and swap slot lifetimes: we charge the
* page to memory here, and uncharge swap when the slot is freed.
*/
- if (do_memsw_account() && PageSwapCache(page)) {
- swp_entry_t entry = { .val = page_private(page) };
+ if (!mem_cgroup_disabled() && do_memsw_account()) {
/*
* The swap entry might not get freed for a long time,
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/
- mem_cgroup_uncharge_swap(entry, nr_pages);
+ mem_cgroup_uncharge_swap(entry, 1);
}
-
-out_put:
- css_put(&memcg->css);
-out:
- return ret;
}
struct uncharge_gather {
struct mem_cgroup *memcg;
- unsigned long nr_pages;
+ unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
struct page *dummy_page;
@@ -6791,10 +6836,10 @@ static void uncharge_batch(const struct uncharge_gather *ug)
{
unsigned long flags;
- if (!mem_cgroup_is_root(ug->memcg)) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
+ if (ug->nr_memory) {
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
memcg_oom_recover(ug->memcg);
@@ -6802,7 +6847,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
local_irq_save(flags);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
@@ -6813,40 +6858,61 @@ static void uncharge_batch(const struct uncharge_gather *ug)
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
{
unsigned long nr_pages;
+ struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
+ bool use_objcg = PageMemcgKmem(page);
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (!page_memcg(page))
- return;
-
/*
* Nobody should be changing or seriously looking at
- * page_memcg(page) at this point, we have fully
+ * page memcg or objcg at this point, we have fully
* exclusive access to the page.
*/
+ if (use_objcg) {
+ objcg = __page_objcg(page);
+ /*
+ * This get matches the put at the end of the function and
+ * kmem pages do not hold memcg references anymore.
+ */
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ } else {
+ memcg = __page_memcg(page);
+ }
+
+ if (!memcg)
+ return;
- if (ug->memcg != page_memcg(page)) {
+ if (ug->memcg != memcg) {
if (ug->memcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = page_memcg(page);
+ ug->memcg = memcg;
+ ug->dummy_page = page;
/* pairs with css_put in uncharge_batch */
- css_get(&ug->memcg->css);
+ css_get(&memcg->css);
}
nr_pages = compound_nr(page);
- ug->nr_pages += nr_pages;
- if (PageMemcgKmem(page))
+ if (use_objcg) {
+ ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
- else
+
+ page->memcg_data = 0;
+ obj_cgroup_put(objcg);
+ } else {
+ /* LRU pages aren't accounted at the root level */
+ if (!mem_cgroup_is_root(memcg))
+ ug->nr_memory += nr_pages;
ug->pgpgout++;
- ug->dummy_page = page;
- page->memcg_data = 0;
- css_put(&ug->memcg->css);
+ page->memcg_data = 0;
+ }
+
+ css_put(&memcg->css);
}
/**
@@ -6930,9 +6996,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
/* Force-charge the new page. The old one will be freed soon */
nr_pages = thp_nr_pages(newpage);
- page_counter_charge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_charge(&memcg->memsw, nr_pages);
+ if (!mem_cgroup_is_root(memcg)) {
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_charge(&memcg->memsw, nr_pages);
+ }
css_get(&memcg->css);
commit_charge(newpage, memcg);
diff --git a/mm/memfd.c b/mm/memfd.c
index 2647c898990c..081dd33e6a61 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -297,9 +297,9 @@ SYSCALL_DEFINE2(memfd_create,
}
if (flags & MFD_HUGETLB) {
- struct user_struct *user = NULL;
+ struct ucounts *ucounts = NULL;
- file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 24210c9bd843..e5a1531f7f4e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
#include <linux/page-isolation.h>
+#include <linux/pagewalk.h>
#include "internal.h"
#include "ras/ras_event.h"
@@ -75,7 +76,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
/*
* We could fail to take off the target page from buddy
- * for example due to racy page allocaiton, but that's
+ * for example due to racy page allocation, but that's
* acceptable because soft-offlined page is not broken
* and if someone really want to use it, they should
* take it.
@@ -554,6 +555,148 @@ static void collect_procs(struct page *page, struct list_head *tokill,
collect_procs_file(page, tokill, force_early);
}
+struct hwp_walk {
+ struct to_kill tk;
+ unsigned long pfn;
+ int flags;
+};
+
+static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
+{
+ tk->addr = addr;
+ tk->size_shift = shift;
+}
+
+static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
+ unsigned long poisoned_pfn, struct to_kill *tk)
+{
+ unsigned long pfn = 0;
+
+ if (pte_present(pte)) {
+ pfn = pte_pfn(pte);
+ } else {
+ swp_entry_t swp = pte_to_swp_entry(pte);
+
+ if (is_hwpoison_entry(swp))
+ pfn = hwpoison_entry_to_pfn(swp);
+ }
+
+ if (!pfn || pfn != poisoned_pfn)
+ return 0;
+
+ set_to_kill(tk, addr, shift);
+ return 1;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+ struct hwp_walk *hwp)
+{
+ pmd_t pmd = *pmdp;
+ unsigned long pfn;
+ unsigned long hwpoison_vaddr;
+
+ if (!pmd_present(pmd))
+ return 0;
+ pfn = pmd_pfn(pmd);
+ if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
+ hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
+ set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
+ return 1;
+ }
+ return 0;
+}
+#else
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+ struct hwp_walk *hwp)
+{
+ return 0;
+}
+#endif
+
+static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+ int ret = 0;
+ pte_t *ptep;
+ spinlock_t *ptl;
+
+ ptl = pmd_trans_huge_lock(pmdp, walk->vma);
+ if (ptl) {
+ ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
+ spin_unlock(ptl);
+ goto out;
+ }
+
+ if (pmd_trans_unstable(pmdp))
+ goto out;
+
+ ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
+ for (; addr != end; ptep++, addr += PAGE_SIZE) {
+ ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
+ hwp->pfn, &hwp->tk);
+ if (ret == 1)
+ break;
+ }
+ pte_unmap_unlock(ptep - 1, ptl);
+out:
+ cond_resched();
+ return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+ pte_t pte = huge_ptep_get(ptep);
+ struct hstate *h = hstate_vma(walk->vma);
+
+ return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+ hwp->pfn, &hwp->tk);
+}
+#else
+#define hwpoison_hugetlb_range NULL
+#endif
+
+static struct mm_walk_ops hwp_walk_ops = {
+ .pmd_entry = hwpoison_pte_range,
+ .hugetlb_entry = hwpoison_hugetlb_range,
+};
+
+/*
+ * Sends SIGBUS to the current process with error info.
+ *
+ * This function is intended to handle "Action Required" MCEs on already
+ * hardware poisoned pages. They could happen, for example, when
+ * memory_failure() failed to unmap the error page at the first call, or
+ * when multiple local machine checks happened on different CPUs.
+ *
+ * MCE handler currently has no easy access to the error virtual address,
+ * so this function walks page table to find it. The returned virtual address
+ * is proper in most cases, but it could be wrong when the application
+ * process has multiple entries mapping the error page.
+ */
+static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
+ int flags)
+{
+ int ret;
+ struct hwp_walk priv = {
+ .pfn = pfn,
+ };
+ priv.tk.tsk = p;
+
+ mmap_read_lock(p->mm);
+ ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
+ (void *)&priv);
+ if (ret == 1 && priv.tk.addr)
+ kill_proc(&priv.tk, pfn, flags);
+ mmap_read_unlock(p->mm);
+ return ret ? -EFAULT : -EHWPOISON;
+}
+
static const char *action_name[] = {
[MF_IGNORED] = "Ignored",
[MF_FAILED] = "Failed",
@@ -658,6 +801,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
+ unlock_page(p);
return MF_IGNORED;
}
@@ -667,6 +811,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ unlock_page(p);
return MF_FAILED;
}
@@ -675,6 +820,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -683,8 +829,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* For anonymous pages we're done the only reference left
* should be the one m_f() holds.
*/
- if (PageAnon(p))
- return MF_RECOVERED;
+ if (PageAnon(p)) {
+ ret = MF_RECOVERED;
+ goto out;
+ }
/*
* Now truncate the page in the page cache. This is really
@@ -698,7 +846,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
- return MF_FAILED;
+ ret = MF_FAILED;
+ goto out;
}
/*
@@ -706,7 +855,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
- return truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, pfn, mapping);
+out:
+ unlock_page(p);
+ return ret;
}
/*
@@ -782,24 +934,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
*/
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
{
+ int ret;
+
ClearPageDirty(p);
/* Trigger EIO in shmem: */
ClearPageUptodate(p);
- if (!delete_from_lru_cache(p))
- return MF_DELAYED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+ unlock_page(p);
+ return ret;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
+
delete_from_swap_cache(p);
- if (!delete_from_lru_cache(p))
- return MF_RECOVERED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+ unlock_page(p);
+ return ret;
}
/*
@@ -820,6 +974,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, pfn, mapping);
+ unlock_page(hpage);
} else {
res = MF_FAILED;
unlock_page(hpage);
@@ -834,7 +989,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
page_ref_inc(p);
res = MF_RECOVERED;
}
- lock_page(hpage);
}
return res;
@@ -866,6 +1020,8 @@ static struct page_state {
unsigned long mask;
unsigned long res;
enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
@@ -929,6 +1085,7 @@ static int page_action(struct page_state *ps, struct page *p,
int result;
int count;
+ /* page p should be unlocked after returning from ps->action(). */
result = ps->action(p, pfn);
count = page_count(p) - 1;
@@ -949,18 +1106,36 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
-/**
- * __get_hwpoison_page() - Get refcount for memory error handling:
- * @page: raw error page (hit by memory error)
- *
- * Return: return 0 if failed to grab the refcount, otherwise true (some
- * non-zero value.)
+/*
+ * Return true if a page type of a given page is supported by hwpoison
+ * mechanism (while handling could fail), otherwise false. This function
+ * does not return true for hugetlb or device memory pages, so it's assumed
+ * to be called only in the context where we never have such pages.
*/
+static inline bool HWPoisonHandlable(struct page *page)
+{
+ return PageLRU(page) || __PageMovable(page);
+}
+
static int __get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
+ int ret = 0;
+ bool hugetlb = false;
+
+ ret = get_hwpoison_huge_page(head, &hugetlb);
+ if (hugetlb)
+ return ret;
- if (!PageHuge(head) && PageTransHuge(head)) {
+ /*
+ * This check prevents from calling get_hwpoison_unless_zero()
+ * for any unsupported type of page in order to reduce the risk of
+ * unexpected races caused by taking a page refcount.
+ */
+ if (!HWPoisonHandlable(head))
+ return 0;
+
+ if (PageTransHuge(head)) {
/*
* Non anonymous thp exists only in allocation/free time. We
* can't handle such a case correctly, so let's give it up.
@@ -986,15 +1161,6 @@ static int __get_hwpoison_page(struct page *page)
return 0;
}
-/*
- * Safely get reference count of an arbitrary page.
- *
- * Returns 0 for a free page, 1 for an in-use page,
- * -EIO for a page-type we cannot handle and -EBUSY if we raced with an
- * allocation.
- * We only incremented refcount in case the page was already in-use and it
- * is a known type we can handle.
- */
static int get_any_page(struct page *p, unsigned long flags)
{
int ret = 0, pass = 0;
@@ -1004,50 +1170,77 @@ static int get_any_page(struct page *p, unsigned long flags)
count_increased = true;
try_again:
- if (!count_increased && !__get_hwpoison_page(p)) {
- if (page_count(p)) {
- /* We raced with an allocation, retry. */
- if (pass++ < 3)
- goto try_again;
- ret = -EBUSY;
- } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
- /* We raced with put_page, retry. */
+ if (!count_increased) {
+ ret = __get_hwpoison_page(p);
+ if (!ret) {
+ if (page_count(p)) {
+ /* We raced with an allocation, retry. */
+ if (pass++ < 3)
+ goto try_again;
+ ret = -EBUSY;
+ } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
+ /* We raced with put_page, retry. */
+ if (pass++ < 3)
+ goto try_again;
+ ret = -EIO;
+ }
+ goto out;
+ } else if (ret == -EBUSY) {
+ /* We raced with freeing huge page to buddy, retry. */
if (pass++ < 3)
goto try_again;
- ret = -EIO;
+ goto out;
}
+ }
+
+ if (PageHuge(p) || HWPoisonHandlable(p)) {
+ ret = 1;
} else {
- if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) {
- ret = 1;
- } else {
- /*
- * A page we cannot handle. Check whether we can turn
- * it into something we can handle.
- */
- if (pass++ < 3) {
- put_page(p);
- shake_page(p, 1);
- count_increased = false;
- goto try_again;
- }
+ /*
+ * A page we cannot handle. Check whether we can turn
+ * it into something we can handle.
+ */
+ if (pass++ < 3) {
put_page(p);
- ret = -EIO;
+ shake_page(p, 1);
+ count_increased = false;
+ goto try_again;
}
+ put_page(p);
+ ret = -EIO;
}
-
+out:
return ret;
}
-static int get_hwpoison_page(struct page *p, unsigned long flags,
- enum mf_flags ctxt)
+/**
+ * get_hwpoison_page() - Get refcount for memory error handling
+ * @p: Raw error page (hit by memory error)
+ * @flags: Flags controlling behavior of error handling
+ *
+ * get_hwpoison_page() takes a page refcount of an error page to handle memory
+ * error on it, after checking that the error page is in a well-defined state
+ * (defined as a page-type we can successfully handle the memor error on it,
+ * such as LRU page and hugetlb page).
+ *
+ * Memory error handling could be triggered at any time on any type of page,
+ * so it's prone to race with typical memory management lifecycle (like
+ * allocation and free). So to avoid such races, get_hwpoison_page() takes
+ * extra care for the error page's state (as done in __get_hwpoison_page()),
+ * and has some retry logic in get_any_page().
+ *
+ * Return: 0 on failure,
+ * 1 on success for in-use pages in a well-defined state,
+ * -EIO for pages on which we can not handle memory errors,
+ * -EBUSY when get_hwpoison_page() has raced with page lifecycle
+ * operations like allocation and free.
+ */
+static int get_hwpoison_page(struct page *p, unsigned long flags)
{
int ret;
zone_pcp_disable(page_zone(p));
- if (ctxt == MF_SOFT_OFFLINE)
- ret = get_any_page(p, flags);
- else
- ret = __get_hwpoison_page(p);
+ ret = get_any_page(p, flags);
zone_pcp_enable(page_zone(p));
return ret;
@@ -1228,32 +1421,41 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
if (TestSetPageHWPoison(head)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
- return 0;
+ res = -EHWPOISON;
+ if (flags & MF_ACTION_REQUIRED)
+ res = kill_accessing_process(current, page_to_pfn(head), flags);
+ return res;
}
num_poisoned_pages_inc();
- if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p, flags, 0)) {
- /*
- * Check "filter hit" and "race with other subpage."
- */
- lock_page(head);
- if (PageHWPoison(head)) {
- if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
- || (p != head && TestSetPageHWPoison(head))) {
- num_poisoned_pages_dec();
- unlock_page(head);
- return 0;
+ if (!(flags & MF_COUNT_INCREASED)) {
+ res = get_hwpoison_page(p, flags);
+ if (!res) {
+ /*
+ * Check "filter hit" and "race with other subpage."
+ */
+ lock_page(head);
+ if (PageHWPoison(head)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != head && TestSetPageHWPoison(head))) {
+ num_poisoned_pages_dec();
+ unlock_page(head);
+ return 0;
+ }
}
+ unlock_page(head);
+ res = MF_FAILED;
+ if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) {
+ page_ref_inc(p);
+ res = MF_RECOVERED;
+ }
+ action_result(pfn, MF_MSG_FREE_HUGE, res);
+ return res == MF_RECOVERED ? 0 : -EBUSY;
+ } else if (res < 0) {
+ action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
+ return -EBUSY;
}
- unlock_page(head);
- res = MF_FAILED;
- if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) {
- page_ref_inc(p);
- res = MF_RECOVERED;
- }
- action_result(pfn, MF_MSG_FREE_HUGE, res);
- return res == MF_RECOVERED ? 0 : -EBUSY;
}
lock_page(head);
@@ -1288,7 +1490,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
goto out;
}
- res = identify_page_state(pfn, p, page_flags);
+ return identify_page_state(pfn, p, page_flags);
out:
unlock_page(head);
return res;
@@ -1368,7 +1570,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* communicated in siginfo, see kill_proc()
*/
start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, start + size, 0);
+ unmap_mapping_range(page->mapping, start, size, 0);
}
kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
@@ -1404,9 +1606,10 @@ int memory_failure(unsigned long pfn, int flags)
struct page *hpage;
struct page *orig_head;
struct dev_pagemap *pgmap;
- int res;
+ int res = 0;
unsigned long page_flags;
bool retry = true;
+ static DEFINE_MUTEX(mf_mutex);
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
@@ -1424,13 +1627,21 @@ int memory_failure(unsigned long pfn, int flags)
return -ENXIO;
}
+ mutex_lock(&mf_mutex);
+
try_again:
- if (PageHuge(p))
- return memory_failure_hugetlb(pfn, flags);
+ if (PageHuge(p)) {
+ res = memory_failure_hugetlb(pfn, flags);
+ goto unlock_mutex;
+ }
+
if (TestSetPageHWPoison(p)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
- return 0;
+ res = -EHWPOISON;
+ if (flags & MF_ACTION_REQUIRED)
+ res = kill_accessing_process(current, pfn, flags);
+ goto unlock_mutex;
}
orig_head = hpage = compound_head(p);
@@ -1447,33 +1658,42 @@ try_again:
* In fact it's dangerous to directly bump up page count from 0,
* that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
*/
- if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p, flags, 0)) {
- if (is_free_buddy_page(p)) {
- if (take_page_off_buddy(p)) {
- page_ref_inc(p);
- res = MF_RECOVERED;
- } else {
- /* We lost the race, try again */
- if (retry) {
- ClearPageHWPoison(p);
- num_poisoned_pages_dec();
- retry = false;
- goto try_again;
+ if (!(flags & MF_COUNT_INCREASED)) {
+ res = get_hwpoison_page(p, flags);
+ if (!res) {
+ if (is_free_buddy_page(p)) {
+ if (take_page_off_buddy(p)) {
+ page_ref_inc(p);
+ res = MF_RECOVERED;
+ } else {
+ /* We lost the race, try again */
+ if (retry) {
+ ClearPageHWPoison(p);
+ num_poisoned_pages_dec();
+ retry = false;
+ goto try_again;
+ }
+ res = MF_FAILED;
}
- res = MF_FAILED;
+ action_result(pfn, MF_MSG_BUDDY, res);
+ res = res == MF_RECOVERED ? 0 : -EBUSY;
+ } else {
+ action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
+ res = -EBUSY;
}
- action_result(pfn, MF_MSG_BUDDY, res);
- return res == MF_RECOVERED ? 0 : -EBUSY;
- } else {
- action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
- return -EBUSY;
+ goto unlock_mutex;
+ } else if (res < 0) {
+ action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
+ res = -EBUSY;
+ goto unlock_mutex;
}
}
if (PageTransHuge(hpage)) {
if (try_to_split_thp_page(p, "Memory Failure") < 0) {
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
- return -EBUSY;
+ res = -EBUSY;
+ goto unlock_mutex;
}
VM_BUG_ON_PAGE(!page_count(p), p);
}
@@ -1497,7 +1717,7 @@ try_again:
if (PageCompound(p) && compound_head(p) != orig_head) {
action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
res = -EBUSY;
- goto out;
+ goto unlock_page;
}
/*
@@ -1517,17 +1737,22 @@ try_again:
num_poisoned_pages_dec();
unlock_page(p);
put_page(p);
- return 0;
+ goto unlock_mutex;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unlock_page(p);
put_page(p);
- return 0;
+ goto unlock_mutex;
}
- if (!PageTransTail(p) && !PageLRU(p))
+ /*
+ * __munlock_pagevec may clear a writeback page's LRU flag without
+ * page_lock. We need wait writeback completion for this page or it
+ * may trigger vfs BUG while evict inode.
+ */
+ if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
goto identify_page_state;
/*
@@ -1543,7 +1768,7 @@ try_again:
if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
- goto out;
+ goto unlock_page;
}
/*
@@ -1552,13 +1777,17 @@ try_again:
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
res = -EBUSY;
- goto out;
+ goto unlock_page;
}
identify_page_state:
res = identify_page_state(pfn, p, page_flags);
-out:
+ mutex_unlock(&mf_mutex);
+ return res;
+unlock_page:
unlock_page(p);
+unlock_mutex:
+ mutex_unlock(&mf_mutex);
return res;
}
EXPORT_SYMBOL_GPL(memory_failure);
@@ -1735,7 +1964,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
- if (!get_hwpoison_page(p, flags, 0)) {
+ if (!get_hwpoison_page(p, flags)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
@@ -1951,7 +2180,7 @@ int soft_offline_page(unsigned long pfn, int flags)
retry:
get_online_mems();
- ret = get_hwpoison_page(page, flags, MF_SOFT_OFFLINE);
+ ret = get_hwpoison_page(page, flags);
put_online_mems();
if (ret > 0) {
diff --git a/mm/memory.c b/mm/memory.c
index 550405fc3b5e..48c4576df898 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -90,8 +90,7 @@
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-/* use the per-pgdat data instead for discontigmem - mbligh */
+#ifndef CONFIG_NUMA
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);
@@ -1361,7 +1360,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
+ } else if (details && details->single_page &&
+ PageTransCompound(details->single_page) &&
+ next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
+ spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
+ /*
+ * Take and drop THP pmd lock so that we cannot return
+ * prematurely, while zap_huge_pmd() has cleared *pmd,
+ * but not yet decremented compound_mapcount().
+ */
+ spin_unlock(ptl);
}
+
/*
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
@@ -2260,26 +2270,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
+/*
+ * Variant of remap_pfn_range that does not call track_pfn_remap. The caller
+ * must have pre-validated the caching bits of the pgprot_t.
*/
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
pgd_t *pgd;
unsigned long next;
unsigned long end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
- unsigned long remap_pfn = pfn;
int err;
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
@@ -2309,10 +2310,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
vma->vm_pgoff = pfn;
}
- err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
- if (err)
- return -EINVAL;
-
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
BUG_ON(addr >= end);
@@ -2324,12 +2321,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
err = remap_p4d_range(mm, pgd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
- break;
+ return err;
} while (pgd++, addr = next, addr != end);
+ return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ int err;
+
+ err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
if (err)
- untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
+ return -EINVAL;
+ err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+ if (err)
+ untrack_pfn(vma, pfn, PAGE_ALIGN(size));
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -2446,13 +2467,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
}
do {
next = pmd_addr_end(addr, end);
- if (create || !pmd_none_or_clear_bad(pmd)) {
- err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (pmd_none(*pmd) && !create)
+ continue;
+ if (WARN_ON_ONCE(pmd_leaf(*pmd)))
+ return -EINVAL;
+ if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
+ if (!create)
+ continue;
+ pmd_clear_bad(pmd);
}
+ err = apply_to_pte_range(mm, pmd, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (pmd++, addr = next, addr != end);
+
return err;
}
@@ -2474,13 +2503,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
}
do {
next = pud_addr_end(addr, end);
- if (create || !pud_none_or_clear_bad(pud)) {
- err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (pud_none(*pud) && !create)
+ continue;
+ if (WARN_ON_ONCE(pud_leaf(*pud)))
+ return -EINVAL;
+ if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
+ if (!create)
+ continue;
+ pud_clear_bad(pud);
}
+ err = apply_to_pmd_range(mm, pud, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (pud++, addr = next, addr != end);
+
return err;
}
@@ -2502,13 +2539,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
}
do {
next = p4d_addr_end(addr, end);
- if (create || !p4d_none_or_clear_bad(p4d)) {
- err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
- create, mask);
- if (err)
- break;
+ if (p4d_none(*p4d) && !create)
+ continue;
+ if (WARN_ON_ONCE(p4d_leaf(*p4d)))
+ return -EINVAL;
+ if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
+ if (!create)
+ continue;
+ p4d_clear_bad(p4d);
}
+ err = apply_to_pud_range(mm, p4d, addr, next,
+ fn, data, create, mask);
+ if (err)
+ break;
} while (p4d++, addr = next, addr != end);
+
return err;
}
@@ -2528,9 +2573,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
- if (!create && pgd_none_or_clear_bad(pgd))
+ if (pgd_none(*pgd) && !create)
continue;
- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return -EINVAL;
+ if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
+ if (!create)
+ continue;
+ pgd_clear_bad(pgd);
+ }
+ err = apply_to_p4d_range(mm, pgd, addr, next,
+ fn, data, create, &mask);
if (err)
break;
} while (pgd++, addr = next, addr != end);
@@ -2896,6 +2949,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -2968,6 +3022,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
munlock_vma_page(old_page);
unlock_page(old_page);
}
+ if (page_copied)
+ free_swap_cache(old_page);
put_page(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
@@ -2992,7 +3048,7 @@ oom:
* The function expects the page to be locked or other protection against
* concurrent faults / writeback (such as DAX radix tree locks).
*
- * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
+ * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
* we acquired PTE lock.
*/
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
@@ -3193,6 +3249,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
}
/**
+ * unmap_mapping_page() - Unmap single page from processes.
+ * @page: The locked page to be unmapped.
+ *
+ * Unmap this page from any userspace process which still has it mmaped.
+ * Typically, for efficiency, the range of nearby pages has already been
+ * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
+ * truncation or invalidation holds the lock on a page, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_page()
+ * to unmap it finally.
+ */
+void unmap_mapping_page(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct zap_details details = { };
+
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageTail(page));
+
+ details.check_mapping = mapping;
+ details.first_index = page->index;
+ details.last_index = page->index + thp_nr_pages(page) - 1;
+ details.single_page = page;
+
+ i_mmap_lock_write(mapping);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+ unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ i_mmap_unlock_write(mapping);
+}
+
+/**
* unmap_mapping_pages() - Unmap pages from processes.
* @mapping: The address space containing pages to be unmapped.
* @start: Index of first page to be unmapped.
@@ -3268,6 +3354,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
+ struct swap_info_struct *si = NULL;
swp_entry_t entry;
pte_t pte;
int locked;
@@ -3295,42 +3382,42 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out;
}
+ /* Prevent swapoff from happening to us. */
+ si = get_swap_device(entry);
+ if (unlikely(!si))
+ goto out;
- delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+ delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
swapcache = page;
if (!page) {
- struct swap_info_struct *si = swp_swap_info(entry);
-
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
/* skip swapcache */
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (page) {
- int err;
-
__SetPageLocked(page);
__SetPageSwapBacked(page);
- set_page_private(page, entry.val);
- /* Tell memcg to use swap ownership records */
- SetPageSwapCache(page);
- err = mem_cgroup_charge(page, vma->vm_mm,
- GFP_KERNEL);
- ClearPageSwapCache(page);
- if (err) {
+ if (mem_cgroup_swapin_charge_page(page,
+ vma->vm_mm, GFP_KERNEL, entry)) {
ret = VM_FAULT_OOM;
goto out_page;
}
+ mem_cgroup_swapin_uncharge_swap(entry);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
workingset_refault(page, shadow);
lru_cache_add(page);
+
+ /* To provide entry to swap_readpage() */
+ set_page_private(page, entry.val);
swap_readpage(page, true);
+ set_page_private(page, 0);
}
} else {
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
@@ -3347,7 +3434,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto unlock;
}
@@ -3361,13 +3448,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* owner processes (which may be unknown at hwpoison time)
*/
ret = VM_FAULT_HWPOISON;
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto out_release;
}
locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
- delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
if (!locked) {
ret |= VM_FAULT_RETRY;
goto out_release;
@@ -3473,6 +3560,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ if (si)
+ put_swap_device(si);
return ret;
out_nomap:
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3484,6 +3573,8 @@ out_release:
unlock_page(swapcache);
put_page(swapcache);
}
+ if (si)
+ put_swap_device(si);
return ret;
}
@@ -3561,6 +3652,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
__SetPageUptodate(page);
entry = mk_pte(page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
@@ -3686,7 +3778,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
return ret;
/*
- * Archs like ppc64 need additonal space to store information
+ * Archs like ppc64 need additional space to store information
* related to pte entry. Use the preallocated table for that.
*/
if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
@@ -3745,6 +3837,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (prefault && arch_wants_old_prefaulted_pte())
entry = pte_mkold(entry);
+ else
+ entry = pte_sw_mkyoung(entry);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -4100,7 +4194,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
- bool migrated = false;
pte_t pte, old_pte;
bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
@@ -4117,29 +4210,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
goto out;
}
- /*
- * Make it present again, Depending on how arch implementes non
- * accessible ptes, some can allow access by kernel mode.
- */
- old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ /* Get the normal PTE */
+ old_pte = ptep_get(vmf->pte);
pte = pte_modify(old_pte, vma->vm_page_prot);
- pte = pte_mkyoung(pte);
- if (was_writable)
- pte = pte_mkwrite(pte);
- ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
- update_mmu_cache(vma, vmf->address, vmf->pte);
page = vm_normal_page(vma, vmf->address, pte);
- if (!page) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
- }
+ if (!page)
+ goto out_map;
/* TODO: handle PTE-mapped THP */
- if (PageCompound(page)) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
- }
+ if (PageCompound(page))
+ goto out_map;
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
@@ -4149,7 +4230,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour.
*/
- if (!pte_write(pte))
+ if (!was_writable)
flags |= TNF_NO_GROUP;
/*
@@ -4163,24 +4244,45 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == NUMA_NO_NODE) {
put_page(page);
- goto out;
+ goto out_map;
}
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
/* Migrate to the requested node */
- migrated = migrate_misplaced_page(page, vma, target_nid);
- if (migrated) {
+ if (migrate_misplaced_page(page, vma, target_nid)) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
- } else
+ } else {
flags |= TNF_MIGRATE_FAIL;
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto out;
+ }
+ goto out_map;
+ }
out:
if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
+out_map:
+ /*
+ * Make it present again, depending on how arch implements
+ * non-accessible ptes, some can allow access by kernel mode.
+ */
+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
+ if (was_writable)
+ pte = pte_mkwrite(pte);
+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto out;
}
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
@@ -4454,7 +4556,7 @@ retry_pud:
}
/**
- * mm_account_fault - Do page fault accountings
+ * mm_account_fault - Do page fault accounting
*
* @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
* of perf event counters, but we'll still do the per-task accounting to
@@ -4463,9 +4565,9 @@ retry_pud:
* @flags: the fault flags.
* @ret: the fault retcode.
*
- * This will take care of most of the page fault accountings. Meanwhile, it
+ * This will take care of most of the page fault accounting. Meanwhile, it
* will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
- * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
* still be in per-arch page fault handlers at the entry of page fault.
*/
static inline void mm_account_fault(struct pt_regs *regs,
@@ -4799,7 +4901,7 @@ out:
/**
* generic_access_phys - generic implementation for iomem mmap access
* @vma: the vma to access
- * @addr: userspace addres, not relative offset within @vma
+ * @addr: userspace address, not relative offset within @vma
* @buf: buffer to read/write
* @len: length of transfer
* @write: set to FOLL_WRITE when writing, otherwise reading
@@ -4891,8 +4993,8 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
+ vma = vma_lookup(mm, addr);
+ if (!vma)
break;
if (vma->vm_ops && vma->vm_ops->access)
ret = vma->vm_ops->access(vma, addr, buf,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0cdbbfbc5757..974a565797d8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,6 +42,16 @@
#include "internal.h"
#include "shuffle.h"
+
+/*
+ * memory_hotplug.memmap_on_memory parameter
+ */
+static bool memmap_on_memory __ro_after_init;
+#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
+module_param(memmap_on_memory, bool, 0444);
+MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+#endif
+
/*
* online_page_callback contains pointer to current page onlining function.
* Initially it is generic_online_page(). If it is required it could be
@@ -648,9 +658,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
* decide to not expose all pages to the buddy (e.g., expose them
* later). We account all pages as being online and belonging to this
* zone ("present").
+ * When using memmap_on_memory, the range might not be aligned to
+ * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
+ * this and the first chunk to online will be pageblock_nr_pages.
*/
- for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
- (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
+ for (pfn = start_pfn; pfn < end_pfn;) {
+ int order = min(MAX_ORDER - 1UL, __ffs(pfn));
+
+ (*online_page_callback)(pfn_to_page(pfn), order);
+ pfn += (1UL << order);
+ }
/* mark all involved sections as online */
online_mem_sections(start_pfn, end_pfn);
@@ -817,7 +834,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn
return movable_node_enabled ? movable_zone : kernel_zone;
}
-struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
+struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
unsigned long nr_pages)
{
if (online_type == MMOP_ONLINE_KERNEL)
@@ -829,24 +846,86 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
- int online_type, int nid)
+/*
+ * This function should only be called by memory_block_{online,offline},
+ * and {online,offline}_pages.
+ */
+void adjust_present_page_count(struct zone *zone, long nr_pages)
+{
+ unsigned long flags;
+
+ zone->present_pages += nr_pages;
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ zone->zone_pgdat->node_present_pages += nr_pages;
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
+int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+ struct zone *zone)
+{
+ unsigned long end_pfn = pfn + nr_pages;
+ int ret;
+
+ ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+ if (ret)
+ return ret;
+
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+
+ /*
+ * It might be that the vmemmap_pages fully span sections. If that is
+ * the case, mark those sections online here as otherwise they will be
+ * left offline.
+ */
+ if (nr_pages >= PAGES_PER_SECTION)
+ online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+ return ret;
+}
+
+void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long end_pfn = pfn + nr_pages;
+
+ /*
+ * It might be that the vmemmap_pages fully span sections. If that is
+ * the case, mark those sections offline here as otherwise they will be
+ * left online.
+ */
+ if (nr_pages >= PAGES_PER_SECTION)
+ offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+ /*
+ * The pages associated with this vmemmap have been offlined, so
+ * we can reset its state here.
+ */
+ remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
+ kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+}
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone)
{
unsigned long flags;
- struct zone *zone;
int need_zonelists_rebuild = 0;
+ const int nid = zone_to_nid(zone);
int ret;
struct memory_notify arg;
- /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+ /*
+ * {on,off}lining is constrained to full memory sections (or more
+ * precisly to memory blocks from the user space POV).
+ * memmap_on_memory is an exception because it reserves initial part
+ * of the physical memory space for vmemmaps. That space is pageblock
+ * aligned.
+ */
if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+ !IS_ALIGNED(pfn, pageblock_nr_pages) ||
+ !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
return -EINVAL;
mem_hotplug_begin();
/* associate pfn range with the zone */
- zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
arg.start_pfn = pfn;
@@ -877,16 +956,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
}
online_pages_range(pfn, nr_pages);
- zone->present_pages += nr_pages;
-
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages += nr_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ adjust_present_page_count(zone, nr_pages);
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
- zone_pcp_update(zone);
/* Basic onlining is complete, allow allocation of onlined pages. */
undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
@@ -899,6 +973,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
*/
shuffle_zone(zone);
+ /* reinitialise watermarks and update pcp limits */
init_per_zone_wmark_min();
kswapd_run(nid);
@@ -1064,6 +1139,45 @@ static int online_memory_block(struct memory_block *mem, void *arg)
return device_online(&mem->dev);
}
+bool mhp_supports_memmap_on_memory(unsigned long size)
+{
+ unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+ unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+ unsigned long remaining_size = size - vmemmap_size;
+
+ /*
+ * Besides having arch support and the feature enabled at runtime, we
+ * need a few more assumptions to hold true:
+ *
+ * a) We span a single memory block: memory onlining/offlinin;g happens
+ * in memory block granularity. We don't want the vmemmap of online
+ * memory blocks to reside on offline memory blocks. In the future,
+ * we might want to support variable-sized memory blocks to make the
+ * feature more versatile.
+ *
+ * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+ * to populate memory from the altmap for unrelated parts (i.e.,
+ * other memory blocks)
+ *
+ * c) The vmemmap pages (and thereby the pages that will be exposed to
+ * the buddy) have to cover full pageblocks: memory onlining/offlining
+ * code requires applicable ranges to be page-aligned, for example, to
+ * set the migratetypes properly.
+ *
+ * TODO: Although we have a check here to make sure that vmemmap pages
+ * fully populate a PMD, it is not the right place to check for
+ * this. A much better solution involves improving vmemmap code
+ * to fallback to base pages when trying to populate vmemmap using
+ * altmap as an alternative source of memory, and we do not exactly
+ * populate a single PMD.
+ */
+ return memmap_on_memory &&
+ IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
+ size == memory_block_size_bytes() &&
+ IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+ IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+}
+
/*
* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
* and online/offline operations (triggered e.g. by sysfs).
@@ -1073,6 +1187,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+ struct vmem_altmap mhp_altmap = {};
u64 start, size;
bool new_node = false;
int ret;
@@ -1099,13 +1214,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
goto error;
new_node = ret;
+ /*
+ * Self hosted memmap array
+ */
+ if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
+ if (!mhp_supports_memmap_on_memory(size)) {
+ ret = -EINVAL;
+ goto error;
+ }
+ mhp_altmap.free = PHYS_PFN(size);
+ mhp_altmap.base_pfn = PHYS_PFN(start);
+ params.altmap = &mhp_altmap;
+ }
+
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size, &params);
if (ret < 0)
goto error;
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size);
+ ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
if (ret) {
arch_remove_memory(nid, start, size, NULL);
goto error;
@@ -1573,9 +1701,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
int ret, node;
char *reason;
- /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+ /*
+ * {on,off}lining is constrained to full memory sections (or more
+ * precisly to memory blocks from the user space POV).
+ * memmap_on_memory is an exception because it reserves initial part
+ * of the physical memory space for vmemmaps. That space is pageblock
+ * aligned.
+ */
if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+ !IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
+ !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
return -EINVAL;
mem_hotplug_begin();
@@ -1611,6 +1746,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
* in a way that pages from isolated pageblock are left on pcplists.
*/
zone_pcp_disable(zone);
+ lru_cache_disable();
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
@@ -1642,7 +1778,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
}
cond_resched();
- lru_add_drain_all();
ret = scan_movable_pages(pfn, end_pfn, &pfn);
if (!ret) {
@@ -1687,23 +1822,20 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
+ lru_cache_enable();
zone_pcp_enable(zone);
/* removal success */
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
- zone->present_pages -= nr_pages;
-
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages -= nr_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ adjust_present_page_count(zone, -nr_pages);
+ /* reinitialise watermarks and update pcp limits */
init_per_zone_wmark_min();
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
- } else
- zone_pcp_update(zone);
+ }
node_states_clear_node(node, &arg);
if (arg.status_change_nid >= 0) {
@@ -1750,6 +1882,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
return 0;
}
+static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+{
+ /*
+ * If not set, continue with the next block.
+ */
+ return mem->nr_vmemmap_pages;
+}
+
static int check_cpu_on_node(pg_data_t *pgdat)
{
int cpu;
@@ -1824,6 +1964,9 @@ EXPORT_SYMBOL(try_offline_node);
static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
int rc = 0;
+ struct vmem_altmap mhp_altmap = {};
+ struct vmem_altmap *altmap = NULL;
+ unsigned long nr_vmemmap_pages;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -1836,6 +1979,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
if (rc)
return rc;
+ /*
+ * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
+ * the same granularity it was added - a single memory block.
+ */
+ if (memmap_on_memory) {
+ nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
+ get_nr_vmemmap_pages_cb);
+ if (nr_vmemmap_pages) {
+ if (size != memory_block_size_bytes()) {
+ pr_warn("Refuse to remove %#llx - %#llx,"
+ "wrong granularity\n",
+ start, start + size);
+ return -EINVAL;
+ }
+
+ /*
+ * Let remove_pmd_table->free_hugepage_table do the
+ * right thing if we used vmem_altmap when hot-adding
+ * the range.
+ */
+ mhp_altmap.alloc = nr_vmemmap_pages;
+ altmap = &mhp_altmap;
+ }
+ }
+
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
@@ -1847,7 +2015,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
mem_hotplug_begin();
- arch_remove_memory(nid, start, size, NULL);
+ arch_remove_memory(nid, start, size, altmap);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_free(start, size);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ab51132547b8..b5d95bf1025d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -330,7 +330,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+ nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
@@ -975,7 +975,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
* want to return MPOL_DEFAULT in this case.
*/
mmap_read_lock(mm);
- vma = find_vma_intersection(mm, addr, addr+1);
+ vma = vma_lookup(mm, addr);
if (!vma) {
mmap_read_unlock(mm);
return -EFAULT;
@@ -994,7 +994,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_ADDR) {
/*
* Take a refcount on the mpol, lookup_node()
- * wil drop the mmap_lock, so after calling
+ * will drop the mmap_lock, so after calling
* lookup_node() only "pol" remains valid, "vma"
* is stale.
*/
@@ -1124,7 +1124,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
int err = 0;
nodemask_t tmp;
- migrate_prep();
+ lru_cache_disable();
mmap_read_lock(mm);
@@ -1161,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
tmp = *from;
while (!nodes_empty(tmp)) {
- int s,d;
+ int s, d;
int source = NUMA_NO_NODE;
int dest = 0;
@@ -1208,6 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
break;
}
mmap_read_unlock(mm);
+
+ lru_cache_enable();
if (err < 0)
return err;
return busy;
@@ -1323,7 +1325,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- migrate_prep();
+ lru_cache_disable();
}
{
NODEMASK_SCRATCH(scratch);
@@ -1371,6 +1373,8 @@ up_out:
mmap_write_unlock(mm);
mpol_out:
mpol_put(new);
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_cache_enable();
return err;
}
@@ -1863,7 +1867,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
* policy->v.nodes is intersect with node_states[N_MEMORY].
- * so if the following test faile, it implies
+ * so if the following test fails, it implies
* policy->v.nodes has movable memory only.
*/
if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
@@ -2094,7 +2098,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
*
* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
* policy. Otherwise, check for intersection between mask and the policy
- * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
+ * nodemask for 'bind' or 'interleave' policy. For 'preferred' or 'local'
* policy, always return true since it may allocate elsewhere on fallback.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
@@ -2140,43 +2144,35 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
{
struct page *page;
- page = __alloc_pages(gfp, order, nid);
+ page = __alloc_pages(gfp, order, nid, NULL);
/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
if (!static_branch_likely(&vm_numa_stat_key))
return page;
if (page && page_to_nid(page) == nid) {
preempt_disable();
- __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
+ __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
preempt_enable();
}
return page;
}
/**
- * alloc_pages_vma - Allocate a page for a VMA.
- *
- * @gfp:
- * %GFP_USER user allocation.
- * %GFP_KERNEL kernel allocations,
- * %GFP_HIGHMEM highmem/user allocations,
- * %GFP_FS allocation should not call back into a file system.
- * %GFP_ATOMIC don't sleep.
+ * alloc_pages_vma - Allocate a page for a VMA.
+ * @gfp: GFP flags.
+ * @order: Order of the GFP allocation.
+ * @vma: Pointer to VMA or NULL if not available.
+ * @addr: Virtual address of the allocation. Must be inside @vma.
+ * @node: Which node to prefer for allocation (modulo policy).
+ * @hugepage: For hugepages try only the preferred node if possible.
*
- * @order:Order of the GFP allocation.
- * @vma: Pointer to VMA or NULL if not available.
- * @addr: Virtual Address of the allocation. Must be inside the VMA.
- * @node: Which node to prefer for allocation (modulo policy).
- * @hugepage: for hugepages try only the preferred node if possible
+ * Allocate a page for a specific address in @vma, using the appropriate
+ * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
+ * of the mm_struct of the VMA to prevent it from going away. Should be
+ * used for all allocations for pages that will be mapped into user space.
*
- * This function allocates a page from the kernel page pool and applies
- * a NUMA policy associated with the VMA or the current process.
- * When VMA is not NULL caller must read-lock the mmap_lock of the
- * mm_struct of the VMA to prevent it from going away. Should be used for
- * all allocations for pages that will be mapped into user space. Returns
- * NULL when no page can be allocated.
+ * Return: The page on success or NULL if allocation fails.
*/
-struct page *
-alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
@@ -2237,7 +2233,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
- page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
+ page = __alloc_pages(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
return page;
@@ -2245,21 +2241,20 @@ out:
EXPORT_SYMBOL(alloc_pages_vma);
/**
- * alloc_pages_current - Allocate pages.
+ * alloc_pages - Allocate pages.
+ * @gfp: GFP flags.
+ * @order: Power of two of number of pages to allocate.
*
- * @gfp:
- * %GFP_USER user allocation,
- * %GFP_KERNEL kernel allocation,
- * %GFP_HIGHMEM highmem allocation,
- * %GFP_FS don't call back into a file system.
- * %GFP_ATOMIC don't sleep.
- * @order: Power of two of allocation size in pages. 0 is a single page.
+ * Allocate 1 << @order contiguous pages. The physical address of the
+ * first page is naturally aligned (eg an order-3 allocation will be aligned
+ * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
+ * process is honoured when in process context.
*
- * Allocate a page from the kernel page pool. When not in
- * interrupt context and apply the current process NUMA policy.
- * Returns NULL when no page can be allocated.
+ * Context: Can be called from any context, providing the appropriate GFP
+ * flags are used.
+ * Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+struct page *alloc_pages(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = &default_policy;
struct page *page;
@@ -2274,13 +2269,13 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
- page = __alloc_pages_nodemask(gfp, order,
+ page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
return page;
}
-EXPORT_SYMBOL(alloc_pages_current);
+EXPORT_SYMBOL(alloc_pages);
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
@@ -2457,14 +2452,11 @@ static void sp_free(struct sp_node *n)
* @addr: virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
- * node id.
- *
- * Returns:
- * -1 - not misplaced, page is in the right node
- * node - node id where the page should be
- *
- * Policy determination "mimics" alloc_page_vma().
+ * node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
+ *
+ * Return: -1 if the page is in a node that is valid for this policy, or a
+ * suitable node ID to allocate a replacement page from.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/mempool.c b/mm/mempool.c
index 79959fac27d7..0b8afbec3e35 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -106,7 +106,8 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_slab_free_mempool(element);
else if (pool->alloc == mempool_alloc_pages)
- kasan_free_pages(element, (unsigned long)pool->pool_data);
+ kasan_poison_pages(element, (unsigned long)pool->pool_data,
+ false);
}
static void kasan_unpoison_element(mempool_t *pool, void *element)
@@ -114,7 +115,8 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_range(element, __ksize(element));
else if (pool->alloc == mempool_alloc_pages)
- kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+ kasan_unpoison_pages(element, (unsigned long)pool->pool_data,
+ false);
}
static __always_inline void add_element(mempool_t *pool, void *element)
@@ -251,7 +253,7 @@ EXPORT_SYMBOL(mempool_init);
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
{
- return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+ return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data,
GFP_KERNEL, NUMA_NO_NODE);
}
EXPORT_SYMBOL(mempool_create);
diff --git a/mm/memremap.c b/mm/memremap.c
index 7aa7d6e80ee5..15a074ffb8d7 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
#include <linux/device.h>
#include <linux/io.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 62b81d5257aa..380ca57b9031 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -57,28 +57,6 @@
#include "internal.h"
-/*
- * migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
- * undesirable, use migrate_prep_local()
- */
-void migrate_prep(void)
-{
- /*
- * Clear the LRU lists so pages can be isolated.
- * Note that pages may be moved off the LRU after we have
- * drained them. Those pages will fail to migrate like other
- * pages that may be busy.
- */
- lru_add_drain_all();
-}
-
-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
-void migrate_prep_local(void)
-{
- lru_add_drain();
-}
-
int isolate_movable_page(struct page *page, isolate_mode_t mode)
{
struct address_space *mapping;
@@ -140,15 +118,10 @@ out:
return -EBUSY;
}
-/* It should be called on page which is PG_movable */
-void putback_movable_page(struct page *page)
+static void putback_movable_page(struct page *page)
{
struct address_space *mapping;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageMovable(page), page);
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
-
mapping = page_mapping(page);
mapping->a_ops->putback_page(page);
__ClearPageIsolated(page);
@@ -322,6 +295,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
goto out;
page = migration_entry_to_page(entry);
+ page = compound_head(page);
/*
* Once page cache replacement of page migration started, page_count
@@ -1375,7 +1349,7 @@ out_unlock:
out:
if (rc == MIGRATEPAGE_SUCCESS)
putback_active_hugepage(hpage);
- else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS)
+ else if (rc != -EAGAIN)
list_move_tail(&hpage->lru, ret);
/*
@@ -1445,6 +1419,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
int rc, nr_subpages;
LIST_HEAD(ret_pages);
+ trace_mm_migrate_pages_start(mode, reason);
+
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
@@ -1617,7 +1593,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
gfp_mask |= __GFP_HIGHMEM;
- new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+ new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
if (new_page && PageTransHuge(new_page))
prep_transhuge_page(new_page);
@@ -1769,7 +1745,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
int start, i;
int err = 0, err1;
- migrate_prep();
+ lru_cache_disable();
for (i = start = 0; i < nr_pages; i++) {
const void __user *p;
@@ -1838,6 +1814,7 @@ out_flush:
if (err >= 0)
err = err1;
out:
+ lru_cache_enable();
return err;
}
@@ -1857,8 +1834,8 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
struct page *page;
int err = -EFAULT;
- vma = find_vma(mm, addr);
- if (!vma || addr < vma->vm_start)
+ vma = vma_lookup(mm, addr);
+ if (!vma)
goto set_status;
/* FOLL_DUMP to ignore special (like zero) pages */
@@ -2110,17 +2087,6 @@ bool pmd_trans_migrating(pmd_t pmd)
return PageLocked(page);
}
-static inline bool is_shared_exec_page(struct vm_area_struct *vma,
- struct page *page)
-{
- if (page_mapcount(page) != 1 &&
- (page_is_file_lru(page) || vma_is_shmem(vma)) &&
- (vma->vm_flags & VM_EXEC))
- return true;
-
- return false;
-}
-
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -2138,7 +2104,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
* Don't migrate file pages that are mapped in multiple processes
* with execute permissions as they are probably shared libraries.
*/
- if (is_shared_exec_page(vma, page))
+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
+ (vma->vm_flags & VM_EXEC))
goto out;
/*
@@ -2193,9 +2160,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
int page_lru = page_is_file_lru(page);
unsigned long start = address & HPAGE_PMD_MASK;
- if (is_shared_exec_page(vma, page))
- goto out;
-
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
HPAGE_PMD_ORDER);
@@ -2307,7 +2271,6 @@ out_fail:
out_unlock:
unlock_page(page);
-out:
put_page(page);
return 0;
}
@@ -2316,44 +2279,38 @@ out:
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DEVICE_PRIVATE
-static int migrate_vma_collect_hole(unsigned long start,
+static int migrate_vma_collect_skip(unsigned long start,
unsigned long end,
- __always_unused int depth,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
unsigned long addr;
- /* Only allow populating anonymous memory. */
- if (!vma_is_anonymous(walk->vma)) {
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- migrate->src[migrate->npages] = 0;
- migrate->dst[migrate->npages] = 0;
- migrate->npages++;
- }
- return 0;
- }
-
for (addr = start; addr < end; addr += PAGE_SIZE) {
- migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
- migrate->npages++;
- migrate->cpages++;
+ migrate->src[migrate->npages++] = 0;
}
return 0;
}
-static int migrate_vma_collect_skip(unsigned long start,
+static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
+ __always_unused int depth,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
unsigned long addr;
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma))
+ return migrate_vma_collect_skip(start, end, walk);
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
- migrate->src[migrate->npages++] = 0;
+ migrate->npages++;
+ migrate->cpages++;
}
return 0;
@@ -2823,11 +2780,11 @@ restore:
*
* For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
* do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
- * allowing the caller to allocate device memory for those unback virtual
- * address. For this the caller simply has to allocate device memory and
+ * allowing the caller to allocate device memory for those unbacked virtual
+ * addresses. For this the caller simply has to allocate device memory and
* properly set the destination entry like for regular migration. Note that
- * this can still fails and thus inside the device driver must check if the
- * migration was successful for those entries after calling migrate_vma_pages()
+ * this can still fail, and thus inside the device driver you must check if the
+ * migration was successful for those entries after calling migrate_vma_pages(),
* just like for regular migration.
*
* After that, the callers must call migrate_vma_pages() to go over each entry
@@ -2973,6 +2930,13 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
entry = swp_entry_to_pte(swp_entry);
+ } else {
+ /*
+ * For now we only support migrating to un-addressable
+ * device memory.
+ */
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+ goto abort;
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
diff --git a/mm/mlock.c b/mm/mlock.c
index f8f8cc32d03d..e338ebc4ad29 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -559,7 +559,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
vm_flags_t flags)
{
unsigned long nstart, end, tmp;
- struct vm_area_struct * vma, * prev;
+ struct vm_area_struct *vma, *prev;
int error;
VM_BUG_ON(offset_in_page(start));
@@ -737,7 +737,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
*/
static int apply_mlockall_flags(int flags)
{
- struct vm_area_struct * vma, * prev = NULL;
+ struct vm_area_struct *vma, *prev = NULL;
vm_flags_t to_add = 0;
current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
@@ -817,9 +817,10 @@ SYSCALL_DEFINE0(munlockall)
*/
static DEFINE_SPINLOCK(shmlock_user_lock);
-int user_shm_lock(size_t size, struct user_struct *user)
+int user_shm_lock(size_t size, struct ucounts *ucounts)
{
unsigned long lock_limit, locked;
+ long memlock;
int allowed = 0;
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -828,21 +829,26 @@ int user_shm_lock(size_t size, struct user_struct *user)
allowed = 1;
lock_limit >>= PAGE_SHIFT;
spin_lock(&shmlock_user_lock);
- if (!allowed &&
- locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+ memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+
+ if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+ goto out;
+ }
+ if (!get_ucounts(ucounts)) {
+ dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
goto out;
- get_uid(user);
- user->locked_shm += locked;
+ }
allowed = 1;
out:
spin_unlock(&shmlock_user_lock);
return allowed;
}
-void user_shm_unlock(size_t size, struct user_struct *user)
+void user_shm_unlock(size_t size, struct ucounts *ucounts)
{
spin_lock(&shmlock_user_lock);
- user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
spin_unlock(&shmlock_user_lock);
- free_uid(user);
+ put_ucounts(ucounts);
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 8e02e865cc65..9ddaf0e1b0ab 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -19,10 +19,6 @@
#ifdef CONFIG_DEBUG_MEMORY_INIT
int __meminitdata mminit_loglevel;
-#ifndef SECTIONS_SHIFT
-#define SECTIONS_SHIFT 0
-#endif
-
/* The zonelists are simply reported, validation is manual. */
void __init mminit_verify_zonelist(void)
{
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f287599a7a3..aa9de981b659 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -93,6 +93,12 @@ static void unmap_region(struct mm_struct *mm,
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
+ *
+ * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
+ * MAP_PRIVATE (with Enhanced PAN supported):
+ * r: (no) no
+ * w: (no) no
+ * x: (yes) yes
*/
pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
@@ -606,7 +612,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
unsigned long nr_pages = 0;
struct vm_area_struct *vma;
- /* Find first overlaping mapping */
+ /* Find first overlapping mapping */
vma = find_vma_intersection(mm, addr, end);
if (!vma)
return 0;
@@ -1451,9 +1457,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
- struct vm_area_struct *vma = find_vma(mm, addr);
-
- if (vma && vma->vm_start < addr + len)
+ if (find_vma_intersection(mm, addr, addr + len))
return -EEXIST;
}
@@ -1605,7 +1609,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
goto out_fput;
}
} else if (flags & MAP_HUGETLB) {
- struct user_struct *user = NULL;
+ struct ucounts *ucounts = NULL;
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1621,13 +1625,13 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE,
+ &ucounts, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+ flags &= ~MAP_DENYWRITE;
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
@@ -2796,6 +2800,22 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return __split_vma(mm, vma, addr, new_below);
}
+static inline void
+unlock_range(struct vm_area_struct *start, unsigned long limit)
+{
+ struct mm_struct *mm = start->vm_mm;
+ struct vm_area_struct *tmp = start;
+
+ while (tmp && tmp->vm_start < limit) {
+ if (tmp->vm_flags & VM_LOCKED) {
+ mm->locked_vm -= vma_pages(tmp);
+ munlock_vma_pages_all(tmp);
+ }
+
+ tmp = tmp->vm_next;
+ }
+}
+
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
@@ -2822,16 +2842,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
*/
arch_unmap(mm, start, end);
- /* Find the first overlapping VMA */
- vma = find_vma(mm, start);
+ /* Find the first overlapping VMA where start < vma->vm_end */
+ vma = find_vma_intersection(mm, start, end);
if (!vma)
return 0;
prev = vma->vm_prev;
- /* we have start < vma->vm_end */
-
- /* if it doesn't overlap, we have nothing.. */
- if (vma->vm_start >= end)
- return 0;
/*
* If we need to split any vma, do it now to save pain later.
@@ -2869,7 +2884,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
if (unlikely(uf)) {
/*
* If userfaultfd_unmap_prep returns an error the vmas
- * will remain splitted, but userland will get a
+ * will remain split, but userland will get a
* highly unexpected error anyway. This is no
* different than the case where the first of the two
* __split_vma fails, but we don't undo the first
@@ -2884,17 +2899,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
/*
* unlock any mlock()ed ranges before detaching vmas
*/
- if (mm->locked_vm) {
- struct vm_area_struct *tmp = vma;
- while (tmp && tmp->vm_start < end) {
- if (tmp->vm_flags & VM_LOCKED) {
- mm->locked_vm -= vma_pages(tmp);
- munlock_vma_pages_all(tmp);
- }
-
- tmp = tmp->vm_next;
- }
- }
+ if (mm->locked_vm)
+ unlock_range(vma, end);
/* Detach vmas from rbtree */
if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
@@ -3023,25 +3029,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
flags &= MAP_NONBLOCK;
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
- if (vma->vm_flags & VM_LOCKED) {
- struct vm_area_struct *tmp;
+ if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
- /* drop PG_Mlocked flag for over-mapped range */
- for (tmp = vma; tmp->vm_start >= start + size;
- tmp = tmp->vm_next) {
- /*
- * Split pmd and munlock page on the border
- * of the range.
- */
- vma_adjust_trans_huge(tmp, start, start + size, 0);
-
- munlock_vma_pages_range(tmp,
- max(tmp->vm_start, start),
- min(tmp->vm_end, start + size));
- }
- }
-
file = get_file(vma->vm_file);
ret = do_mmap(vma->vm_file, start, size,
prot, flags, pgoff, &populate, NULL);
@@ -3195,14 +3185,8 @@ void exit_mmap(struct mm_struct *mm)
mmap_write_unlock(mm);
}
- if (mm->locked_vm) {
- vma = mm->mmap;
- while (vma) {
- if (vma->vm_flags & VM_LOCKED)
- munlock_vma_pages_all(vma);
- vma = vma->vm_next;
- }
- }
+ if (mm->locked_vm)
+ unlock_range(mm->mmap, ULONG_MAX);
arch_exit_mmap(mm);
@@ -3403,14 +3387,10 @@ static const char *special_mapping_name(struct vm_area_struct *vma)
return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}
-static int special_mapping_mremap(struct vm_area_struct *new_vma,
- unsigned long flags)
+static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
struct vm_special_mapping *sm = new_vma->vm_private_data;
- if (flags & MREMAP_DONTUNMAP)
- return -EINVAL;
-
if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
return -EFAULT;
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index dcdde4f722a4..2ae3f33b85b1 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -11,6 +11,7 @@
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/trace_events.h>
+#include <linux/local_lock.h>
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
@@ -39,21 +40,30 @@ static int reg_refcount; /* Protected by reg_lock. */
*/
#define CONTEXT_COUNT 4
-static DEFINE_PER_CPU(char __rcu *, memcg_path_buf);
+struct memcg_path {
+ local_lock_t lock;
+ char __rcu *buf;
+ local_t buf_idx;
+};
+static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+ .buf_idx = LOCAL_INIT(0),
+};
+
static char **tmp_bufs;
-static DEFINE_PER_CPU(int, memcg_path_buf_idx);
/* Called with reg_lock held. */
static void free_memcg_path_bufs(void)
{
+ struct memcg_path *memcg_path;
int cpu;
char **old = tmp_bufs;
for_each_possible_cpu(cpu) {
- *(old++) = rcu_dereference_protected(
- per_cpu(memcg_path_buf, cpu),
+ memcg_path = per_cpu_ptr(&memcg_paths, cpu);
+ *(old++) = rcu_dereference_protected(memcg_path->buf,
lockdep_is_held(&reg_lock));
- rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL);
+ rcu_assign_pointer(memcg_path->buf, NULL);
}
/* Wait for inflight memcg_path_buf users to finish. */
@@ -88,7 +98,7 @@ int trace_mmap_lock_reg(void)
new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL);
if (new == NULL)
goto out_fail_free;
- rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new);
+ rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new);
/* Don't need to wait for inflights, they'd have gotten NULL. */
}
@@ -122,23 +132,24 @@ out:
static inline char *get_memcg_path_buf(void)
{
+ struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths);
char *buf;
int idx;
rcu_read_lock();
- buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf));
+ buf = rcu_dereference(memcg_path->buf);
if (buf == NULL) {
rcu_read_unlock();
return NULL;
}
- idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) -
+ idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) -
MEMCG_PATH_BUF_SIZE;
return &buf[idx];
}
static inline void put_memcg_path_buf(void)
{
- this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE);
+ local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx);
rcu_read_unlock();
}
@@ -179,14 +190,14 @@ out:
#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
do { \
const char *memcg_path; \
- preempt_disable(); \
+ local_lock(&memcg_paths.lock); \
memcg_path = get_mm_memcg_path(mm); \
trace_mmap_lock_##type(mm, \
memcg_path != NULL ? memcg_path : "", \
##__VA_ARGS__); \
if (likely(memcg_path != NULL)) \
put_memcg_path_buf(); \
- preempt_enable(); \
+ local_unlock(&memcg_paths.lock); \
} while (0)
#else /* !CONFIG_MEMCG */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 0dc7149b0c61..1b9837419bf9 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -249,16 +249,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
tlb_flush_mmu_free(tlb);
}
-/**
- * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
- * @tlb: the mmu_gather structure to initialize
- * @mm: the mm_struct of the target address space
- * @fullmm: @mm is without users and we're going to destroy the full address
- * space (exit/execve)
- *
- * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm.
- */
static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
bool fullmm)
{
@@ -283,11 +273,30 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
inc_tlb_flush_pending(tlb->mm);
}
+/**
+ * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm.
+ */
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
{
__tlb_gather_mmu(tlb, mm, false);
}
+/**
+ * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ *
+ * In this case, @mm is without users and we're going to destroy the
+ * full address space (exit/execve).
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm.
+ */
void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
{
__tlb_gather_mmu(tlb, mm, true);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94188df1ee55..e7a443157988 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -699,7 +699,7 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
mmap_write_unlock(current->mm);
/*
- * We could provie warnings or errors if any VMA still
+ * We could provide warnings or errors if any VMA still
* has the pkey set here.
*/
return ret;
diff --git a/mm/mremap.c b/mm/mremap.c
index ec8f840399ed..a369a6100698 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -545,7 +545,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (moved_len < old_len) {
err = -ENOMEM;
} else if (vma->vm_ops && vma->vm_ops->mremap) {
- err = vma->vm_ops->mremap(new_vma, flags);
+ err = vma->vm_ops->mremap(new_vma);
}
if (unlikely(err)) {
@@ -634,10 +634,11 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
unsigned long *p)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = find_vma(mm, addr);
+ struct vm_area_struct *vma;
unsigned long pgoff;
- if (!vma || vma->vm_start > addr)
+ vma = vma_lookup(mm, addr);
+ if (!vma)
return ERR_PTR(-EFAULT);
/*
@@ -653,8 +654,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
return ERR_PTR(-EINVAL);
}
- if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
- vma->vm_flags & VM_SHARED))
+ if ((flags & MREMAP_DONTUNMAP) &&
+ (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
return ERR_PTR(-EINVAL);
if (is_vm_hugetlb_page(vma))
@@ -730,7 +731,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
* So, to avoid such scenario we can pre-compute if the whole
* operation has high chances to success map-wise.
* Worst-scenario case is when both vma's (new_addr and old_addr) get
- * split in 3 before unmaping it.
+ * split in 3 before unmapping it.
* That means 2 more maps (1 for each) to the ones we already hold.
* Check whether current map count plus 2 still leads us to 4 maps below
* the threshold, otherwise return -ENOMEM here to be more safe.
diff --git a/mm/msync.c b/mm/msync.c
index 69c6d2029531..137d1c104f3e 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -55,7 +55,9 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out;
/*
* If the interval [start,end) covers some unmapped address ranges,
- * just ignore them, but return -ENOMEM at the end.
+ * just ignore them, but return -ENOMEM at the end. Besides, if the
+ * flag is MS_ASYNC (w/o MS_INVALIDATE) the result would be -ENOMEM
+ * anyway and there is nothing left to do, so return immediately.
*/
mmap_read_lock(mm);
vma = find_vma(mm, start);
@@ -69,6 +71,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out_unlock;
/* Here start < vma->vm_end. */
if (start < vma->vm_start) {
+ if (flags == MS_ASYNC)
+ goto out_unlock;
start = vma->vm_start;
if (start >= end)
goto out_unlock;
diff --git a/mm/nommu.c b/mm/nommu.c
index 5c9ab799c0e6..affda71641ca 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -210,16 +210,6 @@ long vread(char *buf, char *addr, unsigned long count)
return count;
}
-long vwrite(char *buf, char *addr, unsigned long count)
-{
- /* Don't allow overflow */
- if ((unsigned long) addr + count < count)
- count = -(unsigned long) addr;
-
- memcpy(addr, buf, count);
- return count;
-}
-
/*
* vmalloc - allocate virtually contiguous memory
*
@@ -1306,7 +1296,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
goto out;
}
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+ flags &= ~MAP_DENYWRITE;
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9efaf430cfd3..eefd3f5fde46 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -74,7 +74,7 @@ static inline bool is_memcg_oom(struct oom_control *oc)
#ifdef CONFIG_NUMA
/**
- * oom_cpuset_eligible() - check task eligiblity for kill
+ * oom_cpuset_eligible() - check task eligibility for kill
* @start: task struct of which task to consider
* @oc: pointer to struct oom_control
*
@@ -170,7 +170,7 @@ static bool oom_unkillable_task(struct task_struct *p)
return false;
}
-/**
+/*
* Check whether unreclaimable slab amount is greater than
* all user memory(LRU pages).
* dump_unreclaimable_slab() could help in the case that
@@ -993,7 +993,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
if (oom_group) {
mem_cgroup_print_oom_group(oom_group);
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
- (void*)message);
+ (void *)message);
mem_cgroup_put(oom_group);
}
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9e35b636a393..9f63548f247c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -32,7 +32,6 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
-#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
@@ -109,11 +108,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval);
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
/*
- * Flag that makes the machine dump writes/reads and block dirtyings.
- */
-int block_dump;
-
-/*
* Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
* a full sync is triggered after this time elapses without any disk activity.
*/
@@ -845,7 +839,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
* ^ pos_ratio
* |
* | |<===== global dirty control scope ======>|
- * 2.0 .............*
+ * 2.0 * * * * * * *
* | .*
* | . *
* | . *
@@ -1806,7 +1800,7 @@ pause:
break;
/*
- * In the case of an unresponding NFS server and the NFS dirty
+ * In the case of an unresponsive NFS server and the NFS dirty
* pages exceeds dirty_thresh, give the other good wb's a pipe
* to go through, so that tasks on them still remain responsive.
*
@@ -1869,10 +1863,9 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
- * On really big machines, get_writeback_state is expensive, so try to avoid
- * calling it too often (ratelimiting). But once we're over the dirty memory
- * limit we decrease the ratelimiting by a lot, to prevent individual processes
- * from overshooting the limit by (ratelimit_pages) each.
+ * Once we're over the dirty memory limit we decrease the ratelimiting
+ * by a lot, to prevent individual processes from overshooting the limit
+ * by (ratelimit_pages) each.
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
@@ -1945,6 +1938,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
struct dirty_throttle_control * const gdtc = &gdtc_stor;
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
+ unsigned long reclaimable;
+ unsigned long thresh;
/*
* Similar to balance_dirty_pages() but ignores pages being written
@@ -1957,8 +1952,13 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
if (gdtc->dirty > gdtc->bg_thresh)
return true;
- if (wb_stat(wb, WB_RECLAIMABLE) >
- wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
+ thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh);
+ if (thresh < 2 * wb_stat_error())
+ reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+ else
+ reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+
+ if (reclaimable > thresh)
return true;
if (mdtc) {
@@ -1972,8 +1972,13 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
if (mdtc->dirty > mdtc->bg_thresh)
return true;
- if (wb_stat(wb, WB_RECLAIMABLE) >
- wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
+ thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh);
+ if (thresh < 2 * wb_stat_error())
+ reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+ else
+ reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+
+ if (reclaimable > thresh)
return true;
}
@@ -2045,8 +2050,6 @@ void laptop_sync_completion(void)
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
* if a large number of processes all perform writes at the same time.
- * If it is too low then SMP machines will call the (expensive)
- * get_writeback_state too often.
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
@@ -2216,7 +2219,7 @@ int write_cache_pages(struct address_space *mapping,
* Page truncated or invalidated. We can freely skip it
* then, even for data integrity operations: the page
* has disappeared concurrently, so there could be no
- * real expectation of this data interity operation
+ * real expectation of this data integrity operation
* even if there is now a new, dirty page at the same
* pagecache address.
*/
@@ -2409,6 +2412,7 @@ int __set_page_dirty_no_writeback(struct page *page)
return !TestSetPageDirty(page);
return 0;
}
+EXPORT_SYMBOL(__set_page_dirty_no_writeback);
/*
* Helper function for set_page_dirty family.
@@ -2417,7 +2421,8 @@ int __set_page_dirty_no_writeback(struct page *page)
*
* NOTE: This relies on being atomic wrt interrupts.
*/
-void account_page_dirtied(struct page *page, struct address_space *mapping)
+static void account_page_dirtied(struct page *page,
+ struct address_space *mapping)
{
struct inode *inode = mapping->host;
@@ -2436,7 +2441,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
- this_cpu_inc(bdp_ratelimits);
+ __this_cpu_inc(bdp_ratelimits);
mem_cgroup_track_foreign_dirty(page, wb);
}
@@ -2459,6 +2464,30 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
}
/*
+ * Mark the page dirty, and set it dirty in the page cache, and mark the inode
+ * dirty.
+ *
+ * If warn is true, then emit a warning if the page is not uptodate and has
+ * not been truncated.
+ *
+ * The caller must hold lock_page_memcg().
+ */
+void __set_page_dirty(struct page *page, struct address_space *mapping,
+ int warn)
+{
+ unsigned long flags;
+
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ if (page->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(warn && !PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ __xa_set_mark(&mapping->i_pages, page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ }
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+}
+
+/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* the xarray.
*
@@ -2475,20 +2504,12 @@ int __set_page_dirty_nobuffers(struct page *page)
lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
- unsigned long flags;
if (!mapping) {
unlock_page_memcg(page);
return 1;
}
-
- xa_lock_irqsave(&mapping->i_pages, flags);
- BUG_ON(page_mapping(page) != mapping);
- WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping);
- __xa_set_mark(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_DIRTY);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ __set_page_dirty(page, mapping, !PagePrivate(page));
unlock_page_memcg(page);
if (mapping->host) {
@@ -2546,13 +2567,9 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
/*
* Dirty a page.
*
- * For pages with a mapping this should be done under the page lock
- * for the benefit of asynchronous memory errors who prefer a consistent
- * dirty state. This rule can be broken in some special cases,
- * but should be better not to.
- *
- * If the mapping doesn't provide a set_page_dirty a_op, then
- * just fall through and assume that it wants buffer_heads.
+ * For pages with a mapping this should be done under the page lock for the
+ * benefit of asynchronous memory errors who prefer a consistent dirty state.
+ * This rule can be broken in some special cases, but should be better not to.
*/
int set_page_dirty(struct page *page)
{
@@ -2560,7 +2577,6 @@ int set_page_dirty(struct page *page)
page = compound_head(page);
if (likely(mapping)) {
- int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
/*
* readahead/lru_deactivate_page could remain
* PG_readahead/PG_reclaim due to race with end_page_writeback
@@ -2573,11 +2589,7 @@ int set_page_dirty(struct page *page)
*/
if (PageReclaim(page))
ClearPageReclaim(page);
-#ifdef CONFIG_BLOCK
- if (!spd)
- spd = __set_page_dirty_buffers;
-#endif
- return (*spd)(page);
+ return mapping->a_ops->set_page_dirty(page);
}
if (!PageDirty(page)) {
if (!TestSetPageDirty(page))
@@ -2722,12 +2734,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
int ret;
- memcg = lock_page_memcg(page);
- lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2755,11 +2764,11 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- dec_lruvec_state(lruvec, NR_WRITEBACK);
+ dec_lruvec_page_state(page, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
}
- __unlock_page_memcg(memcg);
+ unlock_page_memcg(page);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cfc72873961d..0817d88383d5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,7 +72,6 @@
#include <linux/padata.h>
#include <linux/khugepaged.h>
#include <linux/buffer_head.h>
-
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -108,9 +107,38 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/*
+ * Don't poison memory with KASAN (only for the tag-based modes).
+ * During boot, all non-reserved memblock memory is exposed to page_alloc.
+ * Poisoning all that memory lengthens boot time, especially on systems with
+ * large amount of RAM. This flag is used to skip that poisoning.
+ * This is only done for the tag-based KASAN modes, as those are able to
+ * detect memory corruptions with the memory tags assigned by default.
+ * All memory allocated normally after boot gets poisoned as usual.
+ */
+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
-#define MIN_PERCPU_PAGELIST_FRACTION (8)
+#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
+
+struct pagesets {
+ local_lock_t lock;
+#if defined(CONFIG_DEBUG_INFO_BTF) && \
+ !defined(CONFIG_DEBUG_LOCK_ALLOC) && \
+ !defined(CONFIG_PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT)
+ /*
+ * pahole 1.21 and earlier gets confused by zero-sized per-CPU
+ * variables and produces invalid BTF. Ensure that
+ * sizeof(struct pagesets) != 0 for older versions of pahole.
+ */
+ char __pahole_hack;
+ #warning "pahole too old to support zero-sized struct pagesets"
+#endif
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -165,12 +193,12 @@ EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
-int percpu_pagelist_fraction;
+int percpu_pagelist_high_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-DEFINE_STATIC_KEY_FALSE(init_on_alloc);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
EXPORT_SYMBOL(init_on_alloc);
-DEFINE_STATIC_KEY_FALSE(init_on_free);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
EXPORT_SYMBOL(init_on_free);
static bool _init_on_alloc_enabled_early __read_mostly
@@ -321,20 +349,7 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
- * are not on separate NUMA nodes. Functionally this works but with
- * watermark_boost_factor, it can reclaim prematurely as the ranges can be
- * quite small. By default, do not boost watermarks on discontigmem as in
- * many cases very high-order allocations like THP are likely to be
- * unsupported and the premature reclaim offsets the advantage of long-term
- * fragmentation avoidance.
- */
-int watermark_boost_factor __read_mostly;
-#else
int watermark_boost_factor __read_mostly = 15000;
-#endif
int watermark_scale_factor = 10;
static unsigned long nr_kernel_pages __initdata;
@@ -372,7 +387,7 @@ int page_group_by_mobility_disabled __read_mostly;
static DEFINE_STATIC_KEY_TRUE(deferred_pages);
/*
- * Calling kasan_free_pages() only after deferred memory initialization
+ * Calling kasan_poison_pages() only after deferred memory initialization
* has completed. Poisoning pages during deferred memory init will greatly
* lengthen the process and cause problem in large memory systems as the
* deferred pages initialization is done with interrupt disabled.
@@ -384,10 +399,12 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
- if (!static_branch_unlikely(&deferred_pages))
- kasan_free_pages(page, order);
+ return static_branch_unlikely(&deferred_pages) ||
+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ PageSkipKASanPoison(page);
}
/* Returns true if the struct page for the pfn is uninitialised */
@@ -438,7 +455,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
#else
-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+{
+ return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ PageSkipKASanPoison(page);
+}
static inline bool early_page_uninitialised(unsigned long pfn)
{
@@ -452,7 +474,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
#endif
/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct page *page,
+static inline unsigned long *get_pageblock_bitmap(const struct page *page,
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
@@ -462,7 +484,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
#endif /* CONFIG_SPARSEMEM */
}
-static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
@@ -473,7 +495,7 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
}
static __always_inline
-unsigned long __get_pfnblock_flags_mask(struct page *page,
+unsigned long __get_pfnblock_flags_mask(const struct page *page,
unsigned long pfn,
unsigned long mask)
{
@@ -498,13 +520,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page,
*
* Return: pageblock_bits flags
*/
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
- unsigned long mask)
+unsigned long get_pfnblock_flags_mask(const struct page *page,
+ unsigned long pfn, unsigned long mask)
{
return __get_pfnblock_flags_mask(page, pfn, mask);
}
-static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+static __always_inline int get_pfnblock_migratetype(const struct page *page,
+ unsigned long pfn)
{
return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
@@ -636,8 +659,7 @@ static void bad_page(struct page *page, const char *reason)
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- __dump_page(page, reason);
- dump_page_owner(page);
+ dump_page(page, reason);
print_modules();
dump_stack();
@@ -647,6 +669,57 @@ out:
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
+static inline unsigned int order_to_pindex(int migratetype, int order)
+{
+ int base = order;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ VM_BUG_ON(order != pageblock_order);
+ base = PAGE_ALLOC_COSTLY_ORDER + 1;
+ }
+#else
+ VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+ return (MIGRATE_PCPTYPES * base) + migratetype;
+}
+
+static inline int pindex_to_order(unsigned int pindex)
+{
+ int order = pindex / MIGRATE_PCPTYPES;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ order = pageblock_order;
+ VM_BUG_ON(order != pageblock_order);
+ }
+#else
+ VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+ return order;
+}
+
+static inline bool pcp_allowed_order(unsigned int order)
+{
+ if (order <= PAGE_ALLOC_COSTLY_ORDER)
+ return true;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order == pageblock_order)
+ return true;
+#endif
+ return false;
+}
+
+static inline void free_the_page(struct page *page, unsigned int order)
+{
+ if (pcp_allowed_order(order)) /* Via pcp? */
+ free_unref_page(page, order);
+ else
+ __free_pages_ok(page, order, FPI_NONE);
+}
+
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
@@ -665,7 +738,7 @@ out:
void free_compound_page(struct page *page)
{
mem_cgroup_uncharge(page);
- __free_pages_ok(page, compound_order(page), FPI_NONE);
+ free_the_page(page, compound_order(page));
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -764,32 +837,36 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
*/
void init_mem_debugging_and_hardening(void)
{
+ bool page_poisoning_requested = false;
+
+#ifdef CONFIG_PAGE_POISONING
+ /*
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
+ */
+ if (page_poisoning_enabled() ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled())) {
+ static_branch_enable(&_page_poisoning_enabled);
+ page_poisoning_requested = true;
+ }
+#endif
+
if (_init_on_alloc_enabled_early) {
- if (page_poisoning_enabled())
+ if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_alloc\n");
else
static_branch_enable(&init_on_alloc);
}
if (_init_on_free_enabled_early) {
- if (page_poisoning_enabled())
+ if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_free\n");
else
static_branch_enable(&init_on_free);
}
-#ifdef CONFIG_PAGE_POISONING
- /*
- * Page poisoning is debug page alloc for some arches. If
- * either of those options are enabled, enable poisoning.
- */
- if (page_poisoning_enabled() ||
- (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
- debug_pagealloc_enabled()))
- static_branch_enable(&_page_poisoning_enabled);
-#endif
-
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
@@ -867,7 +944,7 @@ compaction_capture(struct capture_control *capc, struct page *page,
return false;
/*
- * Do not let lower order allocations polluate a movable pageblock.
+ * Do not let lower order allocations pollute a movable pageblock.
* This might let an unmovable request use a reclaimable pageblock
* and vice-versa but no more than normal fallback logic which can
* have trouble finding a high-order free page.
@@ -1103,7 +1180,7 @@ static inline bool page_expected_state(struct page *page,
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
- (unsigned long)page_memcg(page) |
+ page->memcg_data |
#endif
(page->flags & check_flags)))
return false;
@@ -1128,7 +1205,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
- if (unlikely(page_memcg(page)))
+ if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
@@ -1200,10 +1277,16 @@ out:
return ret;
}
-static void kernel_init_free_pages(struct page *page, int numpages)
+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
{
int i;
+ if (zero_tags) {
+ for (i = 0; i < numpages; i++)
+ tag_clear_highpage(page + i);
+ return;
+ }
+
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
for (i = 0; i < numpages; i++) {
@@ -1216,9 +1299,10 @@ static void kernel_init_free_pages(struct page *page, int numpages)
}
static __always_inline bool free_pages_prepare(struct page *page,
- unsigned int order, bool check_free)
+ unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1276,16 +1360,28 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
- if (want_init_on_free())
- kernel_init_free_pages(page, 1 << order);
kernel_poison_pages(page, 1 << order);
/*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_free_pages and kernel_init_free_pages must be
+ * kept together to avoid discrepancies in behavior.
+ *
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
- kasan_free_nondeferred_pages(page, order);
+ if (kasan_has_integrated_init()) {
+ if (!skip_kasan_poison)
+ kasan_free_pages(page, order);
+ } else {
+ bool init = want_init_on_free();
+
+ if (init)
+ kernel_init_free_pages(page, 1 << order, false);
+ if (!skip_kasan_poison)
+ kasan_poison_pages(page, order, init);
+ }
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1305,9 +1401,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
* to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
* moved from pcp lists to free lists.
*/
-static bool free_pcp_prepare(struct page *page)
+static bool free_pcp_prepare(struct page *page, unsigned int order)
{
- return free_pages_prepare(page, 0, true);
+ return free_pages_prepare(page, order, true, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1324,12 +1420,12 @@ static bool bulkfree_pcp_prepare(struct page *page)
* debug_pagealloc enabled, they are checked also immediately when being freed
* to the pcp lists.
*/
-static bool free_pcp_prepare(struct page *page)
+static bool free_pcp_prepare(struct page *page, unsigned int order)
{
if (debug_pagealloc_enabled_static())
- return free_pages_prepare(page, 0, true);
+ return free_pages_prepare(page, order, true, FPI_NONE);
else
- return free_pages_prepare(page, 0, false);
+ return free_pages_prepare(page, order, false, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1361,8 +1457,10 @@ static inline void prefetch_buddy(struct page *page)
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp)
{
- int migratetype = 0;
+ int pindex = 0;
int batch_free = 0;
+ int nr_freed = 0;
+ unsigned int order;
int prefetch_nr = READ_ONCE(pcp->batch);
bool isolated_pageblocks;
struct page *page, *tmp;
@@ -1373,7 +1471,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
- while (count) {
+ while (count > 0) {
struct list_head *list;
/*
@@ -1385,24 +1483,31 @@ static void free_pcppages_bulk(struct zone *zone, int count,
*/
do {
batch_free++;
- if (++migratetype == MIGRATE_PCPTYPES)
- migratetype = 0;
- list = &pcp->lists[migratetype];
+ if (++pindex == NR_PCP_LISTS)
+ pindex = 0;
+ list = &pcp->lists[pindex];
} while (list_empty(list));
/* This is the only non-empty list. Free them all. */
- if (batch_free == MIGRATE_PCPTYPES)
+ if (batch_free == NR_PCP_LISTS)
batch_free = count;
+ order = pindex_to_order(pindex);
+ BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
do {
page = list_last_entry(list, struct page, lru);
/* must delete to avoid corrupting pcp list */
list_del(&page->lru);
- pcp->count--;
+ nr_freed += 1 << order;
+ count -= 1 << order;
if (bulkfree_pcp_prepare(page))
continue;
+ /* Encode order with the migratetype */
+ page->index <<= NR_PCP_ORDER_WIDTH;
+ page->index |= order;
+
list_add_tail(&page->lru, &head);
/*
@@ -1418,9 +1523,14 @@ static void free_pcppages_bulk(struct zone *zone, int count,
prefetch_buddy(page);
prefetch_nr--;
}
- } while (--count && --batch_free && !list_empty(list));
+ } while (count > 0 && --batch_free && !list_empty(list));
}
+ pcp->count -= nr_freed;
+ /*
+ * local_lock_irq held so equivalent to spin_lock_irqsave for
+ * both PREEMPT_RT and non-PREEMPT_RT configurations.
+ */
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
@@ -1430,14 +1540,19 @@ static void free_pcppages_bulk(struct zone *zone, int count,
*/
list_for_each_entry_safe(page, tmp, &head, lru) {
int mt = get_pcppage_migratetype(page);
+
+ /* mt has been encoded with the order (see above) */
+ order = mt & NR_PCP_ORDER_MASK;
+ mt >>= NR_PCP_ORDER_WIDTH;
+
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
- __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
- trace_mm_page_pcpu_drain(page, 0, mt);
+ __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+ trace_mm_page_pcpu_drain(page, order, mt);
}
spin_unlock(&zone->lock);
}
@@ -1447,13 +1562,15 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype, fpi_t fpi_flags)
{
- spin_lock(&zone->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
- spin_unlock(&zone->lock);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1536,16 +1653,22 @@ static void __free_pages_ok(struct page *page, unsigned int order,
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
+ struct zone *zone = page_zone(page);
- if (!free_pages_prepare(page, order, true))
+ if (!free_pages_prepare(page, order, true, fpi_flags))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- local_irq_save(flags);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(has_isolate_pageblock(zone) ||
+ is_migrate_isolate(migratetype))) {
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ }
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, pfn, order, migratetype,
- fpi_flags);
- local_irq_restore(flags);
}
void __free_pages_core(struct page *page, unsigned int order)
@@ -1574,10 +1697,10 @@ void __free_pages_core(struct page *page, unsigned int order)
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
- __free_pages_ok(page, order, FPI_TO_TAIL);
+ __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
}
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
/*
* During memory init memblocks map pfns to nids. The search is expensive and
@@ -1627,7 +1750,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
return nid;
}
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
+#endif /* CONFIG_NUMA */
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
@@ -2115,14 +2238,6 @@ void __init page_alloc_init_late(void)
wait_for_completion(&pgdat_init_all_done_comp);
/*
- * The number of managed pages has changed due to the initialisation
- * so the pcpu batch and high limits needs to be updated or the limits
- * will be artificially small.
- */
- for_each_populated_zone(zone)
- zone_pcp_update(zone);
-
- /*
* We initialized the rest of the deferred pages. Permanently disable
* on-demand struct page initialization.
*/
@@ -2297,12 +2412,31 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
- kasan_alloc_pages(page, order);
+
+ /*
+ * Page unpoisoning must happen before memory initialization.
+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
+ * allocations and the page unpoisoning code will complain.
+ */
kernel_unpoison_pages(page, 1 << order);
- set_page_owner(page, order, gfp_flags);
- if (!want_init_on_free() && want_init_on_alloc(gfp_flags))
- kernel_init_free_pages(page, 1 << order);
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_alloc_pages and kernel_init_free_pages must be
+ * kept together to avoid discrepancies in behavior.
+ */
+ if (kasan_has_integrated_init()) {
+ kasan_alloc_pages(page, order, gfp_flags);
+ } else {
+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+
+ kasan_unpoison_pages(page, order, init);
+ if (init)
+ kernel_init_free_pages(page, 1 << order,
+ gfp_flags & __GFP_ZEROTAGS);
+ }
+
+ set_page_owner(page, order, gfp_flags);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -2386,19 +2520,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
* boundary. If alignment is required, use move_freepages_block()
*/
static int move_freepages(struct zone *zone,
- struct page *start_page, struct page *end_page,
+ unsigned long start_pfn, unsigned long end_pfn,
int migratetype, int *num_movable)
{
struct page *page;
+ unsigned long pfn;
unsigned int order;
int pages_moved = 0;
- for (page = start_page; page <= end_page;) {
- if (!pfn_valid_within(page_to_pfn(page))) {
- page++;
+ for (pfn = start_pfn; pfn <= end_pfn;) {
+ if (!pfn_valid_within(pfn)) {
+ pfn++;
continue;
}
+ page = pfn_to_page(pfn);
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
@@ -2408,8 +2544,7 @@ static int move_freepages(struct zone *zone,
if (num_movable &&
(PageLRU(page) || __PageMovable(page)))
(*num_movable)++;
-
- page++;
+ pfn++;
continue;
}
@@ -2419,7 +2554,7 @@ static int move_freepages(struct zone *zone,
order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
- page += 1 << order;
+ pfn += 1 << order;
pages_moved += 1 << order;
}
@@ -2429,25 +2564,22 @@ static int move_freepages(struct zone *zone,
int move_freepages_block(struct zone *zone, struct page *page,
int migratetype, int *num_movable)
{
- unsigned long start_pfn, end_pfn;
- struct page *start_page, *end_page;
+ unsigned long start_pfn, end_pfn, pfn;
if (num_movable)
*num_movable = 0;
- start_pfn = page_to_pfn(page);
- start_pfn = start_pfn & ~(pageblock_nr_pages-1);
- start_page = pfn_to_page(start_pfn);
- end_page = start_page + pageblock_nr_pages - 1;
+ pfn = page_to_pfn(page);
+ start_pfn = pfn & ~(pageblock_nr_pages - 1);
end_pfn = start_pfn + pageblock_nr_pages - 1;
/* Do not cross zone boundaries */
if (!zone_spans_pfn(zone, start_pfn))
- start_page = page;
+ start_pfn = pfn;
if (!zone_spans_pfn(zone, end_pfn))
return 0;
- return move_freepages(zone, start_page, end_page, migratetype,
+ return move_freepages(zone, start_pfn, end_pfn, migratetype,
num_movable);
}
@@ -2731,7 +2863,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
/*
* In page freeing path, migratetype change is racy so
* we can counter several free pages in a pageblock
- * in this loop althoug we changed the pageblock type
+ * in this loop although we changed the pageblock type
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
@@ -2908,8 +3040,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
- int i, alloced = 0;
+ int i, allocated = 0;
+ /*
+ * local_lock_irq held so equivalent to spin_lock_irqsave for
+ * both PREEMPT_RT and non-PREEMPT_RT configurations.
+ */
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
@@ -2931,7 +3067,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* pages are ordered properly.
*/
list_add_tail(&page->lru, list);
- alloced++;
+ allocated++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
@@ -2940,12 +3076,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
- * on i. Do not confuse with 'alloced' which is the number of
+ * on i. Do not confuse with 'allocated' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
- return alloced;
+ return allocated;
}
#ifdef CONFIG_NUMA
@@ -2962,12 +3098,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
unsigned long flags;
int to_drain, batch;
- local_irq_save(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
#endif
@@ -2981,16 +3117,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
unsigned long flags;
- struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
- local_irq_save(flags);
- pset = per_cpu_ptr(zone->pageset, cpu);
+ local_lock_irqsave(&pagesets.lock, flags);
- pcp = &pset->pcp;
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
- local_irq_restore(flags);
+
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
/*
@@ -3035,7 +3170,7 @@ static void drain_local_pages_wq(struct work_struct *work)
* drain_all_pages doesn't use proper cpu hotplug protection so
* we can race with cpu offline when the WQ can move this from
* a cpu pinned worker to an unbound one. We can operate on a different
- * cpu which is allright but we also have to make sure to not move to
+ * cpu which is alright but we also have to make sure to not move to
* a different one.
*/
preempt_disable();
@@ -3088,7 +3223,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
* disables preemption as part of its processing
*/
for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pcp;
+ struct per_cpu_pages *pcp;
struct zone *z;
bool has_pcps = false;
@@ -3099,13 +3234,13 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
*/
has_pcps = true;
} else if (zone) {
- pcp = per_cpu_ptr(zone->pageset, cpu);
- if (pcp->pcp.count)
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ if (pcp->count)
has_pcps = true;
} else {
for_each_populated_zone(z) {
- pcp = per_cpu_ptr(z->pageset, cpu);
- if (pcp->pcp.count) {
+ pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
+ if (pcp->count) {
has_pcps = true;
break;
}
@@ -3198,11 +3333,12 @@ void mark_free_pages(struct zone *zone)
}
#endif /* CONFIG_PM */
-static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
+ unsigned int order)
{
int migratetype;
- if (!free_pcp_prepare(page))
+ if (!free_pcp_prepare(page, order))
return false;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -3210,52 +3346,99 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
return true;
}
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
+{
+ int min_nr_free, max_nr_free;
+
+ /* Check for PCP disabled or boot pageset */
+ if (unlikely(high < batch))
+ return 1;
+
+ /* Leave at least pcp->batch pages on the list */
+ min_nr_free = batch;
+ max_nr_free = high - batch;
+
+ /*
+ * Double the number of pages freed each time there is subsequent
+ * freeing of pages without any allocation.
+ */
+ batch <<= pcp->free_factor;
+ if (batch < max_nr_free)
+ pcp->free_factor++;
+ batch = clamp(batch, min_nr_free, max_nr_free);
+
+ return batch;
+}
+
+static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
+{
+ int high = READ_ONCE(pcp->high);
+
+ if (unlikely(!high))
+ return 0;
+
+ if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
+ return high;
+
+ /*
+ * If reclaim is active, limit the number of pages that can be
+ * stored on pcp lists
+ */
+ return min(READ_ONCE(pcp->batch) << 2, high);
+}
+
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
+ int migratetype, unsigned int order)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
- int migratetype;
+ int high;
+ int pindex;
- migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
+ pcp = this_cpu_ptr(zone->per_cpu_pageset);
+ pindex = order_to_pindex(migratetype, order);
+ list_add(&page->lru, &pcp->lists[pindex]);
+ pcp->count += 1 << order;
+ high = nr_pcp_high(pcp, zone);
+ if (pcp->count >= high) {
+ int batch = READ_ONCE(pcp->batch);
+
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp);
+ }
+}
+
+/*
+ * Free a pcp page
+ */
+void free_unref_page(struct page *page, unsigned int order)
+{
+ unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
+ int migratetype;
+
+ if (!free_unref_page_prepare(page, pfn, order))
+ return;
/*
* We only track unmovable, reclaimable and movable on pcp lists.
- * Free ISOLATE pages back to the allocator because they are being
+ * Place ISOLATE pages on the isolated list because they are being
* offlined but treat HIGHATOMIC as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
- if (migratetype >= MIGRATE_PCPTYPES) {
+ migratetype = get_pcppage_migratetype(page);
+ if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, 0, migratetype,
- FPI_NONE);
+ free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
}
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list_add(&page->lru, &pcp->lists[migratetype]);
- pcp->count++;
- if (pcp->count >= READ_ONCE(pcp->high))
- free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp);
-}
-
-/*
- * Free a 0-order page
- */
-void free_unref_page(struct page *page)
-{
- unsigned long flags;
- unsigned long pfn = page_to_pfn(page);
-
- if (!free_unref_page_prepare(page, pfn))
- return;
-
- local_irq_save(flags);
- free_unref_page_commit(page, pfn);
- local_irq_restore(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
+ free_unref_page_commit(page, pfn, migratetype, order);
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
/*
@@ -3266,34 +3449,56 @@ void free_unref_page_list(struct list_head *list)
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
+ int migratetype;
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
pfn = page_to_pfn(page);
- if (!free_unref_page_prepare(page, pfn))
+ if (!free_unref_page_prepare(page, pfn, 0))
list_del(&page->lru);
+
+ /*
+ * Free isolated pages directly to the allocator, see
+ * comment in free_unref_page.
+ */
+ migratetype = get_pcppage_migratetype(page);
+ if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
+ if (unlikely(is_migrate_isolate(migratetype))) {
+ list_del(&page->lru);
+ free_one_page(page_zone(page), page, pfn, 0,
+ migratetype, FPI_NONE);
+ continue;
+ }
+
+ /*
+ * Non-isolated types over MIGRATE_PCPTYPES get added
+ * to the MIGRATE_MOVABLE pcp list.
+ */
+ set_pcppage_migratetype(page, MIGRATE_MOVABLE);
+ }
+
set_page_private(page, pfn);
}
- local_irq_save(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
- unsigned long pfn = page_private(page);
-
+ pfn = page_private(page);
set_page_private(page, 0);
+ migratetype = get_pcppage_migratetype(page);
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn);
+ free_unref_page_commit(page, pfn, migratetype, 0);
/*
* Guard against excessive IRQ disabled times when we get
* a large list of pages to free.
*/
if (++batch_count == SWAP_CLUSTER_MAX) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
batch_count = 0;
- local_irq_save(flags);
+ local_lock_irqsave(&pagesets.lock, flags);
}
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&pagesets.lock, flags);
}
/*
@@ -3392,7 +3597,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
*
* Must be called with interrupts disabled.
*/
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+ long nr_account)
{
#ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -3405,17 +3611,19 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
local_stat = NUMA_OTHER;
if (zone_to_nid(z) == zone_to_nid(preferred_zone))
- __inc_numa_state(z, NUMA_HIT);
+ __count_numa_events(z, NUMA_HIT, nr_account);
else {
- __inc_numa_state(z, NUMA_MISS);
- __inc_numa_state(preferred_zone, NUMA_FOREIGN);
+ __count_numa_events(z, NUMA_MISS, nr_account);
+ __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
}
- __inc_numa_state(z, local_stat);
+ __count_numa_events(z, local_stat, nr_account);
#endif
}
/* Remove page from the per-cpu list, caller must protect the list */
-static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+static inline
+struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
+ int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
@@ -3424,16 +3632,30 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
do {
if (list_empty(list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- READ_ONCE(pcp->batch), list,
+ int batch = READ_ONCE(pcp->batch);
+ int alloced;
+
+ /*
+ * Scale batch relative to order if batch implies
+ * free pages can be stored on the PCP. Batch can
+ * be 1 for small zones or for boot pagesets which
+ * should never store free pages as the pages may
+ * belong to arbitrary zones.
+ */
+ if (batch > 1)
+ batch = max(batch >> order, 2);
+ alloced = rmqueue_bulk(zone, order,
+ batch, list,
migratetype, alloc_flags);
+
+ pcp->count += alloced << order;
if (unlikely(list_empty(list)))
return NULL;
}
page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
- pcp->count--;
+ pcp->count -= 1 << order;
} while (check_new_pcp(page));
return page;
@@ -3441,23 +3663,31 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
- struct zone *zone, gfp_t gfp_flags,
- int migratetype, unsigned int alloc_flags)
+ struct zone *zone, unsigned int order,
+ gfp_t gfp_flags, int migratetype,
+ unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
unsigned long flags;
- local_irq_save(flags);
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
+ local_lock_irqsave(&pagesets.lock, flags);
+
+ /*
+ * On allocation, reduce the number of pages that are batch freed.
+ * See nr_pcp_free() where free_factor is increased for subsequent
+ * frees.
+ */
+ pcp = this_cpu_ptr(zone->per_cpu_pageset);
+ pcp->free_factor >>= 1;
+ list = &pcp->lists[order_to_pindex(migratetype, order)];
+ page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
+ local_unlock_irqrestore(&pagesets.lock, flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
- zone_statistics(preferred_zone, zone);
+ zone_statistics(preferred_zone, zone, 1);
}
- local_irq_restore(flags);
return page;
}
@@ -3473,15 +3703,15 @@ struct page *rmqueue(struct zone *preferred_zone,
unsigned long flags;
struct page *page;
- if (likely(order == 0)) {
+ if (likely(pcp_allowed_order(order))) {
/*
* MIGRATE_MOVABLE pcplist could have the pages on CMA area and
* we need to skip it when CMA area isn't allowed.
*/
if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
migratetype != MIGRATE_MOVABLE) {
- page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
- migratetype, alloc_flags);
+ page = rmqueue_pcplist(preferred_zone, zone, order,
+ gfp_flags, migratetype, alloc_flags);
goto out;
}
}
@@ -3509,15 +3739,15 @@ struct page *rmqueue(struct zone *preferred_zone,
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
- spin_unlock(&zone->lock);
if (!page)
goto failed;
+
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
+ spin_unlock_irqrestore(&zone->lock, flags);
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
- zone_statistics(preferred_zone, zone);
- local_irq_restore(flags);
+ zone_statistics(preferred_zone, zone, 1);
out:
/* Separate test+clear to avoid unnecessary atomics */
@@ -3530,7 +3760,7 @@ out:
return page;
failed:
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
return NULL;
}
@@ -3813,16 +4043,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
return alloc_flags;
}
-static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
- unsigned int alloc_flags)
+/* Must be called after current_gfp_context() which can change gfp_mask */
+static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
+ unsigned int alloc_flags)
{
#ifdef CONFIG_CMA
- unsigned int pflags = current->flags;
-
- if (!(pflags & PF_MEMALLOC_NOCMA) &&
- gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
-
#endif
return alloc_flags;
}
@@ -3922,7 +4149,7 @@ retry:
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (node_reclaim_mode == 0 ||
+ if (!node_reclaim_enabled() ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
@@ -4130,7 +4357,7 @@ out:
}
/*
- * Maximum number of compaction retries wit a progress before OOM
+ * Maximum number of compaction retries with a progress before OOM
* killer is consider as the only way to move forward.
*/
#define MAX_COMPACT_RETRIES 16
@@ -4158,6 +4385,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
+ if (*compact_result == COMPACT_SKIPPED)
+ return NULL;
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
* count a compaction stall
@@ -4207,6 +4436,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
if (!order)
return false;
+ if (fatal_signal_pending(current))
+ return false;
+
if (compaction_made_progress(compact_result))
(*compaction_retries)++;
@@ -4478,7 +4710,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+ alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
return alloc_flags;
}
@@ -4780,7 +5012,7 @@ retry:
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
- alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
+ alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags);
/*
* Reset the nodemask and zonelist iterators if memory policies can be
@@ -4921,7 +5153,7 @@ got_pg:
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
- struct alloc_context *ac, gfp_t *alloc_mask,
+ struct alloc_context *ac, gfp_t *alloc_gfp,
unsigned int *alloc_flags)
{
ac->highest_zoneidx = gfp_zone(gfp_mask);
@@ -4930,7 +5162,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
ac->migratetype = gfp_migratetype(gfp_mask);
if (cpusets_enabled()) {
- *alloc_mask |= __GFP_HARDWALL;
+ *alloc_gfp |= __GFP_HARDWALL;
/*
* When we are in the interrupt context, it is irrelevant
* to the current task context. It means that any node ok.
@@ -4949,7 +5181,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return false;
- *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
+ *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -4966,15 +5198,159 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/*
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
+ * @gfp: GFP flags for the allocation
+ * @preferred_nid: The preferred NUMA node ID to allocate from
+ * @nodemask: Set of nodes to allocate from, may be NULL
+ * @nr_pages: The number of pages desired on the list or array
+ * @page_list: Optional list to store the allocated pages
+ * @page_array: Optional array to store the pages
+ *
+ * This is a batched version of the page allocator that attempts to
+ * allocate nr_pages quickly. Pages are added to page_list if page_list
+ * is not NULL, otherwise it is assumed that the page_array is valid.
+ *
+ * For lists, nr_pages is the number of pages that should be allocated.
+ *
+ * For arrays, only NULL elements are populated with pages and nr_pages
+ * is the maximum number of pages that will be stored in the array.
+ *
+ * Returns the number of pages on the list or array.
+ */
+unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+ nodemask_t *nodemask, int nr_pages,
+ struct list_head *page_list,
+ struct page **page_array)
+{
+ struct page *page;
+ unsigned long flags;
+ struct zone *zone;
+ struct zoneref *z;
+ struct per_cpu_pages *pcp;
+ struct list_head *pcp_list;
+ struct alloc_context ac;
+ gfp_t alloc_gfp;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
+ int nr_populated = 0, nr_account = 0;
+
+ if (unlikely(nr_pages <= 0))
+ return 0;
+
+ /*
+ * Skip populated array elements to determine if any pages need
+ * to be allocated before disabling IRQs.
+ */
+ while (page_array && nr_populated < nr_pages && page_array[nr_populated])
+ nr_populated++;
+
+ /* Already populated array? */
+ if (unlikely(page_array && nr_pages - nr_populated == 0))
+ return nr_populated;
+
+ /* Use the single page allocator for one page. */
+ if (nr_pages - nr_populated == 1)
+ goto failed;
+
+ /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
+ gfp &= gfp_allowed_mask;
+ alloc_gfp = gfp;
+ if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
+ return 0;
+ gfp = alloc_gfp;
+
+ /* Find an allowed local zone that meets the low watermark. */
+ for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
+ unsigned long mark;
+
+ if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
+ !__cpuset_zone_allowed(zone, gfp)) {
+ continue;
+ }
+
+ if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
+ zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
+ goto failed;
+ }
+
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
+ if (zone_watermark_fast(zone, 0, mark,
+ zonelist_zone_idx(ac.preferred_zoneref),
+ alloc_flags, gfp)) {
+ break;
+ }
+ }
+
+ /*
+ * If there are no allowed local zones that meets the watermarks then
+ * try to allocate a single page and reclaim if necessary.
+ */
+ if (unlikely(!zone))
+ goto failed;
+
+ /* Attempt the batch allocation */
+ local_lock_irqsave(&pagesets.lock, flags);
+ pcp = this_cpu_ptr(zone->per_cpu_pageset);
+ pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
+
+ while (nr_populated < nr_pages) {
+
+ /* Skip existing pages */
+ if (page_array && page_array[nr_populated]) {
+ nr_populated++;
+ continue;
+ }
+
+ page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
+ pcp, pcp_list);
+ if (unlikely(!page)) {
+ /* Try and get at least one page */
+ if (!nr_populated)
+ goto failed_irq;
+ break;
+ }
+ nr_account++;
+
+ prep_new_page(page, 0, gfp, 0);
+ if (page_list)
+ list_add(&page->lru, page_list);
+ else
+ page_array[nr_populated] = page;
+ nr_populated++;
+ }
+
+ local_unlock_irqrestore(&pagesets.lock, flags);
+
+ __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
+ zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
+
+ return nr_populated;
+
+failed_irq:
+ local_unlock_irqrestore(&pagesets.lock, flags);
+
+failed:
+ page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
+ if (page) {
+ if (page_list)
+ list_add(&page->lru, page_list);
+ else
+ page_array[nr_populated] = page;
+ nr_populated++;
+ }
+
+ return nr_populated;
+}
+EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
+
+/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
- gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+ gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
/*
@@ -4982,33 +5358,36 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
* so bail out early if the request is out of bound.
*/
if (unlikely(order >= MAX_ORDER)) {
- WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
+ WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
return NULL;
}
- gfp_mask &= gfp_allowed_mask;
- alloc_mask = gfp_mask;
- if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
+ gfp &= gfp_allowed_mask;
+ /*
+ * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+ * resp. GFP_NOIO which has to be inherited for all allocation requests
+ * from a particular context which has been marked by
+ * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
+ * movable zones are not used during allocation.
+ */
+ gfp = current_gfp_context(gfp);
+ alloc_gfp = gfp;
+ if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
+ &alloc_gfp, &alloc_flags))
return NULL;
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
- alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
/* First allocation attempt */
- page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
if (likely(page))
goto out;
- /*
- * Apply scoped allocation constraints. This is mainly about GFP_NOFS
- * resp. GFP_NOIO which has to be inherited for all allocation requests
- * from a particular context which has been marked by
- * memalloc_no{fs,io}_{save,restore}.
- */
- alloc_mask = current_gfp_context(gfp_mask);
+ alloc_gfp = gfp;
ac.spread_dirty_pages = false;
/*
@@ -5017,20 +5396,20 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
*/
ac.nodemask = nodemask;
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+ page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
out:
- if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
+ if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page &&
+ unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
- trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
return page;
}
-EXPORT_SYMBOL(__alloc_pages_nodemask);
+EXPORT_SYMBOL(__alloc_pages);
/*
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
@@ -5054,14 +5433,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page);
-static inline void free_the_page(struct page *page, unsigned int order)
-{
- if (order == 0) /* Via pcp? */
- free_unref_page(page);
- else
- __free_pages_ok(page, order, FPI_NONE);
-}
-
/**
* __free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
@@ -5520,7 +5891,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
continue;
for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
}
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
@@ -5612,7 +5983,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
free_pcp = 0;
for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
show_node(zone);
printk(KERN_CONT
@@ -5653,7 +6024,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
- K(this_cpu_read(zone->pageset->pcp.count)),
+ K(this_cpu_read(zone->per_cpu_pageset->count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
@@ -5736,7 +6107,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
static int __parse_numa_zonelist_order(char *s)
{
/*
- * We used to support different zonlists modes but they turned
+ * We used to support different zonelists modes but they turned
* out to be just not useful. Let's keep the warning in place
* if somebody still use the cmd line parameter so that we do
* not fail it silently
@@ -5980,11 +6351,12 @@ static void build_zonelists(pg_data_t *pgdat)
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
-static void pageset_init(struct per_cpu_pageset *p);
+static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
/* These effectively disable the pcplists in the boot pageset completely */
#define BOOT_PAGESET_HIGH 0
#define BOOT_PAGESET_BATCH 1
-static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void __build_all_zonelists(void *data)
@@ -6051,7 +6423,7 @@ build_all_zonelists_init(void)
* (a chicken-egg dilemma).
*/
for_each_possible_cpu(cpu)
- pageset_init(&per_cpu(boot_pageset, cpu));
+ per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
@@ -6203,7 +6575,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
return;
/*
- * The call to memmap_init_zone should have already taken care
+ * The call to memmap_init should have already taken care
* of the pages reserved for the memmap, so we can just jump to
* the end of that region and start processing the device pages.
*/
@@ -6264,11 +6636,11 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
-#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
+#if !defined(CONFIG_FLATMEM)
/*
* Only struct pages that correspond to ranges defined by memblock.memory
* are zeroed and initialized by going through __init_single_page() during
- * memmap_init_zone().
+ * memmap_init_zone_range().
*
* But, there could be struct pages that correspond to holes in
* memblock.memory. This can happen because of the following reasons:
@@ -6287,9 +6659,9 @@ static void __meminit zone_init_free_lists(struct zone *zone)
* zone/node above the hole except for the trailing pages in the last
* section that will be appended to the zone/node below.
*/
-static u64 __meminit init_unavailable_range(unsigned long spfn,
- unsigned long epfn,
- int zone, int node)
+static void __init init_unavailable_range(unsigned long spfn,
+ unsigned long epfn,
+ int zone, int node)
{
unsigned long pfn;
u64 pgcnt = 0;
@@ -6305,56 +6677,77 @@ static u64 __meminit init_unavailable_range(unsigned long spfn,
pgcnt++;
}
- return pgcnt;
+ if (pgcnt)
+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
+ node, zone_names[zone], pgcnt);
}
#else
-static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn,
- int zone, int node)
+static inline void init_unavailable_range(unsigned long spfn,
+ unsigned long epfn,
+ int zone, int node)
{
- return 0;
}
#endif
-void __meminit __weak memmap_init_zone(struct zone *zone)
+static void __init memmap_init_zone_range(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long *hole_pfn)
{
unsigned long zone_start_pfn = zone->zone_start_pfn;
unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
- int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
- static unsigned long hole_pfn;
+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+
+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+
+ if (start_pfn >= end_pfn)
+ return;
+
+ memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+
+ if (*hole_pfn < start_pfn)
+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
+
+ *hole_pfn = end_pfn;
+}
+
+static void __init memmap_init(void)
+{
unsigned long start_pfn, end_pfn;
- u64 pgcnt = 0;
+ unsigned long hole_pfn = 0;
+ int i, j, zone_id, nid;
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
- end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ struct pglist_data *node = NODE_DATA(nid);
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = node->node_zones + j;
- if (end_pfn > start_pfn)
- memmap_init_range(end_pfn - start_pfn, nid,
- zone_id, start_pfn, zone_end_pfn,
- MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+ if (!populated_zone(zone))
+ continue;
- if (hole_pfn < start_pfn)
- pgcnt += init_unavailable_range(hole_pfn, start_pfn,
- zone_id, nid);
- hole_pfn = end_pfn;
+ memmap_init_zone_range(zone, start_pfn, end_pfn,
+ &hole_pfn);
+ zone_id = j;
+ }
}
#ifdef CONFIG_SPARSEMEM
/*
- * Initialize the hole in the range [zone_end_pfn, section_end].
- * If zone boundary falls in the middle of a section, this hole
- * will be re-initialized during the call to this function for the
- * higher zone.
+ * Initialize the memory map for hole in the range [memory_end,
+ * section_end].
+ * Append the pages in this hole to the highest zone in the last
+ * node.
+ * The call to init_unavailable_range() is outside the ifdef to
+ * silence the compiler warining about zone_id set but not used;
+ * for FLATMEM it is a nop anyway
*/
- end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION);
+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
if (hole_pfn < end_pfn)
- pgcnt += init_unavailable_range(hole_pfn, end_pfn,
- zone_id, nid);
#endif
-
- if (pgcnt)
- pr_info(" %s zone: %llu pages in unavailable ranges\n",
- zone->name, pgcnt);
+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
static int zone_batchsize(struct zone *zone)
@@ -6363,13 +6756,12 @@ static int zone_batchsize(struct zone *zone)
int batch;
/*
- * The per-cpu-pages pools are set to around 1000th of the
- * size of the zone.
+ * The number of pages to batch allocate is either ~0.1%
+ * of the zone or 1MB, whichever is smaller. The batch
+ * size is striking a balance between allocation latency
+ * and zone lock contention.
*/
- batch = zone_managed_pages(zone) / 1024;
- /* But no more than a meg. */
- if (batch * PAGE_SIZE > 1024 * 1024)
- batch = (1024 * 1024) / PAGE_SIZE;
+ batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE);
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
@@ -6406,6 +6798,54 @@ static int zone_batchsize(struct zone *zone)
#endif
}
+static int zone_highsize(struct zone *zone, int batch, int cpu_online)
+{
+#ifdef CONFIG_MMU
+ int high;
+ int nr_split_cpus;
+ unsigned long total_pages;
+
+ if (!percpu_pagelist_high_fraction) {
+ /*
+ * By default, the high value of the pcp is based on the zone
+ * low watermark so that if they are full then background
+ * reclaim will not be started prematurely.
+ */
+ total_pages = low_wmark_pages(zone);
+ } else {
+ /*
+ * If percpu_pagelist_high_fraction is configured, the high
+ * value is based on a fraction of the managed pages in the
+ * zone.
+ */
+ total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
+ }
+
+ /*
+ * Split the high value across all online CPUs local to the zone. Note
+ * that early in boot that CPUs may not be online yet and that during
+ * CPU hotplug that the cpumask is not yet updated when a CPU is being
+ * onlined. For memory nodes that have no CPUs, split pcp->high across
+ * all online CPUs to mitigate the risk that reclaim is triggered
+ * prematurely due to pages stored on pcp lists.
+ */
+ nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
+ if (!nr_split_cpus)
+ nr_split_cpus = num_online_cpus();
+ high = total_pages / nr_split_cpus;
+
+ /*
+ * Ensure high is at least batch*4. The multiple is based on the
+ * historical relationship between high and batch.
+ */
+ high = max(high, batch << 2);
+
+ return high;
+#else
+ return 0;
+#endif
+}
+
/*
* pcp->high and pcp->batch values are related and generally batch is lower
* than high. They are also related to pcp->count such that count is lower
@@ -6429,16 +6869,15 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
WRITE_ONCE(pcp->high, high);
}
-static void pageset_init(struct per_cpu_pageset *p)
+static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
{
- struct per_cpu_pages *pcp;
- int migratetype;
+ int pindex;
- memset(p, 0, sizeof(*p));
+ memset(pcp, 0, sizeof(*pcp));
+ memset(pzstats, 0, sizeof(*pzstats));
- pcp = &p->pcp;
- for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
- INIT_LIST_HEAD(&pcp->lists[migratetype]);
+ for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
+ INIT_LIST_HEAD(&pcp->lists[pindex]);
/*
* Set batch and high values safe for a boot pageset. A true percpu
@@ -6448,38 +6887,31 @@ static void pageset_init(struct per_cpu_pageset *p)
*/
pcp->high = BOOT_PAGESET_HIGH;
pcp->batch = BOOT_PAGESET_BATCH;
+ pcp->free_factor = 0;
}
static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
unsigned long batch)
{
- struct per_cpu_pageset *p;
+ struct per_cpu_pages *pcp;
int cpu;
for_each_possible_cpu(cpu) {
- p = per_cpu_ptr(zone->pageset, cpu);
- pageset_update(&p->pcp, high, batch);
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ pageset_update(pcp, high, batch);
}
}
/*
* Calculate and set new high and batch values for all per-cpu pagesets of a
- * zone, based on the zone's size and the percpu_pagelist_fraction sysctl.
+ * zone based on the zone's size.
*/
-static void zone_set_pageset_high_and_batch(struct zone *zone)
+static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
{
- unsigned long new_high, new_batch;
+ int new_high, new_batch;
- if (percpu_pagelist_fraction) {
- new_high = zone_managed_pages(zone) / percpu_pagelist_fraction;
- new_batch = max(1UL, new_high / 4);
- if ((new_high / 4) > (PAGE_SHIFT * 8))
- new_batch = PAGE_SHIFT * 8;
- } else {
- new_batch = zone_batchsize(zone);
- new_high = 6 * new_batch;
- new_batch = max(1UL, 1 * new_batch);
- }
+ new_batch = max(1, zone_batchsize(zone));
+ new_high = zone_highsize(zone, new_batch, cpu_online);
if (zone->pageset_high == new_high &&
zone->pageset_batch == new_batch)
@@ -6493,16 +6925,23 @@ static void zone_set_pageset_high_and_batch(struct zone *zone)
void __meminit setup_zone_pageset(struct zone *zone)
{
- struct per_cpu_pageset *p;
int cpu;
- zone->pageset = alloc_percpu(struct per_cpu_pageset);
+ /* Size may be 0 on !SMP && !NUMA */
+ if (sizeof(struct per_cpu_zonestat) > 0)
+ zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
+
+ zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
for_each_possible_cpu(cpu) {
- p = per_cpu_ptr(zone->pageset, cpu);
- pageset_init(p);
+ struct per_cpu_pages *pcp;
+ struct per_cpu_zonestat *pzstats;
+
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ per_cpu_pages_init(pcp, pzstats);
}
- zone_set_pageset_high_and_batch(zone);
+ zone_set_pageset_high_and_batch(zone, 0);
}
/*
@@ -6526,9 +6965,9 @@ void __init setup_per_cpu_pageset(void)
* the nodes these zones are associated with.
*/
for_each_possible_cpu(cpu) {
- struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
- memset(pcp->vm_numa_stat_diff, 0,
- sizeof(pcp->vm_numa_stat_diff));
+ struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
+ memset(pzstats->vm_numa_event, 0,
+ sizeof(pzstats->vm_numa_event));
}
#endif
@@ -6544,14 +6983,14 @@ static __meminit void zone_pcp_init(struct zone *zone)
* relies on the ability of the linker to provide the
* offset of a (static) per cpu variable into the per cpu area.
*/
- zone->pageset = &boot_pageset;
+ zone->per_cpu_pageset = &boot_pageset;
+ zone->per_cpu_zonestats = &boot_zonestats;
zone->pageset_high = BOOT_PAGESET_HIGH;
zone->pageset_batch = BOOT_PAGESET_BATCH;
if (populated_zone(zone))
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
- zone->name, zone->present_pages,
- zone_batchsize(zone));
+ pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
+ zone->present_pages, zone_batchsize(zone));
}
void __meminit init_currently_empty_zone(struct zone *zone,
@@ -6821,8 +7260,7 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
pgdat->node_spanned_pages = totalpages;
pgdat->node_present_pages = realtotalpages;
- printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
- realtotalpages);
+ pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}
#ifndef CONFIG_SPARSEMEM
@@ -7022,19 +7460,17 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
- printk(KERN_DEBUG
- " %s zone: %lu pages used for memmap\n",
- zone_names[j], memmap_pages);
+ pr_debug(" %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
} else
- pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
+ pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
- printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
- zone_names[0], dma_reserve);
+ pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
@@ -7057,11 +7493,10 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
set_pageblock_order();
setup_usemap(zone);
init_currently_empty_zone(zone, zone->zone_start_pfn, size);
- memmap_init_zone(zone);
}
}
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
+#ifdef CONFIG_FLATMEM
static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
{
unsigned long __maybe_unused start = 0;
@@ -7096,7 +7531,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
__func__, pgdat->node_id, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
@@ -7109,7 +7544,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
}
#else
static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+#endif /* CONFIG_FLATMEM */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
@@ -7477,7 +7912,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
}
/*
- * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
* such cases we allow max_zone_pfn sorted in the descending order
*/
bool __weak arch_has_descending_max_zone_pfns(void)
@@ -7583,6 +8018,8 @@ void __init free_area_init(unsigned long *max_zone_pfn)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
+
+ memmap_init();
}
static int __init cmdline_parse_core(char *p, unsigned long *core,
@@ -7689,7 +8126,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
-void __init mem_init_print_info(const char *str)
+void __init mem_init_print_info(void)
{
unsigned long physpages, codesize, datasize, rosize, bss_size;
unsigned long init_code_size, init_data_size;
@@ -7728,17 +8165,17 @@ void __init mem_init_print_info(const char *str)
#ifdef CONFIG_HIGHMEM
", %luK highmem"
#endif
- "%s%s)\n",
+ ")\n",
nr_free_pages() << (PAGE_SHIFT - 10),
physpages << (PAGE_SHIFT - 10),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
(physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
- totalcma_pages << (PAGE_SHIFT - 10),
+ totalcma_pages << (PAGE_SHIFT - 10)
#ifdef CONFIG_HIGHMEM
- totalhigh_pages() << (PAGE_SHIFT - 10),
+ , totalhigh_pages() << (PAGE_SHIFT - 10)
#endif
- str ? ", " : "", str ? str : "");
+ );
}
/**
@@ -7759,6 +8196,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
static int page_alloc_cpu_dead(unsigned int cpu)
{
+ struct zone *zone;
lru_add_drain_cpu(cpu);
drain_pages(cpu);
@@ -7779,6 +8217,19 @@ static int page_alloc_cpu_dead(unsigned int cpu)
* race with what we are doing.
*/
cpu_vm_stats_fold(cpu);
+
+ for_each_populated_zone(zone)
+ zone_pcp_update(zone, 0);
+
+ return 0;
+}
+
+static int page_alloc_cpu_online(unsigned int cpu)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ zone_pcp_update(zone, 1);
return 0;
}
@@ -7804,8 +8255,9 @@ void __init page_alloc_init(void)
hashdist = 0;
#endif
- ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
- "mm/page_alloc:dead", NULL,
+ ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
+ "mm/page_alloc:pcp",
+ page_alloc_cpu_online,
page_alloc_cpu_dead);
WARN_ON(ret < 0);
}
@@ -7868,14 +8320,14 @@ static void setup_per_zone_lowmem_reserve(void)
unsigned long managed_pages = 0;
for (j = i + 1; j < MAX_NR_ZONES; j++) {
- if (clear) {
- zone->lowmem_reserve[j] = 0;
- } else {
- struct zone *upper_zone = &pgdat->node_zones[j];
+ struct zone *upper_zone = &pgdat->node_zones[j];
+
+ managed_pages += zone_managed_pages(upper_zone);
- managed_pages += zone_managed_pages(upper_zone);
+ if (clear)
+ zone->lowmem_reserve[j] = 0;
+ else
zone->lowmem_reserve[j] = managed_pages / ratio;
- }
}
}
}
@@ -7955,11 +8407,19 @@ static void __setup_per_zone_wmarks(void)
*/
void setup_per_zone_wmarks(void)
{
+ struct zone *zone;
static DEFINE_SPINLOCK(lock);
spin_lock(&lock);
__setup_per_zone_wmarks();
spin_unlock(&lock);
+
+ /*
+ * The watermark size have changed so update the pcpu batch
+ * and high limits or the limits may be inappropriate.
+ */
+ for_each_zone(zone)
+ zone_pcp_update(zone, 0);
}
/*
@@ -8138,38 +8598,38 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
}
/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu. It is the fraction of total pages in each zone that a hot per cpu
+ * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu
* pagelist can have before it gets flushed back to buddy allocator.
*/
-int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
+int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
+ int write, void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
- int old_percpu_pagelist_fraction;
+ int old_percpu_pagelist_high_fraction;
int ret;
mutex_lock(&pcp_batch_high_lock);
- old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+ old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (!write || ret < 0)
goto out;
/* Sanity checking to avoid pcp imbalance */
- if (percpu_pagelist_fraction &&
- percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
- percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+ if (percpu_pagelist_high_fraction &&
+ percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
+ percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
ret = -EINVAL;
goto out;
}
/* No change? */
- if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+ if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
goto out;
for_each_populated_zone(zone)
- zone_set_pageset_high_and_batch(zone);
+ zone_set_pageset_high_and_batch(zone, 0);
out:
mutex_unlock(&pcp_batch_high_lock);
return ret;
@@ -8222,6 +8682,7 @@ void *__init alloc_large_system_hash(const char *tablename,
void *table = NULL;
gfp_t gfp_flags;
bool virt;
+ bool huge;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -8289,6 +8750,7 @@ void *__init alloc_large_system_hash(const char *tablename,
} else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags);
virt = true;
+ huge = is_vm_area_hugepages(table);
} else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -8305,7 +8767,7 @@ void *__init alloc_large_system_hash(const char *tablename,
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
- virt ? "vmalloc" : "linear");
+ virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
if (_hash_shift)
*_hash_shift = log2qty;
@@ -8450,6 +8912,27 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
pageblock_nr_pages));
}
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
+/* Usage: See admin-guide/dynamic-debug-howto.rst */
+static void alloc_contig_dump_pages(struct list_head *page_list)
+{
+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
+
+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
+ struct page *page;
+
+ dump_stack();
+ list_for_each_entry(page, page_list, lru)
+ dump_page(page, "migration failure");
+ }
+}
+#else
+static inline void alloc_contig_dump_pages(struct list_head *page_list)
+{
+}
+#endif
+
/* [start, end) must belong to a single zone. */
static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
@@ -8464,7 +8947,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
- migrate_prep();
+ lru_cache_disable();
while (pfn < end || !list_empty(&cc->migratepages)) {
if (fatal_signal_pending(current)) {
@@ -8474,14 +8957,13 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
if (list_empty(&cc->migratepages)) {
cc->nr_migratepages = 0;
- pfn = isolate_migratepages_range(cc, pfn, end);
- if (!pfn) {
- ret = -EINTR;
+ ret = isolate_migratepages_range(cc, pfn, end);
+ if (ret && ret != -EAGAIN)
break;
- }
+ pfn = cc->migrate_pfn;
tries = 0;
} else if (++tries == 5) {
- ret = ret < 0 ? ret : -EBUSY;
+ ret = -EBUSY;
break;
}
@@ -8491,8 +8973,19 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
+
+ /*
+ * On -ENOMEM, migrate_pages() bails out right away. It is pointless
+ * to retry again over this error, so do the same here.
+ */
+ if (ret == -ENOMEM)
+ break;
}
+
+ lru_cache_enable();
if (ret < 0) {
+ if (ret == -EBUSY)
+ alloc_contig_dump_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
return ret;
}
@@ -8503,7 +8996,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
* @end: one-past-the-last PFN to allocate
- * @migratetype: migratetype of the underlaying pageblocks (either
+ * @migratetype: migratetype of the underlying pageblocks (either
* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
* in range must have the same migratetype and it must
* be either of the two.
@@ -8583,7 +9076,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret && ret != -EBUSY)
goto done;
- ret =0;
+ ret = 0;
/*
* Pages from [start, end) are within a MAX_ORDER_NR_PAGES
@@ -8602,8 +9095,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* isolated thus they won't get removed from buddy.
*/
- lru_add_drain_all();
-
order = 0;
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
@@ -8629,8 +9120,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, 0)) {
- pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
- __func__, outer_start, end);
ret = -EBUSY;
goto done;
}
@@ -8680,12 +9169,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
if (PageReserved(page))
return false;
-
- if (page_count(page) > 0)
- return false;
-
- if (PageHuge(page))
- return false;
}
return true;
}
@@ -8757,9 +9240,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
}
#endif /* CONFIG_CONTIG_ALLOC */
-void free_contig_range(unsigned long pfn, unsigned int nr_pages)
+void free_contig_range(unsigned long pfn, unsigned long nr_pages)
{
- unsigned int count = 0;
+ unsigned long count = 0;
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
@@ -8767,18 +9250,18 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
count += page_count(page) != 1;
__free_page(page);
}
- WARN(count != 0, "%d pages are still in use!\n", count);
+ WARN(count != 0, "%lu pages are still in use!\n", count);
}
EXPORT_SYMBOL(free_contig_range);
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
- * page high values need to be recalulated.
+ * page high values need to be recalculated.
*/
-void __meminit zone_pcp_update(struct zone *zone)
+void zone_pcp_update(struct zone *zone, int cpu_online)
{
mutex_lock(&pcp_batch_high_lock);
- zone_set_pageset_high_and_batch(zone);
+ zone_set_pageset_high_and_batch(zone, cpu_online);
mutex_unlock(&pcp_batch_high_lock);
}
@@ -8805,21 +9288,19 @@ void zone_pcp_enable(struct zone *zone)
void zone_pcp_reset(struct zone *zone)
{
- unsigned long flags;
int cpu;
- struct per_cpu_pageset *pset;
+ struct per_cpu_zonestat *pzstats;
- /* avoid races with drain_pages() */
- local_irq_save(flags);
- if (zone->pageset != &boot_pageset) {
+ if (zone->per_cpu_pageset != &boot_pageset) {
for_each_online_cpu(cpu) {
- pset = per_cpu_ptr(zone->pageset, cpu);
- drain_zonestat(zone, pset);
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ drain_zonestat(zone, pzstats);
}
- free_percpu(zone->pageset);
- zone->pageset = &boot_pageset;
+ free_percpu(zone->per_cpu_pageset);
+ free_percpu(zone->per_cpu_zonestats);
+ zone->per_cpu_pageset = &boot_pageset;
+ zone->per_cpu_zonestats = &boot_zonestats;
}
- local_irq_restore(flags);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
@@ -8947,6 +9428,8 @@ bool take_page_off_buddy(struct page *page)
del_page_from_free_list(page_head, zone, page_order);
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -1, migratetype);
ret = true;
break;
}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index c6860f51b6c6..7d83641eb86b 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -52,9 +52,13 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
long new;
new = atomic_long_sub_return(nr_pages, &counter->usage);
- propagate_protected_usage(counter, new);
/* More uncharges than charges? */
- WARN_ON_ONCE(new < 0);
+ if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
+ new, nr_pages)) {
+ new = 0;
+ atomic_long_set(&counter->usage, new);
+ }
+ propagate_protected_usage(counter, new);
}
/**
diff --git a/mm/page_ext.c b/mm/page_ext.c
index df6f74aac8e1..293b2685fc48 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -191,7 +191,7 @@ fail:
panic("Out of memory");
}
-#else /* CONFIG_FLAT_NODE_MEM_MAP */
+#else /* CONFIG_FLATMEM */
struct page_ext *lookup_page_ext(const struct page *page)
{
diff --git a/mm/page_owner.c b/mm/page_owner.c
index d15c7c4994f5..f51a57e92aa3 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -27,6 +27,7 @@ struct page_owner {
depot_stack_handle_t handle;
depot_stack_handle_t free_handle;
u64 ts_nsec;
+ u64 free_ts_nsec;
pid_t pid;
};
@@ -41,13 +42,7 @@ static void init_early_allocated_pages(void);
static int __init early_page_owner_param(char *buf)
{
- if (!buf)
- return -EINVAL;
-
- if (strcmp(buf, "on") == 0)
- page_owner_enabled = true;
-
- return 0;
+ return kstrtobool(buf, &page_owner_enabled);
}
early_param("page_owner", early_page_owner_param);
@@ -103,42 +98,30 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
return (void *)page_ext + page_owner_ops.offset;
}
-static inline bool check_recursive_alloc(unsigned long *entries,
- unsigned int nr_entries,
- unsigned long ip)
-{
- unsigned int i;
-
- for (i = 0; i < nr_entries; i++) {
- if (entries[i] == ip)
- return true;
- }
- return false;
-}
-
static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[PAGE_OWNER_STACK_DEPTH];
depot_stack_handle_t handle;
unsigned int nr_entries;
- nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
-
/*
- * We need to check recursion here because our request to
- * stackdepot could trigger memory allocation to save new
- * entry. New memory allocation would reach here and call
- * stack_depot_save_entries() again if we don't catch it. There is
- * still not enough memory in stackdepot so it would try to
- * allocate memory again and loop forever.
+ * Avoid recursion.
+ *
+ * Sometimes page metadata allocation tracking requires more
+ * memory to be allocated:
+ * - when new stack trace is saved to stack depot
+ * - when backtrace itself is calculated (ia64)
*/
- if (check_recursive_alloc(entries, nr_entries, _RET_IP_))
+ if (current->in_page_owner)
return dummy_handle;
+ current->in_page_owner = 1;
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
handle = stack_depot_save(entries, nr_entries, flags);
if (!handle)
handle = failure_handle;
+ current->in_page_owner = 0;
return handle;
}
@@ -146,25 +129,27 @@ void __reset_page_owner(struct page *page, unsigned int order)
{
int i;
struct page_ext *page_ext;
- depot_stack_handle_t handle = 0;
+ depot_stack_handle_t handle;
struct page_owner *page_owner;
-
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+ u64 free_ts_nsec = local_clock();
page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
return;
+
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
for (i = 0; i < (1 << order); i++) {
__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
page_owner = get_page_owner(page_ext);
page_owner->free_handle = handle;
+ page_owner->free_ts_nsec = free_ts_nsec;
page_ext = page_ext_next(page_ext);
}
}
-static inline void __set_page_owner_handle(struct page *page,
- struct page_ext *page_ext, depot_stack_handle_t handle,
- unsigned int order, gfp_t gfp_mask)
+static inline void __set_page_owner_handle(struct page_ext *page_ext,
+ depot_stack_handle_t handle,
+ unsigned int order, gfp_t gfp_mask)
{
struct page_owner *page_owner;
int i;
@@ -194,7 +179,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
return;
handle = save_stack(gfp_mask);
- __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
+ __set_page_owner_handle(page_ext, handle, order, gfp_mask);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -243,11 +228,12 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
new_page_owner->handle = old_page_owner->handle;
new_page_owner->pid = old_page_owner->pid;
new_page_owner->ts_nsec = old_page_owner->ts_nsec;
+ new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
/*
* We don't clear the bit on the oldpage as it's going to be freed
* after migration. Until then, the info can be useful in case of
- * a bug, and the overal stats will be off a bit only temporarily.
+ * a bug, and the overall stats will be off a bit only temporarily.
* Also, migrate_misplaced_transhuge_page() can still fail the
* migration and then we want the oldpage to retain the info. But
* in that case we also don't need to explicitly clear the info from
@@ -356,10 +342,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
return -ENOMEM;
ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n",
+ "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n",
page_owner->order, page_owner->gfp_mask,
&page_owner->gfp_mask, page_owner->pid,
- page_owner->ts_nsec);
+ page_owner->ts_nsec, page_owner->free_ts_nsec);
if (ret >= count)
goto err;
@@ -406,7 +392,7 @@ err:
return -ENOMEM;
}
-void __dump_page_owner(struct page *page)
+void __dump_page_owner(const struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
struct page_owner *page_owner;
@@ -435,9 +421,9 @@ void __dump_page_owner(struct page *page)
else
pr_alert("page_owner tracks the page as freed\n");
- pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n",
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n",
page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
- page_owner->pid, page_owner->ts_nsec);
+ page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
handle = READ_ONCE(page_owner->handle);
if (!handle) {
@@ -612,7 +598,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
continue;
/* Found early allocated page */
- __set_page_owner_handle(page, page_ext, early_handle,
+ __set_page_owner_handle(page_ext, early_handle,
0, 0);
count++;
}
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 655dc5895604..98438985e1ed 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -2,6 +2,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/mmdebug.h>
#include <linux/highmem.h>
#include <linux/page_ext.h>
#include <linux/poison.h>
@@ -45,7 +46,7 @@ static bool single_bit_flip(unsigned char a, unsigned char b)
return error && !(error & (error - 1));
}
-static void check_poison_mem(unsigned char *mem, size_t bytes)
+static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes)
{
static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
unsigned char *start;
@@ -70,6 +71,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
end - start + 1, 1);
dump_stack();
+ dump_page(page, "pagealloc: corrupted page details");
}
static void unpoison_page(struct page *page)
@@ -83,7 +85,7 @@ static void unpoison_page(struct page *page)
* that is freed to buddy. Thus no extra check is done to
* see if a page was poisoned.
*/
- check_poison_mem(kasan_reset_tag(addr), PAGE_SIZE);
+ check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE);
kasan_enable_current();
kunmap_atomic(addr);
}
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index c50d93ffa252..382958eef8a9 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -4,12 +4,17 @@
#include <linux/page_reporting.h>
#include <linux/gfp.h>
#include <linux/export.h>
+#include <linux/module.h>
#include <linux/delay.h>
#include <linux/scatterlist.h>
#include "page_reporting.h"
#include "internal.h"
+unsigned int page_reporting_order = MAX_ORDER;
+module_param(page_reporting_order, uint, 0644);
+MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
+
#define PAGE_REPORTING_DELAY (2 * HZ)
static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
@@ -31,8 +36,8 @@ __page_reporting_request(struct page_reporting_dev_info *prdev)
return;
/*
- * If reporting is already active there is nothing we need to do.
- * Test against 0 as that represents PAGE_REPORTING_IDLE.
+ * If reporting is already active there is nothing we need to do.
+ * Test against 0 as that represents PAGE_REPORTING_IDLE.
*/
state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
if (state != PAGE_REPORTING_IDLE)
@@ -229,7 +234,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
/* Generate minimum watermark to be able to guarantee progress */
watermark = low_wmark_pages(zone) +
- (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
+ (PAGE_REPORTING_CAPACITY << page_reporting_order);
/*
* Cancel request if insufficient free memory or if we failed
@@ -239,7 +244,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
return err;
/* Process each free list starting from lowest order/mt */
- for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
+ for (order = page_reporting_order; order < MAX_ORDER; order++) {
for (mt = 0; mt < MIGRATE_TYPES; mt++) {
/* We do not pull pages from the isolate free list */
if (is_migrate_isolate(mt))
@@ -324,6 +329,12 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
goto err_out;
}
+ /*
+ * Update the page reporting order if it's specified by driver.
+ * Otherwise, it falls back to @pageblock_order.
+ */
+ page_reporting_order = prdev->order ? : pageblock_order;
+
/* initialize state and work structures */
atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
index 2c385dd4ddbd..c51dbc228b94 100644
--- a/mm/page_reporting.h
+++ b/mm/page_reporting.h
@@ -10,10 +10,9 @@
#include <linux/pgtable.h>
#include <linux/scatterlist.h>
-#define PAGE_REPORTING_MIN_ORDER pageblock_order
-
#ifdef CONFIG_PAGE_REPORTING
DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
+extern unsigned int page_reporting_order;
void __page_reporting_notify(void);
static inline bool page_reported(struct page *page)
@@ -38,7 +37,7 @@ static inline void page_reporting_notify_free(unsigned int order)
return;
/* Determine if we have crossed reporting threshold */
- if (order < PAGE_REPORTING_MIN_ORDER)
+ if (order < page_reporting_order)
return;
/* This will add a few cycles, but should be called infrequently */
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 86e3a3688d59..a4435311754b 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -116,6 +116,13 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
return pfn_is_match(pvmw->page, pfn);
}
+static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
+{
+ pvmw->address = (pvmw->address + size) & ~(size - 1);
+ if (!pvmw->address)
+ pvmw->address = ULONG_MAX;
+}
+
/**
* page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
* @pvmw->address
@@ -134,7 +141,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
* regardless of which page table level the page is mapped at. @pvmw->pmd is
* NULL.
*
- * Retruns false if there are no more page table entries for the page in
+ * Returns false if there are no more page table entries for the page in
* the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped.
*
* If you need to stop the walk before page_vma_mapped_walk() returned false,
@@ -144,6 +151,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
{
struct mm_struct *mm = pvmw->vma->vm_mm;
struct page *page = pvmw->page;
+ unsigned long end;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -153,10 +161,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (pvmw->pmd && !pvmw->pte)
return not_found(pvmw);
- if (pvmw->pte)
- goto next_pte;
+ if (unlikely(PageHuge(page))) {
+ /* The only possible mapping was handled on last iteration */
+ if (pvmw->pte)
+ return not_found(pvmw);
- if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
if (!pvmw->pte)
@@ -168,78 +177,108 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
return not_found(pvmw);
return true;
}
-restart:
- pgd = pgd_offset(mm, pvmw->address);
- if (!pgd_present(*pgd))
- return false;
- p4d = p4d_offset(pgd, pvmw->address);
- if (!p4d_present(*p4d))
- return false;
- pud = pud_offset(p4d, pvmw->address);
- if (!pud_present(*pud))
- return false;
- pvmw->pmd = pmd_offset(pud, pvmw->address);
+
/*
- * Make sure the pmd value isn't cached in a register by the
- * compiler and used as a stale value after we've observed a
- * subsequent update.
+ * Seek to next pte only makes sense for THP.
+ * But more important than that optimization, is to filter out
+ * any PageKsm page: whose page->index misleads vma_address()
+ * and vma_address_end() to disaster.
*/
- pmde = READ_ONCE(*pvmw->pmd);
- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
- pvmw->ptl = pmd_lock(mm, pvmw->pmd);
- if (likely(pmd_trans_huge(*pvmw->pmd))) {
- if (pvmw->flags & PVMW_MIGRATION)
- return not_found(pvmw);
- if (pmd_page(*pvmw->pmd) != page)
- return not_found(pvmw);
- return true;
- } else if (!pmd_present(*pvmw->pmd)) {
- if (thp_migration_supported()) {
- if (!(pvmw->flags & PVMW_MIGRATION))
+ end = PageTransCompound(page) ?
+ vma_address_end(page, pvmw->vma) :
+ pvmw->address + PAGE_SIZE;
+ if (pvmw->pte)
+ goto next_pte;
+restart:
+ do {
+ pgd = pgd_offset(mm, pvmw->address);
+ if (!pgd_present(*pgd)) {
+ step_forward(pvmw, PGDIR_SIZE);
+ continue;
+ }
+ p4d = p4d_offset(pgd, pvmw->address);
+ if (!p4d_present(*p4d)) {
+ step_forward(pvmw, P4D_SIZE);
+ continue;
+ }
+ pud = pud_offset(p4d, pvmw->address);
+ if (!pud_present(*pud)) {
+ step_forward(pvmw, PUD_SIZE);
+ continue;
+ }
+
+ pvmw->pmd = pmd_offset(pud, pvmw->address);
+ /*
+ * Make sure the pmd value isn't cached in a register by the
+ * compiler and used as a stale value after we've observed a
+ * subsequent update.
+ */
+ pmde = READ_ONCE(*pvmw->pmd);
+
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+ pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ pmde = *pvmw->pmd;
+ if (likely(pmd_trans_huge(pmde))) {
+ if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
- if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) {
- swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd);
+ if (pmd_page(pmde) != page)
+ return not_found(pvmw);
+ return true;
+ }
+ if (!pmd_present(pmde)) {
+ swp_entry_t entry;
- if (migration_entry_to_page(entry) != page)
- return not_found(pvmw);
- return true;
- }
+ if (!thp_migration_supported() ||
+ !(pvmw->flags & PVMW_MIGRATION))
+ return not_found(pvmw);
+ entry = pmd_to_swp_entry(pmde);
+ if (!is_migration_entry(entry) ||
+ migration_entry_to_page(entry) != page)
+ return not_found(pvmw);
+ return true;
}
- return not_found(pvmw);
- } else {
/* THP pmd was split under us: handle on pte level */
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
+ } else if (!pmd_present(pmde)) {
+ /*
+ * If PVMW_SYNC, take and drop THP pmd lock so that we
+ * cannot return prematurely, while zap_huge_pmd() has
+ * cleared *pmd but not decremented compound_mapcount().
+ */
+ if ((pvmw->flags & PVMW_SYNC) &&
+ PageTransCompound(page)) {
+ spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
+
+ spin_unlock(ptl);
+ }
+ step_forward(pvmw, PMD_SIZE);
+ continue;
}
- } else if (!pmd_present(pmde)) {
- return false;
- }
- if (!map_pte(pvmw))
- goto next_pte;
- while (1) {
+ if (!map_pte(pvmw))
+ goto next_pte;
+this_pte:
if (check_pte(pvmw))
return true;
next_pte:
- /* Seek to next pte only makes sense for THP */
- if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
- return not_found(pvmw);
do {
pvmw->address += PAGE_SIZE;
- if (pvmw->address >= pvmw->vma->vm_end ||
- pvmw->address >=
- __vma_address(pvmw->page, pvmw->vma) +
- thp_size(pvmw->page))
+ if (pvmw->address >= end)
return not_found(pvmw);
/* Did we cross page table boundary? */
- if (pvmw->address % PMD_SIZE == 0) {
- pte_unmap(pvmw->pte);
+ if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
if (pvmw->ptl) {
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
}
+ pte_unmap(pvmw->pte);
+ pvmw->pte = NULL;
goto restart;
- } else {
- pvmw->pte++;
+ }
+ pvmw->pte++;
+ if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) {
+ pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+ spin_lock(pvmw->ptl);
}
} while (pte_none(*pvmw->pte));
@@ -247,7 +286,10 @@ next_pte:
pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
spin_lock(pvmw->ptl);
}
- }
+ goto this_pte;
+ } while (pvmw->address < end);
+
+ return false;
}
/**
@@ -266,14 +308,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
.vma = vma,
.flags = PVMW_SYNC,
};
- unsigned long start, end;
-
- start = __vma_address(page, vma);
- end = start + thp_size(page) - PAGE_SIZE;
- if (unlikely(end < vma->vm_start || start >= vma->vm_end))
+ pvmw.address = vma_address(page, vma);
+ if (pvmw.address == -EFAULT)
return 0;
- pvmw.address = max(start, vma->vm_start);
if (!page_vma_mapped_walk(&pvmw))
return 0;
page_vma_mapped_walk_done(&pvmw);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..9b3db11a4d1d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,6 +58,45 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return err;
}
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk, int pdshift)
+{
+ int err = 0;
+ const struct mm_walk_ops *ops = walk->ops;
+ int shift = hugepd_shift(*phpd);
+ int page_size = 1 << shift;
+
+ if (!ops->pte_entry)
+ return 0;
+
+ if (addr & (page_size - 1))
+ return 0;
+
+ for (;;) {
+ pte_t *pte;
+
+ spin_lock(&walk->mm->page_table_lock);
+ pte = hugepte_offset(*phpd, addr, pdshift);
+ err = ops->pte_entry(pte, addr, addr + page_size, walk);
+ spin_unlock(&walk->mm->page_table_lock);
+
+ if (err)
+ break;
+ if (addr >= end - page_size)
+ break;
+ addr += page_size;
+ }
+ return err;
+}
+#else
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk, int pdshift)
+{
+ return 0;
+}
+#endif
+
static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@@ -108,7 +147,10 @@ again:
goto again;
}
- err = walk_pte_range(pmd, addr, next, walk);
+ if (is_hugepd(__hugepd(pmd_val(*pmd))))
+ err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
+ else
+ err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
} while (pmd++, addr = next, addr != end);
@@ -157,7 +199,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (pud_none(*pud))
goto again;
- err = walk_pmd_range(pud, addr, next, walk);
+ if (is_hugepd(__hugepd(pud_val(*pud))))
+ err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
+ else
+ err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
@@ -189,7 +234,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
if (err)
break;
}
- if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+ if (is_hugepd(__hugepd(p4d_val(*p4d))))
+ err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
+ else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -224,8 +271,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
if (err)
break;
}
- if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
- ops->pte_entry)
+ if (is_hugepd(__hugepd(pgd_val(*pgd))))
+ err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
+ else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index b6dc22904088..639662c20c82 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -124,7 +124,7 @@ struct percpu_stats {
u64 nr_max_alloc; /* max # of live allocations */
u32 nr_chunks; /* current # of live chunks */
u32 nr_max_chunks; /* max # of live chunks */
- size_t min_alloc_size; /* min allocaiton size */
+ size_t min_alloc_size; /* min allocation size */
size_t max_alloc_size; /* max allocation size */
};
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 057546f5555e..ee5d89fcd66f 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -8,6 +8,7 @@
* Chunks are mapped into vmalloc areas and populated page by page.
* This is the default chunk allocator.
*/
+#include "internal.h"
static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
@@ -133,7 +134,7 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
{
- unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+ vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT));
}
/**
@@ -192,8 +193,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
static int __pcpu_map_pages(unsigned long addr, struct page **pages,
int nr_pages)
{
- return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
- PAGE_KERNEL, pages);
+ return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
+ PAGE_KERNEL, pages, PAGE_SHIFT);
}
/**
diff --git a/mm/percpu.c b/mm/percpu.c
index f4c83217f217..b4cebeca4c0c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1906,7 +1906,7 @@ fail:
pr_info("limit reached, disable warning\n");
}
if (is_atomic) {
- /* see the flag handling in pcpu_blance_workfn() */
+ /* see the flag handling in pcpu_balance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
} else {
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
index 1dcc865029a2..e9e879de8649 100644
--- a/mm/pgalloc-track.h
+++ b/mm/pgalloc-track.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_PGALLLC_TRACK_H
-#define _LINUX_PGALLLC_TRACK_H
+#ifndef _LINUX_PGALLOC_TRACK_H
+#define _LINUX_PGALLOC_TRACK_H
#if defined(CONFIG_MMU)
static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
@@ -48,4 +48,4 @@ static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
(__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
NULL: pte_offset_kernel(pmd, address))
-#endif /* _LINUX_PGALLLC_TRACK_H */
+#endif /* _LINUX_PGALLOC_TRACK_H */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c2210e1cdb51..4e640baf9794 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -135,9 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(!pmd_present(*pmdp));
- /* Below assumes pmd_present() is true */
- VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index f5fee9cf90f8..4bcc11958089 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -9,7 +9,6 @@
#include <linux/mm.h>
#include <linux/uio.h>
#include <linux/sched.h>
-#include <linux/compat.h>
#include <linux/sched/mm.h>
#include <linux/highmem.h>
#include <linux/ptrace.h>
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 4354c1422d57..da751448d0e4 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -111,7 +111,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- pte_t val = READ_ONCE(*pte);
+ pte_t val = ptep_get(pte);
if (st->effective_prot)
st->effective_prot(st, 4, pte_val(val));
diff --git a/mm/readahead.c b/mm/readahead.c
index c5b0457415be..d589f147f4c2 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -198,8 +198,6 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
for (i = 0; i < nr_to_read; i++) {
struct page *page = xa_load(&mapping->i_pages, index + i);
- BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
-
if (page && !xa_is_value(page)) {
/*
* Page already present? Kick off the current batch
@@ -210,6 +208,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* not worth getting one just for that.
*/
read_pages(ractl, &page_pool, true);
+ i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
@@ -223,6 +222,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
gfp_mask) < 0) {
put_page(page);
read_pages(ractl, &page_pool, true);
+ i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
if (i == nr_to_read - lookahead_size)
@@ -272,9 +272,10 @@ void do_page_cache_ra(struct readahead_control *ractl,
* memory at once.
*/
void force_page_cache_ra(struct readahead_control *ractl,
- struct file_ra_state *ra, unsigned long nr_to_read)
+ unsigned long nr_to_read)
{
struct address_space *mapping = ractl->mapping;
+ struct file_ra_state *ra = ractl->ra;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages, index;
@@ -433,10 +434,10 @@ static int try_context_readahead(struct address_space *mapping,
* A minimal readahead algorithm for trivial sequential/random reads.
*/
static void ondemand_readahead(struct readahead_control *ractl,
- struct file_ra_state *ra, bool hit_readahead_marker,
- unsigned long req_size)
+ bool hit_readahead_marker, unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
+ struct file_ra_state *ra = ractl->ra;
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
unsigned long index = readahead_index(ractl);
@@ -550,7 +551,7 @@ readit:
}
void page_cache_sync_ra(struct readahead_control *ractl,
- struct file_ra_state *ra, unsigned long req_count)
+ unsigned long req_count)
{
bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
@@ -560,7 +561,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
* read-ahead will do the right thing and limit the read to just the
* requested range, which we'll set to 1 page for this case.
*/
- if (!ra->ra_pages || blk_cgroup_congested()) {
+ if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
if (!ractl->file)
return;
req_count = 1;
@@ -569,21 +570,20 @@ void page_cache_sync_ra(struct readahead_control *ractl,
/* be dumb */
if (do_forced_ra) {
- force_page_cache_ra(ractl, ra, req_count);
+ force_page_cache_ra(ractl, req_count);
return;
}
/* do read-ahead */
- ondemand_readahead(ractl, ra, false, req_count);
+ ondemand_readahead(ractl, false, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
void page_cache_async_ra(struct readahead_control *ractl,
- struct file_ra_state *ra, struct page *page,
- unsigned long req_count)
+ struct page *page, unsigned long req_count)
{
/* no read-ahead */
- if (!ra->ra_pages)
+ if (!ractl->ra->ra_pages)
return;
/*
@@ -604,7 +604,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
return;
/* do read-ahead */
- ondemand_readahead(ractl, ra, true, req_count);
+ ondemand_readahead(ractl, true, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);
@@ -638,3 +638,78 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
return ksys_readahead(fd, offset, count);
}
+
+/**
+ * readahead_expand - Expand a readahead request
+ * @ractl: The request to be expanded
+ * @new_start: The revised start
+ * @new_len: The revised size of the request
+ *
+ * Attempt to expand a readahead request outwards from the current size to the
+ * specified size by inserting locked pages before and after the current window
+ * to increase the size to the new window. This may involve the insertion of
+ * THPs, in which case the window may get expanded even beyond what was
+ * requested.
+ *
+ * The algorithm will stop if it encounters a conflicting page already in the
+ * pagecache and leave a smaller expansion than requested.
+ *
+ * The caller must check for this by examining the revised @ractl object for a
+ * different expansion than was requested.
+ */
+void readahead_expand(struct readahead_control *ractl,
+ loff_t new_start, size_t new_len)
+{
+ struct address_space *mapping = ractl->mapping;
+ struct file_ra_state *ra = ractl->ra;
+ pgoff_t new_index, new_nr_pages;
+ gfp_t gfp_mask = readahead_gfp_mask(mapping);
+
+ new_index = new_start / PAGE_SIZE;
+
+ /* Expand the leading edge downwards */
+ while (ractl->_index > new_index) {
+ unsigned long index = ractl->_index - 1;
+ struct page *page = xa_load(&mapping->i_pages, index);
+
+ if (page && !xa_is_value(page))
+ return; /* Page apparently present */
+
+ page = __page_cache_alloc(gfp_mask);
+ if (!page)
+ return;
+ if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
+ put_page(page);
+ return;
+ }
+
+ ractl->_nr_pages++;
+ ractl->_index = page->index;
+ }
+
+ new_len += new_start - readahead_pos(ractl);
+ new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);
+
+ /* Expand the trailing edge upwards */
+ while (ractl->_nr_pages < new_nr_pages) {
+ unsigned long index = ractl->_index + ractl->_nr_pages;
+ struct page *page = xa_load(&mapping->i_pages, index);
+
+ if (page && !xa_is_value(page))
+ return; /* Page apparently present */
+
+ page = __page_cache_alloc(gfp_mask);
+ if (!page)
+ return;
+ if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
+ put_page(page);
+ return;
+ }
+ ractl->_nr_pages++;
+ if (ra) {
+ ra->size++;
+ ra->async_size++;
+ }
+ }
+}
+EXPORT_SYMBOL(readahead_expand);
diff --git a/mm/rmap.c b/mm/rmap.c
index b0fc27e77d6d..e05c300048e6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -257,7 +257,7 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
*
- * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
+ * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and
* anon_vma_fork(). The first three want an exact copy of src, while the last
* one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
* endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
@@ -707,7 +707,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
- unsigned long address;
if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
@@ -717,15 +716,13 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
- } else if (page->mapping) {
- if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
- return -EFAULT;
- } else
+ } else if (!vma->vm_file) {
return -EFAULT;
- address = __vma_address(page, vma);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
return -EFAULT;
- return address;
+ }
+
+ return vma_address(page, vma);
}
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
@@ -919,7 +916,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- min(vma->vm_end, address + page_size(page)));
+ vma_address_end(page, vma));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
@@ -1405,6 +1402,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ if (flags & TTU_SYNC)
+ pvmw.flags = PVMW_SYNC;
+
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true;
@@ -1426,9 +1432,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
+ range.end = PageKsm(page) ?
+ address + PAGE_SIZE : vma_address_end(page, vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address,
- min(vma->vm_end, address + page_size(page)));
+ address, range.end);
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
@@ -1777,7 +1784,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
else
rmap_walk(page, &rwc);
- return !page_mapcount(page) ? true : false;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ return !page_mapcount(page);
}
/**
@@ -1874,6 +1887,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
@@ -1928,6 +1942,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
pgoff_start, pgoff_end) {
unsigned long address = vma_address(page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
diff --git a/mm/shmem.c b/mm/shmem.c
index b2db4ed0fbc7..6268b9b4e41a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1695,8 +1695,9 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
- struct page *page;
+ struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
+ struct swap_info_struct *si;
+ struct page *page = NULL;
swp_entry_t swap;
int error;
@@ -1704,6 +1705,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
swap = radix_to_swp_entry(*pagep);
*pagep = NULL;
+ /* Prevent swapoff from happening to us. */
+ si = get_swap_device(swap);
+ if (!si) {
+ error = EINVAL;
+ goto failed;
+ }
/* Look it up and read it in.. */
page = lookup_swap_cache(swap, NULL, 0);
if (!page) {
@@ -1765,6 +1772,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
swap_free(swap);
*pagep = page;
+ if (si)
+ put_swap_device(si);
return 0;
failed:
if (!shmem_confirm_swap(mapping, index, swap))
@@ -1775,6 +1784,9 @@ unlock:
put_page(page);
}
+ if (si)
+ put_swap_device(si);
+
return error;
}
@@ -1816,7 +1828,7 @@ repeat:
}
sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = vma ? vma->vm_mm : current->mm;
+ charge_mm = vma ? vma->vm_mm : NULL;
page = pagecache_get_page(mapping, index,
FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
@@ -2227,7 +2239,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
}
#endif
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
struct inode *inode = file_inode(file);
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2239,13 +2251,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
* no serialization needed when called from shm_destroy().
*/
if (lock && !(info->flags & VM_LOCKED)) {
- if (!user_shm_lock(inode->i_size, user))
+ if (!user_shm_lock(inode->i_size, ucounts))
goto out_nomem;
info->flags |= VM_LOCKED;
mapping_set_unevictable(file->f_mapping);
}
- if (!lock && (info->flags & VM_LOCKED) && user) {
- user_shm_unlock(inode->i_size, user);
+ if (!lock && (info->flags & VM_LOCKED) && ucounts) {
+ user_shm_unlock(inode->i_size, ucounts);
info->flags &= ~VM_LOCKED;
mapping_clear_unevictable(file->f_mapping);
}
@@ -2258,25 +2270,11 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+ int ret;
- if (info->seals & F_SEAL_FUTURE_WRITE) {
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * "future write" seal active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
-
- /*
- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
- * MAP_SHARED and read-only, take care to not allow mprotect to
- * revert protections on such mappings. Do this only for shared
- * mappings. For private mappings, don't need to mask
- * VM_MAYWRITE as we still want them to be COW-writable.
- */
- if (vma->vm_flags & VM_SHARED)
- vma->vm_flags &= ~(VM_MAYWRITE);
- }
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
/* arm64 - allow memory tagging on RAM-based files */
vma->vm_flags |= VM_MTE_ALLOWED;
@@ -2375,8 +2373,18 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pgoff_t offset, max_off;
ret = -ENOMEM;
- if (!shmem_inode_acct_block(inode, 1))
+ if (!shmem_inode_acct_block(inode, 1)) {
+ /*
+ * We may have got a page, returned -ENOENT triggering a retry,
+ * and now we find ourselves with -ENOMEM. Release the page, to
+ * avoid a BUG_ON in our caller.
+ */
+ if (unlikely(*pagep)) {
+ put_page(*pagep);
+ *pagep = NULL;
+ }
goto out;
+ }
if (!*pagep) {
page = shmem_alloc_page(gfp, info, pgoff);
@@ -2846,6 +2854,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_ffree = sbinfo->free_inodes;
}
/* else leave those fields 0 like simple_statfs */
+
+ buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
+
return 0;
}
@@ -3505,7 +3516,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
}
}
if (*this_char) {
- char *value = strchr(this_char,'=');
+ char *value = strchr(this_char, '=');
size_t len = 0;
int err;
@@ -4093,7 +4104,7 @@ int shmem_unuse(unsigned int type, bool frontswap,
return 0;
}
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
return 0;
}
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 9c2e145a747a..c13c33b247e8 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -147,8 +147,8 @@ void __meminit __shuffle_zone(struct zone *z)
spin_unlock_irqrestore(&z->lock, flags);
}
-/**
- * shuffle_free_memory - reduce the predictability of the page allocator
+/*
+ * __shuffle_free_memory - reduce the predictability of the page allocator
* @pgdat: node page data
*/
void __meminit __shuffle_free_memory(pg_data_t *pgdat)
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 71b784f0b7c3..cec62984f7d3 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -10,7 +10,7 @@
DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
extern void __shuffle_free_memory(pg_data_t *pgdat);
extern bool shuffle_pick_tail(void);
-static inline void shuffle_free_memory(pg_data_t *pgdat)
+static inline void __meminit shuffle_free_memory(pg_data_t *pgdat)
{
if (!static_branch_unlikely(&page_alloc_shuffle_key))
return;
@@ -18,7 +18,7 @@ static inline void shuffle_free_memory(pg_data_t *pgdat)
}
extern void __shuffle_zone(struct zone *z);
-static inline void shuffle_zone(struct zone *z)
+static inline void __meminit shuffle_zone(struct zone *z)
{
if (!static_branch_unlikely(&page_alloc_shuffle_key))
return;
diff --git a/mm/slab.c b/mm/slab.c
index ae651bf540b7..d0f725637663 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -259,7 +259,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define BATCHREFILL_LIMIT 16
/*
- * Optimization question: fewer reaps means less probability for unnessary
+ * Optimization question: fewer reaps means less probability for unnecessary
* cpucache drain/refill cycles.
*
* OTOH the cpuarrays can contain lots of objects,
@@ -2284,7 +2284,7 @@ void __kmem_cache_release(struct kmem_cache *cachep)
* Because if it is the case, that means we defer the creation of
* the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
* And we eventually call down to __kmem_cache_create(), which
- * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
+ * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one.
* This is a "chicken-and-egg" problem.
*
* So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
@@ -2381,8 +2381,8 @@ union freelist_init_state {
};
/*
- * Initialize the state based on the randomization methode available.
- * return true if the pre-computed list is available, false otherwize.
+ * Initialize the state based on the randomization method available.
+ * return true if the pre-computed list is available, false otherwise.
*/
static bool freelist_state_initialize(union freelist_init_state *state,
struct kmem_cache *cachep,
@@ -3216,6 +3216,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
void *ptr;
int slab_node = numa_mem_id();
struct obj_cgroup *objcg = NULL;
+ bool init = false;
flags &= gfp_allowed_mask;
cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
@@ -3254,12 +3255,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
-
- if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
- memset(ptr, 0, cachep->object_size);
+ init = slab_want_init_on_alloc(flags, cachep);
out_hooks:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
return ptr;
}
@@ -3301,6 +3300,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
unsigned long save_flags;
void *objp;
struct obj_cgroup *objcg = NULL;
+ bool init = false;
flags &= gfp_allowed_mask;
cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
@@ -3317,12 +3317,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
-
- if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
- memset(objp, 0, cachep->object_size);
+ init = slab_want_init_on_alloc(flags, cachep);
out:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init);
return objp;
}
@@ -3427,17 +3425,24 @@ free_done:
static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
+ bool init;
+
if (is_kfence_address(objp)) {
kmemleak_free_recursive(objp, cachep->flags);
__kfence_free(objp);
return;
}
- if (unlikely(slab_want_init_on_free(cachep)))
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_free and initialization memset must be
+ * kept together to avoid discrepancies in behavior.
+ */
+ init = slab_want_init_on_free(cachep);
+ if (init && !kasan_has_integrated_init())
memset(objp, 0, cachep->object_size);
-
- /* Put the object into the quarantine, don't touch it for now. */
- if (kasan_slab_free(cachep, objp))
+ /* KASAN might put objp into memory quarantine, delaying its reuse. */
+ if (kasan_slab_free(cachep, objp, init))
return;
/* Use KCSAN to help debug racy use-after-free. */
@@ -3542,18 +3547,18 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
- /* Clear memory outside IRQ disabled section */
- if (unlikely(slab_want_init_on_alloc(flags, s)))
- for (i = 0; i < size; i++)
- memset(p[i], 0, s->object_size);
-
- slab_post_alloc_hook(s, objcg, flags, size, p);
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled section.
+ */
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s));
/* FIXME: Trace call missing. Christoph would like a bulk variant */
return size;
error:
local_irq_enable();
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
- slab_post_alloc_hook(s, objcg, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3651,6 +3656,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif /* CONFIG_NUMA */
+#ifdef CONFIG_PRINTK
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
{
struct kmem_cache *cachep;
@@ -3670,6 +3676,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
if (DEBUG && cachep->flags & SLAB_STORE_USER)
kpp->kp_ret = *dbg_userword(cachep, objp);
}
+#endif
/**
* __do_kmalloc - allocate memory
diff --git a/mm/slab.h b/mm/slab.h
index 076582f58f68..7b60ef2f32c3 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -215,6 +215,7 @@ DECLARE_STATIC_KEY_TRUE(slub_debug_enabled);
DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
extern void print_tracking(struct kmem_cache *s, void *object);
+long validate_slab_cache(struct kmem_cache *s);
#else
static inline void print_tracking(struct kmem_cache *s, void *object)
{
@@ -239,6 +240,8 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
gfp_t gfp, bool new_page);
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr);
static inline void memcg_free_page_obj_cgroups(struct page *page)
{
@@ -283,20 +286,6 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
return true;
}
-static inline void mod_objcg_state(struct obj_cgroup *objcg,
- struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
-{
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- rcu_read_lock();
- memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- mod_memcg_lruvec_state(lruvec, idx, nr);
- rcu_read_unlock();
-}
-
static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
struct obj_cgroup *objcg,
gfp_t flags, size_t size,
@@ -309,7 +298,6 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
if (!memcg_kmem_enabled() || !objcg)
return;
- flags &= ~__GFP_ACCOUNT;
for (i = 0; i < size; i++) {
if (likely(p[i])) {
page = virt_to_head_page(p[i]);
@@ -506,15 +494,24 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
}
static inline void slab_post_alloc_hook(struct kmem_cache *s,
- struct obj_cgroup *objcg,
- gfp_t flags, size_t size, void **p)
+ struct obj_cgroup *objcg, gfp_t flags,
+ size_t size, void **p, bool init)
{
size_t i;
flags &= gfp_allowed_mask;
+
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_alloc and initialization memset must be
+ * kept together to avoid discrepancies in behavior.
+ *
+ * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
+ */
for (i = 0; i < size; i++) {
- p[i] = kasan_slab_alloc(s, p[i], flags);
- /* As p[i] might get tagged, call kmemleak hook after KASAN. */
+ p[i] = kasan_slab_alloc(s, p[i], flags, init);
+ if (p[i] && init && !kasan_has_integrated_init())
+ memset(p[i], 0, s->object_size);
kmemleak_alloc_recursive(p[i], s->object_size, 1,
s->flags, flags);
}
@@ -601,7 +598,8 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
{
- if (static_branch_unlikely(&init_on_alloc)) {
+ if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
+ &init_on_alloc)) {
if (c->ctor)
return false;
if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
@@ -613,12 +611,20 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
static inline bool slab_want_init_on_free(struct kmem_cache *c)
{
- if (static_branch_unlikely(&init_on_free))
+ if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
+ &init_on_free))
return !(c->ctor ||
(c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
return false;
}
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
+void debugfs_slab_release(struct kmem_cache *);
+#else
+static inline void debugfs_slab_release(struct kmem_cache *s) { }
+#endif
+
+#ifdef CONFIG_PRINTK
#define KS_ADDRS_COUNT 16
struct kmem_obj_info {
void *kp_ptr;
@@ -630,5 +636,6 @@ struct kmem_obj_info {
void *kp_stack[KS_ADDRS_COUNT];
};
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
+#endif
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 88e833986332..c126e6f6b5a5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -71,11 +71,19 @@ static int __init setup_slab_nomerge(char *str)
return 1;
}
+static int __init setup_slab_merge(char *str)
+{
+ slab_nomerge = false;
+ return 1;
+}
+
#ifdef CONFIG_SLUB
__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
+__setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
#endif
__setup("slab_nomerge", setup_slab_nomerge);
+__setup("slab_merge", setup_slab_merge);
/*
* Determine the size of a slab object
@@ -89,8 +97,7 @@ EXPORT_SYMBOL(kmem_cache_size);
#ifdef CONFIG_DEBUG_VM
static int kmem_cache_sanity_check(const char *name, unsigned int size)
{
- if (!name || in_interrupt() || size < sizeof(void *) ||
- size > KMALLOC_MAX_SIZE) {
+ if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
pr_err("kmem_cache_create(%s) integrity check failed\n", name);
return -EINVAL;
}
@@ -310,6 +317,16 @@ kmem_cache_create_usercopy(const char *name,
const char *cache_name;
int err;
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * If no slub_debug was enabled globally, the static key is not yet
+ * enabled by setup_slub_debug(). Enable it if the cache is being
+ * created with any of the debugging flags passed explicitly.
+ */
+ if (flags & SLAB_DEBUG_FLAGS)
+ static_branch_enable(&slub_debug_enabled);
+#endif
+
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, size);
@@ -360,11 +377,11 @@ out_unlock:
if (err) {
if (flags & SLAB_PANIC)
- panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
- name, err);
+ panic("%s: Failed to create slab '%s'. Error %d\n",
+ __func__, name, err);
else {
- pr_warn("kmem_cache_create(%s) failed with error %d\n",
- name, err);
+ pr_warn("%s(%s) failed with error %d\n",
+ __func__, name, err);
dump_stack();
}
return NULL;
@@ -431,6 +448,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
rcu_barrier();
list_for_each_entry_safe(s, s2, &to_destroy, list) {
+ debugfs_slab_release(s);
kfence_shutdown_cache(s);
#ifdef SLAB_SUPPORTS_SYSFS
sysfs_slab_release(s);
@@ -458,6 +476,7 @@ static int shutdown_cache(struct kmem_cache *s)
schedule_work(&slab_caches_to_rcu_destroy_work);
} else {
kfence_shutdown_cache(s);
+ debugfs_slab_release(s);
#ifdef SLAB_SUPPORTS_SYSFS
sysfs_slab_unlink(s);
sysfs_slab_release(s);
@@ -491,8 +510,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
err = shutdown_cache(s);
if (err) {
- pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
- s->name);
+ pr_err("%s %s: Slab cache still has objects\n",
+ __func__, s->name);
dump_stack();
}
out_unlock:
@@ -526,6 +545,7 @@ bool slab_is_available(void)
return slab_state >= UP;
}
+#ifdef CONFIG_PRINTK
/**
* kmem_valid_obj - does the pointer reference a valid slab object?
* @object: pointer to query.
@@ -544,6 +564,7 @@ bool kmem_valid_obj(void *object)
page = virt_to_head_page(object);
return PageSlab(page);
}
+EXPORT_SYMBOL_GPL(kmem_valid_obj);
/**
* kmem_dump_obj - Print available slab provenance information
@@ -600,6 +621,8 @@ void kmem_dump_obj(void *object)
pr_info(" %pS\n", kp.kp_stack[i]);
}
}
+EXPORT_SYMBOL_GPL(kmem_dump_obj);
+#endif
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
@@ -715,26 +738,30 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
}
#ifdef CONFIG_ZONE_DMA
-#define INIT_KMALLOC_INFO(__size, __short_size) \
-{ \
- .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
- .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
- .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \
- .size = __size, \
-}
+#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
+#else
+#define KMALLOC_DMA_NAME(sz)
+#endif
+
+#ifdef CONFIG_MEMCG_KMEM
+#define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
#else
+#define KMALLOC_CGROUP_NAME(sz)
+#endif
+
#define INIT_KMALLOC_INFO(__size, __short_size) \
{ \
.name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
+ KMALLOC_CGROUP_NAME(__short_size) \
+ KMALLOC_DMA_NAME(__short_size) \
.size = __size, \
}
-#endif
/*
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
- * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
- * kmalloc-67108864.
+ * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is
+ * kmalloc-32M.
*/
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(0, 0),
@@ -762,8 +789,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(4194304, 4M),
INIT_KMALLOC_INFO(8388608, 8M),
INIT_KMALLOC_INFO(16777216, 16M),
- INIT_KMALLOC_INFO(33554432, 32M),
- INIT_KMALLOC_INFO(67108864, 64M)
+ INIT_KMALLOC_INFO(33554432, 32M)
};
/*
@@ -816,13 +842,27 @@ void __init setup_kmalloc_cache_index_table(void)
static void __init
new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
{
- if (type == KMALLOC_RECLAIM)
+ if (type == KMALLOC_RECLAIM) {
flags |= SLAB_RECLAIM_ACCOUNT;
+ } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
+ if (cgroup_memory_nokmem) {
+ kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
+ return;
+ }
+ flags |= SLAB_ACCOUNT;
+ }
kmalloc_caches[type][idx] = create_kmalloc_cache(
kmalloc_info[idx].name[type],
kmalloc_info[idx].size, flags, 0,
kmalloc_info[idx].size);
+
+ /*
+ * If CONFIG_MEMCG_KMEM is enabled, disable cache merging for
+ * KMALLOC_NORMAL caches.
+ */
+ if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL))
+ kmalloc_caches[type][idx]->refcount = -1;
}
/*
@@ -835,6 +875,9 @@ void __init create_kmalloc_caches(slab_flags_t flags)
int i;
enum kmalloc_cache_type type;
+ /*
+ * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
+ */
for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
if (!kmalloc_caches[type][i])
diff --git a/mm/slob.c b/mm/slob.c
index 0578429b991b..74d3f6e60666 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -461,11 +461,13 @@ out:
spin_unlock_irqrestore(&slob_lock, flags);
}
+#ifdef CONFIG_PRINTK
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
{
kpp->kp_ptr = object;
kpp->kp_page = page;
}
+#endif
/*
* End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
diff --git a/mm/slub.c b/mm/slub.c
index 3021ce9bf1b3..3bc8b940c933 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3,7 +3,7 @@
* SLUB: A slab allocator that limits cache line use instead of queuing
* objects in per cpu and per node lists.
*
- * The allocator synchronizes using per slab locks or atomic operatios
+ * The allocator synchronizes using per slab locks or atomic operations
* and only uses a centralized lock to manage a pool of partial slabs.
*
* (C) 2007 SGI, Christoph Lameter
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
+#include <linux/swab.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
@@ -35,7 +36,9 @@
#include <linux/prefetch.h>
#include <linux/memcontrol.h>
#include <linux/random.h>
+#include <kunit/test.h>
+#include <linux/debugfs.h>
#include <trace/events/kmem.h>
#include "internal.h"
@@ -116,12 +119,26 @@
*/
#ifdef CONFIG_SLUB_DEBUG
+
#ifdef CONFIG_SLUB_DEBUG_ON
DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
-#endif
+
+static inline bool __slub_debug_enabled(void)
+{
+ return static_branch_unlikely(&slub_debug_enabled);
+}
+
+#else /* CONFIG_SLUB_DEBUG */
+
+static inline bool __slub_debug_enabled(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_SLUB_DEBUG */
static inline bool kmem_cache_debug(struct kmem_cache *s)
{
@@ -153,14 +170,11 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
* - Variable sizing of the per node arrays
*/
-/* Enable to test recovery from slab corruption on boot */
-#undef SLUB_RESILIENCY_TEST
-
/* Enable to log cmpxchg failures */
#undef SLUB_DEBUG_CMPXCHG
/*
- * Mininum number of partial slabs. These will be left on the partial
+ * Minimum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
*/
#define MIN_PARTIAL 5
@@ -225,6 +239,12 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
#endif
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
+static void debugfs_slab_add(struct kmem_cache *);
+#else
+static inline void debugfs_slab_add(struct kmem_cache *s) { }
+#endif
+
static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
@@ -301,6 +321,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
if (!debug_pagealloc_enabled_static())
return get_freepointer(s, object);
+ object = kasan_reset_tag(object);
freepointer_addr = (unsigned long)object + s->offset;
copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
return freelist_ptr(s, p, freepointer_addr);
@@ -447,6 +468,26 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
static DEFINE_SPINLOCK(object_map_lock);
+#if IS_ENABLED(CONFIG_KUNIT)
+static bool slab_add_kunit_errors(void)
+{
+ struct kunit_resource *resource;
+
+ if (likely(!current->kunit_test))
+ return false;
+
+ resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
+ if (!resource)
+ return false;
+
+ (*(int *)resource->data)++;
+ kunit_put_resource(resource);
+ return true;
+}
+#else
+static inline bool slab_add_kunit_errors(void) { return false; }
+#endif
+
/*
* Determine a map of object in use on a page.
*
@@ -624,7 +665,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time)
if (!t->addr)
return;
- pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+ pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
#ifdef CONFIG_STACKTRACE
{
@@ -650,8 +691,9 @@ void print_tracking(struct kmem_cache *s, void *object)
static void print_page_info(struct page *page)
{
- pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
- page, page->objects, page->inuse, page->freelist, page->flags);
+ pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+ page, page->objects, page->inuse, page->freelist,
+ page->flags, &page->flags);
}
@@ -666,16 +708,18 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
pr_err("=============================================================================\n");
pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
pr_err("-----------------------------------------------------------------------------\n\n");
-
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
va_end(args);
}
+__printf(2, 3)
static void slab_fix(struct kmem_cache *s, char *fmt, ...)
{
struct va_format vaf;
va_list args;
+ if (slab_add_kunit_errors())
+ return;
+
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
@@ -706,19 +750,19 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
print_page_info(page);
- pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
if (s->flags & SLAB_RED_ZONE)
- print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
+ print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
s->red_left_pad);
else if (p > addr + 16)
print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
- print_section(KERN_ERR, "Object ", p,
+ print_section(KERN_ERR, "Object ", p,
min_t(unsigned int, s->object_size, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section(KERN_ERR, "Redzone ", p + s->object_size,
+ print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
off = get_info_end(s);
@@ -730,7 +774,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section(KERN_ERR, "Padding ", p + off,
+ print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
dump_stack();
@@ -739,8 +783,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason)
{
+ if (slab_add_kunit_errors())
+ return;
+
slab_bug(s, "%s", reason);
print_trailer(s, page, object);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
@@ -749,12 +797,16 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
va_list args;
char buf[100];
+ if (slab_add_kunit_errors())
+ return;
+
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
slab_bug(s, "%s", buf);
print_page_info(page);
dump_stack();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
@@ -776,7 +828,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
void *from, void *to)
{
- slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
+ slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
memset(from, data, to - from);
}
@@ -798,12 +850,17 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
while (end > fault && end[-1] == value)
end--;
+ if (slab_add_kunit_errors())
+ goto skip_bug_print;
+
slab_bug(s, "%s overwritten", what);
- pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
print_trailer(s, page, object);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+skip_bug_print:
restore_bytes(s, what, value, fault, end);
return 0;
}
@@ -832,7 +889,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
*
* A. Free pointer (if we cannot overwrite object on free)
* B. Tracking data for SLAB_STORE_USER
- * C. Padding to reach required alignment boundary or at mininum
+ * C. Padding to reach required alignment boundary or at minimum
* one word if debugging is on to be able to detect writes
* before the word boundary.
*
@@ -907,11 +964,11 @@ static int check_object(struct kmem_cache *s, struct page *page,
u8 *endobject = object + s->object_size;
if (s->flags & SLAB_RED_ZONE) {
- if (!check_bytes_and_report(s, page, object, "Redzone",
+ if (!check_bytes_and_report(s, page, object, "Left Redzone",
object - s->red_left_pad, val, s->red_left_pad))
return 0;
- if (!check_bytes_and_report(s, page, object, "Redzone",
+ if (!check_bytes_and_report(s, page, object, "Right Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
@@ -926,7 +983,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
(!check_bytes_and_report(s, page, p, "Poison", p,
POISON_FREE, s->object_size - 1) ||
- !check_bytes_and_report(s, page, p, "Poison",
+ !check_bytes_and_report(s, page, p, "End Poison",
p + s->object_size - 1, POISON_END, 1)))
return 0;
/*
@@ -1025,13 +1082,13 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
page->objects, max_objects);
page->objects = max_objects;
- slab_fix(s, "Number of objects adjusted.");
+ slab_fix(s, "Number of objects adjusted");
}
if (page->inuse != page->objects - nr) {
slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
page->inuse, page->objects - nr);
page->inuse = page->objects - nr;
- slab_fix(s, "Object count adjusted.");
+ slab_fix(s, "Object count adjusted");
}
return search == NULL;
}
@@ -1395,6 +1452,8 @@ static int __init setup_slub_debug(char *str)
out:
if (slub_debug != 0 || slub_debug_string)
static_branch_enable(&slub_debug_enabled);
+ else
+ static_branch_disable(&slub_debug_enabled);
if ((static_branch_unlikely(&init_on_alloc) ||
static_branch_unlikely(&init_on_free)) &&
(slub_debug & SLAB_POISON))
@@ -1532,7 +1591,8 @@ static __always_inline void kfree_hook(void *x)
kasan_kfree_large(x);
}
-static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
+static __always_inline bool slab_free_hook(struct kmem_cache *s,
+ void *x, bool init)
{
kmemleak_free_recursive(x, s->flags);
@@ -1558,8 +1618,25 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
__kcsan_check_access(x, s->object_size,
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
- /* KASAN might put x into memory quarantine, delaying its reuse */
- return kasan_slab_free(s, x);
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_free and initialization memset's must be
+ * kept together to avoid discrepancies in behavior.
+ *
+ * The initialization memset's clear the object and the metadata,
+ * but don't touch the SLAB redzone.
+ */
+ if (init) {
+ int rsize;
+
+ if (!kasan_has_integrated_init())
+ memset(kasan_reset_tag(x), 0, s->object_size);
+ rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
+ memset((char *)kasan_reset_tag(x) + s->inuse, 0,
+ s->size - s->inuse - rsize);
+ }
+ /* KASAN might put x into memory quarantine, delaying its reuse. */
+ return kasan_slab_free(s, x, init);
}
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
@@ -1569,10 +1646,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
void *object;
void *next = *head;
void *old_tail = *tail ? *tail : *head;
- int rsize;
if (is_kfence_address(next)) {
- slab_free_hook(s, next);
+ slab_free_hook(s, next, false);
return true;
}
@@ -1584,20 +1660,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
object = next;
next = get_freepointer(s, object);
- if (slab_want_init_on_free(s)) {
- /*
- * Clear the object and the metadata, but don't touch
- * the redzone.
- */
- memset(kasan_reset_tag(object), 0, s->object_size);
- rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
- : 0;
- memset((char *)kasan_reset_tag(object) + s->inuse, 0,
- s->size - s->inuse - rsize);
-
- }
/* If object's reuse doesn't have to be delayed */
- if (!slab_free_hook(s, object)) {
+ if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
/* Move object to the new freelist */
set_freepointer(s, object, *head);
*head = object;
@@ -2822,6 +2886,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct page *page;
unsigned long tid;
struct obj_cgroup *objcg = NULL;
+ bool init = false;
s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
if (!s)
@@ -2899,12 +2964,10 @@ redo:
}
maybe_wipe_obj_freeptr(s, object);
-
- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
- memset(kasan_reset_tag(object), 0, s->object_size);
+ init = slab_want_init_on_alloc(gfpflags, s);
out:
- slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
return object;
}
@@ -3236,7 +3299,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
}
if (is_kfence_address(object)) {
- slab_free_hook(df->s, object);
+ slab_free_hook(df->s, object, false);
__kfence_free(object);
p[size] = NULL; /* mark object processed */
return size;
@@ -3356,20 +3419,16 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
c->tid = next_tid(c->tid);
local_irq_enable();
- /* Clear memory outside IRQ disabled fastpath loop */
- if (unlikely(slab_want_init_on_alloc(flags, s))) {
- int j;
-
- for (j = 0; j < i; j++)
- memset(kasan_reset_tag(p[j]), 0, s->object_size);
- }
-
- /* memcg and kmem_cache debug support */
- slab_post_alloc_hook(s, objcg, flags, size, p);
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled fastpath loop.
+ */
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s));
return i;
error:
local_irq_enable();
- slab_post_alloc_hook(s, objcg, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3390,7 +3449,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
*/
/*
- * Mininum / Maximum order of slab pages. This influences locking overhead
+ * Minimum / Maximum order of slab pages. This influences locking overhead
* and slab fragmentation. A higher order reduces the number of partial slabs
* and increases the number of allocations possible without having to
* take the list_lock.
@@ -3421,7 +3480,7 @@ static unsigned int slub_min_objects;
*
* Higher order allocations also allow the placement of more objects in a
* slab and thereby reduce object handling overhead. If the user has
- * requested a higher mininum order then we start with that one instead of
+ * requested a higher minimum order then we start with that one instead of
* the smallest order which will fit the object.
*/
static inline unsigned int slab_order(unsigned int size,
@@ -3579,7 +3638,7 @@ static void early_kmem_cache_node_alloc(int node)
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL);
+ n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
page->freelist = get_freepointer(kmem_cache_node, n);
page->inuse = 1;
page->frozen = 0;
@@ -3687,7 +3746,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
slab_flags_t flags = s->flags;
unsigned int size = s->object_size;
- unsigned int freepointer_area;
unsigned int order;
/*
@@ -3696,13 +3754,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* the possible location of the free pointer.
*/
size = ALIGN(size, sizeof(void *));
- /*
- * This is the area of the object where a freepointer can be
- * safely written. If redzoning adds more to the inuse size, we
- * can't use that portion for writing the freepointer, so
- * s->offset must be limited within this for the general case.
- */
- freepointer_area = size;
#ifdef CONFIG_SLUB_DEBUG
/*
@@ -3728,19 +3779,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
/*
* With that we have determined the number of bytes in actual use
- * by the object. This is the potential offset to the free pointer.
+ * by the object and redzoning.
*/
s->inuse = size;
- if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
- s->ctor)) {
+ if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
+ ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
+ s->ctor) {
/*
* Relocate free pointer after the object if it is not
* permitted to overwrite the first word of the object on
* kmem_cache_free.
*
* This is the case if we do RCU, have a constructor or
- * destructor or are poisoning the objects.
+ * destructor, are poisoning the objects, or are
+ * redzoning an object smaller than sizeof(void *).
*
* The assumption that s->offset >= s->inuse means free
* pointer is outside of the object is used in the
@@ -3749,13 +3802,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
s->offset = size;
size += sizeof(void *);
- } else if (freepointer_area > sizeof(void *)) {
+ } else {
/*
* Store freelist pointer near middle of object to keep
* it away from the edges of the object to avoid small
* sized over/underflows from neighboring allocations.
*/
- s->offset = ALIGN(freepointer_area / 2, sizeof(void *));
+ s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
}
#ifdef CONFIG_SLUB_DEBUG
@@ -3898,7 +3951,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
for_each_object(p, s, addr, page->objects) {
if (!test_bit(__obj_to_index(s, addr, p), map)) {
- pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+ pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
@@ -3963,6 +4016,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
return 0;
}
+#ifdef CONFIG_PRINTK
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
{
void *base;
@@ -4002,6 +4056,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
#endif
#endif
}
+#endif
/********************************************************************
* Kmalloc subsystem
@@ -4454,6 +4509,10 @@ void __init kmem_cache_init(void)
if (debug_guardpage_minorder())
slub_max_order = 0;
+ /* Print slub debugging pointers without hashing */
+ if (__slub_debug_enabled())
+ no_hash_pointers_enable(NULL);
+
kmem_cache_node = &boot_kmem_cache_node;
kmem_cache = &boot_kmem_cache;
@@ -4542,6 +4601,9 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
if (err)
__kmem_cache_release(s);
+ if (s->flags & SLAB_STORE_USER)
+ debugfs_slab_add(s);
+
return err;
}
@@ -4650,9 +4712,11 @@ static int validate_slab_node(struct kmem_cache *s,
validate_slab(s, page);
count++;
}
- if (count != n->nr_partial)
+ if (count != n->nr_partial) {
pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
s->name, count, n->nr_partial);
+ slab_add_kunit_errors();
+ }
if (!(s->flags & SLAB_STORE_USER))
goto out;
@@ -4661,16 +4725,18 @@ static int validate_slab_node(struct kmem_cache *s,
validate_slab(s, page);
count++;
}
- if (count != atomic_long_read(&n->nr_slabs))
+ if (count != atomic_long_read(&n->nr_slabs)) {
pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
s->name, count, atomic_long_read(&n->nr_slabs));
+ slab_add_kunit_errors();
+ }
out:
spin_unlock_irqrestore(&n->list_lock, flags);
return count;
}
-static long validate_slab_cache(struct kmem_cache *s)
+long validate_slab_cache(struct kmem_cache *s)
{
int node;
unsigned long count = 0;
@@ -4682,6 +4748,9 @@ static long validate_slab_cache(struct kmem_cache *s)
return count;
}
+EXPORT_SYMBOL(validate_slab_cache);
+
+#ifdef CONFIG_DEBUG_FS
/*
* Generate lists of code addresses where slabcache objects are allocated
* and freed.
@@ -4705,6 +4774,8 @@ struct loc_track {
struct location *loc;
};
+static struct dentry *slab_debugfs_root;
+
static void free_loc_track(struct loc_track *t)
{
if (t->max)
@@ -4821,144 +4892,9 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
add_location(t, s, get_track(s, p, alloc));
put_map(map);
}
-
-static int list_locations(struct kmem_cache *s, char *buf,
- enum track_item alloc)
-{
- int len = 0;
- unsigned long i;
- struct loc_track t = { 0, 0, NULL };
- int node;
- struct kmem_cache_node *n;
-
- if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
- GFP_KERNEL)) {
- return sysfs_emit(buf, "Out of memory\n");
- }
- /* Push back cpu slabs */
- flush_all(s);
-
- for_each_kmem_cache_node(s, node, n) {
- unsigned long flags;
- struct page *page;
-
- if (!atomic_long_read(&n->nr_slabs))
- continue;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, slab_list)
- process_slab(&t, s, page, alloc);
- list_for_each_entry(page, &n->full, slab_list)
- process_slab(&t, s, page, alloc);
- spin_unlock_irqrestore(&n->list_lock, flags);
- }
-
- for (i = 0; i < t.count; i++) {
- struct location *l = &t.loc[i];
-
- len += sysfs_emit_at(buf, len, "%7ld ", l->count);
-
- if (l->addr)
- len += sysfs_emit_at(buf, len, "%pS", (void *)l->addr);
- else
- len += sysfs_emit_at(buf, len, "<not-available>");
-
- if (l->sum_time != l->min_time)
- len += sysfs_emit_at(buf, len, " age=%ld/%ld/%ld",
- l->min_time,
- (long)div_u64(l->sum_time,
- l->count),
- l->max_time);
- else
- len += sysfs_emit_at(buf, len, " age=%ld", l->min_time);
-
- if (l->min_pid != l->max_pid)
- len += sysfs_emit_at(buf, len, " pid=%ld-%ld",
- l->min_pid, l->max_pid);
- else
- len += sysfs_emit_at(buf, len, " pid=%ld",
- l->min_pid);
-
- if (num_online_cpus() > 1 &&
- !cpumask_empty(to_cpumask(l->cpus)))
- len += sysfs_emit_at(buf, len, " cpus=%*pbl",
- cpumask_pr_args(to_cpumask(l->cpus)));
-
- if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
- len += sysfs_emit_at(buf, len, " nodes=%*pbl",
- nodemask_pr_args(&l->nodes));
-
- len += sysfs_emit_at(buf, len, "\n");
- }
-
- free_loc_track(&t);
- if (!t.count)
- len += sysfs_emit_at(buf, len, "No data\n");
-
- return len;
-}
+#endif /* CONFIG_DEBUG_FS */
#endif /* CONFIG_SLUB_DEBUG */
-#ifdef SLUB_RESILIENCY_TEST
-static void __init resiliency_test(void)
-{
- u8 *p;
- int type = KMALLOC_NORMAL;
-
- BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
-
- pr_err("SLUB resiliency testing\n");
- pr_err("-----------------------\n");
- pr_err("A. Corruption after allocation\n");
-
- p = kzalloc(16, GFP_KERNEL);
- p[16] = 0x12;
- pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
- p + 16);
-
- validate_slab_cache(kmalloc_caches[type][4]);
-
- /* Hmmm... The next two are dangerous */
- p = kzalloc(32, GFP_KERNEL);
- p[32 + sizeof(void *)] = 0x34;
- pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
- p);
- pr_err("If allocated object is overwritten then not detectable\n\n");
-
- validate_slab_cache(kmalloc_caches[type][5]);
- p = kzalloc(64, GFP_KERNEL);
- p += 64 + (get_cycles() & 0xff) * sizeof(void *);
- *p = 0x56;
- pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
- p);
- pr_err("If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches[type][6]);
-
- pr_err("\nB. Corruption after free\n");
- p = kzalloc(128, GFP_KERNEL);
- kfree(p);
- *p = 0x78;
- pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[type][7]);
-
- p = kzalloc(256, GFP_KERNEL);
- kfree(p);
- p[50] = 0x9a;
- pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[type][8]);
-
- p = kzalloc(512, GFP_KERNEL);
- kfree(p);
- p[512] = 0xab;
- pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[type][9]);
-}
-#else
-#ifdef CONFIG_SYSFS
-static void resiliency_test(void) {};
-#endif
-#endif /* SLUB_RESILIENCY_TEST */
-
#ifdef CONFIG_SYSFS
enum slab_stat_type {
SL_ALL, /* All slabs */
@@ -5346,21 +5282,6 @@ static ssize_t validate_store(struct kmem_cache *s,
}
SLAB_ATTR(validate);
-static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_ALLOC);
-}
-SLAB_ATTR_RO(alloc_calls);
-
-static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_FREE);
-}
-SLAB_ATTR_RO(free_calls);
#endif /* CONFIG_SLUB_DEBUG */
#ifdef CONFIG_FAILSLAB
@@ -5524,8 +5445,6 @@ static struct attribute *slab_attrs[] = {
&poison_attr.attr,
&store_user_attr.attr,
&validate_attr.attr,
- &alloc_calls_attr.attr,
- &free_calls_attr.attr,
#endif
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
@@ -5807,13 +5726,179 @@ static int __init slab_sysfs_init(void)
}
mutex_unlock(&slab_mutex);
- resiliency_test();
return 0;
}
__initcall(slab_sysfs_init);
#endif /* CONFIG_SYSFS */
+#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
+static int slab_debugfs_show(struct seq_file *seq, void *v)
+{
+
+ struct location *l;
+ unsigned int idx = *(unsigned int *)v;
+ struct loc_track *t = seq->private;
+
+ if (idx < t->count) {
+ l = &t->loc[idx];
+
+ seq_printf(seq, "%7ld ", l->count);
+
+ if (l->addr)
+ seq_printf(seq, "%pS", (void *)l->addr);
+ else
+ seq_puts(seq, "<not-available>");
+
+ if (l->sum_time != l->min_time) {
+ seq_printf(seq, " age=%ld/%llu/%ld",
+ l->min_time, div_u64(l->sum_time, l->count),
+ l->max_time);
+ } else
+ seq_printf(seq, " age=%ld", l->min_time);
+
+ if (l->min_pid != l->max_pid)
+ seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
+ else
+ seq_printf(seq, " pid=%ld",
+ l->min_pid);
+
+ if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
+ seq_printf(seq, " cpus=%*pbl",
+ cpumask_pr_args(to_cpumask(l->cpus)));
+
+ if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
+ seq_printf(seq, " nodes=%*pbl",
+ nodemask_pr_args(&l->nodes));
+
+ seq_puts(seq, "\n");
+ }
+
+ if (!idx && !t->count)
+ seq_puts(seq, "No data\n");
+
+ return 0;
+}
+
+static void slab_debugfs_stop(struct seq_file *seq, void *v)
+{
+}
+
+static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+ struct loc_track *t = seq->private;
+
+ v = ppos;
+ ++*ppos;
+ if (*ppos <= t->count)
+ return v;
+
+ return NULL;
+}
+
+static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
+{
+ return ppos;
+}
+
+static const struct seq_operations slab_debugfs_sops = {
+ .start = slab_debugfs_start,
+ .next = slab_debugfs_next,
+ .stop = slab_debugfs_stop,
+ .show = slab_debugfs_show,
+};
+
+static int slab_debug_trace_open(struct inode *inode, struct file *filep)
+{
+
+ struct kmem_cache_node *n;
+ enum track_item alloc;
+ int node;
+ struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
+ sizeof(struct loc_track));
+ struct kmem_cache *s = file_inode(filep)->i_private;
+
+ if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
+ alloc = TRACK_ALLOC;
+ else
+ alloc = TRACK_FREE;
+
+ if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL))
+ return -ENOMEM;
+
+ /* Push back cpu slabs */
+ flush_all(s);
+
+ for_each_kmem_cache_node(s, node, n) {
+ unsigned long flags;
+ struct page *page;
+
+ if (!atomic_long_read(&n->nr_slabs))
+ continue;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, slab_list)
+ process_slab(t, s, page, alloc);
+ list_for_each_entry(page, &n->full, slab_list)
+ process_slab(t, s, page, alloc);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ }
+
+ return 0;
+}
+
+static int slab_debug_trace_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct loc_track *t = seq->private;
+
+ free_loc_track(t);
+ return seq_release_private(inode, file);
+}
+
+static const struct file_operations slab_debugfs_fops = {
+ .open = slab_debug_trace_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = slab_debug_trace_release,
+};
+
+static void debugfs_slab_add(struct kmem_cache *s)
+{
+ struct dentry *slab_cache_dir;
+
+ if (unlikely(!slab_debugfs_root))
+ return;
+
+ slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
+
+ debugfs_create_file("alloc_traces", 0400,
+ slab_cache_dir, s, &slab_debugfs_fops);
+
+ debugfs_create_file("free_traces", 0400,
+ slab_cache_dir, s, &slab_debugfs_fops);
+}
+
+void debugfs_slab_release(struct kmem_cache *s)
+{
+ debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root));
+}
+
+static int __init slab_debugfs_init(void)
+{
+ struct kmem_cache *s;
+
+ slab_debugfs_root = debugfs_create_dir("slab", NULL);
+
+ list_for_each_entry(s, &slab_caches, list)
+ if (s->flags & SLAB_STORE_USER)
+ debugfs_slab_add(s);
+
+ return 0;
+
+}
+__initcall(slab_debugfs_init);
+#endif
/*
* The /proc/slabinfo ABI
*/
diff --git a/mm/sparse.c b/mm/sparse.c
index 7bd23f9d6cef..7272f7a1449d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -257,7 +257,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
if (unlikely(!mem_section)) {
unsigned long size, align;
- size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
+ size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
align = 1 << (INTERNODE_CACHE_SHIFT);
mem_section = memblock_alloc(size, align);
if (!mem_section)
@@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
return sizeof(struct mem_section_usage) + usemap_size();
}
+static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
+{
+#ifndef CONFIG_NUMA
+ return __pa_symbol(pgdat);
+#else
+ return __pa(pgdat);
+#endif
+}
+
#ifdef CONFIG_MEMORY_HOTREMOVE
static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
@@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid
* this problem.
*/
- goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
+ goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
@@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
}
usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
- pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
return;
@@ -547,6 +556,7 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
pnum_begin = pnum;
+ sparse_buffer_fini();
goto failed;
}
check_usemap_section_nr(nid, usage);
@@ -623,7 +633,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
}
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
/* Mark all memory sections within the pfn range as offline */
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -644,7 +653,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
ms->section_mem_map &= ~SECTION_IS_ONLINE;
}
}
-#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static struct page * __meminit populate_section_memmap(unsigned long pfn,
diff --git a/mm/swap.c b/mm/swap.c
index 31b844d4ed94..6c11db780467 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -36,6 +36,7 @@
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
#include <linux/local_lock.h>
+#include <linux/buffer_head.h>
#include "internal.h"
@@ -94,7 +95,7 @@ static void __put_single_page(struct page *page)
{
__page_cache_release(page);
mem_cgroup_uncharge(page);
- free_unref_page(page);
+ free_unref_page(page, 0);
}
static void __put_compound_page(struct page *page)
@@ -235,6 +236,18 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
}
}
+/* return true if pagevec needs to drain */
+static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
+{
+ bool ret = false;
+
+ if (!pagevec_add(pvec, page) || PageCompound(page) ||
+ lru_cache_disabled())
+ ret = true;
+
+ return ret;
+}
+
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
@@ -252,7 +265,7 @@ void rotate_reclaimable_page(struct page *page)
get_page(page);
local_lock_irqsave(&lru_rotate.lock, flags);
pvec = this_cpu_ptr(&lru_rotate.pvec);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
@@ -300,7 +313,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
void lru_note_cost_page(struct page *page)
{
- lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
+ lru_note_cost(mem_cgroup_page_lruvec(page),
page_is_file_lru(page), thp_nr_pages(page));
}
@@ -343,7 +356,7 @@ static void activate_page(struct page *page)
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.activate_page);
get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, __activate_page);
local_unlock(&lru_pvecs.lock);
}
@@ -458,7 +471,7 @@ void lru_cache_add(struct page *page)
get_page(page);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
__pagevec_lru_add(pvec);
local_unlock(&lru_pvecs.lock);
}
@@ -483,7 +496,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
int nr_pages = thp_nr_pages(page);
/*
- * We use the irq-unsafe __mod_zone_page_stat because this
+ * We use the irq-unsafe __mod_zone_page_state because this
* counter is not modified from interrupt context, and the pte
* lock is held(spinlock), which implies preemption disabled.
*/
@@ -629,6 +642,7 @@ void lru_add_drain_cpu(int cpu)
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
activate_page_drain(cpu);
+ invalidate_bh_lrus_cpu(cpu);
}
/**
@@ -654,7 +668,7 @@ void deactivate_file_page(struct page *page)
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
local_unlock(&lru_pvecs.lock);
}
@@ -676,7 +690,7 @@ void deactivate_page(struct page *page)
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_fn);
local_unlock(&lru_pvecs.lock);
}
@@ -698,7 +712,7 @@ void mark_page_lazyfree(struct page *page)
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
+ if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
local_unlock(&lru_pvecs.lock);
}
@@ -735,7 +749,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
* Calling this function with cpu hotplug locks held can actually lead
* to obscure indirect dependencies via WQ context.
*/
-void lru_add_drain_all(void)
+inline void __lru_add_drain_all(bool force_all_cpus)
{
/*
* lru_drain_gen - Global pages generation number
@@ -780,7 +794,7 @@ void lru_add_drain_all(void)
* (C) Exit the draining operation if a newer generation, from another
* lru_add_drain_all(), was already scheduled for draining. Check (A).
*/
- if (unlikely(this_gen != lru_drain_gen))
+ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
goto done;
/*
@@ -794,7 +808,7 @@ void lru_add_drain_all(void)
* below which drains the page vectors.
*
* Let x, y, and z represent some system CPU numbers, where x < y < z.
- * Assume CPU #z is is in the middle of the for_each_online_cpu loop
+ * Assume CPU #z is in the middle of the for_each_online_cpu loop
* below and has already reached CPU #y's per-cpu data. CPU #x comes
* along, adds some pages to its per-cpu vectors, then calls
* lru_add_drain_all().
@@ -810,12 +824,14 @@ void lru_add_drain_all(void)
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
- if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
+ if (force_all_cpus ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
- need_activate_page_drain(cpu)) {
+ need_activate_page_drain(cpu) ||
+ has_bh_in_lru(cpu, NULL)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
__cpumask_set_cpu(cpu, &has_work);
@@ -828,6 +844,11 @@ void lru_add_drain_all(void)
done:
mutex_unlock(&lock);
}
+
+void lru_add_drain_all(void)
+{
+ __lru_add_drain_all(false);
+}
#else
void lru_add_drain_all(void)
{
@@ -835,6 +856,34 @@ void lru_add_drain_all(void)
}
#endif /* CONFIG_SMP */
+atomic_t lru_disable_count = ATOMIC_INIT(0);
+
+/*
+ * lru_cache_disable() needs to be called before we start compiling
+ * a list of pages to be migrated using isolate_lru_page().
+ * It drains pages on LRU cache and then disable on all cpus until
+ * lru_cache_enable is called.
+ *
+ * Must be paired with a call to lru_cache_enable().
+ */
+void lru_cache_disable(void)
+{
+ atomic_inc(&lru_disable_count);
+#ifdef CONFIG_SMP
+ /*
+ * lru_add_drain_all in the force mode will schedule draining on
+ * all online CPUs so any calls of lru_cache_disabled wrapped by
+ * local_lock or preemption disabled would be ordered by that.
+ * The atomic operation doesn't need to have stronger ordering
+ * requirements because that is enforeced by the scheduling
+ * guarantees.
+ */
+ __lru_add_drain_all(true);
+#else
+ lru_add_drain();
+#endif
+}
+
/**
* release_pages - batched put_page()
* @pages: array of pages to release
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index be9de6d5b516..a66f3e0ec973 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -16,7 +16,7 @@
* to local caches without needing to acquire swap_info
* lock. We do not reuse the returned slots directly but
* move them back to the global pool in a batch. This
- * allows the slots to coaellesce and reduce fragmentation.
+ * allows the slots to coalesce and reduce fragmentation.
*
* The swap entry allocated is marked with SWAP_HAS_CACHE
* flag in map_count that prevents it from being allocated
@@ -43,8 +43,6 @@ static DEFINE_MUTEX(swap_slots_cache_mutex);
static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
static void __drain_swap_slots_cache(unsigned int type);
-static void deactivate_swap_slots_cache(void);
-static void reactivate_swap_slots_cache(void);
#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
#define SLOTS_CACHE 0x1
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3cdee7b11da9..c56aa9ac050d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -114,8 +114,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
SetPageSwapCache(page);
do {
- unsigned long nr_shadows = 0;
-
xas_lock_irq(&xas);
xas_create_range(&xas);
if (xas_error(&xas))
@@ -124,7 +122,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
old = xas_load(&xas);
if (xa_is_value(old)) {
- nr_shadows++;
if (shadowp)
*shadowp = old;
}
@@ -132,7 +129,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
xas_store(&xas, page);
xas_next(&xas);
}
- address_space->nrexceptional -= nr_shadows;
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
__mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
@@ -172,8 +168,6 @@ void __delete_from_swap_cache(struct page *page,
xas_next(&xas);
}
ClearPageSwapCache(page);
- if (shadow)
- address_space->nrexceptional += nr;
address_space->nrpages -= nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
__mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
@@ -263,7 +257,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
void *old;
for (;;) {
- unsigned long nr_shadows = 0;
swp_entry_t entry = swp_entry(type, curr);
struct address_space *address_space = swap_address_space(entry);
XA_STATE(xas, &address_space->i_pages, curr);
@@ -273,9 +266,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
if (!xa_is_value(old))
continue;
xas_store(&xas, NULL);
- nr_shadows++;
}
- address_space->nrexceptional -= nr_shadows;
xa_unlock_irq(&address_space->i_pages);
/* search the next swapcache until we meet end */
@@ -295,7 +286,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
* try_to_free_swap() _with_ the lock.
* - Marcelo
*/
-static inline void free_swap_cache(struct page *page)
+void free_swap_cache(struct page *page)
{
if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
try_to_free_swap(page);
@@ -497,16 +488,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
__SetPageLocked(page);
__SetPageSwapBacked(page);
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
- put_swap_page(page, entry);
+ if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
goto fail_unlock;
- }
- if (mem_cgroup_charge(page, NULL, gfp_mask)) {
- delete_from_swap_cache(page);
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
- }
+
+ mem_cgroup_swapin_uncharge_swap(entry);
if (shadow)
workingset_refault(page, shadow);
@@ -517,6 +506,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return page;
fail_unlock:
+ put_swap_page(page, entry);
unlock_page(page);
put_page(page);
return NULL;
@@ -703,7 +693,12 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
void exit_swap_address_space(unsigned int type)
{
- kvfree(swapper_spaces[type]);
+ int i;
+ struct address_space *spaces = swapper_spaces[type];
+
+ for (i = 0; i < nr_swapper_spaces[type]; i++)
+ VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
+ kvfree(spaces);
nr_swapper_spaces[type] = 0;
swapper_spaces[type] = NULL;
}
@@ -726,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long ra_val;
- swp_entry_t entry;
unsigned long faddr, pfn, fpfn;
unsigned long start, end;
pte_t *pte, *orig_pte;
@@ -744,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
faddr = vmf->address;
orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
- entry = pte_to_swp_entry(*pte);
- if ((unlikely(non_swap_entry(entry)))) {
- pte_unmap(orig_pte);
- return;
- }
fpfn = PFN_DOWN(faddr);
ra_val = GET_SWAP_RA_VAL(vma);
@@ -797,7 +786,7 @@ static void swap_ra_info(struct vm_fault *vmf,
*
* Returns the struct page for entry and addr, after queueing swapin.
*
- * Primitive swap readahead code. We simply read in a few pages whoes
+ * Primitive swap readahead code. We simply read in a few pages whose
* virtual addresses are around the fault address in the same vma.
*
* Caller must hold read mmap_lock if vmf->vma is not NULL.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 084a5b9a18e5..e898c879a434 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -39,6 +39,7 @@
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
+#include <linux/completion.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
@@ -99,11 +100,10 @@ atomic_t nr_rotate_swap = ATOMIC_INIT(0);
static struct swap_info_struct *swap_type_to_swap_info(int type)
{
- if (type >= READ_ONCE(nr_swapfiles))
+ if (type >= MAX_SWAPFILES)
return NULL;
- smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
- return READ_ONCE(swap_info[type]);
+ return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}
static inline unsigned char swap_count(unsigned char ent)
@@ -452,10 +452,10 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
unsigned int idx)
{
/*
- * If scan_swap_map() can't find a free cluster, it will check
+ * If scan_swap_map_slots() can't find a free cluster, it will check
* si->swap_map directly. To make sure the discarding cluster isn't
- * taken by scan_swap_map(), mark the swap entries bad (occupied). It
- * will be cleared after discard
+ * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
+ * It will be cleared after discard
*/
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
SWAP_MAP_BAD, SWAPFILE_CLUSTER);
@@ -511,6 +511,14 @@ static void swap_discard_work(struct work_struct *work)
spin_unlock(&si->lock);
}
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+ struct swap_info_struct *si;
+
+ si = container_of(ref, struct swap_info_struct, users);
+ complete(&si->comp);
+}
+
static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
{
struct swap_cluster_info *ci = si->cluster_info;
@@ -580,7 +588,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
}
/*
- * It's possible scan_swap_map() uses a free cluster in the middle of free
+ * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
* cluster list. Avoiding such abuse to avoid list corruption.
*/
static bool
@@ -1028,21 +1036,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
swap_range_free(si, offset, SWAPFILE_CLUSTER);
}
-static unsigned long scan_swap_map(struct swap_info_struct *si,
- unsigned char usage)
-{
- swp_entry_t entry;
- int n_ret;
-
- n_ret = scan_swap_map_slots(si, usage, 1, &entry);
-
- if (n_ret)
- return swp_offset(entry);
- else
- return 0;
-
-}
-
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
{
unsigned long size = swap_entry_size(entry_size);
@@ -1105,14 +1098,14 @@ start_over:
nextsi:
/*
* if we got here, it's likely that si was almost full before,
- * and since scan_swap_map() can drop the si->lock, multiple
- * callers probably all tried to get a page from the same si
- * and it filled up before we could get one; or, the si filled
- * up between us dropping swap_avail_lock and taking si->lock.
- * Since we dropped the swap_avail_lock, the swap_avail_head
- * list may have been modified; so if next is still in the
- * swap_avail_head list then try it, otherwise start over
- * if we have not gotten any slots.
+ * and since scan_swap_map_slots() can drop the si->lock,
+ * multiple callers probably all tried to get a page from the
+ * same si and it filled up before we could get one; or, the si
+ * filled up between us dropping swap_avail_lock and taking
+ * si->lock. Since we dropped the swap_avail_lock, the
+ * swap_avail_head list may have been modified; so if next is
+ * still in the swap_avail_head list then try it, otherwise
+ * start over if we have not gotten any slots.
*/
if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
@@ -1128,30 +1121,6 @@ noswap:
return n_ret;
}
-/* The only caller of this function is now suspend routine */
-swp_entry_t get_swap_page_of_type(int type)
-{
- struct swap_info_struct *si = swap_type_to_swap_info(type);
- pgoff_t offset;
-
- if (!si)
- goto fail;
-
- spin_lock(&si->lock);
- if (si->flags & SWP_WRITEOK) {
- /* This is called for allocating swap entry, not cache */
- offset = scan_swap_map(si, 1);
- if (offset) {
- atomic_long_dec(&nr_swap_pages);
- spin_unlock(&si->lock);
- return swp_entry(type, offset);
- }
- }
- spin_unlock(&si->lock);
-fail:
- return (swp_entry_t) {0};
-}
-
static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
@@ -1270,18 +1239,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
* via preventing the swap device from being swapoff, until
* put_swap_device() is called. Otherwise return NULL.
*
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
* Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc. The caller must
- * be prepared for that. For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc. The
+ * caller must be prepared for that. For example, the following
+ * situation is possible.
*
* CPU1 CPU2
* do_swap_page()
@@ -1309,21 +1272,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
-
- rcu_read_lock();
- if (data_race(!(si->flags & SWP_VALID)))
- goto unlock_out;
+ if (!percpu_ref_tryget_live(&si->users))
+ goto out;
+ /*
+ * Guarantee the si->users are checked before accessing other
+ * fields of swap_info_struct.
+ *
+ * Paired with the spin_unlock() after setup_swap_info() in
+ * enable_swap_info().
+ */
+ smp_rmb();
offset = swp_offset(entry);
if (offset >= si->max)
- goto unlock_out;
+ goto put_out;
return si;
bad_nofile:
pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
return NULL;
-unlock_out:
- rcu_read_unlock();
+put_out:
+ percpu_ref_put(&si->users);
return NULL;
}
@@ -1803,6 +1772,24 @@ int free_swap_and_cache(swp_entry_t entry)
}
#ifdef CONFIG_HIBERNATION
+
+swp_entry_t get_swap_page_of_type(int type)
+{
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
+ swp_entry_t entry = {0};
+
+ if (!si)
+ goto fail;
+
+ /* This is called for allocating swap entry, not cache */
+ spin_lock(&si->lock);
+ if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
+ atomic_long_dec(&nr_swap_pages);
+ spin_unlock(&si->lock);
+fail:
+ return entry;
+}
+
/*
* Find the swap type that corresponds to given device (if any).
*
@@ -1900,7 +1887,7 @@ unsigned int count_swap_pages(int type, int free)
static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
- return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
+ return pte_same(pte_swp_clear_flags(pte), swp_pte);
}
/*
@@ -2466,7 +2453,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
static void _enable_swap_info(struct swap_info_struct *p)
{
- p->flags |= SWP_WRITEOK | SWP_VALID;
+ p->flags |= SWP_WRITEOK;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2497,10 +2484,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
/*
- * Guarantee swap_map, cluster_info, etc. fields are valid
- * between get/put_swap_device() if SWP_VALID bit is set
+ * Finished initializing swap device, now it's safe to reference it.
*/
- synchronize_rcu();
+ percpu_ref_resurrect(&p->users);
spin_lock(&swap_lock);
spin_lock(&p->lock);
_enable_swap_info(p);
@@ -2616,16 +2602,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
reenable_swap_slots_cache_unlock();
- spin_lock(&swap_lock);
- spin_lock(&p->lock);
- p->flags &= ~SWP_VALID; /* mark swap device as invalid */
- spin_unlock(&p->lock);
- spin_unlock(&swap_lock);
/*
- * wait for swap operations protected by get/put_swap_device()
- * to complete
+ * Wait for swap operations protected by get/put_swap_device()
+ * to complete.
+ *
+ * We need synchronize_rcu() here to protect the accessing to
+ * the swap cache data structure.
*/
+ percpu_ref_kill(&p->users);
synchronize_rcu();
+ wait_for_completion(&p->comp);
flush_work(&p->discard_work);
@@ -2641,7 +2627,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_lock(&p->lock);
drain_mmlist();
- /* wait for anyone still in scan_swap_map */
+ /* wait for anyone still in scan_swap_map_slots */
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
spin_unlock(&p->lock);
@@ -2780,7 +2766,7 @@ static int swap_show(struct seq_file *swap, void *v)
unsigned int bytes, inuse;
if (si == SEQ_START_TOKEN) {
- seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
+ seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
return 0;
}
@@ -2857,6 +2843,12 @@ static struct swap_info_struct *alloc_swap_info(void)
if (!p)
return ERR_PTR(-ENOMEM);
+ if (percpu_ref_init(&p->users, swap_users_ref_free,
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
+ kvfree(p);
+ return ERR_PTR(-ENOMEM);
+ }
+
spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) {
if (!(swap_info[type]->flags & SWP_USED))
@@ -2864,19 +2856,18 @@ static struct swap_info_struct *alloc_swap_info(void)
}
if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
+ percpu_ref_exit(&p->users);
kvfree(p);
return ERR_PTR(-EPERM);
}
if (type >= nr_swapfiles) {
p->type = type;
- WRITE_ONCE(swap_info[type], p);
/*
- * Write swap_info[type] before nr_swapfiles, in case a
- * racing procfs swap_start() or swap_next() is reading them.
- * (We never shrink nr_swapfiles, we never free this entry.)
+ * Publish the swap_info_struct after initializing it.
+ * Note that kvzalloc() above zeroes all its fields.
*/
- smp_wmb();
- WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
+ smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
+ nr_swapfiles++;
} else {
defer = p;
p = swap_info[type];
@@ -2891,9 +2882,13 @@ static struct swap_info_struct *alloc_swap_info(void)
plist_node_init(&p->avail_lists[i], 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
- kvfree(defer);
+ if (defer) {
+ percpu_ref_exit(&defer->users);
+ kvfree(defer);
+ }
spin_lock_init(&p->lock);
spin_lock_init(&p->cont_lock);
+ init_completion(&p->comp);
return p;
}
@@ -3284,7 +3279,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sizeof(long),
GFP_KERNEL);
- if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+ if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
/*
* When discard is enabled for swap with no particular
* policy flagged, we set all swap discard flags here in
diff --git a/mm/truncate.c b/mm/truncate.c
index 455944264663..234ddd879caa 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -40,7 +40,6 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
if (xas_load(&xas) != entry)
return;
xas_store(&xas, NULL);
- mapping->nrexceptional--;
}
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
@@ -168,13 +167,10 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static void
-truncate_cleanup_page(struct address_space *mapping, struct page *page)
+static void truncate_cleanup_page(struct page *page)
{
- if (page_mapped(page)) {
- unsigned int nr = thp_nr_pages(page);
- unmap_mapping_pages(mapping, page->index, nr, false);
- }
+ if (page_mapped(page))
+ unmap_mapping_page(page);
if (page_has_private(page))
do_invalidatepage(page, 0, thp_size(page));
@@ -219,7 +215,7 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
if (page->mapping != mapping)
return -EIO;
- truncate_cleanup_page(mapping, page);
+ truncate_cleanup_page(page);
delete_from_page_cache(page);
return 0;
}
@@ -295,7 +291,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
pgoff_t index;
int i;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ if (mapping_empty(mapping))
goto out;
/* Offsets within partial pages */
@@ -326,7 +322,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = indices[pagevec_count(&pvec) - 1] + 1;
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
for (i = 0; i < pagevec_count(&pvec); i++)
- truncate_cleanup_page(mapping, pvec.pages[i]);
+ truncate_cleanup_page(pvec.pages[i]);
delete_from_page_cache_batch(mapping, &pvec);
for (i = 0; i < pagevec_count(&pvec); i++)
unlock_page(pvec.pages[i]);
@@ -440,9 +436,6 @@ EXPORT_SYMBOL(truncate_inode_pages);
*/
void truncate_inode_pages_final(struct address_space *mapping)
{
- unsigned long nrexceptional;
- unsigned long nrpages;
-
/*
* Page reclaim can not participate in regular inode lifetime
* management (can't call iput()) and thus can race with the
@@ -452,16 +445,7 @@ void truncate_inode_pages_final(struct address_space *mapping)
*/
mapping_set_exiting(mapping);
- /*
- * When reclaim installs eviction entries, it increases
- * nrexceptional first, then decreases nrpages. Make sure we see
- * this in the right order or we might miss an entry.
- */
- nrpages = mapping->nrpages;
- smp_rmb();
- nrexceptional = mapping->nrexceptional;
-
- if (nrpages || nrexceptional) {
+ if (!mapping_empty(mapping)) {
/*
* As truncation uses a lockless tree lookup, cycle
* the tree lock to make sure any ongoing tree
@@ -633,7 +617,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
int ret2 = 0;
int did_range_unmap = 0;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ if (mapping_empty(mapping))
goto out;
pagevec_init(&pvec);
@@ -652,6 +636,16 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
continue;
}
+ if (!did_range_unmap && page_mapped(page)) {
+ /*
+ * If page is mapped, before taking its lock,
+ * zap the rest of the file in one hit.
+ */
+ unmap_mapping_pages(mapping, index,
+ (1 + end - index), false);
+ did_range_unmap = 1;
+ }
+
lock_page(page);
WARN_ON(page_to_index(page) != index);
if (page->mapping != mapping) {
@@ -659,23 +653,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
continue;
}
wait_on_page_writeback(page);
- if (page_mapped(page)) {
- if (!did_range_unmap) {
- /*
- * Zap the rest of the file in one hit.
- */
- unmap_mapping_pages(mapping, index,
- (1 + end - index), false);
- did_range_unmap = 1;
- } else {
- /*
- * Just zap this page
- */
- unmap_mapping_pages(mapping, index,
- 1, false);
- }
- }
+
+ if (page_mapped(page))
+ unmap_mapping_page(page);
BUG_ON(page_mapped(page));
+
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9a3d451402d7..63a73e164d55 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage)
+ enum mcopy_atomic_mode mode)
{
int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
* by THP. Since we can not reliably insert a zero page, this
* feature is not supported.
*/
- if (zeropage) {
+ if (mode == MCOPY_ATOMIC_ZEROPAGE) {
mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -273,8 +273,6 @@ retry:
}
while (src_addr < src_start + len) {
- pte_t dst_pteval;
-
BUG_ON(dst_addr >= dst_start + len);
/*
@@ -290,23 +288,23 @@ retry:
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
- dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
+ dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
goto out_unlock;
}
- err = -EEXIST;
- dst_pteval = huge_ptep_get(dst_pte);
- if (!huge_pte_none(dst_pteval)) {
+ if (mode != MCOPY_ATOMIC_CONTINUE &&
+ !huge_pte_none(huge_ptep_get(dst_pte))) {
+ err = -EEXIST;
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
goto out_unlock;
}
err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
- dst_addr, src_addr, &page);
+ dst_addr, src_addr, mode, &page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
@@ -362,38 +360,38 @@ out:
* If a reservation for the page existed in the reservation
* map of a private mapping, the map was modified to indicate
* the reservation was consumed when the page was allocated.
- * We clear the PagePrivate flag now so that the global
+ * We clear the HPageRestoreReserve flag now so that the global
* reserve count will not be incremented in free_huge_page.
* The reservation map will still indicate the reservation
* was consumed and possibly prevent later page allocation.
* This is better than leaking a global reservation. If no
- * reservation existed, it is still safe to clear PagePrivate
- * as no adjustments to reservation counts were made during
- * allocation.
+ * reservation existed, it is still safe to clear
+ * HPageRestoreReserve as no adjustments to reservation counts
+ * were made during allocation.
*
* The reservation map for shared mappings indicates which
* pages have reservations. When a huge page is allocated
* for an address with a reservation, no change is made to
- * the reserve map. In this case PagePrivate will be set
- * to indicate that the global reservation count should be
+ * the reserve map. In this case HPageRestoreReserve will be
+ * set to indicate that the global reservation count should be
* incremented when the page is freed. This is the desired
* behavior. However, when a huge page is allocated for an
* address without a reservation a reservation entry is added
- * to the reservation map, and PagePrivate will not be set.
- * When the page is freed, the global reserve count will NOT
- * be incremented and it will appear as though we have leaked
- * reserved page. In this case, set PagePrivate so that the
- * global reserve count will be incremented to match the
- * reservation map entry which was created.
+ * to the reservation map, and HPageRestoreReserve will not be
+ * set. When the page is freed, the global reserve count will
+ * NOT be incremented and it will appear as though we have
+ * leaked reserved page. In this case, set HPageRestoreReserve
+ * so that the global reserve count will be incremented to
+ * match the reservation map entry which was created.
*
* Note that vm_alloc_shared is based on the flags of the vma
* for which the page was originally allocated. dst_vma could
* be different or NULL on error.
*/
if (vm_alloc_shared)
- SetPagePrivate(page);
+ SetHPageRestoreReserve(page);
else
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
put_page(page);
}
BUG_ON(copied < 0);
@@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage);
+ enum mcopy_atomic_mode mode);
#endif /* CONFIG_HUGETLB_PAGE */
static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage,
+ enum mcopy_atomic_mode mcopy_mode,
bool *mmap_changing,
__u64 mode)
{
@@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
long copied;
struct page *page;
bool wp_copy;
+ bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
/*
* Sanitize the command parameters:
@@ -527,10 +526,12 @@ retry:
*/
if (is_vm_hugetlb_page(dst_vma))
return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
- src_start, len, zeropage);
+ src_start, len, mcopy_mode);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
+ if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+ goto out_unlock;
/*
* Ensure the dst_vma has a anon_vma or this page
@@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
bool *mmap_changing, __u64 mode)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
- mmap_changing, mode);
+ return __mcopy_atomic(dst_mm, dst_start, src_start, len,
+ MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
}
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+ return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
+ mmap_changing, 0);
+}
+
+ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, bool *mmap_changing)
+{
+ return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
+ mmap_changing, 0);
}
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
diff --git a/mm/util.c b/mm/util.c
index 54870226cea6..a8bf17f18a81 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -711,16 +711,6 @@ struct address_space *page_mapping(struct page *page)
}
EXPORT_SYMBOL(page_mapping);
-/*
- * For file cache pages, return the address_space, otherwise return NULL
- */
-struct address_space *page_mapping_file(struct page *page)
-{
- if (unlikely(PageSwapCache(page)))
- return NULL;
- return page_mapping(page);
-}
-
/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
{
@@ -775,7 +765,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
* The deviation of sync_overcommit_as could be big with loose policy
* like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
* strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
- * with the strict "NEVER", and to avoid possible race condtion (even
+ * with the strict "NEVER", and to avoid possible race condition (even
* though user usually won't too frequently do the switching to policy
* OVERCOMMIT_NEVER), the switch is done in the following order:
* 1. changing the batch
@@ -983,6 +973,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
return ret;
}
+#ifdef CONFIG_PRINTK
/**
* mem_dump_obj - Print available provenance information
* @object: object for which to find provenance information.
@@ -996,20 +987,26 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
*/
void mem_dump_obj(void *object)
{
+ const char *type;
+
if (kmem_valid_obj(object)) {
kmem_dump_obj(object);
return;
}
+
if (vmalloc_dump_obj(object))
return;
- if (!virt_addr_valid(object)) {
- if (object == NULL)
- pr_cont(" NULL pointer.\n");
- else if (object == ZERO_SIZE_PTR)
- pr_cont(" zero-size pointer.\n");
- else
- pr_cont(" non-paged memory.\n");
- return;
- }
- pr_cont(" non-slab/vmalloc memory.\n");
+
+ if (virt_addr_valid(object))
+ type = "non-slab/vmalloc memory";
+ else if (object == NULL)
+ type = "NULL pointer";
+ else if (object == ZERO_SIZE_PTR)
+ type = "zero-size pointer";
+ else
+ type = "non-paged memory";
+
+ pr_cont(" %s\n", type);
}
+EXPORT_SYMBOL_GPL(mem_dump_obj);
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4f5f8c907897..b2ec7f751bd0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,7 +34,7 @@
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
-
+#include <linux/pgtable.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -42,6 +42,19 @@
#include "internal.h"
#include "pgalloc-track.h"
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+static bool __ro_after_init vmap_allow_huge = true;
+
+static int __init set_nohugevmalloc(char *str)
+{
+ vmap_allow_huge = false;
+ return 0;
+}
+early_param("nohugevmalloc", set_nohugevmalloc);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+static const bool vmap_allow_huge = false;
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)x;
@@ -68,6 +81,218 @@ static void free_work(struct work_struct *w)
}
/*** Page table manipulation functions ***/
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ u64 pfn;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PMD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pmd_supported(prot))
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PMD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PUD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pud_supported(prot))
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PUD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
+ max_page_shift, mask))
+ return -ENOMEM;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < P4D_SHIFT)
+ return 0;
+
+ if (!arch_vmap_p4d_supported(prot))
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
+
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_P4D_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
+ max_page_shift, mask))
+ return -ENOMEM;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_range_noflush(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+ pgtbl_mod_mask mask = 0;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+ max_page_shift, &mask);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
+}
+
+int vmap_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ int err;
+
+ err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
+ flush_cache_vmap(addr, end);
+
+ return err;
+}
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
@@ -153,22 +378,20 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
} while (p4d++, addr = next, addr != end);
}
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @start: start of the VM area to unmap
- * @size: size of the VM area to unmap
+/*
+ * vunmap_range_noflush is similar to vunmap_range, but does not
+ * flush caches or TLBs.
*
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
- * should have been allocated using get_vm_area() and its friends.
+ * The caller is responsible for calling flush_cache_vmap() before calling
+ * this function, and flush_tlb_kernel_range after it has returned
+ * successfully (and before the addresses are expected to cause a page fault
+ * or be re-mapped for something else, if TLB flushes are being delayed or
+ * coalesced).
*
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible
- * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
- * function and flush_tlb_kernel_range() after.
+ * This is an internal function only. Do not use outside mm/.
*/
-void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
+void vunmap_range_noflush(unsigned long start, unsigned long end)
{
- unsigned long end = start + size;
unsigned long next;
pgd_t *pgd;
unsigned long addr = start;
@@ -189,7 +412,23 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
arch_sync_kernel_mappings(start, end);
}
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+/**
+ * vunmap_range - unmap kernel virtual addresses
+ * @addr: start of the VM area to unmap
+ * @end: end of the VM area to unmap (non-inclusive)
+ *
+ * Clears any present PTEs in the virtual address range, flushes TLBs and
+ * caches. Any subsequent access to the address before it has been re-mapped
+ * is a kernel bug.
+ */
+void vunmap_range(unsigned long addr, unsigned long end)
+{
+ flush_cache_vunmap(addr, end);
+ vunmap_range_noflush(addr, end);
+ flush_tlb_kernel_range(addr, end);
+}
+
+static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -217,7 +456,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -229,13 +468,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -247,13 +486,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -265,37 +504,18 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
- * have been allocated using get_vm_area() and its friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible for
- * calling flush_cache_vmap() on to-be-mapped areas before calling this
- * function.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages)
{
unsigned long start = addr;
- unsigned long end = addr + size;
- unsigned long next;
pgd_t *pgd;
+ unsigned long next;
int err = 0;
int nr = 0;
pgtbl_mod_mask mask = 0;
@@ -306,7 +526,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
+ err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
@@ -317,14 +537,61 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
return 0;
}
-int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
- struct page **pages)
+/*
+ * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
+ * flush caches.
+ *
+ * The caller is responsible for calling flush_cache_vmap() after this
+ * function returns successfully and before the addresses are accessed.
+ *
+ * This is an internal function only. Do not use outside mm/.
+ */
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
{
- int ret;
+ unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
+
+ WARN_ON(page_shift < PAGE_SHIFT);
- ret = map_kernel_range_noflush(start, size, prot, pages);
- flush_cache_vmap(start, start + size);
- return ret;
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
+ page_shift == PAGE_SHIFT)
+ return vmap_small_pages_range_noflush(addr, end, prot, pages);
+
+ for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
+ int err;
+
+ err = vmap_range_noflush(addr, addr + (1UL << page_shift),
+ __pa(page_address(pages[i])), prot,
+ page_shift);
+ if (err)
+ return err;
+
+ addr += 1UL << page_shift;
+ }
+
+ return 0;
+}
+
+/**
+ * vmap_pages_range - map pages to a kernel virtual address
+ * @addr: start of the VM area to map
+ * @end: end of the VM area to map (non-inclusive)
+ * @prot: page protection flags to use
+ * @pages: pages to map (always PAGE_SIZE pages)
+ * @page_shift: maximum shift that the pages may be mapped with, @pages must
+ * be aligned and contiguous up to at least this shift.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int vmap_pages_range(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ int err;
+
+ err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+ flush_cache_vmap(addr, end);
+ return err;
}
int is_vmalloc_or_module_addr(const void *x)
@@ -343,7 +610,9 @@ int is_vmalloc_or_module_addr(const void *x)
}
/*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the struct page it maps. Huge vmap mappings will
+ * return the tail page that corresponds to the base page address, which
+ * matches small vmap mappings.
*/
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
@@ -363,25 +632,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pgd_none(*pgd))
return NULL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return NULL; /* XXX: no allowance for huge pgd */
+ if (WARN_ON_ONCE(pgd_bad(*pgd)))
+ return NULL;
+
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return NULL;
- pud = pud_offset(p4d, addr);
+ if (p4d_leaf(*p4d))
+ return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(p4d_bad(*p4d)))
+ return NULL;
- /*
- * Don't dereference bad PUD or PMD (below) entries. This will also
- * identify huge mappings, which we may encounter on architectures
- * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
- * identified as vmalloc addresses by is_vmalloc_addr(), but are
- * not [unambiguously] associated with a struct page, so there is
- * no correct value to return for them.
- */
- WARN_ON_ONCE(pud_bad(*pud));
- if (pud_none(*pud) || pud_bad(*pud))
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
return NULL;
+ if (pud_leaf(*pud))
+ return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pud_bad(*pud)))
+ return NULL;
+
pmd = pmd_offset(pud, addr);
- WARN_ON_ONCE(pmd_bad(*pmd));
- if (pmd_none(*pmd) || pmd_bad(*pmd))
+ if (pmd_none(*pmd))
+ return NULL;
+ if (pmd_leaf(*pmd))
+ return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
ptep = pte_offset_map(pmd, addr);
@@ -389,6 +666,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pte_present(pte))
page = pte_page(pte);
pte_unmap(ptep);
+
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
@@ -1152,6 +1430,29 @@ static void free_vmap_area(struct vmap_area *va)
spin_unlock(&free_vmap_area_lock);
}
+static inline void
+preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
+{
+ struct vmap_area *va = NULL;
+
+ /*
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. It guarantees that
+ * a CPU that does an allocation is preloaded.
+ *
+ * We do it in non-atomic context, thus it allows us to use more
+ * permissive allocation masks to be more stable under low memory
+ * condition and high memory pressure.
+ */
+ if (!this_cpu_read(ne_fit_preload_node))
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+
+ spin_lock(lock);
+
+ if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
+ kmem_cache_free(vmap_area_cachep, va);
+}
+
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
@@ -1161,7 +1462,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
- struct vmap_area *va, *pva;
+ struct vmap_area *va;
unsigned long addr;
int purged = 0;
int ret;
@@ -1187,43 +1488,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
- /*
- * Preload this CPU with one extra vmap_area object. It is used
- * when fit type of free area is NE_FIT_TYPE. Please note, it
- * does not guarantee that an allocation occurs on a CPU that
- * is preloaded, instead we minimize the case when it is not.
- * It can happen because of cpu migration, because there is a
- * race until the below spinlock is taken.
- *
- * The preload is done in non-atomic context, thus it allows us
- * to use more permissive allocation masks to be more stable under
- * low memory condition and high memory pressure. In rare case,
- * if not preloaded, GFP_NOWAIT is used.
- *
- * Set "pva" to NULL here, because of "retry" path.
- */
- pva = NULL;
-
- if (!this_cpu_read(ne_fit_preload_node))
- /*
- * Even if it fails we do not really care about that.
- * Just proceed as it is. If needed "overflow" path
- * will refill the cache we allocate from.
- */
- pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
-
- spin_lock(&free_vmap_area_lock);
-
- if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
- kmem_cache_free(vmap_area_cachep, pva);
+ preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
+ addr = __alloc_vmap_area(size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
- addr = __alloc_vmap_area(size, align, vstart, vend);
- spin_unlock(&free_vmap_area_lock);
-
if (unlikely(addr == vend))
goto overflow;
@@ -1231,7 +1503,6 @@ retry:
va->va_end = addr + size;
va->vm = NULL;
-
spin_lock(&vmap_area_lock);
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
@@ -1312,7 +1583,7 @@ static unsigned long lazy_max_pages(void)
static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
/*
- * Serialize vmap purging. There is no actual criticial section protected
+ * Serialize vmap purging. There is no actual critical section protected
* by this look, but we want to avoid concurrent calls for performance
* reasons and to make the pcpu_get_vm_areas more deterministic.
*/
@@ -1448,7 +1719,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
+ vunmap_range_noflush(va->va_start, va->va_end);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
@@ -1726,7 +1997,7 @@ static void vb_free(unsigned long addr, unsigned long size)
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
- unmap_kernel_range_noflush(addr, size);
+ vunmap_range_noflush(addr, addr + size);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(addr, addr + size);
@@ -1762,7 +2033,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
spin_lock(&vb->lock);
- if (vb->dirty) {
+ if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
unsigned long va_start = vb->va->va_start;
unsigned long s, e;
@@ -1879,16 +2150,36 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
kasan_unpoison_vmalloc(mem, size);
- if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
+ if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
+ pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
+static inline unsigned int vm_area_page_order(struct vm_struct *vm)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ return vm->page_order;
+#else
+ return 0;
+#endif
+}
+
+static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ vm->page_order = order;
+#else
+ BUG_ON(order != 0);
+#endif
+}
+
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -2023,23 +2314,6 @@ void __init vmalloc_init(void)
vmap_initialized = true;
}
-/**
- * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Similar to unmap_kernel_range_noflush() but flushes vcache before
- * the unmapping and tlb after.
- */
-void unmap_kernel_range(unsigned long addr, unsigned long size)
-{
- unsigned long end = addr + size;
-
- flush_cache_vunmap(addr, end);
- unmap_kernel_range_noflush(addr, size);
- flush_tlb_kernel_range(addr, end);
-}
-
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
@@ -2070,15 +2344,16 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
- unsigned long align, unsigned long flags, unsigned long start,
- unsigned long end, int node, gfp_t gfp_mask, const void *caller)
+ unsigned long align, unsigned long shift, unsigned long flags,
+ unsigned long start, unsigned long end, int node,
+ gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
unsigned long requested_size = size;
BUG_ON(in_interrupt());
- size = PAGE_ALIGN(size);
+ size = ALIGN(size, 1ul << shift);
if (unlikely(!size))
return NULL;
@@ -2110,8 +2385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, caller);
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
+ NUMA_NO_NODE, GFP_KERNEL, caller);
}
/**
@@ -2127,7 +2402,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL,
__builtin_return_address(0));
}
@@ -2135,7 +2411,8 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL, caller);
}
@@ -2199,6 +2476,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
{
int i;
+ /* HUGE_VMALLOC passes small pages to set_direct_map */
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
set_direct_map(area->pages[i]);
@@ -2208,6 +2486,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
{
unsigned long start = ULONG_MAX, end = 0;
+ unsigned int page_order = vm_area_page_order(area);
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int flush_dmap = 0;
int i;
@@ -2232,11 +2511,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
* map. Find the start and end range of the direct mappings to make sure
* the vm_unmap_aliases() flush includes the direct map.
*/
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
unsigned long addr = (unsigned long)page_address(area->pages[i]);
if (addr) {
+ unsigned long page_size;
+
+ page_size = PAGE_SIZE << page_order;
start = min(addr, start);
- end = max(addr + PAGE_SIZE, end);
+ end = max(addr + page_size, end);
flush_dmap = 1;
}
}
@@ -2277,13 +2559,15 @@ static void __vunmap(const void *addr, int deallocate_pages)
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
+ unsigned int page_order = vm_area_page_order(area);
int i;
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_pages(page, 0);
+ __free_pages(page, page_order);
+ cond_resched();
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2348,7 +2632,7 @@ static void __vfree(const void *addr)
* May sleep if called *not* from interrupt context.
* Must not be called in NMI context (strictly speaking, it could be
* if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea).
+ * conventions for vfree() arch-dependent would be a really bad idea).
*/
void vfree(const void *addr)
{
@@ -2402,6 +2686,7 @@ void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
+ unsigned long addr;
unsigned long size; /* In bytes */
might_sleep();
@@ -2414,8 +2699,9 @@ void *vmap(struct page **pages, unsigned int count,
if (!area)
return NULL;
- if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
- pages) < 0) {
+ addr = (unsigned long)area->addr;
+ if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
+ pages, PAGE_SHIFT) < 0) {
vunmap(area->addr);
return NULL;
}
@@ -2473,66 +2759,116 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */
+static inline unsigned int
+vm_area_alloc_pages(gfp_t gfp, int nid,
+ unsigned int order, unsigned long nr_pages, struct page **pages)
+{
+ unsigned int nr_allocated = 0;
+
+ /*
+ * For order-0 pages we make use of bulk allocator, if
+ * the page array is partly or not at all populated due
+ * to fails, fallback to a single page allocator that is
+ * more permissive.
+ */
+ if (!order)
+ nr_allocated = alloc_pages_bulk_array_node(
+ gfp, nid, nr_pages, pages);
+ else
+ /*
+ * Compound pages required for remap_vmalloc_page if
+ * high-order pages.
+ */
+ gfp |= __GFP_COMP;
+
+ /* High-order pages or fallback path if "bulk" fails. */
+ while (nr_allocated < nr_pages) {
+ struct page *page;
+ int i;
+
+ page = alloc_pages_node(nid, gfp, order);
+ if (unlikely(!page))
+ break;
+
+ /*
+ * Careful, we allocate and map page-order pages, but
+ * tracking is done per PAGE_SIZE page so as to keep the
+ * vm_struct APIs independent of the physical/mapped size.
+ */
+ for (i = 0; i < (1U << order); i++)
+ pages[nr_allocated + i] = page + i;
+
+ if (gfpflags_allow_blocking(gfp))
+ cond_resched();
+
+ nr_allocated += 1U << order;
+ }
+
+ return nr_allocated;
+}
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+ pgprot_t prot, unsigned int page_shift,
+ int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long size = get_vm_area_size(area);
unsigned long array_size;
- unsigned int i;
- struct page **pages;
+ unsigned int nr_small_pages = size >> PAGE_SHIFT;
+ unsigned int page_order;
- array_size = (unsigned long)nr_pages * sizeof(struct page *);
+ array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+ area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
area->caller);
} else {
- pages = kmalloc_node(array_size, nested_gfp, node);
+ area->pages = kmalloc_node(array_size, nested_gfp, node);
}
- if (!pages) {
+ if (!area->pages) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, failed to allocated page array size %lu",
+ nr_small_pages * PAGE_SIZE, array_size);
free_vm_area(area);
return NULL;
}
- area->pages = pages;
- area->nr_pages = nr_pages;
+ set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
+ page_order = vm_area_page_order(area);
- for (i = 0; i < area->nr_pages; i++) {
- struct page *page;
+ area->nr_pages = vm_area_alloc_pages(gfp_mask, node,
+ page_order, nr_small_pages, area->pages);
- if (node == NUMA_NO_NODE)
- page = alloc_page(gfp_mask);
- else
- page = alloc_pages_node(node, gfp_mask, 0);
+ atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (unlikely(!page)) {
- /* Successfully allocated i pages, free them in __vfree() */
- area->nr_pages = i;
- atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- goto fail;
- }
- area->pages[i] = page;
- if (gfpflags_allow_blocking(gfp_mask))
- cond_resched();
+ /*
+ * If not enough pages were obtained to accomplish an
+ * allocation request, free them via __vfree() if any.
+ */
+ if (area->nr_pages != nr_small_pages) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, page order %u, failed to allocate pages",
+ area->nr_pages * PAGE_SIZE, page_order);
+ goto fail;
}
- atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
- prot, pages) < 0)
+ if (vmap_pages_range(addr, addr + size, prot, area->pages,
+ page_shift) < 0) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, failed to map pages",
+ area->nr_pages * PAGE_SIZE);
goto fail;
+ }
return area->addr;
fail:
- warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure, allocated %ld of %ld bytes",
- (area->nr_pages*PAGE_SIZE), area->size);
__vfree(area->addr);
return NULL;
}
@@ -2563,19 +2899,54 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
+ unsigned long real_align = align;
+ unsigned int shift = PAGE_SHIFT;
- size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > totalram_pages())
- goto fail;
+ if (WARN_ON_ONCE(!size))
+ return NULL;
- area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
- vm_flags, start, end, node, gfp_mask, caller);
- if (!area)
+ if ((size >> PAGE_SHIFT) > totalram_pages()) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, exceeds total pages",
+ real_size);
+ return NULL;
+ }
+
+ if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
+ arch_vmap_pmd_supported(prot)) {
+ unsigned long size_per_node;
+
+ /*
+ * Try huge pages. Only try for PAGE_KERNEL allocations,
+ * others like modules don't yet expect huge pages in
+ * their allocations due to apply_to_page_range not
+ * supporting them.
+ */
+
+ size_per_node = size;
+ if (node == NUMA_NO_NODE)
+ size_per_node /= num_online_nodes();
+ if (size_per_node >= PMD_SIZE) {
+ shift = PMD_SHIFT;
+ align = max(real_align, 1UL << shift);
+ size = ALIGN(real_size, 1UL << shift);
+ }
+ }
+
+again:
+ area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
+ VM_UNINITIALIZED | vm_flags, start, end, node,
+ gfp_mask, caller);
+ if (!area) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc error: size %lu, vm_struct allocation failed",
+ real_size);
goto fail;
+ }
- addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+ addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!addr)
- return NULL;
+ goto fail;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2584,13 +2955,19 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
*/
clear_vm_uninitialized_flag(area);
+ size = PAGE_ALIGN(size);
kmemleak_vmalloc(area, size, gfp_mask);
return addr;
fail:
- warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure: %lu bytes", real_size);
+ if (shift > PAGE_SHIFT) {
+ shift = PAGE_SHIFT;
+ align = real_align;
+ size = real_size;
+ goto again;
+ }
+
return NULL;
}
@@ -2655,6 +3032,23 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/**
+ * vmalloc_no_huge - allocate virtually contiguous memory using small pages
+ * @size: allocation size
+ *
+ * Allocate enough non-huge pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_no_huge(unsigned long size)
+{
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
+ NUMA_NO_NODE, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_no_huge);
+
+/**
* vzalloc - allocate virtually contiguous memory with zero fill
* @size: allocation size
*
@@ -2739,7 +3133,7 @@ EXPORT_SYMBOL(vzalloc_node);
* 64b systems should always have either DMA or DMA32 zones. For others
* GFP_DMA32 should do the right thing and use the normal zone.
*/
-#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif
/**
@@ -2797,15 +3191,12 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
/*
* To do safe access to this _mapped_ area, we need
* lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calles for this _debug_
+ * overhead of vmalloc()/vfree() calls for this _debug_
* interface, rarely used. Instead of that, we'll use
* kmap() and get small overhead in this access function.
*/
if (p) {
- /*
- * we can expect USER0 is not used (see vread/vwrite's
- * function description)
- */
+ /* We can expect USER0 is not used -- see vread() */
void *map = kmap_atomic(p);
memcpy(buf, map + offset, length);
kunmap_atomic(map);
@@ -2820,43 +3211,6 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
return copied;
}
-static int aligned_vwrite(char *buf, char *addr, unsigned long count)
-{
- struct page *p;
- int copied = 0;
-
- while (count) {
- unsigned long offset, length;
-
- offset = offset_in_page(addr);
- length = PAGE_SIZE - offset;
- if (length > count)
- length = count;
- p = vmalloc_to_page(addr);
- /*
- * To do safe access to this _mapped_ area, we need
- * lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calles for this _debug_
- * interface, rarely used. Instead of that, we'll use
- * kmap() and get small overhead in this access function.
- */
- if (p) {
- /*
- * we can expect USER0 is not used (see vread/vwrite's
- * function description)
- */
- void *map = kmap_atomic(p);
- memcpy(map + offset, buf, length);
- kunmap_atomic(map);
- }
- addr += length;
- buf += length;
- copied += length;
- count -= length;
- }
- return copied;
-}
-
/**
* vread() - read vmalloc area in a safe way.
* @buf: buffer for reading data
@@ -2875,7 +3229,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
* Note: In usual ops, vread() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
* This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
+ * any information, as /proc/kcore.
*
* Return: number of bytes for which addr and buf should be increased
* (same number as @count) or %0 if [addr...addr+count) doesn't
@@ -2894,7 +3248,10 @@ long vread(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
+ va = __find_vmap_area((unsigned long)addr);
+ if (!va)
+ goto finished;
+ list_for_each_entry_from(va, &vmap_area_list, list) {
if (!count)
break;
@@ -2937,80 +3294,6 @@ finished:
}
/**
- * vwrite() - write vmalloc area in a safe way.
- * @buf: buffer for source data
- * @addr: vm address.
- * @count: number of bytes to be read.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from a buffer to the given addr. If specified range of
- * [addr...addr+count) includes some valid address, data is copied from
- * proper area of @buf. If there are memory holes, no copy to hole.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vwrite() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
- *
- * Return: number of bytes for which addr and buf should be
- * increased (same number as @count) or %0 if [addr...addr+count)
- * doesn't include any intersection with valid vmalloc area
- */
-long vwrite(char *buf, char *addr, unsigned long count)
-{
- struct vmap_area *va;
- struct vm_struct *vm;
- char *vaddr;
- unsigned long n, buflen;
- int copied = 0;
-
- /* Don't allow overflow */
- if ((unsigned long) addr + count < count)
- count = -(unsigned long) addr;
- buflen = count;
-
- spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
- if (!count)
- break;
-
- if (!va->vm)
- continue;
-
- vm = va->vm;
- vaddr = (char *) vm->addr;
- if (addr >= vaddr + get_vm_area_size(vm))
- continue;
- while (addr < vaddr) {
- if (count == 0)
- goto finished;
- buf++;
- addr++;
- count--;
- }
- n = vaddr + get_vm_area_size(vm) - addr;
- if (n > count)
- n = count;
- if (!(vm->flags & VM_IOREMAP)) {
- aligned_vwrite(buf, addr, n);
- copied++;
- }
- buf += n;
- addr += n;
- count -= n;
- }
-finished:
- spin_unlock(&vmap_area_lock);
- if (!copied)
- return 0;
- return buflen;
-}
-
-/**
* remap_vmalloc_range_partial - map vmalloc pages to userspace
* @vma: vma to cover
* @uaddr: target user address to start at
@@ -3072,7 +3355,6 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
return 0;
}
-EXPORT_SYMBOL(remap_vmalloc_range_partial);
/**
* remap_vmalloc_range - map vmalloc pages to userspace
@@ -3450,6 +3732,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
}
#endif /* CONFIG_SMP */
+#ifdef CONFIG_PRINTK
bool vmalloc_dump_obj(void *object)
{
struct vm_struct *vm;
@@ -3462,6 +3745,7 @@ bool vmalloc_dump_obj(void *object)
vm->nr_pages, (unsigned long)vm->addr, vm->caller);
return true;
}
+#endif
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 562e87cbd7a1..d7c3cb8688dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -185,39 +185,181 @@ static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG
-/*
- * We allow subsystems to populate their shrinker-related
- * LRU lists before register_shrinker_prepared() is called
- * for the shrinker, since we don't want to impose
- * restrictions on their internal registration order.
- * In this case shrink_slab_memcg() may find corresponding
- * bit is set in the shrinkers map.
- *
- * This value is used by the function to detect registering
- * shrinkers and to skip do_shrink_slab() calls for them.
- */
-#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+static int shrinker_nr_max;
+
+/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
+static inline int shrinker_map_size(int nr_items)
+{
+ return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
+}
+
+static inline int shrinker_defer_size(int nr_items)
+{
+ return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
+}
+
+static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
+ int nid)
+{
+ return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+ lockdep_is_held(&shrinker_rwsem));
+}
+
+static int expand_one_shrinker_info(struct mem_cgroup *memcg,
+ int map_size, int defer_size,
+ int old_map_size, int old_defer_size)
+{
+ struct shrinker_info *new, *old;
+ struct mem_cgroup_per_node *pn;
+ int nid;
+ int size = map_size + defer_size;
+
+ for_each_node(nid) {
+ pn = memcg->nodeinfo[nid];
+ old = shrinker_info_protected(memcg, nid);
+ /* Not yet online memcg */
+ if (!old)
+ return 0;
+
+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
+ if (!new)
+ return -ENOMEM;
+
+ new->nr_deferred = (atomic_long_t *)(new + 1);
+ new->map = (void *)new->nr_deferred + defer_size;
+
+ /* map: set all old bits, clear all new bits */
+ memset(new->map, (int)0xff, old_map_size);
+ memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
+ /* nr_deferred: copy old values, clear all new values */
+ memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
+ memset((void *)new->nr_deferred + old_defer_size, 0,
+ defer_size - old_defer_size);
+
+ rcu_assign_pointer(pn->shrinker_info, new);
+ kvfree_rcu(old, rcu);
+ }
+
+ return 0;
+}
+
+void free_shrinker_info(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *pn;
+ struct shrinker_info *info;
+ int nid;
+
+ for_each_node(nid) {
+ pn = memcg->nodeinfo[nid];
+ info = rcu_dereference_protected(pn->shrinker_info, true);
+ kvfree(info);
+ rcu_assign_pointer(pn->shrinker_info, NULL);
+ }
+}
+
+int alloc_shrinker_info(struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+ int nid, size, ret = 0;
+ int map_size, defer_size = 0;
+
+ down_write(&shrinker_rwsem);
+ map_size = shrinker_map_size(shrinker_nr_max);
+ defer_size = shrinker_defer_size(shrinker_nr_max);
+ size = map_size + defer_size;
+ for_each_node(nid) {
+ info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
+ if (!info) {
+ free_shrinker_info(memcg);
+ ret = -ENOMEM;
+ break;
+ }
+ info->nr_deferred = (atomic_long_t *)(info + 1);
+ info->map = (void *)info->nr_deferred + defer_size;
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
+ }
+ up_write(&shrinker_rwsem);
+
+ return ret;
+}
+
+static inline bool need_expand(int nr_max)
+{
+ return round_up(nr_max, BITS_PER_LONG) >
+ round_up(shrinker_nr_max, BITS_PER_LONG);
+}
+
+static int expand_shrinker_info(int new_id)
+{
+ int ret = 0;
+ int new_nr_max = new_id + 1;
+ int map_size, defer_size = 0;
+ int old_map_size, old_defer_size = 0;
+ struct mem_cgroup *memcg;
+
+ if (!need_expand(new_nr_max))
+ goto out;
+
+ if (!root_mem_cgroup)
+ goto out;
+
+ lockdep_assert_held(&shrinker_rwsem);
+
+ map_size = shrinker_map_size(new_nr_max);
+ defer_size = shrinker_defer_size(new_nr_max);
+ old_map_size = shrinker_map_size(shrinker_nr_max);
+ old_defer_size = shrinker_defer_size(shrinker_nr_max);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ ret = expand_one_shrinker_info(memcg, map_size, defer_size,
+ old_map_size, old_defer_size);
+ if (ret) {
+ mem_cgroup_iter_break(NULL, memcg);
+ goto out;
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+out:
+ if (!ret)
+ shrinker_nr_max = new_nr_max;
+
+ return ret;
+}
+
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+ if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+ struct shrinker_info *info;
+
+ rcu_read_lock();
+ info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ /* Pairs with smp mb in shrink_slab() */
+ smp_mb__before_atomic();
+ set_bit(shrinker_id, info->map);
+ rcu_read_unlock();
+ }
+}
static DEFINE_IDR(shrinker_idr);
-static int shrinker_nr_max;
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
int id, ret = -ENOMEM;
+ if (mem_cgroup_disabled())
+ return -ENOSYS;
+
down_write(&shrinker_rwsem);
/* This may call shrinker, so it must use down_read_trylock() */
- id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
if (id >= shrinker_nr_max) {
- if (memcg_expand_shrinker_maps(id)) {
+ if (expand_shrinker_info(id)) {
idr_remove(&shrinker_idr, id);
goto unlock;
}
-
- shrinker_nr_max = id + 1;
}
shrinker->id = id;
ret = 0;
@@ -232,9 +374,51 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
BUG_ON(id < 0);
- down_write(&shrinker_rwsem);
+ lockdep_assert_held(&shrinker_rwsem);
+
idr_remove(&shrinker_idr, id);
- up_write(&shrinker_rwsem);
+}
+
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+
+ info = shrinker_info_protected(memcg, nid);
+ return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ struct shrinker_info *info;
+
+ info = shrinker_info_protected(memcg, nid);
+ return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+}
+
+void reparent_shrinker_deferred(struct mem_cgroup *memcg)
+{
+ int i, nid;
+ long nr;
+ struct mem_cgroup *parent;
+ struct shrinker_info *child_info, *parent_info;
+
+ parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ parent = root_mem_cgroup;
+
+ /* Prevent from concurrent shrinker_info expand */
+ down_read(&shrinker_rwsem);
+ for_each_node(nid) {
+ child_info = shrinker_info_protected(memcg, nid);
+ parent_info = shrinker_info_protected(parent, nid);
+ for (i = 0; i < shrinker_nr_max; i++) {
+ nr = atomic_long_read(&child_info->nr_deferred[i]);
+ atomic_long_add(nr, &parent_info->nr_deferred[i]);
+ }
+ }
+ up_read(&shrinker_rwsem);
}
static bool cgroup_reclaim(struct scan_control *sc)
@@ -268,13 +452,25 @@ static bool writeback_throttling_sane(struct scan_control *sc)
#else
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
- return 0;
+ return -ENOSYS;
}
static void unregister_memcg_shrinker(struct shrinker *shrinker)
{
}
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+ struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
static bool cgroup_reclaim(struct scan_control *sc)
{
return false;
@@ -286,6 +482,39 @@ static bool writeback_throttling_sane(struct scan_control *sc)
}
#endif
+static long xchg_nr_deferred(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int nid = sc->nid;
+
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
+ if (sc->memcg &&
+ (shrinker->flags & SHRINKER_MEMCG_AWARE))
+ return xchg_nr_deferred_memcg(nid, shrinker,
+ sc->memcg);
+
+ return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+}
+
+
+static long add_nr_deferred(long nr, struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int nid = sc->nid;
+
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
+ if (sc->memcg &&
+ (shrinker->flags & SHRINKER_MEMCG_AWARE))
+ return add_nr_deferred_memcg(nr, nid, shrinker,
+ sc->memcg);
+
+ return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
+}
+
/*
* This misses isolated pages which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
@@ -335,8 +564,18 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
*/
int prealloc_shrinker(struct shrinker *shrinker)
{
- unsigned int size = sizeof(*shrinker->nr_deferred);
+ unsigned int size;
+ int err;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ err = prealloc_memcg_shrinker(shrinker);
+ if (err != -ENOSYS)
+ return err;
+ shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
+ }
+
+ size = sizeof(*shrinker->nr_deferred);
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -344,26 +583,17 @@ int prealloc_shrinker(struct shrinker *shrinker)
if (!shrinker->nr_deferred)
return -ENOMEM;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- if (prealloc_memcg_shrinker(shrinker))
- goto free_deferred;
- }
-
return 0;
-
-free_deferred:
- kfree(shrinker->nr_deferred);
- shrinker->nr_deferred = NULL;
- return -ENOMEM;
}
void free_prealloced_shrinker(struct shrinker *shrinker)
{
- if (!shrinker->nr_deferred)
- return;
-
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ down_write(&shrinker_rwsem);
unregister_memcg_shrinker(shrinker);
+ up_write(&shrinker_rwsem);
+ return;
+ }
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
@@ -373,10 +603,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
- idr_replace(&shrinker_idr, shrinker, shrinker->id);
-#endif
+ shrinker->flags |= SHRINKER_REGISTERED;
up_write(&shrinker_rwsem);
}
@@ -396,13 +623,16 @@ EXPORT_SYMBOL(register_shrinker);
*/
void unregister_shrinker(struct shrinker *shrinker)
{
- if (!shrinker->nr_deferred)
+ if (!(shrinker->flags & SHRINKER_REGISTERED))
return;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
- unregister_memcg_shrinker(shrinker);
+
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
+ shrinker->flags &= ~SHRINKER_REGISTERED;
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
up_write(&shrinker_rwsem);
+
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
@@ -419,14 +649,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
long freeable;
long nr;
long new_nr;
- int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
long scanned = 0, next_deferred;
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- nid = 0;
-
freeable = shrinker->count_objects(shrinker, shrinkctl);
if (freeable == 0 || freeable == SHRINK_EMPTY)
return freeable;
@@ -436,9 +662,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* and zero it so that other concurrent shrinker invocations
* don't also do this scanning work.
*/
- nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+ nr = xchg_nr_deferred(shrinker, shrinkctl);
- total_scan = nr;
if (shrinker->seeks) {
delta = freeable >> priority;
delta *= 4;
@@ -452,37 +677,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
delta = freeable / 2;
}
+ total_scan = nr >> priority;
total_scan += delta;
- if (total_scan < 0) {
- pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
- shrinker->scan_objects, total_scan);
- total_scan = freeable;
- next_deferred = nr;
- } else
- next_deferred = total_scan;
-
- /*
- * We need to avoid excessive windup on filesystem shrinkers
- * due to large numbers of GFP_NOFS allocations causing the
- * shrinkers to return -1 all the time. This results in a large
- * nr being built up so when a shrink that can do some work
- * comes along it empties the entire cache due to nr >>>
- * freeable. This is bad for sustaining a working set in
- * memory.
- *
- * Hence only allow the shrinker to scan the entire cache when
- * a large delta change is calculated directly.
- */
- if (delta < freeable / 4)
- total_scan = min(total_scan, freeable / 2);
-
- /*
- * Avoid risking looping forever due to too large nr value:
- * never try to free more than twice the estimate number of
- * freeable entries.
- */
- if (total_scan > freeable * 2)
- total_scan = freeable * 2;
+ total_scan = min(total_scan, (2 * freeable));
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
freeable, delta, total_scan, priority);
@@ -521,22 +718,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
cond_resched();
}
- if (next_deferred >= scanned)
- next_deferred -= scanned;
- else
- next_deferred = 0;
+ /*
+ * The deferred work is increased by any new work (delta) that wasn't
+ * done, decreased by old deferred work that was done now.
+ *
+ * And it is capped to two times of the freeable items.
+ */
+ next_deferred = max_t(long, (nr + delta - scanned), 0);
+ next_deferred = min(next_deferred, (2 * freeable));
+
/*
* move the unused scan count back into the shrinker in a
- * manner that handles concurrent updates. If we exhausted the
- * scan, there is no need to do an update.
+ * manner that handles concurrent updates.
*/
- if (next_deferred > 0)
- new_nr = atomic_long_add_return(next_deferred,
- &shrinker->nr_deferred[nid]);
- else
- new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
- trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
+ trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
return freed;
}
@@ -544,7 +741,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
- struct memcg_shrinker_map *map;
+ struct shrinker_info *info;
unsigned long ret, freed = 0;
int i;
@@ -554,12 +751,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
if (!down_read_trylock(&shrinker_rwsem))
return 0;
- map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
- true);
- if (unlikely(!map))
+ info = shrinker_info_protected(memcg, nid);
+ if (unlikely(!info))
goto unlock;
- for_each_set_bit(i, map->map, shrinker_nr_max) {
+ for_each_set_bit(i, info->map, shrinker_nr_max) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -568,9 +764,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
shrinker = idr_find(&shrinker_idr, i);
- if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+ if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
if (!shrinker)
- clear_bit(i, map->map);
+ clear_bit(i, info->map);
continue;
}
@@ -581,7 +777,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY) {
- clear_bit(i, map->map);
+ clear_bit(i, info->map);
/*
* After the shrinker reported that it had no objects to
* free, but before we cleared the corresponding bit in
@@ -590,7 +786,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
* case, we invoke the shrinker one more time and reset
* the bit if it reports that it is not empty anymore.
* The memory barrier here pairs with the barrier in
- * memcg_set_shrinker_bit():
+ * set_shrinker_bit():
*
* list_lru_add() shrink_slab_memcg()
* list_add_tail() clear_bit()
@@ -602,7 +798,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
if (ret == SHRINK_EMPTY)
ret = 0;
else
- memcg_set_shrinker_bit(memcg, nid, i);
+ set_shrinker_bit(memcg, nid, i);
}
freed += ret;
@@ -1507,8 +1703,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
LIST_HEAD(clean_pages);
list_for_each_entry_safe(page, next, page_list, lru) {
- if (page_is_file_lru(page) && !PageDirty(page) &&
- !__PageMovable(page) && !PageUnevictable(page)) {
+ if (!PageHuge(page) && page_is_file_lru(page) &&
+ !PageDirty(page) && !__PageMovable(page) &&
+ !PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
@@ -1818,8 +2015,8 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
*
* Returns the number of pages moved to the given lruvec.
*/
-static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list)
+static unsigned int move_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
{
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
@@ -1866,7 +2063,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
* All pages were isolated from the same lruvec (and isolation
* inhibits memcg migration).
*/
- VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
+ VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
add_page_to_lru_list(page, lruvec);
nr_pages = thp_nr_pages(page);
nr_moved += nr_pages;
@@ -1899,7 +2096,7 @@ static int current_may_throttle(void)
* shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
*/
-static noinline_for_stack unsigned long
+static unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, enum lru_list lru)
{
@@ -3525,6 +3722,38 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
return sc->nr_scanned >= sc->nr_to_reclaim;
}
+/* Page allocator PCP high watermark is lowered if reclaim is active. */
+static inline void
+update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
+{
+ int i;
+ struct zone *zone;
+
+ for (i = 0; i <= highest_zoneidx; i++) {
+ zone = pgdat->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ if (active)
+ set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
+ else
+ clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
+ }
+}
+
+static inline void
+set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
+{
+ update_reclaim_active(pgdat, highest_zoneidx, true);
+}
+
+static inline void
+clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
+{
+ update_reclaim_active(pgdat, highest_zoneidx, false);
+}
+
/*
* For kswapd, balance_pgdat() will reclaim pages across a node from zones
* that are eligible for use by the caller until at least one zone is
@@ -3577,6 +3806,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
boosted = nr_boost_reclaim;
restart:
+ set_reclaim_active(pgdat, highest_zoneidx);
sc.priority = DEF_PRIORITY;
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
@@ -3710,6 +3940,8 @@ restart:
pgdat->kswapd_failures++;
out:
+ clear_reclaim_active(pgdat, highest_zoneidx);
+
/* If reclaim was boosted, account for the reclaim done in this pass */
if (boosted) {
unsigned long flags;
@@ -3862,7 +4094,7 @@ static int kswapd(void *p)
{
unsigned int alloc_order, reclaim_order;
unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
- pg_data_t *pgdat = (pg_data_t*)p;
+ pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -4086,14 +4318,6 @@ module_init(kswapd_init)
int node_reclaim_mode __read_mostly;
/*
- * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
- * ABI. New bits are OK, but existing bits can never change.
- */
-#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
-#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
-
-/*
* Priority for NODE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 74b2c374b86c..b0534e068166 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -31,8 +31,6 @@
#include "internal.h"
-#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
-
#ifdef CONFIG_NUMA
int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
@@ -41,11 +39,12 @@ static void zero_zone_numa_counters(struct zone *zone)
{
int item, cpu;
- for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
- atomic_long_set(&zone->vm_numa_stat[item], 0);
- for_each_online_cpu(cpu)
- per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
+ atomic_long_set(&zone->vm_numa_event[item], 0);
+ for_each_online_cpu(cpu) {
+ per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
= 0;
+ }
}
}
@@ -63,8 +62,8 @@ static void zero_global_numa_counters(void)
{
int item;
- for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
- atomic_long_set(&vm_numa_stat[item], 0);
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ atomic_long_set(&vm_numa_event[item], 0);
}
static void invalid_numa_statistics(void)
@@ -161,10 +160,9 @@ void vm_events_fold_cpu(int cpu)
* vm_stat contains the global counters
*/
atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
-atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_zone_stat);
-EXPORT_SYMBOL(vm_numa_stat);
EXPORT_SYMBOL(vm_node_stat);
#ifdef CONFIG_SMP
@@ -266,7 +264,7 @@ void refresh_zone_stat_thresholds(void)
for_each_online_cpu(cpu) {
int pgdat_threshold;
- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
= threshold;
/* Base nodestat threshold on the largest populated zone. */
@@ -303,7 +301,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
threshold = (*calculate_pressure)(zone);
for_each_online_cpu(cpu)
- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
= threshold;
}
}
@@ -316,7 +314,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
long delta)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
long t;
@@ -389,7 +387,7 @@ EXPORT_SYMBOL(__mod_node_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
@@ -435,7 +433,7 @@ EXPORT_SYMBOL(__inc_node_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
@@ -495,7 +493,7 @@ EXPORT_SYMBOL(__dec_node_page_state);
static inline void mod_zone_state(struct zone *zone,
enum zone_stat_item item, long delta, int overstep_mode)
{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
s8 __percpu *p = pcp->vm_stat_diff + item;
long o, n, t, z;
@@ -706,8 +704,7 @@ EXPORT_SYMBOL(dec_node_page_state);
* Fold a differential into the global counters.
* Returns the number of counters updated.
*/
-#ifdef CONFIG_NUMA
-static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
+static int fold_diff(int *zone_diff, int *node_diff)
{
int i;
int changes = 0;
@@ -718,12 +715,6 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
changes++;
}
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- if (numa_diff[i]) {
- atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
- changes++;
- }
-
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
if (node_diff[i]) {
atomic_long_add(node_diff[i], &vm_node_stat[i]);
@@ -731,26 +722,34 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
}
return changes;
}
-#else
-static int fold_diff(int *zone_diff, int *node_diff)
+
+#ifdef CONFIG_NUMA
+static void fold_vm_zone_numa_events(struct zone *zone)
{
- int i;
- int changes = 0;
+ unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
+ int cpu;
+ enum numa_stat_item item;
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (zone_diff[i]) {
- atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
- changes++;
- }
+ for_each_online_cpu(cpu) {
+ struct per_cpu_zonestat *pzstats;
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- if (node_diff[i]) {
- atomic_long_add(node_diff[i], &vm_node_stat[i]);
- changes++;
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
}
- return changes;
+
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ zone_numa_event_add(zone_numa_events[item], zone, item);
+}
+
+void fold_vm_numa_events(void)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ fold_vm_zone_numa_events(zone);
}
-#endif /* CONFIG_NUMA */
+#endif
/*
* Update the zone counters for the current cpu.
@@ -774,41 +773,30 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
struct zone *zone;
int i;
int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
-#ifdef CONFIG_NUMA
- int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
-#endif
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
int changes = 0;
for_each_populated_zone(zone) {
- struct per_cpu_pageset __percpu *p = zone->pageset;
+ struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+#ifdef CONFIG_NUMA
+ struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
+#endif
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
int v;
- v = this_cpu_xchg(p->vm_stat_diff[i], 0);
+ v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
if (v) {
atomic_long_add(v, &zone->vm_stat[i]);
global_zone_diff[i] += v;
#ifdef CONFIG_NUMA
/* 3 seconds idle till flush */
- __this_cpu_write(p->expire, 3);
+ __this_cpu_write(pcp->expire, 3);
#endif
}
}
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
- int v;
-
- v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
- if (v) {
-
- atomic_long_add(v, &zone->vm_numa_stat[i]);
- global_numa_diff[i] += v;
- __this_cpu_write(p->expire, 3);
- }
- }
if (do_pagesets) {
cond_resched();
@@ -819,23 +807,23 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
* Check if there are pages remaining in this pageset
* if not then there is nothing to expire.
*/
- if (!__this_cpu_read(p->expire) ||
- !__this_cpu_read(p->pcp.count))
+ if (!__this_cpu_read(pcp->expire) ||
+ !__this_cpu_read(pcp->count))
continue;
/*
* We never drain zones local to this processor.
*/
if (zone_to_nid(zone) == numa_node_id()) {
- __this_cpu_write(p->expire, 0);
+ __this_cpu_write(pcp->expire, 0);
continue;
}
- if (__this_cpu_dec_return(p->expire))
+ if (__this_cpu_dec_return(pcp->expire))
continue;
- if (__this_cpu_read(p->pcp.count)) {
- drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+ if (__this_cpu_read(pcp->count)) {
+ drain_zone_pages(zone, this_cpu_ptr(pcp));
changes++;
}
}
@@ -856,12 +844,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
}
}
-#ifdef CONFIG_NUMA
- changes += fold_diff(global_zone_diff, global_numa_diff,
- global_node_diff);
-#else
changes += fold_diff(global_zone_diff, global_node_diff);
-#endif
return changes;
}
@@ -876,36 +859,33 @@ void cpu_vm_stats_fold(int cpu)
struct zone *zone;
int i;
int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
-#ifdef CONFIG_NUMA
- int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
-#endif
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
for_each_populated_zone(zone) {
- struct per_cpu_pageset *p;
+ struct per_cpu_zonestat *pzstats;
- p = per_cpu_ptr(zone->pageset, cpu);
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (p->vm_stat_diff[i]) {
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ if (pzstats->vm_stat_diff[i]) {
int v;
- v = p->vm_stat_diff[i];
- p->vm_stat_diff[i] = 0;
+ v = pzstats->vm_stat_diff[i];
+ pzstats->vm_stat_diff[i] = 0;
atomic_long_add(v, &zone->vm_stat[i]);
global_zone_diff[i] += v;
}
-
+ }
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- if (p->vm_numa_stat_diff[i]) {
- int v;
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
+ if (pzstats->vm_numa_event[i]) {
+ unsigned long v;
- v = p->vm_numa_stat_diff[i];
- p->vm_numa_stat_diff[i] = 0;
- atomic_long_add(v, &zone->vm_numa_stat[i]);
- global_numa_diff[i] += v;
+ v = pzstats->vm_numa_event[i];
+ pzstats->vm_numa_event[i] = 0;
+ zone_numa_event_add(v, zone, i);
}
+ }
#endif
}
@@ -925,58 +905,39 @@ void cpu_vm_stats_fold(int cpu)
}
}
-#ifdef CONFIG_NUMA
- fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
-#else
fold_diff(global_zone_diff, global_node_diff);
-#endif
}
/*
* this is only called if !populated_zone(zone), which implies no other users of
- * pset->vm_stat_diff[] exsist.
+ * pset->vm_stat_diff[] exist.
*/
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
{
+ unsigned long v;
int i;
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (pset->vm_stat_diff[i]) {
- int v = pset->vm_stat_diff[i];
- pset->vm_stat_diff[i] = 0;
- atomic_long_add(v, &zone->vm_stat[i]);
- atomic_long_add(v, &vm_zone_stat[i]);
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ if (pzstats->vm_stat_diff[i]) {
+ v = pzstats->vm_stat_diff[i];
+ pzstats->vm_stat_diff[i] = 0;
+ zone_page_state_add(v, zone, i);
}
+ }
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- if (pset->vm_numa_stat_diff[i]) {
- int v = pset->vm_numa_stat_diff[i];
-
- pset->vm_numa_stat_diff[i] = 0;
- atomic_long_add(v, &zone->vm_numa_stat[i]);
- atomic_long_add(v, &vm_numa_stat[i]);
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
+ if (pzstats->vm_numa_event[i]) {
+ v = pzstats->vm_numa_event[i];
+ pzstats->vm_numa_event[i] = 0;
+ zone_numa_event_add(v, zone, i);
}
+ }
#endif
}
#endif
#ifdef CONFIG_NUMA
-void __inc_numa_state(struct zone *zone,
- enum numa_stat_item item)
-{
- struct per_cpu_pageset __percpu *pcp = zone->pageset;
- u16 __percpu *p = pcp->vm_numa_stat_diff + item;
- u16 v;
-
- v = __this_cpu_inc_return(*p);
-
- if (unlikely(v > NUMA_STATS_THRESHOLD)) {
- zone_numa_state_add(v, zone, item);
- __this_cpu_write(*p, 0);
- }
-}
-
/*
* Determine the per node value of a stat item. This function
* is called frequently in a NUMA machine, so try to be as
@@ -995,19 +956,16 @@ unsigned long sum_zone_node_page_state(int node,
return count;
}
-/*
- * Determine the per node value of a numa stat item. To avoid deviation,
- * the per cpu stat number in vm_numa_stat_diff[] is also included.
- */
-unsigned long sum_zone_numa_state(int node,
+/* Determine the per node value of a numa stat item. */
+unsigned long sum_zone_numa_event_state(int node,
enum numa_stat_item item)
{
struct zone *zones = NODE_DATA(node)->node_zones;
- int i;
unsigned long count = 0;
+ int i;
for (i = 0; i < MAX_NR_ZONES; i++)
- count += zone_numa_state_snapshot(zones + i, item);
+ count += zone_numa_event_state(zones + i, item);
return count;
}
@@ -1313,6 +1271,10 @@ const char * const vmstat_text[] = {
"htlb_buddy_alloc_success",
"htlb_buddy_alloc_fail",
#endif
+#ifdef CONFIG_CMA
+ "cma_alloc_success",
+ "cma_alloc_fail",
+#endif
"unevictable_pgs_culled",
"unevictable_pgs_scanned",
"unevictable_pgs_rescued",
@@ -1365,6 +1327,10 @@ const char * const vmstat_text[] = {
"swap_ra",
"swap_ra_hit",
#endif
+#ifdef CONFIG_X86
+ "direct_map_level2_splits",
+ "direct_map_level3_splits",
+#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
@@ -1678,28 +1644,30 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone_page_state(zone, i));
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
- zone_numa_state_snapshot(zone, i));
+ zone_numa_event_state(zone, i));
#endif
seq_printf(m, "\n pagesets");
for_each_online_cpu(i) {
- struct per_cpu_pageset *pageset;
+ struct per_cpu_pages *pcp;
+ struct per_cpu_zonestat __maybe_unused *pzstats;
- pageset = per_cpu_ptr(zone->pageset, i);
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
seq_printf(m,
"\n cpu: %i"
"\n count: %i"
"\n high: %i"
"\n batch: %i",
i,
- pageset->pcp.count,
- pageset->pcp.high,
- pageset->pcp.batch);
+ pcp->count,
+ pcp->high,
+ pcp->batch);
#ifdef CONFIG_SMP
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
seq_printf(m, "\n vm stats threshold: %d",
- pageset->stat_threshold);
+ pzstats->stat_threshold);
#endif
}
seq_printf(m,
@@ -1732,7 +1700,7 @@ static const struct seq_operations zoneinfo_op = {
};
#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
- NR_VM_NUMA_STAT_ITEMS + \
+ NR_VM_NUMA_EVENT_ITEMS + \
NR_VM_NODE_STAT_ITEMS + \
NR_VM_WRITEBACK_STAT_ITEMS + \
(IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
@@ -1747,6 +1715,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
return NULL;
BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
+ fold_vm_numa_events();
v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
m->private = v;
if (!v)
@@ -1756,9 +1725,9 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
v += NR_VM_ZONE_STAT_ITEMS;
#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
- v[i] = global_numa_state(i);
- v += NR_VM_NUMA_STAT_ITEMS;
+ for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
+ v[i] = global_numa_event_state(i);
+ v += NR_VM_NUMA_EVENT_ITEMS;
#endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
@@ -1854,25 +1823,34 @@ int vmstat_refresh(struct ctl_table *table, int write,
if (err)
return err;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ /*
+ * Skip checking stats known to go negative occasionally.
+ */
+ switch (i) {
+ case NR_ZONE_WRITE_PENDING:
+ case NR_FREE_CMA_PAGES:
+ continue;
+ }
val = atomic_long_read(&vm_zone_stat[i]);
if (val < 0) {
pr_warn("%s: %s %ld\n",
__func__, zone_stat_name(i), val);
- err = -EINVAL;
}
}
-#ifdef CONFIG_NUMA
- for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
- val = atomic_long_read(&vm_numa_stat[i]);
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ /*
+ * Skip checking stats known to go negative occasionally.
+ */
+ switch (i) {
+ case NR_WRITEBACK:
+ continue;
+ }
+ val = atomic_long_read(&vm_node_stat[i]);
if (val < 0) {
pr_warn("%s: %s %ld\n",
- __func__, numa_stat_name(i), val);
- err = -EINVAL;
+ __func__, node_stat_name(i), val);
}
}
-#endif
- if (err)
- return err;
if (write)
*ppos += *lenp;
else
@@ -1910,19 +1888,16 @@ static bool need_update(int cpu)
struct zone *zone;
for_each_populated_zone(zone) {
- struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
+ struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
struct per_cpu_nodestat *n;
+
/*
* The fast way of checking if there are any vmstat diffs.
*/
- if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
- sizeof(p->vm_stat_diff[0])))
- return true;
-#ifdef CONFIG_NUMA
- if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
- sizeof(p->vm_numa_stat_diff[0])))
+ if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
+ sizeof(pzstats->vm_stat_diff[0])))
return true;
-#endif
+
if (last_pgdat == zone->zone_pgdat)
continue;
last_pgdat = zone->zone_pgdat;
diff --git a/mm/workingset.c b/mm/workingset.c
index cd39902c1062..4f7a306ce75a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -408,7 +408,7 @@ void workingset_activation(struct page *page)
memcg = page_memcg_rcu(page);
if (!mem_cgroup_disabled() && !memcg)
goto out;
- lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ lruvec = mem_cgroup_page_lruvec(page);
workingset_age_nonresident(lruvec, thp_nr_pages(page));
out:
rcu_read_unlock();
@@ -554,7 +554,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out_invalid;
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
- mapping->nrexceptional -= node->nr_values;
xa_delete_node(node, workingset_update_node);
__inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 9d889ad2bb86..7fe7adaaad01 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -391,7 +391,7 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool)
{
if (pool->inode)
iput(pool->inode);
- }
+}
/* Initializes the z3fold header of a newly allocated z3fold page */
static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
diff --git a/mm/zpool.c b/mm/zpool.c
index 5ed71207ced7..6d9ed48141e5 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -336,7 +336,7 @@ int zpool_shrink(struct zpool *zpool, unsigned int pages,
* This may hold locks, disable interrupts, and/or preemption,
* and the zpool_unmap_handle() must be called to undo those
* actions. The code that uses the mapped handle should complete
- * its operatons on the mapped handle memory quickly and unmap
+ * its operations on the mapped handle memory quickly and unmap
* as soon as possible. As the implementation may use per-cpu
* data, multiple handles should not be mapped concurrently on
* any cpu.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 30c358b72025..19b563bc6c48 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -61,7 +61,7 @@
#define ZSPAGE_MAGIC 0x58
/*
- * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * This must be power of 2 and greater than or equal to sizeof(link_free).
* These two conditions ensure that any 'struct link_free' itself doesn't
* span more than 1 page which avoids complex case of mapping 2 pages simply
* to restore link_free pointer values.
@@ -530,7 +530,7 @@ static void set_zspage_mapping(struct zspage *zspage,
* class maintains a list of zspages where each zspage is divided
* into equal sized chunks. Each allocation falls into one of these
* classes depending on its size. This function returns index of the
- * size class which has chunk size big enough to hold the give size.
+ * size class which has chunk size big enough to hold the given size.
*/
static int get_size_class_index(int size)
{
@@ -1227,7 +1227,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
* zs_map_object - get address of allocated object from handle.
* @pool: pool from which the object was allocated
* @handle: handle returned from zs_malloc
- * @mm: maping mode to use
+ * @mm: mapping mode to use
*
* Before using an object allocated from zs_malloc, it must be mapped using
* this function. When done with the object, it must be unmapped using
@@ -1987,8 +1987,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
head = obj_to_head(page, addr);
if (head & OBJ_ALLOCATED_TAG) {
handle = head & ~OBJ_ALLOCATED_TAG;
- if (!testpin_tag(handle))
- BUG();
+ BUG_ON(!testpin_tag(handle));
old_obj = handle_to_obj(handle);
obj_to_location(old_obj, &dummy, &obj_idx);
@@ -2035,8 +2034,7 @@ unpin_objects:
head = obj_to_head(page, addr);
if (head & OBJ_ALLOCATED_TAG) {
handle = head & ~OBJ_ALLOCATED_TAG;
- if (!testpin_tag(handle))
- BUG();
+ BUG_ON(!testpin_tag(handle));
unpin_tag(handle);
}
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 578d9f256920..20763267a219 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -614,7 +614,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
}
pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
- strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+ strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
if (!pool->acomp_ctx) {