summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c47
-rw-r--r--mm/bootmem.c2
-rw-r--r--mm/cma.c15
-rw-r--r--mm/compaction.c29
-rw-r--r--mm/filemap.c55
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c173
-rw-r--r--mm/hugetlb.c221
-rw-r--r--mm/internal.h11
-rw-r--r--mm/kasan/kasan_init.c15
-rw-r--r--mm/kasan/report.c3
-rw-r--r--mm/khugepaged.c26
-rw-r--r--mm/madvise.c8
-rw-r--r--mm/memblock.c88
-rw-r--r--mm/memcontrol.c45
-rw-r--r--mm/memory.c146
-rw-r--r--mm/memory_hotplug.c61
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/mmap.c24
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mprotect.c46
-rw-r--r--mm/mremap.c17
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/oom_kill.c17
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c444
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/shmem.c160
-rw-r--r--mm/slab.c18
-rw-r--r--mm/slab.h33
-rw-r--r--mm/slab_common.c299
-rw-r--r--mm/slub.c112
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c6
-rw-r--r--mm/swap_slots.c342
-rw-r--r--mm/swap_state.c80
-rw-r--r--mm/swapfile.c536
-rw-r--r--mm/truncate.c75
-rw-r--r--mm/usercopy.c4
-rw-r--r--mm/userfaultfd.c277
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c241
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/workingset.c5
-rw-r--r--mm/z3fold.c10
-rw-r--r--mm/zsmalloc.c6
-rw-r--r--mm/zswap.c30
48 files changed, 2814 insertions, 953 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 295bd7a9f76b..433eaf9a876e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -35,7 +35,7 @@ obj-y := filemap.o mempool.o oom_kill.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o vmacache.o \
+ compaction.o vmacache.o swap_slots.o \
interval_tree.o list_lru.o workingset.o \
debug.o $(mmu-y)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3bfed5ab2475..6d861d090e9f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -237,6 +237,7 @@ static __init int bdi_class_init(void)
bdi_class->dev_groups = bdi_dev_groups;
bdi_debug_init();
+
return 0;
}
postcore_initcall(bdi_class_init);
@@ -410,8 +411,8 @@ retry:
while (*node != NULL) {
parent = *node;
- congested = container_of(parent, struct bdi_writeback_congested,
- rb_node);
+ congested = rb_entry(parent, struct bdi_writeback_congested,
+ rb_node);
if (congested->blkcg_id < blkcg_id)
node = &parent->rb_left;
else if (congested->blkcg_id > blkcg_id)
@@ -758,15 +759,20 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
if (!bdi->wb_congested)
return -ENOMEM;
+ atomic_set(&bdi->wb_congested->refcnt, 1);
+
err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (err) {
- kfree(bdi->wb_congested);
+ wb_congested_put(bdi->wb_congested);
return err;
}
return 0;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+{
+ wb_congested_put(bdi->wb_congested);
+}
#endif /* CONFIG_CGROUP_WRITEBACK */
@@ -776,6 +782,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->dev = NULL;
+ kref_init(&bdi->refcnt);
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
@@ -791,6 +798,22 @@ int bdi_init(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_init);
+struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
+{
+ struct backing_dev_info *bdi;
+
+ bdi = kmalloc_node(sizeof(struct backing_dev_info),
+ gfp_mask | __GFP_ZERO, node_id);
+ if (!bdi)
+ return NULL;
+
+ if (bdi_init(bdi)) {
+ kfree(bdi);
+ return NULL;
+ }
+ return bdi;
+}
+
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
@@ -871,12 +894,26 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
}
-void bdi_exit(struct backing_dev_info *bdi)
+static void bdi_exit(struct backing_dev_info *bdi)
{
WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
}
+static void release_bdi(struct kref *ref)
+{
+ struct backing_dev_info *bdi =
+ container_of(ref, struct backing_dev_info, refcnt);
+
+ bdi_exit(bdi);
+ kfree(bdi);
+}
+
+void bdi_put(struct backing_dev_info *bdi)
+{
+ kref_put(&bdi->refcnt, release_bdi);
+}
+
void bdi_destroy(struct backing_dev_info *bdi)
{
bdi_unregister(bdi);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e8a55a3c9feb..9fedb27c6451 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -53,7 +53,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
static unsigned long __init bootmap_bytes(unsigned long pages)
{
- unsigned long bytes = DIV_ROUND_UP(pages, 8);
+ unsigned long bytes = DIV_ROUND_UP(pages, BITS_PER_BYTE);
return ALIGN(bytes, sizeof(long));
}
diff --git a/mm/cma.c b/mm/cma.c
index c960459eda7e..94b3460cd608 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -235,18 +235,13 @@ int __init cma_declare_contiguous(phys_addr_t base,
phys_addr_t highmem_start;
int ret = 0;
-#ifdef CONFIG_X86
/*
- * high_memory isn't direct mapped memory so retrieving its physical
- * address isn't appropriate. But it would be useful to check the
- * physical address of the highmem boundary so it's justifiable to get
- * the physical address from it. On x86 there is a validation check for
- * this case, so the following workaround is needed to avoid it.
+ * We can't use __pa(high_memory) directly, since high_memory
+ * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly)
+ * complain. Find the boundary by adding one to the last valid
+ * address.
*/
- highmem_start = __pa_nodebug(high_memory);
-#else
- highmem_start = __pa(high_memory);
-#endif
+ highmem_start = __pa(high_memory - 1) + 1;
pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
__func__, &size, &base, &limit, &alignment);
diff --git a/mm/compaction.c b/mm/compaction.c
index 949198d01260..0aa2757399ee 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -548,7 +548,7 @@ isolate_fail:
if (blockpfn == end_pfn)
update_pageblock_skip(cc, valid_page, total_isolated, false);
- count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
+ cc->total_free_scanned += nr_scanned;
if (total_isolated)
count_compact_events(COMPACTISOLATED, total_isolated);
return total_isolated;
@@ -931,7 +931,7 @@ isolate_fail:
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
- count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+ cc->total_migrate_scanned += nr_scanned;
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
@@ -1631,6 +1631,9 @@ out:
zone->compact_cached_free_pfn = free_pfn;
}
+ count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
+ count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
+
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret);
@@ -1645,6 +1648,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
+ .total_migrate_scanned = 0,
+ .total_free_scanned = 0,
.order = order,
.gfp_mask = gfp_mask,
.zone = zone,
@@ -1757,6 +1762,8 @@ static void compact_node(int nid)
struct zone *zone;
struct compact_control cc = {
.order = -1,
+ .total_migrate_scanned = 0,
+ .total_free_scanned = 0,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
@@ -1883,6 +1890,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct zone *zone;
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
+ .total_migrate_scanned = 0,
+ .total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = true,
@@ -1891,7 +1900,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
cc.classzone_idx);
- count_vm_event(KCOMPACTD_WAKE);
+ count_compact_event(KCOMPACTD_WAKE);
for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
int status;
@@ -1909,6 +1918,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
cc.nr_freepages = 0;
cc.nr_migratepages = 0;
+ cc.total_migrate_scanned = 0;
+ cc.total_free_scanned = 0;
cc.zone = zone;
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1927,6 +1938,11 @@ static void kcompactd_do_work(pg_data_t *pgdat)
defer_compaction(zone, cc.order);
}
+ count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+ cc.total_migrate_scanned);
+ count_compact_events(KCOMPACTD_FREE_SCANNED,
+ cc.total_free_scanned);
+
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
}
@@ -1950,6 +1966,13 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kcompactd_max_order < order)
pgdat->kcompactd_max_order = order;
+ /*
+ * Pairs with implicit barrier in wait_event_freezable()
+ * such that wakeups are not missed in the lockless
+ * waitqueue_active() call.
+ */
+ smp_acquire__after_ctrl_dep();
+
if (pgdat->kcompactd_classzone_idx > classzone_idx)
pgdat->kcompactd_classzone_idx = classzone_idx;
diff --git a/mm/filemap.c b/mm/filemap.c
index 82f26cde830c..416d563468a3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -138,7 +138,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
/* Wakeup waiters for exceptional entry lock */
dax_wake_mapping_entry_waiter(mapping, page->index, p,
- false);
+ true);
}
}
__radix_tree_replace(&mapping->page_tree, node, slot, page,
@@ -788,7 +788,7 @@ static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void
return autoremove_wake_function(wait, mode, sync, key);
}
-void wake_up_page_bit(struct page *page, int bit_nr)
+static void wake_up_page_bit(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
struct wait_page_key key;
@@ -821,7 +821,13 @@ void wake_up_page_bit(struct page *page, int bit_nr)
}
spin_unlock_irqrestore(&q->lock, flags);
}
-EXPORT_SYMBOL(wake_up_page_bit);
+
+static void wake_up_page(struct page *page, int bit)
+{
+ if (!PageWaiters(page))
+ return;
+ wake_up_page_bit(page, bit);
+}
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, bool lock)
@@ -912,6 +918,29 @@ void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
}
EXPORT_SYMBOL_GPL(add_page_wait_queue);
+#ifndef clear_bit_unlock_is_negative_byte
+
+/*
+ * PG_waiters is the high bit in the same byte as PG_lock.
+ *
+ * On x86 (and on many other architectures), we can clear PG_lock and
+ * test the sign bit at the same time. But if the architecture does
+ * not support that special operation, we just do this all by hand
+ * instead.
+ *
+ * The read of PG_waiters has to be after (or concurrently with) PG_locked
+ * being cleared, but a memory barrier should be unneccssary since it is
+ * in the same byte as PG_locked.
+ */
+static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
+{
+ clear_bit_unlock(nr, mem);
+ /* smp_mb__after_atomic(); */
+ return test_bit(PG_waiters, mem);
+}
+
+#endif
+
/**
* unlock_page - unlock a locked page
* @page: the page
@@ -921,16 +950,19 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
*
- * The mb is necessary to enforce ordering between the clear_bit and the read
- * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
+ * Note that this depends on PG_waiters being the sign bit in the byte
+ * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
+ * clear the PG_locked bit and test PG_waiters at the same time fairly
+ * portably (architectures that do LL/SC can test any bit, while x86 can
+ * test the sign bit).
*/
void unlock_page(struct page *page)
{
+ BUILD_BUG_ON(PG_waiters != 7);
page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- clear_bit_unlock(PG_locked, &page->flags);
- smp_mb__after_atomic();
- wake_up_page(page, PG_locked);
+ if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
+ wake_up_page_bit(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
@@ -987,7 +1019,7 @@ EXPORT_SYMBOL_GPL(page_endio);
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @page: the page to lock
+ * @__page: the page to lock
*/
void __lock_page(struct page *__page)
{
@@ -1765,6 +1797,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
cond_resched();
find_page:
+ if (fatal_signal_pending(current)) {
+ error = -EINTR;
+ goto out;
+ }
+
page = find_get_page(mapping, index);
if (!page) {
page_cache_sync_readahead(mapping,
diff --git a/mm/gup.c b/mm/gup.c
index 55315555489d..40abe4c90383 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -572,7 +572,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
- gup_flags);
+ gup_flags, nonblocking);
continue;
}
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 10eedbf14421..f9ecc2aeadfc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -142,42 +142,6 @@ static struct shrinker huge_zero_page_shrinker = {
};
#ifdef CONFIG_SYSFS
-
-static ssize_t triple_flag_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count,
- enum transparent_hugepage_flag enabled,
- enum transparent_hugepage_flag deferred,
- enum transparent_hugepage_flag req_madv)
-{
- if (!memcmp("defer", buf,
- min(sizeof("defer")-1, count))) {
- if (enabled == deferred)
- return -EINVAL;
- clear_bit(enabled, &transparent_hugepage_flags);
- clear_bit(req_madv, &transparent_hugepage_flags);
- set_bit(deferred, &transparent_hugepage_flags);
- } else if (!memcmp("always", buf,
- min(sizeof("always")-1, count))) {
- clear_bit(deferred, &transparent_hugepage_flags);
- clear_bit(req_madv, &transparent_hugepage_flags);
- set_bit(enabled, &transparent_hugepage_flags);
- } else if (!memcmp("madvise", buf,
- min(sizeof("madvise")-1, count))) {
- clear_bit(enabled, &transparent_hugepage_flags);
- clear_bit(deferred, &transparent_hugepage_flags);
- set_bit(req_madv, &transparent_hugepage_flags);
- } else if (!memcmp("never", buf,
- min(sizeof("never")-1, count))) {
- clear_bit(enabled, &transparent_hugepage_flags);
- clear_bit(req_madv, &transparent_hugepage_flags);
- clear_bit(deferred, &transparent_hugepage_flags);
- } else
- return -EINVAL;
-
- return count;
-}
-
static ssize_t enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -193,19 +157,28 @@ static ssize_t enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- ssize_t ret;
+ ssize_t ret = count;
- ret = triple_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_FLAG,
- TRANSPARENT_HUGEPAGE_FLAG,
- TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+ if (!memcmp("always", buf,
+ min(sizeof("always")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("madvise", buf,
+ min(sizeof("madvise")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("never", buf,
+ min(sizeof("never")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else
+ ret = -EINVAL;
if (ret > 0) {
int err = start_stop_khugepaged();
if (err)
ret = err;
}
-
return ret;
}
static struct kobj_attribute enabled_attr =
@@ -241,32 +214,58 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
return count;
}
-/*
- * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
- * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
- * memory just to allocate one more hugepage.
- */
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "[always] defer madvise never\n");
+ return sprintf(buf, "[always] defer defer+madvise madvise never\n");
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always [defer] madvise never\n");
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always defer [madvise] never\n");
- else
- return sprintf(buf, "always defer madvise [never]\n");
-
+ return sprintf(buf, "always [defer] defer+madvise madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always defer [defer+madvise] madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always defer defer+madvise [madvise] never\n");
+ return sprintf(buf, "always defer defer+madvise madvise [never]\n");
}
+
static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- return triple_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
- TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
- TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+ if (!memcmp("always", buf,
+ min(sizeof("always")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("defer", buf,
+ min(sizeof("defer")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("defer+madvise", buf,
+ min(sizeof("defer+madvise")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("madvise", buf,
+ min(sizeof("madvise")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("never", buf,
+ min(sizeof("never")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
}
static struct kobj_attribute defrag_attr =
__ATTR(defrag, 0644, defrag_show, defrag_store);
@@ -612,25 +611,28 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
}
/*
- * If THP defrag is set to always then directly reclaim/compact as necessary
- * If set to defer then do only background reclaim/compact and defer to khugepaged
- * If set to madvise and the VMA is flagged then directly reclaim/compact
- * When direct reclaim/compact is allowed, don't retry except for flagged VMA's
+ * always: directly stall for all thp allocations
+ * defer: wake kswapd and fail if not immediately available
+ * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
+ * fail if not immediately available
+ * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
+ * available
+ * never: never stall for any thp allocation
*/
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
{
- bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+ const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
- &transparent_hugepage_flags) && vma_madvised)
- return GFP_TRANSHUGE;
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
- &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
- &transparent_hugepage_flags))
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
-
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ __GFP_KSWAPD_RECLAIM);
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ 0);
return GFP_TRANSHUGE_LIGHT;
}
@@ -783,6 +785,12 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
assert_spin_locked(pmd_lockptr(mm, pmd));
+ /*
+ * When we COW a devmap PMD entry, we split it into PTEs, so we should
+ * not be in this function with `flags & FOLL_COW` set.
+ */
+ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
+
if (flags & FOLL_WRITE && !pmd_write(*pmd))
return NULL;
@@ -883,15 +891,17 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
{
pmd_t entry;
unsigned long haddr;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
+ if (write)
+ entry = pmd_mkdirty(entry);
haddr = vmf->address & HPAGE_PMD_MASK;
- if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry,
- vmf->flags & FAULT_FLAG_WRITE))
+ if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
unlock:
@@ -919,8 +929,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
- __GFP_OTHER_NODE, vma,
+ pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
vmf->address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], vma->vm_mm,
@@ -1127,6 +1136,16 @@ out_unlock:
return ret;
}
+/*
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
+ */
+static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+{
+ return pmd_write(pmd) ||
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+}
+
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
@@ -1137,7 +1156,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
assert_spin_locked(pmd_lockptr(mm, pmd));
- if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
goto out;
/* Avoid dumping huge zero page */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3edb759c5c7d..30e7709a5121 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -32,6 +32,7 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
+#include <linux/userfaultfd_k.h>
#include "internal.h"
int hugepages_treat_as_movable;
@@ -1773,23 +1774,32 @@ free:
}
/*
- * When releasing a hugetlb pool reservation, any surplus pages that were
- * allocated to satisfy the reservation must be explicitly freed if they were
- * never used.
- * Called with hugetlb_lock held.
+ * This routine has two main purposes:
+ * 1) Decrement the reservation count (resv_huge_pages) by the value passed
+ * in unused_resv_pages. This corresponds to the prior adjustments made
+ * to the associated reservation map.
+ * 2) Free any unused surplus pages that may have been allocated to satisfy
+ * the reservation. As many as unused_resv_pages may be freed.
+ *
+ * Called with hugetlb_lock held. However, the lock could be dropped (and
+ * reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
+ * we must make sure nobody else can claim pages we are in the process of
+ * freeing. Do this by ensuring resv_huge_page always is greater than the
+ * number of huge pages we plan to free when dropping the lock.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
- /* Uncommit the reservation */
- h->resv_huge_pages -= unused_resv_pages;
-
/* Cannot return gigantic pages currently */
if (hstate_is_gigantic(h))
- return;
+ goto out;
+ /*
+ * Part (or even all) of the reservation could have been backed
+ * by pre-allocated pages. Only free surplus pages.
+ */
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
/*
@@ -1799,12 +1809,22 @@ static void return_unused_surplus_pages(struct hstate *h,
* when the nodes with surplus pages have no free pages.
* free_pool_huge_page() will balance the the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
+ *
+ * Note that we decrement resv_huge_pages as we free the pages. If
+ * we drop the lock, resv_huge_pages will still be sufficiently large
+ * to cover subsequent pages we may free.
*/
while (nr_pages--) {
+ h->resv_huge_pages--;
+ unused_resv_pages--;
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
- break;
+ goto out;
cond_resched_lock(&hugetlb_lock);
}
+
+out:
+ /* Fully uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
}
@@ -3661,6 +3681,38 @@ retry:
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
+
+ /*
+ * Check for page in userfault range
+ */
+ if (userfaultfd_missing(vma)) {
+ u32 hash;
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = address,
+ .flags = flags,
+ /*
+ * Hard to debug if it ends up being
+ * used by a callee that assumes
+ * something about the other
+ * uninitialized fields... same as in
+ * memory.c
+ */
+ };
+
+ /*
+ * hugetlb_fault_mutex must be dropped before
+ * handling userfault. Reacquire after handling
+ * fault to make calling code simpler.
+ */
+ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
+ idx, address);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
page = alloc_huge_page(vma, address, 0);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
@@ -3929,10 +3981,113 @@ out_mutex:
return ret;
}
+/*
+ * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
+ * modifications for huge pages.
+ */
+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ int vm_shared = dst_vma->vm_flags & VM_SHARED;
+ struct hstate *h = hstate_vma(dst_vma);
+ pte_t _dst_pte;
+ spinlock_t *ptl;
+ int ret;
+ struct page *page;
+
+ if (!*pagep) {
+ ret = -ENOMEM;
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (IS_ERR(page))
+ goto out;
+
+ ret = copy_huge_page_from_user(page,
+ (const void __user *) src_addr,
+ pages_per_huge_page(h), false);
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ ret = -EFAULT;
+ *pagep = page;
+ /* don't free the page */
+ goto out;
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+ set_page_huge_active(page);
+
+ /*
+ * If shared, add to page cache
+ */
+ if (vm_shared) {
+ struct address_space *mapping = dst_vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+ ret = huge_add_to_page_cache(page, mapping, idx);
+ if (ret)
+ goto out_release_nounlock;
+ }
+
+ ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+ spin_lock(ptl);
+
+ ret = -EEXIST;
+ if (!huge_pte_none(huge_ptep_get(dst_pte)))
+ goto out_release_unlock;
+
+ if (vm_shared) {
+ page_dup_rmap(page, true);
+ } else {
+ ClearPagePrivate(page);
+ hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+ }
+
+ _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = huge_pte_mkdirty(_dst_pte);
+ _dst_pte = pte_mkyoung(_dst_pte);
+
+ set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
+ dst_vma->vm_flags & VM_WRITE);
+ hugetlb_count_add(pages_per_huge_page(h), dst_mm);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ spin_unlock(ptl);
+ if (vm_shared)
+ unlock_page(page);
+ ret = 0;
+out:
+ return ret;
+out_release_unlock:
+ spin_unlock(ptl);
+out_release_nounlock:
+ if (vm_shared)
+ unlock_page(page);
+ put_page(page);
+ goto out;
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags)
+ long i, unsigned int flags, int *nonblocking)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
@@ -3995,16 +4150,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
((flags & FOLL_WRITE) &&
!huge_pte_write(huge_ptep_get(pte)))) {
int ret;
+ unsigned int fault_flags = 0;
if (pte)
spin_unlock(ptl);
- ret = hugetlb_fault(mm, vma, vaddr,
- (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
- if (!(ret & VM_FAULT_ERROR))
- continue;
-
- remainder = 0;
- break;
+ if (flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (flags & FOLL_NOWAIT)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_RETRY_NOWAIT;
+ if (flags & FOLL_TRIED) {
+ VM_WARN_ON_ONCE(fault_flags &
+ FAULT_FLAG_ALLOW_RETRY);
+ fault_flags |= FAULT_FLAG_TRIED;
+ }
+ ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ remainder = 0;
+ break;
+ }
+ if (ret & VM_FAULT_RETRY) {
+ if (nonblocking)
+ *nonblocking = 0;
+ *nr_pages = 0;
+ /*
+ * VM_FAULT_RETRY must not return an
+ * error, it will return zero
+ * instead.
+ *
+ * No need to update "position" as the
+ * caller will not check it after
+ * *nr_pages is set to 0.
+ */
+ return i;
+ }
+ continue;
}
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
@@ -4033,6 +4215,11 @@ same_page:
spin_unlock(ptl);
}
*nr_pages = remainder;
+ /*
+ * setting position is actually required only if remainder is
+ * not zero but it's faster not to add a "if (remainder)"
+ * branch.
+ */
*position = vaddr;
return i ? i : -EFAULT;
diff --git a/mm/internal.h b/mm/internal.h
index 7aa2ea0a8623..8ab72f4374e0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -43,6 +43,11 @@ int do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
+static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
+{
+ return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
+}
+
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
@@ -133,9 +138,9 @@ struct alloc_context {
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
static inline unsigned long
-__find_buddy_index(unsigned long page_idx, unsigned int order)
+__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
- return page_idx ^ (1 << order);
+ return page_pfn ^ (1 << order);
}
extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
@@ -175,6 +180,8 @@ struct compact_control {
struct list_head migratepages; /* List of pages being migrated */
unsigned long nr_freepages; /* Number of isolated free pages */
unsigned long nr_migratepages; /* Number of pages to migrate */
+ unsigned long total_migrate_scanned;
+ unsigned long total_free_scanned;
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index 3f9a41cf0ac6..31238dad85fb 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -15,6 +15,7 @@
#include <linux/kasan.h>
#include <linux/kernel.h>
#include <linux/memblock.h>
+#include <linux/mm.h>
#include <linux/pfn.h>
#include <asm/page.h>
@@ -49,7 +50,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
pte_t *pte = pte_offset_kernel(pmd, addr);
pte_t zero_pte;
- zero_pte = pfn_pte(PFN_DOWN(__pa(kasan_zero_page)), PAGE_KERNEL);
+ zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL);
zero_pte = pte_wrprotect(zero_pte);
while (addr + PAGE_SIZE <= end) {
@@ -69,7 +70,7 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
next = pmd_addr_end(addr, end);
if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
- pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+ pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
continue;
}
@@ -92,9 +93,9 @@ static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
pmd_t *pmd;
- pud_populate(&init_mm, pud, kasan_zero_pmd);
+ pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
pmd = pmd_offset(pud, addr);
- pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+ pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
continue;
}
@@ -135,11 +136,11 @@ void __init kasan_populate_zero_shadow(const void *shadow_start,
* puds,pmds, so pgd_populate(), pud_populate()
* is noops.
*/
- pgd_populate(&init_mm, pgd, kasan_zero_pud);
+ pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_pud));
pud = pud_offset(pgd, addr);
- pud_populate(&init_mm, pud, kasan_zero_pmd);
+ pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
pmd = pmd_offset(pud, addr);
- pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+ pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
continue;
}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b82b3e215157..f479365530b6 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -13,6 +13,7 @@
*
*/
+#include <linux/ftrace.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/printk.h>
@@ -300,6 +301,8 @@ void kasan_report(unsigned long addr, size_t size,
if (likely(!kasan_report_enabled()))
return;
+ disable_trace_on_warning();
+
info.access_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e32389a97030..77ae3239c3de 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -943,7 +943,7 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
/*
* Before allocating the hugepage, release the mmap_sem read lock.
@@ -1242,7 +1242,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
struct vm_area_struct *vma;
unsigned long addr;
pmd_t *pmd, _pmd;
- bool deposited = false;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1267,26 +1266,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
- /*
- * now deposit the pgtable for arch that need it
- * otherwise free it.
- */
- if (arch_needs_pgtable_deposit()) {
- /*
- * The deposit should be visibile only after
- * collapse is seen by others.
- */
- smp_wmb();
- pgtable_trans_huge_deposit(vma->vm_mm, pmd,
- pmd_pgtable(_pmd));
- deposited = true;
- }
spin_unlock(ptl);
up_write(&vma->vm_mm->mmap_sem);
- if (!deposited) {
- atomic_long_dec(&vma->vm_mm->nr_ptes);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
- }
+ atomic_long_dec(&vma->vm_mm->nr_ptes);
+ pte_free(vma->vm_mm, pmd_pgtable(_pmd));
}
}
i_mmap_unlock_write(mapping);
@@ -1326,8 +1309,7 @@ static void collapse_shmem(struct mm_struct *mm,
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
/* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() |
- __GFP_OTHER_NODE | __GFP_THISNODE;
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
new_page = khugepaged_alloc_page(hpage, gfp, node);
if (!new_page) {
diff --git a/mm/madvise.c b/mm/madvise.c
index 0e3828eae9f8..b530a4986035 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -10,6 +10,7 @@
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
+#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
#include <linux/sched.h>
@@ -24,6 +25,8 @@
#include <asm/tlb.h>
+#include "internal.h"
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_sem for writing. Others, which simply traverse vmas, need
@@ -473,10 +476,11 @@ static long madvise_dontneed(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
*prev = vma;
- if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ if (!can_madv_dontneed_vma(vma))
return -EINVAL;
- zap_page_range(vma, start, end - start, NULL);
+ madvise_userfault_dontneed(vma, prev, start, end);
+ zap_page_range(vma, start, end - start);
return 0;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 7608bc305936..c004f52be419 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -611,10 +611,10 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
- (unsigned long long)base,
- (unsigned long long)base + size - 1,
- 0UL, (void *)_RET_IP_);
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("memblock_add: [%pa-%pa] %pF\n",
+ &base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
}
@@ -718,10 +718,10 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
- memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
- (unsigned long long)base,
- (unsigned long long)base + size - 1,
- (void *)_RET_IP_);
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg(" memblock_free: [%pa-%pa] %pF\n",
+ &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
return memblock_remove_range(&memblock.reserved, base, size);
@@ -729,10 +729,10 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
- memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
- (unsigned long long)base,
- (unsigned long long)base + size - 1,
- 0UL, (void *)_RET_IP_);
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n",
+ &base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
}
@@ -1105,6 +1105,31 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
*out_nid = r->nid;
}
+unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
+ unsigned long max_pfn)
+{
+ struct memblock_type *type = &memblock.memory;
+ unsigned int right = type->cnt;
+ unsigned int mid, left = 0;
+ phys_addr_t addr = PFN_PHYS(pfn + 1);
+
+ do {
+ mid = (right + left) / 2;
+
+ if (addr < type->regions[mid].base)
+ right = mid;
+ else if (addr >= (type->regions[mid].base +
+ type->regions[mid].size))
+ left = mid + 1;
+ else {
+ /* addr is within the region, so pfn + 1 is valid */
+ return min(pfn + 1, max_pfn);
+ }
+ } while (left < right);
+
+ return min(PHYS_PFN(type->regions[right].base), max_pfn);
+}
+
/**
* memblock_set_node - set node ID on memblock regions
* @base: base of area to set node ID for
@@ -1202,8 +1227,8 @@ phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys
alloc = __memblock_alloc_base(size, align, max_addr);
if (alloc == 0)
- panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
- (unsigned long long) size, (unsigned long long) max_addr);
+ panic("ERROR: Failed to allocate %pa bytes below %pa.\n",
+ &size, &max_addr);
return alloc;
}
@@ -1274,18 +1299,17 @@ static void * __init memblock_virt_alloc_internal(
if (max_addr > memblock.current_limit)
max_addr = memblock.current_limit;
-
again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
nid, flags);
- if (alloc)
+ if (alloc && !memblock_reserve(alloc, size))
goto done;
if (nid != NUMA_NO_NODE) {
alloc = memblock_find_in_range_node(size, align, min_addr,
max_addr, NUMA_NO_NODE,
flags);
- if (alloc)
+ if (alloc && !memblock_reserve(alloc, size))
goto done;
}
@@ -1303,7 +1327,6 @@ again:
return NULL;
done:
- memblock_reserve(alloc, size);
ptr = phys_to_virt(alloc);
memset(ptr, 0, size);
@@ -1615,8 +1638,7 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
if (idx == -1)
return 0;
- return memblock.memory.regions[idx].base <= base &&
- (memblock.memory.regions[idx].base +
+ return (memblock.memory.regions[idx].base +
memblock.memory.regions[idx].size) >= end;
}
@@ -1673,7 +1695,7 @@ phys_addr_t __init_memblock memblock_get_current_limit(void)
static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
{
- unsigned long long base, size;
+ phys_addr_t base, end, size;
unsigned long flags;
int idx;
struct memblock_region *rgn;
@@ -1685,23 +1707,24 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
base = rgn->base;
size = rgn->size;
+ end = base + size - 1;
flags = rgn->flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
#endif
- pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
- name, idx, base, base + size - 1, size, nid_buf, flags);
+ pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n",
+ name, idx, &base, &end, &size, nid_buf, flags);
}
}
void __init_memblock __memblock_dump_all(void)
{
pr_info("MEMBLOCK configuration:\n");
- pr_info(" memory size = %#llx reserved size = %#llx\n",
- (unsigned long long)memblock.memory.total_size,
- (unsigned long long)memblock.reserved.total_size);
+ pr_info(" memory size = %pa reserved size = %pa\n",
+ &memblock.memory.total_size,
+ &memblock.reserved.total_size);
memblock_dump(&memblock.memory, "memory");
memblock_dump(&memblock.reserved, "reserved");
@@ -1727,19 +1750,14 @@ static int memblock_debug_show(struct seq_file *m, void *private)
struct memblock_type *type = m->private;
struct memblock_region *reg;
int i;
+ phys_addr_t end;
for (i = 0; i < type->cnt; i++) {
reg = &type->regions[i];
- seq_printf(m, "%4d: ", i);
- if (sizeof(phys_addr_t) == 4)
- seq_printf(m, "0x%08lx..0x%08lx\n",
- (unsigned long)reg->base,
- (unsigned long)(reg->base + reg->size - 1));
- else
- seq_printf(m, "0x%016llx..0x%016llx\n",
- (unsigned long long)reg->base,
- (unsigned long long)(reg->base + reg->size - 1));
+ end = reg->base + reg->size - 1;
+ seq_printf(m, "%4d: ", i);
+ seq_printf(m, "%pa..%pa\n", &reg->base, &end);
}
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4048897e7b01..1fd6affcdde7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -317,6 +317,8 @@ void memcg_put_cache_ids(void)
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
+struct workqueue_struct *memcg_kmem_cache_wq;
+
#endif /* !CONFIG_SLOB */
/**
@@ -625,8 +627,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
unsigned long nr = 0;
- struct mem_cgroup_per_node *mz;
enum lru_list lru;
VM_BUG_ON((unsigned)nid >= nr_node_ids);
@@ -634,8 +636,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- mz = mem_cgroup_nodeinfo(memcg, nid);
- nr += mz->lru_size[lru];
+ nr += mem_cgroup_get_lru_size(lruvec, lru);
}
return nr;
}
@@ -1002,6 +1003,7 @@ out:
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
* @lru: index of lru list the page is sitting on
+ * @zid: zone id of the accounted pages
* @nr_pages: positive when adding or negative when removing
*
* This function must be called under lru_lock, just before a page is added
@@ -1009,27 +1011,25 @@ out:
* so as to allow it to check that lru_size 0 is consistent with list_empty).
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int nr_pages)
+ int zid, int nr_pages)
{
struct mem_cgroup_per_node *mz;
unsigned long *lru_size;
long size;
- bool empty;
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- lru_size = mz->lru_size + lru;
- empty = list_empty(lruvec->lists + lru);
+ lru_size = &mz->lru_zone_size[zid][lru];
if (nr_pages < 0)
*lru_size += nr_pages;
size = *lru_size;
- if (WARN_ONCE(size < 0 || empty != !size,
- "%s(%p, %d, %d): lru_size %ld but %sempty\n",
- __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+ if (WARN_ONCE(size < 0,
+ "%s(%p, %d, %d): lru_size %ld\n",
+ __func__, lruvec, lru, nr_pages, size)) {
VM_BUG_ON(1);
*lru_size = 0;
}
@@ -2145,8 +2145,6 @@ struct memcg_kmem_cache_create_work {
struct work_struct work;
};
-static struct workqueue_struct *memcg_kmem_cache_create_wq;
-
static void memcg_kmem_cache_create_func(struct work_struct *w)
{
struct memcg_kmem_cache_create_work *cw =
@@ -2178,7 +2176,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
- queue_work(memcg_kmem_cache_create_wq, &cw->work);
+ queue_work(memcg_kmem_cache_wq, &cw->work);
}
static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
@@ -2839,6 +2837,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
*/
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
+ INIT_LIST_HEAD(&memcg->kmem_caches);
return 0;
}
@@ -4004,9 +4003,9 @@ static struct cftype mem_cgroup_legacy_files[] = {
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
- .seq_start = slab_start,
- .seq_next = slab_next,
- .seq_stop = slab_stop,
+ .seq_start = memcg_slab_start,
+ .seq_next = memcg_slab_next,
+ .seq_stop = memcg_slab_stop,
.seq_show = memcg_slab_show,
},
#endif
@@ -4355,9 +4354,9 @@ static int mem_cgroup_do_precharge(unsigned long count)
return ret;
}
- /* Try charges one by one with reclaim */
+ /* Try charges one by one with reclaim, but do not retry */
while (count--) {
- ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
+ ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
if (ret)
return ret;
mc.precharge++;
@@ -5779,12 +5778,12 @@ static int __init mem_cgroup_init(void)
#ifndef CONFIG_SLOB
/*
* Kmem cache creation is mostly done with the slab_mutex held,
- * so use a special workqueue to avoid stalling all worker
- * threads in case lots of cgroups are created simultaneously.
+ * so use a workqueue with limited concurrency to avoid stalling
+ * all worker threads in case lots of cgroups are created and
+ * destroyed simultaneously.
*/
- memcg_kmem_cache_create_wq =
- alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
- BUG_ON(!memcg_kmem_cache_create_wq);
+ memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
+ BUG_ON(!memcg_kmem_cache_wq);
#endif
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
diff --git a/mm/memory.c b/mm/memory.c
index 7d23b5050248..7663068a33c6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1155,12 +1155,6 @@ again:
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
- /*
- * oom_reaper cannot tear down dirty
- * pages
- */
- if (unlikely(details && details->ignore_dirty))
- continue;
force_flush = 1;
set_page_dirty(page);
}
@@ -1179,8 +1173,8 @@ again:
}
continue;
}
- /* only check swap_entries if explicitly asked for in details */
- if (unlikely(details && !details->check_swap_entries))
+ /* If details->check_mapping, we leave swap entries. */
+ if (unlikely(details))
continue;
entry = pte_to_swp_entry(ptent);
@@ -1376,12 +1370,11 @@ void unmap_vmas(struct mmu_gather *tlb,
* @vma: vm_area_struct holding the applicable pages
* @start: starting address of pages to zap
* @size: number of bytes to zap
- * @details: details of shared cache invalidation
*
* Caller must protect the VMA list
*/
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long size, struct zap_details *details)
+ unsigned long size)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
@@ -1392,7 +1385,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
- unmap_single_vma(&tlb, vma, start, end, details);
+ unmap_single_vma(&tlb, vma, start, end, NULL);
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
@@ -3008,13 +3001,6 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
ret = 0;
count_vm_event(THP_FILE_MAPPED);
out:
- /*
- * If we are going to fallback to pte mapping, do a
- * withdraw with pmd lock held.
- */
- if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
- vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
- vmf->pmd);
spin_unlock(vmf->ptl);
return ret;
}
@@ -3055,20 +3041,18 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
- goto fault_handled;
+ return ret;
}
if (!vmf->pte) {
ret = pte_alloc_one_map(vmf);
if (ret)
- goto fault_handled;
+ return ret;
}
/* Re-check under ptl */
- if (unlikely(!pte_none(*vmf->pte))) {
- ret = VM_FAULT_NOPAGE;
- goto fault_handled;
- }
+ if (unlikely(!pte_none(*vmf->pte)))
+ return VM_FAULT_NOPAGE;
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
@@ -3088,15 +3072,8 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, vmf->address, vmf->pte);
- ret = 0;
-fault_handled:
- /* preallocated pagetable is unused: free it */
- if (vmf->prealloc_pte) {
- pte_free(vmf->vma->vm_mm, vmf->prealloc_pte);
- vmf->prealloc_pte = 0;
- }
- return ret;
+ return 0;
}
@@ -3360,15 +3337,24 @@ static int do_shared_fault(struct vm_fault *vmf)
static int do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+ int ret;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
- return VM_FAULT_SIGBUS;
- if (!(vmf->flags & FAULT_FLAG_WRITE))
- return do_read_fault(vmf);
- if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(vmf);
- return do_shared_fault(vmf);
+ ret = VM_FAULT_SIGBUS;
+ else if (!(vmf->flags & FAULT_FLAG_WRITE))
+ ret = do_read_fault(vmf);
+ else if (!(vma->vm_flags & VM_SHARED))
+ ret = do_cow_fault(vmf);
+ else
+ ret = do_shared_fault(vmf);
+
+ /* preallocated pagetable is unused: free it */
+ if (vmf->prealloc_pte) {
+ pte_free(vma->vm_mm, vmf->prealloc_pte);
+ vmf->prealloc_pte = 0;
+ }
+ return ret;
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3478,12 +3464,10 @@ out:
static int create_huge_pmd(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = vmf->vma;
- if (vma_is_anonymous(vma))
+ if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
- if (vma->vm_ops->pmd_fault)
- return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
- vmf->flags);
+ if (vmf->vma->vm_ops->pmd_fault)
+ return vmf->vma->vm_ops->pmd_fault(vmf);
return VM_FAULT_FALLBACK;
}
@@ -3492,8 +3476,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_wp_page(vmf, orig_pmd);
if (vmf->vma->vm_ops->pmd_fault)
- return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
- vmf->pmd, vmf->flags);
+ return vmf->vma->vm_ops->pmd_fault(vmf);
/* COW handled on pte level: split pmd */
VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3779,8 +3762,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-static int __follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
@@ -3797,11 +3780,20 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- goto out;
- /* We cannot handle huge page PFN maps. Luckily they don't exist. */
- if (pmd_huge(*pmd))
+ if (pmd_huge(*pmd)) {
+ if (!pmdpp)
+ goto out;
+
+ *ptlp = pmd_lock(mm, pmd);
+ if (pmd_huge(*pmd)) {
+ *pmdpp = pmd;
+ return 0;
+ }
+ spin_unlock(*ptlp);
+ }
+
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
@@ -3817,17 +3809,31 @@ out:
return -EINVAL;
}
-int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
- spinlock_t **ptlp)
+static inline int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
{
int res;
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte(mm, address, ptepp, ptlp)));
+ !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
+ ptlp)));
return res;
}
+int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+{
+ int res;
+
+ /* (void) is needed to make gcc happy */
+ (void) __cond_lock(*ptlp,
+ !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
+ ptlp)));
+ return res;
+}
+EXPORT_SYMBOL(follow_pte_pmd);
+
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
@@ -4139,6 +4145,38 @@ void copy_user_huge_page(struct page *dst, struct page *src,
copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
}
}
+
+long copy_huge_page_from_user(struct page *dst_page,
+ const void __user *usr_src,
+ unsigned int pages_per_huge_page,
+ bool allow_pagefault)
+{
+ void *src = (void *)usr_src;
+ void *page_kaddr;
+ unsigned long i, rc = 0;
+ unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+
+ for (i = 0; i < pages_per_huge_page; i++) {
+ if (allow_pagefault)
+ page_kaddr = kmap(dst_page + i);
+ else
+ page_kaddr = kmap_atomic(dst_page + i);
+ rc = copy_from_user(page_kaddr,
+ (const void __user *)(src + i * PAGE_SIZE),
+ PAGE_SIZE);
+ if (allow_pagefault)
+ kunmap(dst_page + i);
+ else
+ kunmap_atomic(page_kaddr);
+
+ ret_val -= (PAGE_SIZE - rc);
+ if (rc)
+ break;
+
+ cond_resched();
+ }
+ return ret_val;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e43142c15631..d67787d10ff0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -179,7 +179,7 @@ static void release_memory_resource(struct resource *res)
void get_page_bootmem(unsigned long info, struct page *page,
unsigned long type)
{
- page->lru.next = (struct list_head *) type;
+ page->freelist = (void *)type;
SetPagePrivate(page);
set_page_private(page, info);
page_ref_inc(page);
@@ -189,11 +189,12 @@ void put_page_bootmem(struct page *page)
{
unsigned long type;
- type = (unsigned long) page->lru.next;
+ type = (unsigned long) page->freelist;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (page_ref_dec_return(page) == 1) {
+ page->freelist = NULL;
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
@@ -1033,36 +1034,39 @@ static void node_states_set_node(int node, struct memory_notify *arg)
node_set_state(node, N_MEMORY);
}
-int zone_can_shift(unsigned long pfn, unsigned long nr_pages,
- enum zone_type target)
+bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
+ enum zone_type target, int *zone_shift)
{
struct zone *zone = page_zone(pfn_to_page(pfn));
enum zone_type idx = zone_idx(zone);
int i;
+ *zone_shift = 0;
+
if (idx < target) {
/* pages must be at end of current zone */
if (pfn + nr_pages != zone_end_pfn(zone))
- return 0;
+ return false;
/* no zones in use between current zone and target */
for (i = idx + 1; i < target; i++)
if (zone_is_initialized(zone - idx + i))
- return 0;
+ return false;
}
if (target < idx) {
/* pages must be at beginning of current zone */
if (pfn != zone->zone_start_pfn)
- return 0;
+ return false;
/* no zones in use between current zone and target */
for (i = target + 1; i < idx; i++)
if (zone_is_initialized(zone - idx + i))
- return 0;
+ return false;
}
- return target - idx;
+ *zone_shift = target - idx;
+ return true;
}
/* Must be protected by mem_hotplug_begin() */
@@ -1089,10 +1093,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
!can_online_high_movable(zone))
return -EINVAL;
- if (online_type == MMOP_ONLINE_KERNEL)
- zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL);
- else if (online_type == MMOP_ONLINE_MOVABLE)
- zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE);
+ if (online_type == MMOP_ONLINE_KERNEL) {
+ if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
+ return -EINVAL;
+ } else if (online_type == MMOP_ONLINE_MOVABLE) {
+ if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
+ return -EINVAL;
+ }
zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
if (!zone)
@@ -1477,17 +1484,20 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) is belongs to the same zone.
+ * Confirm all pages in a range [start, end) belong to the same zone.
+ * When true, return its valid [start, end).
*/
-int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
+int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long *valid_start, unsigned long *valid_end)
{
unsigned long pfn, sec_end_pfn;
+ unsigned long start, end;
struct zone *zone = NULL;
struct page *page;
int i;
- for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn);
+ for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
pfn < end_pfn;
- pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) {
+ pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
/* Make sure the memory section is present first */
if (!present_section_nr(pfn_to_section_nr(pfn)))
continue;
@@ -1503,10 +1513,20 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
return 0;
+ if (!zone)
+ start = pfn + i;
zone = page_zone(page);
+ end = pfn + MAX_ORDER_NR_PAGES;
}
}
- return 1;
+
+ if (zone) {
+ *valid_start = start;
+ *valid_end = end;
+ return 1;
+ } else {
+ return 0;
+ }
}
/*
@@ -1833,6 +1853,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
long offlined_pages;
int ret, drain, retry_max, node;
unsigned long flags;
+ unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
@@ -1843,10 +1864,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
return -EINVAL;
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
- if (!test_pages_in_a_zone(start_pfn, end_pfn))
+ if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
return -EINVAL;
- zone = page_zone(pfn_to_page(start_pfn));
+ zone = page_zone(pfn_to_page(valid_start));
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2e346645eb80..1e7873e40c9a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2017,8 +2017,8 @@ retry_cpuset:
nmask = policy_nodemask(gfp, pol);
zl = policy_zonelist(gfp, pol, node);
- mpol_cond_put(pol);
page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+ mpol_cond_put(pol);
out:
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
diff --git a/mm/mmap.c b/mm/mmap.c
index 3714aa4e6f81..2ffca2181a38 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2806,11 +2806,11 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
-static int do_brk(unsigned long addr, unsigned long request)
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
- unsigned long flags, len;
+ unsigned long len;
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
@@ -2821,7 +2821,10 @@ static int do_brk(unsigned long addr, unsigned long request)
if (!len)
return 0;
- flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ /* Until we need other flags, refuse anything except VM_EXEC. */
+ if ((flags & (~VM_EXEC)) != 0)
+ return -EINVAL;
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
if (offset_in_page(error))
@@ -2889,7 +2892,12 @@ out:
return 0;
}
-int vm_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len)
+{
+ return do_brk_flags(addr, len, 0);
+}
+
+int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
int ret;
@@ -2898,13 +2906,19 @@ int vm_brk(unsigned long addr, unsigned long len)
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
- ret = do_brk(addr, len);
+ ret = do_brk_flags(addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
up_write(&mm->mmap_sem);
if (populate && !ret)
mm_populate(addr, len);
return ret;
}
+EXPORT_SYMBOL(vm_brk_flags);
+
+int vm_brk(unsigned long addr, unsigned long len)
+{
+ return vm_brk_flags(addr, len, 0);
+}
EXPORT_SYMBOL(vm_brk);
/* Release all mmaps. */
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 5652be858e5e..a51c0a67ea3d 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -60,7 +60,7 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z,
* Find the next suitable zone to use for the allocation.
* Only filter based on nodemask if it's set
*/
- if (likely(nodes == NULL))
+ if (unlikely(nodes == NULL))
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;
else
diff --git a/mm/mprotect.c b/mm/mprotect.c
index f9c07f54dd62..a45b4dc6a7f5 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -33,34 +33,6 @@
#include "internal.h"
-/*
- * For a prot_numa update we only hold mmap_sem for read so there is a
- * potential race with faulting where a pmd was temporarily none. This
- * function checks for a transhuge pmd under the appropriate lock. It
- * returns a pte if it was successfully locked or NULL if it raced with
- * a transhuge insertion.
- */
-static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, int prot_numa, spinlock_t **ptl)
-{
- pte_t *pte;
- spinlock_t *pmdl;
-
- /* !prot_numa is protected by mmap_sem held for write */
- if (!prot_numa)
- return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
-
- pmdl = pmd_lock(vma->vm_mm, pmd);
- if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
- spin_unlock(pmdl);
- return NULL;
- }
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
- spin_unlock(pmdl);
- return pte;
-}
-
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
@@ -71,7 +43,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long pages = 0;
int target_node = NUMA_NO_NODE;
- pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
+ /*
+ * Can be called with only the mmap_sem for reading by
+ * prot_numa so we must check the pmd isn't constantly
+ * changing from under us from pmd_none to pmd_trans_huge
+ * and/or the other way around.
+ */
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ /*
+ * The pmd points to a regular pte so the pmd can't change
+ * from under us even if the mmap_sem is only hold for
+ * reading.
+ */
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (!pte)
return 0;
@@ -177,8 +163,6 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
__split_huge_pmd(vma, pmd, addr, false, NULL);
- if (pmd_trans_unstable(pmd))
- continue;
} else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
diff --git a/mm/mremap.c b/mm/mremap.c
index 30d7d2482eea..8779928d6a70 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
#include <linux/mmu_notifier.h>
#include <linux/uaccess.h>
#include <linux/mm-arch-hooks.h>
+#include <linux/userfaultfd_k.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -250,7 +251,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
- unsigned long new_len, unsigned long new_addr, bool *locked)
+ unsigned long new_len, unsigned long new_addr,
+ bool *locked, struct vm_userfaultfd_ctx *uf)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -309,6 +311,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
old_addr = new_addr;
new_addr = err;
} else {
+ mremap_userfaultfd_prep(new_vma, uf);
arch_remap(mm, old_addr, old_addr + old_len,
new_addr, new_addr + new_len);
}
@@ -413,7 +416,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
}
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
- unsigned long new_addr, unsigned long new_len, bool *locked)
+ unsigned long new_addr, unsigned long new_len, bool *locked,
+ struct vm_userfaultfd_ctx *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -458,7 +462,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (offset_in_page(ret))
goto out1;
- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf);
if (!(offset_in_page(ret)))
goto out;
out1:
@@ -497,6 +501,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long ret = -EINVAL;
unsigned long charged = 0;
bool locked = false;
+ struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
return ret;
@@ -523,7 +528,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked);
+ &locked, &uf);
goto out;
}
@@ -592,7 +597,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
goto out;
}
- ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr,
+ &locked, &uf);
}
out:
if (offset_in_page(ret)) {
@@ -602,5 +608,6 @@ out:
up_write(&current->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
+ mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
return ret;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index e366354f777d..782e83a14535 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1191,7 +1191,7 @@ error_free:
enomem:
pr_err("Allocation of length %lu from process %d (%s) failed\n",
len, current->pid, current->comm);
- show_free_areas(0);
+ show_free_areas(0, NULL);
return -ENOMEM;
}
@@ -1412,13 +1412,13 @@ error_getting_vma:
kmem_cache_free(vm_region_jar, region);
pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
len, current->pid);
- show_free_areas(0);
+ show_free_areas(0, NULL);
return -ENOMEM;
error_getting_region:
pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
len, current->pid);
- show_free_areas(0);
+ show_free_areas(0, NULL);
return -ENOMEM;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ec9f11d4f094..8256788ac119 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -417,7 +417,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
if (oc->memcg)
mem_cgroup_print_oom_info(oc->memcg, p);
else
- show_mem(SHOW_MEM_FILTER_NODES);
+ show_mem(SHOW_MEM_FILTER_NODES, nm);
if (sysctl_oom_dump_tasks)
dump_tasks(oc->memcg, oc->nodemask);
}
@@ -465,8 +465,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
struct mmu_gather tlb;
struct vm_area_struct *vma;
- struct zap_details details = {.check_swap_entries = true,
- .ignore_dirty = true};
bool ret = true;
/*
@@ -510,14 +508,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
tlb_gather_mmu(&tlb, mm, 0, -1);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (is_vm_hugetlb_page(vma))
- continue;
-
- /*
- * mlocked VMAs require explicit munlocking before unmap.
- * Let's keep it simple here and skip such VMAs.
- */
- if (vma->vm_flags & VM_LOCKED)
+ if (!can_madv_dontneed_vma(vma))
continue;
/*
@@ -532,7 +523,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
- &details);
+ NULL);
}
tlb_finish_mmu(&tlb, 0, -1);
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
@@ -1013,7 +1004,7 @@ bool out_of_memory(struct oom_control *oc)
* make sure exclude 0 mask - all other users should have at least
* ___GFP_DIRECT_RECLAIM to get here.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
+ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
return true;
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 290e8b7d3181..216449825859 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1988,11 +1988,11 @@ void laptop_mode_timer_fn(unsigned long data)
* We want to write everything out, not just down to the dirty
* threshold
*/
- if (!bdi_has_dirty_io(&q->backing_dev_info))
+ if (!bdi_has_dirty_io(q->backing_dev_info))
return;
rcu_read_lock();
- list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
+ list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node)
if (wb_has_dirty_io(wb))
wb_start_writeback(wb, nr_pages, true,
WB_REASON_LAPTOP_TIMER);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c6d5f64feca..c21b33668133 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,6 +55,7 @@
#include <linux/kmemleak.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
+#include <trace/events/oom.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/migrate.h>
@@ -714,7 +715,7 @@ static inline void rmv_page_order(struct page *page)
/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
- * (a) the buddy is not in a hole &&
+ * (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
@@ -729,9 +730,6 @@ static inline void rmv_page_order(struct page *page)
static inline int page_is_buddy(struct page *page, struct page *buddy,
unsigned int order)
{
- if (!pfn_valid_within(page_to_pfn(buddy)))
- return 0;
-
if (page_is_guard(buddy) && page_order(buddy) == order) {
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
@@ -787,9 +785,8 @@ static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order,
int migratetype)
{
- unsigned long page_idx;
- unsigned long combined_idx;
- unsigned long uninitialized_var(buddy_idx);
+ unsigned long combined_pfn;
+ unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
@@ -802,15 +799,16 @@ static inline void __free_one_page(struct page *page,
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
- page_idx = pfn & ((1 << MAX_ORDER) - 1);
-
- VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+ VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
while (order < max_order - 1) {
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
+
+ if (!pfn_valid_within(buddy_pfn))
+ goto done_merging;
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
@@ -824,9 +822,9 @@ continue_merging:
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
}
- combined_idx = buddy_idx & page_idx;
- page = page + (combined_idx - page_idx);
- page_idx = combined_idx;
+ combined_pfn = buddy_pfn & pfn;
+ page = page + (combined_pfn - pfn);
+ pfn = combined_pfn;
order++;
}
if (max_order < MAX_ORDER) {
@@ -841,8 +839,8 @@ continue_merging:
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
@@ -865,12 +863,12 @@ done_merging:
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
struct page *higher_page, *higher_buddy;
- combined_idx = buddy_idx & page_idx;
- higher_page = page + (combined_idx - page_idx);
- buddy_idx = __find_buddy_index(combined_idx, order + 1);
- higher_buddy = higher_page + (buddy_idx - combined_idx);
+ combined_pfn = buddy_pfn & pfn;
+ higher_page = page + (combined_pfn - pfn);
+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
@@ -1864,14 +1862,14 @@ int move_freepages(struct zone *zone,
#endif
for (page = start_page; page <= end_page;) {
- /* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+
if (!PageBuddy(page)) {
page++;
continue;
@@ -2583,30 +2581,22 @@ int __isolate_free_page(struct page *page, unsigned int order)
* Update NUMA hit/miss statistics
*
* Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
*/
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
- gfp_t flags)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
{
#ifdef CONFIG_NUMA
- int local_nid = numa_node_id();
enum zone_stat_item local_stat = NUMA_LOCAL;
- if (unlikely(flags & __GFP_OTHER_NODE)) {
+ if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
- local_nid = preferred_zone->node;
- }
- if (z->node == local_nid) {
+ if (z->node == preferred_zone->node)
__inc_zone_state(z, NUMA_HIT);
- __inc_zone_state(z, local_stat);
- } else {
+ else {
__inc_zone_state(z, NUMA_MISS);
__inc_zone_state(preferred_zone, NUMA_FOREIGN);
}
+ __inc_zone_state(z, local_stat);
#endif
}
@@ -2674,7 +2664,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
}
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
- zone_statistics(preferred_zone, zone, gfp_flags);
+ zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
@@ -3015,18 +3005,12 @@ static inline bool should_suppress_show_mem(void)
return ret;
}
-static DEFINE_RATELIMIT_STATE(nopage_rs,
- DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
-
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
- struct va_format vaf;
- va_list args;
+ static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
- debug_guardpage_minorder() > 0)
+ if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
return;
/*
@@ -3041,6 +3025,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
+ show_mem(filter, nodemask);
+}
+
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+ debug_guardpage_minorder() > 0)
+ return;
+
pr_warn("%s: ", current->comm);
va_start(args, fmt);
@@ -3049,11 +3047,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
pr_cont("%pV", &vaf);
va_end(args);
- pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
+ pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
+ if (nodemask)
+ pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
+ else
+ pr_cont("(null)\n");
+
+ cpuset_print_current_mems_allowed();
dump_stack();
- if (!should_suppress_show_mem())
- show_mem(filter);
+ warn_alloc_show_mem(gfp_mask, nodemask);
+}
+
+static inline struct page *
+__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
+ unsigned int alloc_flags,
+ const struct alloc_context *ac)
+{
+ struct page *page;
+
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags|ALLOC_CPUSET, ac);
+ /*
+ * fallback to ignore cpuset restriction if our nodes
+ * are depleted
+ */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags, ac);
+
+ return page;
}
static inline struct page *
@@ -3091,47 +3114,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (page)
goto out;
- if (!(gfp_mask & __GFP_NOFAIL)) {
- /* Coredumps can quickly deplete all memory reserves */
- if (current->flags & PF_DUMPCORE)
- goto out;
- /* The OOM killer will not help higher order allocs */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- goto out;
- /* The OOM killer does not needlessly kill tasks for lowmem */
- if (ac->high_zoneidx < ZONE_NORMAL)
- goto out;
- if (pm_suspended_storage())
- goto out;
- /*
- * XXX: GFP_NOFS allocations should rather fail than rely on
- * other request to make a forward progress.
- * We are in an unfortunate situation where out_of_memory cannot
- * do much for this context but let's try it to at least get
- * access to memory reserved if the current task is killed (see
- * out_of_memory). Once filesystems are ready to handle allocation
- * failures more gracefully we should just bail out here.
- */
+ /* Coredumps can quickly deplete all memory reserves */
+ if (current->flags & PF_DUMPCORE)
+ goto out;
+ /* The OOM killer will not help higher order allocs */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto out;
+ /* The OOM killer does not needlessly kill tasks for lowmem */
+ if (ac->high_zoneidx < ZONE_NORMAL)
+ goto out;
+ if (pm_suspended_storage())
+ goto out;
+ /*
+ * XXX: GFP_NOFS allocations should rather fail than rely on
+ * other request to make a forward progress.
+ * We are in an unfortunate situation where out_of_memory cannot
+ * do much for this context but let's try it to at least get
+ * access to memory reserved if the current task is killed (see
+ * out_of_memory). Once filesystems are ready to handle allocation
+ * failures more gracefully we should just bail out here.
+ */
+
+ /* The OOM killer may not free memory on a specific node */
+ if (gfp_mask & __GFP_THISNODE)
+ goto out;
- /* The OOM killer may not free memory on a specific node */
- if (gfp_mask & __GFP_THISNODE)
- goto out;
- }
/* Exhausted what can be done so it's blamo time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
- if (gfp_mask & __GFP_NOFAIL) {
- page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
- /*
- * fallback to ignore cpuset restriction if our nodes
- * are depleted
- */
- if (!page)
- page = get_page_from_freelist(gfp_mask, order,
+ /*
+ * Help non-failing allocations by giving them access to memory
+ * reserves
+ */
+ if (gfp_mask & __GFP_NOFAIL)
+ page = __alloc_pages_cpuset_fallback(gfp_mask, order,
ALLOC_NO_WATERMARKS, ac);
- }
}
out:
mutex_unlock(&oom_lock);
@@ -3200,6 +3218,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
{
int max_retries = MAX_COMPACT_RETRIES;
int min_priority;
+ bool ret = false;
+ int retries = *compaction_retries;
+ enum compact_priority priority = *compact_priority;
if (!order)
return false;
@@ -3221,8 +3242,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
* But do not retry if the given zonelist is not suitable for
* compaction.
*/
- if (compaction_withdrawn(compact_result))
- return compaction_zonelist_suitable(ac, order, alloc_flags);
+ if (compaction_withdrawn(compact_result)) {
+ ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+ goto out;
+ }
/*
* !costly requests are much more important than __GFP_REPEAT
@@ -3234,8 +3257,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)
max_retries /= 4;
- if (*compaction_retries <= max_retries)
- return true;
+ if (*compaction_retries <= max_retries) {
+ ret = true;
+ goto out;
+ }
/*
* Make sure there are attempts at the highest priority if we exhausted
@@ -3244,12 +3269,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
check_priority:
min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+
if (*compact_priority > min_priority) {
(*compact_priority)--;
*compaction_retries = 0;
- return true;
+ ret = true;
}
- return false;
+out:
+ trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
+ return ret;
}
#else
static inline struct page *
@@ -3472,6 +3500,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
ac->nodemask) {
unsigned long available;
unsigned long reclaimable;
+ unsigned long min_wmark = min_wmark_pages(zone);
+ bool wmark;
available = reclaimable = zone_reclaimable_pages(zone);
available -= DIV_ROUND_UP((*no_progress_loops) * available,
@@ -3482,8 +3512,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* Would the allocation succeed if we reclaimed the whole
* available?
*/
- if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
- ac_classzone_idx(ac), alloc_flags, available)) {
+ wmark = __zone_watermark_ok(zone, order, min_wmark,
+ ac_classzone_idx(ac), alloc_flags, available);
+ trace_reclaim_retry_zone(z, order, reclaimable,
+ available, min_wmark, *no_progress_loops, wmark);
+ if (wmark) {
/*
* If we didn't make any progress and have a lot of
* dirty + writeback pages then we should wait for
@@ -3531,12 +3564,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
- enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
+ enum compact_priority compact_priority;
enum compact_result compact_result;
- int compaction_retries = 0;
- int no_progress_loops = 0;
+ int compaction_retries;
+ int no_progress_loops;
unsigned long alloc_start = jiffies;
unsigned int stall_timeout = 10 * HZ;
+ unsigned int cpuset_mems_cookie;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3557,6 +3591,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
+retry_cpuset:
+ compaction_retries = 0;
+ no_progress_loops = 0;
+ compact_priority = DEF_COMPACT_PRIORITY;
+ cpuset_mems_cookie = read_mems_allowed_begin();
+
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
@@ -3564,6 +3604,17 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ /*
+ * We need to recalculate the starting point for the zonelist iterator
+ * because we might have used different nodemask in the fast path, or
+ * there was a cpuset modification and we are retrying - otherwise we
+ * could end up iterating over non-eligible zones endlessly.
+ */
+ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->high_zoneidx, ac->nodemask);
+ if (!ac->preferred_zoneref->zone)
+ goto nopage;
+
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
@@ -3640,35 +3691,21 @@ retry:
goto got_pg;
/* Caller is not willing to reclaim, we can't balance anything */
- if (!can_direct_reclaim) {
- /*
- * All existing users of the __GFP_NOFAIL are blockable, so warn
- * of any new users that actually allow this type of allocation
- * to fail.
- */
- WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
+ if (!can_direct_reclaim)
goto nopage;
- }
- /* Avoid recursion of direct reclaim */
- if (current->flags & PF_MEMALLOC) {
- /*
- * __GFP_NOFAIL request from this context is rather bizarre
- * because we cannot reclaim anything and only can loop waiting
- * for somebody to do a work for us.
- */
- if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
- cond_resched();
- goto retry;
- }
- goto nopage;
+ /* Make sure we know about allocations which stall for too long */
+ if (time_after(jiffies, alloc_start + stall_timeout)) {
+ warn_alloc(gfp_mask, ac->nodemask,
+ "page allocation stalls for %ums, order:%u",
+ jiffies_to_msecs(jiffies-alloc_start), order);
+ stall_timeout += 10 * HZ;
}
- /* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+ /* Avoid recursion of direct reclaim */
+ if (current->flags & PF_MEMALLOC)
goto nopage;
-
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
@@ -3692,14 +3729,6 @@ retry:
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
goto nopage;
- /* Make sure we know about allocations which stall for too long */
- if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask,
- "page allocation stalls for %ums, order:%u",
- jiffies_to_msecs(jiffies-alloc_start), order);
- stall_timeout += 10 * HZ;
- }
-
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;
@@ -3716,11 +3745,22 @@ retry:
&compaction_retries))
goto retry;
+ /*
+ * It's possible we raced with cpuset update so the OOM would be
+ * premature (see below the nopage: label for full explanation).
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry_cpuset;
+
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;
+ /* Avoid allocations with no watermarks from looping endlessly */
+ if (test_thread_flag(TIF_MEMDIE))
+ goto nopage;
+
/* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
no_progress_loops = 0;
@@ -3728,7 +3768,58 @@ retry:
}
nopage:
- warn_alloc(gfp_mask,
+ /*
+ * When updating a task's mems_allowed or mempolicy nodemask, it is
+ * possible to race with parallel threads in such a way that our
+ * allocation can fail while the mask is being updated. If we are about
+ * to fail, check if the cpuset changed during allocation and if so,
+ * retry.
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry_cpuset;
+
+ /*
+ * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
+ * we always retry
+ */
+ if (gfp_mask & __GFP_NOFAIL) {
+ /*
+ * All existing users of the __GFP_NOFAIL are blockable, so warn
+ * of any new users that actually require GFP_NOWAIT
+ */
+ if (WARN_ON_ONCE(!can_direct_reclaim))
+ goto fail;
+
+ /*
+ * PF_MEMALLOC request from this context is rather bizarre
+ * because we cannot reclaim anything and only can loop waiting
+ * for somebody to do a work for us
+ */
+ WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+
+ /*
+ * non failing costly orders are a hard requirement which we
+ * are not prepared for much so let's warn about these users
+ * so that we can identify them and convert them to something
+ * else.
+ */
+ WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+
+ /*
+ * Help non-failing allocations by giving them access to memory
+ * reserves but do not use ALLOC_NO_WATERMARKS because this
+ * could deplete whole memory reserves which would just make
+ * the situation worse
+ */
+ page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
+ if (page)
+ goto got_pg;
+
+ cond_resched();
+ goto retry;
+ }
+fail:
+ warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
got_pg:
return page;
@@ -3742,7 +3833,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
struct page *page;
- unsigned int cpuset_mems_cookie;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = {
@@ -3779,9 +3869,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
-retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
-
/* Dirty zone balancing only done in the fast path */
ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -3792,8 +3879,13 @@ retry_cpuset:
*/
ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
ac.high_zoneidx, ac.nodemask);
- if (!ac.preferred_zoneref) {
+ if (!ac.preferred_zoneref->zone) {
page = NULL;
+ /*
+ * This might be due to race with cpuset_current_mems_allowed
+ * update, so make sure we retry with original nodemask in the
+ * slow path.
+ */
goto no_zone;
}
@@ -3802,6 +3894,7 @@ retry_cpuset:
if (likely(page))
goto out;
+no_zone:
/*
* Runtime PM, block IO and its error handling path can deadlock
* because I/O on the device might not complete.
@@ -3813,21 +3906,10 @@ retry_cpuset:
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
- if (cpusets_enabled())
+ if (unlikely(ac.nodemask != nodemask))
ac.nodemask = nodemask;
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-no_zone:
- /*
- * When updating a task's mems_allowed, it is possible to race with
- * parallel threads in such a way that an allocation can fail while
- * the mask is being updated. If a page allocation is about to fail,
- * check if the cpuset changed during allocation and if so, retry.
- */
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
- alloc_mask = gfp_mask;
- goto retry_cpuset;
- }
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
@@ -3904,8 +3986,8 @@ EXPORT_SYMBOL(free_pages);
* drivers to provide a backing region of memory for use as either an
* sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
*/
-static struct page *__page_frag_refill(struct page_frag_cache *nc,
- gfp_t gfp_mask)
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+ gfp_t gfp_mask)
{
struct page *page = NULL;
gfp_t gfp = gfp_mask;
@@ -3925,22 +4007,23 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
return page;
}
-void __page_frag_drain(struct page *page, unsigned int order,
- unsigned int count)
+void __page_frag_cache_drain(struct page *page, unsigned int count)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
if (page_ref_sub_and_test(page, count)) {
+ unsigned int order = compound_order(page);
+
if (order == 0)
free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
}
}
-EXPORT_SYMBOL(__page_frag_drain);
+EXPORT_SYMBOL(__page_frag_cache_drain);
-void *__alloc_page_frag(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask)
+void *page_frag_alloc(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -3948,7 +4031,7 @@ void *__alloc_page_frag(struct page_frag_cache *nc,
if (unlikely(!nc->va)) {
refill:
- page = __page_frag_refill(nc, gfp_mask);
+ page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
@@ -3991,19 +4074,19 @@ refill:
return nc->va + offset;
}
-EXPORT_SYMBOL(__alloc_page_frag);
+EXPORT_SYMBOL(page_frag_alloc);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
*/
-void __free_page_frag(void *addr)
+void page_frag_free(void *addr)
{
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
__free_pages_ok(page, compound_order(page));
}
-EXPORT_SYMBOL(__free_page_frag);
+EXPORT_SYMBOL(page_frag_free);
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
@@ -4233,20 +4316,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
* Determine whether the node should be displayed or not, depending on whether
* SHOW_MEM_FILTER_NODES was passed to show_free_areas().
*/
-bool skip_free_areas_node(unsigned int flags, int nid)
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
{
- bool ret = false;
- unsigned int cpuset_mems_cookie;
-
if (!(flags & SHOW_MEM_FILTER_NODES))
- goto out;
+ return false;
- do {
- cpuset_mems_cookie = read_mems_allowed_begin();
- ret = !node_isset(nid, cpuset_current_mems_allowed);
- } while (read_mems_allowed_retry(cpuset_mems_cookie));
-out:
- return ret;
+ /*
+ * no node mask - aka implicit memory numa policy. Do not bother with
+ * the synchronization - read_mems_allowed_begin - because we do not
+ * have to be precise here.
+ */
+ if (!nodemask)
+ nodemask = &cpuset_current_mems_allowed;
+
+ return !node_isset(nid, *nodemask);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -4287,7 +4370,7 @@ static void show_migration_types(unsigned char type)
* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
* cpuset.
*/
-void show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter, nodemask_t *nodemask)
{
unsigned long free_pcp = 0;
int cpu;
@@ -4295,7 +4378,7 @@ void show_free_areas(unsigned int filter)
pg_data_t *pgdat;
for_each_populated_zone(zone) {
- if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
for_each_online_cpu(cpu)
@@ -4329,6 +4412,9 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_FREE_CMA_PAGES));
for_each_online_pgdat(pgdat) {
+ if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
+ continue;
+
printk("Node %d"
" active_anon:%lukB"
" inactive_anon:%lukB"
@@ -4378,7 +4464,7 @@ void show_free_areas(unsigned int filter)
for_each_populated_zone(zone) {
int i;
- if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
free_pcp = 0;
@@ -4443,7 +4529,7 @@ void show_free_areas(unsigned int filter)
unsigned long nr[MAX_ORDER], flags, total = 0;
unsigned char types[MAX_ORDER];
- if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
show_node(zone);
printk(KERN_CONT "%s: ", zone->name);
@@ -5064,8 +5150,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
if (context != MEMMAP_EARLY)
goto not_early;
- if (!early_pfn_valid(pfn))
+ if (!early_pfn_valid(pfn)) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ /*
+ * Skip to the pfn preceding the next valid one (or
+ * end_pfn), such that we hit a valid pfn (or end_pfn)
+ * on our next iteration of the loop.
+ */
+ pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
+#endif
continue;
+ }
if (!early_pfn_in_nid(pfn, nid))
continue;
if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
@@ -7255,6 +7350,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
+ .gfp_mask = GFP_KERNEL,
};
INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a5594bfcc5ed..f4e17a57926a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -83,7 +83,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
unsigned long flags, nr_pages;
bool isolated_page = false;
unsigned int order;
- unsigned long page_idx, buddy_idx;
+ unsigned long pfn, buddy_pfn;
struct page *buddy;
zone = page_zone(page);
@@ -102,11 +102,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
if (PageBuddy(page)) {
order = page_order(page);
if (order >= pageblock_order) {
- page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ pfn = page_to_pfn(page);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
- if (pfn_valid_within(page_to_pfn(buddy)) &&
+ if (pfn_valid_within(buddy_pfn) &&
!is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
isolated_page = true;
diff --git a/mm/shmem.c b/mm/shmem.c
index bb53285a1d99..9c6d22ff44e2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -34,6 +34,8 @@
#include <linux/uio.h>
#include <linux/khugepaged.h>
+#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
+
static struct vfsmount *shm_mnt;
#ifdef CONFIG_SHMEM
@@ -70,6 +72,8 @@ static struct vfsmount *shm_mnt;
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/rmap.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
@@ -115,13 +119,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
- gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
+ gfp_t gfp, struct vm_area_struct *vma,
+ struct vm_fault *vmf, int *fault_type);
int shmem_getpage(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp)
{
return shmem_getpage_gfp(inode, index, pagep, sgp,
- mapping_gfp_mask(inode->i_mapping), NULL, NULL);
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -190,6 +195,11 @@ static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
static struct file_system_type shmem_fs_type;
+bool vma_is_shmem(struct vm_area_struct *vma)
+{
+ return vma->vm_ops == &shmem_vm_ops;
+}
+
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
@@ -415,6 +425,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shrink_control *sc, unsigned long nr_to_split)
{
LIST_HEAD(list), *pos, *next;
+ LIST_HEAD(to_remove);
struct inode *inode;
struct shmem_inode_info *info;
struct page *page;
@@ -441,9 +452,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
/* Check if there's anything to gain */
if (round_up(inode->i_size, PAGE_SIZE) ==
round_up(inode->i_size, HPAGE_PMD_SIZE)) {
- list_del_init(&info->shrinklist);
+ list_move(&info->shrinklist, &to_remove);
removed++;
- iput(inode);
goto next;
}
@@ -454,6 +464,13 @@ next:
}
spin_unlock(&sbinfo->shrinklist_lock);
+ list_for_each_safe(pos, next, &to_remove) {
+ info = list_entry(pos, struct shmem_inode_info, shrinklist);
+ inode = &info->vfs_inode;
+ list_del_init(&info->shrinklist);
+ iput(inode);
+ }
+
list_for_each_safe(pos, next, &list) {
int ret;
@@ -1563,7 +1580,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp, gfp_t gfp,
- struct mm_struct *fault_mm, int *fault_type)
+ struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -1617,7 +1634,7 @@ repeat:
* bring it back from swap or allocate.
*/
sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = fault_mm ? : current->mm;
+ charge_mm = vma ? vma->vm_mm : current->mm;
if (swap.val) {
/* Look it up and read it in.. */
@@ -1627,7 +1644,8 @@ repeat:
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT);
+ mem_cgroup_count_vm_event(charge_mm,
+ PGMAJFAULT);
}
/* Here we actually start the io */
page = shmem_swapin(swap, gfp, info, index);
@@ -1696,6 +1714,11 @@ repeat:
swap_free(swap);
} else {
+ if (vma && userfaultfd_missing(vma)) {
+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
+ return 0;
+ }
+
/* shmem_symlink() */
if (mapping->a_ops != &shmem_aops)
goto alloc_nohuge;
@@ -1958,7 +1981,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
sgp = SGP_NOHUGE;
error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
- gfp, vma->vm_mm, &ret);
+ gfp, vma, vmf, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
return ret;
@@ -2168,10 +2191,123 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
bool shmem_mapping(struct address_space *mapping)
{
- if (!mapping->host)
- return false;
+ return mapping->a_ops == &shmem_aops;
+}
- return mapping->host->i_sb->s_op == &shmem_ops;
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ gfp_t gfp = mapping_gfp_mask(mapping);
+ pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct mem_cgroup *memcg;
+ spinlock_t *ptl;
+ void *page_kaddr;
+ struct page *page;
+ pte_t _dst_pte, *dst_pte;
+ int ret;
+
+ ret = -ENOMEM;
+ if (shmem_acct_block(info->flags, 1))
+ goto out;
+ if (sbinfo->max_blocks) {
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks) >= 0)
+ goto out_unacct_blocks;
+ percpu_counter_inc(&sbinfo->used_blocks);
+ }
+
+ if (!*pagep) {
+ page = shmem_alloc_page(gfp, info, pgoff);
+ if (!page)
+ goto out_dec_used_blocks;
+
+ page_kaddr = kmap_atomic(page);
+ ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
+ PAGE_SIZE);
+ kunmap_atomic(page_kaddr);
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ *pagep = page;
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -1);
+ shmem_unacct_blocks(info->flags, 1);
+ /* don't free the page */
+ return -EFAULT;
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+ __SetPageUptodate(page);
+
+ ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+ if (ret)
+ goto out_release;
+
+ ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
+ if (!ret) {
+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
+ radix_tree_preload_end();
+ }
+ if (ret)
+ goto out_release_uncharge;
+
+ mem_cgroup_commit_charge(page, memcg, false, false);
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+
+ ret = -EEXIST;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!pte_none(*dst_pte))
+ goto out_release_uncharge_unlock;
+
+ lru_cache_add_anon(page);
+
+ spin_lock(&info->lock);
+ info->alloced++;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+
+ inc_mm_counter(dst_mm, mm_counter_file(page));
+ page_add_file_rmap(page, false);
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ unlock_page(page);
+ pte_unmap_unlock(dst_pte, ptl);
+ ret = 0;
+out:
+ return ret;
+out_release_uncharge_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+out_release_uncharge:
+ mem_cgroup_cancel_charge(page, memcg, false);
+out_release:
+ unlock_page(page);
+ put_page(page);
+out_dec_used_blocks:
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -1);
+out_unacct_blocks:
+ shmem_unacct_blocks(info->flags, 1);
+ goto out;
}
#ifdef CONFIG_TMPFS
@@ -4133,7 +4269,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
BUG_ON(mapping->a_ops != &shmem_aops);
error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
- gfp, NULL, NULL);
+ gfp, NULL, NULL, NULL);
if (error)
page = ERR_PTR(error);
else
diff --git a/mm/slab.c b/mm/slab.c
index 29bc6c0dedd0..bd63450a9b16 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1288,7 +1288,8 @@ void __init kmem_cache_init(void)
* Initialize the caches that provide memory for the kmem_cache_node
* structures first. Without this, further allocations will bug.
*/
- kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
+ kmalloc_caches[INDEX_NODE] = create_kmalloc_cache(
+ kmalloc_info[INDEX_NODE].name,
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
slab_state = PARTIAL_NODE;
setup_kmalloc_cache_index_table();
@@ -2332,6 +2333,13 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)
return (ret ? 1 : 0);
}
+#ifdef CONFIG_MEMCG
+void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
+{
+ __kmem_cache_shrink(cachep);
+}
+#endif
+
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
return __kmem_cache_shrink(cachep);
@@ -2457,7 +2465,6 @@ union freelist_init_state {
unsigned int pos;
unsigned int *list;
unsigned int count;
- unsigned int rand;
};
struct rnd_state rnd_state;
};
@@ -2483,8 +2490,7 @@ static bool freelist_state_initialize(union freelist_init_state *state,
} else {
state->list = cachep->random_seq;
state->count = count;
- state->pos = 0;
- state->rand = rand;
+ state->pos = rand % count;
ret = true;
}
return ret;
@@ -2493,7 +2499,9 @@ static bool freelist_state_initialize(union freelist_init_state *state,
/* Get the next entry on the list and randomize it using a random shift */
static freelist_idx_t next_random_slot(union freelist_init_state *state)
{
- return (state->list[state->pos++] + state->rand) % state->count;
+ if (state->pos >= state->count)
+ state->pos = 0;
+ return state->list[state->pos++];
}
/* Swap two freelist entries */
diff --git a/mm/slab.h b/mm/slab.h
index de6579dc362c..65e7c3fcac72 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -71,6 +71,12 @@ extern struct list_head slab_caches;
/* The slab cache that manages slab cache information */
extern struct kmem_cache *kmem_cache;
+/* A table of kmalloc cache names and sizes */
+extern const struct kmalloc_info_struct {
+ const char *name;
+ unsigned long size;
+} kmalloc_info[];
+
unsigned long calculate_alignment(unsigned long flags,
unsigned long align, unsigned long size);
@@ -162,6 +168,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
+void __kmemcg_cache_deactivate(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
@@ -195,17 +202,22 @@ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+
+/* List of all root caches. */
+extern struct list_head slab_root_caches;
+#define root_caches_node memcg_params.__root_caches_node
+
/*
* Iterate over all memcg caches of the given root cache. The caller must hold
* slab_mutex.
*/
#define for_each_memcg_cache(iter, root) \
- list_for_each_entry(iter, &(root)->memcg_params.list, \
- memcg_params.list)
+ list_for_each_entry(iter, &(root)->memcg_params.children, \
+ memcg_params.children_node)
static inline bool is_root_cache(struct kmem_cache *s)
{
- return s->memcg_params.is_root_cache;
+ return !s->memcg_params.root_cache;
}
static inline bool slab_equal_or_root(struct kmem_cache *s,
@@ -294,9 +306,16 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
}
extern void slab_init_memcg_params(struct kmem_cache *);
+extern void memcg_link_cache(struct kmem_cache *s);
+extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
+ void (*deact_fn)(struct kmem_cache *));
#else /* CONFIG_MEMCG && !CONFIG_SLOB */
+/* If !memcg, all caches are root. */
+#define slab_root_caches slab_caches
+#define root_caches_node list
+
#define for_each_memcg_cache(iter, root) \
for ((void)(iter), (void)(root); 0; )
@@ -341,6 +360,11 @@ static inline void memcg_uncharge_slab(struct page *page, int order,
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
+
+static inline void memcg_link_cache(struct kmem_cache *s)
+{
+}
+
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
@@ -488,6 +512,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
void *slab_start(struct seq_file *m, loff_t *pos);
void *slab_next(struct seq_file *m, void *p, loff_t *pos);
void slab_stop(struct seq_file *m, void *p);
+void *memcg_slab_start(struct seq_file *m, loff_t *pos);
+void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
+void memcg_slab_stop(struct seq_file *m, void *p);
int memcg_slab_show(struct seq_file *m, void *p);
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ae323841adb1..23ff74e61838 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -30,6 +30,11 @@ LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
+static LIST_HEAD(slab_caches_to_rcu_destroy);
+static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
+static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
+ slab_caches_to_rcu_destroy_workfn);
+
/*
* Set of flags that will prevent slab merging
*/
@@ -133,11 +138,14 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
}
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+
+LIST_HEAD(slab_root_caches);
+
void slab_init_memcg_params(struct kmem_cache *s)
{
- s->memcg_params.is_root_cache = true;
- INIT_LIST_HEAD(&s->memcg_params.list);
+ s->memcg_params.root_cache = NULL;
RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+ INIT_LIST_HEAD(&s->memcg_params.children);
}
static int init_memcg_params(struct kmem_cache *s,
@@ -145,10 +153,11 @@ static int init_memcg_params(struct kmem_cache *s,
{
struct memcg_cache_array *arr;
- if (memcg) {
- s->memcg_params.is_root_cache = false;
- s->memcg_params.memcg = memcg;
+ if (root_cache) {
s->memcg_params.root_cache = root_cache;
+ s->memcg_params.memcg = memcg;
+ INIT_LIST_HEAD(&s->memcg_params.children_node);
+ INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
return 0;
}
@@ -177,9 +186,6 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
{
struct memcg_cache_array *old, *new;
- if (!is_root_cache(s))
- return 0;
-
new = kzalloc(sizeof(struct memcg_cache_array) +
new_array_size * sizeof(void *), GFP_KERNEL);
if (!new)
@@ -203,7 +209,7 @@ int memcg_update_all_caches(int num_memcgs)
int ret = 0;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
+ list_for_each_entry(s, &slab_root_caches, root_caches_node) {
ret = update_memcg_params(s, num_memcgs);
/*
* Instead of freeing the memory, we'll just leave the caches
@@ -215,6 +221,28 @@ int memcg_update_all_caches(int num_memcgs)
mutex_unlock(&slab_mutex);
return ret;
}
+
+void memcg_link_cache(struct kmem_cache *s)
+{
+ if (is_root_cache(s)) {
+ list_add(&s->root_caches_node, &slab_root_caches);
+ } else {
+ list_add(&s->memcg_params.children_node,
+ &s->memcg_params.root_cache->memcg_params.children);
+ list_add(&s->memcg_params.kmem_caches_node,
+ &s->memcg_params.memcg->kmem_caches);
+ }
+}
+
+static void memcg_unlink_cache(struct kmem_cache *s)
+{
+ if (is_root_cache(s)) {
+ list_del(&s->root_caches_node);
+ } else {
+ list_del(&s->memcg_params.children_node);
+ list_del(&s->memcg_params.kmem_caches_node);
+ }
+}
#else
static inline int init_memcg_params(struct kmem_cache *s,
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
@@ -225,6 +253,10 @@ static inline int init_memcg_params(struct kmem_cache *s,
static inline void destroy_memcg_params(struct kmem_cache *s)
{
}
+
+static inline void memcg_unlink_cache(struct kmem_cache *s)
+{
+}
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
/*
@@ -255,7 +287,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
{
struct kmem_cache *s;
- if (slab_nomerge || (flags & SLAB_NEVER_MERGE))
+ if (slab_nomerge)
return NULL;
if (ctor)
@@ -266,7 +298,10 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
size = ALIGN(size, align);
flags = kmem_cache_flags(size, flags, name, NULL);
- list_for_each_entry_reverse(s, &slab_caches, list) {
+ if (flags & SLAB_NEVER_MERGE)
+ return NULL;
+
+ list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
if (slab_unmergeable(s))
continue;
@@ -350,6 +385,7 @@ static struct kmem_cache *create_cache(const char *name,
s->refcount = 1;
list_add(&s->list, &slab_caches);
+ memcg_link_cache(s);
out:
if (err)
return ERR_PTR(err);
@@ -458,33 +494,58 @@ out_unlock:
}
EXPORT_SYMBOL(kmem_cache_create);
-static int shutdown_cache(struct kmem_cache *s,
- struct list_head *release, bool *need_rcu_barrier)
+static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
{
- if (__kmem_cache_shutdown(s) != 0)
- return -EBUSY;
+ LIST_HEAD(to_destroy);
+ struct kmem_cache *s, *s2;
+
+ /*
+ * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the
+ * @slab_caches_to_rcu_destroy list. The slab pages are freed
+ * through RCU and and the associated kmem_cache are dereferenced
+ * while freeing the pages, so the kmem_caches should be freed only
+ * after the pending RCU operations are finished. As rcu_barrier()
+ * is a pretty slow operation, we batch all pending destructions
+ * asynchronously.
+ */
+ mutex_lock(&slab_mutex);
+ list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy);
+ mutex_unlock(&slab_mutex);
+
+ if (list_empty(&to_destroy))
+ return;
- if (s->flags & SLAB_DESTROY_BY_RCU)
- *need_rcu_barrier = true;
+ rcu_barrier();
- list_move(&s->list, release);
- return 0;
+ list_for_each_entry_safe(s, s2, &to_destroy, list) {
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_release(s);
+#else
+ slab_kmem_cache_release(s);
+#endif
+ }
}
-static void release_caches(struct list_head *release, bool need_rcu_barrier)
+static int shutdown_cache(struct kmem_cache *s)
{
- struct kmem_cache *s, *s2;
+ if (__kmem_cache_shutdown(s) != 0)
+ return -EBUSY;
- if (need_rcu_barrier)
- rcu_barrier();
+ memcg_unlink_cache(s);
+ list_del(&s->list);
- list_for_each_entry_safe(s, s2, release, list) {
+ if (s->flags & SLAB_DESTROY_BY_RCU) {
+ list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
+ schedule_work(&slab_caches_to_rcu_destroy_work);
+ } else {
#ifdef SLAB_SUPPORTS_SYSFS
- sysfs_slab_remove(s);
+ sysfs_slab_release(s);
#else
slab_kmem_cache_release(s);
#endif
}
+
+ return 0;
}
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
@@ -551,8 +612,6 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
goto out_unlock;
}
- list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
-
/*
* Since readers won't lock (see cache_from_memcg_idx()), we need a
* barrier here to ensure nobody will see the kmem_cache partially
@@ -568,6 +627,66 @@ out_unlock:
put_online_cpus();
}
+static void kmemcg_deactivate_workfn(struct work_struct *work)
+{
+ struct kmem_cache *s = container_of(work, struct kmem_cache,
+ memcg_params.deact_work);
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+
+ s->memcg_params.deact_fn(s);
+
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+
+ /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
+ css_put(&s->memcg_params.memcg->css);
+}
+
+static void kmemcg_deactivate_rcufn(struct rcu_head *head)
+{
+ struct kmem_cache *s = container_of(head, struct kmem_cache,
+ memcg_params.deact_rcu_head);
+
+ /*
+ * We need to grab blocking locks. Bounce to ->deact_work. The
+ * work item shares the space with the RCU head and can't be
+ * initialized eariler.
+ */
+ INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
+ queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
+}
+
+/**
+ * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
+ * sched RCU grace period
+ * @s: target kmem_cache
+ * @deact_fn: deactivation function to call
+ *
+ * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
+ * held after a sched RCU grace period. The slab is guaranteed to stay
+ * alive until @deact_fn is finished. This is to be used from
+ * __kmemcg_cache_deactivate().
+ */
+void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
+ void (*deact_fn)(struct kmem_cache *))
+{
+ if (WARN_ON_ONCE(is_root_cache(s)) ||
+ WARN_ON_ONCE(s->memcg_params.deact_fn))
+ return;
+
+ /* pin memcg so that @s doesn't get destroyed in the middle */
+ css_get(&s->memcg_params.memcg->css);
+
+ s->memcg_params.deact_fn = deact_fn;
+ call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
+}
+
void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
{
int idx;
@@ -579,41 +698,15 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
get_online_cpus();
get_online_mems();
-#ifdef CONFIG_SLUB
- /*
- * In case of SLUB, we need to disable empty slab caching to
- * avoid pinning the offline memory cgroup by freeable kmem
- * pages charged to it. SLAB doesn't need this, as it
- * periodically purges unused slabs.
- */
- mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
- c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL;
- if (c) {
- c->cpu_partial = 0;
- c->min_partial = 0;
- }
- }
- mutex_unlock(&slab_mutex);
- /*
- * kmem_cache->cpu_partial is checked locklessly (see
- * put_cpu_partial()). Make sure the change is visible.
- */
- synchronize_sched();
-#endif
-
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
- if (!is_root_cache(s))
- continue;
-
+ list_for_each_entry(s, &slab_root_caches, root_caches_node) {
arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
lockdep_is_held(&slab_mutex));
c = arr->entries[idx];
if (!c)
continue;
- __kmem_cache_shrink(c);
+ __kmemcg_cache_deactivate(c);
arr->entries[idx] = NULL;
}
mutex_unlock(&slab_mutex);
@@ -622,47 +715,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
put_online_cpus();
}
-static int __shutdown_memcg_cache(struct kmem_cache *s,
- struct list_head *release, bool *need_rcu_barrier)
-{
- BUG_ON(is_root_cache(s));
-
- if (shutdown_cache(s, release, need_rcu_barrier))
- return -EBUSY;
-
- list_del(&s->memcg_params.list);
- return 0;
-}
-
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
{
- LIST_HEAD(release);
- bool need_rcu_barrier = false;
struct kmem_cache *s, *s2;
get_online_cpus();
get_online_mems();
mutex_lock(&slab_mutex);
- list_for_each_entry_safe(s, s2, &slab_caches, list) {
- if (is_root_cache(s) || s->memcg_params.memcg != memcg)
- continue;
+ list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
+ memcg_params.kmem_caches_node) {
/*
* The cgroup is about to be freed and therefore has no charges
* left. Hence, all its caches must be empty by now.
*/
- BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier));
+ BUG_ON(shutdown_cache(s));
}
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
-
- release_caches(&release, need_rcu_barrier);
}
-static int shutdown_memcg_caches(struct kmem_cache *s,
- struct list_head *release, bool *need_rcu_barrier)
+static int shutdown_memcg_caches(struct kmem_cache *s)
{
struct memcg_cache_array *arr;
struct kmem_cache *c, *c2;
@@ -681,13 +756,13 @@ static int shutdown_memcg_caches(struct kmem_cache *s,
c = arr->entries[i];
if (!c)
continue;
- if (__shutdown_memcg_cache(c, release, need_rcu_barrier))
+ if (shutdown_cache(c))
/*
* The cache still has objects. Move it to a temporary
* list so as not to try to destroy it for a second
* time while iterating over inactive caches below.
*/
- list_move(&c->memcg_params.list, &busy);
+ list_move(&c->memcg_params.children_node, &busy);
else
/*
* The cache is empty and will be destroyed soon. Clear
@@ -702,23 +777,22 @@ static int shutdown_memcg_caches(struct kmem_cache *s,
* Second, shutdown all caches left from memory cgroups that are now
* offline.
*/
- list_for_each_entry_safe(c, c2, &s->memcg_params.list,
- memcg_params.list)
- __shutdown_memcg_cache(c, release, need_rcu_barrier);
+ list_for_each_entry_safe(c, c2, &s->memcg_params.children,
+ memcg_params.children_node)
+ shutdown_cache(c);
- list_splice(&busy, &s->memcg_params.list);
+ list_splice(&busy, &s->memcg_params.children);
/*
* A cache being destroyed must be empty. In particular, this means
* that all per memcg caches attached to it must be empty too.
*/
- if (!list_empty(&s->memcg_params.list))
+ if (!list_empty(&s->memcg_params.children))
return -EBUSY;
return 0;
}
#else
-static inline int shutdown_memcg_caches(struct kmem_cache *s,
- struct list_head *release, bool *need_rcu_barrier)
+static inline int shutdown_memcg_caches(struct kmem_cache *s)
{
return 0;
}
@@ -734,8 +808,6 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
- LIST_HEAD(release);
- bool need_rcu_barrier = false;
int err;
if (unlikely(!s))
@@ -751,9 +823,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
- err = shutdown_memcg_caches(s, &release, &need_rcu_barrier);
+ err = shutdown_memcg_caches(s);
if (!err)
- err = shutdown_cache(s, &release, &need_rcu_barrier);
+ err = shutdown_cache(s);
if (err) {
pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
@@ -765,8 +837,6 @@ out_unlock:
put_online_mems();
put_online_cpus();
-
- release_caches(&release, need_rcu_barrier);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -828,6 +898,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
create_boot_cache(s, name, size, flags);
list_add(&s->list, &slab_caches);
+ memcg_link_cache(s);
s->refcount = 1;
return s;
}
@@ -912,10 +983,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
* kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
* kmalloc-67108864.
*/
-static struct {
- const char *name;
- unsigned long size;
-} const kmalloc_info[] __initconst = {
+const struct kmalloc_info_struct kmalloc_info[] __initconst = {
{NULL, 0}, {"kmalloc-96", 96},
{"kmalloc-192", 192}, {"kmalloc-8", 8},
{"kmalloc-16", 16}, {"kmalloc-32", 32},
@@ -1138,12 +1206,12 @@ static void print_slabinfo_header(struct seq_file *m)
void *slab_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&slab_mutex);
- return seq_list_start(&slab_caches, *pos);
+ return seq_list_start(&slab_root_caches, *pos);
}
void *slab_next(struct seq_file *m, void *p, loff_t *pos)
{
- return seq_list_next(p, &slab_caches, pos);
+ return seq_list_next(p, &slab_root_caches, pos);
}
void slab_stop(struct seq_file *m, void *p)
@@ -1195,25 +1263,44 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
static int slab_show(struct seq_file *m, void *p)
{
- struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+ struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node);
- if (p == slab_caches.next)
+ if (p == slab_root_caches.next)
print_slabinfo_header(m);
- if (is_root_cache(s))
- cache_show(s, m);
+ cache_show(s, m);
return 0;
}
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+void *memcg_slab_start(struct seq_file *m, loff_t *pos)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ mutex_lock(&slab_mutex);
+ return seq_list_start(&memcg->kmem_caches, *pos);
+}
+
+void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ return seq_list_next(p, &memcg->kmem_caches, pos);
+}
+
+void memcg_slab_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&slab_mutex);
+}
+
int memcg_slab_show(struct seq_file *m, void *p)
{
- struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+ struct kmem_cache *s = list_entry(p, struct kmem_cache,
+ memcg_params.kmem_caches_node);
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- if (p == slab_caches.next)
+ if (p == memcg->kmem_caches.next)
print_slabinfo_header(m);
- if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
- cache_show(s, m);
+ cache_show(s, m);
return 0;
}
#endif
diff --git a/mm/slub.c b/mm/slub.c
index 067598a00849..7f4bc7027ed5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -214,11 +214,13 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void memcg_propagate_slab_attrs(struct kmem_cache *s);
+static void sysfs_slab_remove(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
+static inline void sysfs_slab_remove(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -496,10 +498,11 @@ static inline int check_valid_pointer(struct kmem_cache *s,
return 1;
}
-static void print_section(char *text, u8 *addr, unsigned int length)
+static void print_section(char *level, char *text, u8 *addr,
+ unsigned int length)
{
metadata_access_enable();
- print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
+ print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
length, 1);
metadata_access_disable();
}
@@ -636,14 +639,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
p, p - addr, get_freepointer(s, p));
if (s->flags & SLAB_RED_ZONE)
- print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+ print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
+ s->red_left_pad);
else if (p > addr + 16)
- print_section("Bytes b4 ", p - 16, 16);
+ print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
- print_section("Object ", p, min_t(unsigned long, s->object_size,
- PAGE_SIZE));
+ print_section(KERN_ERR, "Object ", p,
+ min_t(unsigned long, s->object_size, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section("Redzone ", p + s->object_size,
+ print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
if (s->offset)
@@ -658,7 +662,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section("Padding ", p + off, size_from_object(s) - off);
+ print_section(KERN_ERR, "Padding ", p + off,
+ size_from_object(s) - off);
dump_stack();
}
@@ -820,7 +825,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
end--;
slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
- print_section("Padding ", end - remainder, remainder);
+ print_section(KERN_ERR, "Padding ", end - remainder, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
return 0;
@@ -973,7 +978,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
page->freelist);
if (!alloc)
- print_section("Object ", (void *)object,
+ print_section(KERN_INFO, "Object ", (void *)object,
s->object_size);
dump_stack();
@@ -1419,6 +1424,10 @@ static int init_cache_random_seq(struct kmem_cache *s)
int err;
unsigned long i, count = oo_objects(s->oo);
+ /* Bailout if already initialised */
+ if (s->random_seq)
+ return 0;
+
err = cache_random_seq_create(s, count, GFP_KERNEL);
if (err) {
pr_err("SLUB: Unable to initialize free list for %s\n",
@@ -1623,6 +1632,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
flags &= ~GFP_SLAB_BUG_MASK;
pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
invalid_mask, &invalid_mask, flags, &flags);
+ dump_stack();
}
return allocate_slab(s,
@@ -3679,6 +3689,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
+ sysfs_slab_remove(s);
return 0;
}
@@ -3945,6 +3956,42 @@ int __kmem_cache_shrink(struct kmem_cache *s)
return ret;
}
+#ifdef CONFIG_MEMCG
+static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
+{
+ /*
+ * Called with all the locks held after a sched RCU grace period.
+ * Even if @s becomes empty after shrinking, we can't know that @s
+ * doesn't have allocations already in-flight and thus can't
+ * destroy @s until the associated memcg is released.
+ *
+ * However, let's remove the sysfs files for empty caches here.
+ * Each cache has a lot of interface files which aren't
+ * particularly useful for empty draining caches; otherwise, we can
+ * easily end up with millions of unnecessary sysfs files on
+ * systems which have a lot of memory and transient cgroups.
+ */
+ if (!__kmem_cache_shrink(s))
+ sysfs_slab_remove(s);
+}
+
+void __kmemcg_cache_deactivate(struct kmem_cache *s)
+{
+ /*
+ * Disable empty slabs caching. Used to avoid pinning offline
+ * memory cgroups by kmem pages that can be freed.
+ */
+ s->cpu_partial = 0;
+ s->min_partial = 0;
+
+ /*
+ * s->cpu_partial is checked locklessly (see put_cpu_partial), so
+ * we have to make sure the change is visible before shrinking.
+ */
+ slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
+}
+#endif
+
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
@@ -4101,6 +4148,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
}
slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
+ memcg_link_cache(s);
return s;
}
@@ -4660,6 +4708,22 @@ enum slab_stat_type {
#define SO_OBJECTS (1 << SL_OBJECTS)
#define SO_TOTAL (1 << SL_TOTAL)
+#ifdef CONFIG_MEMCG
+static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
+
+static int __init setup_slub_memcg_sysfs(char *str)
+{
+ int v;
+
+ if (get_option(&str, &v) > 0)
+ memcg_sysfs_enabled = v;
+
+ return 1;
+}
+
+__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
+#endif
+
static ssize_t show_slab_objects(struct kmem_cache *s,
char *buf, unsigned long flags)
{
@@ -5563,8 +5627,14 @@ static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
const char *name;
+ struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
+ if (!kset) {
+ kobject_init(&s->kobj, &slab_ktype);
+ return 0;
+ }
+
if (unmergeable) {
/*
* Slabcache can never be merged so we can use the name proper.
@@ -5581,7 +5651,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
name = create_unique_id(s);
}
- s->kobj.kset = cache_kset(s);
+ s->kobj.kset = kset;
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
if (err)
goto out;
@@ -5591,7 +5661,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
goto out_del_kobj;
#ifdef CONFIG_MEMCG
- if (is_root_cache(s)) {
+ if (is_root_cache(s) && memcg_sysfs_enabled) {
s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
if (!s->memcg_kset) {
err = -ENOMEM;
@@ -5614,7 +5684,7 @@ out_del_kobj:
goto out;
}
-void sysfs_slab_remove(struct kmem_cache *s)
+static void sysfs_slab_remove(struct kmem_cache *s)
{
if (slab_state < FULL)
/*
@@ -5623,12 +5693,26 @@ void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
+ if (!s->kobj.state_in_sysfs)
+ /*
+ * For a memcg cache, this may be called during
+ * deactivation and again on shutdown. Remove only once.
+ * A cache is never shut down before deactivation is
+ * complete, so no need to worry about synchronization.
+ */
+ return;
+
#ifdef CONFIG_MEMCG
kset_unregister(s->memcg_kset);
#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
- kobject_put(&s->kobj);
+}
+
+void sysfs_slab_release(struct kmem_cache *s)
+{
+ if (slab_state >= FULL)
+ kobject_put(&s->kobj);
}
/*
diff --git a/mm/sparse.c b/mm/sparse.c
index 1e168bf2779a..db6bf3c97ea2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -662,12 +662,12 @@ static void free_map_bootmem(struct page *memmap)
>> PAGE_SHIFT;
for (i = 0; i < nr_pages; i++, page++) {
- magic = (unsigned long) page->lru.next;
+ magic = (unsigned long) page->freelist;
BUG_ON(magic == NODE_INFO);
maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
- removing_section_nr = page->private;
+ removing_section_nr = page_private(page);
/*
* When this function is called, the removing section is
diff --git a/mm/swap.c b/mm/swap.c
index 844baedd2429..aabf2e90fe32 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -971,12 +971,6 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
void __init swap_setup(void)
{
unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
-#ifdef CONFIG_SWAP
- int i;
-
- for (i = 0; i < MAX_SWAPFILES; i++)
- spin_lock_init(&swapper_spaces[i].tree_lock);
-#endif
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
new file mode 100644
index 000000000000..9b5bc86f96ad
--- /dev/null
+++ b/mm/swap_slots.c
@@ -0,0 +1,342 @@
+/*
+ * Manage cache of swap slots to be used for and returned from
+ * swap.
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * We allocate the swap slots from the global pool and put
+ * it into local per cpu caches. This has the advantage
+ * of no needing to acquire the swap_info lock every time
+ * we need a new slot.
+ *
+ * There is also opportunity to simply return the slot
+ * to local caches without needing to acquire swap_info
+ * lock. We do not reuse the returned slots directly but
+ * move them back to the global pool in a batch. This
+ * allows the slots to coaellesce and reduce fragmentation.
+ *
+ * The swap entry allocated is marked with SWAP_HAS_CACHE
+ * flag in map_count that prevents it from being allocated
+ * again from the global pool.
+ *
+ * The swap slots cache is protected by a mutex instead of
+ * a spin lock as when we search for slots with scan_swap_map,
+ * we can possibly sleep.
+ */
+
+#include <linux/swap_slots.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_SWAP
+
+static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
+static bool swap_slot_cache_active;
+bool swap_slot_cache_enabled;
+static bool swap_slot_cache_initialized;
+DEFINE_MUTEX(swap_slots_cache_mutex);
+/* Serialize swap slots cache enable/disable operations */
+DEFINE_MUTEX(swap_slots_cache_enable_mutex);
+
+static void __drain_swap_slots_cache(unsigned int type);
+static void deactivate_swap_slots_cache(void);
+static void reactivate_swap_slots_cache(void);
+
+#define use_swap_slot_cache (swap_slot_cache_active && \
+ swap_slot_cache_enabled && swap_slot_cache_initialized)
+#define SLOTS_CACHE 0x1
+#define SLOTS_CACHE_RET 0x2
+
+static void deactivate_swap_slots_cache(void)
+{
+ mutex_lock(&swap_slots_cache_mutex);
+ swap_slot_cache_active = false;
+ __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+ mutex_unlock(&swap_slots_cache_mutex);
+}
+
+static void reactivate_swap_slots_cache(void)
+{
+ mutex_lock(&swap_slots_cache_mutex);
+ swap_slot_cache_active = true;
+ mutex_unlock(&swap_slots_cache_mutex);
+}
+
+/* Must not be called with cpu hot plug lock */
+void disable_swap_slots_cache_lock(void)
+{
+ mutex_lock(&swap_slots_cache_enable_mutex);
+ swap_slot_cache_enabled = false;
+ if (swap_slot_cache_initialized) {
+ /* serialize with cpu hotplug operations */
+ get_online_cpus();
+ __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+ put_online_cpus();
+ }
+}
+
+static void __reenable_swap_slots_cache(void)
+{
+ swap_slot_cache_enabled = has_usable_swap();
+}
+
+void reenable_swap_slots_cache_unlock(void)
+{
+ __reenable_swap_slots_cache();
+ mutex_unlock(&swap_slots_cache_enable_mutex);
+}
+
+static bool check_cache_active(void)
+{
+ long pages;
+
+ if (!swap_slot_cache_enabled || !swap_slot_cache_initialized)
+ return false;
+
+ pages = get_nr_swap_pages();
+ if (!swap_slot_cache_active) {
+ if (pages > num_online_cpus() *
+ THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
+ reactivate_swap_slots_cache();
+ goto out;
+ }
+
+ /* if global pool of slot caches too low, deactivate cache */
+ if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
+ deactivate_swap_slots_cache();
+out:
+ return swap_slot_cache_active;
+}
+
+static int alloc_swap_slot_cache(unsigned int cpu)
+{
+ struct swap_slots_cache *cache;
+ swp_entry_t *slots, *slots_ret;
+
+ /*
+ * Do allocation outside swap_slots_cache_mutex
+ * as vzalloc could trigger reclaim and get_swap_page,
+ * which can lock swap_slots_cache_mutex.
+ */
+ slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+ if (!slots)
+ return -ENOMEM;
+
+ slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+ if (!slots_ret) {
+ vfree(slots);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&swap_slots_cache_mutex);
+ cache = &per_cpu(swp_slots, cpu);
+ if (cache->slots || cache->slots_ret)
+ /* cache already allocated */
+ goto out;
+ if (!cache->lock_initialized) {
+ mutex_init(&cache->alloc_lock);
+ spin_lock_init(&cache->free_lock);
+ cache->lock_initialized = true;
+ }
+ cache->nr = 0;
+ cache->cur = 0;
+ cache->n_ret = 0;
+ cache->slots = slots;
+ slots = NULL;
+ cache->slots_ret = slots_ret;
+ slots_ret = NULL;
+out:
+ mutex_unlock(&swap_slots_cache_mutex);
+ if (slots)
+ vfree(slots);
+ if (slots_ret)
+ vfree(slots_ret);
+ return 0;
+}
+
+static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
+ bool free_slots)
+{
+ struct swap_slots_cache *cache;
+ swp_entry_t *slots = NULL;
+
+ cache = &per_cpu(swp_slots, cpu);
+ if ((type & SLOTS_CACHE) && cache->slots) {
+ mutex_lock(&cache->alloc_lock);
+ swapcache_free_entries(cache->slots + cache->cur, cache->nr);
+ cache->cur = 0;
+ cache->nr = 0;
+ if (free_slots && cache->slots) {
+ vfree(cache->slots);
+ cache->slots = NULL;
+ }
+ mutex_unlock(&cache->alloc_lock);
+ }
+ if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
+ spin_lock_irq(&cache->free_lock);
+ swapcache_free_entries(cache->slots_ret, cache->n_ret);
+ cache->n_ret = 0;
+ if (free_slots && cache->slots_ret) {
+ slots = cache->slots_ret;
+ cache->slots_ret = NULL;
+ }
+ spin_unlock_irq(&cache->free_lock);
+ if (slots)
+ vfree(slots);
+ }
+}
+
+static void __drain_swap_slots_cache(unsigned int type)
+{
+ unsigned int cpu;
+
+ /*
+ * This function is called during
+ * 1) swapoff, when we have to make sure no
+ * left over slots are in cache when we remove
+ * a swap device;
+ * 2) disabling of swap slot cache, when we run low
+ * on swap slots when allocating memory and need
+ * to return swap slots to global pool.
+ *
+ * We cannot acquire cpu hot plug lock here as
+ * this function can be invoked in the cpu
+ * hot plug path:
+ * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
+ * -> memory allocation -> direct reclaim -> get_swap_page
+ * -> drain_swap_slots_cache
+ *
+ * Hence the loop over current online cpu below could miss cpu that
+ * is being brought online but not yet marked as online.
+ * That is okay as we do not schedule and run anything on a
+ * cpu before it has been marked online. Hence, we will not
+ * fill any swap slots in slots cache of such cpu.
+ * There are no slots on such cpu that need to be drained.
+ */
+ for_each_online_cpu(cpu)
+ drain_slots_cache_cpu(cpu, type, false);
+}
+
+static int free_slot_cache(unsigned int cpu)
+{
+ mutex_lock(&swap_slots_cache_mutex);
+ drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
+ mutex_unlock(&swap_slots_cache_mutex);
+ return 0;
+}
+
+int enable_swap_slots_cache(void)
+{
+ int ret = 0;
+
+ mutex_lock(&swap_slots_cache_enable_mutex);
+ if (swap_slot_cache_initialized) {
+ __reenable_swap_slots_cache();
+ goto out_unlock;
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
+ alloc_swap_slot_cache, free_slot_cache);
+ if (ret < 0)
+ goto out_unlock;
+ swap_slot_cache_initialized = true;
+ __reenable_swap_slots_cache();
+out_unlock:
+ mutex_unlock(&swap_slots_cache_enable_mutex);
+ return 0;
+}
+
+/* called with swap slot cache's alloc lock held */
+static int refill_swap_slots_cache(struct swap_slots_cache *cache)
+{
+ if (!use_swap_slot_cache || cache->nr)
+ return 0;
+
+ cache->cur = 0;
+ if (swap_slot_cache_active)
+ cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots);
+
+ return cache->nr;
+}
+
+int free_swap_slot(swp_entry_t entry)
+{
+ struct swap_slots_cache *cache;
+
+ BUG_ON(!swap_slot_cache_initialized);
+
+ cache = &get_cpu_var(swp_slots);
+ if (use_swap_slot_cache && cache->slots_ret) {
+ spin_lock_irq(&cache->free_lock);
+ /* Swap slots cache may be deactivated before acquiring lock */
+ if (!use_swap_slot_cache) {
+ spin_unlock_irq(&cache->free_lock);
+ goto direct_free;
+ }
+ if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
+ /*
+ * Return slots to global pool.
+ * The current swap_map value is SWAP_HAS_CACHE.
+ * Set it to 0 to indicate it is available for
+ * allocation in global pool
+ */
+ swapcache_free_entries(cache->slots_ret, cache->n_ret);
+ cache->n_ret = 0;
+ }
+ cache->slots_ret[cache->n_ret++] = entry;
+ spin_unlock_irq(&cache->free_lock);
+ } else {
+direct_free:
+ swapcache_free_entries(&entry, 1);
+ }
+ put_cpu_var(swp_slots);
+
+ return 0;
+}
+
+swp_entry_t get_swap_page(void)
+{
+ swp_entry_t entry, *pentry;
+ struct swap_slots_cache *cache;
+
+ /*
+ * Preemption is allowed here, because we may sleep
+ * in refill_swap_slots_cache(). But it is safe, because
+ * accesses to the per-CPU data structure are protected by the
+ * mutex cache->alloc_lock.
+ *
+ * The alloc path here does not touch cache->slots_ret
+ * so cache->free_lock is not taken.
+ */
+ cache = raw_cpu_ptr(&swp_slots);
+
+ entry.val = 0;
+ if (check_cache_active()) {
+ mutex_lock(&cache->alloc_lock);
+ if (cache->slots) {
+repeat:
+ if (cache->nr) {
+ pentry = &cache->slots[cache->cur++];
+ entry = *pentry;
+ pentry->val = 0;
+ cache->nr--;
+ } else {
+ if (refill_swap_slots_cache(cache))
+ goto repeat;
+ }
+ }
+ mutex_unlock(&cache->alloc_lock);
+ if (entry.val)
+ return entry;
+ }
+
+ get_swap_pages(1, &entry);
+
+ return entry;
+}
+
+#endif /* CONFIG_SWAP */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 35d7e0ee1c77..473b71e052a8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,8 @@
#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/migrate.h>
+#include <linux/vmalloc.h>
+#include <linux/swap_slots.h>
#include <asm/pgtable.h>
@@ -32,15 +34,8 @@ static const struct address_space_operations swap_aops = {
#endif
};
-struct address_space swapper_spaces[MAX_SWAPFILES] = {
- [0 ... MAX_SWAPFILES - 1] = {
- .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
- .i_mmap_writable = ATOMIC_INIT(0),
- .a_ops = &swap_aops,
- /* swap cache doesn't use writeback related tags */
- .flags = 1 << AS_NO_WRITEBACK_TAGS,
- }
-};
+struct address_space *swapper_spaces[MAX_SWAPFILES];
+static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -53,11 +48,26 @@ static struct {
unsigned long total_swapcache_pages(void)
{
- int i;
+ unsigned int i, j, nr;
unsigned long ret = 0;
+ struct address_space *spaces;
- for (i = 0; i < MAX_SWAPFILES; i++)
- ret += swapper_spaces[i].nrpages;
+ rcu_read_lock();
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ /*
+ * The corresponding entries in nr_swapper_spaces and
+ * swapper_spaces will be reused only after at least
+ * one grace period. So it is impossible for them
+ * belongs to different usage.
+ */
+ nr = nr_swapper_spaces[i];
+ spaces = rcu_dereference(swapper_spaces[i]);
+ if (!nr || !spaces)
+ continue;
+ for (j = 0; j < nr; j++)
+ ret += spaces[j].nrpages;
+ }
+ rcu_read_unlock();
return ret;
}
@@ -315,6 +325,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
break;
/*
+ * Just skip read ahead for unused swap slot.
+ * During swap_off when swap_slot_cache is disabled,
+ * we have to handle the race between putting
+ * swap entry in swap cache and marking swap slot
+ * as SWAP_HAS_CACHE. That's done in later part of code or
+ * else swap_off will be aborted if we return NULL.
+ */
+ if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
+ break;
+
+ /*
* Get a new page to read into from swap.
*/
if (!new_page) {
@@ -505,3 +526,38 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
skip:
return read_swap_cache_async(entry, gfp_mask, vma, addr);
}
+
+int init_swap_address_space(unsigned int type, unsigned long nr_pages)
+{
+ struct address_space *spaces, *space;
+ unsigned int i, nr;
+
+ nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+ spaces = vzalloc(sizeof(struct address_space) * nr);
+ if (!spaces)
+ return -ENOMEM;
+ for (i = 0; i < nr; i++) {
+ space = spaces + i;
+ INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN);
+ atomic_set(&space->i_mmap_writable, 0);
+ space->a_ops = &swap_aops;
+ /* swap cache doesn't use writeback related tags */
+ mapping_set_no_writeback_tags(space);
+ spin_lock_init(&space->tree_lock);
+ }
+ nr_swapper_spaces[type] = nr;
+ rcu_assign_pointer(swapper_spaces[type], spaces);
+
+ return 0;
+}
+
+void exit_swap_address_space(unsigned int type)
+{
+ struct address_space *spaces;
+
+ spaces = swapper_spaces[type];
+ nr_swapper_spaces[type] = 0;
+ rcu_assign_pointer(swapper_spaces[type], NULL);
+ synchronize_rcu();
+ kvfree(spaces);
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1c6e0321205d..2cac12cc9abe 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -34,6 +34,7 @@
#include <linux/frontswap.h>
#include <linux/swapfile.h>
#include <linux/export.h>
+#include <linux/swap_slots.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -257,6 +258,47 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
info->data = 0;
}
+static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ struct swap_cluster_info *ci;
+
+ ci = si->cluster_info;
+ if (ci) {
+ ci += offset / SWAPFILE_CLUSTER;
+ spin_lock(&ci->lock);
+ }
+ return ci;
+}
+
+static inline void unlock_cluster(struct swap_cluster_info *ci)
+{
+ if (ci)
+ spin_unlock(&ci->lock);
+}
+
+static inline struct swap_cluster_info *lock_cluster_or_swap_info(
+ struct swap_info_struct *si,
+ unsigned long offset)
+{
+ struct swap_cluster_info *ci;
+
+ ci = lock_cluster(si, offset);
+ if (!ci)
+ spin_lock(&si->lock);
+
+ return ci;
+}
+
+static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
+{
+ if (ci)
+ unlock_cluster(ci);
+ else
+ spin_unlock(&si->lock);
+}
+
static inline bool cluster_list_empty(struct swap_cluster_list *list)
{
return cluster_is_null(&list->head);
@@ -281,9 +323,17 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
cluster_set_next_flag(&list->head, idx, 0);
cluster_set_next_flag(&list->tail, idx, 0);
} else {
+ struct swap_cluster_info *ci_tail;
unsigned int tail = cluster_next(&list->tail);
- cluster_set_next(&ci[tail], idx);
+ /*
+ * Nested cluster lock, but both cluster locks are
+ * only acquired when we held swap_info_struct->lock
+ */
+ ci_tail = ci + tail;
+ spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
+ cluster_set_next(ci_tail, idx);
+ unlock_cluster(ci_tail);
cluster_set_next_flag(&list->tail, idx, 0);
}
}
@@ -328,7 +378,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
*/
static void swap_do_scheduled_discard(struct swap_info_struct *si)
{
- struct swap_cluster_info *info;
+ struct swap_cluster_info *info, *ci;
unsigned int idx;
info = si->cluster_info;
@@ -341,10 +391,14 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
SWAPFILE_CLUSTER);
spin_lock(&si->lock);
- cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+ ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+ cluster_set_flag(ci, CLUSTER_FLAG_FREE);
+ unlock_cluster(ci);
cluster_list_add_tail(&si->free_clusters, info, idx);
+ ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
0, SWAPFILE_CLUSTER);
+ unlock_cluster(ci);
}
}
@@ -443,12 +497,13 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
* Try to get a swap entry from current cpu's swap entry pool (a cluster). This
* might involve allocating a new cluster for current CPU too.
*/
-static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
unsigned long *offset, unsigned long *scan_base)
{
struct percpu_cluster *cluster;
+ struct swap_cluster_info *ci;
bool found_free;
- unsigned long tmp;
+ unsigned long tmp, max;
new_cluster:
cluster = this_cpu_ptr(si->percpu_cluster);
@@ -466,7 +521,7 @@ new_cluster:
*scan_base = *offset = si->cluster_next;
goto new_cluster;
} else
- return;
+ return false;
}
found_free = false;
@@ -476,14 +531,21 @@ new_cluster:
* check if there is still free entry in the cluster
*/
tmp = cluster->next;
- while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
- SWAPFILE_CLUSTER) {
+ max = min_t(unsigned long, si->max,
+ (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
+ if (tmp >= max) {
+ cluster_set_null(&cluster->index);
+ goto new_cluster;
+ }
+ ci = lock_cluster(si, tmp);
+ while (tmp < max) {
if (!si->swap_map[tmp]) {
found_free = true;
break;
}
tmp++;
}
+ unlock_cluster(ci);
if (!found_free) {
cluster_set_null(&cluster->index);
goto new_cluster;
@@ -491,15 +553,22 @@ new_cluster:
cluster->next = tmp + 1;
*offset = tmp;
*scan_base = tmp;
+ return found_free;
}
-static unsigned long scan_swap_map(struct swap_info_struct *si,
- unsigned char usage)
+static int scan_swap_map_slots(struct swap_info_struct *si,
+ unsigned char usage, int nr,
+ swp_entry_t slots[])
{
+ struct swap_cluster_info *ci;
unsigned long offset;
unsigned long scan_base;
unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
+ int n_ret = 0;
+
+ if (nr > SWAP_BATCH)
+ nr = SWAP_BATCH;
/*
* We try to cluster swap pages by allocating them sequentially
@@ -517,8 +586,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
/* SSD algorithm */
if (si->cluster_info) {
- scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
- goto checks;
+ if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+ goto checks;
+ else
+ goto scan;
}
if (unlikely(!si->cluster_nr--)) {
@@ -562,8 +633,14 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
checks:
if (si->cluster_info) {
- while (scan_swap_map_ssd_cluster_conflict(si, offset))
- scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+ while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
+ /* take a break if we already got some slots */
+ if (n_ret)
+ goto done;
+ if (!scan_swap_map_try_ssd_cluster(si, &offset,
+ &scan_base))
+ goto scan;
+ }
}
if (!(si->flags & SWP_WRITEOK))
goto no_page;
@@ -572,9 +649,11 @@ checks:
if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit;
+ ci = lock_cluster(si, offset);
/* reuse swap entry of cache-only swap if not busy. */
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
+ unlock_cluster(ci);
spin_unlock(&si->lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
spin_lock(&si->lock);
@@ -584,8 +663,13 @@ checks:
goto scan; /* check next one */
}
- if (si->swap_map[offset])
- goto scan;
+ if (si->swap_map[offset]) {
+ unlock_cluster(ci);
+ if (!n_ret)
+ goto scan;
+ else
+ goto done;
+ }
if (offset == si->lowest_bit)
si->lowest_bit++;
@@ -601,10 +685,45 @@ checks:
}
si->swap_map[offset] = usage;
inc_cluster_info_page(si, si->cluster_info, offset);
+ unlock_cluster(ci);
si->cluster_next = offset + 1;
- si->flags -= SWP_SCANNING;
+ slots[n_ret++] = swp_entry(si->type, offset);
+
+ /* got enough slots or reach max slots? */
+ if ((n_ret == nr) || (offset >= si->highest_bit))
+ goto done;
+
+ /* search for next available slot */
- return offset;
+ /* time to take a break? */
+ if (unlikely(--latency_ration < 0)) {
+ if (n_ret)
+ goto done;
+ spin_unlock(&si->lock);
+ cond_resched();
+ spin_lock(&si->lock);
+ latency_ration = LATENCY_LIMIT;
+ }
+
+ /* try to get more slots in cluster */
+ if (si->cluster_info) {
+ if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+ goto checks;
+ else
+ goto done;
+ }
+ /* non-ssd case */
+ ++offset;
+
+ /* non-ssd case, still more slots in cluster? */
+ if (si->cluster_nr && !si->swap_map[offset]) {
+ --si->cluster_nr;
+ goto checks;
+ }
+
+done:
+ si->flags -= SWP_SCANNING;
+ return n_ret;
scan:
spin_unlock(&si->lock);
@@ -642,17 +761,41 @@ scan:
no_page:
si->flags -= SWP_SCANNING;
- return 0;
+ return n_ret;
}
-swp_entry_t get_swap_page(void)
+static unsigned long scan_swap_map(struct swap_info_struct *si,
+ unsigned char usage)
+{
+ swp_entry_t entry;
+ int n_ret;
+
+ n_ret = scan_swap_map_slots(si, usage, 1, &entry);
+
+ if (n_ret)
+ return swp_offset(entry);
+ else
+ return 0;
+
+}
+
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
{
struct swap_info_struct *si, *next;
- pgoff_t offset;
+ long avail_pgs;
+ int n_ret = 0;
- if (atomic_long_read(&nr_swap_pages) <= 0)
+ avail_pgs = atomic_long_read(&nr_swap_pages);
+ if (avail_pgs <= 0)
goto noswap;
- atomic_long_dec(&nr_swap_pages);
+
+ if (n_goal > SWAP_BATCH)
+ n_goal = SWAP_BATCH;
+
+ if (n_goal > avail_pgs)
+ n_goal = avail_pgs;
+
+ atomic_long_sub(n_goal, &nr_swap_pages);
spin_lock(&swap_avail_lock);
@@ -678,14 +821,14 @@ start_over:
spin_unlock(&si->lock);
goto nextsi;
}
-
- /* This is called for allocating swap entry for cache */
- offset = scan_swap_map(si, SWAP_HAS_CACHE);
+ n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+ n_goal, swp_entries);
spin_unlock(&si->lock);
- if (offset)
- return swp_entry(si->type, offset);
+ if (n_ret)
+ goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n",
- si->type);
+ si->type);
+
spin_lock(&swap_avail_lock);
nextsi:
/*
@@ -696,7 +839,8 @@ nextsi:
* up between us dropping swap_avail_lock and taking si->lock.
* Since we dropped the swap_avail_lock, the swap_avail_head
* list may have been modified; so if next is still in the
- * swap_avail_head list then try it, otherwise start over.
+ * swap_avail_head list then try it, otherwise start over
+ * if we have not gotten any slots.
*/
if (plist_node_empty(&next->avail_list))
goto start_over;
@@ -704,9 +848,11 @@ nextsi:
spin_unlock(&swap_avail_lock);
- atomic_long_inc(&nr_swap_pages);
+check_out:
+ if (n_ret < n_goal)
+ atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages);
noswap:
- return (swp_entry_t) {0};
+ return n_ret;
}
/* The only caller of this function is now suspend routine */
@@ -731,7 +877,7 @@ swp_entry_t get_swap_page_of_type(int type)
return (swp_entry_t) {0};
}
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
unsigned long offset, type;
@@ -747,34 +893,76 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
offset = swp_offset(entry);
if (offset >= p->max)
goto bad_offset;
- if (!p->swap_map[offset])
- goto bad_free;
- spin_lock(&p->lock);
return p;
-bad_free:
- pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
- goto out;
bad_offset:
- pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
+ pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
goto out;
bad_device:
- pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
+ pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
goto out;
bad_nofile:
- pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
+ pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
+out:
+ return NULL;
+}
+
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
+{
+ struct swap_info_struct *p;
+
+ p = __swap_info_get(entry);
+ if (!p)
+ goto out;
+ if (!p->swap_map[swp_offset(entry)])
+ goto bad_free;
+ return p;
+
+bad_free:
+ pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
+ goto out;
out:
return NULL;
}
-static unsigned char swap_entry_free(struct swap_info_struct *p,
- swp_entry_t entry, unsigned char usage)
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+{
+ struct swap_info_struct *p;
+
+ p = _swap_info_get(entry);
+ if (p)
+ spin_lock(&p->lock);
+ return p;
+}
+
+static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
+ struct swap_info_struct *q)
{
+ struct swap_info_struct *p;
+
+ p = _swap_info_get(entry);
+
+ if (p != q) {
+ if (q != NULL)
+ spin_unlock(&q->lock);
+ if (p != NULL)
+ spin_lock(&p->lock);
+ }
+ return p;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
+{
+ struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
+ ci = lock_cluster_or_swap_info(p, offset);
+
count = p->swap_map[offset];
+
has_cache = count & SWAP_HAS_CACHE;
count &= ~SWAP_HAS_CACHE;
@@ -798,38 +986,52 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
}
usage = count | has_cache;
- p->swap_map[offset] = usage;
-
- /* free if no reference */
- if (!usage) {
- mem_cgroup_uncharge_swap(entry);
- dec_cluster_info_page(p, p->cluster_info, offset);
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit) {
- bool was_full = !p->highest_bit;
- p->highest_bit = offset;
- if (was_full && (p->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- WARN_ON(!plist_node_empty(&p->avail_list));
- if (plist_node_empty(&p->avail_list))
- plist_add(&p->avail_list,
- &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- }
- }
- atomic_long_inc(&nr_swap_pages);
- p->inuse_pages--;
- frontswap_invalidate_page(p->type, offset);
- if (p->flags & SWP_BLKDEV) {
- struct gendisk *disk = p->bdev->bd_disk;
- if (disk->fops->swap_slot_free_notify)
- disk->fops->swap_slot_free_notify(p->bdev,
- offset);
+ p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+
+ unlock_cluster_or_swap_info(p, ci);
+
+ return usage;
+}
+
+static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char count;
+
+ ci = lock_cluster(p, offset);
+ count = p->swap_map[offset];
+ VM_BUG_ON(count != SWAP_HAS_CACHE);
+ p->swap_map[offset] = 0;
+ dec_cluster_info_page(p, p->cluster_info, offset);
+ unlock_cluster(ci);
+
+ mem_cgroup_uncharge_swap(entry);
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit) {
+ bool was_full = !p->highest_bit;
+
+ p->highest_bit = offset;
+ if (was_full && (p->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ WARN_ON(!plist_node_empty(&p->avail_list));
+ if (plist_node_empty(&p->avail_list))
+ plist_add(&p->avail_list,
+ &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
}
+ atomic_long_inc(&nr_swap_pages);
+ p->inuse_pages--;
+ frontswap_invalidate_page(p->type, offset);
+ if (p->flags & SWP_BLKDEV) {
+ struct gendisk *disk = p->bdev->bd_disk;
- return usage;
+ if (disk->fops->swap_slot_free_notify)
+ disk->fops->swap_slot_free_notify(p->bdev,
+ offset);
+ }
}
/*
@@ -840,10 +1042,10 @@ void swap_free(swp_entry_t entry)
{
struct swap_info_struct *p;
- p = swap_info_get(entry);
+ p = _swap_info_get(entry);
if (p) {
- swap_entry_free(p, entry, 1);
- spin_unlock(&p->lock);
+ if (!__swap_entry_free(p, entry, 1))
+ free_swap_slot(entry);
}
}
@@ -854,11 +1056,33 @@ void swapcache_free(swp_entry_t entry)
{
struct swap_info_struct *p;
- p = swap_info_get(entry);
+ p = _swap_info_get(entry);
if (p) {
- swap_entry_free(p, entry, SWAP_HAS_CACHE);
- spin_unlock(&p->lock);
+ if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
+ free_swap_slot(entry);
+ }
+}
+
+void swapcache_free_entries(swp_entry_t *entries, int n)
+{
+ struct swap_info_struct *p, *prev;
+ int i;
+
+ if (n <= 0)
+ return;
+
+ prev = NULL;
+ p = NULL;
+ for (i = 0; i < n; ++i) {
+ p = swap_info_get_cont(entries[i], prev);
+ if (p)
+ swap_entry_free(p, entries[i]);
+ else
+ break;
+ prev = p;
}
+ if (p)
+ spin_unlock(&p->lock);
}
/*
@@ -870,13 +1094,39 @@ int page_swapcount(struct page *page)
{
int count = 0;
struct swap_info_struct *p;
+ struct swap_cluster_info *ci;
swp_entry_t entry;
+ unsigned long offset;
entry.val = page_private(page);
- p = swap_info_get(entry);
+ p = _swap_info_get(entry);
if (p) {
- count = swap_count(p->swap_map[swp_offset(entry)]);
- spin_unlock(&p->lock);
+ offset = swp_offset(entry);
+ ci = lock_cluster_or_swap_info(p, offset);
+ count = swap_count(p->swap_map[offset]);
+ unlock_cluster_or_swap_info(p, ci);
+ }
+ return count;
+}
+
+/*
+ * How many references to @entry are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
+ */
+int __swp_swapcount(swp_entry_t entry)
+{
+ int count = 0;
+ pgoff_t offset;
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+
+ si = __swap_info_get(entry);
+ if (si) {
+ offset = swp_offset(entry);
+ ci = lock_cluster_or_swap_info(si, offset);
+ count = swap_count(si->swap_map[offset]);
+ unlock_cluster_or_swap_info(si, ci);
}
return count;
}
@@ -889,22 +1139,26 @@ int swp_swapcount(swp_entry_t entry)
{
int count, tmp_count, n;
struct swap_info_struct *p;
+ struct swap_cluster_info *ci;
struct page *page;
pgoff_t offset;
unsigned char *map;
- p = swap_info_get(entry);
+ p = _swap_info_get(entry);
if (!p)
return 0;
- count = swap_count(p->swap_map[swp_offset(entry)]);
+ offset = swp_offset(entry);
+
+ ci = lock_cluster_or_swap_info(p, offset);
+
+ count = swap_count(p->swap_map[offset]);
if (!(count & COUNT_CONTINUED))
goto out;
count &= ~COUNT_CONTINUED;
n = SWAP_MAP_MAX + 1;
- offset = swp_offset(entry);
page = vmalloc_to_page(p->swap_map + offset);
offset &= ~PAGE_MASK;
VM_BUG_ON(page_private(page) != SWP_CONTINUED);
@@ -919,7 +1173,7 @@ int swp_swapcount(swp_entry_t entry)
n *= (SWAP_CONT_MAX + 1);
} while (tmp_count & COUNT_CONTINUED);
out:
- spin_unlock(&p->lock);
+ unlock_cluster_or_swap_info(p, ci);
return count;
}
@@ -943,11 +1197,25 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
count = page_trans_huge_mapcount(page, total_mapcount);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
- if (count == 1 && !PageWriteback(page)) {
+ if (count != 1)
+ goto out;
+ if (!PageWriteback(page)) {
delete_from_swap_cache(page);
SetPageDirty(page);
+ } else {
+ swp_entry_t entry;
+ struct swap_info_struct *p;
+
+ entry.val = page_private(page);
+ p = swap_info_get(entry);
+ if (p->flags & SWP_STABLE_WRITES) {
+ spin_unlock(&p->lock);
+ return false;
+ }
+ spin_unlock(&p->lock);
}
}
+out:
return count <= 1;
}
@@ -997,21 +1265,23 @@ int free_swap_and_cache(swp_entry_t entry)
{
struct swap_info_struct *p;
struct page *page = NULL;
+ unsigned char count;
if (non_swap_entry(entry))
return 1;
- p = swap_info_get(entry);
+ p = _swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
+ count = __swap_entry_free(p, entry, 1);
+ if (count == SWAP_HAS_CACHE) {
page = find_get_page(swap_address_space(entry),
swp_offset(entry));
if (page && !trylock_page(page)) {
put_page(page);
page = NULL;
}
- }
- spin_unlock(&p->lock);
+ } else if (!count)
+ free_swap_slot(entry);
}
if (page) {
/*
@@ -1839,6 +2109,17 @@ static void reinsert_swap_info(struct swap_info_struct *p)
spin_unlock(&swap_lock);
}
+bool has_usable_swap(void)
+{
+ bool ret = true;
+
+ spin_lock(&swap_lock);
+ if (plist_head_empty(&swap_active_head))
+ ret = false;
+ spin_unlock(&swap_lock);
+ return ret;
+}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -1909,6 +2190,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
+ disable_swap_slots_cache_lock();
+
set_current_oom_origin();
err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
clear_current_oom_origin();
@@ -1916,9 +2199,12 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (err) {
/* re-insert swap space back into swap_list */
reinsert_swap_info(p);
+ reenable_swap_slots_cache_unlock();
goto out_dput;
}
+ reenable_swap_slots_cache_unlock();
+
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -1961,6 +2247,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
vfree(frontswap_map);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
+ exit_swap_address_space(p->type);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
@@ -2284,6 +2571,13 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
return maxpages;
}
+#define SWAP_CLUSTER_INFO_COLS \
+ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+#define SWAP_CLUSTER_SPACE_COLS \
+ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
+#define SWAP_CLUSTER_COLS \
+ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
+
static int setup_swap_map_and_extents(struct swap_info_struct *p,
union swap_header *swap_header,
unsigned char *swap_map,
@@ -2291,11 +2585,12 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
unsigned long maxpages,
sector_t *span)
{
- int i;
+ unsigned int j, k;
unsigned int nr_good_pages;
int nr_extents;
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
+ unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
+ unsigned long i, idx;
nr_good_pages = maxpages - 1; /* omit header page */
@@ -2343,15 +2638,23 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
if (!cluster_info)
return nr_extents;
- for (i = 0; i < nr_clusters; i++) {
- if (!cluster_count(&cluster_info[idx])) {
+
+ /*
+ * Reduce false cache line sharing between cluster_info and
+ * sharing same address space.
+ */
+ for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
+ j = (k + col) % SWAP_CLUSTER_COLS;
+ for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
+ idx = i * SWAP_CLUSTER_COLS + j;
+ if (idx >= nr_clusters)
+ continue;
+ if (cluster_count(&cluster_info[idx]))
+ continue;
cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
cluster_list_add_tail(&p->free_clusters, cluster_info,
idx);
}
- idx++;
- if (idx == nr_clusters)
- idx = 0;
}
return nr_extents;
}
@@ -2448,8 +2751,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = -ENOMEM;
goto bad_swap;
}
+
+ if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
+ p->flags |= SWP_STABLE_WRITES;
+
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
int cpu;
+ unsigned long ci, nr_cluster;
p->flags |= SWP_SOLIDSTATE;
/*
@@ -2457,13 +2765,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
* SSD
*/
p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+ nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
- SWAPFILE_CLUSTER) * sizeof(*cluster_info));
+ cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info));
if (!cluster_info) {
error = -ENOMEM;
goto bad_swap;
}
+
+ for (ci = 0; ci < nr_cluster; ci++)
+ spin_lock_init(&((cluster_info + ci)->lock));
+
p->percpu_cluster = alloc_percpu(struct percpu_cluster);
if (!p->percpu_cluster) {
error = -ENOMEM;
@@ -2520,6 +2832,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
}
+ error = init_swap_address_space(p->type, maxpages);
+ if (error)
+ goto bad_swap;
+
mutex_lock(&swapon_mutex);
prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
@@ -2575,6 +2891,8 @@ out:
putname(name);
if (inode && S_ISREG(inode->i_mode))
inode_unlock(inode);
+ if (!error)
+ enable_swap_slots_cache();
return error;
}
@@ -2609,6 +2927,7 @@ void si_swapinfo(struct sysinfo *val)
static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
+ struct swap_cluster_info *ci;
unsigned long offset, type;
unsigned char count;
unsigned char has_cache;
@@ -2622,10 +2941,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
goto bad_file;
p = swap_info[type];
offset = swp_offset(entry);
-
- spin_lock(&p->lock);
if (unlikely(offset >= p->max))
- goto unlock_out;
+ goto out;
+
+ ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -2668,7 +2987,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
p->swap_map[offset] = count | has_cache;
unlock_out:
- spin_unlock(&p->lock);
+ unlock_cluster_or_swap_info(p, ci);
out:
return err;
@@ -2757,6 +3076,7 @@ EXPORT_SYMBOL_GPL(__page_file_index);
int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
{
struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
struct page *head;
struct page *page;
struct page *list_page;
@@ -2780,6 +3100,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
offset = swp_offset(entry);
+
+ ci = lock_cluster(si, offset);
+
count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
@@ -2792,6 +3115,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
+ unlock_cluster(ci);
spin_unlock(&si->lock);
return -ENOMEM;
}
@@ -2840,6 +3164,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
list_add_tail(&page->lru, &head->lru);
page = NULL; /* now it's attached, don't free it */
out:
+ unlock_cluster(ci);
spin_unlock(&si->lock);
outer:
if (page)
@@ -2853,7 +3178,8 @@ outer:
* into, carry if so, or else fail until a new continuation page is allocated;
* when the original swap_map count is decremented from 0 with continuation,
* borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
+ * lock.
*/
static bool swap_count_continued(struct swap_info_struct *si,
pgoff_t offset, unsigned char count)
diff --git a/mm/truncate.c b/mm/truncate.c
index fd97f1dbce29..dd7b24e083c5 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -24,20 +24,12 @@
#include <linux/rmap.h>
#include "internal.h"
-static void clear_exceptional_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
+ void *entry)
{
struct radix_tree_node *node;
void **slot;
- /* Handled by shmem itself */
- if (shmem_mapping(mapping))
- return;
-
- if (dax_mapping(mapping)) {
- dax_delete_mapping_entry(mapping, index);
- return;
- }
spin_lock_irq(&mapping->tree_lock);
/*
* Regular page slots are stabilized by the page lock even
@@ -55,6 +47,56 @@ unlock:
spin_unlock_irq(&mapping->tree_lock);
}
+/*
+ * Unconditionally remove exceptional entry. Usually called from truncate path.
+ */
+static void truncate_exceptional_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return;
+
+ if (dax_mapping(mapping)) {
+ dax_delete_mapping_entry(mapping, index);
+ return;
+ }
+ clear_shadow_entry(mapping, index, entry);
+}
+
+/*
+ * Invalidate exceptional entry if easily possible. This handles exceptional
+ * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and
+ * clean entries.
+ */
+static int invalidate_exceptional_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return 1;
+ if (dax_mapping(mapping))
+ return dax_invalidate_mapping_entry(mapping, index);
+ clear_shadow_entry(mapping, index, entry);
+ return 1;
+}
+
+/*
+ * Invalidate exceptional entry if clean. This handles exceptional entries for
+ * invalidate_inode_pages2() so for DAX it evicts only clean entries.
+ */
+static int invalidate_exceptional_entry2(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return 1;
+ if (dax_mapping(mapping))
+ return dax_invalidate_mapping_entry_sync(mapping, index);
+ clear_shadow_entry(mapping, index, entry);
+ return 1;
+}
+
/**
* do_invalidatepage - invalidate part or all of a page
* @page: the page which is affected
@@ -262,7 +304,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
break;
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ truncate_exceptional_entry(mapping, index,
+ page);
continue;
}
@@ -351,7 +394,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
}
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ truncate_exceptional_entry(mapping, index,
+ page);
continue;
}
@@ -470,7 +514,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
break;
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ invalidate_exceptional_entry(mapping, index,
+ page);
continue;
}
@@ -592,7 +637,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
break;
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ if (!invalidate_exceptional_entry2(mapping,
+ index, page))
+ ret = -EBUSY;
continue;
}
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 3c8da0af9695..8345299e3e3b 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -108,13 +108,13 @@ static inline const char *check_kernel_text_object(const void *ptr,
* __pa() is not just the reverse of __va(). This can be detected
* and checked:
*/
- textlow_linear = (unsigned long)__va(__pa(textlow));
+ textlow_linear = (unsigned long)lm_alias(textlow);
/* No different mapping: we're done. */
if (textlow_linear == textlow)
return NULL;
/* Check the secondary mapping... */
- texthigh_linear = (unsigned long)__va(__pa(texthigh));
+ texthigh_linear = (unsigned long)lm_alias(texthigh);
if (overlaps(ptr, n, textlow_linear, texthigh_linear))
return "<linear kernel text>";
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index af817e5060fb..1e5c2f94e8a3 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -14,6 +14,9 @@
#include <linux/swapops.h>
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/shmem_fs.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -139,6 +142,234 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
return pmd;
}
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
+ * called with mmap_sem held, it will release mmap_sem before returning.
+ */
+static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage)
+{
+ int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
+ int vm_shared = dst_vma->vm_flags & VM_SHARED;
+ ssize_t err;
+ pte_t *dst_pte;
+ unsigned long src_addr, dst_addr;
+ long copied;
+ struct page *page;
+ struct hstate *h;
+ unsigned long vma_hpagesize;
+ pgoff_t idx;
+ u32 hash;
+ struct address_space *mapping;
+
+ /*
+ * There is no default zero huge page for all huge page sizes as
+ * supported by hugetlb. A PMD_SIZE huge pages may exist as used
+ * by THP. Since we can not reliably insert a zero page, this
+ * feature is not supported.
+ */
+ if (zeropage) {
+ up_read(&dst_mm->mmap_sem);
+ return -EINVAL;
+ }
+
+ src_addr = src_start;
+ dst_addr = dst_start;
+ copied = 0;
+ page = NULL;
+ vma_hpagesize = vma_kernel_pagesize(dst_vma);
+
+ /*
+ * Validate alignment based on huge page size
+ */
+ err = -EINVAL;
+ if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
+ goto out_unlock;
+
+retry:
+ /*
+ * On routine entry dst_vma is set. If we had to drop mmap_sem and
+ * retry, dst_vma will be set to NULL and we must lookup again.
+ */
+ if (!dst_vma) {
+ err = -EINVAL;
+ dst_vma = find_vma(dst_mm, dst_start);
+ if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
+ goto out_unlock;
+
+ if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
+ goto out_unlock;
+
+ /*
+ * Make sure the remaining dst range is both valid and
+ * fully within a single existing vma.
+ */
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ goto out_unlock;
+
+ vm_shared = dst_vma->vm_flags & VM_SHARED;
+ }
+
+ if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
+ (len - copied) & (vma_hpagesize - 1)))
+ goto out_unlock;
+
+ /*
+ * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges.
+ */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ goto out_unlock;
+
+ /*
+ * If not shared, ensure the dst_vma has a anon_vma.
+ */
+ err = -ENOMEM;
+ if (!vm_shared) {
+ if (unlikely(anon_vma_prepare(dst_vma)))
+ goto out_unlock;
+ }
+
+ h = hstate_vma(dst_vma);
+
+ while (src_addr < src_start + len) {
+ pte_t dst_pteval;
+
+ BUG_ON(dst_addr >= dst_start + len);
+ VM_BUG_ON(dst_addr & ~huge_page_mask(h));
+
+ /*
+ * Serialize via hugetlb_fault_mutex
+ */
+ idx = linear_page_index(dst_vma, dst_addr);
+ mapping = dst_vma->vm_file->f_mapping;
+ hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
+ idx, dst_addr);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ err = -ENOMEM;
+ dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
+ if (!dst_pte) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out_unlock;
+ }
+
+ err = -EEXIST;
+ dst_pteval = huge_ptep_get(dst_pte);
+ if (!huge_pte_none(dst_pteval)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out_unlock;
+ }
+
+ err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
+ dst_addr, src_addr, &page);
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ vm_alloc_shared = vm_shared;
+
+ cond_resched();
+
+ if (unlikely(err == -EFAULT)) {
+ up_read(&dst_mm->mmap_sem);
+ BUG_ON(!page);
+
+ err = copy_huge_page_from_user(page,
+ (const void __user *)src_addr,
+ pages_per_huge_page(h), true);
+ if (unlikely(err)) {
+ err = -EFAULT;
+ goto out;
+ }
+ down_read(&dst_mm->mmap_sem);
+
+ dst_vma = NULL;
+ goto retry;
+ } else
+ BUG_ON(page);
+
+ if (!err) {
+ dst_addr += vma_hpagesize;
+ src_addr += vma_hpagesize;
+ copied += vma_hpagesize;
+
+ if (fatal_signal_pending(current))
+ err = -EINTR;
+ }
+ if (err)
+ break;
+ }
+
+out_unlock:
+ up_read(&dst_mm->mmap_sem);
+out:
+ if (page) {
+ /*
+ * We encountered an error and are about to free a newly
+ * allocated huge page.
+ *
+ * Reservation handling is very subtle, and is different for
+ * private and shared mappings. See the routine
+ * restore_reserve_on_error for details. Unfortunately, we
+ * can not call restore_reserve_on_error now as it would
+ * require holding mmap_sem.
+ *
+ * If a reservation for the page existed in the reservation
+ * map of a private mapping, the map was modified to indicate
+ * the reservation was consumed when the page was allocated.
+ * We clear the PagePrivate flag now so that the global
+ * reserve count will not be incremented in free_huge_page.
+ * The reservation map will still indicate the reservation
+ * was consumed and possibly prevent later page allocation.
+ * This is better than leaking a global reservation. If no
+ * reservation existed, it is still safe to clear PagePrivate
+ * as no adjustments to reservation counts were made during
+ * allocation.
+ *
+ * The reservation map for shared mappings indicates which
+ * pages have reservations. When a huge page is allocated
+ * for an address with a reservation, no change is made to
+ * the reserve map. In this case PagePrivate will be set
+ * to indicate that the global reservation count should be
+ * incremented when the page is freed. This is the desired
+ * behavior. However, when a huge page is allocated for an
+ * address without a reservation a reservation entry is added
+ * to the reservation map, and PagePrivate will not be set.
+ * When the page is freed, the global reserve count will NOT
+ * be incremented and it will appear as though we have leaked
+ * reserved page. In this case, set PagePrivate so that the
+ * global reserve count will be incremented to match the
+ * reservation map entry which was created.
+ *
+ * Note that vm_alloc_shared is based on the flags of the vma
+ * for which the page was originally allocated. dst_vma could
+ * be different or NULL on error.
+ */
+ if (vm_alloc_shared)
+ SetPagePrivate(page);
+ else
+ ClearPagePrivate(page);
+ put_page(page);
+ }
+ BUG_ON(copied < 0);
+ BUG_ON(err > 0);
+ BUG_ON(!copied && !err);
+ return copied ? copied : err;
+}
+#else /* !CONFIG_HUGETLB_PAGE */
+/* fail at build time if gcc attempts to use this */
+extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage);
+#endif /* CONFIG_HUGETLB_PAGE */
+
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
@@ -175,13 +406,28 @@ retry:
*/
err = -EINVAL;
dst_vma = find_vma(dst_mm, dst_start);
- if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+ if (!dst_vma)
goto out_unlock;
+ /*
+ * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
+ * it will overwrite vm_ops, so vma_is_anonymous must return false.
+ */
+ if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
+ dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+
if (dst_start < dst_vma->vm_start ||
dst_start + len > dst_vma->vm_end)
goto out_unlock;
/*
+ * If this is a HUGETLB vma, pass off to appropriate routine
+ */
+ if (is_vm_hugetlb_page(dst_vma))
+ return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+ src_start, len, zeropage);
+
+ /*
* Be strict and only allow __mcopy_atomic on userfaultfd
* registered ranges to prevent userland errors going
* unnoticed. As far as the VM consistency is concerned, it
@@ -193,11 +439,7 @@ retry:
if (!dst_vma->vm_userfaultfd_ctx.ctx)
goto out_unlock;
- /*
- * FIXME: only allow copying on anonymous vmas, tmpfs should
- * be added.
- */
- if (dst_vma->vm_ops)
+ if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
/*
@@ -206,7 +448,7 @@ retry:
* dst_vma.
*/
err = -ENOMEM;
- if (unlikely(anon_vma_prepare(dst_vma)))
+ if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
goto out_unlock;
while (src_addr < src_start + len) {
@@ -243,12 +485,21 @@ retry:
BUG_ON(pmd_none(*dst_pmd));
BUG_ON(pmd_trans_huge(*dst_pmd));
- if (!zeropage)
- err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, &page);
- else
- err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr);
+ if (vma_is_anonymous(dst_vma)) {
+ if (!zeropage)
+ err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr,
+ &page);
+ else
+ err = mfill_zeropage_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr);
+ } else {
+ err = -EINVAL; /* if zeropage is true return -EINVAL */
+ if (likely(!zeropage))
+ err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr,
+ src_addr, &page);
+ }
cond_resched();
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3ca82d44edd3..d89034a393f2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1662,7 +1662,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
- warn_alloc(gfp_mask,
+ warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
@@ -1724,7 +1724,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return addr;
fail:
- warn_alloc(gfp_mask,
+ warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure: %lu bytes", real_size);
return NULL;
}
@@ -2309,7 +2309,7 @@ EXPORT_SYMBOL_GPL(free_vm_area);
#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
- return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
+ return rb_entry_safe(n, struct vmap_area, rb_node);
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6aa5b01d3e75..7bb23ff229b6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -234,12 +234,39 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
pgdat_reclaimable_pages(pgdat) * 6;
}
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
+/**
+ * lruvec_lru_size - Returns the number of pages on the given LRU list.
+ * @lruvec: lru vector
+ * @lru: lru to use
+ * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
+ unsigned long lru_size;
+ int zid;
+
if (!mem_cgroup_disabled())
- return mem_cgroup_get_lru_size(lruvec, lru);
+ lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+ else
+ lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
+
+ for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
+ unsigned long size;
+
+ if (!managed_zone(zone))
+ continue;
+
+ if (!mem_cgroup_disabled())
+ size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+ else
+ size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
+ NR_ZONE_LRU_BASE + lru);
+ lru_size -= min(size, lru_size);
+ }
+
+ return lru_size;
- return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
}
/*
@@ -902,6 +929,17 @@ static void page_check_dirty_writeback(struct page *page,
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
+struct reclaim_stat {
+ unsigned nr_dirty;
+ unsigned nr_unqueued_dirty;
+ unsigned nr_congested;
+ unsigned nr_writeback;
+ unsigned nr_immediate;
+ unsigned nr_activate;
+ unsigned nr_ref_keep;
+ unsigned nr_unmap_fail;
+};
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -909,22 +947,20 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct pglist_data *pgdat,
struct scan_control *sc,
enum ttu_flags ttu_flags,
- unsigned long *ret_nr_dirty,
- unsigned long *ret_nr_unqueued_dirty,
- unsigned long *ret_nr_congested,
- unsigned long *ret_nr_writeback,
- unsigned long *ret_nr_immediate,
+ struct reclaim_stat *stat,
bool force_reclaim)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
int pgactivate = 0;
- unsigned long nr_unqueued_dirty = 0;
- unsigned long nr_dirty = 0;
- unsigned long nr_congested = 0;
- unsigned long nr_reclaimed = 0;
- unsigned long nr_writeback = 0;
- unsigned long nr_immediate = 0;
+ unsigned nr_unqueued_dirty = 0;
+ unsigned nr_dirty = 0;
+ unsigned nr_congested = 0;
+ unsigned nr_reclaimed = 0;
+ unsigned nr_writeback = 0;
+ unsigned nr_immediate = 0;
+ unsigned nr_ref_keep = 0;
+ unsigned nr_unmap_fail = 0;
cond_resched();
@@ -1063,6 +1099,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGEREF_ACTIVATE:
goto activate_locked;
case PAGEREF_KEEP:
+ nr_ref_keep++;
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
@@ -1100,6 +1137,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
(ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
(ttu_flags | TTU_BATCH_FLUSH))) {
case SWAP_FAIL:
+ nr_unmap_fail++;
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
@@ -1266,11 +1304,16 @@ keep:
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
- *ret_nr_dirty += nr_dirty;
- *ret_nr_congested += nr_congested;
- *ret_nr_unqueued_dirty += nr_unqueued_dirty;
- *ret_nr_writeback += nr_writeback;
- *ret_nr_immediate += nr_immediate;
+ if (stat) {
+ stat->nr_dirty = nr_dirty;
+ stat->nr_congested = nr_congested;
+ stat->nr_unqueued_dirty = nr_unqueued_dirty;
+ stat->nr_writeback = nr_writeback;
+ stat->nr_immediate = nr_immediate;
+ stat->nr_activate = pgactivate;
+ stat->nr_ref_keep = nr_ref_keep;
+ stat->nr_unmap_fail = nr_unmap_fail;
+ }
return nr_reclaimed;
}
@@ -1282,7 +1325,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
.priority = DEF_PRIORITY,
.may_unmap = 1,
};
- unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
+ unsigned long ret;
struct page *page, *next;
LIST_HEAD(clean_pages);
@@ -1295,8 +1338,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_UNMAP|TTU_IGNORE_ACCESS,
- &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
+ TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
@@ -1382,8 +1424,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
* be complete before mem_cgroup_update_lru_size due to a santity check.
*/
static __always_inline void update_lru_sizes(struct lruvec *lruvec,
- enum lru_list lru, unsigned long *nr_zone_taken,
- unsigned long nr_taken)
+ enum lru_list lru, unsigned long *nr_zone_taken)
{
int zid;
@@ -1392,11 +1433,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
continue;
__update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
- }
-
#ifdef CONFIG_MEMCG
- mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
+ mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
#endif
+ }
+
}
/*
@@ -1428,6 +1469,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
+ unsigned long skipped = 0, total_skipped = 0;
unsigned long scan, nr_pages;
LIST_HEAD(pages_skipped);
@@ -1479,14 +1521,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
*/
if (!list_empty(&pages_skipped)) {
int zid;
- unsigned long total_skipped = 0;
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
if (!nr_skipped[zid])
continue;
__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
- total_skipped += nr_skipped[zid];
+ skipped += nr_skipped[zid];
}
/*
@@ -1494,14 +1535,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* close to unreclaimable. If the LRU list is empty, account
* skipped pages as a full scan.
*/
- scan += list_empty(src) ? total_skipped : total_skipped >> 2;
+ total_skipped = list_empty(src) ? skipped : skipped >> 2;
list_splice(&pages_skipped, src);
}
- *nr_scanned = scan;
- trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
- nr_taken, mode, is_file_lru(lru));
- update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
+ *nr_scanned = scan + total_skipped;
+ trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
+ scan, skipped, nr_taken, mode, lru);
+ update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
@@ -1660,30 +1701,6 @@ static int current_may_throttle(void)
bdi_write_congested(current->backing_dev_info);
}
-static bool inactive_reclaimable_pages(struct lruvec *lruvec,
- struct scan_control *sc, enum lru_list lru)
-{
- int zid;
- struct zone *zone;
- int file = is_file_lru(lru);
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-
- if (!global_reclaim(sc))
- return true;
-
- for (zid = sc->reclaim_idx; zid >= 0; zid--) {
- zone = &pgdat->node_zones[zid];
- if (!managed_zone(zone))
- continue;
-
- if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
- LRU_FILE * file) >= SWAP_CLUSTER_MAX)
- return true;
- }
-
- return false;
-}
-
/*
* shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
@@ -1696,19 +1713,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- unsigned long nr_dirty = 0;
- unsigned long nr_congested = 0;
- unsigned long nr_unqueued_dirty = 0;
- unsigned long nr_writeback = 0;
- unsigned long nr_immediate = 0;
+ struct reclaim_stat stat = {};
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- if (!inactive_reclaimable_pages(lruvec, sc, lru))
- return 0;
-
while (unlikely(too_many_isolated(pgdat, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1745,9 +1755,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return 0;
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
- &nr_dirty, &nr_unqueued_dirty, &nr_congested,
- &nr_writeback, &nr_immediate,
- false);
+ &stat, false);
spin_lock_irq(&pgdat->lru_lock);
@@ -1781,7 +1789,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* of pages under pages flagged for immediate reclaim and stall if any
* are encountered in the nr_immediate check below.
*/
- if (nr_writeback && nr_writeback == nr_taken)
+ if (stat.nr_writeback && stat.nr_writeback == nr_taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/*
@@ -1793,7 +1801,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* Tag a zone as congested if all the dirty pages scanned were
* backed by a congested BDI and wait_iff_congested will stall.
*/
- if (nr_dirty && nr_dirty == nr_congested)
+ if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
set_bit(PGDAT_CONGESTED, &pgdat->flags);
/*
@@ -1802,7 +1810,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* the pgdat PGDAT_DIRTY and kswapd will start writing pages from
* reclaim context.
*/
- if (nr_unqueued_dirty == nr_taken)
+ if (stat.nr_unqueued_dirty == nr_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
@@ -1811,7 +1819,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* that pages are cycling through the LRU faster than
* they are written so also forcibly stall.
*/
- if (nr_immediate && current_may_throttle())
+ if (stat.nr_immediate && current_may_throttle())
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
@@ -1826,6 +1834,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
nr_scanned, nr_reclaimed,
+ stat.nr_dirty, stat.nr_writeback,
+ stat.nr_congested, stat.nr_immediate,
+ stat.nr_activate, stat.nr_ref_keep,
+ stat.nr_unmap_fail,
sc->priority, file);
return nr_reclaimed;
}
@@ -1846,17 +1858,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
*
* The downside is that we have to touch page->_refcount against each page.
* But we had to alter page->flags anyway.
+ *
+ * Returns the number of pages moved to the given lru.
*/
-static void move_active_pages_to_lru(struct lruvec *lruvec,
+static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
struct list_head *list,
struct list_head *pages_to_free,
enum lru_list lru)
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- unsigned long pgmoved = 0;
struct page *page;
int nr_pages;
+ int nr_moved = 0;
while (!list_empty(list)) {
page = lru_to_page(list);
@@ -1868,7 +1882,6 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
nr_pages = hpage_nr_pages(page);
update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
- pgmoved += nr_pages;
if (put_page_testzero(page)) {
__ClearPageLRU(page);
@@ -1882,11 +1895,15 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, pages_to_free);
+ } else {
+ nr_moved += nr_pages;
}
}
if (!is_active_lru(lru))
- __count_vm_events(PGDEACTIVATE, pgmoved);
+ __count_vm_events(PGDEACTIVATE, nr_moved);
+
+ return nr_moved;
}
static void shrink_active_list(unsigned long nr_to_scan,
@@ -1902,7 +1919,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_inactive);
struct page *page;
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- unsigned long nr_rotated = 0;
+ unsigned nr_deactivate, nr_activate;
+ unsigned nr_rotated = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -1980,13 +1998,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
- move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
+ nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
+ nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&l_hold);
free_hot_cold_page_list(&l_hold, true);
+ trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
+ nr_deactivate, nr_rotated, sc->priority, file);
}
/*
@@ -2016,14 +2036,13 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct scan_control *sc)
+ struct scan_control *sc, bool trace)
{
unsigned long inactive_ratio;
- unsigned long inactive;
- unsigned long active;
+ unsigned long inactive, active;
+ enum lru_list inactive_lru = file * LRU_FILE;
+ enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
unsigned long gb;
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- int zid;
/*
* If we don't have swap space, anonymous page deactivation
@@ -2032,29 +2051,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
if (!file && !total_swap_pages)
return false;
- inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
- active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-
- /*
- * For zone-constrained allocations, it is necessary to check if
- * deactivations are required for lowmem to be reclaimed. This
- * calculates the inactive/active pages available in eligible zones.
- */
- for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
- struct zone *zone = &pgdat->node_zones[zid];
- unsigned long inactive_zone, active_zone;
-
- if (!managed_zone(zone))
- continue;
-
- inactive_zone = zone_page_state(zone,
- NR_ZONE_LRU_BASE + (file * LRU_FILE));
- active_zone = zone_page_state(zone,
- NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
-
- inactive -= min(inactive, inactive_zone);
- active -= min(active, active_zone);
- }
+ inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
+ active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
@@ -2062,6 +2060,13 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
else
inactive_ratio = 1;
+ if (trace)
+ trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
+ sc->reclaim_idx,
+ lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+ lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+ inactive_ratio, file);
+
return inactive * inactive_ratio < active;
}
@@ -2069,7 +2074,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru), sc))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2200,8 +2205,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true, sc) &&
- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+ if (!inactive_list_is_low(lruvec, true, sc, false) &&
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2227,10 +2232,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anon in [0], file in [1]
*/
- anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
spin_lock_irq(&pgdat->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2268,7 +2273,7 @@ out:
unsigned long size;
unsigned long scan;
- size = lruvec_lru_size(lruvec, lru);
+ size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
scan = size >> sc->priority;
if (!scan && pass && force_scan)
@@ -2425,7 +2430,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, sc))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -3075,7 +3080,7 @@ static void age_active_anon(struct pglist_data *pgdat,
do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false, sc))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7c28df36f50f..69f9aff39a2e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1038,6 +1038,8 @@ const char * const vmstat_text[] = {
"compact_fail",
"compact_success",
"compact_daemon_wake",
+ "compact_daemon_migrate_scanned",
+ "compact_daemon_free_scanned",
#endif
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/workingset.c b/mm/workingset.c
index 241fa5d6b3b2..a67f5796b995 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -267,7 +267,7 @@ bool workingset_refault(void *shadow)
}
lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
- active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
rcu_read_unlock();
/*
@@ -473,7 +473,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->exceptional))
goto out_invalid;
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
- __radix_tree_delete_node(&mapping->page_tree, node);
+ __radix_tree_delete_node(&mapping->page_tree, node,
+ workingset_update_node, mapping);
out_invalid:
spin_unlock(&mapping->tree_lock);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 8f9e89ca1d31..207e5ddc87a2 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -50,7 +50,7 @@
#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
-#define BUDDY_MASK ((1 << NCHUNKS_ORDER) - 1)
+#define BUDDY_MASK (0x3)
struct z3fold_pool;
struct z3fold_ops {
@@ -109,7 +109,7 @@ struct z3fold_header {
unsigned short middle_chunks;
unsigned short last_chunks;
unsigned short start_middle;
- unsigned short first_num:NCHUNKS_ORDER;
+ unsigned short first_num:2;
};
/*
@@ -179,7 +179,11 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
return (struct z3fold_header *)(handle & PAGE_MASK);
}
-/* Returns buddy number */
+/*
+ * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
+ * but that doesn't matter. because the masking will result in the
+ * correct buddy number.
+ */
static enum buddy handle_to_buddy(unsigned long handle)
{
struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 9cc3c0b2c2c1..a1f24989ac23 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -25,7 +25,7 @@
* Usage of struct page flags:
* PG_private: identifies the first component page
* PG_private2: identifies the last component page
- * PG_owner_priv_1: indentifies the huge component page
+ * PG_owner_priv_1: identifies the huge component page
*
*/
@@ -364,7 +364,7 @@ static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
{
return kmem_cache_alloc(pool->zspage_cachep,
flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
-};
+}
static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
{
@@ -2383,7 +2383,7 @@ struct zs_pool *zs_create_pool(const char *name)
goto err;
/*
- * Iterate reversly, because, size of size_class that we want to use
+ * Iterate reversely, because, size of size_class that we want to use
* for merging should be larger or equal to current size.
*/
for (i = zs_size_classes - 1; i >= 0; i--) {
diff --git a/mm/zswap.c b/mm/zswap.c
index 067a0d62f318..cabf09e0128b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -78,7 +78,13 @@ static u64 zswap_duplicate_entry;
/* Enable/disable zswap (disabled by default) */
static bool zswap_enabled;
-module_param_named(enabled, zswap_enabled, bool, 0644);
+static int zswap_enabled_param_set(const char *,
+ const struct kernel_param *);
+static struct kernel_param_ops zswap_enabled_param_ops = {
+ .set = zswap_enabled_param_set,
+ .get = param_get_bool,
+};
+module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
/* Crypto compressor to use */
#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
@@ -176,6 +182,9 @@ static atomic_t zswap_pools_count = ATOMIC_INIT(0);
/* used by param callback function */
static bool zswap_init_started;
+/* fatal error during init */
+static bool zswap_init_failed;
+
/*********************************
* helpers and fwd declarations
**********************************/
@@ -624,6 +633,11 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
char *s = strstrip((char *)val);
int ret;
+ if (zswap_init_failed) {
+ pr_err("can't set param, initialization failed\n");
+ return -ENODEV;
+ }
+
/* no change required */
if (!strcmp(s, *(char **)kp->arg))
return 0;
@@ -703,6 +717,17 @@ static int zswap_zpool_param_set(const char *val,
return __zswap_param_set(val, kp, NULL, zswap_compressor);
}
+static int zswap_enabled_param_set(const char *val,
+ const struct kernel_param *kp)
+{
+ if (zswap_init_failed) {
+ pr_err("can't enable, initialization failed\n");
+ return -ENODEV;
+ }
+
+ return param_set_bool(val, kp);
+}
+
/*********************************
* writeback code
**********************************/
@@ -1201,6 +1226,9 @@ hp_fail:
dstmem_fail:
zswap_entry_cache_destroy();
cache_fail:
+ /* if built-in, we aren't unloaded on failure; don't allow use */
+ zswap_init_failed = true;
+ zswap_enabled = false;
return -ENOMEM;
}
/* must be late so crypto has time to come up */