summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Kconfig.debug5
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c28
-rw-r--r--mm/bootmem.c26
-rw-r--r--mm/bounce.c11
-rw-r--r--mm/compaction.c43
-rw-r--r--mm/debug-pagealloc.c59
-rw-r--r--mm/dmapool.c3
-rw-r--r--mm/fadvise.c3
-rw-r--r--mm/failslab.c2
-rw-r--r--mm/filemap.c43
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c1
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/huge_memory.c200
-rw-r--r--mm/hugetlb.c58
-rw-r--r--mm/internal.h46
-rw-r--r--mm/kmemleak.c160
-rw-r--r--mm/ksm.c14
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/memblock.c970
-rw-r--r--mm/memcontrol.c2078
-rw-r--r--mm/memory-failure.c15
-rw-r--r--mm/memory.c8
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/mempolicy.c31
-rw-r--r--mm/mempool.c106
-rw-r--r--mm/migrate.c260
-rw-r--r--mm/mlock.c15
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap.c71
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mmzone.c1
-rw-r--r--mm/mremap.c51
-rw-r--r--mm/nobootmem.c47
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/oom_kill.c102
-rw-r--r--mm/page-writeback.c1175
-rw-r--r--mm/page_alloc.c845
-rw-r--r--mm/page_cgroup.c174
-rw-r--r--mm/percpu-vm.c17
-rw-r--r--mm/percpu.c80
-rw-r--r--mm/process_vm_access.c496
-rw-r--r--mm/quicklist.c1
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c69
-rw-r--r--mm/shmem.c31
-rw-r--r--mm/slab.c65
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c712
-rw-r--r--mm/sparse-vmemmap.c1
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c178
-rw-r--r--mm/swap_state.c12
-rw-r--r--mm/swapfile.c18
-rw-r--r--mm/thrash.c2
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c157
-rw-r--r--mm/vmscan.c1032
-rw-r--r--mm/vmstat.c5
63 files changed, 5778 insertions, 3793 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f2f1ca19ed53..e338407f1225 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -131,6 +131,15 @@ config SPARSEMEM_VMEMMAP
config HAVE_MEMBLOCK
boolean
+config HAVE_MEMBLOCK_NODE_MAP
+ boolean
+
+config ARCH_DISCARD_MEMBLOCK
+ boolean
+
+config NO_BOOTMEM
+ boolean
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 8b1a477162dc..4b2443254de2 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -4,6 +4,7 @@ config DEBUG_PAGEALLOC
depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
depends on !KMEMCHECK
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
@@ -22,3 +23,7 @@ config WANT_PAGE_DEBUG_FLAGS
config PAGE_POISONING
bool
select WANT_PAGE_DEBUG_FLAGS
+
+config PAGE_GUARD
+ bool
+ select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1bf..50ec00ef2a0e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,8 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o pagewalk.o pgtable-generic.o
+ vmalloc.o pagewalk.o pgtable-generic.o \
+ process_vm_access.o
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a87da524a4a0..7ba8feae11b8 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -97,6 +97,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiDirtyThresh: %10lu kB\n"
"DirtyThresh: %10lu kB\n"
"BackgroundThresh: %10lu kB\n"
+ "BdiDirtied: %10lu kB\n"
"BdiWritten: %10lu kB\n"
"BdiWriteBandwidth: %10lu kBps\n"
"b_dirty: %10lu\n"
@@ -109,6 +110,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
K(bdi_thresh),
K(dirty_thresh),
K(background_thresh),
+ (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
(unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
(unsigned long) K(bdi->write_bandwidth),
nr_dirty,
@@ -404,9 +406,8 @@ static int bdi_forker_thread(void *ptr)
/*
* In the following loop we are going to check whether we have
* some work to do without any synchronization with tasks
- * waking us up to do work for them. So we have to set task
- * state already here so that we don't miss wakeups coming
- * after we verify some condition.
+ * waking us up to do work for them. Set the task state here
+ * so that we don't miss wakeups after verifying conditions.
*/
set_current_state(TASK_INTERRUPTIBLE);
@@ -474,7 +475,8 @@ static int bdi_forker_thread(void *ptr)
* the bdi from the thread. Hopefully 1024 is
* large enough for efficient IO.
*/
- writeback_inodes_wb(&bdi->wb, 1024);
+ writeback_inodes_wb(&bdi->wb, 1024,
+ WB_REASON_FORKER_THREAD);
} else {
/*
* The spinlock makes sure we do not lose
@@ -598,14 +600,10 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
/*
* Finally, kill the kernel thread. We don't need to be RCU
- * safe anymore, since the bdi is gone from visibility. Force
- * unfreeze of the thread before calling kthread_stop(), otherwise
- * it would never exet if it is currently stuck in the refrigerator.
+ * safe anymore, since the bdi is gone from visibility.
*/
- if (bdi->wb.task) {
- thaw_process(bdi->wb.task);
+ if (bdi->wb.task)
kthread_stop(bdi->wb.task);
- }
}
/*
@@ -684,6 +682,8 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->bw_time_stamp = jiffies;
bdi->written_stamp = 0;
+ bdi->balanced_dirty_ratelimit = INIT_BW;
+ bdi->dirty_ratelimit = INIT_BW;
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;
@@ -720,6 +720,14 @@ void bdi_destroy(struct backing_dev_info *bdi)
bdi_unregister(bdi);
+ /*
+ * If bdi_unregister() had already been called earlier, the
+ * wakeup_timer could still be armed because bdi_prune_sb()
+ * can race with the bdi_wakeup_thread_delayed() calls from
+ * __mark_inode_dirty().
+ */
+ del_timer_sync(&bdi->wb.wakeup_timer);
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 01d5a4b3dd0c..668e94df8cf2 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -12,7 +12,7 @@
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/bootmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
#include <linux/memblock.h>
@@ -56,7 +56,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
static unsigned long __init bootmap_bytes(unsigned long pages)
{
- unsigned long bytes = (pages + 7) / 8;
+ unsigned long bytes = DIV_ROUND_UP(pages, 8);
return ALIGN(bytes, sizeof(long));
}
@@ -171,7 +171,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
- int aligned;
struct page *page;
unsigned long start, end, pages, count = 0;
@@ -181,14 +180,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
start = bdata->node_min_pfn;
end = bdata->node_low_pfn;
- /*
- * If the start is aligned to the machines wordsize, we might
- * be able to free pages in bulks of that order.
- */
- aligned = !(start & (BITS_PER_LONG - 1));
-
- bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
- bdata - bootmem_node_data, start, end, aligned);
+ bdebug("nid=%td start=%lx end=%lx\n",
+ bdata - bootmem_node_data, start, end);
while (start < end) {
unsigned long *map, idx, vec;
@@ -196,12 +189,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
vec = ~map[idx / BITS_PER_LONG];
-
- if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+ /*
+ * If we have a properly aligned and fully unreserved
+ * BITS_PER_LONG block of pages in front of us, free
+ * it in one go.
+ */
+ if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
int order = ilog2(BITS_PER_LONG);
__free_pages_bootmem(pfn_to_page(start), order);
count += BITS_PER_LONG;
+ start += BITS_PER_LONG;
} else {
unsigned long off = 0;
@@ -214,8 +212,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
vec >>= 1;
off++;
}
+ start = ALIGN(start + 1, BITS_PER_LONG);
}
- start += BITS_PER_LONG;
}
page = virt_to_page(bdata->node_bootmem_map);
diff --git a/mm/bounce.c b/mm/bounce.c
index 1481de68184b..4e9ae722af83 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -4,7 +4,7 @@
*/
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
#include <linux/gfp.h>
#include <linux/bio.h>
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
+#include <linux/bootmem.h>
#include <asm/tlbflush.h>
#include <trace/events/block.h>
@@ -26,12 +27,10 @@ static mempool_t *page_pool, *isa_page_pool;
#ifdef CONFIG_HIGHMEM
static __init int init_emergency_pool(void)
{
- struct sysinfo i;
- si_meminfo(&i);
- si_swapinfo(&i);
-
- if (!i.totalhigh)
+#ifndef CONFIG_MEMORY_HOTPLUG
+ if (max_pfn <= max_low_pfn)
return 0;
+#endif
page_pool = mempool_create_page_pool(POOL_SIZE, 0);
BUG_ON(!page_pool);
diff --git a/mm/compaction.c b/mm/compaction.c
index 6cc604bd5649..71a58f67f481 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */
- /* Account for isolated anon and file pages */
- unsigned long nr_anon;
- unsigned long nr_file;
-
unsigned int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
static void acct_isolated(struct zone *zone, struct compact_control *cc)
{
struct page *page;
- unsigned int count[NR_LRU_LISTS] = { 0, };
+ unsigned int count[2] = { 0, };
- list_for_each_entry(page, &cc->migratepages, lru) {
- int lru = page_lru_base_type(page);
- count[lru]++;
- }
+ list_for_each_entry(page, &cc->migratepages, lru)
+ count[!!page_is_file_cache(page)]++;
- cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
}
/* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
+ isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
/* Do not scan outside zone boundaries */
low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -356,8 +349,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
continue;
}
+ if (!cc->sync)
+ mode |= ISOLATE_ASYNC_MIGRATE;
+
/* Try isolate the page */
- if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+ if (__isolate_lru_page(page, mode, 0) != 0)
continue;
VM_BUG_ON(PageTransCompound(page));
@@ -369,8 +365,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
nr_isolated++;
/* Avoid isolating too much */
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ ++low_pfn;
break;
+ }
}
acct_isolated(zone, cc);
@@ -559,7 +557,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
- cc->sync);
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
@@ -586,7 +584,7 @@ out:
return ret;
}
-unsigned long compact_zone_order(struct zone *zone,
+static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
bool sync)
{
@@ -673,6 +671,7 @@ static int compact_node(int nid)
.nr_freepages = 0,
.nr_migratepages = 0,
.order = -1,
+ .sync = true,
};
zone = &pgdat->node_zones[zoneid];
@@ -725,23 +724,23 @@ int sysctl_extfrag_handler(struct ctl_table *table, int write,
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
-ssize_t sysfs_compact_node(struct sys_device *dev,
- struct sysdev_attribute *attr,
+ssize_t sysfs_compact_node(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t count)
{
compact_node(dev->id);
return count;
}
-static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
+static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
int compaction_register_node(struct node *node)
{
- return sysdev_create_file(&node->sysdev, &attr_compact);
+ return device_create_file(&node->dev, &dev_attr_compact);
}
void compaction_unregister_node(struct node *node)
{
- return sysdev_remove_file(&node->sysdev, &attr_compact);
+ return device_remove_file(&node->dev, &dev_attr_compact);
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index a1e3324de2b5..789ff70c8a4a 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -1,7 +1,10 @@
#include <linux/kernel.h>
+#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/page-debug-flags.h>
#include <linux/poison.h>
+#include <linux/ratelimit.h>
static inline void set_page_poison(struct page *page)
{
@@ -18,28 +21,13 @@ static inline bool page_poison(struct page *page)
return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
}
-static void poison_highpage(struct page *page)
-{
- /*
- * Page poisoning for highmem pages is not implemented.
- *
- * This can be called from interrupt contexts.
- * So we need to create a new kmap_atomic slot for this
- * application and it will need interrupt protection.
- */
-}
-
static void poison_page(struct page *page)
{
- void *addr;
+ void *addr = kmap_atomic(page);
- if (PageHighMem(page)) {
- poison_highpage(page);
- return;
- }
set_page_poison(page);
- addr = page_address(page);
memset(addr, PAGE_POISON, PAGE_SIZE);
+ kunmap_atomic(addr);
}
static void poison_pages(struct page *page, int n)
@@ -59,14 +47,12 @@ static bool single_bit_flip(unsigned char a, unsigned char b)
static void check_poison_mem(unsigned char *mem, size_t bytes)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
unsigned char *start;
unsigned char *end;
- for (start = mem; start < mem + bytes; start++) {
- if (*start != PAGE_POISON)
- break;
- }
- if (start == mem + bytes)
+ start = memchr_inv(mem, PAGE_POISON, bytes);
+ if (!start)
return;
for (end = mem + bytes - 1; end > start; end--) {
@@ -74,7 +60,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
break;
}
- if (!printk_ratelimit())
+ if (!__ratelimit(&ratelimit))
return;
else if (start == end && single_bit_flip(*start, PAGE_POISON))
printk(KERN_ERR "pagealloc: single bit error\n");
@@ -86,27 +72,17 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
dump_stack();
}
-static void unpoison_highpage(struct page *page)
-{
- /*
- * See comment in poison_highpage().
- * Highmem pages should not be poisoned for now
- */
- BUG_ON(page_poison(page));
-}
-
static void unpoison_page(struct page *page)
{
- if (PageHighMem(page)) {
- unpoison_highpage(page);
+ void *addr;
+
+ if (!page_poison(page))
return;
- }
- if (page_poison(page)) {
- void *addr = page_address(page);
- check_poison_mem(addr, PAGE_SIZE);
- clear_page_poison(page);
- }
+ addr = kmap_atomic(page);
+ check_poison_mem(addr, PAGE_SIZE);
+ clear_page_poison(page);
+ kunmap_atomic(addr);
}
static void unpoison_pages(struct page *page, int n)
@@ -119,9 +95,6 @@ static void unpoison_pages(struct page *page, int n)
void kernel_map_pages(struct page *page, int numpages, int enable)
{
- if (!debug_pagealloc_enabled)
- return;
-
if (enable)
unpoison_pages(page, numpages);
else
diff --git a/mm/dmapool.c b/mm/dmapool.c
index fbb58e346888..c5ab33bca0a8 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -27,11 +27,12 @@
#include <linux/dmapool.h>
#include <linux/kernel.h>
#include <linux/list.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/poison.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/stat.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/types.h>
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 8d723c9e8b75..469491e0af79 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -117,7 +117,8 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
break;
case POSIX_FADV_DONTNEED:
if (!bdi_write_congested(mapping->backing_dev_info))
- filemap_flush(mapping);
+ __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
/* First and last FULL page! */
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
diff --git a/mm/failslab.c b/mm/failslab.c
index 0dd7b8fec71c..fefaabaab76d 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -35,7 +35,7 @@ __setup("failslab=", setup_failslab);
static int __init failslab_debugfs_init(void)
{
struct dentry *dir;
- mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
if (IS_ERR(dir))
diff --git a/mm/filemap.c b/mm/filemap.c
index 7771871fa353..97f49ed35bd2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,7 +9,7 @@
* most "normal" filesystems (but you don't /have/ to use this:
* the NFS filesystem used to do this differently, for example)
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
@@ -393,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
{
int error;
- struct mem_cgroup *memcg = NULL;
VM_BUG_ON(!PageLocked(old));
VM_BUG_ON(!PageLocked(new));
VM_BUG_ON(new->mapping);
- /*
- * This is not page migration, but prepare_migration and
- * end_migration does enough work for charge replacement.
- *
- * In the longer term we probably want a specialized function
- * for moving the charge from old to new in a more efficient
- * manner.
- */
- error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
- if (error)
- return error;
-
error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (!error) {
struct address_space *mapping = old->mapping;
@@ -432,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
+ /* mem_cgroup codes must not be called under tree_lock */
+ mem_cgroup_replace_page_cache(old, new);
radix_tree_preload_end();
if (freepage)
freepage(old);
page_cache_release(old);
- mem_cgroup_end_migration(memcg, old, new, true);
- } else {
- mem_cgroup_end_migration(memcg, old, new, false);
}
return error;
@@ -1828,7 +1814,7 @@ repeat:
page = __page_cache_alloc(gfp | __GFP_COLD);
if (!page)
return ERR_PTR(-ENOMEM);
- err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+ err = add_to_page_cache_lru(page, mapping, index, gfp);
if (unlikely(err)) {
page_cache_release(page);
if (err == -EEXIST)
@@ -1925,10 +1911,7 @@ static struct page *wait_on_page_read(struct page *page)
* @gfp: the page allocator flags to use if allocating
*
* This is the same as "read_mapping_page(mapping, index, NULL)", but with
- * any new page allocations done using the specified allocation flags. Note
- * that the Radix tree operations will still use GFP_KERNEL, so you can't
- * expect to do this atomically or anything like that - but you can pass in
- * other page requirements.
+ * any new page allocations done using the specified allocation flags.
*
* If the page does not get brought uptodate, return -EIO.
*/
@@ -1971,7 +1954,7 @@ EXPORT_SYMBOL(read_cache_page);
*/
int should_remove_suid(struct dentry *dentry)
{
- mode_t mode = dentry->d_inode->i_mode;
+ umode_t mode = dentry->d_inode->i_mode;
int kill = 0;
/* suid always must be killed */
@@ -2115,6 +2098,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
} else {
const struct iovec *iov = i->iov;
size_t base = i->iov_offset;
+ unsigned long nr_segs = i->nr_segs;
/*
* The !iov->iov_len check ensures we skip over unlikely
@@ -2130,11 +2114,13 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
base += copy;
if (iov->iov_len == base) {
iov++;
+ nr_segs--;
base = 0;
}
}
i->iov = iov;
i->iov_offset = base;
+ i->nr_segs = nr_segs;
}
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -2351,8 +2337,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
int status;
+ gfp_t gfp_mask;
struct page *page;
gfp_t gfp_notmask = 0;
+
+ gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE;
if (flags & AOP_FLAG_NOFS)
gfp_notmask = __GFP_FS;
repeat:
@@ -2360,7 +2349,7 @@ repeat:
if (page)
goto found;
- page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
+ page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
if (!page)
return NULL;
status = add_to_page_cache_lru(page, mapping, index,
@@ -2404,7 +2393,6 @@ static ssize_t generic_perform_write(struct file *file,
iov_iter_count(i));
again:
-
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
@@ -2460,7 +2448,10 @@ again:
written += copied;
balance_dirty_pages_ratelimited(mapping);
-
+ if (fatal_signal_pending(current)) {
+ status = -EINTR;
+ break;
+ }
} while (iov_iter_count(i));
return written ? written : status;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 93356cd12828..f91b2f687343 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -10,7 +10,7 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/uio.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
diff --git a/mm/fremap.c b/mm/fremap.c
index b8e0e2d468af..9ed4fd432467 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -13,7 +13,6 @@
#include <linux/pagemap.h>
#include <linux/swapops.h>
#include <linux/rmap.h>
-#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
diff --git a/mm/highmem.c b/mm/highmem.c
index 5ef672c07f75..57d82c6250c3 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -17,7 +17,7 @@
*/
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/pagemap.h>
@@ -250,7 +250,7 @@ void *kmap_high_get(struct page *page)
#endif
/**
- * kunmap_high - map a highmem page into memory
+ * kunmap_high - unmap a highmem page into memory
* @page: &struct page to unmap
*
* If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2d1587be269..b3ffc21ce801 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -89,7 +89,8 @@ struct khugepaged_scan {
struct list_head mm_head;
struct mm_slot *mm_slot;
unsigned long address;
-} khugepaged_scan = {
+};
+static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
@@ -486,41 +487,68 @@ static struct attribute_group khugepaged_attr_group = {
.attrs = khugepaged_attr,
.name = "khugepaged",
};
-#endif /* CONFIG_SYSFS */
-static int __init hugepage_init(void)
+static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
int err;
-#ifdef CONFIG_SYSFS
- static struct kobject *hugepage_kobj;
-#endif
-
- err = -EINVAL;
- if (!has_transparent_hugepage()) {
- transparent_hugepage_flags = 0;
- goto out;
- }
-#ifdef CONFIG_SYSFS
- err = -ENOMEM;
- hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
- if (unlikely(!hugepage_kobj)) {
+ *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+ if (unlikely(!*hugepage_kobj)) {
printk(KERN_ERR "hugepage: failed kobject create\n");
- goto out;
+ return -ENOMEM;
}
- err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+ err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
if (err) {
printk(KERN_ERR "hugepage: failed register hugeage group\n");
- goto out;
+ goto delete_obj;
}
- err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+ err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
if (err) {
printk(KERN_ERR "hugepage: failed register hugeage group\n");
- goto out;
+ goto remove_hp_group;
}
-#endif
+
+ return 0;
+
+remove_hp_group:
+ sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
+delete_obj:
+ kobject_put(*hugepage_kobj);
+ return err;
+}
+
+static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
+{
+ sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
+ sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
+ kobject_put(hugepage_kobj);
+}
+#else
+static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
+{
+ return 0;
+}
+
+static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
+{
+}
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+ int err;
+ struct kobject *hugepage_kobj;
+
+ if (!has_transparent_hugepage()) {
+ transparent_hugepage_flags = 0;
+ return -EINVAL;
+ }
+
+ err = hugepage_init_sysfs(&hugepage_kobj);
+ if (err)
+ return err;
err = khugepaged_slab_init();
if (err)
@@ -544,7 +572,9 @@ static int __init hugepage_init(void)
set_recommended_min_free_kbytes();
+ return 0;
out:
+ hugepage_exit_sysfs(hugepage_kobj);
return err;
}
module_init(hugepage_init)
@@ -829,7 +859,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
for (i = 0; i < HPAGE_PMD_NR; i++) {
copy_user_highpage(pages[i], page + i,
- haddr + PAGE_SHIFT*i, vma);
+ haddr + PAGE_SIZE * i, vma);
__SetPageUptodate(pages[i]);
cond_resched();
}
@@ -989,14 +1019,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON(!PageCompound(page));
if (flags & FOLL_GET)
- get_page(page);
+ get_page_foll(page);
out:
return page;
}
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
- pmd_t *pmd)
+ pmd_t *pmd, unsigned long addr)
{
int ret = 0;
@@ -1012,6 +1042,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pgtable = get_pmd_huge_pte(tlb->mm);
page = pmd_page(*pmd);
pmd_clear(pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
page_remove_rmap(page);
VM_BUG_ON(page_mapcount(page) < 0);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
@@ -1052,6 +1083,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
return ret;
}
+int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+ unsigned long old_addr,
+ unsigned long new_addr, unsigned long old_end,
+ pmd_t *old_pmd, pmd_t *new_pmd)
+{
+ int ret = 0;
+ pmd_t pmd;
+
+ struct mm_struct *mm = vma->vm_mm;
+
+ if ((old_addr & ~HPAGE_PMD_MASK) ||
+ (new_addr & ~HPAGE_PMD_MASK) ||
+ old_end - old_addr < HPAGE_PMD_SIZE ||
+ (new_vma->vm_flags & VM_NOHUGEPAGE))
+ goto out;
+
+ /*
+ * The destination pmd shouldn't be established, free_pgtables()
+ * should have release it.
+ */
+ if (WARN_ON(!pmd_none(*new_pmd))) {
+ VM_BUG_ON(pmd_trans_huge(*new_pmd));
+ goto out;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*old_pmd))) {
+ if (pmd_trans_splitting(*old_pmd)) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, old_pmd);
+ ret = -1;
+ } else {
+ pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ VM_BUG_ON(!pmd_none(*new_pmd));
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
+ spin_unlock(&mm->page_table_lock);
+ ret = 1;
+ }
+ } else {
+ spin_unlock(&mm->page_table_lock);
+ }
+out:
+ return ret;
+}
+
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot)
{
@@ -1070,7 +1146,6 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
entry = pmd_modify(entry, newprot);
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);
- flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
ret = 1;
}
} else
@@ -1153,22 +1228,39 @@ static int __split_huge_page_splitting(struct page *page,
static void __split_huge_page_refcount(struct page *page)
{
int i;
- unsigned long head_index = page->index;
struct zone *zone = page_zone(page);
- int zonestat;
+ int tail_count = 0;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
compound_lock(page);
+ /* complete memcg works before add pages to LRU */
+ mem_cgroup_split_huge_fixup(page);
- for (i = 1; i < HPAGE_PMD_NR; i++) {
+ for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
struct page *page_tail = page + i;
- /* tail_page->_count cannot change */
- atomic_sub(atomic_read(&page_tail->_count), &page->_count);
- BUG_ON(page_count(page) <= 0);
- atomic_add(page_mapcount(page) + 1, &page_tail->_count);
- BUG_ON(atomic_read(&page_tail->_count) <= 0);
+ /* tail_page->_mapcount cannot change */
+ BUG_ON(page_mapcount(page_tail) < 0);
+ tail_count += page_mapcount(page_tail);
+ /* check for overflow */
+ BUG_ON(tail_count < 0);
+ BUG_ON(atomic_read(&page_tail->_count) != 0);
+ /*
+ * tail_page->_count is zero and not changing from
+ * under us. But get_page_unless_zero() may be running
+ * from under us on the tail_page. If we used
+ * atomic_set() below instead of atomic_add(), we
+ * would then run atomic_set() concurrently with
+ * get_page_unless_zero(), and atomic_set() is
+ * implemented in C not using locked ops. spin_unlock
+ * on x86 sometime uses locked ops because of PPro
+ * errata 66, 92, so unless somebody can guarantee
+ * atomic_set() here would be safe on all archs (and
+ * not only on x86), it's safer to use atomic_add().
+ */
+ atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+ &page_tail->_count);
/* after clearing PageTail the gup refcount can be released */
smp_mb();
@@ -1186,10 +1278,7 @@ static void __split_huge_page_refcount(struct page *page)
(1L << PG_uptodate)));
page_tail->flags |= (1L << PG_dirty);
- /*
- * 1) clear PageTail before overwriting first_page
- * 2) clear PageTail before clearing PageHead for VM_BUG_ON
- */
+ /* clear PageTail before overwriting first_page */
smp_wmb();
/*
@@ -1206,36 +1295,27 @@ static void __split_huge_page_refcount(struct page *page)
* status is achieved setting a reserved bit in the
* pmd, not by clearing the present bit.
*/
- BUG_ON(page_mapcount(page_tail));
page_tail->_mapcount = page->_mapcount;
BUG_ON(page_tail->mapping);
page_tail->mapping = page->mapping;
- page_tail->index = ++head_index;
+ page_tail->index = page->index + i;
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
BUG_ON(!PageDirty(page_tail));
BUG_ON(!PageSwapBacked(page_tail));
- mem_cgroup_split_huge_fixup(page, page_tail);
lru_add_page_tail(zone, page, page_tail);
}
+ atomic_sub(tail_count, &page->_count);
+ BUG_ON(atomic_read(&page->_count) <= 0);
__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
- /*
- * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
- * so adjust those appropriately if this page is on the LRU.
- */
- if (PageLRU(page)) {
- zonestat = NR_LRU_BASE + page_lru(page);
- __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
- }
-
ClearPageCompound(page);
compound_unlock(page);
spin_unlock_irq(&zone->lru_lock);
@@ -1906,7 +1986,7 @@ static void collapse_huge_page(struct mm_struct *mm,
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
set_pmd_at(mm, address, pmd, _pmd);
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache(vma, address, _pmd);
prepare_pmd_huge_pte(pgtable, mm);
mm->nr_ptes--;
spin_unlock(&mm->page_table_lock);
@@ -2024,6 +2104,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
+ __releases(&khugepaged_mm_lock)
+ __acquires(&khugepaged_mm_lock)
{
struct mm_slot *mm_slot;
struct mm_struct *mm;
@@ -2196,12 +2278,8 @@ static void khugepaged_do_scan(struct page **hpage)
static void khugepaged_alloc_sleep(void)
{
- DEFINE_WAIT(wait);
- add_wait_queue(&khugepaged_wait, &wait);
- schedule_timeout_interruptible(
- msecs_to_jiffies(
- khugepaged_alloc_sleep_millisecs));
- remove_wait_queue(&khugepaged_wait, &wait);
+ wait_event_freezable_timeout(khugepaged_wait, false,
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
}
#ifndef CONFIG_NUMA
@@ -2250,14 +2328,10 @@ static void khugepaged_loop(void)
if (unlikely(kthread_should_stop()))
break;
if (khugepaged_has_work()) {
- DEFINE_WAIT(wait);
if (!khugepaged_scan_sleep_millisecs)
continue;
- add_wait_queue(&khugepaged_wait, &wait);
- schedule_timeout_interruptible(
- msecs_to_jiffies(
- khugepaged_scan_sleep_millisecs));
- remove_wait_queue(&khugepaged_wait, &wait);
+ wait_event_freezable_timeout(khugepaged_wait, false,
+ msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
} else if (khugepaged_enabled())
wait_event_freezable(khugepaged_wait,
khugepaged_wait_event());
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dae27ba3be2c..ea8c3a4cd2ae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -576,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
__SetPageTail(p);
+ set_page_count(p, 0);
p->first_page = page;
}
}
@@ -799,7 +800,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
if (page && arch_prepare_hugepage(page)) {
__free_pages(page, huge_page_order(h));
- return NULL;
+ page = NULL;
}
spin_lock(&hugetlb_lock);
@@ -900,7 +901,6 @@ retry:
h->resv_huge_pages += delta;
ret = 0;
- spin_unlock(&hugetlb_lock);
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
@@ -914,6 +914,7 @@ retry:
VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
+ spin_unlock(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
free:
@@ -1591,9 +1592,9 @@ static void __init hugetlb_sysfs_init(void)
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
- * with node sysdevs in node_devices[] using a parallel array. The array
- * index of a node sysdev or _hstate == node id.
- * This is here to avoid any static dependency of the node sysdev driver, in
+ * with node devices in node_devices[] using a parallel array. The array
+ * index of a node device or _hstate == node id.
+ * This is here to avoid any static dependency of the node device driver, in
* the base kernel, on the hugetlb module.
*/
struct node_hstate {
@@ -1603,7 +1604,7 @@ struct node_hstate {
struct node_hstate node_hstates[MAX_NUMNODES];
/*
- * A subset of global hstate attributes for node sysdevs
+ * A subset of global hstate attributes for node devices
*/
static struct attribute *per_node_hstate_attrs[] = {
&nr_hugepages_attr.attr,
@@ -1617,7 +1618,7 @@ static struct attribute_group per_node_hstate_attr_group = {
};
/*
- * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
+ * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
* Returns node id via non-NULL nidp.
*/
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
@@ -1640,13 +1641,13 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
}
/*
- * Unregister hstate attributes from a single node sysdev.
+ * Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
if (!nhs->hugepages_kobj)
return; /* no hstate attributes */
@@ -1662,7 +1663,7 @@ void hugetlb_unregister_node(struct node *node)
}
/*
- * hugetlb module exit: unregister hstate attributes from node sysdevs
+ * hugetlb module exit: unregister hstate attributes from node devices
* that have them.
*/
static void hugetlb_unregister_all_nodes(void)
@@ -1670,7 +1671,7 @@ static void hugetlb_unregister_all_nodes(void)
int nid;
/*
- * disable node sysdev registrations.
+ * disable node device registrations.
*/
register_hugetlbfs_with_node(NULL, NULL);
@@ -1682,20 +1683,20 @@ static void hugetlb_unregister_all_nodes(void)
}
/*
- * Register hstate attributes for a single node sysdev.
+ * Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
void hugetlb_register_node(struct node *node)
{
struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
if (nhs->hugepages_kobj)
return; /* already allocated */
nhs->hugepages_kobj = kobject_create_and_add("hugepages",
- &node->sysdev.kobj);
+ &node->dev.kobj);
if (!nhs->hugepages_kobj)
return;
@@ -1706,7 +1707,7 @@ void hugetlb_register_node(struct node *node)
if (err) {
printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
" for node %d\n",
- h->name, node->sysdev.id);
+ h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
}
@@ -1715,8 +1716,8 @@ void hugetlb_register_node(struct node *node)
/*
* hugetlb init time: register hstate attributes for all registered node
- * sysdevs of nodes that have memory. All on-line nodes should have
- * registered their associated sysdev by this time.
+ * devices of nodes that have memory. All on-line nodes should have
+ * registered their associated device by this time.
*/
static void hugetlb_register_all_nodes(void)
{
@@ -1724,12 +1725,12 @@ static void hugetlb_register_all_nodes(void)
for_each_node_state(nid, N_HIGH_MEMORY) {
struct node *node = &node_devices[nid];
- if (node->sysdev.id == nid)
+ if (node->dev.id == nid)
hugetlb_register_node(node);
}
/*
- * Let the node sysdev driver know we're here so it can
+ * Let the node device driver know we're here so it can
* [un]register hstate attributes on node hotplug.
*/
register_hugetlbfs_with_node(hugetlb_register_node,
@@ -2314,8 +2315,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* from page cache lookup which is in HPAGE_SIZE units.
*/
address = address & huge_page_mask(h);
- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
- + (vma->vm_pgoff >> PAGE_SHIFT);
+ pgoff = vma_hugecache_offset(h, vma, address);
mapping = (struct address_space *)page_private(page);
/*
@@ -2348,6 +2348,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * cannot race with other handlers or page migration.
+ * Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
@@ -2407,7 +2410,14 @@ retry_avoidcopy:
BUG_ON(page_count(old_page) != 1);
BUG_ON(huge_pte_none(pte));
spin_lock(&mm->page_table_lock);
- goto retry_avoidcopy;
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ if (likely(pte_same(huge_ptep_get(ptep), pte)))
+ goto retry_avoidcopy;
+ /*
+ * race occurs while re-acquiring page_table_lock, and
+ * our job is done.
+ */
+ return 0;
}
WARN_ON_ONCE(1);
}
@@ -2422,6 +2432,8 @@ retry_avoidcopy:
* anon_vma prepared.
*/
if (unlikely(anon_vma_prepare(vma))) {
+ page_cache_release(new_page);
+ page_cache_release(old_page);
/* Caller expects lock to be held */
spin_lock(&mm->page_table_lock);
return VM_FAULT_OOM;
@@ -2627,6 +2639,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
+ address &= huge_page_mask(h);
+
ptep = huge_pte_offset(mm, address);
if (ptep) {
entry = huge_ptep_get(ptep);
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb49..2189af491783 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
atomic_dec(&page->_count);
}
+static inline void __get_page_tail_foll(struct page *page,
+ bool get_page_head)
+{
+ /*
+ * If we're getting a tail page, the elevated page->_count is
+ * required only in the head page and we will elevate the head
+ * page->_count and tail page->_mapcount.
+ *
+ * We elevate page_tail->_mapcount for tail pages to force
+ * page_tail->_count to be zero at all times to avoid getting
+ * false positives from get_page_unless_zero() with
+ * speculative page access (like in
+ * page_cache_get_speculative()) on tail pages.
+ */
+ VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+ VM_BUG_ON(atomic_read(&page->_count) != 0);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ if (get_page_head)
+ atomic_inc(&page->first_page->_count);
+ atomic_inc(&page->_mapcount);
+}
+
+/*
+ * This is meant to be called as the FOLL_GET operation of
+ * follow_page() and it must be called while holding the proper PT
+ * lock while the pte (or pmd_trans_huge) is still mapping the page.
+ */
+static inline void get_page_foll(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ /*
+ * This is safe only because
+ * __split_huge_page_refcount() can't run under
+ * get_page_foll() because we hold the proper PT lock.
+ */
+ __get_page_tail_foll(page, true);
+ else {
+ /*
+ * Getting a normal page or the head of a compound page
+ * requires to already have an elevated page->_count.
+ */
+ VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ atomic_inc(&page->_count);
+ }
+}
+
extern unsigned long highest_memmap_pfn;
/*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index d6880f542f95..c833addd94d7 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -69,7 +69,7 @@
#include <linux/sched.h>
#include <linux/jiffies.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kthread.h>
#include <linux/prio_tree.h>
#include <linux/fs.h>
@@ -100,6 +100,7 @@
#include <linux/kmemcheck.h>
#include <linux/kmemleak.h>
+#include <linux/memory_hotplug.h>
/*
* Kmemleak configuration and common defines.
@@ -196,7 +197,9 @@ static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
/* enables or disables early logging of the memory operations */
static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
-/* set if a fata kmemleak error has occurred */
+/* set if a kmemleak warning was issued */
+static atomic_t kmemleak_warning = ATOMIC_INIT(0);
+/* set if a fatal kmemleak error has occurred */
static atomic_t kmemleak_error = ATOMIC_INIT(0);
/* minimum and maximum address that may be valid pointers */
@@ -228,8 +231,10 @@ static int kmemleak_skip_disable;
/* kmemleak operation type for early logging */
enum {
KMEMLEAK_ALLOC,
+ KMEMLEAK_ALLOC_PERCPU,
KMEMLEAK_FREE,
KMEMLEAK_FREE_PART,
+ KMEMLEAK_FREE_PERCPU,
KMEMLEAK_NOT_LEAK,
KMEMLEAK_IGNORE,
KMEMLEAK_SCAN_AREA,
@@ -259,9 +264,10 @@ static void kmemleak_disable(void);
/*
* Print a warning and dump the stack trace.
*/
-#define kmemleak_warn(x...) do { \
- pr_warning(x); \
- dump_stack(); \
+#define kmemleak_warn(x...) do { \
+ pr_warning(x); \
+ dump_stack(); \
+ atomic_set(&kmemleak_warning, 1); \
} while (0)
/*
@@ -403,8 +409,8 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
object = prio_tree_entry(node, struct kmemleak_object,
tree_node);
if (!alias && object->pointer != ptr) {
- pr_warning("Found object by alias at 0x%08lx\n", ptr);
- dump_stack();
+ kmemleak_warn("Found object by alias at 0x%08lx\n",
+ ptr);
dump_object_info(object);
object = NULL;
}
@@ -794,9 +800,13 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
unsigned long flags;
struct early_log *log;
+ if (atomic_read(&kmemleak_error)) {
+ /* kmemleak stopped recording, just count the requests */
+ crt_early_log++;
+ return;
+ }
+
if (crt_early_log >= ARRAY_SIZE(early_log)) {
- pr_warning("Early log buffer exceeded, "
- "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n");
kmemleak_disable();
return;
}
@@ -811,8 +821,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
log->ptr = ptr;
log->size = size;
log->min_count = min_count;
- if (op_type == KMEMLEAK_ALLOC)
- log->trace_len = __save_stack_trace(log->trace);
+ log->trace_len = __save_stack_trace(log->trace);
crt_early_log++;
local_irq_restore(flags);
}
@@ -846,6 +855,20 @@ out:
rcu_read_unlock();
}
+/*
+ * Log an early allocated block and populate the stack trace.
+ */
+static void early_alloc_percpu(struct early_log *log)
+{
+ unsigned int cpu;
+ const void __percpu *ptr = log->ptr;
+
+ for_each_possible_cpu(cpu) {
+ log->ptr = per_cpu_ptr(ptr, cpu);
+ early_alloc(log);
+ }
+}
+
/**
* kmemleak_alloc - register a newly allocated object
* @ptr: pointer to beginning of the object
@@ -873,6 +896,34 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
EXPORT_SYMBOL_GPL(kmemleak_alloc);
/**
+ * kmemleak_alloc_percpu - register a newly allocated __percpu object
+ * @ptr: __percpu pointer to beginning of the object
+ * @size: size of the object
+ *
+ * This function is called from the kernel percpu allocator when a new object
+ * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL
+ * allocation.
+ */
+void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+{
+ unsigned int cpu;
+
+ pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size);
+
+ /*
+ * Percpu allocations are only scanned and not reported as leaks
+ * (min_count is set to 0).
+ */
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ for_each_possible_cpu(cpu)
+ create_object((unsigned long)per_cpu_ptr(ptr, cpu),
+ size, 0, GFP_KERNEL);
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
+
+/**
* kmemleak_free - unregister a previously registered object
* @ptr: pointer to beginning of the object
*
@@ -911,6 +962,28 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
EXPORT_SYMBOL_GPL(kmemleak_free_part);
/**
+ * kmemleak_free_percpu - unregister a previously registered __percpu object
+ * @ptr: __percpu pointer to beginning of the object
+ *
+ * This function is called from the kernel percpu allocator when an object
+ * (memory block) is freed (free_percpu).
+ */
+void __ref kmemleak_free_percpu(const void __percpu *ptr)
+{
+ unsigned int cpu;
+
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ for_each_possible_cpu(cpu)
+ delete_object_full((unsigned long)per_cpu_ptr(ptr,
+ cpu));
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
+
+/**
* kmemleak_not_leak - mark an allocated object as false positive
* @ptr: pointer to beginning of the object
*
@@ -1220,9 +1293,9 @@ static void kmemleak_scan(void)
#endif
/*
- * Struct page scanning for each node. The code below is not yet safe
- * with MEMORY_HOTPLUG.
+ * Struct page scanning for each node.
*/
+ lock_memory_hotplug();
for_each_online_node(i) {
pg_data_t *pgdat = NODE_DATA(i);
unsigned long start_pfn = pgdat->node_start_pfn;
@@ -1241,6 +1314,7 @@ static void kmemleak_scan(void)
scan_block(page, page + 1, NULL, 1);
}
}
+ unlock_memory_hotplug();
/*
* Scanning the task stacks (may introduce false negatives).
@@ -1467,9 +1541,6 @@ static const struct seq_operations kmemleak_seq_ops = {
static int kmemleak_open(struct inode *inode, struct file *file)
{
- if (!atomic_read(&kmemleak_enabled))
- return -EBUSY;
-
return seq_open(file, &kmemleak_seq_ops);
}
@@ -1543,6 +1614,9 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
int buf_size;
int ret;
+ if (!atomic_read(&kmemleak_enabled))
+ return -EBUSY;
+
buf_size = min(size, (sizeof(buf) - 1));
if (strncpy_from_user(buf, user_buf, buf_size) < 0)
return -EFAULT;
@@ -1602,20 +1676,24 @@ static const struct file_operations kmemleak_fops = {
};
/*
- * Perform the freeing of the kmemleak internal objects after waiting for any
- * current memory scan to complete.
+ * Stop the memory scanning thread and free the kmemleak internal objects if
+ * no previous scan thread (otherwise, kmemleak may still have some useful
+ * information on memory leaks).
*/
static void kmemleak_do_cleanup(struct work_struct *work)
{
struct kmemleak_object *object;
+ bool cleanup = scan_thread == NULL;
mutex_lock(&scan_mutex);
stop_scan_thread();
- rcu_read_lock();
- list_for_each_entry_rcu(object, &object_list, object_list)
- delete_object_full(object->pointer);
- rcu_read_unlock();
+ if (cleanup) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list)
+ delete_object_full(object->pointer);
+ rcu_read_unlock();
+ }
mutex_unlock(&scan_mutex);
}
@@ -1632,7 +1710,6 @@ static void kmemleak_disable(void)
return;
/* stop any memory operation tracing */
- atomic_set(&kmemleak_early_log, 0);
atomic_set(&kmemleak_enabled, 0);
/* check whether it is too early for a kernel thread */
@@ -1659,6 +1736,17 @@ static int kmemleak_boot_config(char *str)
}
early_param("kmemleak", kmemleak_boot_config);
+static void __init print_log_trace(struct early_log *log)
+{
+ struct stack_trace trace;
+
+ trace.nr_entries = log->trace_len;
+ trace.entries = log->trace;
+
+ pr_notice("Early log backtrace:\n");
+ print_stack_trace(&trace, 2);
+}
+
/*
* Kmemleak initialization.
*/
@@ -1681,12 +1769,18 @@ void __init kmemleak_init(void)
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
INIT_PRIO_TREE_ROOT(&object_tree_root);
+ if (crt_early_log >= ARRAY_SIZE(early_log))
+ pr_warning("Early log buffer exceeded (%d), please increase "
+ "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
+
/* the kernel is still in UP mode, so disabling the IRQs is enough */
local_irq_save(flags);
- if (!atomic_read(&kmemleak_error)) {
+ atomic_set(&kmemleak_early_log, 0);
+ if (atomic_read(&kmemleak_error)) {
+ local_irq_restore(flags);
+ return;
+ } else
atomic_set(&kmemleak_enabled, 1);
- atomic_set(&kmemleak_early_log, 0);
- }
local_irq_restore(flags);
/*
@@ -1701,12 +1795,18 @@ void __init kmemleak_init(void)
case KMEMLEAK_ALLOC:
early_alloc(log);
break;
+ case KMEMLEAK_ALLOC_PERCPU:
+ early_alloc_percpu(log);
+ break;
case KMEMLEAK_FREE:
kmemleak_free(log->ptr);
break;
case KMEMLEAK_FREE_PART:
kmemleak_free_part(log->ptr, log->size);
break;
+ case KMEMLEAK_FREE_PERCPU:
+ kmemleak_free_percpu(log->ptr);
+ break;
case KMEMLEAK_NOT_LEAK:
kmemleak_not_leak(log->ptr);
break;
@@ -1720,7 +1820,13 @@ void __init kmemleak_init(void)
kmemleak_no_scan(log->ptr);
break;
default:
- WARN_ON(1);
+ kmemleak_warn("Unknown early log operation: %d\n",
+ log->op_type);
+ }
+
+ if (atomic_read(&kmemleak_warning)) {
+ print_log_trace(log);
+ atomic_set(&kmemleak_warning, 0);
}
}
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 9a68b0cf0a1c..1925ffbfb27f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -28,6 +28,7 @@
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/slab.h>
+#include <linux/memcontrol.h>
#include <linux/rbtree.h>
#include <linux/memory.h>
#include <linux/mmu_notifier.h>
@@ -1571,6 +1572,16 @@ struct page *ksm_does_need_to_copy(struct page *page,
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (new_page) {
+ /*
+ * The memcg-specific accounting when moving
+ * pages around the LRU lists relies on the
+ * page's owner (memcg) to be valid. Usually,
+ * pages are assigned to a new owner before
+ * being put on the LRU list, but since this
+ * is not the case here, the stale owner from
+ * a previous allocation cycle must be reset.
+ */
+ mem_cgroup_reset_owner(new_page);
copy_user_highpage(new_page, page, address, vma);
SetPageDirty(new_page);
@@ -1905,7 +1916,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
err = unmerge_and_remove_all_rmap_items();
- test_set_oom_score_adj(oom_score_adj);
+ compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
+ oom_score_adj);
if (err) {
ksm_run = KSM_RUN_STOP;
count = err;
diff --git a/mm/maccess.c b/mm/maccess.c
index 4cee182ab5f3..d53adf9ba84b 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,7 +1,7 @@
/*
* Access kernel memory without faulting.
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
diff --git a/mm/memblock.c b/mm/memblock.c
index ccbf97339592..2f55f19b7c86 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,12 +20,23 @@
#include <linux/seq_file.h>
#include <linux/memblock.h>
-struct memblock memblock __initdata_memblock;
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+
+struct memblock memblock __initdata_memblock = {
+ .memory.regions = memblock_memory_init_regions,
+ .memory.cnt = 1, /* empty dummy entry */
+ .memory.max = INIT_MEMBLOCK_REGIONS,
+
+ .reserved.regions = memblock_reserved_init_regions,
+ .reserved.cnt = 1, /* empty dummy entry */
+ .reserved.max = INIT_MEMBLOCK_REGIONS,
+
+ .current_limit = MEMBLOCK_ALLOC_ANYWHERE,
+};
int memblock_debug __initdata_memblock;
-int memblock_can_resize __initdata_memblock;
-static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
-static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
+static int memblock_can_resize __initdata_memblock;
/* inline so we don't get a warning when pr_debug is compiled out */
static inline const char *memblock_type_name(struct memblock_type *type)
@@ -38,27 +49,23 @@ static inline const char *memblock_type_name(struct memblock_type *type)
return "unknown";
}
-/*
- * Address comparison utilities
- */
-
-static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
-{
- return addr & ~(size - 1);
-}
-
-static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
+/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
+static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
{
- return (addr + (size - 1)) & ~(size - 1);
+ return *size = min(*size, (phys_addr_t)ULLONG_MAX - base);
}
+/*
+ * Address comparison utilities
+ */
static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
phys_addr_t base2, phys_addr_t size2)
{
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
-long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
{
unsigned long i;
@@ -72,83 +79,66 @@ long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_a
return (i < type->cnt) ? i : -1;
}
-/*
- * Find, allocate, deallocate or reserve unreserved regions. All allocations
- * are top-down.
+/**
+ * memblock_find_in_range_node - find free area in given range and node
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Find @size free area aligned to @align in the specified range and node.
+ *
+ * RETURNS:
+ * Found address on success, %0 on failure.
*/
-
-static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align)
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
+ phys_addr_t end, phys_addr_t size,
+ phys_addr_t align, int nid)
{
- phys_addr_t base, res_base;
- long j;
-
- /* In case, huge size is requested */
- if (end < size)
- return MEMBLOCK_ERROR;
-
- base = memblock_align_down((end - size), align);
+ phys_addr_t this_start, this_end, cand;
+ u64 i;
- /* Prevent allocations returning 0 as it's also used to
- * indicate an allocation failure
- */
- if (start == 0)
- start = PAGE_SIZE;
-
- while (start <= base) {
- j = memblock_overlaps_region(&memblock.reserved, base, size);
- if (j < 0)
- return base;
- res_base = memblock.reserved.regions[j].base;
- if (res_base < size)
- break;
- base = memblock_align_down(res_base - size, align);
- }
+ /* align @size to avoid excessive fragmentation on reserved array */
+ size = round_up(size, align);
- return MEMBLOCK_ERROR;
-}
-
-static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
- phys_addr_t align, phys_addr_t start, phys_addr_t end)
-{
- long i;
-
- BUG_ON(0 == size);
-
- /* Pump up max_addr */
+ /* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
end = memblock.current_limit;
- /* We do a top-down search, this tends to limit memory
- * fragmentation by keeping early boot allocs near the
- * top of memory
- */
- for (i = memblock.memory.cnt - 1; i >= 0; i--) {
- phys_addr_t memblockbase = memblock.memory.regions[i].base;
- phys_addr_t memblocksize = memblock.memory.regions[i].size;
- phys_addr_t bottom, top, found;
+ /* adjust @start to avoid underflow and allocating the first page */
+ start = max3(start, size, (phys_addr_t)PAGE_SIZE);
+ end = max(start, end);
- if (memblocksize < size)
- continue;
- if ((memblockbase + memblocksize) <= start)
- break;
- bottom = max(memblockbase, start);
- top = min(memblockbase + memblocksize, end);
- if (bottom >= top)
- continue;
- found = memblock_find_region(bottom, top, size, align);
- if (found != MEMBLOCK_ERROR)
- return found;
+ for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+
+ cand = round_down(this_end - size, align);
+ if (cand >= this_start)
+ return cand;
}
- return MEMBLOCK_ERROR;
+ return 0;
}
-/*
- * Find a free area with specified alignment in a specific range.
+/**
+ * memblock_find_in_range - find free area in given range
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ *
+ * Find @size free area aligned to @align in the specified range.
+ *
+ * RETURNS:
+ * Found address on success, %0 on failure.
*/
-u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
+phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
+ phys_addr_t end, phys_addr_t size,
+ phys_addr_t align)
{
- return memblock_find_base(size, align, start, end);
+ return memblock_find_in_range_node(start, end, size, align,
+ MAX_NUMNODES);
}
/*
@@ -177,25 +167,21 @@ int __init_memblock memblock_reserve_reserved_regions(void)
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
{
- unsigned long i;
-
- for (i = r; i < type->cnt - 1; i++) {
- type->regions[i].base = type->regions[i + 1].base;
- type->regions[i].size = type->regions[i + 1].size;
- }
+ type->total_size -= type->regions[r].size;
+ memmove(&type->regions[r], &type->regions[r + 1],
+ (type->cnt - (r + 1)) * sizeof(type->regions[r]));
type->cnt--;
/* Special case for empty arrays */
if (type->cnt == 0) {
+ WARN_ON(type->total_size != 0);
type->cnt = 1;
type->regions[0].base = 0;
type->regions[0].size = 0;
+ memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
}
}
-/* Defined below but needed now */
-static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
-
static int __init_memblock memblock_double_array(struct memblock_type *type)
{
struct memblock_region *new_array, *old_array;
@@ -225,10 +211,10 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
*/
if (use_slab) {
new_array = kmalloc(new_size, GFP_KERNEL);
- addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
+ addr = new_array ? __pa(new_array) : 0;
} else
- addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
- if (addr == MEMBLOCK_ERROR) {
+ addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+ if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
memblock_type_name(type), type->max, type->max * 2);
return -1;
@@ -253,7 +239,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
return 0;
/* Add the new reserved region now. Should not fail ! */
- BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size));
+ BUG_ON(memblock_reserve(addr, new_size));
/* If the array wasn't our static init one, then free it. We only do
* that before SLAB is available as later on, we don't know whether
@@ -267,343 +253,514 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
return 0;
}
-extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
- phys_addr_t addr2, phys_addr_t size2)
+/**
+ * memblock_merge_regions - merge neighboring compatible regions
+ * @type: memblock type to scan
+ *
+ * Scan @type and merge neighboring compatible regions.
+ */
+static void __init_memblock memblock_merge_regions(struct memblock_type *type)
{
- return 1;
-}
+ int i = 0;
-static long __init_memblock memblock_add_region(struct memblock_type *type,
- phys_addr_t base, phys_addr_t size)
-{
- phys_addr_t end = base + size;
- int i, slot = -1;
+ /* cnt never goes below 1 */
+ while (i < type->cnt - 1) {
+ struct memblock_region *this = &type->regions[i];
+ struct memblock_region *next = &type->regions[i + 1];
- /* First try and coalesce this MEMBLOCK with others */
- for (i = 0; i < type->cnt; i++) {
- struct memblock_region *rgn = &type->regions[i];
- phys_addr_t rend = rgn->base + rgn->size;
-
- /* Exit if there's no possible hits */
- if (rgn->base > end || rgn->size == 0)
- break;
-
- /* Check if we are fully enclosed within an existing
- * block
- */
- if (rgn->base <= base && rend >= end)
- return 0;
+ if (this->base + this->size != next->base ||
+ memblock_get_region_node(this) !=
+ memblock_get_region_node(next)) {
+ BUG_ON(this->base + this->size > next->base);
+ i++;
+ continue;
+ }
- /* Check if we overlap or are adjacent with the bottom
- * of a block.
- */
- if (base < rgn->base && end >= rgn->base) {
- /* If we can't coalesce, create a new block */
- if (!memblock_memory_can_coalesce(base, size,
- rgn->base,
- rgn->size)) {
- /* Overlap & can't coalesce are mutually
- * exclusive, if you do that, be prepared
- * for trouble
- */
- WARN_ON(end != rgn->base);
- goto new_block;
- }
- /* We extend the bottom of the block down to our
- * base
- */
- rgn->base = base;
- rgn->size = rend - base;
+ this->size += next->size;
+ memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next));
+ type->cnt--;
+ }
+}
- /* Return if we have nothing else to allocate
- * (fully coalesced)
- */
- if (rend >= end)
- return 0;
+/**
+ * memblock_insert_region - insert new memblock region
+ * @type: memblock type to insert into
+ * @idx: index for the insertion point
+ * @base: base address of the new region
+ * @size: size of the new region
+ *
+ * Insert new memblock region [@base,@base+@size) into @type at @idx.
+ * @type must already have extra room to accomodate the new region.
+ */
+static void __init_memblock memblock_insert_region(struct memblock_type *type,
+ int idx, phys_addr_t base,
+ phys_addr_t size, int nid)
+{
+ struct memblock_region *rgn = &type->regions[idx];
- /* We continue processing from the end of the
- * coalesced block.
- */
- base = rend;
- size = end - base;
- }
+ BUG_ON(type->cnt >= type->max);
+ memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
+ rgn->base = base;
+ rgn->size = size;
+ memblock_set_region_node(rgn, nid);
+ type->cnt++;
+ type->total_size += size;
+}
- /* Now check if we overlap or are adjacent with the
- * top of a block
- */
- if (base <= rend && end >= rend) {
- /* If we can't coalesce, create a new block */
- if (!memblock_memory_can_coalesce(rgn->base,
- rgn->size,
- base, size)) {
- /* Overlap & can't coalesce are mutually
- * exclusive, if you do that, be prepared
- * for trouble
- */
- WARN_ON(rend != base);
- goto new_block;
- }
- /* We adjust our base down to enclose the
- * original block and destroy it. It will be
- * part of our new allocation. Since we've
- * freed an entry, we know we won't fail
- * to allocate one later, so we won't risk
- * losing the original block allocation.
- */
- size += (base - rgn->base);
- base = rgn->base;
- memblock_remove_region(type, i--);
- }
- }
+/**
+ * memblock_add_region - add new memblock region
+ * @type: memblock type to add new region into
+ * @base: base address of the new region
+ * @size: size of the new region
+ * @nid: nid of the new region
+ *
+ * Add new memblock region [@base,@base+@size) into @type. The new region
+ * is allowed to overlap with existing ones - overlaps don't affect already
+ * existing regions. @type is guaranteed to be minimal (all neighbouring
+ * compatible regions are merged) after the addition.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int __init_memblock memblock_add_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size, int nid)
+{
+ bool insert = false;
+ phys_addr_t obase = base;
+ phys_addr_t end = base + memblock_cap_size(base, &size);
+ int i, nr_new;
- /* If the array is empty, special case, replace the fake
- * filler region and return
- */
- if ((type->cnt == 1) && (type->regions[0].size == 0)) {
+ /* special case for empty array */
+ if (type->regions[0].size == 0) {
+ WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
+ memblock_set_region_node(&type->regions[0], nid);
+ type->total_size = size;
return 0;
}
-
- new_block:
- /* If we are out of space, we fail. It's too late to resize the array
- * but then this shouldn't have happened in the first place.
+repeat:
+ /*
+ * The following is executed twice. Once with %false @insert and
+ * then with %true. The first counts the number of regions needed
+ * to accomodate the new area. The second actually inserts them.
*/
- if (WARN_ON(type->cnt >= type->max))
- return -1;
+ base = obase;
+ nr_new = 0;
- /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
- for (i = type->cnt - 1; i >= 0; i--) {
- if (base < type->regions[i].base) {
- type->regions[i+1].base = type->regions[i].base;
- type->regions[i+1].size = type->regions[i].size;
- } else {
- type->regions[i+1].base = base;
- type->regions[i+1].size = size;
- slot = i + 1;
+ for (i = 0; i < type->cnt; i++) {
+ struct memblock_region *rgn = &type->regions[i];
+ phys_addr_t rbase = rgn->base;
+ phys_addr_t rend = rbase + rgn->size;
+
+ if (rbase >= end)
break;
+ if (rend <= base)
+ continue;
+ /*
+ * @rgn overlaps. If it separates the lower part of new
+ * area, insert that portion.
+ */
+ if (rbase > base) {
+ nr_new++;
+ if (insert)
+ memblock_insert_region(type, i++, base,
+ rbase - base, nid);
}
+ /* area below @rend is dealt with, forget about it */
+ base = min(rend, end);
}
- if (base < type->regions[0].base) {
- type->regions[0].base = base;
- type->regions[0].size = size;
- slot = 0;
+
+ /* insert the remaining portion */
+ if (base < end) {
+ nr_new++;
+ if (insert)
+ memblock_insert_region(type, i, base, end - base, nid);
}
- type->cnt++;
- /* The array is full ? Try to resize it. If that fails, we undo
- * our allocation and return an error
+ /*
+ * If this was the first round, resize array and repeat for actual
+ * insertions; otherwise, merge and return.
*/
- if (type->cnt == type->max && memblock_double_array(type)) {
- BUG_ON(slot < 0);
- memblock_remove_region(type, slot);
- return -1;
+ if (!insert) {
+ while (type->cnt + nr_new > type->max)
+ if (memblock_double_array(type) < 0)
+ return -ENOMEM;
+ insert = true;
+ goto repeat;
+ } else {
+ memblock_merge_regions(type);
+ return 0;
}
-
- return 0;
}
-long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
+ int nid)
{
- return memblock_add_region(&memblock.memory, base, size);
+ return memblock_add_region(&memblock.memory, base, size, nid);
+}
+int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
}
-static long __init_memblock __memblock_remove(struct memblock_type *type,
- phys_addr_t base, phys_addr_t size)
+/**
+ * memblock_isolate_range - isolate given range into disjoint memblocks
+ * @type: memblock type to isolate range for
+ * @base: base of range to isolate
+ * @size: size of range to isolate
+ * @start_rgn: out parameter for the start of isolated region
+ * @end_rgn: out parameter for the end of isolated region
+ *
+ * Walk @type and ensure that regions don't cross the boundaries defined by
+ * [@base,@base+@size). Crossing regions are split at the boundaries,
+ * which may create at most two more regions. The index of the first
+ * region inside the range is returned in *@start_rgn and end in *@end_rgn.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int __init_memblock memblock_isolate_range(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size,
+ int *start_rgn, int *end_rgn)
{
- phys_addr_t end = base + size;
+ phys_addr_t end = base + memblock_cap_size(base, &size);
int i;
- /* Walk through the array for collisions */
+ *start_rgn = *end_rgn = 0;
+
+ /* we'll create at most two more regions */
+ while (type->cnt + 2 > type->max)
+ if (memblock_double_array(type) < 0)
+ return -ENOMEM;
+
for (i = 0; i < type->cnt; i++) {
struct memblock_region *rgn = &type->regions[i];
- phys_addr_t rend = rgn->base + rgn->size;
+ phys_addr_t rbase = rgn->base;
+ phys_addr_t rend = rbase + rgn->size;
- /* Nothing more to do, exit */
- if (rgn->base > end || rgn->size == 0)
+ if (rbase >= end)
break;
-
- /* If we fully enclose the block, drop it */
- if (base <= rgn->base && end >= rend) {
- memblock_remove_region(type, i--);
+ if (rend <= base)
continue;
- }
- /* If we are fully enclosed within a block
- * then we need to split it and we are done
- */
- if (base > rgn->base && end < rend) {
- rgn->size = base - rgn->base;
- if (!memblock_add_region(type, end, rend - end))
- return 0;
- /* Failure to split is bad, we at least
- * restore the block before erroring
+ if (rbase < base) {
+ /*
+ * @rgn intersects from below. Split and continue
+ * to process the next region - the new top half.
+ */
+ rgn->base = base;
+ rgn->size -= base - rbase;
+ type->total_size -= base - rbase;
+ memblock_insert_region(type, i, rbase, base - rbase,
+ memblock_get_region_node(rgn));
+ } else if (rend > end) {
+ /*
+ * @rgn intersects from above. Split and redo the
+ * current region - the new bottom half.
*/
- rgn->size = rend - rgn->base;
- WARN_ON(1);
- return -1;
- }
-
- /* Check if we need to trim the bottom of a block */
- if (rgn->base < end && rend > end) {
- rgn->size -= end - rgn->base;
rgn->base = end;
- break;
+ rgn->size -= end - rbase;
+ type->total_size -= end - rbase;
+ memblock_insert_region(type, i--, rbase, end - rbase,
+ memblock_get_region_node(rgn));
+ } else {
+ /* @rgn is fully contained, record it */
+ if (!*end_rgn)
+ *start_rgn = i;
+ *end_rgn = i + 1;
}
+ }
- /* And check if we need to trim the top of a block */
- if (base < rend)
- rgn->size -= rend - base;
+ return 0;
+}
- }
+static int __init_memblock __memblock_remove(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
+{
+ int start_rgn, end_rgn;
+ int i, ret;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = end_rgn - 1; i >= start_rgn; i--)
+ memblock_remove_region(type, i);
return 0;
}
-long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
return __memblock_remove(&memblock.memory, base, size);
}
-long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
+ memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
+ (unsigned long long)base,
+ (unsigned long long)base + size,
+ (void *)_RET_IP_);
+
return __memblock_remove(&memblock.reserved, base, size);
}
-long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
struct memblock_type *_rgn = &memblock.reserved;
+ memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
+ (unsigned long long)base,
+ (unsigned long long)base + size,
+ (void *)_RET_IP_);
BUG_ON(0 == size);
- return memblock_add_region(_rgn, base, size);
+ return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
}
-phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+/**
+ * __next_free_mem_range - next function for for_each_free_mem_range()
+ * @idx: pointer to u64 loop variable
+ * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @p_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Find the first free area from *@idx which matches @nid, fill the out
+ * parameters, and update *@idx for the next iteration. The lower 32bit of
+ * *@idx contains index into memory region and the upper 32bit indexes the
+ * areas before each reserved region. For example, if reserved regions
+ * look like the following,
+ *
+ * 0:[0-16), 1:[32-48), 2:[128-130)
+ *
+ * The upper 32bit indexes the following regions.
+ *
+ * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX)
+ *
+ * As both region arrays are sorted, the function advances the two indices
+ * in lockstep and returns each intersection.
+ */
+void __init_memblock __next_free_mem_range(u64 *idx, int nid,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
{
- phys_addr_t found;
+ struct memblock_type *mem = &memblock.memory;
+ struct memblock_type *rsv = &memblock.reserved;
+ int mi = *idx & 0xffffffff;
+ int ri = *idx >> 32;
- /* We align the size to limit fragmentation. Without this, a lot of
- * small allocs quickly eat up the whole reserve array on sparc
- */
- size = memblock_align_up(size, align);
+ for ( ; mi < mem->cnt; mi++) {
+ struct memblock_region *m = &mem->regions[mi];
+ phys_addr_t m_start = m->base;
+ phys_addr_t m_end = m->base + m->size;
- found = memblock_find_base(size, align, 0, max_addr);
- if (found != MEMBLOCK_ERROR &&
- !memblock_add_region(&memblock.reserved, found, size))
- return found;
+ /* only memory regions are associated with nodes, check it */
+ if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ continue;
- return 0;
+ /* scan areas before each reservation for intersection */
+ for ( ; ri < rsv->cnt + 1; ri++) {
+ struct memblock_region *r = &rsv->regions[ri];
+ phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
+ phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
+
+ /* if ri advanced past mi, break out to advance mi */
+ if (r_start >= m_end)
+ break;
+ /* if the two regions intersect, we're done */
+ if (m_start < r_end) {
+ if (out_start)
+ *out_start = max(m_start, r_start);
+ if (out_end)
+ *out_end = min(m_end, r_end);
+ if (out_nid)
+ *out_nid = memblock_get_region_node(m);
+ /*
+ * The region which ends first is advanced
+ * for the next iteration.
+ */
+ if (m_end <= r_end)
+ mi++;
+ else
+ ri++;
+ *idx = (u32)mi | (u64)ri << 32;
+ return;
+ }
+ }
+ }
+
+ /* signal end of iteration */
+ *idx = ULLONG_MAX;
}
-phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+/**
+ * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
+ * @idx: pointer to u64 loop variable
+ * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @p_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Reverse of __next_free_mem_range().
+ */
+void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
{
- phys_addr_t alloc;
+ struct memblock_type *mem = &memblock.memory;
+ struct memblock_type *rsv = &memblock.reserved;
+ int mi = *idx & 0xffffffff;
+ int ri = *idx >> 32;
- alloc = __memblock_alloc_base(size, align, max_addr);
+ if (*idx == (u64)ULLONG_MAX) {
+ mi = mem->cnt - 1;
+ ri = rsv->cnt;
+ }
- if (alloc == 0)
- panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
- (unsigned long long) size, (unsigned long long) max_addr);
+ for ( ; mi >= 0; mi--) {
+ struct memblock_region *m = &mem->regions[mi];
+ phys_addr_t m_start = m->base;
+ phys_addr_t m_end = m->base + m->size;
- return alloc;
-}
+ /* only memory regions are associated with nodes, check it */
+ if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ continue;
-phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
-{
- return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
-}
+ /* scan areas before each reservation for intersection */
+ for ( ; ri >= 0; ri--) {
+ struct memblock_region *r = &rsv->regions[ri];
+ phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
+ phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
+
+ /* if ri advanced past mi, break out to advance mi */
+ if (r_end <= m_start)
+ break;
+ /* if the two regions intersect, we're done */
+ if (m_end > r_start) {
+ if (out_start)
+ *out_start = max(m_start, r_start);
+ if (out_end)
+ *out_end = min(m_end, r_end);
+ if (out_nid)
+ *out_nid = memblock_get_region_node(m);
+
+ if (m_start >= r_start)
+ mi--;
+ else
+ ri--;
+ *idx = (u32)mi | (u64)ri << 32;
+ return;
+ }
+ }
+ }
+ *idx = ULLONG_MAX;
+}
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
- * Additional node-local allocators. Search for node memory is bottom up
- * and walks memblock regions within that node bottom-up as well, but allocation
- * within an memblock region is top-down. XXX I plan to fix that at some stage
- *
- * WARNING: Only available after early_node_map[] has been populated,
- * on some architectures, that is after all the calls to add_active_range()
- * have been done to populate it.
+ * Common iterator interface used to define for_each_mem_range().
*/
-
-phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
+void __init_memblock __next_mem_pfn_range(int *idx, int nid,
+ unsigned long *out_start_pfn,
+ unsigned long *out_end_pfn, int *out_nid)
{
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
- /*
- * This code originates from sparc which really wants use to walk by addresses
- * and returns the nid. This is not very convenient for early_pfn_map[] users
- * as the map isn't sorted yet, and it really wants to be walked by nid.
- *
- * For now, I implement the inefficient method below which walks the early
- * map multiple times. Eventually we may want to use an ARCH config option
- * to implement a completely different method for both case.
- */
- unsigned long start_pfn, end_pfn;
- int i;
+ struct memblock_type *type = &memblock.memory;
+ struct memblock_region *r;
+
+ while (++*idx < type->cnt) {
+ r = &type->regions[*idx];
- for (i = 0; i < MAX_NUMNODES; i++) {
- get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
- if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
+ if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
continue;
- *nid = i;
- return min(end, PFN_PHYS(end_pfn));
+ if (nid == MAX_NUMNODES || nid == r->nid)
+ break;
+ }
+ if (*idx >= type->cnt) {
+ *idx = -1;
+ return;
}
-#endif
- *nid = 0;
- return end;
+ if (out_start_pfn)
+ *out_start_pfn = PFN_UP(r->base);
+ if (out_end_pfn)
+ *out_end_pfn = PFN_DOWN(r->base + r->size);
+ if (out_nid)
+ *out_nid = r->nid;
}
-static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
- phys_addr_t size,
- phys_addr_t align, int nid)
+/**
+ * memblock_set_node - set node ID on memblock regions
+ * @base: base of area to set node ID for
+ * @size: size of area to set node ID for
+ * @nid: node ID to set
+ *
+ * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
+ * Regions which cross the area boundaries are split as necessary.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
+ int nid)
{
- phys_addr_t start, end;
+ struct memblock_type *type = &memblock.memory;
+ int start_rgn, end_rgn;
+ int i, ret;
- start = mp->base;
- end = start + mp->size;
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
- start = memblock_align_up(start, align);
- while (start < end) {
- phys_addr_t this_end;
- int this_nid;
+ for (i = start_rgn; i < end_rgn; i++)
+ type->regions[i].nid = nid;
- this_end = memblock_nid_range(start, end, &this_nid);
- if (this_nid == nid) {
- phys_addr_t ret = memblock_find_region(start, this_end, size, align);
- if (ret != MEMBLOCK_ERROR &&
- !memblock_add_region(&memblock.reserved, ret, size))
- return ret;
- }
- start = this_end;
- }
+ memblock_merge_regions(type);
+ return 0;
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
+static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t max_addr,
+ int nid)
+{
+ phys_addr_t found;
+
+ found = memblock_find_in_range_node(0, max_addr, size, align, nid);
+ if (found && !memblock_reserve(found, size))
+ return found;
- return MEMBLOCK_ERROR;
+ return 0;
}
phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- struct memblock_type *mem = &memblock.memory;
- int i;
+ return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+}
- BUG_ON(0 == size);
+phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+{
+ return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
+}
- /* We align the size to limit fragmentation. Without this, a lot of
- * small allocs quickly eat up the whole reserve array on sparc
- */
- size = memblock_align_up(size, align);
+phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+{
+ phys_addr_t alloc;
- /* We do a bottom-up search for a region with the right
- * nid since that's easier considering how memblock_nid_range()
- * works
- */
- for (i = 0; i < mem->cnt; i++) {
- phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
- size, align, nid);
- if (ret != MEMBLOCK_ERROR)
- return ret;
- }
+ alloc = __memblock_alloc_base(size, align, max_addr);
- return 0;
+ if (alloc == 0)
+ panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+ (unsigned long long) size, (unsigned long long) max_addr);
+
+ return alloc;
+}
+
+phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
+{
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
@@ -612,7 +769,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
if (res)
return res;
- return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
@@ -620,10 +777,15 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
* Remaining API functions
*/
-/* You must call memblock_analyze() before this. */
phys_addr_t __init memblock_phys_mem_size(void)
{
- return memblock.memory_size;
+ return memblock.memory.total_size;
+}
+
+/* lowest address */
+phys_addr_t __init_memblock memblock_start_of_DRAM(void)
+{
+ return memblock.memory.regions[0].base;
}
phys_addr_t __init_memblock memblock_end_of_DRAM(void)
@@ -633,45 +795,28 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
}
-/* You must call memblock_analyze() after this. */
-void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
+void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
unsigned long i;
- phys_addr_t limit;
- struct memblock_region *p;
+ phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
- if (!memory_limit)
+ if (!limit)
return;
- /* Truncate the memblock regions to satisfy the memory limit. */
- limit = memory_limit;
+ /* find out max address */
for (i = 0; i < memblock.memory.cnt; i++) {
- if (limit > memblock.memory.regions[i].size) {
- limit -= memblock.memory.regions[i].size;
- continue;
- }
-
- memblock.memory.regions[i].size = limit;
- memblock.memory.cnt = i + 1;
- break;
- }
-
- memory_limit = memblock_end_of_DRAM();
+ struct memblock_region *r = &memblock.memory.regions[i];
- /* And truncate any reserves above the limit also. */
- for (i = 0; i < memblock.reserved.cnt; i++) {
- p = &memblock.reserved.regions[i];
-
- if (p->base > memory_limit)
- p->size = 0;
- else if ((p->base + p->size) > memory_limit)
- p->size = memory_limit - p->base;
-
- if (p->size == 0) {
- memblock_remove_region(&memblock.reserved, i);
- i--;
+ if (limit <= r->size) {
+ max_addr = r->base + limit;
+ break;
}
+ limit -= r->size;
}
+
+ /* truncate both memory and reserved regions */
+ __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX);
+ __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX);
}
static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
@@ -705,16 +850,18 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
{
int idx = memblock_search(&memblock.memory, base);
+ phys_addr_t end = base + memblock_cap_size(base, &size);
if (idx == -1)
return 0;
return memblock.memory.regions[idx].base <= base &&
(memblock.memory.regions[idx].base +
- memblock.memory.regions[idx].size) >= (base + size);
+ memblock.memory.regions[idx].size) >= end;
}
int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
+ memblock_cap_size(base, &size);
return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
}
@@ -724,86 +871,45 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
memblock.current_limit = limit;
}
-static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
+static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
{
unsigned long long base, size;
int i;
- pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
+ pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
- for (i = 0; i < region->cnt; i++) {
- base = region->regions[i].base;
- size = region->regions[i].size;
-
- pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
- name, i, base, base + size - 1, size);
+ for (i = 0; i < type->cnt; i++) {
+ struct memblock_region *rgn = &type->regions[i];
+ char nid_buf[32] = "";
+
+ base = rgn->base;
+ size = rgn->size;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ if (memblock_get_region_node(rgn) != MAX_NUMNODES)
+ snprintf(nid_buf, sizeof(nid_buf), " on node %d",
+ memblock_get_region_node(rgn));
+#endif
+ pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
+ name, i, base, base + size - 1, size, nid_buf);
}
}
-void __init_memblock memblock_dump_all(void)
+void __init_memblock __memblock_dump_all(void)
{
- if (!memblock_debug)
- return;
-
pr_info("MEMBLOCK configuration:\n");
- pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
+ pr_info(" memory size = %#llx reserved size = %#llx\n",
+ (unsigned long long)memblock.memory.total_size,
+ (unsigned long long)memblock.reserved.total_size);
memblock_dump(&memblock.memory, "memory");
memblock_dump(&memblock.reserved, "reserved");
}
-void __init memblock_analyze(void)
+void __init memblock_allow_resize(void)
{
- int i;
-
- /* Check marker in the unused last array entry */
- WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
- != MEMBLOCK_INACTIVE);
- WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
- != MEMBLOCK_INACTIVE);
-
- memblock.memory_size = 0;
-
- for (i = 0; i < memblock.memory.cnt; i++)
- memblock.memory_size += memblock.memory.regions[i].size;
-
- /* We allow resizing from there */
memblock_can_resize = 1;
}
-void __init memblock_init(void)
-{
- static int init_done __initdata = 0;
-
- if (init_done)
- return;
- init_done = 1;
-
- /* Hookup the initial arrays */
- memblock.memory.regions = memblock_memory_init_regions;
- memblock.memory.max = INIT_MEMBLOCK_REGIONS;
- memblock.reserved.regions = memblock_reserved_init_regions;
- memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
-
- /* Write a marker in the unused last array entry */
- memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
- memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
-
- /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
- * This simplifies the memblock_add() code below...
- */
- memblock.memory.regions[0].base = 0;
- memblock.memory.regions[0].size = 0;
- memblock.memory.cnt = 1;
-
- /* Ditto. */
- memblock.reserved.regions[0].base = 0;
- memblock.reserved.regions[0].size = 0;
- memblock.reserved.cnt = 1;
-
- memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
-}
-
static int __init early_memblock(char *p)
{
if (p && strstr(p, "debug"))
@@ -812,7 +918,7 @@ static int __init early_memblock(char *p)
}
early_param("memblock", early_memblock);
-#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
+#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
static int memblock_debug_show(struct seq_file *m, void *private)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3508777837c7..602207be9853 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,6 +33,7 @@
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/limits.h>
+#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
@@ -49,6 +50,8 @@
#include <linux/cpu.h>
#include <linux/oom.h>
#include "internal.h"
+#include <net/sock.h>
+#include <net/tcp_memcontrol.h>
#include <asm/uaccess.h>
@@ -120,16 +123,22 @@ struct mem_cgroup_stat_cpu {
unsigned long targets[MEM_CGROUP_NTARGETS];
};
+struct mem_cgroup_reclaim_iter {
+ /* css_id of the last scanned hierarchy member */
+ int position;
+ /* scan generation, increased every round-trip */
+ unsigned int generation;
+};
+
/*
* per-zone information in memory controller.
*/
struct mem_cgroup_per_zone {
- /*
- * spin_lock to protect the per cgroup LRU
- */
- struct list_head lists[NR_LRU_LISTS];
+ struct lruvec lruvec;
unsigned long count[NR_LRU_LISTS];
+ struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+
struct zone_reclaim_stat reclaim_stat;
struct rb_node tree_node; /* RB tree node */
unsigned long long usage_in_excess;/* Set to the value by which */
@@ -201,8 +210,8 @@ struct mem_cgroup_eventfd_list {
struct eventfd_ctx *eventfd;
};
-static void mem_cgroup_threshold(struct mem_cgroup *mem);
-static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
+static void mem_cgroup_threshold(struct mem_cgroup *memcg);
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
/*
* The memory controller data structure. The memory controller controls both
@@ -230,11 +239,6 @@ struct mem_cgroup {
* per zone LRU lists.
*/
struct mem_cgroup_lru_info info;
- /*
- * While reclaiming in a hierarchy, we cache the last child we
- * reclaimed from.
- */
- int last_scanned_child;
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
@@ -285,6 +289,10 @@ struct mem_cgroup {
*/
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
+
+#ifdef CONFIG_INET
+ struct tcp_memcontrol tcp_mem;
+#endif
};
/* Stuffs for move charges at task migration. */
@@ -359,32 +367,90 @@ enum charge_type {
#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
-#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
-static void mem_cgroup_get(struct mem_cgroup *mem);
-static void mem_cgroup_put(struct mem_cgroup *mem);
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(struct mem_cgroup *mem);
+static void mem_cgroup_get(struct mem_cgroup *memcg);
+static void mem_cgroup_put(struct mem_cgroup *memcg);
+
+/* Writing them here to avoid exposing memcg's inner layout */
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_INET
+#include <net/sock.h>
+#include <net/ip.h>
+
+static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
+void sock_update_memcg(struct sock *sk)
+{
+ if (static_branch(&memcg_socket_limit_enabled)) {
+ struct mem_cgroup *memcg;
+
+ BUG_ON(!sk->sk_prot->proto_cgroup);
+
+ /* Socket cloning can throw us here with sk_cgrp already
+ * filled. It won't however, necessarily happen from
+ * process context. So the test for root memcg given
+ * the current task's memcg won't help us in this case.
+ *
+ * Respecting the original socket's memcg is a better
+ * decision in this case.
+ */
+ if (sk->sk_cgrp) {
+ BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
+ mem_cgroup_get(sk->sk_cgrp->memcg);
+ return;
+ }
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (!mem_cgroup_is_root(memcg)) {
+ mem_cgroup_get(memcg);
+ sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
+ }
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(sock_update_memcg);
+
+void sock_release_memcg(struct sock *sk)
+{
+ if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) {
+ struct mem_cgroup *memcg;
+ WARN_ON(!sk->sk_cgrp->memcg);
+ memcg = sk->sk_cgrp->memcg;
+ mem_cgroup_put(memcg);
+ }
+}
+
+struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
+{
+ if (!memcg || mem_cgroup_is_root(memcg))
+ return NULL;
+
+ return &memcg->tcp_mem.cg_proto;
+}
+EXPORT_SYMBOL(tcp_proto_cgroup);
+#endif /* CONFIG_INET */
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+
+static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
{
- return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+ return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
}
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
+struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
{
- return &mem->css;
+ return &memcg->css;
}
static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
+page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
int nid = page_to_nid(page);
int zid = page_zonenum(page);
- return mem_cgroup_zoneinfo(mem, nid, zid);
+ return mem_cgroup_zoneinfo(memcg, nid, zid);
}
static struct mem_cgroup_tree_per_zone *
@@ -403,7 +469,7 @@ soft_limit_tree_from_page(struct page *page)
}
static void
-__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz,
unsigned long long new_usage_in_excess)
@@ -437,7 +503,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
}
static void
-__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
@@ -448,17 +514,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
}
static void
-mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
spin_lock(&mctz->lock);
- __mem_cgroup_remove_exceeded(mem, mz, mctz);
+ __mem_cgroup_remove_exceeded(memcg, mz, mctz);
spin_unlock(&mctz->lock);
}
-static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
unsigned long long excess;
struct mem_cgroup_per_zone *mz;
@@ -471,9 +537,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
*/
- for (; mem; mem = parent_mem_cgroup(mem)) {
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
- excess = res_counter_soft_limit_excess(&mem->res);
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ excess = res_counter_soft_limit_excess(&memcg->res);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
@@ -482,28 +548,28 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
spin_lock(&mctz->lock);
/* if on-tree, remove it */
if (mz->on_tree)
- __mem_cgroup_remove_exceeded(mem, mz, mctz);
+ __mem_cgroup_remove_exceeded(memcg, mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
- __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
spin_unlock(&mctz->lock);
}
}
}
-static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
int node, zone;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
- for_each_node_state(node, N_POSSIBLE) {
+ for_each_node(node) {
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- mz = mem_cgroup_zoneinfo(mem, node, zone);
+ mz = mem_cgroup_zoneinfo(memcg, node, zone);
mctz = soft_limit_tree_node_zone(node, zone);
- mem_cgroup_remove_exceeded(mem, mz, mctz);
+ mem_cgroup_remove_exceeded(memcg, mz, mctz);
}
}
}
@@ -564,7 +630,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
* common workload, threashold and synchonization as vmstat[] should be
* implemented.
*/
-static long mem_cgroup_read_stat(struct mem_cgroup *mem,
+static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
long val = 0;
@@ -572,81 +638,73 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
get_online_cpus();
for_each_online_cpu(cpu)
- val += per_cpu(mem->stat->count[idx], cpu);
+ val += per_cpu(memcg->stat->count[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
- spin_lock(&mem->pcp_counter_lock);
- val += mem->nocpu_base.count[idx];
- spin_unlock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
+ val += memcg->nocpu_base.count[idx];
+ spin_unlock(&memcg->pcp_counter_lock);
#endif
put_online_cpus();
return val;
}
-static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
+static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
{
int val = (charge) ? 1 : -1;
- this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
-}
-
-void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
-{
- this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
+ this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
}
-void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
-{
- this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
-}
-
-static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx)
{
unsigned long val = 0;
int cpu;
for_each_online_cpu(cpu)
- val += per_cpu(mem->stat->events[idx], cpu);
+ val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
- spin_lock(&mem->pcp_counter_lock);
- val += mem->nocpu_base.events[idx];
- spin_unlock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
+ val += memcg->nocpu_base.events[idx];
+ spin_unlock(&memcg->pcp_counter_lock);
#endif
return val;
}
-static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
+static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
bool file, int nr_pages)
{
preempt_disable();
if (file)
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+ nr_pages);
else
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+ nr_pages);
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
+ __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
else {
- __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
+ __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
+ __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
preempt_enable();
}
unsigned long
-mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
+mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
unsigned int lru_mask)
{
struct mem_cgroup_per_zone *mz;
enum lru_list l;
unsigned long ret = 0;
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
for_each_lru(l) {
if (BIT(l) & lru_mask)
@@ -656,90 +714,90 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
}
static unsigned long
-mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
u64 total = 0;
int zid;
for (zid = 0; zid < MAX_NR_ZONES; zid++)
- total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
+ total += mem_cgroup_zone_nr_lru_pages(memcg,
+ nid, zid, lru_mask);
return total;
}
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)
{
int nid;
u64 total = 0;
for_each_node_state(nid, N_HIGH_MEMORY)
- total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
+ total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
return total;
}
-static bool __memcg_event_check(struct mem_cgroup *mem, int target)
+static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_target target)
{
unsigned long val, next;
- val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
- next = this_cpu_read(mem->stat->targets[target]);
+ val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+ next = __this_cpu_read(memcg->stat->targets[target]);
/* from time_after() in jiffies.h */
- return ((long)next - (long)val < 0);
-}
-
-static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
-{
- unsigned long val, next;
-
- val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
-
- switch (target) {
- case MEM_CGROUP_TARGET_THRESH:
- next = val + THRESHOLDS_EVENTS_TARGET;
- break;
- case MEM_CGROUP_TARGET_SOFTLIMIT:
- next = val + SOFTLIMIT_EVENTS_TARGET;
- break;
- case MEM_CGROUP_TARGET_NUMAINFO:
- next = val + NUMAINFO_EVENTS_TARGET;
- break;
- default:
- return;
+ if ((long)next - (long)val < 0) {
+ switch (target) {
+ case MEM_CGROUP_TARGET_THRESH:
+ next = val + THRESHOLDS_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_SOFTLIMIT:
+ next = val + SOFTLIMIT_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_NUMAINFO:
+ next = val + NUMAINFO_EVENTS_TARGET;
+ break;
+ default:
+ break;
+ }
+ __this_cpu_write(memcg->stat->targets[target], next);
+ return true;
}
-
- this_cpu_write(mem->stat->targets[target], next);
+ return false;
}
/*
* Check events in order.
*
*/
-static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
+static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
{
+ preempt_disable();
/* threshold event is triggered in finer grain than soft limit */
- if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
- mem_cgroup_threshold(mem);
- __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
- if (unlikely(__memcg_event_check(mem,
- MEM_CGROUP_TARGET_SOFTLIMIT))) {
- mem_cgroup_update_tree(mem, page);
- __mem_cgroup_target_update(mem,
- MEM_CGROUP_TARGET_SOFTLIMIT);
- }
+ if (unlikely(mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_THRESH))) {
+ bool do_softlimit, do_numainfo;
+
+ do_softlimit = mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_SOFTLIMIT);
#if MAX_NUMNODES > 1
- if (unlikely(__memcg_event_check(mem,
- MEM_CGROUP_TARGET_NUMAINFO))) {
- atomic_inc(&mem->numainfo_events);
- __mem_cgroup_target_update(mem,
- MEM_CGROUP_TARGET_NUMAINFO);
- }
+ do_numainfo = mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_NUMAINFO);
#endif
- }
+ preempt_enable();
+
+ mem_cgroup_threshold(memcg);
+ if (unlikely(do_softlimit))
+ mem_cgroup_update_tree(memcg, page);
+#if MAX_NUMNODES > 1
+ if (unlikely(do_numainfo))
+ atomic_inc(&memcg->numainfo_events);
+#endif
+ } else
+ preempt_enable();
}
-static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
+struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
{
return container_of(cgroup_subsys_state(cont,
mem_cgroup_subsys_id), struct mem_cgroup,
@@ -762,7 +820,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
if (!mm)
return NULL;
@@ -773,115 +831,148 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
*/
rcu_read_lock();
do {
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!mem))
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
break;
- } while (!css_tryget(&mem->css));
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
- return mem;
+ return memcg;
}
-/* The caller has to guarantee "mem" exists before calling this */
-static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
+/**
+ * mem_cgroup_iter - iterate over memory cgroup hierarchy
+ * @root: hierarchy root
+ * @prev: previously returned memcg, NULL on first invocation
+ * @reclaim: cookie for shared reclaim walks, NULL for full walks
+ *
+ * Returns references to children of the hierarchy below @root, or
+ * @root itself, or %NULL after a full round-trip.
+ *
+ * Caller must pass the return value in @prev on subsequent
+ * invocations for reference counting, or use mem_cgroup_iter_break()
+ * to cancel a hierarchy walk before the round-trip is complete.
+ *
+ * Reclaimers can specify a zone and a priority level in @reclaim to
+ * divide up the memcgs in the hierarchy among all concurrent
+ * reclaimers operating on the same zone and priority.
+ */
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+ struct mem_cgroup *prev,
+ struct mem_cgroup_reclaim_cookie *reclaim)
{
- struct cgroup_subsys_state *css;
- int found;
+ struct mem_cgroup *memcg = NULL;
+ int id = 0;
- if (!mem) /* ROOT cgroup has the smallest ID */
- return root_mem_cgroup; /*css_put/get against root is ignored*/
- if (!mem->use_hierarchy) {
- if (css_tryget(&mem->css))
- return mem;
+ if (mem_cgroup_disabled())
return NULL;
- }
- rcu_read_lock();
- /*
- * searching a memory cgroup which has the smallest ID under given
- * ROOT cgroup. (ID >= 1)
- */
- css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
- if (css && css_tryget(css))
- mem = container_of(css, struct mem_cgroup, css);
- else
- mem = NULL;
- rcu_read_unlock();
- return mem;
-}
-static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
- struct mem_cgroup *root,
- bool cond)
-{
- int nextid = css_id(&iter->css) + 1;
- int found;
- int hierarchy_used;
- struct cgroup_subsys_state *css;
+ if (!root)
+ root = root_mem_cgroup;
- hierarchy_used = iter->use_hierarchy;
+ if (prev && !reclaim)
+ id = css_id(&prev->css);
- css_put(&iter->css);
- /* If no ROOT, walk all, ignore hierarchy */
- if (!cond || (root && !hierarchy_used))
- return NULL;
+ if (prev && prev != root)
+ css_put(&prev->css);
- if (!root)
- root = root_mem_cgroup;
+ if (!root->use_hierarchy && root != root_mem_cgroup) {
+ if (prev)
+ return NULL;
+ return root;
+ }
- do {
- iter = NULL;
- rcu_read_lock();
+ while (!memcg) {
+ struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+ struct cgroup_subsys_state *css;
+
+ if (reclaim) {
+ int nid = zone_to_nid(reclaim->zone);
+ int zid = zone_idx(reclaim->zone);
+ struct mem_cgroup_per_zone *mz;
+
+ mz = mem_cgroup_zoneinfo(root, nid, zid);
+ iter = &mz->reclaim_iter[reclaim->priority];
+ if (prev && reclaim->generation != iter->generation)
+ return NULL;
+ id = iter->position;
+ }
- css = css_get_next(&mem_cgroup_subsys, nextid,
- &root->css, &found);
- if (css && css_tryget(css))
- iter = container_of(css, struct mem_cgroup, css);
+ rcu_read_lock();
+ css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
+ if (css) {
+ if (css == &root->css || css_tryget(css))
+ memcg = container_of(css,
+ struct mem_cgroup, css);
+ } else
+ id = 0;
rcu_read_unlock();
- /* If css is NULL, no more cgroups will be found */
- nextid = found + 1;
- } while (css && !iter);
- return iter;
+ if (reclaim) {
+ iter->position = id;
+ if (!css)
+ iter->generation++;
+ else if (!prev && memcg)
+ reclaim->generation = iter->generation;
+ }
+
+ if (prev && !css)
+ return NULL;
+ }
+ return memcg;
}
-/*
- * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
- * be careful that "break" loop is not allowed. We have reference count.
- * Instead of that modify "cond" to be false and "continue" to exit the loop.
- */
-#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
- for (iter = mem_cgroup_start_loop(root);\
- iter != NULL;\
- iter = mem_cgroup_get_next(iter, root, cond))
-#define for_each_mem_cgroup_tree(iter, root) \
- for_each_mem_cgroup_tree_cond(iter, root, true)
+/**
+ * mem_cgroup_iter_break - abort a hierarchy walk prematurely
+ * @root: hierarchy root
+ * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
+ */
+void mem_cgroup_iter_break(struct mem_cgroup *root,
+ struct mem_cgroup *prev)
+{
+ if (!root)
+ root = root_mem_cgroup;
+ if (prev && prev != root)
+ css_put(&prev->css);
+}
-#define for_each_mem_cgroup_all(iter) \
- for_each_mem_cgroup_tree_cond(iter, NULL, true)
+/*
+ * Iteration constructs for visiting all cgroups (under a tree). If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root) \
+ for (iter = mem_cgroup_iter(root, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(root, iter, NULL))
+#define for_each_mem_cgroup(iter) \
+ for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
- return (mem == root_mem_cgroup);
+ return (memcg == root_mem_cgroup);
}
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
if (!mm)
return;
rcu_read_lock();
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!mem))
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
goto out;
switch (idx) {
- case PGMAJFAULT:
- mem_cgroup_pgmajfault(mem, 1);
- break;
case PGFAULT:
- mem_cgroup_pgfault(mem, 1);
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
+ break;
+ case PGMAJFAULT:
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
break;
default:
BUG();
@@ -891,6 +982,27 @@ out:
}
EXPORT_SYMBOL(mem_cgroup_count_vm_event);
+/**
+ * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
+ * @zone: zone of the wanted lruvec
+ * @mem: memcg of the wanted lruvec
+ *
+ * Returns the lru list vector holding pages for the given @zone and
+ * @mem. This can be the global zone lruvec, if the memory controller
+ * is disabled.
+ */
+struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ if (mem_cgroup_disabled())
+ return &zone->lruvec;
+
+ mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
+ return &mz->lruvec;
+}
+
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
@@ -905,211 +1017,156 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
* When moving account, the page is not on LRU. It's isolated.
*/
-void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
-{
- struct page_cgroup *pc;
- struct mem_cgroup_per_zone *mz;
-
- if (mem_cgroup_disabled())
- return;
- pc = lookup_page_cgroup(page);
- /* can happen while we handle swapcache. */
- if (!TestClearPageCgroupAcctLRU(pc))
- return;
- VM_BUG_ON(!pc->mem_cgroup);
- /*
- * We don't check PCG_USED bit. It's cleared when the "page" is finally
- * removed from global LRU.
- */
- mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
- /* huge page split is done under lru_lock. so, we have no races. */
- MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
- if (mem_cgroup_is_root(pc->mem_cgroup))
- return;
- VM_BUG_ON(list_empty(&pc->lru));
- list_del_init(&pc->lru);
-}
-
-void mem_cgroup_del_lru(struct page *page)
-{
- mem_cgroup_del_lru_list(page, page_lru(page));
-}
-
-/*
- * Writeback is about to end against a page which has been marked for immediate
- * reclaim. If it still appears to be reclaimable, move it to the tail of the
- * inactive list.
+/**
+ * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
+ * @zone: zone of the page
+ * @page: the page
+ * @lru: current lru
+ *
+ * This function accounts for @page being added to @lru, and returns
+ * the lruvec for the given @zone and the memcg @page is charged to.
+ *
+ * The callsite is then responsible for physically linking the page to
+ * the returned lruvec->lists[@lru].
*/
-void mem_cgroup_rotate_reclaimable_page(struct page *page)
+struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
+ enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup *memcg;
struct page_cgroup *pc;
- enum lru_list lru = page_lru(page);
if (mem_cgroup_disabled())
- return;
+ return &zone->lruvec;
pc = lookup_page_cgroup(page);
- /* unused or root page is not rotated. */
- if (!PageCgroupUsed(pc))
- return;
- /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
- smp_rmb();
- if (mem_cgroup_is_root(pc->mem_cgroup))
- return;
- mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
- list_move_tail(&pc->lru, &mz->lists[lru]);
+ memcg = pc->mem_cgroup;
+ mz = page_cgroup_zoneinfo(memcg, page);
+ /* compound_order() is stabilized through lru_lock */
+ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
+ return &mz->lruvec;
}
-void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+/**
+ * mem_cgroup_lru_del_list - account for removing an lru page
+ * @page: the page
+ * @lru: target lru
+ *
+ * This function accounts for @page being removed from @lru.
+ *
+ * The callsite is then responsible for physically unlinking
+ * @page->lru.
+ */
+void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup *memcg;
struct page_cgroup *pc;
if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(page);
- /* unused or root page is not rotated. */
- if (!PageCgroupUsed(pc))
- return;
- /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
- smp_rmb();
- if (mem_cgroup_is_root(pc->mem_cgroup))
- return;
- mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
- list_move(&pc->lru, &mz->lists[lru]);
-}
-
-void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
-{
- struct page_cgroup *pc;
- struct mem_cgroup_per_zone *mz;
-
- if (mem_cgroup_disabled())
- return;
- pc = lookup_page_cgroup(page);
- VM_BUG_ON(PageCgroupAcctLRU(pc));
- if (!PageCgroupUsed(pc))
- return;
- /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
- smp_rmb();
- mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
+ memcg = pc->mem_cgroup;
+ VM_BUG_ON(!memcg);
+ mz = page_cgroup_zoneinfo(memcg, page);
/* huge page split is done under lru_lock. so, we have no races. */
- MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
- SetPageCgroupAcctLRU(pc);
- if (mem_cgroup_is_root(pc->mem_cgroup))
- return;
- list_add(&pc->lru, &mz->lists[lru]);
-}
-
-/*
- * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
- * while it's linked to lru because the page may be reused after it's fully
- * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
- * It's done under lock_page and expected that zone->lru_lock isnever held.
- */
-static void mem_cgroup_lru_del_before_commit(struct page *page)
-{
- unsigned long flags;
- struct zone *zone = page_zone(page);
- struct page_cgroup *pc = lookup_page_cgroup(page);
-
- /*
- * Doing this check without taking ->lru_lock seems wrong but this
- * is safe. Because if page_cgroup's USED bit is unset, the page
- * will not be added to any memcg's LRU. If page_cgroup's USED bit is
- * set, the commit after this will fail, anyway.
- * This all charge/uncharge is done under some mutual execustion.
- * So, we don't need to taking care of changes in USED bit.
- */
- if (likely(!PageLRU(page)))
- return;
-
- spin_lock_irqsave(&zone->lru_lock, flags);
- /*
- * Forget old LRU when this page_cgroup is *not* used. This Used bit
- * is guarded by lock_page() because the page is SwapCache.
- */
- if (!PageCgroupUsed(pc))
- mem_cgroup_del_lru_list(page, page_lru(page));
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page)));
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
}
-static void mem_cgroup_lru_add_after_commit(struct page *page)
+void mem_cgroup_lru_del(struct page *page)
{
- unsigned long flags;
- struct zone *zone = page_zone(page);
- struct page_cgroup *pc = lookup_page_cgroup(page);
-
- /* taking care of that the page is added to LRU while we commit it */
- if (likely(!PageLRU(page)))
- return;
- spin_lock_irqsave(&zone->lru_lock, flags);
- /* link when the page is linked to LRU but page_cgroup isn't */
- if (PageLRU(page) && !PageCgroupAcctLRU(pc))
- mem_cgroup_add_lru_list(page, page_lru(page));
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ mem_cgroup_lru_del_list(page, page_lru(page));
}
-
-void mem_cgroup_move_lists(struct page *page,
- enum lru_list from, enum lru_list to)
+/**
+ * mem_cgroup_lru_move_lists - account for moving a page between lrus
+ * @zone: zone of the page
+ * @page: the page
+ * @from: current lru
+ * @to: target lru
+ *
+ * This function accounts for @page being moved between the lrus @from
+ * and @to, and returns the lruvec for the given @zone and the memcg
+ * @page is charged to.
+ *
+ * The callsite is then responsible for physically relinking
+ * @page->lru to the returned lruvec->lists[@to].
+ */
+struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
+ struct page *page,
+ enum lru_list from,
+ enum lru_list to)
{
- if (mem_cgroup_disabled())
- return;
- mem_cgroup_del_lru_list(page, from);
- mem_cgroup_add_lru_list(page, to);
+ /* XXX: Optimize this, especially for @from == @to */
+ mem_cgroup_lru_del_list(page, from);
+ return mem_cgroup_lru_add_list(zone, page, to);
}
/*
- * Checks whether given mem is same or in the root_mem's
+ * Checks whether given mem is same or in the root_mem_cgroup's
* hierarchy subtree
*/
-static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
- struct mem_cgroup *mem)
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+ struct mem_cgroup *memcg)
{
- if (root_mem != mem) {
- return (root_mem->use_hierarchy &&
- css_is_ancestor(&mem->css, &root_mem->css));
+ if (root_memcg != memcg) {
+ return (root_memcg->use_hierarchy &&
+ css_is_ancestor(&memcg->css, &root_memcg->css));
}
return true;
}
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
{
int ret;
struct mem_cgroup *curr = NULL;
struct task_struct *p;
p = find_lock_task_mm(task);
- if (!p)
- return 0;
- curr = try_get_mem_cgroup_from_mm(p->mm);
- task_unlock(p);
+ if (p) {
+ curr = try_get_mem_cgroup_from_mm(p->mm);
+ task_unlock(p);
+ } else {
+ /*
+ * All threads may have already detached their mm's, but the oom
+ * killer still needs to detect if they have already been oom
+ * killed to prevent needlessly killing additional tasks.
+ */
+ task_lock(task);
+ curr = mem_cgroup_from_task(task);
+ if (curr)
+ css_get(&curr->css);
+ task_unlock(task);
+ }
if (!curr)
return 0;
/*
- * We should check use_hierarchy of "mem" not "curr". Because checking
+ * We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
- * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
- * hierarchy(even if use_hierarchy is disabled in "mem").
+ * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
+ * hierarchy(even if use_hierarchy is disabled in "memcg").
*/
- ret = mem_cgroup_same_or_subtree(mem, curr);
+ ret = mem_cgroup_same_or_subtree(memcg, curr);
css_put(&curr->css);
return ret;
}
-static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
{
- unsigned long active;
+ unsigned long inactive_ratio;
+ int nid = zone_to_nid(zone);
+ int zid = zone_idx(zone);
unsigned long inactive;
+ unsigned long active;
unsigned long gb;
- unsigned long inactive_ratio;
- inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
- active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
+ inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+ BIT(LRU_INACTIVE_ANON));
+ active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+ BIT(LRU_ACTIVE_ANON));
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
@@ -1117,39 +1174,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
else
inactive_ratio = 1;
- if (present_pages) {
- present_pages[0] = inactive;
- present_pages[1] = active;
- }
-
- return inactive_ratio;
-}
-
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
-{
- unsigned long active;
- unsigned long inactive;
- unsigned long present_pages[2];
- unsigned long inactive_ratio;
-
- inactive_ratio = calc_inactive_ratio(memcg, present_pages);
-
- inactive = present_pages[0];
- active = present_pages[1];
-
- if (inactive * inactive_ratio < active)
- return 1;
-
- return 0;
+ return inactive * inactive_ratio < active;
}
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
{
unsigned long active;
unsigned long inactive;
+ int zid = zone_idx(zone);
+ int nid = zone_to_nid(zone);
- inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
- active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
+ inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+ BIT(LRU_INACTIVE_FILE));
+ active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+ BIT(LRU_ACTIVE_FILE));
return (active > inactive);
}
@@ -1182,67 +1220,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
return &mz->reclaim_stat;
}
-unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
- struct list_head *dst,
- unsigned long *scanned, int order,
- int mode, struct zone *z,
- struct mem_cgroup *mem_cont,
- int active, int file)
-{
- unsigned long nr_taken = 0;
- struct page *page;
- unsigned long scan;
- LIST_HEAD(pc_list);
- struct list_head *src;
- struct page_cgroup *pc, *tmp;
- int nid = zone_to_nid(z);
- int zid = zone_idx(z);
- struct mem_cgroup_per_zone *mz;
- int lru = LRU_FILE * file + active;
- int ret;
-
- BUG_ON(!mem_cont);
- mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
- src = &mz->lists[lru];
-
- scan = 0;
- list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
- if (scan >= nr_to_scan)
- break;
-
- if (unlikely(!PageCgroupUsed(pc)))
- continue;
-
- page = lookup_cgroup_page(pc);
-
- if (unlikely(!PageLRU(page)))
- continue;
-
- scan++;
- ret = __isolate_lru_page(page, mode, file);
- switch (ret) {
- case 0:
- list_move(&page->lru, dst);
- mem_cgroup_del_lru(page);
- nr_taken += hpage_nr_pages(page);
- break;
- case -EBUSY:
- /* we don't affect global LRU but rotate in our LRU */
- mem_cgroup_rotate_lru_list(page, page_lru(page));
- break;
- default:
- break;
- }
- }
-
- *scanned = scan;
-
- trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
- 0, 0, 0, mode);
-
- return nr_taken;
-}
-
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1253,13 +1230,13 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
* Returns the maximum amount of memory @mem can be charged with, in
* pages.
*/
-static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
+static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
unsigned long long margin;
- margin = res_counter_margin(&mem->res);
+ margin = res_counter_margin(&memcg->res);
if (do_swap_account)
- margin = min(margin, res_counter_margin(&mem->memsw));
+ margin = min(margin, res_counter_margin(&memcg->memsw));
return margin >> PAGE_SHIFT;
}
@@ -1274,33 +1251,33 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
return memcg->swappiness;
}
-static void mem_cgroup_start_move(struct mem_cgroup *mem)
+static void mem_cgroup_start_move(struct mem_cgroup *memcg)
{
int cpu;
get_online_cpus();
- spin_lock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
for_each_online_cpu(cpu)
- per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
- mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
- spin_unlock(&mem->pcp_counter_lock);
+ per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+ memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
+ spin_unlock(&memcg->pcp_counter_lock);
put_online_cpus();
synchronize_rcu();
}
-static void mem_cgroup_end_move(struct mem_cgroup *mem)
+static void mem_cgroup_end_move(struct mem_cgroup *memcg)
{
int cpu;
- if (!mem)
+ if (!memcg)
return;
get_online_cpus();
- spin_lock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
for_each_online_cpu(cpu)
- per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
- mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
- spin_unlock(&mem->pcp_counter_lock);
+ per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+ memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
+ spin_unlock(&memcg->pcp_counter_lock);
put_online_cpus();
}
/*
@@ -1315,13 +1292,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem)
* waiting at hith-memory prressure caused by "move".
*/
-static bool mem_cgroup_stealed(struct mem_cgroup *mem)
+static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
{
VM_BUG_ON(!rcu_read_lock_held());
- return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+ return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
}
-static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
{
struct mem_cgroup *from;
struct mem_cgroup *to;
@@ -1336,17 +1313,17 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
if (!from)
goto unlock;
- ret = mem_cgroup_same_or_subtree(mem, from)
- || mem_cgroup_same_or_subtree(mem, to);
+ ret = mem_cgroup_same_or_subtree(memcg, from)
+ || mem_cgroup_same_or_subtree(memcg, to);
unlock:
spin_unlock(&mc.lock);
return ret;
}
-static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
{
if (mc.moving_task && current != mc.moving_task) {
- if (mem_cgroup_under_move(mem)) {
+ if (mem_cgroup_under_move(memcg)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
@@ -1430,12 +1407,12 @@ done:
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
-static int mem_cgroup_count_children(struct mem_cgroup *mem)
+static int mem_cgroup_count_children(struct mem_cgroup *memcg)
{
int num = 0;
struct mem_cgroup *iter;
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
num++;
return num;
}
@@ -1459,41 +1436,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
return min(limit, memsw);
}
-/*
- * Visit the first child (need not be the first child as per the ordering
- * of the cgroup list, since we track last_scanned_child) of @mem and use
- * that to reclaim free pages from.
- */
-static struct mem_cgroup *
-mem_cgroup_select_victim(struct mem_cgroup *root_mem)
+static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
+ gfp_t gfp_mask,
+ unsigned long flags)
{
- struct mem_cgroup *ret = NULL;
- struct cgroup_subsys_state *css;
- int nextid, found;
+ unsigned long total = 0;
+ bool noswap = false;
+ int loop;
- if (!root_mem->use_hierarchy) {
- css_get(&root_mem->css);
- ret = root_mem;
- }
-
- while (!ret) {
- rcu_read_lock();
- nextid = root_mem->last_scanned_child + 1;
- css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
- &found);
- if (css && css_tryget(css))
- ret = container_of(css, struct mem_cgroup, css);
+ if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
+ noswap = true;
+ if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
+ noswap = true;
- rcu_read_unlock();
- /* Updates scanning parameter */
- if (!css) {
- /* this means start scan from ID:1 */
- root_mem->last_scanned_child = 0;
- } else
- root_mem->last_scanned_child = found;
+ for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
+ if (loop)
+ drain_all_stock_async(memcg);
+ total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
+ /*
+ * Allow limit shrinkers, which are triggered directly
+ * by userspace, to catch signals and stop reclaim
+ * after minimal progress, regardless of the margin.
+ */
+ if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
+ break;
+ if (mem_cgroup_margin(memcg))
+ break;
+ /*
+ * If nothing was reclaimed after two attempts, there
+ * may be no reclaimable pages in this hierarchy.
+ */
+ if (loop && !total)
+ break;
}
-
- return ret;
+ return total;
}
/**
@@ -1506,14 +1482,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
* reclaimable pages on a node. Returns true if there are any reclaimable
* pages in the node.
*/
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
- if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
+ if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
- if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
+ if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
return true;
return false;
@@ -1526,29 +1502,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
{
int nid;
/*
* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
* pagein/pageout changes since the last update.
*/
- if (!atomic_read(&mem->numainfo_events))
+ if (!atomic_read(&memcg->numainfo_events))
return;
- if (atomic_inc_return(&mem->numainfo_updating) > 1)
+ if (atomic_inc_return(&memcg->numainfo_updating) > 1)
return;
/* make a nodemask where this memcg uses memory from */
- mem->scan_nodes = node_states[N_HIGH_MEMORY];
+ memcg->scan_nodes = node_states[N_HIGH_MEMORY];
for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
- if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
- node_clear(nid, mem->scan_nodes);
+ if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
+ node_clear(nid, memcg->scan_nodes);
}
- atomic_set(&mem->numainfo_events, 0);
- atomic_set(&mem->numainfo_updating, 0);
+ atomic_set(&memcg->numainfo_events, 0);
+ atomic_set(&memcg->numainfo_updating, 0);
}
/*
@@ -1563,16 +1539,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
*
* Now, we use round-robin. Better algorithm is welcomed.
*/
-int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
int node;
- mem_cgroup_may_update_nodemask(mem);
- node = mem->last_scanned_node;
+ mem_cgroup_may_update_nodemask(memcg);
+ node = memcg->last_scanned_node;
- node = next_node(node, mem->scan_nodes);
+ node = next_node(node, memcg->scan_nodes);
if (node == MAX_NUMNODES)
- node = first_node(mem->scan_nodes);
+ node = first_node(memcg->scan_nodes);
/*
* We call this when we hit limit, not when pages are added to LRU.
* No LRU may hold pages because all pages are UNEVICTABLE or
@@ -1582,7 +1558,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
- mem->last_scanned_node = node;
+ memcg->last_scanned_node = node;
return node;
}
@@ -1592,7 +1568,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
-bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
int nid;
@@ -1600,12 +1576,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
* quick check...making use of scan_node.
* We can skip unused nodes.
*/
- if (!nodes_empty(mem->scan_nodes)) {
- for (nid = first_node(mem->scan_nodes);
+ if (!nodes_empty(memcg->scan_nodes)) {
+ for (nid = first_node(memcg->scan_nodes);
nid < MAX_NUMNODES;
- nid = next_node(nid, mem->scan_nodes)) {
+ nid = next_node(nid, memcg->scan_nodes)) {
- if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+ if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
}
@@ -1613,81 +1589,55 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
* Check rest of nodes.
*/
for_each_node_state(nid, N_HIGH_MEMORY) {
- if (node_isset(nid, mem->scan_nodes))
+ if (node_isset(nid, memcg->scan_nodes))
continue;
- if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+ if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
return false;
}
#else
-int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
-bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
- return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+ return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
}
#endif
-/*
- * Scan the hierarchy if needed to reclaim memory. We remember the last child
- * we reclaimed from, so that we don't end up penalizing one child extensively
- * based on its position in the children list.
- *
- * root_mem is the original ancestor that we've been reclaim from.
- *
- * We give up and return to the caller when we visit root_mem twice.
- * (other groups can be removed while we're walking....)
- *
- * If shrink==true, for avoiding to free too much, this returns immedieately.
- */
-static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
- struct zone *zone,
- gfp_t gfp_mask,
- unsigned long reclaim_options,
- unsigned long *total_scanned)
-{
- struct mem_cgroup *victim;
- int ret, total = 0;
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
+ struct zone *zone,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
+{
+ struct mem_cgroup *victim = NULL;
+ int total = 0;
int loop = 0;
- bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
- bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
- bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
unsigned long excess;
unsigned long nr_scanned;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .zone = zone,
+ .priority = 0,
+ };
- excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
-
- /* If memsw_is_minimum==1, swap-out is of-no-use. */
- if (!check_soft && !shrink && root_mem->memsw_is_minimum)
- noswap = true;
+ excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
while (1) {
- victim = mem_cgroup_select_victim(root_mem);
- if (victim == root_mem) {
+ victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
+ if (!victim) {
loop++;
- /*
- * We are not draining per cpu cached charges during
- * soft limit reclaim because global reclaim doesn't
- * care about charges. It tries to free some memory and
- * charges will not give any.
- */
- if (!check_soft && loop >= 1)
- drain_all_stock_async(root_mem);
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
- if (!check_soft || !total) {
- css_put(&victim->css);
+ if (!total)
break;
- }
/*
* We want to do more targeted reclaim.
* excess >> 2 is not to excessive so as to
@@ -1695,40 +1645,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
- (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
- css_put(&victim->css);
+ (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
break;
- }
}
- }
- if (!mem_cgroup_reclaimable(victim, noswap)) {
- /* this cgroup's local usage == 0 */
- css_put(&victim->css);
continue;
}
- /* we use swappiness of local cgroup */
- if (check_soft) {
- ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
- noswap, zone, &nr_scanned);
- *total_scanned += nr_scanned;
- } else
- ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
- noswap);
- css_put(&victim->css);
- /*
- * At shrinking usage, we can't check we should stop here or
- * reclaim more. It's depends on callers. last_scanned_child
- * will work enough for keeping fairness under tree.
- */
- if (shrink)
- return ret;
- total += ret;
- if (check_soft) {
- if (!res_counter_soft_limit_excess(&root_mem->res))
- return total;
- } else if (mem_cgroup_margin(root_mem))
- return total;
+ if (!mem_cgroup_reclaimable(victim, false))
+ continue;
+ total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+ zone, &nr_scanned);
+ *total_scanned += nr_scanned;
+ if (!res_counter_soft_limit_excess(&root_memcg->res))
+ break;
}
+ mem_cgroup_iter_break(root_memcg, victim);
return total;
}
@@ -1737,19 +1667,19 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
* If someone is running, return false.
* Has to be called with memcg_oom_lock
*/
-static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter, *failed = NULL;
- bool cond = true;
- for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+ for_each_mem_cgroup_tree(iter, memcg) {
if (iter->oom_lock) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
failed = iter;
- cond = false;
+ mem_cgroup_iter_break(memcg, iter);
+ break;
} else
iter->oom_lock = true;
}
@@ -1761,11 +1691,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
* OK, we failed to lock the whole subtree so we have to clean up
* what we set up to the failing subtree
*/
- cond = true;
- for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+ for_each_mem_cgroup_tree(iter, memcg) {
if (iter == failed) {
- cond = false;
- continue;
+ mem_cgroup_iter_break(memcg, iter);
+ break;
}
iter->oom_lock = false;
}
@@ -1775,24 +1704,24 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
/*
* Has to be called with memcg_oom_lock
*/
-static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
+static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
return 0;
}
-static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
atomic_inc(&iter->under_oom);
}
-static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
@@ -1801,7 +1730,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
}
@@ -1816,85 +1745,85 @@ struct oom_wait_info {
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
- struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
- *oom_wait_mem;
+ struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
+ *oom_wait_memcg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
- oom_wait_mem = oom_wait_info->mem;
+ oom_wait_memcg = oom_wait_info->mem;
/*
* Both of oom_wait_info->mem and wake_mem are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
- if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
- && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
+ if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
+ && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
-static void memcg_wakeup_oom(struct mem_cgroup *mem)
+static void memcg_wakeup_oom(struct mem_cgroup *memcg)
{
- /* for filtering, pass "mem" as argument. */
- __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
+ /* for filtering, pass "memcg" as argument. */
+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
-static void memcg_oom_recover(struct mem_cgroup *mem)
+static void memcg_oom_recover(struct mem_cgroup *memcg)
{
- if (mem && atomic_read(&mem->under_oom))
- memcg_wakeup_oom(mem);
+ if (memcg && atomic_read(&memcg->under_oom))
+ memcg_wakeup_oom(memcg);
}
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
-bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
+bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
{
struct oom_wait_info owait;
bool locked, need_to_kill;
- owait.mem = mem;
+ owait.mem = memcg;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
need_to_kill = true;
- mem_cgroup_mark_under_oom(mem);
+ mem_cgroup_mark_under_oom(memcg);
- /* At first, try to OOM lock hierarchy under mem.*/
+ /* At first, try to OOM lock hierarchy under memcg.*/
spin_lock(&memcg_oom_lock);
- locked = mem_cgroup_oom_lock(mem);
+ locked = mem_cgroup_oom_lock(memcg);
/*
* Even if signal_pending(), we can't quit charge() loop without
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here.
*/
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
- if (!locked || mem->oom_kill_disable)
+ if (!locked || memcg->oom_kill_disable)
need_to_kill = false;
if (locked)
- mem_cgroup_oom_notify(mem);
+ mem_cgroup_oom_notify(memcg);
spin_unlock(&memcg_oom_lock);
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(mem, mask);
+ mem_cgroup_out_of_memory(memcg, mask);
} else {
schedule();
finish_wait(&memcg_oom_waitq, &owait.wait);
}
spin_lock(&memcg_oom_lock);
if (locked)
- mem_cgroup_oom_unlock(mem);
- memcg_wakeup_oom(mem);
+ mem_cgroup_oom_unlock(memcg);
+ memcg_wakeup_oom(memcg);
spin_unlock(&memcg_oom_lock);
- mem_cgroup_unmark_under_oom(mem);
+ mem_cgroup_unmark_under_oom(memcg);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false;
/* Give chance to dying process */
- schedule_timeout(1);
+ schedule_timeout_uninterruptible(1);
return true;
}
@@ -1925,25 +1854,25 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx, int val)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
unsigned long uninitialized_var(flags);
- if (unlikely(!pc))
+ if (mem_cgroup_disabled())
return;
rcu_read_lock();
- mem = pc->mem_cgroup;
- if (unlikely(!mem || !PageCgroupUsed(pc)))
+ memcg = pc->mem_cgroup;
+ if (unlikely(!memcg || !PageCgroupUsed(pc)))
goto out;
/* pc->mem_cgroup is unstable ? */
- if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
+ if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
/* take a lock against to access pc->mem_cgroup */
move_lock_page_cgroup(pc, &flags);
need_unlock = true;
- mem = pc->mem_cgroup;
- if (!mem || !PageCgroupUsed(pc))
+ memcg = pc->mem_cgroup;
+ if (!memcg || !PageCgroupUsed(pc))
goto out;
}
@@ -1959,7 +1888,7 @@ void mem_cgroup_update_page_stat(struct page *page,
BUG();
}
- this_cpu_add(mem->stat->count[idx], val);
+ this_cpu_add(memcg->stat->count[idx], val);
out:
if (unlikely(need_unlock))
@@ -1990,13 +1919,13 @@ static DEFINE_MUTEX(percpu_charge_mutex);
* cgroup which is not current target, returns false. This stock will be
* refilled.
*/
-static bool consume_stock(struct mem_cgroup *mem)
+static bool consume_stock(struct mem_cgroup *memcg)
{
struct memcg_stock_pcp *stock;
bool ret = true;
stock = &get_cpu_var(memcg_stock);
- if (mem == stock->cached && stock->nr_pages)
+ if (memcg == stock->cached && stock->nr_pages)
stock->nr_pages--;
else /* need to call res_counter_charge */
ret = false;
@@ -2037,24 +1966,24 @@ static void drain_local_stock(struct work_struct *dummy)
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
-static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
- if (stock->cached != mem) { /* reset if necessary */
+ if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
- stock->cached = mem;
+ stock->cached = memcg;
}
stock->nr_pages += nr_pages;
put_cpu_var(memcg_stock);
}
/*
- * Drains all per-CPU charge caches for given root_mem resp. subtree
+ * Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it. sync flag says whether we should block
* until the work is done.
*/
-static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
+static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
{
int cpu, curcpu;
@@ -2063,12 +1992,12 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
curcpu = get_cpu();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
- mem = stock->cached;
- if (!mem || !stock->nr_pages)
+ memcg = stock->cached;
+ if (!memcg || !stock->nr_pages)
continue;
- if (!mem_cgroup_same_or_subtree(root_mem, mem))
+ if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
continue;
if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
@@ -2097,23 +2026,23 @@ out:
* expects some charges will be back to res_counter later but cannot wait for
* it.
*/
-static void drain_all_stock_async(struct mem_cgroup *root_mem)
+static void drain_all_stock_async(struct mem_cgroup *root_memcg)
{
/*
* If someone calls draining, avoid adding more kworker runs.
*/
if (!mutex_trylock(&percpu_charge_mutex))
return;
- drain_all_stock(root_mem, false);
+ drain_all_stock(root_memcg, false);
mutex_unlock(&percpu_charge_mutex);
}
/* This is a synchronous drain interface. */
-static void drain_all_stock_sync(struct mem_cgroup *root_mem)
+static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
{
/* called when force_empty is called */
mutex_lock(&percpu_charge_mutex);
- drain_all_stock(root_mem, true);
+ drain_all_stock(root_memcg, true);
mutex_unlock(&percpu_charge_mutex);
}
@@ -2121,35 +2050,35 @@ static void drain_all_stock_sync(struct mem_cgroup *root_mem)
* This function drains percpu counter value from DEAD cpu and
* move it to local cpu. Note that this function can be preempted.
*/
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
{
int i;
- spin_lock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
- long x = per_cpu(mem->stat->count[i], cpu);
+ long x = per_cpu(memcg->stat->count[i], cpu);
- per_cpu(mem->stat->count[i], cpu) = 0;
- mem->nocpu_base.count[i] += x;
+ per_cpu(memcg->stat->count[i], cpu) = 0;
+ memcg->nocpu_base.count[i] += x;
}
for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
- unsigned long x = per_cpu(mem->stat->events[i], cpu);
+ unsigned long x = per_cpu(memcg->stat->events[i], cpu);
- per_cpu(mem->stat->events[i], cpu) = 0;
- mem->nocpu_base.events[i] += x;
+ per_cpu(memcg->stat->events[i], cpu) = 0;
+ memcg->nocpu_base.events[i] += x;
}
/* need to clear ON_MOVE value, works as a kind of lock. */
- per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
- spin_unlock(&mem->pcp_counter_lock);
+ per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
+ spin_unlock(&memcg->pcp_counter_lock);
}
-static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
+static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
{
int idx = MEM_CGROUP_ON_MOVE;
- spin_lock(&mem->pcp_counter_lock);
- per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
- spin_unlock(&mem->pcp_counter_lock);
+ spin_lock(&memcg->pcp_counter_lock);
+ per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
+ spin_unlock(&memcg->pcp_counter_lock);
}
static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
@@ -2161,7 +2090,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
struct mem_cgroup *iter;
if ((action == CPU_ONLINE)) {
- for_each_mem_cgroup_all(iter)
+ for_each_mem_cgroup(iter)
synchronize_mem_cgroup_on_move(iter, cpu);
return NOTIFY_OK;
}
@@ -2169,7 +2098,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
- for_each_mem_cgroup_all(iter)
+ for_each_mem_cgroup(iter)
mem_cgroup_drain_pcp_counter(iter, cpu);
stock = &per_cpu(memcg_stock, cpu);
@@ -2187,7 +2116,7 @@ enum {
CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
-static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages, bool oom_check)
{
unsigned long csize = nr_pages * PAGE_SIZE;
@@ -2196,16 +2125,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
unsigned long flags = 0;
int ret;
- ret = res_counter_charge(&mem->res, csize, &fail_res);
+ ret = res_counter_charge(&memcg->res, csize, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
return CHARGE_OK;
- ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+ ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
if (likely(!ret))
return CHARGE_OK;
- res_counter_uncharge(&mem->res, csize);
+ res_counter_uncharge(&memcg->res, csize);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
@@ -2223,8 +2152,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
- ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags, NULL);
+ ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
return CHARGE_RETRY;
/*
@@ -2257,18 +2185,35 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
}
/*
- * Unlike exported interface, "oom" parameter is added. if oom==true,
- * oom-killer can be invoked.
+ * __mem_cgroup_try_charge() does
+ * 1. detect memcg to be charged against from passed *mm and *ptr,
+ * 2. update res_counter
+ * 3. call memory reclaim if necessary.
+ *
+ * In some special case, if the task is fatal, fatal_signal_pending() or
+ * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
+ * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
+ * as possible without any hazards. 2: all pages should have a valid
+ * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
+ * pointer, that is treated as a charge to root_mem_cgroup.
+ *
+ * So __mem_cgroup_try_charge() will return
+ * 0 ... on success, filling *ptr with a valid memcg pointer.
+ * -ENOMEM ... charge failure because of resource limits.
+ * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
+ *
+ * Unlike the exported interface, an "oom" parameter is added. if oom==true,
+ * the oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
gfp_t gfp_mask,
unsigned int nr_pages,
- struct mem_cgroup **memcg,
+ struct mem_cgroup **ptr,
bool oom)
{
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
int ret;
/*
@@ -2286,17 +2231,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- if (!*memcg && !mm)
- goto bypass;
+ if (!*ptr && !mm)
+ *ptr = root_mem_cgroup;
again:
- if (*memcg) { /* css should be a valid one */
- mem = *memcg;
- VM_BUG_ON(css_is_removed(&mem->css));
- if (mem_cgroup_is_root(mem))
+ if (*ptr) { /* css should be a valid one */
+ memcg = *ptr;
+ VM_BUG_ON(css_is_removed(&memcg->css));
+ if (mem_cgroup_is_root(memcg))
goto done;
- if (nr_pages == 1 && consume_stock(mem))
+ if (nr_pages == 1 && consume_stock(memcg))
goto done;
- css_get(&mem->css);
+ css_get(&memcg->css);
} else {
struct task_struct *p;
@@ -2304,7 +2249,7 @@ again:
p = rcu_dereference(mm->owner);
/*
* Because we don't have task_lock(), "p" can exit.
- * In that case, "mem" can point to root or p can be NULL with
+ * In that case, "memcg" can point to root or p can be NULL with
* race with swapoff. Then, we have small risk of mis-accouning.
* But such kind of mis-account by race always happens because
* we don't have cgroup_mutex(). It's overkill and we allo that
@@ -2312,12 +2257,14 @@ again:
* (*) swapoff at el will charge against mm-struct not against
* task-struct. So, mm->owner can be NULL.
*/
- mem = mem_cgroup_from_task(p);
- if (!mem || mem_cgroup_is_root(mem)) {
+ memcg = mem_cgroup_from_task(p);
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ if (mem_cgroup_is_root(memcg)) {
rcu_read_unlock();
goto done;
}
- if (nr_pages == 1 && consume_stock(mem)) {
+ if (nr_pages == 1 && consume_stock(memcg)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
@@ -2330,7 +2277,7 @@ again:
goto done;
}
/* after here, we may be blocked. we need to get refcnt */
- if (!css_tryget(&mem->css)) {
+ if (!css_tryget(&memcg->css)) {
rcu_read_unlock();
goto again;
}
@@ -2342,7 +2289,7 @@ again:
/* If killed, bypass charge */
if (fatal_signal_pending(current)) {
- css_put(&mem->css);
+ css_put(&memcg->css);
goto bypass;
}
@@ -2352,44 +2299,44 @@ again:
nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
switch (ret) {
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
batch = nr_pages;
- css_put(&mem->css);
- mem = NULL;
+ css_put(&memcg->css);
+ memcg = NULL;
goto again;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
- css_put(&mem->css);
+ css_put(&memcg->css);
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
if (!oom) {
- css_put(&mem->css);
+ css_put(&memcg->css);
goto nomem;
}
/* If oom, we never return -ENOMEM */
nr_oom_retries--;
break;
case CHARGE_OOM_DIE: /* Killed by OOM Killer */
- css_put(&mem->css);
+ css_put(&memcg->css);
goto bypass;
}
} while (ret != CHARGE_OK);
if (batch > nr_pages)
- refill_stock(mem, batch - nr_pages);
- css_put(&mem->css);
+ refill_stock(memcg, batch - nr_pages);
+ css_put(&memcg->css);
done:
- *memcg = mem;
+ *ptr = memcg;
return 0;
nomem:
- *memcg = NULL;
+ *ptr = NULL;
return -ENOMEM;
bypass:
- *memcg = NULL;
- return 0;
+ *ptr = root_mem_cgroup;
+ return -EINTR;
}
/*
@@ -2397,15 +2344,15 @@ bypass:
* This function is for that and do uncharge, put css's refcnt.
* gotten by try_charge().
*/
-static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
unsigned int nr_pages)
{
- if (!mem_cgroup_is_root(mem)) {
+ if (!mem_cgroup_is_root(memcg)) {
unsigned long bytes = nr_pages * PAGE_SIZE;
- res_counter_uncharge(&mem->res, bytes);
+ res_counter_uncharge(&memcg->res, bytes);
if (do_swap_account)
- res_counter_uncharge(&mem->memsw, bytes);
+ res_counter_uncharge(&memcg->memsw, bytes);
}
}
@@ -2430,7 +2377,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
unsigned short id;
swp_entry_t ent;
@@ -2440,23 +2387,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
- mem = pc->mem_cgroup;
- if (mem && !css_tryget(&mem->css))
- mem = NULL;
+ memcg = pc->mem_cgroup;
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
} else if (PageSwapCache(page)) {
ent.val = page_private(page);
- id = lookup_swap_cgroup(ent);
+ id = lookup_swap_cgroup_id(ent);
rcu_read_lock();
- mem = mem_cgroup_lookup(id);
- if (mem && !css_tryget(&mem->css))
- mem = NULL;
+ memcg = mem_cgroup_lookup(id);
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
rcu_read_unlock();
}
unlock_page_cgroup(pc);
- return mem;
+ return memcg;
}
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
struct page *page,
unsigned int nr_pages,
struct page_cgroup *pc,
@@ -2465,14 +2412,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- __mem_cgroup_cancel_charge(mem, nr_pages);
+ __mem_cgroup_cancel_charge(memcg, nr_pages);
return;
}
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
*/
- pc->mem_cgroup = mem;
+ pc->mem_cgroup = memcg;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
* Especially when a page_cgroup is taken from a page, pc->mem_cgroup
@@ -2495,57 +2442,43 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
break;
}
- mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
+ mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
unlock_page_cgroup(pc);
+ WARN_ON_ONCE(PageLRU(page));
/*
* "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
*/
- memcg_check_events(mem, page);
+ memcg_check_events(memcg, page);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
- (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
+ (1 << PCG_MIGRATION))
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ * zone->lru_lock, 'splitting on pmd' and compound_lock.
+ * charge/uncharge will be never happen and move_account() is done under
+ * compound_lock(), so we don't have to take care of races.
*/
-void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+void mem_cgroup_split_huge_fixup(struct page *head)
{
struct page_cgroup *head_pc = lookup_page_cgroup(head);
- struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
- unsigned long flags;
+ struct page_cgroup *pc;
+ int i;
if (mem_cgroup_disabled())
return;
- /*
- * We have no races with charge/uncharge but will have races with
- * page state accounting.
- */
- move_lock_page_cgroup(head_pc, &flags);
-
- tail_pc->mem_cgroup = head_pc->mem_cgroup;
- smp_wmb(); /* see __commit_charge() */
- if (PageCgroupAcctLRU(head_pc)) {
- enum lru_list lru;
- struct mem_cgroup_per_zone *mz;
-
- /*
- * LRU flags cannot be copied because we need to add tail
- *.page to LRU by generic call and our hook will be called.
- * We hold lru_lock, then, reduce counter directly.
- */
- lru = page_lru(head);
- mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
- MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ pc = head_pc + i;
+ pc->mem_cgroup = head_pc->mem_cgroup;
+ smp_wmb();/* see __commit_charge() */
+ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
}
- tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
- move_unlock_page_cgroup(head_pc, &flags);
}
-#endif
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/**
* mem_cgroup_move_account - move account of the page
@@ -2660,7 +2593,7 @@ static int mem_cgroup_move_parent(struct page *page,
parent = mem_cgroup_from_cont(pcg);
ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
- if (ret || !parent)
+ if (ret)
goto put_back;
if (nr_pages > 1)
@@ -2689,7 +2622,7 @@ out:
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
struct page_cgroup *pc;
bool oom = true;
@@ -2706,13 +2639,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
}
pc = lookup_page_cgroup(page);
- BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
-
- ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
- if (ret || !mem)
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
+ if (ret == -ENOMEM)
return ret;
-
- __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
+ __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
return 0;
}
@@ -2721,19 +2651,11 @@ int mem_cgroup_newpage_charge(struct page *page,
{
if (mem_cgroup_disabled())
return 0;
- /*
- * If already mapped, we don't have to account.
- * If page cache, page->mapping has address_space.
- * But page->mapping may have out-of-use anon_vma pointer,
- * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
- * is NULL.
- */
- if (page_mapped(page) || (page->mapping && !PageAnon(page)))
- return 0;
- if (unlikely(!mm))
- mm = &init_mm;
+ VM_BUG_ON(page_mapped(page));
+ VM_BUG_ON(page->mapping && !PageAnon(page));
+ VM_BUG_ON(!mm);
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_MAPPED);
+ MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
static void
@@ -2741,25 +2663,39 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
enum charge_type ctype);
static void
-__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
+__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
enum charge_type ctype)
{
struct page_cgroup *pc = lookup_page_cgroup(page);
+ struct zone *zone = page_zone(page);
+ unsigned long flags;
+ bool removed = false;
+
/*
* In some case, SwapCache, FUSE(splice_buf->radixtree), the page
* is already on LRU. It means the page may on some other page_cgroup's
* LRU. Take care of it.
*/
- mem_cgroup_lru_del_before_commit(page);
- __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
- mem_cgroup_lru_add_after_commit(page);
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ if (PageLRU(page)) {
+ del_page_from_lru_list(zone, page, page_lru(page));
+ ClearPageLRU(page);
+ removed = true;
+ }
+ __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
+ if (removed) {
+ add_page_to_lru_list(zone, page, page_lru(page));
+ SetPageLRU(page);
+ }
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
return;
}
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
+ enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
int ret;
if (mem_cgroup_disabled())
@@ -2769,31 +2705,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (unlikely(!mm))
mm = &init_mm;
+ if (!page_is_file_cache(page))
+ type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- if (page_is_file_cache(page)) {
- ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
- if (ret || !mem)
- return ret;
-
- /*
- * FUSE reuses pages without going through the final
- * put that would remove them from the LRU list, make
- * sure that they get relinked properly.
- */
- __mem_cgroup_commit_charge_lrucare(page, mem,
- MEM_CGROUP_CHARGE_TYPE_CACHE);
- return ret;
- }
- /* shmem */
- if (PageSwapCache(page)) {
- ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
+ if (!PageSwapCache(page))
+ ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
+ else { /* page is swapcache/shmem */
+ ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
if (!ret)
- __mem_cgroup_commit_charge_swapin(page, mem,
- MEM_CGROUP_CHARGE_TYPE_SHMEM);
- } else
- ret = mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_SHMEM);
-
+ __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ }
return ret;
}
@@ -2805,12 +2726,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
*/
int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct page *page,
- gfp_t mask, struct mem_cgroup **ptr)
+ gfp_t mask, struct mem_cgroup **memcgp)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
int ret;
- *ptr = NULL;
+ *memcgp = NULL;
if (mem_cgroup_disabled())
return 0;
@@ -2825,30 +2746,35 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
*/
if (!PageSwapCache(page))
goto charge_cur_mm;
- mem = try_get_mem_cgroup_from_page(page);
- if (!mem)
+ memcg = try_get_mem_cgroup_from_page(page);
+ if (!memcg)
goto charge_cur_mm;
- *ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
- css_put(&mem->css);
+ *memcgp = memcg;
+ ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
+ css_put(&memcg->css);
+ if (ret == -EINTR)
+ ret = 0;
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
+ ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
+ if (ret == -EINTR)
+ ret = 0;
+ return ret;
}
static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
+__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
enum charge_type ctype)
{
if (mem_cgroup_disabled())
return;
- if (!ptr)
+ if (!memcg)
return;
- cgroup_exclude_rmdir(&ptr->css);
+ cgroup_exclude_rmdir(&memcg->css);
- __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
+ __mem_cgroup_commit_charge_lrucare(page, memcg, ctype);
/*
* Now swap is on-memory. This means this page may be
* counted both as mem and swap....double count.
@@ -2858,21 +2784,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
*/
if (do_swap_account && PageSwapCache(page)) {
swp_entry_t ent = {.val = page_private(page)};
+ struct mem_cgroup *swap_memcg;
unsigned short id;
- struct mem_cgroup *memcg;
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
- if (memcg) {
+ swap_memcg = mem_cgroup_lookup(id);
+ if (swap_memcg) {
/*
* This recorded memcg can be obsolete one. So, avoid
* calling css_tryget
*/
- if (!mem_cgroup_is_root(memcg))
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
- mem_cgroup_swap_statistics(memcg, false);
- mem_cgroup_put(memcg);
+ if (!mem_cgroup_is_root(swap_memcg))
+ res_counter_uncharge(&swap_memcg->memsw,
+ PAGE_SIZE);
+ mem_cgroup_swap_statistics(swap_memcg, false);
+ mem_cgroup_put(swap_memcg);
}
rcu_read_unlock();
}
@@ -2881,25 +2808,26 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
* So, rmdir()->pre_destroy() can be called while we do this charge.
* In that case, we need to call pre_destroy() again. check it here.
*/
- cgroup_release_and_wakeup_rmdir(&ptr->css);
+ cgroup_release_and_wakeup_rmdir(&memcg->css);
}
-void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
+void mem_cgroup_commit_charge_swapin(struct page *page,
+ struct mem_cgroup *memcg)
{
- __mem_cgroup_commit_charge_swapin(page, ptr,
- MEM_CGROUP_CHARGE_TYPE_MAPPED);
+ __mem_cgroup_commit_charge_swapin(page, memcg,
+ MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
return;
- if (!mem)
+ if (!memcg)
return;
- __mem_cgroup_cancel_charge(mem, 1);
+ __mem_cgroup_cancel_charge(memcg, 1);
}
-static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
+static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
unsigned int nr_pages,
const enum charge_type ctype)
{
@@ -2917,7 +2845,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
* uncharges. Then, it's ok to ignore memcg's refcnt.
*/
if (!batch->memcg)
- batch->memcg = mem;
+ batch->memcg = memcg;
/*
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
* In those cases, all pages freed continuously can be expected to be in
@@ -2937,7 +2865,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
* merge a series of uncharges to an uncharge of res_counter.
* If not, we uncharge res_counter ony by one.
*/
- if (batch->memcg != mem)
+ if (batch->memcg != memcg)
goto direct_uncharge;
/* remember freed charge and uncharge it later */
batch->nr_pages++;
@@ -2945,11 +2873,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
batch->memsw_nr_pages++;
return;
direct_uncharge:
- res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
+ res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
if (uncharge_memsw)
- res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
- if (unlikely(batch->memcg != mem))
- memcg_oom_recover(mem);
+ res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
+ if (unlikely(batch->memcg != memcg))
+ memcg_oom_recover(memcg);
return;
}
@@ -2959,7 +2887,7 @@ direct_uncharge:
static struct mem_cgroup *
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
struct page_cgroup *pc;
@@ -2977,12 +2905,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
* Check if our page_cgroup is valid
*/
pc = lookup_page_cgroup(page);
- if (unlikely(!pc || !PageCgroupUsed(pc)))
+ if (unlikely(!PageCgroupUsed(pc)))
return NULL;
lock_page_cgroup(pc);
- mem = pc->mem_cgroup;
+ memcg = pc->mem_cgroup;
if (!PageCgroupUsed(pc))
goto unlock_out;
@@ -3005,7 +2933,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
+ mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
ClearPageCgroupUsed(pc);
/*
@@ -3017,18 +2945,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
unlock_page_cgroup(pc);
/*
- * even after unlock, we have mem->res.usage here and this memcg
+ * even after unlock, we have memcg->res.usage here and this memcg
* will never be freed.
*/
- memcg_check_events(mem, page);
+ memcg_check_events(memcg, page);
if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
- mem_cgroup_swap_statistics(mem, true);
- mem_cgroup_get(mem);
+ mem_cgroup_swap_statistics(memcg, true);
+ mem_cgroup_get(memcg);
}
- if (!mem_cgroup_is_root(mem))
- mem_cgroup_do_uncharge(mem, nr_pages, ctype);
+ if (!mem_cgroup_is_root(memcg))
+ mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
- return mem;
+ return memcg;
unlock_out:
unlock_page_cgroup(pc);
@@ -3040,8 +2968,7 @@ void mem_cgroup_uncharge_page(struct page *page)
/* early check. */
if (page_mapped(page))
return;
- if (page->mapping && !PageAnon(page))
- return;
+ VM_BUG_ON(page->mapping && !PageAnon(page));
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
@@ -3099,6 +3026,23 @@ void mem_cgroup_uncharge_end(void)
batch->memcg = NULL;
}
+/*
+ * A function for resetting pc->mem_cgroup for newly allocated pages.
+ * This function should be called if the newpage will be added to LRU
+ * before start accounting.
+ */
+void mem_cgroup_reset_owner(struct page *newpage)
+{
+ struct page_cgroup *pc;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ pc = lookup_page_cgroup(newpage);
+ VM_BUG_ON(PageCgroupUsed(pc));
+ pc->mem_cgroup = root_mem_cgroup;
+}
+
#ifdef CONFIG_SWAP
/*
* called after __delete_from_swap_cache() and drop "page" account.
@@ -3216,14 +3160,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
* page belongs to.
*/
int mem_cgroup_prepare_migration(struct page *page,
- struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
+ struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
enum charge_type ctype;
int ret = 0;
- *ptr = NULL;
+ *memcgp = NULL;
VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
@@ -3232,8 +3176,8 @@ int mem_cgroup_prepare_migration(struct page *page,
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
- mem = pc->mem_cgroup;
- css_get(&mem->css);
+ memcg = pc->mem_cgroup;
+ css_get(&memcg->css);
/*
* At migrating an anonymous page, its mapcount goes down
* to 0 and uncharge() will be called. But, even if it's fully
@@ -3271,13 +3215,13 @@ int mem_cgroup_prepare_migration(struct page *page,
* If the page is not charged at this point,
* we return here.
*/
- if (!mem)
+ if (!memcg)
return 0;
- *ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
- css_put(&mem->css);/* drop extra refcnt */
- if (ret || *ptr == NULL) {
+ *memcgp = memcg;
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
+ css_put(&memcg->css);/* drop extra refcnt */
+ if (ret) {
if (PageAnon(page)) {
lock_page_cgroup(pc);
ClearPageCgroupMigration(pc);
@@ -3287,6 +3231,7 @@ int mem_cgroup_prepare_migration(struct page *page,
*/
mem_cgroup_uncharge_page(page);
}
+ /* we'll need to revisit this error code (we have -EINTR) */
return -ENOMEM;
}
/*
@@ -3302,21 +3247,21 @@ int mem_cgroup_prepare_migration(struct page *page,
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
+ __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
return ret;
}
/* remove redundant charge if migration failed*/
-void mem_cgroup_end_migration(struct mem_cgroup *mem,
+void mem_cgroup_end_migration(struct mem_cgroup *memcg,
struct page *oldpage, struct page *newpage, bool migration_ok)
{
struct page *used, *unused;
struct page_cgroup *pc;
- if (!mem)
+ if (!memcg)
return;
/* blocks rmdir() */
- cgroup_exclude_rmdir(&mem->css);
+ cgroup_exclude_rmdir(&memcg->css);
if (!migration_ok) {
used = oldpage;
unused = newpage;
@@ -3352,7 +3297,41 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
* So, rmdir()->pre_destroy() can be called while we do this charge.
* In that case, we need to call pre_destroy() again. check it here.
*/
- cgroup_release_and_wakeup_rmdir(&mem->css);
+ cgroup_release_and_wakeup_rmdir(&memcg->css);
+}
+
+/*
+ * At replace page cache, newpage is not under any memcg but it's on
+ * LRU. So, this function doesn't touch res_counter but handles LRU
+ * in correct way. Both pages are locked so we cannot race with uncharge.
+ */
+void mem_cgroup_replace_page_cache(struct page *oldpage,
+ struct page *newpage)
+{
+ struct mem_cgroup *memcg;
+ struct page_cgroup *pc;
+ enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ pc = lookup_page_cgroup(oldpage);
+ /* fix accounting on old pages */
+ lock_page_cgroup(pc);
+ memcg = pc->mem_cgroup;
+ mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
+ ClearPageCgroupUsed(pc);
+ unlock_page_cgroup(pc);
+
+ if (PageSwapBacked(oldpage))
+ type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+
+ /*
+ * Even if newpage->mapping was NULL before starting replacement,
+ * the newpage may be on LRU(or pagevec for LRU) already. We lock
+ * LRU while we overwrite pc->mem_cgroup.
+ */
+ __mem_cgroup_commit_charge_lrucare(newpage, memcg, type);
}
#ifdef CONFIG_DEBUG_VM
@@ -3361,6 +3340,11 @@ static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
+ /*
+ * Can be NULL while feeding pages into the page allocator for
+ * the first time, i.e. during boot or memory hotplug;
+ * or when mem_cgroup_disabled().
+ */
if (likely(pc) && PageCgroupUsed(pc))
return pc;
return NULL;
@@ -3380,23 +3364,8 @@ void mem_cgroup_print_bad_page(struct page *page)
pc = lookup_page_cgroup_used(page);
if (pc) {
- int ret = -1;
- char *path;
-
- printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
+ printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
pc, pc->flags, pc->mem_cgroup);
-
- path = kmalloc(PATH_MAX, GFP_KERNEL);
- if (path) {
- rcu_read_lock();
- ret = cgroup_path(pc->mem_cgroup->css.cgroup,
- path, PATH_MAX);
- rcu_read_unlock();
- }
-
- printk(KERN_CONT "(%s)\n",
- (ret < 0) ? "cannot get the path" : path);
- kfree(path);
}
}
#endif
@@ -3431,7 +3400,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
- * We have to guarantee mem->res.limit < mem->memsw.limit.
+ * We have to guarantee memcg->res.limit < memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3457,9 +3426,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
- MEM_CGROUP_RECLAIM_SHRINK,
- NULL);
+ mem_cgroup_reclaim(memcg, GFP_KERNEL,
+ MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -3493,7 +3461,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
- * We have to guarantee mem->res.limit < mem->memsw.limit.
+ * We have to guarantee memcg->res.limit < memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3517,10 +3485,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
- MEM_CGROUP_RECLAIM_NOSWAP |
- MEM_CGROUP_RECLAIM_SHRINK,
- NULL);
+ mem_cgroup_reclaim(memcg, GFP_KERNEL,
+ MEM_CGROUP_RECLAIM_NOSWAP |
+ MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -3563,10 +3530,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
break;
nr_scanned = 0;
- reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
- gfp_mask,
- MEM_CGROUP_RECLAIM_SOFT,
- &nr_scanned);
+ reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
+ gfp_mask, &nr_scanned);
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
spin_lock(&mctz->lock);
@@ -3631,25 +3596,26 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
* This routine traverse page_cgroup in given list and drop them all.
* *And* this routine doesn't reclaim page itself, just removes page_cgroup.
*/
-static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
+static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
int node, int zid, enum lru_list lru)
{
- struct zone *zone;
struct mem_cgroup_per_zone *mz;
- struct page_cgroup *pc, *busy;
unsigned long flags, loop;
struct list_head *list;
+ struct page *busy;
+ struct zone *zone;
int ret = 0;
zone = &NODE_DATA(node)->node_zones[zid];
- mz = mem_cgroup_zoneinfo(mem, node, zid);
- list = &mz->lists[lru];
+ mz = mem_cgroup_zoneinfo(memcg, node, zid);
+ list = &mz->lruvec.lists[lru];
loop = MEM_CGROUP_ZSTAT(mz, lru);
/* give some margin against EBUSY etc...*/
loop += 256;
busy = NULL;
while (loop--) {
+ struct page_cgroup *pc;
struct page *page;
ret = 0;
@@ -3658,24 +3624,24 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
spin_unlock_irqrestore(&zone->lru_lock, flags);
break;
}
- pc = list_entry(list->prev, struct page_cgroup, lru);
- if (busy == pc) {
- list_move(&pc->lru, list);
+ page = list_entry(list->prev, struct page, lru);
+ if (busy == page) {
+ list_move(&page->lru, list);
busy = NULL;
spin_unlock_irqrestore(&zone->lru_lock, flags);
continue;
}
spin_unlock_irqrestore(&zone->lru_lock, flags);
- page = lookup_cgroup_page(pc);
+ pc = lookup_page_cgroup(page);
- ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
- if (ret == -ENOMEM)
+ ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
+ if (ret == -ENOMEM || ret == -EINTR)
break;
if (ret == -EBUSY || ret == -EINVAL) {
/* found lock contention or "pc" is obsolete. */
- busy = pc;
+ busy = page;
cond_resched();
} else
busy = NULL;
@@ -3690,14 +3656,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
* make mem_cgroup's charge to be 0 if there is no task.
* This enables deleting this mem_cgroup.
*/
-static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
{
int ret;
int node, zid, shrink;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct cgroup *cgrp = mem->css.cgroup;
+ struct cgroup *cgrp = memcg->css.cgroup;
- css_get(&mem->css);
+ css_get(&memcg->css);
shrink = 0;
/* should free all ? */
@@ -3713,14 +3679,14 @@ move_account:
goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
- drain_all_stock_sync(mem);
+ drain_all_stock_sync(memcg);
ret = 0;
- mem_cgroup_start_move(mem);
+ mem_cgroup_start_move(memcg);
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
enum lru_list l;
for_each_lru(l) {
- ret = mem_cgroup_force_empty_list(mem,
+ ret = mem_cgroup_force_empty_list(memcg,
node, zid, l);
if (ret)
break;
@@ -3729,16 +3695,16 @@ move_account:
if (ret)
break;
}
- mem_cgroup_end_move(mem);
- memcg_oom_recover(mem);
+ mem_cgroup_end_move(memcg);
+ memcg_oom_recover(memcg);
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
goto try_to_free;
cond_resched();
/* "ret" should also be checked to ensure all lists are empty. */
- } while (mem->res.usage > 0 || ret);
+ } while (memcg->res.usage > 0 || ret);
out:
- css_put(&mem->css);
+ css_put(&memcg->css);
return ret;
try_to_free:
@@ -3751,14 +3717,14 @@ try_to_free:
lru_add_drain_all();
/* try to free all pages in this cgroup */
shrink = 1;
- while (nr_retries && mem->res.usage > 0) {
+ while (nr_retries && memcg->res.usage > 0) {
int progress;
if (signal_pending(current)) {
ret = -EINTR;
goto out;
}
- progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
+ progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
false);
if (!progress) {
nr_retries--;
@@ -3787,12 +3753,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
u64 val)
{
int retval = 0;
- struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
struct cgroup *parent = cont->parent;
- struct mem_cgroup *parent_mem = NULL;
+ struct mem_cgroup *parent_memcg = NULL;
if (parent)
- parent_mem = mem_cgroup_from_cont(parent);
+ parent_memcg = mem_cgroup_from_cont(parent);
cgroup_lock();
/*
@@ -3803,10 +3769,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
* For the root cgroup, parent_mem is NULL, we allow value to be
* set if there are no children.
*/
- if ((!parent_mem || !parent_mem->use_hierarchy) &&
+ if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
if (list_empty(&cont->children))
- mem->use_hierarchy = val;
+ memcg->use_hierarchy = val;
else
retval = -EBUSY;
} else
@@ -3817,14 +3783,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
}
-static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
struct mem_cgroup *iter;
long val = 0;
/* Per-cpu values can be negative, use a signed accumulator */
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
val += mem_cgroup_read_stat(iter, idx);
if (val < 0) /* race ? */
@@ -3832,29 +3798,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
return val;
}
-static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
u64 val;
- if (!mem_cgroup_is_root(mem)) {
+ if (!mem_cgroup_is_root(memcg)) {
if (!swap)
- return res_counter_read_u64(&mem->res, RES_USAGE);
+ return res_counter_read_u64(&memcg->res, RES_USAGE);
else
- return res_counter_read_u64(&mem->memsw, RES_USAGE);
+ return res_counter_read_u64(&memcg->memsw, RES_USAGE);
}
- val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
- val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
+ val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
if (swap)
- val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
return val << PAGE_SHIFT;
}
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
u64 val;
int type, name;
@@ -3863,15 +3829,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
switch (type) {
case _MEM:
if (name == RES_USAGE)
- val = mem_cgroup_usage(mem, false);
+ val = mem_cgroup_usage(memcg, false);
else
- val = res_counter_read_u64(&mem->res, name);
+ val = res_counter_read_u64(&memcg->res, name);
break;
case _MEMSWAP:
if (name == RES_USAGE)
- val = mem_cgroup_usage(mem, true);
+ val = mem_cgroup_usage(memcg, true);
else
- val = res_counter_read_u64(&mem->memsw, name);
+ val = res_counter_read_u64(&memcg->memsw, name);
break;
default:
BUG();
@@ -3959,24 +3925,24 @@ out:
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *memcg;
int type, name;
- mem = mem_cgroup_from_cont(cont);
+ memcg = mem_cgroup_from_cont(cont);
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);
switch (name) {
case RES_MAX_USAGE:
if (type == _MEM)
- res_counter_reset_max(&mem->res);
+ res_counter_reset_max(&memcg->res);
else
- res_counter_reset_max(&mem->memsw);
+ res_counter_reset_max(&memcg->memsw);
break;
case RES_FAILCNT:
if (type == _MEM)
- res_counter_reset_failcnt(&mem->res);
+ res_counter_reset_failcnt(&memcg->res);
else
- res_counter_reset_failcnt(&mem->memsw);
+ res_counter_reset_failcnt(&memcg->memsw);
break;
}
@@ -3993,7 +3959,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
if (val >= (1 << NR_MOVE_TYPE))
return -EINVAL;
@@ -4003,7 +3969,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
* inconsistent.
*/
cgroup_lock();
- mem->move_charge_at_immigrate = val;
+ memcg->move_charge_at_immigrate = val;
cgroup_unlock();
return 0;
@@ -4060,49 +4026,49 @@ struct {
static void
-mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
+mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
{
s64 val;
/* per cpu stat */
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
+ val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
s->stat[MCS_CACHE] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
+ val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
+ val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
s->stat[MCS_PGPGOUT] += val;
if (do_swap_account) {
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
+ val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
- val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
s->stat[MCS_PGFAULT] += val;
- val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
s->stat[MCS_PGMAJFAULT] += val;
/* per zone stat */
- val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
+ val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
+ val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
+ val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
+ val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
- val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
+ val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
}
static void
-mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
+mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
{
struct mem_cgroup *iter;
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
mem_cgroup_get_local_stat(iter, s);
}
@@ -4188,8 +4154,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
}
#ifdef CONFIG_DEBUG_VM
- cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
-
{
int nid, zid;
struct mem_cgroup_per_zone *mz;
@@ -4326,20 +4290,20 @@ static int compare_thresholds(const void *a, const void *b)
return _a->threshold - _b->threshold;
}
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
{
struct mem_cgroup_eventfd_list *ev;
- list_for_each_entry(ev, &mem->oom_notify, list)
+ list_for_each_entry(ev, &memcg->oom_notify, list)
eventfd_signal(ev->eventfd, 1);
return 0;
}
-static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
- for_each_mem_cgroup_tree(iter, mem)
+ for_each_mem_cgroup_tree(iter, memcg)
mem_cgroup_oom_notify_cb(iter);
}
@@ -4529,7 +4493,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *ev, *tmp;
int type = MEMFILE_TYPE(cft->private);
@@ -4537,7 +4501,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
spin_lock(&memcg_oom_lock);
- list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
+ list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
if (ev->eventfd == eventfd) {
list_del(&ev->list);
kfree(ev);
@@ -4550,11 +4514,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
struct cftype *cft, struct cgroup_map_cb *cb)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
- cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
+ cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
- if (atomic_read(&mem->under_oom))
+ if (atomic_read(&memcg->under_oom))
cb->fill(cb, "under_oom", 1);
else
cb->fill(cb, "under_oom", 0);
@@ -4564,7 +4528,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent;
/* cannot set to root cgroup and only 0 and 1 are allowed */
@@ -4576,13 +4540,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
cgroup_lock();
/* oom-kill-disable is a flag for subhierarchy. */
if ((parent->use_hierarchy) ||
- (mem->use_hierarchy && !list_empty(&cgrp->children))) {
+ (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
cgroup_unlock();
return -EINVAL;
}
- mem->oom_kill_disable = val;
+ memcg->oom_kill_disable = val;
if (!val)
- memcg_oom_recover(mem);
+ memcg_oom_recover(memcg);
cgroup_unlock();
return 0;
}
@@ -4603,6 +4567,36 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
}
#endif /* CONFIG_NUMA */
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+ /*
+ * Part of this would be better living in a separate allocation
+ * function, leaving us with just the cgroup tree population work.
+ * We, however, depend on state such as network's proto_list that
+ * is only initialized after cgroup creation. I found the less
+ * cumbersome way to deal with it to defer it all to populate time
+ */
+ return mem_cgroup_sockets_init(cont, ss);
+};
+
+static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ mem_cgroup_sockets_destroy(cont, ss);
+}
+#else
+static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+ return 0;
+}
+
+static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+}
+#endif
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -4718,7 +4712,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
}
#endif
-static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup_per_zone *mz;
@@ -4738,21 +4732,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
if (!pn)
return 1;
- mem->info.nodeinfo[node] = pn;
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
- INIT_LIST_HEAD(&mz->lists[l]);
+ INIT_LIST_HEAD(&mz->lruvec.lists[l]);
mz->usage_in_excess = 0;
mz->on_tree = false;
- mz->mem = mem;
+ mz->mem = memcg;
}
+ memcg->info.nodeinfo[node] = pn;
return 0;
}
-static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
- kfree(mem->info.nodeinfo[node]);
+ kfree(memcg->info.nodeinfo[node]);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -4794,52 +4788,53 @@ out_free:
* Removal of cgroup itself succeeds regardless of refs from swap.
*/
-static void __mem_cgroup_free(struct mem_cgroup *mem)
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- mem_cgroup_remove_from_trees(mem);
- free_css_id(&mem_cgroup_subsys, &mem->css);
+ mem_cgroup_remove_from_trees(memcg);
+ free_css_id(&mem_cgroup_subsys, &memcg->css);
- for_each_node_state(node, N_POSSIBLE)
- free_mem_cgroup_per_zone_info(mem, node);
+ for_each_node(node)
+ free_mem_cgroup_per_zone_info(memcg, node);
- free_percpu(mem->stat);
+ free_percpu(memcg->stat);
if (sizeof(struct mem_cgroup) < PAGE_SIZE)
- kfree(mem);
+ kfree(memcg);
else
- vfree(mem);
+ vfree(memcg);
}
-static void mem_cgroup_get(struct mem_cgroup *mem)
+static void mem_cgroup_get(struct mem_cgroup *memcg)
{
- atomic_inc(&mem->refcnt);
+ atomic_inc(&memcg->refcnt);
}
-static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
+static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
{
- if (atomic_sub_and_test(count, &mem->refcnt)) {
- struct mem_cgroup *parent = parent_mem_cgroup(mem);
- __mem_cgroup_free(mem);
+ if (atomic_sub_and_test(count, &memcg->refcnt)) {
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ __mem_cgroup_free(memcg);
if (parent)
mem_cgroup_put(parent);
}
}
-static void mem_cgroup_put(struct mem_cgroup *mem)
+static void mem_cgroup_put(struct mem_cgroup *memcg)
{
- __mem_cgroup_put(mem, 1);
+ __mem_cgroup_put(memcg, 1);
}
/*
* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
*/
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
+struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
- if (!mem->res.parent)
+ if (!memcg->res.parent)
return NULL;
- return mem_cgroup_from_res_counter(mem->res.parent, res);
+ return mem_cgroup_from_res_counter(memcg->res.parent, res);
}
+EXPORT_SYMBOL(parent_mem_cgroup);
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static void __init enable_swap_cgroup(void)
@@ -4859,13 +4854,13 @@ static int mem_cgroup_soft_limit_tree_init(void)
struct mem_cgroup_tree_per_zone *rtpz;
int tmp, node, zone;
- for_each_node_state(node, N_POSSIBLE) {
+ for_each_node(node) {
tmp = node;
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
if (!rtpn)
- return 1;
+ goto err_cleanup;
soft_limit_tree.rb_tree_per_node[node] = rtpn;
@@ -4876,21 +4871,31 @@ static int mem_cgroup_soft_limit_tree_init(void)
}
}
return 0;
+
+err_cleanup:
+ for_each_node(node) {
+ if (!soft_limit_tree.rb_tree_per_node[node])
+ break;
+ kfree(soft_limit_tree.rb_tree_per_node[node]);
+ soft_limit_tree.rb_tree_per_node[node] = NULL;
+ }
+ return 1;
+
}
static struct cgroup_subsys_state * __ref
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
- struct mem_cgroup *mem, *parent;
+ struct mem_cgroup *memcg, *parent;
long error = -ENOMEM;
int node;
- mem = mem_cgroup_alloc();
- if (!mem)
+ memcg = mem_cgroup_alloc();
+ if (!memcg)
return ERR_PTR(error);
- for_each_node_state(node, N_POSSIBLE)
- if (alloc_mem_cgroup_per_zone_info(mem, node))
+ for_each_node(node)
+ if (alloc_mem_cgroup_per_zone_info(memcg, node))
goto free_out;
/* root ? */
@@ -4898,9 +4903,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
int cpu;
enable_swap_cgroup();
parent = NULL;
- root_mem_cgroup = mem;
if (mem_cgroup_soft_limit_tree_init())
goto free_out;
+ root_mem_cgroup = memcg;
for_each_possible_cpu(cpu) {
struct memcg_stock_pcp *stock =
&per_cpu(memcg_stock, cpu);
@@ -4909,13 +4914,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
} else {
parent = mem_cgroup_from_cont(cont->parent);
- mem->use_hierarchy = parent->use_hierarchy;
- mem->oom_kill_disable = parent->oom_kill_disable;
+ memcg->use_hierarchy = parent->use_hierarchy;
+ memcg->oom_kill_disable = parent->oom_kill_disable;
}
if (parent && parent->use_hierarchy) {
- res_counter_init(&mem->res, &parent->res);
- res_counter_init(&mem->memsw, &parent->memsw);
+ res_counter_init(&memcg->res, &parent->res);
+ res_counter_init(&memcg->memsw, &parent->memsw);
/*
* We increment refcnt of the parent to ensure that we can
* safely access it on res_counter_charge/uncharge.
@@ -4924,39 +4929,39 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
*/
mem_cgroup_get(parent);
} else {
- res_counter_init(&mem->res, NULL);
- res_counter_init(&mem->memsw, NULL);
+ res_counter_init(&memcg->res, NULL);
+ res_counter_init(&memcg->memsw, NULL);
}
- mem->last_scanned_child = 0;
- mem->last_scanned_node = MAX_NUMNODES;
- INIT_LIST_HEAD(&mem->oom_notify);
+ memcg->last_scanned_node = MAX_NUMNODES;
+ INIT_LIST_HEAD(&memcg->oom_notify);
if (parent)
- mem->swappiness = mem_cgroup_swappiness(parent);
- atomic_set(&mem->refcnt, 1);
- mem->move_charge_at_immigrate = 0;
- mutex_init(&mem->thresholds_lock);
- return &mem->css;
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+ atomic_set(&memcg->refcnt, 1);
+ memcg->move_charge_at_immigrate = 0;
+ mutex_init(&memcg->thresholds_lock);
+ return &memcg->css;
free_out:
- __mem_cgroup_free(mem);
- root_mem_cgroup = NULL;
+ __mem_cgroup_free(memcg);
return ERR_PTR(error);
}
static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- return mem_cgroup_force_empty(mem, false);
+ return mem_cgroup_force_empty(memcg, false);
}
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
- struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+
+ kmem_cgroup_destroy(ss, cont);
- mem_cgroup_put(mem);
+ mem_cgroup_put(memcg);
}
static int mem_cgroup_populate(struct cgroup_subsys *ss,
@@ -4969,6 +4974,10 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
if (!ret)
ret = register_memsw_files(cont, ss);
+
+ if (!ret)
+ ret = register_kmem_files(cont, ss);
+
return ret;
}
@@ -4979,9 +4988,9 @@ static int mem_cgroup_do_precharge(unsigned long count)
{
int ret = 0;
int batch_count = PRECHARGE_COUNT_AT_ONCE;
- struct mem_cgroup *mem = mc.to;
+ struct mem_cgroup *memcg = mc.to;
- if (mem_cgroup_is_root(mem)) {
+ if (mem_cgroup_is_root(memcg)) {
mc.precharge += count;
/* we don't need css_get for root */
return ret;
@@ -4990,16 +4999,16 @@ static int mem_cgroup_do_precharge(unsigned long count)
if (count > 1) {
struct res_counter *dummy;
/*
- * "mem" cannot be under rmdir() because we've already checked
+ * "memcg" cannot be under rmdir() because we've already checked
* by cgroup_lock_live_cgroup() that it is not removed and we
* are still under the same cgroup_mutex. So we can postpone
* css_get().
*/
- if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
+ if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
goto one_by_one;
- if (do_swap_account && res_counter_charge(&mem->memsw,
+ if (do_swap_account && res_counter_charge(&memcg->memsw,
PAGE_SIZE * count, &dummy)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE * count);
+ res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
goto one_by_one;
}
mc.precharge += count;
@@ -5016,10 +5025,11 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
- if (ret || !mem)
+ ret = __mem_cgroup_try_charge(NULL,
+ GFP_KERNEL, 1, &memcg, false);
+ if (ret)
/* mem_cgroup_clear_mc() will do uncharge later */
- return -ENOMEM;
+ return ret;
mc.precharge++;
}
return ret;
@@ -5164,7 +5174,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
- css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
+ css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
@@ -5288,16 +5298,17 @@ static void mem_cgroup_clear_mc(void)
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p)
+ struct cgroup_taskset *tset)
{
+ struct task_struct *p = cgroup_taskset_first(tset);
int ret = 0;
- struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
- if (mem->move_charge_at_immigrate) {
+ if (memcg->move_charge_at_immigrate) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
- VM_BUG_ON(from == mem);
+ VM_BUG_ON(from == memcg);
mm = get_task_mm(p);
if (!mm)
@@ -5312,7 +5323,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
- mc.to = mem;
+ mc.to = memcg;
spin_unlock(&mc.lock);
/* We set mc.moving_task later */
@@ -5327,7 +5338,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p)
+ struct cgroup_taskset *tset)
{
mem_cgroup_clear_mc();
}
@@ -5444,9 +5455,9 @@ retry:
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
- struct cgroup *old_cont,
- struct task_struct *p)
+ struct cgroup_taskset *tset)
{
+ struct task_struct *p = cgroup_taskset_first(tset);
struct mm_struct *mm = get_task_mm(p);
if (mm) {
@@ -5461,19 +5472,18 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p)
+ struct cgroup_taskset *tset)
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p)
+ struct cgroup_taskset *tset)
{
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
- struct cgroup *old_cont,
- struct task_struct *p)
+ struct cgroup_taskset *tset)
{
}
#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2b43ba051ac9..56080ea36140 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -42,6 +42,7 @@
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
+#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/backing-dev.h>
@@ -1310,7 +1311,7 @@ int unpoison_memory(unsigned long pfn)
* to the end.
*/
if (PageHuge(page)) {
- pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
return 0;
}
if (TestClearPageHWPoison(p))
@@ -1419,7 +1420,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (PageHWPoison(hpage)) {
put_page(hpage);
- pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+ pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
return -EBUSY;
}
@@ -1433,8 +1434,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
list_for_each_entry_safe(page1, page2, &pagelist, lru)
put_page(page1);
- pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
- pfn, ret, page->flags);
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
return ret;
@@ -1505,7 +1506,7 @@ int soft_offline_page(struct page *page, int flags)
}
if (!PageLRU(page)) {
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
- pfn, page->flags);
+ pfn, page->flags);
return -EIO;
}
@@ -1556,7 +1557,7 @@ int soft_offline_page(struct page *page, int flags)
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- 0, true);
+ 0, MIGRATE_SYNC);
if (ret) {
putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1566,7 +1567,7 @@ int soft_offline_page(struct page *page, int flags)
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
- pfn, ret, page_count(page), page->flags);
+ pfn, ret, page_count(page), page->flags);
}
if (ret)
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index a56e3ba816b2..5e30583c2605 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,7 +47,7 @@
#include <linux/pagemap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
@@ -293,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
struct mmu_gather_batch *batch;
- tlb->need_flush = 1;
+ VM_BUG_ON(!tlb->need_flush);
if (tlb_fast_mode(tlb)) {
free_page_and_swap_cache(page);
@@ -1231,7 +1231,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (next-addr != HPAGE_PMD_SIZE) {
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
split_huge_page_pmd(vma->vm_mm, pmd);
- } else if (zap_huge_pmd(tlb, vma, pmd))
+ } else if (zap_huge_pmd(tlb, vma, pmd, addr))
continue;
/* fall through */
}
@@ -1503,7 +1503,7 @@ split_fallthrough:
}
if (flags & FOLL_GET)
- get_page(page);
+ get_page_foll(page);
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6e7d8b21dbfa..6629fafd6ce4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -11,7 +11,7 @@
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/slab.h>
@@ -809,7 +809,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
}
/* this function returns # of failed pages */
ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
- true, true);
+ true, MIGRATE_SYNC);
if (ret)
putback_lru_pages(&source);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9c51f9f58cac..06b145fb64ab 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -75,7 +75,7 @@
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
@@ -111,7 +111,7 @@ enum zone_type policy_zone = 0;
/*
* run-time system-wide default policy => local allocation
*/
-struct mempolicy default_policy = {
+static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
.mode = MPOL_PREFERRED,
.flags = MPOL_F_LOCAL,
@@ -636,6 +636,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
+ pgoff_t pgoff;
unsigned long vmstart;
unsigned long vmend;
@@ -643,13 +644,21 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (!vma || vma->vm_start > start)
return -EFAULT;
+ if (start > vma->vm_start)
+ prev = vma;
+
for (; vma && vma->vm_start < end; prev = vma, vma = next) {
next = vma->vm_next;
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
+ if (mpol_equal(vma_policy(vma), new_pol))
+ continue;
+
+ pgoff = vma->vm_pgoff +
+ ((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
- vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma->anon_vma, vma->vm_file, pgoff,
new_pol);
if (prev) {
vma = prev;
@@ -933,7 +942,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
- false, true);
+ false, MIGRATE_SYNC);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1974,28 +1983,28 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
}
/* Slow path of a mempolicy comparison */
-int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
+bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (!a || !b)
- return 0;
+ return false;
if (a->mode != b->mode)
- return 0;
+ return false;
if (a->flags != b->flags)
- return 0;
+ return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
- return 0;
+ return false;
switch (a->mode) {
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
- return nodes_equal(a->v.nodes, b->v.nodes);
+ return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
return a->v.preferred_node == b->v.preferred_node;
default:
BUG();
- return 0;
+ return false;
}
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a3bc3d4d554..d9049811f352 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -10,7 +10,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
@@ -27,7 +27,15 @@ static void *remove_element(mempool_t *pool)
return pool->elements[--pool->curr_nr];
}
-static void free_pool(mempool_t *pool)
+/**
+ * mempool_destroy - deallocate a memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ *
+ * Free all reserved elements in @pool and @pool itself. This function
+ * only sleeps if the free_fn() function sleeps.
+ */
+void mempool_destroy(mempool_t *pool)
{
while (pool->curr_nr) {
void *element = remove_element(pool);
@@ -36,6 +44,7 @@ static void free_pool(mempool_t *pool)
kfree(pool->elements);
kfree(pool);
}
+EXPORT_SYMBOL(mempool_destroy);
/**
* mempool_create - create a memory pool
@@ -86,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
element = pool->alloc(GFP_KERNEL, pool->pool_data);
if (unlikely(!element)) {
- free_pool(pool);
+ mempool_destroy(pool);
return NULL;
}
add_element(pool, element);
@@ -172,23 +181,6 @@ out:
EXPORT_SYMBOL(mempool_resize);
/**
- * mempool_destroy - deallocate a memory pool
- * @pool: pointer to the memory pool which was allocated via
- * mempool_create().
- *
- * this function only sleeps if the free_fn() function sleeps. The caller
- * has to guarantee that all elements have been returned to the pool (ie:
- * freed) prior to calling mempool_destroy().
- */
-void mempool_destroy(mempool_t *pool)
-{
- /* Check for outstanding elements */
- BUG_ON(pool->curr_nr != pool->min_nr);
- free_pool(pool);
-}
-EXPORT_SYMBOL(mempool_destroy);
-
-/**
* mempool_alloc - allocate an element from a specific memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
@@ -224,28 +216,40 @@ repeat_alloc:
if (likely(pool->curr_nr)) {
element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
+ /* paired with rmb in mempool_free(), read comment there */
+ smp_wmb();
return element;
}
- spin_unlock_irqrestore(&pool->lock, flags);
- /* We must not sleep in the GFP_ATOMIC case */
- if (!(gfp_mask & __GFP_WAIT))
+ /*
+ * We use gfp mask w/o __GFP_WAIT or IO for the first round. If
+ * alloc failed with that and @pool was empty, retry immediately.
+ */
+ if (gfp_temp != gfp_mask) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ gfp_temp = gfp_mask;
+ goto repeat_alloc;
+ }
+
+ /* We must not sleep if !__GFP_WAIT */
+ if (!(gfp_mask & __GFP_WAIT)) {
+ spin_unlock_irqrestore(&pool->lock, flags);
return NULL;
+ }
- /* Now start performing page reclaim */
- gfp_temp = gfp_mask;
+ /* Let's wait for someone else to return an element to @pool */
init_wait(&wait);
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
- smp_mb();
- if (!pool->curr_nr) {
- /*
- * FIXME: this should be io_schedule(). The timeout is there
- * as a workaround for some DM problems in 2.6.18.
- */
- io_schedule_timeout(5*HZ);
- }
- finish_wait(&pool->wait, &wait);
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ /*
+ * FIXME: this should be io_schedule(). The timeout is there as a
+ * workaround for some DM problems in 2.6.18.
+ */
+ io_schedule_timeout(5*HZ);
+
+ finish_wait(&pool->wait, &wait);
goto repeat_alloc;
}
EXPORT_SYMBOL(mempool_alloc);
@@ -265,7 +269,39 @@ void mempool_free(void *element, mempool_t *pool)
if (unlikely(element == NULL))
return;
- smp_mb();
+ /*
+ * Paired with the wmb in mempool_alloc(). The preceding read is
+ * for @element and the following @pool->curr_nr. This ensures
+ * that the visible value of @pool->curr_nr is from after the
+ * allocation of @element. This is necessary for fringe cases
+ * where @element was passed to this task without going through
+ * barriers.
+ *
+ * For example, assume @p is %NULL at the beginning and one task
+ * performs "p = mempool_alloc(...);" while another task is doing
+ * "while (!p) cpu_relax(); mempool_free(p, ...);". This function
+ * may end up using curr_nr value which is from before allocation
+ * of @p without the following rmb.
+ */
+ smp_rmb();
+
+ /*
+ * For correctness, we need a test which is guaranteed to trigger
+ * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr
+ * without locking achieves that and refilling as soon as possible
+ * is desirable.
+ *
+ * Because curr_nr visible here is always a value after the
+ * allocation of @element, any task which decremented curr_nr below
+ * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
+ * incremented to min_nr afterwards. If curr_nr gets incremented
+ * to min_nr after the allocation of @element, the elements
+ * allocated after that are subject to the same guarantee.
+ *
+ * Waiters happen iff curr_nr is 0 and the above guarantee also
+ * ensures that there will be frees which return elements to the
+ * pool waking up the waiters.
+ */
if (pool->curr_nr < pool->min_nr) {
spin_lock_irqsave(&pool->lock, flags);
if (pool->curr_nr < pool->min_nr) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 14d0a6a632f6..9871a56d82c3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -13,7 +13,7 @@
*/
#include <linux/migrate.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
@@ -39,8 +39,6 @@
#include "internal.h"
-#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
-
/*
* migrate_prep() needs to be called before we start compiling a list of pages
* to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
@@ -181,8 +179,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
* Something used the pte of a page under migration. We need to
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
- *
- * This function is called from do_swap_page().
*/
void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address)
@@ -220,6 +216,56 @@ out:
pte_unmap_unlock(ptep, ptl);
}
+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ struct buffer_head *bh = head;
+
+ /* Simple case, sync compaction */
+ if (mode != MIGRATE_ASYNC) {
+ do {
+ get_bh(bh);
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return true;
+ }
+
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
+ do {
+ get_bh(bh);
+ if (!trylock_buffer(bh)) {
+ /*
+ * We failed to lock the buffer and cannot stall in
+ * async migration. Release the taken locks
+ */
+ struct buffer_head *failed_bh = bh;
+ put_bh(failed_bh);
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ put_bh(bh);
+ bh = bh->b_this_page;
+ }
+ return false;
+ }
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return true;
+}
+#else
+static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ return true;
+}
+#endif /* CONFIG_BLOCK */
+
/*
* Replace the page in the mapping.
*
@@ -229,7 +275,8 @@ out:
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
static int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ struct buffer_head *head, enum migrate_mode mode)
{
int expected_count;
void **pslot;
@@ -259,6 +306,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
/*
+ * In the async migration case of moving a page with buffers, lock the
+ * buffers using trylock before the mapping is moved. If the mapping
+ * was moved, we later failed to lock the buffers and could not move
+ * the mapping back due to an elevated page count, we would have to
+ * block waiting on other references to be dropped.
+ */
+ if (mode == MIGRATE_ASYNC && head &&
+ !buffer_migrate_lock_buffers(head, mode)) {
+ page_unfreeze_refs(page, expected_count);
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ /*
* Now we know that no one else is looking at the page.
*/
get_page(newpage); /* add cache reference */
@@ -269,12 +330,12 @@ static int migrate_page_move_mapping(struct address_space *mapping,
radix_tree_replace_slot(pslot, newpage);
- page_unfreeze_refs(page, expected_count);
/*
- * Drop cache reference from old page.
+ * Drop cache reference from old page by unfreezing
+ * to one less reference.
* We know this isn't the last reference.
*/
- __put_page(page);
+ page_unfreeze_refs(page, expected_count - 1);
/*
* If moved to a different zone then also account
@@ -334,9 +395,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
radix_tree_replace_slot(pslot, newpage);
- page_unfreeze_refs(page, expected_count);
-
- __put_page(page);
+ page_unfreeze_refs(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
return 0;
@@ -415,13 +474,14 @@ EXPORT_SYMBOL(fail_migrate_page);
* Pages are locked upon entry and exit.
*/
int migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
int rc;
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page);
+ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
if (rc)
return rc;
@@ -438,28 +498,28 @@ EXPORT_SYMBOL(migrate_page);
* exist.
*/
int buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page, enum migrate_mode mode)
{
struct buffer_head *bh, *head;
int rc;
if (!page_has_buffers(page))
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
head = page_buffers(page);
- rc = migrate_page_move_mapping(mapping, newpage, page);
+ rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
if (rc)
return rc;
- bh = head;
- do {
- get_bh(bh);
- lock_buffer(bh);
- bh = bh->b_this_page;
-
- } while (bh != head);
+ /*
+ * In the async case, migrate_page_move_mapping locked the buffers
+ * with an IRQ-safe spinlock held. In the sync case, the buffers
+ * need to be locked now
+ */
+ if (mode != MIGRATE_ASYNC)
+ BUG_ON(!buffer_migrate_lock_buffers(head, mode));
ClearPagePrivate(page);
set_page_private(newpage, page_private(page));
@@ -536,10 +596,14 @@ static int writeout(struct address_space *mapping, struct page *page)
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page, enum migrate_mode mode)
{
- if (PageDirty(page))
+ if (PageDirty(page)) {
+ /* Only writeback pages in full synchronous migration */
+ if (mode != MIGRATE_SYNC)
+ return -EBUSY;
return writeout(mapping, page);
+ }
/*
* Buffers may be managed in a filesystem specific way.
@@ -549,7 +613,7 @@ static int fallback_migrate_page(struct address_space *mapping,
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
/*
@@ -564,7 +628,7 @@ static int fallback_migrate_page(struct address_space *mapping,
* == 0 - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
- int remap_swapcache, bool sync)
+ int remap_swapcache, enum migrate_mode mode)
{
struct address_space *mapping;
int rc;
@@ -585,29 +649,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
mapping = page_mapping(page);
if (!mapping)
- rc = migrate_page(mapping, newpage, page);
- else {
+ rc = migrate_page(mapping, newpage, page, mode);
+ else if (mapping->a_ops->migratepage)
/*
- * Do not writeback pages if !sync and migratepage is
- * not pointing to migrate_page() which is nonblocking
- * (swapcache/tmpfs uses migratepage = migrate_page).
+ * Most pages have a mapping and most filesystems provide a
+ * migratepage callback. Anonymous pages are part of swap
+ * space which also has its own migratepage callback. This
+ * is the most common path for page migration.
*/
- if (PageDirty(page) && !sync &&
- mapping->a_ops->migratepage != migrate_page)
- rc = -EBUSY;
- else if (mapping->a_ops->migratepage)
- /*
- * Most pages have a mapping and most filesystems
- * should provide a migration function. Anonymous
- * pages are part of swap space which also has its
- * own migration function. This is the most common
- * path for page migration.
- */
- rc = mapping->a_ops->migratepage(mapping,
- newpage, page);
- else
- rc = fallback_migrate_page(mapping, newpage, page);
- }
+ rc = mapping->a_ops->migratepage(mapping,
+ newpage, page, mode);
+ else
+ rc = fallback_migrate_page(mapping, newpage, page, mode);
if (rc) {
newpage->mapping = NULL;
@@ -621,38 +674,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
return rc;
}
-/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, bool offlining, bool sync)
+static int __unmap_and_move(struct page *page, struct page *newpage,
+ int force, bool offlining, enum migrate_mode mode)
{
- int rc = 0;
- int *result = NULL;
- struct page *newpage = get_new_page(page, private, &result);
+ int rc = -EAGAIN;
int remap_swapcache = 1;
int charge = 0;
struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;
- if (!newpage)
- return -ENOMEM;
-
- if (page_count(page) == 1) {
- /* page was freed from under us. So we are done. */
- goto move_newpage;
- }
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page)))
- goto move_newpage;
-
- /* prepare cgroup just returns 0 or -ENOMEM */
- rc = -EAGAIN;
-
if (!trylock_page(page)) {
- if (!force || !sync)
- goto move_newpage;
+ if (!force || mode == MIGRATE_ASYNC)
+ goto out;
/*
* It's not safe for direct compaction to call lock_page.
@@ -668,7 +701,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* altogether.
*/
if (current->flags & PF_MEMALLOC)
- goto move_newpage;
+ goto out;
lock_page(page);
}
@@ -697,10 +730,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
if (PageWriteback(page)) {
/*
- * For !sync, there is no point retrying as the retry loop
- * is expected to be too short for PageWriteback to be cleared
+ * Only in the case of a full syncronous migration is it
+ * necessary to wait for PageWriteback. In the async case,
+ * the retry loop is too short and in the sync-light case,
+ * the overhead of stalling is too much
*/
- if (!sync) {
+ if (mode != MIGRATE_SYNC) {
rc = -EBUSY;
goto uncharge;
}
@@ -771,7 +806,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, remap_swapcache, sync);
+ rc = move_to_new_page(newpage, page, remap_swapcache, mode);
if (rc && remap_swapcache)
remove_migration_ptes(page, page);
@@ -785,27 +820,55 @@ uncharge:
mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
+out:
+ return rc;
+}
+
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+ struct page *page, int force, bool offlining,
+ enum migrate_mode mode)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *newpage = get_new_page(page, private, &result);
+
+ if (!newpage)
+ return -ENOMEM;
+
+ mem_cgroup_reset_owner(newpage);
+
+ if (page_count(page) == 1) {
+ /* page was freed from under us. So we are done. */
+ goto out;
+ }
-move_newpage:
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page)))
+ goto out;
+
+ rc = __unmap_and_move(page, newpage, force, offlining, mode);
+out:
if (rc != -EAGAIN) {
- /*
- * A page that has been migrated has all references
- * removed and will be freed. A page that has not been
- * migrated will have kepts its references and be
- * restored.
- */
- list_del(&page->lru);
+ /*
+ * A page that has been migrated has all references
+ * removed and will be freed. A page that has not been
+ * migrated will have kepts its references and be
+ * restored.
+ */
+ list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
putback_lru_page(page);
}
-
/*
* Move the new page to the LRU. If migration was not successful
* then this will free the page.
*/
putback_lru_page(newpage);
-
if (result) {
if (rc)
*result = rc;
@@ -835,7 +898,8 @@ move_newpage:
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
unsigned long private, struct page *hpage,
- int force, bool offlining, bool sync)
+ int force, bool offlining,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -848,7 +912,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
rc = -EAGAIN;
if (!trylock_page(hpage)) {
- if (!force || !sync)
+ if (!force || mode != MIGRATE_SYNC)
goto out;
lock_page(hpage);
}
@@ -859,16 +923,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, 1, sync);
+ rc = move_to_new_page(new_hpage, hpage, 1, mode);
if (rc)
remove_migration_ptes(hpage, hpage);
if (anon_vma)
put_anon_vma(anon_vma);
-out:
unlock_page(hpage);
+out:
if (rc != -EAGAIN) {
list_del(&hpage->lru);
put_page(hpage);
@@ -902,7 +966,7 @@ out:
*/
int migrate_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- bool sync)
+ enum migrate_mode mode)
{
int retry = 1;
int nr_failed = 0;
@@ -923,7 +987,7 @@ int migrate_pages(struct list_head *from,
rc = unmap_and_move(get_new_page, private,
page, pass > 2, offlining,
- sync);
+ mode);
switch(rc) {
case -ENOMEM:
@@ -953,7 +1017,7 @@ out:
int migrate_huge_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- bool sync)
+ enum migrate_mode mode)
{
int retry = 1;
int nr_failed = 0;
@@ -970,7 +1034,7 @@ int migrate_huge_pages(struct list_head *from,
rc = unmap_and_move_huge_page(get_new_page,
private, page, pass > 2, offlining,
- sync);
+ mode);
switch(rc) {
case -ENOMEM:
@@ -1099,7 +1163,7 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0, true);
+ (unsigned long)pm, 0, MIGRATE_SYNC);
if (err)
putback_lru_pages(&pagelist);
}
diff --git a/mm/mlock.c b/mm/mlock.c
index 048260c4e02e..4f4f53bdc65d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -14,7 +14,7 @@
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
@@ -110,7 +110,15 @@ void munlock_vma_page(struct page *page)
if (TestClearPageMlocked(page)) {
dec_zone_page_state(page, NR_MLOCK);
if (!isolate_lru_page(page)) {
- int ret = try_to_munlock(page);
+ int ret = SWAP_AGAIN;
+
+ /*
+ * Optimization: if the page was mapped just once,
+ * that's our mapping and we don't need to check all the
+ * other vmas.
+ */
+ if (page_mapcount(page) > 1)
+ ret = try_to_munlock(page);
/*
* did try_to_unlock() succeed or punt?
*/
@@ -549,7 +557,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
if (!can_do_mlock())
goto out;
- lru_add_drain_all(); /* flush pagevec */
+ if (flags & MCL_CURRENT)
+ lru_add_drain_all(); /* flush pagevec */
down_write(&current->mm->mmap_sem);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4e0e26591dfa..1ffd97ae26d7 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -8,7 +8,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/kobject.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
diff --git a/mm/mmap.c b/mm/mmap.c
index a65efd4db3e1..3f758c7f4c81 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -22,7 +22,7 @@
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/profile.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
@@ -1603,39 +1603,19 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma);
-/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
+/*
+ * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+ * Note: pprev is set to NULL when return value is NULL.
+ */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
- struct vm_area_struct *vma = NULL, *prev = NULL;
- struct rb_node *rb_node;
- if (!mm)
- goto out;
-
- /* Guard against addr being lower than the first VMA */
- vma = mm->mmap;
-
- /* Go through the RB tree quickly. */
- rb_node = mm->mm_rb.rb_node;
-
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
- vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
-
- if (addr < vma_tmp->vm_end) {
- rb_node = rb_node->rb_left;
- } else {
- prev = vma_tmp;
- if (!prev->vm_next || (addr < prev->vm_next->vm_end))
- break;
- rb_node = rb_node->rb_right;
- }
- }
+ struct vm_area_struct *vma;
-out:
- *pprev = prev;
- return prev ? prev->vm_next : vma;
+ vma = find_vma(mm, addr);
+ *pprev = vma ? vma->vm_prev : NULL;
+ return vma;
}
/*
@@ -2322,13 +2302,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
struct vm_area_struct *new_vma, *prev;
struct rb_node **rb_link, *rb_parent;
struct mempolicy *pol;
+ bool faulted_in_anon_vma = true;
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
*/
- if (!vma->vm_file && !vma->anon_vma)
+ if (unlikely(!vma->vm_file && !vma->anon_vma)) {
pgoff = addr >> PAGE_SHIFT;
+ faulted_in_anon_vma = false;
+ }
find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
@@ -2337,9 +2320,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
/*
* Source vma may have been merged into new_vma
*/
- if (vma_start >= new_vma->vm_start &&
- vma_start < new_vma->vm_end)
+ if (unlikely(vma_start >= new_vma->vm_start &&
+ vma_start < new_vma->vm_end)) {
+ /*
+ * The only way we can get a vma_merge with
+ * self during an mremap is if the vma hasn't
+ * been faulted in yet and we were allowed to
+ * reset the dst vma->vm_pgoff to the
+ * destination address of the mremap to allow
+ * the merge to happen. mremap must change the
+ * vm_pgoff linearity between src and dst vmas
+ * (in turn preventing a vma_merge) to be
+ * safe. It is only safe to keep the vm_pgoff
+ * linear if there are no pages mapped yet.
+ */
+ VM_BUG_ON(faulted_in_anon_vma);
*vmap = new_vma;
+ } else
+ anon_vma_moveto_tail(new_vma);
} else {
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) {
@@ -2558,7 +2556,6 @@ int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- int ret = -EINTR;
BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2579,13 +2576,11 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_anon_vma(mm, avc->anon_vma);
}
- ret = 0;
+ return 0;
out_unlock:
- if (ret)
- mm_drop_all_locks(mm);
-
- return ret;
+ mm_drop_all_locks(mm);
+ return -EINTR;
}
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 9e82e937000e..cf332bc0080a 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,7 +5,7 @@
#include <linux/mm.h>
#include <linux/mmu_context.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <asm/mmu_context.h>
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de4088e..9a611d3a1848 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -11,7 +11,7 @@
#include <linux/rculist.h>
#include <linux/mmu_notifier.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/rcupdate.h>
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d1760213..7cf7b7ddc7c5 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -8,7 +8,6 @@
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
-#include <linux/module.h>
struct pglist_data *first_online_pgdat(void)
{
diff --git a/mm/mremap.c b/mm/mremap.c
index 506fa44403df..87bb8393e7d2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return NULL;
pmd = pmd_offset(pud, addr);
- split_huge_page_pmd(mm, pmd);
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none(*pmd))
return NULL;
return pmd;
@@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL;
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
- return NULL;
return pmd;
}
@@ -80,11 +77,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
- unsigned long old_start;
- old_start = old_addr;
- mmu_notifier_invalidate_range_start(vma->vm_mm,
- old_start, old_end);
if (vma->vm_file) {
/*
* Subtle point from Rajesh Venkatasubramanian: before
@@ -111,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
new_pte++, new_addr += PAGE_SIZE) {
if (pte_none(*old_pte))
continue;
- pte = ptep_clear_flush(vma, old_addr, old_pte);
+ pte = ptep_get_and_clear(mm, old_addr, old_pte);
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
set_pte_at(mm, new_addr, new_pte, pte);
}
@@ -123,7 +116,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);
- mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
@@ -134,22 +126,43 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
{
unsigned long extent, next, old_end;
pmd_t *old_pmd, *new_pmd;
+ bool need_flush = false;
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
+ mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
next = (old_addr + PMD_SIZE) & PMD_MASK;
- if (next - 1 > old_end)
- next = old_end;
+ /* even if next overflowed, extent below will be ok */
extent = next - old_addr;
+ if (extent > old_end - old_addr)
+ extent = old_end - old_addr;
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
+ if (pmd_trans_huge(*old_pmd)) {
+ int err = 0;
+ if (extent == HPAGE_PMD_SIZE)
+ err = move_huge_pmd(vma, new_vma, old_addr,
+ new_addr, old_end,
+ old_pmd, new_pmd);
+ if (err > 0) {
+ need_flush = true;
+ continue;
+ } else if (!err) {
+ split_huge_page_pmd(vma->vm_mm, old_pmd);
+ }
+ VM_BUG_ON(pmd_trans_huge(*old_pmd));
+ }
+ if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
+ new_pmd, new_addr))
+ break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
extent = next - new_addr;
@@ -157,7 +170,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
extent = LATENCY_LIMIT;
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
new_vma, new_pmd, new_addr);
+ need_flush = true;
}
+ if (likely(need_flush))
+ flush_tlb_range(vma, old_end-len, old_addr);
+
+ mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
return len + old_addr - old_end; /* how much done */
}
@@ -203,6 +221,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
if (moved_len < old_len) {
/*
+ * Before moving the page tables from the new vma to
+ * the old vma, we need to be sure the old vma is
+ * queued after new vma in the same_anon_vma list to
+ * prevent SMP races with rmap_walk (that could lead
+ * rmap_walk to miss some page table).
+ */
+ anon_vma_moveto_tail(vma);
+
+ /*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6e93dc7f2586..24f0fc1a56d6 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -12,7 +12,7 @@
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/bootmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
#include <linux/memblock.h>
@@ -41,14 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
if (limit > memblock.current_limit)
limit = memblock.current_limit;
- addr = find_memory_core_early(nid, size, align, goal, limit);
-
- if (addr == MEMBLOCK_ERROR)
+ addr = memblock_find_in_range_node(goal, limit, size, align, nid);
+ if (!addr)
return NULL;
ptr = phys_to_virt(addr);
memset(ptr, 0, size);
- memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+ memblock_reserve(addr, size);
/*
* The min_count is set to 0 so that bootmem allocated blocks
* are never reported as leaks.
@@ -107,23 +106,27 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
__free_pages_bootmem(pfn_to_page(i), 0);
}
-unsigned long __init free_all_memory_core_early(int nodeid)
+unsigned long __init free_low_memory_core_early(int nodeid)
{
- int i;
- u64 start, end;
unsigned long count = 0;
- struct range *range = NULL;
- int nr_range;
-
- nr_range = get_free_all_memory_range(&range, nodeid);
-
- for (i = 0; i < nr_range; i++) {
- start = range[i].start;
- end = range[i].end;
- count += end - start;
- __free_pages_memory(start, end);
+ phys_addr_t start, end;
+ u64 i;
+
+ /* free reserved array temporarily so that it's treated as free area */
+ memblock_free_reserved_regions();
+
+ for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = min_t(unsigned long,
+ PFN_DOWN(end), max_low_pfn);
+ if (start_pfn < end_pfn) {
+ __free_pages_memory(start_pfn, end_pfn);
+ count += end_pfn - start_pfn;
+ }
}
+ /* put region array back? */
+ memblock_reserve_reserved_regions();
return count;
}
@@ -137,7 +140,7 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
- /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+ /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
return 0;
}
@@ -155,7 +158,7 @@ unsigned long __init free_all_bootmem(void)
* Use MAX_NUMNODES will make sure all ranges in early_node_map[]
* will be used instead of only Node0 related
*/
- return free_all_memory_core_early(MAX_NUMNODES);
+ return free_low_memory_core_early(MAX_NUMNODES);
}
/**
@@ -172,7 +175,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
kmemleak_free_part(__va(physaddr), size);
- memblock_x86_free_range(physaddr, physaddr + size);
+ memblock_free(physaddr, size);
}
/**
@@ -187,7 +190,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
void __init free_bootmem(unsigned long addr, unsigned long size)
{
kmemleak_free_part(__va(addr), size);
- memblock_x86_free_range(addr, addr + size);
+ memblock_free(addr, size);
}
static void * __init ___alloc_bootmem_nopanic(unsigned long size,
diff --git a/mm/nommu.c b/mm/nommu.c
index 4358032566e9..b982290fd962 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -13,7 +13,7 @@
* Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/swap.h>
@@ -454,7 +454,7 @@ void __attribute__((weak)) vmalloc_sync_all(void)
* between processes, it syncs the pagetable across all
* processes.
*/
-struct vm_struct *alloc_vm_area(size_t size)
+struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
BUG();
return NULL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 626303b52f3c..2958fd8e7c9a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,18 +26,43 @@
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
#include <linux/mempolicy.h>
#include <linux/security.h>
#include <linux/ptrace.h>
+#include <linux/freezer.h>
+#include <linux/ftrace.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/oom.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
+/*
+ * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
+ * @old_val: old oom_score_adj for compare
+ * @new_val: new oom_score_adj for swap
+ *
+ * Sets the oom_score_adj value for current to @new_val iff its present value is
+ * @old_val. Usually used to reinstate a previous value to prevent racing with
+ * userspacing tuning the value in the interim.
+ */
+void compare_swap_oom_score_adj(int old_val, int new_val)
+{
+ struct sighand_struct *sighand = current->sighand;
+
+ spin_lock_irq(&sighand->siglock);
+ if (current->signal->oom_score_adj == old_val)
+ current->signal->oom_score_adj = new_val;
+ trace_oom_score_adj_update(current);
+ spin_unlock_irq(&sighand->siglock);
+}
+
/**
* test_set_oom_score_adj() - set current's oom_score_adj and return old value
* @new_val: new oom_score_adj value
@@ -53,13 +78,8 @@ int test_set_oom_score_adj(int new_val)
spin_lock_irq(&sighand->siglock);
old_val = current->signal->oom_score_adj;
- if (new_val != old_val) {
- if (new_val == OOM_SCORE_ADJ_MIN)
- atomic_inc(&current->mm->oom_disable_count);
- else if (old_val == OOM_SCORE_ADJ_MIN)
- atomic_dec(&current->mm->oom_disable_count);
- current->signal->oom_score_adj = new_val;
- }
+ current->signal->oom_score_adj = new_val;
+ trace_oom_score_adj_update(current);
spin_unlock_irq(&sighand->siglock);
return old_val;
@@ -132,7 +152,7 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p,
- const struct mem_cgroup *mem, const nodemask_t *nodemask)
+ const struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
if (is_global_init(p))
return true;
@@ -140,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p,
return true;
/* When mem_cgroup_out_of_memory() and p is not member of the group */
- if (mem && !task_in_mem_cgroup(p, mem))
+ if (memcg && !task_in_mem_cgroup(p, memcg))
return true;
/* p may not have freeable memory in nodemask */
@@ -159,24 +179,19 @@ static bool oom_unkillable_task(struct task_struct *p,
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
-unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
const nodemask_t *nodemask, unsigned long totalpages)
{
- int points;
+ long points;
- if (oom_unkillable_task(p, mem, nodemask))
+ if (oom_unkillable_task(p, memcg, nodemask))
return 0;
p = find_lock_task_mm(p);
if (!p)
return 0;
- /*
- * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
- * so the entire heuristic doesn't need to be executed for something
- * that cannot be killed.
- */
- if (atomic_read(&p->mm->oom_disable_count)) {
+ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
task_unlock(p);
return 0;
}
@@ -293,7 +308,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* (not docbooked, we don't want this one cluttering up the manual)
*/
static struct task_struct *select_bad_process(unsigned int *ppoints,
- unsigned long totalpages, struct mem_cgroup *mem,
+ unsigned long totalpages, struct mem_cgroup *memcg,
const nodemask_t *nodemask)
{
struct task_struct *g, *p;
@@ -305,7 +320,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
if (p->exit_state)
continue;
- if (oom_unkillable_task(p, mem, nodemask))
+ if (oom_unkillable_task(p, memcg, nodemask))
continue;
/*
@@ -317,8 +332,11 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
* blocked waiting for another task which itself is waiting
* for memory. Is there a better alternative?
*/
- if (test_tsk_thread_flag(p, TIF_MEMDIE))
+ if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
+ if (unlikely(frozen(p)))
+ __thaw_task(p);
return ERR_PTR(-1UL);
+ }
if (!p->mm)
continue;
@@ -346,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
}
}
- points = oom_badness(p, mem, nodemask, totalpages);
+ points = oom_badness(p, memcg, nodemask, totalpages);
if (points > *ppoints) {
chosen = p;
*ppoints = points;
@@ -369,14 +387,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
*
* Call with tasklist_lock read-locked.
*/
-static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
+static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *task;
pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
for_each_process(p) {
- if (oom_unkillable_task(p, mem, nodemask))
+ if (oom_unkillable_task(p, memcg, nodemask))
continue;
task = find_lock_task_mm(p);
@@ -399,7 +417,7 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
}
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *mem, const nodemask_t *nodemask)
+ struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
@@ -409,14 +427,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
cpuset_print_task_mems_allowed(current);
task_unlock(current);
dump_stack();
- mem_cgroup_print_oom_info(mem, p);
+ mem_cgroup_print_oom_info(memcg, p);
show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
- dump_tasks(mem, nodemask);
+ dump_tasks(memcg, nodemask);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
-static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
+static int oom_kill_task(struct task_struct *p)
{
struct task_struct *q;
struct mm_struct *mm;
@@ -435,7 +453,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
task_unlock(p);
/*
- * Kill all processes sharing p->mm in other thread groups, if any.
+ * Kill all user processes sharing p->mm in other thread groups, if any.
* They don't get access to memory reserves or a higher scheduler
* priority, though, to avoid depletion of all memory or task
* starvation. This prevents mm->mmap_sem livelock when an oom killed
@@ -445,7 +463,11 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
* signal.
*/
for_each_process(q)
- if (q->mm == mm && !same_thread_group(q, p)) {
+ if (q->mm == mm && !same_thread_group(q, p) &&
+ !(q->flags & PF_KTHREAD)) {
+ if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ continue;
+
task_lock(q); /* Protect ->comm from prctl() */
pr_err("Kill process %d (%s) sharing same memory\n",
task_pid_nr(q), q->comm);
@@ -462,7 +484,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *mem, nodemask_t *nodemask,
+ struct mem_cgroup *memcg, nodemask_t *nodemask,
const char *message)
{
struct task_struct *victim = p;
@@ -471,7 +493,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int victim_points = 0;
if (printk_ratelimit())
- dump_header(p, gfp_mask, order, mem, nodemask);
+ dump_header(p, gfp_mask, order, memcg, nodemask);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -502,7 +524,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, mem, nodemask,
+ child_points = oom_badness(child, memcg, nodemask,
totalpages);
if (child_points > victim_points) {
victim = child;
@@ -511,7 +533,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
}
} while_each_thread(p, t);
- return oom_kill_task(victim, mem);
+ return oom_kill_task(victim);
}
/*
@@ -539,7 +561,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
{
unsigned long limit;
unsigned int points = 0;
@@ -556,14 +578,14 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
}
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
- limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
+ limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
retry:
- p = select_bad_process(&points, limit, mem, NULL);
+ p = select_bad_process(&points, limit, memcg, NULL);
if (!p || PTR_ERR(p) == -1UL)
goto out;
- if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
+ if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
"Memory cgroup out of memory"))
goto retry;
out:
@@ -722,7 +744,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
read_lock(&tasklist_lock);
if (sysctl_oom_kill_allocating_task &&
!oom_unkillable_task(current, NULL, nodemask) &&
- current->mm && !atomic_read(&current->mm->oom_disable_count)) {
+ current->mm) {
/*
* oom_kill_process() needs tasklist_lock held. If it returns
* non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0e309cd1b5b9..363ba7082ef5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -12,7 +12,7 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -32,7 +32,7 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
-#include <linux/buffer_head.h>
+#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
#include <trace/events/writeback.h>
@@ -42,30 +42,24 @@
#define MAX_PAUSE max(HZ/5, 1)
/*
+ * Try to keep balance_dirty_pages() call intervals higher than this many pages
+ * by raising pause time to max_pause when falls below it.
+ */
+#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
+
+/*
* Estimate write bandwidth at 200ms intervals.
*/
#define BANDWIDTH_INTERVAL max(HZ/5, 1)
+#define RATELIMIT_CALC_SHIFT 10
+
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
static long ratelimit_pages = 32;
-/*
- * When balance_dirty_pages decides that the caller needs to perform some
- * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than dirtied pages to ensure that reasonably
- * large amounts of I/O are submitted.
- */
-static inline long sync_writeback_pages(unsigned long dirtied)
-{
- if (dirtied < ratelimit_pages)
- dirtied = ratelimit_pages;
-
- return dirtied + dirtied / 2;
-}
-
/* The following parameters are exported via /proc/sys/vm */
/*
@@ -140,7 +134,191 @@ unsigned long global_dirty_limit;
*
*/
static struct prop_descriptor vm_completions;
-static struct prop_descriptor vm_dirties;
+
+/*
+ * Work out the current dirty-memory clamping and background writeout
+ * thresholds.
+ *
+ * The main aim here is to lower them aggressively if there is a lot of mapped
+ * memory around. To avoid stressing page reclaim with lots of unreclaimable
+ * pages. It is better to clamp down on writers than to start swapping, and
+ * performing lots of scanning.
+ *
+ * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ *
+ * We don't permit the clamping level to fall below 5% - that is getting rather
+ * excessive.
+ *
+ * We make sure that the background writeout level is below the adjusted
+ * clamping level.
+ */
+
+/*
+ * In a memory zone, there is a certain amount of pages we consider
+ * available for the page cache, which is essentially the number of
+ * free and reclaimable pages, minus some zone reserves to protect
+ * lowmem and the ability to uphold the zone's watermarks without
+ * requiring writeback.
+ *
+ * This number of dirtyable pages is the base value of which the
+ * user-configurable dirty ratio is the effictive number of pages that
+ * are allowed to be actually dirtied. Per individual zone, or
+ * globally by using the sum of dirtyable pages over all zones.
+ *
+ * Because the user is allowed to specify the dirty limit globally as
+ * absolute number of bytes, calculating the per-zone dirty limit can
+ * require translating the configured limit into a percentage of
+ * global dirtyable memory first.
+ */
+
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
+#ifdef CONFIG_HIGHMEM
+ int node;
+ unsigned long x = 0;
+
+ for_each_node_state(node, N_HIGH_MEMORY) {
+ struct zone *z =
+ &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+
+ x += zone_page_state(z, NR_FREE_PAGES) +
+ zone_reclaimable_pages(z) - z->dirty_balance_reserve;
+ }
+ /*
+ * Make sure that the number of highmem pages is never larger
+ * than the number of the total dirtyable memory. This can only
+ * occur in very strange VM situations but we want to make sure
+ * that this does not occur.
+ */
+ return min(x, total);
+#else
+ return 0;
+#endif
+}
+
+/**
+ * global_dirtyable_memory - number of globally dirtyable pages
+ *
+ * Returns the global number of pages potentially available for dirty
+ * page cache. This is the base value for the global dirty limits.
+ */
+unsigned long global_dirtyable_memory(void)
+{
+ unsigned long x;
+
+ x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
+ dirty_balance_reserve;
+
+ if (!vm_highmem_is_dirtyable)
+ x -= highmem_dirtyable_memory(x);
+
+ return x + 1; /* Ensure that we never return 0 */
+}
+
+/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio or vm.dirty_background_bytes
+ * - vm.dirty_ratio or vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * real-time tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+ unsigned long background;
+ unsigned long dirty;
+ unsigned long uninitialized_var(available_memory);
+ struct task_struct *tsk;
+
+ if (!vm_dirty_bytes || !dirty_background_bytes)
+ available_memory = global_dirtyable_memory();
+
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ else
+ dirty = (vm_dirty_ratio * available_memory) / 100;
+
+ if (dirty_background_bytes)
+ background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+ else
+ background = (dirty_background_ratio * available_memory) / 100;
+
+ if (background >= dirty)
+ background = dirty / 2;
+ tsk = current;
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+ background += background / 4;
+ dirty += dirty / 4;
+ }
+ *pbackground = background;
+ *pdirty = dirty;
+ trace_global_dirty_state(background, dirty);
+}
+
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ /*
+ * The effective global number of dirtyable pages may exclude
+ * highmem as a big-picture measure to keep the ratio between
+ * dirty memory and lowmem reasonable.
+ *
+ * But this function is purely about the individual zone and a
+ * highmem zone can hold its share of dirty pages, so we don't
+ * care about vm_highmem_is_dirtyable here.
+ */
+ return zone_page_state(zone, NR_FREE_PAGES) +
+ zone_reclaimable_pages(zone) -
+ zone->dirty_balance_reserve;
+}
+
+/**
+ * zone_dirty_limit - maximum number of dirty pages allowed in a zone
+ * @zone: the zone
+ *
+ * Returns the maximum number of dirty pages allowed in a zone, based
+ * on the zone's dirtyable memory.
+ */
+static unsigned long zone_dirty_limit(struct zone *zone)
+{
+ unsigned long zone_memory = zone_dirtyable_memory(zone);
+ struct task_struct *tsk = current;
+ unsigned long dirty;
+
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
+ zone_memory / global_dirtyable_memory();
+ else
+ dirty = vm_dirty_ratio * zone_memory / 100;
+
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+ dirty += dirty / 4;
+
+ return dirty;
+}
+
+/**
+ * zone_dirty_ok - tells whether a zone is within its dirty limits
+ * @zone: the zone to check
+ *
+ * Returns %true when the dirty pages in @zone are within the zone's
+ * dirty limit, %false if the limit is exceeded.
+ */
+bool zone_dirty_ok(struct zone *zone)
+{
+ unsigned long limit = zone_dirty_limit(zone);
+
+ return zone_page_state(zone, NR_FILE_DIRTY) +
+ zone_page_state(zone, NR_UNSTABLE_NFS) +
+ zone_page_state(zone, NR_WRITEBACK) <= limit;
+}
/*
* couple the period to the dirty_ratio:
@@ -154,7 +332,7 @@ static int calc_period_shift(void)
if (vm_dirty_bytes)
dirty_total = vm_dirty_bytes / PAGE_SIZE;
else
- dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+ dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
100;
return 2 + ilog2(dirty_total - 1);
}
@@ -166,7 +344,8 @@ static void update_completion_period(void)
{
int shift = calc_period_shift();
prop_change_shift(&vm_completions, shift);
- prop_change_shift(&vm_dirties, shift);
+
+ writeback_set_ratelimit();
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -208,7 +387,6 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
return ret;
}
-
int dirty_bytes_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -245,11 +423,6 @@ void bdi_writeout_inc(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL_GPL(bdi_writeout_inc);
-void task_dirty_inc(struct task_struct *tsk)
-{
- prop_inc_single(&vm_dirties, &tsk->dirties);
-}
-
/*
* Obtain an accurate fraction of the BDI's portion.
*/
@@ -260,52 +433,10 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
numerator, denominator);
}
-static inline void task_dirties_fraction(struct task_struct *tsk,
- long *numerator, long *denominator)
-{
- prop_fraction_single(&vm_dirties, &tsk->dirties,
- numerator, denominator);
-}
-
-/*
- * task_dirty_limit - scale down dirty throttling threshold for one task
- *
- * task specific dirty limit:
- *
- * dirty -= (dirty/8) * p_{t}
- *
- * To protect light/slow dirtying tasks from heavier/fast ones, we start
- * throttling individual tasks before reaching the bdi dirty limit.
- * Relatively low thresholds will be allocated to heavy dirtiers. So when
- * dirty pages grow large, heavy dirtiers will be throttled first, which will
- * effectively curb the growth of dirty pages. Light dirtiers with high enough
- * dirty threshold may never get throttled.
- */
-#define TASK_LIMIT_FRACTION 8
-static unsigned long task_dirty_limit(struct task_struct *tsk,
- unsigned long bdi_dirty)
-{
- long numerator, denominator;
- unsigned long dirty = bdi_dirty;
- u64 inv = dirty / TASK_LIMIT_FRACTION;
-
- task_dirties_fraction(tsk, &numerator, &denominator);
- inv *= numerator;
- do_div(inv, denominator);
-
- dirty -= inv;
-
- return max(dirty, bdi_dirty/2);
-}
-
-/* Minimum limit for any task */
-static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
-{
- return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
-}
-
/*
- *
+ * bdi_min_ratio keeps the sum of the minimum dirty shares of all
+ * registered backing devices, which, for obvious reasons, can not
+ * exceed 100%.
*/
static unsigned int bdi_min_ratio;
@@ -350,65 +481,10 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
}
EXPORT_SYMBOL(bdi_set_max_ratio);
-/*
- * Work out the current dirty-memory clamping and background writeout
- * thresholds.
- *
- * The main aim here is to lower them aggressively if there is a lot of mapped
- * memory around. To avoid stressing page reclaim with lots of unreclaimable
- * pages. It is better to clamp down on writers than to start swapping, and
- * performing lots of scanning.
- *
- * We only allow 1/2 of the currently-unmapped memory to be dirtied.
- *
- * We don't permit the clamping level to fall below 5% - that is getting rather
- * excessive.
- *
- * We make sure that the background writeout level is below the adjusted
- * clamping level.
- */
-
-static unsigned long highmem_dirtyable_memory(unsigned long total)
-{
-#ifdef CONFIG_HIGHMEM
- int node;
- unsigned long x = 0;
-
- for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z =
- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
-
- x += zone_page_state(z, NR_FREE_PAGES) +
- zone_reclaimable_pages(z);
- }
- /*
- * Make sure that the number of highmem pages is never larger
- * than the number of the total dirtyable memory. This can only
- * occur in very strange VM situations but we want to make sure
- * that this does not occur.
- */
- return min(x, total);
-#else
- return 0;
-#endif
-}
-
-/**
- * determine_dirtyable_memory - amount of memory that may be used
- *
- * Returns the numebr of pages that can currently be freed and used
- * by the kernel for direct mappings.
- */
-unsigned long determine_dirtyable_memory(void)
+static unsigned long dirty_freerun_ceiling(unsigned long thresh,
+ unsigned long bg_thresh)
{
- unsigned long x;
-
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
-
- if (!vm_highmem_is_dirtyable)
- x -= highmem_dirtyable_memory(x);
-
- return x + 1; /* Ensure that we never return 0 */
+ return (thresh + bg_thresh) / 2;
}
static unsigned long hard_dirty_limit(unsigned long thresh)
@@ -416,47 +492,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
return max(thresh, global_dirty_limit);
}
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
- *
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio or vm.dirty_background_bytes
- * - vm.dirty_ratio or vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
- */
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
-{
- unsigned long background;
- unsigned long dirty;
- unsigned long uninitialized_var(available_memory);
- struct task_struct *tsk;
-
- if (!vm_dirty_bytes || !dirty_background_bytes)
- available_memory = determine_dirtyable_memory();
-
- if (vm_dirty_bytes)
- dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
- else
- dirty = (vm_dirty_ratio * available_memory) / 100;
-
- if (dirty_background_bytes)
- background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
- else
- background = (dirty_background_ratio * available_memory) / 100;
-
- if (background >= dirty)
- background = dirty / 2;
- tsk = current;
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
- background += background / 4;
- dirty += dirty / 4;
- }
- *pbackground = background;
- *pdirty = dirty;
- trace_global_dirty_state(background, dirty);
-}
-
/**
* bdi_dirty_limit - @bdi's share of dirty throttling threshold
* @bdi: the backing_dev_info to query
@@ -464,8 +499,13 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
*
* Returns @bdi's dirty limit in pages. The term "dirty" in the context of
* dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
- * And the "limit" in the name is not seriously taken as hard limit in
- * balance_dirty_pages().
+ *
+ * Note that balance_dirty_pages() will only seriously take it as a hard limit
+ * when sleeping max_pause per page is not enough to keep the dirty pages under
+ * control. For example, when the device is completely stalled due to some error
+ * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
+ * In the other normal situations, it acts more gently by throttling the tasks
+ * more (rather than completely block them) when the bdi dirty pages go high.
*
* It allocates high/low dirty limits to fast/slow devices, in order to prevent
* - starving fast devices
@@ -495,6 +535,205 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
return bdi_dirty;
}
+/*
+ * Dirty position control.
+ *
+ * (o) global/bdi setpoints
+ *
+ * We want the dirty pages be balanced around the global/bdi setpoints.
+ * When the number of dirty pages is higher/lower than the setpoint, the
+ * dirty position control ratio (and hence task dirty ratelimit) will be
+ * decreased/increased to bring the dirty pages back to the setpoint.
+ *
+ * pos_ratio = 1 << RATELIMIT_CALC_SHIFT
+ *
+ * if (dirty < setpoint) scale up pos_ratio
+ * if (dirty > setpoint) scale down pos_ratio
+ *
+ * if (bdi_dirty < bdi_setpoint) scale up pos_ratio
+ * if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+ *
+ * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
+ *
+ * (o) global control line
+ *
+ * ^ pos_ratio
+ * |
+ * | |<===== global dirty control scope ======>|
+ * 2.0 .............*
+ * | .*
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * 1.0 ................................*
+ * | . . *
+ * | . . *
+ * | . . *
+ * | . . *
+ * | . . *
+ * 0 +------------.------------------.----------------------*------------->
+ * freerun^ setpoint^ limit^ dirty pages
+ *
+ * (o) bdi control line
+ *
+ * ^ pos_ratio
+ * |
+ * | *
+ * | *
+ * | *
+ * | *
+ * | * |<=========== span ============>|
+ * 1.0 .......................*
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * 1/4 ...............................................* * * * * * * * * * * *
+ * | . .
+ * | . .
+ * | . .
+ * 0 +----------------------.-------------------------------.------------->
+ * bdi_setpoint^ x_intercept^
+ *
+ * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+ * be smoothly throttled down to normal if it starts high in situations like
+ * - start writing to a slow SD card and a fast disk at the same time. The SD
+ * card's bdi_dirty may rush to many times higher than bdi_setpoint.
+ * - the bdi dirty thresh drops quickly due to change of JBOD workload
+ */
+static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long bg_thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty)
+{
+ unsigned long write_bw = bdi->avg_write_bandwidth;
+ unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+ unsigned long limit = hard_dirty_limit(thresh);
+ unsigned long x_intercept;
+ unsigned long setpoint; /* dirty pages' target balance point */
+ unsigned long bdi_setpoint;
+ unsigned long span;
+ long long pos_ratio; /* for scaling up/down the rate limit */
+ long x;
+
+ if (unlikely(dirty >= limit))
+ return 0;
+
+ /*
+ * global setpoint
+ *
+ * setpoint - dirty 3
+ * f(dirty) := 1.0 + (----------------)
+ * limit - setpoint
+ *
+ * it's a 3rd order polynomial that subjects to
+ *
+ * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
+ * (2) f(setpoint) = 1.0 => the balance point
+ * (3) f(limit) = 0 => the hard limit
+ * (4) df/dx <= 0 => negative feedback control
+ * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
+ * => fast response on large errors; small oscillation near setpoint
+ */
+ setpoint = (freerun + limit) / 2;
+ x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
+ limit - setpoint + 1);
+ pos_ratio = x;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+
+ /*
+ * We have computed basic pos_ratio above based on global situation. If
+ * the bdi is over/under its share of dirty pages, we want to scale
+ * pos_ratio further down/up. That is done by the following mechanism.
+ */
+
+ /*
+ * bdi setpoint
+ *
+ * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+ *
+ * x_intercept - bdi_dirty
+ * := --------------------------
+ * x_intercept - bdi_setpoint
+ *
+ * The main bdi control line is a linear function that subjects to
+ *
+ * (1) f(bdi_setpoint) = 1.0
+ * (2) k = - 1 / (8 * write_bw) (in single bdi case)
+ * or equally: x_intercept = bdi_setpoint + 8 * write_bw
+ *
+ * For single bdi case, the dirty pages are observed to fluctuate
+ * regularly within range
+ * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+ * for various filesystems, where (2) can yield in a reasonable 12.5%
+ * fluctuation range for pos_ratio.
+ *
+ * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+ * own size, so move the slope over accordingly and choose a slope that
+ * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+ */
+ if (unlikely(bdi_thresh > thresh))
+ bdi_thresh = thresh;
+ /*
+ * It's very possible that bdi_thresh is close to 0 not because the
+ * device is slow, but that it has remained inactive for long time.
+ * Honour such devices a reasonable good (hopefully IO efficient)
+ * threshold, so that the occasional writes won't be blocked and active
+ * writes can rampup the threshold quickly.
+ */
+ bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+ /*
+ * scale global setpoint to bdi's:
+ * bdi_setpoint = setpoint * bdi_thresh / thresh
+ */
+ x = div_u64((u64)bdi_thresh << 16, thresh + 1);
+ bdi_setpoint = setpoint * (u64)x >> 16;
+ /*
+ * Use span=(8*write_bw) in single bdi case as indicated by
+ * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+ *
+ * bdi_thresh thresh - bdi_thresh
+ * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
+ * thresh thresh
+ */
+ span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
+ x_intercept = bdi_setpoint + span;
+
+ if (bdi_dirty < x_intercept - span / 4) {
+ pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
+ x_intercept - bdi_setpoint + 1);
+ } else
+ pos_ratio /= 4;
+
+ /*
+ * bdi reserve area, safeguard against dirty pool underrun and disk idle
+ * It may push the desired control point of global dirty pages higher
+ * than setpoint.
+ */
+ x_intercept = bdi_thresh / 2;
+ if (bdi_dirty < x_intercept) {
+ if (bdi_dirty > x_intercept / 8)
+ pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
+ else
+ pos_ratio *= 8;
+ }
+
+ return pos_ratio;
+}
+
static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
unsigned long elapsed,
unsigned long written)
@@ -591,8 +830,158 @@ static void global_update_bandwidth(unsigned long thresh,
spin_unlock(&dirty_lock);
}
+/*
+ * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ *
+ * Normal bdi tasks will be curbed at or below it in long term.
+ * Obviously it should be around (write_bw / N) when there are N dd tasks.
+ */
+static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long bg_thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long dirtied,
+ unsigned long elapsed)
+{
+ unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+ unsigned long limit = hard_dirty_limit(thresh);
+ unsigned long setpoint = (freerun + limit) / 2;
+ unsigned long write_bw = bdi->avg_write_bandwidth;
+ unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+ unsigned long dirty_rate;
+ unsigned long task_ratelimit;
+ unsigned long balanced_dirty_ratelimit;
+ unsigned long pos_ratio;
+ unsigned long step;
+ unsigned long x;
+
+ /*
+ * The dirty rate will match the writeout rate in long term, except
+ * when dirty pages are truncated by userspace or re-dirtied by FS.
+ */
+ dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+
+ pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
+ bdi_thresh, bdi_dirty);
+ /*
+ * task_ratelimit reflects each dd's dirty rate for the past 200ms.
+ */
+ task_ratelimit = (u64)dirty_ratelimit *
+ pos_ratio >> RATELIMIT_CALC_SHIFT;
+ task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
+
+ /*
+ * A linear estimation of the "balanced" throttle rate. The theory is,
+ * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+ * dirty_rate will be measured to be (N * task_ratelimit). So the below
+ * formula will yield the balanced rate limit (write_bw / N).
+ *
+ * Note that the expanded form is not a pure rate feedback:
+ * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
+ * but also takes pos_ratio into account:
+ * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
+ *
+ * (1) is not realistic because pos_ratio also takes part in balancing
+ * the dirty rate. Consider the state
+ * pos_ratio = 0.5 (3)
+ * rate = 2 * (write_bw / N) (4)
+ * If (1) is used, it will stuck in that state! Because each dd will
+ * be throttled at
+ * task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
+ * yielding
+ * dirty_rate = N * task_ratelimit = write_bw (6)
+ * put (6) into (1) we get
+ * rate_(i+1) = rate_(i) (7)
+ *
+ * So we end up using (2) to always keep
+ * rate_(i+1) ~= (write_bw / N) (8)
+ * regardless of the value of pos_ratio. As long as (8) is satisfied,
+ * pos_ratio is able to drive itself to 1.0, which is not only where
+ * the dirty count meet the setpoint, but also where the slope of
+ * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
+ */
+ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
+ dirty_rate | 1);
+ /*
+ * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
+ */
+ if (unlikely(balanced_dirty_ratelimit > write_bw))
+ balanced_dirty_ratelimit = write_bw;
+
+ /*
+ * We could safely do this and return immediately:
+ *
+ * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+ *
+ * However to get a more stable dirty_ratelimit, the below elaborated
+ * code makes use of task_ratelimit to filter out sigular points and
+ * limit the step size.
+ *
+ * The below code essentially only uses the relative value of
+ *
+ * task_ratelimit - dirty_ratelimit
+ * = (pos_ratio - 1) * dirty_ratelimit
+ *
+ * which reflects the direction and size of dirty position error.
+ */
+
+ /*
+ * dirty_ratelimit will follow balanced_dirty_ratelimit iff
+ * task_ratelimit is on the same side of dirty_ratelimit, too.
+ * For example, when
+ * - dirty_ratelimit > balanced_dirty_ratelimit
+ * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
+ * lowering dirty_ratelimit will help meet both the position and rate
+ * control targets. Otherwise, don't update dirty_ratelimit if it will
+ * only help meet the rate target. After all, what the users ultimately
+ * feel and care are stable dirty rate and small position error.
+ *
+ * |task_ratelimit - dirty_ratelimit| is used to limit the step size
+ * and filter out the sigular points of balanced_dirty_ratelimit. Which
+ * keeps jumping around randomly and can even leap far away at times
+ * due to the small 200ms estimation period of dirty_rate (we want to
+ * keep that period small to reduce time lags).
+ */
+ step = 0;
+ if (dirty < setpoint) {
+ x = min(bdi->balanced_dirty_ratelimit,
+ min(balanced_dirty_ratelimit, task_ratelimit));
+ if (dirty_ratelimit < x)
+ step = x - dirty_ratelimit;
+ } else {
+ x = max(bdi->balanced_dirty_ratelimit,
+ max(balanced_dirty_ratelimit, task_ratelimit));
+ if (dirty_ratelimit > x)
+ step = dirty_ratelimit - x;
+ }
+
+ /*
+ * Don't pursue 100% rate matching. It's impossible since the balanced
+ * rate itself is constantly fluctuating. So decrease the track speed
+ * when it gets close to the target. Helps eliminate pointless tremors.
+ */
+ step >>= dirty_ratelimit / (2 * step + 1);
+ /*
+ * Limit the tracking speed to avoid overshooting.
+ */
+ step = (step + 7) / 8;
+
+ if (dirty_ratelimit < balanced_dirty_ratelimit)
+ dirty_ratelimit += step;
+ else
+ dirty_ratelimit -= step;
+
+ bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+ bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+
+ trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+}
+
void __bdi_update_bandwidth(struct backing_dev_info *bdi,
unsigned long thresh,
+ unsigned long bg_thresh,
unsigned long dirty,
unsigned long bdi_thresh,
unsigned long bdi_dirty,
@@ -600,6 +989,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
{
unsigned long now = jiffies;
unsigned long elapsed = now - bdi->bw_time_stamp;
+ unsigned long dirtied;
unsigned long written;
/*
@@ -608,6 +998,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
if (elapsed < BANDWIDTH_INTERVAL)
return;
+ dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
/*
@@ -617,18 +1008,23 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
goto snapshot;
- if (thresh)
+ if (thresh) {
global_update_bandwidth(thresh, dirty, now);
-
+ bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
+ bdi_thresh, bdi_dirty,
+ dirtied, elapsed);
+ }
bdi_update_write_bandwidth(bdi, elapsed, written);
snapshot:
+ bdi->dirtied_stamp = dirtied;
bdi->written_stamp = written;
bdi->bw_time_stamp = now;
}
static void bdi_update_bandwidth(struct backing_dev_info *bdi,
unsigned long thresh,
+ unsigned long bg_thresh,
unsigned long dirty,
unsigned long bdi_thresh,
unsigned long bdi_dirty,
@@ -637,37 +1033,161 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
return;
spin_lock(&bdi->wb.list_lock);
- __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
- start_time);
+ __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
+ bdi_thresh, bdi_dirty, start_time);
spin_unlock(&bdi->wb.list_lock);
}
/*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If dirty_poll_interval is too low, big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long dirty_poll_interval(unsigned long dirty,
+ unsigned long thresh)
+{
+ if (thresh > dirty)
+ return 1UL << (ilog2(thresh - dirty) >> 1);
+
+ return 1;
+}
+
+static long bdi_max_pause(struct backing_dev_info *bdi,
+ unsigned long bdi_dirty)
+{
+ long bw = bdi->avg_write_bandwidth;
+ long t;
+
+ /*
+ * Limit pause time for small memory systems. If sleeping for too long
+ * time, a small pool of dirty/writeback pages may go empty and disk go
+ * idle.
+ *
+ * 8 serves as the safety ratio.
+ */
+ t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
+ t++;
+
+ return min_t(long, t, MAX_PAUSE);
+}
+
+static long bdi_min_pause(struct backing_dev_info *bdi,
+ long max_pause,
+ unsigned long task_ratelimit,
+ unsigned long dirty_ratelimit,
+ int *nr_dirtied_pause)
+{
+ long hi = ilog2(bdi->avg_write_bandwidth);
+ long lo = ilog2(bdi->dirty_ratelimit);
+ long t; /* target pause */
+ long pause; /* estimated next pause */
+ int pages; /* target nr_dirtied_pause */
+
+ /* target for 10ms pause on 1-dd case */
+ t = max(1, HZ / 100);
+
+ /*
+ * Scale up pause time for concurrent dirtiers in order to reduce CPU
+ * overheads.
+ *
+ * (N * 10ms) on 2^N concurrent tasks.
+ */
+ if (hi > lo)
+ t += (hi - lo) * (10 * HZ) / 1024;
+
+ /*
+ * This is a bit convoluted. We try to base the next nr_dirtied_pause
+ * on the much more stable dirty_ratelimit. However the next pause time
+ * will be computed based on task_ratelimit and the two rate limits may
+ * depart considerably at some time. Especially if task_ratelimit goes
+ * below dirty_ratelimit/2 and the target pause is max_pause, the next
+ * pause time will be max_pause*2 _trimmed down_ to max_pause. As a
+ * result task_ratelimit won't be executed faithfully, which could
+ * eventually bring down dirty_ratelimit.
+ *
+ * We apply two rules to fix it up:
+ * 1) try to estimate the next pause time and if necessary, use a lower
+ * nr_dirtied_pause so as not to exceed max_pause. When this happens,
+ * nr_dirtied_pause will be "dancing" with task_ratelimit.
+ * 2) limit the target pause time to max_pause/2, so that the normal
+ * small fluctuations of task_ratelimit won't trigger rule (1) and
+ * nr_dirtied_pause will remain as stable as dirty_ratelimit.
+ */
+ t = min(t, 1 + max_pause / 2);
+ pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
+
+ /*
+ * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
+ * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
+ * When the 16 consecutive reads are often interrupted by some dirty
+ * throttling pause during the async writes, cfq will go into idles
+ * (deadline is fine). So push nr_dirtied_pause as high as possible
+ * until reaches DIRTY_POLL_THRESH=32 pages.
+ */
+ if (pages < DIRTY_POLL_THRESH) {
+ t = max_pause;
+ pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
+ if (pages > DIRTY_POLL_THRESH) {
+ pages = DIRTY_POLL_THRESH;
+ t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
+ }
+ }
+
+ pause = HZ * pages / (task_ratelimit + 1);
+ if (pause > max_pause) {
+ t = max_pause;
+ pages = task_ratelimit * t / roundup_pow_of_two(HZ);
+ }
+
+ *nr_dirtied_pause = pages;
+ /*
+ * The minimal pause time will normally be half the target pause time.
+ */
+ return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
+}
+
+/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
- * the caller to perform writeback if the system is over `vm_dirty_ratio'.
+ * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
* If we're over `background_thresh' then the writeback threads are woken to
* perform some writeout.
*/
static void balance_dirty_pages(struct address_space *mapping,
- unsigned long write_chunk)
+ unsigned long pages_dirtied)
{
- unsigned long nr_reclaimable, bdi_nr_reclaimable;
+ unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
+ unsigned long bdi_reclaimable;
unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
unsigned long bdi_dirty;
+ unsigned long freerun;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long task_bdi_thresh;
- unsigned long min_task_bdi_thresh;
- unsigned long pages_written = 0;
- unsigned long pause = 1;
+ long period;
+ long pause;
+ long max_pause;
+ long min_pause;
+ int nr_dirtied_pause;
bool dirty_exceeded = false;
- bool clear_dirty_exceeded = true;
+ unsigned long task_ratelimit;
+ unsigned long dirty_ratelimit;
+ unsigned long pos_ratio;
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long start_time = jiffies;
for (;;) {
+ unsigned long now = jiffies;
+
+ /*
+ * Unstable writes are a feature of certain networked
+ * filesystems (i.e. NFS) in which data may have been
+ * written to the server's write cache, but has not yet
+ * been flushed to permanent storage.
+ */
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
@@ -679,12 +1199,33 @@ static void balance_dirty_pages(struct address_space *mapping,
* catch-up. This avoids (excessively) small writeouts
* when the bdi limits are ramping up.
*/
- if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
+ freerun = dirty_freerun_ceiling(dirty_thresh,
+ background_thresh);
+ if (nr_dirty <= freerun) {
+ current->dirty_paused_when = now;
+ current->nr_dirtied = 0;
+ current->nr_dirtied_pause =
+ dirty_poll_interval(nr_dirty, dirty_thresh);
break;
+ }
+ if (unlikely(!writeback_in_progress(bdi)))
+ bdi_start_background_writeback(bdi);
+
+ /*
+ * bdi_thresh is not treated as some limiting factor as
+ * dirty_thresh, due to reasons
+ * - in JBOD setup, bdi_thresh can fluctuate a lot
+ * - in a system with HDD and USB key, the USB key may somehow
+ * go into state (bdi_dirty >> bdi_thresh) either because
+ * bdi_dirty starts high, or because bdi_thresh drops low.
+ * In this case we don't want to hard throttle the USB key
+ * dirtiers for 100 seconds until bdi_dirty drops under
+ * bdi_thresh. Instead the auxiliary bdi control line in
+ * bdi_position_ratio() will let the dirtier task progress
+ * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+ */
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
- min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
- task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
@@ -696,80 +1237,126 @@ static void balance_dirty_pages(struct address_space *mapping,
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
- if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
- bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
- bdi_dirty = bdi_nr_reclaimable +
+ if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
+ bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+ bdi_dirty = bdi_reclaimable +
bdi_stat_sum(bdi, BDI_WRITEBACK);
} else {
- bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_dirty = bdi_nr_reclaimable +
+ bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+ bdi_dirty = bdi_reclaimable +
bdi_stat(bdi, BDI_WRITEBACK);
}
- /*
- * The bdi thresh is somehow "soft" limit derived from the
- * global "hard" limit. The former helps to prevent heavy IO
- * bdi or process from holding back light ones; The latter is
- * the last resort safeguard.
- */
- dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
+ dirty_exceeded = (bdi_dirty > bdi_thresh) &&
(nr_dirty > dirty_thresh);
- clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
- (nr_dirty <= dirty_thresh);
-
- if (!dirty_exceeded)
- break;
-
- if (!bdi->dirty_exceeded)
+ if (dirty_exceeded && !bdi->dirty_exceeded)
bdi->dirty_exceeded = 1;
- bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
- bdi_thresh, bdi_dirty, start_time);
-
- /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
- * Unstable writes are a feature of certain networked
- * filesystems (i.e. NFS) in which data may have been
- * written to the server's write cache, but has not yet
- * been flushed to permanent storage.
- * Only move pages to writeback if this bdi is over its
- * threshold otherwise wait until the disk writes catch
- * up.
+ bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
+ nr_dirty, bdi_thresh, bdi_dirty,
+ start_time);
+
+ dirty_ratelimit = bdi->dirty_ratelimit;
+ pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
+ background_thresh, nr_dirty,
+ bdi_thresh, bdi_dirty);
+ task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
+ RATELIMIT_CALC_SHIFT;
+ max_pause = bdi_max_pause(bdi, bdi_dirty);
+ min_pause = bdi_min_pause(bdi, max_pause,
+ task_ratelimit, dirty_ratelimit,
+ &nr_dirtied_pause);
+
+ if (unlikely(task_ratelimit == 0)) {
+ period = max_pause;
+ pause = max_pause;
+ goto pause;
+ }
+ period = HZ * pages_dirtied / task_ratelimit;
+ pause = period;
+ if (current->dirty_paused_when)
+ pause -= now - current->dirty_paused_when;
+ /*
+ * For less than 1s think time (ext3/4 may block the dirtier
+ * for up to 800ms from time to time on 1-HDD; so does xfs,
+ * however at much less frequency), try to compensate it in
+ * future periods by updating the virtual time; otherwise just
+ * do a reset, as it may be a light dirtier.
*/
- trace_balance_dirty_start(bdi);
- if (bdi_nr_reclaimable > task_bdi_thresh) {
- pages_written += writeback_inodes_wb(&bdi->wb,
- write_chunk);
- trace_balance_dirty_written(bdi, pages_written);
- if (pages_written >= write_chunk)
- break; /* We've done our duty */
+ if (pause < min_pause) {
+ trace_balance_dirty_pages(bdi,
+ dirty_thresh,
+ background_thresh,
+ nr_dirty,
+ bdi_thresh,
+ bdi_dirty,
+ dirty_ratelimit,
+ task_ratelimit,
+ pages_dirtied,
+ period,
+ min(pause, 0L),
+ start_time);
+ if (pause < -HZ) {
+ current->dirty_paused_when = now;
+ current->nr_dirtied = 0;
+ } else if (period) {
+ current->dirty_paused_when += period;
+ current->nr_dirtied = 0;
+ } else if (current->nr_dirtied_pause <= pages_dirtied)
+ current->nr_dirtied_pause += pages_dirtied;
+ break;
}
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ if (unlikely(pause > max_pause)) {
+ /* for occasional dropped task_ratelimit */
+ now += min(pause - max_pause, max_pause);
+ pause = max_pause;
+ }
+
+pause:
+ trace_balance_dirty_pages(bdi,
+ dirty_thresh,
+ background_thresh,
+ nr_dirty,
+ bdi_thresh,
+ bdi_dirty,
+ dirty_ratelimit,
+ task_ratelimit,
+ pages_dirtied,
+ period,
+ pause,
+ start_time);
+ __set_current_state(TASK_KILLABLE);
io_schedule_timeout(pause);
- trace_balance_dirty_wait(bdi);
- dirty_thresh = hard_dirty_limit(dirty_thresh);
+ current->dirty_paused_when = now + pause;
+ current->nr_dirtied = 0;
+ current->nr_dirtied_pause = nr_dirtied_pause;
+
/*
- * max-pause area. If dirty exceeded but still within this
- * area, no need to sleep for more than 200ms: (a) 8 pages per
- * 200ms is typically more than enough to curb heavy dirtiers;
- * (b) the pause time limit makes the dirtiers more responsive.
+ * This is typically equal to (nr_dirty < dirty_thresh) and can
+ * also keep "1000+ dd on a slow USB stick" under control.
*/
- if (nr_dirty < dirty_thresh &&
- bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 &&
- time_after(jiffies, start_time + MAX_PAUSE))
+ if (task_ratelimit)
break;
/*
- * Increase the delay for each loop, up to our previous
- * default of taking a 100ms nap.
+ * In the case of an unresponding NFS server and the NFS dirty
+ * pages exceeds dirty_thresh, give the other good bdi's a pipe
+ * to go through, so that tasks on them still remain responsive.
+ *
+ * In theory 1 page is enough to keep the comsumer-producer
+ * pipe going: the flusher cleans 1 page => the task dirties 1
+ * more page. However bdi_dirty has accounting errors. So use
+ * the larger and more IO friendly bdi_stat_error.
*/
- pause <<= 1;
- if (pause > HZ / 10)
- pause = HZ / 10;
+ if (bdi_dirty <= bdi_stat_error(bdi))
+ break;
+
+ if (fatal_signal_pending(current))
+ break;
}
- /* Clear dirty_exceeded flag only when no task can exceed the limit */
- if (clear_dirty_exceeded && bdi->dirty_exceeded)
+ if (!dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -783,8 +1370,10 @@ static void balance_dirty_pages(struct address_space *mapping,
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
- if ((laptop_mode && pages_written) ||
- (!laptop_mode && (nr_reclaimable > background_thresh)))
+ if (laptop_mode)
+ return;
+
+ if (nr_reclaimable > background_thresh)
bdi_start_background_writeback(bdi);
}
@@ -798,7 +1387,23 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
}
}
-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+static DEFINE_PER_CPU(int, bdp_ratelimits);
+
+/*
+ * Normal tasks are throttled by
+ * loop {
+ * dirty tsk->nr_dirtied_pause pages;
+ * take a snap in balance_dirty_pages();
+ * }
+ * However there is a worst case. If every task exit immediately when dirtied
+ * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
+ * called to throttle the page dirties. The solution is to save the not yet
+ * throttled page dirties in dirty_throttle_leaks on task exit and charge them
+ * randomly into the running tasks. This works well for the above worst case,
+ * as the new task will pick up and accumulate the old task's leaked dirty
+ * count and eventually get throttled.
+ */
+DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
/**
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
@@ -818,31 +1423,45 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
- unsigned long ratelimit;
- unsigned long *p;
+ int ratelimit;
+ int *p;
if (!bdi_cap_account_dirty(bdi))
return;
- ratelimit = ratelimit_pages;
- if (mapping->backing_dev_info->dirty_exceeded)
- ratelimit = 8;
+ ratelimit = current->nr_dirtied_pause;
+ if (bdi->dirty_exceeded)
+ ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
+ preempt_disable();
/*
- * Check the rate limiting. Also, we do not want to throttle real-time
- * tasks in balance_dirty_pages(). Period.
+ * This prevents one CPU to accumulate too many dirtied pages without
+ * calling into balance_dirty_pages(), which can happen when there are
+ * 1000+ tasks, all of them start dirtying pages at exactly the same
+ * time, hence all honoured too large initial task->nr_dirtied_pause.
*/
- preempt_disable();
p = &__get_cpu_var(bdp_ratelimits);
- *p += nr_pages_dirtied;
- if (unlikely(*p >= ratelimit)) {
- ratelimit = sync_writeback_pages(*p);
+ if (unlikely(current->nr_dirtied >= ratelimit))
*p = 0;
- preempt_enable();
- balance_dirty_pages(mapping, ratelimit);
- return;
+ else if (unlikely(*p >= ratelimit_pages)) {
+ *p = 0;
+ ratelimit = 0;
+ }
+ /*
+ * Pick up the dirtied pages by the exited tasks. This avoids lots of
+ * short-lived tasks (eg. gcc invocations in a kernel build) escaping
+ * the dirty throttling and livelock other long-run dirtiers.
+ */
+ p = &__get_cpu_var(dirty_throttle_leaks);
+ if (*p > 0 && current->nr_dirtied < ratelimit) {
+ nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
+ *p -= nr_pages_dirtied;
+ current->nr_dirtied += nr_pages_dirtied;
}
preempt_enable();
+
+ if (unlikely(current->nr_dirtied >= ratelimit))
+ balance_dirty_pages(mapping, current->nr_dirtied);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
@@ -898,7 +1517,8 @@ void laptop_mode_timer_fn(unsigned long data)
* threshold
*/
if (bdi_has_dirty_io(&q->backing_dev_info))
- bdi_start_writeback(&q->backing_dev_info, nr_pages);
+ bdi_start_writeback(&q->backing_dev_info, nr_pages,
+ WB_REASON_LAPTOP_TIMER);
}
/*
@@ -937,22 +1557,17 @@ void laptop_sync_completion(void)
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
- *
- * But the limit should not be set too high. Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time. So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
+ * thresholds.
*/
void writeback_set_ratelimit(void)
{
- ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
- if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
- ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
}
static int __cpuinit
@@ -994,7 +1609,6 @@ void __init page_writeback_init(void)
shift = calc_period_shift();
prop_descriptor_init(&vm_completions, shift);
- prop_descriptor_init(&vm_dirties, shift);
}
/**
@@ -1322,8 +1936,10 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
- task_dirty_inc(current);
+ __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
task_io_account_write(PAGE_CACHE_SIZE);
+ current->nr_dirtied++;
+ this_cpu_inc(bdp_ratelimits);
}
}
EXPORT_SYMBOL(account_page_dirtied);
@@ -1384,6 +2000,24 @@ int __set_page_dirty_nobuffers(struct page *page)
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
/*
+ * Call this whenever redirtying a page, to de-account the dirty counters
+ * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
+ * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
+ * systematic errors in balanced_dirty_ratelimit and the dirty pages position
+ * control.
+ */
+void account_page_redirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ current->nr_dirtied--;
+ dec_zone_page_state(page, NR_DIRTIED);
+ dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ }
+}
+EXPORT_SYMBOL(account_page_redirty);
+
+/*
* When a writepage implementation decides that it doesn't want to write this
* page for some reason, it should redirty the locked page via
* redirty_page_for_writepage() and it should then unlock the page and return 0
@@ -1391,6 +2025,7 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
{
wbc->pages_skipped++;
+ account_page_redirty(page);
return __set_page_dirty_nobuffers(page);
}
EXPORT_SYMBOL(redirty_page_for_writepage);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e8ecb6e021c..0027d8f4a1bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
#include <linux/ftrace_event.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
+#include <linux/page-debug-flags.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory. This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~GFP_IOFS;
}
+
+bool pm_suspended_storage(void)
+{
+ if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+ return false;
+ return true;
+}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -181,39 +197,17 @@ static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
static unsigned long __meminitdata dma_reserve;
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
- /*
- * MAX_ACTIVE_REGIONS determines the maximum number of distinct
- * ranges of memory (RAM) that may be registered with add_active_range().
- * Ranges passed to add_active_range() will be merged if possible
- * so the number of times add_active_range() can be called is
- * related to the number of nodes and the number of holes
- */
- #ifdef CONFIG_MAX_ACTIVE_REGIONS
- /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
- #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
- #else
- #if MAX_NUMNODES >= 32
- /* If there can be many nodes, allow up to 50 holes per node */
- #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
- #else
- /* By default, allow up to 256 distinct regions */
- #define MAX_ACTIVE_REGIONS 256
- #endif
- #endif
-
- static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
- static int __meminitdata nr_nodemap_entries;
- static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
- static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
- static unsigned long __initdata required_kernelcore;
- static unsigned long __initdata required_movablecore;
- static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
-
- /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
- int movable_zone;
- EXPORT_SYMBOL(movable_zone);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __initdata required_kernelcore;
+static unsigned long __initdata required_movablecore;
+static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+
+/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+int movable_zone;
+EXPORT_SYMBOL(movable_zone);
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -318,6 +312,7 @@ static void bad_page(struct page *page)
current->comm, page_to_pfn(page));
dump_page(page);
+ print_modules();
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
@@ -332,8 +327,8 @@ out:
*
* The remaining PAGE_SIZE pages are called "tail pages".
*
- * All pages have PG_compound set. All pages have their ->private pointing at
- * the head page (even the head page has this).
+ * All pages have PG_compound set. All tail pages have their ->first_page
+ * pointing at the head page.
*
* The first tail page's ->lru.next holds the address of the compound page's
* put_page() function. Its ->lru.prev holds the order of allocation.
@@ -355,8 +350,8 @@ void prep_compound_page(struct page *page, unsigned long order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
-
__SetPageTail(p);
+ set_page_count(p, 0);
p->first_page = page;
}
}
@@ -402,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
clear_highpage(page + i);
}
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+ unsigned long res;
+
+ if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
+ printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ return 0;
+ }
+ _debug_guardpage_minorder = res;
+ printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+
+static inline void set_page_guard_flag(struct page *page)
+{
+ __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+
+static inline void clear_page_guard_flag(struct page *page)
+{
+ __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
+
static inline void set_page_order(struct page *page, int order)
{
set_page_private(page, order);
@@ -459,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
+ if (page_is_guard(buddy) && page_order(buddy) == order) {
+ VM_BUG_ON(page_count(buddy) != 0);
+ return 1;
+ }
+
if (PageBuddy(buddy) && page_order(buddy) == order) {
VM_BUG_ON(page_count(buddy) != 0);
return 1;
@@ -515,11 +546,19 @@ static inline void __free_one_page(struct page *page,
buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
break;
-
- /* Our buddy is free, merge with it and move up one order. */
- list_del(&buddy->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(buddy);
+ /*
+ * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+ * merge with it and move up one order.
+ */
+ if (page_is_guard(buddy)) {
+ clear_page_guard_flag(buddy);
+ set_page_private(page, 0);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+ } else {
+ list_del(&buddy->lru);
+ zone->free_area[order].nr_free--;
+ rmv_page_order(buddy);
+ }
combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
@@ -653,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
int i;
int bad = 0;
- trace_mm_page_free_direct(page, order);
+ trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
if (PageAnon(page))
@@ -691,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-/*
- * permit the bootmem allocator to evade page validation on high-order frees
- */
void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
{
- if (order == 0) {
- __ClearPageReserved(page);
- set_page_count(page, 0);
- set_page_refcounted(page);
- __free_page(page);
- } else {
- int loop;
+ unsigned int nr_pages = 1 << order;
+ unsigned int loop;
- prefetchw(page);
- for (loop = 0; loop < BITS_PER_LONG; loop++) {
- struct page *p = &page[loop];
+ prefetchw(page);
+ for (loop = 0; loop < nr_pages; loop++) {
+ struct page *p = &page[loop];
- if (loop + 1 < BITS_PER_LONG)
- prefetchw(p + 1);
- __ClearPageReserved(p);
- set_page_count(p, 0);
- }
-
- set_page_refcounted(page);
- __free_pages(page, order);
+ if (loop + 1 < nr_pages)
+ prefetchw(p + 1);
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
}
+
+ set_page_refcounted(page);
+ __free_pages(page, order);
}
@@ -745,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page,
high--;
size >>= 1;
VM_BUG_ON(bad_range(zone, &page[size]));
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (high < debug_guardpage_minorder()) {
+ /*
+ * Mark as guard pages (or page), that will allow to
+ * merge back to allocator when buddy will be freed.
+ * Corresponding page table entries will not be touched,
+ * pages will stay not present in virtual address space
+ */
+ INIT_LIST_HEAD(&page[size].lru);
+ set_page_guard_flag(&page[size]);
+ set_page_private(&page[size], high);
+ /* Guard pages are not available for any usage */
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+ continue;
+ }
+#endif
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
@@ -1210,6 +1257,19 @@ out:
}
/*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ trace_mm_page_free_batched(page, cold);
+ free_hot_cold_page(page, cold);
+ }
+}
+
+/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
* Each sub-page must be freed individually.
@@ -1407,7 +1467,7 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
static int __init fail_page_alloc_debugfs(void)
{
- mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
@@ -1456,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
long min = mark;
int o;
- free_pages -= (1 << order) + 1;
+ free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
@@ -1666,6 +1726,35 @@ zonelist_scan:
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
+ /*
+ * When allocating a page cache page for writing, we
+ * want to get it from a zone that is within its dirty
+ * limit, such that no single zone holds more than its
+ * proportional share of globally allowed dirty pages.
+ * The dirty limits take into account the zone's
+ * lowmem reserves and high watermark so that kswapd
+ * should be able to balance it without having to
+ * write pages from its LRU list.
+ *
+ * This may look like it could increase pressure on
+ * lower zones by failing allocations in higher zones
+ * before they are full. But the pages that do spill
+ * over are limited as the lower zones are protected
+ * by this very same mechanism. It should not become
+ * a practical burden to them.
+ *
+ * XXX: For now, allow allocations to potentially
+ * exceed the per-zone dirty limit in the slowpath
+ * (ALLOC_WMARK_LOW unset) before going into reclaim,
+ * which is important when on a NUMA setup the allowed
+ * zones are together not big enough to reach the
+ * global limit. The proper fix for these situations
+ * will require awareness of zones in the
+ * dirty-throttling and the flusher threads.
+ */
+ if ((alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+ goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1753,10 +1842,10 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
{
- va_list args;
unsigned int filter = SHOW_MEM_FILTER_NODES;
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+ debug_guardpage_minorder() > 0)
return;
/*
@@ -1772,14 +1861,21 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
filter &= ~SHOW_MEM_FILTER_NODES;
if (fmt) {
- printk(KERN_WARNING);
+ struct va_format vaf;
+ va_list args;
+
va_start(args, fmt);
- vprintk(fmt, args);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_warn("%pV", &vaf);
+
va_end(args);
}
- pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
- current->comm, order, gfp_mask);
+ pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+ current->comm, order, gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
@@ -1788,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
static inline int
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+ unsigned long did_some_progress,
unsigned long pages_reclaimed)
{
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
return 0;
+ /* Always retry if specifically requested */
+ if (gfp_mask & __GFP_NOFAIL)
+ return 1;
+
+ /*
+ * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+ * making forward progress without invoking OOM. Suspend also disables
+ * storage devices so kswapd will not help. Bail if we are suspending.
+ */
+ if (!did_some_progress && pm_suspended_storage())
+ return 0;
+
/*
* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
* means __GFP_NOFAIL, but that may not be true in other
@@ -1812,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
return 1;
- /*
- * Don't let big-order allocations loop unless the caller
- * explicitly requests that.
- */
- if (gfp_mask & __GFP_NOFAIL)
- return 1;
-
return 0;
}
@@ -1879,14 +1981,20 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
struct page *page;
- if (!order || compaction_deferred(preferred_zone))
+ if (!order)
return NULL;
+ if (compaction_deferred(preferred_zone)) {
+ *deferred_compaction = true;
+ return NULL;
+ }
+
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration);
@@ -1914,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
- defer_compaction(preferred_zone);
+
+ /*
+ * As async compaction considers a subset of pageblocks, only
+ * defer if the failure was a sync compaction failure.
+ */
+ if (sync_migration)
+ defer_compaction(preferred_zone);
cond_resched();
}
@@ -1926,8 +2040,9 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
return NULL;
}
@@ -2077,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
+ bool deferred_compaction = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2157,12 +2273,22 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
sync_migration = true;
+ /*
+ * If compaction is deferred for high-order allocations, it is because
+ * sync compaction recently failed. In this is the case and the caller
+ * has requested the system not be heavily disrupted, fail the
+ * allocation now instead of entering direct reclaim
+ */
+ if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+ goto nopage;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
@@ -2211,7 +2337,8 @@ rebalance:
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
- if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+ if (should_alloc_retry(gfp_mask, order, did_some_progress,
+ pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
@@ -2225,8 +2352,9 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
}
@@ -2321,16 +2449,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page);
-void __pagevec_free(struct pagevec *pvec)
-{
- int i = pagevec_count(pvec);
-
- while (--i >= 0) {
- trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
- free_hot_cold_page(pvec->pages[i], pvec->cold);
- }
-}
-
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
@@ -3370,9 +3488,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
unsigned long block_migratetype;
int reserve;
- /* Get the start pfn, end pfn and the number of blocks to reserve */
+ /*
+ * Get the start pfn, end pfn and the number of blocks to reserve
+ * We have to be careful to be aligned to pageblock_nr_pages to
+ * make sure that we always check pfn_valid for the first page in
+ * the block.
+ */
start_pfn = zone->zone_start_pfn;
end_pfn = start_pfn + zone->spanned_pages;
+ start_pfn = roundup(start_pfn, pageblock_nr_pages);
reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
pageblock_order;
@@ -3394,25 +3518,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
if (page_to_nid(page) != zone_to_nid(zone))
continue;
- /* Blocks with reserved pages will never free, skip them. */
- block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
- if (pageblock_is_reserved(pfn, block_end_pfn))
- continue;
-
block_migratetype = get_pageblock_migratetype(page);
- /* If this block is reserved, account for it */
- if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
- reserve--;
- continue;
- }
+ /* Only test what is necessary when the reserves are not met */
+ if (reserve > 0) {
+ /*
+ * Blocks with reserved pages will never free, skip
+ * them.
+ */
+ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+ if (pageblock_is_reserved(pfn, block_end_pfn))
+ continue;
- /* Suitable for reserving if this block is movable */
- if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
- set_pageblock_migratetype(page, MIGRATE_RESERVE);
- move_freepages_block(zone, page, MIGRATE_RESERVE);
- reserve--;
- continue;
+ /* If this block is reserved, account for it */
+ if (block_migratetype == MIGRATE_RESERVE) {
+ reserve--;
+ continue;
+ }
+
+ /* Suitable for reserving if this block is movable */
+ if (block_migratetype == MIGRATE_MOVABLE) {
+ set_pageblock_migratetype(page,
+ MIGRATE_RESERVE);
+ move_freepages_block(zone, page,
+ MIGRATE_RESERVE);
+ reserve--;
+ continue;
+ }
}
/*
@@ -3724,35 +3856,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
return 0;
}
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-/*
- * Basic iterator support. Return the first range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns first region regardless of node
- */
-static int __meminit first_active_region_index_in_nid(int nid)
-{
- int i;
-
- for (i = 0; i < nr_nodemap_entries; i++)
- if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
- return i;
-
- return -1;
-}
-
-/*
- * Basic iterator support. Return the next active range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns next region regardless of node
- */
-static int __meminit next_active_region_index_in_nid(int index, int nid)
-{
- for (index = index + 1; index < nr_nodemap_entries; index++)
- if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
- return index;
-
- return -1;
-}
-
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@ -3762,15 +3866,12 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
*/
int __meminit __early_pfn_to_nid(unsigned long pfn)
{
- int i;
-
- for (i = 0; i < nr_nodemap_entries; i++) {
- unsigned long start_pfn = early_node_map[i].start_pfn;
- unsigned long end_pfn = early_node_map[i].end_pfn;
+ unsigned long start_pfn, end_pfn;
+ int i, nid;
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
if (start_pfn <= pfn && pfn < end_pfn)
- return early_node_map[i].nid;
- }
+ return nid;
/* This is a memory hole */
return -1;
}
@@ -3799,11 +3900,6 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
}
#endif
-/* Basic iterator support to walk early_node_map[] */
-#define for_each_active_range_index_in_nid(i, nid) \
- for (i = first_active_region_index_in_nid(nid); i != -1; \
- i = next_active_region_index_in_nid(i, nid))
-
/**
* free_bootmem_with_active_regions - Call free_bootmem_node for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -3813,122 +3909,34 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
* add_active_ranges() contain no holes and may be freed, this
* this function may be used instead of calling free_bootmem() manually.
*/
-void __init free_bootmem_with_active_regions(int nid,
- unsigned long max_low_pfn)
-{
- int i;
-
- for_each_active_range_index_in_nid(i, nid) {
- unsigned long size_pages = 0;
- unsigned long end_pfn = early_node_map[i].end_pfn;
-
- if (early_node_map[i].start_pfn >= max_low_pfn)
- continue;
-
- if (end_pfn > max_low_pfn)
- end_pfn = max_low_pfn;
-
- size_pages = end_pfn - early_node_map[i].start_pfn;
- free_bootmem_node(NODE_DATA(early_node_map[i].nid),
- PFN_PHYS(early_node_map[i].start_pfn),
- size_pages << PAGE_SHIFT);
- }
-}
-
-#ifdef CONFIG_HAVE_MEMBLOCK
-/*
- * Basic iterator support. Return the last range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns last region regardless of node
- */
-static int __meminit last_active_region_index_in_nid(int nid)
-{
- int i;
-
- for (i = nr_nodemap_entries - 1; i >= 0; i--)
- if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
- return i;
-
- return -1;
-}
-
-/*
- * Basic iterator support. Return the previous active range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns next region regardless of node
- */
-static int __meminit previous_active_region_index_in_nid(int index, int nid)
-{
- for (index = index - 1; index >= 0; index--)
- if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
- return index;
-
- return -1;
-}
-
-#define for_each_active_range_index_in_nid_reverse(i, nid) \
- for (i = last_active_region_index_in_nid(nid); i != -1; \
- i = previous_active_region_index_in_nid(i, nid))
-
-u64 __init find_memory_core_early(int nid, u64 size, u64 align,
- u64 goal, u64 limit)
+void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
{
- int i;
-
- /* Need to go over early_node_map to find out good range for node */
- for_each_active_range_index_in_nid_reverse(i, nid) {
- u64 addr;
- u64 ei_start, ei_last;
- u64 final_start, final_end;
-
- ei_last = early_node_map[i].end_pfn;
- ei_last <<= PAGE_SHIFT;
- ei_start = early_node_map[i].start_pfn;
- ei_start <<= PAGE_SHIFT;
-
- final_start = max(ei_start, goal);
- final_end = min(ei_last, limit);
-
- if (final_start >= final_end)
- continue;
-
- addr = memblock_find_in_range(final_start, final_end, size, align);
+ unsigned long start_pfn, end_pfn;
+ int i, this_nid;
- if (addr == MEMBLOCK_ERROR)
- continue;
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
+ start_pfn = min(start_pfn, max_low_pfn);
+ end_pfn = min(end_pfn, max_low_pfn);
- return addr;
+ if (start_pfn < end_pfn)
+ free_bootmem_node(NODE_DATA(this_nid),
+ PFN_PHYS(start_pfn),
+ (end_pfn - start_pfn) << PAGE_SHIFT);
}
-
- return MEMBLOCK_ERROR;
}
-#endif
int __init add_from_early_node_map(struct range *range, int az,
int nr_range, int nid)
{
+ unsigned long start_pfn, end_pfn;
int i;
- u64 start, end;
/* need to go over early_node_map to find out good range for node */
- for_each_active_range_index_in_nid(i, nid) {
- start = early_node_map[i].start_pfn;
- end = early_node_map[i].end_pfn;
- nr_range = add_range(range, az, nr_range, start, end);
- }
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
+ nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
return nr_range;
}
-void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
-{
- int i;
- int ret;
-
- for_each_active_range_index_in_nid(i, nid) {
- ret = work_fn(early_node_map[i].start_pfn,
- early_node_map[i].end_pfn, data);
- if (ret)
- break;
- }
-}
/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -3939,12 +3947,11 @@ void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
*/
void __init sparse_memory_present_with_active_regions(int nid)
{
- int i;
+ unsigned long start_pfn, end_pfn;
+ int i, this_nid;
- for_each_active_range_index_in_nid(i, nid)
- memory_present(early_node_map[i].nid,
- early_node_map[i].start_pfn,
- early_node_map[i].end_pfn);
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
+ memory_present(this_nid, start_pfn, end_pfn);
}
/**
@@ -3961,13 +3968,15 @@ void __init sparse_memory_present_with_active_regions(int nid)
void __meminit get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
+ unsigned long this_start_pfn, this_end_pfn;
int i;
+
*start_pfn = -1UL;
*end_pfn = 0;
- for_each_active_range_index_in_nid(i, nid) {
- *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
- *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+ for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+ *start_pfn = min(*start_pfn, this_start_pfn);
+ *end_pfn = max(*end_pfn, this_end_pfn);
}
if (*start_pfn == -1UL)
@@ -4070,46 +4079,16 @@ unsigned long __meminit __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
- int i = 0;
- unsigned long prev_end_pfn = 0, hole_pages = 0;
- unsigned long start_pfn;
-
- /* Find the end_pfn of the first active range of pfns in the node */
- i = first_active_region_index_in_nid(nid);
- if (i == -1)
- return 0;
-
- prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
-
- /* Account for ranges before physical memory on this node */
- if (early_node_map[i].start_pfn > range_start_pfn)
- hole_pages = prev_end_pfn - range_start_pfn;
-
- /* Find all holes for the zone within the node */
- for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
-
- /* No need to continue if prev_end_pfn is outside the zone */
- if (prev_end_pfn >= range_end_pfn)
- break;
-
- /* Make sure the end of the zone is not within the hole */
- start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
- prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+ unsigned long nr_absent = range_end_pfn - range_start_pfn;
+ unsigned long start_pfn, end_pfn;
+ int i;
- /* Update the hole size cound and move on */
- if (start_pfn > range_start_pfn) {
- BUG_ON(prev_end_pfn > start_pfn);
- hole_pages += start_pfn - prev_end_pfn;
- }
- prev_end_pfn = early_node_map[i].end_pfn;
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+ end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+ nr_absent -= end_pfn - start_pfn;
}
-
- /* Account for ranges past physical memory on this node */
- if (range_end_pfn > prev_end_pfn)
- hole_pages += range_end_pfn -
- max(range_start_pfn, prev_end_pfn);
-
- return hole_pages;
+ return nr_absent;
}
/**
@@ -4130,14 +4109,14 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *ignored)
{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long node_start_pfn, node_end_pfn;
unsigned long zone_start_pfn, zone_end_pfn;
get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
- zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
- node_start_pfn);
- zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
- node_end_pfn);
+ zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
@@ -4145,7 +4124,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
}
-#else
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *zones_size)
@@ -4163,7 +4142,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
return zholes_size[zone_type];
}
-#endif
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
@@ -4283,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
- enum lru_list l;
+ enum lru_list lru;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4333,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- for_each_lru(l)
- INIT_LIST_HEAD(&zone->lru[l].list);
+ for_each_lru(lru)
+ INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4386,10 +4365,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
@@ -4414,7 +4393,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
free_area_init_core(pgdat, zones_size, zholes_size);
}
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
#if MAX_NUMNODES > 1
/*
@@ -4436,170 +4415,6 @@ static inline void setup_nr_node_ids(void)
#endif
/**
- * add_active_range - Register a range of PFNs backed by physical memory
- * @nid: The node ID the range resides on
- * @start_pfn: The start PFN of the available physical memory
- * @end_pfn: The end PFN of the available physical memory
- *
- * These ranges are stored in an early_node_map[] and later used by
- * free_area_init_nodes() to calculate zone sizes and holes. If the
- * range spans a memory hole, it is up to the architecture to ensure
- * the memory is not freed by the bootmem allocator. If possible
- * the range being registered will be merged with existing ranges.
- */
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- int i;
-
- mminit_dprintk(MMINIT_TRACE, "memory_register",
- "Entering add_active_range(%d, %#lx, %#lx) "
- "%d entries of %d used\n",
- nid, start_pfn, end_pfn,
- nr_nodemap_entries, MAX_ACTIVE_REGIONS);
-
- mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
-
- /* Merge with existing active regions if possible */
- for (i = 0; i < nr_nodemap_entries; i++) {
- if (early_node_map[i].nid != nid)
- continue;
-
- /* Skip if an existing region covers this new one */
- if (start_pfn >= early_node_map[i].start_pfn &&
- end_pfn <= early_node_map[i].end_pfn)
- return;
-
- /* Merge forward if suitable */
- if (start_pfn <= early_node_map[i].end_pfn &&
- end_pfn > early_node_map[i].end_pfn) {
- early_node_map[i].end_pfn = end_pfn;
- return;
- }
-
- /* Merge backward if suitable */
- if (start_pfn < early_node_map[i].start_pfn &&
- end_pfn >= early_node_map[i].start_pfn) {
- early_node_map[i].start_pfn = start_pfn;
- return;
- }
- }
-
- /* Check that early_node_map is large enough */
- if (i >= MAX_ACTIVE_REGIONS) {
- printk(KERN_CRIT "More than %d memory regions, truncating\n",
- MAX_ACTIVE_REGIONS);
- return;
- }
-
- early_node_map[i].nid = nid;
- early_node_map[i].start_pfn = start_pfn;
- early_node_map[i].end_pfn = end_pfn;
- nr_nodemap_entries = i + 1;
-}
-
-/**
- * remove_active_range - Shrink an existing registered range of PFNs
- * @nid: The node id the range is on that should be shrunk
- * @start_pfn: The new PFN of the range
- * @end_pfn: The new PFN of the range
- *
- * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- * The map is kept near the end physical page range that has already been
- * registered. This function allows an arch to shrink an existing registered
- * range.
- */
-void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- int i, j;
- int removed = 0;
-
- printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
- nid, start_pfn, end_pfn);
-
- /* Find the old active region end and shrink */
- for_each_active_range_index_in_nid(i, nid) {
- if (early_node_map[i].start_pfn >= start_pfn &&
- early_node_map[i].end_pfn <= end_pfn) {
- /* clear it */
- early_node_map[i].start_pfn = 0;
- early_node_map[i].end_pfn = 0;
- removed = 1;
- continue;
- }
- if (early_node_map[i].start_pfn < start_pfn &&
- early_node_map[i].end_pfn > start_pfn) {
- unsigned long temp_end_pfn = early_node_map[i].end_pfn;
- early_node_map[i].end_pfn = start_pfn;
- if (temp_end_pfn > end_pfn)
- add_active_range(nid, end_pfn, temp_end_pfn);
- continue;
- }
- if (early_node_map[i].start_pfn >= start_pfn &&
- early_node_map[i].end_pfn > end_pfn &&
- early_node_map[i].start_pfn < end_pfn) {
- early_node_map[i].start_pfn = end_pfn;
- continue;
- }
- }
-
- if (!removed)
- return;
-
- /* remove the blank ones */
- for (i = nr_nodemap_entries - 1; i > 0; i--) {
- if (early_node_map[i].nid != nid)
- continue;
- if (early_node_map[i].end_pfn)
- continue;
- /* we found it, get rid of it */
- for (j = i; j < nr_nodemap_entries - 1; j++)
- memcpy(&early_node_map[j], &early_node_map[j+1],
- sizeof(early_node_map[j]));
- j = nr_nodemap_entries - 1;
- memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
- nr_nodemap_entries--;
- }
-}
-
-/**
- * remove_all_active_ranges - Remove all currently registered regions
- *
- * During discovery, it may be found that a table like SRAT is invalid
- * and an alternative discovery method must be used. This function removes
- * all currently registered regions.
- */
-void __init remove_all_active_ranges(void)
-{
- memset(early_node_map, 0, sizeof(early_node_map));
- nr_nodemap_entries = 0;
-}
-
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
-{
- struct node_active_region *arange = (struct node_active_region *)a;
- struct node_active_region *brange = (struct node_active_region *)b;
-
- /* Done this way to avoid overflows */
- if (arange->start_pfn > brange->start_pfn)
- return 1;
- if (arange->start_pfn < brange->start_pfn)
- return -1;
-
- return 0;
-}
-
-/* sort the node_map by start_pfn */
-void __init sort_node_map(void)
-{
- sort(early_node_map, (size_t)nr_nodemap_entries,
- sizeof(struct node_active_region),
- cmp_node_active_region, NULL);
-}
-
-/**
* node_map_pfn_alignment - determine the maximum internode alignment
*
* This function should be called after node map is populated and sorted.
@@ -4621,15 +4436,11 @@ void __init sort_node_map(void)
unsigned long __init node_map_pfn_alignment(void)
{
unsigned long accl_mask = 0, last_end = 0;
+ unsigned long start, end, mask;
int last_nid = -1;
- int i;
-
- for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
- int nid = early_node_map[i].nid;
- unsigned long start = early_node_map[i].start_pfn;
- unsigned long end = early_node_map[i].end_pfn;
- unsigned long mask;
+ int i, nid;
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
if (!start || last_nid < 0 || last_nid == nid) {
last_nid = nid;
last_end = end;
@@ -4656,12 +4467,12 @@ unsigned long __init node_map_pfn_alignment(void)
/* Find the lowest pfn for a node */
static unsigned long __init find_min_pfn_for_node(int nid)
{
- int i;
unsigned long min_pfn = ULONG_MAX;
+ unsigned long start_pfn;
+ int i;
- /* Assuming a sorted map, the first range found has the starting pfn */
- for_each_active_range_index_in_nid(i, nid)
- min_pfn = min(min_pfn, early_node_map[i].start_pfn);
+ for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
+ min_pfn = min(min_pfn, start_pfn);
if (min_pfn == ULONG_MAX) {
printk(KERN_WARNING
@@ -4690,15 +4501,16 @@ unsigned long __init find_min_pfn_with_active_regions(void)
*/
static unsigned long __init early_calculate_totalpages(void)
{
- int i;
unsigned long totalpages = 0;
+ unsigned long start_pfn, end_pfn;
+ int i, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ unsigned long pages = end_pfn - start_pfn;
- for (i = 0; i < nr_nodemap_entries; i++) {
- unsigned long pages = early_node_map[i].end_pfn -
- early_node_map[i].start_pfn;
totalpages += pages;
if (pages)
- node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
+ node_set_state(nid, N_HIGH_MEMORY);
}
return totalpages;
}
@@ -4753,6 +4565,8 @@ restart:
/* Spread kernelcore memory as evenly as possible throughout nodes */
kernelcore_node = required_kernelcore / usable_nodes;
for_each_node_state(nid, N_HIGH_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
/*
* Recalculate kernelcore_node if the division per node
* now exceeds what is necessary to satisfy the requested
@@ -4769,13 +4583,10 @@ restart:
kernelcore_remaining = kernelcore_node;
/* Go through each range of PFNs within this node */
- for_each_active_range_index_in_nid(i, nid) {
- unsigned long start_pfn, end_pfn;
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
unsigned long size_pages;
- start_pfn = max(early_node_map[i].start_pfn,
- zone_movable_pfn[nid]);
- end_pfn = early_node_map[i].end_pfn;
+ start_pfn = max(start_pfn, zone_movable_pfn[nid]);
if (start_pfn >= end_pfn)
continue;
@@ -4856,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat)
for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
- if (zone->present_pages)
+ if (zone->present_pages) {
node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+ break;
+ }
}
#endif
}
@@ -4877,11 +4690,8 @@ static void check_for_regular_memory(pg_data_t *pgdat)
*/
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
- unsigned long nid;
- int i;
-
- /* Sort early_node_map as initialisation assumes it is sorted */
- sort_node_map();
+ unsigned long start_pfn, end_pfn;
+ int i, nid;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
@@ -4928,11 +4738,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
}
/* Print out the early_node_map[] */
- printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
- for (i = 0; i < nr_nodemap_entries; i++)
- printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
- early_node_map[i].start_pfn,
- early_node_map[i].end_pfn);
+ printk("Early memory PFN ranges\n");
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+ printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
/* Initialise every node */
mminit_verify_pageflags_layout();
@@ -4985,7 +4793,7 @@ static int __init cmdline_parse_movablecore(char *p)
early_param("kernelcore", cmdline_parse_kernelcore);
early_param("movablecore", cmdline_parse_movablecore);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
/**
* set_dma_reserve - set the specified number of pages reserved in the first zone
@@ -5069,8 +4877,19 @@ static void calculate_totalreserve_pages(void)
if (max > zone->present_pages)
max = zone->present_pages;
reserve_pages += max;
+ /*
+ * Lowmem reserves are not available to
+ * GFP_HIGHUSER page cache allocations and
+ * kswapd tries to balance zones to their high
+ * watermark. As a result, neither should be
+ * regarded as dirtyable memory, to prevent a
+ * situation where reclaim has to clean pages
+ * in order to balance the zones.
+ */
+ zone->dirty_balance_reserve = max;
}
}
+ dirty_balance_reserve = reserve_pages;
totalreserve_pages = reserve_pages;
}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 39d216d535ea..de1616aa9b1e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -11,13 +11,6 @@
#include <linux/swapops.h>
#include <linux/kmemleak.h>
-static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
-{
- pc->flags = 0;
- set_page_cgroup_array_id(pc, id);
- pc->mem_cgroup = NULL;
- INIT_LIST_HEAD(&pc->lru);
-}
static unsigned long total_usage;
#if !defined(CONFIG_SPARSEMEM)
@@ -35,35 +28,27 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
struct page_cgroup *base;
base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_cgroup arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
if (unlikely(!base))
return NULL;
-
+#endif
offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
return base + offset;
}
-struct page *lookup_cgroup_page(struct page_cgroup *pc)
-{
- unsigned long pfn;
- struct page *page;
- pg_data_t *pgdat;
-
- pgdat = NODE_DATA(page_cgroup_array_id(pc));
- pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
- page = pfn_to_page(pfn);
- VM_BUG_ON(pc != lookup_page_cgroup(page));
- return page;
-}
-
static int __init alloc_node_page_cgroup(int nid)
{
- struct page_cgroup *base, *pc;
+ struct page_cgroup *base;
unsigned long table_size;
- unsigned long start_pfn, nr_pages, index;
+ unsigned long nr_pages;
- start_pfn = NODE_DATA(nid)->node_start_pfn;
nr_pages = NODE_DATA(nid)->node_spanned_pages;
-
if (!nr_pages)
return 0;
@@ -73,10 +58,6 @@ static int __init alloc_node_page_cgroup(int nid)
table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (!base)
return -ENOMEM;
- for (index = 0; index < nr_pages; index++) {
- pc = base + index;
- init_page_cgroup(pc, nid);
- }
NODE_DATA(nid)->node_page_cgroup = base;
total_usage += table_size;
return 0;
@@ -111,67 +92,45 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_cgroup arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
if (!section->page_cgroup)
return NULL;
+#endif
return section->page_cgroup + pfn;
}
-struct page *lookup_cgroup_page(struct page_cgroup *pc)
-{
- struct mem_section *section;
- struct page *page;
- unsigned long nr;
-
- nr = page_cgroup_array_id(pc);
- section = __nr_to_section(nr);
- page = pfn_to_page(pc - section->page_cgroup);
- VM_BUG_ON(pc != lookup_page_cgroup(page));
- return page;
-}
-
static void *__meminit alloc_page_cgroup(size_t size, int nid)
{
+ gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
void *addr = NULL;
- addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
- if (addr)
+ addr = alloc_pages_exact_nid(nid, size, flags);
+ if (addr) {
+ kmemleak_alloc(addr, size, 1, flags);
return addr;
+ }
if (node_state(nid, N_HIGH_MEMORY))
- addr = vmalloc_node(size, nid);
+ addr = vzalloc_node(size, nid);
else
- addr = vmalloc(size);
+ addr = vzalloc(size);
return addr;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void free_page_cgroup(void *addr)
-{
- if (is_vmalloc_addr(addr)) {
- vfree(addr);
- } else {
- struct page *page = virt_to_page(addr);
- size_t table_size =
- sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-
- BUG_ON(PageReserved(page));
- free_pages_exact(addr, table_size);
- }
-}
-#endif
-
static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
{
- struct page_cgroup *base, *pc;
struct mem_section *section;
+ struct page_cgroup *base;
unsigned long table_size;
- unsigned long nr;
- int index;
- nr = pfn_to_section_nr(pfn);
- section = __nr_to_section(nr);
+ section = __pfn_to_section(pfn);
if (section->page_cgroup)
return 0;
@@ -191,10 +150,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
return -ENOMEM;
}
- for (index = 0; index < PAGES_PER_SECTION; index++) {
- pc = base + index;
- init_page_cgroup(pc, nr);
- }
/*
* The passed "pfn" may not be aligned to SECTION. For the calculation
* we need to apply a mask.
@@ -205,6 +160,20 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
return 0;
}
#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_page_cgroup(void *addr)
+{
+ if (is_vmalloc_addr(addr)) {
+ vfree(addr);
+ } else {
+ struct page *page = virt_to_page(addr);
+ size_t table_size =
+ sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+
+ BUG_ON(PageReserved(page));
+ free_pages_exact(addr, table_size);
+ }
+}
+
void __free_page_cgroup(unsigned long pfn)
{
struct mem_section *ms;
@@ -357,13 +326,12 @@ struct swap_cgroup_ctrl {
spinlock_t lock;
};
-struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
struct swap_cgroup {
unsigned short id;
};
#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
-#define SC_POS_MASK (SC_PER_PAGE - 1)
/*
* SwapCgroup implements "lookup" and "exchange" operations.
@@ -405,6 +373,21 @@ not_enough_page:
return -ENOMEM;
}
+static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
+ struct swap_cgroup_ctrl **ctrlp)
+{
+ pgoff_t offset = swp_offset(ent);
+ struct swap_cgroup_ctrl *ctrl;
+ struct page *mappage;
+
+ ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+ if (ctrlp)
+ *ctrlp = ctrl;
+
+ mappage = ctrl->map[offset / SC_PER_PAGE];
+ return page_address(mappage) + offset % SC_PER_PAGE;
+}
+
/**
* swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
* @end: swap entry to be cmpxchged
@@ -417,21 +400,13 @@ not_enough_page:
unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
unsigned short old, unsigned short new)
{
- int type = swp_type(ent);
- unsigned long offset = swp_offset(ent);
- unsigned long idx = offset / SC_PER_PAGE;
- unsigned long pos = offset & SC_POS_MASK;
struct swap_cgroup_ctrl *ctrl;
- struct page *mappage;
struct swap_cgroup *sc;
unsigned long flags;
unsigned short retval;
- ctrl = &swap_cgroup_ctrl[type];
+ sc = lookup_swap_cgroup(ent, &ctrl);
- mappage = ctrl->map[idx];
- sc = page_address(mappage);
- sc += pos;
spin_lock_irqsave(&ctrl->lock, flags);
retval = sc->id;
if (retval == old)
@@ -452,21 +427,13 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
*/
unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
{
- int type = swp_type(ent);
- unsigned long offset = swp_offset(ent);
- unsigned long idx = offset / SC_PER_PAGE;
- unsigned long pos = offset & SC_POS_MASK;
struct swap_cgroup_ctrl *ctrl;
- struct page *mappage;
struct swap_cgroup *sc;
unsigned short old;
unsigned long flags;
- ctrl = &swap_cgroup_ctrl[type];
+ sc = lookup_swap_cgroup(ent, &ctrl);
- mappage = ctrl->map[idx];
- sc = page_address(mappage);
- sc += pos;
spin_lock_irqsave(&ctrl->lock, flags);
old = sc->id;
sc->id = id;
@@ -476,28 +443,14 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
}
/**
- * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
+ * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
* @ent: swap entry to be looked up.
*
* Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
*/
-unsigned short lookup_swap_cgroup(swp_entry_t ent)
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
{
- int type = swp_type(ent);
- unsigned long offset = swp_offset(ent);
- unsigned long idx = offset / SC_PER_PAGE;
- unsigned long pos = offset & SC_POS_MASK;
- struct swap_cgroup_ctrl *ctrl;
- struct page *mappage;
- struct swap_cgroup *sc;
- unsigned short ret;
-
- ctrl = &swap_cgroup_ctrl[type];
- mappage = ctrl->map[idx];
- sc = page_address(mappage);
- sc += pos;
- ret = sc->id;
- return ret;
+ return lookup_swap_cgroup(ent, NULL)->id;
}
int swap_cgroup_swapon(int type, unsigned long max_pages)
@@ -513,11 +466,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array_size = length * sizeof(void *);
- array = vmalloc(array_size);
+ array = vzalloc(array_size);
if (!array)
goto nomem;
- memset(array, 0, array_size);
ctrl = &swap_cgroup_ctrl[type];
mutex_lock(&swap_cgroup_mutex);
ctrl->length = length;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index ea534960a04b..12a48a88c0d8 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -50,14 +50,13 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
if (!pages || !bitmap) {
if (may_alloc && !pages)
- pages = pcpu_mem_alloc(pages_size);
+ pages = pcpu_mem_zalloc(pages_size);
if (may_alloc && !bitmap)
- bitmap = pcpu_mem_alloc(bitmap_size);
+ bitmap = pcpu_mem_zalloc(bitmap_size);
if (!pages || !bitmap)
return NULL;
}
- memset(pages, 0, pages_size);
bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
*bitmapp = bitmap;
@@ -143,8 +142,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_cache_vunmap(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}
static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
@@ -206,8 +205,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_tlb_kernel_range(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}
static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -284,8 +283,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_cache_vmap(
- pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
- pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}
/**
diff --git a/mm/percpu.c b/mm/percpu.c
index bf80e55dbed7..f47af9123af7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -67,6 +67,7 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
+#include <linux/kmemleak.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
@@ -116,9 +117,9 @@ static int pcpu_atom_size __read_mostly;
static int pcpu_nr_slots __read_mostly;
static size_t pcpu_chunk_struct_size __read_mostly;
-/* cpus with the lowest and highest unit numbers */
-static unsigned int pcpu_first_unit_cpu __read_mostly;
-static unsigned int pcpu_last_unit_cpu __read_mostly;
+/* cpus with the lowest and highest unit addresses */
+static unsigned int pcpu_low_unit_cpu __read_mostly;
+static unsigned int pcpu_high_unit_cpu __read_mostly;
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
@@ -273,11 +274,11 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
(rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
/**
- * pcpu_mem_alloc - allocate memory
+ * pcpu_mem_zalloc - allocate memory
* @size: bytes to allocate
*
* Allocate @size bytes. If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vmalloc() is used. The returned
+ * kzalloc() is used; otherwise, vzalloc() is used. The returned
* memory is always zeroed.
*
* CONTEXT:
@@ -286,7 +287,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
-static void *pcpu_mem_alloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size)
{
if (WARN_ON_ONCE(!slab_is_available()))
return NULL;
@@ -302,7 +303,7 @@ static void *pcpu_mem_alloc(size_t size)
* @ptr: memory to free
* @size: size of the area
*
- * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc().
+ * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
*/
static void pcpu_mem_free(void *ptr, size_t size)
{
@@ -384,7 +385,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
unsigned long flags;
- new = pcpu_mem_alloc(new_size);
+ new = pcpu_mem_zalloc(new_size);
if (!new)
return -ENOMEM;
@@ -604,11 +605,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
{
struct pcpu_chunk *chunk;
- chunk = pcpu_mem_alloc(pcpu_chunk_struct_size);
+ chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
if (!chunk)
return NULL;
- chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+ chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
+ sizeof(chunk->map[0]));
if (!chunk->map) {
kfree(chunk);
return NULL;
@@ -709,6 +711,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
const char *err;
int slot, off, new_alloc;
unsigned long flags;
+ void __percpu *ptr;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -801,7 +804,9 @@ area_found:
mutex_unlock(&pcpu_alloc_mutex);
/* return address relative to base address */
- return __addr_to_pcpu_ptr(chunk->base_addr + off);
+ ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
+ kmemleak_alloc_percpu(ptr, size);
+ return ptr;
fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
@@ -915,6 +920,8 @@ void free_percpu(void __percpu *ptr)
if (!ptr)
return;
+ kmemleak_free_percpu(ptr);
+
addr = __pcpu_ptr_to_addr(ptr);
spin_lock_irqsave(&pcpu_lock, flags);
@@ -977,6 +984,17 @@ bool is_kernel_percpu_address(unsigned long addr)
* address. The caller is responsible for ensuring @addr stays valid
* until this function finishes.
*
+ * percpu allocator has special setup for the first chunk, which currently
+ * supports either embedding in linear address space or vmalloc mapping,
+ * and, from the second one, the backing allocator (currently either vm or
+ * km) provides translation.
+ *
+ * The addr can be tranlated simply without checking if it falls into the
+ * first chunk. But the current code reflects better how percpu allocator
+ * actually works, and the verification can discover both bugs in percpu
+ * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
+ * code.
+ *
* RETURNS:
* The physical address for @addr.
*/
@@ -984,19 +1002,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
bool in_first_chunk = false;
- unsigned long first_start, first_end;
+ unsigned long first_low, first_high;
unsigned int cpu;
/*
- * The following test on first_start/end isn't strictly
+ * The following test on unit_low/high isn't strictly
* necessary but will speed up lookups of addresses which
* aren't in the first chunk.
*/
- first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0);
- first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu,
- pcpu_unit_pages);
- if ((unsigned long)addr >= first_start &&
- (unsigned long)addr < first_end) {
+ first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
+ first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
+ pcpu_unit_pages);
+ if ((unsigned long)addr >= first_low &&
+ (unsigned long)addr < first_high) {
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(base, cpu);
@@ -1011,9 +1029,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
if (!is_vmalloc_addr(addr))
return __pa(addr);
else
- return page_to_phys(vmalloc_to_page(addr));
+ return page_to_phys(vmalloc_to_page(addr)) +
+ offset_in_page(addr);
} else
- return page_to_phys(pcpu_addr_to_page(addr));
+ return page_to_phys(pcpu_addr_to_page(addr)) +
+ offset_in_page(addr);
}
/**
@@ -1233,7 +1253,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
- pcpu_first_unit_cpu = NR_CPUS;
+
+ pcpu_low_unit_cpu = NR_CPUS;
+ pcpu_high_unit_cpu = NR_CPUS;
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
const struct pcpu_group_info *gi = &ai->groups[group];
@@ -1253,9 +1275,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
unit_map[cpu] = unit + i;
unit_off[cpu] = gi->base_offset + i * ai->unit_size;
- if (pcpu_first_unit_cpu == NR_CPUS)
- pcpu_first_unit_cpu = cpu;
- pcpu_last_unit_cpu = cpu;
+ /* determine low/high unit_cpu */
+ if (pcpu_low_unit_cpu == NR_CPUS ||
+ unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
+ pcpu_low_unit_cpu = cpu;
+ if (pcpu_high_unit_cpu == NR_CPUS ||
+ unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
+ pcpu_high_unit_cpu = cpu;
}
}
pcpu_nr_units = unit;
@@ -1619,6 +1645,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
rc = -ENOMEM;
goto out_free_areas;
}
+ /* kmemleak tracks the percpu allocations separately */
+ kmemleak_free(ptr);
areas[group] = ptr;
base = min(ptr, base);
@@ -1733,6 +1761,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
"for cpu%u\n", psize_str, cpu);
goto enomem;
}
+ /* kmemleak tracks the percpu allocations separately */
+ kmemleak_free(ptr);
pages[j++] = virt_to_page(ptr);
}
@@ -1889,7 +1919,7 @@ void __init percpu_init_late(void)
BUILD_BUG_ON(size > PAGE_SIZE);
- map = pcpu_mem_alloc(size);
+ map = pcpu_mem_zalloc(size);
BUG_ON(!map);
spin_lock_irqsave(&pcpu_lock, flags);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
new file mode 100644
index 000000000000..e920aa3ce104
--- /dev/null
+++ b/mm/process_vm_access.c
@@ -0,0 +1,496 @@
+/*
+ * linux/mm/process_vm_access.c
+ *
+ * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+
+/**
+ * process_vm_rw_pages - read/write pages from task specified
+ * @task: task to read/write from
+ * @mm: mm for task
+ * @process_pages: struct pages area that can store at least
+ * nr_pages_to_copy struct page pointers
+ * @pa: address of page in task to start copying from/to
+ * @start_offset: offset in page to start copying from/to
+ * @len: number of bytes to copy
+ * @lvec: iovec array specifying where to copy to/from
+ * @lvec_cnt: number of elements in iovec array
+ * @lvec_current: index in iovec array we are up to
+ * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @vm_write: 0 means copy from, 1 means copy to
+ * @nr_pages_to_copy: number of pages to copy
+ * @bytes_copied: returns number of bytes successfully copied
+ * Returns 0 on success, error code otherwise
+ */
+static int process_vm_rw_pages(struct task_struct *task,
+ struct mm_struct *mm,
+ struct page **process_pages,
+ unsigned long pa,
+ unsigned long start_offset,
+ unsigned long len,
+ const struct iovec *lvec,
+ unsigned long lvec_cnt,
+ unsigned long *lvec_current,
+ size_t *lvec_offset,
+ int vm_write,
+ unsigned int nr_pages_to_copy,
+ ssize_t *bytes_copied)
+{
+ int pages_pinned;
+ void *target_kaddr;
+ int pgs_copied = 0;
+ int j;
+ int ret;
+ ssize_t bytes_to_copy;
+ ssize_t rc = 0;
+
+ *bytes_copied = 0;
+
+ /* Get the pages we're interested in */
+ down_read(&mm->mmap_sem);
+ pages_pinned = get_user_pages(task, mm, pa,
+ nr_pages_to_copy,
+ vm_write, 0, process_pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ if (pages_pinned != nr_pages_to_copy) {
+ rc = -EFAULT;
+ goto end;
+ }
+
+ /* Do the copy for each page */
+ for (pgs_copied = 0;
+ (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
+ pgs_copied++) {
+ /* Make sure we have a non zero length iovec */
+ while (*lvec_current < lvec_cnt
+ && lvec[*lvec_current].iov_len == 0)
+ (*lvec_current)++;
+ if (*lvec_current == lvec_cnt)
+ break;
+
+ /*
+ * Will copy smallest of:
+ * - bytes remaining in page
+ * - bytes remaining in destination iovec
+ */
+ bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
+ len - *bytes_copied);
+ bytes_to_copy = min_t(ssize_t, bytes_to_copy,
+ lvec[*lvec_current].iov_len
+ - *lvec_offset);
+
+ target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
+
+ if (vm_write)
+ ret = copy_from_user(target_kaddr,
+ lvec[*lvec_current].iov_base
+ + *lvec_offset,
+ bytes_to_copy);
+ else
+ ret = copy_to_user(lvec[*lvec_current].iov_base
+ + *lvec_offset,
+ target_kaddr, bytes_to_copy);
+ kunmap(process_pages[pgs_copied]);
+ if (ret) {
+ *bytes_copied += bytes_to_copy - ret;
+ pgs_copied++;
+ rc = -EFAULT;
+ goto end;
+ }
+ *bytes_copied += bytes_to_copy;
+ *lvec_offset += bytes_to_copy;
+ if (*lvec_offset == lvec[*lvec_current].iov_len) {
+ /*
+ * Need to copy remaining part of page into the
+ * next iovec if there are any bytes left in page
+ */
+ (*lvec_current)++;
+ *lvec_offset = 0;
+ start_offset = (start_offset + bytes_to_copy)
+ % PAGE_SIZE;
+ if (start_offset)
+ pgs_copied--;
+ } else {
+ start_offset = 0;
+ }
+ }
+
+end:
+ if (vm_write) {
+ for (j = 0; j < pages_pinned; j++) {
+ if (j < pgs_copied)
+ set_page_dirty_lock(process_pages[j]);
+ put_page(process_pages[j]);
+ }
+ } else {
+ for (j = 0; j < pages_pinned; j++)
+ put_page(process_pages[j]);
+ }
+
+ return rc;
+}
+
+/* Maximum number of pages kmalloc'd to hold struct page's during copy */
+#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
+
+/**
+ * process_vm_rw_single_vec - read/write pages from task specified
+ * @addr: start memory address of target process
+ * @len: size of area to copy to/from
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @lvec_cnt: number of elements in iovec array
+ * @lvec_current: index in iovec array we are up to
+ * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @process_pages: struct pages area that can store at least
+ * nr_pages_to_copy struct page pointers
+ * @mm: mm for task
+ * @task: task to read/write from
+ * @vm_write: 0 means copy from, 1 means copy to
+ * @bytes_copied: returns number of bytes successfully copied
+ * Returns 0 on success or on failure error code
+ */
+static int process_vm_rw_single_vec(unsigned long addr,
+ unsigned long len,
+ const struct iovec *lvec,
+ unsigned long lvec_cnt,
+ unsigned long *lvec_current,
+ size_t *lvec_offset,
+ struct page **process_pages,
+ struct mm_struct *mm,
+ struct task_struct *task,
+ int vm_write,
+ ssize_t *bytes_copied)
+{
+ unsigned long pa = addr & PAGE_MASK;
+ unsigned long start_offset = addr - pa;
+ unsigned long nr_pages;
+ ssize_t bytes_copied_loop;
+ ssize_t rc = 0;
+ unsigned long nr_pages_copied = 0;
+ unsigned long nr_pages_to_copy;
+ unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
+ / sizeof(struct pages *);
+
+ *bytes_copied = 0;
+
+ /* Work out address and page range required */
+ if (len == 0)
+ return 0;
+ nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
+
+ while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
+ nr_pages_to_copy = min(nr_pages - nr_pages_copied,
+ max_pages_per_loop);
+
+ rc = process_vm_rw_pages(task, mm, process_pages, pa,
+ start_offset, len,
+ lvec, lvec_cnt,
+ lvec_current, lvec_offset,
+ vm_write, nr_pages_to_copy,
+ &bytes_copied_loop);
+ start_offset = 0;
+ *bytes_copied += bytes_copied_loop;
+
+ if (rc < 0) {
+ return rc;
+ } else {
+ len -= bytes_copied_loop;
+ nr_pages_copied += nr_pages_to_copy;
+ pa += nr_pages_to_copy * PAGE_SIZE;
+ }
+ }
+
+ return rc;
+}
+
+/* Maximum number of entries for process pages array
+ which lives on stack */
+#define PVM_MAX_PP_ARRAY_COUNT 16
+
+/**
+ * process_vm_rw_core - core of reading/writing pages from task specified
+ * @pid: PID of process to read/write from/to
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ * Returns the number of bytes read/written or error code. May
+ * return less bytes than expected if an error occurs during the copying
+ * process.
+ */
+static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
+ unsigned long liovcnt,
+ const struct iovec *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct task_struct *task;
+ struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
+ struct page **process_pages = pp_stack;
+ struct mm_struct *mm;
+ unsigned long i;
+ ssize_t rc = 0;
+ ssize_t bytes_copied_loop;
+ ssize_t bytes_copied = 0;
+ unsigned long nr_pages = 0;
+ unsigned long nr_pages_iov;
+ unsigned long iov_l_curr_idx = 0;
+ size_t iov_l_curr_offset = 0;
+ ssize_t iov_len;
+
+ /*
+ * Work out how many pages of struct pages we're going to need
+ * when eventually calling get_user_pages
+ */
+ for (i = 0; i < riovcnt; i++) {
+ iov_len = rvec[i].iov_len;
+ if (iov_len > 0) {
+ nr_pages_iov = ((unsigned long)rvec[i].iov_base
+ + iov_len)
+ / PAGE_SIZE - (unsigned long)rvec[i].iov_base
+ / PAGE_SIZE + 1;
+ nr_pages = max(nr_pages, nr_pages_iov);
+ }
+ }
+
+ if (nr_pages == 0)
+ return 0;
+
+ if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
+ /* For reliability don't try to kmalloc more than
+ 2 pages worth */
+ process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
+ sizeof(struct pages *)*nr_pages),
+ GFP_KERNEL);
+
+ if (!process_pages)
+ return -ENOMEM;
+ }
+
+ /* Get process information */
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+ if (!task) {
+ rc = -ESRCH;
+ goto free_proc_pages;
+ }
+
+ task_lock(task);
+ if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+ task_unlock(task);
+ rc = -EPERM;
+ goto put_task_struct;
+ }
+ mm = task->mm;
+
+ if (!mm || (task->flags & PF_KTHREAD)) {
+ task_unlock(task);
+ rc = -EINVAL;
+ goto put_task_struct;
+ }
+
+ atomic_inc(&mm->mm_users);
+ task_unlock(task);
+
+ for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
+ rc = process_vm_rw_single_vec(
+ (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
+ lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
+ process_pages, mm, task, vm_write, &bytes_copied_loop);
+ bytes_copied += bytes_copied_loop;
+ if (rc != 0) {
+ /* If we have managed to copy any data at all then
+ we return the number of bytes copied. Otherwise
+ we return the error code */
+ if (bytes_copied)
+ rc = bytes_copied;
+ goto put_mm;
+ }
+ }
+
+ rc = bytes_copied;
+put_mm:
+ mmput(mm);
+
+put_task_struct:
+ put_task_struct(task);
+
+free_proc_pages:
+ if (process_pages != pp_stack)
+ kfree(process_pages);
+ return rc;
+}
+
+/**
+ * process_vm_rw - check iovecs before calling core routine
+ * @pid: PID of process to read/write from/to
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ * Returns the number of bytes read/written or error code. May
+ * return less bytes than expected if an error occurs during the copying
+ * process.
+ */
+static ssize_t process_vm_rw(pid_t pid,
+ const struct iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct iovec iovstack_l[UIO_FASTIOV];
+ struct iovec iovstack_r[UIO_FASTIOV];
+ struct iovec *iov_l = iovstack_l;
+ struct iovec *iov_r = iovstack_r;
+ ssize_t rc;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ /* Check iovecs */
+ if (vm_write)
+ rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
+ iovstack_l, &iov_l, 1);
+ else
+ rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
+ iovstack_l, &iov_l, 1);
+ if (rc <= 0)
+ goto free_iovecs;
+
+ rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
+ iovstack_r, &iov_r, 0);
+ if (rc <= 0)
+ goto free_iovecs;
+
+ rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
+ vm_write);
+
+free_iovecs:
+ if (iov_r != iovstack_r)
+ kfree(iov_r);
+ if (iov_l != iovstack_l)
+ kfree(iov_l);
+
+ return rc;
+}
+
+SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
+ unsigned long, liovcnt, const struct iovec __user *, rvec,
+ unsigned long, riovcnt, unsigned long, flags)
+{
+ return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
+}
+
+SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
+ const struct iovec __user *, lvec,
+ unsigned long, liovcnt, const struct iovec __user *, rvec,
+ unsigned long, riovcnt, unsigned long, flags)
+{
+ return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
+}
+
+#ifdef CONFIG_COMPAT
+
+asmlinkage ssize_t
+compat_process_vm_rw(compat_pid_t pid,
+ const struct compat_iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct compat_iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct iovec iovstack_l[UIO_FASTIOV];
+ struct iovec iovstack_r[UIO_FASTIOV];
+ struct iovec *iov_l = iovstack_l;
+ struct iovec *iov_r = iovstack_r;
+ ssize_t rc = -EFAULT;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
+ goto out;
+
+ if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
+ goto out;
+
+ if (vm_write)
+ rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
+ UIO_FASTIOV, iovstack_l,
+ &iov_l, 1);
+ else
+ rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
+ UIO_FASTIOV, iovstack_l,
+ &iov_l, 1);
+ if (rc <= 0)
+ goto free_iovecs;
+ rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
+ UIO_FASTIOV, iovstack_r,
+ &iov_r, 0);
+ if (rc <= 0)
+ goto free_iovecs;
+
+ rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
+ vm_write);
+
+free_iovecs:
+ if (iov_r != iovstack_r)
+ kfree(iov_r);
+ if (iov_l != iovstack_l)
+ kfree(iov_l);
+
+out:
+ return rc;
+}
+
+asmlinkage ssize_t
+compat_sys_process_vm_readv(compat_pid_t pid,
+ const struct compat_iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct compat_iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags)
+{
+ return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
+ riovcnt, flags, 0);
+}
+
+asmlinkage ssize_t
+compat_sys_process_vm_writev(compat_pid_t pid,
+ const struct compat_iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct compat_iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags)
+{
+ return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
+ riovcnt, flags, 1);
+}
+
+#endif
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 2876349339a7..942212970529 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -17,7 +17,6 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
-#include <linux/module.h>
#include <linux/quicklist.h>
DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
diff --git a/mm/readahead.c b/mm/readahead.c
index 867f9dd82dcd..cbcbb02f3e28 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -11,7 +11,7 @@
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 8005080fb9e3..c8454e06b6c8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -51,7 +51,7 @@
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
@@ -272,6 +272,51 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
}
/*
+ * Some rmap walk that needs to find all ptes/hugepmds without false
+ * negatives (like migrate and split_huge_page) running concurrent
+ * with operations that copy or move pagetables (like mremap() and
+ * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
+ * list to be in a certain order: the dst_vma must be placed after the
+ * src_vma in the list. This is always guaranteed by fork() but
+ * mremap() needs to call this function to enforce it in case the
+ * dst_vma isn't newly allocated and chained with the anon_vma_clone()
+ * function but just an extension of a pre-existing vma through
+ * vma_merge.
+ *
+ * NOTE: the same_anon_vma list can still be changed by other
+ * processes while mremap runs because mremap doesn't hold the
+ * anon_vma mutex to prevent modifications to the list while it
+ * runs. All we need to enforce is that the relative order of this
+ * process vmas isn't changing (we don't care about other vmas
+ * order). Each vma corresponds to an anon_vma_chain structure so
+ * there's no risk that other processes calling anon_vma_moveto_tail()
+ * and changing the same_anon_vma list under mremap() will screw with
+ * the relative order of this process vmas in the list, because we
+ * they can't alter the order of any vma that belongs to this
+ * process. And there can't be another anon_vma_moveto_tail() running
+ * concurrently with mremap() coming from this process because we hold
+ * the mmap_sem for the whole mremap(). fork() ordering dependency
+ * also shouldn't be affected because fork() only cares that the
+ * parent vmas are placed in the list before the child vmas and
+ * anon_vma_moveto_tail() won't reorder vmas from either the fork()
+ * parent or child.
+ */
+void anon_vma_moveto_tail(struct vm_area_struct *dst)
+{
+ struct anon_vma_chain *pavc;
+ struct anon_vma *root = NULL;
+
+ list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = pavc->anon_vma;
+ VM_BUG_ON(pavc->vma != dst);
+ root = lock_anon_vma_root(root, anon_vma);
+ list_del(&pavc->same_anon_vma);
+ list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
+ }
+ unlock_anon_vma_root(root);
+}
+
+/*
* Attach vma to its own anon_vma, as well as to the anon_vmas that
* the corresponding VMA in the parent process is attached to.
* Returns 0 on success, non-zero on failure.
@@ -728,7 +773,7 @@ out:
}
static int page_referenced_anon(struct page *page,
- struct mem_cgroup *mem_cont,
+ struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
unsigned int mapcount;
@@ -751,7 +796,7 @@ static int page_referenced_anon(struct page *page,
* counting on behalf of references from different
* cgroups
*/
- if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
continue;
referenced += page_referenced_one(page, vma, address,
&mapcount, vm_flags);
@@ -766,7 +811,7 @@ static int page_referenced_anon(struct page *page,
/**
* page_referenced_file - referenced check for object-based rmap
* @page: the page we're checking references on.
- * @mem_cont: target memory controller
+ * @memcg: target memory control group
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
*
* For an object-based mapped page, find all the places it is mapped and
@@ -777,7 +822,7 @@ static int page_referenced_anon(struct page *page,
* This function is only called from page_referenced for object-based pages.
*/
static int page_referenced_file(struct page *page,
- struct mem_cgroup *mem_cont,
+ struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
unsigned int mapcount;
@@ -819,7 +864,7 @@ static int page_referenced_file(struct page *page,
* counting on behalf of references from different
* cgroups
*/
- if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
continue;
referenced += page_referenced_one(page, vma, address,
&mapcount, vm_flags);
@@ -835,7 +880,7 @@ static int page_referenced_file(struct page *page,
* page_referenced - test if the page was referenced
* @page: the page to test
* @is_locked: caller holds lock on the page
- * @mem_cont: target memory controller
+ * @memcg: target memory cgroup
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
*
* Quick test_and_clear_referenced for all mappings to a page,
@@ -843,7 +888,7 @@ static int page_referenced_file(struct page *page,
*/
int page_referenced(struct page *page,
int is_locked,
- struct mem_cgroup *mem_cont,
+ struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
int referenced = 0;
@@ -859,13 +904,13 @@ int page_referenced(struct page *page,
}
}
if (unlikely(PageKsm(page)))
- referenced += page_referenced_ksm(page, mem_cont,
+ referenced += page_referenced_ksm(page, memcg,
vm_flags);
else if (PageAnon(page))
- referenced += page_referenced_anon(page, mem_cont,
+ referenced += page_referenced_anon(page, memcg,
vm_flags);
else if (page->mapping)
- referenced += page_referenced_file(page, mem_cont,
+ referenced += page_referenced_file(page, memcg,
vm_flags);
if (we_locked)
unlock_page(page);
@@ -1164,7 +1209,7 @@ void page_remove_rmap(struct page *page)
/*
* Subfunctions of try_to_unmap: try_to_unmap_one called
- * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
+ * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
*/
int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, enum ttu_flags flags)
diff --git a/mm/shmem.c b/mm/shmem.c
index 32f6763f16fb..feead1943d92 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,7 +28,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
static struct vfsmount *shm_mnt;
@@ -1068,6 +1068,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
user_shm_unlock(inode->i_size, user);
info->flags &= ~VM_LOCKED;
mapping_clear_unevictable(file->f_mapping);
+ /*
+ * Ensure that a racing putback_lru_page() can see
+ * the pages of this mapping are evictable when we
+ * skip them due to !PageLRU during the scan.
+ */
+ smp_mb__after_clear_bit();
scan_mapping_unevictable_pages(file->f_mapping);
}
retval = 0;
@@ -1086,7 +1092,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
}
static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
- int mode, dev_t dev, unsigned long flags)
+ umode_t mode, dev_t dev, unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
@@ -1450,7 +1456,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
* File creation. Allocate an inode, and we're done..
*/
static int
-shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
int error = -ENOSPC;
@@ -1458,7 +1464,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
error = security_inode_init_security(inode, dir,
- &dentry->d_name, NULL,
+ &dentry->d_name,
NULL, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
@@ -1483,7 +1489,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
return error;
}
-static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int error;
@@ -1493,7 +1499,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return 0;
}
-static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
+static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct nameidata *nd)
{
return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
@@ -1598,7 +1604,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
if (!inode)
return -ENOSPC;
- error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
+ error = security_inode_init_security(inode, dir, &dentry->d_name,
NULL, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
@@ -2112,9 +2118,9 @@ out:
return error;
}
-static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int shmem_show_options(struct seq_file *seq, struct dentry *root)
{
- struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
if (sbinfo->max_blocks != shmem_default_max_blocks())
seq_printf(seq, ",size=%luk",
@@ -2122,7 +2128,7 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (sbinfo->max_inodes != shmem_default_max_inodes())
seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
- seq_printf(seq, ",mode=%03o", sbinfo->mode);
+ seq_printf(seq, ",mode=%03ho", sbinfo->mode);
if (sbinfo->uid != 0)
seq_printf(seq, ",uid=%u", sbinfo->uid);
if (sbinfo->gid != 0)
@@ -2228,13 +2234,12 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
static void shmem_destroy_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}
static void shmem_destroy_inode(struct inode *inode)
{
- if ((inode->i_mode & S_IFMT) == S_IFREG)
+ if (S_ISREG(inode->i_mode))
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
call_rcu(&inode->i_rcu, shmem_destroy_callback);
}
@@ -2497,7 +2502,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
d_instantiate(path.dentry, inode);
inode->i_size = size;
- inode->i_nlink = 0; /* It is unlinked */
+ clear_nlink(inode); /* It is unlinked */
#ifndef CONFIG_MMU
error = ramfs_nommu_expand_for_mapping(inode, size);
if (error)
diff --git a/mm/slab.c b/mm/slab.c
index 6d90a091fdca..f0bd7857ab3b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -121,6 +121,8 @@
#include <asm/tlbflush.h>
#include <asm/page.h>
+#include <trace/events/kmem.h>
+
/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -479,11 +481,13 @@ EXPORT_SYMBOL(slab_buffer_size);
#endif
/*
- * Do not go above this order unless 0 objects fit into the slab.
+ * Do not go above this order unless 0 objects fit into the slab or
+ * overridden on the command line.
*/
-#define BREAK_GFP_ORDER_HI 1
-#define BREAK_GFP_ORDER_LO 0
-static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+#define SLAB_MAX_ORDER_HI 1
+#define SLAB_MAX_ORDER_LO 0
+static int slab_max_order = SLAB_MAX_ORDER_LO;
+static bool slab_max_order_set __initdata;
/*
* Functions for storing/retrieving the cachep and or slab from the page
@@ -595,6 +599,7 @@ static enum {
PARTIAL_AC,
PARTIAL_L3,
EARLY,
+ LATE,
FULL
} g_cpucache_up;
@@ -671,7 +676,7 @@ static void init_node_lock_keys(int q)
{
struct cache_sizes *s = malloc_sizes;
- if (g_cpucache_up != FULL)
+ if (g_cpucache_up < LATE)
return;
for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -851,6 +856,17 @@ static int __init noaliencache_setup(char *s)
}
__setup("noaliencache", noaliencache_setup);
+static int __init slab_max_order_setup(char *str)
+{
+ get_option(&str, &slab_max_order);
+ slab_max_order = slab_max_order < 0 ? 0 :
+ min(slab_max_order, MAX_ORDER - 1);
+ slab_max_order_set = true;
+
+ return 1;
+}
+__setup("slab_max_order=", slab_max_order_setup);
+
#ifdef CONFIG_NUMA
/*
* Special reaping functions for NUMA systems called from cache_reap().
@@ -1499,10 +1515,11 @@ void __init kmem_cache_init(void)
/*
* Fragmentation resistance on low memory - only use bigger
- * page orders on machines with more than 32MB of memory.
+ * page orders on machines with more than 32MB of memory if
+ * not overridden on the command line.
*/
- if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
- slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+ if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
+ slab_max_order = SLAB_MAX_ORDER_HI;
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
@@ -1666,6 +1683,8 @@ void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
+ g_cpucache_up = LATE;
+
/* Annotate slab for lockdep -- annotate the malloc caches */
init_lock_keys();
@@ -1851,15 +1870,15 @@ static void dump_line(char *data, int offset, int limit)
unsigned char error = 0;
int bad_count = 0;
- printk(KERN_ERR "%03x:", offset);
+ printk(KERN_ERR "%03x: ", offset);
for (i = 0; i < limit; i++) {
if (data[offset + i] != POISON_FREE) {
error = data[offset + i];
bad_count++;
}
- printk(" %02x", (unsigned char)data[offset + i]);
}
- printk("\n");
+ print_hex_dump(KERN_CONT, "", 0, 16, 1,
+ &data[offset], limit, 1);
if (bad_count == 1) {
error ^= POISON_FREE;
@@ -1927,8 +1946,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Print header */
if (lines == 0) {
printk(KERN_ERR
- "Slab corruption: %s start=%p, len=%d\n",
- cachep->name, realobj, size);
+ "Slab corruption (%s): %s start=%p, len=%d\n",
+ print_tainted(), cachep->name, realobj, size);
print_objinfo(cachep, objp, 0);
}
/* Hexdump the affected line */
@@ -2112,7 +2131,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
- if (gfporder >= slab_break_gfp_order)
+ if (gfporder >= slab_max_order)
break;
/*
@@ -3037,16 +3056,12 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
if (entries != cachep->num - slabp->inuse) {
bad:
printk(KERN_ERR "slab: Internal list corruption detected in "
- "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
- cachep->name, cachep->num, slabp, slabp->inuse);
- for (i = 0;
- i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
- i++) {
- if (i % 16 == 0)
- printk("\n%03x:", i);
- printk(" %02x", ((unsigned char *)slabp)[i]);
- }
- printk("\n");
+ "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
+ cachep->name, cachep->num, slabp, slabp->inuse,
+ print_tainted());
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
+ sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
+ 1);
BUG();
}
}
@@ -4584,7 +4599,7 @@ static const struct file_operations proc_slabstats_operations = {
static int __init slab_proc_init(void)
{
- proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+ proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
#ifdef CONFIG_DEBUG_SLAB_LEAK
proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
#endif
diff --git a/mm/slob.c b/mm/slob.c
index bf3918187165..8105be42cad1 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -63,7 +63,7 @@
#include <linux/swap.h> /* struct reclaim_state */
#include <linux/cache.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <linux/kmemleak.h>
diff --git a/mm/slub.c b/mm/slub.c
index 7c54fe83a90c..4907563ef7ff 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -366,9 +366,10 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
const char *n)
{
VM_BUG_ON(!irqs_disabled());
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&page->freelist,
+ if (cmpxchg_double(&page->freelist, &page->counters,
freelist_old, counters_old,
freelist_new, counters_new))
return 1;
@@ -400,9 +401,10 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_new, unsigned long counters_new,
const char *n)
{
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&page->freelist,
+ if (cmpxchg_double(&page->freelist, &page->counters,
freelist_old, counters_old,
freelist_new, counters_new))
return 1;
@@ -467,34 +469,8 @@ static int disable_higher_order_debug;
*/
static void print_section(char *text, u8 *addr, unsigned int length)
{
- int i, offset;
- int newline = 1;
- char ascii[17];
-
- ascii[16] = 0;
-
- for (i = 0; i < length; i++) {
- if (newline) {
- printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
- newline = 0;
- }
- printk(KERN_CONT " %02x", addr[i]);
- offset = i % 16;
- ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
- if (offset == 15) {
- printk(KERN_CONT " %s\n", ascii);
- newline = 1;
- }
- }
- if (!newline) {
- i %= 16;
- while (i < 16) {
- printk(KERN_CONT " ");
- ascii[i] = ' ';
- i++;
- }
- printk(KERN_CONT " %s\n", ascii);
- }
+ print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
+ length, 1);
}
static struct track *get_track(struct kmem_cache *s, void *object,
@@ -596,7 +572,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
va_end(args);
printk(KERN_ERR "========================================"
"=====================================\n");
- printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
+ printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
printk(KERN_ERR "----------------------------------------"
"-------------------------------------\n\n");
}
@@ -625,12 +601,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
p, p - addr, get_freepointer(s, p));
if (p > addr + 16)
- print_section("Bytes b4", p - 16, 16);
-
- print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
+ print_section("Bytes b4 ", p - 16, 16);
+ print_section("Object ", p, min_t(unsigned long, s->objsize,
+ PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section("Redzone", p + s->objsize,
+ print_section("Redzone ", p + s->objsize,
s->inuse - s->objsize);
if (s->offset)
@@ -643,7 +619,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (off != s->size)
/* Beginning of the filler is the free pointer */
- print_section("Padding", p + off, s->size - off);
+ print_section("Padding ", p + off, s->size - off);
dump_stack();
}
@@ -681,49 +657,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
memset(p + s->objsize, val, s->inuse - s->objsize);
}
-static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
-{
- while (bytes) {
- if (*start != value)
- return start;
- start++;
- bytes--;
- }
- return NULL;
-}
-
-static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
-{
- u64 value64;
- unsigned int words, prefix;
-
- if (bytes <= 16)
- return check_bytes8(start, value, bytes);
-
- value64 = value | value << 8 | value << 16 | value << 24;
- value64 = (value64 & 0xffffffff) | value64 << 32;
- prefix = 8 - ((unsigned long)start) % 8;
-
- if (prefix) {
- u8 *r = check_bytes8(start, value, prefix);
- if (r)
- return r;
- start += prefix;
- bytes -= prefix;
- }
-
- words = bytes / 8;
-
- while (words) {
- if (*(u64 *)start != value64)
- return check_bytes8(start, value, 8);
- start += 8;
- words--;
- }
-
- return check_bytes8(start, value, bytes % 8);
-}
-
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
void *from, void *to)
{
@@ -738,7 +671,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
u8 *fault;
u8 *end;
- fault = check_bytes(start, value, bytes);
+ fault = memchr_inv(start, value, bytes);
if (!fault)
return 1;
@@ -831,14 +764,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
if (!remainder)
return 1;
- fault = check_bytes(end - remainder, POISON_INUSE, remainder);
+ fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
if (!fault)
return 1;
while (end > fault && end[-1] == POISON_INUSE)
end--;
slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
- print_section("Padding", end - remainder, remainder);
+ print_section("Padding ", end - remainder, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
return 0;
@@ -987,7 +920,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
page->freelist);
if (!alloc)
- print_section("Object", (void *)object, s->objsize);
+ print_section("Object ", (void *)object, s->objsize);
dump_stack();
}
@@ -1447,7 +1380,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, last, NULL);
page->freelist = start;
- page->inuse = 0;
+ page->inuse = page->objects;
page->frozen = 1;
out:
return page;
@@ -1534,7 +1467,7 @@ static inline void add_partial(struct kmem_cache_node *n,
struct page *page, int tail)
{
n->nr_partial++;
- if (tail)
+ if (tail == DEACTIVATE_TO_TAIL)
list_add_tail(&page->lru, &n->partial);
else
list_add(&page->lru, &n->partial);
@@ -1554,10 +1487,13 @@ static inline void remove_partial(struct kmem_cache_node *n,
* Lock slab, remove from the partial list and put the object into the
* per cpu freelist.
*
+ * Returns a list of objects or NULL if it fails.
+ *
* Must hold list_lock.
*/
-static inline int acquire_slab(struct kmem_cache *s,
- struct kmem_cache_node *n, struct page *page)
+static inline void *acquire_slab(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct page *page,
+ int mode)
{
void *freelist;
unsigned long counters;
@@ -1572,7 +1508,8 @@ static inline int acquire_slab(struct kmem_cache *s,
freelist = page->freelist;
counters = page->counters;
new.counters = counters;
- new.inuse = page->objects;
+ if (mode)
+ new.inuse = page->objects;
VM_BUG_ON(new.frozen);
new.frozen = 1;
@@ -1583,32 +1520,19 @@ static inline int acquire_slab(struct kmem_cache *s,
"lock and freeze"));
remove_partial(n, page);
-
- if (freelist) {
- /* Populate the per cpu freelist */
- this_cpu_write(s->cpu_slab->freelist, freelist);
- this_cpu_write(s->cpu_slab->page, page);
- this_cpu_write(s->cpu_slab->node, page_to_nid(page));
- return 1;
- } else {
- /*
- * Slab page came from the wrong list. No object to allocate
- * from. Put it onto the correct list and continue partial
- * scan.
- */
- printk(KERN_ERR "SLUB: %s : Page without available objects on"
- " partial list\n", s->name);
- return 0;
- }
+ return freelist;
}
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+
/*
* Try to allocate a partial slab from a specific node.
*/
-static struct page *get_partial_node(struct kmem_cache *s,
- struct kmem_cache_node *n)
+static void *get_partial_node(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct kmem_cache_cpu *c)
{
- struct page *page;
+ struct page *page, *page2;
+ void *object = NULL;
/*
* Racy check. If we mistakenly see no partial slabs then we
@@ -1620,26 +1544,43 @@ static struct page *get_partial_node(struct kmem_cache *s,
return NULL;
spin_lock(&n->list_lock);
- list_for_each_entry(page, &n->partial, lru)
- if (acquire_slab(s, n, page))
- goto out;
- page = NULL;
-out:
+ list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ void *t = acquire_slab(s, n, page, object == NULL);
+ int available;
+
+ if (!t)
+ break;
+
+ if (!object) {
+ c->page = page;
+ c->node = page_to_nid(page);
+ stat(s, ALLOC_FROM_PARTIAL);
+ object = t;
+ available = page->objects - page->inuse;
+ } else {
+ page->freelist = t;
+ available = put_cpu_partial(s, page, 0);
+ }
+ if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
+ break;
+
+ }
spin_unlock(&n->list_lock);
- return page;
+ return object;
}
/*
* Get a page from somewhere. Search in increasing NUMA distances.
*/
-static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
+static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ struct kmem_cache_cpu *c)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(flags);
- struct page *page;
+ void *object;
/*
* The defrag ratio allows a configuration of the tradeoffs between
@@ -1672,10 +1613,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
n->nr_partial > s->min_partial) {
- page = get_partial_node(s, n);
- if (page) {
+ object = get_partial_node(s, n, c);
+ if (object) {
put_mems_allowed();
- return page;
+ return object;
}
}
}
@@ -1687,16 +1628,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
/*
* Get a partial page, lock it and return it.
*/
-static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
+static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
+ struct kmem_cache_cpu *c)
{
- struct page *page;
+ void *object;
int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
- page = get_partial_node(s, get_node(s, searchnode));
- if (page || node != NUMA_NO_NODE)
- return page;
+ object = get_partial_node(s, get_node(s, searchnode), c);
+ if (object || node != NUMA_NO_NODE)
+ return object;
- return get_any_partial(s, flags);
+ return get_any_partial(s, flags, c);
}
#ifdef CONFIG_PREEMPT
@@ -1765,9 +1707,6 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
for_each_possible_cpu(cpu)
per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
}
-/*
- * Remove the cpu slab
- */
/*
* Remove the cpu slab
@@ -1781,13 +1720,13 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
enum slab_modes l = M_NONE, m = M_NONE;
void *freelist;
void *nextfree;
- int tail = 0;
+ int tail = DEACTIVATE_TO_HEAD;
struct page new;
struct page old;
if (page->freelist) {
stat(s, DEACTIVATE_REMOTE_FREES);
- tail = 1;
+ tail = DEACTIVATE_TO_TAIL;
}
c->tid = next_tid(c->tid);
@@ -1893,7 +1832,7 @@ redo:
if (m == M_PARTIAL) {
add_partial(n, page, tail);
- stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
+ stat(s, tail);
} else if (m == M_FULL) {
@@ -1920,6 +1859,135 @@ redo:
}
}
+/* Unfreeze all the cpu partial slabs */
+static void unfreeze_partials(struct kmem_cache *s)
+{
+ struct kmem_cache_node *n = NULL;
+ struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
+ struct page *page, *discard_page = NULL;
+
+ while ((page = c->partial)) {
+ enum slab_modes { M_PARTIAL, M_FREE };
+ enum slab_modes l, m;
+ struct page new;
+ struct page old;
+
+ c->partial = page->next;
+ l = M_FREE;
+
+ do {
+
+ old.freelist = page->freelist;
+ old.counters = page->counters;
+ VM_BUG_ON(!old.frozen);
+
+ new.counters = old.counters;
+ new.freelist = old.freelist;
+
+ new.frozen = 0;
+
+ if (!new.inuse && (!n || n->nr_partial > s->min_partial))
+ m = M_FREE;
+ else {
+ struct kmem_cache_node *n2 = get_node(s,
+ page_to_nid(page));
+
+ m = M_PARTIAL;
+ if (n != n2) {
+ if (n)
+ spin_unlock(&n->list_lock);
+
+ n = n2;
+ spin_lock(&n->list_lock);
+ }
+ }
+
+ if (l != m) {
+ if (l == M_PARTIAL) {
+ remove_partial(n, page);
+ stat(s, FREE_REMOVE_PARTIAL);
+ } else {
+ add_partial(n, page,
+ DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+
+ l = m;
+ }
+
+ } while (!cmpxchg_double_slab(s, page,
+ old.freelist, old.counters,
+ new.freelist, new.counters,
+ "unfreezing slab"));
+
+ if (m == M_FREE) {
+ page->next = discard_page;
+ discard_page = page;
+ }
+ }
+
+ if (n)
+ spin_unlock(&n->list_lock);
+
+ while (discard_page) {
+ page = discard_page;
+ discard_page = discard_page->next;
+
+ stat(s, DEACTIVATE_EMPTY);
+ discard_slab(s, page);
+ stat(s, FREE_SLAB);
+ }
+}
+
+/*
+ * Put a page that was just frozen (in __slab_free) into a partial page
+ * slot if available. This is done without interrupts disabled and without
+ * preemption disabled. The cmpxchg is racy and may put the partial page
+ * onto a random cpus partial slot.
+ *
+ * If we did not find a slot then simply move all the partials to the
+ * per node partial list.
+ */
+int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+{
+ struct page *oldpage;
+ int pages;
+ int pobjects;
+
+ do {
+ pages = 0;
+ pobjects = 0;
+ oldpage = this_cpu_read(s->cpu_slab->partial);
+
+ if (oldpage) {
+ pobjects = oldpage->pobjects;
+ pages = oldpage->pages;
+ if (drain && pobjects > s->cpu_partial) {
+ unsigned long flags;
+ /*
+ * partial array is full. Move the existing
+ * set to the per node partial list.
+ */
+ local_irq_save(flags);
+ unfreeze_partials(s);
+ local_irq_restore(flags);
+ pobjects = 0;
+ pages = 0;
+ }
+ }
+
+ pages++;
+ pobjects += page->objects - page->inuse;
+
+ page->pages = pages;
+ page->pobjects = pobjects;
+ page->next = oldpage;
+
+ } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+ stat(s, CPU_PARTIAL_FREE);
+ return pobjects;
+}
+
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
stat(s, CPUSLAB_FLUSH);
@@ -1935,8 +2003,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- if (likely(c && c->page))
- flush_slab(s, c);
+ if (likely(c)) {
+ if (c->page)
+ flush_slab(s, c);
+
+ unfreeze_partials(s);
+ }
}
static void flush_cpu_slab(void *d)
@@ -2027,12 +2099,70 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
}
}
+static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
+ int node, struct kmem_cache_cpu **pc)
+{
+ void *object;
+ struct kmem_cache_cpu *c;
+ struct page *page = new_slab(s, flags, node);
+
+ if (page) {
+ c = __this_cpu_ptr(s->cpu_slab);
+ if (c->page)
+ flush_slab(s, c);
+
+ /*
+ * No other reference to the page yet so we can
+ * muck around with it freely without cmpxchg
+ */
+ object = page->freelist;
+ page->freelist = NULL;
+
+ stat(s, ALLOC_SLAB);
+ c->node = page_to_nid(page);
+ c->page = page;
+ *pc = c;
+ } else
+ object = NULL;
+
+ return object;
+}
+
+/*
+ * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
+ * or deactivate the page.
+ *
+ * The page is still frozen if the return value is not NULL.
+ *
+ * If this function returns NULL then the page has been unfrozen.
+ */
+static inline void *get_freelist(struct kmem_cache *s, struct page *page)
+{
+ struct page new;
+ unsigned long counters;
+ void *freelist;
+
+ do {
+ freelist = page->freelist;
+ counters = page->counters;
+ new.counters = counters;
+ VM_BUG_ON(!new.frozen);
+
+ new.inuse = page->objects;
+ new.frozen = freelist != NULL;
+
+ } while (!cmpxchg_double_slab(s, page,
+ freelist, counters,
+ NULL, new.counters,
+ "get_freelist"));
+
+ return freelist;
+}
+
/*
* Slow path. The lockless freelist is empty or we need to perform
* debugging duties.
*
- * Interrupts are disabled.
- *
* Processing is still very fast if new objects have been freed to the
* regular freelist. In that case we simply take over the regular freelist
* as the lockless freelist and zap the regular freelist.
@@ -2049,10 +2179,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void **object;
- struct page *page;
unsigned long flags;
- struct page new;
- unsigned long counters;
local_irq_save(flags);
#ifdef CONFIG_PREEMPT
@@ -2064,46 +2191,25 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
c = this_cpu_ptr(s->cpu_slab);
#endif
- /* We handle __GFP_ZERO in the caller */
- gfpflags &= ~__GFP_ZERO;
-
- page = c->page;
- if (!page)
+ if (!c->page)
goto new_slab;
-
+redo:
if (unlikely(!node_match(c, node))) {
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, c);
goto new_slab;
}
- stat(s, ALLOC_SLOWPATH);
-
- do {
- object = page->freelist;
- counters = page->counters;
- new.counters = counters;
- VM_BUG_ON(!new.frozen);
-
- /*
- * If there is no object left then we use this loop to
- * deactivate the slab which is simple since no objects
- * are left in the slab and therefore we do not need to
- * put the page back onto the partial list.
- *
- * If there are objects left then we retrieve them
- * and use them to refill the per cpu queue.
- */
+ /* must check again c->freelist in case of cpu migration or IRQ */
+ object = c->freelist;
+ if (object)
+ goto load_freelist;
- new.inuse = page->objects;
- new.frozen = object != NULL;
+ stat(s, ALLOC_SLOWPATH);
- } while (!__cmpxchg_double_slab(s, page,
- object, counters,
- NULL, new.counters,
- "__slab_alloc"));
+ object = get_freelist(s, c->page);
- if (unlikely(!object)) {
+ if (!object) {
c->page = NULL;
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
@@ -2112,58 +2218,47 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
stat(s, ALLOC_REFILL);
load_freelist:
- VM_BUG_ON(!page->frozen);
c->freelist = get_freepointer(s, object);
c->tid = next_tid(c->tid);
local_irq_restore(flags);
return object;
new_slab:
- page = get_partial(s, gfpflags, node);
- if (page) {
- stat(s, ALLOC_FROM_PARTIAL);
- object = c->freelist;
- if (kmem_cache_debug(s))
- goto debug;
- goto load_freelist;
+ if (c->partial) {
+ c->page = c->partial;
+ c->partial = c->page->next;
+ c->node = page_to_nid(c->page);
+ stat(s, CPU_PARTIAL_ALLOC);
+ c->freelist = NULL;
+ goto redo;
}
- page = new_slab(s, gfpflags, node);
+ /* Then do expensive stuff like retrieving pages from the partial lists */
+ object = get_partial(s, gfpflags, node, c);
- if (page) {
- c = __this_cpu_ptr(s->cpu_slab);
- if (c->page)
- flush_slab(s, c);
+ if (unlikely(!object)) {
- /*
- * No other reference to the page yet so we can
- * muck around with it freely without cmpxchg
- */
- object = page->freelist;
- page->freelist = NULL;
- page->inuse = page->objects;
+ object = new_slab_objects(s, gfpflags, node, &c);
- stat(s, ALLOC_SLAB);
- c->node = page_to_nid(page);
- c->page = page;
+ if (unlikely(!object)) {
+ if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+ slab_out_of_memory(s, gfpflags, node);
- if (kmem_cache_debug(s))
- goto debug;
- goto load_freelist;
+ local_irq_restore(flags);
+ return NULL;
+ }
}
- if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
- slab_out_of_memory(s, gfpflags, node);
- local_irq_restore(flags);
- return NULL;
-debug:
- if (!object || !alloc_debug_processing(s, page, object, addr))
- goto new_slab;
+ if (likely(!kmem_cache_debug(s)))
+ goto load_freelist;
+
+ /* Only entered in the debug case */
+ if (!alloc_debug_processing(s, c->page, object, addr))
+ goto new_slab; /* Slab failed checks. Next slab needed */
c->freelist = get_freepointer(s, object);
deactivate_slab(s, c);
- c->page = NULL;
c->node = NUMA_NO_NODE;
local_irq_restore(flags);
return object;
@@ -2226,7 +2321,7 @@ redo:
* Since this is without lock semantics the protection is only against
* code executing on this cpu *not* from access by other cpus.
*/
- if (unlikely(!irqsafe_cpu_cmpxchg_double(
+ if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
object, tid,
get_freepointer_safe(s, object), next_tid(tid)))) {
@@ -2333,16 +2428,29 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
was_frozen = new.frozen;
new.inuse--;
if ((!new.inuse || !prior) && !was_frozen && !n) {
- n = get_node(s, page_to_nid(page));
- /*
- * Speculatively acquire the list_lock.
- * If the cmpxchg does not succeed then we may
- * drop the list_lock without any processing.
- *
- * Otherwise the list_lock will synchronize with
- * other processors updating the list of slabs.
- */
- spin_lock_irqsave(&n->list_lock, flags);
+
+ if (!kmem_cache_debug(s) && !prior)
+
+ /*
+ * Slab was on no list before and will be partially empty
+ * We can defer the list move and instead freeze it.
+ */
+ new.frozen = 1;
+
+ else { /* Needs to be taken off a list */
+
+ n = get_node(s, page_to_nid(page));
+ /*
+ * Speculatively acquire the list_lock.
+ * If the cmpxchg does not succeed then we may
+ * drop the list_lock without any processing.
+ *
+ * Otherwise the list_lock will synchronize with
+ * other processors updating the list of slabs.
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ }
}
inuse = new.inuse;
@@ -2352,7 +2460,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
"__slab_free"));
if (likely(!n)) {
- /*
+
+ /*
+ * If we just froze the page then put it onto the
+ * per cpu partial list.
+ */
+ if (new.frozen && !was_frozen)
+ put_cpu_partial(s, page, 1);
+
+ /*
* The list lock was not taken therefore no list
* activity can be necessary.
*/
@@ -2377,7 +2493,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
*/
if (unlikely(!prior)) {
remove_full(s, page);
- add_partial(n, page, 1);
+ add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
}
@@ -2421,7 +2537,6 @@ static __always_inline void slab_free(struct kmem_cache *s,
slab_free_hook(s, x);
redo:
-
/*
* Determine the currently cpus per cpu slab.
* The cpu may change afterward. However that does not matter since
@@ -2436,7 +2551,7 @@ redo:
if (likely(page == c->page)) {
set_freepointer(s, object, c->freelist);
- if (unlikely(!irqsafe_cpu_cmpxchg_double(
+ if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
c->freelist, tid,
object, next_tid(tid)))) {
@@ -2685,7 +2800,7 @@ static void early_kmem_cache_node_alloc(int node)
n = page->freelist;
BUG_ON(!n);
page->freelist = get_freepointer(kmem_cache_node, n);
- page->inuse++;
+ page->inuse = 1;
page->frozen = 0;
kmem_cache_node->node[node] = n;
#ifdef CONFIG_SLUB_DEBUG
@@ -2695,7 +2810,7 @@ static void early_kmem_cache_node_alloc(int node)
init_kmem_cache_node(n, kmem_cache_node);
inc_slabs_node(kmem_cache_node, node, page->objects);
- add_partial(n, page, 0);
+ add_partial(n, page, DEACTIVATE_TO_HEAD);
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2901,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s,
}
}
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
/* Enable fast mode */
s->flags |= __CMPXCHG_DOUBLE;
@@ -2911,7 +3027,36 @@ static int kmem_cache_open(struct kmem_cache *s,
* The larger the object size is, the more pages we want on the partial
* list to avoid pounding the page allocator excessively.
*/
- set_min_partial(s, ilog2(s->size));
+ set_min_partial(s, ilog2(s->size) / 2);
+
+ /*
+ * cpu_partial determined the maximum number of objects kept in the
+ * per cpu partial lists of a processor.
+ *
+ * Per cpu partial lists mainly contain slabs that just have one
+ * object freed. If they are used for allocation then they can be
+ * filled up again with minimal effort. The slab will never hit the
+ * per node partial lists and therefore no locking will be required.
+ *
+ * This setting also determines
+ *
+ * A) The number of objects from per cpu partial slabs dumped to the
+ * per node list when we reach the limit.
+ * B) The number of objects in cpu partial slabs to extract from the
+ * per node list when we run out of per cpu objects. We only fetch 50%
+ * to keep some capacity around for frees.
+ */
+ if (kmem_cache_debug(s))
+ s->cpu_partial = 0;
+ else if (s->size >= PAGE_SIZE)
+ s->cpu_partial = 2;
+ else if (s->size >= 1024)
+ s->cpu_partial = 6;
+ else if (s->size >= 256)
+ s->cpu_partial = 13;
+ else
+ s->cpu_partial = 30;
+
s->refcount = 1;
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
@@ -2970,13 +3115,13 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
/*
* Attempt to free all partial slabs on a node.
+ * This is called from kmem_cache_close(). We must be the last thread
+ * using the cache and therefore we do not need to lock anymore.
*/
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
- unsigned long flags;
struct page *page, *h;
- spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
remove_partial(n, page);
@@ -2986,7 +3131,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
"Objects remaining on kmem_cache_close()");
}
}
- spin_unlock_irqrestore(&n->list_lock, flags);
}
/*
@@ -3020,6 +3164,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
+ up_write(&slub_lock);
if (kmem_cache_close(s)) {
printk(KERN_ERR "SLUB %s: %s called for cache that "
"still has objects.\n", s->name, __func__);
@@ -3028,8 +3173,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
sysfs_slab_remove(s);
- }
- up_write(&slub_lock);
+ } else
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3347,23 +3492,23 @@ int kmem_cache_shrink(struct kmem_cache *s)
* list_lock. page->inuse here is the upper limit.
*/
list_for_each_entry_safe(page, t, &n->partial, lru) {
- if (!page->inuse) {
- remove_partial(n, page);
- discard_slab(s, page);
- } else {
- list_move(&page->lru,
- slabs_by_inuse + page->inuse);
- }
+ list_move(&page->lru, slabs_by_inuse + page->inuse);
+ if (!page->inuse)
+ n->nr_partial--;
}
/*
* Rebuild the partial list with the slabs filled up most
* first and the least used slabs at the end.
*/
- for (i = objects - 1; i >= 0; i--)
+ for (i = objects - 1; i > 0; i--)
list_splice(slabs_by_inuse + i, n->partial.prev);
spin_unlock_irqrestore(&n->list_lock, flags);
+
+ /* Release empty slabs */
+ list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+ discard_slab(s, page);
}
kfree(slabs_by_inuse);
@@ -3529,6 +3674,9 @@ void __init kmem_cache_init(void)
struct kmem_cache *temp_kmem_cache_node;
unsigned long kmalloc_size;
+ if (debug_guardpage_minorder())
+ slub_max_order = 0;
+
kmem_size = offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *);
@@ -4319,22 +4467,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
for_each_possible_cpu(cpu) {
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ int node = ACCESS_ONCE(c->node);
+ struct page *page;
- if (!c || c->node < 0)
+ if (node < 0)
continue;
-
- if (c->page) {
- if (flags & SO_TOTAL)
- x = c->page->objects;
+ page = ACCESS_ONCE(c->page);
+ if (page) {
+ if (flags & SO_TOTAL)
+ x = page->objects;
else if (flags & SO_OBJECTS)
- x = c->page->inuse;
+ x = page->inuse;
else
x = 1;
total += x;
- nodes[c->node] += x;
+ nodes[node] += x;
+ }
+ page = c->partial;
+
+ if (page) {
+ x = page->pobjects;
+ total += x;
+ nodes[node] += x;
}
- per_cpu[c->node]++;
+ per_cpu[node]++;
}
}
@@ -4412,11 +4569,12 @@ struct slab_attribute {
};
#define SLAB_ATTR_RO(_name) \
- static struct slab_attribute _name##_attr = __ATTR_RO(_name)
+ static struct slab_attribute _name##_attr = \
+ __ATTR(_name, 0400, _name##_show, NULL)
#define SLAB_ATTR(_name) \
static struct slab_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+ __ATTR(_name, 0600, _name##_show, _name##_store)
static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
{
@@ -4485,6 +4643,29 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
}
SLAB_ATTR(min_partial);
+static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->cpu_partial);
+}
+
+static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ unsigned long objects;
+ int err;
+
+ err = strict_strtoul(buf, 10, &objects);
+ if (err)
+ return err;
+ if (objects && kmem_cache_debug(s))
+ return -EINVAL;
+
+ s->cpu_partial = objects;
+ flush_all(s);
+ return length;
+}
+SLAB_ATTR(cpu_partial);
+
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
if (!s->ctor)
@@ -4523,6 +4704,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(objects_partial);
+static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+ int objects = 0;
+ int pages = 0;
+ int cpu;
+ int len;
+
+ for_each_online_cpu(cpu) {
+ struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
+
+ if (page) {
+ pages += page->pages;
+ objects += page->pobjects;
+ }
+ }
+
+ len = sprintf(buf, "%d(%d)", objects, pages);
+
+#ifdef CONFIG_SMP
+ for_each_online_cpu(cpu) {
+ struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
+
+ if (page && len < PAGE_SIZE - 20)
+ len += sprintf(buf + len, " C%d=%d(%d)", cpu,
+ page->pobjects, page->pages);
+ }
+#endif
+ return len + sprintf(buf + len, "\n");
+}
+SLAB_ATTR_RO(slabs_cpu_partial);
+
static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4845,6 +5057,8 @@ STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
STAT_ATTR(ORDER_FALLBACK, order_fallback);
STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
+STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
+STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
#endif
static struct attribute *slab_attrs[] = {
@@ -4853,6 +5067,7 @@ static struct attribute *slab_attrs[] = {
&objs_per_slab_attr.attr,
&order_attr.attr,
&min_partial_attr.attr,
+ &cpu_partial_attr.attr,
&objects_attr.attr,
&objects_partial_attr.attr,
&partial_attr.attr,
@@ -4865,6 +5080,7 @@ static struct attribute *slab_attrs[] = {
&destroy_by_rcu_attr.attr,
&shrink_attr.attr,
&reserved_attr.attr,
+ &slabs_cpu_partial_attr.attr,
#ifdef CONFIG_SLUB_DEBUG
&total_objects_attr.attr,
&slabs_attr.attr,
@@ -4906,6 +5122,8 @@ static struct attribute *slab_attrs[] = {
&order_fallback_attr.attr,
&cmpxchg_double_fail_attr.attr,
&cmpxchg_double_cpu_fail_attr.attr,
+ &cpu_partial_alloc_attr.attr,
+ &cpu_partial_free_attr.attr,
#endif
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
@@ -5257,7 +5475,7 @@ static const struct file_operations proc_slabinfo_operations = {
static int __init slab_proc_init(void)
{
- proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
+ proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
return 0;
}
module_init(slab_proc_init);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 64b984091edb..1b7e22ab9b09 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -21,7 +21,6 @@
#include <linux/mmzone.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
diff --git a/mm/sparse.c b/mm/sparse.c
index 858e1dff9b2a..61d7cde23111 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -6,7 +6,7 @@
#include <linux/mmzone.h>
#include <linux/bootmem.h>
#include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include "internal.h"
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b3..b0f529b38979 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -21,9 +21,8 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm_inline.h>
-#include <linux/buffer_head.h> /* for try_to_release_page() */
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
@@ -54,7 +53,7 @@ static void __page_cache_release(struct page *page)
spin_lock_irqsave(&zone->lru_lock, flags);
VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
- del_page_from_lru(zone, page);
+ del_page_from_lru_list(zone, page, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
}
@@ -78,39 +77,22 @@ static void put_compound_page(struct page *page)
{
if (unlikely(PageTail(page))) {
/* __split_huge_page_refcount can run under us */
- struct page *page_head = page->first_page;
- smp_rmb();
- /*
- * If PageTail is still set after smp_rmb() we can be sure
- * that the page->first_page we read wasn't a dangling pointer.
- * See __split_huge_page_refcount() smp_wmb().
- */
- if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+ struct page *page_head = compound_trans_head(page);
+
+ if (likely(page != page_head &&
+ get_page_unless_zero(page_head))) {
unsigned long flags;
/*
- * Verify that our page_head wasn't converted
- * to a a regular page before we got a
- * reference on it.
- */
- if (unlikely(!PageHead(page_head))) {
- /* PageHead is cleared after PageTail */
- smp_rmb();
- VM_BUG_ON(PageTail(page));
- goto out_put_head;
- }
- /*
- * Only run compound_lock on a valid PageHead,
- * after having it pinned with
- * get_page_unless_zero() above.
+ * page_head wasn't a dangling pointer but it
+ * may not be a head page anymore by the time
+ * we obtain the lock. That is ok as long as it
+ * can't be freed from under us.
*/
- smp_mb();
- /* page_head wasn't a dangling pointer */
flags = compound_lock_irqsave(page_head);
if (unlikely(!PageTail(page))) {
/* __split_huge_page_refcount run before us */
compound_unlock_irqrestore(page_head, flags);
VM_BUG_ON(PageHead(page_head));
- out_put_head:
if (put_page_testzero(page_head))
__put_single_page(page_head);
out_put_single:
@@ -121,16 +103,17 @@ static void put_compound_page(struct page *page)
VM_BUG_ON(page_head != page->first_page);
/*
* We can release the refcount taken by
- * get_page_unless_zero now that
- * split_huge_page_refcount is blocked on the
- * compound_lock.
+ * get_page_unless_zero() now that
+ * __split_huge_page_refcount() is blocked on
+ * the compound_lock.
*/
if (put_page_testzero(page_head))
VM_BUG_ON(1);
/* __split_huge_page_refcount will wait now */
- VM_BUG_ON(atomic_read(&page->_count) <= 0);
- atomic_dec(&page->_count);
+ VM_BUG_ON(page_mapcount(page) <= 0);
+ atomic_dec(&page->_mapcount);
VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+ VM_BUG_ON(atomic_read(&page->_count) != 0);
compound_unlock_irqrestore(page_head, flags);
if (put_page_testzero(page_head)) {
if (PageHead(page_head))
@@ -160,6 +143,45 @@ void put_page(struct page *page)
}
EXPORT_SYMBOL(put_page);
+/*
+ * This function is exported but must not be called by anything other
+ * than get_page(). It implements the slow path of get_page().
+ */
+bool __get_page_tail(struct page *page)
+{
+ /*
+ * This takes care of get_page() if run on a tail page
+ * returned by one of the get_user_pages/follow_page variants.
+ * get_user_pages/follow_page itself doesn't need the compound
+ * lock because it runs __get_page_tail_foll() under the
+ * proper PT lock that already serializes against
+ * split_huge_page().
+ */
+ unsigned long flags;
+ bool got = false;
+ struct page *page_head = compound_trans_head(page);
+
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
+ /*
+ * page_head wasn't a dangling pointer but it
+ * may not be a head page anymore by the time
+ * we obtain the lock. That is ok as long as it
+ * can't be freed from under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ /* here __split_huge_page_refcount won't run anymore */
+ if (likely(PageTail(page))) {
+ __get_page_tail_foll(page, false);
+ got = true;
+ }
+ compound_unlock_irqrestore(page_head, flags);
+ if (unlikely(!got))
+ put_page(page_head);
+ }
+ return got;
+}
+EXPORT_SYMBOL(__get_page_tail);
+
/**
* put_pages_list() - release a list of pages
* @pages: list of pages threaded on page->lru
@@ -209,12 +231,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
static void pagevec_move_tail_fn(struct page *page, void *arg)
{
int *pgmoved = arg;
- struct zone *zone = page_zone(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
enum lru_list lru = page_lru_base_type(page);
- list_move_tail(&page->lru, &zone->lru[lru].list);
- mem_cgroup_rotate_reclaimable_page(page);
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lru_move_lists(page_zone(page),
+ page, lru, lru);
+ list_move_tail(&page->lru, &lruvec->lists[lru]);
(*pgmoved)++;
}
}
@@ -345,7 +369,6 @@ void mark_page_accessed(struct page *page)
SetPageReferenced(page);
}
}
-
EXPORT_SYMBOL(mark_page_accessed);
void __lru_cache_add(struct page *page, enum lru_list lru)
@@ -354,7 +377,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
page_cache_get(page);
if (!pagevec_add(pvec, page))
- ____pagevec_lru_add(pvec, lru);
+ __pagevec_lru_add(pvec, lru);
put_cpu_var(lru_add_pvecs);
}
EXPORT_SYMBOL(__lru_cache_add);
@@ -453,12 +476,13 @@ static void lru_deactivate_fn(struct page *page, void *arg)
*/
SetPageReclaim(page);
} else {
+ struct lruvec *lruvec;
/*
* The page's writeback ends up during pagevec
* We moves tha page into tail of inactive.
*/
- list_move_tail(&page->lru, &zone->lru[lru].list);
- mem_cgroup_rotate_reclaimable_page(page);
+ lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
+ list_move_tail(&page->lru, &lruvec->lists[lru]);
__count_vm_event(PGROTATED);
}
@@ -481,7 +505,7 @@ static void drain_cpu_pagevecs(int cpu)
for_each_lru(lru) {
pvec = &pvecs[lru - LRU_BASE];
if (pagevec_count(pvec))
- ____pagevec_lru_add(pvec, lru);
+ __pagevec_lru_add(pvec, lru);
}
pvec = &per_cpu(lru_rotate_pvecs, cpu);
@@ -562,11 +586,10 @@ int lru_add_drain_all(void)
void release_pages(struct page **pages, int nr, int cold)
{
int i;
- struct pagevec pages_to_free;
+ LIST_HEAD(pages_to_free);
struct zone *zone = NULL;
unsigned long uninitialized_var(flags);
- pagevec_init(&pages_to_free, cold);
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
@@ -594,22 +617,15 @@ void release_pages(struct page **pages, int nr, int cold)
}
VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
- del_page_from_lru(zone, page);
+ del_page_from_lru_list(zone, page, page_off_lru(page));
}
- if (!pagevec_add(&pages_to_free, page)) {
- if (zone) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = NULL;
- }
- __pagevec_free(&pages_to_free);
- pagevec_reinit(&pages_to_free);
- }
+ list_add(&page->lru, &pages_to_free);
}
if (zone)
spin_unlock_irqrestore(&zone->lru_lock, flags);
- pagevec_free(&pages_to_free);
+ free_hot_cold_page_list(&pages_to_free, cold);
}
EXPORT_SYMBOL(release_pages);
@@ -629,9 +645,9 @@ void __pagevec_release(struct pagevec *pvec)
release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
pagevec_reinit(pvec);
}
-
EXPORT_SYMBOL(__pagevec_release);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* used by __split_huge_page_refcount() */
void lru_add_page_tail(struct zone* zone,
struct page *page, struct page *page_tail)
@@ -639,7 +655,6 @@ void lru_add_page_tail(struct zone* zone,
int active;
enum lru_list lru;
const int file = 0;
- struct list_head *head;
VM_BUG_ON(!PageHead(page));
VM_BUG_ON(PageCompound(page_tail));
@@ -658,18 +673,30 @@ void lru_add_page_tail(struct zone* zone,
lru = LRU_INACTIVE_ANON;
}
update_page_reclaim_stat(zone, page_tail, file, active);
- if (likely(PageLRU(page)))
- head = page->lru.prev;
- else
- head = &zone->lru[lru].list;
- __add_page_to_lru_list(zone, page_tail, lru, head);
} else {
SetPageUnevictable(page_tail);
- add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
+ lru = LRU_UNEVICTABLE;
+ }
+
+ if (likely(PageLRU(page)))
+ list_add_tail(&page_tail->lru, &page->lru);
+ else {
+ struct list_head *list_head;
+ /*
+ * Head page has not yet been counted, as an hpage,
+ * so we must account for each subpage individually.
+ *
+ * Use the standard add function to put page_tail on the list,
+ * but then correct its position so they all end up in order.
+ */
+ add_page_to_lru_list(zone, page_tail, lru);
+ list_head = page_tail->lru.prev;
+ list_move_tail(&page_tail->lru, list_head);
}
}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void ____pagevec_lru_add_fn(struct page *page, void *arg)
+static void __pagevec_lru_add_fn(struct page *page, void *arg)
{
enum lru_list lru = (enum lru_list)arg;
struct zone *zone = page_zone(page);
@@ -691,32 +718,13 @@ static void ____pagevec_lru_add_fn(struct page *page, void *arg)
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
*/
-void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
+void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
{
VM_BUG_ON(is_unevictable_lru(lru));
- pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
-}
-
-EXPORT_SYMBOL(____pagevec_lru_add);
-
-/*
- * Try to drop buffers from the pages in a pagevec
- */
-void pagevec_strip(struct pagevec *pvec)
-{
- int i;
-
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
-
- if (page_has_private(page) && trylock_page(page)) {
- if (page_has_private(page))
- try_to_release_page(page, 0);
- unlock_page(page);
- }
- }
+ pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
}
+EXPORT_SYMBOL(__pagevec_lru_add);
/**
* pagevec_lookup - gang pagecache lookup
@@ -740,7 +748,6 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
return pagevec_count(pvec);
}
-
EXPORT_SYMBOL(pagevec_lookup);
unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
@@ -750,7 +757,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
nr_pages, pvec->pages);
return pagevec_count(pvec);
}
-
EXPORT_SYMBOL(pagevec_lookup_tag);
/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 46680461785b..470038a91873 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -6,7 +6,6 @@
*
* Rewritten to use page cache, (C) 1998 Stephen Tweedie
*/
-#include <linux/module.h>
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
@@ -14,7 +13,6 @@
#include <linux/swapops.h>
#include <linux/init.h>
#include <linux/pagemap.h>
-#include <linux/buffer_head.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/migrate.h>
@@ -302,6 +300,16 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
new_page = alloc_page_vma(gfp_mask, vma, addr);
if (!new_page)
break; /* Out of memory */
+ /*
+ * The memcg-specific accounting when moving
+ * pages around the LRU lists relies on the
+ * page's owner (memcg) to be valid. Usually,
+ * pages are assigned to a new owner before
+ * being put on the LRU list, but since this
+ * is not the case here, the stale owner from
+ * a previous allocation cycle must be reset.
+ */
+ mem_cgroup_reset_owner(new_page);
}
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 17bc224bce68..d999f090dfda 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -21,7 +21,6 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
@@ -668,10 +667,10 @@ int try_to_free_swap(struct page *page)
* original page might be freed under memory pressure, then
* later read back in from swap, now with the wrong data.
*
- * Hibernation clears bits from gfp_allowed_mask to prevent
- * memory reclaim from writing to disk, so check that here.
+ * Hibration suspends storage while it is writing the image
+ * to disk so check that here.
*/
- if (!(gfp_allowed_mask & __GFP_IO))
+ if (pm_suspended_storage())
return 0;
delete_from_swap_cache(page);
@@ -848,12 +847,13 @@ unsigned int count_swap_pages(int type, int free)
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct page *page)
{
- struct mem_cgroup *ptr;
+ struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
int ret = 1;
- if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
+ if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
+ GFP_KERNEL, &memcg)) {
ret = -ENOMEM;
goto out_nolock;
}
@@ -861,7 +861,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
if (ret > 0)
- mem_cgroup_cancel_charge_swapin(ptr);
+ mem_cgroup_cancel_charge_swapin(memcg);
ret = 0;
goto out;
}
@@ -872,7 +872,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
page_add_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge_swapin(page, ptr);
+ mem_cgroup_commit_charge_swapin(page, memcg);
swap_free(entry);
/*
* Move the page to the active list so it is not
@@ -1617,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
err = try_to_unuse(type);
- test_set_oom_score_adj(oom_score_adj);
+ compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
if (err) {
/*
diff --git a/mm/thrash.c b/mm/thrash.c
index e53f7d02c17c..57ad495dbd54 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -29,7 +29,7 @@
static DEFINE_SPINLOCK(swap_token_lock);
struct mm_struct *swap_token_mm;
-struct mem_cgroup *swap_token_memcg;
+static struct mem_cgroup *swap_token_memcg;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
diff --git a/mm/truncate.c b/mm/truncate.c
index b40ac6d4e86e..632b15e29f74 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,7 +12,7 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/pagevec.h>
diff --git a/mm/util.c b/mm/util.c
index 88ea1bd661c0..136ac4f322b8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <asm/uaccess.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5016f19e1661..86ce9a526c17 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,7 +256,7 @@ struct vmap_area {
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
struct list_head purge_list; /* "lazy purge" list */
- void *private;
+ struct vm_struct *vm;
struct rcu_head rcu_head;
};
@@ -1118,6 +1118,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
EXPORT_SYMBOL(vm_map_ram);
/**
+ * vm_area_add_early - add vmap area early during boot
+ * @vm: vm_struct to add
+ *
+ * This function is used to add fixed kernel vm area to vmlist before
+ * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
+ * should contain proper values and the other fields should be zero.
+ *
+ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
+ */
+void __init vm_area_add_early(struct vm_struct *vm)
+{
+ struct vm_struct *tmp, **p;
+
+ BUG_ON(vmap_initialized);
+ for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
+ if (tmp->addr >= vm->addr) {
+ BUG_ON(tmp->addr < vm->addr + vm->size);
+ break;
+ } else
+ BUG_ON(tmp->addr + tmp->size > vm->addr);
+ }
+ vm->next = *p;
+ *p = vm;
+}
+
+/**
* vm_area_register_early - register vmap area early during boot
* @vm: vm_struct to register
* @align: requested alignment
@@ -1139,8 +1165,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
vm->addr = (void *)addr;
- vm->next = vmlist;
- vmlist = vm;
+ vm_area_add_early(vm);
}
void __init vmalloc_init(void)
@@ -1253,18 +1278,22 @@ EXPORT_SYMBOL_GPL(map_vm_area);
DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
-static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, void *caller)
{
- struct vm_struct *tmp, **p;
-
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
- va->private = vm;
+ va->vm = vm;
va->flags |= VM_VM_AREA;
+}
+
+static void insert_vmalloc_vmlist(struct vm_struct *vm)
+{
+ struct vm_struct *tmp, **p;
+ vm->flags &= ~VM_UNLIST;
write_lock(&vmlist_lock);
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
if (tmp->addr >= vm->addr)
@@ -1275,11 +1304,18 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
write_unlock(&vmlist_lock);
}
+static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, void *caller)
+{
+ setup_vmalloc_vm(vm, va, flags, caller);
+ insert_vmalloc_vmlist(vm);
+}
+
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, void *caller)
{
- static struct vmap_area *va;
+ struct vmap_area *va;
struct vm_struct *area;
BUG_ON(in_interrupt());
@@ -1313,7 +1349,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return NULL;
}
- insert_vmalloc_vm(area, va, flags, caller);
+ /*
+ * When this function is called from __vmalloc_node_range,
+ * we do not add vm_struct to vmlist here to avoid
+ * accessing uninitialized members of vm_struct such as
+ * pages and nr_pages fields. They will be set later.
+ * To distinguish it from others, we use a VM_UNLIST flag.
+ */
+ if (flags & VM_UNLIST)
+ setup_vmalloc_vm(area, va, flags, caller);
+ else
+ insert_vmalloc_vm(area, va, flags, caller);
+
return area;
}
@@ -1361,7 +1408,7 @@ static struct vm_struct *find_vm_area(const void *addr)
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA)
- return va->private;
+ return va->vm;
return NULL;
}
@@ -1380,18 +1427,21 @@ struct vm_struct *remove_vm_area(const void *addr)
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA) {
- struct vm_struct *vm = va->private;
- struct vm_struct *tmp, **p;
- /*
- * remove from list and disallow access to this vm_struct
- * before unmap. (address range confliction is maintained by
- * vmap.)
- */
- write_lock(&vmlist_lock);
- for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
- ;
- *p = tmp->next;
- write_unlock(&vmlist_lock);
+ struct vm_struct *vm = va->vm;
+
+ if (!(vm->flags & VM_UNLIST)) {
+ struct vm_struct *tmp, **p;
+ /*
+ * remove from list and disallow access to
+ * this vm_struct before unmap. (address range
+ * confliction is maintained by vmap.)
+ */
+ write_lock(&vmlist_lock);
+ for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
+ ;
+ *p = tmp->next;
+ write_unlock(&vmlist_lock);
+ }
vmap_debug_free_range(va->va_start, va->va_end);
free_unmap_vmap_area(va);
@@ -1568,8 +1618,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
- warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, "
- "allocated %ld of %ld bytes\n",
+ warn_alloc_failed(gfp_mask, order,
+ "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
@@ -1600,15 +1650,22 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
size = PAGE_ALIGN(size);
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
- return NULL;
-
- area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
- gfp_mask, caller);
+ goto fail;
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
+ start, end, node, gfp_mask, caller);
if (!area)
- return NULL;
+ goto fail;
addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+ if (!addr)
+ return NULL;
+
+ /*
+ * In this function, newly allocated vm_struct is not added
+ * to vmlist at __get_vm_area_node(). so, it is added here.
+ */
+ insert_vmalloc_vmlist(area);
/*
* A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -1618,6 +1675,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
kmemleak_alloc(addr, real_size, 3, gfp_mask);
return addr;
+
+fail:
+ warn_alloc_failed(gfp_mask, 0,
+ "vmalloc: allocation failure: %lu bytes\n",
+ real_size);
+ return NULL;
}
/**
@@ -2105,23 +2168,30 @@ void __attribute__((weak)) vmalloc_sync_all(void)
static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
{
- /* apply_to_page_range() does all the hard work. */
+ pte_t ***p = data;
+
+ if (p) {
+ *(*p) = pte;
+ (*p)++;
+ }
return 0;
}
/**
* alloc_vm_area - allocate a range of kernel address space
* @size: size of the area
+ * @ptes: returns the PTEs for the address space
*
* Returns: NULL on failure, vm_struct on success
*
* This function reserves a range of kernel address space, and
* allocates pagetables to map that range. No actual mappings
- * are created. If the kernel address space is not shared
- * between processes, it syncs the pagetable across all
- * processes.
+ * are created.
+ *
+ * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
+ * allocated for the VM area are returned.
*/
-struct vm_struct *alloc_vm_area(size_t size)
+struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
struct vm_struct *area;
@@ -2135,19 +2205,11 @@ struct vm_struct *alloc_vm_area(size_t size)
* of kernel virtual address space and mapped into init_mm.
*/
if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
- area->size, f, NULL)) {
+ size, f, ptes ? &ptes : NULL)) {
free_vm_area(area);
return NULL;
}
- /*
- * If the allocated address space is passed to a hypercall
- * before being used then we cannot rely on a page fault to
- * trigger an update of the page tables. So sync all the page
- * tables here.
- */
- vmalloc_sync_all();
-
return area;
}
EXPORT_SYMBOL_GPL(alloc_vm_area);
@@ -2316,7 +2378,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
if (!vas || !vms)
- goto err_free;
+ goto err_free2;
for (area = 0; area < nr_vms; area++) {
vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
@@ -2414,11 +2476,10 @@ found:
err_free:
for (area = 0; area < nr_vms; area++) {
- if (vas)
- kfree(vas[area]);
- if (vms)
- kfree(vms[area]);
+ kfree(vas[area]);
+ kfree(vms[area]);
}
+err_free2:
kfree(vas);
kfree(vms);
return NULL;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b55699cd9067..2880396f7953 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -103,8 +103,11 @@ struct scan_control {
*/
reclaim_mode_t reclaim_mode;
- /* Which cgroup do we reclaim from */
- struct mem_cgroup *mem_cgroup;
+ /*
+ * The memory cgroup that hit its limit and as a result is the
+ * primary target of this reclaim invocation.
+ */
+ struct mem_cgroup *target_mem_cgroup;
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -113,6 +116,11 @@ struct scan_control {
nodemask_t *nodemask;
};
+struct mem_cgroup_zone {
+ struct mem_cgroup *mem_cgroup;
+ struct zone *zone;
+};
+
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
#ifdef ARCH_HAS_PREFETCH
@@ -153,28 +161,45 @@ static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-#define scanning_global_lru(sc) (!(sc)->mem_cgroup)
+static bool global_reclaim(struct scan_control *sc)
+{
+ return !sc->target_mem_cgroup;
+}
+
+static bool scanning_global_lru(struct mem_cgroup_zone *mz)
+{
+ return !mz->mem_cgroup;
+}
#else
-#define scanning_global_lru(sc) (1)
+static bool global_reclaim(struct scan_control *sc)
+{
+ return true;
+}
+
+static bool scanning_global_lru(struct mem_cgroup_zone *mz)
+{
+ return true;
+}
#endif
-static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
- struct scan_control *sc)
+static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
{
- if (!scanning_global_lru(sc))
- return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
+ if (!scanning_global_lru(mz))
+ return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
- return &zone->reclaim_stat;
+ return &mz->zone->reclaim_stat;
}
-static unsigned long zone_nr_lru_pages(struct zone *zone,
- struct scan_control *sc, enum lru_list lru)
+static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
+ enum lru_list lru)
{
- if (!scanning_global_lru(sc))
- return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
- zone_to_nid(zone), zone_idx(zone), BIT(lru));
+ if (!scanning_global_lru(mz))
+ return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
+ zone_to_nid(mz->zone),
+ zone_idx(mz->zone),
+ BIT(lru));
- return zone_page_state(zone, NR_LRU_BASE + lru);
+ return zone_page_state(mz->zone, NR_LRU_BASE + lru);
}
@@ -183,7 +208,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
*/
void register_shrinker(struct shrinker *shrinker)
{
- shrinker->nr = 0;
+ atomic_long_set(&shrinker->nr_in_batch, 0);
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
@@ -247,25 +272,26 @@ unsigned long shrink_slab(struct shrink_control *shrink,
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
- unsigned long total_scan;
- unsigned long max_pass;
+ long total_scan;
+ long max_pass;
int shrink_ret = 0;
long nr;
long new_nr;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
+ max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+ if (max_pass <= 0)
+ continue;
+
/*
* copy the current shrinker scan count into a local variable
* and zero it so that other concurrent shrinker invocations
* don't also do this scanning work.
*/
- do {
- nr = shrinker->nr;
- } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+ nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
total_scan = nr;
- max_pass = do_shrinker_shrink(shrinker, shrink, 0);
delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
@@ -325,12 +351,11 @@ unsigned long shrink_slab(struct shrink_control *shrink,
* manner that handles concurrent updates. If we exhausted the
* scan, there is no need to do an update.
*/
- do {
- nr = shrinker->nr;
- new_nr = total_scan + nr;
- if (total_scan <= 0)
- break;
- } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+ if (total_scan > 0)
+ new_nr = atomic_long_add_return(total_scan,
+ &shrinker->nr_in_batch);
+ else
+ new_nr = atomic_long_read(&shrinker->nr_in_batch);
trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
}
@@ -495,15 +520,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
return PAGE_ACTIVATE;
}
- /*
- * Wait on writeback if requested to. This happens when
- * direct reclaiming a large contiguous area and the
- * first attempt to free a range of pages fails.
- */
- if (PageWriteback(page) &&
- (sc->reclaim_mode & RECLAIM_MODE_SYNC))
- wait_on_page_writeback(page);
-
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
@@ -642,13 +658,14 @@ redo:
lru = LRU_UNEVICTABLE;
add_page_to_unevictable_list(page);
/*
- * When racing with an mlock clearing (page is
- * unlocked), make sure that if the other thread does
- * not observe our setting of PG_lru and fails
- * isolation, we see PG_mlocked cleared below and move
+ * When racing with an mlock or AS_UNEVICTABLE clearing
+ * (page is unlocked) make sure that if the other thread
+ * does not observe our setting of PG_lru and fails
+ * isolation/check_move_unevictable_page,
+ * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
* the page back to the evictable list.
*
- * The other side is TestClearPageMlocked().
+ * The other side is TestClearPageMlocked() or shmem_lock().
*/
smp_mb();
}
@@ -685,12 +702,13 @@ enum page_references {
};
static enum page_references page_check_references(struct page *page,
+ struct mem_cgroup_zone *mz,
struct scan_control *sc)
{
int referenced_ptes, referenced_page;
unsigned long vm_flags;
- referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
+ referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
referenced_page = TestClearPageReferenced(page);
/* Lumpy reclaim - ignore references */
@@ -723,7 +741,13 @@ static enum page_references page_check_references(struct page *page,
*/
SetPageReferenced(page);
- if (referenced_page)
+ if (referenced_page || referenced_ptes > 1)
+ return PAGEREF_ACTIVATE;
+
+ /*
+ * Activate file-backed executable pages after first usage.
+ */
+ if (vm_flags & VM_EXEC)
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
@@ -736,30 +760,15 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_RECLAIM;
}
-static noinline_for_stack void free_page_list(struct list_head *free_pages)
-{
- struct pagevec freed_pvec;
- struct page *page, *tmp;
-
- pagevec_init(&freed_pvec, 1);
-
- list_for_each_entry_safe(page, tmp, free_pages, lru) {
- list_del(&page->lru);
- if (!pagevec_add(&freed_pvec, page)) {
- __pagevec_free(&freed_pvec);
- pagevec_reinit(&freed_pvec);
- }
- }
-
- pagevec_free(&freed_pvec);
-}
-
/*
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct zone *zone,
- struct scan_control *sc)
+ struct mem_cgroup_zone *mz,
+ struct scan_control *sc,
+ int priority,
+ unsigned long *ret_nr_dirty,
+ unsigned long *ret_nr_writeback)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
@@ -767,6 +776,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
unsigned long nr_dirty = 0;
unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
+ unsigned long nr_writeback = 0;
cond_resched();
@@ -785,7 +795,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep;
VM_BUG_ON(PageActive(page));
- VM_BUG_ON(page_zone(page) != zone);
+ VM_BUG_ON(page_zone(page) != mz->zone);
sc->nr_scanned++;
@@ -803,13 +813,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
if (PageWriteback(page)) {
+ nr_writeback++;
/*
- * Synchronous reclaim is performed in two passes,
- * first an asynchronous pass over the list to
- * start parallel writeback, and a second synchronous
- * pass to wait for the IO to complete. Wait here
- * for any page for which writeback has already
- * started.
+ * Synchronous reclaim cannot queue pages for
+ * writeback due to the possibility of stack overflow
+ * but if it encounters a page under writeback, wait
+ * for the IO to complete.
*/
if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
may_enter_fs)
@@ -820,7 +829,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
- references = page_check_references(page, sc);
+ references = page_check_references(page, mz, sc);
switch (references) {
case PAGEREF_ACTIVATE:
goto activate_locked;
@@ -865,6 +874,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageDirty(page)) {
nr_dirty++;
+ /*
+ * Only kswapd can writeback filesystem pages to
+ * avoid risk of stack overflow but do not writeback
+ * unless under significant pressure.
+ */
+ if (page_is_file_cache(page) &&
+ (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+ /*
+ * Immediately reclaim when written back.
+ * Similar in principal to deactivate_page()
+ * except we already have the page isolated
+ * and know it's dirty
+ */
+ inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+ SetPageReclaim(page);
+
+ goto keep_locked;
+ }
+
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
@@ -992,13 +1020,15 @@ keep_lumpy:
* back off and wait for congestion to clear because further reclaim
* will encounter the same problem
*/
- if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
- zone_set_flag(zone, ZONE_CONGESTED);
+ if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
+ zone_set_flag(mz->zone, ZONE_CONGESTED);
- free_page_list(&free_pages);
+ free_hot_cold_page_list(&free_pages, 1);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
+ *ret_nr_dirty += nr_dirty;
+ *ret_nr_writeback += nr_writeback;
return nr_reclaimed;
}
@@ -1012,23 +1042,27 @@ keep_lumpy:
*
* returns 0 on success, -ve errno on failure.
*/
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
{
+ bool all_lru_mode;
int ret = -EINVAL;
/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;
+ all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+ (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
/*
* When checking the active state, we need to be sure we are
* dealing with comparible boolean values. Take the logical not
* of each.
*/
- if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+ if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
return ret;
- if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+ if (!all_lru_mode && !!page_is_file_cache(page) != file)
return ret;
/*
@@ -1041,6 +1075,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)
ret = -EBUSY;
+ /*
+ * To minimise LRU disruption, the caller can indicate that it only
+ * wants to isolate pages it will be able to operate on without
+ * blocking - clean pages for the most part.
+ *
+ * ISOLATE_CLEAN means that only clean pages should be isolated. This
+ * is used by reclaim when it is cannot write to backing storage
+ *
+ * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+ * that it is possible to migrate without blocking
+ */
+ if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+ /* All the caller can do on PageWriteback is block */
+ if (PageWriteback(page))
+ return ret;
+
+ if (PageDirty(page)) {
+ struct address_space *mapping;
+
+ /* ISOLATE_CLEAN means only clean pages */
+ if (mode & ISOLATE_CLEAN)
+ return ret;
+
+ /*
+ * Only pages without mappings or that have a
+ * ->migratepage callback are possible to migrate
+ * without blocking
+ */
+ mapping = page_mapping(page);
+ if (mapping && !mapping->a_ops->migratepage)
+ return ret;
+ }
+ }
+
+ if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+ return ret;
+
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
@@ -1065,24 +1136,36 @@ int __isolate_lru_page(struct page *page, int mode, int file)
* Appropriate locks must be held before calling this function.
*
* @nr_to_scan: The number of pages to look through on the list.
- * @src: The LRU list to pull pages off.
+ * @mz: The mem_cgroup_zone to pull pages from.
* @dst: The temp list to put pages on to.
- * @scanned: The number of pages that were scanned.
+ * @nr_scanned: The number of pages that were scanned.
* @order: The caller's attempted allocation order
* @mode: One of the LRU isolation modes
+ * @active: True [1] if isolating active pages
* @file: True [1] if isolating file [!anon] pages
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
- struct list_head *src, struct list_head *dst,
- unsigned long *scanned, int order, int mode, int file)
+ struct mem_cgroup_zone *mz, struct list_head *dst,
+ unsigned long *nr_scanned, int order, isolate_mode_t mode,
+ int active, int file)
{
+ struct lruvec *lruvec;
+ struct list_head *src;
unsigned long nr_taken = 0;
unsigned long nr_lumpy_taken = 0;
unsigned long nr_lumpy_dirty = 0;
unsigned long nr_lumpy_failed = 0;
unsigned long scan;
+ int lru = LRU_BASE;
+
+ lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
+ if (active)
+ lru += LRU_ACTIVE;
+ if (file)
+ lru += LRU_FILE;
+ src = &lruvec->lists[lru];
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
struct page *page;
@@ -1098,15 +1181,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
switch (__isolate_lru_page(page, mode, file)) {
case 0:
+ mem_cgroup_lru_del(page);
list_move(&page->lru, dst);
- mem_cgroup_del_lru(page);
nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* else it is being freed elsewhere */
list_move(&page->lru, src);
- mem_cgroup_rotate_lru_list(page, page_lru(page));
continue;
default:
@@ -1151,18 +1233,22 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* anon page which don't already have a swap slot is
* pointless.
*/
- if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+ if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
!PageSwapCache(cursor_page))
break;
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
+ unsigned int isolated_pages;
+
+ mem_cgroup_lru_del(cursor_page);
list_move(&cursor_page->lru, dst);
- mem_cgroup_del_lru(cursor_page);
- nr_taken += hpage_nr_pages(page);
- nr_lumpy_taken++;
+ isolated_pages = hpage_nr_pages(cursor_page);
+ nr_taken += isolated_pages;
+ nr_lumpy_taken += isolated_pages;
if (PageDirty(cursor_page))
- nr_lumpy_dirty++;
+ nr_lumpy_dirty += isolated_pages;
scan++;
+ pfn += isolated_pages - 1;
} else {
/*
* Check if the page is freed already.
@@ -1188,57 +1274,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
nr_lumpy_failed++;
}
- *scanned = scan;
+ *nr_scanned = scan;
trace_mm_vmscan_lru_isolate(order,
nr_to_scan, scan,
nr_taken,
nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
- mode);
+ mode, file);
return nr_taken;
}
-static unsigned long isolate_pages_global(unsigned long nr,
- struct list_head *dst,
- unsigned long *scanned, int order,
- int mode, struct zone *z,
- int active, int file)
-{
- int lru = LRU_BASE;
- if (active)
- lru += LRU_ACTIVE;
- if (file)
- lru += LRU_FILE;
- return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
- mode, file);
-}
-
-/*
- * clear_active_flags() is a helper for shrink_active_list(), clearing
- * any active bits from the pages in the list.
- */
-static unsigned long clear_active_flags(struct list_head *page_list,
- unsigned int *count)
-{
- int nr_active = 0;
- int lru;
- struct page *page;
-
- list_for_each_entry(page, page_list, lru) {
- int numpages = hpage_nr_pages(page);
- lru = page_lru_base_type(page);
- if (PageActive(page)) {
- lru += LRU_ACTIVE;
- ClearPageActive(page);
- nr_active += numpages;
- }
- if (count)
- count[lru] += numpages;
- }
-
- return nr_active;
-}
-
/**
* isolate_lru_page - tries to isolate a page from its LRU list
* @page: page to isolate from its LRU list
@@ -1298,7 +1343,7 @@ static int too_many_isolated(struct zone *zone, int file,
if (current_is_kswapd())
return 0;
- if (!scanning_global_lru(sc))
+ if (!global_reclaim(sc))
return 0;
if (file) {
@@ -1312,27 +1357,21 @@ static int too_many_isolated(struct zone *zone, int file,
return isolated > inactive;
}
-/*
- * TODO: Try merging with migrations version of putback_lru_pages
- */
static noinline_for_stack void
-putback_lru_pages(struct zone *zone, struct scan_control *sc,
- unsigned long nr_anon, unsigned long nr_file,
- struct list_head *page_list)
+putback_inactive_pages(struct mem_cgroup_zone *mz,
+ struct list_head *page_list)
{
- struct page *page;
- struct pagevec pvec;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-
- pagevec_init(&pvec, 1);
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+ struct zone *zone = mz->zone;
+ LIST_HEAD(pages_to_free);
/*
* Put back any unfreeable pages.
*/
- spin_lock(&zone->lru_lock);
while (!list_empty(page_list)) {
+ struct page *page = lru_to_page(page_list);
int lru;
- page = lru_to_page(page_list);
+
VM_BUG_ON(PageLRU(page));
list_del(&page->lru);
if (unlikely(!page_evictable(page, NULL))) {
@@ -1349,30 +1388,53 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
int numpages = hpage_nr_pages(page);
reclaim_stat->recent_rotated[file] += numpages;
}
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
+ if (put_page_testzero(page)) {
+ __ClearPageLRU(page);
+ __ClearPageActive(page);
+ del_page_from_lru_list(zone, page, lru);
+
+ if (unlikely(PageCompound(page))) {
+ spin_unlock_irq(&zone->lru_lock);
+ (*get_compound_page_dtor(page))(page);
+ spin_lock_irq(&zone->lru_lock);
+ } else
+ list_add(&page->lru, &pages_to_free);
}
}
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
+ /*
+ * To save our caller's stack, now use input list for pages to free.
+ */
+ list_splice(&pages_to_free, page_list);
}
-static noinline_for_stack void update_isolated_counts(struct zone *zone,
- struct scan_control *sc,
- unsigned long *nr_anon,
- unsigned long *nr_file,
- struct list_head *isolated_list)
+static noinline_for_stack void
+update_isolated_counts(struct mem_cgroup_zone *mz,
+ struct list_head *page_list,
+ unsigned long *nr_anon,
+ unsigned long *nr_file)
{
- unsigned long nr_active;
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+ struct zone *zone = mz->zone;
unsigned int count[NR_LRU_LISTS] = { 0, };
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+ unsigned long nr_active = 0;
+ struct page *page;
+ int lru;
+
+ /*
+ * Count pages and clear active flags
+ */
+ list_for_each_entry(page, page_list, lru) {
+ int numpages = hpage_nr_pages(page);
+ lru = page_lru_base_type(page);
+ if (PageActive(page)) {
+ lru += LRU_ACTIVE;
+ ClearPageActive(page);
+ nr_active += numpages;
+ }
+ count[lru] += numpages;
+ }
- nr_active = clear_active_flags(isolated_list, count);
__count_vm_events(PGDEACTIVATE, nr_active);
__mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1386,15 +1448,13 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
*nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
*nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
reclaim_stat->recent_scanned[0] += *nr_anon;
reclaim_stat->recent_scanned[1] += *nr_file;
}
/*
- * Returns true if the caller should wait to clean dirty/writeback pages.
+ * Returns true if a direct reclaim should wait on pages under writeback.
*
* If we are direct reclaiming for contiguous pages and we do not reclaim
* everything in the list, try again and wait for writeback IO to complete.
@@ -1416,7 +1476,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
return false;
- /* If we have relaimed everything on the isolated list, no stall */
+ /* If we have reclaimed everything on the isolated list, no stall */
if (nr_freed == nr_taken)
return false;
@@ -1439,8 +1499,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
* of reclaimed pages
*/
static noinline_for_stack unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
- struct scan_control *sc, int priority, int file)
+shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
+ struct scan_control *sc, int priority, int file)
{
LIST_HEAD(page_list);
unsigned long nr_scanned;
@@ -1448,6 +1508,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
unsigned long nr_taken;
unsigned long nr_anon;
unsigned long nr_file;
+ unsigned long nr_dirty = 0;
+ unsigned long nr_writeback = 0;
+ isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
+ struct zone *zone = mz->zone;
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1458,15 +1522,22 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
}
set_reclaim_mode(priority, sc, false);
+ if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+ reclaim_mode |= ISOLATE_ACTIVE;
+
lru_add_drain();
+
+ if (!sc->may_unmap)
+ reclaim_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ reclaim_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
- if (scanning_global_lru(sc)) {
- nr_taken = isolate_pages_global(nr_to_scan,
- &page_list, &nr_scanned, sc->order,
- sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
- zone, 0, file);
+ nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
+ &nr_scanned, sc->order,
+ reclaim_mode, 0, file);
+ if (global_reclaim(sc)) {
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1474,17 +1545,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
else
__count_zone_vm_events(PGSCAN_DIRECT, zone,
nr_scanned);
- } else {
- nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
- &page_list, &nr_scanned, sc->order,
- sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
- zone, sc->mem_cgroup,
- 0, file);
- /*
- * mem_cgroup_isolate_pages() keeps track of
- * scanned pages on its own.
- */
}
if (nr_taken == 0) {
@@ -1492,24 +1552,63 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
return 0;
}
- update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
+ update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
+
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
spin_unlock_irq(&zone->lru_lock);
- nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+ nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
+ &nr_dirty, &nr_writeback);
/* Check if we should syncronously wait for writeback */
if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
set_reclaim_mode(priority, sc, true);
- nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+ nr_reclaimed += shrink_page_list(&page_list, mz, sc,
+ priority, &nr_dirty, &nr_writeback);
}
- local_irq_disable();
+ spin_lock_irq(&zone->lru_lock);
+
if (current_is_kswapd())
__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
__count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
- putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+ putback_inactive_pages(mz, &page_list);
+
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+
+ spin_unlock_irq(&zone->lru_lock);
+
+ free_hot_cold_page_list(&page_list, 1);
+
+ /*
+ * If reclaim is isolating dirty pages under writeback, it implies
+ * that the long-lived page allocation rate is exceeding the page
+ * laundering rate. Either the global limits are not being effective
+ * at throttling processes due to the page distribution throughout
+ * zones or there is heavy usage of a slow backing device. The
+ * only option is to throttle from reclaim context which is not ideal
+ * as there is no guarantee the dirtying process is throttled in the
+ * same way balance_dirty_pages() manages.
+ *
+ * This scales the number of dirty pages that must be under writeback
+ * before throttling depending on priority. It is a simple backoff
+ * function that has the most effect in the range DEF_PRIORITY to
+ * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+ * in trouble and reclaim is considered to be in trouble.
+ *
+ * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
+ * DEF_PRIORITY-1 50% must be PageWriteback
+ * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
+ * ...
+ * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+ * isolated page is PageWriteback
+ */
+ if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+ wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
@@ -1539,30 +1638,47 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
static void move_active_pages_to_lru(struct zone *zone,
struct list_head *list,
+ struct list_head *pages_to_free,
enum lru_list lru)
{
unsigned long pgmoved = 0;
- struct pagevec pvec;
struct page *page;
- pagevec_init(&pvec, 1);
+ if (buffer_heads_over_limit) {
+ spin_unlock_irq(&zone->lru_lock);
+ list_for_each_entry(page, list, lru) {
+ if (page_has_private(page) && trylock_page(page)) {
+ if (page_has_private(page))
+ try_to_release_page(page, 0);
+ unlock_page(page);
+ }
+ }
+ spin_lock_irq(&zone->lru_lock);
+ }
while (!list_empty(list)) {
+ struct lruvec *lruvec;
+
page = lru_to_page(list);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
- list_move(&page->lru, &zone->lru[lru].list);
- mem_cgroup_add_lru_list(page, lru);
+ lruvec = mem_cgroup_lru_add_list(zone, page, lru);
+ list_move(&page->lru, &lruvec->lists[lru]);
pgmoved += hpage_nr_pages(page);
- if (!pagevec_add(&pvec, page) || list_empty(list)) {
- spin_unlock_irq(&zone->lru_lock);
- if (buffer_heads_over_limit)
- pagevec_strip(&pvec);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
+ if (put_page_testzero(page)) {
+ __ClearPageLRU(page);
+ __ClearPageActive(page);
+ del_page_from_lru_list(zone, page, lru);
+
+ if (unlikely(PageCompound(page))) {
+ spin_unlock_irq(&zone->lru_lock);
+ (*get_compound_page_dtor(page))(page);
+ spin_lock_irq(&zone->lru_lock);
+ } else
+ list_add(&page->lru, pages_to_free);
}
}
__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1570,41 +1686,41 @@ static void move_active_pages_to_lru(struct zone *zone,
__count_vm_events(PGDEACTIVATE, pgmoved);
}
-static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
- struct scan_control *sc, int priority, int file)
+static void shrink_active_list(unsigned long nr_to_scan,
+ struct mem_cgroup_zone *mz,
+ struct scan_control *sc,
+ int priority, int file)
{
unsigned long nr_taken;
- unsigned long pgscanned;
+ unsigned long nr_scanned;
unsigned long vm_flags;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
unsigned long nr_rotated = 0;
+ isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
+ struct zone *zone = mz->zone;
lru_add_drain();
+
+ if (!sc->may_unmap)
+ reclaim_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ reclaim_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
- if (scanning_global_lru(sc)) {
- nr_taken = isolate_pages_global(nr_pages, &l_hold,
- &pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
- 1, file);
- zone->pages_scanned += pgscanned;
- } else {
- nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
- &pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
- sc->mem_cgroup, 1, file);
- /*
- * mem_cgroup_isolate_pages() keeps track of
- * scanned pages on its own.
- */
- }
+
+ nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
+ &nr_scanned, sc->order,
+ reclaim_mode, 1, file);
+ if (global_reclaim(sc))
+ zone->pages_scanned += nr_scanned;
reclaim_stat->recent_scanned[file] += nr_taken;
- __count_zone_vm_events(PGREFILL, zone, pgscanned);
+ __count_zone_vm_events(PGREFILL, zone, nr_scanned);
if (file)
__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
else
@@ -1622,7 +1738,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
continue;
}
- if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
+ if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
@@ -1655,12 +1771,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- move_active_pages_to_lru(zone, &l_active,
+ move_active_pages_to_lru(zone, &l_active, &l_hold,
LRU_ACTIVE + file * LRU_FILE);
- move_active_pages_to_lru(zone, &l_inactive,
+ move_active_pages_to_lru(zone, &l_inactive, &l_hold,
LRU_BASE + file * LRU_FILE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
+
+ free_hot_cold_page_list(&l_hold, 1);
}
#ifdef CONFIG_SWAP
@@ -1685,10 +1803,8 @@ static int inactive_anon_is_low_global(struct zone *zone)
* Returns true if the zone does not have enough inactive anon pages,
* meaning some active anon pages need to be deactivated.
*/
-static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
+static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
{
- int low;
-
/*
* If we don't have swap space, anonymous page deactivation
* is pointless.
@@ -1696,15 +1812,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
if (!total_swap_pages)
return 0;
- if (scanning_global_lru(sc))
- low = inactive_anon_is_low_global(zone);
- else
- low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
- return low;
+ if (!scanning_global_lru(mz))
+ return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
+ mz->zone);
+
+ return inactive_anon_is_low_global(mz->zone);
}
#else
-static inline int inactive_anon_is_low(struct zone *zone,
- struct scan_control *sc)
+static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
{
return 0;
}
@@ -1722,8 +1837,7 @@ static int inactive_file_is_low_global(struct zone *zone)
/**
* inactive_file_is_low - check if file pages need to be deactivated
- * @zone: zone to check
- * @sc: scan control of this context
+ * @mz: memory cgroup and zone to check
*
* When the system is doing streaming IO, memory pressure here
* ensures that active file pages get deactivated, until more
@@ -1735,45 +1849,44 @@ static int inactive_file_is_low_global(struct zone *zone)
* This uses a different ratio than the anonymous pages, because
* the page cache uses a use-once replacement algorithm.
*/
-static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
+static int inactive_file_is_low(struct mem_cgroup_zone *mz)
{
- int low;
+ if (!scanning_global_lru(mz))
+ return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
+ mz->zone);
- if (scanning_global_lru(sc))
- low = inactive_file_is_low_global(zone);
- else
- low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
- return low;
+ return inactive_file_is_low_global(mz->zone);
}
-static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
- int file)
+static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
{
if (file)
- return inactive_file_is_low(zone, sc);
+ return inactive_file_is_low(mz);
else
- return inactive_anon_is_low(zone, sc);
+ return inactive_anon_is_low(mz);
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct zone *zone, struct scan_control *sc, int priority)
+ struct mem_cgroup_zone *mz,
+ struct scan_control *sc, int priority)
{
int file = is_file_lru(lru);
if (is_active_lru(lru)) {
- if (inactive_list_is_low(zone, sc, file))
- shrink_active_list(nr_to_scan, zone, sc, priority, file);
+ if (inactive_list_is_low(mz, file))
+ shrink_active_list(nr_to_scan, mz, sc, priority, file);
return 0;
}
- return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
+ return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
}
-static int vmscan_swappiness(struct scan_control *sc)
+static int vmscan_swappiness(struct mem_cgroup_zone *mz,
+ struct scan_control *sc)
{
- if (scanning_global_lru(sc))
+ if (global_reclaim(sc))
return vm_swappiness;
- return mem_cgroup_swappiness(sc->mem_cgroup);
+ return mem_cgroup_swappiness(mz->mem_cgroup);
}
/*
@@ -1784,24 +1897,31 @@ static int vmscan_swappiness(struct scan_control *sc)
*
* nr[0] = anon pages to scan; nr[1] = file pages to scan
*/
-static void get_scan_count(struct zone *zone, struct scan_control *sc,
- unsigned long *nr, int priority)
+static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
+ unsigned long *nr, int priority)
{
unsigned long anon, file, free;
unsigned long anon_prio, file_prio;
unsigned long ap, fp;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
u64 fraction[2], denominator;
- enum lru_list l;
+ enum lru_list lru;
int noswap = 0;
bool force_scan = false;
- unsigned long nr_force_scan[2];
- /* kswapd does zone balancing and needs to scan this zone */
- if (scanning_global_lru(sc) && current_is_kswapd())
+ /*
+ * If the zone or memcg is small, nr[l] can be 0. This
+ * results in no scanning on this priority and a potential
+ * priority drop. Global direct reclaim can go to the next
+ * zone and tends to have no problems. Global kswapd is for
+ * zone balancing and it needs to scan a minimum amount. When
+ * reclaiming for a memcg, a priority drop can cause high
+ * latencies, so it's better to scan a minimum amount there as
+ * well.
+ */
+ if (current_is_kswapd() && mz->zone->all_unreclaimable)
force_scan = true;
- /* memcg may have small limit and need to avoid priority drop */
- if (!scanning_global_lru(sc))
+ if (!global_reclaim(sc))
force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
@@ -1810,26 +1930,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = 0;
fraction[1] = 1;
denominator = 1;
- nr_force_scan[0] = 0;
- nr_force_scan[1] = SWAP_CLUSTER_MAX;
goto out;
}
- anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
- file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
+ zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
+ file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
+ zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
- if (scanning_global_lru(sc)) {
- free = zone_page_state(zone, NR_FREE_PAGES);
+ if (global_reclaim(sc)) {
+ free = zone_page_state(mz->zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
force-scan anon pages. */
- if (unlikely(file + free <= high_wmark_pages(zone))) {
+ if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
- nr_force_scan[0] = SWAP_CLUSTER_MAX;
- nr_force_scan[1] = 0;
goto out;
}
}
@@ -1838,8 +1954,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
- anon_prio = vmscan_swappiness(sc);
- file_prio = 200 - vmscan_swappiness(sc);
+ anon_prio = vmscan_swappiness(mz, sc);
+ file_prio = 200 - vmscan_swappiness(mz, sc);
/*
* OK, so we have swap space and a fair amount of page cache
@@ -1852,7 +1968,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&mz->zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
@@ -1873,39 +1989,24 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&mz->zone->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
denominator = ap + fp + 1;
- if (force_scan) {
- unsigned long scan = SWAP_CLUSTER_MAX;
- nr_force_scan[0] = div64_u64(scan * ap, denominator);
- nr_force_scan[1] = div64_u64(scan * fp, denominator);
- }
out:
- for_each_evictable_lru(l) {
- int file = is_file_lru(l);
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
unsigned long scan;
- scan = zone_nr_lru_pages(zone, sc, l);
+ scan = zone_nr_lru_pages(mz, lru);
if (priority || noswap) {
scan >>= priority;
+ if (!scan && force_scan)
+ scan = SWAP_CLUSTER_MAX;
scan = div64_u64(scan * fraction[file], denominator);
}
-
- /*
- * If zone is small or memcg is small, nr[l] can be 0.
- * This results no-scan on this priority and priority drop down.
- * For global direct reclaim, it can visit next zone and tend
- * not to have problems. For global kswapd, it's for zone
- * balancing and it need to scan a small amounts. When using
- * memcg, priority drop can cause big latency. So, it's better
- * to scan small amount. See may_noscan above.
- */
- if (!scan && force_scan)
- scan = nr_force_scan[file];
- nr[l] = scan;
+ nr[lru] = scan;
}
}
@@ -1916,7 +2017,7 @@ out:
* back to the allocator and call try_to_compact_zone(), we ensure that
* there are enough free pages for it to be likely successful
*/
-static inline bool should_continue_reclaim(struct zone *zone,
+static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
unsigned long nr_reclaimed,
unsigned long nr_scanned,
struct scan_control *sc)
@@ -1956,14 +2057,15 @@ static inline bool should_continue_reclaim(struct zone *zone,
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
- inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+ if (nr_swap_pages > 0)
+ inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
/* If compaction would go ahead or the allocation would succeed, stop */
- switch (compaction_suitable(zone, sc->order)) {
+ switch (compaction_suitable(mz->zone, sc->order)) {
case COMPACT_PARTIAL:
case COMPACT_CONTINUE:
return false;
@@ -1975,30 +2077,32 @@ static inline bool should_continue_reclaim(struct zone *zone,
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_zone(int priority, struct zone *zone,
- struct scan_control *sc)
+static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
+ struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
- enum lru_list l;
+ enum lru_list lru;
unsigned long nr_reclaimed, nr_scanned;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ struct blk_plug plug;
restart:
nr_reclaimed = 0;
nr_scanned = sc->nr_scanned;
- get_scan_count(zone, sc, nr, priority);
+ get_scan_count(mz, sc, nr, priority);
+ blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
- for_each_evictable_lru(l) {
- if (nr[l]) {
+ for_each_evictable_lru(lru) {
+ if (nr[lru]) {
nr_to_scan = min_t(unsigned long,
- nr[l], SWAP_CLUSTER_MAX);
- nr[l] -= nr_to_scan;
+ nr[lru], SWAP_CLUSTER_MAX);
+ nr[lru] -= nr_to_scan;
- nr_reclaimed += shrink_list(l, nr_to_scan,
- zone, sc, priority);
+ nr_reclaimed += shrink_list(lru, nr_to_scan,
+ mz, sc, priority);
}
}
/*
@@ -2012,23 +2116,96 @@ restart:
if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
break;
}
+ blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_anon_is_low(zone, sc))
- shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+ if (inactive_anon_is_low(mz))
+ shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
/* reclaim/compaction might need reclaim to continue */
- if (should_continue_reclaim(zone, nr_reclaimed,
+ if (should_continue_reclaim(mz, nr_reclaimed,
sc->nr_scanned - nr_scanned, sc))
goto restart;
throttle_vm_writeout(sc->gfp_mask);
}
+static void shrink_zone(int priority, struct zone *zone,
+ struct scan_control *sc)
+{
+ struct mem_cgroup *root = sc->target_mem_cgroup;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .zone = zone,
+ .priority = priority,
+ };
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_iter(root, NULL, &reclaim);
+ do {
+ struct mem_cgroup_zone mz = {
+ .mem_cgroup = memcg,
+ .zone = zone,
+ };
+
+ shrink_mem_cgroup_zone(priority, &mz, sc);
+ /*
+ * Limit reclaim has historically picked one memcg and
+ * scanned it with decreasing priority levels until
+ * nr_to_reclaim had been reclaimed. This priority
+ * cycle is thus over after a single memcg.
+ *
+ * Direct reclaim and kswapd, on the other hand, have
+ * to scan all memory cgroups to fulfill the overall
+ * scan target for the zone.
+ */
+ if (!global_reclaim(sc)) {
+ mem_cgroup_iter_break(root, memcg);
+ break;
+ }
+ memcg = mem_cgroup_iter(root, memcg, &reclaim);
+ } while (memcg);
+}
+
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+ unsigned long balance_gap, watermark;
+ bool watermark_ok;
+
+ /* Do not consider compaction for orders reclaim is meant to satisfy */
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+ return false;
+
+ /*
+ * Compaction takes time to run and there are potentially other
+ * callers using the pages just freed. Continue reclaiming until
+ * there is a buffer of free pages available to give compaction
+ * a reasonable chance of completing and allocating the page
+ */
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+
+ /*
+ * If compaction is deferred, reclaim up to a point where
+ * compaction will have a chance of success when re-enabled
+ */
+ if (compaction_deferred(zone))
+ return watermark_ok;
+
+ /* If compaction is not ready to start, keep reclaiming */
+ if (!compaction_suitable(zone, sc->order))
+ return false;
+
+ return watermark_ok;
+}
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2044,14 +2221,20 @@ restart:
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
*/
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ bool aborted_reclaim = false;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2061,11 +2244,26 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
* Take care memory controller reclaiming has small influence
* to global LRU.
*/
- if (scanning_global_lru(sc)) {
+ if (global_reclaim(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ if (COMPACTION_BUILD) {
+ /*
+ * If we already have plenty of memory free for
+ * compaction in this zone, don't free any more.
+ * Even though compaction is invoked for any
+ * non-zero order, only frequent costly order
+ * reclamation is disruptive enough to become a
+ * noticable problem, like transparent huge page
+ * allocations.
+ */
+ if (compaction_ready(zone, sc)) {
+ aborted_reclaim = true;
+ continue;
+ }
+ }
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
@@ -2083,6 +2281,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
shrink_zone(priority, zone, sc);
}
+
+ return aborted_reclaim;
}
static bool zone_reclaimable(struct zone *zone)
@@ -2136,23 +2336,25 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct zoneref *z;
struct zone *zone;
unsigned long writeback_threshold;
+ bool aborted_reclaim;
get_mems_allowed();
delayacct_freepages_start();
- if (scanning_global_lru(sc))
+ if (global_reclaim(sc))
count_vm_event(ALLOCSTALL);
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc->nr_scanned = 0;
if (!priority)
- disable_swap_token(sc->mem_cgroup);
- shrink_zones(priority, zonelist, sc);
+ disable_swap_token(sc->target_mem_cgroup);
+ aborted_reclaim = shrink_zones(priority, zonelist, sc);
+
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
*/
- if (scanning_global_lru(sc)) {
+ if (global_reclaim(sc)) {
unsigned long lru_pages = 0;
for_each_zone_zonelist(zone, z, zonelist,
gfp_zone(sc->gfp_mask)) {
@@ -2181,7 +2383,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {
- wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+ wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+ WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
@@ -2212,8 +2415,12 @@ out:
if (oom_killer_disabled)
return 0;
+ /* Aborted reclaim to try compaction? don't OOM, then */
+ if (aborted_reclaim)
+ return 1;
+
/* top priority shrink_zones still had more to do? don't OOM, then */
- if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
+ if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
return 1;
return 0;
@@ -2230,7 +2437,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.may_unmap = 1,
.may_swap = 1,
.order = order,
- .mem_cgroup = NULL,
+ .target_mem_cgroup = NULL,
.nodemask = nodemask,
};
struct shrink_control shrink = {
@@ -2250,7 +2457,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
unsigned long *nr_scanned)
@@ -2262,7 +2469,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
.may_unmap = 1,
.may_swap = !noswap,
.order = 0,
- .mem_cgroup = mem,
+ .target_mem_cgroup = memcg,
+ };
+ struct mem_cgroup_zone mz = {
+ .mem_cgroup = memcg,
+ .zone = zone,
};
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2279,7 +2490,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_zone(0, zone, &sc);
+ shrink_mem_cgroup_zone(0, &mz, &sc);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2287,7 +2498,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
return sc.nr_reclaimed;
}
-unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
gfp_t gfp_mask,
bool noswap)
{
@@ -2300,7 +2511,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.may_swap = !noswap,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.order = 0,
- .mem_cgroup = mem_cont,
+ .target_mem_cgroup = memcg,
.nodemask = NULL, /* we don't care the placement */
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2314,7 +2525,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
* take care of from where we get pages. So the node where we start the
* scan does not need to be the current node.
*/
- nid = mem_cgroup_select_victim_node(mem_cont);
+ nid = mem_cgroup_select_victim_node(memcg);
zonelist = NODE_DATA(nid)->node_zonelists;
@@ -2330,6 +2541,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
}
#endif
+static void age_active_anon(struct zone *zone, struct scan_control *sc,
+ int priority)
+{
+ struct mem_cgroup *memcg;
+
+ if (!total_swap_pages)
+ return;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct mem_cgroup_zone mz = {
+ .mem_cgroup = memcg,
+ .zone = zone,
+ };
+
+ if (inactive_anon_is_low(&mz))
+ shrink_active_list(SWAP_CLUSTER_MAX, &mz,
+ sc, priority, 0);
+
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ } while (memcg);
+}
+
/*
* pgdat_balanced is used when checking if a node is balanced for high-order
* allocations. Only zones that meet watermarks and are in a zone allowed
@@ -2450,7 +2684,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
*/
.nr_to_reclaim = ULONG_MAX,
.order = order,
- .mem_cgroup = NULL,
+ .target_mem_cgroup = NULL,
};
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
@@ -2489,9 +2723,7 @@ loop_again:
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming.
*/
- if (inactive_anon_is_low(zone, &sc))
- shrink_active_list(SWAP_CLUSTER_MAX, zone,
- &sc, priority, 0);
+ age_active_anon(zone, &sc, priority);
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
@@ -2690,6 +2922,8 @@ out:
/* If balanced, clear the congested flag */
zone_clear_flag(zone, ZONE_CONGESTED);
+ if (i <= *classzone_idx)
+ balanced += zone->present_pages;
}
}
@@ -2763,7 +2997,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
static int kswapd(void *p)
{
unsigned long order, new_order;
+ unsigned balanced_order;
int classzone_idx, new_classzone_idx;
+ int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -2794,7 +3030,9 @@ static int kswapd(void *p)
set_freezable();
order = new_order = 0;
+ balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+ balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
int ret;
@@ -2803,7 +3041,8 @@ static int kswapd(void *p)
* new request of a similar or harder type will succeed soon
* so consider going to sleep on the basis we reclaimed at
*/
- if (classzone_idx >= new_classzone_idx && order == new_order) {
+ if (balanced_classzone_idx >= new_classzone_idx &&
+ balanced_order == new_order) {
new_order = pgdat->kswapd_max_order;
new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
@@ -2818,9 +3057,12 @@ static int kswapd(void *p)
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, order, classzone_idx);
+ kswapd_try_to_sleep(pgdat, balanced_order,
+ balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
+ new_order = order;
+ new_classzone_idx = classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
@@ -2835,7 +3077,9 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- order = balance_pgdat(pgdat, order, &classzone_idx);
+ balanced_classzone_idx = classzone_idx;
+ balanced_order = balance_pgdat(pgdat, order,
+ &balanced_classzone_idx);
}
}
return 0;
@@ -3268,16 +3512,18 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
*/
static void check_move_unevictable_page(struct page *page, struct zone *zone)
{
- VM_BUG_ON(PageActive(page));
+ struct lruvec *lruvec;
+ VM_BUG_ON(PageActive(page));
retry:
ClearPageUnevictable(page);
if (page_evictable(page, NULL)) {
enum lru_list l = page_lru_base_type(page);
__dec_zone_state(zone, NR_UNEVICTABLE);
- list_move(&page->lru, &zone->lru[l].list);
- mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
+ lruvec = mem_cgroup_lru_move_lists(zone, page,
+ LRU_UNEVICTABLE, l);
+ list_move(&page->lru, &lruvec->lists[l]);
__inc_zone_state(zone, NR_INACTIVE_ANON + l);
__count_vm_event(UNEVICTABLE_PGRESCUED);
} else {
@@ -3285,8 +3531,9 @@ retry:
* rotate unevictable list
*/
SetPageUnevictable(page);
- list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
- mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
+ lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE,
+ LRU_UNEVICTABLE);
+ list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]);
if (page_evictable(page, NULL))
goto retry;
}
@@ -3347,66 +3594,13 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
}
-/**
- * scan_zone_unevictable_pages - check unevictable list for evictable pages
- * @zone - zone of which to scan the unevictable list
- *
- * Scan @zone's unevictable LRU lists to check for pages that have become
- * evictable. Move those that have to @zone's inactive list where they
- * become candidates for reclaim, unless shrink_inactive_zone() decides
- * to reactivate them. Pages that are still unevictable are rotated
- * back onto @zone's unevictable list.
- */
-#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-static void scan_zone_unevictable_pages(struct zone *zone)
+static void warn_scan_unevictable_pages(void)
{
- struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
- unsigned long scan;
- unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
-
- while (nr_to_scan > 0) {
- unsigned long batch_size = min(nr_to_scan,
- SCAN_UNEVICTABLE_BATCH_SIZE);
-
- spin_lock_irq(&zone->lru_lock);
- for (scan = 0; scan < batch_size; scan++) {
- struct page *page = lru_to_page(l_unevictable);
-
- if (!trylock_page(page))
- continue;
-
- prefetchw_prev_lru_page(page, l_unevictable, flags);
-
- if (likely(PageLRU(page) && PageUnevictable(page)))
- check_move_unevictable_page(page, zone);
-
- unlock_page(page);
- }
- spin_unlock_irq(&zone->lru_lock);
-
- nr_to_scan -= batch_size;
- }
-}
-
-
-/**
- * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
- *
- * A really big hammer: scan all zones' unevictable LRU lists to check for
- * pages that have become evictable. Move those back to the zones'
- * inactive list where they become candidates for reclaim.
- * This occurs when, e.g., we have unswappable pages on the unevictable lists,
- * and we add swap to the system. As such, it runs in the context of a task
- * that has possibly/probably made some previously unevictable pages
- * evictable.
- */
-static void scan_all_zones_unevictable_pages(void)
-{
- struct zone *zone;
-
- for_each_zone(zone) {
- scan_zone_unevictable_pages(zone);
- }
+ printk_once(KERN_WARNING
+ "%s: The scan_unevictable_pages sysctl/node-interface has been "
+ "disabled for lack of a legitimate use case. If you have "
+ "one, please send an email to linux-mm@kvack.org.\n",
+ current->comm);
}
/*
@@ -3419,11 +3613,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
void __user *buffer,
size_t *length, loff_t *ppos)
{
+ warn_scan_unevictable_pages();
proc_doulongvec_minmax(table, write, buffer, length, ppos);
-
- if (write && *(unsigned long *)table->data)
- scan_all_zones_unevictable_pages();
-
scan_unevictable_pages = 0;
return 0;
}
@@ -3434,45 +3625,34 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
* a specified node's per zone unevictable lists for evictable pages.
*/
-static ssize_t read_scan_unevictable_node(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t read_scan_unevictable_node(struct device *dev,
+ struct device_attribute *attr,
char *buf)
{
+ warn_scan_unevictable_pages();
return sprintf(buf, "0\n"); /* always zero; should fit... */
}
-static ssize_t write_scan_unevictable_node(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t write_scan_unevictable_node(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t count)
{
- struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
- struct zone *zone;
- unsigned long res;
- unsigned long req = strict_strtoul(buf, 10, &res);
-
- if (!req)
- return 1; /* zero is no-op */
-
- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!populated_zone(zone))
- continue;
- scan_zone_unevictable_pages(zone);
- }
+ warn_scan_unevictable_pages();
return 1;
}
-static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
+static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
read_scan_unevictable_node,
write_scan_unevictable_node);
int scan_unevictable_register_node(struct node *node)
{
- return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
+ return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
}
void scan_unevictable_unregister_node(struct node *node)
{
- sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
+ device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
}
#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d52b13d28e8f..f600557a7659 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
*
* vm_stat contains the global counters
*/
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
@@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
}
EXPORT_SYMBOL(__dec_zone_page_state);
-#ifdef CONFIG_CMPXCHG_LOCAL
+#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
/*
* If we have cmpxchg_local support then we do not need to incur the overhead
* that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
@@ -702,6 +702,7 @@ const char * const vmstat_text[] = {
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",
+ "nr_vmscan_immediate_reclaim",
"nr_writeback_temp",
"nr_isolated_anon",
"nr_isolated_file",