summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/filemap.c8
-rw-r--r--mm/gup.c42
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/hugetlb.c44
-rw-r--r--mm/hugetlb_cgroup.c6
-rw-r--r--mm/init-mm.c1
-rw-r--r--mm/ksm.c11
-rw-r--r--mm/memblock.c27
-rw-r--r--mm/memcontrol.c342
-rw-r--r--mm/memfd.c345
-rw-r--r--mm/memory.c36
-rw-r--r--mm/memory_hotplug.c23
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c74
-rw-r--r--mm/page_counter.c100
-rw-r--r--mm/shmem.c382
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c112
-rw-r--r--mm/sparse.c6
-rw-r--r--mm/swap_slots.c10
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/userfaultfd.c22
-rw-r--r--mm/util.c6
-rw-r--r--mm/vmalloc.c41
-rw-r--r--mm/vmpressure.c35
-rw-r--r--mm/vmscan.c38
32 files changed, 1008 insertions, 737 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3e0b6e87f65d..00bffa7a5112 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -754,3 +754,6 @@ config GUP_BENCHMARK
performance of get_user_pages_fast().
See tools/testing/selftests/vm/gup_benchmark.c
+
+config ARCH_HAS_PTE_SPECIAL
+ bool
diff --git a/mm/Makefile b/mm/Makefile
index b4e54a9ae9c5..8716bdabe1e6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_HMM) += hmm.o
+obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8fe3ebd6ac00..347cc834c04a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -557,7 +557,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
memcg = mem_cgroup_from_css(memcg_css);
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
blkcg = css_to_blkcg(blkcg_css);
- memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+ memcg_cgwb_list = &memcg->cgwb_list;
blkcg_cgwb_list = &blkcg->cgwb_list;
/* look up again under lock and discard on blkcg mismatch */
@@ -736,7 +736,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
*/
void wb_memcg_offline(struct mem_cgroup *memcg)
{
- struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+ struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
struct bdi_writeback *wb, *next;
spin_lock_irq(&cgwb_lock);
diff --git a/mm/filemap.c b/mm/filemap.c
index 0604cb02e6f3..52517f28e6f4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2489,7 +2489,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
*
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
*/
-int filemap_fault(struct vm_fault *vmf)
+vm_fault_t filemap_fault(struct vm_fault *vmf)
{
int error;
struct file *file = vmf->vma->vm_file;
@@ -2499,7 +2499,7 @@ int filemap_fault(struct vm_fault *vmf)
pgoff_t offset = vmf->pgoff;
pgoff_t max_off;
struct page *page;
- int ret = 0;
+ vm_fault_t ret = 0;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off))
@@ -2693,11 +2693,11 @@ next:
}
EXPORT_SYMBOL(filemap_map_pages);
-int filemap_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
- int ret = VM_FAULT_LOCKED;
+ vm_fault_t ret = VM_FAULT_LOCKED;
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
diff --git a/mm/gup.c b/mm/gup.c
index 541904a7c60f..1020c7f8f5ee 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -212,53 +212,69 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
unsigned int flags, unsigned int *page_mask)
{
- pmd_t *pmd;
+ pmd_t *pmd, pmdval;
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
pmd = pmd_offset(pudp, address);
- if (pmd_none(*pmd))
+ /*
+ * The READ_ONCE() will stabilize the pmdval in a register or
+ * on the stack so that it will stop changing under the code.
+ */
+ pmdval = READ_ONCE(*pmd);
+ if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
+ if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
return no_page_table(vma, flags);
}
- if (is_hugepd(__hugepd(pmd_val(*pmd)))) {
+ if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
page = follow_huge_pd(vma, address,
- __hugepd(pmd_val(*pmd)), flags,
+ __hugepd(pmd_val(pmdval)), flags,
PMD_SHIFT);
if (page)
return page;
return no_page_table(vma, flags);
}
retry:
- if (!pmd_present(*pmd)) {
+ if (!pmd_present(pmdval)) {
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(*pmd));
- if (is_pmd_migration_entry(*pmd))
+ !is_pmd_migration_entry(pmdval));
+ if (is_pmd_migration_entry(pmdval))
pmd_migration_entry_wait(mm, pmd);
+ pmdval = READ_ONCE(*pmd);
+ /*
+ * MADV_DONTNEED may convert the pmd to null because
+ * mmap_sem is held in read mode
+ */
+ if (pmd_none(pmdval))
+ return no_page_table(vma, flags);
goto retry;
}
- if (pmd_devmap(*pmd)) {
+ if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
if (page)
return page;
}
- if (likely(!pmd_trans_huge(*pmd)))
+ if (likely(!pmd_trans_huge(pmdval)))
return follow_page_pte(vma, address, pmd, flags);
- if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+ if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
return no_page_table(vma, flags);
retry_locked:
ptl = pmd_lock(mm, pmd);
+ if (unlikely(pmd_none(*pmd))) {
+ spin_unlock(ptl);
+ return no_page_table(vma, flags);
+ }
if (unlikely(!pmd_present(*pmd))) {
spin_unlock(ptl);
if (likely(!(flags & FOLL_MIGRATION)))
@@ -1354,7 +1370,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
}
-#ifdef __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
@@ -1430,7 +1446,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
{
return 0;
}
-#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ac5591d8622c..ba8fdc0b6e7f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -483,11 +483,8 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
static inline struct list_head *page_deferred_list(struct page *page)
{
- /*
- * ->lru in the tail pages is occupied by compound_head.
- * Let's use ->mapping + ->index in the second tail page as list_head.
- */
- return (struct list_head *)&page[2].mapping;
+ /* ->lru in the tail pages is occupied by compound_head. */
+ return &page[2].deferred_list;
}
void prep_transhuge_page(struct page *page)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 129088710510..696befffe6f7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3159,7 +3159,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
-static int hugetlb_vm_op_fault(struct vm_fault *vmf)
+static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
{
BUG();
return 0;
@@ -3686,6 +3686,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
pte_t new_pte;
spinlock_t *ptl;
+ unsigned long haddr = address & huge_page_mask(h);
/*
* Currently, we are forced to kill the process in the event the
@@ -3716,7 +3717,7 @@ retry:
u32 hash;
struct vm_fault vmf = {
.vma = vma,
- .address = address,
+ .address = haddr,
.flags = flags,
/*
* Hard to debug if it ends up being
@@ -3733,14 +3734,14 @@ retry:
* fault to make calling code simpler.
*/
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, address);
+ idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
- page = alloc_huge_page(vma, address, 0);
+ page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
if (ret == -ENOMEM)
@@ -3789,12 +3790,12 @@ retry:
* the spinlock.
*/
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
- if (vma_needs_reservation(h, vma, address) < 0) {
+ if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
/* Just decrements count, does not deallocate */
- vma_end_reservation(h, vma, address);
+ vma_end_reservation(h, vma, haddr);
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -3808,17 +3809,17 @@ retry:
if (anon_rmap) {
ClearPagePrivate(page);
- hugepage_add_new_anon_rmap(page, vma, address);
+ hugepage_add_new_anon_rmap(page, vma, haddr);
} else
page_dup_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
- set_huge_pte_at(mm, address, ptep, new_pte);
+ set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
+ ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl);
}
spin_unlock(ptl);
@@ -3830,7 +3831,7 @@ backout:
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
- restore_reserve_on_error(h, vma, address, page);
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
goto out;
}
@@ -3883,10 +3884,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
+ unsigned long haddr = address & huge_page_mask(h);
- address &= huge_page_mask(h);
-
- ptep = huge_pte_offset(mm, address, huge_page_size(h));
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
@@ -3896,20 +3896,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
} else {
- ptep = huge_pte_alloc(mm, address, huge_page_size(h));
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
if (!ptep)
return VM_FAULT_OOM;
}
mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
+ idx = vma_hugecache_offset(h, vma, haddr);
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
+ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -3939,16 +3939,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* consumed.
*/
if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
- if (vma_needs_reservation(h, vma, address) < 0) {
+ if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
}
/* Just decrements count, does not deallocate */
- vma_end_reservation(h, vma, address);
+ vma_end_reservation(h, vma, haddr);
if (!(vma->vm_flags & VM_MAYSHARE))
pagecache_page = hugetlbfs_pagecache_page(h,
- vma, address);
+ vma, haddr);
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -3973,16 +3973,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, address, ptep,
+ ret = hugetlb_cow(mm, vma, haddr, ptep,
pagecache_page, ptl);
goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (huge_ptep_set_access_flags(vma, address, ptep, entry,
+ if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
flags & FAULT_FLAG_WRITE))
- update_mmu_cache(vma, address, ptep);
+ update_mmu_cache(vma, haddr, ptep);
out_put_page:
if (page != pagecache_page)
unlock_page(page);
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index eec1150125b9..68c2f2f3c05b 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
limit = round_down(PAGE_COUNTER_MAX,
1 << huge_page_order(&hstates[idx]));
- ret = page_counter_limit(counter, limit);
+ ret = page_counter_set_max(counter, limit);
VM_BUG_ON(ret);
}
}
@@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
case RES_USAGE:
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
- return (u64)counter->limit * PAGE_SIZE;
+ return (u64)counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
@@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
mutex_lock(&hugetlb_limit_mutex);
- ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages);
+ ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages);
mutex_unlock(&hugetlb_limit_mutex);
break;
default:
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f94d5d15ebc0..f0179c9c04c2 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -22,6 +22,7 @@ struct mm_struct init_mm = {
.mm_count = ATOMIC_INIT(1),
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
INIT_MM_CONTEXT(init_mm)
diff --git a/mm/ksm.c b/mm/ksm.c
index 7d6558f3bac9..e2d2886fb1df 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -840,6 +840,17 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
return err;
}
+static inline struct stable_node *page_stable_node(struct page *page)
+{
+ return PageKsm(page) ? page_rmapping(page) : NULL;
+}
+
+static inline void set_page_stable_node(struct page *page,
+ struct stable_node *stable_node)
+{
+ page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+}
+
#ifdef CONFIG_SYSFS
/*
* Only called through the sysfs control interface:
diff --git a/mm/memblock.c b/mm/memblock.c
index 5108356ad8aa..93ad42bc8a73 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -68,7 +68,7 @@ ulong __init_memblock choose_memblock_flags(void)
/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
{
- return *size = min(*size, (phys_addr_t)ULLONG_MAX - base);
+ return *size = min(*size, PHYS_ADDR_MAX - base);
}
/*
@@ -697,6 +697,11 @@ static int __init_memblock memblock_remove_range(struct memblock_type *type,
int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("memblock_remove: [%pa-%pa] %pS\n",
+ &base, &end, (void *)_RET_IP_);
+
return memblock_remove_range(&memblock.memory, base, size);
}
@@ -925,7 +930,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
r = &type_b->regions[idx_b];
r_start = idx_b ? r[-1].base + r[-1].size : 0;
r_end = idx_b < type_b->cnt ?
- r->base : (phys_addr_t)ULLONG_MAX;
+ r->base : PHYS_ADDR_MAX;
/*
* if idx_b advanced past idx_a,
@@ -1041,7 +1046,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
r = &type_b->regions[idx_b];
r_start = idx_b ? r[-1].base + r[-1].size : 0;
r_end = idx_b < type_b->cnt ?
- r->base : (phys_addr_t)ULLONG_MAX;
+ r->base : PHYS_ADDR_MAX;
/*
* if idx_b advanced past idx_a,
* break out to advance idx_a
@@ -1516,13 +1521,13 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
{
- phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ phys_addr_t max_addr = PHYS_ADDR_MAX;
struct memblock_region *r;
/*
* translate the memory @limit size into the max address within one of
* the memory memblock regions, if the @limit exceeds the total size
- * of those regions, max_addr will keep original value ULLONG_MAX
+ * of those regions, max_addr will keep original value PHYS_ADDR_MAX
*/
for_each_memblock(memory, r) {
if (limit <= r->size) {
@@ -1537,7 +1542,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
- phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ phys_addr_t max_addr = PHYS_ADDR_MAX;
if (!limit)
return;
@@ -1545,14 +1550,14 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
max_addr = __find_max_addr(limit);
/* @limit exceeds the total size of the memory, do nothing */
- if (max_addr == (phys_addr_t)ULLONG_MAX)
+ if (max_addr == PHYS_ADDR_MAX)
return;
/* truncate both memory and reserved regions */
memblock_remove_range(&memblock.memory, max_addr,
- (phys_addr_t)ULLONG_MAX);
+ PHYS_ADDR_MAX);
memblock_remove_range(&memblock.reserved, max_addr,
- (phys_addr_t)ULLONG_MAX);
+ PHYS_ADDR_MAX);
}
void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
@@ -1580,7 +1585,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
/* truncate the reserved regions */
memblock_remove_range(&memblock.reserved, 0, base);
memblock_remove_range(&memblock.reserved,
- base + size, (phys_addr_t)ULLONG_MAX);
+ base + size, PHYS_ADDR_MAX);
}
void __init memblock_mem_limit_remove_map(phys_addr_t limit)
@@ -1593,7 +1598,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
max_addr = __find_max_addr(limit);
/* @limit exceeds the total size of the memory, do nothing */
- if (max_addr == (phys_addr_t)ULLONG_MAX)
+ if (max_addr == PHYS_ADDR_MAX)
return;
memblock_cap_memory_range(0, max_addr);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1695f38630f1..c1e64d60ed02 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1034,13 +1034,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
unsigned long limit;
count = page_counter_read(&memcg->memory);
- limit = READ_ONCE(memcg->memory.limit);
+ limit = READ_ONCE(memcg->memory.max);
if (count < limit)
margin = limit - count;
if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
- limit = READ_ONCE(memcg->memsw.limit);
+ limit = READ_ONCE(memcg->memsw.max);
if (count <= limit)
margin = min(margin, limit - count);
else
@@ -1148,13 +1148,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)memcg->memory.limit), memcg->memory.failcnt);
+ K((u64)memcg->memory.max), memcg->memory.failcnt);
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memsw)),
- K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->kmem)),
- K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
for_each_mem_cgroup_tree(iter, memcg) {
pr_info("Memory cgroup stats for ");
@@ -1179,21 +1179,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
-unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
- unsigned long limit;
+ unsigned long max;
- limit = memcg->memory.limit;
+ max = memcg->memory.max;
if (mem_cgroup_swappiness(memcg)) {
- unsigned long memsw_limit;
- unsigned long swap_limit;
+ unsigned long memsw_max;
+ unsigned long swap_max;
- memsw_limit = memcg->memsw.limit;
- swap_limit = memcg->swap.limit;
- swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
- limit = min(limit + swap_limit, memsw_limit);
+ memsw_max = memcg->memsw.max;
+ swap_max = memcg->swap.max;
+ swap_max = min(swap_max, (unsigned long)total_swap_pages);
+ max = min(max + swap_max, memsw_max);
}
- return limit;
+ return max;
}
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2444,12 +2444,13 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
}
#endif
-static DEFINE_MUTEX(memcg_limit_mutex);
+static DEFINE_MUTEX(memcg_max_mutex);
-static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
- unsigned long limit, bool memsw)
+static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
+ unsigned long max, bool memsw)
{
bool enlarge = false;
+ bool drained = false;
int ret;
bool limits_invariant;
struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
@@ -2460,26 +2461,32 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
break;
}
- mutex_lock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
/*
* Make sure that the new limit (memsw or memory limit) doesn't
- * break our basic invariant rule memory.limit <= memsw.limit.
+ * break our basic invariant rule memory.max <= memsw.max.
*/
- limits_invariant = memsw ? limit >= memcg->memory.limit :
- limit <= memcg->memsw.limit;
+ limits_invariant = memsw ? max >= memcg->memory.max :
+ max <= memcg->memsw.max;
if (!limits_invariant) {
- mutex_unlock(&memcg_limit_mutex);
+ mutex_unlock(&memcg_max_mutex);
ret = -EINVAL;
break;
}
- if (limit > counter->limit)
+ if (max > counter->max)
enlarge = true;
- ret = page_counter_limit(counter, limit);
- mutex_unlock(&memcg_limit_mutex);
+ ret = page_counter_set_max(counter, max);
+ mutex_unlock(&memcg_max_mutex);
if (!ret)
break;
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
if (!try_to_free_mem_cgroup_pages(memcg, 1,
GFP_KERNEL, !memsw)) {
ret = -EBUSY;
@@ -2603,6 +2610,9 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
+
+ drain_all_stock(memcg);
+
/* try to free all pages in this cgroup */
while (nr_retries && page_counter_read(&memcg->memory)) {
int progress;
@@ -2757,7 +2767,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
- return (u64)counter->limit * PAGE_SIZE;
+ return (u64)counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
@@ -2871,24 +2881,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
}
#endif /* !CONFIG_SLOB */
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static int memcg_update_kmem_max(struct mem_cgroup *memcg,
+ unsigned long max)
{
int ret;
- mutex_lock(&memcg_limit_mutex);
- ret = page_counter_limit(&memcg->kmem, limit);
- mutex_unlock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
+ ret = page_counter_set_max(&memcg->kmem, max);
+ mutex_unlock(&memcg_max_mutex);
return ret;
}
-static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
{
int ret;
- mutex_lock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
- ret = page_counter_limit(&memcg->tcpmem, limit);
+ ret = page_counter_set_max(&memcg->tcpmem, max);
if (ret)
goto out;
@@ -2913,7 +2923,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
memcg->tcpmem_active = true;
}
out:
- mutex_unlock(&memcg_limit_mutex);
+ mutex_unlock(&memcg_max_mutex);
return ret;
}
@@ -2941,16 +2951,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
- ret = mem_cgroup_resize_limit(memcg, nr_pages, false);
+ ret = mem_cgroup_resize_max(memcg, nr_pages, false);
break;
case _MEMSWAP:
- ret = mem_cgroup_resize_limit(memcg, nr_pages, true);
+ ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
- ret = memcg_update_kmem_limit(memcg, nr_pages);
+ ret = memcg_update_kmem_max(memcg, nr_pages);
break;
case _TCP:
- ret = memcg_update_tcp_limit(memcg, nr_pages);
+ ret = memcg_update_tcp_max(memcg, nr_pages);
break;
}
break;
@@ -3083,7 +3093,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
#endif /* CONFIG_NUMA */
/* Universal VM events cgroup1 shows, original sort order */
-unsigned int memcg1_events[] = {
+static const unsigned int memcg1_events[] = {
PGPGIN,
PGPGOUT,
PGFAULT,
@@ -3126,8 +3136,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
- memory = min(memory, mi->memory.limit);
- memsw = min(memsw, mi->memsw.limit);
+ memory = min(memory, mi->memory.max);
+ memsw = min(memsw, mi->memsw.max);
}
seq_printf(m, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
@@ -3562,11 +3572,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
#ifdef CONFIG_CGROUP_WRITEBACK
-struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
-{
- return &memcg->cgwb_list;
-}
-
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return wb_domain_init(&memcg->cgwb_domain, gfp);
@@ -3626,7 +3631,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
- unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+ unsigned long ceiling = min(memcg->memory.max, memcg->high);
unsigned long used = page_counter_read(&memcg->memory);
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -4270,7 +4275,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
- memcg->low = 0;
+ page_counter_set_min(&memcg->memory, 0);
+ page_counter_set_low(&memcg->memory, 0);
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
@@ -4319,12 +4325,13 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
- memcg->low = 0;
+ page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+ page_counter_set_min(&memcg->memory, 0);
+ page_counter_set_low(&memcg->memory, 0);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
memcg_wb_domain_size_changed(memcg);
@@ -5061,10 +5068,40 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
+static int memory_min_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long min = READ_ONCE(memcg->memory.min);
+
+ if (min == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
+
+ return 0;
+}
+
+static ssize_t memory_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long min;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &min);
+ if (err)
+ return err;
+
+ page_counter_set_min(&memcg->memory, min);
+
+ return nbytes;
+}
+
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = READ_ONCE(memcg->low);
+ unsigned long low = READ_ONCE(memcg->memory.low);
if (low == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5086,7 +5123,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
if (err)
return err;
- memcg->low = low;
+ page_counter_set_low(&memcg->memory, low);
return nbytes;
}
@@ -5131,7 +5168,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
static int memory_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->memory.limit);
+ unsigned long max = READ_ONCE(memcg->memory.max);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5155,7 +5192,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;
- xchg(&memcg->memory.limit, max);
+ xchg(&memcg->memory.max, max);
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -5296,6 +5333,12 @@ static struct cftype memory_files[] = {
.read_u64 = memory_current_read,
},
{
+ .name = "min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_min_show,
+ .write = memory_min_write,
+ },
+ {
.name = "low",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_low_show,
@@ -5344,54 +5387,140 @@ struct cgroup_subsys memory_cgrp_subsys = {
};
/**
- * mem_cgroup_low - check if memory consumption is below the normal range
+ * mem_cgroup_protected - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
- * Returns %true if memory consumption of @memcg, and that of all
- * ancestors up to (but not including) @root, is below the normal range.
+ * WARNING: This function is not stateless! It can only be used as part
+ * of a top-down tree iteration, not for isolated queries.
+ *
+ * Returns one of the following:
+ * MEMCG_PROT_NONE: cgroup memory is not protected
+ * MEMCG_PROT_LOW: cgroup memory is protected as long there is
+ * an unprotected supply of reclaimable memory from other cgroups.
+ * MEMCG_PROT_MIN: cgroup memory is protected
*
- * @root is exclusive; it is never low when looked at directly and isn't
- * checked when traversing the hierarchy.
+ * @root is exclusive; it is never protected when looked at directly
*
- * Excluding @root enables using memory.low to prioritize memory usage
- * between cgroups within a subtree of the hierarchy that is limited by
- * memory.high or memory.max.
+ * To provide a proper hierarchical behavior, effective memory.min/low values
+ * are used. Below is the description of how effective memory.low is calculated.
+ * Effective memory.min values is calculated in the same way.
*
- * For example, given cgroup A with children B and C:
+ * Effective memory.low is always equal or less than the original memory.low.
+ * If there is no memory.low overcommittment (which is always true for
+ * top-level memory cgroups), these two values are equal.
+ * Otherwise, it's a part of parent's effective memory.low,
+ * calculated as a cgroup's memory.low usage divided by sum of sibling's
+ * memory.low usages, where memory.low usage is the size of actually
+ * protected memory.
*
- * A
- * / \
- * B C
+ * low_usage
+ * elow = min( memory.low, parent->elow * ------------------ ),
+ * siblings_low_usage
*
- * and
+ * | memory.current, if memory.current < memory.low
+ * low_usage = |
+ | 0, otherwise.
*
- * 1. A/memory.current > A/memory.high
- * 2. A/B/memory.current < A/B/memory.low
- * 3. A/C/memory.current >= A/C/memory.low
*
- * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
- * should reclaim from 'C' until 'A' is no longer high or until we can
- * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
- * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
- * low and we will reclaim indiscriminately from both 'B' and 'C'.
+ * Such definition of the effective memory.low provides the expected
+ * hierarchical behavior: parent's memory.low value is limiting
+ * children, unprotected memory is reclaimed first and cgroups,
+ * which are not using their guarantee do not affect actual memory
+ * distribution.
+ *
+ * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
+ *
+ * A A/memory.low = 2G, A/memory.current = 6G
+ * //\\
+ * BC DE B/memory.low = 3G B/memory.current = 2G
+ * C/memory.low = 1G C/memory.current = 2G
+ * D/memory.low = 0 D/memory.current = 2G
+ * E/memory.low = 10G E/memory.current = 0
+ *
+ * and the memory pressure is applied, the following memory distribution
+ * is expected (approximately):
+ *
+ * A/memory.current = 2G
+ *
+ * B/memory.current = 1.3G
+ * C/memory.current = 0.6G
+ * D/memory.current = 0
+ * E/memory.current = 0
+ *
+ * These calculations require constant tracking of the actual low usages
+ * (see propagate_protected_usage()), as well as recursive calculation of
+ * effective memory.low values. But as we do call mem_cgroup_protected()
+ * path for each memory cgroup top-down from the reclaim,
+ * it's possible to optimize this part, and save calculated elow
+ * for next usage. This part is intentionally racy, but it's ok,
+ * as memory.low is a best-effort mechanism.
*/
-bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
+enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
{
+ struct mem_cgroup *parent;
+ unsigned long emin, parent_emin;
+ unsigned long elow, parent_elow;
+ unsigned long usage;
+
if (mem_cgroup_disabled())
- return false;
+ return MEMCG_PROT_NONE;
if (!root)
root = root_mem_cgroup;
if (memcg == root)
- return false;
+ return MEMCG_PROT_NONE;
+
+ usage = page_counter_read(&memcg->memory);
+ if (!usage)
+ return MEMCG_PROT_NONE;
+
+ emin = memcg->memory.min;
+ elow = memcg->memory.low;
+
+ parent = parent_mem_cgroup(memcg);
+ if (parent == root)
+ goto exit;
- for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
- if (page_counter_read(&memcg->memory) >= memcg->low)
- return false;
+ parent_emin = READ_ONCE(parent->memory.emin);
+ emin = min(emin, parent_emin);
+ if (emin && parent_emin) {
+ unsigned long min_usage, siblings_min_usage;
+
+ min_usage = min(usage, memcg->memory.min);
+ siblings_min_usage = atomic_long_read(
+ &parent->memory.children_min_usage);
+
+ if (min_usage && siblings_min_usage)
+ emin = min(emin, parent_emin * min_usage /
+ siblings_min_usage);
}
- return true;
+ parent_elow = READ_ONCE(parent->memory.elow);
+ elow = min(elow, parent_elow);
+ if (elow && parent_elow) {
+ unsigned long low_usage, siblings_low_usage;
+
+ low_usage = min(usage, memcg->memory.low);
+ siblings_low_usage = atomic_long_read(
+ &parent->memory.children_low_usage);
+
+ if (low_usage && siblings_low_usage)
+ elow = min(elow, parent_elow * low_usage /
+ siblings_low_usage);
+ }
+
+exit:
+ memcg->memory.emin = emin;
+ memcg->memory.elow = elow;
+
+ if (usage <= emin)
+ return MEMCG_PROT_MIN;
+ else if (usage <= elow)
+ return MEMCG_PROT_LOW;
+ else
+ return MEMCG_PROT_NONE;
}
/**
@@ -6012,10 +6141,17 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (!memcg)
return 0;
+ if (!entry.val) {
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ return 0;
+ }
+
memcg = mem_cgroup_id_get_online(memcg);
if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
+ memcg_memory_event(memcg, MEMCG_SWAP_MAX);
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
mem_cgroup_id_put(memcg);
return -ENOMEM;
}
@@ -6067,7 +6203,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
return nr_swap_pages;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
- READ_ONCE(memcg->swap.limit) -
+ READ_ONCE(memcg->swap.max) -
page_counter_read(&memcg->swap));
return nr_swap_pages;
}
@@ -6088,7 +6224,7 @@ bool mem_cgroup_swap_full(struct page *page)
return false;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
+ if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
return true;
return false;
@@ -6122,7 +6258,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
static int swap_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->swap.limit);
+ unsigned long max = READ_ONCE(memcg->swap.max);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -6144,15 +6280,23 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
if (err)
return err;
- mutex_lock(&memcg_limit_mutex);
- err = page_counter_limit(&memcg->swap, max);
- mutex_unlock(&memcg_limit_mutex);
- if (err)
- return err;
+ xchg(&memcg->swap.max, max);
return nbytes;
}
+static int swap_events_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ seq_printf(m, "max %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
+ seq_printf(m, "fail %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
+
+ return 0;
+}
+
static struct cftype swap_files[] = {
{
.name = "swap.current",
@@ -6165,6 +6309,12 @@ static struct cftype swap_files[] = {
.seq_show = swap_max_show,
.write = swap_max_write,
},
+ {
+ .name = "swap.events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, swap_events_file),
+ .seq_show = swap_events_show,
+ },
{ } /* terminate */
};
diff --git a/mm/memfd.c b/mm/memfd.c
new file mode 100644
index 000000000000..27069518e3c5
--- /dev/null
+++ b/mm/memfd.c
@@ -0,0 +1,345 @@
+/*
+ * memfd_create system call and file sealing support
+ *
+ * Code was originally included in shmem.c, and broken out to facilitate
+ * use by hugetlbfs as well as tmpfs.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/khugepaged.h>
+#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
+#include <linux/shmem_fs.h>
+#include <linux/memfd.h>
+#include <uapi/linux/memfd.h>
+
+/*
+ * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * so reuse a tag which we firmly believe is never set or cleared on tmpfs
+ * or hugetlbfs because they are memory only filesystems.
+ */
+#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
+#define LAST_SCAN 4 /* about 150ms max */
+
+static void memfd_tag_pins(struct address_space *mapping)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ pgoff_t start;
+ struct page *page;
+
+ lru_add_drain();
+ start = 0;
+ rcu_read_lock();
+
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
+ page = radix_tree_deref_slot(slot);
+ if (!page || radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ } else if (page_count(page) - page_mapcount(page) > 1) {
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_set(&mapping->i_pages, iter.index,
+ MEMFD_TAG_PINNED);
+ xa_unlock_irq(&mapping->i_pages);
+ }
+
+ if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
+ * via get_user_pages(), drivers might have some pending I/O without any active
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * and see whether it has an elevated ref-count. If so, we tag them and wait for
+ * them to be dropped.
+ * The caller must guarantee that no new user will acquire writable references
+ * to those pages to avoid races.
+ */
+static int memfd_wait_for_pins(struct address_space *mapping)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ pgoff_t start;
+ struct page *page;
+ int error, scan;
+
+ memfd_tag_pins(mapping);
+
+ error = 0;
+ for (scan = 0; scan <= LAST_SCAN; scan++) {
+ if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED))
+ break;
+
+ if (!scan)
+ lru_add_drain_all();
+ else if (schedule_timeout_killable((HZ << scan) / 200))
+ scan = LAST_SCAN;
+
+ start = 0;
+ rcu_read_lock();
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
+ start, MEMFD_TAG_PINNED) {
+
+ page = radix_tree_deref_slot(slot);
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ page = NULL;
+ }
+
+ if (page &&
+ page_count(page) - page_mapcount(page) != 1) {
+ if (scan < LAST_SCAN)
+ goto continue_resched;
+
+ /*
+ * On the last scan, we clean up all those tags
+ * we inserted; but make a note that we still
+ * found pages pinned.
+ */
+ error = -EBUSY;
+ }
+
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_clear(&mapping->i_pages,
+ iter.index, MEMFD_TAG_PINNED);
+ xa_unlock_irq(&mapping->i_pages);
+continue_resched:
+ if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ return error;
+}
+
+static unsigned int *memfd_file_seals_ptr(struct file *file)
+{
+ if (shmem_file(file))
+ return &SHMEM_I(file_inode(file))->seals;
+
+#ifdef CONFIG_HUGETLBFS
+ if (is_file_hugepages(file))
+ return &HUGETLBFS_I(file_inode(file))->seals;
+#endif
+
+ return NULL;
+}
+
+#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_SHRINK | \
+ F_SEAL_GROW | \
+ F_SEAL_WRITE)
+
+static int memfd_add_seals(struct file *file, unsigned int seals)
+{
+ struct inode *inode = file_inode(file);
+ unsigned int *file_seals;
+ int error;
+
+ /*
+ * SEALING
+ * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
+ * but restrict access to a specific subset of file operations. Seals
+ * can only be added, but never removed. This way, mutually untrusted
+ * parties can share common memory regions with a well-defined policy.
+ * A malicious peer can thus never perform unwanted operations on a
+ * shared object.
+ *
+ * Seals are only supported on special tmpfs or hugetlbfs files and
+ * always affect the whole underlying inode. Once a seal is set, it
+ * may prevent some kinds of access to the file. Currently, the
+ * following seals are defined:
+ * SEAL_SEAL: Prevent further seals from being set on this file
+ * SEAL_SHRINK: Prevent the file from shrinking
+ * SEAL_GROW: Prevent the file from growing
+ * SEAL_WRITE: Prevent write access to the file
+ *
+ * As we don't require any trust relationship between two parties, we
+ * must prevent seals from being removed. Therefore, sealing a file
+ * only adds a given set of seals to the file, it never touches
+ * existing seals. Furthermore, the "setting seals"-operation can be
+ * sealed itself, which basically prevents any further seal from being
+ * added.
+ *
+ * Semantics of sealing are only defined on volatile files. Only
+ * anonymous tmpfs and hugetlbfs files support sealing. More
+ * importantly, seals are never written to disk. Therefore, there's
+ * no plan to support it on other file types.
+ */
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EPERM;
+ if (seals & ~(unsigned int)F_ALL_SEALS)
+ return -EINVAL;
+
+ inode_lock(inode);
+
+ file_seals = memfd_file_seals_ptr(file);
+ if (!file_seals) {
+ error = -EINVAL;
+ goto unlock;
+ }
+
+ if (*file_seals & F_SEAL_SEAL) {
+ error = -EPERM;
+ goto unlock;
+ }
+
+ if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
+ error = mapping_deny_writable(file->f_mapping);
+ if (error)
+ goto unlock;
+
+ error = memfd_wait_for_pins(file->f_mapping);
+ if (error) {
+ mapping_allow_writable(file->f_mapping);
+ goto unlock;
+ }
+ }
+
+ *file_seals |= seals;
+ error = 0;
+
+unlock:
+ inode_unlock(inode);
+ return error;
+}
+
+static int memfd_get_seals(struct file *file)
+{
+ unsigned int *seals = memfd_file_seals_ptr(file);
+
+ return seals ? *seals : -EINVAL;
+}
+
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ long error;
+
+ switch (cmd) {
+ case F_ADD_SEALS:
+ /* disallow upper 32bit */
+ if (arg > UINT_MAX)
+ return -EINVAL;
+
+ error = memfd_add_seals(file, arg);
+ break;
+ case F_GET_SEALS:
+ error = memfd_get_seals(file);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+#define MFD_NAME_PREFIX "memfd:"
+#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
+#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
+
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ unsigned int *file_seals;
+ struct file *file;
+ int fd, error;
+ char *name;
+ long len;
+
+ if (!(flags & MFD_HUGETLB)) {
+ if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ return -EINVAL;
+ } else {
+ /* Allow huge page size encoding in flags. */
+ if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+ (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+ return -EINVAL;
+ }
+
+ /* length includes terminating zero */
+ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
+ if (len <= 0)
+ return -EFAULT;
+ if (len > MFD_NAME_MAX_LEN + 1)
+ return -EINVAL;
+
+ name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ strcpy(name, MFD_NAME_PREFIX);
+ if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ /* terminating-zero may have changed after strnlen_user() returned */
+ if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+ if (fd < 0) {
+ error = fd;
+ goto err_name;
+ }
+
+ if (flags & MFD_HUGETLB) {
+ struct user_struct *user = NULL;
+
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+ HUGETLB_ANONHUGE_INODE,
+ (flags >> MFD_HUGE_SHIFT) &
+ MFD_HUGE_MASK);
+ } else
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_fd;
+ }
+ file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+ file->f_flags |= O_RDWR | O_LARGEFILE;
+
+ if (flags & MFD_ALLOW_SEALING) {
+ file_seals = memfd_file_seals_ptr(file);
+ *file_seals &= ~F_SEAL_SEAL;
+ }
+
+ fd_install(fd, file);
+ kfree(name);
+ return fd;
+
+err_fd:
+ put_unused_fd(fd);
+err_name:
+ kfree(name);
+ return error;
+}
diff --git a/mm/memory.c b/mm/memory.c
index 5d8c2afb0730..7206a634270b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -817,17 +817,12 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
* PFNMAP mappings in order to support COWable mappings.
*
*/
-#ifdef __HAVE_ARCH_PTE_SPECIAL
-# define HAVE_PTE_SPECIAL 1
-#else
-# define HAVE_PTE_SPECIAL 0
-#endif
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, bool with_public_device)
{
unsigned long pfn = pte_pfn(pte);
- if (HAVE_PTE_SPECIAL) {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
@@ -862,7 +857,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
}
- /* !HAVE_PTE_SPECIAL case follows: */
+ /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
@@ -881,6 +876,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (is_zero_pfn(pfn))
return NULL;
+
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
@@ -904,7 +900,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
/*
* There is no pmd_special() but there may be special pmds, e.g.
* in a direct-access (dax) mapping, so let's just replicate the
- * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+ * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
*/
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
@@ -1932,7 +1928,8 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
+ !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
struct page *page;
/*
@@ -1954,12 +1951,25 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_mixed);
-int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+/*
+ * If the insertion of PTE failed because someone else already added a
+ * different entry in the mean time, we treat that as success as we assume
+ * the same entry was actually inserted.
+ */
+
+vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn)
{
- return __vm_insert_mixed(vma, addr, pfn, true);
+ int err;
+
+ err = __vm_insert_mixed(vma, addr, pfn, true);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+ return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
/*
* maps a range of physical memory into the requested pages. the old
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 25982467800b..7deb49f69e27 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1237,6 +1237,29 @@ static struct page *next_active_pageblock(struct page *page)
return page + pageblock_nr_pages;
}
+static bool is_pageblock_removable_nolock(struct page *page)
+{
+ struct zone *zone;
+ unsigned long pfn;
+
+ /*
+ * We have to be careful here because we are iterating over memory
+ * sections which are not zone aware so we might end up outside of
+ * the zone but still within the section.
+ * We have to take care about the node as well. If the node is offline
+ * its NODE_DATA will be NULL - see page_zone.
+ */
+ if (!node_online(page_to_nid(page)))
+ return false;
+
+ zone = page_zone(page);
+ pfn = page_to_pfn(page);
+ if (!zone_spans_pfn(zone, pfn))
+ return false;
+
+ return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
+}
+
/* Checks if this range of memory is likely to be hot-removable. */
bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
diff --git a/mm/mmap.c b/mm/mmap.c
index d817764a9974..d1eb87ef4b1a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3277,7 +3277,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
mm->data_vm += npages;
}
-static int special_mapping_fault(struct vm_fault *vmf);
+static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
/*
* Having a close hook prevents vma merging regardless of flags.
@@ -3316,7 +3316,7 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = {
.fault = special_mapping_fault,
};
-static int special_mapping_fault(struct vm_fault *vmf)
+static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
pgoff_t pgoff;
diff --git a/mm/nommu.c b/mm/nommu.c
index 13723736d38f..4452d8bd9ae4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1763,7 +1763,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
return -ENOMEM;
}
-int filemap_fault(struct vm_fault *vmf)
+vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8ba6cb88cf58..6694348b27e9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
int nid;
if (is_memcg_oom(oc)) {
- oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
+ oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
return CONSTRAINT_MEMCG;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22320ea27489..07b3c23762ad 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -705,16 +705,14 @@ static inline void rmv_page_order(struct page *page)
/*
* This function checks whether a page is free && is the buddy
- * we can do coalesce a page and its buddy if
+ * we can coalesce a page and its buddy if
* (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we set ->_mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE.
- * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
- * serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set PageBuddy.
+ * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -759,9 +757,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with _mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
- * field.
+ * free pages of length of (1 << order) and marked with PageBuddy.
+ * Page's order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
@@ -946,7 +943,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
}
switch (page - head_page) {
case 1:
- /* the first tail page: ->mapping is compound_mapcount() */
+ /* the first tail page: ->mapping may be compound_mapcount() */
if (unlikely(compound_mapcount(page))) {
bad_page(page, "nonzero compound_mapcount", 0);
goto out;
@@ -955,7 +952,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
case 2:
/*
* the second tail page: ->mapping is
- * page_deferred_list().next -- ignore value.
+ * deferred_list.next -- ignore value.
*/
break;
default:
@@ -3701,7 +3698,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
#endif /* CONFIG_COMPACTION */
#ifdef CONFIG_LOCKDEP
-struct lockdep_map __fs_reclaim_map =
+static struct lockdep_map __fs_reclaim_map =
STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
static bool __need_fs_reclaim(gfp_t gfp_mask)
@@ -3726,17 +3723,27 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
return true;
}
+void __fs_reclaim_acquire(void)
+{
+ lock_map_acquire(&__fs_reclaim_map);
+}
+
+void __fs_reclaim_release(void)
+{
+ lock_map_release(&__fs_reclaim_map);
+}
+
void fs_reclaim_acquire(gfp_t gfp_mask)
{
if (__need_fs_reclaim(gfp_mask))
- lock_map_acquire(&__fs_reclaim_map);
+ __fs_reclaim_acquire();
}
EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
void fs_reclaim_release(gfp_t gfp_mask)
{
if (__need_fs_reclaim(gfp_mask))
- lock_map_release(&__fs_reclaim_map);
+ __fs_reclaim_release();
}
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
@@ -3754,8 +3761,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- noreclaim_flag = memalloc_noreclaim_save();
fs_reclaim_acquire(gfp_mask);
+ noreclaim_flag = memalloc_noreclaim_save();
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
@@ -3763,8 +3770,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
ac->nodemask);
current->reclaim_state = NULL;
- fs_reclaim_release(gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
+ fs_reclaim_release(gfp_mask);
cond_resched();
@@ -4162,7 +4169,6 @@ retry:
* orientated.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
- ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
@@ -4326,8 +4332,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/* Determine whether to spread dirty pages and what the first usable zone */
-static inline void finalise_ac(gfp_t gfp_mask,
- unsigned int order, struct alloc_context *ac)
+static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
{
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -4358,7 +4363,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
- finalise_ac(gfp_mask, order, &ac);
+ finalise_ac(gfp_mask, &ac);
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
@@ -6229,18 +6234,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize, freesize, memmap_pages;
+ unsigned long size, freesize, memmap_pages;
unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
- realsize = freesize = zone->present_pages;
+ freesize = zone->present_pages;
/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
- memmap_pages = calc_memmap_size(size, realsize);
+ memmap_pages = calc_memmap_size(size, freesize);
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
@@ -6272,7 +6277,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
- zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
+ zone->managed_pages = freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
#endif
@@ -7682,29 +7687,6 @@ unmovable:
return true;
}
-bool is_pageblock_removable_nolock(struct page *page)
-{
- struct zone *zone;
- unsigned long pfn;
-
- /*
- * We have to be careful here because we are iterating over memory
- * sections which are not zone aware so we might end up outside of
- * the zone but still within the section.
- * We have to take care about the node as well. If the node is offline
- * its NODE_DATA will be NULL - see page_zone.
- */
- if (!node_online(page_to_nid(page)))
- return false;
-
- zone = page_zone(page);
- pfn = page_to_pfn(page);
- if (!zone_spans_pfn(zone, pfn))
- return false;
-
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
-}
-
#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static unsigned long pfn_max_align_down(unsigned long pfn)
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 2a8df3ad60a4..de31470655f6 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,6 +13,40 @@
#include <linux/bug.h>
#include <asm/page.h>
+static void propagate_protected_usage(struct page_counter *c,
+ unsigned long usage)
+{
+ unsigned long protected, old_protected;
+ long delta;
+
+ if (!c->parent)
+ return;
+
+ if (c->min || atomic_long_read(&c->min_usage)) {
+ if (usage <= c->min)
+ protected = usage;
+ else
+ protected = 0;
+
+ old_protected = atomic_long_xchg(&c->min_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_min_usage);
+ }
+
+ if (c->low || atomic_long_read(&c->low_usage)) {
+ if (usage <= c->low)
+ protected = usage;
+ else
+ protected = 0;
+
+ old_protected = atomic_long_xchg(&c->low_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_low_usage);
+ }
+}
+
/**
* page_counter_cancel - take pages out of the local counter
* @counter: counter
@@ -22,7 +56,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
{
long new;
- new = atomic_long_sub_return(nr_pages, &counter->count);
+ new = atomic_long_sub_return(nr_pages, &counter->usage);
+ propagate_protected_usage(counter, new);
/* More uncharges than charges? */
WARN_ON_ONCE(new < 0);
}
@@ -41,7 +76,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
for (c = counter; c; c = c->parent) {
long new;
- new = atomic_long_add_return(nr_pages, &c->count);
+ new = atomic_long_add_return(nr_pages, &c->usage);
+ propagate_protected_usage(counter, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -82,9 +118,10 @@ bool page_counter_try_charge(struct page_counter *counter,
* we either see the new limit or the setter sees the
* counter has changed and retries.
*/
- new = atomic_long_add_return(nr_pages, &c->count);
- if (new > c->limit) {
- atomic_long_sub(nr_pages, &c->count);
+ new = atomic_long_add_return(nr_pages, &c->usage);
+ if (new > c->max) {
+ atomic_long_sub(nr_pages, &c->usage);
+ propagate_protected_usage(counter, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -93,6 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
+ propagate_protected_usage(counter, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
@@ -123,20 +161,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
}
/**
- * page_counter_limit - limit the number of pages allowed
+ * page_counter_set_max - set the maximum number of pages allowed
* @counter: counter
- * @limit: limit to set
+ * @nr_pages: limit to set
*
* Returns 0 on success, -EBUSY if the current number of pages on the
* counter already exceeds the specified limit.
*
* The caller must serialize invocations on the same counter.
*/
-int page_counter_limit(struct page_counter *counter, unsigned long limit)
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
{
for (;;) {
unsigned long old;
- long count;
+ long usage;
/*
* Update the limit while making sure that it's not
@@ -149,22 +187,56 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
* the limit, so if it sees the old limit, we see the
* modified counter and retry.
*/
- count = atomic_long_read(&counter->count);
+ usage = atomic_long_read(&counter->usage);
- if (count > limit)
+ if (usage > nr_pages)
return -EBUSY;
- old = xchg(&counter->limit, limit);
+ old = xchg(&counter->max, nr_pages);
- if (atomic_long_read(&counter->count) <= count)
+ if (atomic_long_read(&counter->usage) <= usage)
return 0;
- counter->limit = old;
+ counter->max = old;
cond_resched();
}
}
/**
+ * page_counter_set_min - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ counter->min = nr_pages;
+
+ for (c = counter; c; c = c->parent)
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
+/**
+ * page_counter_set_low - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ counter->low = nr_pages;
+
+ for (c = counter; c; c = c->parent)
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
+/**
* page_counter_memparse - memparse() for page counter limits
* @buf: string to parse
* @max: string meaning maximum possible value
diff --git a/mm/shmem.c b/mm/shmem.c
index 9d6c7e595415..e9a7ac74823d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -327,7 +327,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
struct radix_tree_node *node;
- void **pslot;
+ void __rcu **pslot;
void *item;
VM_BUG_ON(!expected);
@@ -395,7 +395,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
/* ifdef here to avoid bloating shmem.o when not necessary */
-int shmem_huge __read_mostly;
+static int shmem_huge __read_mostly;
#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static int shmem_parse_huge(const char *str)
@@ -571,6 +571,15 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
}
#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
+{
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
+ shmem_huge != SHMEM_HUGE_DENY)
+ return true;
+ return false;
+}
+
/*
* Like add_to_page_cache_locked, but error if expected item has gone.
*/
@@ -682,7 +691,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
struct page *page;
unsigned long swapped = 0;
@@ -988,6 +997,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat,
{
struct inode *inode = path->dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
spin_lock_irq(&info->lock);
@@ -995,6 +1005,10 @@ static int shmem_getattr(const struct path *path, struct kstat *stat,
spin_unlock_irq(&info->lock);
}
generic_fillattr(inode, stat);
+
+ if (is_huge_enabled(sb_info))
+ stat->blksize = HPAGE_PMD_SIZE;
+
return 0;
}
@@ -1098,13 +1112,19 @@ static void shmem_evict_inode(struct inode *inode)
static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
{
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
unsigned long found = -1;
unsigned int checked = 0;
rcu_read_lock();
radix_tree_for_each_slot(slot, root, &iter, 0) {
- if (*slot == item) {
+ void *entry = radix_tree_deref_slot(slot);
+
+ if (radix_tree_deref_retry(entry)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ if (entry == item) {
found = iter.index;
break;
}
@@ -1322,9 +1342,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (!swap.val)
goto redirty;
- if (mem_cgroup_try_charge_swap(page, swap))
- goto free_swap;
-
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the page is
@@ -1353,7 +1370,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
}
mutex_unlock(&shmem_swaplist_mutex);
-free_swap:
put_swap_page(page, swap);
redirty:
set_page_dirty(page);
@@ -1404,10 +1420,9 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
struct shmem_inode_info *info, pgoff_t index)
{
/* Create a pseudo vma that just contains the policy */
- vma->vm_start = 0;
+ memset(vma, 0, sizeof(*vma));
/* Bias interleave by inode number to distribute better across nodes */
vma->vm_pgoff = index + info->vfs_inode.i_ino;
- vma->vm_ops = NULL;
vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
}
@@ -1931,14 +1946,14 @@ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, in
return ret;
}
-static int shmem_fault(struct vm_fault *vmf)
+static vm_fault_t shmem_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
enum sgp_type sgp;
- int error;
- int ret = VM_FAULT_LOCKED;
+ int err;
+ vm_fault_t ret = VM_FAULT_LOCKED;
/*
* Trinity finds that probing a hole which tmpfs is punching can
@@ -2006,10 +2021,10 @@ static int shmem_fault(struct vm_fault *vmf)
else if (vma->vm_flags & VM_HUGEPAGE)
sgp = SGP_HUGE;
- error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
+ err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
gfp, vma, vmf, &ret);
- if (error)
- return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
+ if (err)
+ return vmf_error(err);
return ret;
}
@@ -2616,241 +2631,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}
-/*
- * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
- * so reuse a tag which we firmly believe is never set or cleared on shmem.
- */
-#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE
-#define LAST_SCAN 4 /* about 150ms max */
-
-static void shmem_tag_pins(struct address_space *mapping)
-{
- struct radix_tree_iter iter;
- void **slot;
- pgoff_t start;
- struct page *page;
-
- lru_add_drain();
- start = 0;
- rcu_read_lock();
-
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- page = radix_tree_deref_slot(slot);
- if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- } else if (page_count(page) - page_mapcount(page) > 1) {
- xa_lock_irq(&mapping->i_pages);
- radix_tree_tag_set(&mapping->i_pages, iter.index,
- SHMEM_TAG_PINNED);
- xa_unlock_irq(&mapping->i_pages);
- }
-
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
- }
- rcu_read_unlock();
-}
-
-/*
- * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
- * via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
- * and see whether it has an elevated ref-count. If so, we tag them and wait for
- * them to be dropped.
- * The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
- */
-static int shmem_wait_for_pins(struct address_space *mapping)
-{
- struct radix_tree_iter iter;
- void **slot;
- pgoff_t start;
- struct page *page;
- int error, scan;
-
- shmem_tag_pins(mapping);
-
- error = 0;
- for (scan = 0; scan <= LAST_SCAN; scan++) {
- if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED))
- break;
-
- if (!scan)
- lru_add_drain_all();
- else if (schedule_timeout_killable((HZ << scan) / 200))
- scan = LAST_SCAN;
-
- start = 0;
- rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
- start, SHMEM_TAG_PINNED) {
-
- page = radix_tree_deref_slot(slot);
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
- page = NULL;
- }
-
- if (page &&
- page_count(page) - page_mapcount(page) != 1) {
- if (scan < LAST_SCAN)
- goto continue_resched;
-
- /*
- * On the last scan, we clean up all those tags
- * we inserted; but make a note that we still
- * found pages pinned.
- */
- error = -EBUSY;
- }
-
- xa_lock_irq(&mapping->i_pages);
- radix_tree_tag_clear(&mapping->i_pages,
- iter.index, SHMEM_TAG_PINNED);
- xa_unlock_irq(&mapping->i_pages);
-continue_resched:
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
- }
- rcu_read_unlock();
- }
-
- return error;
-}
-
-static unsigned int *memfd_file_seals_ptr(struct file *file)
-{
- if (file->f_op == &shmem_file_operations)
- return &SHMEM_I(file_inode(file))->seals;
-
-#ifdef CONFIG_HUGETLBFS
- if (file->f_op == &hugetlbfs_file_operations)
- return &HUGETLBFS_I(file_inode(file))->seals;
-#endif
-
- return NULL;
-}
-
-#define F_ALL_SEALS (F_SEAL_SEAL | \
- F_SEAL_SHRINK | \
- F_SEAL_GROW | \
- F_SEAL_WRITE)
-
-static int memfd_add_seals(struct file *file, unsigned int seals)
-{
- struct inode *inode = file_inode(file);
- unsigned int *file_seals;
- int error;
-
- /*
- * SEALING
- * Sealing allows multiple parties to share a shmem-file but restrict
- * access to a specific subset of file operations. Seals can only be
- * added, but never removed. This way, mutually untrusted parties can
- * share common memory regions with a well-defined policy. A malicious
- * peer can thus never perform unwanted operations on a shared object.
- *
- * Seals are only supported on special shmem-files and always affect
- * the whole underlying inode. Once a seal is set, it may prevent some
- * kinds of access to the file. Currently, the following seals are
- * defined:
- * SEAL_SEAL: Prevent further seals from being set on this file
- * SEAL_SHRINK: Prevent the file from shrinking
- * SEAL_GROW: Prevent the file from growing
- * SEAL_WRITE: Prevent write access to the file
- *
- * As we don't require any trust relationship between two parties, we
- * must prevent seals from being removed. Therefore, sealing a file
- * only adds a given set of seals to the file, it never touches
- * existing seals. Furthermore, the "setting seals"-operation can be
- * sealed itself, which basically prevents any further seal from being
- * added.
- *
- * Semantics of sealing are only defined on volatile files. Only
- * anonymous shmem files support sealing. More importantly, seals are
- * never written to disk. Therefore, there's no plan to support it on
- * other file types.
- */
-
- if (!(file->f_mode & FMODE_WRITE))
- return -EPERM;
- if (seals & ~(unsigned int)F_ALL_SEALS)
- return -EINVAL;
-
- inode_lock(inode);
-
- file_seals = memfd_file_seals_ptr(file);
- if (!file_seals) {
- error = -EINVAL;
- goto unlock;
- }
-
- if (*file_seals & F_SEAL_SEAL) {
- error = -EPERM;
- goto unlock;
- }
-
- if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
- error = mapping_deny_writable(file->f_mapping);
- if (error)
- goto unlock;
-
- error = shmem_wait_for_pins(file->f_mapping);
- if (error) {
- mapping_allow_writable(file->f_mapping);
- goto unlock;
- }
- }
-
- *file_seals |= seals;
- error = 0;
-
-unlock:
- inode_unlock(inode);
- return error;
-}
-
-static int memfd_get_seals(struct file *file)
-{
- unsigned int *seals = memfd_file_seals_ptr(file);
-
- return seals ? *seals : -EINVAL;
-}
-
-long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- long error;
-
- switch (cmd) {
- case F_ADD_SEALS:
- /* disallow upper 32bit */
- if (arg > UINT_MAX)
- return -EINVAL;
-
- error = memfd_add_seals(file, arg);
- break;
- case F_GET_SEALS:
- error = memfd_get_seals(file);
- break;
- default:
- error = -EINVAL;
- break;
- }
-
- return error;
-}
-
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -3428,6 +3208,15 @@ static int shmem_match(struct inode *ino, void *vfh)
return ino->i_ino == inum && fh[0] == ino->i_generation;
}
+/* Find any alias of inode, but prefer a hashed alias */
+static struct dentry *shmem_find_alias(struct inode *inode)
+{
+ struct dentry *alias = d_find_alias(inode);
+
+ return alias ?: d_find_any_alias(inode);
+}
+
+
static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
@@ -3444,7 +3233,7 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
shmem_match, fid->raw);
if (inode) {
- dentry = d_find_alias(inode);
+ dentry = shmem_find_alias(inode);
iput(inode);
}
@@ -3673,93 +3462,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-#define MFD_NAME_PREFIX "memfd:"
-#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
-#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
-
-SYSCALL_DEFINE2(memfd_create,
- const char __user *, uname,
- unsigned int, flags)
-{
- unsigned int *file_seals;
- struct file *file;
- int fd, error;
- char *name;
- long len;
-
- if (!(flags & MFD_HUGETLB)) {
- if (flags & ~(unsigned int)MFD_ALL_FLAGS)
- return -EINVAL;
- } else {
- /* Allow huge page size encoding in flags. */
- if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
- (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
- return -EINVAL;
- }
-
- /* length includes terminating zero */
- len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
- if (len <= 0)
- return -EFAULT;
- if (len > MFD_NAME_MAX_LEN + 1)
- return -EINVAL;
-
- name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
- if (!name)
- return -ENOMEM;
-
- strcpy(name, MFD_NAME_PREFIX);
- if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
- error = -EFAULT;
- goto err_name;
- }
-
- /* terminating-zero may have changed after strnlen_user() returned */
- if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
- error = -EFAULT;
- goto err_name;
- }
-
- fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
- if (fd < 0) {
- error = fd;
- goto err_name;
- }
-
- if (flags & MFD_HUGETLB) {
- struct user_struct *user = NULL;
-
- file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
- HUGETLB_ANONHUGE_INODE,
- (flags >> MFD_HUGE_SHIFT) &
- MFD_HUGE_MASK);
- } else
- file = shmem_file_setup(name, 0, VM_NORESERVE);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_fd;
- }
- file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
- file->f_flags |= O_RDWR | O_LARGEFILE;
-
- if (flags & MFD_ALLOW_SEALING) {
- file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
- }
-
- fd_install(fd, file);
- kfree(name);
- return fd;
-
-err_fd:
- put_unused_fd(fd);
-err_name:
- kfree(name);
- return error;
-}
-
#endif /* CONFIG_TMPFS */
static void shmem_put_super(struct super_block *sb)
diff --git a/mm/slab.c b/mm/slab.c
index 2f308253c3d7..36688f6c87eb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1235,8 +1235,6 @@ void __init kmem_cache_init(void)
{
int i;
- BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
- sizeof(struct rcu_head));
kmem_cache = &kmem_cache_boot;
if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
@@ -2665,6 +2663,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
invalid_mask, &invalid_mask, flags, &flags);
dump_stack();
}
+ WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
check_irq_off();
@@ -3071,6 +3070,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
+ WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
diff --git a/mm/slob.c b/mm/slob.c
index 623e8a5c46ce..307c2c9feb44 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -555,8 +555,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
flags, node);
}
- if (b && c->ctor)
+ if (b && c->ctor) {
+ WARN_ON_ONCE(flags & __GFP_ZERO);
c->ctor(b);
+ }
kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
return b;
diff --git a/mm/slub.c b/mm/slub.c
index 44aa7847324a..15505479c3ab 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -52,11 +52,11 @@
* and to synchronize major metadata changes to slab cache structures.
*
* The slab_lock is only used for debugging and on arches that do not
- * have the ability to do a cmpxchg_double. It only protects the second
- * double word in the page struct. Meaning
+ * have the ability to do a cmpxchg_double. It only protects:
* A. page->freelist -> List of object free in a page
- * B. page->counters -> Counters of objects
- * C. page->frozen -> frozen state
+ * B. page->inuse -> Number of objects in use
+ * C. page->objects -> Number of objects in page
+ * D. page->frozen -> frozen state
*
* If a slab is frozen then it is exempt from list management. It is not
* on any list. The processor that froze the slab is the one who can
@@ -316,16 +316,16 @@ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
return (p - addr) / s->size;
}
-static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved)
+static inline unsigned int order_objects(unsigned int order, unsigned int size)
{
- return (((unsigned int)PAGE_SIZE << order) - reserved) / size;
+ return ((unsigned int)PAGE_SIZE << order) / size;
}
static inline struct kmem_cache_order_objects oo_make(unsigned int order,
- unsigned int size, unsigned int reserved)
+ unsigned int size)
{
struct kmem_cache_order_objects x = {
- (order << OO_SHIFT) + order_objects(order, size, reserved)
+ (order << OO_SHIFT) + order_objects(order, size)
};
return x;
@@ -356,21 +356,6 @@ static __always_inline void slab_unlock(struct page *page)
__bit_spin_unlock(PG_locked, &page->flags);
}
-static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
-{
- struct page tmp;
- tmp.counters = counters_new;
- /*
- * page->counters can cover frozen/inuse/objects as well
- * as page->_refcount. If we assign to ->counters directly
- * we run the risk of losing updates to page->_refcount, so
- * be careful and only assign to the fields we need.
- */
- page->frozen = tmp.frozen;
- page->inuse = tmp.inuse;
- page->objects = tmp.objects;
-}
-
/* Interrupts must be disabled (for the fallback code to work right) */
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
@@ -392,7 +377,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
- set_page_slub_counters(page, counters_new);
+ page->counters = counters_new;
slab_unlock(page);
return true;
}
@@ -431,7 +416,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
- set_page_slub_counters(page, counters_new);
+ page->counters = counters_new;
slab_unlock(page);
local_irq_restore(flags);
return true;
@@ -711,7 +696,7 @@ void object_err(struct kmem_cache *s, struct page *page,
print_trailer(s, page, object);
}
-static void slab_err(struct kmem_cache *s, struct page *page,
+static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
const char *fmt, ...)
{
va_list args;
@@ -847,7 +832,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
return 1;
start = page_address(page);
- length = (PAGE_SIZE << compound_order(page)) - s->reserved;
+ length = PAGE_SIZE << compound_order(page);
end = start + length;
remainder = length % s->size;
if (!remainder)
@@ -936,7 +921,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
return 0;
}
- maxobj = order_objects(compound_order(page), s->size, s->reserved);
+ maxobj = order_objects(compound_order(page), s->size);
if (page->objects > maxobj) {
slab_err(s, page, "objects %u > max %u",
page->objects, maxobj);
@@ -986,7 +971,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
nr++;
}
- max_objects = order_objects(compound_order(page), s->size, s->reserved);
+ max_objects = order_objects(compound_order(page), s->size);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
@@ -1694,24 +1679,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
- page_mapcount_reset(page);
+ page->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
memcg_uncharge_slab(page, order, s);
__free_pages(page, order);
}
-#define need_reserve_slab_rcu \
- (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
-
static void rcu_free_slab(struct rcu_head *h)
{
- struct page *page;
-
- if (need_reserve_slab_rcu)
- page = virt_to_head_page(h);
- else
- page = container_of((struct list_head *)h, struct page, lru);
+ struct page *page = container_of(h, struct page, rcu_head);
__free_slab(page->slab_cache, page);
}
@@ -1719,19 +1696,7 @@ static void rcu_free_slab(struct rcu_head *h)
static void free_slab(struct kmem_cache *s, struct page *page)
{
if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
- struct rcu_head *head;
-
- if (need_reserve_slab_rcu) {
- int order = compound_order(page);
- int offset = (PAGE_SIZE << order) - s->reserved;
-
- VM_BUG_ON(s->reserved != sizeof(*head));
- head = page_address(page) + offset;
- } else {
- head = &page->rcu_head;
- }
-
- call_rcu(head, rcu_free_slab);
+ call_rcu(&page->rcu_head, rcu_free_slab);
} else
__free_slab(s, page);
}
@@ -2444,6 +2409,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
struct kmem_cache_cpu *c = *pc;
struct page *page;
+ WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
+
freelist = get_partial(s, flags, node, c);
if (freelist)
@@ -3226,21 +3193,21 @@ static unsigned int slub_min_objects;
*/
static inline unsigned int slab_order(unsigned int size,
unsigned int min_objects, unsigned int max_order,
- unsigned int fract_leftover, unsigned int reserved)
+ unsigned int fract_leftover)
{
unsigned int min_order = slub_min_order;
unsigned int order;
- if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
+ if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
return get_order(size * MAX_OBJS_PER_PAGE) - 1;
- for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
+ for (order = max(min_order, (unsigned int)get_order(min_objects * size));
order <= max_order; order++) {
unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
unsigned int rem;
- rem = (slab_size - reserved) % size;
+ rem = slab_size % size;
if (rem <= slab_size / fract_leftover)
break;
@@ -3249,7 +3216,7 @@ static inline unsigned int slab_order(unsigned int size,
return order;
}
-static inline int calculate_order(unsigned int size, unsigned int reserved)
+static inline int calculate_order(unsigned int size)
{
unsigned int order;
unsigned int min_objects;
@@ -3266,7 +3233,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved)
min_objects = slub_min_objects;
if (!min_objects)
min_objects = 4 * (fls(nr_cpu_ids) + 1);
- max_objects = order_objects(slub_max_order, size, reserved);
+ max_objects = order_objects(slub_max_order, size);
min_objects = min(min_objects, max_objects);
while (min_objects > 1) {
@@ -3275,7 +3242,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved)
fraction = 16;
while (fraction >= 4) {
order = slab_order(size, min_objects,
- slub_max_order, fraction, reserved);
+ slub_max_order, fraction);
if (order <= slub_max_order)
return order;
fraction /= 2;
@@ -3287,14 +3254,14 @@ static inline int calculate_order(unsigned int size, unsigned int reserved)
* We were unable to place multiple objects in a slab. Now
* lets see if we can place a single object there.
*/
- order = slab_order(size, 1, slub_max_order, 1, reserved);
+ order = slab_order(size, 1, slub_max_order, 1);
if (order <= slub_max_order)
return order;
/*
* Doh this slab cannot be placed using slub_max_order.
*/
- order = slab_order(size, 1, MAX_ORDER, 1, reserved);
+ order = slab_order(size, 1, MAX_ORDER, 1);
if (order < MAX_ORDER)
return order;
return -ENOSYS;
@@ -3562,7 +3529,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
if (forced_order >= 0)
order = forced_order;
else
- order = calculate_order(size, s->reserved);
+ order = calculate_order(size);
if ((int)order < 0)
return 0;
@@ -3580,8 +3547,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
/*
* Determine the number of objects per slab
*/
- s->oo = oo_make(order, size, s->reserved);
- s->min = oo_make(get_order(size), size, s->reserved);
+ s->oo = oo_make(order, size);
+ s->min = oo_make(get_order(size), size);
if (oo_objects(s->oo) > oo_objects(s->max))
s->max = s->oo;
@@ -3591,14 +3558,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
- s->reserved = 0;
#ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
#endif
- if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
- s->reserved = sizeof(struct rcu_head);
-
if (!calculate_sizes(s, -1))
goto error;
if (disable_higher_order_debug) {
@@ -4239,12 +4202,6 @@ void __init kmem_cache_init(void)
SLAB_HWCACHE_ALIGN, 0, 0);
kmem_cache = bootstrap(&boot_kmem_cache);
-
- /*
- * Allocate kmem_cache_node properly from the kmem_cache slab.
- * kmem_cache_node is separately allocated so no need to
- * update any list pointers.
- */
kmem_cache_node = bootstrap(&boot_kmem_cache_node);
/* Now we can use the kmem_cache to allocate kmalloc slabs */
@@ -5117,12 +5074,6 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(destroy_by_rcu);
-static ssize_t reserved_show(struct kmem_cache *s, char *buf)
-{
- return sprintf(buf, "%u\n", s->reserved);
-}
-SLAB_ATTR_RO(reserved);
-
#ifdef CONFIG_SLUB_DEBUG
static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
@@ -5435,7 +5386,6 @@ static struct attribute *slab_attrs[] = {
&reclaim_account_attr.attr,
&destroy_by_rcu_attr.attr,
&shrink_attr.attr,
- &reserved_attr.attr,
&slabs_cpu_partial_attr.attr,
#ifdef CONFIG_SLUB_DEBUG
&total_objects_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index 73dc2fcc0eab..f13f2723950a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -190,15 +190,13 @@ static inline int next_present_section_nr(int section_nr)
section_nr++;
if (present_section_nr(section_nr))
return section_nr;
- } while ((section_nr < NR_MEM_SECTIONS) &&
- (section_nr <= __highest_present_section_nr));
+ } while ((section_nr <= __highest_present_section_nr));
return -1;
}
#define for_each_present_section_nr(start, section_nr) \
for (section_nr = next_present_section_nr(start-1); \
((section_nr >= 0) && \
- (section_nr < NR_MEM_SECTIONS) && \
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))
@@ -524,7 +522,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
map_count = 1;
}
/* ok, last chunk */
- alloc_func(data, pnum_begin, NR_MEM_SECTIONS,
+ alloc_func(data, pnum_begin, __highest_present_section_nr+1,
map_count, nodeid_begin);
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index f2641894f440..f51ac051c0c9 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -317,7 +317,7 @@ swp_entry_t get_swap_page(struct page *page)
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
get_swap_pages(1, true, &entry);
- return entry;
+ goto out;
}
/*
@@ -347,10 +347,14 @@ repeat:
}
mutex_unlock(&cache->alloc_lock);
if (entry.val)
- return entry;
+ goto out;
}
get_swap_pages(1, false, &entry);
-
+out:
+ if (mem_cgroup_try_charge_swap(page, entry)) {
+ put_swap_page(page, entry);
+ entry.val = 0;
+ }
return entry;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 07f9aa2340c3..ab8e59cd18ea 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -216,9 +216,6 @@ int add_to_swap(struct page *page)
if (!entry.val)
return 0;
- if (mem_cgroup_try_charge_swap(page, entry))
- goto fail;
-
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 39791b81ede7..5029f241908f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage)
+ bool zeropage,
+ bool *mmap_changing)
{
struct vm_area_struct *dst_vma;
ssize_t err;
@@ -431,6 +432,15 @@ retry:
down_read(&dst_mm->mmap_sem);
/*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ err = -EAGAIN;
+ if (mmap_changing && READ_ONCE(*mmap_changing))
+ goto out_unlock;
+
+ /*
* Make sure the vma is not shared, that the dst range is
* both valid and fully within a single existing vma.
*/
@@ -563,13 +573,15 @@ out:
}
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len)
+ unsigned long src_start, unsigned long len,
+ bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+ return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
+ mmap_changing);
}
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len)
+ unsigned long len, bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, true);
+ return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
}
diff --git a/mm/util.c b/mm/util.c
index c2d0a7cdb189..3351659200e6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -391,7 +391,8 @@ EXPORT_SYMBOL(vm_mmap);
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
* preferable to the vmalloc fallback, due to visible performance drawbacks.
*
- * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
+ * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
+ * fall back to vmalloc.
*/
void *kvmalloc_node(size_t size, gfp_t flags, int node)
{
@@ -402,7 +403,8 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
* vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
* so the given set of flags has to be compatible.
*/
- WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
+ if ((flags & GFP_KERNEL) != GFP_KERNEL)
+ return kmalloc_node(size, flags, node);
/*
* We want to attempt a large physically contiguous block first because
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 63a5f502da08..89efac3a020e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -603,26 +603,6 @@ static void unmap_vmap_area(struct vmap_area *va)
vunmap_page_range(va->va_start, va->va_end);
}
-static void vmap_debug_free_range(unsigned long start, unsigned long end)
-{
- /*
- * Unmap page tables and force a TLB flush immediately if pagealloc
- * debugging is enabled. This catches use after free bugs similarly to
- * those in linear kernel virtual address space after a page has been
- * freed.
- *
- * All the lazy freeing logic is still retained, in order to minimise
- * intrusiveness of this debugging feature.
- *
- * This is going to be *slow* (linear kernel virtual address debugging
- * doesn't do a broadcast TLB flush so it is a lot faster).
- */
- if (debug_pagealloc_enabled()) {
- vunmap_page_range(start, end);
- flush_tlb_kernel_range(start, end);
- }
-}
-
/*
* lazy_max_pages is the maximum amount of virtual address space we gather up
* before attempting to purge with a TLB flush.
@@ -756,6 +736,9 @@ static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
unmap_vmap_area(va);
+ if (debug_pagealloc_enabled())
+ flush_tlb_kernel_range(va->va_start, va->va_end);
+
free_vmap_area_noflush(va);
}
@@ -1053,6 +1036,10 @@ static void vb_free(const void *addr, unsigned long size)
vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+ if (debug_pagealloc_enabled())
+ flush_tlb_kernel_range((unsigned long)addr,
+ (unsigned long)addr + size);
+
spin_lock(&vb->lock);
/* Expand dirty range */
@@ -1141,16 +1128,16 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(addr > VMALLOC_END);
BUG_ON(!PAGE_ALIGNED(addr));
- debug_check_no_locks_freed(mem, size);
- vmap_debug_free_range(addr, addr+size);
-
if (likely(count <= VMAP_MAX_ALLOC)) {
+ debug_check_no_locks_freed(mem, size);
vb_free(mem, size);
return;
}
va = find_vmap_area(addr);
BUG_ON(!va);
+ debug_check_no_locks_freed((void *)va->va_start,
+ (va->va_end - va->va_start));
free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
@@ -1499,7 +1486,6 @@ struct vm_struct *remove_vm_area(const void *addr)
va->flags |= VM_LAZY_FREE;
spin_unlock(&vmap_area_lock);
- vmap_debug_free_range(va->va_start, va->va_end);
kasan_free_shadow(vm);
free_unmap_vmap_area(va);
@@ -1519,16 +1505,17 @@ static void __vunmap(const void *addr, int deallocate_pages)
addr))
return;
- area = remove_vm_area(addr);
+ area = find_vmap_area((unsigned long)addr)->vm;
if (unlikely(!area)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
return;
}
- debug_check_no_locks_freed(addr, get_vm_area_size(area));
- debug_check_no_obj_freed(addr, get_vm_area_size(area));
+ debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
+ debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
+ remove_vm_area(addr);
if (deallocate_pages) {
int i;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 85350ce2d25d..4854584ec436 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -342,26 +342,6 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
vmpressure(gfp, memcg, true, vmpressure_win, 0);
}
-static enum vmpressure_levels str_to_level(const char *arg)
-{
- enum vmpressure_levels level;
-
- for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++)
- if (!strcmp(vmpressure_str_levels[level], arg))
- return level;
- return -1;
-}
-
-static enum vmpressure_modes str_to_mode(const char *arg)
-{
- enum vmpressure_modes mode;
-
- for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++)
- if (!strcmp(vmpressure_str_modes[mode], arg))
- return mode;
- return -1;
-}
-
#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
/**
@@ -390,27 +370,26 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
char *token;
int ret = 0;
- spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL);
+ spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
if (!spec) {
ret = -ENOMEM;
goto out;
}
- strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN);
/* Find required level */
token = strsep(&spec, ",");
- level = str_to_level(token);
- if (level == -1) {
- ret = -EINVAL;
+ level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
+ if (level < 0) {
+ ret = level;
goto out;
}
/* Find optional mode */
token = strsep(&spec, ",");
if (token) {
- mode = str_to_mode(token);
- if (mode == -1) {
- ret = -EINVAL;
+ mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
+ if (mode < 0) {
+ ret = mode;
goto out;
}
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9270a4370d54..03822f86f288 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long reclaimed;
unsigned long scanned;
- if (mem_cgroup_low(root, memcg)) {
+ switch (mem_cgroup_protected(root, memcg)) {
+ case MEMCG_PROT_MIN:
+ /*
+ * Hard protection.
+ * If there is no reclaimable memory, OOM.
+ */
+ continue;
+ case MEMCG_PROT_LOW:
+ /*
+ * Soft protection.
+ * Respect the protection only as long as
+ * there is an unprotected supply
+ * of reclaimable memory from other cgroups.
+ */
if (!sc->memcg_low_reclaim) {
sc->memcg_low_skipped = 1;
continue;
}
memcg_memory_event(memcg, MEMCG_LOW);
+ break;
+ case MEMCG_PROT_NONE:
+ break;
}
reclaimed = sc->nr_reclaimed;
@@ -3318,11 +3334,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
.may_unmap = 1,
.may_swap = 1,
};
+
+ __fs_reclaim_acquire();
+
count_vm_event(PAGEOUTRUN);
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
+ bool ret;
sc.reclaim_idx = classzone_idx;
@@ -3395,7 +3415,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
- if (try_to_freeze() || kthread_should_stop())
+ __fs_reclaim_release();
+ ret = try_to_freeze();
+ __fs_reclaim_acquire();
+ if (ret || kthread_should_stop())
break;
/*
@@ -3412,6 +3435,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
out:
snapshot_refaults(NULL, pgdat);
+ __fs_reclaim_release();
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
@@ -3600,9 +3624,7 @@ kswapd_try_sleep:
*/
trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
alloc_order);
- fs_reclaim_acquire(GFP_KERNEL);
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
- fs_reclaim_release(GFP_KERNEL);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
}
@@ -3684,16 +3706,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
- noreclaim_flag = memalloc_noreclaim_save();
fs_reclaim_acquire(sc.gfp_mask);
+ noreclaim_flag = memalloc_noreclaim_save();
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
p->reclaim_state = NULL;
- fs_reclaim_release(sc.gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
+ fs_reclaim_release(sc.gfp_mask);
return nr_reclaimed;
}
@@ -3870,6 +3892,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
};
cond_resched();
+ fs_reclaim_acquire(sc.gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
* and we also need to be able to write out pages for RECLAIM_WRITE
@@ -3877,7 +3900,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- fs_reclaim_acquire(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -3892,9 +3914,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
}
p->reclaim_state = NULL;
- fs_reclaim_release(gfp_mask);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
+ fs_reclaim_release(sc.gfp_mask);
return sc.nr_reclaimed >= nr_pages;
}