summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/cleancache.c4
-rw-r--r--mm/filemap.c95
-rw-r--r--mm/huge_memory.c41
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/list_lru.c12
-rw-r--r--mm/memcontrol.c858
-rw-r--r--mm/memory.c12
-rw-r--r--mm/mincore.c3
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/percpu.c18
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/shmem.c25
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c14
-rw-r--r--mm/slub.c10
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c18
-rw-r--r--mm/truncate.c69
-rw-r--r--mm/util.c16
-rw-r--r--mm/vmscan.c37
-rw-r--r--mm/vmstat.c12
-rw-r--r--mm/workingset.c4
-rw-r--r--mm/zsmalloc.c14
23 files changed, 780 insertions, 498 deletions
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 8fc50811119b..ba5d8f3e6d68 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -22,7 +22,7 @@
* cleancache_ops is set by cleancache_register_ops to contain the pointers
* to the cleancache "backend" implementation functions.
*/
-static struct cleancache_ops *cleancache_ops __read_mostly;
+static const struct cleancache_ops *cleancache_ops __read_mostly;
/*
* Counters available via /sys/kernel/debug/cleancache (if debugfs is
@@ -49,7 +49,7 @@ static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
/*
* Register operations for cleancache. Returns 0 on success.
*/
-int cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(const struct cleancache_ops *ops)
{
if (cmpxchg(&cleancache_ops, NULL, ops))
return -EBUSY;
diff --git a/mm/filemap.c b/mm/filemap.c
index 847ee43c2806..bc943867d68c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -11,6 +11,7 @@
*/
#include <linux/export.h>
#include <linux/compiler.h>
+#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
@@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
if (shadow) {
- mapping->nrshadows++;
+ mapping->nrexceptional++;
/*
- * Make sure the nrshadows update is committed before
+ * Make sure the nrexceptional update is committed before
* the nrpages update so that final truncate racing
* with reclaim does not see both counters 0 at the
* same time and miss a shadow entry.
@@ -481,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
{
int err = 0;
+ if (dax_mapping(mapping) && mapping->nrexceptional) {
+ err = dax_writeback_mapping_range(mapping, lstart, lend);
+ if (err)
+ return err;
+ }
+
if (mapping->nrpages) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
@@ -579,9 +586,13 @@ static int page_cache_tree_insert(struct address_space *mapping,
p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
if (!radix_tree_exceptional_entry(p))
return -EEXIST;
+
+ if (WARN_ON(dax_mapping(mapping)))
+ return -EINVAL;
+
if (shadowp)
*shadowp = p;
- mapping->nrshadows--;
+ mapping->nrexceptional--;
if (node)
workingset_node_shadows_dec(node);
}
@@ -1245,9 +1256,9 @@ repeat:
if (radix_tree_deref_retry(page))
goto restart;
/*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Return
- * it without attempting to raise page count.
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
*/
goto export;
}
@@ -1494,6 +1505,74 @@ repeat:
}
EXPORT_SYMBOL(find_get_pages_tag);
+/**
+ * find_get_entries_tag - find and return entries that match @tag
+ * @mapping: the address_space to search
+ * @start: the starting page cache index
+ * @tag: the tag index
+ * @nr_entries: the maximum number of entries
+ * @entries: where the resulting entries are placed
+ * @indices: the cache indices corresponding to the entries in @entries
+ *
+ * Like find_get_entries, except we only return entries which are tagged with
+ * @tag.
+ */
+unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
+ int tag, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices)
+{
+ void **slot;
+ unsigned int ret = 0;
+ struct radix_tree_iter iter;
+
+ if (!nr_entries)
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_tagged(slot, &mapping->page_tree,
+ &iter, start, tag) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+
+ /*
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
+ */
+ goto export;
+ }
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+export:
+ indices[ret] = iter.index;
+ entries[ret] = page;
+ if (++ret == nr_entries)
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(find_get_entries_tag);
+
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2684,11 +2763,11 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file->f_mapping->host;
ssize_t ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret > 0)
ret = __generic_file_write_iter(iocb, from);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret > 0) {
ssize_t err;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 50342eff7960..fd3a07b3e6f4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1560,7 +1560,8 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct mm_struct *mm = tlb->mm;
int ret = 0;
- if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
goto out_unlocked;
orig_pmd = *pmd;
@@ -1627,7 +1628,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
spinlock_t *ptl;
- if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
+ ptl = __pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
return 0;
/*
* For architectures like ppc64 we look at deposited pgtable
@@ -1690,7 +1692,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_sem prevents deadlock.
*/
- if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
+ old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
+ if (old_ptl) {
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1724,7 +1727,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
+ ptl = __pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
pmd_t entry;
bool preserve_write = prot_numa && pmd_write(*pmd);
ret = 1;
@@ -1760,14 +1764,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* Note that if it returns true, this routine returns without unlocking page
* table lock. So callers must unlock it.
*/
-bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
- spinlock_t **ptl)
+spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
- *ptl = pmd_lock(vma->vm_mm, pmd);
+ spinlock_t *ptl;
+ ptl = pmd_lock(vma->vm_mm, pmd);
if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
- return true;
- spin_unlock(*ptl);
- return false;
+ return ptl;
+ spin_unlock(ptl);
+ return NULL;
}
#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@ -2068,7 +2072,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (likely(writable)) {
if (likely(referenced)) {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
return 1;
}
@@ -2078,7 +2082,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
out:
release_pte_pages(pte, _pte);
- trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
return 0;
}
@@ -2320,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated, result = 0;
+ int isolated = 0, result = 0;
unsigned long hstart, hend;
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2576,7 +2580,7 @@ out_unmap:
collapse_huge_page(mm, address, hpage, vma, node);
}
out:
- trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
none_or_zero, result);
return ret;
}
@@ -3357,6 +3361,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct anon_vma *anon_vma;
int count, mapcount, ret;
bool mlocked;
+ unsigned long flags;
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
@@ -3396,7 +3401,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
lru_add_drain();
/* Prevent deferred_split_scan() touching ->_count */
- spin_lock(&split_queue_lock);
+ spin_lock_irqsave(&split_queue_lock, flags);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && count == 1) {
@@ -3404,11 +3409,11 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
split_queue_len--;
list_del(page_deferred_list(head));
}
- spin_unlock(&split_queue_lock);
+ spin_unlock_irqrestore(&split_queue_lock, flags);
__split_huge_page(page, list);
ret = 0;
} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- spin_unlock(&split_queue_lock);
+ spin_unlock_irqrestore(&split_queue_lock, flags);
pr_alert("total_mapcount: %u, page_count(): %u\n",
mapcount, count);
if (PageTail(page))
@@ -3416,7 +3421,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
dump_page(page, "total_mapcount(head) > 0");
BUG();
} else {
- spin_unlock(&split_queue_lock);
+ spin_unlock_irqrestore(&split_queue_lock, flags);
unfreeze_page(anon_vma, head);
ret = -EBUSY;
}
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 64710148941e..a61460d9f5b0 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,4 +1,5 @@
KASAN_SANITIZE := n
+UBSAN_SANITIZE_kasan.o := n
CFLAGS_REMOVE_kasan.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/mm/list_lru.c b/mm/list_lru.c
index afc71ea9a381..1d05cb9d363d 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -12,7 +12,7 @@
#include <linux/mutex.h>
#include <linux/memcontrol.h>
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static LIST_HEAD(list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
@@ -37,9 +37,9 @@ static void list_lru_register(struct list_lru *lru)
static void list_lru_unregister(struct list_lru *lru)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
/*
@@ -104,7 +104,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
{
return &nlru->lru;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
@@ -292,7 +292,7 @@ static void init_one_lru(struct list_lru_one *l)
l->nr_items = 0;
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
int begin, int end)
{
@@ -529,7 +529,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0eda67376df4..d06cae2de783 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,7 +66,6 @@
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
-#include <net/tcp_memcontrol.h>
#include "slab.h"
#include <asm/uaccess.h>
@@ -83,6 +82,9 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
+/* Kernel memory accounting disabled? */
+static bool cgroup_memory_nokmem;
+
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
int do_swap_account __read_mostly;
@@ -239,6 +241,7 @@ enum res_type {
_MEMSWAP,
_OOM_TYPE,
_KMEM,
+ _TCP,
};
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
@@ -247,13 +250,6 @@ enum res_type {
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
-/*
- * The memcg_create_mutex will be held whenever a new cgroup is created.
- * As a consequence, any change that needs to protect against new child cgroups
- * appearing has to hold it as well.
- */
-static DEFINE_MUTEX(memcg_create_mutex);
-
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -297,7 +293,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return mem_cgroup_from_css(css);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
@@ -349,7 +345,7 @@ void memcg_put_cache_ids(void)
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* !CONFIG_SLOB */
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
@@ -370,13 +366,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
*
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
- *
- * XXX: The above description of behavior on the default hierarchy isn't
- * strictly true yet as replace_page_cache_page() can modify the
- * association before @page is released even on the default hierarchy;
- * however, the current and planned usages don't mix the the two functions
- * and replace_page_cache_page() will soon be updated to make the invariant
- * actually true.
*/
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
@@ -896,17 +885,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (css == &root->css)
break;
- if (css_tryget(css)) {
- /*
- * Make sure the memcg is initialized:
- * mem_cgroup_css_online() orders the the
- * initialization against setting the flag.
- */
- if (smp_load_acquire(&memcg->initialized))
- break;
-
- css_put(css);
- }
+ if (css_tryget(css))
+ break;
memcg = NULL;
}
@@ -1233,7 +1213,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont(":");
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+ if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
K(mem_cgroup_read_stat(iter, i)));
@@ -1272,9 +1252,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
limit = memcg->memory.limit;
if (mem_cgroup_swappiness(memcg)) {
unsigned long memsw_limit;
+ unsigned long swap_limit;
memsw_limit = memcg->memsw.limit;
- limit = min(limit + total_swap_pages, memsw_limit);
+ swap_limit = memcg->swap.limit;
+ swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
+ limit = min(limit + swap_limit, memsw_limit);
}
return limit;
}
@@ -2203,7 +2186,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
unlock_page_lru(page, isolated);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2378,16 +2361,17 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct page_counter *counter;
int ret;
- if (!memcg_kmem_is_active(memcg))
+ if (!memcg_kmem_online(memcg))
return 0;
- if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
- return -ENOMEM;
-
ret = try_charge(memcg, gfp, nr_pages);
- if (ret) {
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ if (ret)
return ret;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+ !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+ cancel_charge(memcg, nr_pages);
+ return -ENOMEM;
}
page->mem_cgroup = memcg;
@@ -2416,7 +2400,9 @@ void __memcg_kmem_uncharge(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
@@ -2424,7 +2410,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
page->mem_cgroup = NULL;
css_put_many(&memcg->css, nr_pages);
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* !CONFIG_SLOB */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2684,14 +2670,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
bool ret;
- /*
- * The lock does not prevent addition or deletion of children, but
- * it prevents a new child from being initialized based on this
- * parent in css_online(), so it's enough to decide whether
- * hierarchically inherited attributes can still be changed or not.
- */
- lockdep_assert_held(&memcg_create_mutex);
-
rcu_read_lock();
ret = css_next_child(NULL, &memcg->css);
rcu_read_unlock();
@@ -2754,10 +2732,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
- mutex_lock(&memcg_create_mutex);
-
if (memcg->use_hierarchy == val)
- goto out;
+ return 0;
/*
* If parent's use_hierarchy is set, we can't make any modifications
@@ -2776,9 +2752,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
} else
retval = -EINVAL;
-out:
- mutex_unlock(&memcg_create_mutex);
-
return retval;
}
@@ -2794,6 +2767,18 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
return val;
}
+static unsigned long tree_events(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_index idx)
+{
+ struct mem_cgroup *iter;
+ unsigned long val = 0;
+
+ for_each_mem_cgroup_tree(iter, memcg)
+ val += mem_cgroup_read_events(iter, idx);
+
+ return val;
+}
+
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
unsigned long val;
@@ -2836,6 +2821,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
case _KMEM:
counter = &memcg->kmem;
break;
+ case _TCP:
+ counter = &memcg->tcpmem;
+ break;
default:
BUG();
}
@@ -2860,103 +2848,180 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-#ifdef CONFIG_MEMCG_KMEM
-static int memcg_activate_kmem(struct mem_cgroup *memcg,
- unsigned long nr_pages)
+#ifndef CONFIG_SLOB
+static int memcg_online_kmem(struct mem_cgroup *memcg)
{
- int err = 0;
int memcg_id;
BUG_ON(memcg->kmemcg_id >= 0);
- BUG_ON(memcg->kmem_acct_activated);
- BUG_ON(memcg->kmem_acct_active);
-
- /*
- * For simplicity, we won't allow this to be disabled. It also can't
- * be changed if the cgroup has children already, or if tasks had
- * already joined.
- *
- * If tasks join before we set the limit, a person looking at
- * kmem.usage_in_bytes will have no way to determine when it took
- * place, which makes the value quite meaningless.
- *
- * After it first became limited, changes in the value of the limit are
- * of course permitted.
- */
- mutex_lock(&memcg_create_mutex);
- if (cgroup_is_populated(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- err = -EBUSY;
- mutex_unlock(&memcg_create_mutex);
- if (err)
- goto out;
+ BUG_ON(memcg->kmem_state);
memcg_id = memcg_alloc_cache_id();
- if (memcg_id < 0) {
- err = memcg_id;
- goto out;
- }
-
- /*
- * We couldn't have accounted to this cgroup, because it hasn't got
- * activated yet, so this should succeed.
- */
- err = page_counter_limit(&memcg->kmem, nr_pages);
- VM_BUG_ON(err);
+ if (memcg_id < 0)
+ return memcg_id;
static_branch_inc(&memcg_kmem_enabled_key);
/*
- * A memory cgroup is considered kmem-active as soon as it gets
+ * A memory cgroup is considered kmem-online as soon as it gets
* kmemcg_id. Setting the id after enabling static branching will
* guarantee no one starts accounting before all call sites are
* patched.
*/
memcg->kmemcg_id = memcg_id;
- memcg->kmem_acct_activated = true;
- memcg->kmem_acct_active = true;
-out:
- return err;
+ memcg->kmem_state = KMEM_ONLINE;
+
+ return 0;
}
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static int memcg_propagate_kmem(struct mem_cgroup *parent,
+ struct mem_cgroup *memcg)
{
- int ret;
+ int ret = 0;
mutex_lock(&memcg_limit_mutex);
- if (!memcg_kmem_is_active(memcg))
- ret = memcg_activate_kmem(memcg, limit);
- else
- ret = page_counter_limit(&memcg->kmem, limit);
+ /*
+ * If the parent cgroup is not kmem-online now, it cannot be
+ * onlined after this point, because it has at least one child
+ * already.
+ */
+ if (memcg_kmem_online(parent) ||
+ (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
+ ret = memcg_online_kmem(memcg);
mutex_unlock(&memcg_limit_mutex);
return ret;
}
-static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
- int ret = 0;
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *parent, *child;
+ int kmemcg_id;
+
+ if (memcg->kmem_state != KMEM_ONLINE)
+ return;
+ /*
+ * Clear the online state before clearing memcg_caches array
+ * entries. The slab_mutex in memcg_deactivate_kmem_caches()
+ * guarantees that no cache will be created for this cgroup
+ * after we are done (see memcg_create_kmem_cache()).
+ */
+ memcg->kmem_state = KMEM_ALLOCATED;
+ memcg_deactivate_kmem_caches(memcg);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ parent = parent_mem_cgroup(memcg);
if (!parent)
- return 0;
+ parent = root_mem_cgroup;
- mutex_lock(&memcg_limit_mutex);
/*
- * If the parent cgroup is not kmem-active now, it cannot be activated
- * after this point, because it has at least one child already.
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent. After we have finished, all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty. The
+ * ordering is imposed by list_lru_node->lock taken by
+ * memcg_drain_all_list_lrus().
*/
- if (memcg_kmem_is_active(parent))
- ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
- mutex_unlock(&memcg_limit_mutex);
- return ret;
+ css_for_each_descendant_pre(css, &memcg->css) {
+ child = mem_cgroup_from_css(css);
+ BUG_ON(child->kmemcg_id != kmemcg_id);
+ child->kmemcg_id = parent->kmemcg_id;
+ if (!memcg->use_hierarchy)
+ break;
+ }
+ memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+ memcg_free_cache_id(kmemcg_id);
+}
+
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+ /* css_alloc() failed, offlining didn't happen */
+ if (unlikely(memcg->kmem_state == KMEM_ONLINE))
+ memcg_offline_kmem(memcg);
+
+ if (memcg->kmem_state == KMEM_ALLOCATED) {
+ memcg_destroy_kmem_caches(memcg);
+ static_branch_dec(&memcg_kmem_enabled_key);
+ WARN_ON(page_counter_read(&memcg->kmem));
+ }
}
#else
+static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static int memcg_online_kmem(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
+{
+}
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+}
+#endif /* !CONFIG_SLOB */
+
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- return -EINVAL;
+ int ret = 0;
+
+ mutex_lock(&memcg_limit_mutex);
+ /* Top-level cgroup doesn't propagate from root */
+ if (!memcg_kmem_online(memcg)) {
+ if (cgroup_is_populated(memcg->css.cgroup) ||
+ (memcg->use_hierarchy && memcg_has_children(memcg)))
+ ret = -EBUSY;
+ if (ret)
+ goto out;
+ ret = memcg_online_kmem(memcg);
+ if (ret)
+ goto out;
+ }
+ ret = page_counter_limit(&memcg->kmem, limit);
+out:
+ mutex_unlock(&memcg_limit_mutex);
+ return ret;
+}
+
+static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+{
+ int ret;
+
+ mutex_lock(&memcg_limit_mutex);
+
+ ret = page_counter_limit(&memcg->tcpmem, limit);
+ if (ret)
+ goto out;
+
+ if (!memcg->tcpmem_active) {
+ /*
+ * The active flag needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See sock_update_memcg() for
+ * details, and note that we don't mark any socket as belonging
+ * to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in sock_update_memcg(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ */
+ static_branch_inc(&memcg_sockets_enabled_key);
+ memcg->tcpmem_active = true;
+ }
+out:
+ mutex_unlock(&memcg_limit_mutex);
+ return ret;
}
-#endif /* CONFIG_MEMCG_KMEM */
/*
* The user of this function is...
@@ -2990,6 +3055,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
case _KMEM:
ret = memcg_update_kmem_limit(memcg, nr_pages);
break;
+ case _TCP:
+ ret = memcg_update_tcp_limit(memcg, nr_pages);
+ break;
}
break;
case RES_SOFT_LIMIT:
@@ -3016,6 +3084,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
case _KMEM:
counter = &memcg->kmem;
break;
+ case _TCP:
+ counter = &memcg->tcpmem;
+ break;
default:
BUG();
}
@@ -3582,88 +3653,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
return 0;
}
-#ifdef CONFIG_MEMCG_KMEM
-static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- int ret;
-
- ret = memcg_propagate_kmem(memcg);
- if (ret)
- return ret;
-
- return tcp_init_cgroup(memcg, ss);
-}
-
-static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
-{
- struct cgroup_subsys_state *css;
- struct mem_cgroup *parent, *child;
- int kmemcg_id;
-
- if (!memcg->kmem_acct_active)
- return;
-
- /*
- * Clear the 'active' flag before clearing memcg_caches arrays entries.
- * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
- * guarantees no cache will be created for this cgroup after we are
- * done (see memcg_create_kmem_cache()).
- */
- memcg->kmem_acct_active = false;
-
- memcg_deactivate_kmem_caches(memcg);
-
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
-
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
- /*
- * Change kmemcg_id of this cgroup and all its descendants to the
- * parent's id, and then move all entries from this cgroup's list_lrus
- * to ones of the parent. After we have finished, all list_lrus
- * corresponding to this cgroup are guaranteed to remain empty. The
- * ordering is imposed by list_lru_node->lock taken by
- * memcg_drain_all_list_lrus().
- */
- css_for_each_descendant_pre(css, &memcg->css) {
- child = mem_cgroup_from_css(css);
- BUG_ON(child->kmemcg_id != kmemcg_id);
- child->kmemcg_id = parent->kmemcg_id;
- if (!memcg->use_hierarchy)
- break;
- }
- memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
-
- memcg_free_cache_id(kmemcg_id);
-}
-
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-{
- if (memcg->kmem_acct_activated) {
- memcg_destroy_kmem_caches(memcg);
- static_branch_dec(&memcg_kmem_enabled_key);
- WARN_ON(page_counter_read(&memcg->kmem));
- }
- tcp_destroy_cgroup(memcg);
-}
-#else
-static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- return 0;
-}
-
-static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
-{
-}
-
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-{
-}
-#endif
-
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
@@ -4051,7 +4040,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memcg_numa_stat_show,
},
#endif
-#ifdef CONFIG_MEMCG_KMEM
{
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
@@ -4084,7 +4072,29 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memcg_slab_show,
},
#endif
-#endif
+ {
+ .name = "kmem.tcp.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.failcnt",
+ .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
{ }, /* terminate */
};
@@ -4123,147 +4133,92 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
kfree(memcg->nodeinfo[node]);
}
-static struct mem_cgroup *mem_cgroup_alloc(void)
-{
- struct mem_cgroup *memcg;
- size_t size;
-
- size = sizeof(struct mem_cgroup);
- size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-
- memcg = kzalloc(size, GFP_KERNEL);
- if (!memcg)
- return NULL;
-
- memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat)
- goto out_free;
-
- if (memcg_wb_domain_init(memcg, GFP_KERNEL))
- goto out_free_stat;
-
- return memcg;
-
-out_free_stat:
- free_percpu(memcg->stat);
-out_free:
- kfree(memcg);
- return NULL;
-}
-
-/*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
- * (scanning all at force_empty is too costly...)
- *
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
- */
-
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
+static void mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- cancel_work_sync(&memcg->high_work);
-
- mem_cgroup_remove_from_trees(memcg);
-
+ memcg_wb_domain_exit(memcg);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
-
free_percpu(memcg->stat);
- memcg_wb_domain_exit(memcg);
kfree(memcg);
}
-static struct cgroup_subsys_state * __ref
-mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- long error = -ENOMEM;
+ size_t size;
int node;
- memcg = mem_cgroup_alloc();
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+
+ memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
- return ERR_PTR(error);
+ return NULL;
+
+ memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!memcg->stat)
+ goto fail;
for_each_node(node)
if (alloc_mem_cgroup_per_zone_info(memcg, node))
- goto free_out;
+ goto fail;
- /* root ? */
- if (parent_css == NULL) {
- root_mem_cgroup = memcg;
- page_counter_init(&memcg->memory, NULL);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
- page_counter_init(&memcg->memsw, NULL);
- page_counter_init(&memcg->kmem, NULL);
- }
+ if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+ goto fail;
INIT_WORK(&memcg->high_work, high_work_func);
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
- memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
-#ifdef CONFIG_MEMCG_KMEM
+ memcg->socket_pressure = jiffies;
+#ifndef CONFIG_SLOB
memcg->kmemcg_id = -1;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
-#ifdef CONFIG_INET
- memcg->socket_pressure = jiffies;
-#endif
- return &memcg->css;
-
-free_out:
- __mem_cgroup_free(memcg);
- return ERR_PTR(error);
+ return memcg;
+fail:
+ mem_cgroup_free(memcg);
+ return NULL;
}
-static int
-mem_cgroup_css_online(struct cgroup_subsys_state *css)
+static struct cgroup_subsys_state * __ref
+mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
- int ret;
-
- if (css->id > MEM_CGROUP_ID_MAX)
- return -ENOSPC;
-
- if (!parent)
- return 0;
-
- mutex_lock(&memcg_create_mutex);
+ struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
+ struct mem_cgroup *memcg;
+ long error = -ENOMEM;
- memcg->use_hierarchy = parent->use_hierarchy;
- memcg->oom_kill_disable = parent->oom_kill_disable;
- memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg = mem_cgroup_alloc();
+ if (!memcg)
+ return ERR_PTR(error);
- if (parent->use_hierarchy) {
+ memcg->high = PAGE_COUNTER_MAX;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
+ if (parent) {
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg->oom_kill_disable = parent->oom_kill_disable;
+ }
+ if (parent && parent->use_hierarchy) {
+ memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
-
- /*
- * No need to take a reference to the parent because cgroup
- * core guarantees its existence.
- */
+ page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
page_counter_init(&memcg->memory, NULL);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
+ page_counter_init(&memcg->tcpmem, NULL);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -4272,23 +4227,31 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent != root_mem_cgroup)
memory_cgrp_subsys.broken_hierarchy = true;
}
- mutex_unlock(&memcg_create_mutex);
- ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
- if (ret)
- return ret;
+ /* The following stuff does not apply to the root */
+ if (!parent) {
+ root_mem_cgroup = memcg;
+ return &memcg->css;
+ }
+
+ error = memcg_propagate_kmem(parent, memcg);
+ if (error)
+ goto fail;
-#ifdef CONFIG_INET
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
-#endif
- /*
- * Make sure the memcg is initialized: mem_cgroup_iter()
- * orders reading memcg->initialized against its callers
- * reading the memcg members.
- */
- smp_store_release(&memcg->initialized, 1);
+ return &memcg->css;
+fail:
+ mem_cgroup_free(memcg);
+ return NULL;
+}
+
+static int
+mem_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ if (css->id > MEM_CGROUP_ID_MAX)
+ return -ENOSPC;
return 0;
}
@@ -4310,10 +4273,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
- vmpressure_cleanup(&memcg->vmpressure);
-
- memcg_deactivate_kmem(memcg);
-
+ memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
}
@@ -4328,12 +4288,17 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- memcg_destroy_kmem(memcg);
-#ifdef CONFIG_INET
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
-#endif
- __mem_cgroup_free(memcg);
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
+ static_branch_dec(&memcg_sockets_enabled_key);
+
+ vmpressure_cleanup(&memcg->vmpressure);
+ cancel_work_sync(&memcg->high_work);
+ mem_cgroup_remove_from_trees(memcg);
+ memcg_free_kmem(memcg);
+ mem_cgroup_free(memcg);
}
/**
@@ -4673,7 +4638,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
@@ -4861,7 +4827,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
union mc_target target;
struct page *page;
- if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
spin_unlock(ptl);
return 0;
@@ -5143,6 +5110,59 @@ static int memory_events_show(struct seq_file *m, void *v)
return 0;
}
+static int memory_stat_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ int i;
+
+ /*
+ * Provide statistics on the state of the memory subsystem as
+ * well as cumulative event counters that show past behavior.
+ *
+ * This list is ordered following a combination of these gradients:
+ * 1) generic big picture -> specifics and details
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
+ *
+ * Current memory state:
+ */
+
+ seq_printf(m, "anon %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+ seq_printf(m, "file %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+ seq_printf(m, "sock %llu\n",
+ (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+
+ seq_printf(m, "file_mapped %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
+ PAGE_SIZE);
+ seq_printf(m, "file_dirty %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
+ PAGE_SIZE);
+ seq_printf(m, "file_writeback %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
+ PAGE_SIZE);
+
+ for (i = 0; i < NR_LRU_LISTS; i++) {
+ struct mem_cgroup *mi;
+ unsigned long val = 0;
+
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_nr_lru_pages(mi, BIT(i));
+ seq_printf(m, "%s %llu\n",
+ mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
+ }
+
+ /* Accumulated memory events */
+
+ seq_printf(m, "pgfault %lu\n",
+ tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+ seq_printf(m, "pgmajfault %lu\n",
+ tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+
+ return 0;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -5173,6 +5193,11 @@ static struct cftype memory_files[] = {
.file_offset = offsetof(struct mem_cgroup, events_file),
.seq_show = memory_events_show,
},
+ {
+ .name = "stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_stat_show,
+ },
{ } /* terminate */
};
@@ -5269,7 +5294,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
if (page->mem_cgroup)
goto out;
- if (do_memsw_account()) {
+ if (do_swap_account) {
swp_entry_t ent = { .val = page_private(page), };
unsigned short id = lookup_swap_cgroup_id(ent);
@@ -5504,7 +5529,8 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
- int isolated;
+ unsigned int nr_pages;
+ bool compound;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
@@ -5524,14 +5550,22 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
if (!memcg)
return;
- lock_page_lru(oldpage, &isolated);
- oldpage->mem_cgroup = NULL;
- unlock_page_lru(oldpage, isolated);
+ /* Force-charge the new page. The old one will be freed soon */
+ compound = PageTransHuge(newpage);
+ nr_pages = compound ? hpage_nr_pages(newpage) : 1;
+
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_charge(&memcg->memsw, nr_pages);
+ css_get_many(&memcg->css, nr_pages);
commit_charge(newpage, memcg, true);
-}
-#ifdef CONFIG_INET
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
+ memcg_check_events(memcg, newpage);
+ local_irq_enable();
+}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
EXPORT_SYMBOL(memcg_sockets_enabled_key);
@@ -5558,10 +5592,8 @@ void sock_update_memcg(struct sock *sk)
memcg = mem_cgroup_from_task(current);
if (memcg == root_mem_cgroup)
goto out;
-#ifdef CONFIG_MEMCG_KMEM
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
-#endif
if (css_tryget_online(&memcg->css))
sk->sk_memcg = memcg;
out:
@@ -5587,24 +5619,24 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
gfp_t gfp_mask = GFP_KERNEL;
-#ifdef CONFIG_MEMCG_KMEM
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- struct page_counter *counter;
+ struct page_counter *fail;
- if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated,
- nr_pages, &counter)) {
- memcg->tcp_mem.memory_pressure = 0;
+ if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
+ memcg->tcpmem_pressure = 0;
return true;
}
- page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages);
- memcg->tcp_mem.memory_pressure = 1;
+ page_counter_charge(&memcg->tcpmem, nr_pages);
+ memcg->tcpmem_pressure = 1;
return false;
}
-#endif
+
/* Don't block in the packet receive path */
if (in_softirq())
gfp_mask = GFP_NOWAIT;
+ this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
+
if (try_charge(memcg, gfp_mask, nr_pages) == 0)
return true;
@@ -5619,19 +5651,17 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
*/
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
-#ifdef CONFIG_MEMCG_KMEM
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- page_counter_uncharge(&memcg->tcp_mem.memory_allocated,
- nr_pages);
+ page_counter_uncharge(&memcg->tcpmem, nr_pages);
return;
}
-#endif
+
+ this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
+
page_counter_uncharge(&memcg->memory, nr_pages);
css_put_many(&memcg->css, nr_pages);
}
-#endif /* CONFIG_INET */
-
static int __init cgroup_memory(char *s)
{
char *token;
@@ -5641,6 +5671,8 @@ static int __init cgroup_memory(char *s)
continue;
if (!strcmp(token, "nosocket"))
cgroup_memory_nosocket = true;
+ if (!strcmp(token, "nokmem"))
+ cgroup_memory_nokmem = true;
}
return 0;
}
@@ -5730,32 +5762,107 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
memcg_check_events(memcg, page);
}
+/*
+ * mem_cgroup_try_charge_swap - try charging a swap entry
+ * @page: page being added to swap
+ * @entry: swap entry to charge
+ *
+ * Try to charge @entry to the memcg that @page belongs to.
+ *
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ struct page_counter *counter;
+ unsigned short oldid;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
+ return 0;
+
+ memcg = page->mem_cgroup;
+
+ /* Readahead page, never charged */
+ if (!memcg)
+ return 0;
+
+ if (!mem_cgroup_is_root(memcg) &&
+ !page_counter_try_charge(&memcg->swap, 1, &counter))
+ return -ENOMEM;
+
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ VM_BUG_ON_PAGE(oldid, page);
+ mem_cgroup_swap_statistics(memcg, true);
+
+ css_get(&memcg->css);
+ return 0;
+}
+
/**
* mem_cgroup_uncharge_swap - uncharge a swap entry
* @entry: swap entry to uncharge
*
- * Drop the memsw charge associated with @entry.
+ * Drop the swap charge associated with @entry.
*/
void mem_cgroup_uncharge_swap(swp_entry_t entry)
{
struct mem_cgroup *memcg;
unsigned short id;
- if (!do_memsw_account())
+ if (!do_swap_account)
return;
id = swap_cgroup_record(entry, 0);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memsw, 1);
+ if (!mem_cgroup_is_root(memcg)) {
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->swap, 1);
+ else
+ page_counter_uncharge(&memcg->memsw, 1);
+ }
mem_cgroup_swap_statistics(memcg, false);
css_put(&memcg->css);
}
rcu_read_unlock();
}
+long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
+{
+ long nr_swap_pages = get_nr_swap_pages();
+
+ if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return nr_swap_pages;
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ nr_swap_pages = min_t(long, nr_swap_pages,
+ READ_ONCE(memcg->swap.limit) -
+ page_counter_read(&memcg->swap));
+ return nr_swap_pages;
+}
+
+bool mem_cgroup_swap_full(struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (vm_swap_full())
+ return true;
+ if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return false;
+
+ memcg = page->mem_cgroup;
+ if (!memcg)
+ return false;
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
+ return true;
+
+ return false;
+}
+
/* for remember boot option*/
#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
@@ -5773,6 +5880,63 @@ static int __init enable_swap_account(char *s)
}
__setup("swapaccount=", enable_swap_account);
+static u64 swap_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
+}
+
+static int swap_max_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long max = READ_ONCE(memcg->swap.limit);
+
+ if (max == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
+
+ return 0;
+}
+
+static ssize_t swap_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &max);
+ if (err)
+ return err;
+
+ mutex_lock(&memcg_limit_mutex);
+ err = page_counter_limit(&memcg->swap, max);
+ mutex_unlock(&memcg_limit_mutex);
+ if (err)
+ return err;
+
+ return nbytes;
+}
+
+static struct cftype swap_files[] = {
+ {
+ .name = "swap.current",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = swap_current_read,
+ },
+ {
+ .name = "swap.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_max_show,
+ .write = swap_max_write,
+ },
+ { } /* terminate */
+};
+
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
@@ -5804,6 +5968,8 @@ static int __init mem_cgroup_swap_init(void)
{
if (!mem_cgroup_disabled() && really_do_swap_account) {
do_swap_account = 1;
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
+ swap_files));
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
memsw_cgroup_files));
}
diff --git a/mm/memory.c b/mm/memory.c
index ff17850a52d9..93ce37989471 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1591,10 +1591,15 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
+ if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
struct page *page;
- page = pfn_t_to_page(pfn);
+ /*
+ * At this point we are committed to insert_page()
+ * regardless of whether the caller specified flags that
+ * result in pfn_t_has_page() == false.
+ */
+ page = pfn_to_page(pfn_t_to_pfn(pfn));
return insert_page(vma, addr, page, vma->vm_page_prot);
}
return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -2582,7 +2587,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
swap_free(entry);
- if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ if (mem_cgroup_swap_full(page) ||
+ (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (page != swapcache) {
diff --git a/mm/mincore.c b/mm/mincore.c
index 2a565ed8bb49..563f32045490 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -117,7 +117,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
- if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
memset(vec, 1, nr);
spin_unlock(ptl);
goto out;
diff --git a/mm/mlock.c b/mm/mlock.c
index e1e2b1207bf2..96f001041928 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -175,7 +175,7 @@ static void __munlock_isolation_failed(struct page *page)
*/
unsigned int munlock_vma_page(struct page *page)
{
- unsigned int nr_pages;
+ int nr_pages;
struct zone *zone = page_zone(page);
/* For try_to_munlock() and to serialize with page migration */
diff --git a/mm/percpu.c b/mm/percpu.c
index 8a943b97a053..998607adf6eb 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size)
/**
* pcpu_mem_free - free memory
* @ptr: memory to free
- * @size: size of the area
*
* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
*/
-static void pcpu_mem_free(void *ptr, size_t size)
+static void pcpu_mem_free(void *ptr)
{
- if (size <= PAGE_SIZE)
- kfree(ptr);
- else
- vfree(ptr);
+ kvfree(ptr);
}
/**
@@ -463,8 +459,8 @@ out_unlock:
* pcpu_mem_free() might end up calling vfree() which uses
* IRQ-unsafe lock and thus can't be called under pcpu_lock.
*/
- pcpu_mem_free(old, old_size);
- pcpu_mem_free(new, new_size);
+ pcpu_mem_free(old);
+ pcpu_mem_free(new);
return 0;
}
@@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
sizeof(chunk->map[0]));
if (!chunk->map) {
- pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+ pcpu_mem_free(chunk);
return NULL;
}
@@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
- pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
- pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+ pcpu_mem_free(chunk->map);
+ pcpu_mem_free(chunk);
}
/**
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index e88d071648c2..5d453e58ddbf 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
goto free_proc_pages;
}
- mm = mm_access(task, PTRACE_MODE_ATTACH);
+ mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
if (!mm || IS_ERR(mm)) {
rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index b98e1011858c..440e2a7e6c1c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -701,8 +701,7 @@ static void shmem_evict_inode(struct inode *inode)
list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
}
- } else
- kfree(info->symlink);
+ }
simple_xattrs_free(&info->xattrs);
WARN_ON(inode->i_blocks);
@@ -912,6 +911,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (!swap.val)
goto redirty;
+ if (mem_cgroup_try_charge_swap(page, swap))
+ goto free_swap;
+
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the page is
@@ -940,6 +942,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
}
mutex_unlock(&shmem_swaplist_mutex);
+free_swap:
swapcache_free(swap);
redirty:
set_page_dirty(page);
@@ -1898,7 +1901,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
if (whence != SEEK_DATA && whence != SEEK_HOLE)
return generic_file_llseek_size(file, offset, whence,
MAX_LFS_FILESIZE, i_size_read(inode));
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* We're holding i_mutex so we can access i_size directly */
if (offset < 0)
@@ -1922,7 +1925,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return offset;
}
@@ -2087,7 +2090,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
if (seals & ~(unsigned int)F_ALL_SEALS)
return -EINVAL;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (info->seals & F_SEAL_SEAL) {
error = -EPERM;
@@ -2110,7 +2113,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
error = 0;
unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
EXPORT_SYMBOL_GPL(shmem_add_seals);
@@ -2160,7 +2163,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (mode & FALLOC_FL_PUNCH_HOLE) {
struct address_space *mapping = file->f_mapping;
@@ -2273,7 +2276,7 @@ undone:
inode->i_private = NULL;
spin_unlock(&inode->i_lock);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
@@ -2545,13 +2548,12 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
info = SHMEM_I(inode);
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
- info->symlink = kmemdup(symname, len, GFP_KERNEL);
- if (!info->symlink) {
+ inode->i_link = kmemdup(symname, len, GFP_KERNEL);
+ if (!inode->i_link) {
iput(inode);
return -ENOMEM;
}
inode->i_op = &shmem_short_symlink_operations;
- inode->i_link = info->symlink;
} else {
inode_nohighmem(inode);
error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
@@ -3128,6 +3130,7 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
static void shmem_destroy_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
+ kfree(inode->i_link);
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}
diff --git a/mm/slab.h b/mm/slab.h
index c63b8699cfa3..834ad240c0bb 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -173,7 +173,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
/*
* Iterate over all memcg caches of the given root cache. The caller must hold
* slab_mutex.
@@ -251,7 +251,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
extern void slab_init_memcg_params(struct kmem_cache *);
-#else /* !CONFIG_MEMCG_KMEM */
+#else /* CONFIG_MEMCG && !CONFIG_SLOB */
#define for_each_memcg_cache(iter, root) \
for ((void)(iter), (void)(root); 0; )
@@ -292,7 +292,7 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e016178063e1..b50aef01ccf7 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -128,7 +128,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
return i;
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
void slab_init_memcg_params(struct kmem_cache *s)
{
s->memcg_params.is_root_cache = true;
@@ -221,7 +221,7 @@ static inline int init_memcg_params(struct kmem_cache *s,
static inline void destroy_memcg_params(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
/*
* Find a mergeable slab cache
@@ -477,7 +477,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier)
}
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
/*
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
* @memcg: The memory cgroup the new cache is for.
@@ -503,10 +503,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
mutex_lock(&slab_mutex);
/*
- * The memory cgroup could have been deactivated while the cache
+ * The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
- if (!memcg_kmem_is_active(memcg))
+ if (!memcg_kmem_online(memcg))
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -689,7 +689,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s,
{
return 0;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
void slab_kmem_cache_release(struct kmem_cache *s)
{
@@ -1123,7 +1123,7 @@ static int slab_show(struct seq_file *m, void *p)
return 0;
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
int memcg_slab_show(struct seq_file *m, void *p)
{
struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
diff --git a/mm/slub.c b/mm/slub.c
index b21fd24b08b1..2e1355ac056b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5207,7 +5207,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return -EIO;
err = attribute->store(s, buf, len);
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
struct kmem_cache *c;
@@ -5242,7 +5242,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
static void memcg_propagate_slab_attrs(struct kmem_cache *s)
{
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
int i;
char *buffer = NULL;
struct kmem_cache *root_cache;
@@ -5328,7 +5328,7 @@ static struct kset *slab_kset;
static inline struct kset *cache_kset(struct kmem_cache *s)
{
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (!is_root_cache(s))
return s->memcg_params.root_cache->memcg_kset;
#endif
@@ -5405,7 +5405,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
if (err)
goto out_del_kobj;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (is_root_cache(s)) {
s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
if (!s->memcg_kset) {
@@ -5438,7 +5438,7 @@ void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
kset_unregister(s->memcg_kset);
#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 676ff2991380..69cb2464e7dc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list)
if (!entry.val)
return 0;
+ if (mem_cgroup_try_charge_swap(page, entry)) {
+ swapcache_free(entry);
+ return 0;
+ }
+
if (unlikely(PageTransHuge(page)))
if (unlikely(split_huge_page_to_list(page, list))) {
swapcache_free(entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2bb30aa3a412..d2c37365e2d6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -785,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
count--;
}
- if (!count)
- mem_cgroup_uncharge_swap(entry);
-
usage = count | has_cache;
p->swap_map[offset] = usage;
/* free if no reference */
if (!usage) {
+ mem_cgroup_uncharge_swap(entry);
dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
@@ -1008,7 +1006,7 @@ int free_swap_and_cache(swp_entry_t entry)
* Also recheck PageSwapCache now page is locked (above).
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || vm_swap_full())) {
+ (!page_mapped(page) || mem_cgroup_swap_full(page))) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
@@ -1958,9 +1956,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
set_blocksize(bdev, old_block_size);
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
} else {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
inode->i_flags &= ~S_SWAPFILE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
filp_close(swap_file, NULL);
@@ -2185,7 +2183,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (IS_SWAPFILE(inode))
return -EBUSY;
} else
@@ -2418,7 +2416,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
mapping = swap_file->f_mapping;
inode = mapping->host;
- /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
+ /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
error = claim_swapfile(p, inode);
if (unlikely(error))
goto bad_swap;
@@ -2563,7 +2561,7 @@ bad_swap:
vfree(cluster_info);
if (swap_file) {
if (inode && S_ISREG(inode->i_mode)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
inode = NULL;
}
filp_close(swap_file, NULL);
@@ -2576,7 +2574,7 @@ out:
if (name)
putname(name);
if (inode && S_ISREG(inode->i_mode))
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 76e35ad97102..e3ee0e27cd17 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/backing-dev.h>
+#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
@@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping,
return;
spin_lock_irq(&mapping->tree_lock);
- /*
- * Regular page slots are stabilized by the page lock even
- * without the tree itself locked. These unlocked entries
- * need verification under the tree lock.
- */
- if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
- goto unlock;
- if (*slot != entry)
- goto unlock;
- radix_tree_replace_slot(slot, NULL);
- mapping->nrshadows--;
- if (!node)
- goto unlock;
- workingset_node_shadows_dec(node);
- /*
- * Don't track node without shadow entries.
- *
- * Avoid acquiring the list_lru lock if already untracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!workingset_node_shadows(node) &&
- !list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes, &node->private_list);
- __radix_tree_delete_node(&mapping->page_tree, node);
+
+ if (dax_mapping(mapping)) {
+ if (radix_tree_delete_item(&mapping->page_tree, index, entry))
+ mapping->nrexceptional--;
+ } else {
+ /*
+ * Regular page slots are stabilized by the page lock even
+ * without the tree itself locked. These unlocked entries
+ * need verification under the tree lock.
+ */
+ if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+ &slot))
+ goto unlock;
+ if (*slot != entry)
+ goto unlock;
+ radix_tree_replace_slot(slot, NULL);
+ mapping->nrexceptional--;
+ if (!node)
+ goto unlock;
+ workingset_node_shadows_dec(node);
+ /*
+ * Don't track node without shadow entries.
+ *
+ * Avoid acquiring the list_lru lock if already untracked.
+ * The list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!workingset_node_shadows(node) &&
+ !list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes,
+ &node->private_list);
+ __radix_tree_delete_node(&mapping->page_tree, node);
+ }
unlock:
spin_unlock_irq(&mapping->tree_lock);
}
@@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
int i;
cleancache_invalidate_inode(mapping);
- if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+ if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
return;
/* Offsets within partial pages */
@@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
*/
void truncate_inode_pages_final(struct address_space *mapping)
{
- unsigned long nrshadows;
+ unsigned long nrexceptional;
unsigned long nrpages;
/*
@@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping)
/*
* When reclaim installs eviction entries, it increases
- * nrshadows first, then decreases nrpages. Make sure we see
+ * nrexceptional first, then decreases nrpages. Make sure we see
* this in the right order or we might miss an entry.
*/
nrpages = mapping->nrpages;
smp_rmb();
- nrshadows = mapping->nrshadows;
+ nrexceptional = mapping->nrexceptional;
- if (nrpages || nrshadows) {
+ if (nrpages || nrexceptional) {
/*
* As truncation uses a lockless tree lookup, cycle
* the tree lock to make sure any ongoing tree
diff --git a/mm/util.c b/mm/util.c
index 6d1f9200f74e..c108a6542d05 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -476,17 +476,25 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
int res = 0;
unsigned int len;
struct mm_struct *mm = get_task_mm(task);
+ unsigned long arg_start, arg_end, env_start, env_end;
if (!mm)
goto out;
if (!mm->arg_end)
goto out_mm; /* Shh! No looking before we're done */
- len = mm->arg_end - mm->arg_start;
+ down_read(&mm->mmap_sem);
+ arg_start = mm->arg_start;
+ arg_end = mm->arg_end;
+ env_start = mm->env_start;
+ env_end = mm->env_end;
+ up_read(&mm->mmap_sem);
+
+ len = arg_end - arg_start;
if (len > buflen)
len = buflen;
- res = access_process_vm(task, mm->arg_start, buffer, len, 0);
+ res = access_process_vm(task, arg_start, buffer, len, 0);
/*
* If the nul at the end of args has been overwritten, then
@@ -497,10 +505,10 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
if (len < res) {
res = len;
} else {
- len = mm->env_end - mm->env_start;
+ len = env_end - env_start;
if (len > buflen - res)
len = buflen - res;
- res += access_process_vm(task, mm->env_start,
+ res += access_process_vm(task, env_start,
buffer+res, len, 0);
res = strnlen(buffer, res);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5ac86956ff9d..eb3dd37ccd7c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -46,6 +46,7 @@
#include <linux/oom.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
+#include <linux/dax.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -411,7 +412,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
unsigned long freed = 0;
- if (memcg && !memcg_kmem_is_active(memcg))
+ if (memcg && !memcg_kmem_online(memcg))
return 0;
if (nr_scanned == 0)
@@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* inode reclaim needs to empty out the radix tree or
* the nodes are lost. Don't plant shadows behind its
* back.
+ *
+ * We also don't store shadows for DAX mappings because the
+ * only page cache pages found in these are zero pages
+ * covering holes, and because we don't want to mix DAX
+ * exceptional entries and shadow exceptional entries in the
+ * same page_tree.
*/
if (reclaimed && page_is_file_cache(page) &&
- !mapping_exiting(mapping))
+ !mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(mapping, page);
__delete_from_page_cache(page, shadow, memcg);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -1214,7 +1221,7 @@ cull_mlocked:
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && vm_swap_full())
+ if (PageSwapCache(page) && mem_cgroup_swap_full(page))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
SetPageActive(page);
@@ -1966,10 +1973,11 @@ enum scan_balance {
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
-static void get_scan_count(struct lruvec *lruvec, int swappiness,
+static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
struct scan_control *sc, unsigned long *nr,
unsigned long *lru_pages)
{
+ int swappiness = mem_cgroup_swappiness(memcg);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
@@ -1996,14 +2004,14 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
if (current_is_kswapd()) {
if (!zone_reclaimable(zone))
force_scan = true;
- if (!mem_cgroup_lruvec_online(lruvec))
+ if (!mem_cgroup_online(memcg))
force_scan = true;
}
if (!global_reclaim(sc))
force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
+ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2193,9 +2201,10 @@ static inline void init_tlb_ubc(void)
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
- struct scan_control *sc, unsigned long *lru_pages)
+static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
+ struct scan_control *sc, unsigned long *lru_pages)
{
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
@@ -2205,7 +2214,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
struct blk_plug plug;
bool scan_adjusted;
- get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
+ get_scan_count(lruvec, memcg, sc, nr, lru_pages);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2409,8 +2418,6 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
unsigned long lru_pages;
unsigned long reclaimed;
unsigned long scanned;
- struct lruvec *lruvec;
- int swappiness;
if (mem_cgroup_low(root, memcg)) {
if (!sc->may_thrash)
@@ -2418,12 +2425,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
mem_cgroup_events(memcg, MEMCG_LOW, 1);
}
- lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- swappiness = mem_cgroup_swappiness(memcg);
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
- shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
+ shrink_zone_memcg(zone, memcg, sc, &lru_pages);
zone_lru_pages += lru_pages;
if (memcg && is_classzone)
@@ -2893,8 +2898,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !noswap,
};
- struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- int swappiness = mem_cgroup_swappiness(memcg);
unsigned long lru_pages;
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2911,7 +2914,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_lruvec(lruvec, swappiness, &sc, &lru_pages);
+ shrink_zone_memcg(zone, memcg, &sc, &lru_pages);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 64bd0aa13f75..40b2c74ddf16 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1408,17 +1408,7 @@ static void vmstat_update(struct work_struct *w)
* Defer the checking for differentials to the
* shepherd thread on a different processor.
*/
- int r;
- /*
- * Shepherd work thread does not race since it never
- * changes the bit if its zero but the cpu
- * online / off line code may race if
- * worker threads are still allowed during
- * shutdown / startup.
- */
- r = cpumask_test_and_set_cpu(smp_processor_id(),
- cpu_stat_off);
- VM_BUG_ON(r);
+ cpumask_set_cpu(smp_processor_id(), cpu_stat_off);
}
}
diff --git a/mm/workingset.c b/mm/workingset.c
index aa017133744b..61ead9e5549d 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
node->slots[i] = NULL;
BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
- BUG_ON(!mapping->nrshadows);
- mapping->nrshadows--;
+ BUG_ON(!mapping->nrexceptional);
+ mapping->nrexceptional--;
}
}
BUG_ON(node->count);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index e7414cec220b..2d7c4c11fc63 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -309,7 +309,12 @@ static void free_handle(struct zs_pool *pool, unsigned long handle)
static void record_obj(unsigned long handle, unsigned long obj)
{
- *(unsigned long *)handle = obj;
+ /*
+ * lsb of @obj represents handle lock while other bits
+ * represent object value the handle is pointing so
+ * updating shouldn't do store tearing.
+ */
+ WRITE_ONCE(*(unsigned long *)handle, obj);
}
/* zpool driver */
@@ -1635,6 +1640,13 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
free_obj = obj_malloc(d_page, class, handle);
zs_object_copy(free_obj, used_obj, class);
index++;
+ /*
+ * record_obj updates handle's value to free_obj and it will
+ * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
+ * breaks synchronization using pin_tag(e,g, zs_free) so
+ * let's keep the lock bit.
+ */
+ free_obj |= BIT(HANDLE_PIN_BIT);
record_obj(handle, free_obj);
unpin_tag(handle);
obj_free(pool, class, used_obj);