summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/huge_memory.c57
-rw-r--r--mm/memcontrol.c1310
-rw-r--r--mm/memory.c81
-rw-r--r--mm/migrate.c38
-rw-r--r--mm/mmap.c30
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/rmap.c20
-rw-r--r--mm/shmem.c369
-rw-r--r--mm/slab.c4
-rw-r--r--mm/swap.c36
-rw-r--r--mm/swap_state.c9
-rw-r--r--mm/swapfile.c21
-rw-r--r--mm/truncate.c9
-rw-r--r--mm/util.c9
-rw-r--r--mm/vmscan.c12
-rw-r--r--mm/zswap.c6
17 files changed, 1051 insertions, 990 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index af19a6b079f5..f501b56ec2c6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -31,6 +31,7 @@
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
#include <linux/rmap.h>
@@ -233,7 +234,6 @@ void delete_from_page_cache(struct page *page)
spin_lock_irq(&mapping->tree_lock);
__delete_from_page_cache(page, NULL);
spin_unlock_irq(&mapping->tree_lock);
- mem_cgroup_uncharge_cache_page(page);
if (freepage)
freepage(page);
@@ -489,8 +489,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
- /* mem_cgroup codes must not be called under tree_lock */
- mem_cgroup_replace_page_cache(old, new);
+ mem_cgroup_migrate(old, new, true);
radix_tree_preload_end();
if (freepage)
freepage(old);
@@ -548,19 +547,24 @@ static int __add_to_page_cache_locked(struct page *page,
pgoff_t offset, gfp_t gfp_mask,
void **shadowp)
{
+ int huge = PageHuge(page);
+ struct mem_cgroup *memcg;
int error;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
- error = mem_cgroup_charge_file(page, current->mm,
- gfp_mask & GFP_RECLAIM_MASK);
- if (error)
- return error;
+ if (!huge) {
+ error = mem_cgroup_try_charge(page, current->mm,
+ gfp_mask, &memcg);
+ if (error)
+ return error;
+ }
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error) {
- mem_cgroup_uncharge_cache_page(page);
+ if (!huge)
+ mem_cgroup_cancel_charge(page, memcg);
return error;
}
@@ -575,13 +579,16 @@ static int __add_to_page_cache_locked(struct page *page,
goto err_insert;
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
+ if (!huge)
+ mem_cgroup_commit_charge(page, memcg, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
err_insert:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
- mem_cgroup_uncharge_cache_page(page);
+ if (!huge)
+ mem_cgroup_cancel_charge(page, memcg);
page_cache_release(page);
return error;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3630d577e987..d9a21d06b862 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -715,13 +715,20 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
unsigned long haddr, pmd_t *pmd,
struct page *page)
{
+ struct mem_cgroup *memcg;
pgtable_t pgtable;
spinlock_t *ptl;
VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+ if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
+ return VM_FAULT_OOM;
+
pgtable = pte_alloc_one(mm, haddr);
- if (unlikely(!pgtable))
+ if (unlikely(!pgtable)) {
+ mem_cgroup_cancel_charge(page, memcg);
return VM_FAULT_OOM;
+ }
clear_huge_page(page, haddr, HPAGE_PMD_NR);
/*
@@ -734,7 +741,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_none(*pmd))) {
spin_unlock(ptl);
- mem_cgroup_uncharge_page(page);
+ mem_cgroup_cancel_charge(page, memcg);
put_page(page);
pte_free(mm, pgtable);
} else {
@@ -742,6 +749,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr);
+ mem_cgroup_commit_charge(page, memcg, false);
+ lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -827,13 +836,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_TRANSHUGE))) {
- put_page(page);
- count_vm_event(THP_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
- }
if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
- mem_cgroup_uncharge_page(page);
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -979,6 +982,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct page *page,
unsigned long haddr)
{
+ struct mem_cgroup *memcg;
spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
@@ -999,20 +1003,21 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
__GFP_OTHER_NODE,
vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
- mem_cgroup_charge_anon(pages[i], mm,
- GFP_KERNEL))) {
+ mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
+ &memcg))) {
if (pages[i])
put_page(pages[i]);
- mem_cgroup_uncharge_start();
while (--i >= 0) {
- mem_cgroup_uncharge_page(pages[i]);
+ memcg = (void *)page_private(pages[i]);
+ set_page_private(pages[i], 0);
+ mem_cgroup_cancel_charge(pages[i], memcg);
put_page(pages[i]);
}
- mem_cgroup_uncharge_end();
kfree(pages);
ret |= VM_FAULT_OOM;
goto out;
}
+ set_page_private(pages[i], (unsigned long)memcg);
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1041,7 +1046,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
pte_t *pte, entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ memcg = (void *)page_private(pages[i]);
+ set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vma, haddr);
+ mem_cgroup_commit_charge(pages[i], memcg, false);
+ lru_cache_add_active_or_unevictable(pages[i], vma);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -1065,12 +1074,12 @@ out:
out_free_pages:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- mem_cgroup_uncharge_start();
for (i = 0; i < HPAGE_PMD_NR; i++) {
- mem_cgroup_uncharge_page(pages[i]);
+ memcg = (void *)page_private(pages[i]);
+ set_page_private(pages[i], 0);
+ mem_cgroup_cancel_charge(pages[i], memcg);
put_page(pages[i]);
}
- mem_cgroup_uncharge_end();
kfree(pages);
goto out;
}
@@ -1081,6 +1090,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl;
int ret = 0;
struct page *page = NULL, *new_page;
+ struct mem_cgroup *memcg;
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -1132,7 +1142,8 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm,
+ GFP_TRANSHUGE, &memcg))) {
put_page(new_page);
if (page) {
split_huge_page(page);
@@ -1161,7 +1172,7 @@ alloc:
put_user_huge_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(ptl);
- mem_cgroup_uncharge_page(new_page);
+ mem_cgroup_cancel_charge(new_page, memcg);
put_page(new_page);
goto out_mn;
} else {
@@ -1170,6 +1181,8 @@ alloc:
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_clear_flush(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
+ mem_cgroup_commit_charge(new_page, memcg, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
if (!page) {
@@ -2413,6 +2426,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spinlock_t *pmd_ptl, *pte_ptl;
int isolated;
unsigned long hstart, hend;
+ struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -2423,7 +2437,8 @@ static void collapse_huge_page(struct mm_struct *mm,
if (!new_page)
return;
- if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE)))
+ if (unlikely(mem_cgroup_try_charge(new_page, mm,
+ GFP_TRANSHUGE, &memcg)))
return;
/*
@@ -2510,6 +2525,8 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
+ mem_cgroup_commit_charge(new_page, memcg, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
@@ -2523,7 +2540,7 @@ out_up_write:
return;
out:
- mem_cgroup_uncharge_page(new_page);
+ mem_cgroup_cancel_charge(new_page, memcg);
goto out_up_write;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 90dc501eaf3f..ec4dcf1b9562 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -754,9 +754,11 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
- spin_lock(&mctz->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&mctz->lock, flags);
__mem_cgroup_remove_exceeded(mz, mctz);
- spin_unlock(&mctz->lock);
+ spin_unlock_irqrestore(&mctz->lock, flags);
}
@@ -779,7 +781,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
* mem is over its softlimit.
*/
if (excess || mz->on_tree) {
- spin_lock(&mctz->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&mctz->lock, flags);
/* if on-tree, remove it */
if (mz->on_tree)
__mem_cgroup_remove_exceeded(mz, mctz);
@@ -788,7 +792,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
* If excess is 0, no tree ops.
*/
__mem_cgroup_insert_exceeded(mz, mctz, excess);
- spin_unlock(&mctz->lock);
+ spin_unlock_irqrestore(&mctz->lock, flags);
}
}
}
@@ -839,9 +843,9 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
struct mem_cgroup_per_zone *mz;
- spin_lock(&mctz->lock);
+ spin_lock_irq(&mctz->lock);
mz = __mem_cgroup_largest_soft_limit_node(mctz);
- spin_unlock(&mctz->lock);
+ spin_unlock_irq(&mctz->lock);
return mz;
}
@@ -882,13 +886,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
return val;
}
-static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
- bool charge)
-{
- int val = (charge) ? 1 : -1;
- this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
-}
-
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx)
{
@@ -909,13 +906,13 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- bool anon, int nr_pages)
+ int nr_pages)
{
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
*/
- if (anon)
+ if (PageAnon(page))
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
nr_pages);
else
@@ -1013,7 +1010,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
*/
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
{
- preempt_disable();
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
@@ -1026,8 +1022,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
do_numainfo = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_NUMAINFO);
#endif
- preempt_enable();
-
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
mem_cgroup_update_tree(memcg, page);
@@ -1035,8 +1029,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
#endif
- } else
- preempt_enable();
+ }
}
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -1347,20 +1340,6 @@ out:
return lruvec;
}
-/*
- * Following LRU functions are allowed to be used without PCG_LOCK.
- * Operations are called by routine of global LRU independently from memcg.
- * What we have to take care of here is validness of pc->mem_cgroup.
- *
- * Changes to pc->mem_cgroup happens when
- * 1. charge
- * 2. moving account
- * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
- * It is added to LRU before charge.
- * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
- * When moving account, the page is not on LRU. It's isolated.
- */
-
/**
* mem_cgroup_page_lruvec - return lruvec for adding an lru page
* @page: the page
@@ -2261,22 +2240,14 @@ cleanup:
*
* Notes: Race condition
*
- * We usually use lock_page_cgroup() for accessing page_cgroup member but
- * it tends to be costly. But considering some conditions, we doesn't need
- * to do so _always_.
- *
- * Considering "charge", lock_page_cgroup() is not required because all
- * file-stat operations happen after a page is attached to radix-tree. There
- * are no race with "charge".
+ * Charging occurs during page instantiation, while the page is
+ * unmapped and locked in page migration, or while the page table is
+ * locked in THP migration. No race is possible.
*
- * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
- * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
- * if there are race with "uncharge". Statistics itself is properly handled
- * by flags.
+ * Uncharge happens to pages with zero references, no race possible.
*
- * Considering "move", this is an only case we see a race. To make the race
- * small, we check memcg->moving_account and detect there are possibility
- * of race or not. If there is, we take a lock.
+ * Charge moving between groups is protected by checking mm->moving
+ * account and taking the move_lock in the slowpath.
*/
void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2551,17 +2522,8 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
return NOTIFY_OK;
}
-/**
- * mem_cgroup_try_charge - try charging a memcg
- * @memcg: memcg to charge
- * @nr_pages: number of pages to charge
- *
- * Returns 0 if @memcg was charged successfully, -EINTR if the charge
- * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
- */
-static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
- gfp_t gfp_mask,
- unsigned int nr_pages)
+static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -2660,41 +2622,7 @@ done:
return ret;
}
-/**
- * mem_cgroup_try_charge_mm - try charging a mm
- * @mm: mm_struct to charge
- * @nr_pages: number of pages to charge
- * @oom: trigger OOM if reclaim fails
- *
- * Returns the charged mem_cgroup associated with the given mm_struct or
- * NULL the charge failed.
- */
-static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
- gfp_t gfp_mask,
- unsigned int nr_pages)
-
-{
- struct mem_cgroup *memcg;
- int ret;
-
- memcg = get_mem_cgroup_from_mm(mm);
- ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages);
- css_put(&memcg->css);
- if (ret == -EINTR)
- memcg = root_mem_cgroup;
- else if (ret)
- memcg = NULL;
-
- return memcg;
-}
-
-/*
- * Somemtimes we have to undo a charge we got by try_charge().
- * This function is for that and do uncharge, put css's refcnt.
- * gotten by try_charge().
- */
-static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
- unsigned int nr_pages)
+static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long bytes = nr_pages * PAGE_SIZE;
@@ -2732,6 +2660,16 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
return mem_cgroup_from_id(id);
}
+/*
+ * try_get_mem_cgroup_from_page - look up page's memcg association
+ * @page: the page
+ *
+ * Look up, get a css reference, and return the memcg that owns @page.
+ *
+ * The page must be locked to prevent racing with swap-in and page
+ * cache charges. If coming from an unlocked page table, the caller
+ * must ensure the page is on the LRU or this can race with charging.
+ */
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
struct mem_cgroup *memcg = NULL;
@@ -2742,7 +2680,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page);
- lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
if (memcg && !css_tryget_online(&memcg->css))
@@ -2756,23 +2693,46 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
memcg = NULL;
rcu_read_unlock();
}
- unlock_page_cgroup(pc);
return memcg;
}
-static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
- struct page *page,
- unsigned int nr_pages,
- enum charge_type ctype,
- bool lrucare)
+static void lock_page_lru(struct page *page, int *isolated)
+{
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ if (PageLRU(page)) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ *isolated = 1;
+ } else
+ *isolated = 0;
+}
+
+static void unlock_page_lru(struct page *page, int isolated)
+{
+ struct zone *zone = page_zone(page);
+
+ if (isolated) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ SetPageLRU(page);
+ add_page_to_lru_list(page, lruvec, page_lru(page));
+ }
+ spin_unlock_irq(&zone->lru_lock);
+}
+
+static void commit_charge(struct page *page, struct mem_cgroup *memcg,
+ bool lrucare)
{
struct page_cgroup *pc = lookup_page_cgroup(page);
- struct zone *uninitialized_var(zone);
- struct lruvec *lruvec;
- bool was_on_lru = false;
- bool anon;
+ int isolated;
- lock_page_cgroup(pc);
VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
@@ -2783,44 +2743,28 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
* may already be on some other mem_cgroup's LRU. Take care of it.
*/
- if (lrucare) {
- zone = page_zone(page);
- spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page)) {
- lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
- was_on_lru = true;
- }
- }
-
- pc->mem_cgroup = memcg;
- SetPageCgroupUsed(pc);
-
- if (lrucare) {
- if (was_on_lru) {
- lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
- add_page_to_lru_list(page, lruvec, page_lru(page));
- }
- spin_unlock_irq(&zone->lru_lock);
- }
-
- if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
- anon = true;
- else
- anon = false;
-
- mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
- unlock_page_cgroup(pc);
+ if (lrucare)
+ lock_page_lru(page, &isolated);
/*
- * "charge_statistics" updated event counter. Then, check it.
- * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
- * if they exceeds softlimit.
+ * Nobody should be changing or seriously looking at
+ * pc->mem_cgroup and pc->flags at this point:
+ *
+ * - the page is uncharged
+ *
+ * - the page is off-LRU
+ *
+ * - an anonymous fault has exclusive page access, except for
+ * a locked page table
+ *
+ * - a page cache insertion, a swapin fault, or a migration
+ * have the page locked
*/
- memcg_check_events(memcg, page);
+ pc->mem_cgroup = memcg;
+ pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
+
+ if (lrucare)
+ unlock_page_lru(page, isolated);
}
static DEFINE_MUTEX(set_limit_mutex);
@@ -2882,21 +2826,21 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
if (ret)
return ret;
- ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT);
+ ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
if (ret == -EINTR) {
/*
- * mem_cgroup_try_charge() chosed to bypass to root due to
- * OOM kill or fatal signal. Since our only options are to
- * either fail the allocation or charge it to this cgroup, do
- * it as a temporary condition. But we can't fail. From a
- * kmem/slab perspective, the cache has already been selected,
- * by mem_cgroup_kmem_get_cache(), so it is too late to change
+ * try_charge() chose to bypass to root due to OOM kill or
+ * fatal signal. Since our only options are to either fail
+ * the allocation or charge it to this cgroup, do it as a
+ * temporary condition. But we can't fail. From a kmem/slab
+ * perspective, the cache has already been selected, by
+ * mem_cgroup_kmem_get_cache(), so it is too late to change
* our minds.
*
* This condition will only trigger if the task entered
- * memcg_charge_kmem in a sane state, but was OOM-killed during
- * mem_cgroup_try_charge() above. Tasks that were already
- * dying when the allocation triggers should have been already
+ * memcg_charge_kmem in a sane state, but was OOM-killed
+ * during try_charge() above. Tasks that were already dying
+ * when the allocation triggers should have been already
* directed to the root cgroup in memcontrol.h
*/
res_counter_charge_nofail(&memcg->res, size, &fail_res);
@@ -3447,7 +3391,6 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
/*
* Because tail pages are not marked as "used", set it. We're under
* zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -3468,7 +3411,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
for (i = 1; i < HPAGE_PMD_NR; i++) {
pc = head_pc + i;
pc->mem_cgroup = memcg;
- pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+ pc->flags = head_pc->flags;
}
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
HPAGE_PMD_NR);
@@ -3498,7 +3441,6 @@ static int mem_cgroup_move_account(struct page *page,
{
unsigned long flags;
int ret;
- bool anon = PageAnon(page);
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -3512,15 +3454,21 @@ static int mem_cgroup_move_account(struct page *page,
if (nr_pages > 1 && !PageTransHuge(page))
goto out;
- lock_page_cgroup(pc);
+ /*
+ * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
+ * of its source page while we change it: page migration takes
+ * both pages off the LRU, but page cache replacement doesn't.
+ */
+ if (!trylock_page(page))
+ goto out;
ret = -EINVAL;
if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
- goto unlock;
+ goto out_unlock;
move_lock_mem_cgroup(from, &flags);
- if (!anon && page_mapped(page)) {
+ if (!PageAnon(page) && page_mapped(page)) {
__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
nr_pages);
__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
@@ -3534,20 +3482,25 @@ static int mem_cgroup_move_account(struct page *page,
nr_pages);
}
- mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
+ /*
+ * It is safe to change pc->mem_cgroup here because the page
+ * is referenced, charged, and isolated - we can't race with
+ * uncharging, charging, migration, or LRU putback.
+ */
/* caller should have done css_get */
pc->mem_cgroup = to;
- mem_cgroup_charge_statistics(to, page, anon, nr_pages);
move_unlock_mem_cgroup(from, &flags);
ret = 0;
-unlock:
- unlock_page_cgroup(pc);
- /*
- * check events
- */
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(to, page, nr_pages);
memcg_check_events(to, page);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
memcg_check_events(from, page);
+ local_irq_enable();
+out_unlock:
+ unlock_page(page);
out:
return ret;
}
@@ -3618,449 +3571,12 @@ out:
return ret;
}
-int mem_cgroup_charge_anon(struct page *page,
- struct mm_struct *mm, gfp_t gfp_mask)
-{
- unsigned int nr_pages = 1;
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return 0;
-
- VM_BUG_ON_PAGE(page_mapped(page), page);
- VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- VM_BUG_ON(!mm);
-
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
- memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages);
- if (!memcg)
- return -ENOMEM;
- __mem_cgroup_commit_charge(memcg, page, nr_pages,
- MEM_CGROUP_CHARGE_TYPE_ANON, false);
- return 0;
-}
-
-/*
- * While swap-in, try_charge -> commit or cancel, the page is locked.
- * And when try_charge() successfully returns, one refcnt to memcg without
- * struct page_cgroup is acquired. This refcnt will be consumed by
- * "commit()" or removed by "cancel()"
- */
-static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
- struct page *page,
- gfp_t mask,
- struct mem_cgroup **memcgp)
-{
- struct mem_cgroup *memcg = NULL;
- struct page_cgroup *pc;
- int ret;
-
- pc = lookup_page_cgroup(page);
- /*
- * Every swap fault against a single page tries to charge the
- * page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. The USED bit is protected by
- * the page lock, which serializes swap cache removal, which
- * in turn serializes uncharging.
- */
- if (PageCgroupUsed(pc))
- goto out;
- if (do_swap_account)
- memcg = try_get_mem_cgroup_from_page(page);
- if (!memcg)
- memcg = get_mem_cgroup_from_mm(mm);
- ret = mem_cgroup_try_charge(memcg, mask, 1);
- css_put(&memcg->css);
- if (ret == -EINTR)
- memcg = root_mem_cgroup;
- else if (ret)
- return ret;
-out:
- *memcgp = memcg;
- return 0;
-}
-
-int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
- gfp_t gfp_mask, struct mem_cgroup **memcgp)
-{
- if (mem_cgroup_disabled()) {
- *memcgp = NULL;
- return 0;
- }
- /*
- * A racing thread's fault, or swapoff, may have already
- * updated the pte, and even removed page from swap cache: in
- * those cases unuse_pte()'s pte_same() test will fail; but
- * there's also a KSM case which does need to charge the page.
- */
- if (!PageSwapCache(page)) {
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1);
- if (!memcg)
- return -ENOMEM;
- *memcgp = memcg;
- return 0;
- }
- return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
-}
-
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
-{
- if (mem_cgroup_disabled())
- return;
- if (!memcg)
- return;
- __mem_cgroup_cancel_charge(memcg, 1);
-}
-
-static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
- enum charge_type ctype)
-{
- if (mem_cgroup_disabled())
- return;
- if (!memcg)
- return;
-
- __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
- /*
- * Now swap is on-memory. This means this page may be
- * counted both as mem and swap....double count.
- * Fix it by uncharging from memsw. Basically, this SwapCache is stable
- * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
- * may call delete_from_swap_cache() before reach here.
- */
- if (do_swap_account && PageSwapCache(page)) {
- swp_entry_t ent = {.val = page_private(page)};
- mem_cgroup_uncharge_swap(ent);
- }
-}
-
-void mem_cgroup_commit_charge_swapin(struct page *page,
- struct mem_cgroup *memcg)
-{
- __mem_cgroup_commit_charge_swapin(page, memcg,
- MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
-int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask)
-{
- enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
- struct mem_cgroup *memcg;
- int ret;
-
- if (mem_cgroup_disabled())
- return 0;
- if (PageCompound(page))
- return 0;
-
- if (PageSwapCache(page)) { /* shmem */
- ret = __mem_cgroup_try_charge_swapin(mm, page,
- gfp_mask, &memcg);
- if (ret)
- return ret;
- __mem_cgroup_commit_charge_swapin(page, memcg, type);
- return 0;
- }
-
- memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1);
- if (!memcg)
- return -ENOMEM;
- __mem_cgroup_commit_charge(memcg, page, 1, type, false);
- return 0;
-}
-
-static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
- unsigned int nr_pages,
- const enum charge_type ctype)
-{
- struct memcg_batch_info *batch = NULL;
- bool uncharge_memsw = true;
-
- /* If swapout, usage of swap doesn't decrease */
- if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- uncharge_memsw = false;
-
- batch = &current->memcg_batch;
- /*
- * In usual, we do css_get() when we remember memcg pointer.
- * But in this case, we keep res->usage until end of a series of
- * uncharges. Then, it's ok to ignore memcg's refcnt.
- */
- if (!batch->memcg)
- batch->memcg = memcg;
- /*
- * do_batch > 0 when unmapping pages or inode invalidate/truncate.
- * In those cases, all pages freed continuously can be expected to be in
- * the same cgroup and we have chance to coalesce uncharges.
- * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
- * because we want to do uncharge as soon as possible.
- */
-
- if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
- goto direct_uncharge;
-
- if (nr_pages > 1)
- goto direct_uncharge;
-
- /*
- * In typical case, batch->memcg == mem. This means we can
- * merge a series of uncharges to an uncharge of res_counter.
- * If not, we uncharge res_counter ony by one.
- */
- if (batch->memcg != memcg)
- goto direct_uncharge;
- /* remember freed charge and uncharge it later */
- batch->nr_pages++;
- if (uncharge_memsw)
- batch->memsw_nr_pages++;
- return;
-direct_uncharge:
- res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
- if (uncharge_memsw)
- res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
- if (unlikely(batch->memcg != memcg))
- memcg_oom_recover(memcg);
-}
-
-/*
- * uncharge if !page_mapped(page)
- */
-static struct mem_cgroup *
-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
- bool end_migration)
-{
- struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = 1;
- struct page_cgroup *pc;
- bool anon;
-
- if (mem_cgroup_disabled())
- return NULL;
-
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
- /*
- * Check if our page_cgroup is valid
- */
- pc = lookup_page_cgroup(page);
- if (unlikely(!PageCgroupUsed(pc)))
- return NULL;
-
- lock_page_cgroup(pc);
-
- memcg = pc->mem_cgroup;
-
- if (!PageCgroupUsed(pc))
- goto unlock_out;
-
- anon = PageAnon(page);
-
- switch (ctype) {
- case MEM_CGROUP_CHARGE_TYPE_ANON:
- /*
- * Generally PageAnon tells if it's the anon statistics to be
- * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
- * used before page reached the stage of being marked PageAnon.
- */
- anon = true;
- /* fallthrough */
- case MEM_CGROUP_CHARGE_TYPE_DROP:
- /* See mem_cgroup_prepare_migration() */
- if (page_mapped(page))
- goto unlock_out;
- /*
- * Pages under migration may not be uncharged. But
- * end_migration() /must/ be the one uncharging the
- * unused post-migration page and so it has to call
- * here with the migration bit still set. See the
- * res_counter handling below.
- */
- if (!end_migration && PageCgroupMigration(pc))
- goto unlock_out;
- break;
- case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
- if (!PageAnon(page)) { /* Shared memory */
- if (page->mapping && !page_is_file_cache(page))
- goto unlock_out;
- } else if (page_mapped(page)) /* Anon */
- goto unlock_out;
- break;
- default:
- break;
- }
-
- mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
-
- ClearPageCgroupUsed(pc);
- /*
- * pc->mem_cgroup is not cleared here. It will be accessed when it's
- * freed from LRU. This is safe because uncharged page is expected not
- * to be reused (freed soon). Exception is SwapCache, it's handled by
- * special functions.
- */
-
- unlock_page_cgroup(pc);
- /*
- * even after unlock, we have memcg->res.usage here and this memcg
- * will never be freed, so it's safe to call css_get().
- */
- memcg_check_events(memcg, page);
- if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
- mem_cgroup_swap_statistics(memcg, true);
- css_get(&memcg->css);
- }
- /*
- * Migration does not charge the res_counter for the
- * replacement page, so leave it alone when phasing out the
- * page that is unused after the migration.
- */
- if (!end_migration)
- mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
-
- return memcg;
-
-unlock_out:
- unlock_page_cgroup(pc);
- return NULL;
-}
-
-void mem_cgroup_uncharge_page(struct page *page)
-{
- /* early check. */
- if (page_mapped(page))
- return;
- VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- /*
- * If the page is in swap cache, uncharge should be deferred
- * to the swap path, which also properly accounts swap usage
- * and handles memcg lifetime.
- *
- * Note that this check is not stable and reclaim may add the
- * page to swap cache at any time after this. However, if the
- * page is not in swap cache by the time page->mapcount hits
- * 0, there won't be any page table references to the swap
- * slot, and reclaim will free it and not actually write the
- * page to disk.
- */
- if (PageSwapCache(page))
- return;
- __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
-}
-
-void mem_cgroup_uncharge_cache_page(struct page *page)
-{
- VM_BUG_ON_PAGE(page_mapped(page), page);
- VM_BUG_ON_PAGE(page->mapping, page);
- __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
-}
-
-/*
- * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
- * In that cases, pages are freed continuously and we can expect pages
- * are in the same memcg. All these calls itself limits the number of
- * pages freed at once, then uncharge_start/end() is called properly.
- * This may be called prural(2) times in a context,
- */
-
-void mem_cgroup_uncharge_start(void)
-{
- current->memcg_batch.do_batch++;
- /* We can do nest. */
- if (current->memcg_batch.do_batch == 1) {
- current->memcg_batch.memcg = NULL;
- current->memcg_batch.nr_pages = 0;
- current->memcg_batch.memsw_nr_pages = 0;
- }
-}
-
-void mem_cgroup_uncharge_end(void)
-{
- struct memcg_batch_info *batch = &current->memcg_batch;
-
- if (!batch->do_batch)
- return;
-
- batch->do_batch--;
- if (batch->do_batch) /* If stacked, do nothing. */
- return;
-
- if (!batch->memcg)
- return;
- /*
- * This "batch->memcg" is valid without any css_get/put etc...
- * bacause we hide charges behind us.
- */
- if (batch->nr_pages)
- res_counter_uncharge(&batch->memcg->res,
- batch->nr_pages * PAGE_SIZE);
- if (batch->memsw_nr_pages)
- res_counter_uncharge(&batch->memcg->memsw,
- batch->memsw_nr_pages * PAGE_SIZE);
- memcg_oom_recover(batch->memcg);
- /* forget this pointer (for sanity check) */
- batch->memcg = NULL;
-}
-
-#ifdef CONFIG_SWAP
-/*
- * called after __delete_from_swap_cache() and drop "page" account.
- * memcg information is recorded to swap_cgroup of "ent"
- */
-void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
-{
- struct mem_cgroup *memcg;
- int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
-
- if (!swapout) /* this was a swap cache but the swap is unused ! */
- ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
-
- memcg = __mem_cgroup_uncharge_common(page, ctype, false);
-
- /*
- * record memcg information, if swapout && memcg != NULL,
- * css_get() was called in uncharge().
- */
- if (do_swap_account && swapout && memcg)
- swap_cgroup_record(ent, mem_cgroup_id(memcg));
-}
-#endif
-
#ifdef CONFIG_MEMCG_SWAP
-/*
- * called from swap_entry_free(). remove record in swap_cgroup and
- * uncharge "memsw" account.
- */
-void mem_cgroup_uncharge_swap(swp_entry_t ent)
+static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
+ bool charge)
{
- struct mem_cgroup *memcg;
- unsigned short id;
-
- if (!do_swap_account)
- return;
-
- id = swap_cgroup_record(ent, 0);
- rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
- if (memcg) {
- /*
- * We uncharge this because swap is freed. This memcg can
- * be obsolete one. We avoid calling css_tryget_online().
- */
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
- mem_cgroup_swap_statistics(memcg, false);
- css_put(&memcg->css);
- }
- rcu_read_unlock();
+ int val = (charge) ? 1 : -1;
+ this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
}
/**
@@ -4112,175 +3628,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
}
#endif
-/*
- * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
- * page belongs to.
- */
-void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
- struct mem_cgroup **memcgp)
-{
- struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = 1;
- struct page_cgroup *pc;
- enum charge_type ctype;
-
- *memcgp = NULL;
-
- if (mem_cgroup_disabled())
- return;
-
- if (PageTransHuge(page))
- nr_pages <<= compound_order(page);
-
- pc = lookup_page_cgroup(page);
- lock_page_cgroup(pc);
- if (PageCgroupUsed(pc)) {
- memcg = pc->mem_cgroup;
- css_get(&memcg->css);
- /*
- * At migrating an anonymous page, its mapcount goes down
- * to 0 and uncharge() will be called. But, even if it's fully
- * unmapped, migration may fail and this page has to be
- * charged again. We set MIGRATION flag here and delay uncharge
- * until end_migration() is called
- *
- * Corner Case Thinking
- * A)
- * When the old page was mapped as Anon and it's unmap-and-freed
- * while migration was ongoing.
- * If unmap finds the old page, uncharge() of it will be delayed
- * until end_migration(). If unmap finds a new page, it's
- * uncharged when it make mapcount to be 1->0. If unmap code
- * finds swap_migration_entry, the new page will not be mapped
- * and end_migration() will find it(mapcount==0).
- *
- * B)
- * When the old page was mapped but migraion fails, the kernel
- * remaps it. A charge for it is kept by MIGRATION flag even
- * if mapcount goes down to 0. We can do remap successfully
- * without charging it again.
- *
- * C)
- * The "old" page is under lock_page() until the end of
- * migration, so, the old page itself will not be swapped-out.
- * If the new page is swapped out before end_migraton, our
- * hook to usual swap-out path will catch the event.
- */
- if (PageAnon(page))
- SetPageCgroupMigration(pc);
- }
- unlock_page_cgroup(pc);
- /*
- * If the page is not charged at this point,
- * we return here.
- */
- if (!memcg)
- return;
-
- *memcgp = memcg;
- /*
- * We charge new page before it's used/mapped. So, even if unlock_page()
- * is called before end_migration, we can catch all events on this new
- * page. In the case new page is migrated but not remapped, new page's
- * mapcount will be finally 0 and we call uncharge in end_migration().
- */
- if (PageAnon(page))
- ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
- else
- ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
- /*
- * The page is committed to the memcg, but it's not actually
- * charged to the res_counter since we plan on replacing the
- * old one and only one page is going to be left afterwards.
- */
- __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
-}
-
-/* remove redundant charge if migration failed*/
-void mem_cgroup_end_migration(struct mem_cgroup *memcg,
- struct page *oldpage, struct page *newpage, bool migration_ok)
-{
- struct page *used, *unused;
- struct page_cgroup *pc;
- bool anon;
-
- if (!memcg)
- return;
-
- if (!migration_ok) {
- used = oldpage;
- unused = newpage;
- } else {
- used = newpage;
- unused = oldpage;
- }
- anon = PageAnon(used);
- __mem_cgroup_uncharge_common(unused,
- anon ? MEM_CGROUP_CHARGE_TYPE_ANON
- : MEM_CGROUP_CHARGE_TYPE_CACHE,
- true);
- css_put(&memcg->css);
- /*
- * We disallowed uncharge of pages under migration because mapcount
- * of the page goes down to zero, temporarly.
- * Clear the flag and check the page should be charged.
- */
- pc = lookup_page_cgroup(oldpage);
- lock_page_cgroup(pc);
- ClearPageCgroupMigration(pc);
- unlock_page_cgroup(pc);
-
- /*
- * If a page is a file cache, radix-tree replacement is very atomic
- * and we can skip this check. When it was an Anon page, its mapcount
- * goes down to 0. But because we added MIGRATION flage, it's not
- * uncharged yet. There are several case but page->mapcount check
- * and USED bit check in mem_cgroup_uncharge_page() will do enough
- * check. (see prepare_charge() also)
- */
- if (anon)
- mem_cgroup_uncharge_page(used);
-}
-
-/*
- * At replace page cache, newpage is not under any memcg but it's on
- * LRU. So, this function doesn't touch res_counter but handles LRU
- * in correct way. Both pages are locked so we cannot race with uncharge.
- */
-void mem_cgroup_replace_page_cache(struct page *oldpage,
- struct page *newpage)
-{
- struct mem_cgroup *memcg = NULL;
- struct page_cgroup *pc;
- enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-
- if (mem_cgroup_disabled())
- return;
-
- pc = lookup_page_cgroup(oldpage);
- /* fix accounting on old pages */
- lock_page_cgroup(pc);
- if (PageCgroupUsed(pc)) {
- memcg = pc->mem_cgroup;
- mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
- ClearPageCgroupUsed(pc);
- }
- unlock_page_cgroup(pc);
-
- /*
- * When called from shmem_replace_page(), in some cases the
- * oldpage has already been charged, and in some cases not.
- */
- if (!memcg)
- return;
- /*
- * Even if newpage->mapping was NULL before starting replacement,
- * the newpage may be on LRU(or pagevec for LRU) already. We lock
- * LRU while we overwrite pc->mem_cgroup.
- */
- __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
-}
-
#ifdef CONFIG_DEBUG_VM
static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
{
@@ -4479,7 +3826,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_mask, &nr_scanned);
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
- spin_lock(&mctz->lock);
+ spin_lock_irq(&mctz->lock);
/*
* If we failed to reclaim anything from this memory cgroup
@@ -4519,7 +3866,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
*/
/* If excess == 0, no tree ops */
__mem_cgroup_insert_exceeded(mz, mctz, excess);
- spin_unlock(&mctz->lock);
+ spin_unlock_irq(&mctz->lock);
css_put(&mz->memcg->css);
loop++;
/*
@@ -6319,20 +5666,19 @@ static int mem_cgroup_do_precharge(unsigned long count)
int ret;
/* Try a single bulk charge without reclaim first */
- ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+ ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
if (!ret) {
mc.precharge += count;
return ret;
}
if (ret == -EINTR) {
- __mem_cgroup_cancel_charge(root_mem_cgroup, count);
+ cancel_charge(root_mem_cgroup, count);
return ret;
}
/* Try charges one by one with reclaim */
while (count--) {
- ret = mem_cgroup_try_charge(mc.to,
- GFP_KERNEL & ~__GFP_NORETRY, 1);
+ ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
/*
* In case of failure, any residual charges against
* mc.to will be dropped by mem_cgroup_clear_mc()
@@ -6340,7 +5686,7 @@ static int mem_cgroup_do_precharge(unsigned long count)
* bypassed to root right away or they'll be lost.
*/
if (ret == -EINTR)
- __mem_cgroup_cancel_charge(root_mem_cgroup, 1);
+ cancel_charge(root_mem_cgroup, 1);
if (ret)
return ret;
mc.precharge++;
@@ -6482,9 +5828,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (page) {
pc = lookup_page_cgroup(page);
/*
- * Do only loose check w/o page_cgroup lock.
- * mem_cgroup_move_account() checks the pc is valid or not under
- * the lock.
+ * Do only loose check w/o serialization.
+ * mem_cgroup_move_account() checks the pc is valid or
+ * not under LRU exclusion.
*/
if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
@@ -6609,7 +5955,7 @@ static void __mem_cgroup_clear_mc(void)
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
- __mem_cgroup_cancel_charge(mc.to, mc.precharge);
+ cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
}
/*
@@ -6617,7 +5963,7 @@ static void __mem_cgroup_clear_mc(void)
* we must uncharge here.
*/
if (mc.moved_charge) {
- __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+ cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
}
/* we must fixup refcnts and charges */
@@ -6946,6 +6292,398 @@ static void __init enable_swap_cgroup(void)
}
#endif
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+ struct page_cgroup *pc;
+ unsigned short oldid;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
+ if (!do_swap_account)
+ return;
+
+ pc = lookup_page_cgroup(page);
+
+ /* Readahead page, never charged */
+ if (!PageCgroupUsed(pc))
+ return;
+
+ VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
+
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
+ VM_BUG_ON_PAGE(oldid, page);
+
+ pc->flags &= ~PCG_MEMSW;
+ css_get(&pc->mem_cgroup->css);
+ mem_cgroup_swap_statistics(pc->mem_cgroup, true);
+}
+
+/**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ unsigned short id;
+
+ if (!do_swap_account)
+ return;
+
+ id = swap_cgroup_record(entry, 0);
+ rcu_read_lock();
+ memcg = mem_cgroup_lookup(id);
+ if (memcg) {
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ mem_cgroup_swap_statistics(memcg, false);
+ css_put(&memcg->css);
+ }
+ rcu_read_unlock();
+}
+#endif
+
+/**
+ * mem_cgroup_try_charge - try charging a page
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp_mask: reclaim mode
+ * @memcgp: charged memcg return
+ *
+ * Try to charge @page to the memcg that @mm belongs to, reclaiming
+ * pages according to @gfp_mask if necessary.
+ *
+ * Returns 0 on success, with *@memcgp pointing to the charged memcg.
+ * Otherwise, an error code is returned.
+ *
+ * After page->mapping has been set up, the caller must finalize the
+ * charge with mem_cgroup_commit_charge(). Or abort the transaction
+ * with mem_cgroup_cancel_charge() in case page instantiation fails.
+ */
+int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+ struct mem_cgroup *memcg = NULL;
+ unsigned int nr_pages = 1;
+ int ret = 0;
+
+ if (mem_cgroup_disabled())
+ goto out;
+
+ if (PageSwapCache(page)) {
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+ /*
+ * Every swap fault against a single page tries to charge the
+ * page, bail as early as possible. shmem_unuse() encounters
+ * already charged pages, too. The USED bit is protected by
+ * the page lock, which serializes swap cache removal, which
+ * in turn serializes uncharging.
+ */
+ if (PageCgroupUsed(pc))
+ goto out;
+ }
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ }
+
+ if (do_swap_account && PageSwapCache(page))
+ memcg = try_get_mem_cgroup_from_page(page);
+ if (!memcg)
+ memcg = get_mem_cgroup_from_mm(mm);
+
+ ret = try_charge(memcg, gfp_mask, nr_pages);
+
+ css_put(&memcg->css);
+
+ if (ret == -EINTR) {
+ memcg = root_mem_cgroup;
+ ret = 0;
+ }
+out:
+ *memcgp = memcg;
+ return ret;
+}
+
+/**
+ * mem_cgroup_commit_charge - commit a page charge
+ * @page: page to charge
+ * @memcg: memcg to charge the page to
+ * @lrucare: page might be on LRU already
+ *
+ * Finalize a charge transaction started by mem_cgroup_try_charge(),
+ * after page->mapping has been set up. This must happen atomically
+ * as part of the page instantiation, i.e. under the page table lock
+ * for anonymous pages, under the page lock for page and swap cache.
+ *
+ * In addition, the page must not be on the LRU during the commit, to
+ * prevent racing with task migration. If it might be, use @lrucare.
+ *
+ * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
+ */
+void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
+ bool lrucare)
+{
+ unsigned int nr_pages = 1;
+
+ VM_BUG_ON_PAGE(!page->mapping, page);
+ VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
+
+ if (mem_cgroup_disabled())
+ return;
+ /*
+ * Swap faults will attempt to charge the same page multiple
+ * times. But reuse_swap_page() might have removed the page
+ * from swapcache already, so we can't check PageSwapCache().
+ */
+ if (!memcg)
+ return;
+
+ commit_charge(page, memcg, lrucare);
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ }
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ memcg_check_events(memcg, page);
+ local_irq_enable();
+
+ if (do_swap_account && PageSwapCache(page)) {
+ swp_entry_t entry = { .val = page_private(page) };
+ /*
+ * The swap entry might not get freed for a long time,
+ * let's not wait for it. The page already received a
+ * memory+swap charge, drop the swap entry duplicate.
+ */
+ mem_cgroup_uncharge_swap(entry);
+ }
+}
+
+/**
+ * mem_cgroup_cancel_charge - cancel a page charge
+ * @page: page to charge
+ * @memcg: memcg to charge the page to
+ *
+ * Cancel a charge transaction started by mem_cgroup_try_charge().
+ */
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+{
+ unsigned int nr_pages = 1;
+
+ if (mem_cgroup_disabled())
+ return;
+ /*
+ * Swap faults will attempt to charge the same page multiple
+ * times. But reuse_swap_page() might have removed the page
+ * from swapcache already, so we can't check PageSwapCache().
+ */
+ if (!memcg)
+ return;
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ }
+
+ cancel_charge(memcg, nr_pages);
+}
+
+static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+ unsigned long nr_mem, unsigned long nr_memsw,
+ unsigned long nr_anon, unsigned long nr_file,
+ unsigned long nr_huge, struct page *dummy_page)
+{
+ unsigned long flags;
+
+ if (nr_mem)
+ res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);
+ if (nr_memsw)
+ res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);
+
+ memcg_oom_recover(memcg);
+
+ local_irq_save(flags);
+ __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
+ __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
+ __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+ __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
+ __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
+ memcg_check_events(memcg, dummy_page);
+ local_irq_restore(flags);
+}
+
+static void uncharge_list(struct list_head *page_list)
+{
+ struct mem_cgroup *memcg = NULL;
+ unsigned long nr_memsw = 0;
+ unsigned long nr_anon = 0;
+ unsigned long nr_file = 0;
+ unsigned long nr_huge = 0;
+ unsigned long pgpgout = 0;
+ unsigned long nr_mem = 0;
+ struct list_head *next;
+ struct page *page;
+
+ next = page_list->next;
+ do {
+ unsigned int nr_pages = 1;
+ struct page_cgroup *pc;
+
+ page = list_entry(next, struct page, lru);
+ next = page->lru.next;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
+ pc = lookup_page_cgroup(page);
+ if (!PageCgroupUsed(pc))
+ continue;
+
+ /*
+ * Nobody should be changing or seriously looking at
+ * pc->mem_cgroup and pc->flags at this point, we have
+ * fully exclusive access to the page.
+ */
+
+ if (memcg != pc->mem_cgroup) {
+ if (memcg) {
+ uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+ nr_anon, nr_file, nr_huge, page);
+ pgpgout = nr_mem = nr_memsw = 0;
+ nr_anon = nr_file = nr_huge = 0;
+ }
+ memcg = pc->mem_cgroup;
+ }
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ nr_huge += nr_pages;
+ }
+
+ if (PageAnon(page))
+ nr_anon += nr_pages;
+ else
+ nr_file += nr_pages;
+
+ if (pc->flags & PCG_MEM)
+ nr_mem += nr_pages;
+ if (pc->flags & PCG_MEMSW)
+ nr_memsw += nr_pages;
+ pc->flags = 0;
+
+ pgpgout++;
+ } while (next != page_list);
+
+ if (memcg)
+ uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+ nr_anon, nr_file, nr_huge, page);
+}
+
+/**
+ * mem_cgroup_uncharge - uncharge a page
+ * @page: page to uncharge
+ *
+ * Uncharge a page previously charged with mem_cgroup_try_charge() and
+ * mem_cgroup_commit_charge().
+ */
+void mem_cgroup_uncharge(struct page *page)
+{
+ struct page_cgroup *pc;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ /* Don't touch page->lru of any random page, pre-check: */
+ pc = lookup_page_cgroup(page);
+ if (!PageCgroupUsed(pc))
+ return;
+
+ INIT_LIST_HEAD(&page->lru);
+ uncharge_list(&page->lru);
+}
+
+/**
+ * mem_cgroup_uncharge_list - uncharge a list of page
+ * @page_list: list of pages to uncharge
+ *
+ * Uncharge a list of pages previously charged with
+ * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
+ */
+void mem_cgroup_uncharge_list(struct list_head *page_list)
+{
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!list_empty(page_list))
+ uncharge_list(page_list);
+}
+
+/**
+ * mem_cgroup_migrate - migrate a charge to another page
+ * @oldpage: currently charged page
+ * @newpage: page to transfer the charge to
+ * @lrucare: both pages might be on the LRU already
+ *
+ * Migrate the charge from @oldpage to @newpage.
+ *
+ * Both pages must be locked, @newpage->mapping must be set up.
+ */
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
+ bool lrucare)
+{
+ struct page_cgroup *pc;
+ int isolated;
+
+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
+ VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
+ VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
+ newpage);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ /* Page cache replacement: new page already charged? */
+ pc = lookup_page_cgroup(newpage);
+ if (PageCgroupUsed(pc))
+ return;
+
+ /* Re-entrant migration: old page already uncharged? */
+ pc = lookup_page_cgroup(oldpage);
+ if (!PageCgroupUsed(pc))
+ return;
+
+ VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
+ VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
+
+ if (lrucare)
+ lock_page_lru(oldpage, &isolated);
+
+ pc->flags = 0;
+
+ if (lrucare)
+ unlock_page_lru(oldpage, isolated);
+
+ commit_charge(newpage, pc->mem_cgroup, lrucare);
+}
+
/*
* subsys_initcall() for memory controller.
*
diff --git a/mm/memory.c b/mm/memory.c
index 5c55270729f7..ab3537bcfed2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1292,7 +1292,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
details = NULL;
BUG_ON(addr >= end);
- mem_cgroup_uncharge_start();
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
@@ -1302,7 +1301,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
next = zap_pud_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
- mem_cgroup_uncharge_end();
}
@@ -2049,6 +2047,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *dirty_page = NULL;
unsigned long mmun_start = 0; /* For mmu_notifiers */
unsigned long mmun_end = 0; /* For mmu_notifiers */
+ struct mem_cgroup *memcg;
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
@@ -2204,7 +2203,7 @@ gotten:
}
__SetPageUptodate(new_page);
- if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
goto oom_free_new;
mmun_start = address & PAGE_MASK;
@@ -2234,6 +2233,8 @@ gotten:
*/
ptep_clear_flush(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
+ mem_cgroup_commit_charge(new_page, memcg, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -2271,7 +2272,7 @@ gotten:
new_page = old_page;
ret |= VM_FAULT_WRITE;
} else
- mem_cgroup_uncharge_page(new_page);
+ mem_cgroup_cancel_charge(new_page, memcg);
if (new_page)
page_cache_release(new_page);
@@ -2410,10 +2411,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
spinlock_t *ptl;
struct page *page, *swapcache;
+ struct mem_cgroup *memcg;
swp_entry_t entry;
pte_t pte;
int locked;
- struct mem_cgroup *ptr;
int exclusive = 0;
int ret = 0;
@@ -2489,7 +2490,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_page;
}
- if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -2514,10 +2515,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
* while the page is counted on swap but not yet in mapcount i.e.
* before page_add_anon_rmap() and swap_free(); try_to_free_swap()
* must be called after the swap_free(), or it will never succeed.
- * Because delete_from_swap_page() may be called by reuse_swap_page(),
- * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
- * in page->private. In this case, a record in swap_cgroup is silently
- * discarded at swap_free().
*/
inc_mm_counter_fast(mm, MM_ANONPAGES);
@@ -2533,12 +2530,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
set_pte_at(mm, address, page_table, pte);
- if (page == swapcache)
+ if (page == swapcache) {
do_page_add_anon_rmap(page, vma, address, exclusive);
- else /* ksm created a completely new copy */
+ mem_cgroup_commit_charge(page, memcg, true);
+ } else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, address);
- /* It's better to call commit-charge after rmap is established */
- mem_cgroup_commit_charge_swapin(page, ptr);
+ mem_cgroup_commit_charge(page, memcg, false);
+ lru_cache_add_active_or_unevictable(page, vma);
+ }
swap_free(entry);
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
@@ -2571,7 +2570,7 @@ unlock:
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge_swapin(ptr);
+ mem_cgroup_cancel_charge(page, memcg);
pte_unmap_unlock(page_table, ptl);
out_page:
unlock_page(page);
@@ -2627,6 +2626,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags)
{
+ struct mem_cgroup *memcg;
struct page *page;
spinlock_t *ptl;
pte_t entry;
@@ -2660,7 +2660,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
__SetPageUptodate(page);
- if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL))
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
goto oom_free_page;
entry = mk_pte(page, vma->vm_page_prot);
@@ -2673,6 +2673,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
+ mem_cgroup_commit_charge(page, memcg, false);
+ lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(mm, address, page_table, entry);
@@ -2682,7 +2684,7 @@ unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
release:
- mem_cgroup_uncharge_page(page);
+ mem_cgroup_cancel_charge(page, memcg);
page_cache_release(page);
goto unlock;
oom_free_page:
@@ -2919,6 +2921,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
struct page *fault_page, *new_page;
+ struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
int ret;
@@ -2930,7 +2933,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new_page)
return VM_FAULT_OOM;
- if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) {
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
page_cache_release(new_page);
return VM_FAULT_OOM;
}
@@ -2950,12 +2953,14 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto uncharge_out;
}
do_set_pte(vma, address, new_page, pte, true, true);
+ mem_cgroup_commit_charge(new_page, memcg, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
page_cache_release(fault_page);
return ret;
uncharge_out:
- mem_cgroup_uncharge_page(new_page);
+ mem_cgroup_cancel_charge(new_page, memcg);
page_cache_release(new_page);
return ret;
}
@@ -3425,44 +3430,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-#if !defined(__HAVE_ARCH_GATE_AREA)
-
-#if defined(AT_SYSINFO_EHDR)
-static struct vm_area_struct gate_vma;
-
-static int __init gate_vma_init(void)
-{
- gate_vma.vm_mm = NULL;
- gate_vma.vm_start = FIXADDR_USER_START;
- gate_vma.vm_end = FIXADDR_USER_END;
- gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
- gate_vma.vm_page_prot = __P101;
-
- return 0;
-}
-__initcall(gate_vma_init);
-#endif
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
-#ifdef AT_SYSINFO_EHDR
- return &gate_vma;
-#else
- return NULL;
-#endif
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
-#ifdef AT_SYSINFO_EHDR
- if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
- return 1;
-#endif
- return 0;
-}
-
-#endif /* __HAVE_ARCH_GATE_AREA */
-
static int __follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
diff --git a/mm/migrate.c b/mm/migrate.c
index be6dbf995c0c..f78ec9bd454d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -780,6 +780,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
if (rc != MIGRATEPAGE_SUCCESS) {
newpage->mapping = NULL;
} else {
+ mem_cgroup_migrate(page, newpage, false);
if (remap_swapcache)
remove_migration_ptes(page, newpage);
page->mapping = NULL;
@@ -795,7 +796,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
{
int rc = -EAGAIN;
int remap_swapcache = 1;
- struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;
if (!trylock_page(page)) {
@@ -821,9 +821,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
lock_page(page);
}
- /* charge against new page */
- mem_cgroup_prepare_migration(page, newpage, &mem);
-
if (PageWriteback(page)) {
/*
* Only in the case of a full synchronous migration is it
@@ -833,10 +830,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
*/
if (mode != MIGRATE_SYNC) {
rc = -EBUSY;
- goto uncharge;
+ goto out_unlock;
}
if (!force)
- goto uncharge;
+ goto out_unlock;
wait_on_page_writeback(page);
}
/*
@@ -872,7 +869,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
*/
remap_swapcache = 0;
} else {
- goto uncharge;
+ goto out_unlock;
}
}
@@ -885,7 +882,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* the page migration right away (proteced by page lock).
*/
rc = balloon_page_migrate(newpage, page, mode);
- goto uncharge;
+ goto out_unlock;
}
/*
@@ -904,7 +901,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
VM_BUG_ON_PAGE(PageAnon(page), page);
if (page_has_private(page)) {
try_to_free_buffers(page);
- goto uncharge;
+ goto out_unlock;
}
goto skip_unmap;
}
@@ -923,10 +920,7 @@ skip_unmap:
if (anon_vma)
put_anon_vma(anon_vma);
-uncharge:
- mem_cgroup_end_migration(mem, page, newpage,
- (rc == MIGRATEPAGE_SUCCESS ||
- rc == MIGRATEPAGE_BALLOON_SUCCESS));
+out_unlock:
unlock_page(page);
out:
return rc;
@@ -1786,7 +1780,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
- struct mem_cgroup *memcg = NULL;
int page_lru = page_is_file_cache(page);
unsigned long mmun_start = address & HPAGE_PMD_MASK;
unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
@@ -1852,15 +1845,6 @@ fail_putback:
goto out_unlock;
}
- /*
- * Traditional migration needs to prepare the memcg charge
- * transaction early to prevent the old page from being
- * uncharged when installing migration entries. Here we can
- * save the potential rollback and start the charge transfer
- * only when migration is already known to end successfully.
- */
- mem_cgroup_prepare_migration(page, new_page, &memcg);
-
orig_entry = *pmd;
entry = mk_pmd(new_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
@@ -1888,14 +1872,10 @@ fail_putback:
goto fail_putback;
}
+ mem_cgroup_migrate(page, new_page, false);
+
page_remove_rmap(page);
- /*
- * Finish the charge transaction under the page table lock to
- * prevent split_huge_page() from dividing up the charge
- * before it's fully transferred to the new page.
- */
- mem_cgroup_end_migration(memcg, page, new_page, true);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mmap.c b/mm/mmap.c
index 64c9d736155c..c1f2ea4a0b99 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -221,7 +221,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
if (vma->vm_flags & VM_DENYWRITE)
atomic_inc(&file_inode(file)->i_writecount);
if (vma->vm_flags & VM_SHARED)
- mapping->i_mmap_writable--;
+ mapping_unmap_writable(mapping);
flush_dcache_mmap_lock(mapping);
if (unlikely(vma->vm_flags & VM_NONLINEAR))
@@ -622,7 +622,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
if (vma->vm_flags & VM_DENYWRITE)
atomic_dec(&file_inode(file)->i_writecount);
if (vma->vm_flags & VM_SHARED)
- mapping->i_mmap_writable++;
+ atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
if (unlikely(vma->vm_flags & VM_NONLINEAR))
@@ -1577,6 +1577,17 @@ munmap_back:
if (error)
goto free_vma;
}
+ if (vm_flags & VM_SHARED) {
+ error = mapping_map_writable(file->f_mapping);
+ if (error)
+ goto allow_write_and_free_vma;
+ }
+
+ /* ->mmap() can change vma->vm_file, but must guarantee that
+ * vma_link() below can deny write-access if VM_DENYWRITE is set
+ * and map writably if VM_SHARED is set. This usually means the
+ * new file must not have been exposed to user-space, yet.
+ */
vma->vm_file = get_file(file);
error = file->f_op->mmap(file, vma);
if (error)
@@ -1616,8 +1627,12 @@ munmap_back:
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
- if (vm_flags & VM_DENYWRITE)
- allow_write_access(file);
+ if (file) {
+ if (vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
+ }
file = vma->vm_file;
out:
perf_event_mmap(vma);
@@ -1646,14 +1661,17 @@ out:
return addr;
unmap_and_free_vma:
- if (vm_flags & VM_DENYWRITE)
- allow_write_access(file);
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
+ if (vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+allow_write_and_free_vma:
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
free_vma:
kmem_cache_free(vm_area_cachep, vma);
unacct_error:
diff --git a/mm/nommu.c b/mm/nommu.c
index 4a852f6c5709..a881d9673c6b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1981,11 +1981,6 @@ error:
return -ENOMEM;
}
-int in_gate_area_no_mm(unsigned long addr)
-{
- return 0;
-}
-
int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
BUG();
diff --git a/mm/rmap.c b/mm/rmap.c
index 22a4a7699cdb..3e8491c504f8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1032,25 +1032,6 @@ void page_add_new_anon_rmap(struct page *page,
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
hpage_nr_pages(page));
__page_set_anon_rmap(page, vma, address, 1);
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
- if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
- SetPageActive(page);
- lru_cache_add(page);
- return;
- }
-
- if (!TestSetPageMlocked(page)) {
- /*
- * We use the irq-unsafe __mod_zone_page_stat because this
- * counter is not modified from interrupt context, and the pte
- * lock is held(spinlock), which implies preemption disabled.
- */
- __mod_zone_page_state(page_zone(page), NR_MLOCK,
- hpage_nr_pages(page));
- count_vm_event(UNEVICTABLE_PGMLOCKED);
- }
- add_page_to_unevictable_list(page);
}
/**
@@ -1108,7 +1089,6 @@ void page_remove_rmap(struct page *page)
if (unlikely(PageHuge(page)))
goto out;
if (anon) {
- mem_cgroup_uncharge_page(page);
if (PageTransHuge(page))
__dec_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
diff --git a/mm/shmem.c b/mm/shmem.c
index 302d1cf7ad07..a42add14331c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,6 +66,9 @@ static struct vfsmount *shm_mnt;
#include <linux/highmem.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <uapi/linux/memfd.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -419,7 +422,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
pvec.pages, indices);
if (!pvec.nr)
break;
- mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -447,7 +449,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
- mem_cgroup_uncharge_end();
cond_resched();
index++;
}
@@ -495,7 +496,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
index = start;
continue;
}
- mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -531,7 +531,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
- mem_cgroup_uncharge_end();
index++;
}
@@ -551,6 +550,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
+ struct shmem_inode_info *info = SHMEM_I(inode);
int error;
error = inode_change_ok(inode, attr);
@@ -561,6 +561,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
+ /* protected by i_mutex */
+ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
+ (newsize > oldsize && (info->seals & F_SEAL_GROW)))
+ return -EPERM;
+
if (newsize != oldsize) {
error = shmem_reacct_size(SHMEM_I(inode)->flags,
oldsize, newsize);
@@ -621,7 +626,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
radswap = swp_to_radix_entry(swap);
index = radix_tree_locate_item(&mapping->page_tree, radswap);
if (index == -1)
- return 0;
+ return -EAGAIN; /* tell shmem_unuse we found nothing */
/*
* Move _head_ to start search for next from here.
@@ -680,7 +685,6 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
spin_unlock(&info->lock);
swap_free(swap);
}
- error = 1; /* not an error, but entry was found */
}
return error;
}
@@ -692,7 +696,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
{
struct list_head *this, *next;
struct shmem_inode_info *info;
- int found = 0;
+ struct mem_cgroup *memcg;
int error = 0;
/*
@@ -707,26 +711,32 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
+ error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
+ error = -EAGAIN;
mutex_lock(&shmem_swaplist_mutex);
list_for_each_safe(this, next, &shmem_swaplist) {
info = list_entry(this, struct shmem_inode_info, swaplist);
if (info->swapped)
- found = shmem_unuse_inode(info, swap, &page);
+ error = shmem_unuse_inode(info, swap, &page);
else
list_del_init(&info->swaplist);
cond_resched();
- if (found)
+ if (error != -EAGAIN)
break;
+ /* found nothing in this: move on to search the next */
}
mutex_unlock(&shmem_swaplist_mutex);
- if (found < 0)
- error = found;
+ if (error) {
+ if (error != -ENOMEM)
+ error = 0;
+ mem_cgroup_cancel_charge(page, memcg);
+ } else
+ mem_cgroup_commit_charge(page, memcg, true);
out:
unlock_page(page);
page_cache_release(page);
@@ -830,7 +840,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
}
mutex_unlock(&shmem_swaplist_mutex);
- swapcache_free(swap, NULL);
+ swapcache_free(swap);
redirty:
set_page_dirty(page);
if (wbc->for_reclaim)
@@ -1003,7 +1013,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
oldpage = newpage;
} else {
- mem_cgroup_replace_page_cache(oldpage, newpage);
+ mem_cgroup_migrate(oldpage, newpage, false);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
@@ -1030,6 +1040,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo;
+ struct mem_cgroup *memcg;
struct page *page;
swp_entry_t swap;
int error;
@@ -1108,8 +1119,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_charge_file(page, current->mm,
- gfp & GFP_RECLAIM_MASK);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
swp_to_radix_entry(swap));
@@ -1125,12 +1135,16 @@ repeat:
* Reset swap.val? No, leave it so "failed" goes back to
* "repeat": reading a hole and writing should succeed.
*/
- if (error)
+ if (error) {
+ mem_cgroup_cancel_charge(page, memcg);
delete_from_swap_cache(page);
+ }
}
if (error)
goto failed;
+ mem_cgroup_commit_charge(page, memcg, true);
+
spin_lock(&info->lock);
info->swapped--;
shmem_recalc_inode(inode);
@@ -1168,8 +1182,7 @@ repeat:
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_charge_file(page, current->mm,
- gfp & GFP_RECLAIM_MASK);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
if (error)
goto decused;
error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1179,9 +1192,10 @@ repeat:
radix_tree_preload_end();
}
if (error) {
- mem_cgroup_uncharge_cache_page(page);
+ mem_cgroup_cancel_charge(page, memcg);
goto decused;
}
+ mem_cgroup_commit_charge(page, memcg, false);
lru_cache_add_anon(page);
spin_lock(&info->lock);
@@ -1407,6 +1421,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
+ info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->swaplist);
simple_xattrs_init(&info->xattrs);
@@ -1465,7 +1480,17 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
+ struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+ /* i_mutex is held by caller */
+ if (unlikely(info->seals)) {
+ if (info->seals & F_SEAL_WRITE)
+ return -EPERM;
+ if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
+ return -EPERM;
+ }
+
return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
}
@@ -1803,11 +1828,233 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}
+/*
+ * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * so reuse a tag which we firmly believe is never set or cleared on shmem.
+ */
+#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE
+#define LAST_SCAN 4 /* about 150ms max */
+
+static void shmem_tag_pins(struct address_space *mapping)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ pgoff_t start;
+ struct page *page;
+
+ lru_add_drain();
+ start = 0;
+ rcu_read_lock();
+
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ page = radix_tree_deref_slot(slot);
+ if (!page || radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto restart;
+ } else if (page_count(page) - page_mapcount(page) > 1) {
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_tag_set(&mapping->page_tree, iter.index,
+ SHMEM_TAG_PINNED);
+ spin_unlock_irq(&mapping->tree_lock);
+ }
+
+ if (need_resched()) {
+ cond_resched_rcu();
+ start = iter.index + 1;
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
+ * via get_user_pages(), drivers might have some pending I/O without any active
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * and see whether it has an elevated ref-count. If so, we tag them and wait for
+ * them to be dropped.
+ * The caller must guarantee that no new user will acquire writable references
+ * to those pages to avoid races.
+ */
+static int shmem_wait_for_pins(struct address_space *mapping)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ pgoff_t start;
+ struct page *page;
+ int error, scan;
+
+ shmem_tag_pins(mapping);
+
+ error = 0;
+ for (scan = 0; scan <= LAST_SCAN; scan++) {
+ if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
+ break;
+
+ if (!scan)
+ lru_add_drain_all();
+ else if (schedule_timeout_killable((HZ << scan) / 200))
+ scan = LAST_SCAN;
+
+ start = 0;
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
+ start, SHMEM_TAG_PINNED) {
+
+ page = radix_tree_deref_slot(slot);
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto restart;
+
+ page = NULL;
+ }
+
+ if (page &&
+ page_count(page) - page_mapcount(page) != 1) {
+ if (scan < LAST_SCAN)
+ goto continue_resched;
+
+ /*
+ * On the last scan, we clean up all those tags
+ * we inserted; but make a note that we still
+ * found pages pinned.
+ */
+ error = -EBUSY;
+ }
+
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_tag_clear(&mapping->page_tree,
+ iter.index, SHMEM_TAG_PINNED);
+ spin_unlock_irq(&mapping->tree_lock);
+continue_resched:
+ if (need_resched()) {
+ cond_resched_rcu();
+ start = iter.index + 1;
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ return error;
+}
+
+#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_SHRINK | \
+ F_SEAL_GROW | \
+ F_SEAL_WRITE)
+
+int shmem_add_seals(struct file *file, unsigned int seals)
+{
+ struct inode *inode = file_inode(file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ int error;
+
+ /*
+ * SEALING
+ * Sealing allows multiple parties to share a shmem-file but restrict
+ * access to a specific subset of file operations. Seals can only be
+ * added, but never removed. This way, mutually untrusted parties can
+ * share common memory regions with a well-defined policy. A malicious
+ * peer can thus never perform unwanted operations on a shared object.
+ *
+ * Seals are only supported on special shmem-files and always affect
+ * the whole underlying inode. Once a seal is set, it may prevent some
+ * kinds of access to the file. Currently, the following seals are
+ * defined:
+ * SEAL_SEAL: Prevent further seals from being set on this file
+ * SEAL_SHRINK: Prevent the file from shrinking
+ * SEAL_GROW: Prevent the file from growing
+ * SEAL_WRITE: Prevent write access to the file
+ *
+ * As we don't require any trust relationship between two parties, we
+ * must prevent seals from being removed. Therefore, sealing a file
+ * only adds a given set of seals to the file, it never touches
+ * existing seals. Furthermore, the "setting seals"-operation can be
+ * sealed itself, which basically prevents any further seal from being
+ * added.
+ *
+ * Semantics of sealing are only defined on volatile files. Only
+ * anonymous shmem files support sealing. More importantly, seals are
+ * never written to disk. Therefore, there's no plan to support it on
+ * other file types.
+ */
+
+ if (file->f_op != &shmem_file_operations)
+ return -EINVAL;
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EPERM;
+ if (seals & ~(unsigned int)F_ALL_SEALS)
+ return -EINVAL;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (info->seals & F_SEAL_SEAL) {
+ error = -EPERM;
+ goto unlock;
+ }
+
+ if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) {
+ error = mapping_deny_writable(file->f_mapping);
+ if (error)
+ goto unlock;
+
+ error = shmem_wait_for_pins(file->f_mapping);
+ if (error) {
+ mapping_allow_writable(file->f_mapping);
+ goto unlock;
+ }
+ }
+
+ info->seals |= seals;
+ error = 0;
+
+unlock:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+EXPORT_SYMBOL_GPL(shmem_add_seals);
+
+int shmem_get_seals(struct file *file)
+{
+ if (file->f_op != &shmem_file_operations)
+ return -EINVAL;
+
+ return SHMEM_I(file_inode(file))->seals;
+}
+EXPORT_SYMBOL_GPL(shmem_get_seals);
+
+long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ long error;
+
+ switch (cmd) {
+ case F_ADD_SEALS:
+ /* disallow upper 32bit */
+ if (arg > UINT_MAX)
+ return -EINVAL;
+
+ error = shmem_add_seals(file, arg);
+ break;
+ case F_GET_SEALS:
+ error = shmem_get_seals(file);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
struct inode *inode = file_inode(file);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_falloc shmem_falloc;
pgoff_t start, index, end;
int error;
@@ -1823,6 +2070,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
+ /* protected by i_mutex */
+ if (info->seals & F_SEAL_WRITE) {
+ error = -EPERM;
+ goto out;
+ }
+
shmem_falloc.waitq = &shmem_falloc_waitq;
shmem_falloc.start = unmap_start >> PAGE_SHIFT;
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
@@ -1849,6 +2102,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
if (error)
goto out;
+ if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
+ error = -EPERM;
+ goto out;
+ }
+
start = offset >> PAGE_CACHE_SHIFT;
end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
/* Try to avoid a swapstorm if len is impossible to satisfy */
@@ -2584,6 +2842,77 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
shmem_show_mpol(seq, sbinfo->mpol);
return 0;
}
+
+#define MFD_NAME_PREFIX "memfd:"
+#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
+#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
+
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
+
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ struct shmem_inode_info *info;
+ struct file *file;
+ int fd, error;
+ char *name;
+ long len;
+
+ if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ return -EINVAL;
+
+ /* length includes terminating zero */
+ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
+ if (len <= 0)
+ return -EFAULT;
+ if (len > MFD_NAME_MAX_LEN + 1)
+ return -EINVAL;
+
+ name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY);
+ if (!name)
+ return -ENOMEM;
+
+ strcpy(name, MFD_NAME_PREFIX);
+ if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ /* terminating-zero may have changed after strnlen_user() returned */
+ if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+ if (fd < 0) {
+ error = fd;
+ goto err_name;
+ }
+
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_fd;
+ }
+ info = SHMEM_I(file_inode(file));
+ file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+ file->f_flags |= O_RDWR | O_LARGEFILE;
+ if (flags & MFD_ALLOW_SEALING)
+ info->seals &= ~F_SEAL_SEAL;
+
+ fd_install(fd, file);
+ kfree(name);
+ return fd;
+
+err_fd:
+ put_unused_fd(fd);
+err_name:
+ kfree(name);
+ return error;
+}
+
#endif /* CONFIG_TMPFS */
static void shmem_put_super(struct super_block *sb)
diff --git a/mm/slab.c b/mm/slab.c
index 2e60bf3dedbb..a467b308c682 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -470,6 +470,8 @@ static struct kmem_cache kmem_cache_boot = {
.name = "kmem_cache",
};
+#define BAD_ALIEN_MAGIC 0x01020304ul
+
static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -836,7 +838,7 @@ static int transfer_objects(struct array_cache *to,
static inline struct alien_cache **alloc_alien_cache(int node,
int limit, gfp_t gfp)
{
- return NULL;
+ return (struct alien_cache **)BAD_ALIEN_MAGIC;
}
static inline void free_alien_cache(struct alien_cache **ac_ptr)
diff --git a/mm/swap.c b/mm/swap.c
index c789d01c9ec3..6b2dc3897cd5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -62,6 +62,7 @@ static void __page_cache_release(struct page *page)
del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
+ mem_cgroup_uncharge(page);
}
static void __put_single_page(struct page *page)
@@ -687,6 +688,40 @@ void add_page_to_unevictable_list(struct page *page)
spin_unlock_irq(&zone->lru_lock);
}
+/**
+ * lru_cache_add_active_or_unevictable
+ * @page: the page to be added to LRU
+ * @vma: vma in which page is mapped for determining reclaimability
+ *
+ * Place @page on the active or unevictable LRU list, depending on its
+ * evictability. Note that if the page is not evictable, it goes
+ * directly back onto it's zone's unevictable list, it does NOT use a
+ * per cpu pagevec.
+ */
+void lru_cache_add_active_or_unevictable(struct page *page,
+ struct vm_area_struct *vma)
+{
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+ SetPageActive(page);
+ lru_cache_add(page);
+ return;
+ }
+
+ if (!TestSetPageMlocked(page)) {
+ /*
+ * We use the irq-unsafe __mod_zone_page_stat because this
+ * counter is not modified from interrupt context, and the pte
+ * lock is held(spinlock), which implies preemption disabled.
+ */
+ __mod_zone_page_state(page_zone(page), NR_MLOCK,
+ hpage_nr_pages(page));
+ count_vm_event(UNEVICTABLE_PGMLOCKED);
+ }
+ add_page_to_unevictable_list(page);
+}
+
/*
* If the page can not be invalidated, it is moved to the
* inactive list to speed up its reclaim. It is moved to the
@@ -913,6 +948,7 @@ void release_pages(struct page **pages, int nr, bool cold)
if (zone)
spin_unlock_irqrestore(&zone->lru_lock, flags);
+ mem_cgroup_uncharge_list(&pages_to_free);
free_hot_cold_page_list(&pages_to_free, cold);
}
EXPORT_SYMBOL(release_pages);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2972eee184a4..3e0ec83d000c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,6 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
struct address_space swapper_spaces[MAX_SWAPFILES] = {
[0 ... MAX_SWAPFILES - 1] = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+ .i_mmap_writable = ATOMIC_INIT(0),
.a_ops = &swap_aops,
.backing_dev_info = &swap_backing_dev_info,
}
@@ -176,7 +177,7 @@ int add_to_swap(struct page *page, struct list_head *list)
if (unlikely(PageTransHuge(page)))
if (unlikely(split_huge_page_to_list(page, list))) {
- swapcache_free(entry, NULL);
+ swapcache_free(entry);
return 0;
}
@@ -202,7 +203,7 @@ int add_to_swap(struct page *page, struct list_head *list)
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
*/
- swapcache_free(entry, NULL);
+ swapcache_free(entry);
return 0;
}
}
@@ -225,7 +226,7 @@ void delete_from_swap_cache(struct page *page)
__delete_from_swap_cache(page);
spin_unlock_irq(&address_space->tree_lock);
- swapcache_free(entry, page);
+ swapcache_free(entry);
page_cache_release(page);
}
@@ -386,7 +387,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
*/
- swapcache_free(entry, NULL);
+ swapcache_free(entry);
} while (err != -ENOMEM);
if (new_page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4c524f7bd0bf..8798b2e0ac59 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -843,16 +843,13 @@ void swap_free(swp_entry_t entry)
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
-void swapcache_free(swp_entry_t entry, struct page *page)
+void swapcache_free(swp_entry_t entry)
{
struct swap_info_struct *p;
- unsigned char count;
p = swap_info_get(entry);
if (p) {
- count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
- if (page)
- mem_cgroup_uncharge_swapcache(page, entry, count != 0);
+ swap_entry_free(p, entry, SWAP_HAS_CACHE);
spin_unlock(&p->lock);
}
}
@@ -1106,15 +1103,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
if (unlikely(!page))
return -ENOMEM;
- if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
- GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
ret = -ENOMEM;
goto out_nolock;
}
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
- mem_cgroup_cancel_charge_swapin(memcg);
+ mem_cgroup_cancel_charge(page, memcg);
ret = 0;
goto out;
}
@@ -1124,11 +1120,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
- if (page == swapcache)
+ if (page == swapcache) {
page_add_anon_rmap(page, vma, addr);
- else /* ksm created a completely new copy */
+ mem_cgroup_commit_charge(page, memcg, true);
+ } else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge_swapin(page, memcg);
+ mem_cgroup_commit_charge(page, memcg, false);
+ lru_cache_add_active_or_unevictable(page, vma);
+ }
swap_free(entry);
/*
* Move the page to the active list so it is not
diff --git a/mm/truncate.c b/mm/truncate.c
index eda247307164..96d167372d89 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -281,7 +281,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
indices)) {
- mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -307,7 +306,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
- mem_cgroup_uncharge_end();
cond_resched();
index++;
}
@@ -369,7 +367,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
pagevec_release(&pvec);
break;
}
- mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -394,7 +391,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
- mem_cgroup_uncharge_end();
index++;
}
cleancache_invalidate_inode(mapping);
@@ -493,7 +489,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
indices)) {
- mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -522,7 +517,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
- mem_cgroup_uncharge_end();
cond_resched();
index++;
}
@@ -553,7 +547,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
BUG_ON(page_has_private(page));
__delete_from_page_cache(page, NULL);
spin_unlock_irq(&mapping->tree_lock);
- mem_cgroup_uncharge_cache_page(page);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -602,7 +595,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
indices)) {
- mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -655,7 +647,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
- mem_cgroup_uncharge_end();
cond_resched();
index++;
}
diff --git a/mm/util.c b/mm/util.c
index 7b6608df2ee8..093c973f1697 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -183,17 +183,14 @@ pid_t vm_is_stack(struct task_struct *task,
if (in_group) {
struct task_struct *t;
- rcu_read_lock();
- if (!pid_alive(task))
- goto done;
- t = task;
- do {
+ rcu_read_lock();
+ for_each_thread(task, t) {
if (vm_is_stack_for_task(t, vma)) {
ret = t->pid;
goto done;
}
- } while_each_thread(task, t);
+ }
done:
rcu_read_unlock();
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d2f65c856350..2836b5373b2e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -577,9 +577,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
+ mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
spin_unlock_irq(&mapping->tree_lock);
- swapcache_free(swap, page);
+ swapcache_free(swap);
} else {
void (*freepage)(struct page *);
void *shadow = NULL;
@@ -600,7 +601,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
shadow = workingset_eviction(mapping, page);
__delete_from_page_cache(page, shadow);
spin_unlock_irq(&mapping->tree_lock);
- mem_cgroup_uncharge_cache_page(page);
if (freepage != NULL)
freepage(page);
@@ -822,7 +822,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
cond_resched();
- mem_cgroup_uncharge_start();
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
@@ -1133,11 +1132,12 @@ keep:
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+ mem_cgroup_uncharge_list(&free_pages);
free_hot_cold_page_list(&free_pages, true);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
- mem_cgroup_uncharge_end();
+
*ret_nr_dirty += nr_dirty;
*ret_nr_congested += nr_congested;
*ret_nr_unqueued_dirty += nr_unqueued_dirty;
@@ -1437,6 +1437,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
+ mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&zone->lru_lock);
} else
@@ -1544,6 +1545,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_unlock_irq(&zone->lru_lock);
+ mem_cgroup_uncharge_list(&page_list);
free_hot_cold_page_list(&page_list, true);
/*
@@ -1658,6 +1660,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
+ mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&zone->lru_lock);
} else
@@ -1765,6 +1768,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
+ mem_cgroup_uncharge_list(&l_hold);
free_hot_cold_page_list(&l_hold, true);
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 032c21eeab2b..ea064c1a09ba 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -212,7 +212,7 @@ static int zswap_entry_cache_create(void)
return zswap_entry_cache == NULL;
}
-static void zswap_entry_cache_destory(void)
+static void __init zswap_entry_cache_destroy(void)
{
kmem_cache_destroy(zswap_entry_cache);
}
@@ -507,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
*/
- swapcache_free(entry, NULL);
+ swapcache_free(entry);
} while (err != -ENOMEM);
if (new_page)
@@ -941,7 +941,7 @@ static int __init init_zswap(void)
pcpufail:
zswap_comp_exit();
compfail:
- zswap_entry_cache_destory();
+ zswap_entry_cache_destroy();
cachefail:
zpool_destroy_pool(zswap_pool);
error: