summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c226
1 files changed, 221 insertions, 5 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f5c0c517c49..597d58101872 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -87,6 +87,10 @@ int do_swap_account __read_mostly;
#define do_swap_account 0
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -752,15 +756,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
+ /* Update lruvec */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup_per_node *pi;
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
atomic_long_add(x, &pi->lruvec_stat[idx]);
x = 0;
@@ -3260,6 +3262,72 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
+{
+ unsigned long stat[MEMCG_NR_STAT];
+ struct mem_cgroup *mi;
+ int node, cpu, i;
+ int min_idx, max_idx;
+
+ if (slab_only) {
+ min_idx = NR_SLAB_RECLAIMABLE;
+ max_idx = NR_SLAB_UNRECLAIMABLE;
+ } else {
+ min_idx = 0;
+ max_idx = MEMCG_NR_STAT;
+ }
+
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = min_idx; i < max_idx; i++)
+ atomic_long_add(stat[i], &mi->vmstats[i]);
+
+ if (!slab_only)
+ max_idx = NR_VM_NODE_STAT_ITEMS;
+
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ struct mem_cgroup_per_node *pi;
+
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] += per_cpu(
+ pn->lruvec_stat_cpu->count[i], cpu);
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+ for (i = min_idx; i < max_idx; i++)
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ }
+}
+
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+{
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ struct mem_cgroup *mi;
+ int cpu, i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
+ cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ atomic_long_add(events[i], &mi->vmevents[i]);
+}
+
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -3309,7 +3377,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (!parent)
parent = root_mem_cgroup;
+ /*
+ * Deactivate and reparent kmem_caches. Then flush percpu
+ * slab statistics to have precise values at the parent and
+ * all ancestor levels. It's required to keep slab stats
+ * accurate after the reparenting of kmem_caches.
+ */
memcg_deactivate_kmem_caches(memcg, parent);
+ memcg_flush_percpu_vmstats(memcg, true);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
@@ -4101,6 +4176,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
#ifdef CONFIG_CGROUP_WRITEBACK
+#include <trace/events/writeback.h>
+
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return wb_domain_init(&memcg->cgwb_domain, gfp);
@@ -4184,6 +4261,130 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
}
}
+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback. The former
+ * trackes ownership per-page while the latter per-inode. This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases. For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode. B owns the inode and
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback. A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction. However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism. When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+ struct memcg_cgwb_frn *frn;
+ u64 now = get_jiffies_64();
+ u64 oldest_at = now;
+ int oldest = -1;
+ int i;
+
+ trace_track_foreign_dirty(page, wb);
+
+ /*
+ * Pick the slot to use. If there is already a slot for @wb, keep
+ * using it. If not replace the oldest one which isn't being
+ * written out.
+ */
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ frn = &memcg->cgwb_frn[i];
+ if (frn->bdi_id == wb->bdi->id &&
+ frn->memcg_id == wb->memcg_css->id)
+ break;
+ if (time_before64(frn->at, oldest_at) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ oldest = i;
+ oldest_at = frn->at;
+ }
+ }
+
+ if (i < MEMCG_CGWB_FRN_CNT) {
+ /*
+ * Re-using an existing one. Update timestamp lazily to
+ * avoid making the cacheline hot. We want them to be
+ * reasonably up-to-date and significantly shorter than
+ * dirty_expire_interval as that's what expires the record.
+ * Use the shorter of 1s and dirty_expire_interval / 8.
+ */
+ unsigned long update_intv =
+ min_t(unsigned long, HZ,
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+ if (time_before64(frn->at, now - update_intv))
+ frn->at = now;
+ } else if (oldest >= 0) {
+ /* replace the oldest free one */
+ frn = &memcg->cgwb_frn[oldest];
+ frn->bdi_id = wb->bdi->id;
+ frn->memcg_id = wb->memcg_css->id;
+ frn->at = now;
+ }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+ u64 now = jiffies_64;
+ int i;
+
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+ /*
+ * If the record is older than dirty_expire_interval,
+ * writeback on it has already started. No need to kick it
+ * off again. Also, don't start a new one if there's
+ * already one in flight.
+ */
+ if (time_after64(frn->at, now - intv) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ frn->at = 0;
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ WB_REASON_FOREIGN_FLUSH,
+ &frn->done);
+ }
+ }
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@ -4682,6 +4883,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg, false);
+ memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
@@ -4700,6 +4907,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
struct mem_cgroup *memcg;
unsigned int size;
int node;
+ int __maybe_unused i;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -4743,6 +4951,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ memcg->cgwb_frn[i].done =
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
@@ -4872,7 +5083,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ int __maybe_unused i;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);