summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorShakeel Butt <shakeelb@google.com>2022-10-24 08:28:41 +0300
committerAndrew Morton <akpm@linux-foundation.org>2022-12-01 02:58:40 +0300
commitf1a7941243c102a44e8847e3b94ff4ff3ec56f25 (patch)
tree3d1f4938bbf28c7b35003c1d18bbfce011a61480 /include
parent9cd6ffa60256e931503d347006049b8bef508203 (diff)
downloadlinux-f1a7941243c102a44e8847e3b94ff4ff3ec56f25.tar.xz
mm: convert mm's rss stats into percpu_counter
Currently mm_struct maintains rss_stats which are updated on page fault and the unmapping codepaths. For page fault codepath the updates are cached per thread with the batch of TASK_RSS_EVENTS_THRESH which is 64. The reason for caching is performance for multithreaded applications otherwise the rss_stats updates may become hotspot for such applications. However this optimization comes with the cost of error margin in the rss stats. The rss_stats for applications with large number of threads can be very skewed. At worst the error margin is (nr_threads * 64) and we have a lot of applications with 100s of threads, so the error margin can be very high. Internally we had to reduce TASK_RSS_EVENTS_THRESH to 32. Recently we started seeing the unbounded errors for rss_stats for specific applications which use TCP rx0cp. It seems like vm_insert_pages() codepath does not sync rss_stats at all. This patch converts the rss_stats into percpu_counter to convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However this conversion enable us to get the accurate stats for situations where accuracy is more important than the cpu cost. This patch does not make such tradeoffs - we can just use percpu_counter_add_local() for the updates and percpu_counter_sum() (or percpu_counter_sync() + percpu_counter_read) for the readers. At the moment the readers are either procfs interface, oom_killer and memory reclaim which I think are not performance critical and should be ok with slow read. However I think we can make that change in a separate patch. Link: https://lkml.kernel.org/r/20221024052841.3291983-1-shakeelb@google.com Signed-off-by: Shakeel Butt <shakeelb@google.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/mm.h26
-rw-r--r--include/linux/mm_types.h7
-rw-r--r--include/linux/mm_types_task.h13
-rw-r--r--include/linux/percpu_counter.h1
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/trace/events/kmem.h8
6 files changed, 14 insertions, 44 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f919befc8fac..0cb4e196d60b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2052,40 +2052,30 @@ static inline bool get_user_page_fast_only(unsigned long addr,
*/
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
- long val = atomic_long_read(&mm->rss_stat.count[member]);
-
-#ifdef SPLIT_RSS_COUNTING
- /*
- * counter is updated in asynchronous manner and may go to minus.
- * But it's never be expected number for users.
- */
- if (val < 0)
- val = 0;
-#endif
- return (unsigned long)val;
+ return percpu_counter_read_positive(&mm->rss_stat[member]);
}
-void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
+void mm_trace_rss_stat(struct mm_struct *mm, int member);
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
- long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
+ percpu_counter_add(&mm->rss_stat[member], value);
- mm_trace_rss_stat(mm, member, count);
+ mm_trace_rss_stat(mm, member);
}
static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
- long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
+ percpu_counter_inc(&mm->rss_stat[member]);
- mm_trace_rss_stat(mm, member, count);
+ mm_trace_rss_stat(mm, member);
}
static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
- long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
+ percpu_counter_dec(&mm->rss_stat[member]);
- mm_trace_rss_stat(mm, member, count);
+ mm_trace_rss_stat(mm, member);
}
/* Optimized variant when page is already known not to be PageAnon */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2d5b1575ffe0..e86861ff5bbd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -18,6 +18,7 @@
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
+#include <linux/percpu_counter.h>
#include <asm/mmu.h>
@@ -626,11 +627,7 @@ struct mm_struct {
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
- /*
- * Special counters, in some configurations protected by the
- * page_table_lock, in other configurations by being atomic.
- */
- struct mm_rss_stat rss_stat;
+ struct percpu_counter rss_stat[NR_MM_COUNTERS];
struct linux_binfmt *binfmt;
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 0bb4b6da9993..5414b5c6a103 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -36,19 +36,6 @@ enum {
NR_MM_COUNTERS
};
-#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
-#define SPLIT_RSS_COUNTING
-/* per-thread cached information, */
-struct task_rss_stat {
- int events; /* for synchronization threshold */
- int count[NR_MM_COUNTERS];
-};
-#endif /* USE_SPLIT_PTE_PTLOCKS */
-
-struct mm_rss_stat {
- atomic_long_t count[NR_MM_COUNTERS];
-};
-
struct page_frag {
struct page *page;
#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 8ed5fba6d156..bde6c4c1f405 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -13,7 +13,6 @@
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>
-#include <linux/gfp.h>
/* percpu_counter batch for local add or sub */
#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffb6eb55cd13..079d299fa465 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -870,9 +870,6 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
-#ifdef SPLIT_RSS_COUNTING
- struct task_rss_stat rss_stat;
-#endif
int exit_state;
int exit_code;
int exit_signal;
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 243073cfc29d..58688768ef0f 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -346,10 +346,9 @@ TRACE_MM_PAGES
TRACE_EVENT(rss_stat,
TP_PROTO(struct mm_struct *mm,
- int member,
- long count),
+ int member),
- TP_ARGS(mm, member, count),
+ TP_ARGS(mm, member),
TP_STRUCT__entry(
__field(unsigned int, mm_id)
@@ -362,7 +361,8 @@ TRACE_EVENT(rss_stat,
__entry->mm_id = mm_ptr_to_hash(mm);
__entry->curr = !!(current->mm == mm);
__entry->member = member;
- __entry->size = (count << PAGE_SHIFT);
+ __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
+ << PAGE_SHIFT);
),
TP_printk("mm_id=%u curr=%d type=%s size=%ldB",