summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@techsingularity.net>2021-06-29 05:42:24 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-29 20:53:55 +0300
commit74f44822097c665041010994502b5971d6cd9f04 (patch)
tree308a8f1e7f8087b1f39ea0bc87813ae7c5519186 /mm/page_alloc.c
parentc49c2c47dab6b8d45022b3fabf0642a0e62e3109 (diff)
downloadlinux-74f44822097c665041010994502b5971d6cd9f04.tar.xz
mm/page_alloc: introduce vm.percpu_pagelist_high_fraction
This introduces a new sysctl vm.percpu_pagelist_high_fraction. It is similar to the old vm.percpu_pagelist_fraction. The old sysctl increased both pcp->batch and pcp->high with the higher pcp->high potentially reducing zone->lock contention. However, the higher pcp->batch value also potentially increased allocation latency while the PCP was refilled. This sysctl only adjusts pcp->high so that zone->lock contention is potentially reduced but allocation latency during a PCP refill remains the same. # grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 649 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=8 # grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 35071 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=64 high: 4383 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=0 high: 649 batch: 63 [mgorman@techsingularity.net: fix documentation] Link: https://lkml.kernel.org/r/20210528151010.GQ30378@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Acked-by: Dave Hansen <dave.hansen@linux.intel.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Hillf Danton <hdanton@sina.com> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c69
1 files changed, 62 insertions, 7 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index adf35ccfd8e5..cfc4071310fb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -120,6 +120,7 @@ typedef int __bitwise fpi_t;
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
struct pagesets {
local_lock_t lock;
@@ -192,6 +193,7 @@ EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
+int percpu_pagelist_high_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
EXPORT_SYMBOL(init_on_alloc);
@@ -6725,17 +6727,32 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
#ifdef CONFIG_MMU
int high;
int nr_local_cpus;
+ unsigned long total_pages;
+
+ if (!percpu_pagelist_high_fraction) {
+ /*
+ * By default, the high value of the pcp is based on the zone
+ * low watermark so that if they are full then background
+ * reclaim will not be started prematurely.
+ */
+ total_pages = low_wmark_pages(zone);
+ } else {
+ /*
+ * If percpu_pagelist_high_fraction is configured, the high
+ * value is based on a fraction of the managed pages in the
+ * zone.
+ */
+ total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
+ }
/*
- * The high value of the pcp is based on the zone low watermark
- * so that if they are full then background reclaim will not be
- * started prematurely. The value is split across all online CPUs
- * local to the zone. Note that early in boot that CPUs may not be
- * online yet and that during CPU hotplug that the cpumask is not
- * yet updated when a CPU is being onlined.
+ * Split the high value across all online CPUs local to the zone. Note
+ * that early in boot that CPUs may not be online yet and that during
+ * CPU hotplug that the cpumask is not yet updated when a CPU is being
+ * onlined.
*/
nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online;
- high = low_wmark_pages(zone) / nr_local_cpus;
+ high = total_pages / nr_local_cpus;
/*
* Ensure high is at least batch*4. The multiple is based on the
@@ -8500,6 +8517,44 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+/*
+ * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu
+ * pagelist can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
+ int write, void *buffer, size_t *length, loff_t *ppos)
+{
+ struct zone *zone;
+ int old_percpu_pagelist_high_fraction;
+ int ret;
+
+ mutex_lock(&pcp_batch_high_lock);
+ old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
+
+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (!write || ret < 0)
+ goto out;
+
+ /* Sanity checking to avoid pcp imbalance */
+ if (percpu_pagelist_high_fraction &&
+ percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
+ percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* No change? */
+ if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
+ goto out;
+
+ for_each_populated_zone(zone)
+ zone_set_pageset_high_and_batch(zone, 0);
+out:
+ mutex_unlock(&pcp_batch_high_lock);
+ return ret;
+}
+
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
/*
* Returns the number of pages that arch has reserved but