summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/gfp.h1
-rw-r--r--mm/page_alloc.c119
-rw-r--r--mm/vmstat.c8
3 files changed, 99 insertions, 29 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 665edc11fb9f..5b917e5b9350 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -320,6 +320,7 @@ extern void page_frag_free(void *addr);
#define free_page(addr) free_pages((addr), 0)
void page_alloc_init_cpuhp(void);
+int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 922836b6a01d..83af76a8cef9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2157,6 +2157,40 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
return i;
}
+/*
+ * Called from the vmstat counter updater to decay the PCP high.
+ * Return whether there are addition works to do.
+ */
+int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+{
+ int high_min, to_drain, batch;
+ int todo = 0;
+
+ high_min = READ_ONCE(pcp->high_min);
+ batch = READ_ONCE(pcp->batch);
+ /*
+ * Decrease pcp->high periodically to try to free possible
+ * idle PCP pages. And, avoid to free too many pages to
+ * control latency. This caps pcp->high decrement too.
+ */
+ if (pcp->high > high_min) {
+ pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+ pcp->high - (pcp->high >> 3), high_min);
+ if (pcp->high > high_min)
+ todo++;
+ }
+
+ to_drain = pcp->count - pcp->high;
+ if (to_drain > 0) {
+ spin_lock(&pcp->lock);
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
+ spin_unlock(&pcp->lock);
+ todo++;
+ }
+
+ return todo;
+}
+
#ifdef CONFIG_NUMA
/*
* Called from the vmstat counter updater to drain pagesets of this
@@ -2318,14 +2352,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
return true;
}
-static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
{
int min_nr_free, max_nr_free;
- int batch = READ_ONCE(pcp->batch);
- /* Free everything if batch freeing high-order pages. */
+ /* Free as much as possible if batch freeing high-order pages. */
if (unlikely(free_high))
- return pcp->count;
+ return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
/* Check for PCP disabled or boot pageset */
if (unlikely(high < batch))
@@ -2340,7 +2373,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
* freeing of pages without any allocation.
*/
batch <<= pcp->free_factor;
- if (batch < max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX)
+ if (batch <= max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX)
pcp->free_factor++;
batch = clamp(batch, min_nr_free, max_nr_free);
@@ -2348,28 +2381,48 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
}
static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
- bool free_high)
+ int batch, bool free_high)
{
- int high = READ_ONCE(pcp->high_min);
+ int high, high_min, high_max;
+
+ high_min = READ_ONCE(pcp->high_min);
+ high_max = READ_ONCE(pcp->high_max);
+ high = pcp->high = clamp(pcp->high, high_min, high_max);
- if (unlikely(!high || free_high))
+ if (unlikely(!high))
return 0;
- if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
- return high;
+ if (unlikely(free_high)) {
+ pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+ high_min);
+ return 0;
+ }
/*
* If reclaim is active, limit the number of pages that can be
* stored on pcp lists
*/
- return min(READ_ONCE(pcp->batch) << 2, high);
+ if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
+ pcp->high = max(high - (batch << pcp->free_factor), high_min);
+ return min(batch << 2, pcp->high);
+ }
+
+ if (pcp->count >= high && high_min != high_max) {
+ int need_high = (batch << pcp->free_factor) + batch;
+
+ /* pcp->high should be large enough to hold batch freed pages */
+ if (pcp->high < need_high)
+ pcp->high = clamp(need_high, high_min, high_max);
+ }
+
+ return high;
}
static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
struct page *page, int migratetype,
unsigned int order)
{
- int high;
+ int high, batch;
int pindex;
bool free_high = false;
@@ -2384,6 +2437,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
+ batch = READ_ONCE(pcp->batch);
/*
* As high-order pages other than THP's stored on PCP can contribute
* to fragmentation, limit the number stored when PCP is heavily
@@ -2394,14 +2448,15 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
free_high = (pcp->free_factor &&
(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
(!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
- pcp->count >= READ_ONCE(pcp->batch)));
+ pcp->count >= READ_ONCE(batch)));
pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
}
- high = nr_pcp_high(pcp, zone, free_high);
+ high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
- free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex);
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
+ pcp, pindex);
}
}
@@ -2685,24 +2740,38 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
return page;
}
-static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order)
+static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
{
- int high, batch, max_nr_alloc;
+ int high, base_batch, batch, max_nr_alloc;
+ int high_max, high_min;
- high = READ_ONCE(pcp->high_min);
- batch = READ_ONCE(pcp->batch);
+ base_batch = READ_ONCE(pcp->batch);
+ high_min = READ_ONCE(pcp->high_min);
+ high_max = READ_ONCE(pcp->high_max);
+ high = pcp->high = clamp(pcp->high, high_min, high_max);
/* Check for PCP disabled or boot pageset */
- if (unlikely(high < batch))
+ if (unlikely(high < base_batch))
return 1;
+ if (order)
+ batch = base_batch;
+ else
+ batch = (base_batch << pcp->alloc_factor);
+
/*
- * Double the number of pages allocated each time there is subsequent
- * allocation of order-0 pages without any freeing.
+ * If we had larger pcp->high, we could avoid to allocate from
+ * zone.
*/
+ if (high_min != high_max && !test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
+ high = pcp->high = min(high + batch, high_max);
+
if (!order) {
- max_nr_alloc = max(high - pcp->count - batch, batch);
- batch <<= pcp->alloc_factor;
+ max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
+ /*
+ * Double the number of pages allocated each time there is
+ * subsequent allocation of order-0 pages without any freeing.
+ */
if (batch <= max_nr_alloc &&
pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
pcp->alloc_factor++;
@@ -2733,7 +2802,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
do {
if (list_empty(list)) {
- int batch = nr_pcp_alloc(pcp, order);
+ int batch = nr_pcp_alloc(pcp, zone, order);
int alloced;
alloced = rmqueue_bulk(zone, order,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 88ea95d4221c..359460deb377 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -816,9 +816,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
for_each_populated_zone(zone) {
struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
-#ifdef CONFIG_NUMA
struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
-#endif
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
int v;
@@ -834,10 +832,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
#endif
}
}
-#ifdef CONFIG_NUMA
if (do_pagesets) {
cond_resched();
+
+ changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
+#ifdef CONFIG_NUMA
/*
* Deal with draining the remote pageset of this
* processor
@@ -866,8 +866,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
drain_zone_pages(zone, this_cpu_ptr(pcp));
changes++;
}
- }
#endif
+ }
}
for_each_online_pgdat(pgdat) {