summaryrefslogtreecommitdiff
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c218
1 files changed, 214 insertions, 4 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1bdc7d0d1b0b..a8db92c23697 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -19,6 +19,13 @@
* for anonymous memory. For process policy an process counter
* is used.
*
+ * weighted interleave
+ * Allocate memory interleaved over a set of nodes based on
+ * a set of weights (per-node), with normal fallback if it
+ * fails. Otherwise operates the same as interleave.
+ * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
+ * on node 0 for every 1 page allocated on node 1.
+ *
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
@@ -441,6 +448,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_preferred,
},
+ [MPOL_WEIGHTED_INTERLEAVE] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_nodemask,
+ },
};
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
@@ -858,8 +869,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
old = current->mempolicy;
current->mempolicy = new;
- if (new && new->mode == MPOL_INTERLEAVE)
+ if (new && (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
current->il_prev = MAX_NUMNODES-1;
+ current->il_weight = 0;
+ }
task_unlock(current);
mpol_put(old);
ret = 0;
@@ -884,6 +898,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
*nodes = pol->nodes;
break;
case MPOL_LOCAL:
@@ -968,6 +983,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->nodes);
+ } else if (pol == current->mempolicy &&
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
+ if (current->il_weight)
+ *policy = current->il_prev;
+ else
+ *policy = next_node_in(current->il_prev,
+ pol->nodes);
} else {
err = -EINVAL;
goto out;
@@ -1332,7 +1354,8 @@ static long do_mbind(unsigned long start, unsigned long len,
* VMAs, the nodes will still be interleaved from the targeted
* nodemask, but one by one may be selected differently.
*/
- if (new->mode == MPOL_INTERLEAVE) {
+ if (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE) {
struct page *page;
unsigned int order;
unsigned long addr = -EFAULT;
@@ -1780,7 +1803,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup
* @order: 0, or appropriate huge_page_order for interleaving
- * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
+ * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
+ * MPOL_WEIGHTED_INTERLEAVE
*
* Returns effective policy for a VMA at specified address.
* Falls back to current->mempolicy or system default policy, as necessary.
@@ -1797,7 +1821,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
pol = __get_vma_policy(vma, addr, ilx);
if (!pol)
pol = get_task_policy(current);
- if (pol->mode == MPOL_INTERLEAVE) {
+ if (pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
*ilx += vma->vm_pgoff >> order;
*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
}
@@ -1847,6 +1872,22 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
return zone >= dynamic_policy_zone;
}
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+ unsigned int node = current->il_prev;
+
+ if (!current->il_weight || !node_isset(node, policy->nodes)) {
+ node = next_node_in(node, policy->nodes);
+ /* can only happen if nodemask is being rebound */
+ if (node == MAX_NUMNODES)
+ return node;
+ current->il_prev = node;
+ current->il_weight = get_il_weight(node);
+ }
+ current->il_weight--;
+ return node;
+}
+
/* Do dynamic interleaving for a process */
static unsigned int interleave_nodes(struct mempolicy *policy)
{
@@ -1881,6 +1922,9 @@ unsigned int mempolicy_slab_node(void)
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
+ case MPOL_WEIGHTED_INTERLEAVE:
+ return weighted_interleave_nodes(policy);
+
case MPOL_BIND:
case MPOL_PREFERRED_MANY:
{
@@ -1919,6 +1963,45 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
return nodes_weight(*mask);
}
+static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
+{
+ nodemask_t nodemask;
+ unsigned int target, nr_nodes;
+ u8 *table;
+ unsigned int weight_total = 0;
+ u8 weight;
+ int nid;
+
+ nr_nodes = read_once_policy_nodemask(pol, &nodemask);
+ if (!nr_nodes)
+ return numa_node_id();
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ /* calculate the total weight */
+ for_each_node_mask(nid, nodemask) {
+ /* detect system default usage */
+ weight = table ? table[nid] : 1;
+ weight = weight ? weight : 1;
+ weight_total += weight;
+ }
+
+ /* Calculate the node offset based on totals */
+ target = ilx % weight_total;
+ nid = first_node(nodemask);
+ while (target) {
+ /* detect system default usage */
+ weight = table ? table[nid] : 1;
+ weight = weight ? weight : 1;
+ if (target < weight)
+ break;
+ target -= weight;
+ nid = next_node_in(nid, nodemask);
+ }
+ rcu_read_unlock();
+ return nid;
+}
+
/*
* Do static interleaving for interleave index @ilx. Returns the ilx'th
* node in pol->nodes (starting from ilx=0), wrapping around if ilx
@@ -1979,6 +2062,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
*nid = (ilx == NO_INTERLEAVE_INDEX) ?
interleave_nodes(pol) : interleave_nid(pol, ilx);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ *nid = (ilx == NO_INTERLEAVE_INDEX) ?
+ weighted_interleave_nodes(pol) :
+ weighted_interleave_nid(pol, ilx);
+ break;
}
return nodemask;
@@ -2040,6 +2128,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
*mask = mempolicy->nodes;
break;
@@ -2140,6 +2229,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode != MPOL_INTERLEAVE &&
+ pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
(!nodemask || node_isset(nid, *nodemask))) {
/*
* First, try to allocate THP only on local node, but
@@ -2275,6 +2365,114 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
return total_allocated;
}
+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ struct task_struct *me = current;
+ unsigned long total_allocated = 0;
+ unsigned long nr_allocated = 0;
+ unsigned long rounds;
+ unsigned long node_pages, delta;
+ u8 *table, *weights, weight;
+ unsigned int weight_total = 0;
+ unsigned long rem_pages = nr_pages;
+ nodemask_t nodes;
+ int nnodes, node;
+ int resume_node = MAX_NUMNODES - 1;
+ u8 resume_weight = 0;
+ int prev_node;
+ int i;
+
+ if (!nr_pages)
+ return 0;
+
+ nnodes = read_once_policy_nodemask(pol, &nodes);
+ if (!nnodes)
+ return 0;
+
+ /* Continue allocating from most recent node and adjust the nr_pages */
+ node = me->il_prev;
+ weight = me->il_weight;
+ if (weight && node_isset(node, nodes)) {
+ node_pages = min(rem_pages, weight);
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ /* if that's all the pages, no need to interleave */
+ if (rem_pages <= weight) {
+ me->il_weight -= rem_pages;
+ return total_allocated;
+ }
+ /* Otherwise we adjust remaining pages, continue from there */
+ rem_pages -= weight;
+ }
+ /* clear active weight in case of an allocation failure */
+ me->il_weight = 0;
+ prev_node = node;
+
+ /* create a local copy of node weights to operate on outside rcu */
+ weights = kzalloc(nr_node_ids, GFP_KERNEL);
+ if (!weights)
+ return total_allocated;
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ if (table)
+ memcpy(weights, table, nr_node_ids);
+ rcu_read_unlock();
+
+ /* calculate total, detect system default usage */
+ for_each_node_mask(node, nodes) {
+ if (!weights[node])
+ weights[node] = 1;
+ weight_total += weights[node];
+ }
+
+ /*
+ * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
+ * Track which node weighted interleave should resume from.
+ *
+ * if (rounds > 0) and (delta == 0), resume_node will always be
+ * the node following prev_node and its weight.
+ */
+ rounds = rem_pages / weight_total;
+ delta = rem_pages % weight_total;
+ resume_node = next_node_in(prev_node, nodes);
+ resume_weight = weights[resume_node];
+ for (i = 0; i < nnodes; i++) {
+ node = next_node_in(prev_node, nodes);
+ weight = weights[node];
+ node_pages = weight * rounds;
+ /* If a delta exists, add this node's portion of the delta */
+ if (delta > weight) {
+ node_pages += weight;
+ delta -= weight;
+ } else if (delta) {
+ /* when delta is depleted, resume from that node */
+ node_pages += delta;
+ resume_node = node;
+ resume_weight = weight - delta;
+ delta = 0;
+ }
+ /* node_pages can be 0 if an allocation fails and rounds == 0 */
+ if (!node_pages)
+ break;
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ if (total_allocated == nr_pages)
+ break;
+ prev_node = node;
+ }
+ me->il_prev = resume_node;
+ me->il_weight = resume_weight;
+ kfree(weights);
+ return total_allocated;
+}
+
static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
@@ -2315,6 +2513,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
return alloc_pages_bulk_array_interleave(gfp, pol,
nr_pages, page_array);
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return alloc_pages_bulk_array_weighted_interleave(
+ gfp, pol, nr_pages, page_array);
+
if (pol->mode == MPOL_PREFERRED_MANY)
return alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
@@ -2390,6 +2592,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
@@ -2526,6 +2729,10 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
polnid = interleave_nid(pol, ilx);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ polnid = weighted_interleave_nid(pol, ilx);
+ break;
+
case MPOL_PREFERRED:
if (node_isset(curnid, pol->nodes))
goto out;
@@ -2900,6 +3107,7 @@ static const char * const policy_modes[] =
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
+ [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
[MPOL_LOCAL] = "local",
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -2959,6 +3167,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
}
break;
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
@@ -3069,6 +3278,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
nodes = pol->nodes;
break;
default: