summaryrefslogtreecommitdiff
path: root/mm/memory-tiers.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-tiers.c')
-rw-r--r--mm/memory-tiers.c168
1 files changed, 125 insertions, 43 deletions
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0537664620e5..9842acebd05e 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -6,6 +6,7 @@
#include <linux/memory.h>
#include <linux/memory-tiers.h>
#include <linux/notifier.h>
+#include <linux/sched/sysctl.h>
#include "internal.h"
@@ -36,14 +37,38 @@ struct node_memory_type_map {
static DEFINE_MUTEX(memory_tier_lock);
static LIST_HEAD(memory_tiers);
+/*
+ * The list is used to store all memory types that are not created
+ * by a device driver.
+ */
+static LIST_HEAD(default_memory_types);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
struct memory_dev_type *default_dram_type;
+nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE;
static const struct bus_type memory_tier_subsys = {
.name = "memory_tiering",
.dev_name = "memory_tier",
};
+#ifdef CONFIG_NUMA_BALANCING
+/**
+ * folio_use_access_time - check if a folio reuses cpupid for page access time
+ * @folio: folio to check
+ *
+ * folio's _last_cpupid field is repurposed by memory tiering. In memory
+ * tiering mode, cpupid of slow memory folio (not toptier memory) is used to
+ * record page access time.
+ *
+ * Return: the folio _last_cpupid is used to record page access time
+ */
+bool folio_use_access_time(struct folio *folio)
+{
+ return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(folio_nid(folio));
+}
+#endif
+
#ifdef CONFIG_MIGRATION
static int top_tier_adistance;
/*
@@ -108,6 +133,8 @@ static struct demotion_nodes *node_demotion __read_mostly;
static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
+/* The lock is used to protect `default_dram_perf*` info and nid. */
+static DEFINE_MUTEX(default_dram_perf_lock);
static bool default_dram_perf_error;
static struct access_coordinate default_dram_perf;
static int default_dram_perf_ref_nid = NUMA_NO_NODE;
@@ -505,7 +532,8 @@ static inline void __init_node_memory_type(int node, struct memory_dev_type *mem
static struct memory_tier *set_node_memory_tier(int node)
{
struct memory_tier *memtier;
- struct memory_dev_type *memtype;
+ struct memory_dev_type *memtype = default_dram_type;
+ int adist = MEMTIER_ADISTANCE_DRAM;
pg_data_t *pgdat = NODE_DATA(node);
@@ -514,7 +542,16 @@ static struct memory_tier *set_node_memory_tier(int node)
if (!node_state(node, N_MEMORY))
return ERR_PTR(-EINVAL);
- __init_node_memory_type(node, default_dram_type);
+ mt_calc_adistance(node, &adist);
+ if (!node_memory_types[node].memtype) {
+ memtype = mt_find_alloc_memory_type(adist, &default_memory_types);
+ if (IS_ERR(memtype)) {
+ memtype = default_dram_type;
+ pr_info("Failed to allocate a memory type. Fall back.\n");
+ }
+ }
+
+ __init_node_memory_type(node, memtype);
memtype = node_memory_types[node].memtype;
node_set(node, memtype->nodes);
@@ -623,6 +660,71 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
}
EXPORT_SYMBOL_GPL(clear_node_memory_type);
+struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types)
+{
+ struct memory_dev_type *mtype;
+
+ list_for_each_entry(mtype, memory_types, list)
+ if (mtype->adistance == adist)
+ return mtype;
+
+ mtype = alloc_memory_type(adist);
+ if (IS_ERR(mtype))
+ return mtype;
+
+ list_add(&mtype->list, memory_types);
+
+ return mtype;
+}
+EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type);
+
+void mt_put_memory_types(struct list_head *memory_types)
+{
+ struct memory_dev_type *mtype, *mtn;
+
+ list_for_each_entry_safe(mtype, mtn, memory_types, list) {
+ list_del(&mtype->list);
+ put_memory_type(mtype);
+ }
+}
+EXPORT_SYMBOL_GPL(mt_put_memory_types);
+
+/*
+ * This is invoked via `late_initcall()` to initialize memory tiers for
+ * memory nodes, both with and without CPUs. After the initialization of
+ * firmware and devices, adistance algorithms are expected to be provided.
+ */
+static int __init memory_tier_late_init(void)
+{
+ int nid;
+ struct memory_tier *memtier;
+
+ get_online_mems();
+ guard(mutex)(&memory_tier_lock);
+
+ /* Assign each uninitialized N_MEMORY node to a memory tier. */
+ for_each_node_state(nid, N_MEMORY) {
+ /*
+ * Some device drivers may have initialized
+ * memory tiers, potentially bringing memory nodes
+ * online and configuring memory tiers.
+ * Exclude them here.
+ */
+ if (node_memory_types[nid].memtype)
+ continue;
+
+ memtier = set_node_memory_tier(nid);
+ if (IS_ERR(memtier))
+ continue;
+ }
+
+ establish_demotion_targets();
+ put_online_mems();
+
+ return 0;
+}
+late_initcall(memory_tier_late_init);
+
static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix)
{
pr_info(
@@ -634,25 +736,19 @@ static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix)
int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
const char *source)
{
- int rc = 0;
-
- mutex_lock(&memory_tier_lock);
- if (default_dram_perf_error) {
- rc = -EIO;
- goto out;
- }
+ guard(mutex)(&default_dram_perf_lock);
+ if (default_dram_perf_error)
+ return -EIO;
if (perf->read_latency + perf->write_latency == 0 ||
- perf->read_bandwidth + perf->write_bandwidth == 0) {
- rc = -EINVAL;
- goto out;
- }
+ perf->read_bandwidth + perf->write_bandwidth == 0)
+ return -EINVAL;
if (default_dram_perf_ref_nid == NUMA_NO_NODE) {
default_dram_perf = *perf;
default_dram_perf_ref_nid = nid;
default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL);
- goto out;
+ return 0;
}
/*
@@ -680,27 +776,25 @@ int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
pr_info(
" disable default DRAM node performance based abstract distance algorithm.\n");
default_dram_perf_error = true;
- rc = -EINVAL;
+ return -EINVAL;
}
-out:
- mutex_unlock(&memory_tier_lock);
- return rc;
+ return 0;
}
int mt_perf_to_adistance(struct access_coordinate *perf, int *adist)
{
+ guard(mutex)(&default_dram_perf_lock);
if (default_dram_perf_error)
return -EIO;
- if (default_dram_perf_ref_nid == NUMA_NO_NODE)
- return -ENOENT;
-
if (perf->read_latency + perf->write_latency == 0 ||
perf->read_bandwidth + perf->write_bandwidth == 0)
return -EINVAL;
- mutex_lock(&memory_tier_lock);
+ if (default_dram_perf_ref_nid == NUMA_NO_NODE)
+ return -ENOENT;
+
/*
* The abstract distance of a memory node is in direct proportion to
* its memory latency (read + write) and inversely proportional to its
@@ -713,7 +807,6 @@ int mt_perf_to_adistance(struct access_coordinate *perf, int *adist)
(default_dram_perf.read_latency + default_dram_perf.write_latency) *
(default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) /
(perf->read_bandwidth + perf->write_bandwidth);
- mutex_unlock(&memory_tier_lock);
return 0;
}
@@ -809,8 +902,7 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
static int __init memory_tier_init(void)
{
- int ret, node;
- struct memory_tier *memtier;
+ int ret;
ret = subsys_virtual_register(&memory_tier_subsys, NULL);
if (ret)
@@ -821,30 +913,21 @@ static int __init memory_tier_init(void)
GFP_KERNEL);
WARN_ON(!node_demotion);
#endif
+
mutex_lock(&memory_tier_lock);
/*
* For now we can have 4 faster memory tiers with smaller adistance
* than default DRAM tier.
*/
- default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
+ default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
+ &default_memory_types);
+ mutex_unlock(&memory_tier_lock);
if (IS_ERR(default_dram_type))
panic("%s() failed to allocate default DRAM tier\n", __func__);
- /*
- * Look at all the existing N_MEMORY nodes and add them to
- * default memory tier or to a tier if we already have memory
- * types assigned.
- */
- for_each_node_state(node, N_MEMORY) {
- memtier = set_node_memory_tier(node);
- if (IS_ERR(memtier))
- /*
- * Continue with memtiers we are able to setup
- */
- break;
- }
- establish_demotion_targets();
- mutex_unlock(&memory_tier_lock);
+ /* Record nodes with memory and CPU to set default DRAM performance. */
+ nodes_and(default_dram_nodes, node_states[N_MEMORY],
+ node_states[N_CPU]);
hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
return 0;
@@ -858,8 +941,7 @@ bool numa_demotion_enabled = false;
static ssize_t demotion_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%s\n",
- numa_demotion_enabled ? "true" : "false");
+ return sysfs_emit(buf, "%s\n", str_true_false(numa_demotion_enabled));
}
static ssize_t demotion_enabled_store(struct kobject *kobj,