summaryrefslogtreecommitdiff
path: root/mm/memory-tiers.c
diff options
context:
space:
mode:
authorHuang Ying <ying.huang@intel.com>2023-09-26 09:06:27 +0300
committerAndrew Morton <akpm@linux-foundation.org>2023-10-17 01:44:39 +0300
commit3718c02dbd4c88d47b5af003acdb3d1112604ea3 (patch)
treed257b0b032c78f0647d4bf778c73d98af8a0f4cc /mm/memory-tiers.c
parentd0376aac59a166cd7bd9d1a9768e31e71002631b (diff)
downloadlinux-3718c02dbd4c88d47b5af003acdb3d1112604ea3.tar.xz
acpi, hmat: calculate abstract distance with HMAT
A memory tiering abstract distance calculation algorithm based on ACPI HMAT is implemented. The basic idea is as follows. The performance attributes of system default DRAM nodes are recorded as the base line. Whose abstract distance is MEMTIER_ADISTANCE_DRAM. Then, the ratio of the abstract distance of a memory node (target) to MEMTIER_ADISTANCE_DRAM is scaled based on the ratio of the performance attributes of the node to that of the default DRAM nodes. The functions to record the read/write latency/bandwidth of the default DRAM nodes and calculate abstract distance according to read/write latency/bandwidth ratio will be used by CXL CDAT (Coherent Device Attribute Table) and other memory device drivers. So, they are put in memory-tiers.c. Link: https://lkml.kernel.org/r/20230926060628.265989-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Tested-by: Bharata B Rao <bharata@amd.com> Reviewed-by: Dave Jiang <dave.jiang@intel.com> Reviewed-by: Alistair Popple <apopple@nvidia.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Wei Xu <weixugc@google.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Yang Shi <shy828301@gmail.com> Cc: Rafael J Wysocki <rafael.j.wysocki@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/memory-tiers.c')
-rw-r--r--mm/memory-tiers.c103
1 files changed, 102 insertions, 1 deletions
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 4301e7e89223..085321c77123 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -37,7 +37,7 @@ struct node_memory_type_map {
static DEFINE_MUTEX(memory_tier_lock);
static LIST_HEAD(memory_tiers);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
-static struct memory_dev_type *default_dram_type;
+struct memory_dev_type *default_dram_type;
static struct bus_type memory_tier_subsys = {
.name = "memory_tiering",
@@ -108,6 +108,11 @@ static struct demotion_nodes *node_demotion __read_mostly;
static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
+static bool default_dram_perf_error;
+static struct node_hmem_attrs default_dram_perf;
+static int default_dram_perf_ref_nid = NUMA_NO_NODE;
+static const char *default_dram_perf_ref_source;
+
static inline struct memory_tier *to_memory_tier(struct device *device)
{
return container_of(device, struct memory_tier, dev);
@@ -595,6 +600,102 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
}
EXPORT_SYMBOL_GPL(clear_node_memory_type);
+static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix)
+{
+ pr_info(
+"%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n",
+ prefix, attrs->read_latency, attrs->write_latency,
+ attrs->read_bandwidth, attrs->write_bandwidth);
+}
+
+int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+ const char *source)
+{
+ int rc = 0;
+
+ mutex_lock(&memory_tier_lock);
+ if (default_dram_perf_error) {
+ rc = -EIO;
+ goto out;
+ }
+
+ if (perf->read_latency + perf->write_latency == 0 ||
+ perf->read_bandwidth + perf->write_bandwidth == 0) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (default_dram_perf_ref_nid == NUMA_NO_NODE) {
+ default_dram_perf = *perf;
+ default_dram_perf_ref_nid = nid;
+ default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL);
+ goto out;
+ }
+
+ /*
+ * The performance of all default DRAM nodes is expected to be
+ * same (that is, the variation is less than 10%). And it
+ * will be used as base to calculate the abstract distance of
+ * other memory nodes.
+ */
+ if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 >
+ default_dram_perf.read_latency ||
+ abs(perf->write_latency - default_dram_perf.write_latency) * 10 >
+ default_dram_perf.write_latency ||
+ abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 >
+ default_dram_perf.read_bandwidth ||
+ abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 >
+ default_dram_perf.write_bandwidth) {
+ pr_info(
+"memory-tiers: the performance of DRAM node %d mismatches that of the reference\n"
+"DRAM node %d.\n", nid, default_dram_perf_ref_nid);
+ pr_info(" performance of reference DRAM node %d:\n",
+ default_dram_perf_ref_nid);
+ dump_hmem_attrs(&default_dram_perf, " ");
+ pr_info(" performance of DRAM node %d:\n", nid);
+ dump_hmem_attrs(perf, " ");
+ pr_info(
+" disable default DRAM node performance based abstract distance algorithm.\n");
+ default_dram_perf_error = true;
+ rc = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&memory_tier_lock);
+ return rc;
+}
+
+int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+{
+ if (default_dram_perf_error)
+ return -EIO;
+
+ if (default_dram_perf_ref_nid == NUMA_NO_NODE)
+ return -ENOENT;
+
+ if (perf->read_latency + perf->write_latency == 0 ||
+ perf->read_bandwidth + perf->write_bandwidth == 0)
+ return -EINVAL;
+
+ mutex_lock(&memory_tier_lock);
+ /*
+ * The abstract distance of a memory node is in direct proportion to
+ * its memory latency (read + write) and inversely proportional to its
+ * memory bandwidth (read + write). The abstract distance, memory
+ * latency, and memory bandwidth of the default DRAM nodes are used as
+ * the base.
+ */
+ *adist = MEMTIER_ADISTANCE_DRAM *
+ (perf->read_latency + perf->write_latency) /
+ (default_dram_perf.read_latency + default_dram_perf.write_latency) *
+ (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) /
+ (perf->read_bandwidth + perf->write_bandwidth);
+ mutex_unlock(&memory_tier_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mt_perf_to_adistance);
+
/**
* register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm
* @nb: The notifier block which describe the algorithm