From 32206ab36553be8714d9253ce33f4085681d369c Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:41:52 -0700 Subject: x86/intel_rdt: Provide pseudo-locking hooks within rdt_mount Stephen Rothwell reported that the Cache Pseudo-Locking enabling and the kernfs support for mounting with fs_context are conflicting. In preparation for a conflict-free merge between the two repos some no-op hooks are created within the RDT mount function being changed by the two features. The goal is for this commit to be placed on a minimal no-rebase branch to be consumed by both features. Reported-by: Stephen Rothwell Suggested-by: Al Viro Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Cc: David Howells Link: https://lkml.kernel.org/r/410697ead08978bd12111c0afc4ce9e7bd71a5fe.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 9 +++++++++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 10 +++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 39752825e376..be152b3b2543 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -468,4 +468,13 @@ void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); +/* + * Define the hooks for Cache Pseudo-Locking to use within rdt_mount(). + * These are no-ops provided for the new kernfs changes to use as a + * baseline in preparation for a conflict-free merge between it + * (kernfs changes) and the Cache Pseudo-Locking enabling. + */ +static inline int rdt_pseudo_lock_init(void) { return 0; } +static inline void rdt_pseudo_lock_release(void) { } + #endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 749856a2e736..fa668f967062 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1289,10 +1289,16 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, rdtgroup_default.mon.mon_data_kn = kn_mondata; } + ret = rdt_pseudo_lock_init(); + if (ret) { + dentry = ERR_PTR(ret); + goto out_mondata; + } + dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_mondata; + goto out_psl; if (rdt_alloc_capable) static_branch_enable_cpuslocked(&rdt_alloc_enable_key); @@ -1310,6 +1316,8 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out; +out_psl: + rdt_pseudo_lock_release(); out_mondata: if (rdt_mon_capable) kernfs_remove(kn_mondata); -- cgit v1.2.3 From eb956a636f90bd0297309a7eae6db388ae5bfc43 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:41:54 -0700 Subject: x86/intel_rdt: Introduce RDT resource group mode At this time there are no constraints on how bitmasks represented by schemata can be associated with closids represented by resource groups. A bitmask of one class of service can without any objections overlap with the bitmask of another class of service. The concept of "mode" is introduced in preparation for support of control over whether cache regions can be shared between classes of service. At this time the only mode reflects the current cache allocations where all can potentially be shared. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/87e88275597fbfa03ea9d41c1186bf012c831c01.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index be152b3b2543..fef254e85da4 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -80,6 +80,22 @@ enum rdt_group_type { RDT_NUM_GROUP, }; +/** + * enum rdtgrp_mode - Mode of a RDT resource group + * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations + * + * The mode of a resource group enables control over the allowed overlap + * between allocations associated with different resource groups (classes + * of service). User is able to modify the mode of a resource group by + * writing to the "mode" resctrl file associated with the resource group. + */ +enum rdtgrp_mode { + RDT_MODE_SHAREABLE = 0, + + /* Must be last */ + RDT_NUM_MODES, +}; + /** * struct mongroup - store mon group's data in resctrl fs. * @mon_data_kn kernlfs node for the mon_data directory -- cgit v1.2.3 From 472ef09b40c5802a2e014ae2503ad8619486f9e6 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:41:55 -0700 Subject: x86/intel_rdt: Associate mode with each RDT resource group Each RDT resource group is associated with a mode that will reflect the level of sharing of its allocations. The default, shareable, will be associated with each resource group on creation since it is zero and resource groups are created with kzalloc. The managing of the mode of a resource group will follow. The default resource group always remain though so ensure that it is reset to the default mode when the resctrl filesystem is unmounted. Also introduce a utility that can be used to determine the mode of a resource group when it is searched for based on its class of service. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/797e4e1de4e4fcdf5b5e0039354d6a28079e2015.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 3 +++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index fef254e85da4..482dd6d3b8c5 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -122,6 +122,7 @@ struct mongroup { * @type: indicates type of this rdtgroup - either * monitor only or ctrl_mon group * @mon: mongroup related data + * @mode: mode of resource group */ struct rdtgroup { struct kernfs_node *kn; @@ -132,6 +133,7 @@ struct rdtgroup { atomic_t waitcount; enum rdt_group_type type; struct mongroup mon; + enum rdtgrp_mode mode; }; /* rdtgroup.flags */ @@ -461,6 +463,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int alloc_rmid(void); void free_rmid(u32 rmid); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index fa668f967062..4a72061df7cd 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -126,6 +126,27 @@ static void closid_free(int closid) closid_free_map |= 1 << closid; } +/** + * rdtgroup_mode_by_closid - Return mode of resource group with closid + * @closid: closid if the resource group + * + * Each resource group is associated with a @closid. Here the mode + * of a resource group can be queried by searching for it using its closid. + * + * Return: mode as &enum rdtgrp_mode of resource group with closid @closid + */ +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) +{ + struct rdtgroup *rdtgrp; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->closid == closid) + return rdtgrp->mode; + } + + return RDT_NUM_MODES; +} + /* set uid and gid of rdtgroup dirs and files to that of the creator */ static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) { @@ -1491,6 +1512,7 @@ static void rdt_kill_sb(struct super_block *sb) reset_all_ctrls(r); cdp_disable_all(); rmdir_all_sub(); + rdtgroup_default.mode = RDT_MODE_SHAREABLE; static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); static_branch_disable_cpuslocked(&rdt_enable_key); -- cgit v1.2.3 From d48d7a57f7181b36be748ad8fb806e8de3104e89 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:41:56 -0700 Subject: x86/intel_rdt: Introduce resource group's mode resctrl file A new resctrl file "mode" associated with each resource group is introduced. This file will display the resource group's current mode and an administrator can also use it to modify the resource group's mode. Only shareable mode is currently supported. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/20ab78fda26a8c8d98e18ec555f6a1f728948972.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 83 ++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 4a72061df7cd..d00cce3186ad 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -147,6 +147,24 @@ enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) return RDT_NUM_MODES; } +static const char * const rdt_mode_str[] = { + [RDT_MODE_SHAREABLE] = "shareable", +}; + +/** + * rdtgroup_mode_str - Return the string representation of mode + * @mode: the resource group mode as &enum rdtgroup_mode + * + * Return: string representation of valid mode, "unknown" otherwise + */ +static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) +{ + if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) + return "unknown"; + + return rdt_mode_str[mode]; +} + /* set uid and gid of rdtgroup dirs and files to that of the creator */ static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) { @@ -761,6 +779,63 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, return nbytes; } +/* + * rdtgroup_mode_show - Display mode of this resource group + */ +static int rdtgroup_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); + + rdtgroup_kn_unlock(of->kn); + return 0; +} + +static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + enum rdtgrp_mode mode; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + rdt_last_cmd_clear(); + + mode = rdtgrp->mode; + + if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE)) + goto out; + + if (!strcmp(buf, "shareable")) { + rdtgrp->mode = RDT_MODE_SHAREABLE; + } else { + rdt_last_cmd_printf("unknown/unsupported mode\n"); + ret = -EINVAL; + } + +out: + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -874,6 +949,14 @@ static struct rftype res_common_files[] = { .seq_show = rdtgroup_schemata_show, .fflags = RF_CTRL_BASE, }, + { + .name = "mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mode_write, + .seq_show = rdtgroup_mode_show, + .fflags = RF_CTRL_BASE, + }, }; static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) -- cgit v1.2.3 From 0b9aa6562650f42da23c29814adc2318c135cef4 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:41:57 -0700 Subject: x86/intel_rdt: Introduce test to determine if closid is in use During CAT feature discovery the capacity bitmasks (CBMs) associated with all the classes of service are initialized to all ones, even if the class of service is not in use. Introduce a test that can be used to determine if a class of service is in use. This test enables code interested in parsing the CBMs to know if its values are meaningful or can be ignored. Temporarily mark the function as unused to silence compile warnings until it is used. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/798f8d89cd9b12df492d48c14bdc8ee3b39b1c6f.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index d00cce3186ad..659b643fcf94 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -126,6 +126,18 @@ static void closid_free(int closid) closid_free_map |= 1 << closid; } +/** + * closid_allocated - test if provided closid is in use + * @closid: closid to be tested + * + * Return: true if @closid is currently associated with a resource group, + * false if @closid is free + */ +static bool __attribute__ ((unused)) closid_allocated(unsigned int closid) +{ + return (closid_free_map & (1 << closid)) == 0; +} + /** * rdtgroup_mode_by_closid - Return mode of resource group with closid * @closid: closid if the resource group -- cgit v1.2.3 From 024d15be3855044faa8bddf829e3613961fd27ba Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:41:58 -0700 Subject: x86/intel_rdt: Make useful functions available internally In support of the work done to enable resource groups to have different modes some static functions need to be available for sharing amongst all RDT components. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/2af8fd6e937ae4fbdaa52dee1123823cb4993176.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 2 ++ arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 2 +- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 482dd6d3b8c5..f2fbf4059b3f 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -465,6 +465,8 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int update_domains(struct rdt_resource *r, int closid); +void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 116d57b248d3..2c23bb136ccc 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -175,7 +175,7 @@ next: return -EINVAL; } -static int update_domains(struct rdt_resource *r, int closid) +int update_domains(struct rdt_resource *r, int closid) { struct msr_param msr_param; cpumask_var_t cpu_mask; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 659b643fcf94..04ef140ebc84 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -121,7 +121,7 @@ static int closid_alloc(void) return closid; } -static void closid_free(int closid) +void closid_free(int closid) { closid_free_map |= 1 << closid; } -- cgit v1.2.3 From 95f0b77efa5749f19e7acfedcb8521da4b13ed0e Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:41:59 -0700 Subject: x86/intel_rdt: Initialize new resource group with sane defaults Currently when a new resource group is created its allocations would be those that belonged to the resource group to which its closid belonged previously. That is, we can encounter a case like: mkdir newgroup cat newgroup/schemata L2:0=ff;1=ff echo 'L2:0=0xf0;1=0xf0' > newgroup/schemata cat newgroup/schemata L2:0=0xf0;1=0xf0 rmdir newgroup mkdir newnewgroup cat newnewgroup/schemata L2:0=0xf0;1=0xf0 When the new group is created it would be reasonable to expect its allocations to be initialized with all regions that it can possibly use. At this time these regions would be all that are shareable by other resource groups as well as regions that are not currently used. If the available cache region is found to be non-contiguous the available region is adjusted to enforce validity. When a new resource group is created the hardware is initialized with these new default allocations. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/c468ed79340b63024111978e01430bb9589d85c0.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 115 ++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 04ef140ebc84..80ac09cf978b 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -133,7 +133,7 @@ void closid_free(int closid) * Return: true if @closid is currently associated with a resource group, * false if @closid is free */ -static bool __attribute__ ((unused)) closid_allocated(unsigned int closid) +static bool closid_allocated(unsigned int closid) { return (closid_free_map & (1 << closid)) == 0; } @@ -1807,6 +1807,110 @@ out_destroy: return ret; } +/** + * cbm_ensure_valid - Enforce validity on provided CBM + * @_val: Candidate CBM + * @r: RDT resource to which the CBM belongs + * + * The provided CBM represents all cache portions available for use. This + * may be represented by a bitmap that does not consist of contiguous ones + * and thus be an invalid CBM. + * Here the provided CBM is forced to be a valid CBM by only considering + * the first set of contiguous bits as valid and clearing all bits. + * The intention here is to provide a valid default CBM with which a new + * resource group is initialized. The user can follow this with a + * modification to the CBM if the default does not satisfy the + * requirements. + */ +static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) +{ + /* + * Convert the u32 _val to an unsigned long required by all the bit + * operations within this function. No more than 32 bits of this + * converted value can be accessed because all bit operations are + * additionally provided with cbm_len that is initialized during + * hardware enumeration using five bits from the EAX register and + * thus never can exceed 32 bits. + */ + unsigned long *val = (unsigned long *)_val; + unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit; + + if (*val == 0) + return; + + first_bit = find_first_bit(val, cbm_len); + zero_bit = find_next_zero_bit(val, cbm_len, first_bit); + + /* Clear any remaining bits to ensure contiguous region */ + bitmap_clear(val, zero_bit, cbm_len - zero_bit); +} + +/** + * rdtgroup_init_alloc - Initialize the new RDT group's allocations + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. That is, all shareable and unused bits. + * + * All-zero CBM is invalid. If there are no more shareable bits available + * on any domain then the entire allocation will fail. + */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + u32 used_b = 0, unused_b = 0; + u32 closid = rdtgrp->closid; + struct rdt_resource *r; + enum rdtgrp_mode mode; + struct rdt_domain *d; + int i, ret; + u32 *ctrl; + + for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(d, &r->domains, list) { + d->have_new_ctrl = false; + d->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + ctrl = d->ctrl_val; + for (i = 0; i < r->num_closid; i++, ctrl++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + used_b |= *ctrl; + if (mode == RDT_MODE_SHAREABLE) + d->new_ctrl |= *ctrl; + } + } + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + d->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cbm_ensure_valid(&d->new_ctrl, r); + if (bitmap_weight((unsigned long *) &d->new_ctrl, + r->cache.cbm_len) < + r->cache.min_cbm_bits) { + rdt_last_cmd_printf("no space on %s:%d\n", + r->name, d->id); + return -ENOSPC; + } + d->have_new_ctrl = true; + } + } + + for_each_alloc_enabled_rdt_resource(r) { + ret = update_domains(r, rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("failed to initialize allocations\n"); + return ret; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + } + + return 0; +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, struct kernfs_node *prgrp_kn, const char *name, umode_t mode, @@ -1965,6 +2069,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, ret = 0; rdtgrp->closid = closid; + ret = rdtgroup_init_alloc(rdtgrp); + if (ret < 0) + goto out_id_free; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); if (rdt_mon_capable) { @@ -1975,15 +2083,16 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); if (ret) { rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_id_free; + goto out_del_list; } } goto out_unlock; +out_del_list: + list_del(&rdtgrp->rdtgroup_list); out_id_free: closid_free(closid); - list_del(&rdtgrp->rdtgroup_list); out_common_fail: mkdir_rdt_prepare_clean(rdtgrp); out_unlock: -- cgit v1.2.3 From 414dd2b4732949ddc972c2592a13c799434249c6 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:00 -0700 Subject: x86/intel_rdt: Introduce new "exclusive" mode At the moment all allocations are shareable. There is no way for a user to designate that an allocation associated with a resource group cannot be shared by another. Introduce the new mode "exclusive". When a resource group is marked as such it implies that no overlap is allowed between its allocation and that of another resource group. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/f6d24672a4280fe3b24cd2da9b5f50214439c1af.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 2 ++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 1 + 2 files changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index f2fbf4059b3f..c9033fd774d5 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -83,6 +83,7 @@ enum rdt_group_type { /** * enum rdtgrp_mode - Mode of a RDT resource group * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations + * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed * * The mode of a resource group enables control over the allowed overlap * between allocations associated with different resource groups (classes @@ -91,6 +92,7 @@ enum rdt_group_type { */ enum rdtgrp_mode { RDT_MODE_SHAREABLE = 0, + RDT_MODE_EXCLUSIVE, /* Must be last */ RDT_NUM_MODES, diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 80ac09cf978b..910febef55e8 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -161,6 +161,7 @@ enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) static const char * const rdt_mode_str[] = { [RDT_MODE_SHAREABLE] = "shareable", + [RDT_MODE_EXCLUSIVE] = "exclusive", }; /** -- cgit v1.2.3 From 49f7b4efa1101bbc143a960eff3a9c8f9d6f7358 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:01 -0700 Subject: x86/intel_rdt: Enable setting of exclusive mode The new "mode" file now accepts "exclusive" that means that the allocations of this resource group cannot be shared. Enable users to modify a resource group's mode to "exclusive". To succeed it is required that there is no overlap between resource group's current schemata and that of all the other active resource groups as well as cache regions potentially used by other hardware entities. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/83642cbba3c8c21db7fa6bb36fe7d385d3b275f2.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 97 +++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 910febef55e8..ba1410ced420 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -812,6 +812,93 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, return 0; } +/** + * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Checks if provided @cbm intended to be used for @closid on domain + * @d overlaps with any other closids or other hardware usage associated + * with this domain. If @exclusive is true then only overlaps with + * resource groups in exclusive mode will be considered. If @exclusive + * is false then overlaps with any resource group or hardware entities + * will be considered. + * + * Return: false if CBM does not overlap, true if it does. + */ +static bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + u32 _cbm, int closid, bool exclusive) +{ + unsigned long *cbm = (unsigned long *)&_cbm; + unsigned long *ctrl_b; + enum rdtgrp_mode mode; + u32 *ctrl; + int i; + + /* Check for any overlap with regions used by hardware directly */ + if (!exclusive) { + if (bitmap_intersects(cbm, + (unsigned long *)&r->cache.shareable_bits, + r->cache.cbm_len)) + return true; + } + + /* Check for overlap with other resource groups */ + ctrl = d->ctrl_val; + for (i = 0; i < r->num_closid; i++, ctrl++) { + ctrl_b = (unsigned long *)ctrl; + if (closid_allocated(i) && i != closid) { + if (bitmap_intersects(cbm, ctrl_b, r->cache.cbm_len)) { + mode = rdtgroup_mode_by_closid(i); + if (exclusive) { + if (mode == RDT_MODE_EXCLUSIVE) + return true; + continue; + } + return true; + } + } + } + + return false; +} + +/** + * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * + * An exclusive resource group implies that there should be no sharing of + * its allocated resources. At the time this group is considered to be + * exclusive this test can determine if its current schemata supports this + * setting by testing for overlap with all other resource groups. + * + * Return: true if resource group can be exclusive, false if there is overlap + * with allocations of other resource groups and thus this resource group + * cannot be exclusive. + */ +static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) +{ + int closid = rdtgrp->closid; + struct rdt_resource *r; + struct rdt_domain *d; + + for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(d, &r->domains, list) { + if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], + rdtgrp->closid, false)) + return false; + } + } + + return true; +} + +/** + * rdtgroup_mode_write - Modify the resource group's mode + * + */ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -834,11 +921,19 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, mode = rdtgrp->mode; - if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE)) + if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || + (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE)) goto out; if (!strcmp(buf, "shareable")) { rdtgrp->mode = RDT_MODE_SHAREABLE; + } else if (!strcmp(buf, "exclusive")) { + if (!rdtgroup_mode_test_exclusive(rdtgrp)) { + rdt_last_cmd_printf("schemata overlaps\n"); + ret = -EINVAL; + goto out; + } + rdtgrp->mode = RDT_MODE_EXCLUSIVE; } else { rdt_last_cmd_printf("unknown/unsupported mode\n"); ret = -EINVAL; -- cgit v1.2.3 From 9af4c0a6dc1a1abf5336f2c3f951444db6b71da9 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:02 -0700 Subject: x86/intel_rdt: Making CBM name and type more explicit cbm_validate() receives a pointer to the variable that will be initialized with a validated capacity bitmask. The pointer points to a variable of type unsigned long that is immediately assigned to a variable of type u32 by the caller on return from cbm_validate(). Let cbm_validate() initialize a variable of type u32 directly. At this time also change tha variable name "data" within parse_cbm() to a name more reflective of the content: "cbm_val". This frees up the generic "data" to be used later when it is indeed used for a collection of input. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/5e29cf0209ea2deac9beacd35cbe5239a50959fb.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 2c23bb136ccc..b3da5b981dd8 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -87,7 +87,7 @@ int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). * Additionally Haswell requires at least two bits set. */ -static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) { unsigned long first_bit, zero_bit, val; unsigned int cbm_len = r->cache.cbm_len; @@ -128,16 +128,17 @@ static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) */ int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) { - unsigned long data; + u32 cbm_val; if (d->have_new_ctrl) { rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; } - if(!cbm_validate(buf, &data, r)) + if (!cbm_validate(buf, &cbm_val, r)) return -EINVAL; - d->new_ctrl = data; + + d->new_ctrl = cbm_val; d->have_new_ctrl = true; return 0; -- cgit v1.2.3 From 7604df6e16ae0b4dba6553ae74abcf7280a512fb Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:03 -0700 Subject: x86/intel_rdt: Support flexible data to parsing callbacks Each resource is associated with a configurable callback that should be used to parse the information provided for the particular resource from user space. In addition to the resource and domain pointers this callback is provided with just the character buffer being parsed. In support of flexible parsing the callback is modified to support a void pointer as argument. This enables resources that need more data than just the user provided data to pass its required data to the callback without affecting the signatures for the callbacks of all the other resources. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/34baacfced4d787d994ec7015e249e6c7e619053.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 6 +++--- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index c9033fd774d5..b2b2864150aa 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -371,7 +371,7 @@ struct rdt_resource { struct rdt_cache cache; struct rdt_membw membw; const char *format_str; - int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + int (*parse_ctrlval) (void *data, struct rdt_resource *r, struct rdt_domain *d); struct list_head evt_list; int num_rmid; @@ -379,8 +379,8 @@ struct rdt_resource { unsigned long fflags; }; -int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d); extern struct mutex rdtgroup_mutex; diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index b3da5b981dd8..ab4bb8731825 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -64,9 +64,10 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) +int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; + char *buf = _buf; if (d->have_new_ctrl) { rdt_last_cmd_printf("duplicate domain %d\n", d->id); @@ -126,8 +127,9 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) +int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) { + char *buf = _data; u32 cbm_val; if (d->have_new_ctrl) { -- cgit v1.2.3 From 9ab9aa15c3096000678891b61f7309c8729928e0 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:04 -0700 Subject: x86/intel_rdt: Ensure requested schemata respects mode When the administrator requests a change in a resource group's schemata we have to ensure that the new schemata respects the current resource group as well as the other active resource groups' schemata. The new schemata is not allowed to overlap with the schemata of any exclusive resource groups. Similarly, if the resource group being changed is exclusive then its new schemata is not allowed to overlap with any schemata of any other active resource group. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/b0c05b21110d3040fff45f4c1d2cfda8dba3f207.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 2 ++ arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 49 ++++++++++++++++++++++------- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 4 +-- 3 files changed, 41 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index b2b2864150aa..362c8fe695f3 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -465,6 +465,8 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + u32 _cbm, int closid, bool exclusive); enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index ab4bb8731825..0e6210a043f0 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -123,13 +123,19 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return true; } +struct rdt_cbm_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + /* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) { - char *buf = _data; + struct rdt_cbm_parse_data *data = _data; + struct rdtgroup *rdtgrp = data->rdtgrp; u32 cbm_val; if (d->have_new_ctrl) { @@ -137,8 +143,24 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) return -EINVAL; } - if (!cbm_validate(buf, &cbm_val, r)) + if (!cbm_validate(data->buf, &cbm_val, r)) + return -EINVAL; + + /* + * The CBM may not overlap with the CBM of another closid if + * either is exclusive. + */ + if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) { + rdt_last_cmd_printf("overlaps with exclusive group\n"); return -EINVAL; + } + + if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) { + if (rdtgrp->mode == RDT_MODE_EXCLUSIVE) { + rdt_last_cmd_printf("overlaps with other group\n"); + return -EINVAL; + } + } d->new_ctrl = cbm_val; d->have_new_ctrl = true; @@ -152,8 +174,10 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) * separated by ";". The "id" is in decimal, and must match one of * the "id"s for this resource. */ -static int parse_line(char *line, struct rdt_resource *r) +static int parse_line(char *line, struct rdt_resource *r, + struct rdtgroup *rdtgrp) { + struct rdt_cbm_parse_data data; char *dom = NULL, *id; struct rdt_domain *d; unsigned long dom_id; @@ -170,7 +194,9 @@ next: dom = strim(dom); list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { - if (r->parse_ctrlval(dom, r, d)) + data.buf = dom; + data.rdtgrp = rdtgrp; + if (r->parse_ctrlval(&data, r, d)) return -EINVAL; goto next; } @@ -223,13 +249,14 @@ done: return 0; } -static int rdtgroup_parse_resource(char *resname, char *tok, int closid) +static int rdtgroup_parse_resource(char *resname, char *tok, + struct rdtgroup *rdtgrp) { struct rdt_resource *r; for_each_alloc_enabled_rdt_resource(r) { - if (!strcmp(resname, r->name) && closid < r->num_closid) - return parse_line(tok, r); + if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid) + return parse_line(tok, r, rdtgrp); } rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname); return -EINVAL; @@ -242,7 +269,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, struct rdt_domain *dom; struct rdt_resource *r; char *tok, *resname; - int closid, ret = 0; + int ret = 0; /* Valid input requires a trailing newline */ if (nbytes == 0 || buf[nbytes - 1] != '\n') @@ -256,8 +283,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, } rdt_last_cmd_clear(); - closid = rdtgrp->closid; - for_each_alloc_enabled_rdt_resource(r) { list_for_each_entry(dom, &r->domains, list) dom->have_new_ctrl = false; @@ -275,13 +300,13 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, ret = -EINVAL; goto out; } - ret = rdtgroup_parse_resource(resname, tok, closid); + ret = rdtgroup_parse_resource(resname, tok, rdtgrp); if (ret) goto out; } for_each_alloc_enabled_rdt_resource(r) { - ret = update_domains(r, closid); + ret = update_domains(r, rdtgrp->closid); if (ret) goto out; } diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index ba1410ced420..75a43f8b400e 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -829,8 +829,8 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, * * Return: false if CBM does not overlap, true if it does. */ -static bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, - u32 _cbm, int closid, bool exclusive) +bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + u32 _cbm, int closid, bool exclusive) { unsigned long *cbm = (unsigned long *)&_cbm; unsigned long *ctrl_b; -- cgit v1.2.3 From e651901187ab8bc8a0a969144da6ae0e2e59a148 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:05 -0700 Subject: x86/intel_rdt: Introduce "bit_usage" to display cache allocations details With cache regions now explicitly marked as "shareable" or "exclusive" we would like to communicate to the user how portions of the cache are used. Introduce "bit_usage" that indicates for each resource how portions of the cache are configured to be used. To assist the user to distinguish whether the sharing is from software or hardware we add the following annotation: 0 - currently unused X - currently available for sharing and used by software and hardware H - currently used by hardware only but available for software use S - currently used and shareable by software only E - currently used exclusively by one resource group Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/105d44c40e582c2b7e2dccf0ae247e5e61137d4b.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 75a43f8b400e..e0ec41a6540e 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -714,6 +714,78 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of, return 0; } +/** + * rdt_bit_usage_show - Display current usage of resources + * + * A domain is a shared resource that can now be allocated differently. Here + * we display the current regions of the domain as an annotated bitmask. + * For each domain of this resource its allocation bitmask + * is annotated as below to indicate the current usage of the corresponding bit: + * 0 - currently unused + * X - currently available for sharing and used by software and hardware + * H - currently used by hardware only but available for software use + * S - currently used and shareable by software only + * E - currently used exclusively by one resource group + */ +static int rdt_bit_usage_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + u32 sw_shareable, hw_shareable, exclusive; + struct rdt_domain *dom; + int i, hwb, swb, excl; + enum rdtgrp_mode mode; + bool sep = false; + u32 *ctrl; + + mutex_lock(&rdtgroup_mutex); + hw_shareable = r->cache.shareable_bits; + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_putc(seq, ';'); + ctrl = dom->ctrl_val; + sw_shareable = 0; + exclusive = 0; + seq_printf(seq, "%d=", dom->id); + for (i = 0; i < r->num_closid; i++, ctrl++) { + if (!closid_allocated(i)) + continue; + mode = rdtgroup_mode_by_closid(i); + switch (mode) { + case RDT_MODE_SHAREABLE: + sw_shareable |= *ctrl; + break; + case RDT_MODE_EXCLUSIVE: + exclusive |= *ctrl; + break; + case RDT_NUM_MODES: + WARN(1, + "invalid mode for closid %d\n", i); + break; + } + } + for (i = r->cache.cbm_len - 1; i >= 0; i--) { + hwb = test_bit(i, (unsigned long *)&hw_shareable); + swb = test_bit(i, (unsigned long *)&sw_shareable); + excl = test_bit(i, (unsigned long *)&exclusive); + if (hwb && swb) + seq_putc(seq, 'X'); + else if (hwb && !swb) + seq_putc(seq, 'H'); + else if (!hwb && swb) + seq_putc(seq, 'S'); + else if (excl) + seq_putc(seq, 'E'); + else /* Unused bits remain */ + seq_putc(seq, '0'); + } + sep = true; + } + seq_putc(seq, '\n'); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -995,6 +1067,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_shareable_bits_show, .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, + { + .name = "bit_usage", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bit_usage_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + }, { .name = "min_bandwidth", .mode = 0444, -- cgit v1.2.3 From d9b48c86eb380bb272d0ad8f5338d8dc941f3b32 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:06 -0700 Subject: x86/intel_rdt: Display resource groups' allocations' size in bytes The schemata file displays the allocations associated with each domain of each resource. The syntax of this file reflects the capacity bitmask (CBM) of the actual allocation. In order to determine the actual size of an allocation the user needs to dig through three different files to query the variables needed to compute it (the cache size, the CBM length, and the schemata). Introduce a new file "size" associated with each resource group that will mirror the schemata file syntax and display the size in bytes of each allocation. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/cc0058014c30adb88ca7d1a5abfadacbfb5edd0d.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 2 + arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 81 ++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 362c8fe695f3..ed20e91c28ed 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -467,6 +467,8 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, u32 _cbm, int closid, bool exclusive); +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, + u32 cbm); enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index e0ec41a6540e..cde0f4114d4e 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -20,6 +20,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -1016,6 +1017,78 @@ out: return ret ?: nbytes; } +/** + * rdtgroup_cbm_to_size - Translate CBM to size in bytes + * @r: RDT resource to which @d belongs. + * @d: RDT domain instance. + * @cbm: bitmask for which the size should be computed. + * + * The bitmask provided associated with the RDT domain instance @d will be + * translated into how many bytes it represents. The size in bytes is + * computed by first dividing the total cache size by the CBM length to + * determine how many bytes each bit in the bitmask represents. The result + * is multiplied with the number of bits set in the bitmask. + */ +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, + struct rdt_domain *d, u32 cbm) +{ + struct cpu_cacheinfo *ci; + unsigned int size = 0; + int num_b, i; + + num_b = bitmap_weight((unsigned long *)&cbm, r->cache.cbm_len); + ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == r->cache_level) { + size = ci->info_list[i].size / r->cache.cbm_len * num_b; + break; + } + } + + return size; +} + +/** + * rdtgroup_size_show - Display size in bytes of allocated regions + * + * The "size" file mirrors the layout of the "schemata" file, printing the + * size in bytes of each region instead of the capacity bitmask. + * + */ +static int rdtgroup_size_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + struct rdt_domain *d; + unsigned int size; + bool sep = false; + u32 cbm; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + for_each_alloc_enabled_rdt_resource(r) { + seq_printf(s, "%*s:", max_name_width, r->name); + list_for_each_entry(d, &r->domains, list) { + if (sep) + seq_putc(s, ';'); + cbm = d->ctrl_val[rdtgrp->closid]; + size = rdtgroup_cbm_to_size(r, d, cbm); + seq_printf(s, "%d=%u", d->id, size); + sep = true; + } + seq_putc(s, '\n'); + } + + rdtgroup_kn_unlock(of->kn); + + return 0; +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -1144,6 +1217,14 @@ static struct rftype res_common_files[] = { .seq_show = rdtgroup_mode_show, .fflags = RF_CTRL_BASE, }, + { + .name = "size", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_size_show, + .fflags = RF_CTRL_BASE, + }, + }; static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) -- cgit v1.2.3 From bb9fec69cb41380428d6b8dab3a303189a9b480a Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:08 -0700 Subject: x86/intel_rdt: Introduce the Cache Pseudo-Locking modes The two modes used to manage Cache Pseudo-Locked regions are introduced. A resource group is assigned "pseudo-locksetup" mode when the user indicates that this resource group will be used for a Cache Pseudo-Locked region. When the Cache Pseudo-Locked region has been set up successfully after the user wrote the requested schemata to the "schemata" file, then the mode will automatically changed to "pseudo-locked". The user is not able to modify the mode to "pseudo-locked" by writing "pseudo-locked" to the "mode" file directly. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/98d6ca129bbe7dd0932d1fcfeb3cbb65f29a8d9d.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 10 ++++++++++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 13 +++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index ed20e91c28ed..7b872f2cc294 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -84,15 +84,25 @@ enum rdt_group_type { * enum rdtgrp_mode - Mode of a RDT resource group * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed + * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking + * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations + * allowed AND the allocations are Cache Pseudo-Locked * * The mode of a resource group enables control over the allowed overlap * between allocations associated with different resource groups (classes * of service). User is able to modify the mode of a resource group by * writing to the "mode" resctrl file associated with the resource group. + * + * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by + * writing the appropriate text to the "mode" file. A resource group enters + * "pseudo-locked" mode after the schemata is written while the resource + * group is in "pseudo-locksetup" mode. */ enum rdtgrp_mode { RDT_MODE_SHAREABLE = 0, RDT_MODE_EXCLUSIVE, + RDT_MODE_PSEUDO_LOCKSETUP, + RDT_MODE_PSEUDO_LOCKED, /* Must be last */ RDT_NUM_MODES, diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index cde0f4114d4e..fef92b939f0f 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -161,8 +161,10 @@ enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) } static const char * const rdt_mode_str[] = { - [RDT_MODE_SHAREABLE] = "shareable", - [RDT_MODE_EXCLUSIVE] = "exclusive", + [RDT_MODE_SHAREABLE] = "shareable", + [RDT_MODE_EXCLUSIVE] = "exclusive", + [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", + [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", }; /** @@ -759,6 +761,13 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, case RDT_MODE_EXCLUSIVE: exclusive |= *ctrl; break; + /* + * Temporarily handle pseudo-locking enums + * to silence compile warnings until handling + * added in later patches. + */ + case RDT_MODE_PSEUDO_LOCKSETUP: + case RDT_MODE_PSEUDO_LOCKED: case RDT_NUM_MODES: WARN(1, "invalid mode for closid %d\n", i); -- cgit v1.2.3 From 21220bb199f7d65c8f0a63ac7d3209e40fbdd706 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:09 -0700 Subject: x86/intel_rdt: Respect read and write access By default, if the opener has CAP_DAC_OVERRIDE, a kernfs file can be opened regardless of RW permissions. Writing to a kernfs file will thus succeed even if permissions are 0000. It's required to restrict the actions that can be performed on a resource group from userspace based on the mode of the resource group. This restriction will be done through a modification of the file permissions. That is, for example, if a resource group is locked then the user cannot add tasks to the resource group. For this restriction through file permissions to work it has to be ensured that the permissions are always respected. To do so the resctrl filesystem is created with the KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK flag that will result in open(2) failing with -EACCESS regardless of CAP_DAC_OVERRIDE if the permission does not have the respective read or write access. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/26f4fc25f110bfc07c2d2c8b2c4ee904922fedf7.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index fef92b939f0f..6eb716765a3f 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -2545,7 +2545,8 @@ static int __init rdtgroup_setup_root(void) int ret; rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, - KERNFS_ROOT_CREATE_DEACTIVATED, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, &rdtgroup_default); if (IS_ERR(rdt_root)) return PTR_ERR(rdt_root); -- cgit v1.2.3 From f7a6e3f6f5ffa4926818ea2b3e8994ecef00b84f Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:10 -0700 Subject: x86/intel_rdt: Add utility to test if tasks assigned to resource group In considering changes to a resource group it becomes necessary to know whether tasks have been assigned to the resource group in question. Introduce a new utility that can be used to check if any tasks have been assigned to a particular resource group. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/be9ea3969ffd731dfd90c0ebcd5a0e0a2d135bb2.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 1 + arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 7b872f2cc294..11c76c4b9a5e 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -480,6 +480,7 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, u32 cbm); enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); +int rdtgroup_tasks_assigned(struct rdtgroup *r); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); void closid_free(int closid); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 6eb716765a3f..59e1de9fe2d5 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -564,6 +564,32 @@ static int __rdtgroup_move_task(struct task_struct *tsk, return ret; } +/** + * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group + * @r: Resource group + * + * Return: 1 if tasks have been assigned to @r, 0 otherwise + */ +int rdtgroup_tasks_assigned(struct rdtgroup *r) +{ + struct task_struct *p, *t; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + rcu_read_lock(); + for_each_process_thread(p, t) { + if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || + (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) { + ret = 1; + break; + } + } + rcu_read_unlock(); + + return ret; +} + static int rdtgroup_task_write_permission(struct task_struct *task, struct kernfs_open_file *of) { -- cgit v1.2.3 From 125db711e3629977b5e1f06fa066abe6366db294 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:11 -0700 Subject: x86/intel_rdt: Add utility to restrict/restore access to resctrl files When a resource group is used for Cache Pseudo-Locking then the region of cache ends up being orphaned with no class of service referring to it. The resctrl files intended to manage how the classes of services are utilized thus become irrelevant. The fact that a resctrl file is not relevant can be communicated to the user by setting all of its permissions to zero. That is, its read, write, and execute permissions are unset for all users. Introduce two utilities, rdtgroup_kn_mode_restrict() and rdtgroup_kn_mode_restore(), that can be used to restrict and restore the permissions of a file or directory belonging to a resource group. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/7afdbf5551b2f93cd45d61fbf5e01d87331f529a.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 2 + arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 95 ++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 11c76c4b9a5e..8d56776a3bd2 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -469,6 +469,8 @@ void rdt_last_cmd_printf(const char *fmt, ...); void rdt_ctrl_update(void *arg); struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); void rdtgroup_kn_unlock(struct kernfs_node *kn); +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name); struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, struct list_head **pos); ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 59e1de9fe2d5..08a412e0b47a 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1290,6 +1290,101 @@ error: return ret; } +/** + * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * + * The permissions of named resctrl file, directory, or link are modified + * to not allow read, write, or execute by any user. + * + * WARNING: This function is intended to communicate to the user that the + * resctrl file has been locked down - that it is not relevant to the + * particular state the system finds itself in. It should not be relied + * on to protect from user access because after the file's permissions + * are restricted the user can still change the permissions using chmod + * from the command line. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn; + int ret = 0; + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + iattr.ia_mode = S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode = S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode = S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +/** + * rdtgroup_kn_mode_restore - Restore user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * + * Restore the permissions of the named file. If @name is a directory the + * permissions of its parent will be used. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn, *parent; + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + iattr.ia_mode = rft->mode; + } + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + parent = kernfs_get_parent(kn); + if (parent) { + iattr.ia_mode |= parent->mode; + kernfs_put(parent); + } + iattr.ia_mode |= S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode |= S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode |= S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, unsigned long fflags) { -- cgit v1.2.3 From c966dac8a5ede5d5f9b730512d8bdbcec307fe38 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:12 -0700 Subject: x86/intel_rdt: Protect against resource group changes during locking We intend to modify file permissions to make the "tasks", "cpus", and "cpus_list" not accessible to the user when cache pseudo-locking in progress. Even so, it is still possible for the user to force the file permissions (using chmod) to make them writeable. Similarly, directory permissions will be modified to prevent future monitor group creation but the user can override these restrictions also. Add additional checks to the files we intend to restrict to ensure that no modifications from user space are attempted while setting up a pseudo-locking or after a pseudo-locked region is set up. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/0c5cb006e81ead0b8bfff2df530c5d3017fd31d1.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 10 +++++++++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 32 +++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 0e6210a043f0..bc79396c5dad 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -283,6 +283,16 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, } rdt_last_cmd_clear(); + /* + * No changes to pseudo-locked region allowed. It has to be removed + * and re-created instead. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = -EINVAL; + rdt_last_cmd_puts("resource group is pseudo-locked\n"); + goto out; + } + for_each_alloc_enabled_rdt_resource(r) { list_for_each_entry(dom, &r->domains, list) dom->have_new_ctrl = false; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 08a412e0b47a..013cbfedc753 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -449,6 +449,13 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("pseudo-locking in progress\n"); + goto unlock; + } + if (is_cpu_list(of)) ret = cpulist_parse(buf, newmask); else @@ -651,13 +658,22 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } rdt_last_cmd_clear(); - if (rdtgrp) - ret = rdtgroup_move_task(pid, rdtgrp, of); - else - ret = -ENOENT; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("pseudo-locking in progress\n"); + goto unlock; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); +unlock: rdtgroup_kn_unlock(of->kn); return ret ?: nbytes; @@ -2315,6 +2331,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_unlock; } + if (rtype == RDTMON_GROUP && + (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { + ret = -EINVAL; + rdt_last_cmd_puts("pseudo-locking in progress\n"); + goto out_unlock; + } + /* allocate the rdtgroup. */ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { -- cgit v1.2.3 From 2a5d76a4fc6469ea9dd6f02fcd4dad3a129bc1c0 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:13 -0700 Subject: x86/intel_rdt: Utilities to restrict/restore access to specific files In support of Cache Pseudo-Locking we need to restrict access to specific resctrl files to protect the state of a resource group used for pseudo-locking from being changed in unsupported ways. Introduce two utilities that can be used to either restrict or restore the access to all files irrelevant to cache pseudo-locking when pseudo-locking in progress for the resource group. At this time introduce a new source file, intel_rdt_pseudo_lock.c, that will contain most of the code related to cache pseudo-locking. Temporarily mark these new functions as unused to silence compile warnings until they are used. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/ab6319d1244366be3f9b7f9fba1c3da4810a274b.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/Makefile | 3 +- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 113 ++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c (limited to 'arch') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 7a40196967cb..c4e02555563a 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -35,7 +35,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt_ctrlmondata.o intel_rdt_pseudo_lock.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c new file mode 100644 index 000000000000..dc79b3090ac5 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Resource Director Technology (RDT) + * + * Pseudo-locking support built on top of Cache Allocation Technology (CAT) + * + * Copyright (C) 2018 Intel Corporation + * + * Author: Reinette Chatre + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "intel_rdt.h" + +/** + * rdtgroup_locksetup_user_restrict - Restrict user access to group + * @rdtgrp: resource group needing access restricted + * + * A resource group used for cache pseudo-locking cannot have cpus or tasks + * assigned to it. This is communicated to the user by restricting access + * to all the files that can be used to make such changes. + * + * Permissions restored with rdtgroup_locksetup_user_restore() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restriction of access an attempt will be made to restore permissions but + * the state of the mode of these files will be uncertain when a failure + * occurs. + */ +static int __attribute__ ((unused)) +rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); + if (ret) + goto err_cpus; + + if (rdt_mon_capable) { + ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list"); +err_cpus: + rdtgroup_kn_mode_restore(rdtgrp, "cpus"); +err_tasks: + rdtgroup_kn_mode_restore(rdtgrp, "tasks"); +out: + return ret; +} + +/** + * rdtgroup_locksetup_user_restore - Restore user access to group + * @rdtgrp: resource group needing access restored + * + * Restore all file access previously removed using + * rdtgroup_locksetup_user_restrict() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restoration of access an attempt will be made to restrict permissions + * again but the state of the mode of these files will be uncertain when + * a failure occurs. + */ +static int __attribute__ ((unused)) +rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks"); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus"); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list"); + if (ret) + goto err_cpus; + + if (rdt_mon_capable) { + ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups"); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); +err_cpus: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); +err_tasks: + rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); +out: + return ret; +} -- cgit v1.2.3 From bbcee99b67c5a8cc4e8037d561be9ed293961fd3 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:14 -0700 Subject: x86/intel_rdt: Add check to determine if monitoring in progress When a resource group is pseudo-locked it is orphaned without a class of service associated with it. We thus do not want any monitoring in progress on a resource group that will be used for pseudo-locking. Introduce a test that can be used to determine if pseudo-locking in progress on a resource group. Temporarily mark it as unused to avoid compile warnings until it is used. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/14fd9494f87ca72a213b3a197d1172d4e66ae196.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index dc79b3090ac5..8693dbe602a2 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -13,6 +13,19 @@ #include "intel_rdt.h" +/** + * rdtgroup_monitor_in_progress - Test if monitoring in progress + * @r: resource group being queried + * + * Return: 1 if monitor groups have been created for this resource + * group, 0 otherwise. + */ +static int __attribute__ ((unused)) +rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) +{ + return !list_empty(&rdtgrp->mon.crdtgrp_list); +} + /** * rdtgroup_locksetup_user_restrict - Restrict user access to group * @rdtgrp: resource group needing access restricted -- cgit v1.2.3 From e8140a2d13d429364f18ca41b1e4960708c3a40e Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:15 -0700 Subject: x86/intel_rdt: Introduce pseudo-locked region A pseudo-locked region is introduced representing an instance of a pseudo-locked cache region. Each cache instance (domain) can support one pseudo-locked region. Similarly a resource group can be used for one pseudo-locked region. Include a pointer to a pseudo-locked region from the domain and resource group structures. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/9f69eb159051067703bcbc714de62e69874d5dee.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 64 ++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 8d56776a3bd2..f3dcbcd03dfa 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -122,6 +122,20 @@ struct mongroup { u32 rmid; }; +/** + * struct pseudo_lock_region - pseudo-lock region information + * @r: RDT resource to which this pseudo-locked region + * belongs + * @d: RDT domain to which this pseudo-locked region + * belongs + * @cbm: bitmask of the pseudo-locked region + */ +struct pseudo_lock_region { + struct rdt_resource *r; + struct rdt_domain *d; + u32 cbm; +}; + /** * struct rdtgroup - store rdtgroup's data in resctrl file system. * @kn: kernfs node @@ -135,17 +149,19 @@ struct mongroup { * monitor only or ctrl_mon group * @mon: mongroup related data * @mode: mode of resource group + * @plr: pseudo-locked region */ struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - u32 closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; - enum rdt_group_type type; - struct mongroup mon; - enum rdtgrp_mode mode; + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; + enum rdtgrp_mode mode; + struct pseudo_lock_region *plr; }; /* rdtgroup.flags */ @@ -246,22 +262,24 @@ struct mbm_state { * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps * @new_ctrl: new ctrl value to be loaded * @have_new_ctrl: did user provide new_ctrl for this domain + * @plr: pseudo-locked region (if any) associated with domain */ struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; - unsigned long *rmid_busy_llc; - struct mbm_state *mbm_total; - struct mbm_state *mbm_local; - struct delayed_work mbm_over; - struct delayed_work cqm_limbo; - int mbm_work_cpu; - int cqm_work_cpu; - u32 *ctrl_val; - u32 *mbps_val; - u32 new_ctrl; - bool have_new_ctrl; + struct list_head list; + int id; + struct cpumask cpu_mask; + unsigned long *rmid_busy_llc; + struct mbm_state *mbm_total; + struct mbm_state *mbm_local; + struct delayed_work mbm_over; + struct delayed_work cqm_limbo; + int mbm_work_cpu; + int cqm_work_cpu; + u32 *ctrl_val; + u32 *mbps_val; + u32 new_ctrl; + bool have_new_ctrl; + struct pseudo_lock_region *plr; }; /** -- cgit v1.2.3 From 63657c1cdf89a37d3b69471ad4e5b55c77e86d3f Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:16 -0700 Subject: x86/intel_rdt: Support enter/exit of locksetup mode The locksetup mode is the way in which the user communicates that the resource group will be used for a pseudo-locked region. Locksetup mode should thus ensure that all restrictions on a resource group are met before locksetup mode can be entered. The resource group should also be configured to ensure that it cannot be modified in unsupported ways when a pseudo-locked region. Introduce the support where the request for entering locksetup mode can be validated. This includes: CDP is not active, no cpus or tasks are assigned to the resource group, monitoring is not in progress on the resource group. Once the resource group is determined ready for a pseudo-locked region it is configured to not allow future changes to these properties. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/b120f71ced30116bcc6c6f651e8a7906ae6b903d.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 2 + arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 187 +++++++++++++++++++++++++++- 2 files changed, 183 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index f3dcbcd03dfa..338fe47396a5 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -501,6 +501,8 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, u32 cbm); enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); int rdtgroup_tasks_assigned(struct rdtgroup *r); +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); void closid_free(int closid); diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 8693dbe602a2..ce8243c87877 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -11,8 +11,48 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include "intel_rdt.h" +/** + * pseudo_lock_init - Initialize a pseudo-lock region + * @rdtgrp: resource group to which new pseudo-locked region will belong + * + * A pseudo-locked region is associated with a resource group. When this + * association is created the pseudo-locked region is initialized. The + * details of the pseudo-locked region are not known at this time so only + * allocation is done and association established. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_init(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr; + + plr = kzalloc(sizeof(*plr), GFP_KERNEL); + if (!plr) + return -ENOMEM; + + rdtgrp->plr = plr; + return 0; +} + +/** + * pseudo_lock_free - Free a pseudo-locked region + * @rdtgrp: resource group to which pseudo-locked region belonged + * + * The pseudo-locked region's resources have already been released, or not + * yet created at this point. Now it can be freed and disassociated from the + * resource group. + * + * Return: void + */ +static void pseudo_lock_free(struct rdtgroup *rdtgrp) +{ + kfree(rdtgrp->plr); + rdtgrp->plr = NULL; +} + /** * rdtgroup_monitor_in_progress - Test if monitoring in progress * @r: resource group being queried @@ -20,8 +60,7 @@ * Return: 1 if monitor groups have been created for this resource * group, 0 otherwise. */ -static int __attribute__ ((unused)) -rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) +static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) { return !list_empty(&rdtgrp->mon.crdtgrp_list); } @@ -41,8 +80,7 @@ rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) * the state of the mode of these files will be uncertain when a failure * occurs. */ -static int __attribute__ ((unused)) -rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) +static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) { int ret; @@ -89,8 +127,7 @@ out: * again but the state of the mode of these files will be uncertain when * a failure occurs. */ -static int __attribute__ ((unused)) -rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) +static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) { int ret; @@ -124,3 +161,141 @@ err_tasks: out: return ret; } + +/** + * rdtgroup_locksetup_enter - Resource group enters locksetup mode + * @rdtgrp: resource group requested to enter locksetup mode + * + * A resource group enters locksetup mode to reflect that it would be used + * to represent a pseudo-locked region and is in the process of being set + * up to do so. A resource group used for a pseudo-locked region would + * lose the closid associated with it so we cannot allow it to have any + * tasks or cpus assigned nor permit tasks or cpus to be assigned in the + * future. Monitoring of a pseudo-locked region is not allowed either. + * + * The above and more restrictions on a pseudo-locked region are checked + * for and enforced before the resource group enters the locksetup mode. + * + * Returns: 0 if the resource group successfully entered locksetup mode, <0 + * on failure. On failure the last_cmd_status buffer is updated with text to + * communicate details of failure to the user. + */ +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + int ret; + + /* + * The default resource group can neither be removed nor lose the + * default closid associated with it. + */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("cannot pseudo-lock default group\n"); + return -EINVAL; + } + + /* + * Cache Pseudo-locking not supported when CDP is enabled. + * + * Some things to consider if you would like to enable this + * support (using L3 CDP as example): + * - When CDP is enabled two separate resources are exposed, + * L3DATA and L3CODE, but they are actually on the same cache. + * The implication for pseudo-locking is that if a + * pseudo-locked region is created on a domain of one + * resource (eg. L3CODE), then a pseudo-locked region cannot + * be created on that same domain of the other resource + * (eg. L3DATA). This is because the creation of a + * pseudo-locked region involves a call to wbinvd that will + * affect all cache allocations on particular domain. + * - Considering the previous, it may be possible to only + * expose one of the CDP resources to pseudo-locking and + * hide the other. For example, we could consider to only + * expose L3DATA and since the L3 cache is unified it is + * still possible to place instructions there are execute it. + * - If only one region is exposed to pseudo-locking we should + * still keep in mind that availability of a portion of cache + * for pseudo-locking should take into account both resources. + * Similarly, if a pseudo-locked region is created in one + * resource, the portion of cache used by it should be made + * unavailable to all future allocations from both resources. + */ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled || + rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) { + rdt_last_cmd_puts("CDP enabled\n"); + return -EINVAL; + } + + if (rdtgroup_monitor_in_progress(rdtgrp)) { + rdt_last_cmd_puts("monitoring in progress\n"); + return -EINVAL; + } + + if (rdtgroup_tasks_assigned(rdtgrp)) { + rdt_last_cmd_puts("tasks assigned to resource group\n"); + return -EINVAL; + } + + if (!cpumask_empty(&rdtgrp->cpu_mask)) { + rdt_last_cmd_puts("CPUs assigned to resource group\n"); + return -EINVAL; + } + + if (rdtgroup_locksetup_user_restrict(rdtgrp)) { + rdt_last_cmd_puts("unable to modify resctrl permissions\n"); + return -EIO; + } + + ret = pseudo_lock_init(rdtgrp); + if (ret) { + rdt_last_cmd_puts("unable to init pseudo-lock region\n"); + goto out_release; + } + + /* + * If this system is capable of monitoring a rmid would have been + * allocated when the control group was created. This is not needed + * anymore when this group would be used for pseudo-locking. This + * is safe to call on platforms not capable of monitoring. + */ + free_rmid(rdtgrp->mon.rmid); + + ret = 0; + goto out; + +out_release: + rdtgroup_locksetup_user_restore(rdtgrp); +out: + return ret; +} + +/** + * rdtgroup_locksetup_exit - resource group exist locksetup mode + * @rdtgrp: resource group + * + * When a resource group exits locksetup mode the earlier restrictions are + * lifted. + * + * Return: 0 on success, <0 on failure + */ +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + int ret; + + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) { + rdt_last_cmd_puts("out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + } + + ret = rdtgroup_locksetup_user_restore(rdtgrp); + if (ret) { + free_rmid(rdtgrp->mon.rmid); + return ret; + } + + pseudo_lock_free(rdtgrp); + return 0; +} -- cgit v1.2.3 From dfe9674b04ff6b819f9105650a008d164d81725e Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:17 -0700 Subject: x86/intel_rdt: Enable entering of pseudo-locksetup mode The user can request entering pseudo-locksetup mode by writing "pseudo-locksetup" to the mode file. Act on this request as well as support switching from a pseudo-locksetup mode (before pseudo-locked mode was entered). It is not supported to modify the mode once pseudo-locked mode has been entered. The schemata reflects the new mode by adding "uninitialized" to all resources. The size resctrl file reports zero for all cache domains in support of the uninitialized nature. Since there are no users of this class of service its allocations can be ignored when searching for appropriate default allocations for new resource groups. For the same reason resource groups in pseudo-locksetup mode are not considered when testing if new resource groups may overlap. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/56f553334708022903c296284e62db3bbc1ff150.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 16 +++++++---- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 41 +++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index bc79396c5dad..1ed273220ffa 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -156,7 +156,8 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) } if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) { - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE) { + if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { rdt_last_cmd_printf("overlaps with other group\n"); return -EINVAL; } @@ -356,10 +357,15 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { - closid = rdtgrp->closid; - for_each_alloc_enabled_rdt_resource(r) { - if (closid < r->num_closid) - show_doms(s, r, closid); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + for_each_alloc_enabled_rdt_resource(r) + seq_printf(s, "%s:uninitialized\n", r->name); + } else { + closid = rdtgrp->closid; + for_each_alloc_enabled_rdt_resource(r) { + if (closid < r->num_closid) + show_doms(s, r, closid); + } } } else { ret = -ENOENT; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 013cbfedc753..bd4e2de303dc 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -974,9 +974,10 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, ctrl = d->ctrl_val; for (i = 0; i < r->num_closid; i++, ctrl++) { ctrl_b = (unsigned long *)ctrl; - if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (closid_allocated(i) && i != closid && + mode != RDT_MODE_PSEUDO_LOCKSETUP) { if (bitmap_intersects(cbm, ctrl_b, r->cache.cbm_len)) { - mode = rdtgroup_mode_by_closid(i); if (exclusive) { if (mode == RDT_MODE_EXCLUSIVE) return true; @@ -1046,10 +1047,24 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, mode = rdtgrp->mode; if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || - (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE)) + (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || + (!strcmp(buf, "pseudo-locksetup") && + mode == RDT_MODE_PSEUDO_LOCKSETUP) || + (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) goto out; + if (mode == RDT_MODE_PSEUDO_LOCKED) { + rdt_last_cmd_printf("cannot change pseudo-locked group\n"); + ret = -EINVAL; + goto out; + } + if (!strcmp(buf, "shareable")) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } rdtgrp->mode = RDT_MODE_SHAREABLE; } else if (!strcmp(buf, "exclusive")) { if (!rdtgroup_mode_test_exclusive(rdtgrp)) { @@ -1057,7 +1072,17 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, ret = -EINVAL; goto out; } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } rdtgrp->mode = RDT_MODE_EXCLUSIVE; + } else if (!strcmp(buf, "pseudo-locksetup")) { + ret = rdtgroup_locksetup_enter(rdtgrp); + if (ret) + goto out; + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; } else { rdt_last_cmd_printf("unknown/unsupported mode\n"); ret = -EINVAL; @@ -1127,8 +1152,12 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, list_for_each_entry(d, &r->domains, list) { if (sep) seq_putc(s, ';'); - cbm = d->ctrl_val[rdtgrp->closid]; - size = rdtgroup_cbm_to_size(r, d, cbm); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + size = 0; + } else { + cbm = d->ctrl_val[rdtgrp->closid]; + size = rdtgroup_cbm_to_size(r, d, cbm); + } seq_printf(s, "%d=%u", d->id, size); sep = true; } @@ -2277,6 +2306,8 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) for (i = 0; i < r->num_closid; i++, ctrl++) { if (closid_allocated(i) && i != closid) { mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + break; used_b |= *ctrl; if (mode == RDT_MODE_SHAREABLE) d->new_ctrl |= *ctrl; -- cgit v1.2.3 From 17eafd076291ed23eeb17b28132fa33b0688bc57 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:18 -0700 Subject: x86/intel_rdt: Split resource group removal in two Resource groups used for pseudo-locking do not require the same work on removal as the other resource groups. The resource group removal is split in two in preparation for support of pseudo-locking resource groups. A single re-ordering occurs - the setting of the rdtgrp flag is moved to later. This flag is not used by any of the code between its original and new location. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/c8cbf7a7c72480b39bb946a929dbae96c0f9aca1.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index bd4e2de303dc..526cd4a97054 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -2620,6 +2620,21 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, return 0; } +static int rdtgroup_ctrl_remove(struct kernfs_node *kn, + struct rdtgroup *rdtgrp) +{ + rdtgrp->flags = RDT_DELETED; + list_del(&rdtgrp->rdtgroup_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + return 0; +} + static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { @@ -2645,7 +2660,6 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); - rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); free_rmid(rdtgrp->mon.rmid); @@ -2654,14 +2668,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, */ free_all_child_rdtgrp(rdtgrp); - list_del(&rdtgrp->rdtgroup_list); - - /* - * one extra hold on this, will drop when we kfree(rdtgrp) - * in rdtgroup_kn_unlock() - */ - kernfs_get(kn); - kernfs_remove(rdtgrp->kn); + rdtgroup_ctrl_remove(kn, rdtgrp); return 0; } -- cgit v1.2.3 From 72d505056604a305a4fcd8b268d2f6e979e17023 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:19 -0700 Subject: x86/intel_rdt: Add utilities to test pseudo-locked region possibility A pseudo-locked region does not have a class of service associated with it and thus not tracked in the array of control values maintained as part of the domain. Even so, when the user provides a new bitmask for another resource group it needs to be checked for interference with existing pseudo-locked regions. Additionally only one pseudo-locked region can be created in any cache hierarchy. Introduce two utilities in support of above scenarios: (1) a utility that can be used to test if a given capacity bitmask overlaps with any pseudo-locked regions associated with a particular cache instance, (2) a utility that can be used to test if a pseudo-locked region exists within a particular cache hierarchy. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/b8e31dbdcf22ddf71df46072647b47e7558abb32.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 2 + arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 74 +++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 338fe47396a5..b5d3ddfa94e6 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -503,6 +503,8 @@ enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); int rdtgroup_tasks_assigned(struct rdtgroup *r); int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm); +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); void closid_free(int closid); diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index ce8243c87877..b145a7386b10 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -299,3 +299,77 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) pseudo_lock_free(rdtgrp); return 0; } + +/** + * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked + * @d: RDT domain + * @_cbm: CBM to test + * + * @d represents a cache instance and @_cbm a capacity bitmask that is + * considered for it. Determine if @_cbm overlaps with any existing + * pseudo-locked region on @d. + * + * Return: true if @_cbm overlaps with pseudo-locked region on @d, false + * otherwise. + */ +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm) +{ + unsigned long *cbm = (unsigned long *)&_cbm; + unsigned long *cbm_b; + unsigned int cbm_len; + + if (d->plr) { + cbm_len = d->plr->r->cache.cbm_len; + cbm_b = (unsigned long *)&d->plr->cbm; + if (bitmap_intersects(cbm, cbm_b, cbm_len)) + return true; + } + + return false; +} + +/** + * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy + * @d: RDT domain under test + * + * The setup of a pseudo-locked region affects all cache instances within + * the hierarchy of the region. It is thus essential to know if any + * pseudo-locked regions exist within a cache hierarchy to prevent any + * attempts to create new pseudo-locked regions in the same hierarchy. + * + * Return: true if a pseudo-locked region exists in the hierarchy of @d or + * if it is not possible to test due to memory allocation issue, + * false otherwise. + */ +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) +{ + cpumask_var_t cpu_with_psl; + struct rdt_resource *r; + struct rdt_domain *d_i; + bool ret = false; + + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) + return true; + + /* + * First determine which cpus have pseudo-locked regions + * associated with them. + */ + for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(d_i, &r->domains, list) { + if (d_i->plr) + cpumask_or(cpu_with_psl, cpu_with_psl, + &d_i->cpu_mask); + } + } + + /* + * Next test if new pseudo-locked region would intersect with + * existing region. + */ + if (cpumask_intersects(&d->cpu_mask, cpu_with_psl)) + ret = true; + + free_cpumask_var(cpu_with_psl); + return ret; +} -- cgit v1.2.3 From f2a177292bd052ce12ac453d2ceeb083fe07718a Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:20 -0700 Subject: x86/intel_rdt: Discover supported platforms via prefetch disable bits Knowing the model specific prefetch disable bits is required to support cache pseudo-locking because the hardware prefetchers need to be disabled when the kernel memory is pseudo-locked to cache. We add these bits only for platforms known to support cache pseudo-locking. When the user requests locksetup mode to be entered it will fail if the prefetch disabling bits are not known for the platform. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/3eef559aa9fd693a104ff99ff909cfee450c1695.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 75 +++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index b145a7386b10..cbba4bc17522 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -12,8 +12,73 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include "intel_rdt.h" +/* + * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware + * prefetcher state. Details about this register can be found in the MSR + * tables for specific platforms found in Intel's SDM. + */ +#define MSR_MISC_FEATURE_CONTROL 0x000001a4 + +/* + * The bits needed to disable hardware prefetching varies based on the + * platform. During initialization we will discover which bits to use. + */ +static u64 prefetch_disable_bits; + +/** + * get_prefetch_disable_bits - prefetch disable bits of supported platforms + * + * Capture the list of platforms that have been validated to support + * pseudo-locking. This includes testing to ensure pseudo-locked regions + * with low cache miss rates can be created under variety of load conditions + * as well as that these pseudo-locked regions can maintain their low cache + * miss rates under variety of load conditions for significant lengths of time. + * + * After a platform has been validated to support pseudo-locking its + * hardware prefetch disable bits are included here as they are documented + * in the SDM. + * + * Return: + * If platform is supported, the bits to disable hardware prefetchers, 0 + * if platform is not supported. + */ +static u64 get_prefetch_disable_bits(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return 0; + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_X: + /* + * SDM defines bits of MSR_MISC_FEATURE_CONTROL register + * as: + * 0 L2 Hardware Prefetcher Disable (R/W) + * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W) + * 2 DCU Hardware Prefetcher Disable (R/W) + * 3 DCU IP Prefetcher Disable (R/W) + * 63:4 Reserved + */ + return 0xF; + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_GEMINI_LAKE: + /* + * SDM defines bits of MSR_MISC_FEATURE_CONTROL register + * as: + * 0 L2 Hardware Prefetcher Disable (R/W) + * 1 Reserved + * 2 DCU Hardware Prefetcher Disable (R/W) + * 63:3 Reserved + */ + return 0x5; + } + + return 0; +} + /** * pseudo_lock_init - Initialize a pseudo-lock region * @rdtgrp: resource group to which new pseudo-locked region will belong @@ -225,6 +290,16 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) return -EINVAL; } + /* + * Not knowing the bits to disable prefetching implies that this + * platform does not support Cache Pseudo-Locking. + */ + prefetch_disable_bits = get_prefetch_disable_bits(); + if (prefetch_disable_bits == 0) { + rdt_last_cmd_puts("pseudo-locking not supported\n"); + return -EINVAL; + } + if (rdtgroup_monitor_in_progress(rdtgrp)) { rdt_last_cmd_puts("monitoring in progress\n"); return -EINVAL; -- cgit v1.2.3 From 018961ae5579016d46ee03587c0ecf673821eedb Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:21 -0700 Subject: x86/intel_rdt: Pseudo-lock region creation/removal core The user requests a pseudo-locked region by providing a schemata to a resource group that is in the pseudo-locksetup mode. This is the functionality that consumes the parsed user data and creates the pseudo-locked region. First, required information is deduced from user provided data. This includes, how much memory does the requested bitmask represent, which CPU the requested region is associated with, and what is the cache line size of that cache (to learn the stride needed for locking). Second, a contiguous block of memory matching the requested bitmask is allocated. Finally, pseudo-locking is performed. The resource group already has the allocation that reflects the requested bitmask. With this class of service active and interference minimized, the allocated memory is loaded into the cache. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/67391160bbf06143bc62d856d3d234eb152008b7.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 17 ++ arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 337 +++++++++++++++++++++++++++- 2 files changed, 353 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index b5d3ddfa94e6..f245aaae514e 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -129,11 +129,26 @@ struct mongroup { * @d: RDT domain to which this pseudo-locked region * belongs * @cbm: bitmask of the pseudo-locked region + * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread + * completion + * @thread_done: variable used by waitqueue to test if pseudo-locking + * thread completed + * @cpu: core associated with the cache on which the setup code + * will be run + * @line_size: size of the cache lines + * @size: size of pseudo-locked region in bytes + * @kmem: the kernel memory associated with pseudo-locked region */ struct pseudo_lock_region { struct rdt_resource *r; struct rdt_domain *d; u32 cbm; + wait_queue_head_t lock_thread_wq; + int thread_done; + int cpu; + unsigned int line_size; + unsigned int size; + void *kmem; }; /** @@ -505,6 +520,8 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm); bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); void closid_free(int closid); diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index cbba4bc17522..22387af30a9e 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -11,8 +11,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include #include +#include #include +#include #include "intel_rdt.h" /* @@ -79,6 +85,53 @@ static u64 get_prefetch_disable_bits(void) return 0; } +/** + * pseudo_lock_region_init - Initialize pseudo-lock region information + * @plr: pseudo-lock region + * + * Called after user provided a schemata to be pseudo-locked. From the + * schemata the &struct pseudo_lock_region is on entry already initialized + * with the resource, domain, and capacity bitmask. Here the information + * required for pseudo-locking is deduced from this data and &struct + * pseudo_lock_region initialized further. This information includes: + * - size in bytes of the region to be pseudo-locked + * - cache line size to know the stride with which data needs to be accessed + * to be pseudo-locked + * - a cpu associated with the cache instance on which the pseudo-locking + * flow can be executed + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_init(struct pseudo_lock_region *plr) +{ + struct cpu_cacheinfo *ci; + int i; + + /* Pick the first cpu we find that is associated with the cache. */ + plr->cpu = cpumask_first(&plr->d->cpu_mask); + + if (!cpu_online(plr->cpu)) { + rdt_last_cmd_printf("cpu %u associated with cache not online\n", + plr->cpu); + return -ENODEV; + } + + ci = get_cpu_cacheinfo(plr->cpu); + + plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm); + + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == plr->r->cache_level) { + plr->line_size = ci->info_list[i].coherency_line_size; + return 0; + } + } + + rdt_last_cmd_puts("unable to determine cache line size\n"); + return -1; +} + /** * pseudo_lock_init - Initialize a pseudo-lock region * @rdtgrp: resource group to which new pseudo-locked region will belong @@ -98,10 +151,69 @@ static int pseudo_lock_init(struct rdtgroup *rdtgrp) if (!plr) return -ENOMEM; + init_waitqueue_head(&plr->lock_thread_wq); rdtgrp->plr = plr; return 0; } +/** + * pseudo_lock_region_clear - Reset pseudo-lock region data + * @plr: pseudo-lock region + * + * All content of the pseudo-locked region is reset - any memory allocated + * freed. + * + * Return: void + */ +static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) +{ + plr->size = 0; + plr->line_size = 0; + kfree(plr->kmem); + plr->kmem = NULL; + plr->r = NULL; + if (plr->d) + plr->d->plr = NULL; + plr->d = NULL; + plr->cbm = 0; +} + +/** + * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked + * @plr: pseudo-lock region + * + * Initialize the details required to set up the pseudo-locked region and + * allocate the contiguous memory that will be pseudo-locked to the cache. + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) +{ + int ret; + + ret = pseudo_lock_region_init(plr); + if (ret < 0) + return ret; + + /* + * We do not yet support contiguous regions larger than + * KMALLOC_MAX_SIZE. + */ + if (plr->size > KMALLOC_MAX_SIZE) { + rdt_last_cmd_puts("requested region exceeds maximum size\n"); + return -E2BIG; + } + + plr->kmem = kzalloc(plr->size, GFP_KERNEL); + if (!plr->kmem) { + rdt_last_cmd_puts("unable to allocate memory\n"); + return -ENOMEM; + } + + return 0; +} + /** * pseudo_lock_free - Free a pseudo-locked region * @rdtgrp: resource group to which pseudo-locked region belonged @@ -114,10 +226,142 @@ static int pseudo_lock_init(struct rdtgroup *rdtgrp) */ static void pseudo_lock_free(struct rdtgroup *rdtgrp) { + pseudo_lock_region_clear(rdtgrp->plr); kfree(rdtgrp->plr); rdtgrp->plr = NULL; } +/** + * pseudo_lock_fn - Load kernel memory into cache + * @_rdtgrp: resource group to which pseudo-lock region belongs + * + * This is the core pseudo-locking flow. + * + * First we ensure that the kernel memory cannot be found in the cache. + * Then, while taking care that there will be as little interference as + * possible, the memory to be loaded is accessed while core is running + * with class of service set to the bitmask of the pseudo-locked region. + * After this is complete no future CAT allocations will be allowed to + * overlap with this bitmask. + * + * Local register variables are utilized to ensure that the memory region + * to be locked is the only memory access made during the critical locking + * loop. + * + * Return: 0. Waiter on waitqueue will be woken on completion. + */ +static int pseudo_lock_fn(void *_rdtgrp) +{ + struct rdtgroup *rdtgrp = _rdtgrp; + struct pseudo_lock_region *plr = rdtgrp->plr; + u32 rmid_p, closid_p; + unsigned long i; +#ifdef CONFIG_KASAN + /* + * The registers used for local register variables are also used + * when KASAN is active. When KASAN is active we use a regular + * variable to ensure we always use a valid pointer, but the cost + * is that this variable will enter the cache through evicting the + * memory we are trying to lock into the cache. Thus expect lower + * pseudo-locking success rate when KASAN is active. + */ + unsigned int line_size; + unsigned int size; + void *mem_r; +#else + register unsigned int line_size asm("esi"); + register unsigned int size asm("edi"); +#ifdef CONFIG_X86_64 + register void *mem_r asm("rbx"); +#else + register void *mem_r asm("ebx"); +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_KASAN */ + + /* + * Make sure none of the allocated memory is cached. If it is we + * will get a cache hit in below loop from outside of pseudo-locked + * region. + * wbinvd (as opposed to clflush/clflushopt) is required to + * increase likelihood that allocated cache portion will be filled + * with associated memory. + */ + native_wbinvd(); + + /* + * Always called with interrupts enabled. By disabling interrupts + * ensure that we will not be preempted during this critical section. + */ + local_irq_disable(); + + /* + * Call wrmsr and rdmsr as directly as possible to avoid tracing + * clobbering local register variables or affecting cache accesses. + * + * Disable the hardware prefetcher so that when the end of the memory + * being pseudo-locked is reached the hardware will not read beyond + * the buffer and evict pseudo-locked memory read earlier from the + * cache. + */ + __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + closid_p = this_cpu_read(pqr_state.cur_closid); + rmid_p = this_cpu_read(pqr_state.cur_rmid); + mem_r = plr->kmem; + size = plr->size; + line_size = plr->line_size; + /* + * Critical section begin: start by writing the closid associated + * with the capacity bitmask of the cache region being + * pseudo-locked followed by reading of kernel memory to load it + * into the cache. + */ + __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); + /* + * Cache was flushed earlier. Now access kernel memory to read it + * into cache region associated with just activated plr->closid. + * Loop over data twice: + * - In first loop the cache region is shared with the page walker + * as it populates the paging structure caches (including TLB). + * - In the second loop the paging structure caches are used and + * cache region is populated with the memory being referenced. + */ + for (i = 0; i < size; i += PAGE_SIZE) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + for (i = 0; i < size; i += line_size) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Critical section end: restore closid with capacity bitmask that + * does not overlap with pseudo-locked region. + */ + __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); + + /* Re-enable the hardware prefetcher(s) */ + wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + local_irq_enable(); + + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + /** * rdtgroup_monitor_in_progress - Test if monitoring in progress * @r: resource group being queried @@ -399,7 +643,6 @@ bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm) if (bitmap_intersects(cbm, cbm_b, cbm_len)) return true; } - return false; } @@ -448,3 +691,95 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) free_cpumask_var(cpu_with_psl); return ret; } + +/** + * rdtgroup_pseudo_lock_create - Create a pseudo-locked region + * @rdtgrp: resource group to which pseudo-lock region belongs + * + * Called when a resource group in the pseudo-locksetup mode receives a + * valid schemata that should be pseudo-locked. Since the resource group is + * in pseudo-locksetup mode the &struct pseudo_lock_region has already been + * allocated and initialized with the essential information. If a failure + * occurs the resource group remains in the pseudo-locksetup mode with the + * &struct pseudo_lock_region associated with it, but cleared from all + * information and ready for the user to re-attempt pseudo-locking by + * writing the schemata again. + * + * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 + * on failure. Descriptive error will be written to last_cmd_status buffer. + */ +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + int ret; + + ret = pseudo_lock_region_alloc(plr); + if (ret < 0) + return ret; + + plr->thread_done = 0; + + thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, + cpu_to_node(plr->cpu), + "pseudo_lock/%u", plr->cpu); + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + rdt_last_cmd_printf("locking thread returned error %d\n", ret); + goto out_region; + } + + kthread_bind(thread, plr->cpu); + wake_up_process(thread); + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) { + /* + * If the thread does not get on the CPU for whatever + * reason and the process which sets up the region is + * interrupted then this will leave the thread in runnable + * state and once it gets on the CPU it will derefence + * the cleared, but not freed, plr struct resulting in an + * empty pseudo-locking loop. + */ + rdt_last_cmd_puts("locking thread interrupted\n"); + goto out_region; + } + + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; + closid_free(rdtgrp->closid); + ret = 0; + goto out; + +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region + * @rdtgrp: resource group to which the pseudo-locked region belongs + * + * The removal of a pseudo-locked region can be initiated when the resource + * group is removed from user space via a "rmdir" from userspace or the + * unmount of the resctrl filesystem. On removal the resource group does + * not go back to pseudo-locksetup mode before it is removed, instead it is + * removed directly. There is thus assymmetry with the creation where the + * &struct pseudo_lock_region is removed here while it was not created in + * rdtgroup_pseudo_lock_create(). + * + * Return: void + */ +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) +{ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) + /* + * Default group cannot be a pseudo-locked region so we can + * free closid here. + */ + closid_free(rdtgrp->closid); + + pseudo_lock_free(rdtgrp); +} -- cgit v1.2.3 From e0bdfe8e36f3fbbdc91e70bf927f743ca23917b0 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:22 -0700 Subject: x86/intel_rdt: Support creation/removal of pseudo-locked region The user triggers the creation of a pseudo-locked region when writing a valid schemata to the schemata file of a resource group in the pseudo-locksetup mode. A valid schemata is one that: (1) does not overlap with any other resource group, (2) does not involve a cache that already contains a pseudo-locked region within its hierarchy. After a valid schemata is parsed the system is programmed to associate the to be pseudo-lock bitmask with the closid associated with the resource group. With the system set up the pseudo-locked region can be created. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/8929c3a9e2ba600e79649abe584aa28b8d0ff639.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 42 +++++++++++++++++++++++++++++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 25 +++++++++++++---- 2 files changed, 62 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 1ed273220ffa..6f4c0002b2c1 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -143,9 +143,26 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) return -EINVAL; } + /* + * Cannot set up more than one pseudo-locked region in a cache + * hierarchy. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + rdtgroup_pseudo_locked_in_hierarchy(d)) { + rdt_last_cmd_printf("pseudo-locked region in hierarchy\n"); + return -EINVAL; + } + if (!cbm_validate(data->buf, &cbm_val, r)) return -EINVAL; + if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_SHAREABLE) && + rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { + rdt_last_cmd_printf("CBM overlaps with pseudo-locked region\n"); + return -EINVAL; + } + /* * The CBM may not overlap with the CBM of another closid if * either is exclusive. @@ -199,6 +216,21 @@ next: data.rdtgrp = rdtgrp; if (r->parse_ctrlval(&data, r, d)) return -EINVAL; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * In pseudo-locking setup mode and just + * parsed a valid CBM that should be + * pseudo-locked. Only one locked region per + * resource group and domain so just do + * the required initialization for single + * region and return. + */ + rdtgrp->plr->r = r; + rdtgrp->plr->d = d; + rdtgrp->plr->cbm = d->new_ctrl; + d->plr = rdtgrp->plr; + return 0; + } goto next; } } @@ -322,6 +354,16 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, goto out; } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * If pseudo-locking fails we keep the resource group in + * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service + * active and updated for just the domain the pseudo-locked + * region was requested for. + */ + ret = rdtgroup_pseudo_lock_create(rdtgrp); + } + out: rdtgroup_kn_unlock(of->kn); return ret ?: nbytes; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 526cd4a97054..d3fb7bd8cbba 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1771,6 +1771,9 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); kernfs_unbreak_active_protection(kn); kernfs_put(rdtgrp->kn); kfree(rdtgrp); @@ -2002,6 +2005,10 @@ static void rmdir_all_sub(void) if (rdtgrp == &rdtgroup_default) continue; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + /* * Give any CPUs back to the default group. We cannot copy * cpu_online_mask because a CPU might have executed the @@ -2313,6 +2320,8 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) d->new_ctrl |= *ctrl; } } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); unused_b &= BIT_MASK(r->cache.cbm_len) - 1; d->new_ctrl |= unused_b; @@ -2696,13 +2705,19 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) * If the rdtgroup is a mon group and parent directory * is a valid "mon_groups" directory, remove the mon group. */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) - ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); - else if (rdtgrp->type == RDTMON_GROUP && - is_mon_groups(parent_kn, kn->name)) + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = rdtgroup_ctrl_remove(kn, rdtgrp); + } else { + ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + } + } else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) { ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); - else + } else { ret = -EPERM; + } out: rdtgroup_kn_unlock(kn); -- cgit v1.2.3 From f4e80d67a527469245f391976d8665f934a16205 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:23 -0700 Subject: x86/intel_rdt: Resctrl files reflect pseudo-locked information Information about resources as well as resource groups are contained in a variety of resctrl files. Now that pseudo-locked regions can be created the files can be updated to present appropriate information to the user. Update the resource group's schemata file to show only the information of the pseudo-locked region. Update the resource group's size file to show the size in bytes of only the pseudo-locked region. Update the bit_usage file to use the letter 'P' for all pseudo-locked regions. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/5ece82869b651c2178b278e00bca959f7626b6e9.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 3 +++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 31 +++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 6f4c0002b2c1..af358ca05160 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -402,6 +402,9 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { for_each_alloc_enabled_rdt_resource(r) seq_printf(s, "%s:uninitialized\n", r->name); + } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->r->name, + rdtgrp->plr->d->id, rdtgrp->plr->cbm); } else { closid = rdtgrp->closid; for_each_alloc_enabled_rdt_resource(r) { diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index d3fb7bd8cbba..f301919ae9b2 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -771,14 +771,16 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of, * H - currently used by hardware only but available for software use * S - currently used and shareable by software only * E - currently used exclusively by one resource group + * P - currently pseudo-locked by one resource group */ static int rdt_bit_usage_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - u32 sw_shareable, hw_shareable, exclusive; + u32 sw_shareable = 0, hw_shareable = 0; + u32 exclusive = 0, pseudo_locked = 0; struct rdt_domain *dom; - int i, hwb, swb, excl; + int i, hwb, swb, excl, psl; enum rdtgrp_mode mode; bool sep = false; u32 *ctrl; @@ -803,12 +805,15 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, case RDT_MODE_EXCLUSIVE: exclusive |= *ctrl; break; + case RDT_MODE_PSEUDO_LOCKSETUP: /* - * Temporarily handle pseudo-locking enums - * to silence compile warnings until handling - * added in later patches. + * RDT_MODE_PSEUDO_LOCKSETUP is possible + * here but not included since the CBM + * associated with this CLOSID in this mode + * is not initialized and no task or cpu can be + * assigned this CLOSID. */ - case RDT_MODE_PSEUDO_LOCKSETUP: + break; case RDT_MODE_PSEUDO_LOCKED: case RDT_NUM_MODES: WARN(1, @@ -817,9 +822,11 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } } for (i = r->cache.cbm_len - 1; i >= 0; i--) { + pseudo_locked = dom->plr ? dom->plr->cbm : 0; hwb = test_bit(i, (unsigned long *)&hw_shareable); swb = test_bit(i, (unsigned long *)&sw_shareable); excl = test_bit(i, (unsigned long *)&exclusive); + psl = test_bit(i, (unsigned long *)&pseudo_locked); if (hwb && swb) seq_putc(seq, 'X'); else if (hwb && !swb) @@ -828,6 +835,8 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, seq_putc(seq, 'S'); else if (excl) seq_putc(seq, 'E'); + else if (psl) + seq_putc(seq, 'P'); else /* Unused bits remain */ seq_putc(seq, '0'); } @@ -1147,6 +1156,15 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, return -ENOENT; } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->r, + rdtgrp->plr->d, + rdtgrp->plr->cbm); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + goto out; + } + for_each_alloc_enabled_rdt_resource(r) { seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(d, &r->domains, list) { @@ -1164,6 +1182,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, seq_putc(s, '\n'); } +out: rdtgroup_kn_unlock(of->kn); return 0; -- cgit v1.2.3 From 0af6a48da481109affc4ea8295034f69993a91ef Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:24 -0700 Subject: x86/intel_rdt: Ensure RDT cleanup on exit The RDT system's initialization does not have the corresponding exit handling to ensure everything initialized on load is cleaned up also. Introduce the cleanup routines that complement all initialization. This includes the removal of a duplicate rdtgroup_init() declaration. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/a9e3a2bbd731d13915d2d7bf05d4f675b4fa109b.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 11 +++++++++++ arch/x86/kernel/cpu/intel_rdt.h | 3 +-- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 7 +++++++ 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index ec4754f81cbd..abb71ac70443 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -859,6 +859,8 @@ static __init bool get_rdt_resources(void) return (rdt_mon_capable || rdt_alloc_capable); } +static enum cpuhp_state rdt_online; + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -880,6 +882,7 @@ static int __init intel_rdt_late_init(void) cpuhp_remove_state(state); return ret; } + rdt_online = state; for_each_alloc_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); @@ -891,3 +894,11 @@ static int __init intel_rdt_late_init(void) } late_initcall(intel_rdt_late_init); + +static void __exit intel_rdt_exit(void) +{ + cpuhp_remove_state(rdt_online); + rdtgroup_exit(); +} + +__exitcall(intel_rdt_exit); diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index f245aaae514e..2b3f7619be48 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -209,6 +209,7 @@ extern struct list_head rdt_all_groups; extern int max_name_width, max_data_width; int __init rdtgroup_init(void); +void __exit rdtgroup_exit(void); /** * struct rftype - describe each file in the resctrl file system @@ -431,8 +432,6 @@ extern struct rdt_resource rdt_resources_all[]; extern struct rdtgroup rdtgroup_default; DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); -int __init rdtgroup_init(void); - enum { RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index f301919ae9b2..960066249374 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -2828,3 +2828,10 @@ cleanup_root: return ret; } + +void __exit rdtgroup_exit(void) +{ + unregister_filesystem(&rdt_fs_type); + sysfs_remove_mount_point(fs_kobj, "resctrl"); + kernfs_destroy_root(rdt_root); +} -- cgit v1.2.3 From 37707ec6cba6668f22a47016fdee612b0ff62fe9 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:25 -0700 Subject: x86/intel_rdt: Create resctrl debug area In preparation for support of debugging of RDT sub features the user can now enable a RDT debugfs region. The debug area is always enabled when CONFIG_DEBUG_FS is set as advised in http://lkml.kernel.org/r/20180523080501.GA6822@kroah.com Also from same discussion in above linked email, no error checking on the debugfs creation return value since code should not behave differently when debugging passes or fails. Even on failure the returned value can be passed safely to other debugfs calls. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/9f553faf30866a6317f1aaaa2fe9f92de66a10d2.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 2 ++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 2b3f7619be48..a97be4094fa5 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -432,6 +432,8 @@ extern struct rdt_resource rdt_resources_all[]; extern struct rdtgroup rdtgroup_default; DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +extern struct dentry *debugfs_resctrl; + enum { RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 960066249374..bc52c593a87c 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -56,6 +57,8 @@ static struct kernfs_node *kn_mondata; static struct seq_buf last_cmd_status; static char last_cmd_status_buf[512]; +struct dentry *debugfs_resctrl; + void rdt_last_cmd_clear(void) { lockdep_assert_held(&rdtgroup_mutex); @@ -2819,6 +2822,29 @@ int __init rdtgroup_init(void) if (ret) goto cleanup_mountpoint; + /* + * Adding the resctrl debugfs directory here may not be ideal since + * it would let the resctrl debugfs directory appear on the debugfs + * filesystem before the resctrl filesystem is mounted. + * It may also be ok since that would enable debugging of RDT before + * resctrl is mounted. + * The reason why the debugfs directory is created here and not in + * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and + * during the debugfs directory creation also &sb->s_type->i_mutex_key + * (the lockdep class of inode->i_rwsem). Other filesystem + * interactions (eg. SyS_getdents) have the lock ordering: + * &sb->s_type->i_mutex_key --> &mm->mmap_sem + * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex + * is taken, thus creating dependency: + * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause + * issues considering the other two lock dependencies. + * By creating the debugfs directory here we avoid a dependency + * that may cause deadlock (even though file operations cannot + * occur until the filesystem is mounted, but I do not know how to + * tell lockdep that). + */ + debugfs_resctrl = debugfs_create_dir("resctrl", NULL); + return 0; cleanup_mountpoint: @@ -2831,6 +2857,7 @@ cleanup_root: void __exit rdtgroup_exit(void) { + debugfs_remove_recursive(debugfs_resctrl); unregister_filesystem(&rdt_fs_type); sysfs_remove_mount_point(fs_kobj, "resctrl"); kernfs_destroy_root(rdt_root); -- cgit v1.2.3 From 443810fe6160542c78e24c66047edd7a3cc830c6 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:26 -0700 Subject: x86/intel_rdt: Create debugfs files for pseudo-locking testing There is no simple yes/no test to determine if pseudo-locking was successful. In order to test pseudo-locking we expose a debugfs file for each pseudo-locked region that will record the latency of reading the pseudo-locked memory at a stride of 32 bytes (hardcoded). These numbers will give us an idea of locking was successful or not since they will reflect cache hits and cache misses (hardware prefetching is disabled during the test). The new debugfs file "pseudo_lock_measure" will, when the pseudo_lock_mem_latency tracepoint is enabled, record the latency of accessing each cache line twice. Kernel tracepoints offer us histograms (when CONFIG_HIST_TRIGGERS is enabled) that is a simple way to visualize the memory access latency and immediately see any cache misses. For example, the hist trigger below before trigger of the measurement will display the memory access latency and instances at each latency: echo 'hist:keys=latency' > /sys/kernel/debug/tracing/events/resctrl/\ pseudo_lock_mem_latency/trigger echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable echo 1 > /sys/kernel/debug/resctrl//pseudo_lock_measure echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable cat /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/hist Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/6b2ea76181099d1b79ccfa7d3be24497ab2d1a45.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/intel_rdt.h | 3 + arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 176 +++++++++++++++++++++- arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h | 23 +++ 4 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h (limited to 'arch') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c4e02555563a..347137e80bf5 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o obj-$(CONFIG_INTEL_RDT) += intel_rdt_ctrlmondata.o intel_rdt_pseudo_lock.o +CFLAGS_intel_rdt_pseudo_lock.o = -I$(src) obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index a97be4094fa5..8a7102cfbec7 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -138,6 +138,8 @@ struct mongroup { * @line_size: size of the cache lines * @size: size of pseudo-locked region in bytes * @kmem: the kernel memory associated with pseudo-locked region + * @debugfs_dir: pointer to this region's directory in the debugfs + * filesystem */ struct pseudo_lock_region { struct rdt_resource *r; @@ -149,6 +151,7 @@ struct pseudo_lock_region { unsigned int line_size; unsigned int size; void *kmem; + struct dentry *debugfs_dir; }; /** diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 22387af30a9e..5851f3e20742 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,9 @@ #include #include "intel_rdt.h" +#define CREATE_TRACE_POINTS +#include "intel_rdt_pseudo_lock_event.h" + /* * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware * prefetcher state. Details about this register can be found in the MSR @@ -176,6 +180,7 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) plr->d->plr = NULL; plr->d = NULL; plr->cbm = 0; + plr->debugfs_dir = NULL; } /** @@ -692,6 +697,161 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) return ret; } +/** + * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory + * @_plr: pseudo-lock region to measure + * + * There is no deterministic way to test if a memory region is cached. One + * way is to measure how long it takes to read the memory, the speed of + * access is a good way to learn how close to the cpu the data was. Even + * more, if the prefetcher is disabled and the memory is read at a stride + * of half the cache line, then a cache miss will be easy to spot since the + * read of the first half would be significantly slower than the read of + * the second half. + * + * Return: 0. Waiter on waitqueue will be woken on completion. + */ +static int measure_cycles_lat_fn(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + unsigned long i; + u64 start, end; +#ifdef CONFIG_KASAN + /* + * The registers used for local register variables are also used + * when KASAN is active. When KASAN is active we use a regular + * variable to ensure we always use a valid pointer to access memory. + * The cost is that accessing this pointer, which could be in + * cache, will be included in the measurement of memory read latency. + */ + void *mem_r; +#else +#ifdef CONFIG_X86_64 + register void *mem_r asm("rbx"); +#else + register void *mem_r asm("ebx"); +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_KASAN */ + + local_irq_disable(); + /* + * The wrmsr call may be reordered with the assignment below it. + * Call wrmsr as directly as possible to avoid tracing clobbering + * local register variable used for memory pointer. + */ + __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + mem_r = plr->kmem; + /* + * Dummy execute of the time measurement to load the needed + * instructions into the L1 instruction cache. + */ + start = rdtsc_ordered(); + for (i = 0; i < plr->size; i += 32) { + start = rdtsc_ordered(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + end = rdtsc_ordered(); + trace_pseudo_lock_mem_latency((u32)(end - start)); + } + wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + local_irq_enable(); + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +/** + * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region + * + * The measurement of latency to access a pseudo-locked region should be + * done from a cpu that is associated with that pseudo-locked region. + * Determine which cpu is associated with this region and start a thread on + * that cpu to perform the measurement, wait for that thread to complete. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int cpu; + int ret; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out; + } + + plr->thread_done = 0; + cpu = cpumask_first(&plr->d->cpu_mask); + if (!cpu_online(cpu)) { + ret = -ENODEV; + goto out; + } + + thread = kthread_create_on_node(measure_cycles_lat_fn, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", cpu); + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + goto out; + } + kthread_bind(thread, cpu); + wake_up_process(thread); + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) + goto out; + + ret = 0; + +out: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +static ssize_t pseudo_lock_measure_trigger(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rdtgroup *rdtgrp = file->private_data; + size_t buf_size; + char buf[32]; + int ret; + bool bv; + + buf_size = min(count, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + ret = strtobool(buf, &bv); + if (ret == 0 && bv) { + ret = debugfs_file_get(file->f_path.dentry); + if (ret) + return ret; + ret = pseudo_lock_measure_cycles(rdtgrp); + if (ret == 0) + ret = count; + debugfs_file_put(file->f_path.dentry); + } + + return ret; +} + +static const struct file_operations pseudo_measure_fops = { + .write = pseudo_lock_measure_trigger, + .open = simple_open, + .llseek = default_llseek, +}; + /** * rdtgroup_pseudo_lock_create - Create a pseudo-locked region * @rdtgrp: resource group to which pseudo-lock region belongs @@ -747,6 +907,15 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) goto out_region; } + if (!IS_ERR_OR_NULL(debugfs_resctrl)) { + plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, + debugfs_resctrl); + if (!IS_ERR_OR_NULL(plr->debugfs_dir)) + debugfs_create_file("pseudo_lock_measure", 0200, + plr->debugfs_dir, rdtgrp, + &pseudo_measure_fops); + } + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; closid_free(rdtgrp->closid); ret = 0; @@ -774,12 +943,17 @@ out: */ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { /* * Default group cannot be a pseudo-locked region so we can * free closid here. */ closid_free(rdtgrp->closid); + goto free; + } + + debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); +free: pseudo_lock_free(rdtgrp); } diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h new file mode 100644 index 000000000000..3cd0fa27d5fe --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM resctrl + +#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PSEUDO_LOCK_H + +#include + +TRACE_EVENT(pseudo_lock_mem_latency, + TP_PROTO(u32 latency), + TP_ARGS(latency), + TP_STRUCT__entry(__field(u32, latency)), + TP_fast_assign(__entry->latency = latency), + TP_printk("latency=%u", __entry->latency) + ); + +#endif /* _TRACE_PSEUDO_LOCK_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE intel_rdt_pseudo_lock_event +#include -- cgit v1.2.3 From 746e08590b864cf730d7bd23394e2d3fbb0f22b6 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:27 -0700 Subject: x86/intel_rdt: Create character device exposing pseudo-locked region After a pseudo-locked region is created it needs to be made available to user space for usage. A character device supporting mmap() is created for each pseudo-locked region. A user space application can now use mmap() system call to map pseudo-locked region into its virtual address space. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/fccbb9b20f07655ab0a4df9fa1c1babc0288aea0.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 14 +- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 285 ++++++++++++++++++++++++++++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 1 + 3 files changed, 291 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 8a7102cfbec7..b8e490a43290 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -138,6 +138,8 @@ struct mongroup { * @line_size: size of the cache lines * @size: size of pseudo-locked region in bytes * @kmem: the kernel memory associated with pseudo-locked region + * @minor: minor number of character device associated with this + * region * @debugfs_dir: pointer to this region's directory in the debugfs * filesystem */ @@ -151,6 +153,7 @@ struct pseudo_lock_region { unsigned int line_size; unsigned int size; void *kmem; + unsigned int minor; struct dentry *debugfs_dir; }; @@ -524,6 +527,8 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm); bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); +int rdt_pseudo_lock_init(void); +void rdt_pseudo_lock_release(void); int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); @@ -551,13 +556,4 @@ void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); -/* - * Define the hooks for Cache Pseudo-Locking to use within rdt_mount(). - * These are no-ops provided for the new kernfs changes to use as a - * baseline in preparation for a conflict-free merge between it - * (kernfs changes) and the Cache Pseudo-Locking enabling. - */ -static inline int rdt_pseudo_lock_init(void) { return 0; } -static inline void rdt_pseudo_lock_release(void) { } - #endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 5851f3e20742..1c5e7f28da5b 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -16,10 +16,14 @@ #include #include #include +#include #include +#include + #include #include #include + #include "intel_rdt.h" #define CREATE_TRACE_POINTS @@ -38,6 +42,14 @@ */ static u64 prefetch_disable_bits; +/* + * Major number assigned to and shared by all devices exposing + * pseudo-locked regions. + */ +static unsigned int pseudo_lock_major; +static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); +static struct class *pseudo_lock_class; + /** * get_prefetch_disable_bits - prefetch disable bits of supported platforms * @@ -89,6 +101,66 @@ static u64 get_prefetch_disable_bits(void) return 0; } +/** + * pseudo_lock_minor_get - Obtain available minor number + * @minor: Pointer to where new minor number will be stored + * + * A bitmask is used to track available minor numbers. Here the next free + * minor number is marked as unavailable and returned. + * + * Return: 0 on success, <0 on failure. + */ +static int pseudo_lock_minor_get(unsigned int *minor) +{ + unsigned long first_bit; + + first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); + + if (first_bit == MINORBITS) + return -ENOSPC; + + __clear_bit(first_bit, &pseudo_lock_minor_avail); + *minor = first_bit; + + return 0; +} + +/** + * pseudo_lock_minor_release - Return minor number to available + * @minor: The minor number made available + */ +static void pseudo_lock_minor_release(unsigned int minor) +{ + __set_bit(minor, &pseudo_lock_minor_avail); +} + +/** + * region_find_by_minor - Locate a pseudo-lock region by inode minor number + * @minor: The minor number of the device representing pseudo-locked region + * + * When the character device is accessed we need to determine which + * pseudo-locked region it belongs to. This is done by matching the minor + * number of the device to the pseudo-locked region it belongs. + * + * Minor numbers are assigned at the time a pseudo-locked region is associated + * with a cache instance. + * + * Return: On success return pointer to resource group owning the pseudo-locked + * region, NULL on failure. + */ +static struct rdtgroup *region_find_by_minor(unsigned int minor) +{ + struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->plr && rdtgrp->plr->minor == minor) { + rdtgrp_match = rdtgrp; + break; + } + } + return rdtgrp_match; +} + /** * pseudo_lock_region_init - Initialize pseudo-lock region information * @plr: pseudo-lock region @@ -872,6 +944,8 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) { struct pseudo_lock_region *plr = rdtgrp->plr; struct task_struct *thread; + unsigned int new_minor; + struct device *dev; int ret; ret = pseudo_lock_region_alloc(plr); @@ -916,11 +990,55 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) &pseudo_measure_fops); } + ret = pseudo_lock_minor_get(&new_minor); + if (ret < 0) { + rdt_last_cmd_puts("unable to obtain a new minor number\n"); + goto out_debugfs; + } + + /* + * Unlock access but do not release the reference. The + * pseudo-locked region will still be here on return. + * + * The mutex has to be released temporarily to avoid a potential + * deadlock with the mm->mmap_sem semaphore which is obtained in + * the device_create() callpath below as well as before the mmap() + * callback is called. + */ + mutex_unlock(&rdtgroup_mutex); + + dev = device_create(pseudo_lock_class, NULL, + MKDEV(pseudo_lock_major, new_minor), + rdtgrp, "%s", rdtgrp->kn->name); + + mutex_lock(&rdtgroup_mutex); + + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + rdt_last_cmd_printf("failed to create character device: %d\n", + ret); + goto out_minor; + } + + /* We released the mutex - check if group was removed while we did so */ + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out_device; + } + + plr->minor = new_minor; + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; closid_free(rdtgrp->closid); ret = 0; goto out; +out_device: + device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); +out_minor: + pseudo_lock_minor_release(new_minor); +out_debugfs: + debugfs_remove_recursive(plr->debugfs_dir); out_region: pseudo_lock_region_clear(plr); out: @@ -943,6 +1061,8 @@ out: */ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { + struct pseudo_lock_region *plr = rdtgrp->plr; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { /* * Default group cannot be a pseudo-locked region so we can @@ -953,7 +1073,172 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) } debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); + device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); + pseudo_lock_minor_release(plr->minor); free: pseudo_lock_free(rdtgrp); } + +static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = region_find_by_minor(iminor(inode)); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + filp->private_data = rdtgrp; + atomic_inc(&rdtgrp->waitcount); + /* Perform a non-seekable open - llseek is not supported */ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + filp->private_data = NULL; + atomic_dec(&rdtgrp->waitcount); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +{ + /* Not supported */ + return -EINVAL; +} + +static const struct vm_operations_struct pseudo_mmap_ops = { + .mremap = pseudo_lock_dev_mremap, +}; + +static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long vsize = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct pseudo_lock_region *plr; + struct rdtgroup *rdtgrp; + unsigned long physical; + unsigned long psize; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + plr = rdtgrp->plr; + + /* + * Task is required to run with affinity to the cpus associated + * with the pseudo-locked region. If this is not the case the task + * may be scheduled elsewhere and invalidate entries in the + * pseudo-locked region. + */ + if (!cpumask_subset(¤t->cpus_allowed, &plr->d->cpu_mask)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + physical = __pa(plr->kmem) >> PAGE_SHIFT; + psize = plr->size - off; + + if (off > plr->size) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + /* + * Ensure changes are carried directly to the memory being mapped, + * do not allow copy-on-write mapping. + */ + if (!(vma->vm_flags & VM_SHARED)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + if (vsize > psize) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + memset(plr->kmem + off, 0, vsize); + + if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, + vsize, vma->vm_page_prot)) { + mutex_unlock(&rdtgroup_mutex); + return -EAGAIN; + } + vma->vm_ops = &pseudo_mmap_ops; + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static const struct file_operations pseudo_lock_dev_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = NULL, + .write = NULL, + .open = pseudo_lock_dev_open, + .release = pseudo_lock_dev_release, + .mmap = pseudo_lock_dev_mmap, +}; + +static char *pseudo_lock_devnode(struct device *dev, umode_t *mode) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = dev_get_drvdata(dev); + if (mode) + *mode = 0600; + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); +} + +int rdt_pseudo_lock_init(void) +{ + int ret; + + ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); + if (ret < 0) + return ret; + + pseudo_lock_major = ret; + + pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock"); + if (IS_ERR(pseudo_lock_class)) { + ret = PTR_ERR(pseudo_lock_class); + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + return ret; + } + + pseudo_lock_class->devnode = pseudo_lock_devnode; + return 0; +} + +void rdt_pseudo_lock_release(void) +{ + class_destroy(pseudo_lock_class); + pseudo_lock_class = NULL; + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + pseudo_lock_major = 0; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index bc52c593a87c..7b4a09d81a30 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -2067,6 +2067,7 @@ static void rdt_kill_sb(struct super_block *sb) reset_all_ctrls(r); cdp_disable_all(); rmdir_all_sub(); + rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); -- cgit v1.2.3 From 8a2fc0e1bc0cd856101927188884d7c370b62188 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:28 -0700 Subject: x86/intel_rdt: More precise L2 hit/miss measurements Intel Goldmont processors supports non-architectural precise events that can be used to give us more insight into the success of L2 cache pseudo-locking on these platforms. Introduce a new measurement trigger that will enable two precise events, MEM_LOAD_UOPS_RETIRED.L2_HIT and MEM_LOAD_UOPS_RETIRED.L2_MISS, while accessing pseudo-locked data. A new tracepoint, pseudo_lock_l2, is created to make these results visible to the user. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/06b1456da65b543479dac8d9493e41f92f175d6c.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 145 ++++++++++++++++++++-- arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h | 10 ++ 2 files changed, 146 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 1c5e7f28da5b..9e808c07a7f0 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "intel_rdt.h" @@ -63,6 +64,9 @@ static struct class *pseudo_lock_class; * hardware prefetch disable bits are included here as they are documented * in the SDM. * + * When adding a platform here also add support for its cache events to + * measure_cycles_perf_fn() + * * Return: * If platform is supported, the bits to disable hardware prefetchers, 0 * if platform is not supported. @@ -101,6 +105,16 @@ static u64 get_prefetch_disable_bits(void) return 0; } +/* + * Helper to write 64bit value to MSR without tracing. Used when + * use of the cache should be restricted and use of registers used + * for local variables avoided. + */ +static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val) +{ + __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); +} + /** * pseudo_lock_minor_get - Obtain available minor number * @minor: Pointer to where new minor number will be stored @@ -834,6 +848,107 @@ static int measure_cycles_lat_fn(void *_plr) return 0; } +static int measure_cycles_perf_fn(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + unsigned long long l2_hits, l2_miss; + u64 l2_hit_bits, l2_miss_bits; + unsigned long i; +#ifdef CONFIG_KASAN + /* + * The registers used for local register variables are also used + * when KASAN is active. When KASAN is active we use regular variables + * at the cost of including cache access latency to these variables + * in the measurements. + */ + unsigned int line_size; + unsigned int size; + void *mem_r; +#else + register unsigned int line_size asm("esi"); + register unsigned int size asm("edi"); +#ifdef CONFIG_X86_64 + register void *mem_r asm("rbx"); +#else + register void *mem_r asm("ebx"); +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_KASAN */ + + /* + * Non-architectural event for the Goldmont Microarchitecture + * from Intel x86 Architecture Software Developer Manual (SDM): + * MEM_LOAD_UOPS_RETIRED D1H (event number) + * Umask values: + * L1_HIT 01H + * L2_HIT 02H + * L1_MISS 08H + * L2_MISS 10H + */ + + /* + * Start by setting flags for IA32_PERFEVTSELx: + * OS (Operating system mode) 0x2 + * INT (APIC interrupt enable) 0x10 + * EN (Enable counter) 0x40 + * + * Then add the Umask value and event number to select performance + * event. + */ + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_GEMINI_LAKE: + l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1; + l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1; + break; + default: + goto out; + } + + local_irq_disable(); + /* + * Call wrmsr direcly to avoid the local register variables from + * being overwritten due to reordering of their assignment with + * the wrmsr calls. + */ + __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + /* Disable events and reset counters */ + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0); + /* Set and enable the L2 counters */ + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits); + mem_r = plr->kmem; + size = plr->size; + line_size = plr->line_size; + for (i = 0; i < size; i += line_size) { + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Call wrmsr directly (no tracing) to not influence + * the cache access counters as they are disabled. + */ + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, + l2_hit_bits & ~(0x40ULL << 16)); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, + l2_miss_bits & ~(0x40ULL << 16)); + l2_hits = native_read_pmc(0); + l2_miss = native_read_pmc(1); + wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + local_irq_enable(); + trace_pseudo_lock_l2(l2_hits, l2_miss); + +out: + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + /** * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region * @@ -844,12 +959,12 @@ static int measure_cycles_lat_fn(void *_plr) * * Return: 0 on success, <0 on failure */ -static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp) +static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) { struct pseudo_lock_region *plr = rdtgrp->plr; struct task_struct *thread; unsigned int cpu; - int ret; + int ret = -1; cpus_read_lock(); mutex_lock(&rdtgroup_mutex); @@ -866,9 +981,19 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp) goto out; } - thread = kthread_create_on_node(measure_cycles_lat_fn, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", cpu); + if (sel == 1) + thread = kthread_create_on_node(measure_cycles_lat_fn, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else if (sel == 2) + thread = kthread_create_on_node(measure_cycles_perf_fn, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else + goto out; + if (IS_ERR(thread)) { ret = PTR_ERR(thread); goto out; @@ -897,19 +1022,21 @@ static ssize_t pseudo_lock_measure_trigger(struct file *file, size_t buf_size; char buf[32]; int ret; - bool bv; + int sel; buf_size = min(count, (sizeof(buf) - 1)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; buf[buf_size] = '\0'; - ret = strtobool(buf, &bv); - if (ret == 0 && bv) { + ret = kstrtoint(buf, 10, &sel); + if (ret == 0) { + if (sel != 1 && sel != 2) + return -EINVAL; ret = debugfs_file_get(file->f_path.dentry); if (ret) return ret; - ret = pseudo_lock_measure_cycles(rdtgrp); + ret = pseudo_lock_measure_cycles(rdtgrp, sel); if (ret == 0) ret = count; debugfs_file_put(file->f_path.dentry); diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h index 3cd0fa27d5fe..efad50d2ee2f 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h @@ -15,6 +15,16 @@ TRACE_EVENT(pseudo_lock_mem_latency, TP_printk("latency=%u", __entry->latency) ); +TRACE_EVENT(pseudo_lock_l2, + TP_PROTO(u64 l2_hits, u64 l2_miss), + TP_ARGS(l2_hits, l2_miss), + TP_STRUCT__entry(__field(u64, l2_hits) + __field(u64, l2_miss)), + TP_fast_assign(__entry->l2_hits = l2_hits; + __entry->l2_miss = l2_miss;), + TP_printk("hits=%llu miss=%llu", + __entry->l2_hits, __entry->l2_miss)); + #endif /* _TRACE_PSEUDO_LOCK_H */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From f3be1e7b2cf8bc096386a3588fc640b0db6b28d7 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:29 -0700 Subject: x86/intel_rdt: Support L3 cache performance event of Broadwell Broadwell microarchitecture supports pseudo-locking. Add support for the L3 cache related performance events of these systems so that the success of pseudo-locking can be measured more accurately on these platforms. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/36c1414e9bd17c3faf440f32b644b9c879bcbae2.1529706536.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 56 +++++++++++++++++++++++ arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h | 10 ++++ 2 files changed, 66 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 9e808c07a7f0..dd1341557c9d 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -850,6 +850,8 @@ static int measure_cycles_lat_fn(void *_plr) static int measure_cycles_perf_fn(void *_plr) { + unsigned long long l3_hits = 0, l3_miss = 0; + u64 l3_hit_bits = 0, l3_miss_bits = 0; struct pseudo_lock_region *plr = _plr; unsigned long long l2_hits, l2_miss; u64 l2_hit_bits, l2_miss_bits; @@ -883,6 +885,16 @@ static int measure_cycles_perf_fn(void *_plr) * L2_HIT 02H * L1_MISS 08H * L2_MISS 10H + * + * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event + * has two "no fix" errata associated with it: BDM35 and BDM100. On + * this platform we use the following events instead: + * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/) + * REFERENCES FFH + * MISS 3FH + * LONGEST_LAT_CACHE 2EH (Documented in SDM) + * REFERENCE 4FH + * MISS 41H */ /* @@ -901,6 +913,14 @@ static int measure_cycles_perf_fn(void *_plr) l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1; l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1; break; + case INTEL_FAM6_BROADWELL_X: + /* On BDW the l2_hit_bits count references, not hits */ + l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24; + l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24; + /* On BDW the l3_hit_bits count references, not hits */ + l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e; + l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e; + break; default: goto out; } @@ -917,9 +937,21 @@ static int measure_cycles_perf_fn(void *_plr) pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0); pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0); pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0); + if (l3_hit_bits > 0) { + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0); + } /* Set and enable the L2 counters */ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits); pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits); + if (l3_hit_bits > 0) { + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, + l3_hit_bits); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, + l3_miss_bits); + } mem_r = plr->kmem; size = plr->size; line_size = plr->line_size; @@ -937,11 +969,35 @@ static int measure_cycles_perf_fn(void *_plr) l2_hit_bits & ~(0x40ULL << 16)); pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits & ~(0x40ULL << 16)); + if (l3_hit_bits > 0) { + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, + l3_hit_bits & ~(0x40ULL << 16)); + pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, + l3_miss_bits & ~(0x40ULL << 16)); + } l2_hits = native_read_pmc(0); l2_miss = native_read_pmc(1); + if (l3_hit_bits > 0) { + l3_hits = native_read_pmc(2); + l3_miss = native_read_pmc(3); + } wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); local_irq_enable(); + /* + * On BDW we count references and misses, need to adjust. Sometimes + * the "hits" counter is a bit more than the references, for + * example, x references but x + 1 hits. To not report invalid + * hit values in this case we treat that as misses eaqual to + * references. + */ + if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) + l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss); trace_pseudo_lock_l2(l2_hits, l2_miss); + if (l3_hit_bits > 0) { + if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) + l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss); + trace_pseudo_lock_l3(l3_hits, l3_miss); + } out: plr->thread_done = 1; diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h index efad50d2ee2f..2c041e6d9f05 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h @@ -25,6 +25,16 @@ TRACE_EVENT(pseudo_lock_l2, TP_printk("hits=%llu miss=%llu", __entry->l2_hits, __entry->l2_miss)); +TRACE_EVENT(pseudo_lock_l3, + TP_PROTO(u64 l3_hits, u64 l3_miss), + TP_ARGS(l3_hits, l3_miss), + TP_STRUCT__entry(__field(u64, l3_hits) + __field(u64, l3_miss)), + TP_fast_assign(__entry->l3_hits = l3_hits; + __entry->l3_miss = l3_miss;), + TP_printk("hits=%llu miss=%llu", + __entry->l3_hits, __entry->l3_miss)); + #endif /* _TRACE_PSEUDO_LOCK_H */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 6fc0de37f663278af160e8e1f0c38b27e6c06206 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Fri, 22 Jun 2018 15:42:30 -0700 Subject: x86/intel_rdt: Limit C-states dynamically when pseudo-locking active Deeper C-states impact cache content through shrinking of the cache or flushing entire cache to memory before reducing power to the cache. Deeper C-states will thus negatively impact the pseudo-locked regions. To avoid impacting pseudo-locked regions C-states are limited on pseudo-locked region creation so that cores associated with the pseudo-locked region are prevented from entering deeper C-states. This is accomplished by requesting a CPU latency target which will prevent the core from entering C6 across all supported platforms. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1ef4f99dd6ba12fa6fb44c5a1141e75f952b9cd9.1529706536.git.reinette.chatre@intel.com --- Documentation/x86/intel_rdt_ui.txt | 4 +- arch/x86/kernel/cpu/intel_rdt.h | 2 + arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 85 ++++++++++++++++++++++++++++- 3 files changed, 87 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt index bcd0a6d2fcf8..acac30b67c62 100644 --- a/Documentation/x86/intel_rdt_ui.txt +++ b/Documentation/x86/intel_rdt_ui.txt @@ -461,8 +461,8 @@ in the cache via carefully configuring the CAT feature and controlling application behavior. There is no guarantee that data is placed in cache. Instructions like INVD, WBINVD, CLFLUSH, etc. can still evict “locked” data from cache. Power management C-states may shrink or -power off cache. It is thus recommended to limit the processor maximum -C-state, for example, by setting the processor.max_cstate kernel parameter. +power off cache. Deeper C-states will automatically be restricted on +pseudo-locked region creation. It is required that an application using a pseudo-locked region runs with affinity to the cores (or a subset of the cores) associated diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index b8e490a43290..2d9cbb9d7a58 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -142,6 +142,7 @@ struct mongroup { * region * @debugfs_dir: pointer to this region's directory in the debugfs * filesystem + * @pm_reqs: Power management QoS requests related to this region */ struct pseudo_lock_region { struct rdt_resource *r; @@ -155,6 +156,7 @@ struct pseudo_lock_region { void *kmem; unsigned int minor; struct dentry *debugfs_dir; + struct list_head pm_reqs; }; /** diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index dd1341557c9d..6e83f61552a5 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -175,6 +176,76 @@ static struct rdtgroup *region_find_by_minor(unsigned int minor) return rdtgrp_match; } +/** + * pseudo_lock_pm_req - A power management QoS request list entry + * @list: Entry within the @pm_reqs list for a pseudo-locked region + * @req: PM QoS request + */ +struct pseudo_lock_pm_req { + struct list_head list; + struct dev_pm_qos_request req; +}; + +static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req, *next; + + list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { + dev_pm_qos_remove_request(&pm_req->req); + list_del(&pm_req->list); + kfree(pm_req); + } +} + +/** + * pseudo_lock_cstates_constrain - Restrict cores from entering C6 + * + * To prevent the cache from being affected by power management entering + * C6 has to be avoided. This is accomplished by requesting a latency + * requirement lower than lowest C6 exit latency of all supported + * platforms as found in the cpuidle state tables in the intel_idle driver. + * At this time it is possible to do so with a single latency requirement + * for all supported platforms. + * + * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, + * the ACPI latencies need to be considered while keeping in mind that C2 + * may be set to map to deeper sleep states. In this case the latency + * requirement needs to prevent entering C2 also. + */ +static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req; + int cpu; + int ret; + + for_each_cpu(cpu, &plr->d->cpu_mask) { + pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); + if (!pm_req) { + rdt_last_cmd_puts("fail allocating mem for PM QoS\n"); + ret = -ENOMEM; + goto out_err; + } + ret = dev_pm_qos_add_request(get_cpu_device(cpu), + &pm_req->req, + DEV_PM_QOS_RESUME_LATENCY, + 30); + if (ret < 0) { + rdt_last_cmd_printf("fail to add latency req cpu%d\n", + cpu); + kfree(pm_req); + ret = -1; + goto out_err; + } + list_add(&pm_req->list, &plr->pm_reqs); + } + + return 0; + +out_err: + pseudo_lock_cstates_relax(plr); + return ret; +} + /** * pseudo_lock_region_init - Initialize pseudo-lock region information * @plr: pseudo-lock region @@ -242,6 +313,7 @@ static int pseudo_lock_init(struct rdtgroup *rdtgrp) return -ENOMEM; init_waitqueue_head(&plr->lock_thread_wq); + INIT_LIST_HEAD(&plr->pm_reqs); rdtgrp->plr = plr; return 0; } @@ -1135,6 +1207,12 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) if (ret < 0) return ret; + ret = pseudo_lock_cstates_constrain(plr); + if (ret < 0) { + ret = -EINVAL; + goto out_region; + } + plr->thread_done = 0; thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, @@ -1143,7 +1221,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) if (IS_ERR(thread)) { ret = PTR_ERR(thread); rdt_last_cmd_printf("locking thread returned error %d\n", ret); - goto out_region; + goto out_cstates; } kthread_bind(thread, plr->cpu); @@ -1161,7 +1239,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * empty pseudo-locking loop. */ rdt_last_cmd_puts("locking thread interrupted\n"); - goto out_region; + goto out_cstates; } if (!IS_ERR_OR_NULL(debugfs_resctrl)) { @@ -1222,6 +1300,8 @@ out_minor: pseudo_lock_minor_release(new_minor); out_debugfs: debugfs_remove_recursive(plr->debugfs_dir); +out_cstates: + pseudo_lock_cstates_relax(plr); out_region: pseudo_lock_region_clear(plr); out: @@ -1255,6 +1335,7 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) goto free; } + pseudo_lock_cstates_relax(plr); debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); pseudo_lock_minor_release(plr->minor); -- cgit v1.2.3 From ce730f1cc1255be152c879a2bc5f295d341d8036 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 30 Jun 2018 22:03:02 -0700 Subject: x86/intel_rdt: Move pseudo_lock_region_clear() The pseudo_lock_region_clear() function is moved to earlier in the file in preparation for its use in functions that currently appear before it. No functional change. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/ef098ec2a45501e23792289bff80ae3152141e2f.1530421026.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 46 ++++++++++++++--------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 6e83f61552a5..1860ec10302d 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -246,6 +246,29 @@ out_err: return ret; } +/** + * pseudo_lock_region_clear - Reset pseudo-lock region data + * @plr: pseudo-lock region + * + * All content of the pseudo-locked region is reset - any memory allocated + * freed. + * + * Return: void + */ +static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) +{ + plr->size = 0; + plr->line_size = 0; + kfree(plr->kmem); + plr->kmem = NULL; + plr->r = NULL; + if (plr->d) + plr->d->plr = NULL; + plr->d = NULL; + plr->cbm = 0; + plr->debugfs_dir = NULL; +} + /** * pseudo_lock_region_init - Initialize pseudo-lock region information * @plr: pseudo-lock region @@ -318,29 +341,6 @@ static int pseudo_lock_init(struct rdtgroup *rdtgrp) return 0; } -/** - * pseudo_lock_region_clear - Reset pseudo-lock region data - * @plr: pseudo-lock region - * - * All content of the pseudo-locked region is reset - any memory allocated - * freed. - * - * Return: void - */ -static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) -{ - plr->size = 0; - plr->line_size = 0; - kfree(plr->kmem); - plr->kmem = NULL; - plr->r = NULL; - if (plr->d) - plr->d->plr = NULL; - plr->d = NULL; - plr->cbm = 0; - plr->debugfs_dir = NULL; -} - /** * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked * @plr: pseudo-lock region -- cgit v1.2.3 From 546d3c74277398a3d76d059bd2db47186bb47fc8 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 30 Jun 2018 22:03:03 -0700 Subject: x86/intel_rdt: Fix cleanup of plr structure on error When a resource group enters pseudo-locksetup mode a pseudo_lock_region is associated with it. When the user writes to the resource group's schemata file the CBM of the requested pseudo-locked region is entered into the pseudo_lock_region struct. If any part of pseudo-lock region creation fails the resource group will remain in pseudo-locksetup mode with the pseudo_lock_region associated with it. In case of failure during pseudo-lock region creation care needs to be taken to ensure that the pseudo_lock_region struct associated with the resource group is cleared from any pseudo-locking data - especially the CBM. This is because the existence of a pseudo_lock_region struct with a CBM is significant in other areas of the code, for example, the display of bit_usage and initialization of a new resource group. Fix the error path of pseudo-lock region creation to ensure that the pseudo_lock_region struct is cleared at each error exit. Fixes: 018961ae5579 ("x86/intel_rdt: Pseudo-lock region creation/removal core") Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/49b4782f6d204d122cee3499e642b2772a98d2b4.1530421026.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 1860ec10302d..8fd79c281ee6 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -290,6 +290,7 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) static int pseudo_lock_region_init(struct pseudo_lock_region *plr) { struct cpu_cacheinfo *ci; + int ret; int i; /* Pick the first cpu we find that is associated with the cache. */ @@ -298,7 +299,8 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) if (!cpu_online(plr->cpu)) { rdt_last_cmd_printf("cpu %u associated with cache not online\n", plr->cpu); - return -ENODEV; + ret = -ENODEV; + goto out_region; } ci = get_cpu_cacheinfo(plr->cpu); @@ -312,8 +314,11 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) } } + ret = -1; rdt_last_cmd_puts("unable to determine cache line size\n"); - return -1; +out_region: + pseudo_lock_region_clear(plr); + return ret; } /** @@ -365,16 +370,23 @@ static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) */ if (plr->size > KMALLOC_MAX_SIZE) { rdt_last_cmd_puts("requested region exceeds maximum size\n"); - return -E2BIG; + ret = -E2BIG; + goto out_region; } plr->kmem = kzalloc(plr->size, GFP_KERNEL); if (!plr->kmem) { rdt_last_cmd_puts("unable to allocate memory\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out_region; } - return 0; + ret = 0; + goto out; +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; } /** -- cgit v1.2.3 From 392487def48e4b596526a4a8c2c2ec4cfe73bf13 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 30 Jun 2018 22:17:32 -0700 Subject: x86/intel_rdt: Support restoration of subset of permissions As the mode of a resource group changes, the operations it can support may also change. One way in which the supported operations are managed is to modify the permissions of the files within the resource group's resctrl directory. At the moment only two possible permissions are supported: the default permissions or no permissions in support for when the operation is "locked down". It is possible where an operation on a resource group may have more possibilities. For example, if by default changes can be made to the resource group by writing to a resctrl file while the current settings can be obtained by reading from the file, then it may be possible that in another mode it is only possible to read the current settings, and not change them. Make it possible to modify some of the permissions of a resctrl file in support of a more flexible way to manage the operations on a resource group. In this preparation work the original behavior is maintained where all permissions are restored. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/8773aadfade7bcb2c48a45fa294a04d2c03bb0a1.1530421961.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 3 ++- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 14 +++++++------- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 6 ++++-- 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 2d9cbb9d7a58..4e588f36228f 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -512,7 +512,8 @@ void rdt_ctrl_update(void *arg); struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); void rdtgroup_kn_unlock(struct kernfs_node *kn); int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name); +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask); struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, struct list_head **pos); ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 8fd79c281ee6..5dfe4008c58f 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -590,11 +590,11 @@ static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) goto out; err_cpus_list: - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list"); + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); err_cpus: - rdtgroup_kn_mode_restore(rdtgrp, "cpus"); + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); err_tasks: - rdtgroup_kn_mode_restore(rdtgrp, "tasks"); + rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); out: return ret; } @@ -615,20 +615,20 @@ static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) { int ret; - ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks"); + ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); if (ret) return ret; - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus"); + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); if (ret) goto err_tasks; - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list"); + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); if (ret) goto err_cpus; if (rdt_mon_capable) { - ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups"); + ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); if (ret) goto err_cpus_list; } diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 7b4a09d81a30..c20b51afd62d 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1405,13 +1405,15 @@ int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) * rdtgroup_kn_mode_restore - Restore user access to named resctrl file * @r: The resource group with which the file is associated. * @name: Name of the file + * @mask: Mask of permissions that should be restored * * Restore the permissions of the named file. If @name is a directory the * permissions of its parent will be used. * * Return: 0 on success, <0 on failure. */ -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name) +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask) { struct iattr iattr = {.ia_valid = ATTR_MODE,}; struct kernfs_node *kn, *parent; @@ -1423,7 +1425,7 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name) for (rft = rfts; rft < rfts + len; rft++) { if (!strcmp(rft->name, name)) - iattr.ia_mode = rft->mode; + iattr.ia_mode = rft->mode & mask; } kn = kernfs_find_and_get_ns(r->kn, name, NULL); -- cgit v1.2.3 From 33dc3e410a0d99f394905143b26d34f1fd64c962 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Sat, 30 Jun 2018 22:17:33 -0700 Subject: x86/intel_rdt: Make CPU information accessible for pseudo-locked regions When a resource group enters pseudo-locksetup mode it reflects that the platform supports cache pseudo-locking and the resource group is unused, ready to be used for a pseudo-locked region. Until it is set up as a pseudo-locked region the resource group is "locked down" such that no new tasks or cpus can be assigned to it. This is accomplished in a user visible way by making the cpus, cpus_list, and tasks resctrl files inaccassible (user cannot read from or write to these files). When the resource group changes to pseudo-locked mode it represents a cache pseudo-locked region. While not appropriate to make any changes to the cpus assigned to this region it is useful to make it easy for the user to see which cpus are associated with the pseudo-locked region. Modify the permissions of the cpus/cpus_list file when the resource group changes to pseudo-locked mode to support reading (not writing). The information presented to the user when reading the file are the cpus associated with the pseudo-locked region. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/12756b7963b6abc1bffe8fb560b87b75da827bd1.1530421961.git.reinette.chatre@intel.com --- Documentation/x86/intel_rdt_ui.txt | 3 +++ arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 3 +++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 8 ++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt index acac30b67c62..f662d3c530e5 100644 --- a/Documentation/x86/intel_rdt_ui.txt +++ b/Documentation/x86/intel_rdt_ui.txt @@ -178,6 +178,9 @@ All groups contain the following files: CPUs to/from this group. As with the tasks file a hierarchy is maintained where MON groups may only include CPUs owned by the parent CTRL_MON group. + When the resouce group is in pseudo-locked mode this file will + only be readable, reflecting the CPUs associated with the + pseudo-locked region. "cpus_list": diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 5dfe4008c58f..751c78f9992f 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -1303,6 +1303,9 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; closid_free(rdtgrp->closid); + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); + ret = 0; goto out; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index c20b51afd62d..d6d7ea7349d0 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -265,8 +265,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->cpu_mask)); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->plr->d->cpu_mask)); + else + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); } else { ret = -ENOENT; } -- cgit v1.2.3 From 2989360d9c6669d8ae64edc933088e640481b48b Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Wed, 11 Jul 2018 13:06:07 -0700 Subject: x86/intel_rdt: Fix possible circular lock dependency Lockdep is reporting a possible circular locking dependency: ====================================================== WARNING: possible circular locking dependency detected 4.18.0-rc1-test-test+ #4 Not tainted ------------------------------------------------------ user_example/766 is trying to acquire lock: 0000000073479a0f (rdtgroup_mutex){+.+.}, at: pseudo_lock_dev_mmap but task is already holding lock: 000000001ef7a35b (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0x9f/0x which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&mm->mmap_sem){++++}: _copy_to_user+0x1e/0x70 filldir+0x91/0x100 dcache_readdir+0x54/0x160 iterate_dir+0x142/0x190 __x64_sys_getdents+0xb9/0x170 do_syscall_64+0x86/0x200 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #1 (&sb->s_type->i_mutex_key#3){++++}: start_creating+0x60/0x100 debugfs_create_dir+0xc/0xc0 rdtgroup_pseudo_lock_create+0x217/0x4d0 rdtgroup_schemata_write+0x313/0x3d0 kernfs_fop_write+0xf0/0x1a0 __vfs_write+0x36/0x190 vfs_write+0xb7/0x190 ksys_write+0x52/0xc0 do_syscall_64+0x86/0x200 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (rdtgroup_mutex){+.+.}: __mutex_lock+0x80/0x9b0 pseudo_lock_dev_mmap+0x2f/0x170 mmap_region+0x3d6/0x610 do_mmap+0x387/0x580 vm_mmap_pgoff+0xcf/0x110 ksys_mmap_pgoff+0x170/0x1f0 do_syscall_64+0x86/0x200 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Chain exists of: rdtgroup_mutex --> &sb->s_type->i_mutex_key#3 --> &mm->mmap_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(&sb->s_type->i_mutex_key#3); lock(&mm->mmap_sem); lock(rdtgroup_mutex); *** DEADLOCK *** 1 lock held by user_example/766: #0: 000000001ef7a35b (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0x9f/0x110 rdtgroup_mutex is already being released temporarily during pseudo-lock region creation to prevent the potential deadlock between rdtgroup_mutex and mm->mmap_sem that is obtained during device_create(). Move the debugfs creation into this area to avoid the same circular dependency. Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/fffb57f9c6b8285904c9a60cc91ce21591af17fe.1531332480.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index 751c78f9992f..f80c58f8adc3 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -1254,19 +1254,10 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) goto out_cstates; } - if (!IS_ERR_OR_NULL(debugfs_resctrl)) { - plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, - debugfs_resctrl); - if (!IS_ERR_OR_NULL(plr->debugfs_dir)) - debugfs_create_file("pseudo_lock_measure", 0200, - plr->debugfs_dir, rdtgrp, - &pseudo_measure_fops); - } - ret = pseudo_lock_minor_get(&new_minor); if (ret < 0) { rdt_last_cmd_puts("unable to obtain a new minor number\n"); - goto out_debugfs; + goto out_cstates; } /* @@ -1275,11 +1266,20 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * * The mutex has to be released temporarily to avoid a potential * deadlock with the mm->mmap_sem semaphore which is obtained in - * the device_create() callpath below as well as before the mmap() - * callback is called. + * the device_create() and debugfs_create_dir() callpath below + * as well as before the mmap() callback is called. */ mutex_unlock(&rdtgroup_mutex); + if (!IS_ERR_OR_NULL(debugfs_resctrl)) { + plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, + debugfs_resctrl); + if (!IS_ERR_OR_NULL(plr->debugfs_dir)) + debugfs_create_file("pseudo_lock_measure", 0200, + plr->debugfs_dir, rdtgrp, + &pseudo_measure_fops); + } + dev = device_create(pseudo_lock_class, NULL, MKDEV(pseudo_lock_major, new_minor), rdtgrp, "%s", rdtgrp->kn->name); @@ -1290,7 +1290,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) ret = PTR_ERR(dev); rdt_last_cmd_printf("failed to create character device: %d\n", ret); - goto out_minor; + goto out_debugfs; } /* We released the mutex - check if group was removed while we did so */ @@ -1311,10 +1311,9 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) out_device: device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); -out_minor: - pseudo_lock_minor_release(new_minor); out_debugfs: debugfs_remove_recursive(plr->debugfs_dir); + pseudo_lock_minor_release(new_minor); out_cstates: pseudo_lock_cstates_relax(plr); out_region: -- cgit v1.2.3 From 4a7a54a55e7237386cacc73f173e74329773ac93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Aug 2018 13:06:35 +0200 Subject: x86/intel_rdt: Disable PMU access Peter is objecting to the direct PMU access in RDT. Right now the PMU usage is broken anyway as it is not coordinated with perf. Until this discussion settled, disable the PMU mechanics by simply rejecting the type '2' measurement in the resctrl file. Reported-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Cc: Reinette Chatre Cc: Dave Hansen Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: vikas.shivappa@linux.intel.com CC: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: hpa@zytor.com --- arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c index f80c58f8adc3..40f3903ae5d9 100644 --- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c +++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c @@ -1171,7 +1171,7 @@ static ssize_t pseudo_lock_measure_trigger(struct file *file, buf[buf_size] = '\0'; ret = kstrtoint(buf, 10, &sel); if (ret == 0) { - if (sel != 1 && sel != 2) + if (sel != 1) return -EINVAL; ret = debugfs_file_get(file->f_path.dentry); if (ret) -- cgit v1.2.3