From cfe7ddcbd72dc67ce5749cc6f451a2b0c6aec5b5 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:47 +0100 Subject: ARM, sched/topology: Remove SD_SHARE_POWERDOMAIN This flag was introduced in 2014 by commit: d77b3ed5c9f8 ("sched: Add a new SD_SHARE_POWERDOMAIN for sched_domain") but AFAIA it was never leveraged by the scheduler. The closest thing I can think of is EAS caring about frequency domains, and it does that by leveraging performance domains. Remove the flag. No change in functionality is expected. Suggested-by: Morten Rasmussen Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-2-valentin.schneider@arm.com --- kernel/sched/topology.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel/sched/topology.c') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 007b0a6b0152..fe0396978fea 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -148,8 +148,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { + SD_SHARE_PKG_RESOURCES)) { if (sd->groups != sd->groups->next) return 0; } @@ -180,8 +179,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + SD_PREFER_SIBLING); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -1292,7 +1290,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; * SD_SHARE_CPUCAPACITY - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1303,8 +1300,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_ASYM_PACKING) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, -- cgit v1.2.3 From 65c5e253168dbbb52c20026b0c5b7a2f344b9197 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:51 +0100 Subject: sched/topology: Verify SD_* flags setup when sched_debug is on Now that we have some description of what we expect the flags layout to be, we can use that to assert at runtime that the actual layout is sane. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-6-valentin.schneider@arm.com --- kernel/sched/topology.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel/sched/topology.c') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index fe0396978fea..cbdaf084ff32 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -29,6 +29,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; + unsigned long flags = sd->flags; + unsigned int idx; cpumask_clear(groupmask); @@ -43,6 +45,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + unsigned int flag = BIT(idx); + unsigned int meta_flags = sd_flag_debug[idx].meta_flags; + + if ((meta_flags & SDF_SHARED_CHILD) && sd->child && + !(sd->child->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in child\n", + sd_flag_debug[idx].name); + + if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && + !(sd->parent->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", + sd_flag_debug[idx].name); + } + printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { -- cgit v1.2.3 From 6f349818621dff364fc000909135d0065e9756c9 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:54 +0100 Subject: sched/topology: Use prebuilt SD flag degeneration mask Leverage SD_DEGENERATE_GROUPS_MASK in sd_degenerate() and sd_parent_degenerate(). Suggested-by: Peter Zijlstra Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-9-valentin.schneider@arm.com --- kernel/sched/topology.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel/sched/topology.c') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index cbdaf084ff32..38e6671c3bd7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -160,15 +160,9 @@ static int sd_degenerate(struct sched_domain *sd) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUCAPACITY | - SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES)) { - if (sd->groups != sd->groups->next) - return 0; - } + if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && + (sd->groups != sd->groups->next)) + return 0; /* Following flags don't use groups */ if (sd->flags & (SD_WAKE_AFFINE)) @@ -190,13 +184,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { - pflags &= ~(SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING); + pflags &= ~(SD_DEGENERATE_GROUPS_MASK | SD_PREFER_SIBLING); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } -- cgit v1.2.3 From ab65afb094c7b2e954e8d56ffbc7df843211c2c8 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:55 +0100 Subject: sched/topology: Remove SD_SERIALIZE degeneration special case If there is only a single NUMA node in the system, the only NUMA topology level that will be generated will be NODE (identity distance), which doesn't have SD_SERIALIZE. This means we don't need this special case in sd_parent_degenerate(), as having the NODE level "naturally" covers it. Thus, remove it. Suggested-by: Peter Zijlstra Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-10-valentin.schneider@arm.com --- kernel/sched/topology.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/sched/topology.c') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 38e6671c3bd7..c662c53736e2 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -183,11 +183,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 0; /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { + if (parent->groups == parent->groups->next) pflags &= ~(SD_DEGENERATE_GROUPS_MASK | SD_PREFER_SIBLING); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } + if (~cflags & pflags) return 0; -- cgit v1.2.3 From c200191d4c2c1aa2ffb62a984b756ac1f02dc55c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:56 +0100 Subject: sched/topology: Propagate SD_ASYM_CPUCAPACITY upwards We currently set this flag *only* on domains whose topology level exactly match the level where we detect asymmetry (as returned by asym_cpu_capacity_level()). This is rather problematic. Say there are two clusters in the system, one with a lone big CPU and the other with a mix of big and LITTLE CPUs (as is allowed by DynamIQ): DIE [ ] MC [ ][ ] 0 1 2 3 4 L L B B B asym_cpu_capacity_level() will figure out that the MC level is the one where all CPUs can see a CPU of max capacity, and we will thus set SD_ASYM_CPUCAPACITY at MC level for all CPUs. That lone big CPU will degenerate its MC domain, since it would be alone in there, and will end up with just a DIE domain. Since the flag was only set at MC, this CPU ends up not seeing any SD with the flag set, which is broken. Rather than clearing dflags at every topology level, clear it before entering the topology level loop. This will properly propagate upwards flags that are set starting from a certain level. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Quentin Perret Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-11-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 4 +++- kernel/sched/topology.c | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/sched/topology.c') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index ee5cbfc7189f..40ad0d5c32e3 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -83,9 +83,11 @@ SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD) /* * Domain members have different CPU capacities * + * SHARED_PARENT: Set from the topmost domain down to the first domain where + * asymmetry is detected. * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups. */ -SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_NEEDS_GROUPS) +SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) /* * Domain members share CPU capacity (i.e. SMT) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c662c53736e2..f36ed96f3197 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1988,11 +1988,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; + int dflags = 0; sd = NULL; for_each_sd_topology(tl) { - int dflags = 0; - if (tl == tl_asym) { dflags |= SD_ASYM_CPUCAPACITY; has_asym = true; -- cgit v1.2.3 From 3a6712c7685352a5d71eaf459a4fddfc3589f018 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:57 +0100 Subject: sched/topology: Mark SD_PREFER_SIBLING as SDF_NEEDS_GROUPS SD_PREFER_SIBLING is currently considered in sd_parent_degenerate() but not in sd_degenerate(). It too hinges on load balancing, and thus won't have any effect when set on a domain with a single group. Add it to SD_DEGENERATE_GROUPS_MASK. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-12-valentin.schneider@arm.com --- include/linux/sched/sd_flags.h | 4 +++- kernel/sched/topology.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/sched/topology.c') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 40ad0d5c32e3..d28fe67c3098 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -131,8 +131,10 @@ SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD) * * Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD * flag, but cleared below domains with SD_ASYM_CPUCAPACITY. + * + * NEEDS_GROUPS: Load balancing flag. */ -SD_FLAG(SD_PREFER_SIBLING, 0) +SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS) /* * sched_groups of this level overlap diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f36ed96f3197..c674aaab312c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -184,7 +184,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) - pflags &= ~(SD_DEGENERATE_GROUPS_MASK | SD_PREFER_SIBLING); + pflags &= ~SD_DEGENERATE_GROUPS_MASK; if (~cflags & pflags) return 0; -- cgit v1.2.3 From 4fc472f1214ef75e5450f207e23ff13af6eecad4 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Aug 2020 14:32:16 +0100 Subject: sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h SD_DEGENERATE_GROUPS_MASK is only useful for sched/topology.c, but still gets defined for anyone who imports topology.h, leading to a flurry of unused variable warnings. Move it out of the header and place it next to the SD degeneration functions in sched/topology.c. Fixes: 4ee4ea443a5d ("sched/topology: Introduce SD metaflag for flags needing > 1 groups") Reported-by: Andy Shevchenko Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200825133216.9163-2-valentin.schneider@arm.com --- include/linux/sched/topology.h | 7 ------- kernel/sched/topology.c | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/sched/topology.c') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index b9b0dab4d067..9ef7bf686a9f 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -25,13 +25,6 @@ enum { }; #undef SD_FLAG -/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ -#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | -static const unsigned int SD_DEGENERATE_GROUPS_MASK = -#include -0; -#undef SD_FLAG - #ifdef CONFIG_SCHED_DEBUG struct sd_flag_debug { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c674aaab312c..aa1676abc544 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -154,6 +154,13 @@ static inline bool sched_debug(void) } #endif /* CONFIG_SCHED_DEBUG */ +/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | +static const unsigned int SD_DEGENERATE_GROUPS_MASK = +#include +0; +#undef SD_FLAG + static int sd_degenerate(struct sched_domain *sd) { if (cpumask_weight(sched_domain_span(sd)) == 1) -- cgit v1.2.3 From 848785df48835eefebe0c4eb5da7690690b0a8b7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 8 Sep 2020 19:49:56 +0100 Subject: sched/topology: Move sd_flag_debug out of #ifdef CONFIG_SYSCTL The last sd_flag_debug shuffle inadvertently moved its definition within an #ifdef CONFIG_SYSCTL region. While CONFIG_SYSCTL is indeed required to produce the sched domain ctl interface (which uses sd_flag_debug to output flag names), it isn't required to run any assertion on the sched_domain hierarchy itself. Move the definition of sd_flag_debug to a CONFIG_SCHED_DEBUG region of topology.c. Now at long last we have: - sd_flag_debug declared in include/linux/sched/topology.h iff CONFIG_SCHED_DEBUG=y - sd_flag_debug defined in kernel/sched/topology.c, conditioned by: - CONFIG_SCHED_DEBUG, with an explicit #ifdef block - CONFIG_SMP, as a requirement to compile topology.c With this change, all symbols pertaining to SD flag metadata (with the exception of __SD_FLAG_CNT) are now defined exclusively within topology.c Fixes: 8fca9494d4b4 ("sched/topology: Move sd_flag_debug out of linux/sched/topology.h") Reported-by: Randy Dunlap Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200908184956.23369-1-valentin.schneider@arm.com --- kernel/sched/debug.c | 6 ------ kernel/sched/topology.c | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/sched/topology.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0d7896d2a0b2..0655524700d2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,12 +245,6 @@ set_table_entry(struct ctl_table *entry, entry->proc_handler = proc_handler; } -#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, -const struct sd_flag_debug sd_flag_debug[] = { -#include -}; -#undef SD_FLAG - static int sd_ctl_doflags(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index aa1676abc544..249bec7b0a4c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -25,6 +25,12 @@ static inline bool sched_debug(void) return sched_debug_enabled; } +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +const struct sd_flag_debug sd_flag_debug[] = { +#include +}; +#undef SD_FLAG + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { -- cgit v1.2.3 From 2208cdaa56c957e20d8e16f28819aeb47851cb1e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:24:22 +0200 Subject: sched/fair: Reduce minimal imbalance threshold The 25% default imbalance threshold for DIE and NUMA domain is large enough to generate significant unfairness between threads. A typical example is the case of 11 threads running on 2x4 CPUs. The imbalance of 20% between the 2 groups of 4 cores is just low enough to not trigger the load balance between the 2 groups. We will have always the same 6 threads on one group of 4 CPUs and the other 5 threads on the other group of CPUS. With a fair time sharing in each group, we ends up with +20% running time for the group of 5 threads. Consider decreasing the imbalance threshold for overloaded case where we use the load to balance task and to ensure fair time sharing. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Acked-by: Hillf Danton Link: https://lkml.kernel.org/r/20200921072424.14813-3-vincent.guittot@linaro.org --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/topology.c') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 249bec7b0a4c..41df62884cea 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1349,7 +1349,7 @@ sd_init(struct sched_domain_topology_level *tl, .min_interval = sd_weight, .max_interval = 2*sd_weight, .busy_factor = 32, - .imbalance_pct = 125, + .imbalance_pct = 117, .cache_nice_tries = 0, -- cgit v1.2.3 From 6e7499135db724539ca887b3aa64122502875c71 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:24:24 +0200 Subject: sched/fair: Reduce busy load balance interval The busy_factor, which increases load balance interval when a cpu is busy, is set to 32 by default. This value generates some huge LB interval on large system like the THX2 made of 2 node x 28 cores x 4 threads. For such system, the interval increases from 112ms to 3584ms at MC level. And from 228ms to 7168ms at NUMA level. Even on smaller system, a lower busy factor has shown improvement on the fair distribution of the running time so let reduce it for all. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200921072424.14813-5-vincent.guittot@linaro.org --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/topology.c') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 41df62884cea..a3a2417fec54 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1348,7 +1348,7 @@ sd_init(struct sched_domain_topology_level *tl, *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, - .busy_factor = 32, + .busy_factor = 16, .imbalance_pct = 117, .cache_nice_tries = 0, -- cgit v1.2.3