From 4ce94eabac16b1d2c95762b40f49e5654ab288d7 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sat, 20 Feb 2021 15:17:07 -0800 Subject: x86/mm/tlb: Flush remote and local TLBs concurrently To improve TLB shootdown performance, flush the remote and local TLBs concurrently. Introduce flush_tlb_multi() that does so. Introduce paravirtual versions of flush_tlb_multi() for KVM, Xen and hyper-v (Xen and hyper-v are only compile-tested). While the updated smp infrastructure is capable of running a function on a single local core, it is not optimized for this case. The multiple function calls and the indirect branch introduce some overhead, and might make local TLB flushes slower than they were before the recent changes. Before calling the SMP infrastructure, check if only a local TLB flush is needed to restore the lost performance in this common case. This requires to check mm_cpumask() one more time, but unless this mask is updated very frequently, this should impact performance negatively. Signed-off-by: Nadav Amit Signed-off-by: Ingo Molnar Reviewed-by: Michael Kelley # Hyper-v parts Reviewed-by: Juergen Gross # Xen and paravirt parts Reviewed-by: Dave Hansen Link: https://lore.kernel.org/r/20210220231712.2475218-5-namit@vmware.com --- include/trace/events/xen.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index 3b61b587e137..44a3f565264d 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -346,7 +346,7 @@ TRACE_EVENT(xen_mmu_flush_tlb_one_user, TP_printk("addr %lx", __entry->addr) ); -TRACE_EVENT(xen_mmu_flush_tlb_others, +TRACE_EVENT(xen_mmu_flush_tlb_multi, TP_PROTO(const struct cpumask *cpus, struct mm_struct *mm, unsigned long addr, unsigned long end), TP_ARGS(cpus, mm, addr, end), -- cgit v1.2.3 From 291c4011dd7ac0cd0cebb727a75ee5a50d16dcf7 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sat, 20 Feb 2021 15:17:10 -0800 Subject: cpumask: Mark functions as pure cpumask_next_and() and cpumask_any_but() are pure, and marking them as such seems to generate different and presumably better code for native_flush_tlb_multi(). Signed-off-by: Nadav Amit Signed-off-by: Ingo Molnar Reviewed-by: Dave Hansen Link: https://lore.kernel.org/r/20210220231712.2475218-8-namit@vmware.com --- include/linux/cpumask.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 383684e30f12..c53364c4296d 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -235,7 +235,7 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp) return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); } -unsigned int cpumask_next(int n, const struct cpumask *srcp); +unsigned int __pure cpumask_next(int n, const struct cpumask *srcp); /** * cpumask_next_zero - get the next unset cpu in a cpumask @@ -252,8 +252,8 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); } -int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); -int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); +int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); +int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu); unsigned int cpumask_local_spread(unsigned int i, int node); int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); -- cgit v1.2.3 From a5aa5ce300597224ec76dacc8e63ba3ad7a18bbd Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sat, 20 Feb 2021 15:17:12 -0800 Subject: smp: Inline on_each_cpu_cond() and on_each_cpu() Simplify the code and avoid having an additional function on the stack by inlining on_each_cpu_cond() and on_each_cpu(). Suggested-by: Peter Zijlstra Signed-off-by: Nadav Amit [ Minor edits. ] Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210220231712.2475218-10-namit@vmware.com --- include/linux/smp.h | 50 +++++++++++++++++++++++++++++++++-------------- kernel/smp.c | 56 ----------------------------------------------------- kernel/up.c | 38 +----------------------------------- 3 files changed, 37 insertions(+), 107 deletions(-) (limited to 'include') diff --git a/include/linux/smp.h b/include/linux/smp.h index 70c6f6284dcf..84a0b4828f66 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -50,30 +50,52 @@ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); +void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, + void *info, bool wait, const struct cpumask *mask); + +int smp_call_function_single_async(int cpu, call_single_data_t *csd); + /* * Call a function on all processors */ -void on_each_cpu(smp_call_func_t func, void *info, int wait); +static inline void on_each_cpu(smp_call_func_t func, void *info, int wait) +{ + on_each_cpu_cond_mask(NULL, func, info, wait, cpu_online_mask); +} -/* - * Call a function on processors specified by mask, which might include - * the local one. +/** + * on_each_cpu_mask(): Run a function on processors specified by + * cpumask, which may include the local processor. + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * If @wait is true, then returns once @func has returned. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. The + * exception is that it may be used during early boot while + * early_boot_irqs_disabled is set. */ -void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, - void *info, bool wait); +static inline void on_each_cpu_mask(const struct cpumask *mask, + smp_call_func_t func, void *info, bool wait) +{ + on_each_cpu_cond_mask(NULL, func, info, wait, mask); +} /* * Call a function on each processor for which the supplied function * cond_func returns a positive value. This may include the local - * processor. + * processor. May be used during early boot while early_boot_irqs_disabled is + * set. Use local_irq_save/restore() instead of local_irq_disable/enable(). */ -void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, - void *info, bool wait); - -void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, - void *info, bool wait, const struct cpumask *mask); - -int smp_call_function_single_async(int cpu, call_single_data_t *csd); +static inline void on_each_cpu_cond(smp_cond_func_t cond_func, + smp_call_func_t func, void *info, bool wait) +{ + on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); +} #ifdef CONFIG_SMP diff --git a/kernel/smp.c b/kernel/smp.c index c8a5a1facc1a..b6375d775e93 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -847,55 +847,6 @@ void __init smp_init(void) smp_cpus_done(setup_max_cpus); } -/* - * Call a function on all processors. May be used during early boot while - * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead - * of local_irq_disable/enable(). - */ -void on_each_cpu(smp_call_func_t func, void *info, int wait) -{ - unsigned long flags; - - preempt_disable(); - smp_call_function(func, info, wait); - local_irq_save(flags); - func(info); - local_irq_restore(flags); - preempt_enable(); -} -EXPORT_SYMBOL(on_each_cpu); - -/** - * on_each_cpu_mask(): Run a function on processors specified by - * cpumask, which may include the local processor. - * @mask: The set of cpus to run on (only runs on online subset). - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed - * on other CPUs. - * - * If @wait is true, then returns once @func has returned. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. The - * exception is that it may be used during early boot while - * early_boot_irqs_disabled is set. - */ -void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, - void *info, bool wait) -{ - unsigned int scf_flags; - - scf_flags = SCF_RUN_LOCAL; - if (wait) - scf_flags |= SCF_WAIT; - - preempt_disable(); - smp_call_function_many_cond(mask, func, info, scf_flags, NULL); - preempt_enable(); -} -EXPORT_SYMBOL(on_each_cpu_mask); - /* * on_each_cpu_cond(): Call a function on each processor for which * the supplied function cond_func returns true, optionally waiting @@ -932,13 +883,6 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, } EXPORT_SYMBOL(on_each_cpu_cond_mask); -void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, - void *info, bool wait) -{ - on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); -} -EXPORT_SYMBOL(on_each_cpu_cond); - static void do_nothing(void *unused) { } diff --git a/kernel/up.c b/kernel/up.c index c6f323dcd45b..bf20b4a9af60 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -36,35 +36,6 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) } EXPORT_SYMBOL(smp_call_function_single_async); -void on_each_cpu(smp_call_func_t func, void *info, int wait) -{ - unsigned long flags; - - local_irq_save(flags); - func(info); - local_irq_restore(flags); -} -EXPORT_SYMBOL(on_each_cpu); - -/* - * Note we still need to test the mask even for UP - * because we actually can get an empty mask from - * code that on SMP might call us without the local - * CPU in the mask. - */ -void on_each_cpu_mask(const struct cpumask *mask, - smp_call_func_t func, void *info, bool wait) -{ - unsigned long flags; - - if (cpumask_test_cpu(0, mask)) { - local_irq_save(flags); - func(info); - local_irq_restore(flags); - } -} -EXPORT_SYMBOL(on_each_cpu_mask); - /* * Preemption is disabled here to make sure the cond_func is called under the * same condtions in UP and SMP. @@ -75,7 +46,7 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, unsigned long flags; preempt_disable(); - if (cond_func(0, info)) { + if ((!cond_func || cond_func(0, info)) && cpumask_test_cpu(0, mask)) { local_irq_save(flags); func(info); local_irq_restore(flags); @@ -84,13 +55,6 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, } EXPORT_SYMBOL(on_each_cpu_cond_mask); -void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, - void *info, bool wait) -{ - on_each_cpu_cond_mask(cond_func, func, info, wait, NULL); -} -EXPORT_SYMBOL(on_each_cpu_cond); - int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { int ret; -- cgit v1.2.3