From 43650dcf6d6322ec2d0938bb51f755810ffa783a Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:08 -0700 Subject: x86/irq: Set up per host CPU posted interrupt descriptors To support posted MSIs, create a posted interrupt descriptor (PID) for each host CPU. Later on, when setting up interrupt affinity, the IOMMU's interrupt remapping table entry (IRTE) will point to the physical address of the matching CPU's PID. Each PID is initialized with the owner CPU's physical APICID as the destination. Originally-by: Thomas Gleixner Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-7-jacob.jun.pan@linux.intel.com --- arch/x86/kernel/cpu/common.c | 3 +++ arch/x86/kernel/irq.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 605c26c009c8..25ef145586c6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -68,6 +68,7 @@ #include #include #include +#include #include "cpu.h" @@ -2227,6 +2228,8 @@ void cpu_init(void) barrier(); x2apic_setup(); + + intel_posted_msi_init(); } mmgrab(&init_mm); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 35fde0107901..dbb3a19b3004 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -334,6 +336,27 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) } #endif +#ifdef CONFIG_X86_POSTED_MSI + +/* Posted Interrupt Descriptors for coalesced MSIs to be posted */ +DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc); + +void intel_posted_msi_init(void) +{ + u32 destination; + u32 apic_id; + + this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR); + + /* + * APIC destination ID is stored in bit 8:15 while in XAPIC mode. + * VT-d spec. CH 9.11 + */ + apic_id = this_cpu_read(x86_cpu_to_apicid); + destination = x2apic_enabled() ? apic_id : apic_id << 8; + this_cpu_write(posted_msi_pi_desc.ndst, destination); +} +#endif /* X86_POSTED_MSI */ #ifdef CONFIG_HOTPLUG_CPU /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -- cgit v1.2.3 From 6087c7f36ab293a06bc0bcf3857ed4d7eb1f9905 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:09 -0700 Subject: x86/irq: Factor out handler invocation from common_interrupt() Prepare for calling external interrupt handlers directly from the posted MSI demultiplexing loop. Extract the common code from common_interrupt() to avoid code duplication. Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-8-jacob.jun.pan@linux.intel.com --- arch/x86/kernel/irq.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index dbb3a19b3004..d652b0481899 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -242,18 +242,10 @@ static __always_inline void handle_irq(struct irq_desc *desc, __handle_irq(desc, regs); } -/* - * common_interrupt() handles all normal device IRQ's (the special SMP - * cross-CPU interrupts have their own entry points). - */ -DEFINE_IDTENTRY_IRQ(common_interrupt) +static __always_inline void call_irq_handler(int vector, struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; - /* entry code tells RCU that we're not quiescent. Check it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); - desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { handle_irq(desc, regs); @@ -268,7 +260,20 @@ DEFINE_IDTENTRY_IRQ(common_interrupt) __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); } } +} + +/* + * common_interrupt() handles all normal device IRQ's (the special SMP + * cross-CPU interrupts have their own entry points). + */ +DEFINE_IDTENTRY_IRQ(common_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* entry code tells RCU that we're not quiescent. Check it. */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); + call_irq_handler(vector, regs); set_irq_regs(old_regs); } -- cgit v1.2.3 From 1b03d82ba15e895776f1f7da2bb56a9a60e6dfed Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:10 -0700 Subject: x86/irq: Install posted MSI notification handler All MSI vectors are multiplexed into a single notification vector when posted MSI is enabled. It is the responsibility of the notification vector handler to demultiplex MSI vectors. In the handler the MSI vector handlers are dispatched without IDT delivery for each pending MSI interrupt. For example, the interrupt flow will change as follows: (3 MSIs of different vectors arrive in a a high frequency burst) BEFORE: interrupt(MSI) irq_enter() handler() /* EOI */ irq_exit() process_softirq() interrupt(MSI) irq_enter() handler() /* EOI */ irq_exit() process_softirq() interrupt(MSI) irq_enter() handler() /* EOI */ irq_exit() process_softirq() AFTER: interrupt /* Posted MSI notification vector */ irq_enter() atomic_xchg(PIR) handler() handler() handler() pi_clear_on() apic_eoi() irq_exit() process_softirq() Except for the leading MSI, CPU notifications are skipped/coalesced. For MSIs which arrive at a low frequency, the demultiplexing loop does not wait for more interrupts to coalesce. Therefore, there's no additional latency other than the processing time. Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-9-jacob.jun.pan@linux.intel.com --- arch/x86/entry/entry_fred.c | 2 + arch/x86/include/asm/hardirq.h | 3 + arch/x86/include/asm/idtentry.h | 6 ++ arch/x86/kernel/idt.c | 3 + arch/x86/kernel/irq.c | 125 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 135 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 89c1476fcdd9..f004a4dc74c2 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -117,6 +117,8 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), + + SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification), }; static bool fred_setup_done __initdata; diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index e7ab594b3a7a..c67fa6ad098a 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -44,6 +44,9 @@ typedef struct { unsigned int irq_hv_reenlightenment_count; unsigned int hyperv_stimer0_count; #endif +#ifdef CONFIG_X86_POSTED_MSI + unsigned int posted_msi_notification_count; +#endif } ____cacheline_aligned irq_cpustat_t; DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 749c7411d2f1..d4f24499b256 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -751,6 +751,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested # define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif +# ifdef CONFIG_X86_POSTED_MSI +DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); +#else +# define fred_sysvec_posted_msi_notification NULL +# endif + #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index fc37c8d83daf..f445bec516a0 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = { # endif INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), +# ifdef CONFIG_X86_POSTED_MSI + INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification), +# endif #endif }; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d652b0481899..578e4f6a5080 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -183,6 +183,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); +#endif +#ifdef CONFIG_X86_POSTED_MSI + seq_printf(p, "%*s: ", prec, "PMN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->posted_msi_notification_count); + seq_puts(p, " Posted MSI notification event\n"); #endif return 0; } @@ -242,16 +249,16 @@ static __always_inline void handle_irq(struct irq_desc *desc, __handle_irq(desc, regs); } -static __always_inline void call_irq_handler(int vector, struct pt_regs *regs) +static __always_inline int call_irq_handler(int vector, struct pt_regs *regs) { struct irq_desc *desc; + int ret = 0; desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { handle_irq(desc, regs); } else { - apic_eoi(); - + ret = -EINVAL; if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), @@ -260,6 +267,8 @@ static __always_inline void call_irq_handler(int vector, struct pt_regs *regs) __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); } } + + return ret; } /* @@ -273,7 +282,9 @@ DEFINE_IDTENTRY_IRQ(common_interrupt) /* entry code tells RCU that we're not quiescent. Check it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); - call_irq_handler(vector, regs); + if (unlikely(call_irq_handler(vector, regs))) + apic_eoi(); + set_irq_regs(old_regs); } @@ -361,6 +372,112 @@ void intel_posted_msi_init(void) destination = x2apic_enabled() ? apic_id : apic_id << 8; this_cpu_write(posted_msi_pi_desc.ndst, destination); } + +/* + * De-multiplexing posted interrupts is on the performance path, the code + * below is written to optimize the cache performance based on the following + * considerations: + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently + * accessed by both CPU and IOMMU. + * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg + * for checking and clearing posted interrupt request (PIR), a 256 bit field + * within the PID. + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache + * line when posting interrupts and setting control bits. + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID + * cache line. The cache line states after each operation are as follows: + * CPU IOMMU PID Cache line state + * --------------------------------------------------------------- + *...read64 exclusive + *...lock xchg64 modified + *... post/atomic swap invalid + *...------------------------------------------------------------- + * + * To reduce L1 data cache miss, it is important to avoid contention with + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used + * to dispatch interrupt handlers. + * + * In addition, the code is trying to keep the cache line state consistent + * as much as possible. e.g. when making a copy and clearing the PIR + * (assuming non-zero PIR bits are present in the entire PIR), it does: + * read, read, read, read, xchg, xchg, xchg, xchg + * instead of: + * read, xchg, read, xchg, read, xchg, read, xchg + */ +static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) +{ + int i, vec = FIRST_EXTERNAL_VECTOR; + unsigned long pir_copy[4]; + bool handled = false; + + for (i = 0; i < 4; i++) + pir_copy[i] = pir[i]; + + for (i = 0; i < 4; i++) { + if (!pir_copy[i]) + continue; + + pir_copy[i] = arch_xchg(&pir[i], 0); + handled = true; + } + + if (handled) { + for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) + call_irq_handler(vec, regs); + } + + return handled; +} + +/* + * Performance data shows that 3 is good enough to harvest 90+% of the benefit + * on high IRQ rate workload. + */ +#define MAX_POSTED_MSI_COALESCING_LOOP 3 + +/* + * For MSIs that are delivered as posted interrupts, the CPU notifications + * can be coalesced if the MSIs arrive in high frequency bursts. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct pi_desc *pid; + int i = 0; + + pid = this_cpu_ptr(&posted_msi_pi_desc); + + inc_irq_stat(posted_msi_notification_count); + irq_enter(); + + /* + * Max coalescing count includes the extra round of handle_pending_pir + * after clearing the outstanding notification bit. Hence, at most + * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. + */ + while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { + if (!handle_pending_pir(pid->pir64, regs)) + break; + } + + /* + * Clear outstanding notification bit to allow new IRQ notifications, + * do this last to maximize the window of interrupt coalescing. + */ + pi_clear_on(pid); + + /* + * There could be a race of PI notification and the clearing of ON bit, + * process PIR bits one last time such that handling the new interrupts + * are not delayed until the next IRQ. + */ + handle_pending_pir(pid->pir64, regs); + + apic_eoi(); + irq_exit(); + set_irq_regs(old_regs); +} #endif /* X86_POSTED_MSI */ #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From fef05a078b6fa1e9047e0486f1f6daf70664fd12 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:11 -0700 Subject: x86/irq: Factor out common code for checking pending interrupts Use a common function for checking pending interrupt vector in APIC IRR instead of duplicated open coding them. Additional checks for posted MSI vectors can then be contained in this function. Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-10-jacob.jun.pan@linux.intel.com --- arch/x86/include/asm/apic.h | 11 +++++++++++ arch/x86/kernel/apic/vector.c | 5 ++--- arch/x86/kernel/irq.c | 5 ++--- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index e6ab0cf15ed5..50f9781fa3ed 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -500,6 +500,17 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector) return !!(irr & (1U << (vector % 32))); } +static inline bool is_vector_pending(unsigned int vector) +{ + unsigned int irr; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) + return true; + + return false; +} + /* * Warm reset vector position: */ diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 185738c72766..9eec52925fa3 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -965,7 +965,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) lockdep_assert_held(&vector_lock); hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { - unsigned int irr, vector = apicd->prev_vector; + unsigned int vector = apicd->prev_vector; /* * Paranoia: Check if the vector that needs to be cleaned @@ -979,8 +979,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) * fixup_irqs() was just called to scan IRR for set bits and * forward them to new destination CPUs via IPIs. */ - irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0; - if (irr & (1U << (vector % 32))) { + if (check_irr && is_vector_pending(vector)) { pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); rearm = true; continue; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 578e4f6a5080..385e3a5fc304 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -484,7 +484,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { - unsigned int irr, vector; + unsigned int vector; struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; @@ -511,8 +511,7 @@ void fixup_irqs(void) if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) continue; - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); - if (irr & (1 << (vector % 32))) { + if (is_vector_pending(vector)) { desc = __this_cpu_read(vector_irq[vector]); raw_spin_lock(&desc->lock); -- cgit v1.2.3