From 699f67512f04cbaee965fad872702c06eaf440f6 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:03 -0700 Subject: KVM: VMX: Move posted interrupt descriptor out of VMX code To prepare native usage of posted interrupts, move the PID declarations out of VMX code such that they can be shared. Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Acked-by: Sean Christopherson Link: https://lore.kernel.org/r/20240423174114.526704-2-jacob.jun.pan@linux.intel.com --- arch/x86/include/asm/posted_intr.h | 88 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 arch/x86/include/asm/posted_intr.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h new file mode 100644 index 000000000000..f0324c56f7af --- /dev/null +++ b/arch/x86/include/asm/posted_intr.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_POSTED_INTR_H +#define _X86_POSTED_INTR_H + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +#define PID_TABLE_ENTRY_VALID 1 + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + +static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +#endif /* _X86_POSTED_INTR_H */ -- cgit v1.2.3 From 4ec8fd037139a4d8afb2a5c7edb4a17f9449a035 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:04 -0700 Subject: x86/irq: Unionize PID.PIR for 64bit access w/o casting Make the PIR field into u64 such that atomic xchg64 can be used without ugly casting. Suggested-by: Thomas Gleixner Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-3-jacob.jun.pan@linux.intel.com --- arch/x86/include/asm/posted_intr.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h index f0324c56f7af..acf237b2882e 100644 --- a/arch/x86/include/asm/posted_intr.h +++ b/arch/x86/include/asm/posted_intr.h @@ -9,7 +9,10 @@ /* Posted-Interrupt Descriptor */ struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ + union { + u32 pir[8]; /* Posted interrupt requested */ + u64 pir64[4]; + }; union { struct { /* bit 256 - Outstanding Notification */ -- cgit v1.2.3 From 2254808b53d92c9fe7b645b2f43acc55f22cdce6 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:05 -0700 Subject: x86/irq: Remove bitfields in posted interrupt descriptor Mixture of bitfields and types is weird and really not intuitive, remove bitfields and use typed data exclusively. Bitfields often result in inferior machine code. Suggested-by: Sean Christopherson Suggested-by: Thomas Gleixner Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-4-jacob.jun.pan@linux.intel.com Link: https://lore.kernel.org/all/20240404101735.402feec8@jacob-builder/T/#mf66e34a82a48f4d8e2926b5581eff59a122de53a --- arch/x86/include/asm/posted_intr.h | 21 ++++++++++++--------- arch/x86/kvm/vmx/posted_intr.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h index acf237b2882e..20e31891de15 100644 --- a/arch/x86/include/asm/posted_intr.h +++ b/arch/x86/include/asm/posted_intr.h @@ -15,17 +15,9 @@ struct pi_desc { }; union { struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ + u16 notifications; /* Suppress and outstanding bits */ u8 nv; - /* bit 287:280 - Reserved */ u8 rsvd_2; - /* bit 319:288 - Notification Destination */ u32 ndst; }; u64 control; @@ -88,4 +80,15 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc) return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); } +/* Non-atomic helpers */ +static inline void __pi_set_sn(struct pi_desc *pi_desc) +{ + pi_desc->notifications |= BIT(POSTED_INTR_SN); +} + +static inline void __pi_clear_sn(struct pi_desc *pi_desc) +{ + pi_desc->notifications &= ~BIT(POSTED_INTR_SN); +} + #endif /* _X86_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index af662312fd07..ec08fa3caf43 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -107,7 +107,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * handle task migration (@cpu != vcpu->cpu). */ new.ndst = dest; - new.sn = 0; + __pi_clear_sn(&new); /* * Restore the notification vector; in the blocking case, the @@ -157,7 +157,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); + WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking"); old.control = READ_ONCE(pi_desc->control); do { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 273d26492e41..becefaf95cab 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4845,7 +4845,7 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) * or POSTED_INTR_WAKEUP_VECTOR. */ vmx->pi_desc.nv = POSTED_INTR_VECTOR; - vmx->pi_desc.sn = 1; + __pi_set_sn(&vmx->pi_desc); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) -- cgit v1.2.3 From f5a3562ec9dd29e61735ccf098d8ba05cf6c7c72 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:07 -0700 Subject: x86/irq: Reserve a per CPU IDT vector for posted MSIs When posted MSI is enabled, all device MSIs are multiplexed into a single notification vector. MSI handlers will be de-multiplexed at run-time by system software without IDT delivery. Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-6-jacob.jun.pan@linux.intel.com --- arch/x86/include/asm/irq_vectors.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index d18bfb238f66..13aea8fc3d45 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -97,10 +97,16 @@ #define LOCAL_TIMER_VECTOR 0xec +/* + * Posted interrupt notification vector for all device MSIs delivered to + * the host kernel. + */ +#define POSTED_MSI_NOTIFICATION_VECTOR 0xeb + #define NR_VECTORS 256 #ifdef CONFIG_X86_LOCAL_APIC -#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR +#define FIRST_SYSTEM_VECTOR POSTED_MSI_NOTIFICATION_VECTOR #else #define FIRST_SYSTEM_VECTOR NR_VECTORS #endif -- cgit v1.2.3 From 43650dcf6d6322ec2d0938bb51f755810ffa783a Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:08 -0700 Subject: x86/irq: Set up per host CPU posted interrupt descriptors To support posted MSIs, create a posted interrupt descriptor (PID) for each host CPU. Later on, when setting up interrupt affinity, the IOMMU's interrupt remapping table entry (IRTE) will point to the physical address of the matching CPU's PID. Each PID is initialized with the owner CPU's physical APICID as the destination. Originally-by: Thomas Gleixner Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-7-jacob.jun.pan@linux.intel.com --- arch/x86/include/asm/hardirq.h | 3 +++ arch/x86/include/asm/posted_intr.h | 6 ++++++ arch/x86/kernel/cpu/common.c | 3 +++ arch/x86/kernel/irq.c | 23 +++++++++++++++++++++++ 4 files changed, 35 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index fbc7722b87d1..e7ab594b3a7a 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -48,6 +48,9 @@ typedef struct { DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +#ifdef CONFIG_X86_POSTED_MSI +DECLARE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc); +#endif #define __ARCH_IRQ_STAT #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h index 20e31891de15..6f84f6739d99 100644 --- a/arch/x86/include/asm/posted_intr.h +++ b/arch/x86/include/asm/posted_intr.h @@ -91,4 +91,10 @@ static inline void __pi_clear_sn(struct pi_desc *pi_desc) pi_desc->notifications &= ~BIT(POSTED_INTR_SN); } +#ifdef CONFIG_X86_POSTED_MSI +extern void intel_posted_msi_init(void); +#else +static inline void intel_posted_msi_init(void) {}; +#endif /* X86_POSTED_MSI */ + #endif /* _X86_POSTED_INTR_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 605c26c009c8..25ef145586c6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -68,6 +68,7 @@ #include #include #include +#include #include "cpu.h" @@ -2227,6 +2228,8 @@ void cpu_init(void) barrier(); x2apic_setup(); + + intel_posted_msi_init(); } mmgrab(&init_mm); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 35fde0107901..dbb3a19b3004 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -334,6 +336,27 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) } #endif +#ifdef CONFIG_X86_POSTED_MSI + +/* Posted Interrupt Descriptors for coalesced MSIs to be posted */ +DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc); + +void intel_posted_msi_init(void) +{ + u32 destination; + u32 apic_id; + + this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR); + + /* + * APIC destination ID is stored in bit 8:15 while in XAPIC mode. + * VT-d spec. CH 9.11 + */ + apic_id = this_cpu_read(x86_cpu_to_apicid); + destination = x2apic_enabled() ? apic_id : apic_id << 8; + this_cpu_write(posted_msi_pi_desc.ndst, destination); +} +#endif /* X86_POSTED_MSI */ #ifdef CONFIG_HOTPLUG_CPU /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -- cgit v1.2.3 From 1b03d82ba15e895776f1f7da2bb56a9a60e6dfed Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:10 -0700 Subject: x86/irq: Install posted MSI notification handler All MSI vectors are multiplexed into a single notification vector when posted MSI is enabled. It is the responsibility of the notification vector handler to demultiplex MSI vectors. In the handler the MSI vector handlers are dispatched without IDT delivery for each pending MSI interrupt. For example, the interrupt flow will change as follows: (3 MSIs of different vectors arrive in a a high frequency burst) BEFORE: interrupt(MSI) irq_enter() handler() /* EOI */ irq_exit() process_softirq() interrupt(MSI) irq_enter() handler() /* EOI */ irq_exit() process_softirq() interrupt(MSI) irq_enter() handler() /* EOI */ irq_exit() process_softirq() AFTER: interrupt /* Posted MSI notification vector */ irq_enter() atomic_xchg(PIR) handler() handler() handler() pi_clear_on() apic_eoi() irq_exit() process_softirq() Except for the leading MSI, CPU notifications are skipped/coalesced. For MSIs which arrive at a low frequency, the demultiplexing loop does not wait for more interrupts to coalesce. Therefore, there's no additional latency other than the processing time. Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-9-jacob.jun.pan@linux.intel.com --- arch/x86/entry/entry_fred.c | 2 + arch/x86/include/asm/hardirq.h | 3 + arch/x86/include/asm/idtentry.h | 6 ++ arch/x86/kernel/idt.c | 3 + arch/x86/kernel/irq.c | 125 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 135 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 89c1476fcdd9..f004a4dc74c2 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -117,6 +117,8 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), + + SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification), }; static bool fred_setup_done __initdata; diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index e7ab594b3a7a..c67fa6ad098a 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -44,6 +44,9 @@ typedef struct { unsigned int irq_hv_reenlightenment_count; unsigned int hyperv_stimer0_count; #endif +#ifdef CONFIG_X86_POSTED_MSI + unsigned int posted_msi_notification_count; +#endif } ____cacheline_aligned irq_cpustat_t; DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 749c7411d2f1..d4f24499b256 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -751,6 +751,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested # define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif +# ifdef CONFIG_X86_POSTED_MSI +DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); +#else +# define fred_sysvec_posted_msi_notification NULL +# endif + #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index fc37c8d83daf..f445bec516a0 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = { # endif INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), +# ifdef CONFIG_X86_POSTED_MSI + INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification), +# endif #endif }; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d652b0481899..578e4f6a5080 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -183,6 +183,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); +#endif +#ifdef CONFIG_X86_POSTED_MSI + seq_printf(p, "%*s: ", prec, "PMN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->posted_msi_notification_count); + seq_puts(p, " Posted MSI notification event\n"); #endif return 0; } @@ -242,16 +249,16 @@ static __always_inline void handle_irq(struct irq_desc *desc, __handle_irq(desc, regs); } -static __always_inline void call_irq_handler(int vector, struct pt_regs *regs) +static __always_inline int call_irq_handler(int vector, struct pt_regs *regs) { struct irq_desc *desc; + int ret = 0; desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { handle_irq(desc, regs); } else { - apic_eoi(); - + ret = -EINVAL; if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), @@ -260,6 +267,8 @@ static __always_inline void call_irq_handler(int vector, struct pt_regs *regs) __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); } } + + return ret; } /* @@ -273,7 +282,9 @@ DEFINE_IDTENTRY_IRQ(common_interrupt) /* entry code tells RCU that we're not quiescent. Check it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); - call_irq_handler(vector, regs); + if (unlikely(call_irq_handler(vector, regs))) + apic_eoi(); + set_irq_regs(old_regs); } @@ -361,6 +372,112 @@ void intel_posted_msi_init(void) destination = x2apic_enabled() ? apic_id : apic_id << 8; this_cpu_write(posted_msi_pi_desc.ndst, destination); } + +/* + * De-multiplexing posted interrupts is on the performance path, the code + * below is written to optimize the cache performance based on the following + * considerations: + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently + * accessed by both CPU and IOMMU. + * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg + * for checking and clearing posted interrupt request (PIR), a 256 bit field + * within the PID. + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache + * line when posting interrupts and setting control bits. + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID + * cache line. The cache line states after each operation are as follows: + * CPU IOMMU PID Cache line state + * --------------------------------------------------------------- + *...read64 exclusive + *...lock xchg64 modified + *... post/atomic swap invalid + *...------------------------------------------------------------- + * + * To reduce L1 data cache miss, it is important to avoid contention with + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used + * to dispatch interrupt handlers. + * + * In addition, the code is trying to keep the cache line state consistent + * as much as possible. e.g. when making a copy and clearing the PIR + * (assuming non-zero PIR bits are present in the entire PIR), it does: + * read, read, read, read, xchg, xchg, xchg, xchg + * instead of: + * read, xchg, read, xchg, read, xchg, read, xchg + */ +static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) +{ + int i, vec = FIRST_EXTERNAL_VECTOR; + unsigned long pir_copy[4]; + bool handled = false; + + for (i = 0; i < 4; i++) + pir_copy[i] = pir[i]; + + for (i = 0; i < 4; i++) { + if (!pir_copy[i]) + continue; + + pir_copy[i] = arch_xchg(&pir[i], 0); + handled = true; + } + + if (handled) { + for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) + call_irq_handler(vec, regs); + } + + return handled; +} + +/* + * Performance data shows that 3 is good enough to harvest 90+% of the benefit + * on high IRQ rate workload. + */ +#define MAX_POSTED_MSI_COALESCING_LOOP 3 + +/* + * For MSIs that are delivered as posted interrupts, the CPU notifications + * can be coalesced if the MSIs arrive in high frequency bursts. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct pi_desc *pid; + int i = 0; + + pid = this_cpu_ptr(&posted_msi_pi_desc); + + inc_irq_stat(posted_msi_notification_count); + irq_enter(); + + /* + * Max coalescing count includes the extra round of handle_pending_pir + * after clearing the outstanding notification bit. Hence, at most + * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. + */ + while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { + if (!handle_pending_pir(pid->pir64, regs)) + break; + } + + /* + * Clear outstanding notification bit to allow new IRQ notifications, + * do this last to maximize the window of interrupt coalescing. + */ + pi_clear_on(pid); + + /* + * There could be a race of PI notification and the clearing of ON bit, + * process PIR bits one last time such that handling the new interrupts + * are not delayed until the next IRQ. + */ + handle_pending_pir(pid->pir64, regs); + + apic_eoi(); + irq_exit(); + set_irq_regs(old_regs); +} #endif /* X86_POSTED_MSI */ #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From fef05a078b6fa1e9047e0486f1f6daf70664fd12 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:11 -0700 Subject: x86/irq: Factor out common code for checking pending interrupts Use a common function for checking pending interrupt vector in APIC IRR instead of duplicated open coding them. Additional checks for posted MSI vectors can then be contained in this function. Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-10-jacob.jun.pan@linux.intel.com --- arch/x86/include/asm/apic.h | 11 +++++++++++ arch/x86/kernel/apic/vector.c | 5 ++--- arch/x86/kernel/irq.c | 5 ++--- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index e6ab0cf15ed5..50f9781fa3ed 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -500,6 +500,17 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector) return !!(irr & (1U << (vector % 32))); } +static inline bool is_vector_pending(unsigned int vector) +{ + unsigned int irr; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) + return true; + + return false; +} + /* * Warm reset vector position: */ diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 185738c72766..9eec52925fa3 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -965,7 +965,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) lockdep_assert_held(&vector_lock); hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { - unsigned int irr, vector = apicd->prev_vector; + unsigned int vector = apicd->prev_vector; /* * Paranoia: Check if the vector that needs to be cleaned @@ -979,8 +979,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) * fixup_irqs() was just called to scan IRR for set bits and * forward them to new destination CPUs via IPIs. */ - irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0; - if (irr & (1U << (vector % 32))) { + if (check_irr && is_vector_pending(vector)) { pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); rearm = true; continue; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 578e4f6a5080..385e3a5fc304 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -484,7 +484,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { - unsigned int irr, vector; + unsigned int vector; struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; @@ -511,8 +511,7 @@ void fixup_irqs(void) if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) continue; - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); - if (irr & (1 << (vector % 32))) { + if (is_vector_pending(vector)) { desc = __this_cpu_read(vector_irq[vector]); raw_spin_lock(&desc->lock); -- cgit v1.2.3 From ce0a92871179f8ca58ae8e3cf50e726a163bf831 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:12 -0700 Subject: x86/irq: Extend checks for pending vectors to posted interrupts During interrupt affinity change, it is possible to have interrupts delivered to the old CPU after the affinity has changed to the new one. To prevent lost interrupts, local APIC IRR is checked on the old CPU. Similar checks must be done for posted MSIs given the same reason. Consider the following scenario: Device system agent iommu memory CPU/LAPIC 1 FEEX_XXXX 2 Interrupt request 3 Fetch IRTE -> 4 ->Atomic Swap PID.PIR(vec) Push to Global Observable(GO) 5 if (ON*) done;* else 6 send a notification -> * ON: outstanding notification, 1 will suppress new notifications If the affinity change happens between 3 and 5 in the IOMMU, the old CPU's posted interrupt request (PIR) could have the pending bit set for the vector being moved. Add a helper function to check individual vector status. Then use the helper to check for pending interrupts on the source CPU's PID. Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-11-jacob.jun.pan@linux.intel.com --- arch/x86/include/asm/apic.h | 3 ++- arch/x86/include/asm/posted_intr.h | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 50f9781fa3ed..5644c396713e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -14,6 +14,7 @@ #include #include #include +#include #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -508,7 +509,7 @@ static inline bool is_vector_pending(unsigned int vector) if (irr & (1 << (vector % 32))) return true; - return false; + return pi_pending_this_cpu(vector); } /* diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h index 6f84f6739d99..de788b400fba 100644 --- a/arch/x86/include/asm/posted_intr.h +++ b/arch/x86/include/asm/posted_intr.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_POSTED_INTR_H #define _X86_POSTED_INTR_H +#include #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 @@ -92,8 +93,25 @@ static inline void __pi_clear_sn(struct pi_desc *pi_desc) } #ifdef CONFIG_X86_POSTED_MSI +/* + * Not all external vectors are subject to interrupt remapping, e.g. IOMMU's + * own interrupts. Here we do not distinguish them since those vector bits in + * PIR will always be zero. + */ +static inline bool pi_pending_this_cpu(unsigned int vector) +{ + struct pi_desc *pid = this_cpu_ptr(&posted_msi_pi_desc); + + if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR)) + return false; + + return test_bit(vector, (unsigned long *)pid->pir); +} + extern void intel_posted_msi_init(void); #else +static inline bool pi_pending_this_cpu(unsigned int vector) { return false; } + static inline void intel_posted_msi_init(void) {}; #endif /* X86_POSTED_MSI */ -- cgit v1.2.3 From be9be07b22c96dc03d0ecc76b5a5f21c2dcb05a1 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Apr 2024 10:41:13 -0700 Subject: iommu/vt-d: Make posted MSI an opt-in command line option Add a command line opt-in option for posted MSI if CONFIG_X86_POSTED_MSI=y. Also introduce a helper function for testing if posted MSI is supported on the platform. Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423174114.526704-12-jacob.jun.pan@linux.intel.com --- Documentation/admin-guide/kernel-parameters.txt | 2 ++ arch/x86/include/asm/irq_remapping.h | 7 +++++++ drivers/iommu/irq_remapping.c | 5 ++++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 902ecd92a29f..dfbe9fdf97e4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2251,6 +2251,8 @@ no_x2apic_optout BIOS x2APIC opt-out request will be ignored nopost disable Interrupt Posting + posted_msi + enable MSIs delivered as posted interrupts iomem= Disable strict checking of access to MMIO memory strict regions from userspace. diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 7a2ed154a5e1..5036f13ab69f 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -50,6 +50,13 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void) return x86_vector_domain; } +extern bool enable_posted_msi; + +static inline bool posted_msi_supported(void) +{ + return enable_posted_msi && irq_remapping_cap(IRQ_POSTING_CAP); +} + #else /* CONFIG_IRQ_REMAP */ static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index ee59647c2050..056fec6991bc 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -24,6 +24,8 @@ int no_x2apic_optout; int disable_irq_post = 0; +bool enable_posted_msi __ro_after_init; + static int disable_irq_remap; static struct irq_remap_ops *remap_ops; @@ -70,7 +72,8 @@ static __init int setup_irqremap(char *str) no_x2apic_optout = 1; else if (!strncmp(str, "nopost", 6)) disable_irq_post = 1; - + else if (IS_ENABLED(CONFIG_X86_POSTED_MSI) && !strncmp(str, "posted_msi", 10)) + enable_posted_msi = true; str += strcspn(str, ","); while (*str == ',') str++; -- cgit v1.2.3 From 6ecc2e7932fe8f132d3b671685f9995785f19e9a Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 6 May 2024 10:56:12 -0700 Subject: x86/irq: Use existing helper for pending vector check lapic_vector_set_in_irr() is already available, use it for checking pending vectors at the local APIC. No functional change. Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner Reviewed-by: Imran Khan Link: https://lore.kernel.org/r/20240506175612.1141095-1-jacob.jun.pan@linux.intel.com --- arch/x86/include/asm/apic.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 5644c396713e..467532b3e070 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -503,13 +503,7 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector) static inline bool is_vector_pending(unsigned int vector) { - unsigned int irr; - - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); - if (irr & (1 << (vector % 32))) - return true; - - return pi_pending_this_cpu(vector); + return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector); } /* -- cgit v1.2.3