summaryrefslogtreecommitdiff
path: root/kernel/watchdog.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-14 19:47:14 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-14 19:47:14 +0300
commit6bfd2d442af5c373042f196eef1915e1f6ac058a (patch)
tree3221ccd0185cf16d32639a7a15d0ab5768ed527d /kernel/watchdog.c
parenta9d9ce3fbc2761e69c5daeb99156a5d06eb79ae5 (diff)
parent382d2ffe86efb1e2fa803d2cf17e5bfc34e574f3 (diff)
downloadlinux-6bfd2d442af5c373042f196eef1915e1f6ac058a.tar.xz
Merge tag 'irq-core-2024-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull interrupt subsystem updates from Thomas Gleixner: "Core code: - Interrupt storm detection for the lockup watchdog: Lockups which are caused by interrupt storms are not easy to debug because there is no information about the events which make the lockup detector trigger. To make this more user friendly, provide an extenstion to interrupt statistics which allows to take snapshots and an interface to retrieve the delta to the snapshot. Use this new mechanism in the watchdog code to do a two stage lockup analysis by taking the snapshot and printing the deltas for the topmost active interrupts on the second trigger. Note: This contains both the interrupt and the watchdog changes as the latter depend on the former obviously. - Avoid summation loops in the /proc/interrupts output and use the global counter when possible - Skip suspended interrupts on CPU hotplug operations to ensure that they are not delivered before the system resumes the device drivers when coming out of suspend. - On CPU hot-unplug interrupts which are affine to the outgoing CPU are migrated to a different CPU in the affinity mask. This can fail when the CPUs have no vectors left. Instead of giving up try to migrate it to any online CPU and thereby breaking the affinity setting in order to prevent a stale device interrupt which targets an offline CPU - The usual small cleanups Driver code: - Support for the RISCV AIA MSI controller - Make the interrupt allocation for the Loongson PCH controller more flexible to prevent vector exhaustion - The usual set of cleanups and fixes all over the place" * tag 'irq-core-2024-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits) irqchip/gic-v3-its: Remove BUG_ON in its_vpe_irq_domain_alloc cpuidle: Avoid explicit cpumask allocation on stack irqchip/sifive-plic: Avoid explicit cpumask allocation on stack irqchip/riscv-aplic-direct: Avoid explicit cpumask allocation on stack irqchip/loongson-eiointc: Avoid explicit cpumask allocation on stack irqchip/gic-v3-its: Avoid explicit cpumask allocation on stack irqchip/irq-bcm6345-l1: Avoid explicit cpumask allocation on stack cpumask: Introduce cpumask_first_and_and() irqchip/irq-brcmstb-l2: Avoid saving mask on shutdown genirq: Reuse irq_is_nmi() genirq/cpuhotplug: Retry with cpu_online_mask when migration fails genirq/cpuhotplug: Skip suspended interrupts when restoring affinity arm64: dts: st: Add interrupt parent to pinctrl on stm32mp251 arm64: dts: st: Add exti1 and exti2 nodes on stm32mp251 ARM: dts: stm32: List exti parent interrupts on stm32mp131 ARM: dts: stm32: List exti parent interrupts on stm32mp151 arm64: Kconfig.platforms: Enable STM32_EXTI for ARCH_STM32 irqchip/stm32-exti: Mark events reserved with RIF configuration check irqchip/stm32-exti: Skip secure events irqchip/stm32-exti: Convert driver to standard PM ...
Diffstat (limited to 'kernel/watchdog.c')
-rw-r--r--kernel/watchdog.c215
1 files changed, 210 insertions, 5 deletions
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d7b2125503af..d12ff74889ed 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -12,20 +12,25 @@
#define pr_fmt(fmt) "watchdog: " fmt
-#include <linux/mm.h>
#include <linux/cpu.h>
-#include <linux/nmi.h>
#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/kernel_stat.h>
+#include <linux/kvm_para.h>
+#include <linux/math64.h>
+#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/tick.h>
+
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
-#include <linux/stop_machine.h>
#include <asm/irq_regs.h>
-#include <linux/kvm_para.h>
static DEFINE_MUTEX(watchdog_mutex);
@@ -35,6 +40,8 @@ static DEFINE_MUTEX(watchdog_mutex);
# define WATCHDOG_HARDLOCKUP_DEFAULT 0
#endif
+#define NUM_SAMPLE_PERIODS 5
+
unsigned long __read_mostly watchdog_enabled;
int __read_mostly watchdog_user_enabled = 1;
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
@@ -333,6 +340,188 @@ __setup("watchdog_thresh=", watchdog_thresh_setup);
static void __lockup_detector_cleanup(void);
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
+enum stats_per_group {
+ STATS_SYSTEM,
+ STATS_SOFTIRQ,
+ STATS_HARDIRQ,
+ STATS_IDLE,
+ NUM_STATS_PER_GROUP,
+};
+
+static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = {
+ CPUTIME_SYSTEM,
+ CPUTIME_SOFTIRQ,
+ CPUTIME_IRQ,
+ CPUTIME_IDLE,
+};
+
+static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]);
+static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]);
+static DEFINE_PER_CPU(u8, cpustat_tail);
+
+/*
+ * We don't need nanosecond resolution. A granularity of 16ms is
+ * sufficient for our precision, allowing us to use u16 to store
+ * cpustats, which will roll over roughly every ~1000 seconds.
+ * 2^24 ~= 16 * 10^6
+ */
+static u16 get_16bit_precision(u64 data_ns)
+{
+ return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */
+}
+
+static void update_cpustat(void)
+{
+ int i;
+ u8 util;
+ u16 old_stat, new_stat;
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+ u8 tail = __this_cpu_read(cpustat_tail);
+ u16 sample_period_16 = get_16bit_precision(sample_period);
+
+ kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
+
+ for (i = 0; i < NUM_STATS_PER_GROUP; i++) {
+ old_stat = __this_cpu_read(cpustat_old[i]);
+ new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
+ util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
+ __this_cpu_write(cpustat_util[tail][i], util);
+ __this_cpu_write(cpustat_old[i], new_stat);
+ }
+
+ __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS);
+}
+
+static void print_cpustat(void)
+{
+ int i, group;
+ u8 tail = __this_cpu_read(cpustat_tail);
+ u64 sample_period_second = sample_period;
+
+ do_div(sample_period_second, NSEC_PER_SEC);
+
+ /*
+ * Outputting the "watchdog" prefix on every line is redundant and not
+ * concise, and the original alarm information is sufficient for
+ * positioning in logs, hence here printk() is used instead of pr_crit().
+ */
+ printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n",
+ smp_processor_id(), sample_period_second);
+
+ for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
+ group = (tail + i) % NUM_SAMPLE_PERIODS;
+ printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t"
+ "%3u%% hardirq,\t%3u%% idle\n", i + 1,
+ __this_cpu_read(cpustat_util[group][STATS_SYSTEM]),
+ __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]),
+ __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]),
+ __this_cpu_read(cpustat_util[group][STATS_IDLE]));
+ }
+}
+
+#define HARDIRQ_PERCENT_THRESH 50
+#define NUM_HARDIRQ_REPORT 5
+struct irq_counts {
+ int irq;
+ u32 counts;
+};
+
+static DEFINE_PER_CPU(bool, snapshot_taken);
+
+/* Tabulate the most frequent interrupts. */
+static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank)
+{
+ int i;
+ struct irq_counts new_count = {irq, counts};
+
+ for (i = 0; i < rank; i++) {
+ if (counts > irq_counts[i].counts)
+ swap(new_count, irq_counts[i]);
+ }
+}
+
+/*
+ * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period,
+ * then the cause of softlockup might be interrupt storm. In this case, it
+ * would be useful to start interrupt counting.
+ */
+static bool need_counting_irqs(void)
+{
+ u8 util;
+ int tail = __this_cpu_read(cpustat_tail);
+
+ tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT;
+ util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]);
+ return util > HARDIRQ_PERCENT_THRESH;
+}
+
+static void start_counting_irqs(void)
+{
+ if (!__this_cpu_read(snapshot_taken)) {
+ kstat_snapshot_irqs();
+ __this_cpu_write(snapshot_taken, true);
+ }
+}
+
+static void stop_counting_irqs(void)
+{
+ __this_cpu_write(snapshot_taken, false);
+}
+
+static void print_irq_counts(void)
+{
+ unsigned int i, count;
+ struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = {
+ {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}
+ };
+
+ if (__this_cpu_read(snapshot_taken)) {
+ for_each_active_irq(i) {
+ count = kstat_get_irq_since_snapshot(i);
+ tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT);
+ }
+
+ /*
+ * Outputting the "watchdog" prefix on every line is redundant and not
+ * concise, and the original alarm information is sufficient for
+ * positioning in logs, hence here printk() is used instead of pr_crit().
+ */
+ printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n",
+ smp_processor_id(), HARDIRQ_PERCENT_THRESH);
+
+ for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
+ if (irq_counts_sorted[i].irq == -1)
+ break;
+
+ printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
+ i + 1, irq_counts_sorted[i].counts,
+ irq_counts_sorted[i].irq);
+ }
+
+ /*
+ * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last
+ * sample_period, then we suspect the interrupt storm might be subsiding.
+ */
+ if (!need_counting_irqs())
+ stop_counting_irqs();
+ }
+}
+
+static void report_cpu_status(void)
+{
+ print_cpustat();
+ print_irq_counts();
+}
+#else
+static inline void update_cpustat(void) { }
+static inline void report_cpu_status(void) { }
+static inline bool need_counting_irqs(void) { return false; }
+static inline void start_counting_irqs(void) { }
+static inline void stop_counting_irqs(void) { }
+#endif
+
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
* lockups can have false positives under extreme conditions. So we generally
@@ -364,7 +553,7 @@ static void set_sample_period(void)
* and hard thresholds) to increment before the
* hardlockup detector generates a warning
*/
- sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS);
watchdog_update_hrtimer_threshold(sample_period);
}
@@ -434,6 +623,18 @@ static int is_softlockup(unsigned long touch_ts,
unsigned long now)
{
if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
+ /*
+ * If period_ts has not been updated during a sample_period, then
+ * in the subsequent few sample_periods, period_ts might also not
+ * be updated, which could indicate a potential softlockup. In
+ * this case, if we suspect the cause of the potential softlockup
+ * might be interrupt storm, then we need to count the interrupts
+ * to find which interrupt is storming.
+ */
+ if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) &&
+ need_counting_irqs())
+ start_counting_irqs();
+
/* Warn about unreasonable delays. */
if (time_after(now, period_ts + get_softlockup_thresh()))
return now - touch_ts;
@@ -456,6 +657,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
static int softlockup_fn(void *data)
{
update_touch_ts();
+ stop_counting_irqs();
complete(this_cpu_ptr(&softlockup_completion));
return 0;
@@ -504,6 +706,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
*/
period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
+ update_cpustat();
+
/* Reset the interval when touched by known problematic code. */
if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -539,6 +743,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
+ report_cpu_status();
print_modules();
print_irqtrace_events(current);
if (regs)