summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt12
-rw-r--r--Documentation/core-api/workqueue.rst32
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/workqueue.c322
-rw-r--r--kernel/workqueue_internal.h24
-rw-r--r--lib/Kconfig.debug13
-rw-r--r--tools/workqueue/wq_monitor.py168
7 files changed, 499 insertions, 75 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 87bd0e3271b0..487d5da5355d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6972,6 +6972,18 @@
it can be updated at runtime by writing to the
corresponding sysfs file.
+ workqueue.cpu_intensive_thresh_us=
+ Per-cpu work items which run for longer than this
+ threshold are automatically considered CPU intensive
+ and excluded from concurrency management to prevent
+ them from noticeably delaying other per-cpu work
+ items. Default is 10000 (10ms).
+
+ If CONFIG_WQ_CPU_INTENSIVE_REPORT is set, the kernel
+ will report the work functions which violate this
+ threshold repeatedly. They are likely good
+ candidates for using WQ_UNBOUND workqueues instead.
+
workqueue.disable_numa
By default, all work items queued to unbound
workqueues are affine to the NUMA nodes they're
diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst
index 8ec4d6270b24..a4c9b9d1905f 100644
--- a/Documentation/core-api/workqueue.rst
+++ b/Documentation/core-api/workqueue.rst
@@ -348,6 +348,37 @@ Guidelines
level of locality in wq operations and work item execution.
+Monitoring
+==========
+
+Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::
+
+ $ tools/workqueue/wq_monitor.py events
+ total infl CPUtime CPUhog CMwake mayday rescued
+ events 18545 0 6.1 0 5 - -
+ events_highpri 8 0 0.0 0 0 - -
+ events_long 3 0 0.0 0 0 - -
+ events_unbound 38306 0 0.1 - - - -
+ events_freezable 0 0 0.0 0 0 - -
+ events_power_efficient 29598 0 0.2 0 0 - -
+ events_freezable_power_ 10 0 0.0 0 0 - -
+ sock_diag_events 0 0 0.0 0 0 - -
+
+ total infl CPUtime CPUhog CMwake mayday rescued
+ events 18548 0 6.1 0 5 - -
+ events_highpri 8 0 0.0 0 0 - -
+ events_long 3 0 0.0 0 0 - -
+ events_unbound 38322 0 0.1 - - - -
+ events_freezable 0 0 0.0 0 0 - -
+ events_power_efficient 29603 0 0.2 0 0 - -
+ events_freezable_power_ 10 0 0.0 0 0 - -
+ sock_diag_events 0 0 0.0 0 0 - -
+
+ ...
+
+See the command's help message for more info.
+
+
Debugging
=========
@@ -387,6 +418,7 @@ the stack trace of the offending worker thread. ::
The work item's function should be trivially visible in the stack
trace.
+
Non-reentrance Conditions
=========================
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7eb6e2927390..23203daf7128 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5670,6 +5670,9 @@ void scheduler_tick(void)
perf_event_task_tick();
+ if (curr->flags & PF_WQ_WORKER)
+ wq_worker_tick(curr);
+
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c913e333cce8..02a8f402eeb5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -126,6 +126,12 @@ enum {
* cpu or grabbing pool->lock is enough for read access. If
* POOL_DISASSOCIATED is set, it's identical to L.
*
+ * K: Only modified by worker while holding pool->lock. Can be safely read by
+ * self, while holding pool->lock or from IRQ context if %current is the
+ * kworker.
+ *
+ * S: Only modified by worker self.
+ *
* A: wq_pool_attach_mutex protected.
*
* PL: wq_pool_mutex protected.
@@ -200,6 +206,22 @@ struct worker_pool {
};
/*
+ * Per-pool_workqueue statistics. These can be monitored using
+ * tools/workqueue/wq_monitor.py.
+ */
+enum pool_workqueue_stats {
+ PWQ_STAT_STARTED, /* work items started execution */
+ PWQ_STAT_COMPLETED, /* work items completed execution */
+ PWQ_STAT_CPU_TIME, /* total CPU time consumed */
+ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
+ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
+ PWQ_STAT_MAYDAY, /* maydays to rescuer */
+ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
+
+ PWQ_NR_STATS,
+};
+
+/*
* The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
* of work_struct->data are used for flags and the remaining high bits
* point to the pwq; thus, pwqs need to be aligned at two's power of the
@@ -236,6 +258,8 @@ struct pool_workqueue {
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
+ u64 stats[PWQ_NR_STATS];
+
/*
* Release of unbound pwq is punted to system_wq. See put_pwq()
* and pwq_unbound_release_workfn() for details. pool_workqueue
@@ -310,6 +334,14 @@ static struct kmem_cache *pwq_cache;
static cpumask_var_t *wq_numa_possible_cpumask;
/* possible CPUs of each node */
+/*
+ * Per-cpu work items which run for longer than the following threshold are
+ * automatically considered CPU intensive and excluded from concurrency
+ * management to prevent them from noticeably delaying other per-cpu work items.
+ */
+static unsigned long wq_cpu_intensive_thresh_us = 10000;
+module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
+
static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);
@@ -867,6 +899,152 @@ static void wake_up_worker(struct worker_pool *pool)
}
/**
+ * worker_set_flags - set worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to set
+ *
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
+ *
+ * CONTEXT:
+ * raw_spin_lock_irq(pool->lock)
+ */
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
+{
+ struct worker_pool *pool = worker->pool;
+
+ WARN_ON_ONCE(worker->task != current);
+
+ /* If transitioning into NOT_RUNNING, adjust nr_running. */
+ if ((flags & WORKER_NOT_RUNNING) &&
+ !(worker->flags & WORKER_NOT_RUNNING)) {
+ pool->nr_running--;
+ }
+
+ worker->flags |= flags;
+}
+
+/**
+ * worker_clr_flags - clear worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to clear
+ *
+ * Clear @flags in @worker->flags and adjust nr_running accordingly.
+ *
+ * CONTEXT:
+ * raw_spin_lock_irq(pool->lock)
+ */
+static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
+{
+ struct worker_pool *pool = worker->pool;
+ unsigned int oflags = worker->flags;
+
+ WARN_ON_ONCE(worker->task != current);
+
+ worker->flags &= ~flags;
+
+ /*
+ * If transitioning out of NOT_RUNNING, increment nr_running. Note
+ * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
+ * of multiple flags, not a single flag.
+ */
+ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
+ if (!(worker->flags & WORKER_NOT_RUNNING))
+ pool->nr_running++;
+}
+
+#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
+
+/*
+ * Concurrency-managed per-cpu work items that hog CPU for longer than
+ * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
+ * which prevents them from stalling other concurrency-managed work items. If a
+ * work function keeps triggering this mechanism, it's likely that the work item
+ * should be using an unbound workqueue instead.
+ *
+ * wq_cpu_intensive_report() tracks work functions which trigger such conditions
+ * and report them so that they can be examined and converted to use unbound
+ * workqueues as appropriate. To avoid flooding the console, each violating work
+ * function is tracked and reported with exponential backoff.
+ */
+#define WCI_MAX_ENTS 128
+
+struct wci_ent {
+ work_func_t func;
+ atomic64_t cnt;
+ struct hlist_node hash_node;
+};
+
+static struct wci_ent wci_ents[WCI_MAX_ENTS];
+static int wci_nr_ents;
+static DEFINE_RAW_SPINLOCK(wci_lock);
+static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));
+
+static struct wci_ent *wci_find_ent(work_func_t func)
+{
+ struct wci_ent *ent;
+
+ hash_for_each_possible_rcu(wci_hash, ent, hash_node,
+ (unsigned long)func) {
+ if (ent->func == func)
+ return ent;
+ }
+ return NULL;
+}
+
+static void wq_cpu_intensive_report(work_func_t func)
+{
+ struct wci_ent *ent;
+
+restart:
+ ent = wci_find_ent(func);
+ if (ent) {
+ u64 cnt;
+
+ /*
+ * Start reporting from the fourth time and back off
+ * exponentially.
+ */
+ cnt = atomic64_inc_return_relaxed(&ent->cnt);
+ if (cnt >= 4 && is_power_of_2(cnt))
+ printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
+ ent->func, wq_cpu_intensive_thresh_us,
+ atomic64_read(&ent->cnt));
+ return;
+ }
+
+ /*
+ * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
+ * is exhausted, something went really wrong and we probably made enough
+ * noise already.
+ */
+ if (wci_nr_ents >= WCI_MAX_ENTS)
+ return;
+
+ raw_spin_lock(&wci_lock);
+
+ if (wci_nr_ents >= WCI_MAX_ENTS) {
+ raw_spin_unlock(&wci_lock);
+ return;
+ }
+
+ if (wci_find_ent(func)) {
+ raw_spin_unlock(&wci_lock);
+ goto restart;
+ }
+
+ ent = &wci_ents[wci_nr_ents++];
+ ent->func = func;
+ atomic64_set(&ent->cnt, 1);
+ hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);
+
+ raw_spin_unlock(&wci_lock);
+}
+
+#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
+static void wq_cpu_intensive_report(work_func_t func) {}
+#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
+
+/**
* wq_worker_running - a worker is running again
* @task: task waking up
*
@@ -876,7 +1054,7 @@ void wq_worker_running(struct task_struct *task)
{
struct worker *worker = kthread_data(task);
- if (!worker->sleeping)
+ if (!READ_ONCE(worker->sleeping))
return;
/*
@@ -889,7 +1067,14 @@ void wq_worker_running(struct task_struct *task)
if (!(worker->flags & WORKER_NOT_RUNNING))
worker->pool->nr_running++;
preempt_enable();
- worker->sleeping = 0;
+
+ /*
+ * CPU intensive auto-detection cares about how long a work item hogged
+ * CPU without sleeping. Reset the starting timestamp on wakeup.
+ */
+ worker->current_at = worker->task->se.sum_exec_runtime;
+
+ WRITE_ONCE(worker->sleeping, 0);
}
/**
@@ -915,10 +1100,10 @@ void wq_worker_sleeping(struct task_struct *task)
pool = worker->pool;
/* Return if preempted before wq_worker_running() was reached */
- if (worker->sleeping)
+ if (READ_ONCE(worker->sleeping))
return;
- worker->sleeping = 1;
+ WRITE_ONCE(worker->sleeping, 1);
raw_spin_lock_irq(&pool->lock);
/*
@@ -932,12 +1117,66 @@ void wq_worker_sleeping(struct task_struct *task)
}
pool->nr_running--;
- if (need_more_worker(pool))
+ if (need_more_worker(pool)) {
+ worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
wake_up_worker(pool);
+ }
raw_spin_unlock_irq(&pool->lock);
}
/**
+ * wq_worker_tick - a scheduler tick occurred while a kworker is running
+ * @task: task currently running
+ *
+ * Called from scheduler_tick(). We're in the IRQ context and the current
+ * worker's fields which follow the 'K' locking rule can be accessed safely.
+ */
+void wq_worker_tick(struct task_struct *task)
+{
+ struct worker *worker = kthread_data(task);
+ struct pool_workqueue *pwq = worker->current_pwq;
+ struct worker_pool *pool = worker->pool;
+
+ if (!pwq)
+ return;
+
+ pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;
+
+ if (!wq_cpu_intensive_thresh_us)
+ return;
+
+ /*
+ * If the current worker is concurrency managed and hogged the CPU for
+ * longer than wq_cpu_intensive_thresh_us, it's automatically marked
+ * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
+ *
+ * Set @worker->sleeping means that @worker is in the process of
+ * switching out voluntarily and won't be contributing to
+ * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
+ * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
+ * double decrements. The task is releasing the CPU anyway. Let's skip.
+ * We probably want to make this prettier in the future.
+ */
+ if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
+ worker->task->se.sum_exec_runtime - worker->current_at <
+ wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
+ return;
+
+ raw_spin_lock(&pool->lock);
+
+ worker_set_flags(worker, WORKER_CPU_INTENSIVE);
+ wq_cpu_intensive_report(worker->current_func);
+ pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
+
+ if (need_more_worker(pool)) {
+ pwq->stats[PWQ_STAT_CM_WAKEUP]++;
+ wake_up_worker(pool);
+ }
+
+ raw_spin_unlock(&pool->lock);
+}
+
+/**
* wq_worker_last_func - retrieve worker's last work function
* @task: Task to retrieve last work function of.
*
@@ -969,60 +1208,6 @@ work_func_t wq_worker_last_func(struct task_struct *task)
}
/**
- * worker_set_flags - set worker flags and adjust nr_running accordingly
- * @worker: self
- * @flags: flags to set
- *
- * Set @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
- */
-static inline void worker_set_flags(struct worker *worker, unsigned int flags)
-{
- struct worker_pool *pool = worker->pool;
-
- WARN_ON_ONCE(worker->task != current);
-
- /* If transitioning into NOT_RUNNING, adjust nr_running. */
- if ((flags & WORKER_NOT_RUNNING) &&
- !(worker->flags & WORKER_NOT_RUNNING)) {
- pool->nr_running--;
- }
-
- worker->flags |= flags;
-}
-
-/**
- * worker_clr_flags - clear worker flags and adjust nr_running accordingly
- * @worker: self
- * @flags: flags to clear
- *
- * Clear @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
- */
-static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
-{
- struct worker_pool *pool = worker->pool;
- unsigned int oflags = worker->flags;
-
- WARN_ON_ONCE(worker->task != current);
-
- worker->flags &= ~flags;
-
- /*
- * If transitioning out of NOT_RUNNING, increment nr_running. Note
- * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
- * of multiple flags, not a single flag.
- */
- if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
- if (!(worker->flags & WORKER_NOT_RUNNING))
- pool->nr_running++;
-}
-
-/**
* find_worker_executing_work - find worker which is executing a work
* @pool: pool of interest
* @work: work to find worker for
@@ -1542,6 +1727,8 @@ out:
* We queue the work to a specific CPU, the caller must ensure it
* can't go away. Callers that fail to ensure that the specified
* CPU cannot go away will execute on a randomly chosen CPU.
+ * But note well that callers specifying a CPU that never has been
+ * online will get a splat.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
@@ -2166,6 +2353,7 @@ static void send_mayday(struct work_struct *work)
get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
+ pwq->stats[PWQ_STAT_MAYDAY]++;
}
}
@@ -2303,7 +2491,6 @@ __acquires(&pool->lock)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
- bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
unsigned long work_data;
struct worker *collision;
#ifdef CONFIG_LOCKDEP
@@ -2340,6 +2527,7 @@ __acquires(&pool->lock)
worker->current_work = work;
worker->current_func = work->func;
worker->current_pwq = pwq;
+ worker->current_at = worker->task->se.sum_exec_runtime;
work_data = *work_data_bits(work);
worker->current_color = get_work_color(work_data);
@@ -2357,7 +2545,7 @@ __acquires(&pool->lock)
* of concurrency management and the next code block will chain
* execution of the pending work items.
*/
- if (unlikely(cpu_intensive))
+ if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/*
@@ -2404,6 +2592,7 @@ __acquires(&pool->lock)
* workqueues), so hiding them isn't a problem.
*/
lockdep_invariant_state(true);
+ pwq->stats[PWQ_STAT_STARTED]++;
trace_workqueue_execute_start(work);
worker->current_func(work);
/*
@@ -2411,6 +2600,7 @@ __acquires(&pool->lock)
* point will only record its address.
*/
trace_workqueue_execute_end(work, worker->current_func);
+ pwq->stats[PWQ_STAT_COMPLETED]++;
lock_map_release(&lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
@@ -2435,9 +2625,12 @@ __acquires(&pool->lock)
raw_spin_lock_irq(&pool->lock);
- /* clear cpu intensive status */
- if (unlikely(cpu_intensive))
- worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+ /*
+ * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
+ * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
+ * wq_cpu_intensive_thresh_us. Clear it.
+ */
+ worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
/* tag the worker for identification in schedule() */
worker->last_func = worker->current_func;
@@ -2654,6 +2847,7 @@ repeat:
if (first)
pool->watchdog_ts = jiffies;
move_linked_works(work, scheduled, &n);
+ pwq->stats[PWQ_STAT_RESCUED]++;
}
first = false;
}
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index e00b1204a8e9..6b1d66e28269 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -28,13 +28,18 @@ struct worker {
struct hlist_node hentry; /* L: while busy */
};
- struct work_struct *current_work; /* L: work being processed */
- work_func_t current_func; /* L: current_work's fn */
- struct pool_workqueue *current_pwq; /* L: current_work's pwq */
- unsigned int current_color; /* L: current_work's color */
- struct list_head scheduled; /* L: scheduled works */
+ struct work_struct *current_work; /* K: work being processed and its */
+ work_func_t current_func; /* K: function */
+ struct pool_workqueue *current_pwq; /* K: pwq */
+ u64 current_at; /* K: runtime at start or last wakeup */
+ unsigned int current_color; /* K: color */
+
+ int sleeping; /* S: is worker sleeping? */
- /* 64 bytes boundary on 64bit, 32 on 32bit */
+ /* used by the scheduler to determine a worker's last known identity */
+ work_func_t last_func; /* K: last work's fn */
+
+ struct list_head scheduled; /* L: scheduled works */
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* A: the associated pool */
@@ -42,10 +47,9 @@ struct worker {
struct list_head node; /* A: anchored at pool->workers */
/* A: runs through worker->node */
- unsigned long last_active; /* L: last active timestamp */
+ unsigned long last_active; /* K: last active timestamp */
unsigned int flags; /* X: flags */
int id; /* I: worker id */
- int sleeping; /* None */
/*
* Opaque string set with work_set_desc(). Printed out with task
@@ -55,9 +59,6 @@ struct worker {
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
-
- /* used by the scheduler to determine a worker's last known identity */
- work_func_t last_func;
};
/**
@@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void)
*/
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
+void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f2d6e9dd8021..1d5c3bc40113 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1134,6 +1134,19 @@ config WQ_WATCHDOG
state. This can be configured through kernel parameter
"workqueue.watchdog_thresh" and its sysfs counterpart.
+config WQ_CPU_INTENSIVE_REPORT
+ bool "Report per-cpu work items which hog CPU for too long"
+ depends on DEBUG_KERNEL
+ help
+ Say Y here to enable reporting of concurrency-managed per-cpu work
+ items that hog CPUs for longer than
+ workqueue.cpu_intensive_threshold_us. Workqueue automatically
+ detects and excludes them from concurrency management to prevent
+ them from stalling other per-cpu work items. Occassional
+ triggering may not necessarily indicate a problem. Repeated
+ triggering likely indicates that the work item should be switched
+ to use an unbound workqueue.
+
config TEST_LOCKUP
tristate "Test module to generate lockups"
depends on m
diff --git a/tools/workqueue/wq_monitor.py b/tools/workqueue/wq_monitor.py
new file mode 100644
index 000000000000..6e258d123e8c
--- /dev/null
+++ b/tools/workqueue/wq_monitor.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env drgn
+#
+# Copyright (C) 2023 Tejun Heo <tj@kernel.org>
+# Copyright (C) 2023 Meta Platforms, Inc. and affiliates.
+
+desc = """
+This is a drgn script to monitor workqueues. For more info on drgn, visit
+https://github.com/osandov/drgn.
+
+ total Total number of work items executed by the workqueue.
+
+ infl The number of currently in-flight work items.
+
+ CPUtime Total CPU time consumed by the workqueue in seconds. This is
+ sampled from scheduler ticks and only provides ballpark
+ measurement. "nohz_full=" CPUs are excluded from measurement.
+
+ CPUitsv The number of times a concurrency-managed work item hogged CPU
+ longer than the threshold (workqueue.cpu_intensive_thresh_us)
+ and got excluded from concurrency management to avoid stalling
+ other work items.
+
+ CMwake The number of concurrency-management wake-ups while executing a
+ work item of the workqueue.
+
+ mayday The number of times the rescuer was requested while waiting for
+ new worker creation.
+
+ rescued The number of work items executed by the rescuer.
+"""
+
+import sys
+import signal
+import os
+import re
+import time
+import json
+
+import drgn
+from drgn.helpers.linux.list import list_for_each_entry,list_empty
+from drgn.helpers.linux.cpumask import for_each_possible_cpu
+
+import argparse
+parser = argparse.ArgumentParser(description=desc,
+ formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument('workqueue', metavar='REGEX', nargs='*',
+ help='Target workqueue name patterns (all if empty)')
+parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1,
+ help='Monitoring interval (0 to print once and exit)')
+parser.add_argument('-j', '--json', action='store_true',
+ help='Output in json')
+args = parser.parse_args()
+
+def err(s):
+ print(s, file=sys.stderr, flush=True)
+ sys.exit(1)
+
+workqueues = prog['workqueues']
+
+WQ_UNBOUND = prog['WQ_UNBOUND']
+WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM']
+
+PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution
+PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution
+PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed
+PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
+PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups
+PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer
+PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer
+PWQ_NR_STATS = prog['PWQ_NR_STATS']
+
+class WqStats:
+ def __init__(self, wq):
+ self.name = wq.name.string_().decode()
+ self.unbound = wq.flags & WQ_UNBOUND != 0
+ self.mem_reclaim = wq.flags & WQ_MEM_RECLAIM != 0
+ self.stats = [0] * PWQ_NR_STATS
+ for pwq in list_for_each_entry('struct pool_workqueue', wq.pwqs.address_of_(), 'pwqs_node'):
+ for i in range(PWQ_NR_STATS):
+ self.stats[i] += int(pwq.stats[i])
+
+ def dict(self, now):
+ return { 'timestamp' : now,
+ 'name' : self.name,
+ 'unbound' : self.unbound,
+ 'mem_reclaim' : self.mem_reclaim,
+ 'started' : self.stats[PWQ_STAT_STARTED],
+ 'completed' : self.stats[PWQ_STAT_COMPLETED],
+ 'cpu_time' : self.stats[PWQ_STAT_CPU_TIME],
+ 'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE],
+ 'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP],
+ 'mayday' : self.stats[PWQ_STAT_MAYDAY],
+ 'rescued' : self.stats[PWQ_STAT_RESCUED], }
+
+ def table_header_str():
+ return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\
+ f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'
+
+ def table_row_str(self):
+ cpu_intensive = '-'
+ cm_wakeup = '-'
+ mayday = '-'
+ rescued = '-'
+
+ if not self.unbound:
+ cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
+ cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP])
+
+ if self.mem_reclaim:
+ mayday = str(self.stats[PWQ_STAT_MAYDAY])
+ rescued = str(self.stats[PWQ_STAT_RESCUED])
+
+ out = f'{self.name[-24:]:24} ' \
+ f'{self.stats[PWQ_STAT_STARTED]:8} ' \
+ f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
+ f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \
+ f'{cpu_intensive:>7} ' \
+ f'{cm_wakeup:>7} ' \
+ f'{mayday:>7} ' \
+ f'{rescued:>7} '
+ return out.rstrip(':')
+
+exit_req = False
+
+def sigint_handler(signr, frame):
+ global exit_req
+ exit_req = True
+
+def main():
+ # handle args
+ table_fmt = not args.json
+ interval = args.interval
+
+ re_str = None
+ if args.workqueue:
+ for r in args.workqueue:
+ if re_str is None:
+ re_str = r
+ else:
+ re_str += '|' + r
+
+ filter_re = re.compile(re_str) if re_str else None
+
+ # monitoring loop
+ signal.signal(signal.SIGINT, sigint_handler)
+
+ while not exit_req:
+ now = time.time()
+
+ if table_fmt:
+ print()
+ print(WqStats.table_header_str())
+
+ for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'):
+ stats = WqStats(wq)
+ if filter_re and not filter_re.search(stats.name):
+ continue
+ if table_fmt:
+ print(stats.table_row_str())
+ else:
+ print(stats.dict(now))
+
+ if interval == 0:
+ break
+ time.sleep(interval)
+
+if __name__ == "__main__":
+ main()