From e563d0a7cdc1890ff36bb177b5c8c2854d881e4d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 26 Jan 2024 11:55:50 -1000 Subject: workqueue: Break up enum definitions and give names to the types workqueue is collecting different sorts of enums into a single unnamed enum type which can increase confusion around enum width. Also, unnamed enums can't be accessed from BPF. Let's break up enum definitions according to their purposes and give them type names. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 2cc0a9606175..78047d0d9882 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -22,7 +22,7 @@ */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) -enum { +enum work_bits { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */ WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ @@ -36,21 +36,6 @@ enum { WORK_STRUCT_COLOR_BITS = 4, - WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, - WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, - WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, - WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, -#ifdef CONFIG_DEBUG_OBJECTS_WORK - WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, -#else - WORK_STRUCT_STATIC = 0, -#endif - - WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), - - /* not bound to any CPU, prefer the local CPU */ - WORK_CPU_UNBOUND = NR_CPUS, - /* * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. * This makes pwqs aligned to 256 bytes and allows 16 workqueue @@ -74,6 +59,26 @@ enum { WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, +}; + +enum work_flags { + WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, + WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, + WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, + WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, +#ifdef CONFIG_DEBUG_OBJECTS_WORK + WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, +#else + WORK_STRUCT_STATIC = 0, +#endif +}; + +enum wq_misc_consts { + WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), + + /* not bound to any CPU, prefer the local CPU */ + WORK_CPU_UNBOUND = NR_CPUS, + /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, WORK_BUSY_RUNNING = 1 << 1, @@ -347,7 +352,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } * Workqueue flags and constants. For details, please refer to * Documentation/core-api/workqueue.rst. */ -enum { +enum wq_flags { WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ @@ -387,7 +392,9 @@ enum { __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ +}; +enum wq_consts { WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, -- cgit v1.2.3 From 5797b1c18919cd9c289ded7954383e499f729ce0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:25 -1000 Subject: workqueue: Implement system-wide nr_active enforcement for unbound workqueues A pool_workqueue (pwq) represents the connection between a workqueue and a worker_pool. One of the roles that a pwq plays is enforcement of the max_active concurrency limit. Before 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues"), there was one pwq per each CPU for per-cpu workqueues and per each NUMA node for unbound workqueues, which was a natural result of per-cpu workqueues being served by per-cpu pools and unbound by per-NUMA pools. In terms of max_active enforcement, this was, while not perfect, workable. For per-cpu workqueues, it was fine. For unbound, it wasn't great in that NUMA machines would get max_active that's multiplied by the number of nodes but didn't cause huge problems because NUMA machines are relatively rare and the node count is usually pretty low. However, cache layouts are more complex now and sharing a worker pool across a whole node didn't really work well for unbound workqueues. Thus, a series of commits culminating on 8639ecebc9b1 ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") implemented more flexible affinity mechanism for unbound workqueues which enables using e.g. last-level-cache aligned pools. In the process, 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") made unbound workqueues use per-cpu pwqs like per-cpu workqueues. While the change was necessary to enable more flexible affinity scopes, this came with the side effect of blowing up the effective max_active for unbound workqueues. Before, the effective max_active for unbound workqueues was multiplied by the number of nodes. After, by the number of CPUs. 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") claims that this should generally be okay. It is okay for users which self-regulates concurrency level which are the vast majority; however, there are enough use cases which actually depend on max_active to prevent the level of concurrency from going bonkers including several IO handling workqueues that can issue a work item for each in-flight IO. With targeted benchmarks, the misbehavior can easily be exposed as reported in http://lkml.kernel.org/r/dbu6wiwu3sdhmhikb2w6lns7b27gbobfavhjj57kwi2quafgwl@htjcc5oikcr3. Unfortunately, there is no way to express what these use cases need using per-cpu max_active. A CPU may issue most of in-flight IOs, so we don't want to set max_active too low but as soon as we increase max_active a bit, we can end up with unreasonable number of in-flight work items when many CPUs issue IOs at the same time. ie. The acceptable lowest max_active is higher than the acceptable highest max_active. Ideally, max_active for an unbound workqueue should be system-wide so that the users can regulate the total level of concurrency regardless of node and cache layout. The reasons workqueue hasn't implemented that yet are: - One max_active enforcement decouples from pool boundaires, chaining execution after a work item finishes requires inter-pool operations which would require lock dancing, which is nasty. - Sharing a single nr_active count across the whole system can be pretty expensive on NUMA machines. - Per-pwq enforcement had been more or less okay while we were using per-node pools. It looks like we no longer can avoid decoupling max_active enforcement from pool boundaries. This patch implements system-wide nr_active mechanism with the following design characteristics: - To avoid sharing a single counter across multiple nodes, the configured max_active is split across nodes according to the proportion of each workqueue's online effective CPUs per node. e.g. A node with twice more online effective CPUs will get twice higher portion of max_active. - Workqueue used to be able to process a chain of interdependent work items which is as long as max_active. We can't do this anymore as max_active is distributed across the nodes. Instead, a new parameter min_active is introduced which determines the minimum level of concurrency within a node regardless of how max_active distribution comes out to be. It is set to the smaller of max_active and WQ_DFL_MIN_ACTIVE which is 8. This can lead to higher effective max_weight than configured and also deadlocks if a workqueue was depending on being able to handle chains of interdependent work items that are longer than 8. I believe these should be fine given that the number of CPUs in each NUMA node is usually higher than 8 and work item chain longer than 8 is pretty unlikely. However, if these assumptions turn out to be wrong, we'll need to add an interface to adjust min_active. - Each unbound wq has an array of struct wq_node_nr_active which tracks per-node nr_active. When its pwq wants to run a work item, it has to obtain the matching node's nr_active. If over the node's max_active, the pwq is queued on wq_node_nr_active->pending_pwqs. As work items finish, the completion path round-robins the pending pwqs activating the first inactive work item of each, which involves some pool lock dancing and kicking other pools. It's not the simplest code but doesn't look too bad. v4: - wq_adjust_max_active() updated to invoke wq_update_node_max_active(). - wq_adjust_max_active() is now protected by wq->mutex instead of wq_pool_mutex. v3: - wq_node_max_active() used to calculate per-node max_active on the fly based on system-wide CPU online states. Lai pointed out that this can lead to skewed distributions for workqueues with restricted cpumasks. Update the max_active distribution to use per-workqueue effective online CPU counts instead of system-wide and cache the calculation results in node_nr_active->max. v2: - wq->min/max_active now uses WRITE/READ_ONCE() as suggested by Lai. Signed-off-by: Tejun Heo Reported-by: Naohiro Aota Link: http://lkml.kernel.org/r/dbu6wiwu3sdhmhikb2w6lns7b27gbobfavhjj57kwi2quafgwl@htjcc5oikcr3 Fixes: 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 35 ++++- kernel/workqueue.c | 341 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 341 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 78047d0d9882..232baea90a1d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -398,6 +398,13 @@ enum wq_consts { WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, + + /* + * Per-node default cap on min_active. Unless explicitly set, min_active + * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see + * workqueue_struct->min_active definition. + */ + WQ_DFL_MIN_ACTIVE = 8, }; /* @@ -440,11 +447,33 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq; * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags - * @max_active: max in-flight work items per CPU, 0 for default + * @max_active: max in-flight work items, 0 for default * remaining args: args for @fmt * - * Allocate a workqueue with the specified parameters. For detailed - * information on WQ_* flags, please refer to + * For a per-cpu workqueue, @max_active limits the number of in-flight work + * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be + * executing at most one work item for the workqueue. + * + * For unbound workqueues, @max_active limits the number of in-flight work items + * for the whole system. e.g. @max_active of 16 indicates that that there can be + * at most 16 work items executing for the workqueue in the whole system. + * + * As sharing the same active counter for an unbound workqueue across multiple + * NUMA nodes can be expensive, @max_active is distributed to each NUMA node + * according to the proportion of the number of online CPUs and enforced + * independently. + * + * Depending on online CPU distribution, a node may end up with per-node + * max_active which is significantly lower than @max_active, which can lead to + * deadlocks if the per-node concurrency limit is lower than the maximum number + * of interdependent work items for the workqueue. + * + * To guarantee forward progress regardless of online CPU distribution, the + * concurrency limit on every node is guaranteed to be equal to or greater than + * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means + * that the sum of per-node max_active's may be larger than @max_active. + * + * For detailed information on %WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8d465478adb9..903be39bd2d1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -126,6 +126,9 @@ enum wq_internal_consts { * * L: pool->lock protected. Access with pool->lock held. * + * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for + * reads. + * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. @@ -247,17 +250,18 @@ struct pool_workqueue { * pwq->inactive_works instead of pool->worklist and marked with * WORK_STRUCT_INACTIVE. * - * All work items marked with WORK_STRUCT_INACTIVE do not participate - * in pwq->nr_active and all work items in pwq->inactive_works are - * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE - * work items are in pwq->inactive_works. Some of them are ready to - * run in pool->worklist or worker->scheduled. Those work itmes are - * only struct wq_barrier which is used for flush_work() and should - * not participate in pwq->nr_active. For non-barrier work item, it - * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + * All work items marked with WORK_STRUCT_INACTIVE do not participate in + * nr_active and all work items in pwq->inactive_works are marked with + * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are + * in pwq->inactive_works. Some of them are ready to run in + * pool->worklist or worker->scheduled. Those work itmes are only struct + * wq_barrier which is used for flush_work() and should not participate + * in nr_active. For non-barrier work item, it is marked with + * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. */ int nr_active; /* L: nr of active works */ struct list_head inactive_works; /* L: inactive works */ + struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ @@ -289,9 +293,19 @@ struct wq_device; * on each CPU, in an unbound workqueue, max_active applies to the whole system. * As sharing a single nr_active across multiple sockets can be very expensive, * the counting and enforcement is per NUMA node. + * + * The following struct is used to enforce per-node max_active. When a pwq wants + * to start executing a work item, it should increment ->nr using + * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over + * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish + * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in + * round-robin order. */ struct wq_node_nr_active { - atomic_t nr; /* per-node nr_active count */ + int max; /* per-node max_active */ + atomic_t nr; /* per-node nr_active */ + raw_spinlock_t lock; /* nests inside pool locks */ + struct list_head pending_pwqs; /* LN: pwqs with inactive works */ }; /* @@ -314,8 +328,12 @@ struct workqueue_struct { struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ + + /* See alloc_workqueue() function comment for info on min/max_active */ int max_active; /* WO: max active works */ + int min_active; /* WO: min active works */ int saved_max_active; /* WQ: saved max_active */ + int saved_min_active; /* WQ: saved min_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */ @@ -667,6 +685,19 @@ static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu) lockdep_is_held(&wq->mutex)); } +/** + * unbound_effective_cpumask - effective cpumask of an unbound workqueue + * @wq: workqueue of interest + * + * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which + * is masked with wq_unbound_cpumask to determine the effective cpumask. The + * default pwq is always mapped to the pool with the current effective cpumask. + */ +static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq) +{ + return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask; +} + static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -1461,6 +1492,46 @@ static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, return wq->node_nr_active[node]; } +/** + * wq_update_node_max_active - Update per-node max_actives to use + * @wq: workqueue to update + * @off_cpu: CPU that's going down, -1 if a CPU is not going down + * + * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is + * distributed among nodes according to the proportions of numbers of online + * cpus. The result is always between @wq->min_active and max_active. + */ +static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) +{ + struct cpumask *effective = unbound_effective_cpumask(wq); + int min_active = READ_ONCE(wq->min_active); + int max_active = READ_ONCE(wq->max_active); + int total_cpus, node; + + lockdep_assert_held(&wq->mutex); + + if (!cpumask_test_cpu(off_cpu, effective)) + off_cpu = -1; + + total_cpus = cpumask_weight_and(effective, cpu_online_mask); + if (off_cpu >= 0) + total_cpus--; + + for_each_node(node) { + int node_cpus; + + node_cpus = cpumask_weight_and(effective, cpumask_of_node(node)); + if (off_cpu >= 0 && cpu_to_node(off_cpu) == node) + node_cpus--; + + wq_node_nr_active(wq, node)->max = + clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus), + min_active, max_active); + } + + wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active; +} + /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get @@ -1558,35 +1629,98 @@ static bool pwq_activate_work(struct pool_workqueue *pwq, return true; } +static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) +{ + int max = READ_ONCE(nna->max); + + while (true) { + int old, tmp; + + old = atomic_read(&nna->nr); + if (old >= max) + return false; + tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1); + if (tmp == old) + return true; + } +} + /** * pwq_tryinc_nr_active - Try to increment nr_active for a pwq * @pwq: pool_workqueue of interest + * @fill: max_active may have increased, try to increase concurrency level * * Try to increment nr_active for @pwq. Returns %true if an nr_active count is * successfully obtained. %false otherwise. */ -static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq) +static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill) { struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); - bool obtained; + bool obtained = false; lockdep_assert_held(&pool->lock); - obtained = pwq->nr_active < READ_ONCE(wq->max_active); + if (!nna) { + /* per-cpu workqueue, pwq->nr_active is sufficient */ + obtained = pwq->nr_active < READ_ONCE(wq->max_active); + goto out; + } + + /* + * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is + * already waiting on $nna, pwq_dec_nr_active() will maintain the + * concurrency level. Don't jump the line. + * + * We need to ignore the pending test after max_active has increased as + * pwq_dec_nr_active() can only maintain the concurrency level but not + * increase it. This is indicated by @fill. + */ + if (!list_empty(&pwq->pending_node) && likely(!fill)) + goto out; + + obtained = tryinc_node_nr_active(nna); + if (obtained) + goto out; + + /* + * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs + * and try again. The smp_mb() is paired with the implied memory barrier + * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either + * we see the decremented $nna->nr or they see non-empty + * $nna->pending_pwqs. + */ + raw_spin_lock(&nna->lock); + + if (list_empty(&pwq->pending_node)) + list_add_tail(&pwq->pending_node, &nna->pending_pwqs); + else if (likely(!fill)) + goto out_unlock; + + smp_mb(); + + obtained = tryinc_node_nr_active(nna); - if (obtained) { + /* + * If @fill, @pwq might have already been pending. Being spuriously + * pending in cold paths doesn't affect anything. Let's leave it be. + */ + if (obtained && likely(!fill)) + list_del_init(&pwq->pending_node); + +out_unlock: + raw_spin_unlock(&nna->lock); +out: + if (obtained) pwq->nr_active++; - if (nna) - atomic_inc(&nna->nr); - } return obtained; } /** * pwq_activate_first_inactive - Activate the first inactive work item on a pwq * @pwq: pool_workqueue of interest + * @fill: max_active may have increased, try to increase concurrency level * * Activate the first inactive work item of @pwq if available and allowed by * max_active limit. @@ -1594,13 +1728,13 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq) * Returns %true if an inactive work item has been activated. %false if no * inactive work item is found or max_active limit is reached. */ -static bool pwq_activate_first_inactive(struct pool_workqueue *pwq) +static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill) { struct work_struct *work = list_first_entry_or_null(&pwq->inactive_works, struct work_struct, entry); - if (work && pwq_tryinc_nr_active(pwq)) { + if (work && pwq_tryinc_nr_active(pwq, fill)) { __pwq_activate_work(pwq, work); return true; } else { @@ -1608,11 +1742,93 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq) } } +/** + * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active + * @nna: wq_node_nr_active to activate a pending pwq for + * @caller_pool: worker_pool the caller is locking + * + * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked. + * @caller_pool may be unlocked and relocked to lock other worker_pools. + */ +static void node_activate_pending_pwq(struct wq_node_nr_active *nna, + struct worker_pool *caller_pool) +{ + struct worker_pool *locked_pool = caller_pool; + struct pool_workqueue *pwq; + struct work_struct *work; + + lockdep_assert_held(&caller_pool->lock); + + raw_spin_lock(&nna->lock); +retry: + pwq = list_first_entry_or_null(&nna->pending_pwqs, + struct pool_workqueue, pending_node); + if (!pwq) + goto out_unlock; + + /* + * If @pwq is for a different pool than @locked_pool, we need to lock + * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock + * / lock dance. For that, we also need to release @nna->lock as it's + * nested inside pool locks. + */ + if (pwq->pool != locked_pool) { + raw_spin_unlock(&locked_pool->lock); + locked_pool = pwq->pool; + if (!raw_spin_trylock(&locked_pool->lock)) { + raw_spin_unlock(&nna->lock); + raw_spin_lock(&locked_pool->lock); + raw_spin_lock(&nna->lock); + goto retry; + } + } + + /* + * $pwq may not have any inactive work items due to e.g. cancellations. + * Drop it from pending_pwqs and see if there's another one. + */ + work = list_first_entry_or_null(&pwq->inactive_works, + struct work_struct, entry); + if (!work) { + list_del_init(&pwq->pending_node); + goto retry; + } + + /* + * Acquire an nr_active count and activate the inactive work item. If + * $pwq still has inactive work items, rotate it to the end of the + * pending_pwqs so that we round-robin through them. This means that + * inactive work items are not activated in queueing order which is fine + * given that there has never been any ordering across different pwqs. + */ + if (likely(tryinc_node_nr_active(nna))) { + pwq->nr_active++; + __pwq_activate_work(pwq, work); + + if (list_empty(&pwq->inactive_works)) + list_del_init(&pwq->pending_node); + else + list_move_tail(&pwq->pending_node, &nna->pending_pwqs); + + /* if activating a foreign pool, make sure it's running */ + if (pwq->pool != caller_pool) + kick_pool(pwq->pool); + } + +out_unlock: + raw_spin_unlock(&nna->lock); + if (locked_pool != caller_pool) { + raw_spin_unlock(&locked_pool->lock); + raw_spin_lock(&caller_pool->lock); + } +} + /** * pwq_dec_nr_active - Retire an active count * @pwq: pool_workqueue of interest * * Decrement @pwq's nr_active and try to activate the first inactive work item. + * For unbound workqueues, this function may temporarily drop @pwq->pool->lock. */ static void pwq_dec_nr_active(struct pool_workqueue *pwq) { @@ -1632,12 +1848,29 @@ static void pwq_dec_nr_active(struct pool_workqueue *pwq) * inactive work item on @pwq itself. */ if (!nna) { - pwq_activate_first_inactive(pwq); + pwq_activate_first_inactive(pwq, false); return; } - atomic_dec(&nna->nr); - pwq_activate_first_inactive(pwq); + /* + * If @pwq is for an unbound workqueue, it's more complicated because + * multiple pwqs and pools may be sharing the nr_active count. When a + * pwq needs to wait for an nr_active count, it puts itself on + * $nna->pending_pwqs. The following atomic_dec_return()'s implied + * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to + * guarantee that either we see non-empty pending_pwqs or they see + * decremented $nna->nr. + * + * $nna->max may change as CPUs come online/offline and @pwq->wq's + * max_active gets updated. However, it is guaranteed to be equal to or + * larger than @pwq->wq->min_active which is above zero unless freezing. + * This maintains the forward progress guarantee. + */ + if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max)) + return; + + if (!list_empty(&nna->pending_pwqs)) + node_activate_pending_pwq(nna, pool); } /** @@ -1965,7 +2198,7 @@ retry: * @work must also queue behind existing inactive work items to maintain * ordering when max_active changes. See wq_adjust_max_active(). */ - if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq)) { + if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { if (list_empty(&pool->worklist)) pool->watchdog_ts = jiffies; @@ -3200,7 +3433,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, barr->task = current; - /* The barrier work item does not participate in pwq->nr_active. */ + /* The barrier work item does not participate in nr_active. */ work_flags |= WORK_STRUCT_INACTIVE; /* @@ -4116,6 +4349,8 @@ static void free_node_nr_active(struct wq_node_nr_active **nna_ar) static void init_node_nr_active(struct wq_node_nr_active *nna) { atomic_set(&nna->nr, 0); + raw_spin_lock_init(&nna->lock); + INIT_LIST_HEAD(&nna->pending_pwqs); } /* @@ -4355,6 +4590,15 @@ static void pwq_release_workfn(struct kthread_work *work) mutex_unlock(&wq_pool_mutex); } + if (!list_empty(&pwq->pending_node)) { + struct wq_node_nr_active *nna = + wq_node_nr_active(pwq->wq, pwq->pool->node); + + raw_spin_lock_irq(&nna->lock); + list_del_init(&pwq->pending_node); + raw_spin_unlock_irq(&nna->lock); + } + call_rcu(&pwq->rcu, rcu_free_pwq); /* @@ -4380,6 +4624,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->flush_color = -1; pwq->refcnt = 1; INIT_LIST_HEAD(&pwq->inactive_works); + INIT_LIST_HEAD(&pwq->pending_node); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); kthread_init_work(&pwq->release_work, pwq_release_workfn); @@ -4587,6 +4832,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) ctx->pwq_tbl[cpu]); ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq); + /* update node_nr_active->max */ + wq_update_node_max_active(ctx->wq, -1); + mutex_unlock(&ctx->wq->mutex); } @@ -4850,24 +5098,35 @@ static int init_rescuer(struct workqueue_struct *wq) static void wq_adjust_max_active(struct workqueue_struct *wq) { bool activated; + int new_max, new_min; lockdep_assert_held(&wq->mutex); if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) { - WRITE_ONCE(wq->max_active, 0); - return; + new_max = 0; + new_min = 0; + } else { + new_max = wq->saved_max_active; + new_min = wq->saved_min_active; } - if (wq->max_active == wq->saved_max_active) + if (wq->max_active == new_max && wq->min_active == new_min) return; /* - * Update @wq->max_active and then kick inactive work items if more + * Update @wq->max/min_active and then kick inactive work items if more * active work items are allowed. This doesn't break work item ordering * because new work items are always queued behind existing inactive * work items if there are any. */ - WRITE_ONCE(wq->max_active, wq->saved_max_active); + WRITE_ONCE(wq->max_active, new_max); + WRITE_ONCE(wq->min_active, new_min); + + if (wq->flags & WQ_UNBOUND) + wq_update_node_max_active(wq, -1); + + if (new_max == 0) + return; /* * Round-robin through pwq's activating the first inactive work item @@ -4882,7 +5141,7 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) /* can be called during early boot w/ irq disabled */ raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq_activate_first_inactive(pwq)) { + if (pwq_activate_first_inactive(pwq, true)) { activated = true; kick_pool(pwq->pool); } @@ -4944,7 +5203,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, /* init wq */ wq->flags = flags; wq->max_active = max_active; - wq->saved_max_active = max_active; + wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE); + wq->saved_max_active = wq->max_active; + wq->saved_min_active = wq->min_active; mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->pwqs); @@ -5110,7 +5371,8 @@ EXPORT_SYMBOL_GPL(destroy_workqueue); * @wq: target workqueue * @max_active: new max_active value. * - * Set max_active of @wq to @max_active. + * Set max_active of @wq to @max_active. See the alloc_workqueue() function + * comment. * * CONTEXT: * Don't call from IRQ context. @@ -5127,6 +5389,9 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; + if (wq->flags & WQ_UNBOUND) + wq->saved_min_active = min(wq->saved_min_active, max_active); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); @@ -5808,6 +6073,10 @@ int workqueue_online_cpu(unsigned int cpu) for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, true); + + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, -1); + mutex_unlock(&wq->mutex); } } @@ -5836,6 +6105,10 @@ int workqueue_offline_cpu(unsigned int cpu) for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, false); + + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, cpu); + mutex_unlock(&wq->mutex); } } mutex_unlock(&wq_pool_mutex); @@ -7127,8 +7400,12 @@ void __init workqueue_init_topology(void) * combinations to apply per-pod sharing. */ list_for_each_entry(wq, &workqueues, list) { - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) wq_update_pod(wq, cpu, cpu, true); + if (wq->flags & WQ_UNBOUND) { + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, -1); + mutex_unlock(&wq->mutex); } } -- cgit v1.2.3 From 4cb1ef64609f9b0254184b2947824f4b46ccab22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 Feb 2024 11:28:06 -1000 Subject: workqueue: Implement BH workqueues to eventually replace tasklets The only generic interface to execute asynchronously in the BH context is tasklet; however, it's marked deprecated and has some design flaws such as the execution code accessing the tasklet item after the execution is complete which can lead to subtle use-after-free in certain usage scenarios and less-developed flush and cancel mechanisms. This patch implements BH workqueues which share the same semantics and features of regular workqueues but execute their work items in the softirq context. As there is always only one BH execution context per CPU, none of the concurrency management mechanisms applies and a BH workqueue can be thought of as a convenience wrapper around softirq. Except for the inability to sleep while executing and lack of max_active adjustments, BH workqueues and work items should behave the same as regular workqueues and work items. Currently, the execution is hooked to tasklet[_hi]. However, the goal is to convert all tasklet users over to BH workqueues. Once the conversion is complete, tasklet can be removed and BH workqueues can directly take over the tasklet softirqs. system_bh[_highpri]_wq are added. As queue-wide flushing doesn't exist in tasklet, all existing tasklet users should be able to use the system BH workqueues without creating their own workqueues. v3: - Add missing interrupt.h include. v2: - Instead of using tasklets, hook directly into its softirq action functions - tasklet[_hi]_action(). This is slightly cheaper and closer to the eventual code structure we want to arrive at. Suggested by Lai. - Lai also pointed out several places which need NULL worker->task handling or can use clarification. Updated. Signed-off-by: Tejun Heo Suggested-by: Linus Torvalds Link: http://lkml.kernel.org/r/CAHk-=wjDW53w4-YcSmgKC5RruiRLHmJ1sXeYdp_ZgVoBw=5byA@mail.gmail.com Tested-by: Allen Pais Reviewed-by: Lai Jiangshan --- Documentation/core-api/workqueue.rst | 29 +++- include/linux/workqueue.h | 11 ++ kernel/softirq.c | 3 + kernel/workqueue.c | 291 ++++++++++++++++++++++++++++------- tools/workqueue/wq_dump.py | 11 +- 5 files changed, 285 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index 33c4539155d9..2d6af6c4665c 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -77,10 +77,12 @@ wants a function to be executed asynchronously it has to set up a work item pointing to that function and queue that work item on a workqueue. -Special purpose threads, called worker threads, execute the functions -off of the queue, one after the other. If no work is queued, the -worker threads become idle. These worker threads are managed in so -called worker-pools. +A work item can be executed in either a thread or the BH (softirq) context. + +For threaded workqueues, special purpose threads, called [k]workers, execute +the functions off of the queue, one after the other. If no work is queued, +the worker threads become idle. These worker threads are managed in +worker-pools. The cmwq design differentiates between the user-facing workqueues that subsystems and drivers queue work items on and the backend mechanism @@ -91,6 +93,12 @@ for high priority ones, for each possible CPU and some extra worker-pools to serve work items queued on unbound workqueues - the number of these backing pools is dynamic. +BH workqueues use the same framework. However, as there can only be one +concurrent execution context, there's no need to worry about concurrency. +Each per-CPU BH worker pool contains only one pseudo worker which represents +the BH execution context. A BH workqueue can be considered a convenience +interface to softirq. + Subsystems and drivers can create and queue work items through special workqueue API functions as they see fit. They can influence some aspects of the way the work items are executed by setting flags on the @@ -106,7 +114,7 @@ unless specifically overridden, a work item of a bound workqueue will be queued on the worklist of either normal or highpri worker-pool that is associated to the CPU the issuer is running on. -For any worker pool implementation, managing the concurrency level +For any thread pool implementation, managing the concurrency level (how many execution contexts are active) is an important issue. cmwq tries to keep the concurrency at a minimal but sufficient level. Minimal to save resources and sufficient in that the system is used at @@ -164,6 +172,17 @@ resources, scheduled and executed. ``flags`` --------- +``WQ_BH`` + BH workqueues can be considered a convenience interface to softirq. BH + workqueues are always per-CPU and all BH work items are executed in the + queueing CPU's softirq context in the queueing order. + + All BH workqueues must have 0 ``max_active`` and ``WQ_HIGHPRI`` is the + only allowed additional flag. + + BH work items cannot sleep. All other features such as delayed queueing, + flushing and canceling are supported. + ``WQ_UNBOUND`` Work items queued to an unbound wq are served by the special worker-pools which host workers which are not bound to any diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 232baea90a1d..283d7891b4c4 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -353,6 +353,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } * Documentation/core-api/workqueue.rst. */ enum wq_flags { + WQ_BH = 1 << 0, /* execute in bottom half (softirq) context */ WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ @@ -392,6 +393,9 @@ enum wq_flags { __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ + + /* BH wq only allows the following flags */ + __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, }; enum wq_consts { @@ -434,6 +438,9 @@ enum wq_consts { * they are same as their non-power-efficient counterparts - e.g. * system_power_efficient_wq is identical to system_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. + * + * system_bh[_highpri]_wq are convenience interface to softirq. BH work items + * are executed in the queueing CPU's BH context in the queueing order. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_highpri_wq; @@ -442,6 +449,10 @@ extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; +extern struct workqueue_struct *system_bh_wq; +extern struct workqueue_struct *system_bh_highpri_wq; + +void workqueue_softirq_action(bool highpri); /** * alloc_workqueue - allocate a workqueue diff --git a/kernel/softirq.c b/kernel/softirq.c index 210cf5f8d92c..547d282548a8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -802,11 +803,13 @@ static void tasklet_action_common(struct softirq_action *a, static __latent_entropy void tasklet_action(struct softirq_action *a) { + workqueue_softirq_action(false); tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); } static __latent_entropy void tasklet_hi_action(struct softirq_action *a) { + workqueue_softirq_action(true); tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 767971a29c7a..78b4b992e1a3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -72,8 +73,12 @@ enum worker_pool_flags { * Note that DISASSOCIATED should be flipped only while holding * wq_pool_attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. + * + * As there can only be one concurrent BH execution context per CPU, a + * BH pool is per-CPU and always DISASSOCIATED. */ - POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ + POOL_BH = 1 << 0, /* is a BH pool */ + POOL_MANAGER_ACTIVE = 1 << 1, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ }; @@ -115,6 +120,14 @@ enum wq_internal_consts { WQ_NAME_LEN = 32, }; +/* + * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and + * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because + * msecs_to_jiffies() can't be an initializer. + */ +#define BH_WORKER_JIFFIES msecs_to_jiffies(2) +#define BH_WORKER_RESTARTS 10 + /* * Structure fields follow one of the following exclusion rules. * @@ -443,8 +456,13 @@ static bool wq_debug_force_rr_cpu = false; #endif module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); +/* the BH worker pools */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + bh_worker_pools); + /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ @@ -478,6 +496,10 @@ struct workqueue_struct *system_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_power_efficient_wq); struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); +struct workqueue_struct *system_bh_wq; +EXPORT_SYMBOL_GPL(system_bh_wq); +struct workqueue_struct *system_bh_highpri_wq; +EXPORT_SYMBOL_GPL(system_bh_highpri_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); @@ -498,6 +520,11 @@ static void show_one_worker_pool(struct worker_pool *pool); !lockdep_is_held(&wq_pool_mutex), \ "RCU, wq->mutex or wq_pool_mutex should be held") +#define for_each_bh_worker_pool(pool, cpu) \ + for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \ + (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ + (pool)++) + #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -1186,6 +1213,14 @@ static bool kick_pool(struct worker_pool *pool) if (!need_more_worker(pool) || !worker) return false; + if (pool->flags & POOL_BH) { + if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) + raise_softirq_irqoff(HI_SOFTIRQ); + else + raise_softirq_irqoff(TASKLET_SOFTIRQ); + return true; + } + p = worker->task; #ifdef CONFIG_SMP @@ -1668,7 +1703,7 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill) lockdep_assert_held(&pool->lock); if (!nna) { - /* per-cpu workqueue, pwq->nr_active is sufficient */ + /* BH or per-cpu workqueue, pwq->nr_active is sufficient */ obtained = pwq->nr_active < READ_ONCE(wq->max_active); goto out; } @@ -2523,19 +2558,21 @@ static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) * cpu-[un]hotplugs. */ static void worker_attach_to_pool(struct worker *worker, - struct worker_pool *pool) + struct worker_pool *pool) { mutex_lock(&wq_pool_attach_mutex); /* - * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains - * stable across this function. See the comments above the flag - * definition for details. + * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable + * across this function. See the comments above the flag definition for + * details. BH workers are, while per-CPU, always DISASSOCIATED. */ - if (pool->flags & POOL_DISASSOCIATED) + if (pool->flags & POOL_DISASSOCIATED) { worker->flags |= WORKER_UNBOUND; - else + } else { + WARN_ON_ONCE(pool->flags & POOL_BH); kthread_set_per_cpu(worker->task, pool->cpu); + } if (worker->rescue_wq) set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); @@ -2559,6 +2596,9 @@ static void worker_detach_from_pool(struct worker *worker) struct worker_pool *pool = worker->pool; struct completion *detach_completion = NULL; + /* there is one permanent BH worker per CPU which should never detach */ + WARN_ON_ONCE(pool->flags & POOL_BH); + mutex_lock(&wq_pool_attach_mutex); kthread_set_per_cpu(worker->task, -1); @@ -2610,27 +2650,29 @@ static struct worker *create_worker(struct worker_pool *pool) worker->id = id; - if (pool->cpu >= 0) - snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, - pool->attrs->nice < 0 ? "H" : ""); - else - snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); - - worker->task = kthread_create_on_node(worker_thread, worker, pool->node, - "kworker/%s", id_buf); - if (IS_ERR(worker->task)) { - if (PTR_ERR(worker->task) == -EINTR) { - pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", - id_buf); - } else { - pr_err_once("workqueue: Failed to create a worker thread: %pe", - worker->task); + if (!(pool->flags & POOL_BH)) { + if (pool->cpu >= 0) + snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, + pool->attrs->nice < 0 ? "H" : ""); + else + snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); + + worker->task = kthread_create_on_node(worker_thread, worker, + pool->node, "kworker/%s", id_buf); + if (IS_ERR(worker->task)) { + if (PTR_ERR(worker->task) == -EINTR) { + pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", + id_buf); + } else { + pr_err_once("workqueue: Failed to create a worker thread: %pe", + worker->task); + } + goto fail; } - goto fail; - } - set_user_nice(worker->task, pool->attrs->nice); - kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); + set_user_nice(worker->task, pool->attrs->nice); + kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); + } /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); @@ -2646,7 +2688,8 @@ static struct worker *create_worker(struct worker_pool *pool) * check if not woken up soon. As kick_pool() is noop if @pool is empty, * wake it up explicitly. */ - wake_up_process(worker->task); + if (worker->task) + wake_up_process(worker->task); raw_spin_unlock_irq(&pool->lock); @@ -2988,7 +3031,8 @@ __acquires(&pool->lock) worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; - worker->current_at = worker->task->se.sum_exec_runtime; + if (worker->task) + worker->current_at = worker->task->se.sum_exec_runtime; work_data = *work_data_bits(work); worker->current_color = get_work_color(work_data); @@ -3086,7 +3130,8 @@ __acquires(&pool->lock) * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ - cond_resched(); + if (worker->task) + cond_resched(); raw_spin_lock_irq(&pool->lock); @@ -3369,6 +3414,61 @@ repeat: goto repeat; } +static void bh_worker(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + int nr_restarts = BH_WORKER_RESTARTS; + unsigned long end = jiffies + BH_WORKER_JIFFIES; + + raw_spin_lock_irq(&pool->lock); + worker_leave_idle(worker); + + /* + * This function follows the structure of worker_thread(). See there for + * explanations on each step. + */ + if (!need_more_worker(pool)) + goto done; + + WARN_ON_ONCE(!list_empty(&worker->scheduled)); + worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); + + do { + struct work_struct *work = + list_first_entry(&pool->worklist, + struct work_struct, entry); + + if (assign_work(work, worker, NULL)) + process_scheduled_works(worker); + } while (keep_working(pool) && + --nr_restarts && time_before(jiffies, end)); + + worker_set_flags(worker, WORKER_PREP); +done: + worker_enter_idle(worker); + kick_pool(pool); + raw_spin_unlock_irq(&pool->lock); +} + +/* + * TODO: Convert all tasklet users to workqueue and use softirq directly. + * + * This is currently called from tasklet[_hi]action() and thus is also called + * whenever there are tasklets to run. Let's do an early exit if there's nothing + * queued. Once conversion from tasklet is complete, the need_more_worker() test + * can be dropped. + * + * After full conversion, we'll add worker->softirq_action, directly use the + * softirq action and obtain the worker pointer from the softirq_action pointer. + */ +void workqueue_softirq_action(bool highpri) +{ + struct worker_pool *pool = + &per_cpu(bh_worker_pools, smp_processor_id())[highpri]; + if (need_more_worker(pool)) + bh_worker(list_first_entry(&pool->workers, struct worker, node)); +} + /** * check_flush_dependency - check for flush dependency sanity * @target_wq: workqueue being flushed @@ -3441,6 +3541,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { + static __maybe_unused struct lock_class_key bh_key, thr_key; unsigned int work_flags = 0; unsigned int work_color; struct list_head *head; @@ -3450,8 +3551,13 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. + * + * BH and threaded workqueues need separate lockdep keys to avoid + * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} + * usage". */ - INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); + INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func, + (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion_map(&barr->done, &target->lockdep_map); @@ -3557,15 +3663,31 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, static void touch_wq_lockdep_map(struct workqueue_struct *wq) { +#ifdef CONFIG_LOCKDEP + if (wq->flags & WQ_BH) + local_bh_disable(); + lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); + + if (wq->flags & WQ_BH) + local_bh_enable(); +#endif } static void touch_work_lockdep_map(struct work_struct *work, struct workqueue_struct *wq) { +#ifdef CONFIG_LOCKDEP + if (wq->flags & WQ_BH) + local_bh_disable(); + lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); + + if (wq->flags & WQ_BH) + local_bh_enable(); +#endif } /** @@ -5019,10 +5141,17 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) if (!(wq->flags & WQ_UNBOUND)) { for_each_possible_cpu(cpu) { - struct pool_workqueue **pwq_p = - per_cpu_ptr(wq->cpu_pwq, cpu); - struct worker_pool *pool = - &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]); + struct pool_workqueue **pwq_p; + struct worker_pool __percpu *pools; + struct worker_pool *pool; + + if (wq->flags & WQ_BH) + pools = bh_worker_pools; + else + pools = cpu_worker_pools; + + pool = &(per_cpu_ptr(pools, cpu)[highpri]); + pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); @@ -5197,6 +5326,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, size_t wq_size; int name_len; + if (flags & WQ_BH) { + if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS)) + return NULL; + if (WARN_ON_ONCE(max_active)) + return NULL; + } + /* * Unbound && max_active == 1 used to imply ordered, which is no longer * the case on many machines due to per-pod pools. While @@ -5234,8 +5370,16 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name); - max_active = max_active ?: WQ_DFL_ACTIVE; - max_active = wq_clamp_max_active(max_active, flags, wq->name); + if (flags & WQ_BH) { + /* + * BH workqueues always share a single execution context per CPU + * and don't impose any max_active limit. + */ + max_active = INT_MAX; + } else { + max_active = max_active ?: WQ_DFL_ACTIVE; + max_active = wq_clamp_max_active(max_active, flags, wq->name); + } /* init wq */ wq->flags = flags; @@ -5416,6 +5560,9 @@ EXPORT_SYMBOL_GPL(destroy_workqueue); */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { + /* max_active doesn't mean anything for BH workqueues */ + if (WARN_ON(wq->flags & WQ_BH)) + return; /* disallow meddling with max_active for ordered workqueues */ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; @@ -5617,7 +5764,24 @@ static void pr_cont_pool_info(struct worker_pool *pool) pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); if (pool->node != NUMA_NO_NODE) pr_cont(" node=%d", pool->node); - pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); + pr_cont(" flags=0x%x", pool->flags); + if (pool->flags & POOL_BH) + pr_cont(" bh%s", + pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); + else + pr_cont(" nice=%d", pool->attrs->nice); +} + +static void pr_cont_worker_id(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (pool->flags & WQ_BH) + pr_cont("bh%s", + pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); + else + pr_cont("%d%s", task_pid_nr(worker->task), + worker->rescue_wq ? "(RESCUER)" : ""); } struct pr_cont_work_struct { @@ -5694,10 +5858,9 @@ static void show_pwq(struct pool_workqueue *pwq) if (worker->current_pwq != pwq) continue; - pr_cont("%s %d%s:%ps", comma ? "," : "", - task_pid_nr(worker->task), - worker->rescue_wq ? "(RESCUER)" : "", - worker->current_func); + pr_cont(" %s", comma ? "," : ""); + pr_cont_worker_id(worker); + pr_cont(":%ps", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work, &pcws); pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); @@ -5816,8 +5979,8 @@ static void show_one_worker_pool(struct worker_pool *pool) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); list_for_each_entry(worker, &pool->idle_list, entry) { - pr_cont(" %s%d", first ? "idle: " : "", - task_pid_nr(worker->task)); + pr_cont(" %s", first ? "idle: " : ""); + pr_cont_worker_id(worker); first = false; } pr_cont("\n"); @@ -6090,13 +6253,15 @@ int workqueue_online_cpu(unsigned int cpu) mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { - mutex_lock(&wq_pool_attach_mutex); + /* BH pools aren't affected by hotplug */ + if (pool->flags & POOL_BH) + continue; + mutex_lock(&wq_pool_attach_mutex); if (pool->cpu == cpu) rebind_workers(pool); else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&wq_pool_attach_mutex); } @@ -7053,7 +7218,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) /* did we stall? */ if (time_after(now, ts + thresh)) { lockup_detected = true; - if (pool->cpu >= 0) { + if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) { pool->cpu_stall = true; cpu_pool_stall = true; } @@ -7218,10 +7383,16 @@ void __init workqueue_init_early(void) pt->pod_node[0] = NUMA_NO_NODE; pt->cpu_pod[0] = 0; - /* initialize CPU pools */ + /* initialize BH and CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; + i = 0; + for_each_bh_worker_pool(pool, cpu) { + init_cpu_worker_pool(pool, cpu, std_nice[i++]); + pool->flags |= POOL_BH; + } + i = 0; for_each_cpu_worker_pool(pool, cpu) init_cpu_worker_pool(pool, cpu, std_nice[i++]); @@ -7257,10 +7428,14 @@ void __init workqueue_init_early(void) system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", WQ_FREEZABLE | WQ_POWER_EFFICIENT, 0); + system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); + system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", + WQ_BH | WQ_HIGHPRI, 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || - !system_freezable_power_efficient_wq); + !system_freezable_power_efficient_wq || + !system_bh_wq || !system_bh_highpri_wq); } static void __init wq_cpu_intensive_thresh_init(void) @@ -7326,9 +7501,10 @@ void __init workqueue_init(void) * up. Also, create a rescuer for workqueues that requested it. */ for_each_possible_cpu(cpu) { - for_each_cpu_worker_pool(pool, cpu) { + for_each_bh_worker_pool(pool, cpu) + pool->node = cpu_to_node(cpu); + for_each_cpu_worker_pool(pool, cpu) pool->node = cpu_to_node(cpu); - } } list_for_each_entry(wq, &workqueues, list) { @@ -7339,7 +7515,16 @@ void __init workqueue_init(void) mutex_unlock(&wq_pool_mutex); - /* create the initial workers */ + /* + * Create the initial workers. A BH pool has one pseudo worker that + * represents the shared BH execution context and thus doesn't get + * affected by hotplug events. Create the BH pseudo workers for all + * possible CPUs here. + */ + for_each_possible_cpu(cpu) + for_each_bh_worker_pool(pool, cpu) + BUG_ON(!create_worker(pool)); + for_each_online_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { pool->flags &= ~POOL_DISASSOCIATED; diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py index bd381511bd9a..d29b918306b4 100644 --- a/tools/workqueue/wq_dump.py +++ b/tools/workqueue/wq_dump.py @@ -79,7 +79,9 @@ def cpumask_str(cpumask): wq_type_len = 9 def wq_type_str(wq): - if wq.flags & WQ_UNBOUND: + if wq.flags & WQ_BH: + return f'{"bh":{wq_type_len}}' + elif wq.flags & WQ_UNBOUND: if wq.flags & WQ_ORDERED: return f'{"ordered":{wq_type_len}}' else: @@ -97,6 +99,7 @@ wq_pod_types = prog['wq_pod_types'] wq_affn_dfl = prog['wq_affn_dfl'] wq_affn_names = prog['wq_affn_names'] +WQ_BH = prog['WQ_BH'] WQ_UNBOUND = prog['WQ_UNBOUND'] WQ_ORDERED = prog['__WQ_ORDERED'] WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM'] @@ -107,6 +110,8 @@ WQ_AFFN_CACHE = prog['WQ_AFFN_CACHE'] WQ_AFFN_NUMA = prog['WQ_AFFN_NUMA'] WQ_AFFN_SYSTEM = prog['WQ_AFFN_SYSTEM'] +POOL_BH = prog['POOL_BH'] + WQ_NAME_LEN = prog['WQ_NAME_LEN'].value_() cpumask_str_len = len(cpumask_str(wq_unbound_cpumask)) @@ -151,10 +156,12 @@ for pi, pool in idr_for_each(worker_pool_idr): for pi, pool in idr_for_each(worker_pool_idr): pool = drgn.Object(prog, 'struct worker_pool', address=pool) - print(f'pool[{pi:0{max_pool_id_len}}] ref={pool.refcnt.value_():{max_ref_len}} nice={pool.attrs.nice.value_():3} ', end='') + print(f'pool[{pi:0{max_pool_id_len}}] flags=0x{pool.flags.value_():02x} ref={pool.refcnt.value_():{max_ref_len}} nice={pool.attrs.nice.value_():3} ', end='') print(f'idle/workers={pool.nr_idle.value_():3}/{pool.nr_workers.value_():3} ', end='') if pool.cpu >= 0: print(f'cpu={pool.cpu.value_():3}', end='') + if pool.flags & POOL_BH: + print(' bh', end='') else: print(f'cpus={cpumask_str(pool.attrs.cpumask)}', end='') print(f' pod_cpus={cpumask_str(pool.attrs.__pod_cpumask)}', end='') -- cgit v1.2.3 From 3bc1e711c26bff01d41ad71145ecb8dcb4412576 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Feb 2024 14:19:10 -1000 Subject: workqueue: Don't implicitly make UNBOUND workqueues w/ @max_active==1 ordered 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") automoatically promoted UNBOUND workqueues w/ @max_active==1 to ordered workqueues because UNBOUND workqueues w/ @max_active==1 used to be the way to create ordered workqueues and the new NUMA support broke it. These problems can be subtle and the fact that they can only trigger on NUMA machines made them even more difficult to debug. However, overloading the UNBOUND allocation interface this way creates other issues. It's difficult to tell whether a given workqueue actually needs to be ordered and users that legitimately want a min concurrency level wq unexpectedly gets an ordered one instead. With planned UNBOUND workqueue udpates to improve execution locality and more prevalence of chiplet designs which can benefit from such improvements, this isn't a state we wanna be in forever. There aren't that many UNBOUND w/ @max_active==1 users in the tree and the preceding patches audited all and converted them to alloc_ordered_workqueue() as appropriate. This patch removes the implicit promotion of UNBOUND w/ @max_active==1 workqueues to ordered ones. v2: v1 patch incorrectly dropped !list_empty(&wq->pwqs) condition in apply_workqueue_attrs_locked() which spuriously triggers WARNING and fails workqueue creation. Fix it. Signed-off-by: Tejun Heo Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202304251050.45a5df1f-oliver.sang@intel.com --- Documentation/core-api/workqueue.rst | 14 +++++--------- include/linux/workqueue.h | 4 +--- kernel/workqueue.c | 22 ++++------------------ 3 files changed, 10 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index 2d6af6c4665c..9572609b5263 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -256,15 +256,11 @@ may queue at the same time. Unless there is a specific need for throttling the number of active work items, specifying '0' is recommended. -Some users depend on the strict execution ordering of ST wq. The -combination of ``@max_active`` of 1 and ``WQ_UNBOUND`` used to -achieve this behavior. Work items on such wq were always queued to the -unbound worker-pools and only one work item could be active at any given -time thus achieving the same ordering property as ST wq. - -In the current implementation the above configuration only guarantees -ST behavior within a given NUMA node. Instead ``alloc_ordered_workqueue()`` should -be used to achieve system-wide ST behavior. +Some users depend on strict execution ordering where only one work item +is in flight at any given time and the work items are processed in +queueing order. While the combination of ``@max_active`` of 1 and +``WQ_UNBOUND`` used to achieve this behavior, this is no longer the +case. Use ``alloc_ordered_queue()`` instead. Example Execution Scenarios diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 283d7891b4c4..4ba33cf07f11 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -392,7 +392,6 @@ enum wq_flags { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ - __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ /* BH wq only allows the following flags */ __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, @@ -507,8 +506,7 @@ alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ - alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \ - __WQ_ORDERED_EXPLICIT | (flags), 1, ##args) + alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 68c48489eab3..ecc775843bfa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5007,12 +5007,8 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; /* creating multiple pwqs breaks ordering guarantee */ - if (!list_empty(&wq->pwqs)) { - if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) - return -EINVAL; - - wq->flags &= ~__WQ_ORDERED; - } + if (!list_empty(&wq->pwqs) && WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); if (IS_ERR(ctx)) @@ -5333,15 +5329,6 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, return NULL; } - /* - * Unbound && max_active == 1 used to imply ordered, which is no longer - * the case on many machines due to per-pod pools. While - * alloc_ordered_workqueue() is the right way to create an ordered - * workqueue, keep the previous behavior to avoid subtle breakages. - */ - if ((flags & WQ_UNBOUND) && max_active == 1) - flags |= __WQ_ORDERED; - /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; @@ -5564,14 +5551,13 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) if (WARN_ON(wq->flags & WQ_BH)) return; /* disallow meddling with max_active for ordered workqueues */ - if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + if (WARN_ON(wq->flags & __WQ_ORDERED)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); - wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; if (wq->flags & WQ_UNBOUND) wq->saved_min_active = min(wq->saved_min_active, max_active); @@ -7028,7 +7014,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ - if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + if (WARN_ON(wq->flags & __WQ_ORDERED)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); -- cgit v1.2.3 From 8f172181f24bb5df7675225d9b5b66d059613f50 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Feb 2024 14:11:56 -1000 Subject: workqueue: Implement workqueue_set_min_active() Since 5797b1c18919 ("workqueue: Implement system-wide nr_active enforcement for unbound workqueues"), unbound workqueues have separate min_active which sets the number of interdependent work items that can be handled. This value is currently initialized to WQ_DFL_MIN_ACTIVE which is 8. This isn't high enough for some users, let's add an interface to adjust the setting. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 ++ kernel/workqueue.c | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 4ba33cf07f11..1565bab9edc8 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -553,6 +553,8 @@ extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); +extern void workqueue_set_min_active(struct workqueue_struct *wq, + int min_active); extern struct work_struct *current_work(void); extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ddcdeb7b9f26..4950bfc2cdcc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5629,6 +5629,33 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) } EXPORT_SYMBOL_GPL(workqueue_set_max_active); +/** + * workqueue_set_min_active - adjust min_active of an unbound workqueue + * @wq: target unbound workqueue + * @min_active: new min_active value + * + * Set min_active of an unbound workqueue. Unlike other types of workqueues, an + * unbound workqueue is not guaranteed to be able to process max_active + * interdependent work items. Instead, an unbound workqueue is guaranteed to be + * able to process min_active number of interdependent work items which is + * %WQ_DFL_MIN_ACTIVE by default. + * + * Use this function to adjust the min_active value between 0 and the current + * max_active. + */ +void workqueue_set_min_active(struct workqueue_struct *wq, int min_active) +{ + /* min_active is only meaningful for non-ordered unbound workqueues */ + if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) != + WQ_UNBOUND)) + return; + + mutex_lock(&wq->mutex); + wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active); + wq_adjust_max_active(wq); + mutex_unlock(&wq->mutex); +} + /** * current_work - retrieve %current task's work struct * -- cgit v1.2.3 From bf52b1ac6ab41a060511d56d0f2da12f3a2486db Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Feb 2024 14:14:16 -1000 Subject: async: Use a dedicated unbound workqueue with raised min_active Async can schedule a number of interdependent work items. However, since 5797b1c18919 ("workqueue: Implement system-wide nr_active enforcement for unbound workqueues"), unbound workqueues have separate min_active which sets the number of interdependent work items that can be handled. This default value is 8 which isn't sufficient for async and can lead to stalls during resume from suspend in some cases. Let's use a dedicated unbound workqueue with raised min_active. Link: http://lkml.kernel.org/r/708a65cc-79ec-44a6-8454-a93d0f3114c3@samsung.com Reported-by: Marek Szyprowski Cc: Rafael J. Wysocki Tested-by: Marek Szyprowski Signed-off-by: Tejun Heo --- include/linux/async.h | 1 + init/main.c | 1 + kernel/async.c | 17 ++++++++++++++++- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/async.h b/include/linux/async.h index 33c9ff4afb49..19b778d08600 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -120,4 +120,5 @@ extern void async_synchronize_cookie(async_cookie_t cookie); extern void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain); extern bool current_is_async(void); +extern void async_init(void); #endif diff --git a/init/main.c b/init/main.c index e24b0780fdff..685db36c999f 100644 --- a/init/main.c +++ b/init/main.c @@ -1545,6 +1545,7 @@ static noinline void __init kernel_init_freeable(void) sched_init_smp(); workqueue_init_topology(); + async_init(); padata_init(); page_alloc_init_late(); diff --git a/kernel/async.c b/kernel/async.c index 97f224a5257b..4c3e6a44595f 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -64,6 +64,7 @@ static async_cookie_t next_cookie = 1; static LIST_HEAD(async_global_pending); /* pending from all registered doms */ static ASYNC_DOMAIN(async_dfl_domain); static DEFINE_SPINLOCK(async_lock); +static struct workqueue_struct *async_wq; struct async_entry { struct list_head domain_list; @@ -174,7 +175,7 @@ static async_cookie_t __async_schedule_node_domain(async_func_t func, spin_unlock_irqrestore(&async_lock, flags); /* schedule for execution */ - queue_work_node(node, system_unbound_wq, &entry->work); + queue_work_node(node, async_wq, &entry->work); return newcookie; } @@ -345,3 +346,17 @@ bool current_is_async(void) return worker && worker->current_func == async_run_entry_fn; } EXPORT_SYMBOL_GPL(current_is_async); + +void __init async_init(void) +{ + /* + * Async can schedule a number of interdependent work items. However, + * unbound workqueues can handle only upto min_active interdependent + * work items. The default min_active of 8 isn't sufficient for async + * and can lead to stalls. Let's use a dedicated workqueue with raised + * min_active. + */ + async_wq = alloc_workqueue("async", WQ_UNBOUND, 0); + BUG_ON(!async_wq); + workqueue_set_min_active(async_wq, WQ_DFL_ACTIVE); +} -- cgit v1.2.3 From e9a8e01f9b133c145dd125021ec47c006d108af4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 20 Feb 2024 19:36:14 -1000 Subject: workqueue: Clean up enum work_bits and related constants The bits of work->data are used for a few different purposes. How the bits are used is determined by enum work_bits. The planned disable/enable support will add another use, so let's clean it up a bit in preparation. - Let WORK_STRUCT_*_BIT's values be determined by enum definition order. - Deliminate different bit sections the same way using SHIFT and BITS values. - Rename __WORK_OFFQ_CANCELING to WORK_OFFQ_CANCELING_BIT for consistency. - Introduce WORK_STRUCT_PWQ_SHIFT and replace WORK_STRUCT_FLAG_MASK and WORK_STRUCT_WQ_DATA_MASK with WQ_STRUCT_PWQ_MASK for clarity. - Improve documentation. No functional changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 58 ++++++++++++++++++++++++++--------------------- kernel/workqueue.c | 8 +++---- 2 files changed, 36 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1565bab9edc8..0ad534fe6673 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -24,41 +24,49 @@ enum work_bits { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ - WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */ - WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ - WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ + WORK_STRUCT_INACTIVE_BIT, /* work item is inactive */ + WORK_STRUCT_PWQ_BIT, /* data points to pwq */ + WORK_STRUCT_LINKED_BIT, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK - WORK_STRUCT_STATIC_BIT = 4, /* static initializer (debugobjects) */ - WORK_STRUCT_COLOR_SHIFT = 5, /* color for workqueue flushing */ -#else - WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */ + WORK_STRUCT_STATIC_BIT, /* static initializer (debugobjects) */ #endif + WORK_STRUCT_FLAG_BITS, + /* color for workqueue flushing */ + WORK_STRUCT_COLOR_SHIFT = WORK_STRUCT_FLAG_BITS, WORK_STRUCT_COLOR_BITS = 4, /* - * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. - * This makes pwqs aligned to 256 bytes and allows 16 workqueue - * flush colors. + * When WORK_STRUCT_PWQ is set, reserve 8 bits off of pwq pointer w/ + * debugobjects turned off. This makes pwqs aligned to 256 bytes (512 + * bytes w/ DEBUG_OBJECTS_WORK) and allows 16 workqueue flush colors. + * + * MSB + * [ pwq pointer ] [ flush color ] [ STRUCT flags ] + * 4 bits 4 or 5 bits */ - WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT + - WORK_STRUCT_COLOR_BITS, + WORK_STRUCT_PWQ_SHIFT = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, - /* data contains off-queue information when !WORK_STRUCT_PWQ */ - WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, - - __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, + /* + * data contains off-queue information when !WORK_STRUCT_PWQ. + * + * MSB + * [ pool ID ] [ OFFQ flags ] [ STRUCT flags ] + * 1 bit 4 or 5 bits + */ + WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, + WORK_OFFQ_CANCELING_BIT = WORK_OFFQ_FLAG_SHIFT, + WORK_OFFQ_FLAG_END, + WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, /* - * When a work item is off queue, its high bits point to the last - * pool it was on. Cap at 31 bits and use the highest number to - * indicate that no pool is associated. + * When a work item is off queue, the high bits encode off-queue flags + * and the last pool it was on. Cap pool ID to 31 bits and use the + * highest number to indicate that no pool is associated. */ - WORK_OFFQ_FLAG_BITS = 1, - WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS, + WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, - }; enum work_flags { @@ -88,12 +96,10 @@ enum wq_misc_consts { }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ -#define WORK_OFFQ_CANCELING (1ul << __WORK_OFFQ_CANCELING) +#define WORK_OFFQ_CANCELING (1ul << WORK_OFFQ_CANCELING_BIT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) - -#define WORK_STRUCT_FLAG_MASK ((1ul << WORK_STRUCT_FLAG_BITS) - 1) -#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) +#define WORK_STRUCT_PWQ_MASK (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1)) #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 317c85f051b0..7c6915e23c5c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -247,7 +247,7 @@ enum pool_workqueue_stats { }; /* - * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS + * The per-pool workqueue. While queued, bits below WORK_PWQ_SHIFT * of work_struct->data are used for flags and the remaining high bits * point to the pwq; thus, pwqs need to be aligned at two's power of the * number of flag bits. @@ -294,7 +294,7 @@ struct pool_workqueue { */ struct kthread_work release_work; struct rcu_head rcu; -} __aligned(1 << WORK_STRUCT_FLAG_BITS); +} __aligned(1 << WORK_STRUCT_PWQ_SHIFT); /* * Structure used to wait for workqueue flush. @@ -843,7 +843,7 @@ static void clear_work_data(struct work_struct *work) static inline struct pool_workqueue *work_struct_pwq(unsigned long data) { - return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); + return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK); } static struct pool_workqueue *get_work_pwq(struct work_struct *work) @@ -4851,7 +4851,7 @@ static void pwq_release_workfn(struct kthread_work *work) static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { - BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK); memset(pwq, 0, sizeof(*pwq)); -- cgit v1.2.3 From 60b2ebf48526567b53e0188dbd1a4df8e646bcc1 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Tue, 27 Feb 2024 19:10:37 +0000 Subject: workqueue: Introduce from_work() helper for cleaner callback declarations To streamline the transition from tasklets to worqueues, a new helper function, from_work(), is introduced. This helper, inspired by existing from_() patterns, utilizes container_of() and eliminates the redundancy of declaring variable types, leading to more concise and readable code. The modified code snippet demonstrates the enhanced clarity achieved with from_wq(): void callback(struct work_struct *w) { - struct some_data_structure *local = container_of(w, struct some_data_structure, work); + struct some_data_structure *local = from_work(local, w, work); This change aims to facilitate a smoother transition and uphold code quality standards. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git disable_work-v3 Signed-off-by: Allen Pais Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0ad534fe6673..64a60b9232d3 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -522,6 +522,9 @@ alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); #define create_singlethread_workqueue(name) \ alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) +#define from_work(var, callback_work, work_fieldname) \ + container_of(callback_work, typeof(*var), work_fieldname) + extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(void); -- cgit v1.2.3 From 1acd92d95fa24edca8f0292b21870025da93e24f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Feb 2024 15:38:55 -1000 Subject: workqueue: Drain BH work items on hot-unplugged CPUs Boqun pointed out that workqueues aren't handling BH work items on offlined CPUs. Unlike tasklet which transfers out the pending tasks from CPUHP_SOFTIRQ_DEAD, BH workqueue would just leave them pending which is problematic. Note that this behavior is specific to BH workqueues as the non-BH per-CPU workers just become unbound when the CPU goes offline. This patch fixes the issue by draining the pending BH work items from an offlined CPU from CPUHP_SOFTIRQ_DEAD. Because work items carry more context, it's not as easy to transfer the pending work items from one pool to another. Instead, run BH work items which execute the offlined pools on an online CPU. Note that this assumes that no further BH work items will be queued on the offlined CPUs. This assumption is shared with tasklet and should be fine for conversions. However, this issue also exists for per-CPU workqueues which will just keep executing work items queued after CPU offline on unbound workers and workqueue should reject per-CPU and BH work items queued on offline CPUs. This will be addressed separately later. Signed-off-by: Tejun Heo Reported-and-reviewed-by: Boqun Feng Link: http://lkml.kernel.org/r/Zdvw0HdSXcU3JZ4g@boqun-archlinux --- include/linux/workqueue.h | 1 + kernel/softirq.c | 2 ++ kernel/workqueue.c | 91 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 91 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 64a60b9232d3..158784dd189a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -458,6 +458,7 @@ extern struct workqueue_struct *system_bh_wq; extern struct workqueue_struct *system_bh_highpri_wq; void workqueue_softirq_action(bool highpri); +void workqueue_softirq_dead(unsigned int cpu); /** * alloc_workqueue - allocate a workqueue diff --git a/kernel/softirq.c b/kernel/softirq.c index 547d282548a8..b315b21fb28c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -932,6 +932,8 @@ static void run_ksoftirqd(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU static int takeover_tasklets(unsigned int cpu) { + workqueue_softirq_dead(cpu); + /* CPU is dead, so no lock needed. */ local_irq_disable(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 38783e3a60bb..a60eb65955e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -81,6 +81,7 @@ enum worker_pool_flags { POOL_BH = 1 << 0, /* is a BH pool */ POOL_MANAGER_ACTIVE = 1 << 1, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ + POOL_BH_DRAINING = 1 << 3, /* draining after CPU offline */ }; enum worker_flags { @@ -1218,7 +1219,9 @@ static struct irq_work *bh_pool_irq_work(struct worker_pool *pool) static void kick_bh_pool(struct worker_pool *pool) { #ifdef CONFIG_SMP - if (unlikely(pool->cpu != smp_processor_id())) { + /* see drain_dead_softirq_workfn() for BH_DRAINING */ + if (unlikely(pool->cpu != smp_processor_id() && + !(pool->flags & POOL_BH_DRAINING))) { irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu); return; } @@ -3155,6 +3158,7 @@ __acquires(&pool->lock) struct worker_pool *pool = worker->pool; unsigned long work_data; int lockdep_start_depth, rcu_start_depth; + bool bh_draining = pool->flags & POOL_BH_DRAINING; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from @@ -3220,7 +3224,9 @@ __acquires(&pool->lock) rcu_start_depth = rcu_preempt_depth(); lockdep_start_depth = lockdep_depth(current); - lock_map_acquire(&pwq->wq->lockdep_map); + /* see drain_dead_softirq_workfn() */ + if (!bh_draining) + lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); /* * Strictly speaking we should mark the invariant state without holding @@ -3253,7 +3259,8 @@ __acquires(&pool->lock) trace_workqueue_execute_end(work, worker->current_func); pwq->stats[PWQ_STAT_COMPLETED]++; lock_map_release(&lockdep_map); - lock_map_release(&pwq->wq->lockdep_map); + if (!bh_draining) + lock_map_release(&pwq->wq->lockdep_map); if (unlikely((worker->task && in_atomic()) || lockdep_depth(current) != lockdep_start_depth || @@ -3615,6 +3622,84 @@ void workqueue_softirq_action(bool highpri) bh_worker(list_first_entry(&pool->workers, struct worker, node)); } +struct wq_drain_dead_softirq_work { + struct work_struct work; + struct worker_pool *pool; + struct completion done; +}; + +static void drain_dead_softirq_workfn(struct work_struct *work) +{ + struct wq_drain_dead_softirq_work *dead_work = + container_of(work, struct wq_drain_dead_softirq_work, work); + struct worker_pool *pool = dead_work->pool; + bool repeat; + + /* + * @pool's CPU is dead and we want to execute its still pending work + * items from this BH work item which is running on a different CPU. As + * its CPU is dead, @pool can't be kicked and, as work execution path + * will be nested, a lockdep annotation needs to be suppressed. Mark + * @pool with %POOL_BH_DRAINING for the special treatments. + */ + raw_spin_lock_irq(&pool->lock); + pool->flags |= POOL_BH_DRAINING; + raw_spin_unlock_irq(&pool->lock); + + bh_worker(list_first_entry(&pool->workers, struct worker, node)); + + raw_spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_BH_DRAINING; + repeat = need_more_worker(pool); + raw_spin_unlock_irq(&pool->lock); + + /* + * bh_worker() might hit consecutive execution limit and bail. If there + * still are pending work items, reschedule self and return so that we + * don't hog this CPU's BH. + */ + if (repeat) { + if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) + queue_work(system_bh_highpri_wq, work); + else + queue_work(system_bh_wq, work); + } else { + complete(&dead_work->done); + } +} + +/* + * @cpu is dead. Drain the remaining BH work items on the current CPU. It's + * possible to allocate dead_work per CPU and avoid flushing. However, then we + * have to worry about draining overlapping with CPU coming back online or + * nesting (one CPU's dead_work queued on another CPU which is also dead and so + * on). Let's keep it simple and drain them synchronously. These are BH work + * items which shouldn't be requeued on the same pool. Shouldn't take long. + */ +void workqueue_softirq_dead(unsigned int cpu) +{ + int i; + + for (i = 0; i < NR_STD_WORKER_POOLS; i++) { + struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i]; + struct wq_drain_dead_softirq_work dead_work; + + if (!need_more_worker(pool)) + continue; + + INIT_WORK(&dead_work.work, drain_dead_softirq_workfn); + dead_work.pool = pool; + init_completion(&dead_work.done); + + if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) + queue_work(system_bh_highpri_wq, &dead_work.work); + else + queue_work(system_bh_wq, &dead_work.work); + + wait_for_completion(&dead_work.done); + } +} + /** * check_flush_dependency - check for flush dependency sanity * @target_wq: workqueue being flushed -- cgit v1.2.3