1 files changed, 63 insertions, 52 deletions
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 683efe29fa69..1c1d06804d45 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -125,6 +125,17 @@ struct rcu_work {
 	struct workqueue_struct *wq;
 };
 
+enum wq_affn_scope {
+	WQ_AFFN_DFL,			/* use system default */
+	WQ_AFFN_CPU,			/* one pod per CPU */
+	WQ_AFFN_SMT,			/* one pod poer SMT */
+	WQ_AFFN_CACHE,			/* one pod per LLC */
+	WQ_AFFN_NUMA,			/* one pod per NUMA node */
+	WQ_AFFN_SYSTEM,			/* one pod across the whole system */
+
+	WQ_AFFN_NR_TYPES,
+};
+
 /**
  * struct workqueue_attrs - A struct for workqueue attributes.
  *
@@ -138,17 +149,58 @@ struct workqueue_attrs {
 
 	/**
 	 * @cpumask: allowed CPUs
+	 *
+	 * Work items in this workqueue are affine to these CPUs and not allowed
+	 * to execute on other CPUs. A pool serving a workqueue must have the
+	 * same @cpumask.
 	 */
 	cpumask_var_t cpumask;
 
 	/**
-	 * @no_numa: disable NUMA affinity
+	 * @__pod_cpumask: internal attribute used to create per-pod pools
+	 *
+	 * Internal use only.
+	 *
+	 * Per-pod unbound worker pools are used to improve locality. Always a
+	 * subset of ->cpumask. A workqueue can be associated with multiple
+	 * worker pools with disjoint @__pod_cpumask's. Whether the enforcement
+	 * of a pool's @__pod_cpumask is strict depends on @affn_strict.
+	 */
+	cpumask_var_t __pod_cpumask;
+
+	/**
+	 * @affn_strict: affinity scope is strict
+	 *
+	 * If clear, workqueue will make a best-effort attempt at starting the
+	 * worker inside @__pod_cpumask but the scheduler is free to migrate it
+	 * outside.
 	 *
-	 * Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It
-	 * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus
-	 * doesn't participate in pool hash calculations or equality comparisons.
+	 * If set, workers are only allowed to run inside @__pod_cpumask.
+	 */
+	bool affn_strict;
+
+	/*
+	 * Below fields aren't properties of a worker_pool. They only modify how
+	 * :c:func:`apply_workqueue_attrs` select pools and thus don't
+	 * participate in pool hash calculations or equality comparisons.
+	 */
+
+	/**
+	 * @affn_scope: unbound CPU affinity scope
+	 *
+	 * CPU pods are used to improve execution locality of unbound work
+	 * items. There are multiple pod types, one for each wq_affn_scope, and
+	 * every CPU in the system belongs to one pod in every pod type. CPUs
+	 * that belong to the same pod share the worker pool. For example,
+	 * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker
+	 * pool for each NUMA node.
+	 */
+	enum wq_affn_scope affn_scope;
+
+	/**
+	 * @ordered: work items must be executed one by one in queueing order
 	 */
-	bool no_numa;
+	bool ordered;
 };
 
 static inline struct delayed_work *to_delayed_work(struct work_struct *work)
@@ -343,14 +395,10 @@ enum {
 	__WQ_ORDERED_EXPLICIT	= 1 << 19, /* internal: alloc_ordered_workqueue() */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
-	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
+	WQ_UNBOUND_MAX_ACTIVE	= WQ_MAX_ACTIVE,
 	WQ_DFL_ACTIVE		= WQ_MAX_ACTIVE / 2,
 };
 
-/* unbound wq's aren't per-cpu, scale max_active according to #cpus */
-#define WQ_UNBOUND_MAX_ACTIVE	\
-	max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU)
-
 /*
  * System-wide workqueues which are always present.
  *
@@ -391,7 +439,7 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq;
  * alloc_workqueue - allocate a workqueue
  * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags
- * @max_active: max in-flight work items, 0 for default
+ * @max_active: max in-flight work items per CPU, 0 for default
  * remaining args: args for @fmt
  *
  * Allocate a workqueue with the specified parameters.  For detailed
@@ -569,6 +617,7 @@ static inline bool schedule_work(struct work_struct *work)
 
 /*
  * Detect attempt to flush system-wide workqueues at compile time when possible.
+ * Warn attempt to flush system-wide workqueues at runtime.
  *
  * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp
  * for reasons and steps for converting system-wide workqueues into local workqueues.
@@ -576,52 +625,13 @@ static inline bool schedule_work(struct work_struct *work)
 extern void __warn_flushing_systemwide_wq(void)
 	__compiletime_warning("Please avoid flushing system-wide workqueues.");
 
-/**
- * flush_scheduled_work - ensure that any scheduled work has run to completion.
- *
- * Forces execution of the kernel-global workqueue and blocks until its
- * completion.
- *
- * It's very easy to get into trouble if you don't take great care.
- * Either of the following situations will lead to deadlock:
- *
- *	One of the work items currently on the workqueue needs to acquire
- *	a lock held by your code or its caller.
- *
- *	Your code is running in the context of a work routine.
- *
- * They will be detected by lockdep when they occur, but the first might not
- * occur very often.  It depends on what work items are on the workqueue and
- * what locks they need, which you have no control over.
- *
- * In most situations flushing the entire workqueue is overkill; you merely
- * need to know that a particular work item isn't queued and isn't running.
- * In such cases you should use cancel_delayed_work_sync() or
- * cancel_work_sync() instead.
- *
- * Please stop calling this function! A conversion to stop flushing system-wide
- * workqueues is in progress. This function will be removed after all in-tree
- * users stopped calling this function.
- */
-/*
- * The background of commit 771c035372a036f8 ("deprecate the
- * '__deprecated' attribute warnings entirely and for good") is that,
- * since Linus builds all modules between every single pull he does,
- * the standard kernel build needs to be _clean_ in order to be able to
- * notice when new problems happen. Therefore, don't emit warning while
- * there are in-tree users.
- */
+/* Please stop using this function, for this function will be removed in near future. */
 #define flush_scheduled_work()						\
 ({									\
-	if (0)								\
-		__warn_flushing_systemwide_wq();			\
+	__warn_flushing_systemwide_wq();				\
 	__flush_workqueue(system_wq);					\
 })
 
-/*
- * Although there is no longer in-tree caller, for now just emit warning
- * in order to give out-of-tree callers time to update.
- */
 #define flush_workqueue(wq)						\
 ({									\
 	struct workqueue_struct *_wq = (wq);				\
@@ -714,5 +724,6 @@ int workqueue_offline_cpu(unsigned int cpu);
 
 void __init workqueue_init_early(void);
 void __init workqueue_init(void);
+void __init workqueue_init_topology(void);
 
 #endif