1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * kernel/workqueue.c - generic async execution with shared worker pool
4   *
5   * Copyright (C) 2002		Ingo Molnar
6   *
7   *   Derived from the taskqueue/keventd code by:
8   *     David Woodhouse <dwmw2@infradead.org>
9   *     Andrew Morton
10   *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
11   *     Theodore Ts'o <tytso@mit.edu>
12   *
13   * Made to use alloc_percpu by Christoph Lameter.
14   *
15   * Copyright (C) 2010		SUSE Linux Products GmbH
16   * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
17   *
18   * This is the generic async execution mechanism.  Work items as are
19   * executed in process context.  The worker pool is shared and
20   * automatically managed.  There are two worker pools for each CPU (one for
21   * normal work items and the other for high priority ones) and some extra
22   * pools for workqueues which are not bound to any specific CPU - the
23   * number of these backing pools is dynamic.
24   *
25   * Please read Documentation/core-api/workqueue.rst for details.
26   */
27  
28  #include <linux/export.h>
29  #include <linux/kernel.h>
30  #include <linux/sched.h>
31  #include <linux/init.h>
32  #include <linux/interrupt.h>
33  #include <linux/signal.h>
34  #include <linux/completion.h>
35  #include <linux/workqueue.h>
36  #include <linux/slab.h>
37  #include <linux/cpu.h>
38  #include <linux/notifier.h>
39  #include <linux/kthread.h>
40  #include <linux/hardirq.h>
41  #include <linux/mempolicy.h>
42  #include <linux/freezer.h>
43  #include <linux/debug_locks.h>
44  #include <linux/lockdep.h>
45  #include <linux/idr.h>
46  #include <linux/jhash.h>
47  #include <linux/hashtable.h>
48  #include <linux/rculist.h>
49  #include <linux/nodemask.h>
50  #include <linux/moduleparam.h>
51  #include <linux/uaccess.h>
52  #include <linux/sched/isolation.h>
53  #include <linux/sched/debug.h>
54  #include <linux/nmi.h>
55  #include <linux/kvm_para.h>
56  #include <linux/delay.h>
57  #include <linux/irq_work.h>
58  
59  #include "workqueue_internal.h"
60  
61  enum worker_pool_flags {
62  	/*
63  	 * worker_pool flags
64  	 *
65  	 * A bound pool is either associated or disassociated with its CPU.
66  	 * While associated (!DISASSOCIATED), all workers are bound to the
67  	 * CPU and none has %WORKER_UNBOUND set and concurrency management
68  	 * is in effect.
69  	 *
70  	 * While DISASSOCIATED, the cpu may be offline and all workers have
71  	 * %WORKER_UNBOUND set and concurrency management disabled, and may
72  	 * be executing on any CPU.  The pool behaves as an unbound one.
73  	 *
74  	 * Note that DISASSOCIATED should be flipped only while holding
75  	 * wq_pool_attach_mutex to avoid changing binding state while
76  	 * worker_attach_to_pool() is in progress.
77  	 *
78  	 * As there can only be one concurrent BH execution context per CPU, a
79  	 * BH pool is per-CPU and always DISASSOCIATED.
80  	 */
81  	POOL_BH			= 1 << 0,	/* is a BH pool */
82  	POOL_MANAGER_ACTIVE	= 1 << 1,	/* being managed */
83  	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
84  	POOL_BH_DRAINING	= 1 << 3,	/* draining after CPU offline */
85  };
86  
87  enum worker_flags {
88  	/* worker flags */
89  	WORKER_DIE		= 1 << 1,	/* die die die */
90  	WORKER_IDLE		= 1 << 2,	/* is idle */
91  	WORKER_PREP		= 1 << 3,	/* preparing to run works */
92  	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
93  	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
94  	WORKER_REBOUND		= 1 << 8,	/* worker was rebound */
95  
96  	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_CPU_INTENSIVE |
97  				  WORKER_UNBOUND | WORKER_REBOUND,
98  };
99  
100  enum work_cancel_flags {
101  	WORK_CANCEL_DELAYED	= 1 << 0,	/* canceling a delayed_work */
102  	WORK_CANCEL_DISABLE	= 1 << 1,	/* canceling to disable */
103  };
104  
105  enum wq_internal_consts {
106  	NR_STD_WORKER_POOLS	= 2,		/* # standard pools per cpu */
107  
108  	UNBOUND_POOL_HASH_ORDER	= 6,		/* hashed by pool->attrs */
109  	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
110  
111  	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
112  	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
113  
114  	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
115  						/* call for help after 10ms
116  						   (min two ticks) */
117  	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
118  	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
119  
120  	/*
121  	 * Rescue workers are used only on emergencies and shared by
122  	 * all cpus.  Give MIN_NICE.
123  	 */
124  	RESCUER_NICE_LEVEL	= MIN_NICE,
125  	HIGHPRI_NICE_LEVEL	= MIN_NICE,
126  
127  	WQ_NAME_LEN		= 32,
128  	WORKER_ID_LEN		= 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */
129  };
130  
131  /*
132   * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
133   * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
134   * msecs_to_jiffies() can't be an initializer.
135   */
136  #define BH_WORKER_JIFFIES	msecs_to_jiffies(2)
137  #define BH_WORKER_RESTARTS	10
138  
139  /*
140   * Structure fields follow one of the following exclusion rules.
141   *
142   * I: Modifiable by initialization/destruction paths and read-only for
143   *    everyone else.
144   *
145   * P: Preemption protected.  Disabling preemption is enough and should
146   *    only be modified and accessed from the local cpu.
147   *
148   * L: pool->lock protected.  Access with pool->lock held.
149   *
150   * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
151   *     reads.
152   *
153   * K: Only modified by worker while holding pool->lock. Can be safely read by
154   *    self, while holding pool->lock or from IRQ context if %current is the
155   *    kworker.
156   *
157   * S: Only modified by worker self.
158   *
159   * A: wq_pool_attach_mutex protected.
160   *
161   * PL: wq_pool_mutex protected.
162   *
163   * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
164   *
165   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
166   *
167   * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
168   *      RCU for reads.
169   *
170   * WQ: wq->mutex protected.
171   *
172   * WR: wq->mutex protected for writes.  RCU protected for reads.
173   *
174   * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
175   *     with READ_ONCE() without locking.
176   *
177   * MD: wq_mayday_lock protected.
178   *
179   * WD: Used internally by the watchdog.
180   */
181  
182  /* struct worker is defined in workqueue_internal.h */
183  
184  struct worker_pool {
185  	raw_spinlock_t		lock;		/* the pool lock */
186  	int			cpu;		/* I: the associated cpu */
187  	int			node;		/* I: the associated node ID */
188  	int			id;		/* I: pool ID */
189  	unsigned int		flags;		/* L: flags */
190  
191  	unsigned long		watchdog_ts;	/* L: watchdog timestamp */
192  	bool			cpu_stall;	/* WD: stalled cpu bound pool */
193  
194  	/*
195  	 * The counter is incremented in a process context on the associated CPU
196  	 * w/ preemption disabled, and decremented or reset in the same context
197  	 * but w/ pool->lock held. The readers grab pool->lock and are
198  	 * guaranteed to see if the counter reached zero.
199  	 */
200  	int			nr_running;
201  
202  	struct list_head	worklist;	/* L: list of pending works */
203  
204  	int			nr_workers;	/* L: total number of workers */
205  	int			nr_idle;	/* L: currently idle workers */
206  
207  	struct list_head	idle_list;	/* L: list of idle workers */
208  	struct timer_list	idle_timer;	/* L: worker idle timeout */
209  	struct work_struct      idle_cull_work; /* L: worker idle cleanup */
210  
211  	struct timer_list	mayday_timer;	  /* L: SOS timer for workers */
212  
213  	/* a workers is either on busy_hash or idle_list, or the manager */
214  	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
215  						/* L: hash of busy workers */
216  
217  	struct worker		*manager;	/* L: purely informational */
218  	struct list_head	workers;	/* A: attached workers */
219  
220  	struct ida		worker_ida;	/* worker IDs for task name */
221  
222  	struct workqueue_attrs	*attrs;		/* I: worker attributes */
223  	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */
224  	int			refcnt;		/* PL: refcnt for unbound pools */
225  
226  	/*
227  	 * Destruction of pool is RCU protected to allow dereferences
228  	 * from get_work_pool().
229  	 */
230  	struct rcu_head		rcu;
231  };
232  
233  /*
234   * Per-pool_workqueue statistics. These can be monitored using
235   * tools/workqueue/wq_monitor.py.
236   */
237  enum pool_workqueue_stats {
238  	PWQ_STAT_STARTED,	/* work items started execution */
239  	PWQ_STAT_COMPLETED,	/* work items completed execution */
240  	PWQ_STAT_CPU_TIME,	/* total CPU time consumed */
241  	PWQ_STAT_CPU_INTENSIVE,	/* wq_cpu_intensive_thresh_us violations */
242  	PWQ_STAT_CM_WAKEUP,	/* concurrency-management worker wakeups */
243  	PWQ_STAT_REPATRIATED,	/* unbound workers brought back into scope */
244  	PWQ_STAT_MAYDAY,	/* maydays to rescuer */
245  	PWQ_STAT_RESCUED,	/* linked work items executed by rescuer */
246  
247  	PWQ_NR_STATS,
248  };
249  
250  /*
251   * The per-pool workqueue.  While queued, bits below WORK_PWQ_SHIFT
252   * of work_struct->data are used for flags and the remaining high bits
253   * point to the pwq; thus, pwqs need to be aligned at two's power of the
254   * number of flag bits.
255   */
256  struct pool_workqueue {
257  	struct worker_pool	*pool;		/* I: the associated pool */
258  	struct workqueue_struct *wq;		/* I: the owning workqueue */
259  	int			work_color;	/* L: current color */
260  	int			flush_color;	/* L: flushing color */
261  	int			refcnt;		/* L: reference count */
262  	int			nr_in_flight[WORK_NR_COLORS];
263  						/* L: nr of in_flight works */
264  	bool			plugged;	/* L: execution suspended */
265  
266  	/*
267  	 * nr_active management and WORK_STRUCT_INACTIVE:
268  	 *
269  	 * When pwq->nr_active >= max_active, new work item is queued to
270  	 * pwq->inactive_works instead of pool->worklist and marked with
271  	 * WORK_STRUCT_INACTIVE.
272  	 *
273  	 * All work items marked with WORK_STRUCT_INACTIVE do not participate in
274  	 * nr_active and all work items in pwq->inactive_works are marked with
275  	 * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
276  	 * in pwq->inactive_works. Some of them are ready to run in
277  	 * pool->worklist or worker->scheduled. Those work itmes are only struct
278  	 * wq_barrier which is used for flush_work() and should not participate
279  	 * in nr_active. For non-barrier work item, it is marked with
280  	 * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
281  	 */
282  	int			nr_active;	/* L: nr of active works */
283  	struct list_head	inactive_works;	/* L: inactive works */
284  	struct list_head	pending_node;	/* LN: node on wq_node_nr_active->pending_pwqs */
285  	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */
286  	struct list_head	mayday_node;	/* MD: node on wq->maydays */
287  
288  	u64			stats[PWQ_NR_STATS];
289  
290  	/*
291  	 * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
292  	 * and pwq_release_workfn() for details. pool_workqueue itself is also
293  	 * RCU protected so that the first pwq can be determined without
294  	 * grabbing wq->mutex.
295  	 */
296  	struct kthread_work	release_work;
297  	struct rcu_head		rcu;
298  } __aligned(1 << WORK_STRUCT_PWQ_SHIFT);
299  
300  /*
301   * Structure used to wait for workqueue flush.
302   */
303  struct wq_flusher {
304  	struct list_head	list;		/* WQ: list of flushers */
305  	int			flush_color;	/* WQ: flush color waiting for */
306  	struct completion	done;		/* flush completion */
307  };
308  
309  struct wq_device;
310  
311  /*
312   * Unlike in a per-cpu workqueue where max_active limits its concurrency level
313   * on each CPU, in an unbound workqueue, max_active applies to the whole system.
314   * As sharing a single nr_active across multiple sockets can be very expensive,
315   * the counting and enforcement is per NUMA node.
316   *
317   * The following struct is used to enforce per-node max_active. When a pwq wants
318   * to start executing a work item, it should increment ->nr using
319   * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
320   * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
321   * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
322   * round-robin order.
323   */
324  struct wq_node_nr_active {
325  	int			max;		/* per-node max_active */
326  	atomic_t		nr;		/* per-node nr_active */
327  	raw_spinlock_t		lock;		/* nests inside pool locks */
328  	struct list_head	pending_pwqs;	/* LN: pwqs with inactive works */
329  };
330  
331  /*
332   * The externally visible workqueue.  It relays the issued work items to
333   * the appropriate worker_pool through its pool_workqueues.
334   */
335  struct workqueue_struct {
336  	struct list_head	pwqs;		/* WR: all pwqs of this wq */
337  	struct list_head	list;		/* PR: list of all workqueues */
338  
339  	struct mutex		mutex;		/* protects this wq */
340  	int			work_color;	/* WQ: current work color */
341  	int			flush_color;	/* WQ: current flush color */
342  	atomic_t		nr_pwqs_to_flush; /* flush in progress */
343  	struct wq_flusher	*first_flusher;	/* WQ: first flusher */
344  	struct list_head	flusher_queue;	/* WQ: flush waiters */
345  	struct list_head	flusher_overflow; /* WQ: flush overflow list */
346  
347  	struct list_head	maydays;	/* MD: pwqs requesting rescue */
348  	struct worker		*rescuer;	/* MD: rescue worker */
349  
350  	int			nr_drainers;	/* WQ: drain in progress */
351  
352  	/* See alloc_workqueue() function comment for info on min/max_active */
353  	int			max_active;	/* WO: max active works */
354  	int			min_active;	/* WO: min active works */
355  	int			saved_max_active; /* WQ: saved max_active */
356  	int			saved_min_active; /* WQ: saved min_active */
357  
358  	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
359  	struct pool_workqueue __rcu *dfl_pwq;   /* PW: only for unbound wqs */
360  
361  #ifdef CONFIG_SYSFS
362  	struct wq_device	*wq_dev;	/* I: for sysfs interface */
363  #endif
364  #ifdef CONFIG_LOCKDEP
365  	char			*lock_name;
366  	struct lock_class_key	key;
367  	struct lockdep_map	__lockdep_map;
368  	struct lockdep_map	*lockdep_map;
369  #endif
370  	char			name[WQ_NAME_LEN]; /* I: workqueue name */
371  
372  	/*
373  	 * Destruction of workqueue_struct is RCU protected to allow walking
374  	 * the workqueues list without grabbing wq_pool_mutex.
375  	 * This is used to dump all workqueues from sysrq.
376  	 */
377  	struct rcu_head		rcu;
378  
379  	/* hot fields used during command issue, aligned to cacheline */
380  	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
381  	struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */
382  	struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
383  };
384  
385  /*
386   * Each pod type describes how CPUs should be grouped for unbound workqueues.
387   * See the comment above workqueue_attrs->affn_scope.
388   */
389  struct wq_pod_type {
390  	int			nr_pods;	/* number of pods */
391  	cpumask_var_t		*pod_cpus;	/* pod -> cpus */
392  	int			*pod_node;	/* pod -> node */
393  	int			*cpu_pod;	/* cpu -> pod */
394  };
395  
396  struct work_offq_data {
397  	u32			pool_id;
398  	u32			disable;
399  	u32			flags;
400  };
401  
402  static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
403  	[WQ_AFFN_DFL]		= "default",
404  	[WQ_AFFN_CPU]		= "cpu",
405  	[WQ_AFFN_SMT]		= "smt",
406  	[WQ_AFFN_CACHE]		= "cache",
407  	[WQ_AFFN_NUMA]		= "numa",
408  	[WQ_AFFN_SYSTEM]	= "system",
409  };
410  
411  /*
412   * Per-cpu work items which run for longer than the following threshold are
413   * automatically considered CPU intensive and excluded from concurrency
414   * management to prevent them from noticeably delaying other per-cpu work items.
415   * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
416   * The actual value is initialized in wq_cpu_intensive_thresh_init().
417   */
418  static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
419  module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
420  #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
421  static unsigned int wq_cpu_intensive_warning_thresh = 4;
422  module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644);
423  #endif
424  
425  /* see the comment above the definition of WQ_POWER_EFFICIENT */
426  static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
427  module_param_named(power_efficient, wq_power_efficient, bool, 0444);
428  
429  static bool wq_online;			/* can kworkers be created yet? */
430  static bool wq_topo_initialized __read_mostly = false;
431  
432  static struct kmem_cache *pwq_cache;
433  
434  static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
435  static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
436  
437  /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
438  static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;
439  
440  static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
441  static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
442  static DEFINE_RAW_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
443  /* wait for manager to go away */
444  static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
445  
446  static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
447  static bool workqueue_freezing;		/* PL: have wqs started freezing? */
448  
449  /* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */
450  static cpumask_var_t wq_online_cpumask;
451  
452  /* PL&A: allowable cpus for unbound wqs and work items */
453  static cpumask_var_t wq_unbound_cpumask;
454  
455  /* PL: user requested unbound cpumask via sysfs */
456  static cpumask_var_t wq_requested_unbound_cpumask;
457  
458  /* PL: isolated cpumask to be excluded from unbound cpumask */
459  static cpumask_var_t wq_isolated_cpumask;
460  
461  /* for further constrain wq_unbound_cpumask by cmdline parameter*/
462  static struct cpumask wq_cmdline_cpumask __initdata;
463  
464  /* CPU where unbound work was last round robin scheduled from this CPU */
465  static DEFINE_PER_CPU(int, wq_rr_cpu_last);
466  
467  /*
468   * Local execution of unbound work items is no longer guaranteed.  The
469   * following always forces round-robin CPU selection on unbound work items
470   * to uncover usages which depend on it.
471   */
472  #ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
473  static bool wq_debug_force_rr_cpu = true;
474  #else
475  static bool wq_debug_force_rr_cpu = false;
476  #endif
477  module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
478  
479  /* to raise softirq for the BH worker pools on other CPUs */
480  static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works);
481  
482  /* the BH worker pools */
483  static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools);
484  
485  /* the per-cpu worker pools */
486  static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
487  
488  static DEFINE_IDR(worker_pool_idr);	/* PR: idr of all pools */
489  
490  /* PL: hash of all unbound pools keyed by pool->attrs */
491  static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
492  
493  /* I: attributes used when instantiating standard unbound pools on demand */
494  static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
495  
496  /* I: attributes used when instantiating ordered pools on demand */
497  static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
498  
499  /*
500   * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
501   * process context while holding a pool lock. Bounce to a dedicated kthread
502   * worker to avoid A-A deadlocks.
503   */
504  static struct kthread_worker *pwq_release_worker __ro_after_init;
505  
506  struct workqueue_struct *system_wq __ro_after_init;
507  EXPORT_SYMBOL(system_wq);
508  struct workqueue_struct *system_highpri_wq __ro_after_init;
509  EXPORT_SYMBOL_GPL(system_highpri_wq);
510  struct workqueue_struct *system_long_wq __ro_after_init;
511  EXPORT_SYMBOL_GPL(system_long_wq);
512  struct workqueue_struct *system_unbound_wq __ro_after_init;
513  EXPORT_SYMBOL_GPL(system_unbound_wq);
514  struct workqueue_struct *system_freezable_wq __ro_after_init;
515  EXPORT_SYMBOL_GPL(system_freezable_wq);
516  struct workqueue_struct *system_power_efficient_wq __ro_after_init;
517  EXPORT_SYMBOL_GPL(system_power_efficient_wq);
518  struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
519  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
520  struct workqueue_struct *system_bh_wq;
521  EXPORT_SYMBOL_GPL(system_bh_wq);
522  struct workqueue_struct *system_bh_highpri_wq;
523  EXPORT_SYMBOL_GPL(system_bh_highpri_wq);
524  
525  static int worker_thread(void *__worker);
526  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
527  static void show_pwq(struct pool_workqueue *pwq);
528  static void show_one_worker_pool(struct worker_pool *pool);
529  
530  #define CREATE_TRACE_POINTS
531  #include <trace/events/workqueue.h>
532  
533  #define assert_rcu_or_pool_mutex()					\
534  	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&			\
535  			 !lockdep_is_held(&wq_pool_mutex),		\
536  			 "RCU or wq_pool_mutex should be held")
537  
538  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)			\
539  	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&			\
540  			 !lockdep_is_held(&wq->mutex) &&		\
541  			 !lockdep_is_held(&wq_pool_mutex),		\
542  			 "RCU, wq->mutex or wq_pool_mutex should be held")
543  
544  #define for_each_bh_worker_pool(pool, cpu)				\
545  	for ((pool) = &per_cpu(bh_worker_pools, cpu)[0];		\
546  	     (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
547  	     (pool)++)
548  
549  #define for_each_cpu_worker_pool(pool, cpu)				\
550  	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\
551  	     (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
552  	     (pool)++)
553  
554  /**
555   * for_each_pool - iterate through all worker_pools in the system
556   * @pool: iteration cursor
557   * @pi: integer used for iteration
558   *
559   * This must be called either with wq_pool_mutex held or RCU read
560   * locked.  If the pool needs to be used beyond the locking in effect, the
561   * caller is responsible for guaranteeing that the pool stays online.
562   *
563   * The if/else clause exists only for the lockdep assertion and can be
564   * ignored.
565   */
566  #define for_each_pool(pool, pi)						\
567  	idr_for_each_entry(&worker_pool_idr, pool, pi)			\
568  		if (({ assert_rcu_or_pool_mutex(); false; })) { }	\
569  		else
570  
571  /**
572   * for_each_pool_worker - iterate through all workers of a worker_pool
573   * @worker: iteration cursor
574   * @pool: worker_pool to iterate workers of
575   *
576   * This must be called with wq_pool_attach_mutex.
577   *
578   * The if/else clause exists only for the lockdep assertion and can be
579   * ignored.
580   */
581  #define for_each_pool_worker(worker, pool)				\
582  	list_for_each_entry((worker), &(pool)->workers, node)		\
583  		if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
584  		else
585  
586  /**
587   * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
588   * @pwq: iteration cursor
589   * @wq: the target workqueue
590   *
591   * This must be called either with wq->mutex held or RCU read locked.
592   * If the pwq needs to be used beyond the locking in effect, the caller is
593   * responsible for guaranteeing that the pwq stays online.
594   *
595   * The if/else clause exists only for the lockdep assertion and can be
596   * ignored.
597   */
598  #define for_each_pwq(pwq, wq)						\
599  	list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,		\
600  				 lockdep_is_held(&(wq->mutex)))
601  
602  #ifdef CONFIG_DEBUG_OBJECTS_WORK
603  
604  static const struct debug_obj_descr work_debug_descr;
605  
work_debug_hint(void * addr)606  static void *work_debug_hint(void *addr)
607  {
608  	return ((struct work_struct *) addr)->func;
609  }
610  
work_is_static_object(void * addr)611  static bool work_is_static_object(void *addr)
612  {
613  	struct work_struct *work = addr;
614  
615  	return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
616  }
617  
618  /*
619   * fixup_init is called when:
620   * - an active object is initialized
621   */
work_fixup_init(void * addr,enum debug_obj_state state)622  static bool work_fixup_init(void *addr, enum debug_obj_state state)
623  {
624  	struct work_struct *work = addr;
625  
626  	switch (state) {
627  	case ODEBUG_STATE_ACTIVE:
628  		cancel_work_sync(work);
629  		debug_object_init(work, &work_debug_descr);
630  		return true;
631  	default:
632  		return false;
633  	}
634  }
635  
636  /*
637   * fixup_free is called when:
638   * - an active object is freed
639   */
work_fixup_free(void * addr,enum debug_obj_state state)640  static bool work_fixup_free(void *addr, enum debug_obj_state state)
641  {
642  	struct work_struct *work = addr;
643  
644  	switch (state) {
645  	case ODEBUG_STATE_ACTIVE:
646  		cancel_work_sync(work);
647  		debug_object_free(work, &work_debug_descr);
648  		return true;
649  	default:
650  		return false;
651  	}
652  }
653  
654  static const struct debug_obj_descr work_debug_descr = {
655  	.name		= "work_struct",
656  	.debug_hint	= work_debug_hint,
657  	.is_static_object = work_is_static_object,
658  	.fixup_init	= work_fixup_init,
659  	.fixup_free	= work_fixup_free,
660  };
661  
debug_work_activate(struct work_struct * work)662  static inline void debug_work_activate(struct work_struct *work)
663  {
664  	debug_object_activate(work, &work_debug_descr);
665  }
666  
debug_work_deactivate(struct work_struct * work)667  static inline void debug_work_deactivate(struct work_struct *work)
668  {
669  	debug_object_deactivate(work, &work_debug_descr);
670  }
671  
__init_work(struct work_struct * work,int onstack)672  void __init_work(struct work_struct *work, int onstack)
673  {
674  	if (onstack)
675  		debug_object_init_on_stack(work, &work_debug_descr);
676  	else
677  		debug_object_init(work, &work_debug_descr);
678  }
679  EXPORT_SYMBOL_GPL(__init_work);
680  
destroy_work_on_stack(struct work_struct * work)681  void destroy_work_on_stack(struct work_struct *work)
682  {
683  	debug_object_free(work, &work_debug_descr);
684  }
685  EXPORT_SYMBOL_GPL(destroy_work_on_stack);
686  
destroy_delayed_work_on_stack(struct delayed_work * work)687  void destroy_delayed_work_on_stack(struct delayed_work *work)
688  {
689  	destroy_timer_on_stack(&work->timer);
690  	debug_object_free(&work->work, &work_debug_descr);
691  }
692  EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
693  
694  #else
debug_work_activate(struct work_struct * work)695  static inline void debug_work_activate(struct work_struct *work) { }
debug_work_deactivate(struct work_struct * work)696  static inline void debug_work_deactivate(struct work_struct *work) { }
697  #endif
698  
699  /**
700   * worker_pool_assign_id - allocate ID and assign it to @pool
701   * @pool: the pool pointer of interest
702   *
703   * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
704   * successfully, -errno on failure.
705   */
worker_pool_assign_id(struct worker_pool * pool)706  static int worker_pool_assign_id(struct worker_pool *pool)
707  {
708  	int ret;
709  
710  	lockdep_assert_held(&wq_pool_mutex);
711  
712  	ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
713  			GFP_KERNEL);
714  	if (ret >= 0) {
715  		pool->id = ret;
716  		return 0;
717  	}
718  	return ret;
719  }
720  
721  static struct pool_workqueue __rcu **
unbound_pwq_slot(struct workqueue_struct * wq,int cpu)722  unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
723  {
724         if (cpu >= 0)
725                 return per_cpu_ptr(wq->cpu_pwq, cpu);
726         else
727                 return &wq->dfl_pwq;
728  }
729  
730  /* @cpu < 0 for dfl_pwq */
unbound_pwq(struct workqueue_struct * wq,int cpu)731  static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
732  {
733  	return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
734  				     lockdep_is_held(&wq_pool_mutex) ||
735  				     lockdep_is_held(&wq->mutex));
736  }
737  
738  /**
739   * unbound_effective_cpumask - effective cpumask of an unbound workqueue
740   * @wq: workqueue of interest
741   *
742   * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
743   * is masked with wq_unbound_cpumask to determine the effective cpumask. The
744   * default pwq is always mapped to the pool with the current effective cpumask.
745   */
unbound_effective_cpumask(struct workqueue_struct * wq)746  static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
747  {
748  	return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
749  }
750  
work_color_to_flags(int color)751  static unsigned int work_color_to_flags(int color)
752  {
753  	return color << WORK_STRUCT_COLOR_SHIFT;
754  }
755  
get_work_color(unsigned long work_data)756  static int get_work_color(unsigned long work_data)
757  {
758  	return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
759  		((1 << WORK_STRUCT_COLOR_BITS) - 1);
760  }
761  
work_next_color(int color)762  static int work_next_color(int color)
763  {
764  	return (color + 1) % WORK_NR_COLORS;
765  }
766  
pool_offq_flags(struct worker_pool * pool)767  static unsigned long pool_offq_flags(struct worker_pool *pool)
768  {
769  	return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0;
770  }
771  
772  /*
773   * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
774   * contain the pointer to the queued pwq.  Once execution starts, the flag
775   * is cleared and the high bits contain OFFQ flags and pool ID.
776   *
777   * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling()
778   * can be used to set the pwq, pool or clear work->data. These functions should
779   * only be called while the work is owned - ie. while the PENDING bit is set.
780   *
781   * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
782   * corresponding to a work.  Pool is available once the work has been
783   * queued anywhere after initialization until it is sync canceled.  pwq is
784   * available only while the work item is queued.
785   */
set_work_data(struct work_struct * work,unsigned long data)786  static inline void set_work_data(struct work_struct *work, unsigned long data)
787  {
788  	WARN_ON_ONCE(!work_pending(work));
789  	atomic_long_set(&work->data, data | work_static(work));
790  }
791  
set_work_pwq(struct work_struct * work,struct pool_workqueue * pwq,unsigned long flags)792  static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
793  			 unsigned long flags)
794  {
795  	set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING |
796  		      WORK_STRUCT_PWQ | flags);
797  }
798  
set_work_pool_and_keep_pending(struct work_struct * work,int pool_id,unsigned long flags)799  static void set_work_pool_and_keep_pending(struct work_struct *work,
800  					   int pool_id, unsigned long flags)
801  {
802  	set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
803  		      WORK_STRUCT_PENDING | flags);
804  }
805  
set_work_pool_and_clear_pending(struct work_struct * work,int pool_id,unsigned long flags)806  static void set_work_pool_and_clear_pending(struct work_struct *work,
807  					    int pool_id, unsigned long flags)
808  {
809  	/*
810  	 * The following wmb is paired with the implied mb in
811  	 * test_and_set_bit(PENDING) and ensures all updates to @work made
812  	 * here are visible to and precede any updates by the next PENDING
813  	 * owner.
814  	 */
815  	smp_wmb();
816  	set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
817  		      flags);
818  	/*
819  	 * The following mb guarantees that previous clear of a PENDING bit
820  	 * will not be reordered with any speculative LOADS or STORES from
821  	 * work->current_func, which is executed afterwards.  This possible
822  	 * reordering can lead to a missed execution on attempt to queue
823  	 * the same @work.  E.g. consider this case:
824  	 *
825  	 *   CPU#0                         CPU#1
826  	 *   ----------------------------  --------------------------------
827  	 *
828  	 * 1  STORE event_indicated
829  	 * 2  queue_work_on() {
830  	 * 3    test_and_set_bit(PENDING)
831  	 * 4 }                             set_..._and_clear_pending() {
832  	 * 5                                 set_work_data() # clear bit
833  	 * 6                                 smp_mb()
834  	 * 7                               work->current_func() {
835  	 * 8				      LOAD event_indicated
836  	 *				   }
837  	 *
838  	 * Without an explicit full barrier speculative LOAD on line 8 can
839  	 * be executed before CPU#0 does STORE on line 1.  If that happens,
840  	 * CPU#0 observes the PENDING bit is still set and new execution of
841  	 * a @work is not queued in a hope, that CPU#1 will eventually
842  	 * finish the queued @work.  Meanwhile CPU#1 does not see
843  	 * event_indicated is set, because speculative LOAD was executed
844  	 * before actual STORE.
845  	 */
846  	smp_mb();
847  }
848  
work_struct_pwq(unsigned long data)849  static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
850  {
851  	return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK);
852  }
853  
get_work_pwq(struct work_struct * work)854  static struct pool_workqueue *get_work_pwq(struct work_struct *work)
855  {
856  	unsigned long data = atomic_long_read(&work->data);
857  
858  	if (data & WORK_STRUCT_PWQ)
859  		return work_struct_pwq(data);
860  	else
861  		return NULL;
862  }
863  
864  /**
865   * get_work_pool - return the worker_pool a given work was associated with
866   * @work: the work item of interest
867   *
868   * Pools are created and destroyed under wq_pool_mutex, and allows read
869   * access under RCU read lock.  As such, this function should be
870   * called under wq_pool_mutex or inside of a rcu_read_lock() region.
871   *
872   * All fields of the returned pool are accessible as long as the above
873   * mentioned locking is in effect.  If the returned pool needs to be used
874   * beyond the critical section, the caller is responsible for ensuring the
875   * returned pool is and stays online.
876   *
877   * Return: The worker_pool @work was last associated with.  %NULL if none.
878   */
get_work_pool(struct work_struct * work)879  static struct worker_pool *get_work_pool(struct work_struct *work)
880  {
881  	unsigned long data = atomic_long_read(&work->data);
882  	int pool_id;
883  
884  	assert_rcu_or_pool_mutex();
885  
886  	if (data & WORK_STRUCT_PWQ)
887  		return work_struct_pwq(data)->pool;
888  
889  	pool_id = data >> WORK_OFFQ_POOL_SHIFT;
890  	if (pool_id == WORK_OFFQ_POOL_NONE)
891  		return NULL;
892  
893  	return idr_find(&worker_pool_idr, pool_id);
894  }
895  
shift_and_mask(unsigned long v,u32 shift,u32 bits)896  static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits)
897  {
898  	return (v >> shift) & ((1U << bits) - 1);
899  }
900  
work_offqd_unpack(struct work_offq_data * offqd,unsigned long data)901  static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data)
902  {
903  	WARN_ON_ONCE(data & WORK_STRUCT_PWQ);
904  
905  	offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT,
906  					WORK_OFFQ_POOL_BITS);
907  	offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT,
908  					WORK_OFFQ_DISABLE_BITS);
909  	offqd->flags = data & WORK_OFFQ_FLAG_MASK;
910  }
911  
work_offqd_pack_flags(struct work_offq_data * offqd)912  static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd)
913  {
914  	return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) |
915  		((unsigned long)offqd->flags);
916  }
917  
918  /*
919   * Policy functions.  These define the policies on how the global worker
920   * pools are managed.  Unless noted otherwise, these functions assume that
921   * they're being called with pool->lock held.
922   */
923  
924  /*
925   * Need to wake up a worker?  Called from anything but currently
926   * running workers.
927   *
928   * Note that, because unbound workers never contribute to nr_running, this
929   * function will always return %true for unbound pools as long as the
930   * worklist isn't empty.
931   */
need_more_worker(struct worker_pool * pool)932  static bool need_more_worker(struct worker_pool *pool)
933  {
934  	return !list_empty(&pool->worklist) && !pool->nr_running;
935  }
936  
937  /* Can I start working?  Called from busy but !running workers. */
may_start_working(struct worker_pool * pool)938  static bool may_start_working(struct worker_pool *pool)
939  {
940  	return pool->nr_idle;
941  }
942  
943  /* Do I need to keep working?  Called from currently running workers. */
keep_working(struct worker_pool * pool)944  static bool keep_working(struct worker_pool *pool)
945  {
946  	return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
947  }
948  
949  /* Do we need a new worker?  Called from manager. */
need_to_create_worker(struct worker_pool * pool)950  static bool need_to_create_worker(struct worker_pool *pool)
951  {
952  	return need_more_worker(pool) && !may_start_working(pool);
953  }
954  
955  /* Do we have too many workers and should some go away? */
too_many_workers(struct worker_pool * pool)956  static bool too_many_workers(struct worker_pool *pool)
957  {
958  	bool managing = pool->flags & POOL_MANAGER_ACTIVE;
959  	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
960  	int nr_busy = pool->nr_workers - nr_idle;
961  
962  	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
963  }
964  
965  /**
966   * worker_set_flags - set worker flags and adjust nr_running accordingly
967   * @worker: self
968   * @flags: flags to set
969   *
970   * Set @flags in @worker->flags and adjust nr_running accordingly.
971   */
worker_set_flags(struct worker * worker,unsigned int flags)972  static inline void worker_set_flags(struct worker *worker, unsigned int flags)
973  {
974  	struct worker_pool *pool = worker->pool;
975  
976  	lockdep_assert_held(&pool->lock);
977  
978  	/* If transitioning into NOT_RUNNING, adjust nr_running. */
979  	if ((flags & WORKER_NOT_RUNNING) &&
980  	    !(worker->flags & WORKER_NOT_RUNNING)) {
981  		pool->nr_running--;
982  	}
983  
984  	worker->flags |= flags;
985  }
986  
987  /**
988   * worker_clr_flags - clear worker flags and adjust nr_running accordingly
989   * @worker: self
990   * @flags: flags to clear
991   *
992   * Clear @flags in @worker->flags and adjust nr_running accordingly.
993   */
worker_clr_flags(struct worker * worker,unsigned int flags)994  static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
995  {
996  	struct worker_pool *pool = worker->pool;
997  	unsigned int oflags = worker->flags;
998  
999  	lockdep_assert_held(&pool->lock);
1000  
1001  	worker->flags &= ~flags;
1002  
1003  	/*
1004  	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
1005  	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
1006  	 * of multiple flags, not a single flag.
1007  	 */
1008  	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
1009  		if (!(worker->flags & WORKER_NOT_RUNNING))
1010  			pool->nr_running++;
1011  }
1012  
1013  /* Return the first idle worker.  Called with pool->lock held. */
first_idle_worker(struct worker_pool * pool)1014  static struct worker *first_idle_worker(struct worker_pool *pool)
1015  {
1016  	if (unlikely(list_empty(&pool->idle_list)))
1017  		return NULL;
1018  
1019  	return list_first_entry(&pool->idle_list, struct worker, entry);
1020  }
1021  
1022  /**
1023   * worker_enter_idle - enter idle state
1024   * @worker: worker which is entering idle state
1025   *
1026   * @worker is entering idle state.  Update stats and idle timer if
1027   * necessary.
1028   *
1029   * LOCKING:
1030   * raw_spin_lock_irq(pool->lock).
1031   */
worker_enter_idle(struct worker * worker)1032  static void worker_enter_idle(struct worker *worker)
1033  {
1034  	struct worker_pool *pool = worker->pool;
1035  
1036  	if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
1037  	    WARN_ON_ONCE(!list_empty(&worker->entry) &&
1038  			 (worker->hentry.next || worker->hentry.pprev)))
1039  		return;
1040  
1041  	/* can't use worker_set_flags(), also called from create_worker() */
1042  	worker->flags |= WORKER_IDLE;
1043  	pool->nr_idle++;
1044  	worker->last_active = jiffies;
1045  
1046  	/* idle_list is LIFO */
1047  	list_add(&worker->entry, &pool->idle_list);
1048  
1049  	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1050  		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1051  
1052  	/* Sanity check nr_running. */
1053  	WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
1054  }
1055  
1056  /**
1057   * worker_leave_idle - leave idle state
1058   * @worker: worker which is leaving idle state
1059   *
1060   * @worker is leaving idle state.  Update stats.
1061   *
1062   * LOCKING:
1063   * raw_spin_lock_irq(pool->lock).
1064   */
worker_leave_idle(struct worker * worker)1065  static void worker_leave_idle(struct worker *worker)
1066  {
1067  	struct worker_pool *pool = worker->pool;
1068  
1069  	if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
1070  		return;
1071  	worker_clr_flags(worker, WORKER_IDLE);
1072  	pool->nr_idle--;
1073  	list_del_init(&worker->entry);
1074  }
1075  
1076  /**
1077   * find_worker_executing_work - find worker which is executing a work
1078   * @pool: pool of interest
1079   * @work: work to find worker for
1080   *
1081   * Find a worker which is executing @work on @pool by searching
1082   * @pool->busy_hash which is keyed by the address of @work.  For a worker
1083   * to match, its current execution should match the address of @work and
1084   * its work function.  This is to avoid unwanted dependency between
1085   * unrelated work executions through a work item being recycled while still
1086   * being executed.
1087   *
1088   * This is a bit tricky.  A work item may be freed once its execution
1089   * starts and nothing prevents the freed area from being recycled for
1090   * another work item.  If the same work item address ends up being reused
1091   * before the original execution finishes, workqueue will identify the
1092   * recycled work item as currently executing and make it wait until the
1093   * current execution finishes, introducing an unwanted dependency.
1094   *
1095   * This function checks the work item address and work function to avoid
1096   * false positives.  Note that this isn't complete as one may construct a
1097   * work function which can introduce dependency onto itself through a
1098   * recycled work item.  Well, if somebody wants to shoot oneself in the
1099   * foot that badly, there's only so much we can do, and if such deadlock
1100   * actually occurs, it should be easy to locate the culprit work function.
1101   *
1102   * CONTEXT:
1103   * raw_spin_lock_irq(pool->lock).
1104   *
1105   * Return:
1106   * Pointer to worker which is executing @work if found, %NULL
1107   * otherwise.
1108   */
find_worker_executing_work(struct worker_pool * pool,struct work_struct * work)1109  static struct worker *find_worker_executing_work(struct worker_pool *pool,
1110  						 struct work_struct *work)
1111  {
1112  	struct worker *worker;
1113  
1114  	hash_for_each_possible(pool->busy_hash, worker, hentry,
1115  			       (unsigned long)work)
1116  		if (worker->current_work == work &&
1117  		    worker->current_func == work->func)
1118  			return worker;
1119  
1120  	return NULL;
1121  }
1122  
1123  /**
1124   * move_linked_works - move linked works to a list
1125   * @work: start of series of works to be scheduled
1126   * @head: target list to append @work to
1127   * @nextp: out parameter for nested worklist walking
1128   *
1129   * Schedule linked works starting from @work to @head. Work series to be
1130   * scheduled starts at @work and includes any consecutive work with
1131   * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
1132   * @nextp.
1133   *
1134   * CONTEXT:
1135   * raw_spin_lock_irq(pool->lock).
1136   */
move_linked_works(struct work_struct * work,struct list_head * head,struct work_struct ** nextp)1137  static void move_linked_works(struct work_struct *work, struct list_head *head,
1138  			      struct work_struct **nextp)
1139  {
1140  	struct work_struct *n;
1141  
1142  	/*
1143  	 * Linked worklist will always end before the end of the list,
1144  	 * use NULL for list head.
1145  	 */
1146  	list_for_each_entry_safe_from(work, n, NULL, entry) {
1147  		list_move_tail(&work->entry, head);
1148  		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1149  			break;
1150  	}
1151  
1152  	/*
1153  	 * If we're already inside safe list traversal and have moved
1154  	 * multiple works to the scheduled queue, the next position
1155  	 * needs to be updated.
1156  	 */
1157  	if (nextp)
1158  		*nextp = n;
1159  }
1160  
1161  /**
1162   * assign_work - assign a work item and its linked work items to a worker
1163   * @work: work to assign
1164   * @worker: worker to assign to
1165   * @nextp: out parameter for nested worklist walking
1166   *
1167   * Assign @work and its linked work items to @worker. If @work is already being
1168   * executed by another worker in the same pool, it'll be punted there.
1169   *
1170   * If @nextp is not NULL, it's updated to point to the next work of the last
1171   * scheduled work. This allows assign_work() to be nested inside
1172   * list_for_each_entry_safe().
1173   *
1174   * Returns %true if @work was successfully assigned to @worker. %false if @work
1175   * was punted to another worker already executing it.
1176   */
assign_work(struct work_struct * work,struct worker * worker,struct work_struct ** nextp)1177  static bool assign_work(struct work_struct *work, struct worker *worker,
1178  			struct work_struct **nextp)
1179  {
1180  	struct worker_pool *pool = worker->pool;
1181  	struct worker *collision;
1182  
1183  	lockdep_assert_held(&pool->lock);
1184  
1185  	/*
1186  	 * A single work shouldn't be executed concurrently by multiple workers.
1187  	 * __queue_work() ensures that @work doesn't jump to a different pool
1188  	 * while still running in the previous pool. Here, we should ensure that
1189  	 * @work is not executed concurrently by multiple workers from the same
1190  	 * pool. Check whether anyone is already processing the work. If so,
1191  	 * defer the work to the currently executing one.
1192  	 */
1193  	collision = find_worker_executing_work(pool, work);
1194  	if (unlikely(collision)) {
1195  		move_linked_works(work, &collision->scheduled, nextp);
1196  		return false;
1197  	}
1198  
1199  	move_linked_works(work, &worker->scheduled, nextp);
1200  	return true;
1201  }
1202  
bh_pool_irq_work(struct worker_pool * pool)1203  static struct irq_work *bh_pool_irq_work(struct worker_pool *pool)
1204  {
1205  	int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0;
1206  
1207  	return &per_cpu(bh_pool_irq_works, pool->cpu)[high];
1208  }
1209  
kick_bh_pool(struct worker_pool * pool)1210  static void kick_bh_pool(struct worker_pool *pool)
1211  {
1212  #ifdef CONFIG_SMP
1213  	/* see drain_dead_softirq_workfn() for BH_DRAINING */
1214  	if (unlikely(pool->cpu != smp_processor_id() &&
1215  		     !(pool->flags & POOL_BH_DRAINING))) {
1216  		irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
1217  		return;
1218  	}
1219  #endif
1220  	if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
1221  		raise_softirq_irqoff(HI_SOFTIRQ);
1222  	else
1223  		raise_softirq_irqoff(TASKLET_SOFTIRQ);
1224  }
1225  
1226  /**
1227   * kick_pool - wake up an idle worker if necessary
1228   * @pool: pool to kick
1229   *
1230   * @pool may have pending work items. Wake up worker if necessary. Returns
1231   * whether a worker was woken up.
1232   */
kick_pool(struct worker_pool * pool)1233  static bool kick_pool(struct worker_pool *pool)
1234  {
1235  	struct worker *worker = first_idle_worker(pool);
1236  	struct task_struct *p;
1237  
1238  	lockdep_assert_held(&pool->lock);
1239  
1240  	if (!need_more_worker(pool) || !worker)
1241  		return false;
1242  
1243  	if (pool->flags & POOL_BH) {
1244  		kick_bh_pool(pool);
1245  		return true;
1246  	}
1247  
1248  	p = worker->task;
1249  
1250  #ifdef CONFIG_SMP
1251  	/*
1252  	 * Idle @worker is about to execute @work and waking up provides an
1253  	 * opportunity to migrate @worker at a lower cost by setting the task's
1254  	 * wake_cpu field. Let's see if we want to move @worker to improve
1255  	 * execution locality.
1256  	 *
1257  	 * We're waking the worker that went idle the latest and there's some
1258  	 * chance that @worker is marked idle but hasn't gone off CPU yet. If
1259  	 * so, setting the wake_cpu won't do anything. As this is a best-effort
1260  	 * optimization and the race window is narrow, let's leave as-is for
1261  	 * now. If this becomes pronounced, we can skip over workers which are
1262  	 * still on cpu when picking an idle worker.
1263  	 *
1264  	 * If @pool has non-strict affinity, @worker might have ended up outside
1265  	 * its affinity scope. Repatriate.
1266  	 */
1267  	if (!pool->attrs->affn_strict &&
1268  	    !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
1269  		struct work_struct *work = list_first_entry(&pool->worklist,
1270  						struct work_struct, entry);
1271  		int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask,
1272  							  cpu_online_mask);
1273  		if (wake_cpu < nr_cpu_ids) {
1274  			p->wake_cpu = wake_cpu;
1275  			get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
1276  		}
1277  	}
1278  #endif
1279  	wake_up_process(p);
1280  	return true;
1281  }
1282  
1283  #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
1284  
1285  /*
1286   * Concurrency-managed per-cpu work items that hog CPU for longer than
1287   * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
1288   * which prevents them from stalling other concurrency-managed work items. If a
1289   * work function keeps triggering this mechanism, it's likely that the work item
1290   * should be using an unbound workqueue instead.
1291   *
1292   * wq_cpu_intensive_report() tracks work functions which trigger such conditions
1293   * and report them so that they can be examined and converted to use unbound
1294   * workqueues as appropriate. To avoid flooding the console, each violating work
1295   * function is tracked and reported with exponential backoff.
1296   */
1297  #define WCI_MAX_ENTS 128
1298  
1299  struct wci_ent {
1300  	work_func_t		func;
1301  	atomic64_t		cnt;
1302  	struct hlist_node	hash_node;
1303  };
1304  
1305  static struct wci_ent wci_ents[WCI_MAX_ENTS];
1306  static int wci_nr_ents;
1307  static DEFINE_RAW_SPINLOCK(wci_lock);
1308  static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));
1309  
wci_find_ent(work_func_t func)1310  static struct wci_ent *wci_find_ent(work_func_t func)
1311  {
1312  	struct wci_ent *ent;
1313  
1314  	hash_for_each_possible_rcu(wci_hash, ent, hash_node,
1315  				   (unsigned long)func) {
1316  		if (ent->func == func)
1317  			return ent;
1318  	}
1319  	return NULL;
1320  }
1321  
wq_cpu_intensive_report(work_func_t func)1322  static void wq_cpu_intensive_report(work_func_t func)
1323  {
1324  	struct wci_ent *ent;
1325  
1326  restart:
1327  	ent = wci_find_ent(func);
1328  	if (ent) {
1329  		u64 cnt;
1330  
1331  		/*
1332  		 * Start reporting from the warning_thresh and back off
1333  		 * exponentially.
1334  		 */
1335  		cnt = atomic64_inc_return_relaxed(&ent->cnt);
1336  		if (wq_cpu_intensive_warning_thresh &&
1337  		    cnt >= wq_cpu_intensive_warning_thresh &&
1338  		    is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh))
1339  			printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
1340  					ent->func, wq_cpu_intensive_thresh_us,
1341  					atomic64_read(&ent->cnt));
1342  		return;
1343  	}
1344  
1345  	/*
1346  	 * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
1347  	 * is exhausted, something went really wrong and we probably made enough
1348  	 * noise already.
1349  	 */
1350  	if (wci_nr_ents >= WCI_MAX_ENTS)
1351  		return;
1352  
1353  	raw_spin_lock(&wci_lock);
1354  
1355  	if (wci_nr_ents >= WCI_MAX_ENTS) {
1356  		raw_spin_unlock(&wci_lock);
1357  		return;
1358  	}
1359  
1360  	if (wci_find_ent(func)) {
1361  		raw_spin_unlock(&wci_lock);
1362  		goto restart;
1363  	}
1364  
1365  	ent = &wci_ents[wci_nr_ents++];
1366  	ent->func = func;
1367  	atomic64_set(&ent->cnt, 0);
1368  	hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);
1369  
1370  	raw_spin_unlock(&wci_lock);
1371  
1372  	goto restart;
1373  }
1374  
1375  #else	/* CONFIG_WQ_CPU_INTENSIVE_REPORT */
wq_cpu_intensive_report(work_func_t func)1376  static void wq_cpu_intensive_report(work_func_t func) {}
1377  #endif	/* CONFIG_WQ_CPU_INTENSIVE_REPORT */
1378  
1379  /**
1380   * wq_worker_running - a worker is running again
1381   * @task: task waking up
1382   *
1383   * This function is called when a worker returns from schedule()
1384   */
wq_worker_running(struct task_struct * task)1385  void wq_worker_running(struct task_struct *task)
1386  {
1387  	struct worker *worker = kthread_data(task);
1388  
1389  	if (!READ_ONCE(worker->sleeping))
1390  		return;
1391  
1392  	/*
1393  	 * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
1394  	 * and the nr_running increment below, we may ruin the nr_running reset
1395  	 * and leave with an unexpected pool->nr_running == 1 on the newly unbound
1396  	 * pool. Protect against such race.
1397  	 */
1398  	preempt_disable();
1399  	if (!(worker->flags & WORKER_NOT_RUNNING))
1400  		worker->pool->nr_running++;
1401  	preempt_enable();
1402  
1403  	/*
1404  	 * CPU intensive auto-detection cares about how long a work item hogged
1405  	 * CPU without sleeping. Reset the starting timestamp on wakeup.
1406  	 */
1407  	worker->current_at = worker->task->se.sum_exec_runtime;
1408  
1409  	WRITE_ONCE(worker->sleeping, 0);
1410  }
1411  
1412  /**
1413   * wq_worker_sleeping - a worker is going to sleep
1414   * @task: task going to sleep
1415   *
1416   * This function is called from schedule() when a busy worker is
1417   * going to sleep.
1418   */
wq_worker_sleeping(struct task_struct * task)1419  void wq_worker_sleeping(struct task_struct *task)
1420  {
1421  	struct worker *worker = kthread_data(task);
1422  	struct worker_pool *pool;
1423  
1424  	/*
1425  	 * Rescuers, which may not have all the fields set up like normal
1426  	 * workers, also reach here, let's not access anything before
1427  	 * checking NOT_RUNNING.
1428  	 */
1429  	if (worker->flags & WORKER_NOT_RUNNING)
1430  		return;
1431  
1432  	pool = worker->pool;
1433  
1434  	/* Return if preempted before wq_worker_running() was reached */
1435  	if (READ_ONCE(worker->sleeping))
1436  		return;
1437  
1438  	WRITE_ONCE(worker->sleeping, 1);
1439  	raw_spin_lock_irq(&pool->lock);
1440  
1441  	/*
1442  	 * Recheck in case unbind_workers() preempted us. We don't
1443  	 * want to decrement nr_running after the worker is unbound
1444  	 * and nr_running has been reset.
1445  	 */
1446  	if (worker->flags & WORKER_NOT_RUNNING) {
1447  		raw_spin_unlock_irq(&pool->lock);
1448  		return;
1449  	}
1450  
1451  	pool->nr_running--;
1452  	if (kick_pool(pool))
1453  		worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
1454  
1455  	raw_spin_unlock_irq(&pool->lock);
1456  }
1457  
1458  /**
1459   * wq_worker_tick - a scheduler tick occurred while a kworker is running
1460   * @task: task currently running
1461   *
1462   * Called from sched_tick(). We're in the IRQ context and the current
1463   * worker's fields which follow the 'K' locking rule can be accessed safely.
1464   */
wq_worker_tick(struct task_struct * task)1465  void wq_worker_tick(struct task_struct *task)
1466  {
1467  	struct worker *worker = kthread_data(task);
1468  	struct pool_workqueue *pwq = worker->current_pwq;
1469  	struct worker_pool *pool = worker->pool;
1470  
1471  	if (!pwq)
1472  		return;
1473  
1474  	pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;
1475  
1476  	if (!wq_cpu_intensive_thresh_us)
1477  		return;
1478  
1479  	/*
1480  	 * If the current worker is concurrency managed and hogged the CPU for
1481  	 * longer than wq_cpu_intensive_thresh_us, it's automatically marked
1482  	 * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
1483  	 *
1484  	 * Set @worker->sleeping means that @worker is in the process of
1485  	 * switching out voluntarily and won't be contributing to
1486  	 * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
1487  	 * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
1488  	 * double decrements. The task is releasing the CPU anyway. Let's skip.
1489  	 * We probably want to make this prettier in the future.
1490  	 */
1491  	if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
1492  	    worker->task->se.sum_exec_runtime - worker->current_at <
1493  	    wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
1494  		return;
1495  
1496  	raw_spin_lock(&pool->lock);
1497  
1498  	worker_set_flags(worker, WORKER_CPU_INTENSIVE);
1499  	wq_cpu_intensive_report(worker->current_func);
1500  	pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
1501  
1502  	if (kick_pool(pool))
1503  		pwq->stats[PWQ_STAT_CM_WAKEUP]++;
1504  
1505  	raw_spin_unlock(&pool->lock);
1506  }
1507  
1508  /**
1509   * wq_worker_last_func - retrieve worker's last work function
1510   * @task: Task to retrieve last work function of.
1511   *
1512   * Determine the last function a worker executed. This is called from
1513   * the scheduler to get a worker's last known identity.
1514   *
1515   * CONTEXT:
1516   * raw_spin_lock_irq(rq->lock)
1517   *
1518   * This function is called during schedule() when a kworker is going
1519   * to sleep. It's used by psi to identify aggregation workers during
1520   * dequeuing, to allow periodic aggregation to shut-off when that
1521   * worker is the last task in the system or cgroup to go to sleep.
1522   *
1523   * As this function doesn't involve any workqueue-related locking, it
1524   * only returns stable values when called from inside the scheduler's
1525   * queuing and dequeuing paths, when @task, which must be a kworker,
1526   * is guaranteed to not be processing any works.
1527   *
1528   * Return:
1529   * The last work function %current executed as a worker, NULL if it
1530   * hasn't executed any work yet.
1531   */
wq_worker_last_func(struct task_struct * task)1532  work_func_t wq_worker_last_func(struct task_struct *task)
1533  {
1534  	struct worker *worker = kthread_data(task);
1535  
1536  	return worker->last_func;
1537  }
1538  
1539  /**
1540   * wq_node_nr_active - Determine wq_node_nr_active to use
1541   * @wq: workqueue of interest
1542   * @node: NUMA node, can be %NUMA_NO_NODE
1543   *
1544   * Determine wq_node_nr_active to use for @wq on @node. Returns:
1545   *
1546   * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
1547   *
1548   * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
1549   *
1550   * - Otherwise, node_nr_active[@node].
1551   */
wq_node_nr_active(struct workqueue_struct * wq,int node)1552  static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
1553  						   int node)
1554  {
1555  	if (!(wq->flags & WQ_UNBOUND))
1556  		return NULL;
1557  
1558  	if (node == NUMA_NO_NODE)
1559  		node = nr_node_ids;
1560  
1561  	return wq->node_nr_active[node];
1562  }
1563  
1564  /**
1565   * wq_update_node_max_active - Update per-node max_actives to use
1566   * @wq: workqueue to update
1567   * @off_cpu: CPU that's going down, -1 if a CPU is not going down
1568   *
1569   * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
1570   * distributed among nodes according to the proportions of numbers of online
1571   * cpus. The result is always between @wq->min_active and max_active.
1572   */
wq_update_node_max_active(struct workqueue_struct * wq,int off_cpu)1573  static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
1574  {
1575  	struct cpumask *effective = unbound_effective_cpumask(wq);
1576  	int min_active = READ_ONCE(wq->min_active);
1577  	int max_active = READ_ONCE(wq->max_active);
1578  	int total_cpus, node;
1579  
1580  	lockdep_assert_held(&wq->mutex);
1581  
1582  	if (!wq_topo_initialized)
1583  		return;
1584  
1585  	if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
1586  		off_cpu = -1;
1587  
1588  	total_cpus = cpumask_weight_and(effective, cpu_online_mask);
1589  	if (off_cpu >= 0)
1590  		total_cpus--;
1591  
1592  	/* If all CPUs of the wq get offline, use the default values */
1593  	if (unlikely(!total_cpus)) {
1594  		for_each_node(node)
1595  			wq_node_nr_active(wq, node)->max = min_active;
1596  
1597  		wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
1598  		return;
1599  	}
1600  
1601  	for_each_node(node) {
1602  		int node_cpus;
1603  
1604  		node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
1605  		if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
1606  			node_cpus--;
1607  
1608  		wq_node_nr_active(wq, node)->max =
1609  			clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
1610  			      min_active, max_active);
1611  	}
1612  
1613  	wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
1614  }
1615  
1616  /**
1617   * get_pwq - get an extra reference on the specified pool_workqueue
1618   * @pwq: pool_workqueue to get
1619   *
1620   * Obtain an extra reference on @pwq.  The caller should guarantee that
1621   * @pwq has positive refcnt and be holding the matching pool->lock.
1622   */
get_pwq(struct pool_workqueue * pwq)1623  static void get_pwq(struct pool_workqueue *pwq)
1624  {
1625  	lockdep_assert_held(&pwq->pool->lock);
1626  	WARN_ON_ONCE(pwq->refcnt <= 0);
1627  	pwq->refcnt++;
1628  }
1629  
1630  /**
1631   * put_pwq - put a pool_workqueue reference
1632   * @pwq: pool_workqueue to put
1633   *
1634   * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
1635   * destruction.  The caller should be holding the matching pool->lock.
1636   */
put_pwq(struct pool_workqueue * pwq)1637  static void put_pwq(struct pool_workqueue *pwq)
1638  {
1639  	lockdep_assert_held(&pwq->pool->lock);
1640  	if (likely(--pwq->refcnt))
1641  		return;
1642  	/*
1643  	 * @pwq can't be released under pool->lock, bounce to a dedicated
1644  	 * kthread_worker to avoid A-A deadlocks.
1645  	 */
1646  	kthread_queue_work(pwq_release_worker, &pwq->release_work);
1647  }
1648  
1649  /**
1650   * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
1651   * @pwq: pool_workqueue to put (can be %NULL)
1652   *
1653   * put_pwq() with locking.  This function also allows %NULL @pwq.
1654   */
put_pwq_unlocked(struct pool_workqueue * pwq)1655  static void put_pwq_unlocked(struct pool_workqueue *pwq)
1656  {
1657  	if (pwq) {
1658  		/*
1659  		 * As both pwqs and pools are RCU protected, the
1660  		 * following lock operations are safe.
1661  		 */
1662  		raw_spin_lock_irq(&pwq->pool->lock);
1663  		put_pwq(pwq);
1664  		raw_spin_unlock_irq(&pwq->pool->lock);
1665  	}
1666  }
1667  
pwq_is_empty(struct pool_workqueue * pwq)1668  static bool pwq_is_empty(struct pool_workqueue *pwq)
1669  {
1670  	return !pwq->nr_active && list_empty(&pwq->inactive_works);
1671  }
1672  
__pwq_activate_work(struct pool_workqueue * pwq,struct work_struct * work)1673  static void __pwq_activate_work(struct pool_workqueue *pwq,
1674  				struct work_struct *work)
1675  {
1676  	unsigned long *wdb = work_data_bits(work);
1677  
1678  	WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
1679  	trace_workqueue_activate_work(work);
1680  	if (list_empty(&pwq->pool->worklist))
1681  		pwq->pool->watchdog_ts = jiffies;
1682  	move_linked_works(work, &pwq->pool->worklist, NULL);
1683  	__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
1684  }
1685  
tryinc_node_nr_active(struct wq_node_nr_active * nna)1686  static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
1687  {
1688  	int max = READ_ONCE(nna->max);
1689  
1690  	while (true) {
1691  		int old, tmp;
1692  
1693  		old = atomic_read(&nna->nr);
1694  		if (old >= max)
1695  			return false;
1696  		tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
1697  		if (tmp == old)
1698  			return true;
1699  	}
1700  }
1701  
1702  /**
1703   * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
1704   * @pwq: pool_workqueue of interest
1705   * @fill: max_active may have increased, try to increase concurrency level
1706   *
1707   * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
1708   * successfully obtained. %false otherwise.
1709   */
pwq_tryinc_nr_active(struct pool_workqueue * pwq,bool fill)1710  static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
1711  {
1712  	struct workqueue_struct *wq = pwq->wq;
1713  	struct worker_pool *pool = pwq->pool;
1714  	struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
1715  	bool obtained = false;
1716  
1717  	lockdep_assert_held(&pool->lock);
1718  
1719  	if (!nna) {
1720  		/* BH or per-cpu workqueue, pwq->nr_active is sufficient */
1721  		obtained = pwq->nr_active < READ_ONCE(wq->max_active);
1722  		goto out;
1723  	}
1724  
1725  	if (unlikely(pwq->plugged))
1726  		return false;
1727  
1728  	/*
1729  	 * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
1730  	 * already waiting on $nna, pwq_dec_nr_active() will maintain the
1731  	 * concurrency level. Don't jump the line.
1732  	 *
1733  	 * We need to ignore the pending test after max_active has increased as
1734  	 * pwq_dec_nr_active() can only maintain the concurrency level but not
1735  	 * increase it. This is indicated by @fill.
1736  	 */
1737  	if (!list_empty(&pwq->pending_node) && likely(!fill))
1738  		goto out;
1739  
1740  	obtained = tryinc_node_nr_active(nna);
1741  	if (obtained)
1742  		goto out;
1743  
1744  	/*
1745  	 * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
1746  	 * and try again. The smp_mb() is paired with the implied memory barrier
1747  	 * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
1748  	 * we see the decremented $nna->nr or they see non-empty
1749  	 * $nna->pending_pwqs.
1750  	 */
1751  	raw_spin_lock(&nna->lock);
1752  
1753  	if (list_empty(&pwq->pending_node))
1754  		list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
1755  	else if (likely(!fill))
1756  		goto out_unlock;
1757  
1758  	smp_mb();
1759  
1760  	obtained = tryinc_node_nr_active(nna);
1761  
1762  	/*
1763  	 * If @fill, @pwq might have already been pending. Being spuriously
1764  	 * pending in cold paths doesn't affect anything. Let's leave it be.
1765  	 */
1766  	if (obtained && likely(!fill))
1767  		list_del_init(&pwq->pending_node);
1768  
1769  out_unlock:
1770  	raw_spin_unlock(&nna->lock);
1771  out:
1772  	if (obtained)
1773  		pwq->nr_active++;
1774  	return obtained;
1775  }
1776  
1777  /**
1778   * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
1779   * @pwq: pool_workqueue of interest
1780   * @fill: max_active may have increased, try to increase concurrency level
1781   *
1782   * Activate the first inactive work item of @pwq if available and allowed by
1783   * max_active limit.
1784   *
1785   * Returns %true if an inactive work item has been activated. %false if no
1786   * inactive work item is found or max_active limit is reached.
1787   */
pwq_activate_first_inactive(struct pool_workqueue * pwq,bool fill)1788  static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
1789  {
1790  	struct work_struct *work =
1791  		list_first_entry_or_null(&pwq->inactive_works,
1792  					 struct work_struct, entry);
1793  
1794  	if (work && pwq_tryinc_nr_active(pwq, fill)) {
1795  		__pwq_activate_work(pwq, work);
1796  		return true;
1797  	} else {
1798  		return false;
1799  	}
1800  }
1801  
1802  /**
1803   * unplug_oldest_pwq - unplug the oldest pool_workqueue
1804   * @wq: workqueue_struct where its oldest pwq is to be unplugged
1805   *
1806   * This function should only be called for ordered workqueues where only the
1807   * oldest pwq is unplugged, the others are plugged to suspend execution to
1808   * ensure proper work item ordering::
1809   *
1810   *    dfl_pwq --------------+     [P] - plugged
1811   *                          |
1812   *                          v
1813   *    pwqs -> A -> B [P] -> C [P] (newest)
1814   *            |    |        |
1815   *            1    3        5
1816   *            |    |        |
1817   *            2    4        6
1818   *
1819   * When the oldest pwq is drained and removed, this function should be called
1820   * to unplug the next oldest one to start its work item execution. Note that
1821   * pwq's are linked into wq->pwqs with the oldest first, so the first one in
1822   * the list is the oldest.
1823   */
unplug_oldest_pwq(struct workqueue_struct * wq)1824  static void unplug_oldest_pwq(struct workqueue_struct *wq)
1825  {
1826  	struct pool_workqueue *pwq;
1827  
1828  	lockdep_assert_held(&wq->mutex);
1829  
1830  	/* Caller should make sure that pwqs isn't empty before calling */
1831  	pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue,
1832  				       pwqs_node);
1833  	raw_spin_lock_irq(&pwq->pool->lock);
1834  	if (pwq->plugged) {
1835  		pwq->plugged = false;
1836  		if (pwq_activate_first_inactive(pwq, true))
1837  			kick_pool(pwq->pool);
1838  	}
1839  	raw_spin_unlock_irq(&pwq->pool->lock);
1840  }
1841  
1842  /**
1843   * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
1844   * @nna: wq_node_nr_active to activate a pending pwq for
1845   * @caller_pool: worker_pool the caller is locking
1846   *
1847   * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
1848   * @caller_pool may be unlocked and relocked to lock other worker_pools.
1849   */
node_activate_pending_pwq(struct wq_node_nr_active * nna,struct worker_pool * caller_pool)1850  static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
1851  				      struct worker_pool *caller_pool)
1852  {
1853  	struct worker_pool *locked_pool = caller_pool;
1854  	struct pool_workqueue *pwq;
1855  	struct work_struct *work;
1856  
1857  	lockdep_assert_held(&caller_pool->lock);
1858  
1859  	raw_spin_lock(&nna->lock);
1860  retry:
1861  	pwq = list_first_entry_or_null(&nna->pending_pwqs,
1862  				       struct pool_workqueue, pending_node);
1863  	if (!pwq)
1864  		goto out_unlock;
1865  
1866  	/*
1867  	 * If @pwq is for a different pool than @locked_pool, we need to lock
1868  	 * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
1869  	 * / lock dance. For that, we also need to release @nna->lock as it's
1870  	 * nested inside pool locks.
1871  	 */
1872  	if (pwq->pool != locked_pool) {
1873  		raw_spin_unlock(&locked_pool->lock);
1874  		locked_pool = pwq->pool;
1875  		if (!raw_spin_trylock(&locked_pool->lock)) {
1876  			raw_spin_unlock(&nna->lock);
1877  			raw_spin_lock(&locked_pool->lock);
1878  			raw_spin_lock(&nna->lock);
1879  			goto retry;
1880  		}
1881  	}
1882  
1883  	/*
1884  	 * $pwq may not have any inactive work items due to e.g. cancellations.
1885  	 * Drop it from pending_pwqs and see if there's another one.
1886  	 */
1887  	work = list_first_entry_or_null(&pwq->inactive_works,
1888  					struct work_struct, entry);
1889  	if (!work) {
1890  		list_del_init(&pwq->pending_node);
1891  		goto retry;
1892  	}
1893  
1894  	/*
1895  	 * Acquire an nr_active count and activate the inactive work item. If
1896  	 * $pwq still has inactive work items, rotate it to the end of the
1897  	 * pending_pwqs so that we round-robin through them. This means that
1898  	 * inactive work items are not activated in queueing order which is fine
1899  	 * given that there has never been any ordering across different pwqs.
1900  	 */
1901  	if (likely(tryinc_node_nr_active(nna))) {
1902  		pwq->nr_active++;
1903  		__pwq_activate_work(pwq, work);
1904  
1905  		if (list_empty(&pwq->inactive_works))
1906  			list_del_init(&pwq->pending_node);
1907  		else
1908  			list_move_tail(&pwq->pending_node, &nna->pending_pwqs);
1909  
1910  		/* if activating a foreign pool, make sure it's running */
1911  		if (pwq->pool != caller_pool)
1912  			kick_pool(pwq->pool);
1913  	}
1914  
1915  out_unlock:
1916  	raw_spin_unlock(&nna->lock);
1917  	if (locked_pool != caller_pool) {
1918  		raw_spin_unlock(&locked_pool->lock);
1919  		raw_spin_lock(&caller_pool->lock);
1920  	}
1921  }
1922  
1923  /**
1924   * pwq_dec_nr_active - Retire an active count
1925   * @pwq: pool_workqueue of interest
1926   *
1927   * Decrement @pwq's nr_active and try to activate the first inactive work item.
1928   * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
1929   */
pwq_dec_nr_active(struct pool_workqueue * pwq)1930  static void pwq_dec_nr_active(struct pool_workqueue *pwq)
1931  {
1932  	struct worker_pool *pool = pwq->pool;
1933  	struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
1934  
1935  	lockdep_assert_held(&pool->lock);
1936  
1937  	/*
1938  	 * @pwq->nr_active should be decremented for both percpu and unbound
1939  	 * workqueues.
1940  	 */
1941  	pwq->nr_active--;
1942  
1943  	/*
1944  	 * For a percpu workqueue, it's simple. Just need to kick the first
1945  	 * inactive work item on @pwq itself.
1946  	 */
1947  	if (!nna) {
1948  		pwq_activate_first_inactive(pwq, false);
1949  		return;
1950  	}
1951  
1952  	/*
1953  	 * If @pwq is for an unbound workqueue, it's more complicated because
1954  	 * multiple pwqs and pools may be sharing the nr_active count. When a
1955  	 * pwq needs to wait for an nr_active count, it puts itself on
1956  	 * $nna->pending_pwqs. The following atomic_dec_return()'s implied
1957  	 * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
1958  	 * guarantee that either we see non-empty pending_pwqs or they see
1959  	 * decremented $nna->nr.
1960  	 *
1961  	 * $nna->max may change as CPUs come online/offline and @pwq->wq's
1962  	 * max_active gets updated. However, it is guaranteed to be equal to or
1963  	 * larger than @pwq->wq->min_active which is above zero unless freezing.
1964  	 * This maintains the forward progress guarantee.
1965  	 */
1966  	if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
1967  		return;
1968  
1969  	if (!list_empty(&nna->pending_pwqs))
1970  		node_activate_pending_pwq(nna, pool);
1971  }
1972  
1973  /**
1974   * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
1975   * @pwq: pwq of interest
1976   * @work_data: work_data of work which left the queue
1977   *
1978   * A work either has completed or is removed from pending queue,
1979   * decrement nr_in_flight of its pwq and handle workqueue flushing.
1980   *
1981   * NOTE:
1982   * For unbound workqueues, this function may temporarily drop @pwq->pool->lock
1983   * and thus should be called after all other state updates for the in-flight
1984   * work item is complete.
1985   *
1986   * CONTEXT:
1987   * raw_spin_lock_irq(pool->lock).
1988   */
pwq_dec_nr_in_flight(struct pool_workqueue * pwq,unsigned long work_data)1989  static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
1990  {
1991  	int color = get_work_color(work_data);
1992  
1993  	if (!(work_data & WORK_STRUCT_INACTIVE))
1994  		pwq_dec_nr_active(pwq);
1995  
1996  	pwq->nr_in_flight[color]--;
1997  
1998  	/* is flush in progress and are we at the flushing tip? */
1999  	if (likely(pwq->flush_color != color))
2000  		goto out_put;
2001  
2002  	/* are there still in-flight works? */
2003  	if (pwq->nr_in_flight[color])
2004  		goto out_put;
2005  
2006  	/* this pwq is done, clear flush_color */
2007  	pwq->flush_color = -1;
2008  
2009  	/*
2010  	 * If this was the last pwq, wake up the first flusher.  It
2011  	 * will handle the rest.
2012  	 */
2013  	if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
2014  		complete(&pwq->wq->first_flusher->done);
2015  out_put:
2016  	put_pwq(pwq);
2017  }
2018  
2019  /**
2020   * try_to_grab_pending - steal work item from worklist and disable irq
2021   * @work: work item to steal
2022   * @cflags: %WORK_CANCEL_ flags
2023   * @irq_flags: place to store irq state
2024   *
2025   * Try to grab PENDING bit of @work.  This function can handle @work in any
2026   * stable state - idle, on timer or on worklist.
2027   *
2028   * Return:
2029   *
2030   *  ========	================================================================
2031   *  1		if @work was pending and we successfully stole PENDING
2032   *  0		if @work was idle and we claimed PENDING
2033   *  -EAGAIN	if PENDING couldn't be grabbed at the moment, safe to busy-retry
2034   *  ========	================================================================
2035   *
2036   * Note:
2037   * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
2038   * interrupted while holding PENDING and @work off queue, irq must be
2039   * disabled on entry.  This, combined with delayed_work->timer being
2040   * irqsafe, ensures that we return -EAGAIN for finite short period of time.
2041   *
2042   * On successful return, >= 0, irq is disabled and the caller is
2043   * responsible for releasing it using local_irq_restore(*@irq_flags).
2044   *
2045   * This function is safe to call from any context including IRQ handler.
2046   */
try_to_grab_pending(struct work_struct * work,u32 cflags,unsigned long * irq_flags)2047  static int try_to_grab_pending(struct work_struct *work, u32 cflags,
2048  			       unsigned long *irq_flags)
2049  {
2050  	struct worker_pool *pool;
2051  	struct pool_workqueue *pwq;
2052  
2053  	local_irq_save(*irq_flags);
2054  
2055  	/* try to steal the timer if it exists */
2056  	if (cflags & WORK_CANCEL_DELAYED) {
2057  		struct delayed_work *dwork = to_delayed_work(work);
2058  
2059  		/*
2060  		 * dwork->timer is irqsafe.  If del_timer() fails, it's
2061  		 * guaranteed that the timer is not queued anywhere and not
2062  		 * running on the local CPU.
2063  		 */
2064  		if (likely(del_timer(&dwork->timer)))
2065  			return 1;
2066  	}
2067  
2068  	/* try to claim PENDING the normal way */
2069  	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
2070  		return 0;
2071  
2072  	rcu_read_lock();
2073  	/*
2074  	 * The queueing is in progress, or it is already queued. Try to
2075  	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
2076  	 */
2077  	pool = get_work_pool(work);
2078  	if (!pool)
2079  		goto fail;
2080  
2081  	raw_spin_lock(&pool->lock);
2082  	/*
2083  	 * work->data is guaranteed to point to pwq only while the work
2084  	 * item is queued on pwq->wq, and both updating work->data to point
2085  	 * to pwq on queueing and to pool on dequeueing are done under
2086  	 * pwq->pool->lock.  This in turn guarantees that, if work->data
2087  	 * points to pwq which is associated with a locked pool, the work
2088  	 * item is currently queued on that pool.
2089  	 */
2090  	pwq = get_work_pwq(work);
2091  	if (pwq && pwq->pool == pool) {
2092  		unsigned long work_data = *work_data_bits(work);
2093  
2094  		debug_work_deactivate(work);
2095  
2096  		/*
2097  		 * A cancelable inactive work item must be in the
2098  		 * pwq->inactive_works since a queued barrier can't be
2099  		 * canceled (see the comments in insert_wq_barrier()).
2100  		 *
2101  		 * An inactive work item cannot be deleted directly because
2102  		 * it might have linked barrier work items which, if left
2103  		 * on the inactive_works list, will confuse pwq->nr_active
2104  		 * management later on and cause stall.  Move the linked
2105  		 * barrier work items to the worklist when deleting the grabbed
2106  		 * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that
2107  		 * it doesn't participate in nr_active management in later
2108  		 * pwq_dec_nr_in_flight().
2109  		 */
2110  		if (work_data & WORK_STRUCT_INACTIVE)
2111  			move_linked_works(work, &pwq->pool->worklist, NULL);
2112  
2113  		list_del_init(&work->entry);
2114  
2115  		/*
2116  		 * work->data points to pwq iff queued. Let's point to pool. As
2117  		 * this destroys work->data needed by the next step, stash it.
2118  		 */
2119  		set_work_pool_and_keep_pending(work, pool->id,
2120  					       pool_offq_flags(pool));
2121  
2122  		/* must be the last step, see the function comment */
2123  		pwq_dec_nr_in_flight(pwq, work_data);
2124  
2125  		raw_spin_unlock(&pool->lock);
2126  		rcu_read_unlock();
2127  		return 1;
2128  	}
2129  	raw_spin_unlock(&pool->lock);
2130  fail:
2131  	rcu_read_unlock();
2132  	local_irq_restore(*irq_flags);
2133  	return -EAGAIN;
2134  }
2135  
2136  /**
2137   * work_grab_pending - steal work item from worklist and disable irq
2138   * @work: work item to steal
2139   * @cflags: %WORK_CANCEL_ flags
2140   * @irq_flags: place to store IRQ state
2141   *
2142   * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer
2143   * or on worklist.
2144   *
2145   * Can be called from any context. IRQ is disabled on return with IRQ state
2146   * stored in *@irq_flags. The caller is responsible for re-enabling it using
2147   * local_irq_restore().
2148   *
2149   * Returns %true if @work was pending. %false if idle.
2150   */
work_grab_pending(struct work_struct * work,u32 cflags,unsigned long * irq_flags)2151  static bool work_grab_pending(struct work_struct *work, u32 cflags,
2152  			      unsigned long *irq_flags)
2153  {
2154  	int ret;
2155  
2156  	while (true) {
2157  		ret = try_to_grab_pending(work, cflags, irq_flags);
2158  		if (ret >= 0)
2159  			return ret;
2160  		cpu_relax();
2161  	}
2162  }
2163  
2164  /**
2165   * insert_work - insert a work into a pool
2166   * @pwq: pwq @work belongs to
2167   * @work: work to insert
2168   * @head: insertion point
2169   * @extra_flags: extra WORK_STRUCT_* flags to set
2170   *
2171   * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
2172   * work_struct flags.
2173   *
2174   * CONTEXT:
2175   * raw_spin_lock_irq(pool->lock).
2176   */
insert_work(struct pool_workqueue * pwq,struct work_struct * work,struct list_head * head,unsigned int extra_flags)2177  static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
2178  			struct list_head *head, unsigned int extra_flags)
2179  {
2180  	debug_work_activate(work);
2181  
2182  	/* record the work call stack in order to print it in KASAN reports */
2183  	kasan_record_aux_stack_noalloc(work);
2184  
2185  	/* we own @work, set data and link */
2186  	set_work_pwq(work, pwq, extra_flags);
2187  	list_add_tail(&work->entry, head);
2188  	get_pwq(pwq);
2189  }
2190  
2191  /*
2192   * Test whether @work is being queued from another work executing on the
2193   * same workqueue.
2194   */
is_chained_work(struct workqueue_struct * wq)2195  static bool is_chained_work(struct workqueue_struct *wq)
2196  {
2197  	struct worker *worker;
2198  
2199  	worker = current_wq_worker();
2200  	/*
2201  	 * Return %true iff I'm a worker executing a work item on @wq.  If
2202  	 * I'm @worker, it's safe to dereference it without locking.
2203  	 */
2204  	return worker && worker->current_pwq->wq == wq;
2205  }
2206  
2207  /*
2208   * When queueing an unbound work item to a wq, prefer local CPU if allowed
2209   * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
2210   * avoid perturbing sensitive tasks.
2211   */
wq_select_unbound_cpu(int cpu)2212  static int wq_select_unbound_cpu(int cpu)
2213  {
2214  	int new_cpu;
2215  
2216  	if (likely(!wq_debug_force_rr_cpu)) {
2217  		if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
2218  			return cpu;
2219  	} else {
2220  		pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
2221  	}
2222  
2223  	new_cpu = __this_cpu_read(wq_rr_cpu_last);
2224  	new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
2225  	if (unlikely(new_cpu >= nr_cpu_ids)) {
2226  		new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
2227  		if (unlikely(new_cpu >= nr_cpu_ids))
2228  			return cpu;
2229  	}
2230  	__this_cpu_write(wq_rr_cpu_last, new_cpu);
2231  
2232  	return new_cpu;
2233  }
2234  
__queue_work(int cpu,struct workqueue_struct * wq,struct work_struct * work)2235  static void __queue_work(int cpu, struct workqueue_struct *wq,
2236  			 struct work_struct *work)
2237  {
2238  	struct pool_workqueue *pwq;
2239  	struct worker_pool *last_pool, *pool;
2240  	unsigned int work_flags;
2241  	unsigned int req_cpu = cpu;
2242  
2243  	/*
2244  	 * While a work item is PENDING && off queue, a task trying to
2245  	 * steal the PENDING will busy-loop waiting for it to either get
2246  	 * queued or lose PENDING.  Grabbing PENDING and queueing should
2247  	 * happen with IRQ disabled.
2248  	 */
2249  	lockdep_assert_irqs_disabled();
2250  
2251  	/*
2252  	 * For a draining wq, only works from the same workqueue are
2253  	 * allowed. The __WQ_DESTROYING helps to spot the issue that
2254  	 * queues a new work item to a wq after destroy_workqueue(wq).
2255  	 */
2256  	if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
2257  		     WARN_ON_ONCE(!is_chained_work(wq))))
2258  		return;
2259  	rcu_read_lock();
2260  retry:
2261  	/* pwq which will be used unless @work is executing elsewhere */
2262  	if (req_cpu == WORK_CPU_UNBOUND) {
2263  		if (wq->flags & WQ_UNBOUND)
2264  			cpu = wq_select_unbound_cpu(raw_smp_processor_id());
2265  		else
2266  			cpu = raw_smp_processor_id();
2267  	}
2268  
2269  	pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
2270  	pool = pwq->pool;
2271  
2272  	/*
2273  	 * If @work was previously on a different pool, it might still be
2274  	 * running there, in which case the work needs to be queued on that
2275  	 * pool to guarantee non-reentrancy.
2276  	 *
2277  	 * For ordered workqueue, work items must be queued on the newest pwq
2278  	 * for accurate order management.  Guaranteed order also guarantees
2279  	 * non-reentrancy.  See the comments above unplug_oldest_pwq().
2280  	 */
2281  	last_pool = get_work_pool(work);
2282  	if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) {
2283  		struct worker *worker;
2284  
2285  		raw_spin_lock(&last_pool->lock);
2286  
2287  		worker = find_worker_executing_work(last_pool, work);
2288  
2289  		if (worker && worker->current_pwq->wq == wq) {
2290  			pwq = worker->current_pwq;
2291  			pool = pwq->pool;
2292  			WARN_ON_ONCE(pool != last_pool);
2293  		} else {
2294  			/* meh... not running there, queue here */
2295  			raw_spin_unlock(&last_pool->lock);
2296  			raw_spin_lock(&pool->lock);
2297  		}
2298  	} else {
2299  		raw_spin_lock(&pool->lock);
2300  	}
2301  
2302  	/*
2303  	 * pwq is determined and locked. For unbound pools, we could have raced
2304  	 * with pwq release and it could already be dead. If its refcnt is zero,
2305  	 * repeat pwq selection. Note that unbound pwqs never die without
2306  	 * another pwq replacing it in cpu_pwq or while work items are executing
2307  	 * on it, so the retrying is guaranteed to make forward-progress.
2308  	 */
2309  	if (unlikely(!pwq->refcnt)) {
2310  		if (wq->flags & WQ_UNBOUND) {
2311  			raw_spin_unlock(&pool->lock);
2312  			cpu_relax();
2313  			goto retry;
2314  		}
2315  		/* oops */
2316  		WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
2317  			  wq->name, cpu);
2318  	}
2319  
2320  	/* pwq determined, queue */
2321  	trace_workqueue_queue_work(req_cpu, pwq, work);
2322  
2323  	if (WARN_ON(!list_empty(&work->entry)))
2324  		goto out;
2325  
2326  	pwq->nr_in_flight[pwq->work_color]++;
2327  	work_flags = work_color_to_flags(pwq->work_color);
2328  
2329  	/*
2330  	 * Limit the number of concurrently active work items to max_active.
2331  	 * @work must also queue behind existing inactive work items to maintain
2332  	 * ordering when max_active changes. See wq_adjust_max_active().
2333  	 */
2334  	if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
2335  		if (list_empty(&pool->worklist))
2336  			pool->watchdog_ts = jiffies;
2337  
2338  		trace_workqueue_activate_work(work);
2339  		insert_work(pwq, work, &pool->worklist, work_flags);
2340  		kick_pool(pool);
2341  	} else {
2342  		work_flags |= WORK_STRUCT_INACTIVE;
2343  		insert_work(pwq, work, &pwq->inactive_works, work_flags);
2344  	}
2345  
2346  out:
2347  	raw_spin_unlock(&pool->lock);
2348  	rcu_read_unlock();
2349  }
2350  
clear_pending_if_disabled(struct work_struct * work)2351  static bool clear_pending_if_disabled(struct work_struct *work)
2352  {
2353  	unsigned long data = *work_data_bits(work);
2354  	struct work_offq_data offqd;
2355  
2356  	if (likely((data & WORK_STRUCT_PWQ) ||
2357  		   !(data & WORK_OFFQ_DISABLE_MASK)))
2358  		return false;
2359  
2360  	work_offqd_unpack(&offqd, data);
2361  	set_work_pool_and_clear_pending(work, offqd.pool_id,
2362  					work_offqd_pack_flags(&offqd));
2363  	return true;
2364  }
2365  
2366  /**
2367   * queue_work_on - queue work on specific cpu
2368   * @cpu: CPU number to execute work on
2369   * @wq: workqueue to use
2370   * @work: work to queue
2371   *
2372   * We queue the work to a specific CPU, the caller must ensure it
2373   * can't go away.  Callers that fail to ensure that the specified
2374   * CPU cannot go away will execute on a randomly chosen CPU.
2375   * But note well that callers specifying a CPU that never has been
2376   * online will get a splat.
2377   *
2378   * Return: %false if @work was already on a queue, %true otherwise.
2379   */
queue_work_on(int cpu,struct workqueue_struct * wq,struct work_struct * work)2380  bool queue_work_on(int cpu, struct workqueue_struct *wq,
2381  		   struct work_struct *work)
2382  {
2383  	bool ret = false;
2384  	unsigned long irq_flags;
2385  
2386  	local_irq_save(irq_flags);
2387  
2388  	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
2389  	    !clear_pending_if_disabled(work)) {
2390  		__queue_work(cpu, wq, work);
2391  		ret = true;
2392  	}
2393  
2394  	local_irq_restore(irq_flags);
2395  	return ret;
2396  }
2397  EXPORT_SYMBOL(queue_work_on);
2398  
2399  /**
2400   * select_numa_node_cpu - Select a CPU based on NUMA node
2401   * @node: NUMA node ID that we want to select a CPU from
2402   *
2403   * This function will attempt to find a "random" cpu available on a given
2404   * node. If there are no CPUs available on the given node it will return
2405   * WORK_CPU_UNBOUND indicating that we should just schedule to any
2406   * available CPU if we need to schedule this work.
2407   */
select_numa_node_cpu(int node)2408  static int select_numa_node_cpu(int node)
2409  {
2410  	int cpu;
2411  
2412  	/* Delay binding to CPU if node is not valid or online */
2413  	if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
2414  		return WORK_CPU_UNBOUND;
2415  
2416  	/* Use local node/cpu if we are already there */
2417  	cpu = raw_smp_processor_id();
2418  	if (node == cpu_to_node(cpu))
2419  		return cpu;
2420  
2421  	/* Use "random" otherwise know as "first" online CPU of node */
2422  	cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
2423  
2424  	/* If CPU is valid return that, otherwise just defer */
2425  	return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
2426  }
2427  
2428  /**
2429   * queue_work_node - queue work on a "random" cpu for a given NUMA node
2430   * @node: NUMA node that we are targeting the work for
2431   * @wq: workqueue to use
2432   * @work: work to queue
2433   *
2434   * We queue the work to a "random" CPU within a given NUMA node. The basic
2435   * idea here is to provide a way to somehow associate work with a given
2436   * NUMA node.
2437   *
2438   * This function will only make a best effort attempt at getting this onto
2439   * the right NUMA node. If no node is requested or the requested node is
2440   * offline then we just fall back to standard queue_work behavior.
2441   *
2442   * Currently the "random" CPU ends up being the first available CPU in the
2443   * intersection of cpu_online_mask and the cpumask of the node, unless we
2444   * are running on the node. In that case we just use the current CPU.
2445   *
2446   * Return: %false if @work was already on a queue, %true otherwise.
2447   */
queue_work_node(int node,struct workqueue_struct * wq,struct work_struct * work)2448  bool queue_work_node(int node, struct workqueue_struct *wq,
2449  		     struct work_struct *work)
2450  {
2451  	unsigned long irq_flags;
2452  	bool ret = false;
2453  
2454  	/*
2455  	 * This current implementation is specific to unbound workqueues.
2456  	 * Specifically we only return the first available CPU for a given
2457  	 * node instead of cycling through individual CPUs within the node.
2458  	 *
2459  	 * If this is used with a per-cpu workqueue then the logic in
2460  	 * workqueue_select_cpu_near would need to be updated to allow for
2461  	 * some round robin type logic.
2462  	 */
2463  	WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));
2464  
2465  	local_irq_save(irq_flags);
2466  
2467  	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
2468  	    !clear_pending_if_disabled(work)) {
2469  		int cpu = select_numa_node_cpu(node);
2470  
2471  		__queue_work(cpu, wq, work);
2472  		ret = true;
2473  	}
2474  
2475  	local_irq_restore(irq_flags);
2476  	return ret;
2477  }
2478  EXPORT_SYMBOL_GPL(queue_work_node);
2479  
delayed_work_timer_fn(struct timer_list * t)2480  void delayed_work_timer_fn(struct timer_list *t)
2481  {
2482  	struct delayed_work *dwork = from_timer(dwork, t, timer);
2483  
2484  	/* should have been called from irqsafe timer with irq already off */
2485  	__queue_work(dwork->cpu, dwork->wq, &dwork->work);
2486  }
2487  EXPORT_SYMBOL(delayed_work_timer_fn);
2488  
__queue_delayed_work(int cpu,struct workqueue_struct * wq,struct delayed_work * dwork,unsigned long delay)2489  static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
2490  				struct delayed_work *dwork, unsigned long delay)
2491  {
2492  	struct timer_list *timer = &dwork->timer;
2493  	struct work_struct *work = &dwork->work;
2494  
2495  	WARN_ON_ONCE(!wq);
2496  	WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
2497  	WARN_ON_ONCE(timer_pending(timer));
2498  	WARN_ON_ONCE(!list_empty(&work->entry));
2499  
2500  	/*
2501  	 * If @delay is 0, queue @dwork->work immediately.  This is for
2502  	 * both optimization and correctness.  The earliest @timer can
2503  	 * expire is on the closest next tick and delayed_work users depend
2504  	 * on that there's no such delay when @delay is 0.
2505  	 */
2506  	if (!delay) {
2507  		__queue_work(cpu, wq, &dwork->work);
2508  		return;
2509  	}
2510  
2511  	dwork->wq = wq;
2512  	dwork->cpu = cpu;
2513  	timer->expires = jiffies + delay;
2514  
2515  	if (housekeeping_enabled(HK_TYPE_TIMER)) {
2516  		/* If the current cpu is a housekeeping cpu, use it. */
2517  		cpu = smp_processor_id();
2518  		if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER))
2519  			cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
2520  		add_timer_on(timer, cpu);
2521  	} else {
2522  		if (likely(cpu == WORK_CPU_UNBOUND))
2523  			add_timer_global(timer);
2524  		else
2525  			add_timer_on(timer, cpu);
2526  	}
2527  }
2528  
2529  /**
2530   * queue_delayed_work_on - queue work on specific CPU after delay
2531   * @cpu: CPU number to execute work on
2532   * @wq: workqueue to use
2533   * @dwork: work to queue
2534   * @delay: number of jiffies to wait before queueing
2535   *
2536   * Return: %false if @work was already on a queue, %true otherwise.  If
2537   * @delay is zero and @dwork is idle, it will be scheduled for immediate
2538   * execution.
2539   */
queue_delayed_work_on(int cpu,struct workqueue_struct * wq,struct delayed_work * dwork,unsigned long delay)2540  bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
2541  			   struct delayed_work *dwork, unsigned long delay)
2542  {
2543  	struct work_struct *work = &dwork->work;
2544  	bool ret = false;
2545  	unsigned long irq_flags;
2546  
2547  	/* read the comment in __queue_work() */
2548  	local_irq_save(irq_flags);
2549  
2550  	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
2551  	    !clear_pending_if_disabled(work)) {
2552  		__queue_delayed_work(cpu, wq, dwork, delay);
2553  		ret = true;
2554  	}
2555  
2556  	local_irq_restore(irq_flags);
2557  	return ret;
2558  }
2559  EXPORT_SYMBOL(queue_delayed_work_on);
2560  
2561  /**
2562   * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
2563   * @cpu: CPU number to execute work on
2564   * @wq: workqueue to use
2565   * @dwork: work to queue
2566   * @delay: number of jiffies to wait before queueing
2567   *
2568   * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
2569   * modify @dwork's timer so that it expires after @delay.  If @delay is
2570   * zero, @work is guaranteed to be scheduled immediately regardless of its
2571   * current state.
2572   *
2573   * Return: %false if @dwork was idle and queued, %true if @dwork was
2574   * pending and its timer was modified.
2575   *
2576   * This function is safe to call from any context including IRQ handler.
2577   * See try_to_grab_pending() for details.
2578   */
mod_delayed_work_on(int cpu,struct workqueue_struct * wq,struct delayed_work * dwork,unsigned long delay)2579  bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
2580  			 struct delayed_work *dwork, unsigned long delay)
2581  {
2582  	unsigned long irq_flags;
2583  	bool ret;
2584  
2585  	ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags);
2586  
2587  	if (!clear_pending_if_disabled(&dwork->work))
2588  		__queue_delayed_work(cpu, wq, dwork, delay);
2589  
2590  	local_irq_restore(irq_flags);
2591  	return ret;
2592  }
2593  EXPORT_SYMBOL_GPL(mod_delayed_work_on);
2594  
rcu_work_rcufn(struct rcu_head * rcu)2595  static void rcu_work_rcufn(struct rcu_head *rcu)
2596  {
2597  	struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);
2598  
2599  	/* read the comment in __queue_work() */
2600  	local_irq_disable();
2601  	__queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
2602  	local_irq_enable();
2603  }
2604  
2605  /**
2606   * queue_rcu_work - queue work after a RCU grace period
2607   * @wq: workqueue to use
2608   * @rwork: work to queue
2609   *
2610   * Return: %false if @rwork was already pending, %true otherwise.  Note
2611   * that a full RCU grace period is guaranteed only after a %true return.
2612   * While @rwork is guaranteed to be executed after a %false return, the
2613   * execution may happen before a full RCU grace period has passed.
2614   */
queue_rcu_work(struct workqueue_struct * wq,struct rcu_work * rwork)2615  bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
2616  {
2617  	struct work_struct *work = &rwork->work;
2618  
2619  	/*
2620  	 * rcu_work can't be canceled or disabled. Warn if the user reached
2621  	 * inside @rwork and disabled the inner work.
2622  	 */
2623  	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
2624  	    !WARN_ON_ONCE(clear_pending_if_disabled(work))) {
2625  		rwork->wq = wq;
2626  		call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
2627  		return true;
2628  	}
2629  
2630  	return false;
2631  }
2632  EXPORT_SYMBOL(queue_rcu_work);
2633  
alloc_worker(int node)2634  static struct worker *alloc_worker(int node)
2635  {
2636  	struct worker *worker;
2637  
2638  	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
2639  	if (worker) {
2640  		INIT_LIST_HEAD(&worker->entry);
2641  		INIT_LIST_HEAD(&worker->scheduled);
2642  		INIT_LIST_HEAD(&worker->node);
2643  		/* on creation a worker is in !idle && prep state */
2644  		worker->flags = WORKER_PREP;
2645  	}
2646  	return worker;
2647  }
2648  
pool_allowed_cpus(struct worker_pool * pool)2649  static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
2650  {
2651  	if (pool->cpu < 0 && pool->attrs->affn_strict)
2652  		return pool->attrs->__pod_cpumask;
2653  	else
2654  		return pool->attrs->cpumask;
2655  }
2656  
2657  /**
2658   * worker_attach_to_pool() - attach a worker to a pool
2659   * @worker: worker to be attached
2660   * @pool: the target pool
2661   *
2662   * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
2663   * cpu-binding of @worker are kept coordinated with the pool across
2664   * cpu-[un]hotplugs.
2665   */
worker_attach_to_pool(struct worker * worker,struct worker_pool * pool)2666  static void worker_attach_to_pool(struct worker *worker,
2667  				  struct worker_pool *pool)
2668  {
2669  	mutex_lock(&wq_pool_attach_mutex);
2670  
2671  	/*
2672  	 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable
2673  	 * across this function. See the comments above the flag definition for
2674  	 * details. BH workers are, while per-CPU, always DISASSOCIATED.
2675  	 */
2676  	if (pool->flags & POOL_DISASSOCIATED) {
2677  		worker->flags |= WORKER_UNBOUND;
2678  	} else {
2679  		WARN_ON_ONCE(pool->flags & POOL_BH);
2680  		kthread_set_per_cpu(worker->task, pool->cpu);
2681  	}
2682  
2683  	if (worker->rescue_wq)
2684  		set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));
2685  
2686  	list_add_tail(&worker->node, &pool->workers);
2687  	worker->pool = pool;
2688  
2689  	mutex_unlock(&wq_pool_attach_mutex);
2690  }
2691  
unbind_worker(struct worker * worker)2692  static void unbind_worker(struct worker *worker)
2693  {
2694  	lockdep_assert_held(&wq_pool_attach_mutex);
2695  
2696  	kthread_set_per_cpu(worker->task, -1);
2697  	if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
2698  		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
2699  	else
2700  		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
2701  }
2702  
2703  
detach_worker(struct worker * worker)2704  static void detach_worker(struct worker *worker)
2705  {
2706  	lockdep_assert_held(&wq_pool_attach_mutex);
2707  
2708  	unbind_worker(worker);
2709  	list_del(&worker->node);
2710  }
2711  
2712  /**
2713   * worker_detach_from_pool() - detach a worker from its pool
2714   * @worker: worker which is attached to its pool
2715   *
2716   * Undo the attaching which had been done in worker_attach_to_pool().  The
2717   * caller worker shouldn't access to the pool after detached except it has
2718   * other reference to the pool.
2719   */
worker_detach_from_pool(struct worker * worker)2720  static void worker_detach_from_pool(struct worker *worker)
2721  {
2722  	struct worker_pool *pool = worker->pool;
2723  
2724  	/* there is one permanent BH worker per CPU which should never detach */
2725  	WARN_ON_ONCE(pool->flags & POOL_BH);
2726  
2727  	mutex_lock(&wq_pool_attach_mutex);
2728  	detach_worker(worker);
2729  	worker->pool = NULL;
2730  	mutex_unlock(&wq_pool_attach_mutex);
2731  
2732  	/* clear leftover flags without pool->lock after it is detached */
2733  	worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
2734  }
2735  
format_worker_id(char * buf,size_t size,struct worker * worker,struct worker_pool * pool)2736  static int format_worker_id(char *buf, size_t size, struct worker *worker,
2737  			    struct worker_pool *pool)
2738  {
2739  	if (worker->rescue_wq)
2740  		return scnprintf(buf, size, "kworker/R-%s",
2741  				 worker->rescue_wq->name);
2742  
2743  	if (pool) {
2744  		if (pool->cpu >= 0)
2745  			return scnprintf(buf, size, "kworker/%d:%d%s",
2746  					 pool->cpu, worker->id,
2747  					 pool->attrs->nice < 0  ? "H" : "");
2748  		else
2749  			return scnprintf(buf, size, "kworker/u%d:%d",
2750  					 pool->id, worker->id);
2751  	} else {
2752  		return scnprintf(buf, size, "kworker/dying");
2753  	}
2754  }
2755  
2756  /**
2757   * create_worker - create a new workqueue worker
2758   * @pool: pool the new worker will belong to
2759   *
2760   * Create and start a new worker which is attached to @pool.
2761   *
2762   * CONTEXT:
2763   * Might sleep.  Does GFP_KERNEL allocations.
2764   *
2765   * Return:
2766   * Pointer to the newly created worker.
2767   */
create_worker(struct worker_pool * pool)2768  static struct worker *create_worker(struct worker_pool *pool)
2769  {
2770  	struct worker *worker;
2771  	int id;
2772  
2773  	/* ID is needed to determine kthread name */
2774  	id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
2775  	if (id < 0) {
2776  		pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
2777  			    ERR_PTR(id));
2778  		return NULL;
2779  	}
2780  
2781  	worker = alloc_worker(pool->node);
2782  	if (!worker) {
2783  		pr_err_once("workqueue: Failed to allocate a worker\n");
2784  		goto fail;
2785  	}
2786  
2787  	worker->id = id;
2788  
2789  	if (!(pool->flags & POOL_BH)) {
2790  		char id_buf[WORKER_ID_LEN];
2791  
2792  		format_worker_id(id_buf, sizeof(id_buf), worker, pool);
2793  		worker->task = kthread_create_on_node(worker_thread, worker,
2794  						      pool->node, "%s", id_buf);
2795  		if (IS_ERR(worker->task)) {
2796  			if (PTR_ERR(worker->task) == -EINTR) {
2797  				pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n",
2798  				       id_buf);
2799  			} else {
2800  				pr_err_once("workqueue: Failed to create a worker thread: %pe",
2801  					    worker->task);
2802  			}
2803  			goto fail;
2804  		}
2805  
2806  		set_user_nice(worker->task, pool->attrs->nice);
2807  		kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
2808  	}
2809  
2810  	/* successful, attach the worker to the pool */
2811  	worker_attach_to_pool(worker, pool);
2812  
2813  	/* start the newly created worker */
2814  	raw_spin_lock_irq(&pool->lock);
2815  
2816  	worker->pool->nr_workers++;
2817  	worker_enter_idle(worker);
2818  
2819  	/*
2820  	 * @worker is waiting on a completion in kthread() and will trigger hung
2821  	 * check if not woken up soon. As kick_pool() is noop if @pool is empty,
2822  	 * wake it up explicitly.
2823  	 */
2824  	if (worker->task)
2825  		wake_up_process(worker->task);
2826  
2827  	raw_spin_unlock_irq(&pool->lock);
2828  
2829  	return worker;
2830  
2831  fail:
2832  	ida_free(&pool->worker_ida, id);
2833  	kfree(worker);
2834  	return NULL;
2835  }
2836  
detach_dying_workers(struct list_head * cull_list)2837  static void detach_dying_workers(struct list_head *cull_list)
2838  {
2839  	struct worker *worker;
2840  
2841  	list_for_each_entry(worker, cull_list, entry)
2842  		detach_worker(worker);
2843  }
2844  
reap_dying_workers(struct list_head * cull_list)2845  static void reap_dying_workers(struct list_head *cull_list)
2846  {
2847  	struct worker *worker, *tmp;
2848  
2849  	list_for_each_entry_safe(worker, tmp, cull_list, entry) {
2850  		list_del_init(&worker->entry);
2851  		kthread_stop_put(worker->task);
2852  		kfree(worker);
2853  	}
2854  }
2855  
2856  /**
2857   * set_worker_dying - Tag a worker for destruction
2858   * @worker: worker to be destroyed
2859   * @list: transfer worker away from its pool->idle_list and into list
2860   *
2861   * Tag @worker for destruction and adjust @pool stats accordingly.  The worker
2862   * should be idle.
2863   *
2864   * CONTEXT:
2865   * raw_spin_lock_irq(pool->lock).
2866   */
set_worker_dying(struct worker * worker,struct list_head * list)2867  static void set_worker_dying(struct worker *worker, struct list_head *list)
2868  {
2869  	struct worker_pool *pool = worker->pool;
2870  
2871  	lockdep_assert_held(&pool->lock);
2872  	lockdep_assert_held(&wq_pool_attach_mutex);
2873  
2874  	/* sanity check frenzy */
2875  	if (WARN_ON(worker->current_work) ||
2876  	    WARN_ON(!list_empty(&worker->scheduled)) ||
2877  	    WARN_ON(!(worker->flags & WORKER_IDLE)))
2878  		return;
2879  
2880  	pool->nr_workers--;
2881  	pool->nr_idle--;
2882  
2883  	worker->flags |= WORKER_DIE;
2884  
2885  	list_move(&worker->entry, list);
2886  
2887  	/* get an extra task struct reference for later kthread_stop_put() */
2888  	get_task_struct(worker->task);
2889  }
2890  
2891  /**
2892   * idle_worker_timeout - check if some idle workers can now be deleted.
2893   * @t: The pool's idle_timer that just expired
2894   *
2895   * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
2896   * worker_leave_idle(), as a worker flicking between idle and active while its
2897   * pool is at the too_many_workers() tipping point would cause too much timer
2898   * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
2899   * it expire and re-evaluate things from there.
2900   */
idle_worker_timeout(struct timer_list * t)2901  static void idle_worker_timeout(struct timer_list *t)
2902  {
2903  	struct worker_pool *pool = from_timer(pool, t, idle_timer);
2904  	bool do_cull = false;
2905  
2906  	if (work_pending(&pool->idle_cull_work))
2907  		return;
2908  
2909  	raw_spin_lock_irq(&pool->lock);
2910  
2911  	if (too_many_workers(pool)) {
2912  		struct worker *worker;
2913  		unsigned long expires;
2914  
2915  		/* idle_list is kept in LIFO order, check the last one */
2916  		worker = list_last_entry(&pool->idle_list, struct worker, entry);
2917  		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2918  		do_cull = !time_before(jiffies, expires);
2919  
2920  		if (!do_cull)
2921  			mod_timer(&pool->idle_timer, expires);
2922  	}
2923  	raw_spin_unlock_irq(&pool->lock);
2924  
2925  	if (do_cull)
2926  		queue_work(system_unbound_wq, &pool->idle_cull_work);
2927  }
2928  
2929  /**
2930   * idle_cull_fn - cull workers that have been idle for too long.
2931   * @work: the pool's work for handling these idle workers
2932   *
2933   * This goes through a pool's idle workers and gets rid of those that have been
2934   * idle for at least IDLE_WORKER_TIMEOUT seconds.
2935   *
2936   * We don't want to disturb isolated CPUs because of a pcpu kworker being
2937   * culled, so this also resets worker affinity. This requires a sleepable
2938   * context, hence the split between timer callback and work item.
2939   */
idle_cull_fn(struct work_struct * work)2940  static void idle_cull_fn(struct work_struct *work)
2941  {
2942  	struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
2943  	LIST_HEAD(cull_list);
2944  
2945  	/*
2946  	 * Grabbing wq_pool_attach_mutex here ensures an already-running worker
2947  	 * cannot proceed beyong set_pf_worker() in its self-destruct path.
2948  	 * This is required as a previously-preempted worker could run after
2949  	 * set_worker_dying() has happened but before detach_dying_workers() did.
2950  	 */
2951  	mutex_lock(&wq_pool_attach_mutex);
2952  	raw_spin_lock_irq(&pool->lock);
2953  
2954  	while (too_many_workers(pool)) {
2955  		struct worker *worker;
2956  		unsigned long expires;
2957  
2958  		worker = list_last_entry(&pool->idle_list, struct worker, entry);
2959  		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2960  
2961  		if (time_before(jiffies, expires)) {
2962  			mod_timer(&pool->idle_timer, expires);
2963  			break;
2964  		}
2965  
2966  		set_worker_dying(worker, &cull_list);
2967  	}
2968  
2969  	raw_spin_unlock_irq(&pool->lock);
2970  	detach_dying_workers(&cull_list);
2971  	mutex_unlock(&wq_pool_attach_mutex);
2972  
2973  	reap_dying_workers(&cull_list);
2974  }
2975  
send_mayday(struct work_struct * work)2976  static void send_mayday(struct work_struct *work)
2977  {
2978  	struct pool_workqueue *pwq = get_work_pwq(work);
2979  	struct workqueue_struct *wq = pwq->wq;
2980  
2981  	lockdep_assert_held(&wq_mayday_lock);
2982  
2983  	if (!wq->rescuer)
2984  		return;
2985  
2986  	/* mayday mayday mayday */
2987  	if (list_empty(&pwq->mayday_node)) {
2988  		/*
2989  		 * If @pwq is for an unbound wq, its base ref may be put at
2990  		 * any time due to an attribute change.  Pin @pwq until the
2991  		 * rescuer is done with it.
2992  		 */
2993  		get_pwq(pwq);
2994  		list_add_tail(&pwq->mayday_node, &wq->maydays);
2995  		wake_up_process(wq->rescuer->task);
2996  		pwq->stats[PWQ_STAT_MAYDAY]++;
2997  	}
2998  }
2999  
pool_mayday_timeout(struct timer_list * t)3000  static void pool_mayday_timeout(struct timer_list *t)
3001  {
3002  	struct worker_pool *pool = from_timer(pool, t, mayday_timer);
3003  	struct work_struct *work;
3004  
3005  	raw_spin_lock_irq(&pool->lock);
3006  	raw_spin_lock(&wq_mayday_lock);		/* for wq->maydays */
3007  
3008  	if (need_to_create_worker(pool)) {
3009  		/*
3010  		 * We've been trying to create a new worker but
3011  		 * haven't been successful.  We might be hitting an
3012  		 * allocation deadlock.  Send distress signals to
3013  		 * rescuers.
3014  		 */
3015  		list_for_each_entry(work, &pool->worklist, entry)
3016  			send_mayday(work);
3017  	}
3018  
3019  	raw_spin_unlock(&wq_mayday_lock);
3020  	raw_spin_unlock_irq(&pool->lock);
3021  
3022  	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
3023  }
3024  
3025  /**
3026   * maybe_create_worker - create a new worker if necessary
3027   * @pool: pool to create a new worker for
3028   *
3029   * Create a new worker for @pool if necessary.  @pool is guaranteed to
3030   * have at least one idle worker on return from this function.  If
3031   * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
3032   * sent to all rescuers with works scheduled on @pool to resolve
3033   * possible allocation deadlock.
3034   *
3035   * On return, need_to_create_worker() is guaranteed to be %false and
3036   * may_start_working() %true.
3037   *
3038   * LOCKING:
3039   * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
3040   * multiple times.  Does GFP_KERNEL allocations.  Called only from
3041   * manager.
3042   */
maybe_create_worker(struct worker_pool * pool)3043  static void maybe_create_worker(struct worker_pool *pool)
3044  __releases(&pool->lock)
3045  __acquires(&pool->lock)
3046  {
3047  restart:
3048  	raw_spin_unlock_irq(&pool->lock);
3049  
3050  	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
3051  	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
3052  
3053  	while (true) {
3054  		if (create_worker(pool) || !need_to_create_worker(pool))
3055  			break;
3056  
3057  		schedule_timeout_interruptible(CREATE_COOLDOWN);
3058  
3059  		if (!need_to_create_worker(pool))
3060  			break;
3061  	}
3062  
3063  	del_timer_sync(&pool->mayday_timer);
3064  	raw_spin_lock_irq(&pool->lock);
3065  	/*
3066  	 * This is necessary even after a new worker was just successfully
3067  	 * created as @pool->lock was dropped and the new worker might have
3068  	 * already become busy.
3069  	 */
3070  	if (need_to_create_worker(pool))
3071  		goto restart;
3072  }
3073  
3074  /**
3075   * manage_workers - manage worker pool
3076   * @worker: self
3077   *
3078   * Assume the manager role and manage the worker pool @worker belongs
3079   * to.  At any given time, there can be only zero or one manager per
3080   * pool.  The exclusion is handled automatically by this function.
3081   *
3082   * The caller can safely start processing works on false return.  On
3083   * true return, it's guaranteed that need_to_create_worker() is false
3084   * and may_start_working() is true.
3085   *
3086   * CONTEXT:
3087   * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
3088   * multiple times.  Does GFP_KERNEL allocations.
3089   *
3090   * Return:
3091   * %false if the pool doesn't need management and the caller can safely
3092   * start processing works, %true if management function was performed and
3093   * the conditions that the caller verified before calling the function may
3094   * no longer be true.
3095   */
manage_workers(struct worker * worker)3096  static bool manage_workers(struct worker *worker)
3097  {
3098  	struct worker_pool *pool = worker->pool;
3099  
3100  	if (pool->flags & POOL_MANAGER_ACTIVE)
3101  		return false;
3102  
3103  	pool->flags |= POOL_MANAGER_ACTIVE;
3104  	pool->manager = worker;
3105  
3106  	maybe_create_worker(pool);
3107  
3108  	pool->manager = NULL;
3109  	pool->flags &= ~POOL_MANAGER_ACTIVE;
3110  	rcuwait_wake_up(&manager_wait);
3111  	return true;
3112  }
3113  
3114  /**
3115   * process_one_work - process single work
3116   * @worker: self
3117   * @work: work to process
3118   *
3119   * Process @work.  This function contains all the logics necessary to
3120   * process a single work including synchronization against and
3121   * interaction with other workers on the same cpu, queueing and
3122   * flushing.  As long as context requirement is met, any worker can
3123   * call this function to process a work.
3124   *
3125   * CONTEXT:
3126   * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
3127   */
process_one_work(struct worker * worker,struct work_struct * work)3128  static void process_one_work(struct worker *worker, struct work_struct *work)
3129  __releases(&pool->lock)
3130  __acquires(&pool->lock)
3131  {
3132  	struct pool_workqueue *pwq = get_work_pwq(work);
3133  	struct worker_pool *pool = worker->pool;
3134  	unsigned long work_data;
3135  	int lockdep_start_depth, rcu_start_depth;
3136  	bool bh_draining = pool->flags & POOL_BH_DRAINING;
3137  #ifdef CONFIG_LOCKDEP
3138  	/*
3139  	 * It is permissible to free the struct work_struct from
3140  	 * inside the function that is called from it, this we need to
3141  	 * take into account for lockdep too.  To avoid bogus "held
3142  	 * lock freed" warnings as well as problems when looking into
3143  	 * work->lockdep_map, make a copy and use that here.
3144  	 */
3145  	struct lockdep_map lockdep_map;
3146  
3147  	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
3148  #endif
3149  	/* ensure we're on the correct CPU */
3150  	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
3151  		     raw_smp_processor_id() != pool->cpu);
3152  
3153  	/* claim and dequeue */
3154  	debug_work_deactivate(work);
3155  	hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
3156  	worker->current_work = work;
3157  	worker->current_func = work->func;
3158  	worker->current_pwq = pwq;
3159  	if (worker->task)
3160  		worker->current_at = worker->task->se.sum_exec_runtime;
3161  	work_data = *work_data_bits(work);
3162  	worker->current_color = get_work_color(work_data);
3163  
3164  	/*
3165  	 * Record wq name for cmdline and debug reporting, may get
3166  	 * overridden through set_worker_desc().
3167  	 */
3168  	strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
3169  
3170  	list_del_init(&work->entry);
3171  
3172  	/*
3173  	 * CPU intensive works don't participate in concurrency management.
3174  	 * They're the scheduler's responsibility.  This takes @worker out
3175  	 * of concurrency management and the next code block will chain
3176  	 * execution of the pending work items.
3177  	 */
3178  	if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
3179  		worker_set_flags(worker, WORKER_CPU_INTENSIVE);
3180  
3181  	/*
3182  	 * Kick @pool if necessary. It's always noop for per-cpu worker pools
3183  	 * since nr_running would always be >= 1 at this point. This is used to
3184  	 * chain execution of the pending work items for WORKER_NOT_RUNNING
3185  	 * workers such as the UNBOUND and CPU_INTENSIVE ones.
3186  	 */
3187  	kick_pool(pool);
3188  
3189  	/*
3190  	 * Record the last pool and clear PENDING which should be the last
3191  	 * update to @work.  Also, do this inside @pool->lock so that
3192  	 * PENDING and queued state changes happen together while IRQ is
3193  	 * disabled.
3194  	 */
3195  	set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool));
3196  
3197  	pwq->stats[PWQ_STAT_STARTED]++;
3198  	raw_spin_unlock_irq(&pool->lock);
3199  
3200  	rcu_start_depth = rcu_preempt_depth();
3201  	lockdep_start_depth = lockdep_depth(current);
3202  	/* see drain_dead_softirq_workfn() */
3203  	if (!bh_draining)
3204  		lock_map_acquire(pwq->wq->lockdep_map);
3205  	lock_map_acquire(&lockdep_map);
3206  	/*
3207  	 * Strictly speaking we should mark the invariant state without holding
3208  	 * any locks, that is, before these two lock_map_acquire()'s.
3209  	 *
3210  	 * However, that would result in:
3211  	 *
3212  	 *   A(W1)
3213  	 *   WFC(C)
3214  	 *		A(W1)
3215  	 *		C(C)
3216  	 *
3217  	 * Which would create W1->C->W1 dependencies, even though there is no
3218  	 * actual deadlock possible. There are two solutions, using a
3219  	 * read-recursive acquire on the work(queue) 'locks', but this will then
3220  	 * hit the lockdep limitation on recursive locks, or simply discard
3221  	 * these locks.
3222  	 *
3223  	 * AFAICT there is no possible deadlock scenario between the
3224  	 * flush_work() and complete() primitives (except for single-threaded
3225  	 * workqueues), so hiding them isn't a problem.
3226  	 */
3227  	lockdep_invariant_state(true);
3228  	trace_workqueue_execute_start(work);
3229  	worker->current_func(work);
3230  	/*
3231  	 * While we must be careful to not use "work" after this, the trace
3232  	 * point will only record its address.
3233  	 */
3234  	trace_workqueue_execute_end(work, worker->current_func);
3235  	pwq->stats[PWQ_STAT_COMPLETED]++;
3236  	lock_map_release(&lockdep_map);
3237  	if (!bh_draining)
3238  		lock_map_release(pwq->wq->lockdep_map);
3239  
3240  	if (unlikely((worker->task && in_atomic()) ||
3241  		     lockdep_depth(current) != lockdep_start_depth ||
3242  		     rcu_preempt_depth() != rcu_start_depth)) {
3243  		pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n"
3244  		       "     preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n",
3245  		       current->comm, task_pid_nr(current), preempt_count(),
3246  		       lockdep_start_depth, lockdep_depth(current),
3247  		       rcu_start_depth, rcu_preempt_depth(),
3248  		       worker->current_func);
3249  		debug_show_held_locks(current);
3250  		dump_stack();
3251  	}
3252  
3253  	/*
3254  	 * The following prevents a kworker from hogging CPU on !PREEMPTION
3255  	 * kernels, where a requeueing work item waiting for something to
3256  	 * happen could deadlock with stop_machine as such work item could
3257  	 * indefinitely requeue itself while all other CPUs are trapped in
3258  	 * stop_machine. At the same time, report a quiescent RCU state so
3259  	 * the same condition doesn't freeze RCU.
3260  	 */
3261  	if (worker->task)
3262  		cond_resched();
3263  
3264  	raw_spin_lock_irq(&pool->lock);
3265  
3266  	/*
3267  	 * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
3268  	 * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
3269  	 * wq_cpu_intensive_thresh_us. Clear it.
3270  	 */
3271  	worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
3272  
3273  	/* tag the worker for identification in schedule() */
3274  	worker->last_func = worker->current_func;
3275  
3276  	/* we're done with it, release */
3277  	hash_del(&worker->hentry);
3278  	worker->current_work = NULL;
3279  	worker->current_func = NULL;
3280  	worker->current_pwq = NULL;
3281  	worker->current_color = INT_MAX;
3282  
3283  	/* must be the last step, see the function comment */
3284  	pwq_dec_nr_in_flight(pwq, work_data);
3285  }
3286  
3287  /**
3288   * process_scheduled_works - process scheduled works
3289   * @worker: self
3290   *
3291   * Process all scheduled works.  Please note that the scheduled list
3292   * may change while processing a work, so this function repeatedly
3293   * fetches a work from the top and executes it.
3294   *
3295   * CONTEXT:
3296   * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
3297   * multiple times.
3298   */
process_scheduled_works(struct worker * worker)3299  static void process_scheduled_works(struct worker *worker)
3300  {
3301  	struct work_struct *work;
3302  	bool first = true;
3303  
3304  	while ((work = list_first_entry_or_null(&worker->scheduled,
3305  						struct work_struct, entry))) {
3306  		if (first) {
3307  			worker->pool->watchdog_ts = jiffies;
3308  			first = false;
3309  		}
3310  		process_one_work(worker, work);
3311  	}
3312  }
3313  
set_pf_worker(bool val)3314  static void set_pf_worker(bool val)
3315  {
3316  	mutex_lock(&wq_pool_attach_mutex);
3317  	if (val)
3318  		current->flags |= PF_WQ_WORKER;
3319  	else
3320  		current->flags &= ~PF_WQ_WORKER;
3321  	mutex_unlock(&wq_pool_attach_mutex);
3322  }
3323  
3324  /**
3325   * worker_thread - the worker thread function
3326   * @__worker: self
3327   *
3328   * The worker thread function.  All workers belong to a worker_pool -
3329   * either a per-cpu one or dynamic unbound one.  These workers process all
3330   * work items regardless of their specific target workqueue.  The only
3331   * exception is work items which belong to workqueues with a rescuer which
3332   * will be explained in rescuer_thread().
3333   *
3334   * Return: 0
3335   */
worker_thread(void * __worker)3336  static int worker_thread(void *__worker)
3337  {
3338  	struct worker *worker = __worker;
3339  	struct worker_pool *pool = worker->pool;
3340  
3341  	/* tell the scheduler that this is a workqueue worker */
3342  	set_pf_worker(true);
3343  woke_up:
3344  	raw_spin_lock_irq(&pool->lock);
3345  
3346  	/* am I supposed to die? */
3347  	if (unlikely(worker->flags & WORKER_DIE)) {
3348  		raw_spin_unlock_irq(&pool->lock);
3349  		set_pf_worker(false);
3350  		/*
3351  		 * The worker is dead and PF_WQ_WORKER is cleared, worker->pool
3352  		 * shouldn't be accessed, reset it to NULL in case otherwise.
3353  		 */
3354  		worker->pool = NULL;
3355  		ida_free(&pool->worker_ida, worker->id);
3356  		return 0;
3357  	}
3358  
3359  	worker_leave_idle(worker);
3360  recheck:
3361  	/* no more worker necessary? */
3362  	if (!need_more_worker(pool))
3363  		goto sleep;
3364  
3365  	/* do we need to manage? */
3366  	if (unlikely(!may_start_working(pool)) && manage_workers(worker))
3367  		goto recheck;
3368  
3369  	/*
3370  	 * ->scheduled list can only be filled while a worker is
3371  	 * preparing to process a work or actually processing it.
3372  	 * Make sure nobody diddled with it while I was sleeping.
3373  	 */
3374  	WARN_ON_ONCE(!list_empty(&worker->scheduled));
3375  
3376  	/*
3377  	 * Finish PREP stage.  We're guaranteed to have at least one idle
3378  	 * worker or that someone else has already assumed the manager
3379  	 * role.  This is where @worker starts participating in concurrency
3380  	 * management if applicable and concurrency management is restored
3381  	 * after being rebound.  See rebind_workers() for details.
3382  	 */
3383  	worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
3384  
3385  	do {
3386  		struct work_struct *work =
3387  			list_first_entry(&pool->worklist,
3388  					 struct work_struct, entry);
3389  
3390  		if (assign_work(work, worker, NULL))
3391  			process_scheduled_works(worker);
3392  	} while (keep_working(pool));
3393  
3394  	worker_set_flags(worker, WORKER_PREP);
3395  sleep:
3396  	/*
3397  	 * pool->lock is held and there's no work to process and no need to
3398  	 * manage, sleep.  Workers are woken up only while holding
3399  	 * pool->lock or from local cpu, so setting the current state
3400  	 * before releasing pool->lock is enough to prevent losing any
3401  	 * event.
3402  	 */
3403  	worker_enter_idle(worker);
3404  	__set_current_state(TASK_IDLE);
3405  	raw_spin_unlock_irq(&pool->lock);
3406  	schedule();
3407  	goto woke_up;
3408  }
3409  
3410  /**
3411   * rescuer_thread - the rescuer thread function
3412   * @__rescuer: self
3413   *
3414   * Workqueue rescuer thread function.  There's one rescuer for each
3415   * workqueue which has WQ_MEM_RECLAIM set.
3416   *
3417   * Regular work processing on a pool may block trying to create a new
3418   * worker which uses GFP_KERNEL allocation which has slight chance of
3419   * developing into deadlock if some works currently on the same queue
3420   * need to be processed to satisfy the GFP_KERNEL allocation.  This is
3421   * the problem rescuer solves.
3422   *
3423   * When such condition is possible, the pool summons rescuers of all
3424   * workqueues which have works queued on the pool and let them process
3425   * those works so that forward progress can be guaranteed.
3426   *
3427   * This should happen rarely.
3428   *
3429   * Return: 0
3430   */
rescuer_thread(void * __rescuer)3431  static int rescuer_thread(void *__rescuer)
3432  {
3433  	struct worker *rescuer = __rescuer;
3434  	struct workqueue_struct *wq = rescuer->rescue_wq;
3435  	bool should_stop;
3436  
3437  	set_user_nice(current, RESCUER_NICE_LEVEL);
3438  
3439  	/*
3440  	 * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
3441  	 * doesn't participate in concurrency management.
3442  	 */
3443  	set_pf_worker(true);
3444  repeat:
3445  	set_current_state(TASK_IDLE);
3446  
3447  	/*
3448  	 * By the time the rescuer is requested to stop, the workqueue
3449  	 * shouldn't have any work pending, but @wq->maydays may still have
3450  	 * pwq(s) queued.  This can happen by non-rescuer workers consuming
3451  	 * all the work items before the rescuer got to them.  Go through
3452  	 * @wq->maydays processing before acting on should_stop so that the
3453  	 * list is always empty on exit.
3454  	 */
3455  	should_stop = kthread_should_stop();
3456  
3457  	/* see whether any pwq is asking for help */
3458  	raw_spin_lock_irq(&wq_mayday_lock);
3459  
3460  	while (!list_empty(&wq->maydays)) {
3461  		struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
3462  					struct pool_workqueue, mayday_node);
3463  		struct worker_pool *pool = pwq->pool;
3464  		struct work_struct *work, *n;
3465  
3466  		__set_current_state(TASK_RUNNING);
3467  		list_del_init(&pwq->mayday_node);
3468  
3469  		raw_spin_unlock_irq(&wq_mayday_lock);
3470  
3471  		worker_attach_to_pool(rescuer, pool);
3472  
3473  		raw_spin_lock_irq(&pool->lock);
3474  
3475  		/*
3476  		 * Slurp in all works issued via this workqueue and
3477  		 * process'em.
3478  		 */
3479  		WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
3480  		list_for_each_entry_safe(work, n, &pool->worklist, entry) {
3481  			if (get_work_pwq(work) == pwq &&
3482  			    assign_work(work, rescuer, &n))
3483  				pwq->stats[PWQ_STAT_RESCUED]++;
3484  		}
3485  
3486  		if (!list_empty(&rescuer->scheduled)) {
3487  			process_scheduled_works(rescuer);
3488  
3489  			/*
3490  			 * The above execution of rescued work items could
3491  			 * have created more to rescue through
3492  			 * pwq_activate_first_inactive() or chained
3493  			 * queueing.  Let's put @pwq back on mayday list so
3494  			 * that such back-to-back work items, which may be
3495  			 * being used to relieve memory pressure, don't
3496  			 * incur MAYDAY_INTERVAL delay inbetween.
3497  			 */
3498  			if (pwq->nr_active && need_to_create_worker(pool)) {
3499  				raw_spin_lock(&wq_mayday_lock);
3500  				/*
3501  				 * Queue iff we aren't racing destruction
3502  				 * and somebody else hasn't queued it already.
3503  				 */
3504  				if (wq->rescuer && list_empty(&pwq->mayday_node)) {
3505  					get_pwq(pwq);
3506  					list_add_tail(&pwq->mayday_node, &wq->maydays);
3507  				}
3508  				raw_spin_unlock(&wq_mayday_lock);
3509  			}
3510  		}
3511  
3512  		/*
3513  		 * Put the reference grabbed by send_mayday().  @pool won't
3514  		 * go away while we're still attached to it.
3515  		 */
3516  		put_pwq(pwq);
3517  
3518  		/*
3519  		 * Leave this pool. Notify regular workers; otherwise, we end up
3520  		 * with 0 concurrency and stalling the execution.
3521  		 */
3522  		kick_pool(pool);
3523  
3524  		raw_spin_unlock_irq(&pool->lock);
3525  
3526  		worker_detach_from_pool(rescuer);
3527  
3528  		raw_spin_lock_irq(&wq_mayday_lock);
3529  	}
3530  
3531  	raw_spin_unlock_irq(&wq_mayday_lock);
3532  
3533  	if (should_stop) {
3534  		__set_current_state(TASK_RUNNING);
3535  		set_pf_worker(false);
3536  		return 0;
3537  	}
3538  
3539  	/* rescuers should never participate in concurrency management */
3540  	WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
3541  	schedule();
3542  	goto repeat;
3543  }
3544  
bh_worker(struct worker * worker)3545  static void bh_worker(struct worker *worker)
3546  {
3547  	struct worker_pool *pool = worker->pool;
3548  	int nr_restarts = BH_WORKER_RESTARTS;
3549  	unsigned long end = jiffies + BH_WORKER_JIFFIES;
3550  
3551  	raw_spin_lock_irq(&pool->lock);
3552  	worker_leave_idle(worker);
3553  
3554  	/*
3555  	 * This function follows the structure of worker_thread(). See there for
3556  	 * explanations on each step.
3557  	 */
3558  	if (!need_more_worker(pool))
3559  		goto done;
3560  
3561  	WARN_ON_ONCE(!list_empty(&worker->scheduled));
3562  	worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
3563  
3564  	do {
3565  		struct work_struct *work =
3566  			list_first_entry(&pool->worklist,
3567  					 struct work_struct, entry);
3568  
3569  		if (assign_work(work, worker, NULL))
3570  			process_scheduled_works(worker);
3571  	} while (keep_working(pool) &&
3572  		 --nr_restarts && time_before(jiffies, end));
3573  
3574  	worker_set_flags(worker, WORKER_PREP);
3575  done:
3576  	worker_enter_idle(worker);
3577  	kick_pool(pool);
3578  	raw_spin_unlock_irq(&pool->lock);
3579  }
3580  
3581  /*
3582   * TODO: Convert all tasklet users to workqueue and use softirq directly.
3583   *
3584   * This is currently called from tasklet[_hi]action() and thus is also called
3585   * whenever there are tasklets to run. Let's do an early exit if there's nothing
3586   * queued. Once conversion from tasklet is complete, the need_more_worker() test
3587   * can be dropped.
3588   *
3589   * After full conversion, we'll add worker->softirq_action, directly use the
3590   * softirq action and obtain the worker pointer from the softirq_action pointer.
3591   */
workqueue_softirq_action(bool highpri)3592  void workqueue_softirq_action(bool highpri)
3593  {
3594  	struct worker_pool *pool =
3595  		&per_cpu(bh_worker_pools, smp_processor_id())[highpri];
3596  	if (need_more_worker(pool))
3597  		bh_worker(list_first_entry(&pool->workers, struct worker, node));
3598  }
3599  
3600  struct wq_drain_dead_softirq_work {
3601  	struct work_struct	work;
3602  	struct worker_pool	*pool;
3603  	struct completion	done;
3604  };
3605  
drain_dead_softirq_workfn(struct work_struct * work)3606  static void drain_dead_softirq_workfn(struct work_struct *work)
3607  {
3608  	struct wq_drain_dead_softirq_work *dead_work =
3609  		container_of(work, struct wq_drain_dead_softirq_work, work);
3610  	struct worker_pool *pool = dead_work->pool;
3611  	bool repeat;
3612  
3613  	/*
3614  	 * @pool's CPU is dead and we want to execute its still pending work
3615  	 * items from this BH work item which is running on a different CPU. As
3616  	 * its CPU is dead, @pool can't be kicked and, as work execution path
3617  	 * will be nested, a lockdep annotation needs to be suppressed. Mark
3618  	 * @pool with %POOL_BH_DRAINING for the special treatments.
3619  	 */
3620  	raw_spin_lock_irq(&pool->lock);
3621  	pool->flags |= POOL_BH_DRAINING;
3622  	raw_spin_unlock_irq(&pool->lock);
3623  
3624  	bh_worker(list_first_entry(&pool->workers, struct worker, node));
3625  
3626  	raw_spin_lock_irq(&pool->lock);
3627  	pool->flags &= ~POOL_BH_DRAINING;
3628  	repeat = need_more_worker(pool);
3629  	raw_spin_unlock_irq(&pool->lock);
3630  
3631  	/*
3632  	 * bh_worker() might hit consecutive execution limit and bail. If there
3633  	 * still are pending work items, reschedule self and return so that we
3634  	 * don't hog this CPU's BH.
3635  	 */
3636  	if (repeat) {
3637  		if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
3638  			queue_work(system_bh_highpri_wq, work);
3639  		else
3640  			queue_work(system_bh_wq, work);
3641  	} else {
3642  		complete(&dead_work->done);
3643  	}
3644  }
3645  
3646  /*
3647   * @cpu is dead. Drain the remaining BH work items on the current CPU. It's
3648   * possible to allocate dead_work per CPU and avoid flushing. However, then we
3649   * have to worry about draining overlapping with CPU coming back online or
3650   * nesting (one CPU's dead_work queued on another CPU which is also dead and so
3651   * on). Let's keep it simple and drain them synchronously. These are BH work
3652   * items which shouldn't be requeued on the same pool. Shouldn't take long.
3653   */
workqueue_softirq_dead(unsigned int cpu)3654  void workqueue_softirq_dead(unsigned int cpu)
3655  {
3656  	int i;
3657  
3658  	for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
3659  		struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
3660  		struct wq_drain_dead_softirq_work dead_work;
3661  
3662  		if (!need_more_worker(pool))
3663  			continue;
3664  
3665  		INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn);
3666  		dead_work.pool = pool;
3667  		init_completion(&dead_work.done);
3668  
3669  		if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
3670  			queue_work(system_bh_highpri_wq, &dead_work.work);
3671  		else
3672  			queue_work(system_bh_wq, &dead_work.work);
3673  
3674  		wait_for_completion(&dead_work.done);
3675  		destroy_work_on_stack(&dead_work.work);
3676  	}
3677  }
3678  
3679  /**
3680   * check_flush_dependency - check for flush dependency sanity
3681   * @target_wq: workqueue being flushed
3682   * @target_work: work item being flushed (NULL for workqueue flushes)
3683   *
3684   * %current is trying to flush the whole @target_wq or @target_work on it.
3685   * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
3686   * reclaiming memory or running on a workqueue which doesn't have
3687   * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
3688   * a deadlock.
3689   */
check_flush_dependency(struct workqueue_struct * target_wq,struct work_struct * target_work)3690  static void check_flush_dependency(struct workqueue_struct *target_wq,
3691  				   struct work_struct *target_work)
3692  {
3693  	work_func_t target_func = target_work ? target_work->func : NULL;
3694  	struct worker *worker;
3695  
3696  	if (target_wq->flags & WQ_MEM_RECLAIM)
3697  		return;
3698  
3699  	worker = current_wq_worker();
3700  
3701  	WARN_ONCE(current->flags & PF_MEMALLOC,
3702  		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
3703  		  current->pid, current->comm, target_wq->name, target_func);
3704  	WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
3705  			      (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
3706  		  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
3707  		  worker->current_pwq->wq->name, worker->current_func,
3708  		  target_wq->name, target_func);
3709  }
3710  
3711  struct wq_barrier {
3712  	struct work_struct	work;
3713  	struct completion	done;
3714  	struct task_struct	*task;	/* purely informational */
3715  };
3716  
wq_barrier_func(struct work_struct * work)3717  static void wq_barrier_func(struct work_struct *work)
3718  {
3719  	struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
3720  	complete(&barr->done);
3721  }
3722  
3723  /**
3724   * insert_wq_barrier - insert a barrier work
3725   * @pwq: pwq to insert barrier into
3726   * @barr: wq_barrier to insert
3727   * @target: target work to attach @barr to
3728   * @worker: worker currently executing @target, NULL if @target is not executing
3729   *
3730   * @barr is linked to @target such that @barr is completed only after
3731   * @target finishes execution.  Please note that the ordering
3732   * guarantee is observed only with respect to @target and on the local
3733   * cpu.
3734   *
3735   * Currently, a queued barrier can't be canceled.  This is because
3736   * try_to_grab_pending() can't determine whether the work to be
3737   * grabbed is at the head of the queue and thus can't clear LINKED
3738   * flag of the previous work while there must be a valid next work
3739   * after a work with LINKED flag set.
3740   *
3741   * Note that when @worker is non-NULL, @target may be modified
3742   * underneath us, so we can't reliably determine pwq from @target.
3743   *
3744   * CONTEXT:
3745   * raw_spin_lock_irq(pool->lock).
3746   */
insert_wq_barrier(struct pool_workqueue * pwq,struct wq_barrier * barr,struct work_struct * target,struct worker * worker)3747  static void insert_wq_barrier(struct pool_workqueue *pwq,
3748  			      struct wq_barrier *barr,
3749  			      struct work_struct *target, struct worker *worker)
3750  {
3751  	static __maybe_unused struct lock_class_key bh_key, thr_key;
3752  	unsigned int work_flags = 0;
3753  	unsigned int work_color;
3754  	struct list_head *head;
3755  
3756  	/*
3757  	 * debugobject calls are safe here even with pool->lock locked
3758  	 * as we know for sure that this will not trigger any of the
3759  	 * checks and call back into the fixup functions where we
3760  	 * might deadlock.
3761  	 *
3762  	 * BH and threaded workqueues need separate lockdep keys to avoid
3763  	 * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W}
3764  	 * usage".
3765  	 */
3766  	INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func,
3767  			      (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key);
3768  	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
3769  
3770  	init_completion_map(&barr->done, &target->lockdep_map);
3771  
3772  	barr->task = current;
3773  
3774  	/* The barrier work item does not participate in nr_active. */
3775  	work_flags |= WORK_STRUCT_INACTIVE;
3776  
3777  	/*
3778  	 * If @target is currently being executed, schedule the
3779  	 * barrier to the worker; otherwise, put it after @target.
3780  	 */
3781  	if (worker) {
3782  		head = worker->scheduled.next;
3783  		work_color = worker->current_color;
3784  	} else {
3785  		unsigned long *bits = work_data_bits(target);
3786  
3787  		head = target->entry.next;
3788  		/* there can already be other linked works, inherit and set */
3789  		work_flags |= *bits & WORK_STRUCT_LINKED;
3790  		work_color = get_work_color(*bits);
3791  		__set_bit(WORK_STRUCT_LINKED_BIT, bits);
3792  	}
3793  
3794  	pwq->nr_in_flight[work_color]++;
3795  	work_flags |= work_color_to_flags(work_color);
3796  
3797  	insert_work(pwq, &barr->work, head, work_flags);
3798  }
3799  
3800  /**
3801   * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
3802   * @wq: workqueue being flushed
3803   * @flush_color: new flush color, < 0 for no-op
3804   * @work_color: new work color, < 0 for no-op
3805   *
3806   * Prepare pwqs for workqueue flushing.
3807   *
3808   * If @flush_color is non-negative, flush_color on all pwqs should be
3809   * -1.  If no pwq has in-flight commands at the specified color, all
3810   * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
3811   * has in flight commands, its pwq->flush_color is set to
3812   * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
3813   * wakeup logic is armed and %true is returned.
3814   *
3815   * The caller should have initialized @wq->first_flusher prior to
3816   * calling this function with non-negative @flush_color.  If
3817   * @flush_color is negative, no flush color update is done and %false
3818   * is returned.
3819   *
3820   * If @work_color is non-negative, all pwqs should have the same
3821   * work_color which is previous to @work_color and all will be
3822   * advanced to @work_color.
3823   *
3824   * CONTEXT:
3825   * mutex_lock(wq->mutex).
3826   *
3827   * Return:
3828   * %true if @flush_color >= 0 and there's something to flush.  %false
3829   * otherwise.
3830   */
flush_workqueue_prep_pwqs(struct workqueue_struct * wq,int flush_color,int work_color)3831  static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
3832  				      int flush_color, int work_color)
3833  {
3834  	bool wait = false;
3835  	struct pool_workqueue *pwq;
3836  
3837  	if (flush_color >= 0) {
3838  		WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
3839  		atomic_set(&wq->nr_pwqs_to_flush, 1);
3840  	}
3841  
3842  	for_each_pwq(pwq, wq) {
3843  		struct worker_pool *pool = pwq->pool;
3844  
3845  		raw_spin_lock_irq(&pool->lock);
3846  
3847  		if (flush_color >= 0) {
3848  			WARN_ON_ONCE(pwq->flush_color != -1);
3849  
3850  			if (pwq->nr_in_flight[flush_color]) {
3851  				pwq->flush_color = flush_color;
3852  				atomic_inc(&wq->nr_pwqs_to_flush);
3853  				wait = true;
3854  			}
3855  		}
3856  
3857  		if (work_color >= 0) {
3858  			WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
3859  			pwq->work_color = work_color;
3860  		}
3861  
3862  		raw_spin_unlock_irq(&pool->lock);
3863  	}
3864  
3865  	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
3866  		complete(&wq->first_flusher->done);
3867  
3868  	return wait;
3869  }
3870  
touch_wq_lockdep_map(struct workqueue_struct * wq)3871  static void touch_wq_lockdep_map(struct workqueue_struct *wq)
3872  {
3873  #ifdef CONFIG_LOCKDEP
3874  	if (unlikely(!wq->lockdep_map))
3875  		return;
3876  
3877  	if (wq->flags & WQ_BH)
3878  		local_bh_disable();
3879  
3880  	lock_map_acquire(wq->lockdep_map);
3881  	lock_map_release(wq->lockdep_map);
3882  
3883  	if (wq->flags & WQ_BH)
3884  		local_bh_enable();
3885  #endif
3886  }
3887  
touch_work_lockdep_map(struct work_struct * work,struct workqueue_struct * wq)3888  static void touch_work_lockdep_map(struct work_struct *work,
3889  				   struct workqueue_struct *wq)
3890  {
3891  #ifdef CONFIG_LOCKDEP
3892  	if (wq->flags & WQ_BH)
3893  		local_bh_disable();
3894  
3895  	lock_map_acquire(&work->lockdep_map);
3896  	lock_map_release(&work->lockdep_map);
3897  
3898  	if (wq->flags & WQ_BH)
3899  		local_bh_enable();
3900  #endif
3901  }
3902  
3903  /**
3904   * __flush_workqueue - ensure that any scheduled work has run to completion.
3905   * @wq: workqueue to flush
3906   *
3907   * This function sleeps until all work items which were queued on entry
3908   * have finished execution, but it is not livelocked by new incoming ones.
3909   */
__flush_workqueue(struct workqueue_struct * wq)3910  void __flush_workqueue(struct workqueue_struct *wq)
3911  {
3912  	struct wq_flusher this_flusher = {
3913  		.list = LIST_HEAD_INIT(this_flusher.list),
3914  		.flush_color = -1,
3915  		.done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)),
3916  	};
3917  	int next_color;
3918  
3919  	if (WARN_ON(!wq_online))
3920  		return;
3921  
3922  	touch_wq_lockdep_map(wq);
3923  
3924  	mutex_lock(&wq->mutex);
3925  
3926  	/*
3927  	 * Start-to-wait phase
3928  	 */
3929  	next_color = work_next_color(wq->work_color);
3930  
3931  	if (next_color != wq->flush_color) {
3932  		/*
3933  		 * Color space is not full.  The current work_color
3934  		 * becomes our flush_color and work_color is advanced
3935  		 * by one.
3936  		 */
3937  		WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
3938  		this_flusher.flush_color = wq->work_color;
3939  		wq->work_color = next_color;
3940  
3941  		if (!wq->first_flusher) {
3942  			/* no flush in progress, become the first flusher */
3943  			WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
3944  
3945  			wq->first_flusher = &this_flusher;
3946  
3947  			if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
3948  						       wq->work_color)) {
3949  				/* nothing to flush, done */
3950  				wq->flush_color = next_color;
3951  				wq->first_flusher = NULL;
3952  				goto out_unlock;
3953  			}
3954  		} else {
3955  			/* wait in queue */
3956  			WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
3957  			list_add_tail(&this_flusher.list, &wq->flusher_queue);
3958  			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
3959  		}
3960  	} else {
3961  		/*
3962  		 * Oops, color space is full, wait on overflow queue.
3963  		 * The next flush completion will assign us
3964  		 * flush_color and transfer to flusher_queue.
3965  		 */
3966  		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
3967  	}
3968  
3969  	check_flush_dependency(wq, NULL);
3970  
3971  	mutex_unlock(&wq->mutex);
3972  
3973  	wait_for_completion(&this_flusher.done);
3974  
3975  	/*
3976  	 * Wake-up-and-cascade phase
3977  	 *
3978  	 * First flushers are responsible for cascading flushes and
3979  	 * handling overflow.  Non-first flushers can simply return.
3980  	 */
3981  	if (READ_ONCE(wq->first_flusher) != &this_flusher)
3982  		return;
3983  
3984  	mutex_lock(&wq->mutex);
3985  
3986  	/* we might have raced, check again with mutex held */
3987  	if (wq->first_flusher != &this_flusher)
3988  		goto out_unlock;
3989  
3990  	WRITE_ONCE(wq->first_flusher, NULL);
3991  
3992  	WARN_ON_ONCE(!list_empty(&this_flusher.list));
3993  	WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
3994  
3995  	while (true) {
3996  		struct wq_flusher *next, *tmp;
3997  
3998  		/* complete all the flushers sharing the current flush color */
3999  		list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
4000  			if (next->flush_color != wq->flush_color)
4001  				break;
4002  			list_del_init(&next->list);
4003  			complete(&next->done);
4004  		}
4005  
4006  		WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
4007  			     wq->flush_color != work_next_color(wq->work_color));
4008  
4009  		/* this flush_color is finished, advance by one */
4010  		wq->flush_color = work_next_color(wq->flush_color);
4011  
4012  		/* one color has been freed, handle overflow queue */
4013  		if (!list_empty(&wq->flusher_overflow)) {
4014  			/*
4015  			 * Assign the same color to all overflowed
4016  			 * flushers, advance work_color and append to
4017  			 * flusher_queue.  This is the start-to-wait
4018  			 * phase for these overflowed flushers.
4019  			 */
4020  			list_for_each_entry(tmp, &wq->flusher_overflow, list)
4021  				tmp->flush_color = wq->work_color;
4022  
4023  			wq->work_color = work_next_color(wq->work_color);
4024  
4025  			list_splice_tail_init(&wq->flusher_overflow,
4026  					      &wq->flusher_queue);
4027  			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
4028  		}
4029  
4030  		if (list_empty(&wq->flusher_queue)) {
4031  			WARN_ON_ONCE(wq->flush_color != wq->work_color);
4032  			break;
4033  		}
4034  
4035  		/*
4036  		 * Need to flush more colors.  Make the next flusher
4037  		 * the new first flusher and arm pwqs.
4038  		 */
4039  		WARN_ON_ONCE(wq->flush_color == wq->work_color);
4040  		WARN_ON_ONCE(wq->flush_color != next->flush_color);
4041  
4042  		list_del_init(&next->list);
4043  		wq->first_flusher = next;
4044  
4045  		if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
4046  			break;
4047  
4048  		/*
4049  		 * Meh... this color is already done, clear first
4050  		 * flusher and repeat cascading.
4051  		 */
4052  		wq->first_flusher = NULL;
4053  	}
4054  
4055  out_unlock:
4056  	mutex_unlock(&wq->mutex);
4057  }
4058  EXPORT_SYMBOL(__flush_workqueue);
4059  
4060  /**
4061   * drain_workqueue - drain a workqueue
4062   * @wq: workqueue to drain
4063   *
4064   * Wait until the workqueue becomes empty.  While draining is in progress,
4065   * only chain queueing is allowed.  IOW, only currently pending or running
4066   * work items on @wq can queue further work items on it.  @wq is flushed
4067   * repeatedly until it becomes empty.  The number of flushing is determined
4068   * by the depth of chaining and should be relatively short.  Whine if it
4069   * takes too long.
4070   */
drain_workqueue(struct workqueue_struct * wq)4071  void drain_workqueue(struct workqueue_struct *wq)
4072  {
4073  	unsigned int flush_cnt = 0;
4074  	struct pool_workqueue *pwq;
4075  
4076  	/*
4077  	 * __queue_work() needs to test whether there are drainers, is much
4078  	 * hotter than drain_workqueue() and already looks at @wq->flags.
4079  	 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
4080  	 */
4081  	mutex_lock(&wq->mutex);
4082  	if (!wq->nr_drainers++)
4083  		wq->flags |= __WQ_DRAINING;
4084  	mutex_unlock(&wq->mutex);
4085  reflush:
4086  	__flush_workqueue(wq);
4087  
4088  	mutex_lock(&wq->mutex);
4089  
4090  	for_each_pwq(pwq, wq) {
4091  		bool drained;
4092  
4093  		raw_spin_lock_irq(&pwq->pool->lock);
4094  		drained = pwq_is_empty(pwq);
4095  		raw_spin_unlock_irq(&pwq->pool->lock);
4096  
4097  		if (drained)
4098  			continue;
4099  
4100  		if (++flush_cnt == 10 ||
4101  		    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
4102  			pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
4103  				wq->name, __func__, flush_cnt);
4104  
4105  		mutex_unlock(&wq->mutex);
4106  		goto reflush;
4107  	}
4108  
4109  	if (!--wq->nr_drainers)
4110  		wq->flags &= ~__WQ_DRAINING;
4111  	mutex_unlock(&wq->mutex);
4112  }
4113  EXPORT_SYMBOL_GPL(drain_workqueue);
4114  
start_flush_work(struct work_struct * work,struct wq_barrier * barr,bool from_cancel)4115  static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
4116  			     bool from_cancel)
4117  {
4118  	struct worker *worker = NULL;
4119  	struct worker_pool *pool;
4120  	struct pool_workqueue *pwq;
4121  	struct workqueue_struct *wq;
4122  
4123  	rcu_read_lock();
4124  	pool = get_work_pool(work);
4125  	if (!pool) {
4126  		rcu_read_unlock();
4127  		return false;
4128  	}
4129  
4130  	raw_spin_lock_irq(&pool->lock);
4131  	/* see the comment in try_to_grab_pending() with the same code */
4132  	pwq = get_work_pwq(work);
4133  	if (pwq) {
4134  		if (unlikely(pwq->pool != pool))
4135  			goto already_gone;
4136  	} else {
4137  		worker = find_worker_executing_work(pool, work);
4138  		if (!worker)
4139  			goto already_gone;
4140  		pwq = worker->current_pwq;
4141  	}
4142  
4143  	wq = pwq->wq;
4144  	check_flush_dependency(wq, work);
4145  
4146  	insert_wq_barrier(pwq, barr, work, worker);
4147  	raw_spin_unlock_irq(&pool->lock);
4148  
4149  	touch_work_lockdep_map(work, wq);
4150  
4151  	/*
4152  	 * Force a lock recursion deadlock when using flush_work() inside a
4153  	 * single-threaded or rescuer equipped workqueue.
4154  	 *
4155  	 * For single threaded workqueues the deadlock happens when the work
4156  	 * is after the work issuing the flush_work(). For rescuer equipped
4157  	 * workqueues the deadlock happens when the rescuer stalls, blocking
4158  	 * forward progress.
4159  	 */
4160  	if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer))
4161  		touch_wq_lockdep_map(wq);
4162  
4163  	rcu_read_unlock();
4164  	return true;
4165  already_gone:
4166  	raw_spin_unlock_irq(&pool->lock);
4167  	rcu_read_unlock();
4168  	return false;
4169  }
4170  
__flush_work(struct work_struct * work,bool from_cancel)4171  static bool __flush_work(struct work_struct *work, bool from_cancel)
4172  {
4173  	struct wq_barrier barr;
4174  
4175  	if (WARN_ON(!wq_online))
4176  		return false;
4177  
4178  	if (WARN_ON(!work->func))
4179  		return false;
4180  
4181  	if (!start_flush_work(work, &barr, from_cancel))
4182  		return false;
4183  
4184  	/*
4185  	 * start_flush_work() returned %true. If @from_cancel is set, we know
4186  	 * that @work must have been executing during start_flush_work() and
4187  	 * can't currently be queued. Its data must contain OFFQ bits. If @work
4188  	 * was queued on a BH workqueue, we also know that it was running in the
4189  	 * BH context and thus can be busy-waited.
4190  	 */
4191  	if (from_cancel) {
4192  		unsigned long data = *work_data_bits(work);
4193  
4194  		if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) &&
4195  		    (data & WORK_OFFQ_BH)) {
4196  			/*
4197  			 * On RT, prevent a live lock when %current preempted
4198  			 * soft interrupt processing or prevents ksoftirqd from
4199  			 * running by keeping flipping BH. If the BH work item
4200  			 * runs on a different CPU then this has no effect other
4201  			 * than doing the BH disable/enable dance for nothing.
4202  			 * This is copied from
4203  			 * kernel/softirq.c::tasklet_unlock_spin_wait().
4204  			 */
4205  			while (!try_wait_for_completion(&barr.done)) {
4206  				if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
4207  					local_bh_disable();
4208  					local_bh_enable();
4209  				} else {
4210  					cpu_relax();
4211  				}
4212  			}
4213  			goto out_destroy;
4214  		}
4215  	}
4216  
4217  	wait_for_completion(&barr.done);
4218  
4219  out_destroy:
4220  	destroy_work_on_stack(&barr.work);
4221  	return true;
4222  }
4223  
4224  /**
4225   * flush_work - wait for a work to finish executing the last queueing instance
4226   * @work: the work to flush
4227   *
4228   * Wait until @work has finished execution.  @work is guaranteed to be idle
4229   * on return if it hasn't been requeued since flush started.
4230   *
4231   * Return:
4232   * %true if flush_work() waited for the work to finish execution,
4233   * %false if it was already idle.
4234   */
flush_work(struct work_struct * work)4235  bool flush_work(struct work_struct *work)
4236  {
4237  	might_sleep();
4238  	return __flush_work(work, false);
4239  }
4240  EXPORT_SYMBOL_GPL(flush_work);
4241  
4242  /**
4243   * flush_delayed_work - wait for a dwork to finish executing the last queueing
4244   * @dwork: the delayed work to flush
4245   *
4246   * Delayed timer is cancelled and the pending work is queued for
4247   * immediate execution.  Like flush_work(), this function only
4248   * considers the last queueing instance of @dwork.
4249   *
4250   * Return:
4251   * %true if flush_work() waited for the work to finish execution,
4252   * %false if it was already idle.
4253   */
flush_delayed_work(struct delayed_work * dwork)4254  bool flush_delayed_work(struct delayed_work *dwork)
4255  {
4256  	local_irq_disable();
4257  	if (del_timer_sync(&dwork->timer))
4258  		__queue_work(dwork->cpu, dwork->wq, &dwork->work);
4259  	local_irq_enable();
4260  	return flush_work(&dwork->work);
4261  }
4262  EXPORT_SYMBOL(flush_delayed_work);
4263  
4264  /**
4265   * flush_rcu_work - wait for a rwork to finish executing the last queueing
4266   * @rwork: the rcu work to flush
4267   *
4268   * Return:
4269   * %true if flush_rcu_work() waited for the work to finish execution,
4270   * %false if it was already idle.
4271   */
flush_rcu_work(struct rcu_work * rwork)4272  bool flush_rcu_work(struct rcu_work *rwork)
4273  {
4274  	if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
4275  		rcu_barrier();
4276  		flush_work(&rwork->work);
4277  		return true;
4278  	} else {
4279  		return flush_work(&rwork->work);
4280  	}
4281  }
4282  EXPORT_SYMBOL(flush_rcu_work);
4283  
work_offqd_disable(struct work_offq_data * offqd)4284  static void work_offqd_disable(struct work_offq_data *offqd)
4285  {
4286  	const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1;
4287  
4288  	if (likely(offqd->disable < max))
4289  		offqd->disable++;
4290  	else
4291  		WARN_ONCE(true, "workqueue: work disable count overflowed\n");
4292  }
4293  
work_offqd_enable(struct work_offq_data * offqd)4294  static void work_offqd_enable(struct work_offq_data *offqd)
4295  {
4296  	if (likely(offqd->disable > 0))
4297  		offqd->disable--;
4298  	else
4299  		WARN_ONCE(true, "workqueue: work disable count underflowed\n");
4300  }
4301  
__cancel_work(struct work_struct * work,u32 cflags)4302  static bool __cancel_work(struct work_struct *work, u32 cflags)
4303  {
4304  	struct work_offq_data offqd;
4305  	unsigned long irq_flags;
4306  	int ret;
4307  
4308  	ret = work_grab_pending(work, cflags, &irq_flags);
4309  
4310  	work_offqd_unpack(&offqd, *work_data_bits(work));
4311  
4312  	if (cflags & WORK_CANCEL_DISABLE)
4313  		work_offqd_disable(&offqd);
4314  
4315  	set_work_pool_and_clear_pending(work, offqd.pool_id,
4316  					work_offqd_pack_flags(&offqd));
4317  	local_irq_restore(irq_flags);
4318  	return ret;
4319  }
4320  
__cancel_work_sync(struct work_struct * work,u32 cflags)4321  static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
4322  {
4323  	bool ret;
4324  
4325  	ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE);
4326  
4327  	if (*work_data_bits(work) & WORK_OFFQ_BH)
4328  		WARN_ON_ONCE(in_hardirq());
4329  	else
4330  		might_sleep();
4331  
4332  	/*
4333  	 * Skip __flush_work() during early boot when we know that @work isn't
4334  	 * executing. This allows canceling during early boot.
4335  	 */
4336  	if (wq_online)
4337  		__flush_work(work, true);
4338  
4339  	if (!(cflags & WORK_CANCEL_DISABLE))
4340  		enable_work(work);
4341  
4342  	return ret;
4343  }
4344  
4345  /*
4346   * See cancel_delayed_work()
4347   */
cancel_work(struct work_struct * work)4348  bool cancel_work(struct work_struct *work)
4349  {
4350  	return __cancel_work(work, 0);
4351  }
4352  EXPORT_SYMBOL(cancel_work);
4353  
4354  /**
4355   * cancel_work_sync - cancel a work and wait for it to finish
4356   * @work: the work to cancel
4357   *
4358   * Cancel @work and wait for its execution to finish. This function can be used
4359   * even if the work re-queues itself or migrates to another workqueue. On return
4360   * from this function, @work is guaranteed to be not pending or executing on any
4361   * CPU as long as there aren't racing enqueues.
4362   *
4363   * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's.
4364   * Use cancel_delayed_work_sync() instead.
4365   *
4366   * Must be called from a sleepable context if @work was last queued on a non-BH
4367   * workqueue. Can also be called from non-hardirq atomic contexts including BH
4368   * if @work was last queued on a BH workqueue.
4369   *
4370   * Returns %true if @work was pending, %false otherwise.
4371   */
cancel_work_sync(struct work_struct * work)4372  bool cancel_work_sync(struct work_struct *work)
4373  {
4374  	return __cancel_work_sync(work, 0);
4375  }
4376  EXPORT_SYMBOL_GPL(cancel_work_sync);
4377  
4378  /**
4379   * cancel_delayed_work - cancel a delayed work
4380   * @dwork: delayed_work to cancel
4381   *
4382   * Kill off a pending delayed_work.
4383   *
4384   * Return: %true if @dwork was pending and canceled; %false if it wasn't
4385   * pending.
4386   *
4387   * Note:
4388   * The work callback function may still be running on return, unless
4389   * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
4390   * use cancel_delayed_work_sync() to wait on it.
4391   *
4392   * This function is safe to call from any context including IRQ handler.
4393   */
cancel_delayed_work(struct delayed_work * dwork)4394  bool cancel_delayed_work(struct delayed_work *dwork)
4395  {
4396  	return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED);
4397  }
4398  EXPORT_SYMBOL(cancel_delayed_work);
4399  
4400  /**
4401   * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
4402   * @dwork: the delayed work cancel
4403   *
4404   * This is cancel_work_sync() for delayed works.
4405   *
4406   * Return:
4407   * %true if @dwork was pending, %false otherwise.
4408   */
cancel_delayed_work_sync(struct delayed_work * dwork)4409  bool cancel_delayed_work_sync(struct delayed_work *dwork)
4410  {
4411  	return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED);
4412  }
4413  EXPORT_SYMBOL(cancel_delayed_work_sync);
4414  
4415  /**
4416   * disable_work - Disable and cancel a work item
4417   * @work: work item to disable
4418   *
4419   * Disable @work by incrementing its disable count and cancel it if currently
4420   * pending. As long as the disable count is non-zero, any attempt to queue @work
4421   * will fail and return %false. The maximum supported disable depth is 2 to the
4422   * power of %WORK_OFFQ_DISABLE_BITS, currently 65536.
4423   *
4424   * Can be called from any context. Returns %true if @work was pending, %false
4425   * otherwise.
4426   */
disable_work(struct work_struct * work)4427  bool disable_work(struct work_struct *work)
4428  {
4429  	return __cancel_work(work, WORK_CANCEL_DISABLE);
4430  }
4431  EXPORT_SYMBOL_GPL(disable_work);
4432  
4433  /**
4434   * disable_work_sync - Disable, cancel and drain a work item
4435   * @work: work item to disable
4436   *
4437   * Similar to disable_work() but also wait for @work to finish if currently
4438   * executing.
4439   *
4440   * Must be called from a sleepable context if @work was last queued on a non-BH
4441   * workqueue. Can also be called from non-hardirq atomic contexts including BH
4442   * if @work was last queued on a BH workqueue.
4443   *
4444   * Returns %true if @work was pending, %false otherwise.
4445   */
disable_work_sync(struct work_struct * work)4446  bool disable_work_sync(struct work_struct *work)
4447  {
4448  	return __cancel_work_sync(work, WORK_CANCEL_DISABLE);
4449  }
4450  EXPORT_SYMBOL_GPL(disable_work_sync);
4451  
4452  /**
4453   * enable_work - Enable a work item
4454   * @work: work item to enable
4455   *
4456   * Undo disable_work[_sync]() by decrementing @work's disable count. @work can
4457   * only be queued if its disable count is 0.
4458   *
4459   * Can be called from any context. Returns %true if the disable count reached 0.
4460   * Otherwise, %false.
4461   */
enable_work(struct work_struct * work)4462  bool enable_work(struct work_struct *work)
4463  {
4464  	struct work_offq_data offqd;
4465  	unsigned long irq_flags;
4466  
4467  	work_grab_pending(work, 0, &irq_flags);
4468  
4469  	work_offqd_unpack(&offqd, *work_data_bits(work));
4470  	work_offqd_enable(&offqd);
4471  	set_work_pool_and_clear_pending(work, offqd.pool_id,
4472  					work_offqd_pack_flags(&offqd));
4473  	local_irq_restore(irq_flags);
4474  
4475  	return !offqd.disable;
4476  }
4477  EXPORT_SYMBOL_GPL(enable_work);
4478  
4479  /**
4480   * disable_delayed_work - Disable and cancel a delayed work item
4481   * @dwork: delayed work item to disable
4482   *
4483   * disable_work() for delayed work items.
4484   */
disable_delayed_work(struct delayed_work * dwork)4485  bool disable_delayed_work(struct delayed_work *dwork)
4486  {
4487  	return __cancel_work(&dwork->work,
4488  			     WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
4489  }
4490  EXPORT_SYMBOL_GPL(disable_delayed_work);
4491  
4492  /**
4493   * disable_delayed_work_sync - Disable, cancel and drain a delayed work item
4494   * @dwork: delayed work item to disable
4495   *
4496   * disable_work_sync() for delayed work items.
4497   */
disable_delayed_work_sync(struct delayed_work * dwork)4498  bool disable_delayed_work_sync(struct delayed_work *dwork)
4499  {
4500  	return __cancel_work_sync(&dwork->work,
4501  				  WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
4502  }
4503  EXPORT_SYMBOL_GPL(disable_delayed_work_sync);
4504  
4505  /**
4506   * enable_delayed_work - Enable a delayed work item
4507   * @dwork: delayed work item to enable
4508   *
4509   * enable_work() for delayed work items.
4510   */
enable_delayed_work(struct delayed_work * dwork)4511  bool enable_delayed_work(struct delayed_work *dwork)
4512  {
4513  	return enable_work(&dwork->work);
4514  }
4515  EXPORT_SYMBOL_GPL(enable_delayed_work);
4516  
4517  /**
4518   * schedule_on_each_cpu - execute a function synchronously on each online CPU
4519   * @func: the function to call
4520   *
4521   * schedule_on_each_cpu() executes @func on each online CPU using the
4522   * system workqueue and blocks until all CPUs have completed.
4523   * schedule_on_each_cpu() is very slow.
4524   *
4525   * Return:
4526   * 0 on success, -errno on failure.
4527   */
schedule_on_each_cpu(work_func_t func)4528  int schedule_on_each_cpu(work_func_t func)
4529  {
4530  	int cpu;
4531  	struct work_struct __percpu *works;
4532  
4533  	works = alloc_percpu(struct work_struct);
4534  	if (!works)
4535  		return -ENOMEM;
4536  
4537  	cpus_read_lock();
4538  
4539  	for_each_online_cpu(cpu) {
4540  		struct work_struct *work = per_cpu_ptr(works, cpu);
4541  
4542  		INIT_WORK(work, func);
4543  		schedule_work_on(cpu, work);
4544  	}
4545  
4546  	for_each_online_cpu(cpu)
4547  		flush_work(per_cpu_ptr(works, cpu));
4548  
4549  	cpus_read_unlock();
4550  	free_percpu(works);
4551  	return 0;
4552  }
4553  
4554  /**
4555   * execute_in_process_context - reliably execute the routine with user context
4556   * @fn:		the function to execute
4557   * @ew:		guaranteed storage for the execute work structure (must
4558   *		be available when the work executes)
4559   *
4560   * Executes the function immediately if process context is available,
4561   * otherwise schedules the function for delayed execution.
4562   *
4563   * Return:	0 - function was executed
4564   *		1 - function was scheduled for execution
4565   */
execute_in_process_context(work_func_t fn,struct execute_work * ew)4566  int execute_in_process_context(work_func_t fn, struct execute_work *ew)
4567  {
4568  	if (!in_interrupt()) {
4569  		fn(&ew->work);
4570  		return 0;
4571  	}
4572  
4573  	INIT_WORK(&ew->work, fn);
4574  	schedule_work(&ew->work);
4575  
4576  	return 1;
4577  }
4578  EXPORT_SYMBOL_GPL(execute_in_process_context);
4579  
4580  /**
4581   * free_workqueue_attrs - free a workqueue_attrs
4582   * @attrs: workqueue_attrs to free
4583   *
4584   * Undo alloc_workqueue_attrs().
4585   */
free_workqueue_attrs(struct workqueue_attrs * attrs)4586  void free_workqueue_attrs(struct workqueue_attrs *attrs)
4587  {
4588  	if (attrs) {
4589  		free_cpumask_var(attrs->cpumask);
4590  		free_cpumask_var(attrs->__pod_cpumask);
4591  		kfree(attrs);
4592  	}
4593  }
4594  
4595  /**
4596   * alloc_workqueue_attrs - allocate a workqueue_attrs
4597   *
4598   * Allocate a new workqueue_attrs, initialize with default settings and
4599   * return it.
4600   *
4601   * Return: The allocated new workqueue_attr on success. %NULL on failure.
4602   */
alloc_workqueue_attrs(void)4603  struct workqueue_attrs *alloc_workqueue_attrs(void)
4604  {
4605  	struct workqueue_attrs *attrs;
4606  
4607  	attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
4608  	if (!attrs)
4609  		goto fail;
4610  	if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
4611  		goto fail;
4612  	if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
4613  		goto fail;
4614  
4615  	cpumask_copy(attrs->cpumask, cpu_possible_mask);
4616  	attrs->affn_scope = WQ_AFFN_DFL;
4617  	return attrs;
4618  fail:
4619  	free_workqueue_attrs(attrs);
4620  	return NULL;
4621  }
4622  
copy_workqueue_attrs(struct workqueue_attrs * to,const struct workqueue_attrs * from)4623  static void copy_workqueue_attrs(struct workqueue_attrs *to,
4624  				 const struct workqueue_attrs *from)
4625  {
4626  	to->nice = from->nice;
4627  	cpumask_copy(to->cpumask, from->cpumask);
4628  	cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
4629  	to->affn_strict = from->affn_strict;
4630  
4631  	/*
4632  	 * Unlike hash and equality test, copying shouldn't ignore wq-only
4633  	 * fields as copying is used for both pool and wq attrs. Instead,
4634  	 * get_unbound_pool() explicitly clears the fields.
4635  	 */
4636  	to->affn_scope = from->affn_scope;
4637  	to->ordered = from->ordered;
4638  }
4639  
4640  /*
4641   * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
4642   * comments in 'struct workqueue_attrs' definition.
4643   */
wqattrs_clear_for_pool(struct workqueue_attrs * attrs)4644  static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
4645  {
4646  	attrs->affn_scope = WQ_AFFN_NR_TYPES;
4647  	attrs->ordered = false;
4648  	if (attrs->affn_strict)
4649  		cpumask_copy(attrs->cpumask, cpu_possible_mask);
4650  }
4651  
4652  /* hash value of the content of @attr */
wqattrs_hash(const struct workqueue_attrs * attrs)4653  static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
4654  {
4655  	u32 hash = 0;
4656  
4657  	hash = jhash_1word(attrs->nice, hash);
4658  	hash = jhash_1word(attrs->affn_strict, hash);
4659  	hash = jhash(cpumask_bits(attrs->__pod_cpumask),
4660  		     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
4661  	if (!attrs->affn_strict)
4662  		hash = jhash(cpumask_bits(attrs->cpumask),
4663  			     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
4664  	return hash;
4665  }
4666  
4667  /* content equality test */
wqattrs_equal(const struct workqueue_attrs * a,const struct workqueue_attrs * b)4668  static bool wqattrs_equal(const struct workqueue_attrs *a,
4669  			  const struct workqueue_attrs *b)
4670  {
4671  	if (a->nice != b->nice)
4672  		return false;
4673  	if (a->affn_strict != b->affn_strict)
4674  		return false;
4675  	if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
4676  		return false;
4677  	if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask))
4678  		return false;
4679  	return true;
4680  }
4681  
4682  /* Update @attrs with actually available CPUs */
wqattrs_actualize_cpumask(struct workqueue_attrs * attrs,const cpumask_t * unbound_cpumask)4683  static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
4684  				      const cpumask_t *unbound_cpumask)
4685  {
4686  	/*
4687  	 * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
4688  	 * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
4689  	 * @unbound_cpumask.
4690  	 */
4691  	cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);
4692  	if (unlikely(cpumask_empty(attrs->cpumask)))
4693  		cpumask_copy(attrs->cpumask, unbound_cpumask);
4694  }
4695  
4696  /* find wq_pod_type to use for @attrs */
4697  static const struct wq_pod_type *
wqattrs_pod_type(const struct workqueue_attrs * attrs)4698  wqattrs_pod_type(const struct workqueue_attrs *attrs)
4699  {
4700  	enum wq_affn_scope scope;
4701  	struct wq_pod_type *pt;
4702  
4703  	/* to synchronize access to wq_affn_dfl */
4704  	lockdep_assert_held(&wq_pool_mutex);
4705  
4706  	if (attrs->affn_scope == WQ_AFFN_DFL)
4707  		scope = wq_affn_dfl;
4708  	else
4709  		scope = attrs->affn_scope;
4710  
4711  	pt = &wq_pod_types[scope];
4712  
4713  	if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
4714  	    likely(pt->nr_pods))
4715  		return pt;
4716  
4717  	/*
4718  	 * Before workqueue_init_topology(), only SYSTEM is available which is
4719  	 * initialized in workqueue_init_early().
4720  	 */
4721  	pt = &wq_pod_types[WQ_AFFN_SYSTEM];
4722  	BUG_ON(!pt->nr_pods);
4723  	return pt;
4724  }
4725  
4726  /**
4727   * init_worker_pool - initialize a newly zalloc'd worker_pool
4728   * @pool: worker_pool to initialize
4729   *
4730   * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
4731   *
4732   * Return: 0 on success, -errno on failure.  Even on failure, all fields
4733   * inside @pool proper are initialized and put_unbound_pool() can be called
4734   * on @pool safely to release it.
4735   */
init_worker_pool(struct worker_pool * pool)4736  static int init_worker_pool(struct worker_pool *pool)
4737  {
4738  	raw_spin_lock_init(&pool->lock);
4739  	pool->id = -1;
4740  	pool->cpu = -1;
4741  	pool->node = NUMA_NO_NODE;
4742  	pool->flags |= POOL_DISASSOCIATED;
4743  	pool->watchdog_ts = jiffies;
4744  	INIT_LIST_HEAD(&pool->worklist);
4745  	INIT_LIST_HEAD(&pool->idle_list);
4746  	hash_init(pool->busy_hash);
4747  
4748  	timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
4749  	INIT_WORK(&pool->idle_cull_work, idle_cull_fn);
4750  
4751  	timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
4752  
4753  	INIT_LIST_HEAD(&pool->workers);
4754  
4755  	ida_init(&pool->worker_ida);
4756  	INIT_HLIST_NODE(&pool->hash_node);
4757  	pool->refcnt = 1;
4758  
4759  	/* shouldn't fail above this point */
4760  	pool->attrs = alloc_workqueue_attrs();
4761  	if (!pool->attrs)
4762  		return -ENOMEM;
4763  
4764  	wqattrs_clear_for_pool(pool->attrs);
4765  
4766  	return 0;
4767  }
4768  
4769  #ifdef CONFIG_LOCKDEP
wq_init_lockdep(struct workqueue_struct * wq)4770  static void wq_init_lockdep(struct workqueue_struct *wq)
4771  {
4772  	char *lock_name;
4773  
4774  	lockdep_register_key(&wq->key);
4775  	lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
4776  	if (!lock_name)
4777  		lock_name = wq->name;
4778  
4779  	wq->lock_name = lock_name;
4780  	wq->lockdep_map = &wq->__lockdep_map;
4781  	lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0);
4782  }
4783  
wq_unregister_lockdep(struct workqueue_struct * wq)4784  static void wq_unregister_lockdep(struct workqueue_struct *wq)
4785  {
4786  	if (wq->lockdep_map != &wq->__lockdep_map)
4787  		return;
4788  
4789  	lockdep_unregister_key(&wq->key);
4790  }
4791  
wq_free_lockdep(struct workqueue_struct * wq)4792  static void wq_free_lockdep(struct workqueue_struct *wq)
4793  {
4794  	if (wq->lockdep_map != &wq->__lockdep_map)
4795  		return;
4796  
4797  	if (wq->lock_name != wq->name)
4798  		kfree(wq->lock_name);
4799  }
4800  #else
wq_init_lockdep(struct workqueue_struct * wq)4801  static void wq_init_lockdep(struct workqueue_struct *wq)
4802  {
4803  }
4804  
wq_unregister_lockdep(struct workqueue_struct * wq)4805  static void wq_unregister_lockdep(struct workqueue_struct *wq)
4806  {
4807  }
4808  
wq_free_lockdep(struct workqueue_struct * wq)4809  static void wq_free_lockdep(struct workqueue_struct *wq)
4810  {
4811  }
4812  #endif
4813  
free_node_nr_active(struct wq_node_nr_active ** nna_ar)4814  static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
4815  {
4816  	int node;
4817  
4818  	for_each_node(node) {
4819  		kfree(nna_ar[node]);
4820  		nna_ar[node] = NULL;
4821  	}
4822  
4823  	kfree(nna_ar[nr_node_ids]);
4824  	nna_ar[nr_node_ids] = NULL;
4825  }
4826  
init_node_nr_active(struct wq_node_nr_active * nna)4827  static void init_node_nr_active(struct wq_node_nr_active *nna)
4828  {
4829  	nna->max = WQ_DFL_MIN_ACTIVE;
4830  	atomic_set(&nna->nr, 0);
4831  	raw_spin_lock_init(&nna->lock);
4832  	INIT_LIST_HEAD(&nna->pending_pwqs);
4833  }
4834  
4835  /*
4836   * Each node's nr_active counter will be accessed mostly from its own node and
4837   * should be allocated in the node.
4838   */
alloc_node_nr_active(struct wq_node_nr_active ** nna_ar)4839  static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
4840  {
4841  	struct wq_node_nr_active *nna;
4842  	int node;
4843  
4844  	for_each_node(node) {
4845  		nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
4846  		if (!nna)
4847  			goto err_free;
4848  		init_node_nr_active(nna);
4849  		nna_ar[node] = nna;
4850  	}
4851  
4852  	/* [nr_node_ids] is used as the fallback */
4853  	nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
4854  	if (!nna)
4855  		goto err_free;
4856  	init_node_nr_active(nna);
4857  	nna_ar[nr_node_ids] = nna;
4858  
4859  	return 0;
4860  
4861  err_free:
4862  	free_node_nr_active(nna_ar);
4863  	return -ENOMEM;
4864  }
4865  
rcu_free_wq(struct rcu_head * rcu)4866  static void rcu_free_wq(struct rcu_head *rcu)
4867  {
4868  	struct workqueue_struct *wq =
4869  		container_of(rcu, struct workqueue_struct, rcu);
4870  
4871  	if (wq->flags & WQ_UNBOUND)
4872  		free_node_nr_active(wq->node_nr_active);
4873  
4874  	wq_free_lockdep(wq);
4875  	free_percpu(wq->cpu_pwq);
4876  	free_workqueue_attrs(wq->unbound_attrs);
4877  	kfree(wq);
4878  }
4879  
rcu_free_pool(struct rcu_head * rcu)4880  static void rcu_free_pool(struct rcu_head *rcu)
4881  {
4882  	struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
4883  
4884  	ida_destroy(&pool->worker_ida);
4885  	free_workqueue_attrs(pool->attrs);
4886  	kfree(pool);
4887  }
4888  
4889  /**
4890   * put_unbound_pool - put a worker_pool
4891   * @pool: worker_pool to put
4892   *
4893   * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
4894   * safe manner.  get_unbound_pool() calls this function on its failure path
4895   * and this function should be able to release pools which went through,
4896   * successfully or not, init_worker_pool().
4897   *
4898   * Should be called with wq_pool_mutex held.
4899   */
put_unbound_pool(struct worker_pool * pool)4900  static void put_unbound_pool(struct worker_pool *pool)
4901  {
4902  	struct worker *worker;
4903  	LIST_HEAD(cull_list);
4904  
4905  	lockdep_assert_held(&wq_pool_mutex);
4906  
4907  	if (--pool->refcnt)
4908  		return;
4909  
4910  	/* sanity checks */
4911  	if (WARN_ON(!(pool->cpu < 0)) ||
4912  	    WARN_ON(!list_empty(&pool->worklist)))
4913  		return;
4914  
4915  	/* release id and unhash */
4916  	if (pool->id >= 0)
4917  		idr_remove(&worker_pool_idr, pool->id);
4918  	hash_del(&pool->hash_node);
4919  
4920  	/*
4921  	 * Become the manager and destroy all workers.  This prevents
4922  	 * @pool's workers from blocking on attach_mutex.  We're the last
4923  	 * manager and @pool gets freed with the flag set.
4924  	 *
4925  	 * Having a concurrent manager is quite unlikely to happen as we can
4926  	 * only get here with
4927  	 *   pwq->refcnt == pool->refcnt == 0
4928  	 * which implies no work queued to the pool, which implies no worker can
4929  	 * become the manager. However a worker could have taken the role of
4930  	 * manager before the refcnts dropped to 0, since maybe_create_worker()
4931  	 * drops pool->lock
4932  	 */
4933  	while (true) {
4934  		rcuwait_wait_event(&manager_wait,
4935  				   !(pool->flags & POOL_MANAGER_ACTIVE),
4936  				   TASK_UNINTERRUPTIBLE);
4937  
4938  		mutex_lock(&wq_pool_attach_mutex);
4939  		raw_spin_lock_irq(&pool->lock);
4940  		if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
4941  			pool->flags |= POOL_MANAGER_ACTIVE;
4942  			break;
4943  		}
4944  		raw_spin_unlock_irq(&pool->lock);
4945  		mutex_unlock(&wq_pool_attach_mutex);
4946  	}
4947  
4948  	while ((worker = first_idle_worker(pool)))
4949  		set_worker_dying(worker, &cull_list);
4950  	WARN_ON(pool->nr_workers || pool->nr_idle);
4951  	raw_spin_unlock_irq(&pool->lock);
4952  
4953  	detach_dying_workers(&cull_list);
4954  
4955  	mutex_unlock(&wq_pool_attach_mutex);
4956  
4957  	reap_dying_workers(&cull_list);
4958  
4959  	/* shut down the timers */
4960  	del_timer_sync(&pool->idle_timer);
4961  	cancel_work_sync(&pool->idle_cull_work);
4962  	del_timer_sync(&pool->mayday_timer);
4963  
4964  	/* RCU protected to allow dereferences from get_work_pool() */
4965  	call_rcu(&pool->rcu, rcu_free_pool);
4966  }
4967  
4968  /**
4969   * get_unbound_pool - get a worker_pool with the specified attributes
4970   * @attrs: the attributes of the worker_pool to get
4971   *
4972   * Obtain a worker_pool which has the same attributes as @attrs, bump the
4973   * reference count and return it.  If there already is a matching
4974   * worker_pool, it will be used; otherwise, this function attempts to
4975   * create a new one.
4976   *
4977   * Should be called with wq_pool_mutex held.
4978   *
4979   * Return: On success, a worker_pool with the same attributes as @attrs.
4980   * On failure, %NULL.
4981   */
get_unbound_pool(const struct workqueue_attrs * attrs)4982  static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
4983  {
4984  	struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
4985  	u32 hash = wqattrs_hash(attrs);
4986  	struct worker_pool *pool;
4987  	int pod, node = NUMA_NO_NODE;
4988  
4989  	lockdep_assert_held(&wq_pool_mutex);
4990  
4991  	/* do we already have a matching pool? */
4992  	hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
4993  		if (wqattrs_equal(pool->attrs, attrs)) {
4994  			pool->refcnt++;
4995  			return pool;
4996  		}
4997  	}
4998  
4999  	/* If __pod_cpumask is contained inside a NUMA pod, that's our node */
5000  	for (pod = 0; pod < pt->nr_pods; pod++) {
5001  		if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
5002  			node = pt->pod_node[pod];
5003  			break;
5004  		}
5005  	}
5006  
5007  	/* nope, create a new one */
5008  	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
5009  	if (!pool || init_worker_pool(pool) < 0)
5010  		goto fail;
5011  
5012  	pool->node = node;
5013  	copy_workqueue_attrs(pool->attrs, attrs);
5014  	wqattrs_clear_for_pool(pool->attrs);
5015  
5016  	if (worker_pool_assign_id(pool) < 0)
5017  		goto fail;
5018  
5019  	/* create and start the initial worker */
5020  	if (wq_online && !create_worker(pool))
5021  		goto fail;
5022  
5023  	/* install */
5024  	hash_add(unbound_pool_hash, &pool->hash_node, hash);
5025  
5026  	return pool;
5027  fail:
5028  	if (pool)
5029  		put_unbound_pool(pool);
5030  	return NULL;
5031  }
5032  
5033  /*
5034   * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
5035   * refcnt and needs to be destroyed.
5036   */
pwq_release_workfn(struct kthread_work * work)5037  static void pwq_release_workfn(struct kthread_work *work)
5038  {
5039  	struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
5040  						  release_work);
5041  	struct workqueue_struct *wq = pwq->wq;
5042  	struct worker_pool *pool = pwq->pool;
5043  	bool is_last = false;
5044  
5045  	/*
5046  	 * When @pwq is not linked, it doesn't hold any reference to the
5047  	 * @wq, and @wq is invalid to access.
5048  	 */
5049  	if (!list_empty(&pwq->pwqs_node)) {
5050  		mutex_lock(&wq->mutex);
5051  		list_del_rcu(&pwq->pwqs_node);
5052  		is_last = list_empty(&wq->pwqs);
5053  
5054  		/*
5055  		 * For ordered workqueue with a plugged dfl_pwq, restart it now.
5056  		 */
5057  		if (!is_last && (wq->flags & __WQ_ORDERED))
5058  			unplug_oldest_pwq(wq);
5059  
5060  		mutex_unlock(&wq->mutex);
5061  	}
5062  
5063  	if (wq->flags & WQ_UNBOUND) {
5064  		mutex_lock(&wq_pool_mutex);
5065  		put_unbound_pool(pool);
5066  		mutex_unlock(&wq_pool_mutex);
5067  	}
5068  
5069  	if (!list_empty(&pwq->pending_node)) {
5070  		struct wq_node_nr_active *nna =
5071  			wq_node_nr_active(pwq->wq, pwq->pool->node);
5072  
5073  		raw_spin_lock_irq(&nna->lock);
5074  		list_del_init(&pwq->pending_node);
5075  		raw_spin_unlock_irq(&nna->lock);
5076  	}
5077  
5078  	kfree_rcu(pwq, rcu);
5079  
5080  	/*
5081  	 * If we're the last pwq going away, @wq is already dead and no one
5082  	 * is gonna access it anymore.  Schedule RCU free.
5083  	 */
5084  	if (is_last) {
5085  		wq_unregister_lockdep(wq);
5086  		call_rcu(&wq->rcu, rcu_free_wq);
5087  	}
5088  }
5089  
5090  /* initialize newly allocated @pwq which is associated with @wq and @pool */
init_pwq(struct pool_workqueue * pwq,struct workqueue_struct * wq,struct worker_pool * pool)5091  static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
5092  		     struct worker_pool *pool)
5093  {
5094  	BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK);
5095  
5096  	memset(pwq, 0, sizeof(*pwq));
5097  
5098  	pwq->pool = pool;
5099  	pwq->wq = wq;
5100  	pwq->flush_color = -1;
5101  	pwq->refcnt = 1;
5102  	INIT_LIST_HEAD(&pwq->inactive_works);
5103  	INIT_LIST_HEAD(&pwq->pending_node);
5104  	INIT_LIST_HEAD(&pwq->pwqs_node);
5105  	INIT_LIST_HEAD(&pwq->mayday_node);
5106  	kthread_init_work(&pwq->release_work, pwq_release_workfn);
5107  }
5108  
5109  /* sync @pwq with the current state of its associated wq and link it */
link_pwq(struct pool_workqueue * pwq)5110  static void link_pwq(struct pool_workqueue *pwq)
5111  {
5112  	struct workqueue_struct *wq = pwq->wq;
5113  
5114  	lockdep_assert_held(&wq->mutex);
5115  
5116  	/* may be called multiple times, ignore if already linked */
5117  	if (!list_empty(&pwq->pwqs_node))
5118  		return;
5119  
5120  	/* set the matching work_color */
5121  	pwq->work_color = wq->work_color;
5122  
5123  	/* link in @pwq */
5124  	list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs);
5125  }
5126  
5127  /* obtain a pool matching @attr and create a pwq associating the pool and @wq */
alloc_unbound_pwq(struct workqueue_struct * wq,const struct workqueue_attrs * attrs)5128  static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
5129  					const struct workqueue_attrs *attrs)
5130  {
5131  	struct worker_pool *pool;
5132  	struct pool_workqueue *pwq;
5133  
5134  	lockdep_assert_held(&wq_pool_mutex);
5135  
5136  	pool = get_unbound_pool(attrs);
5137  	if (!pool)
5138  		return NULL;
5139  
5140  	pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
5141  	if (!pwq) {
5142  		put_unbound_pool(pool);
5143  		return NULL;
5144  	}
5145  
5146  	init_pwq(pwq, wq, pool);
5147  	return pwq;
5148  }
5149  
apply_wqattrs_lock(void)5150  static void apply_wqattrs_lock(void)
5151  {
5152  	mutex_lock(&wq_pool_mutex);
5153  }
5154  
apply_wqattrs_unlock(void)5155  static void apply_wqattrs_unlock(void)
5156  {
5157  	mutex_unlock(&wq_pool_mutex);
5158  }
5159  
5160  /**
5161   * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
5162   * @attrs: the wq_attrs of the default pwq of the target workqueue
5163   * @cpu: the target CPU
5164   *
5165   * Calculate the cpumask a workqueue with @attrs should use on @pod.
5166   * The result is stored in @attrs->__pod_cpumask.
5167   *
5168   * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
5169   * and @pod has online CPUs requested by @attrs, the returned cpumask is the
5170   * intersection of the possible CPUs of @pod and @attrs->cpumask.
5171   *
5172   * The caller is responsible for ensuring that the cpumask of @pod stays stable.
5173   */
wq_calc_pod_cpumask(struct workqueue_attrs * attrs,int cpu)5174  static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu)
5175  {
5176  	const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
5177  	int pod = pt->cpu_pod[cpu];
5178  
5179  	/* calculate possible CPUs in @pod that @attrs wants */
5180  	cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
5181  	/* does @pod have any online CPUs @attrs wants? */
5182  	if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) {
5183  		cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
5184  		return;
5185  	}
5186  }
5187  
5188  /* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
install_unbound_pwq(struct workqueue_struct * wq,int cpu,struct pool_workqueue * pwq)5189  static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
5190  					int cpu, struct pool_workqueue *pwq)
5191  {
5192  	struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
5193  	struct pool_workqueue *old_pwq;
5194  
5195  	lockdep_assert_held(&wq_pool_mutex);
5196  	lockdep_assert_held(&wq->mutex);
5197  
5198  	/* link_pwq() can handle duplicate calls */
5199  	link_pwq(pwq);
5200  
5201  	old_pwq = rcu_access_pointer(*slot);
5202  	rcu_assign_pointer(*slot, pwq);
5203  	return old_pwq;
5204  }
5205  
5206  /* context to store the prepared attrs & pwqs before applying */
5207  struct apply_wqattrs_ctx {
5208  	struct workqueue_struct	*wq;		/* target workqueue */
5209  	struct workqueue_attrs	*attrs;		/* attrs to apply */
5210  	struct list_head	list;		/* queued for batching commit */
5211  	struct pool_workqueue	*dfl_pwq;
5212  	struct pool_workqueue	*pwq_tbl[];
5213  };
5214  
5215  /* free the resources after success or abort */
apply_wqattrs_cleanup(struct apply_wqattrs_ctx * ctx)5216  static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
5217  {
5218  	if (ctx) {
5219  		int cpu;
5220  
5221  		for_each_possible_cpu(cpu)
5222  			put_pwq_unlocked(ctx->pwq_tbl[cpu]);
5223  		put_pwq_unlocked(ctx->dfl_pwq);
5224  
5225  		free_workqueue_attrs(ctx->attrs);
5226  
5227  		kfree(ctx);
5228  	}
5229  }
5230  
5231  /* allocate the attrs and pwqs for later installation */
5232  static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct * wq,const struct workqueue_attrs * attrs,const cpumask_var_t unbound_cpumask)5233  apply_wqattrs_prepare(struct workqueue_struct *wq,
5234  		      const struct workqueue_attrs *attrs,
5235  		      const cpumask_var_t unbound_cpumask)
5236  {
5237  	struct apply_wqattrs_ctx *ctx;
5238  	struct workqueue_attrs *new_attrs;
5239  	int cpu;
5240  
5241  	lockdep_assert_held(&wq_pool_mutex);
5242  
5243  	if (WARN_ON(attrs->affn_scope < 0 ||
5244  		    attrs->affn_scope >= WQ_AFFN_NR_TYPES))
5245  		return ERR_PTR(-EINVAL);
5246  
5247  	ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);
5248  
5249  	new_attrs = alloc_workqueue_attrs();
5250  	if (!ctx || !new_attrs)
5251  		goto out_free;
5252  
5253  	/*
5254  	 * If something goes wrong during CPU up/down, we'll fall back to
5255  	 * the default pwq covering whole @attrs->cpumask.  Always create
5256  	 * it even if we don't use it immediately.
5257  	 */
5258  	copy_workqueue_attrs(new_attrs, attrs);
5259  	wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
5260  	cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
5261  	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
5262  	if (!ctx->dfl_pwq)
5263  		goto out_free;
5264  
5265  	for_each_possible_cpu(cpu) {
5266  		if (new_attrs->ordered) {
5267  			ctx->dfl_pwq->refcnt++;
5268  			ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
5269  		} else {
5270  			wq_calc_pod_cpumask(new_attrs, cpu);
5271  			ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
5272  			if (!ctx->pwq_tbl[cpu])
5273  				goto out_free;
5274  		}
5275  	}
5276  
5277  	/* save the user configured attrs and sanitize it. */
5278  	copy_workqueue_attrs(new_attrs, attrs);
5279  	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
5280  	cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
5281  	ctx->attrs = new_attrs;
5282  
5283  	/*
5284  	 * For initialized ordered workqueues, there should only be one pwq
5285  	 * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution
5286  	 * of newly queued work items until execution of older work items in
5287  	 * the old pwq's have completed.
5288  	 */
5289  	if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))
5290  		ctx->dfl_pwq->plugged = true;
5291  
5292  	ctx->wq = wq;
5293  	return ctx;
5294  
5295  out_free:
5296  	free_workqueue_attrs(new_attrs);
5297  	apply_wqattrs_cleanup(ctx);
5298  	return ERR_PTR(-ENOMEM);
5299  }
5300  
5301  /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
apply_wqattrs_commit(struct apply_wqattrs_ctx * ctx)5302  static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
5303  {
5304  	int cpu;
5305  
5306  	/* all pwqs have been created successfully, let's install'em */
5307  	mutex_lock(&ctx->wq->mutex);
5308  
5309  	copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
5310  
5311  	/* save the previous pwqs and install the new ones */
5312  	for_each_possible_cpu(cpu)
5313  		ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
5314  							ctx->pwq_tbl[cpu]);
5315  	ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);
5316  
5317  	/* update node_nr_active->max */
5318  	wq_update_node_max_active(ctx->wq, -1);
5319  
5320  	/* rescuer needs to respect wq cpumask changes */
5321  	if (ctx->wq->rescuer)
5322  		set_cpus_allowed_ptr(ctx->wq->rescuer->task,
5323  				     unbound_effective_cpumask(ctx->wq));
5324  
5325  	mutex_unlock(&ctx->wq->mutex);
5326  }
5327  
apply_workqueue_attrs_locked(struct workqueue_struct * wq,const struct workqueue_attrs * attrs)5328  static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
5329  					const struct workqueue_attrs *attrs)
5330  {
5331  	struct apply_wqattrs_ctx *ctx;
5332  
5333  	/* only unbound workqueues can change attributes */
5334  	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
5335  		return -EINVAL;
5336  
5337  	ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
5338  	if (IS_ERR(ctx))
5339  		return PTR_ERR(ctx);
5340  
5341  	/* the ctx has been prepared successfully, let's commit it */
5342  	apply_wqattrs_commit(ctx);
5343  	apply_wqattrs_cleanup(ctx);
5344  
5345  	return 0;
5346  }
5347  
5348  /**
5349   * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
5350   * @wq: the target workqueue
5351   * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
5352   *
5353   * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
5354   * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
5355   * work items are affine to the pod it was issued on. Older pwqs are released as
5356   * in-flight work items finish. Note that a work item which repeatedly requeues
5357   * itself back-to-back will stay on its current pwq.
5358   *
5359   * Performs GFP_KERNEL allocations.
5360   *
5361   * Return: 0 on success and -errno on failure.
5362   */
apply_workqueue_attrs(struct workqueue_struct * wq,const struct workqueue_attrs * attrs)5363  int apply_workqueue_attrs(struct workqueue_struct *wq,
5364  			  const struct workqueue_attrs *attrs)
5365  {
5366  	int ret;
5367  
5368  	mutex_lock(&wq_pool_mutex);
5369  	ret = apply_workqueue_attrs_locked(wq, attrs);
5370  	mutex_unlock(&wq_pool_mutex);
5371  
5372  	return ret;
5373  }
5374  
5375  /**
5376   * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug
5377   * @wq: the target workqueue
5378   * @cpu: the CPU to update the pwq slot for
5379   *
5380   * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
5381   * %CPU_DOWN_FAILED.  @cpu is in the same pod of the CPU being hot[un]plugged.
5382   *
5383   *
5384   * If pod affinity can't be adjusted due to memory allocation failure, it falls
5385   * back to @wq->dfl_pwq which may not be optimal but is always correct.
5386   *
5387   * Note that when the last allowed CPU of a pod goes offline for a workqueue
5388   * with a cpumask spanning multiple pods, the workers which were already
5389   * executing the work items for the workqueue will lose their CPU affinity and
5390   * may execute on any CPU. This is similar to how per-cpu workqueues behave on
5391   * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
5392   * responsibility to flush the work item from CPU_DOWN_PREPARE.
5393   */
unbound_wq_update_pwq(struct workqueue_struct * wq,int cpu)5394  static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
5395  {
5396  	struct pool_workqueue *old_pwq = NULL, *pwq;
5397  	struct workqueue_attrs *target_attrs;
5398  
5399  	lockdep_assert_held(&wq_pool_mutex);
5400  
5401  	if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)
5402  		return;
5403  
5404  	/*
5405  	 * We don't wanna alloc/free wq_attrs for each wq for each CPU.
5406  	 * Let's use a preallocated one.  The following buf is protected by
5407  	 * CPU hotplug exclusion.
5408  	 */
5409  	target_attrs = unbound_wq_update_pwq_attrs_buf;
5410  
5411  	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
5412  	wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
5413  
5414  	/* nothing to do if the target cpumask matches the current pwq */
5415  	wq_calc_pod_cpumask(target_attrs, cpu);
5416  	if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
5417  		return;
5418  
5419  	/* create a new pwq */
5420  	pwq = alloc_unbound_pwq(wq, target_attrs);
5421  	if (!pwq) {
5422  		pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
5423  			wq->name);
5424  		goto use_dfl_pwq;
5425  	}
5426  
5427  	/* Install the new pwq. */
5428  	mutex_lock(&wq->mutex);
5429  	old_pwq = install_unbound_pwq(wq, cpu, pwq);
5430  	goto out_unlock;
5431  
5432  use_dfl_pwq:
5433  	mutex_lock(&wq->mutex);
5434  	pwq = unbound_pwq(wq, -1);
5435  	raw_spin_lock_irq(&pwq->pool->lock);
5436  	get_pwq(pwq);
5437  	raw_spin_unlock_irq(&pwq->pool->lock);
5438  	old_pwq = install_unbound_pwq(wq, cpu, pwq);
5439  out_unlock:
5440  	mutex_unlock(&wq->mutex);
5441  	put_pwq_unlocked(old_pwq);
5442  }
5443  
alloc_and_link_pwqs(struct workqueue_struct * wq)5444  static int alloc_and_link_pwqs(struct workqueue_struct *wq)
5445  {
5446  	bool highpri = wq->flags & WQ_HIGHPRI;
5447  	int cpu, ret;
5448  
5449  	lockdep_assert_held(&wq_pool_mutex);
5450  
5451  	wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
5452  	if (!wq->cpu_pwq)
5453  		goto enomem;
5454  
5455  	if (!(wq->flags & WQ_UNBOUND)) {
5456  		struct worker_pool __percpu *pools;
5457  
5458  		if (wq->flags & WQ_BH)
5459  			pools = bh_worker_pools;
5460  		else
5461  			pools = cpu_worker_pools;
5462  
5463  		for_each_possible_cpu(cpu) {
5464  			struct pool_workqueue **pwq_p;
5465  			struct worker_pool *pool;
5466  
5467  			pool = &(per_cpu_ptr(pools, cpu)[highpri]);
5468  			pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);
5469  
5470  			*pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
5471  						       pool->node);
5472  			if (!*pwq_p)
5473  				goto enomem;
5474  
5475  			init_pwq(*pwq_p, wq, pool);
5476  
5477  			mutex_lock(&wq->mutex);
5478  			link_pwq(*pwq_p);
5479  			mutex_unlock(&wq->mutex);
5480  		}
5481  		return 0;
5482  	}
5483  
5484  	if (wq->flags & __WQ_ORDERED) {
5485  		struct pool_workqueue *dfl_pwq;
5486  
5487  		ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
5488  		/* there should only be single pwq for ordering guarantee */
5489  		dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
5490  		WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
5491  			      wq->pwqs.prev != &dfl_pwq->pwqs_node),
5492  		     "ordering guarantee broken for workqueue %s\n", wq->name);
5493  	} else {
5494  		ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
5495  	}
5496  
5497  	return ret;
5498  
5499  enomem:
5500  	if (wq->cpu_pwq) {
5501  		for_each_possible_cpu(cpu) {
5502  			struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
5503  
5504  			if (pwq)
5505  				kmem_cache_free(pwq_cache, pwq);
5506  		}
5507  		free_percpu(wq->cpu_pwq);
5508  		wq->cpu_pwq = NULL;
5509  	}
5510  	return -ENOMEM;
5511  }
5512  
wq_clamp_max_active(int max_active,unsigned int flags,const char * name)5513  static int wq_clamp_max_active(int max_active, unsigned int flags,
5514  			       const char *name)
5515  {
5516  	if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
5517  		pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
5518  			max_active, name, 1, WQ_MAX_ACTIVE);
5519  
5520  	return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
5521  }
5522  
5523  /*
5524   * Workqueues which may be used during memory reclaim should have a rescuer
5525   * to guarantee forward progress.
5526   */
init_rescuer(struct workqueue_struct * wq)5527  static int init_rescuer(struct workqueue_struct *wq)
5528  {
5529  	struct worker *rescuer;
5530  	char id_buf[WORKER_ID_LEN];
5531  	int ret;
5532  
5533  	lockdep_assert_held(&wq_pool_mutex);
5534  
5535  	if (!(wq->flags & WQ_MEM_RECLAIM))
5536  		return 0;
5537  
5538  	rescuer = alloc_worker(NUMA_NO_NODE);
5539  	if (!rescuer) {
5540  		pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
5541  		       wq->name);
5542  		return -ENOMEM;
5543  	}
5544  
5545  	rescuer->rescue_wq = wq;
5546  	format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL);
5547  
5548  	rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf);
5549  	if (IS_ERR(rescuer->task)) {
5550  		ret = PTR_ERR(rescuer->task);
5551  		pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
5552  		       wq->name, ERR_PTR(ret));
5553  		kfree(rescuer);
5554  		return ret;
5555  	}
5556  
5557  	wq->rescuer = rescuer;
5558  	if (wq->flags & WQ_UNBOUND)
5559  		kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
5560  	else
5561  		kthread_bind_mask(rescuer->task, cpu_possible_mask);
5562  	wake_up_process(rescuer->task);
5563  
5564  	return 0;
5565  }
5566  
5567  /**
5568   * wq_adjust_max_active - update a wq's max_active to the current setting
5569   * @wq: target workqueue
5570   *
5571   * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
5572   * activate inactive work items accordingly. If @wq is freezing, clear
5573   * @wq->max_active to zero.
5574   */
wq_adjust_max_active(struct workqueue_struct * wq)5575  static void wq_adjust_max_active(struct workqueue_struct *wq)
5576  {
5577  	bool activated;
5578  	int new_max, new_min;
5579  
5580  	lockdep_assert_held(&wq->mutex);
5581  
5582  	if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
5583  		new_max = 0;
5584  		new_min = 0;
5585  	} else {
5586  		new_max = wq->saved_max_active;
5587  		new_min = wq->saved_min_active;
5588  	}
5589  
5590  	if (wq->max_active == new_max && wq->min_active == new_min)
5591  		return;
5592  
5593  	/*
5594  	 * Update @wq->max/min_active and then kick inactive work items if more
5595  	 * active work items are allowed. This doesn't break work item ordering
5596  	 * because new work items are always queued behind existing inactive
5597  	 * work items if there are any.
5598  	 */
5599  	WRITE_ONCE(wq->max_active, new_max);
5600  	WRITE_ONCE(wq->min_active, new_min);
5601  
5602  	if (wq->flags & WQ_UNBOUND)
5603  		wq_update_node_max_active(wq, -1);
5604  
5605  	if (new_max == 0)
5606  		return;
5607  
5608  	/*
5609  	 * Round-robin through pwq's activating the first inactive work item
5610  	 * until max_active is filled.
5611  	 */
5612  	do {
5613  		struct pool_workqueue *pwq;
5614  
5615  		activated = false;
5616  		for_each_pwq(pwq, wq) {
5617  			unsigned long irq_flags;
5618  
5619  			/* can be called during early boot w/ irq disabled */
5620  			raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
5621  			if (pwq_activate_first_inactive(pwq, true)) {
5622  				activated = true;
5623  				kick_pool(pwq->pool);
5624  			}
5625  			raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
5626  		}
5627  	} while (activated);
5628  }
5629  
__alloc_workqueue(const char * fmt,unsigned int flags,int max_active,va_list args)5630  static struct workqueue_struct *__alloc_workqueue(const char *fmt,
5631  						  unsigned int flags,
5632  						  int max_active, va_list args)
5633  {
5634  	struct workqueue_struct *wq;
5635  	size_t wq_size;
5636  	int name_len;
5637  
5638  	if (flags & WQ_BH) {
5639  		if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS))
5640  			return NULL;
5641  		if (WARN_ON_ONCE(max_active))
5642  			return NULL;
5643  	}
5644  
5645  	/* see the comment above the definition of WQ_POWER_EFFICIENT */
5646  	if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
5647  		flags |= WQ_UNBOUND;
5648  
5649  	/* allocate wq and format name */
5650  	if (flags & WQ_UNBOUND)
5651  		wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
5652  	else
5653  		wq_size = sizeof(*wq);
5654  
5655  	wq = kzalloc(wq_size, GFP_KERNEL);
5656  	if (!wq)
5657  		return NULL;
5658  
5659  	if (flags & WQ_UNBOUND) {
5660  		wq->unbound_attrs = alloc_workqueue_attrs();
5661  		if (!wq->unbound_attrs)
5662  			goto err_free_wq;
5663  	}
5664  
5665  	name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
5666  
5667  	if (name_len >= WQ_NAME_LEN)
5668  		pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
5669  			     wq->name);
5670  
5671  	if (flags & WQ_BH) {
5672  		/*
5673  		 * BH workqueues always share a single execution context per CPU
5674  		 * and don't impose any max_active limit.
5675  		 */
5676  		max_active = INT_MAX;
5677  	} else {
5678  		max_active = max_active ?: WQ_DFL_ACTIVE;
5679  		max_active = wq_clamp_max_active(max_active, flags, wq->name);
5680  	}
5681  
5682  	/* init wq */
5683  	wq->flags = flags;
5684  	wq->max_active = max_active;
5685  	wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
5686  	wq->saved_max_active = wq->max_active;
5687  	wq->saved_min_active = wq->min_active;
5688  	mutex_init(&wq->mutex);
5689  	atomic_set(&wq->nr_pwqs_to_flush, 0);
5690  	INIT_LIST_HEAD(&wq->pwqs);
5691  	INIT_LIST_HEAD(&wq->flusher_queue);
5692  	INIT_LIST_HEAD(&wq->flusher_overflow);
5693  	INIT_LIST_HEAD(&wq->maydays);
5694  
5695  	INIT_LIST_HEAD(&wq->list);
5696  
5697  	if (flags & WQ_UNBOUND) {
5698  		if (alloc_node_nr_active(wq->node_nr_active) < 0)
5699  			goto err_free_wq;
5700  	}
5701  
5702  	/*
5703  	 * wq_pool_mutex protects the workqueues list, allocations of PWQs,
5704  	 * and the global freeze state.
5705  	 */
5706  	apply_wqattrs_lock();
5707  
5708  	if (alloc_and_link_pwqs(wq) < 0)
5709  		goto err_unlock_free_node_nr_active;
5710  
5711  	mutex_lock(&wq->mutex);
5712  	wq_adjust_max_active(wq);
5713  	mutex_unlock(&wq->mutex);
5714  
5715  	list_add_tail_rcu(&wq->list, &workqueues);
5716  
5717  	if (wq_online && init_rescuer(wq) < 0)
5718  		goto err_unlock_destroy;
5719  
5720  	apply_wqattrs_unlock();
5721  
5722  	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
5723  		goto err_destroy;
5724  
5725  	return wq;
5726  
5727  err_unlock_free_node_nr_active:
5728  	apply_wqattrs_unlock();
5729  	/*
5730  	 * Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
5731  	 * flushing the pwq_release_worker ensures that the pwq_release_workfn()
5732  	 * completes before calling kfree(wq).
5733  	 */
5734  	if (wq->flags & WQ_UNBOUND) {
5735  		kthread_flush_worker(pwq_release_worker);
5736  		free_node_nr_active(wq->node_nr_active);
5737  	}
5738  err_free_wq:
5739  	free_workqueue_attrs(wq->unbound_attrs);
5740  	kfree(wq);
5741  	return NULL;
5742  err_unlock_destroy:
5743  	apply_wqattrs_unlock();
5744  err_destroy:
5745  	destroy_workqueue(wq);
5746  	return NULL;
5747  }
5748  
5749  __printf(1, 4)
alloc_workqueue(const char * fmt,unsigned int flags,int max_active,...)5750  struct workqueue_struct *alloc_workqueue(const char *fmt,
5751  					 unsigned int flags,
5752  					 int max_active, ...)
5753  {
5754  	struct workqueue_struct *wq;
5755  	va_list args;
5756  
5757  	va_start(args, max_active);
5758  	wq = __alloc_workqueue(fmt, flags, max_active, args);
5759  	va_end(args);
5760  	if (!wq)
5761  		return NULL;
5762  
5763  	wq_init_lockdep(wq);
5764  
5765  	return wq;
5766  }
5767  EXPORT_SYMBOL_GPL(alloc_workqueue);
5768  
5769  #ifdef CONFIG_LOCKDEP
5770  __printf(1, 5)
5771  struct workqueue_struct *
alloc_workqueue_lockdep_map(const char * fmt,unsigned int flags,int max_active,struct lockdep_map * lockdep_map,...)5772  alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags,
5773  			    int max_active, struct lockdep_map *lockdep_map, ...)
5774  {
5775  	struct workqueue_struct *wq;
5776  	va_list args;
5777  
5778  	va_start(args, lockdep_map);
5779  	wq = __alloc_workqueue(fmt, flags, max_active, args);
5780  	va_end(args);
5781  	if (!wq)
5782  		return NULL;
5783  
5784  	wq->lockdep_map = lockdep_map;
5785  
5786  	return wq;
5787  }
5788  EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map);
5789  #endif
5790  
pwq_busy(struct pool_workqueue * pwq)5791  static bool pwq_busy(struct pool_workqueue *pwq)
5792  {
5793  	int i;
5794  
5795  	for (i = 0; i < WORK_NR_COLORS; i++)
5796  		if (pwq->nr_in_flight[i])
5797  			return true;
5798  
5799  	if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
5800  		return true;
5801  	if (!pwq_is_empty(pwq))
5802  		return true;
5803  
5804  	return false;
5805  }
5806  
5807  /**
5808   * destroy_workqueue - safely terminate a workqueue
5809   * @wq: target workqueue
5810   *
5811   * Safely destroy a workqueue. All work currently pending will be done first.
5812   */
destroy_workqueue(struct workqueue_struct * wq)5813  void destroy_workqueue(struct workqueue_struct *wq)
5814  {
5815  	struct pool_workqueue *pwq;
5816  	int cpu;
5817  
5818  	/*
5819  	 * Remove it from sysfs first so that sanity check failure doesn't
5820  	 * lead to sysfs name conflicts.
5821  	 */
5822  	workqueue_sysfs_unregister(wq);
5823  
5824  	/* mark the workqueue destruction is in progress */
5825  	mutex_lock(&wq->mutex);
5826  	wq->flags |= __WQ_DESTROYING;
5827  	mutex_unlock(&wq->mutex);
5828  
5829  	/* drain it before proceeding with destruction */
5830  	drain_workqueue(wq);
5831  
5832  	/* kill rescuer, if sanity checks fail, leave it w/o rescuer */
5833  	if (wq->rescuer) {
5834  		struct worker *rescuer = wq->rescuer;
5835  
5836  		/* this prevents new queueing */
5837  		raw_spin_lock_irq(&wq_mayday_lock);
5838  		wq->rescuer = NULL;
5839  		raw_spin_unlock_irq(&wq_mayday_lock);
5840  
5841  		/* rescuer will empty maydays list before exiting */
5842  		kthread_stop(rescuer->task);
5843  		kfree(rescuer);
5844  	}
5845  
5846  	/*
5847  	 * Sanity checks - grab all the locks so that we wait for all
5848  	 * in-flight operations which may do put_pwq().
5849  	 */
5850  	mutex_lock(&wq_pool_mutex);
5851  	mutex_lock(&wq->mutex);
5852  	for_each_pwq(pwq, wq) {
5853  		raw_spin_lock_irq(&pwq->pool->lock);
5854  		if (WARN_ON(pwq_busy(pwq))) {
5855  			pr_warn("%s: %s has the following busy pwq\n",
5856  				__func__, wq->name);
5857  			show_pwq(pwq);
5858  			raw_spin_unlock_irq(&pwq->pool->lock);
5859  			mutex_unlock(&wq->mutex);
5860  			mutex_unlock(&wq_pool_mutex);
5861  			show_one_workqueue(wq);
5862  			return;
5863  		}
5864  		raw_spin_unlock_irq(&pwq->pool->lock);
5865  	}
5866  	mutex_unlock(&wq->mutex);
5867  
5868  	/*
5869  	 * wq list is used to freeze wq, remove from list after
5870  	 * flushing is complete in case freeze races us.
5871  	 */
5872  	list_del_rcu(&wq->list);
5873  	mutex_unlock(&wq_pool_mutex);
5874  
5875  	/*
5876  	 * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
5877  	 * to put the base refs. @wq will be auto-destroyed from the last
5878  	 * pwq_put. RCU read lock prevents @wq from going away from under us.
5879  	 */
5880  	rcu_read_lock();
5881  
5882  	for_each_possible_cpu(cpu) {
5883  		put_pwq_unlocked(unbound_pwq(wq, cpu));
5884  		RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
5885  	}
5886  
5887  	put_pwq_unlocked(unbound_pwq(wq, -1));
5888  	RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);
5889  
5890  	rcu_read_unlock();
5891  }
5892  EXPORT_SYMBOL_GPL(destroy_workqueue);
5893  
5894  /**
5895   * workqueue_set_max_active - adjust max_active of a workqueue
5896   * @wq: target workqueue
5897   * @max_active: new max_active value.
5898   *
5899   * Set max_active of @wq to @max_active. See the alloc_workqueue() function
5900   * comment.
5901   *
5902   * CONTEXT:
5903   * Don't call from IRQ context.
5904   */
workqueue_set_max_active(struct workqueue_struct * wq,int max_active)5905  void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
5906  {
5907  	/* max_active doesn't mean anything for BH workqueues */
5908  	if (WARN_ON(wq->flags & WQ_BH))
5909  		return;
5910  	/* disallow meddling with max_active for ordered workqueues */
5911  	if (WARN_ON(wq->flags & __WQ_ORDERED))
5912  		return;
5913  
5914  	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
5915  
5916  	mutex_lock(&wq->mutex);
5917  
5918  	wq->saved_max_active = max_active;
5919  	if (wq->flags & WQ_UNBOUND)
5920  		wq->saved_min_active = min(wq->saved_min_active, max_active);
5921  
5922  	wq_adjust_max_active(wq);
5923  
5924  	mutex_unlock(&wq->mutex);
5925  }
5926  EXPORT_SYMBOL_GPL(workqueue_set_max_active);
5927  
5928  /**
5929   * workqueue_set_min_active - adjust min_active of an unbound workqueue
5930   * @wq: target unbound workqueue
5931   * @min_active: new min_active value
5932   *
5933   * Set min_active of an unbound workqueue. Unlike other types of workqueues, an
5934   * unbound workqueue is not guaranteed to be able to process max_active
5935   * interdependent work items. Instead, an unbound workqueue is guaranteed to be
5936   * able to process min_active number of interdependent work items which is
5937   * %WQ_DFL_MIN_ACTIVE by default.
5938   *
5939   * Use this function to adjust the min_active value between 0 and the current
5940   * max_active.
5941   */
workqueue_set_min_active(struct workqueue_struct * wq,int min_active)5942  void workqueue_set_min_active(struct workqueue_struct *wq, int min_active)
5943  {
5944  	/* min_active is only meaningful for non-ordered unbound workqueues */
5945  	if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) !=
5946  		    WQ_UNBOUND))
5947  		return;
5948  
5949  	mutex_lock(&wq->mutex);
5950  	wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active);
5951  	wq_adjust_max_active(wq);
5952  	mutex_unlock(&wq->mutex);
5953  }
5954  
5955  /**
5956   * current_work - retrieve %current task's work struct
5957   *
5958   * Determine if %current task is a workqueue worker and what it's working on.
5959   * Useful to find out the context that the %current task is running in.
5960   *
5961   * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
5962   */
current_work(void)5963  struct work_struct *current_work(void)
5964  {
5965  	struct worker *worker = current_wq_worker();
5966  
5967  	return worker ? worker->current_work : NULL;
5968  }
5969  EXPORT_SYMBOL(current_work);
5970  
5971  /**
5972   * current_is_workqueue_rescuer - is %current workqueue rescuer?
5973   *
5974   * Determine whether %current is a workqueue rescuer.  Can be used from
5975   * work functions to determine whether it's being run off the rescuer task.
5976   *
5977   * Return: %true if %current is a workqueue rescuer. %false otherwise.
5978   */
current_is_workqueue_rescuer(void)5979  bool current_is_workqueue_rescuer(void)
5980  {
5981  	struct worker *worker = current_wq_worker();
5982  
5983  	return worker && worker->rescue_wq;
5984  }
5985  
5986  /**
5987   * workqueue_congested - test whether a workqueue is congested
5988   * @cpu: CPU in question
5989   * @wq: target workqueue
5990   *
5991   * Test whether @wq's cpu workqueue for @cpu is congested.  There is
5992   * no synchronization around this function and the test result is
5993   * unreliable and only useful as advisory hints or for debugging.
5994   *
5995   * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
5996   *
5997   * With the exception of ordered workqueues, all workqueues have per-cpu
5998   * pool_workqueues, each with its own congested state. A workqueue being
5999   * congested on one CPU doesn't mean that the workqueue is contested on any
6000   * other CPUs.
6001   *
6002   * Return:
6003   * %true if congested, %false otherwise.
6004   */
workqueue_congested(int cpu,struct workqueue_struct * wq)6005  bool workqueue_congested(int cpu, struct workqueue_struct *wq)
6006  {
6007  	struct pool_workqueue *pwq;
6008  	bool ret;
6009  
6010  	rcu_read_lock();
6011  	preempt_disable();
6012  
6013  	if (cpu == WORK_CPU_UNBOUND)
6014  		cpu = smp_processor_id();
6015  
6016  	pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
6017  	ret = !list_empty(&pwq->inactive_works);
6018  
6019  	preempt_enable();
6020  	rcu_read_unlock();
6021  
6022  	return ret;
6023  }
6024  EXPORT_SYMBOL_GPL(workqueue_congested);
6025  
6026  /**
6027   * work_busy - test whether a work is currently pending or running
6028   * @work: the work to be tested
6029   *
6030   * Test whether @work is currently pending or running.  There is no
6031   * synchronization around this function and the test result is
6032   * unreliable and only useful as advisory hints or for debugging.
6033   *
6034   * Return:
6035   * OR'd bitmask of WORK_BUSY_* bits.
6036   */
work_busy(struct work_struct * work)6037  unsigned int work_busy(struct work_struct *work)
6038  {
6039  	struct worker_pool *pool;
6040  	unsigned long irq_flags;
6041  	unsigned int ret = 0;
6042  
6043  	if (work_pending(work))
6044  		ret |= WORK_BUSY_PENDING;
6045  
6046  	rcu_read_lock();
6047  	pool = get_work_pool(work);
6048  	if (pool) {
6049  		raw_spin_lock_irqsave(&pool->lock, irq_flags);
6050  		if (find_worker_executing_work(pool, work))
6051  			ret |= WORK_BUSY_RUNNING;
6052  		raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
6053  	}
6054  	rcu_read_unlock();
6055  
6056  	return ret;
6057  }
6058  EXPORT_SYMBOL_GPL(work_busy);
6059  
6060  /**
6061   * set_worker_desc - set description for the current work item
6062   * @fmt: printf-style format string
6063   * @...: arguments for the format string
6064   *
6065   * This function can be called by a running work function to describe what
6066   * the work item is about.  If the worker task gets dumped, this
6067   * information will be printed out together to help debugging.  The
6068   * description can be at most WORKER_DESC_LEN including the trailing '\0'.
6069   */
set_worker_desc(const char * fmt,...)6070  void set_worker_desc(const char *fmt, ...)
6071  {
6072  	struct worker *worker = current_wq_worker();
6073  	va_list args;
6074  
6075  	if (worker) {
6076  		va_start(args, fmt);
6077  		vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
6078  		va_end(args);
6079  	}
6080  }
6081  EXPORT_SYMBOL_GPL(set_worker_desc);
6082  
6083  /**
6084   * print_worker_info - print out worker information and description
6085   * @log_lvl: the log level to use when printing
6086   * @task: target task
6087   *
6088   * If @task is a worker and currently executing a work item, print out the
6089   * name of the workqueue being serviced and worker description set with
6090   * set_worker_desc() by the currently executing work item.
6091   *
6092   * This function can be safely called on any task as long as the
6093   * task_struct itself is accessible.  While safe, this function isn't
6094   * synchronized and may print out mixups or garbages of limited length.
6095   */
print_worker_info(const char * log_lvl,struct task_struct * task)6096  void print_worker_info(const char *log_lvl, struct task_struct *task)
6097  {
6098  	work_func_t *fn = NULL;
6099  	char name[WQ_NAME_LEN] = { };
6100  	char desc[WORKER_DESC_LEN] = { };
6101  	struct pool_workqueue *pwq = NULL;
6102  	struct workqueue_struct *wq = NULL;
6103  	struct worker *worker;
6104  
6105  	if (!(task->flags & PF_WQ_WORKER))
6106  		return;
6107  
6108  	/*
6109  	 * This function is called without any synchronization and @task
6110  	 * could be in any state.  Be careful with dereferences.
6111  	 */
6112  	worker = kthread_probe_data(task);
6113  
6114  	/*
6115  	 * Carefully copy the associated workqueue's workfn, name and desc.
6116  	 * Keep the original last '\0' in case the original is garbage.
6117  	 */
6118  	copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
6119  	copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
6120  	copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
6121  	copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
6122  	copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);
6123  
6124  	if (fn || name[0] || desc[0]) {
6125  		printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
6126  		if (strcmp(name, desc))
6127  			pr_cont(" (%s)", desc);
6128  		pr_cont("\n");
6129  	}
6130  }
6131  
pr_cont_pool_info(struct worker_pool * pool)6132  static void pr_cont_pool_info(struct worker_pool *pool)
6133  {
6134  	pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
6135  	if (pool->node != NUMA_NO_NODE)
6136  		pr_cont(" node=%d", pool->node);
6137  	pr_cont(" flags=0x%x", pool->flags);
6138  	if (pool->flags & POOL_BH)
6139  		pr_cont(" bh%s",
6140  			pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
6141  	else
6142  		pr_cont(" nice=%d", pool->attrs->nice);
6143  }
6144  
pr_cont_worker_id(struct worker * worker)6145  static void pr_cont_worker_id(struct worker *worker)
6146  {
6147  	struct worker_pool *pool = worker->pool;
6148  
6149  	if (pool->flags & WQ_BH)
6150  		pr_cont("bh%s",
6151  			pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
6152  	else
6153  		pr_cont("%d%s", task_pid_nr(worker->task),
6154  			worker->rescue_wq ? "(RESCUER)" : "");
6155  }
6156  
6157  struct pr_cont_work_struct {
6158  	bool comma;
6159  	work_func_t func;
6160  	long ctr;
6161  };
6162  
pr_cont_work_flush(bool comma,work_func_t func,struct pr_cont_work_struct * pcwsp)6163  static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
6164  {
6165  	if (!pcwsp->ctr)
6166  		goto out_record;
6167  	if (func == pcwsp->func) {
6168  		pcwsp->ctr++;
6169  		return;
6170  	}
6171  	if (pcwsp->ctr == 1)
6172  		pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
6173  	else
6174  		pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
6175  	pcwsp->ctr = 0;
6176  out_record:
6177  	if ((long)func == -1L)
6178  		return;
6179  	pcwsp->comma = comma;
6180  	pcwsp->func = func;
6181  	pcwsp->ctr = 1;
6182  }
6183  
pr_cont_work(bool comma,struct work_struct * work,struct pr_cont_work_struct * pcwsp)6184  static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
6185  {
6186  	if (work->func == wq_barrier_func) {
6187  		struct wq_barrier *barr;
6188  
6189  		barr = container_of(work, struct wq_barrier, work);
6190  
6191  		pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
6192  		pr_cont("%s BAR(%d)", comma ? "," : "",
6193  			task_pid_nr(barr->task));
6194  	} else {
6195  		if (!comma)
6196  			pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
6197  		pr_cont_work_flush(comma, work->func, pcwsp);
6198  	}
6199  }
6200  
show_pwq(struct pool_workqueue * pwq)6201  static void show_pwq(struct pool_workqueue *pwq)
6202  {
6203  	struct pr_cont_work_struct pcws = { .ctr = 0, };
6204  	struct worker_pool *pool = pwq->pool;
6205  	struct work_struct *work;
6206  	struct worker *worker;
6207  	bool has_in_flight = false, has_pending = false;
6208  	int bkt;
6209  
6210  	pr_info("  pwq %d:", pool->id);
6211  	pr_cont_pool_info(pool);
6212  
6213  	pr_cont(" active=%d refcnt=%d%s\n",
6214  		pwq->nr_active, pwq->refcnt,
6215  		!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
6216  
6217  	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
6218  		if (worker->current_pwq == pwq) {
6219  			has_in_flight = true;
6220  			break;
6221  		}
6222  	}
6223  	if (has_in_flight) {
6224  		bool comma = false;
6225  
6226  		pr_info("    in-flight:");
6227  		hash_for_each(pool->busy_hash, bkt, worker, hentry) {
6228  			if (worker->current_pwq != pwq)
6229  				continue;
6230  
6231  			pr_cont(" %s", comma ? "," : "");
6232  			pr_cont_worker_id(worker);
6233  			pr_cont(":%ps", worker->current_func);
6234  			list_for_each_entry(work, &worker->scheduled, entry)
6235  				pr_cont_work(false, work, &pcws);
6236  			pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
6237  			comma = true;
6238  		}
6239  		pr_cont("\n");
6240  	}
6241  
6242  	list_for_each_entry(work, &pool->worklist, entry) {
6243  		if (get_work_pwq(work) == pwq) {
6244  			has_pending = true;
6245  			break;
6246  		}
6247  	}
6248  	if (has_pending) {
6249  		bool comma = false;
6250  
6251  		pr_info("    pending:");
6252  		list_for_each_entry(work, &pool->worklist, entry) {
6253  			if (get_work_pwq(work) != pwq)
6254  				continue;
6255  
6256  			pr_cont_work(comma, work, &pcws);
6257  			comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
6258  		}
6259  		pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
6260  		pr_cont("\n");
6261  	}
6262  
6263  	if (!list_empty(&pwq->inactive_works)) {
6264  		bool comma = false;
6265  
6266  		pr_info("    inactive:");
6267  		list_for_each_entry(work, &pwq->inactive_works, entry) {
6268  			pr_cont_work(comma, work, &pcws);
6269  			comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
6270  		}
6271  		pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
6272  		pr_cont("\n");
6273  	}
6274  }
6275  
6276  /**
6277   * show_one_workqueue - dump state of specified workqueue
6278   * @wq: workqueue whose state will be printed
6279   */
show_one_workqueue(struct workqueue_struct * wq)6280  void show_one_workqueue(struct workqueue_struct *wq)
6281  {
6282  	struct pool_workqueue *pwq;
6283  	bool idle = true;
6284  	unsigned long irq_flags;
6285  
6286  	for_each_pwq(pwq, wq) {
6287  		if (!pwq_is_empty(pwq)) {
6288  			idle = false;
6289  			break;
6290  		}
6291  	}
6292  	if (idle) /* Nothing to print for idle workqueue */
6293  		return;
6294  
6295  	pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
6296  
6297  	for_each_pwq(pwq, wq) {
6298  		raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
6299  		if (!pwq_is_empty(pwq)) {
6300  			/*
6301  			 * Defer printing to avoid deadlocks in console
6302  			 * drivers that queue work while holding locks
6303  			 * also taken in their write paths.
6304  			 */
6305  			printk_deferred_enter();
6306  			show_pwq(pwq);
6307  			printk_deferred_exit();
6308  		}
6309  		raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
6310  		/*
6311  		 * We could be printing a lot from atomic context, e.g.
6312  		 * sysrq-t -> show_all_workqueues(). Avoid triggering
6313  		 * hard lockup.
6314  		 */
6315  		touch_nmi_watchdog();
6316  	}
6317  
6318  }
6319  
6320  /**
6321   * show_one_worker_pool - dump state of specified worker pool
6322   * @pool: worker pool whose state will be printed
6323   */
show_one_worker_pool(struct worker_pool * pool)6324  static void show_one_worker_pool(struct worker_pool *pool)
6325  {
6326  	struct worker *worker;
6327  	bool first = true;
6328  	unsigned long irq_flags;
6329  	unsigned long hung = 0;
6330  
6331  	raw_spin_lock_irqsave(&pool->lock, irq_flags);
6332  	if (pool->nr_workers == pool->nr_idle)
6333  		goto next_pool;
6334  
6335  	/* How long the first pending work is waiting for a worker. */
6336  	if (!list_empty(&pool->worklist))
6337  		hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;
6338  
6339  	/*
6340  	 * Defer printing to avoid deadlocks in console drivers that
6341  	 * queue work while holding locks also taken in their write
6342  	 * paths.
6343  	 */
6344  	printk_deferred_enter();
6345  	pr_info("pool %d:", pool->id);
6346  	pr_cont_pool_info(pool);
6347  	pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
6348  	if (pool->manager)
6349  		pr_cont(" manager: %d",
6350  			task_pid_nr(pool->manager->task));
6351  	list_for_each_entry(worker, &pool->idle_list, entry) {
6352  		pr_cont(" %s", first ? "idle: " : "");
6353  		pr_cont_worker_id(worker);
6354  		first = false;
6355  	}
6356  	pr_cont("\n");
6357  	printk_deferred_exit();
6358  next_pool:
6359  	raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
6360  	/*
6361  	 * We could be printing a lot from atomic context, e.g.
6362  	 * sysrq-t -> show_all_workqueues(). Avoid triggering
6363  	 * hard lockup.
6364  	 */
6365  	touch_nmi_watchdog();
6366  
6367  }
6368  
6369  /**
6370   * show_all_workqueues - dump workqueue state
6371   *
6372   * Called from a sysrq handler and prints out all busy workqueues and pools.
6373   */
show_all_workqueues(void)6374  void show_all_workqueues(void)
6375  {
6376  	struct workqueue_struct *wq;
6377  	struct worker_pool *pool;
6378  	int pi;
6379  
6380  	rcu_read_lock();
6381  
6382  	pr_info("Showing busy workqueues and worker pools:\n");
6383  
6384  	list_for_each_entry_rcu(wq, &workqueues, list)
6385  		show_one_workqueue(wq);
6386  
6387  	for_each_pool(pool, pi)
6388  		show_one_worker_pool(pool);
6389  
6390  	rcu_read_unlock();
6391  }
6392  
6393  /**
6394   * show_freezable_workqueues - dump freezable workqueue state
6395   *
6396   * Called from try_to_freeze_tasks() and prints out all freezable workqueues
6397   * still busy.
6398   */
show_freezable_workqueues(void)6399  void show_freezable_workqueues(void)
6400  {
6401  	struct workqueue_struct *wq;
6402  
6403  	rcu_read_lock();
6404  
6405  	pr_info("Showing freezable workqueues that are still busy:\n");
6406  
6407  	list_for_each_entry_rcu(wq, &workqueues, list) {
6408  		if (!(wq->flags & WQ_FREEZABLE))
6409  			continue;
6410  		show_one_workqueue(wq);
6411  	}
6412  
6413  	rcu_read_unlock();
6414  }
6415  
6416  /* used to show worker information through /proc/PID/{comm,stat,status} */
wq_worker_comm(char * buf,size_t size,struct task_struct * task)6417  void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
6418  {
6419  	/* stabilize PF_WQ_WORKER and worker pool association */
6420  	mutex_lock(&wq_pool_attach_mutex);
6421  
6422  	if (task->flags & PF_WQ_WORKER) {
6423  		struct worker *worker = kthread_data(task);
6424  		struct worker_pool *pool = worker->pool;
6425  		int off;
6426  
6427  		off = format_worker_id(buf, size, worker, pool);
6428  
6429  		if (pool) {
6430  			raw_spin_lock_irq(&pool->lock);
6431  			/*
6432  			 * ->desc tracks information (wq name or
6433  			 * set_worker_desc()) for the latest execution.  If
6434  			 * current, prepend '+', otherwise '-'.
6435  			 */
6436  			if (worker->desc[0] != '\0') {
6437  				if (worker->current_work)
6438  					scnprintf(buf + off, size - off, "+%s",
6439  						  worker->desc);
6440  				else
6441  					scnprintf(buf + off, size - off, "-%s",
6442  						  worker->desc);
6443  			}
6444  			raw_spin_unlock_irq(&pool->lock);
6445  		}
6446  	} else {
6447  		strscpy(buf, task->comm, size);
6448  	}
6449  
6450  	mutex_unlock(&wq_pool_attach_mutex);
6451  }
6452  
6453  #ifdef CONFIG_SMP
6454  
6455  /*
6456   * CPU hotplug.
6457   *
6458   * There are two challenges in supporting CPU hotplug.  Firstly, there
6459   * are a lot of assumptions on strong associations among work, pwq and
6460   * pool which make migrating pending and scheduled works very
6461   * difficult to implement without impacting hot paths.  Secondly,
6462   * worker pools serve mix of short, long and very long running works making
6463   * blocked draining impractical.
6464   *
6465   * This is solved by allowing the pools to be disassociated from the CPU
6466   * running as an unbound one and allowing it to be reattached later if the
6467   * cpu comes back online.
6468   */
6469  
unbind_workers(int cpu)6470  static void unbind_workers(int cpu)
6471  {
6472  	struct worker_pool *pool;
6473  	struct worker *worker;
6474  
6475  	for_each_cpu_worker_pool(pool, cpu) {
6476  		mutex_lock(&wq_pool_attach_mutex);
6477  		raw_spin_lock_irq(&pool->lock);
6478  
6479  		/*
6480  		 * We've blocked all attach/detach operations. Make all workers
6481  		 * unbound and set DISASSOCIATED.  Before this, all workers
6482  		 * must be on the cpu.  After this, they may become diasporas.
6483  		 * And the preemption disabled section in their sched callbacks
6484  		 * are guaranteed to see WORKER_UNBOUND since the code here
6485  		 * is on the same cpu.
6486  		 */
6487  		for_each_pool_worker(worker, pool)
6488  			worker->flags |= WORKER_UNBOUND;
6489  
6490  		pool->flags |= POOL_DISASSOCIATED;
6491  
6492  		/*
6493  		 * The handling of nr_running in sched callbacks are disabled
6494  		 * now.  Zap nr_running.  After this, nr_running stays zero and
6495  		 * need_more_worker() and keep_working() are always true as
6496  		 * long as the worklist is not empty.  This pool now behaves as
6497  		 * an unbound (in terms of concurrency management) pool which
6498  		 * are served by workers tied to the pool.
6499  		 */
6500  		pool->nr_running = 0;
6501  
6502  		/*
6503  		 * With concurrency management just turned off, a busy
6504  		 * worker blocking could lead to lengthy stalls.  Kick off
6505  		 * unbound chain execution of currently pending work items.
6506  		 */
6507  		kick_pool(pool);
6508  
6509  		raw_spin_unlock_irq(&pool->lock);
6510  
6511  		for_each_pool_worker(worker, pool)
6512  			unbind_worker(worker);
6513  
6514  		mutex_unlock(&wq_pool_attach_mutex);
6515  	}
6516  }
6517  
6518  /**
6519   * rebind_workers - rebind all workers of a pool to the associated CPU
6520   * @pool: pool of interest
6521   *
6522   * @pool->cpu is coming online.  Rebind all workers to the CPU.
6523   */
rebind_workers(struct worker_pool * pool)6524  static void rebind_workers(struct worker_pool *pool)
6525  {
6526  	struct worker *worker;
6527  
6528  	lockdep_assert_held(&wq_pool_attach_mutex);
6529  
6530  	/*
6531  	 * Restore CPU affinity of all workers.  As all idle workers should
6532  	 * be on the run-queue of the associated CPU before any local
6533  	 * wake-ups for concurrency management happen, restore CPU affinity
6534  	 * of all workers first and then clear UNBOUND.  As we're called
6535  	 * from CPU_ONLINE, the following shouldn't fail.
6536  	 */
6537  	for_each_pool_worker(worker, pool) {
6538  		kthread_set_per_cpu(worker->task, pool->cpu);
6539  		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
6540  						  pool_allowed_cpus(pool)) < 0);
6541  	}
6542  
6543  	raw_spin_lock_irq(&pool->lock);
6544  
6545  	pool->flags &= ~POOL_DISASSOCIATED;
6546  
6547  	for_each_pool_worker(worker, pool) {
6548  		unsigned int worker_flags = worker->flags;
6549  
6550  		/*
6551  		 * We want to clear UNBOUND but can't directly call
6552  		 * worker_clr_flags() or adjust nr_running.  Atomically
6553  		 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
6554  		 * @worker will clear REBOUND using worker_clr_flags() when
6555  		 * it initiates the next execution cycle thus restoring
6556  		 * concurrency management.  Note that when or whether
6557  		 * @worker clears REBOUND doesn't affect correctness.
6558  		 *
6559  		 * WRITE_ONCE() is necessary because @worker->flags may be
6560  		 * tested without holding any lock in
6561  		 * wq_worker_running().  Without it, NOT_RUNNING test may
6562  		 * fail incorrectly leading to premature concurrency
6563  		 * management operations.
6564  		 */
6565  		WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
6566  		worker_flags |= WORKER_REBOUND;
6567  		worker_flags &= ~WORKER_UNBOUND;
6568  		WRITE_ONCE(worker->flags, worker_flags);
6569  	}
6570  
6571  	raw_spin_unlock_irq(&pool->lock);
6572  }
6573  
6574  /**
6575   * restore_unbound_workers_cpumask - restore cpumask of unbound workers
6576   * @pool: unbound pool of interest
6577   * @cpu: the CPU which is coming up
6578   *
6579   * An unbound pool may end up with a cpumask which doesn't have any online
6580   * CPUs.  When a worker of such pool get scheduled, the scheduler resets
6581   * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
6582   * online CPU before, cpus_allowed of all its workers should be restored.
6583   */
restore_unbound_workers_cpumask(struct worker_pool * pool,int cpu)6584  static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
6585  {
6586  	static cpumask_t cpumask;
6587  	struct worker *worker;
6588  
6589  	lockdep_assert_held(&wq_pool_attach_mutex);
6590  
6591  	/* is @cpu allowed for @pool? */
6592  	if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
6593  		return;
6594  
6595  	cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
6596  
6597  	/* as we're called from CPU_ONLINE, the following shouldn't fail */
6598  	for_each_pool_worker(worker, pool)
6599  		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
6600  }
6601  
workqueue_prepare_cpu(unsigned int cpu)6602  int workqueue_prepare_cpu(unsigned int cpu)
6603  {
6604  	struct worker_pool *pool;
6605  
6606  	for_each_cpu_worker_pool(pool, cpu) {
6607  		if (pool->nr_workers)
6608  			continue;
6609  		if (!create_worker(pool))
6610  			return -ENOMEM;
6611  	}
6612  	return 0;
6613  }
6614  
workqueue_online_cpu(unsigned int cpu)6615  int workqueue_online_cpu(unsigned int cpu)
6616  {
6617  	struct worker_pool *pool;
6618  	struct workqueue_struct *wq;
6619  	int pi;
6620  
6621  	mutex_lock(&wq_pool_mutex);
6622  
6623  	cpumask_set_cpu(cpu, wq_online_cpumask);
6624  
6625  	for_each_pool(pool, pi) {
6626  		/* BH pools aren't affected by hotplug */
6627  		if (pool->flags & POOL_BH)
6628  			continue;
6629  
6630  		mutex_lock(&wq_pool_attach_mutex);
6631  		if (pool->cpu == cpu)
6632  			rebind_workers(pool);
6633  		else if (pool->cpu < 0)
6634  			restore_unbound_workers_cpumask(pool, cpu);
6635  		mutex_unlock(&wq_pool_attach_mutex);
6636  	}
6637  
6638  	/* update pod affinity of unbound workqueues */
6639  	list_for_each_entry(wq, &workqueues, list) {
6640  		struct workqueue_attrs *attrs = wq->unbound_attrs;
6641  
6642  		if (attrs) {
6643  			const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
6644  			int tcpu;
6645  
6646  			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
6647  				unbound_wq_update_pwq(wq, tcpu);
6648  
6649  			mutex_lock(&wq->mutex);
6650  			wq_update_node_max_active(wq, -1);
6651  			mutex_unlock(&wq->mutex);
6652  		}
6653  	}
6654  
6655  	mutex_unlock(&wq_pool_mutex);
6656  	return 0;
6657  }
6658  
workqueue_offline_cpu(unsigned int cpu)6659  int workqueue_offline_cpu(unsigned int cpu)
6660  {
6661  	struct workqueue_struct *wq;
6662  
6663  	/* unbinding per-cpu workers should happen on the local CPU */
6664  	if (WARN_ON(cpu != smp_processor_id()))
6665  		return -1;
6666  
6667  	unbind_workers(cpu);
6668  
6669  	/* update pod affinity of unbound workqueues */
6670  	mutex_lock(&wq_pool_mutex);
6671  
6672  	cpumask_clear_cpu(cpu, wq_online_cpumask);
6673  
6674  	list_for_each_entry(wq, &workqueues, list) {
6675  		struct workqueue_attrs *attrs = wq->unbound_attrs;
6676  
6677  		if (attrs) {
6678  			const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
6679  			int tcpu;
6680  
6681  			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
6682  				unbound_wq_update_pwq(wq, tcpu);
6683  
6684  			mutex_lock(&wq->mutex);
6685  			wq_update_node_max_active(wq, cpu);
6686  			mutex_unlock(&wq->mutex);
6687  		}
6688  	}
6689  	mutex_unlock(&wq_pool_mutex);
6690  
6691  	return 0;
6692  }
6693  
6694  struct work_for_cpu {
6695  	struct work_struct work;
6696  	long (*fn)(void *);
6697  	void *arg;
6698  	long ret;
6699  };
6700  
work_for_cpu_fn(struct work_struct * work)6701  static void work_for_cpu_fn(struct work_struct *work)
6702  {
6703  	struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
6704  
6705  	wfc->ret = wfc->fn(wfc->arg);
6706  }
6707  
6708  /**
6709   * work_on_cpu_key - run a function in thread context on a particular cpu
6710   * @cpu: the cpu to run on
6711   * @fn: the function to run
6712   * @arg: the function arg
6713   * @key: The lock class key for lock debugging purposes
6714   *
6715   * It is up to the caller to ensure that the cpu doesn't go offline.
6716   * The caller must not hold any locks which would prevent @fn from completing.
6717   *
6718   * Return: The value @fn returns.
6719   */
work_on_cpu_key(int cpu,long (* fn)(void *),void * arg,struct lock_class_key * key)6720  long work_on_cpu_key(int cpu, long (*fn)(void *),
6721  		     void *arg, struct lock_class_key *key)
6722  {
6723  	struct work_for_cpu wfc = { .fn = fn, .arg = arg };
6724  
6725  	INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
6726  	schedule_work_on(cpu, &wfc.work);
6727  	flush_work(&wfc.work);
6728  	destroy_work_on_stack(&wfc.work);
6729  	return wfc.ret;
6730  }
6731  EXPORT_SYMBOL_GPL(work_on_cpu_key);
6732  
6733  /**
6734   * work_on_cpu_safe_key - run a function in thread context on a particular cpu
6735   * @cpu: the cpu to run on
6736   * @fn:  the function to run
6737   * @arg: the function argument
6738   * @key: The lock class key for lock debugging purposes
6739   *
6740   * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
6741   * any locks which would prevent @fn from completing.
6742   *
6743   * Return: The value @fn returns.
6744   */
work_on_cpu_safe_key(int cpu,long (* fn)(void *),void * arg,struct lock_class_key * key)6745  long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
6746  			  void *arg, struct lock_class_key *key)
6747  {
6748  	long ret = -ENODEV;
6749  
6750  	cpus_read_lock();
6751  	if (cpu_online(cpu))
6752  		ret = work_on_cpu_key(cpu, fn, arg, key);
6753  	cpus_read_unlock();
6754  	return ret;
6755  }
6756  EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
6757  #endif /* CONFIG_SMP */
6758  
6759  #ifdef CONFIG_FREEZER
6760  
6761  /**
6762   * freeze_workqueues_begin - begin freezing workqueues
6763   *
6764   * Start freezing workqueues.  After this function returns, all freezable
6765   * workqueues will queue new works to their inactive_works list instead of
6766   * pool->worklist.
6767   *
6768   * CONTEXT:
6769   * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
6770   */
freeze_workqueues_begin(void)6771  void freeze_workqueues_begin(void)
6772  {
6773  	struct workqueue_struct *wq;
6774  
6775  	mutex_lock(&wq_pool_mutex);
6776  
6777  	WARN_ON_ONCE(workqueue_freezing);
6778  	workqueue_freezing = true;
6779  
6780  	list_for_each_entry(wq, &workqueues, list) {
6781  		mutex_lock(&wq->mutex);
6782  		wq_adjust_max_active(wq);
6783  		mutex_unlock(&wq->mutex);
6784  	}
6785  
6786  	mutex_unlock(&wq_pool_mutex);
6787  }
6788  
6789  /**
6790   * freeze_workqueues_busy - are freezable workqueues still busy?
6791   *
6792   * Check whether freezing is complete.  This function must be called
6793   * between freeze_workqueues_begin() and thaw_workqueues().
6794   *
6795   * CONTEXT:
6796   * Grabs and releases wq_pool_mutex.
6797   *
6798   * Return:
6799   * %true if some freezable workqueues are still busy.  %false if freezing
6800   * is complete.
6801   */
freeze_workqueues_busy(void)6802  bool freeze_workqueues_busy(void)
6803  {
6804  	bool busy = false;
6805  	struct workqueue_struct *wq;
6806  	struct pool_workqueue *pwq;
6807  
6808  	mutex_lock(&wq_pool_mutex);
6809  
6810  	WARN_ON_ONCE(!workqueue_freezing);
6811  
6812  	list_for_each_entry(wq, &workqueues, list) {
6813  		if (!(wq->flags & WQ_FREEZABLE))
6814  			continue;
6815  		/*
6816  		 * nr_active is monotonically decreasing.  It's safe
6817  		 * to peek without lock.
6818  		 */
6819  		rcu_read_lock();
6820  		for_each_pwq(pwq, wq) {
6821  			WARN_ON_ONCE(pwq->nr_active < 0);
6822  			if (pwq->nr_active) {
6823  				busy = true;
6824  				rcu_read_unlock();
6825  				goto out_unlock;
6826  			}
6827  		}
6828  		rcu_read_unlock();
6829  	}
6830  out_unlock:
6831  	mutex_unlock(&wq_pool_mutex);
6832  	return busy;
6833  }
6834  
6835  /**
6836   * thaw_workqueues - thaw workqueues
6837   *
6838   * Thaw workqueues.  Normal queueing is restored and all collected
6839   * frozen works are transferred to their respective pool worklists.
6840   *
6841   * CONTEXT:
6842   * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
6843   */
thaw_workqueues(void)6844  void thaw_workqueues(void)
6845  {
6846  	struct workqueue_struct *wq;
6847  
6848  	mutex_lock(&wq_pool_mutex);
6849  
6850  	if (!workqueue_freezing)
6851  		goto out_unlock;
6852  
6853  	workqueue_freezing = false;
6854  
6855  	/* restore max_active and repopulate worklist */
6856  	list_for_each_entry(wq, &workqueues, list) {
6857  		mutex_lock(&wq->mutex);
6858  		wq_adjust_max_active(wq);
6859  		mutex_unlock(&wq->mutex);
6860  	}
6861  
6862  out_unlock:
6863  	mutex_unlock(&wq_pool_mutex);
6864  }
6865  #endif /* CONFIG_FREEZER */
6866  
workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)6867  static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
6868  {
6869  	LIST_HEAD(ctxs);
6870  	int ret = 0;
6871  	struct workqueue_struct *wq;
6872  	struct apply_wqattrs_ctx *ctx, *n;
6873  
6874  	lockdep_assert_held(&wq_pool_mutex);
6875  
6876  	list_for_each_entry(wq, &workqueues, list) {
6877  		if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING))
6878  			continue;
6879  
6880  		ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
6881  		if (IS_ERR(ctx)) {
6882  			ret = PTR_ERR(ctx);
6883  			break;
6884  		}
6885  
6886  		list_add_tail(&ctx->list, &ctxs);
6887  	}
6888  
6889  	list_for_each_entry_safe(ctx, n, &ctxs, list) {
6890  		if (!ret)
6891  			apply_wqattrs_commit(ctx);
6892  		apply_wqattrs_cleanup(ctx);
6893  	}
6894  
6895  	if (!ret) {
6896  		mutex_lock(&wq_pool_attach_mutex);
6897  		cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
6898  		mutex_unlock(&wq_pool_attach_mutex);
6899  	}
6900  	return ret;
6901  }
6902  
6903  /**
6904   * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
6905   * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
6906   *
6907   * This function can be called from cpuset code to provide a set of isolated
6908   * CPUs that should be excluded from wq_unbound_cpumask.
6909   */
workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)6910  int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
6911  {
6912  	cpumask_var_t cpumask;
6913  	int ret = 0;
6914  
6915  	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
6916  		return -ENOMEM;
6917  
6918  	mutex_lock(&wq_pool_mutex);
6919  
6920  	/*
6921  	 * If the operation fails, it will fall back to
6922  	 * wq_requested_unbound_cpumask which is initially set to
6923  	 * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
6924  	 * by any subsequent write to workqueue/cpumask sysfs file.
6925  	 */
6926  	if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
6927  		cpumask_copy(cpumask, wq_requested_unbound_cpumask);
6928  	if (!cpumask_equal(cpumask, wq_unbound_cpumask))
6929  		ret = workqueue_apply_unbound_cpumask(cpumask);
6930  
6931  	/* Save the current isolated cpumask & export it via sysfs */
6932  	if (!ret)
6933  		cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
6934  
6935  	mutex_unlock(&wq_pool_mutex);
6936  	free_cpumask_var(cpumask);
6937  	return ret;
6938  }
6939  
parse_affn_scope(const char * val)6940  static int parse_affn_scope(const char *val)
6941  {
6942  	int i;
6943  
6944  	for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
6945  		if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
6946  			return i;
6947  	}
6948  	return -EINVAL;
6949  }
6950  
wq_affn_dfl_set(const char * val,const struct kernel_param * kp)6951  static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
6952  {
6953  	struct workqueue_struct *wq;
6954  	int affn, cpu;
6955  
6956  	affn = parse_affn_scope(val);
6957  	if (affn < 0)
6958  		return affn;
6959  	if (affn == WQ_AFFN_DFL)
6960  		return -EINVAL;
6961  
6962  	cpus_read_lock();
6963  	mutex_lock(&wq_pool_mutex);
6964  
6965  	wq_affn_dfl = affn;
6966  
6967  	list_for_each_entry(wq, &workqueues, list) {
6968  		for_each_online_cpu(cpu)
6969  			unbound_wq_update_pwq(wq, cpu);
6970  	}
6971  
6972  	mutex_unlock(&wq_pool_mutex);
6973  	cpus_read_unlock();
6974  
6975  	return 0;
6976  }
6977  
wq_affn_dfl_get(char * buffer,const struct kernel_param * kp)6978  static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
6979  {
6980  	return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
6981  }
6982  
6983  static const struct kernel_param_ops wq_affn_dfl_ops = {
6984  	.set	= wq_affn_dfl_set,
6985  	.get	= wq_affn_dfl_get,
6986  };
6987  
6988  module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);
6989  
6990  #ifdef CONFIG_SYSFS
6991  /*
6992   * Workqueues with WQ_SYSFS flag set is visible to userland via
6993   * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
6994   * following attributes.
6995   *
6996   *  per_cpu		RO bool	: whether the workqueue is per-cpu or unbound
6997   *  max_active		RW int	: maximum number of in-flight work items
6998   *
6999   * Unbound workqueues have the following extra attributes.
7000   *
7001   *  nice		RW int	: nice value of the workers
7002   *  cpumask		RW mask	: bitmask of allowed CPUs for the workers
7003   *  affinity_scope	RW str  : worker CPU affinity scope (cache, numa, none)
7004   *  affinity_strict	RW bool : worker CPU affinity is strict
7005   */
7006  struct wq_device {
7007  	struct workqueue_struct		*wq;
7008  	struct device			dev;
7009  };
7010  
dev_to_wq(struct device * dev)7011  static struct workqueue_struct *dev_to_wq(struct device *dev)
7012  {
7013  	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
7014  
7015  	return wq_dev->wq;
7016  }
7017  
per_cpu_show(struct device * dev,struct device_attribute * attr,char * buf)7018  static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
7019  			    char *buf)
7020  {
7021  	struct workqueue_struct *wq = dev_to_wq(dev);
7022  
7023  	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
7024  }
7025  static DEVICE_ATTR_RO(per_cpu);
7026  
max_active_show(struct device * dev,struct device_attribute * attr,char * buf)7027  static ssize_t max_active_show(struct device *dev,
7028  			       struct device_attribute *attr, char *buf)
7029  {
7030  	struct workqueue_struct *wq = dev_to_wq(dev);
7031  
7032  	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
7033  }
7034  
max_active_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)7035  static ssize_t max_active_store(struct device *dev,
7036  				struct device_attribute *attr, const char *buf,
7037  				size_t count)
7038  {
7039  	struct workqueue_struct *wq = dev_to_wq(dev);
7040  	int val;
7041  
7042  	if (sscanf(buf, "%d", &val) != 1 || val <= 0)
7043  		return -EINVAL;
7044  
7045  	workqueue_set_max_active(wq, val);
7046  	return count;
7047  }
7048  static DEVICE_ATTR_RW(max_active);
7049  
7050  static struct attribute *wq_sysfs_attrs[] = {
7051  	&dev_attr_per_cpu.attr,
7052  	&dev_attr_max_active.attr,
7053  	NULL,
7054  };
7055  ATTRIBUTE_GROUPS(wq_sysfs);
7056  
wq_nice_show(struct device * dev,struct device_attribute * attr,char * buf)7057  static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
7058  			    char *buf)
7059  {
7060  	struct workqueue_struct *wq = dev_to_wq(dev);
7061  	int written;
7062  
7063  	mutex_lock(&wq->mutex);
7064  	written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
7065  	mutex_unlock(&wq->mutex);
7066  
7067  	return written;
7068  }
7069  
7070  /* prepare workqueue_attrs for sysfs store operations */
wq_sysfs_prep_attrs(struct workqueue_struct * wq)7071  static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
7072  {
7073  	struct workqueue_attrs *attrs;
7074  
7075  	lockdep_assert_held(&wq_pool_mutex);
7076  
7077  	attrs = alloc_workqueue_attrs();
7078  	if (!attrs)
7079  		return NULL;
7080  
7081  	copy_workqueue_attrs(attrs, wq->unbound_attrs);
7082  	return attrs;
7083  }
7084  
wq_nice_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)7085  static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
7086  			     const char *buf, size_t count)
7087  {
7088  	struct workqueue_struct *wq = dev_to_wq(dev);
7089  	struct workqueue_attrs *attrs;
7090  	int ret = -ENOMEM;
7091  
7092  	apply_wqattrs_lock();
7093  
7094  	attrs = wq_sysfs_prep_attrs(wq);
7095  	if (!attrs)
7096  		goto out_unlock;
7097  
7098  	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
7099  	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
7100  		ret = apply_workqueue_attrs_locked(wq, attrs);
7101  	else
7102  		ret = -EINVAL;
7103  
7104  out_unlock:
7105  	apply_wqattrs_unlock();
7106  	free_workqueue_attrs(attrs);
7107  	return ret ?: count;
7108  }
7109  
wq_cpumask_show(struct device * dev,struct device_attribute * attr,char * buf)7110  static ssize_t wq_cpumask_show(struct device *dev,
7111  			       struct device_attribute *attr, char *buf)
7112  {
7113  	struct workqueue_struct *wq = dev_to_wq(dev);
7114  	int written;
7115  
7116  	mutex_lock(&wq->mutex);
7117  	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
7118  			    cpumask_pr_args(wq->unbound_attrs->cpumask));
7119  	mutex_unlock(&wq->mutex);
7120  	return written;
7121  }
7122  
wq_cpumask_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)7123  static ssize_t wq_cpumask_store(struct device *dev,
7124  				struct device_attribute *attr,
7125  				const char *buf, size_t count)
7126  {
7127  	struct workqueue_struct *wq = dev_to_wq(dev);
7128  	struct workqueue_attrs *attrs;
7129  	int ret = -ENOMEM;
7130  
7131  	apply_wqattrs_lock();
7132  
7133  	attrs = wq_sysfs_prep_attrs(wq);
7134  	if (!attrs)
7135  		goto out_unlock;
7136  
7137  	ret = cpumask_parse(buf, attrs->cpumask);
7138  	if (!ret)
7139  		ret = apply_workqueue_attrs_locked(wq, attrs);
7140  
7141  out_unlock:
7142  	apply_wqattrs_unlock();
7143  	free_workqueue_attrs(attrs);
7144  	return ret ?: count;
7145  }
7146  
wq_affn_scope_show(struct device * dev,struct device_attribute * attr,char * buf)7147  static ssize_t wq_affn_scope_show(struct device *dev,
7148  				  struct device_attribute *attr, char *buf)
7149  {
7150  	struct workqueue_struct *wq = dev_to_wq(dev);
7151  	int written;
7152  
7153  	mutex_lock(&wq->mutex);
7154  	if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
7155  		written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
7156  				    wq_affn_names[WQ_AFFN_DFL],
7157  				    wq_affn_names[wq_affn_dfl]);
7158  	else
7159  		written = scnprintf(buf, PAGE_SIZE, "%s\n",
7160  				    wq_affn_names[wq->unbound_attrs->affn_scope]);
7161  	mutex_unlock(&wq->mutex);
7162  
7163  	return written;
7164  }
7165  
wq_affn_scope_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)7166  static ssize_t wq_affn_scope_store(struct device *dev,
7167  				   struct device_attribute *attr,
7168  				   const char *buf, size_t count)
7169  {
7170  	struct workqueue_struct *wq = dev_to_wq(dev);
7171  	struct workqueue_attrs *attrs;
7172  	int affn, ret = -ENOMEM;
7173  
7174  	affn = parse_affn_scope(buf);
7175  	if (affn < 0)
7176  		return affn;
7177  
7178  	apply_wqattrs_lock();
7179  	attrs = wq_sysfs_prep_attrs(wq);
7180  	if (attrs) {
7181  		attrs->affn_scope = affn;
7182  		ret = apply_workqueue_attrs_locked(wq, attrs);
7183  	}
7184  	apply_wqattrs_unlock();
7185  	free_workqueue_attrs(attrs);
7186  	return ret ?: count;
7187  }
7188  
wq_affinity_strict_show(struct device * dev,struct device_attribute * attr,char * buf)7189  static ssize_t wq_affinity_strict_show(struct device *dev,
7190  				       struct device_attribute *attr, char *buf)
7191  {
7192  	struct workqueue_struct *wq = dev_to_wq(dev);
7193  
7194  	return scnprintf(buf, PAGE_SIZE, "%d\n",
7195  			 wq->unbound_attrs->affn_strict);
7196  }
7197  
wq_affinity_strict_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)7198  static ssize_t wq_affinity_strict_store(struct device *dev,
7199  					struct device_attribute *attr,
7200  					const char *buf, size_t count)
7201  {
7202  	struct workqueue_struct *wq = dev_to_wq(dev);
7203  	struct workqueue_attrs *attrs;
7204  	int v, ret = -ENOMEM;
7205  
7206  	if (sscanf(buf, "%d", &v) != 1)
7207  		return -EINVAL;
7208  
7209  	apply_wqattrs_lock();
7210  	attrs = wq_sysfs_prep_attrs(wq);
7211  	if (attrs) {
7212  		attrs->affn_strict = (bool)v;
7213  		ret = apply_workqueue_attrs_locked(wq, attrs);
7214  	}
7215  	apply_wqattrs_unlock();
7216  	free_workqueue_attrs(attrs);
7217  	return ret ?: count;
7218  }
7219  
7220  static struct device_attribute wq_sysfs_unbound_attrs[] = {
7221  	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
7222  	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
7223  	__ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
7224  	__ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
7225  	__ATTR_NULL,
7226  };
7227  
7228  static const struct bus_type wq_subsys = {
7229  	.name				= "workqueue",
7230  	.dev_groups			= wq_sysfs_groups,
7231  };
7232  
7233  /**
7234   *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
7235   *  @cpumask: the cpumask to set
7236   *
7237   *  The low-level workqueues cpumask is a global cpumask that limits
7238   *  the affinity of all unbound workqueues.  This function check the @cpumask
7239   *  and apply it to all unbound workqueues and updates all pwqs of them.
7240   *
7241   *  Return:	0	- Success
7242   *		-EINVAL	- Invalid @cpumask
7243   *		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
7244   */
workqueue_set_unbound_cpumask(cpumask_var_t cpumask)7245  static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
7246  {
7247  	int ret = -EINVAL;
7248  
7249  	/*
7250  	 * Not excluding isolated cpus on purpose.
7251  	 * If the user wishes to include them, we allow that.
7252  	 */
7253  	cpumask_and(cpumask, cpumask, cpu_possible_mask);
7254  	if (!cpumask_empty(cpumask)) {
7255  		ret = 0;
7256  		apply_wqattrs_lock();
7257  		if (!cpumask_equal(cpumask, wq_unbound_cpumask))
7258  			ret = workqueue_apply_unbound_cpumask(cpumask);
7259  		if (!ret)
7260  			cpumask_copy(wq_requested_unbound_cpumask, cpumask);
7261  		apply_wqattrs_unlock();
7262  	}
7263  
7264  	return ret;
7265  }
7266  
__wq_cpumask_show(struct device * dev,struct device_attribute * attr,char * buf,cpumask_var_t mask)7267  static ssize_t __wq_cpumask_show(struct device *dev,
7268  		struct device_attribute *attr, char *buf, cpumask_var_t mask)
7269  {
7270  	int written;
7271  
7272  	mutex_lock(&wq_pool_mutex);
7273  	written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
7274  	mutex_unlock(&wq_pool_mutex);
7275  
7276  	return written;
7277  }
7278  
cpumask_requested_show(struct device * dev,struct device_attribute * attr,char * buf)7279  static ssize_t cpumask_requested_show(struct device *dev,
7280  		struct device_attribute *attr, char *buf)
7281  {
7282  	return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
7283  }
7284  static DEVICE_ATTR_RO(cpumask_requested);
7285  
cpumask_isolated_show(struct device * dev,struct device_attribute * attr,char * buf)7286  static ssize_t cpumask_isolated_show(struct device *dev,
7287  		struct device_attribute *attr, char *buf)
7288  {
7289  	return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
7290  }
7291  static DEVICE_ATTR_RO(cpumask_isolated);
7292  
cpumask_show(struct device * dev,struct device_attribute * attr,char * buf)7293  static ssize_t cpumask_show(struct device *dev,
7294  		struct device_attribute *attr, char *buf)
7295  {
7296  	return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
7297  }
7298  
cpumask_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)7299  static ssize_t cpumask_store(struct device *dev,
7300  		struct device_attribute *attr, const char *buf, size_t count)
7301  {
7302  	cpumask_var_t cpumask;
7303  	int ret;
7304  
7305  	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
7306  		return -ENOMEM;
7307  
7308  	ret = cpumask_parse(buf, cpumask);
7309  	if (!ret)
7310  		ret = workqueue_set_unbound_cpumask(cpumask);
7311  
7312  	free_cpumask_var(cpumask);
7313  	return ret ? ret : count;
7314  }
7315  static DEVICE_ATTR_RW(cpumask);
7316  
7317  static struct attribute *wq_sysfs_cpumask_attrs[] = {
7318  	&dev_attr_cpumask.attr,
7319  	&dev_attr_cpumask_requested.attr,
7320  	&dev_attr_cpumask_isolated.attr,
7321  	NULL,
7322  };
7323  ATTRIBUTE_GROUPS(wq_sysfs_cpumask);
7324  
wq_sysfs_init(void)7325  static int __init wq_sysfs_init(void)
7326  {
7327  	return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups);
7328  }
7329  core_initcall(wq_sysfs_init);
7330  
wq_device_release(struct device * dev)7331  static void wq_device_release(struct device *dev)
7332  {
7333  	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
7334  
7335  	kfree(wq_dev);
7336  }
7337  
7338  /**
7339   * workqueue_sysfs_register - make a workqueue visible in sysfs
7340   * @wq: the workqueue to register
7341   *
7342   * Expose @wq in sysfs under /sys/bus/workqueue/devices.
7343   * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
7344   * which is the preferred method.
7345   *
7346   * Workqueue user should use this function directly iff it wants to apply
7347   * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
7348   * apply_workqueue_attrs() may race against userland updating the
7349   * attributes.
7350   *
7351   * Return: 0 on success, -errno on failure.
7352   */
workqueue_sysfs_register(struct workqueue_struct * wq)7353  int workqueue_sysfs_register(struct workqueue_struct *wq)
7354  {
7355  	struct wq_device *wq_dev;
7356  	int ret;
7357  
7358  	/*
7359  	 * Adjusting max_active breaks ordering guarantee.  Disallow exposing
7360  	 * ordered workqueues.
7361  	 */
7362  	if (WARN_ON(wq->flags & __WQ_ORDERED))
7363  		return -EINVAL;
7364  
7365  	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
7366  	if (!wq_dev)
7367  		return -ENOMEM;
7368  
7369  	wq_dev->wq = wq;
7370  	wq_dev->dev.bus = &wq_subsys;
7371  	wq_dev->dev.release = wq_device_release;
7372  	dev_set_name(&wq_dev->dev, "%s", wq->name);
7373  
7374  	/*
7375  	 * unbound_attrs are created separately.  Suppress uevent until
7376  	 * everything is ready.
7377  	 */
7378  	dev_set_uevent_suppress(&wq_dev->dev, true);
7379  
7380  	ret = device_register(&wq_dev->dev);
7381  	if (ret) {
7382  		put_device(&wq_dev->dev);
7383  		wq->wq_dev = NULL;
7384  		return ret;
7385  	}
7386  
7387  	if (wq->flags & WQ_UNBOUND) {
7388  		struct device_attribute *attr;
7389  
7390  		for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
7391  			ret = device_create_file(&wq_dev->dev, attr);
7392  			if (ret) {
7393  				device_unregister(&wq_dev->dev);
7394  				wq->wq_dev = NULL;
7395  				return ret;
7396  			}
7397  		}
7398  	}
7399  
7400  	dev_set_uevent_suppress(&wq_dev->dev, false);
7401  	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
7402  	return 0;
7403  }
7404  
7405  /**
7406   * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
7407   * @wq: the workqueue to unregister
7408   *
7409   * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
7410   */
workqueue_sysfs_unregister(struct workqueue_struct * wq)7411  static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
7412  {
7413  	struct wq_device *wq_dev = wq->wq_dev;
7414  
7415  	if (!wq->wq_dev)
7416  		return;
7417  
7418  	wq->wq_dev = NULL;
7419  	device_unregister(&wq_dev->dev);
7420  }
7421  #else	/* CONFIG_SYSFS */
workqueue_sysfs_unregister(struct workqueue_struct * wq)7422  static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
7423  #endif	/* CONFIG_SYSFS */
7424  
7425  /*
7426   * Workqueue watchdog.
7427   *
7428   * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
7429   * flush dependency, a concurrency managed work item which stays RUNNING
7430   * indefinitely.  Workqueue stalls can be very difficult to debug as the
7431   * usual warning mechanisms don't trigger and internal workqueue state is
7432   * largely opaque.
7433   *
7434   * Workqueue watchdog monitors all worker pools periodically and dumps
7435   * state if some pools failed to make forward progress for a while where
7436   * forward progress is defined as the first item on ->worklist changing.
7437   *
7438   * This mechanism is controlled through the kernel parameter
7439   * "workqueue.watchdog_thresh" which can be updated at runtime through the
7440   * corresponding sysfs parameter file.
7441   */
7442  #ifdef CONFIG_WQ_WATCHDOG
7443  
7444  static unsigned long wq_watchdog_thresh = 30;
7445  static struct timer_list wq_watchdog_timer;
7446  
7447  static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
7448  static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
7449  
7450  static unsigned int wq_panic_on_stall;
7451  module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
7452  
7453  /*
7454   * Show workers that might prevent the processing of pending work items.
7455   * The only candidates are CPU-bound workers in the running state.
7456   * Pending work items should be handled by another idle worker
7457   * in all other situations.
7458   */
show_cpu_pool_hog(struct worker_pool * pool)7459  static void show_cpu_pool_hog(struct worker_pool *pool)
7460  {
7461  	struct worker *worker;
7462  	unsigned long irq_flags;
7463  	int bkt;
7464  
7465  	raw_spin_lock_irqsave(&pool->lock, irq_flags);
7466  
7467  	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
7468  		if (task_is_running(worker->task)) {
7469  			/*
7470  			 * Defer printing to avoid deadlocks in console
7471  			 * drivers that queue work while holding locks
7472  			 * also taken in their write paths.
7473  			 */
7474  			printk_deferred_enter();
7475  
7476  			pr_info("pool %d:\n", pool->id);
7477  			sched_show_task(worker->task);
7478  
7479  			printk_deferred_exit();
7480  		}
7481  	}
7482  
7483  	raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
7484  }
7485  
show_cpu_pools_hogs(void)7486  static void show_cpu_pools_hogs(void)
7487  {
7488  	struct worker_pool *pool;
7489  	int pi;
7490  
7491  	pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
7492  
7493  	rcu_read_lock();
7494  
7495  	for_each_pool(pool, pi) {
7496  		if (pool->cpu_stall)
7497  			show_cpu_pool_hog(pool);
7498  
7499  	}
7500  
7501  	rcu_read_unlock();
7502  }
7503  
panic_on_wq_watchdog(void)7504  static void panic_on_wq_watchdog(void)
7505  {
7506  	static unsigned int wq_stall;
7507  
7508  	if (wq_panic_on_stall) {
7509  		wq_stall++;
7510  		BUG_ON(wq_stall >= wq_panic_on_stall);
7511  	}
7512  }
7513  
wq_watchdog_reset_touched(void)7514  static void wq_watchdog_reset_touched(void)
7515  {
7516  	int cpu;
7517  
7518  	wq_watchdog_touched = jiffies;
7519  	for_each_possible_cpu(cpu)
7520  		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
7521  }
7522  
wq_watchdog_timer_fn(struct timer_list * unused)7523  static void wq_watchdog_timer_fn(struct timer_list *unused)
7524  {
7525  	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
7526  	bool lockup_detected = false;
7527  	bool cpu_pool_stall = false;
7528  	unsigned long now = jiffies;
7529  	struct worker_pool *pool;
7530  	int pi;
7531  
7532  	if (!thresh)
7533  		return;
7534  
7535  	rcu_read_lock();
7536  
7537  	for_each_pool(pool, pi) {
7538  		unsigned long pool_ts, touched, ts;
7539  
7540  		pool->cpu_stall = false;
7541  		if (list_empty(&pool->worklist))
7542  			continue;
7543  
7544  		/*
7545  		 * If a virtual machine is stopped by the host it can look to
7546  		 * the watchdog like a stall.
7547  		 */
7548  		kvm_check_and_clear_guest_paused();
7549  
7550  		/* get the latest of pool and touched timestamps */
7551  		if (pool->cpu >= 0)
7552  			touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
7553  		else
7554  			touched = READ_ONCE(wq_watchdog_touched);
7555  		pool_ts = READ_ONCE(pool->watchdog_ts);
7556  
7557  		if (time_after(pool_ts, touched))
7558  			ts = pool_ts;
7559  		else
7560  			ts = touched;
7561  
7562  		/* did we stall? */
7563  		if (time_after(now, ts + thresh)) {
7564  			lockup_detected = true;
7565  			if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
7566  				pool->cpu_stall = true;
7567  				cpu_pool_stall = true;
7568  			}
7569  			pr_emerg("BUG: workqueue lockup - pool");
7570  			pr_cont_pool_info(pool);
7571  			pr_cont(" stuck for %us!\n",
7572  				jiffies_to_msecs(now - pool_ts) / 1000);
7573  		}
7574  
7575  
7576  	}
7577  
7578  	rcu_read_unlock();
7579  
7580  	if (lockup_detected)
7581  		show_all_workqueues();
7582  
7583  	if (cpu_pool_stall)
7584  		show_cpu_pools_hogs();
7585  
7586  	if (lockup_detected)
7587  		panic_on_wq_watchdog();
7588  
7589  	wq_watchdog_reset_touched();
7590  	mod_timer(&wq_watchdog_timer, jiffies + thresh);
7591  }
7592  
wq_watchdog_touch(int cpu)7593  notrace void wq_watchdog_touch(int cpu)
7594  {
7595  	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
7596  	unsigned long touch_ts = READ_ONCE(wq_watchdog_touched);
7597  	unsigned long now = jiffies;
7598  
7599  	if (cpu >= 0)
7600  		per_cpu(wq_watchdog_touched_cpu, cpu) = now;
7601  	else
7602  		WARN_ONCE(1, "%s should be called with valid CPU", __func__);
7603  
7604  	/* Don't unnecessarily store to global cacheline */
7605  	if (time_after(now, touch_ts + thresh / 4))
7606  		WRITE_ONCE(wq_watchdog_touched, jiffies);
7607  }
7608  
wq_watchdog_set_thresh(unsigned long thresh)7609  static void wq_watchdog_set_thresh(unsigned long thresh)
7610  {
7611  	wq_watchdog_thresh = 0;
7612  	del_timer_sync(&wq_watchdog_timer);
7613  
7614  	if (thresh) {
7615  		wq_watchdog_thresh = thresh;
7616  		wq_watchdog_reset_touched();
7617  		mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
7618  	}
7619  }
7620  
wq_watchdog_param_set_thresh(const char * val,const struct kernel_param * kp)7621  static int wq_watchdog_param_set_thresh(const char *val,
7622  					const struct kernel_param *kp)
7623  {
7624  	unsigned long thresh;
7625  	int ret;
7626  
7627  	ret = kstrtoul(val, 0, &thresh);
7628  	if (ret)
7629  		return ret;
7630  
7631  	if (system_wq)
7632  		wq_watchdog_set_thresh(thresh);
7633  	else
7634  		wq_watchdog_thresh = thresh;
7635  
7636  	return 0;
7637  }
7638  
7639  static const struct kernel_param_ops wq_watchdog_thresh_ops = {
7640  	.set	= wq_watchdog_param_set_thresh,
7641  	.get	= param_get_ulong,
7642  };
7643  
7644  module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
7645  		0644);
7646  
wq_watchdog_init(void)7647  static void wq_watchdog_init(void)
7648  {
7649  	timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
7650  	wq_watchdog_set_thresh(wq_watchdog_thresh);
7651  }
7652  
7653  #else	/* CONFIG_WQ_WATCHDOG */
7654  
wq_watchdog_init(void)7655  static inline void wq_watchdog_init(void) { }
7656  
7657  #endif	/* CONFIG_WQ_WATCHDOG */
7658  
bh_pool_kick_normal(struct irq_work * irq_work)7659  static void bh_pool_kick_normal(struct irq_work *irq_work)
7660  {
7661  	raise_softirq_irqoff(TASKLET_SOFTIRQ);
7662  }
7663  
bh_pool_kick_highpri(struct irq_work * irq_work)7664  static void bh_pool_kick_highpri(struct irq_work *irq_work)
7665  {
7666  	raise_softirq_irqoff(HI_SOFTIRQ);
7667  }
7668  
restrict_unbound_cpumask(const char * name,const struct cpumask * mask)7669  static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
7670  {
7671  	if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
7672  		pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
7673  			cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
7674  		return;
7675  	}
7676  
7677  	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
7678  }
7679  
init_cpu_worker_pool(struct worker_pool * pool,int cpu,int nice)7680  static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice)
7681  {
7682  	BUG_ON(init_worker_pool(pool));
7683  	pool->cpu = cpu;
7684  	cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
7685  	cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
7686  	pool->attrs->nice = nice;
7687  	pool->attrs->affn_strict = true;
7688  	pool->node = cpu_to_node(cpu);
7689  
7690  	/* alloc pool ID */
7691  	mutex_lock(&wq_pool_mutex);
7692  	BUG_ON(worker_pool_assign_id(pool));
7693  	mutex_unlock(&wq_pool_mutex);
7694  }
7695  
7696  /**
7697   * workqueue_init_early - early init for workqueue subsystem
7698   *
7699   * This is the first step of three-staged workqueue subsystem initialization and
7700   * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
7701   * up. It sets up all the data structures and system workqueues and allows early
7702   * boot code to create workqueues and queue/cancel work items. Actual work item
7703   * execution starts only after kthreads can be created and scheduled right
7704   * before early initcalls.
7705   */
workqueue_init_early(void)7706  void __init workqueue_init_early(void)
7707  {
7708  	struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
7709  	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
7710  	void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
7711  						       bh_pool_kick_highpri };
7712  	int i, cpu;
7713  
7714  	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
7715  
7716  	BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
7717  	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
7718  	BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
7719  	BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
7720  
7721  	cpumask_copy(wq_online_cpumask, cpu_online_mask);
7722  	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
7723  	restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
7724  	restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
7725  	if (!cpumask_empty(&wq_cmdline_cpumask))
7726  		restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
7727  
7728  	cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
7729  
7730  	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
7731  
7732  	unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
7733  	BUG_ON(!unbound_wq_update_pwq_attrs_buf);
7734  
7735  	/*
7736  	 * If nohz_full is enabled, set power efficient workqueue as unbound.
7737  	 * This allows workqueue items to be moved to HK CPUs.
7738  	 */
7739  	if (housekeeping_enabled(HK_TYPE_TICK))
7740  		wq_power_efficient = true;
7741  
7742  	/* initialize WQ_AFFN_SYSTEM pods */
7743  	pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
7744  	pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
7745  	pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
7746  	BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);
7747  
7748  	BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
7749  
7750  	pt->nr_pods = 1;
7751  	cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
7752  	pt->pod_node[0] = NUMA_NO_NODE;
7753  	pt->cpu_pod[0] = 0;
7754  
7755  	/* initialize BH and CPU pools */
7756  	for_each_possible_cpu(cpu) {
7757  		struct worker_pool *pool;
7758  
7759  		i = 0;
7760  		for_each_bh_worker_pool(pool, cpu) {
7761  			init_cpu_worker_pool(pool, cpu, std_nice[i]);
7762  			pool->flags |= POOL_BH;
7763  			init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
7764  			i++;
7765  		}
7766  
7767  		i = 0;
7768  		for_each_cpu_worker_pool(pool, cpu)
7769  			init_cpu_worker_pool(pool, cpu, std_nice[i++]);
7770  	}
7771  
7772  	/* create default unbound and ordered wq attrs */
7773  	for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
7774  		struct workqueue_attrs *attrs;
7775  
7776  		BUG_ON(!(attrs = alloc_workqueue_attrs()));
7777  		attrs->nice = std_nice[i];
7778  		unbound_std_wq_attrs[i] = attrs;
7779  
7780  		/*
7781  		 * An ordered wq should have only one pwq as ordering is
7782  		 * guaranteed by max_active which is enforced by pwqs.
7783  		 */
7784  		BUG_ON(!(attrs = alloc_workqueue_attrs()));
7785  		attrs->nice = std_nice[i];
7786  		attrs->ordered = true;
7787  		ordered_wq_attrs[i] = attrs;
7788  	}
7789  
7790  	system_wq = alloc_workqueue("events", 0, 0);
7791  	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
7792  	system_long_wq = alloc_workqueue("events_long", 0, 0);
7793  	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
7794  					    WQ_MAX_ACTIVE);
7795  	system_freezable_wq = alloc_workqueue("events_freezable",
7796  					      WQ_FREEZABLE, 0);
7797  	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
7798  					      WQ_POWER_EFFICIENT, 0);
7799  	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
7800  					      WQ_FREEZABLE | WQ_POWER_EFFICIENT,
7801  					      0);
7802  	system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
7803  	system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
7804  					       WQ_BH | WQ_HIGHPRI, 0);
7805  	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
7806  	       !system_unbound_wq || !system_freezable_wq ||
7807  	       !system_power_efficient_wq ||
7808  	       !system_freezable_power_efficient_wq ||
7809  	       !system_bh_wq || !system_bh_highpri_wq);
7810  }
7811  
wq_cpu_intensive_thresh_init(void)7812  static void __init wq_cpu_intensive_thresh_init(void)
7813  {
7814  	unsigned long thresh;
7815  	unsigned long bogo;
7816  
7817  	pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
7818  	BUG_ON(IS_ERR(pwq_release_worker));
7819  
7820  	/* if the user set it to a specific value, keep it */
7821  	if (wq_cpu_intensive_thresh_us != ULONG_MAX)
7822  		return;
7823  
7824  	/*
7825  	 * The default of 10ms is derived from the fact that most modern (as of
7826  	 * 2023) processors can do a lot in 10ms and that it's just below what
7827  	 * most consider human-perceivable. However, the kernel also runs on a
7828  	 * lot slower CPUs including microcontrollers where the threshold is way
7829  	 * too low.
7830  	 *
7831  	 * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
7832  	 * This is by no means accurate but it doesn't have to be. The mechanism
7833  	 * is still useful even when the threshold is fully scaled up. Also, as
7834  	 * the reports would usually be applicable to everyone, some machines
7835  	 * operating on longer thresholds won't significantly diminish their
7836  	 * usefulness.
7837  	 */
7838  	thresh = 10 * USEC_PER_MSEC;
7839  
7840  	/* see init/calibrate.c for lpj -> BogoMIPS calculation */
7841  	bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
7842  	if (bogo < 4000)
7843  		thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);
7844  
7845  	pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
7846  		 loops_per_jiffy, bogo, thresh);
7847  
7848  	wq_cpu_intensive_thresh_us = thresh;
7849  }
7850  
7851  /**
7852   * workqueue_init - bring workqueue subsystem fully online
7853   *
7854   * This is the second step of three-staged workqueue subsystem initialization
7855   * and invoked as soon as kthreads can be created and scheduled. Workqueues have
7856   * been created and work items queued on them, but there are no kworkers
7857   * executing the work items yet. Populate the worker pools with the initial
7858   * workers and enable future kworker creations.
7859   */
workqueue_init(void)7860  void __init workqueue_init(void)
7861  {
7862  	struct workqueue_struct *wq;
7863  	struct worker_pool *pool;
7864  	int cpu, bkt;
7865  
7866  	wq_cpu_intensive_thresh_init();
7867  
7868  	mutex_lock(&wq_pool_mutex);
7869  
7870  	/*
7871  	 * Per-cpu pools created earlier could be missing node hint. Fix them
7872  	 * up. Also, create a rescuer for workqueues that requested it.
7873  	 */
7874  	for_each_possible_cpu(cpu) {
7875  		for_each_bh_worker_pool(pool, cpu)
7876  			pool->node = cpu_to_node(cpu);
7877  		for_each_cpu_worker_pool(pool, cpu)
7878  			pool->node = cpu_to_node(cpu);
7879  	}
7880  
7881  	list_for_each_entry(wq, &workqueues, list) {
7882  		WARN(init_rescuer(wq),
7883  		     "workqueue: failed to create early rescuer for %s",
7884  		     wq->name);
7885  	}
7886  
7887  	mutex_unlock(&wq_pool_mutex);
7888  
7889  	/*
7890  	 * Create the initial workers. A BH pool has one pseudo worker that
7891  	 * represents the shared BH execution context and thus doesn't get
7892  	 * affected by hotplug events. Create the BH pseudo workers for all
7893  	 * possible CPUs here.
7894  	 */
7895  	for_each_possible_cpu(cpu)
7896  		for_each_bh_worker_pool(pool, cpu)
7897  			BUG_ON(!create_worker(pool));
7898  
7899  	for_each_online_cpu(cpu) {
7900  		for_each_cpu_worker_pool(pool, cpu) {
7901  			pool->flags &= ~POOL_DISASSOCIATED;
7902  			BUG_ON(!create_worker(pool));
7903  		}
7904  	}
7905  
7906  	hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
7907  		BUG_ON(!create_worker(pool));
7908  
7909  	wq_online = true;
7910  	wq_watchdog_init();
7911  }
7912  
7913  /*
7914   * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
7915   * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
7916   * and consecutive pod ID. The rest of @pt is initialized accordingly.
7917   */
init_pod_type(struct wq_pod_type * pt,bool (* cpus_share_pod)(int,int))7918  static void __init init_pod_type(struct wq_pod_type *pt,
7919  				 bool (*cpus_share_pod)(int, int))
7920  {
7921  	int cur, pre, cpu, pod;
7922  
7923  	pt->nr_pods = 0;
7924  
7925  	/* init @pt->cpu_pod[] according to @cpus_share_pod() */
7926  	pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
7927  	BUG_ON(!pt->cpu_pod);
7928  
7929  	for_each_possible_cpu(cur) {
7930  		for_each_possible_cpu(pre) {
7931  			if (pre >= cur) {
7932  				pt->cpu_pod[cur] = pt->nr_pods++;
7933  				break;
7934  			}
7935  			if (cpus_share_pod(cur, pre)) {
7936  				pt->cpu_pod[cur] = pt->cpu_pod[pre];
7937  				break;
7938  			}
7939  		}
7940  	}
7941  
7942  	/* init the rest to match @pt->cpu_pod[] */
7943  	pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
7944  	pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL);
7945  	BUG_ON(!pt->pod_cpus || !pt->pod_node);
7946  
7947  	for (pod = 0; pod < pt->nr_pods; pod++)
7948  		BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));
7949  
7950  	for_each_possible_cpu(cpu) {
7951  		cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);
7952  		pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
7953  	}
7954  }
7955  
cpus_dont_share(int cpu0,int cpu1)7956  static bool __init cpus_dont_share(int cpu0, int cpu1)
7957  {
7958  	return false;
7959  }
7960  
cpus_share_smt(int cpu0,int cpu1)7961  static bool __init cpus_share_smt(int cpu0, int cpu1)
7962  {
7963  #ifdef CONFIG_SCHED_SMT
7964  	return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
7965  #else
7966  	return false;
7967  #endif
7968  }
7969  
cpus_share_numa(int cpu0,int cpu1)7970  static bool __init cpus_share_numa(int cpu0, int cpu1)
7971  {
7972  	return cpu_to_node(cpu0) == cpu_to_node(cpu1);
7973  }
7974  
7975  /**
7976   * workqueue_init_topology - initialize CPU pods for unbound workqueues
7977   *
7978   * This is the third step of three-staged workqueue subsystem initialization and
7979   * invoked after SMP and topology information are fully initialized. It
7980   * initializes the unbound CPU pods accordingly.
7981   */
workqueue_init_topology(void)7982  void __init workqueue_init_topology(void)
7983  {
7984  	struct workqueue_struct *wq;
7985  	int cpu;
7986  
7987  	init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
7988  	init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
7989  	init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
7990  	init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
7991  
7992  	wq_topo_initialized = true;
7993  
7994  	mutex_lock(&wq_pool_mutex);
7995  
7996  	/*
7997  	 * Workqueues allocated earlier would have all CPUs sharing the default
7998  	 * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue
7999  	 * and CPU combinations to apply per-pod sharing.
8000  	 */
8001  	list_for_each_entry(wq, &workqueues, list) {
8002  		for_each_online_cpu(cpu)
8003  			unbound_wq_update_pwq(wq, cpu);
8004  		if (wq->flags & WQ_UNBOUND) {
8005  			mutex_lock(&wq->mutex);
8006  			wq_update_node_max_active(wq, -1);
8007  			mutex_unlock(&wq->mutex);
8008  		}
8009  	}
8010  
8011  	mutex_unlock(&wq_pool_mutex);
8012  }
8013  
__warn_flushing_systemwide_wq(void)8014  void __warn_flushing_systemwide_wq(void)
8015  {
8016  	pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
8017  	dump_stack();
8018  }
8019  EXPORT_SYMBOL(__warn_flushing_systemwide_wq);
8020  
workqueue_unbound_cpus_setup(char * str)8021  static int __init workqueue_unbound_cpus_setup(char *str)
8022  {
8023  	if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {
8024  		cpumask_clear(&wq_cmdline_cpumask);
8025  		pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
8026  	}
8027  
8028  	return 1;
8029  }
8030  __setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);
8031