1  // SPDX-License-Identifier: GPL-2.0-only
2  #define pr_fmt(fmt) "%s: " fmt, __func__
3  
4  #include <linux/kernel.h>
5  #include <linux/sched.h>
6  #include <linux/wait.h>
7  #include <linux/slab.h>
8  #include <linux/mm.h>
9  #include <linux/percpu-refcount.h>
10  
11  /*
12   * Initially, a percpu refcount is just a set of percpu counters. Initially, we
13   * don't try to detect the ref hitting 0 - which means that get/put can just
14   * increment or decrement the local counter. Note that the counter on a
15   * particular cpu can (and will) wrap - this is fine, when we go to shutdown the
16   * percpu counters will all sum to the correct value
17   *
18   * (More precisely: because modular arithmetic is commutative the sum of all the
19   * percpu_count vars will be equal to what it would have been if all the gets
20   * and puts were done to a single integer, even if some of the percpu integers
21   * overflow or underflow).
22   *
23   * The real trick to implementing percpu refcounts is shutdown. We can't detect
24   * the ref hitting 0 on every put - this would require global synchronization
25   * and defeat the whole purpose of using percpu refs.
26   *
27   * What we do is require the user to keep track of the initial refcount; we know
28   * the ref can't hit 0 before the user drops the initial ref, so as long as we
29   * convert to non percpu mode before the initial ref is dropped everything
30   * works.
31   *
32   * Converting to non percpu mode is done with some RCUish stuff in
33   * percpu_ref_kill. Additionally, we need a bias value so that the
34   * atomic_long_t can't hit 0 before we've added up all the percpu refs.
35   */
36  
37  #define PERCPU_COUNT_BIAS	(1LU << (BITS_PER_LONG - 1))
38  
39  static DEFINE_SPINLOCK(percpu_ref_switch_lock);
40  static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
41  
percpu_count_ptr(struct percpu_ref * ref)42  static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
43  {
44  	return (unsigned long __percpu *)
45  		(ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD);
46  }
47  
48  /**
49   * percpu_ref_init - initialize a percpu refcount
50   * @ref: percpu_ref to initialize
51   * @release: function which will be called when refcount hits 0
52   * @flags: PERCPU_REF_INIT_* flags
53   * @gfp: allocation mask to use
54   *
55   * Initializes @ref.  @ref starts out in percpu mode with a refcount of 1 unless
56   * @flags contains PERCPU_REF_INIT_ATOMIC or PERCPU_REF_INIT_DEAD.  These flags
57   * change the start state to atomic with the latter setting the initial refcount
58   * to 0.  See the definitions of PERCPU_REF_INIT_* flags for flag behaviors.
59   *
60   * Note that @release must not sleep - it may potentially be called from RCU
61   * callback context by percpu_ref_kill().
62   */
percpu_ref_init(struct percpu_ref * ref,percpu_ref_func_t * release,unsigned int flags,gfp_t gfp)63  int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
64  		    unsigned int flags, gfp_t gfp)
65  {
66  	size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS,
67  			     __alignof__(unsigned long));
68  	unsigned long start_count = 0;
69  	struct percpu_ref_data *data;
70  
71  	ref->percpu_count_ptr = (unsigned long)
72  		__alloc_percpu_gfp(sizeof(unsigned long), align, gfp);
73  	if (!ref->percpu_count_ptr)
74  		return -ENOMEM;
75  
76  	data = kzalloc(sizeof(*ref->data), gfp);
77  	if (!data) {
78  		free_percpu((void __percpu *)ref->percpu_count_ptr);
79  		ref->percpu_count_ptr = 0;
80  		return -ENOMEM;
81  	}
82  
83  	data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC;
84  	data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT;
85  
86  	if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) {
87  		ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
88  		data->allow_reinit = true;
89  	} else {
90  		start_count += PERCPU_COUNT_BIAS;
91  	}
92  
93  	if (flags & PERCPU_REF_INIT_DEAD)
94  		ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
95  	else
96  		start_count++;
97  
98  	atomic_long_set(&data->count, start_count);
99  
100  	data->release = release;
101  	data->confirm_switch = NULL;
102  	data->ref = ref;
103  	ref->data = data;
104  	return 0;
105  }
106  EXPORT_SYMBOL_GPL(percpu_ref_init);
107  
__percpu_ref_exit(struct percpu_ref * ref)108  static void __percpu_ref_exit(struct percpu_ref *ref)
109  {
110  	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
111  
112  	if (percpu_count) {
113  		/* non-NULL confirm_switch indicates switching in progress */
114  		WARN_ON_ONCE(ref->data && ref->data->confirm_switch);
115  		free_percpu(percpu_count);
116  		ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
117  	}
118  }
119  
120  /**
121   * percpu_ref_exit - undo percpu_ref_init()
122   * @ref: percpu_ref to exit
123   *
124   * This function exits @ref.  The caller is responsible for ensuring that
125   * @ref is no longer in active use.  The usual places to invoke this
126   * function from are the @ref->release() callback or in init failure path
127   * where percpu_ref_init() succeeded but other parts of the initialization
128   * of the embedding object failed.
129   */
percpu_ref_exit(struct percpu_ref * ref)130  void percpu_ref_exit(struct percpu_ref *ref)
131  {
132  	struct percpu_ref_data *data = ref->data;
133  	unsigned long flags;
134  
135  	__percpu_ref_exit(ref);
136  
137  	if (!data)
138  		return;
139  
140  	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
141  	ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) <<
142  		__PERCPU_REF_FLAG_BITS;
143  	ref->data = NULL;
144  	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
145  
146  	kfree(data);
147  }
148  EXPORT_SYMBOL_GPL(percpu_ref_exit);
149  
percpu_ref_call_confirm_rcu(struct rcu_head * rcu)150  static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
151  {
152  	struct percpu_ref_data *data = container_of(rcu,
153  			struct percpu_ref_data, rcu);
154  	struct percpu_ref *ref = data->ref;
155  
156  	data->confirm_switch(ref);
157  	data->confirm_switch = NULL;
158  	wake_up_all(&percpu_ref_switch_waitq);
159  
160  	if (!data->allow_reinit)
161  		__percpu_ref_exit(ref);
162  
163  	/* drop ref from percpu_ref_switch_to_atomic() */
164  	percpu_ref_put(ref);
165  }
166  
percpu_ref_switch_to_atomic_rcu(struct rcu_head * rcu)167  static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
168  {
169  	struct percpu_ref_data *data = container_of(rcu,
170  			struct percpu_ref_data, rcu);
171  	struct percpu_ref *ref = data->ref;
172  	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
173  	static atomic_t underflows;
174  	unsigned long count = 0;
175  	int cpu;
176  
177  	for_each_possible_cpu(cpu)
178  		count += *per_cpu_ptr(percpu_count, cpu);
179  
180  	pr_debug("global %lu percpu %lu\n",
181  		 atomic_long_read(&data->count), count);
182  
183  	/*
184  	 * It's crucial that we sum the percpu counters _before_ adding the sum
185  	 * to &ref->count; since gets could be happening on one cpu while puts
186  	 * happen on another, adding a single cpu's count could cause
187  	 * @ref->count to hit 0 before we've got a consistent value - but the
188  	 * sum of all the counts will be consistent and correct.
189  	 *
190  	 * Subtracting the bias value then has to happen _after_ adding count to
191  	 * &ref->count; we need the bias value to prevent &ref->count from
192  	 * reaching 0 before we add the percpu counts. But doing it at the same
193  	 * time is equivalent and saves us atomic operations:
194  	 */
195  	atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count);
196  
197  	if (WARN_ONCE(atomic_long_read(&data->count) <= 0,
198  		      "percpu ref (%ps) <= 0 (%ld) after switching to atomic",
199  		      data->release, atomic_long_read(&data->count)) &&
200  	    atomic_inc_return(&underflows) < 4) {
201  		pr_err("%s(): percpu_ref underflow", __func__);
202  		mem_dump_obj(data);
203  	}
204  
205  	/* @ref is viewed as dead on all CPUs, send out switch confirmation */
206  	percpu_ref_call_confirm_rcu(rcu);
207  }
208  
percpu_ref_noop_confirm_switch(struct percpu_ref * ref)209  static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
210  {
211  }
212  
__percpu_ref_switch_to_atomic(struct percpu_ref * ref,percpu_ref_func_t * confirm_switch)213  static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
214  					  percpu_ref_func_t *confirm_switch)
215  {
216  	if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
217  		if (confirm_switch)
218  			confirm_switch(ref);
219  		return;
220  	}
221  
222  	/* switching from percpu to atomic */
223  	ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
224  
225  	/*
226  	 * Non-NULL ->confirm_switch is used to indicate that switching is
227  	 * in progress.  Use noop one if unspecified.
228  	 */
229  	ref->data->confirm_switch = confirm_switch ?:
230  		percpu_ref_noop_confirm_switch;
231  
232  	percpu_ref_get(ref);	/* put after confirmation */
233  	call_rcu_hurry(&ref->data->rcu,
234  		       percpu_ref_switch_to_atomic_rcu);
235  }
236  
__percpu_ref_switch_to_percpu(struct percpu_ref * ref)237  static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
238  {
239  	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
240  	int cpu;
241  
242  	BUG_ON(!percpu_count);
243  
244  	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
245  		return;
246  
247  	if (WARN_ON_ONCE(!ref->data->allow_reinit))
248  		return;
249  
250  	atomic_long_add(PERCPU_COUNT_BIAS, &ref->data->count);
251  
252  	/*
253  	 * Restore per-cpu operation.  smp_store_release() is paired
254  	 * with READ_ONCE() in __ref_is_percpu() and guarantees that the
255  	 * zeroing is visible to all percpu accesses which can see the
256  	 * following __PERCPU_REF_ATOMIC clearing.
257  	 */
258  	for_each_possible_cpu(cpu)
259  		*per_cpu_ptr(percpu_count, cpu) = 0;
260  
261  	smp_store_release(&ref->percpu_count_ptr,
262  			  ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
263  }
264  
__percpu_ref_switch_mode(struct percpu_ref * ref,percpu_ref_func_t * confirm_switch)265  static void __percpu_ref_switch_mode(struct percpu_ref *ref,
266  				     percpu_ref_func_t *confirm_switch)
267  {
268  	struct percpu_ref_data *data = ref->data;
269  
270  	lockdep_assert_held(&percpu_ref_switch_lock);
271  
272  	/*
273  	 * If the previous ATOMIC switching hasn't finished yet, wait for
274  	 * its completion.  If the caller ensures that ATOMIC switching
275  	 * isn't in progress, this function can be called from any context.
276  	 */
277  	wait_event_lock_irq(percpu_ref_switch_waitq, !data->confirm_switch,
278  			    percpu_ref_switch_lock);
279  
280  	if (data->force_atomic || percpu_ref_is_dying(ref))
281  		__percpu_ref_switch_to_atomic(ref, confirm_switch);
282  	else
283  		__percpu_ref_switch_to_percpu(ref);
284  }
285  
286  /**
287   * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
288   * @ref: percpu_ref to switch to atomic mode
289   * @confirm_switch: optional confirmation callback
290   *
291   * There's no reason to use this function for the usual reference counting.
292   * Use percpu_ref_kill[_and_confirm]().
293   *
294   * Schedule switching of @ref to atomic mode.  All its percpu counts will
295   * be collected to the main atomic counter.  On completion, when all CPUs
296   * are guaraneed to be in atomic mode, @confirm_switch, which may not
297   * block, is invoked.  This function may be invoked concurrently with all
298   * the get/put operations and can safely be mixed with kill and reinit
299   * operations.  Note that @ref will stay in atomic mode across kill/reinit
300   * cycles until percpu_ref_switch_to_percpu() is called.
301   *
302   * This function may block if @ref is in the process of switching to atomic
303   * mode.  If the caller ensures that @ref is not in the process of
304   * switching to atomic mode, this function can be called from any context.
305   */
percpu_ref_switch_to_atomic(struct percpu_ref * ref,percpu_ref_func_t * confirm_switch)306  void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
307  				 percpu_ref_func_t *confirm_switch)
308  {
309  	unsigned long flags;
310  
311  	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
312  
313  	ref->data->force_atomic = true;
314  	__percpu_ref_switch_mode(ref, confirm_switch);
315  
316  	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
317  }
318  EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
319  
320  /**
321   * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
322   * @ref: percpu_ref to switch to atomic mode
323   *
324   * Schedule switching the ref to atomic mode, and wait for the
325   * switch to complete.  Caller must ensure that no other thread
326   * will switch back to percpu mode.
327   */
percpu_ref_switch_to_atomic_sync(struct percpu_ref * ref)328  void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
329  {
330  	percpu_ref_switch_to_atomic(ref, NULL);
331  	wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
332  }
333  EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);
334  
335  /**
336   * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
337   * @ref: percpu_ref to switch to percpu mode
338   *
339   * There's no reason to use this function for the usual reference counting.
340   * To re-use an expired ref, use percpu_ref_reinit().
341   *
342   * Switch @ref to percpu mode.  This function may be invoked concurrently
343   * with all the get/put operations and can safely be mixed with kill and
344   * reinit operations.  This function reverses the sticky atomic state set
345   * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic().  If @ref is
346   * dying or dead, the actual switching takes place on the following
347   * percpu_ref_reinit().
348   *
349   * This function may block if @ref is in the process of switching to atomic
350   * mode.  If the caller ensures that @ref is not in the process of
351   * switching to atomic mode, this function can be called from any context.
352   */
percpu_ref_switch_to_percpu(struct percpu_ref * ref)353  void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
354  {
355  	unsigned long flags;
356  
357  	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
358  
359  	ref->data->force_atomic = false;
360  	__percpu_ref_switch_mode(ref, NULL);
361  
362  	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
363  }
364  EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
365  
366  /**
367   * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation
368   * @ref: percpu_ref to kill
369   * @confirm_kill: optional confirmation callback
370   *
371   * Equivalent to percpu_ref_kill() but also schedules kill confirmation if
372   * @confirm_kill is not NULL.  @confirm_kill, which may not block, will be
373   * called after @ref is seen as dead from all CPUs at which point all
374   * further invocations of percpu_ref_tryget_live() will fail.  See
375   * percpu_ref_tryget_live() for details.
376   *
377   * This function normally doesn't block and can be called from any context
378   * but it may block if @confirm_kill is specified and @ref is in the
379   * process of switching to atomic mode by percpu_ref_switch_to_atomic().
380   *
381   * There are no implied RCU grace periods between kill and release.
382   */
percpu_ref_kill_and_confirm(struct percpu_ref * ref,percpu_ref_func_t * confirm_kill)383  void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
384  				 percpu_ref_func_t *confirm_kill)
385  {
386  	unsigned long flags;
387  
388  	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
389  
390  	WARN_ONCE(percpu_ref_is_dying(ref),
391  		  "%s called more than once on %ps!", __func__,
392  		  ref->data->release);
393  
394  	ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
395  	__percpu_ref_switch_mode(ref, confirm_kill);
396  	percpu_ref_put(ref);
397  
398  	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
399  }
400  EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
401  
402  /**
403   * percpu_ref_is_zero - test whether a percpu refcount reached zero
404   * @ref: percpu_ref to test
405   *
406   * Returns %true if @ref reached zero.
407   *
408   * This function is safe to call as long as @ref is between init and exit.
409   */
percpu_ref_is_zero(struct percpu_ref * ref)410  bool percpu_ref_is_zero(struct percpu_ref *ref)
411  {
412  	unsigned long __percpu *percpu_count;
413  	unsigned long count, flags;
414  
415  	if (__ref_is_percpu(ref, &percpu_count))
416  		return false;
417  
418  	/* protect us from being destroyed */
419  	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
420  	if (ref->data)
421  		count = atomic_long_read(&ref->data->count);
422  	else
423  		count = ref->percpu_count_ptr >> __PERCPU_REF_FLAG_BITS;
424  	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
425  
426  	return count == 0;
427  }
428  EXPORT_SYMBOL_GPL(percpu_ref_is_zero);
429  
430  /**
431   * percpu_ref_reinit - re-initialize a percpu refcount
432   * @ref: perpcu_ref to re-initialize
433   *
434   * Re-initialize @ref so that it's in the same state as when it finished
435   * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD.  @ref must have been
436   * initialized successfully and reached 0 but not exited.
437   *
438   * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
439   * this function is in progress.
440   */
percpu_ref_reinit(struct percpu_ref * ref)441  void percpu_ref_reinit(struct percpu_ref *ref)
442  {
443  	WARN_ON_ONCE(!percpu_ref_is_zero(ref));
444  
445  	percpu_ref_resurrect(ref);
446  }
447  EXPORT_SYMBOL_GPL(percpu_ref_reinit);
448  
449  /**
450   * percpu_ref_resurrect - modify a percpu refcount from dead to live
451   * @ref: perpcu_ref to resurrect
452   *
453   * Modify @ref so that it's in the same state as before percpu_ref_kill() was
454   * called. @ref must be dead but must not yet have exited.
455   *
456   * If @ref->release() frees @ref then the caller is responsible for
457   * guaranteeing that @ref->release() does not get called while this
458   * function is in progress.
459   *
460   * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
461   * this function is in progress.
462   */
percpu_ref_resurrect(struct percpu_ref * ref)463  void percpu_ref_resurrect(struct percpu_ref *ref)
464  {
465  	unsigned long __percpu *percpu_count;
466  	unsigned long flags;
467  
468  	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
469  
470  	WARN_ON_ONCE(!percpu_ref_is_dying(ref));
471  	WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count));
472  
473  	ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
474  	percpu_ref_get(ref);
475  	__percpu_ref_switch_mode(ref, NULL);
476  
477  	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
478  }
479  EXPORT_SYMBOL_GPL(percpu_ref_resurrect);
480