1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Fast batching percpu counters.
4   */
5  
6  #include <linux/percpu_counter.h>
7  #include <linux/mutex.h>
8  #include <linux/init.h>
9  #include <linux/cpu.h>
10  #include <linux/module.h>
11  #include <linux/debugobjects.h>
12  
13  #ifdef CONFIG_HOTPLUG_CPU
14  static LIST_HEAD(percpu_counters);
15  static DEFINE_SPINLOCK(percpu_counters_lock);
16  #endif
17  
18  #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER
19  
20  static const struct debug_obj_descr percpu_counter_debug_descr;
21  
percpu_counter_fixup_free(void * addr,enum debug_obj_state state)22  static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
23  {
24  	struct percpu_counter *fbc = addr;
25  
26  	switch (state) {
27  	case ODEBUG_STATE_ACTIVE:
28  		percpu_counter_destroy(fbc);
29  		debug_object_free(fbc, &percpu_counter_debug_descr);
30  		return true;
31  	default:
32  		return false;
33  	}
34  }
35  
36  static const struct debug_obj_descr percpu_counter_debug_descr = {
37  	.name		= "percpu_counter",
38  	.fixup_free	= percpu_counter_fixup_free,
39  };
40  
debug_percpu_counter_activate(struct percpu_counter * fbc)41  static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
42  {
43  	debug_object_init(fbc, &percpu_counter_debug_descr);
44  	debug_object_activate(fbc, &percpu_counter_debug_descr);
45  }
46  
debug_percpu_counter_deactivate(struct percpu_counter * fbc)47  static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
48  {
49  	debug_object_deactivate(fbc, &percpu_counter_debug_descr);
50  	debug_object_free(fbc, &percpu_counter_debug_descr);
51  }
52  
53  #else	/* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
debug_percpu_counter_activate(struct percpu_counter * fbc)54  static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
55  { }
debug_percpu_counter_deactivate(struct percpu_counter * fbc)56  static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
57  { }
58  #endif	/* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
59  
percpu_counter_set(struct percpu_counter * fbc,s64 amount)60  void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
61  {
62  	int cpu;
63  	unsigned long flags;
64  
65  	raw_spin_lock_irqsave(&fbc->lock, flags);
66  	for_each_possible_cpu(cpu) {
67  		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
68  		*pcount = 0;
69  	}
70  	fbc->count = amount;
71  	raw_spin_unlock_irqrestore(&fbc->lock, flags);
72  }
73  EXPORT_SYMBOL(percpu_counter_set);
74  
75  /*
76   * Add to a counter while respecting batch size.
77   *
78   * There are 2 implementations, both dealing with the following problem:
79   *
80   * The decision slow path/fast path and the actual update must be atomic.
81   * Otherwise a call in process context could check the current values and
82   * decide that the fast path can be used. If now an interrupt occurs before
83   * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters),
84   * then the this_cpu_add() that is executed after the interrupt has completed
85   * can produce values larger than "batch" or even overflows.
86   */
87  #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
88  /*
89   * Safety against interrupts is achieved in 2 ways:
90   * 1. the fast path uses local cmpxchg (note: no lock prefix)
91   * 2. the slow path operates with interrupts disabled
92   */
percpu_counter_add_batch(struct percpu_counter * fbc,s64 amount,s32 batch)93  void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
94  {
95  	s64 count;
96  	unsigned long flags;
97  
98  	count = this_cpu_read(*fbc->counters);
99  	do {
100  		if (unlikely(abs(count + amount) >= batch)) {
101  			raw_spin_lock_irqsave(&fbc->lock, flags);
102  			/*
103  			 * Note: by now we might have migrated to another CPU
104  			 * or the value might have changed.
105  			 */
106  			count = __this_cpu_read(*fbc->counters);
107  			fbc->count += count + amount;
108  			__this_cpu_sub(*fbc->counters, count);
109  			raw_spin_unlock_irqrestore(&fbc->lock, flags);
110  			return;
111  		}
112  	} while (!this_cpu_try_cmpxchg(*fbc->counters, &count, count + amount));
113  }
114  #else
115  /*
116   * local_irq_save() is used to make the function irq safe:
117   * - The slow path would be ok as protected by an irq-safe spinlock.
118   * - this_cpu_add would be ok as it is irq-safe by definition.
119   */
percpu_counter_add_batch(struct percpu_counter * fbc,s64 amount,s32 batch)120  void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
121  {
122  	s64 count;
123  	unsigned long flags;
124  
125  	local_irq_save(flags);
126  	count = __this_cpu_read(*fbc->counters) + amount;
127  	if (abs(count) >= batch) {
128  		raw_spin_lock(&fbc->lock);
129  		fbc->count += count;
130  		__this_cpu_sub(*fbc->counters, count - amount);
131  		raw_spin_unlock(&fbc->lock);
132  	} else {
133  		this_cpu_add(*fbc->counters, amount);
134  	}
135  	local_irq_restore(flags);
136  }
137  #endif
138  EXPORT_SYMBOL(percpu_counter_add_batch);
139  
140  /*
141   * For percpu_counter with a big batch, the devication of its count could
142   * be big, and there is requirement to reduce the deviation, like when the
143   * counter's batch could be runtime decreased to get a better accuracy,
144   * which can be achieved by running this sync function on each CPU.
145   */
percpu_counter_sync(struct percpu_counter * fbc)146  void percpu_counter_sync(struct percpu_counter *fbc)
147  {
148  	unsigned long flags;
149  	s64 count;
150  
151  	raw_spin_lock_irqsave(&fbc->lock, flags);
152  	count = __this_cpu_read(*fbc->counters);
153  	fbc->count += count;
154  	__this_cpu_sub(*fbc->counters, count);
155  	raw_spin_unlock_irqrestore(&fbc->lock, flags);
156  }
157  EXPORT_SYMBOL(percpu_counter_sync);
158  
159  /*
160   * Add up all the per-cpu counts, return the result.  This is a more accurate
161   * but much slower version of percpu_counter_read_positive().
162   *
163   * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums
164   * from CPUs that are in the process of being taken offline. Dying cpus have
165   * been removed from the online mask, but may not have had the hotplug dead
166   * notifier called to fold the percpu count back into the global counter sum.
167   * By including dying CPUs in the iteration mask, we avoid this race condition
168   * so __percpu_counter_sum() just does the right thing when CPUs are being taken
169   * offline.
170   */
__percpu_counter_sum(struct percpu_counter * fbc)171  s64 __percpu_counter_sum(struct percpu_counter *fbc)
172  {
173  	s64 ret;
174  	int cpu;
175  	unsigned long flags;
176  
177  	raw_spin_lock_irqsave(&fbc->lock, flags);
178  	ret = fbc->count;
179  	for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
180  		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
181  		ret += *pcount;
182  	}
183  	raw_spin_unlock_irqrestore(&fbc->lock, flags);
184  	return ret;
185  }
186  EXPORT_SYMBOL(__percpu_counter_sum);
187  
__percpu_counter_init_many(struct percpu_counter * fbc,s64 amount,gfp_t gfp,u32 nr_counters,struct lock_class_key * key)188  int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
189  			       gfp_t gfp, u32 nr_counters,
190  			       struct lock_class_key *key)
191  {
192  	unsigned long flags __maybe_unused;
193  	size_t counter_size;
194  	s32 __percpu *counters;
195  	u32 i;
196  
197  	counter_size = ALIGN(sizeof(*counters), __alignof__(*counters));
198  	counters = __alloc_percpu_gfp(nr_counters * counter_size,
199  				      __alignof__(*counters), gfp);
200  	if (!counters) {
201  		fbc[0].counters = NULL;
202  		return -ENOMEM;
203  	}
204  
205  	for (i = 0; i < nr_counters; i++) {
206  		raw_spin_lock_init(&fbc[i].lock);
207  		lockdep_set_class(&fbc[i].lock, key);
208  #ifdef CONFIG_HOTPLUG_CPU
209  		INIT_LIST_HEAD(&fbc[i].list);
210  #endif
211  		fbc[i].count = amount;
212  		fbc[i].counters = (void __percpu *)counters + i * counter_size;
213  
214  		debug_percpu_counter_activate(&fbc[i]);
215  	}
216  
217  #ifdef CONFIG_HOTPLUG_CPU
218  	spin_lock_irqsave(&percpu_counters_lock, flags);
219  	for (i = 0; i < nr_counters; i++)
220  		list_add(&fbc[i].list, &percpu_counters);
221  	spin_unlock_irqrestore(&percpu_counters_lock, flags);
222  #endif
223  	return 0;
224  }
225  EXPORT_SYMBOL(__percpu_counter_init_many);
226  
percpu_counter_destroy_many(struct percpu_counter * fbc,u32 nr_counters)227  void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters)
228  {
229  	unsigned long flags __maybe_unused;
230  	u32 i;
231  
232  	if (WARN_ON_ONCE(!fbc))
233  		return;
234  
235  	if (!fbc[0].counters)
236  		return;
237  
238  	for (i = 0; i < nr_counters; i++)
239  		debug_percpu_counter_deactivate(&fbc[i]);
240  
241  #ifdef CONFIG_HOTPLUG_CPU
242  	spin_lock_irqsave(&percpu_counters_lock, flags);
243  	for (i = 0; i < nr_counters; i++)
244  		list_del(&fbc[i].list);
245  	spin_unlock_irqrestore(&percpu_counters_lock, flags);
246  #endif
247  
248  	free_percpu(fbc[0].counters);
249  
250  	for (i = 0; i < nr_counters; i++)
251  		fbc[i].counters = NULL;
252  }
253  EXPORT_SYMBOL(percpu_counter_destroy_many);
254  
255  int percpu_counter_batch __read_mostly = 32;
256  EXPORT_SYMBOL(percpu_counter_batch);
257  
compute_batch_value(unsigned int cpu)258  static int compute_batch_value(unsigned int cpu)
259  {
260  	int nr = num_online_cpus();
261  
262  	percpu_counter_batch = max(32, nr*2);
263  	return 0;
264  }
265  
percpu_counter_cpu_dead(unsigned int cpu)266  static int percpu_counter_cpu_dead(unsigned int cpu)
267  {
268  #ifdef CONFIG_HOTPLUG_CPU
269  	struct percpu_counter *fbc;
270  
271  	compute_batch_value(cpu);
272  
273  	spin_lock_irq(&percpu_counters_lock);
274  	list_for_each_entry(fbc, &percpu_counters, list) {
275  		s32 *pcount;
276  
277  		raw_spin_lock(&fbc->lock);
278  		pcount = per_cpu_ptr(fbc->counters, cpu);
279  		fbc->count += *pcount;
280  		*pcount = 0;
281  		raw_spin_unlock(&fbc->lock);
282  	}
283  	spin_unlock_irq(&percpu_counters_lock);
284  #endif
285  	return 0;
286  }
287  
288  /*
289   * Compare counter against given value.
290   * Return 1 if greater, 0 if equal and -1 if less
291   */
__percpu_counter_compare(struct percpu_counter * fbc,s64 rhs,s32 batch)292  int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
293  {
294  	s64	count;
295  
296  	count = percpu_counter_read(fbc);
297  	/* Check to see if rough count will be sufficient for comparison */
298  	if (abs(count - rhs) > (batch * num_online_cpus())) {
299  		if (count > rhs)
300  			return 1;
301  		else
302  			return -1;
303  	}
304  	/* Need to use precise count */
305  	count = percpu_counter_sum(fbc);
306  	if (count > rhs)
307  		return 1;
308  	else if (count < rhs)
309  		return -1;
310  	else
311  		return 0;
312  }
313  EXPORT_SYMBOL(__percpu_counter_compare);
314  
315  /*
316   * Compare counter, and add amount if total is: less than or equal to limit if
317   * amount is positive, or greater than or equal to limit if amount is negative.
318   * Return true if amount is added, or false if total would be beyond the limit.
319   *
320   * Negative limit is allowed, but unusual.
321   * When negative amounts (subs) are given to percpu_counter_limited_add(),
322   * the limit would most naturally be 0 - but other limits are also allowed.
323   *
324   * Overflow beyond S64_MAX is not allowed for: counter, limit and amount
325   * are all assumed to be sane (far from S64_MIN and S64_MAX).
326   */
__percpu_counter_limited_add(struct percpu_counter * fbc,s64 limit,s64 amount,s32 batch)327  bool __percpu_counter_limited_add(struct percpu_counter *fbc,
328  				  s64 limit, s64 amount, s32 batch)
329  {
330  	s64 count;
331  	s64 unknown;
332  	unsigned long flags;
333  	bool good = false;
334  
335  	if (amount == 0)
336  		return true;
337  
338  	local_irq_save(flags);
339  	unknown = batch * num_online_cpus();
340  	count = __this_cpu_read(*fbc->counters);
341  
342  	/* Skip taking the lock when safe */
343  	if (abs(count + amount) <= batch &&
344  	    ((amount > 0 && fbc->count + unknown <= limit) ||
345  	     (amount < 0 && fbc->count - unknown >= limit))) {
346  		this_cpu_add(*fbc->counters, amount);
347  		local_irq_restore(flags);
348  		return true;
349  	}
350  
351  	raw_spin_lock(&fbc->lock);
352  	count = fbc->count + amount;
353  
354  	/* Skip percpu_counter_sum() when safe */
355  	if (amount > 0) {
356  		if (count - unknown > limit)
357  			goto out;
358  		if (count + unknown <= limit)
359  			good = true;
360  	} else {
361  		if (count + unknown < limit)
362  			goto out;
363  		if (count - unknown >= limit)
364  			good = true;
365  	}
366  
367  	if (!good) {
368  		s32 *pcount;
369  		int cpu;
370  
371  		for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
372  			pcount = per_cpu_ptr(fbc->counters, cpu);
373  			count += *pcount;
374  		}
375  		if (amount > 0) {
376  			if (count > limit)
377  				goto out;
378  		} else {
379  			if (count < limit)
380  				goto out;
381  		}
382  		good = true;
383  	}
384  
385  	count = __this_cpu_read(*fbc->counters);
386  	fbc->count += count + amount;
387  	__this_cpu_sub(*fbc->counters, count);
388  out:
389  	raw_spin_unlock(&fbc->lock);
390  	local_irq_restore(flags);
391  	return good;
392  }
393  
percpu_counter_startup(void)394  static int __init percpu_counter_startup(void)
395  {
396  	int ret;
397  
398  	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
399  				compute_batch_value, NULL);
400  	WARN_ON(ret < 0);
401  	ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
402  					"lib/percpu_cnt:dead", NULL,
403  					percpu_counter_cpu_dead);
404  	WARN_ON(ret < 0);
405  	return 0;
406  }
407  module_init(percpu_counter_startup);
408