1  /* SPDX-License-Identifier: GPL-2.0
2   *
3   * page_pool.c
4   *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
5   *	Copyright (C) 2016 Red Hat, Inc.
6   */
7  
8  #include <linux/error-injection.h>
9  #include <linux/types.h>
10  #include <linux/kernel.h>
11  #include <linux/slab.h>
12  #include <linux/device.h>
13  
14  #include <net/netdev_rx_queue.h>
15  #include <net/page_pool/helpers.h>
16  #include <net/xdp.h>
17  
18  #include <linux/dma-direction.h>
19  #include <linux/dma-mapping.h>
20  #include <linux/page-flags.h>
21  #include <linux/mm.h> /* for put_page() */
22  #include <linux/poison.h>
23  #include <linux/ethtool.h>
24  #include <linux/netdevice.h>
25  
26  #include <trace/events/page_pool.h>
27  
28  #include "mp_dmabuf_devmem.h"
29  #include "netmem_priv.h"
30  #include "page_pool_priv.h"
31  
32  DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
33  
34  #define DEFER_TIME (msecs_to_jiffies(1000))
35  #define DEFER_WARN_INTERVAL (60 * HZ)
36  
37  #define BIAS_MAX	(LONG_MAX >> 1)
38  
39  #ifdef CONFIG_PAGE_POOL_STATS
40  static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
41  
42  /* alloc_stat_inc is intended to be used in softirq context */
43  #define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
44  /* recycle_stat_inc is safe to use when preemption is possible. */
45  #define recycle_stat_inc(pool, __stat)							\
46  	do {										\
47  		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
48  		this_cpu_inc(s->__stat);						\
49  	} while (0)
50  
51  #define recycle_stat_add(pool, __stat, val)						\
52  	do {										\
53  		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
54  		this_cpu_add(s->__stat, val);						\
55  	} while (0)
56  
57  static const char pp_stats[][ETH_GSTRING_LEN] = {
58  	"rx_pp_alloc_fast",
59  	"rx_pp_alloc_slow",
60  	"rx_pp_alloc_slow_ho",
61  	"rx_pp_alloc_empty",
62  	"rx_pp_alloc_refill",
63  	"rx_pp_alloc_waive",
64  	"rx_pp_recycle_cached",
65  	"rx_pp_recycle_cache_full",
66  	"rx_pp_recycle_ring",
67  	"rx_pp_recycle_ring_full",
68  	"rx_pp_recycle_released_ref",
69  };
70  
71  /**
72   * page_pool_get_stats() - fetch page pool stats
73   * @pool:	pool from which page was allocated
74   * @stats:	struct page_pool_stats to fill in
75   *
76   * Retrieve statistics about the page_pool. This API is only available
77   * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
78   * A pointer to a caller allocated struct page_pool_stats structure
79   * is passed to this API which is filled in. The caller can then report
80   * those stats to the user (perhaps via ethtool, debugfs, etc.).
81   */
page_pool_get_stats(const struct page_pool * pool,struct page_pool_stats * stats)82  bool page_pool_get_stats(const struct page_pool *pool,
83  			 struct page_pool_stats *stats)
84  {
85  	int cpu = 0;
86  
87  	if (!stats)
88  		return false;
89  
90  	/* The caller is responsible to initialize stats. */
91  	stats->alloc_stats.fast += pool->alloc_stats.fast;
92  	stats->alloc_stats.slow += pool->alloc_stats.slow;
93  	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
94  	stats->alloc_stats.empty += pool->alloc_stats.empty;
95  	stats->alloc_stats.refill += pool->alloc_stats.refill;
96  	stats->alloc_stats.waive += pool->alloc_stats.waive;
97  
98  	for_each_possible_cpu(cpu) {
99  		const struct page_pool_recycle_stats *pcpu =
100  			per_cpu_ptr(pool->recycle_stats, cpu);
101  
102  		stats->recycle_stats.cached += pcpu->cached;
103  		stats->recycle_stats.cache_full += pcpu->cache_full;
104  		stats->recycle_stats.ring += pcpu->ring;
105  		stats->recycle_stats.ring_full += pcpu->ring_full;
106  		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
107  	}
108  
109  	return true;
110  }
111  EXPORT_SYMBOL(page_pool_get_stats);
112  
page_pool_ethtool_stats_get_strings(u8 * data)113  u8 *page_pool_ethtool_stats_get_strings(u8 *data)
114  {
115  	int i;
116  
117  	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
118  		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
119  		data += ETH_GSTRING_LEN;
120  	}
121  
122  	return data;
123  }
124  EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
125  
page_pool_ethtool_stats_get_count(void)126  int page_pool_ethtool_stats_get_count(void)
127  {
128  	return ARRAY_SIZE(pp_stats);
129  }
130  EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
131  
page_pool_ethtool_stats_get(u64 * data,const void * stats)132  u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
133  {
134  	const struct page_pool_stats *pool_stats = stats;
135  
136  	*data++ = pool_stats->alloc_stats.fast;
137  	*data++ = pool_stats->alloc_stats.slow;
138  	*data++ = pool_stats->alloc_stats.slow_high_order;
139  	*data++ = pool_stats->alloc_stats.empty;
140  	*data++ = pool_stats->alloc_stats.refill;
141  	*data++ = pool_stats->alloc_stats.waive;
142  	*data++ = pool_stats->recycle_stats.cached;
143  	*data++ = pool_stats->recycle_stats.cache_full;
144  	*data++ = pool_stats->recycle_stats.ring;
145  	*data++ = pool_stats->recycle_stats.ring_full;
146  	*data++ = pool_stats->recycle_stats.released_refcnt;
147  
148  	return data;
149  }
150  EXPORT_SYMBOL(page_pool_ethtool_stats_get);
151  
152  #else
153  #define alloc_stat_inc(pool, __stat)
154  #define recycle_stat_inc(pool, __stat)
155  #define recycle_stat_add(pool, __stat, val)
156  #endif
157  
page_pool_producer_lock(struct page_pool * pool)158  static bool page_pool_producer_lock(struct page_pool *pool)
159  	__acquires(&pool->ring.producer_lock)
160  {
161  	bool in_softirq = in_softirq();
162  
163  	if (in_softirq)
164  		spin_lock(&pool->ring.producer_lock);
165  	else
166  		spin_lock_bh(&pool->ring.producer_lock);
167  
168  	return in_softirq;
169  }
170  
page_pool_producer_unlock(struct page_pool * pool,bool in_softirq)171  static void page_pool_producer_unlock(struct page_pool *pool,
172  				      bool in_softirq)
173  	__releases(&pool->ring.producer_lock)
174  {
175  	if (in_softirq)
176  		spin_unlock(&pool->ring.producer_lock);
177  	else
178  		spin_unlock_bh(&pool->ring.producer_lock);
179  }
180  
page_pool_struct_check(void)181  static void page_pool_struct_check(void)
182  {
183  	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
184  	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
185  	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
186  	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
187  				    PAGE_POOL_FRAG_GROUP_ALIGN);
188  }
189  
page_pool_init(struct page_pool * pool,const struct page_pool_params * params,int cpuid)190  static int page_pool_init(struct page_pool *pool,
191  			  const struct page_pool_params *params,
192  			  int cpuid)
193  {
194  	unsigned int ring_qsize = 1024; /* Default */
195  	struct netdev_rx_queue *rxq;
196  	int err;
197  
198  	page_pool_struct_check();
199  
200  	memcpy(&pool->p, &params->fast, sizeof(pool->p));
201  	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
202  
203  	pool->cpuid = cpuid;
204  
205  	/* Validate only known flags were used */
206  	if (pool->slow.flags & ~PP_FLAG_ALL)
207  		return -EINVAL;
208  
209  	if (pool->p.pool_size)
210  		ring_qsize = pool->p.pool_size;
211  
212  	/* Sanity limit mem that can be pinned down */
213  	if (ring_qsize > 32768)
214  		return -E2BIG;
215  
216  	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
217  	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
218  	 * which is the XDP_TX use-case.
219  	 */
220  	if (pool->slow.flags & PP_FLAG_DMA_MAP) {
221  		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
222  		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
223  			return -EINVAL;
224  
225  		pool->dma_map = true;
226  	}
227  
228  	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
229  		/* In order to request DMA-sync-for-device the page
230  		 * needs to be mapped
231  		 */
232  		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
233  			return -EINVAL;
234  
235  		if (!pool->p.max_len)
236  			return -EINVAL;
237  
238  		pool->dma_sync = true;
239  
240  		/* pool->p.offset has to be set according to the address
241  		 * offset used by the DMA engine to start copying rx data
242  		 */
243  	}
244  
245  	pool->has_init_callback = !!pool->slow.init_callback;
246  
247  #ifdef CONFIG_PAGE_POOL_STATS
248  	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
249  		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
250  		if (!pool->recycle_stats)
251  			return -ENOMEM;
252  	} else {
253  		/* For system page pool instance we use a singular stats object
254  		 * instead of allocating a separate percpu variable for each
255  		 * (also percpu) page pool instance.
256  		 */
257  		pool->recycle_stats = &pp_system_recycle_stats;
258  		pool->system = true;
259  	}
260  #endif
261  
262  	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
263  #ifdef CONFIG_PAGE_POOL_STATS
264  		if (!pool->system)
265  			free_percpu(pool->recycle_stats);
266  #endif
267  		return -ENOMEM;
268  	}
269  
270  	atomic_set(&pool->pages_state_release_cnt, 0);
271  
272  	/* Driver calling page_pool_create() also call page_pool_destroy() */
273  	refcount_set(&pool->user_cnt, 1);
274  
275  	if (pool->dma_map)
276  		get_device(pool->p.dev);
277  
278  	if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
279  		/* We rely on rtnl_lock()ing to make sure netdev_rx_queue
280  		 * configuration doesn't change while we're initializing
281  		 * the page_pool.
282  		 */
283  		ASSERT_RTNL();
284  		rxq = __netif_get_rx_queue(pool->slow.netdev,
285  					   pool->slow.queue_idx);
286  		pool->mp_priv = rxq->mp_params.mp_priv;
287  	}
288  
289  	if (pool->mp_priv) {
290  		err = mp_dmabuf_devmem_init(pool);
291  		if (err) {
292  			pr_warn("%s() mem-provider init failed %d\n", __func__,
293  				err);
294  			goto free_ptr_ring;
295  		}
296  
297  		static_branch_inc(&page_pool_mem_providers);
298  	}
299  
300  	return 0;
301  
302  free_ptr_ring:
303  	ptr_ring_cleanup(&pool->ring, NULL);
304  #ifdef CONFIG_PAGE_POOL_STATS
305  	if (!pool->system)
306  		free_percpu(pool->recycle_stats);
307  #endif
308  	return err;
309  }
310  
page_pool_uninit(struct page_pool * pool)311  static void page_pool_uninit(struct page_pool *pool)
312  {
313  	ptr_ring_cleanup(&pool->ring, NULL);
314  
315  	if (pool->dma_map)
316  		put_device(pool->p.dev);
317  
318  #ifdef CONFIG_PAGE_POOL_STATS
319  	if (!pool->system)
320  		free_percpu(pool->recycle_stats);
321  #endif
322  }
323  
324  /**
325   * page_pool_create_percpu() - create a page pool for a given cpu.
326   * @params: parameters, see struct page_pool_params
327   * @cpuid: cpu identifier
328   */
329  struct page_pool *
page_pool_create_percpu(const struct page_pool_params * params,int cpuid)330  page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
331  {
332  	struct page_pool *pool;
333  	int err;
334  
335  	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
336  	if (!pool)
337  		return ERR_PTR(-ENOMEM);
338  
339  	err = page_pool_init(pool, params, cpuid);
340  	if (err < 0)
341  		goto err_free;
342  
343  	err = page_pool_list(pool);
344  	if (err)
345  		goto err_uninit;
346  
347  	return pool;
348  
349  err_uninit:
350  	page_pool_uninit(pool);
351  err_free:
352  	pr_warn("%s() gave up with errno %d\n", __func__, err);
353  	kfree(pool);
354  	return ERR_PTR(err);
355  }
356  EXPORT_SYMBOL(page_pool_create_percpu);
357  
358  /**
359   * page_pool_create() - create a page pool
360   * @params: parameters, see struct page_pool_params
361   */
page_pool_create(const struct page_pool_params * params)362  struct page_pool *page_pool_create(const struct page_pool_params *params)
363  {
364  	return page_pool_create_percpu(params, -1);
365  }
366  EXPORT_SYMBOL(page_pool_create);
367  
368  static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
369  
page_pool_refill_alloc_cache(struct page_pool * pool)370  static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
371  {
372  	struct ptr_ring *r = &pool->ring;
373  	netmem_ref netmem;
374  	int pref_nid; /* preferred NUMA node */
375  
376  	/* Quicker fallback, avoid locks when ring is empty */
377  	if (__ptr_ring_empty(r)) {
378  		alloc_stat_inc(pool, empty);
379  		return 0;
380  	}
381  
382  	/* Softirq guarantee CPU and thus NUMA node is stable. This,
383  	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
384  	 */
385  #ifdef CONFIG_NUMA
386  	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
387  #else
388  	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
389  	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
390  #endif
391  
392  	/* Refill alloc array, but only if NUMA match */
393  	do {
394  		netmem = (__force netmem_ref)__ptr_ring_consume(r);
395  		if (unlikely(!netmem))
396  			break;
397  
398  		if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
399  			pool->alloc.cache[pool->alloc.count++] = netmem;
400  		} else {
401  			/* NUMA mismatch;
402  			 * (1) release 1 page to page-allocator and
403  			 * (2) break out to fallthrough to alloc_pages_node.
404  			 * This limit stress on page buddy alloactor.
405  			 */
406  			page_pool_return_page(pool, netmem);
407  			alloc_stat_inc(pool, waive);
408  			netmem = 0;
409  			break;
410  		}
411  	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
412  
413  	/* Return last page */
414  	if (likely(pool->alloc.count > 0)) {
415  		netmem = pool->alloc.cache[--pool->alloc.count];
416  		alloc_stat_inc(pool, refill);
417  	}
418  
419  	return netmem;
420  }
421  
422  /* fast path */
__page_pool_get_cached(struct page_pool * pool)423  static netmem_ref __page_pool_get_cached(struct page_pool *pool)
424  {
425  	netmem_ref netmem;
426  
427  	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
428  	if (likely(pool->alloc.count)) {
429  		/* Fast-path */
430  		netmem = pool->alloc.cache[--pool->alloc.count];
431  		alloc_stat_inc(pool, fast);
432  	} else {
433  		netmem = page_pool_refill_alloc_cache(pool);
434  	}
435  
436  	return netmem;
437  }
438  
__page_pool_dma_sync_for_device(const struct page_pool * pool,netmem_ref netmem,u32 dma_sync_size)439  static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
440  					    netmem_ref netmem,
441  					    u32 dma_sync_size)
442  {
443  #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
444  	dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
445  
446  	dma_sync_size = min(dma_sync_size, pool->p.max_len);
447  	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
448  				     dma_sync_size, pool->p.dma_dir);
449  #endif
450  }
451  
452  static __always_inline void
page_pool_dma_sync_for_device(const struct page_pool * pool,netmem_ref netmem,u32 dma_sync_size)453  page_pool_dma_sync_for_device(const struct page_pool *pool,
454  			      netmem_ref netmem,
455  			      u32 dma_sync_size)
456  {
457  	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
458  		__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
459  }
460  
page_pool_dma_map(struct page_pool * pool,netmem_ref netmem)461  static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
462  {
463  	dma_addr_t dma;
464  
465  	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
466  	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
467  	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
468  	 * This mapping is kept for lifetime of page, until leaving pool.
469  	 */
470  	dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
471  				 (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
472  				 DMA_ATTR_SKIP_CPU_SYNC |
473  					 DMA_ATTR_WEAK_ORDERING);
474  	if (dma_mapping_error(pool->p.dev, dma))
475  		return false;
476  
477  	if (page_pool_set_dma_addr_netmem(netmem, dma))
478  		goto unmap_failed;
479  
480  	page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
481  
482  	return true;
483  
484  unmap_failed:
485  	WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
486  	dma_unmap_page_attrs(pool->p.dev, dma,
487  			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
488  			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
489  	return false;
490  }
491  
__page_pool_alloc_page_order(struct page_pool * pool,gfp_t gfp)492  static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
493  						 gfp_t gfp)
494  {
495  	struct page *page;
496  
497  	gfp |= __GFP_COMP;
498  	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
499  	if (unlikely(!page))
500  		return NULL;
501  
502  	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
503  		put_page(page);
504  		return NULL;
505  	}
506  
507  	alloc_stat_inc(pool, slow_high_order);
508  	page_pool_set_pp_info(pool, page_to_netmem(page));
509  
510  	/* Track how many pages are held 'in-flight' */
511  	pool->pages_state_hold_cnt++;
512  	trace_page_pool_state_hold(pool, page_to_netmem(page),
513  				   pool->pages_state_hold_cnt);
514  	return page;
515  }
516  
517  /* slow path */
__page_pool_alloc_pages_slow(struct page_pool * pool,gfp_t gfp)518  static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
519  							gfp_t gfp)
520  {
521  	const int bulk = PP_ALLOC_CACHE_REFILL;
522  	unsigned int pp_order = pool->p.order;
523  	bool dma_map = pool->dma_map;
524  	netmem_ref netmem;
525  	int i, nr_pages;
526  
527  	/* Don't support bulk alloc for high-order pages */
528  	if (unlikely(pp_order))
529  		return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
530  
531  	/* Unnecessary as alloc cache is empty, but guarantees zero count */
532  	if (unlikely(pool->alloc.count > 0))
533  		return pool->alloc.cache[--pool->alloc.count];
534  
535  	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
536  	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
537  
538  	nr_pages = alloc_pages_bulk_array_node(gfp,
539  					       pool->p.nid, bulk,
540  					       (struct page **)pool->alloc.cache);
541  	if (unlikely(!nr_pages))
542  		return 0;
543  
544  	/* Pages have been filled into alloc.cache array, but count is zero and
545  	 * page element have not been (possibly) DMA mapped.
546  	 */
547  	for (i = 0; i < nr_pages; i++) {
548  		netmem = pool->alloc.cache[i];
549  		if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
550  			put_page(netmem_to_page(netmem));
551  			continue;
552  		}
553  
554  		page_pool_set_pp_info(pool, netmem);
555  		pool->alloc.cache[pool->alloc.count++] = netmem;
556  		/* Track how many pages are held 'in-flight' */
557  		pool->pages_state_hold_cnt++;
558  		trace_page_pool_state_hold(pool, netmem,
559  					   pool->pages_state_hold_cnt);
560  	}
561  
562  	/* Return last page */
563  	if (likely(pool->alloc.count > 0)) {
564  		netmem = pool->alloc.cache[--pool->alloc.count];
565  		alloc_stat_inc(pool, slow);
566  	} else {
567  		netmem = 0;
568  	}
569  
570  	/* When page just alloc'ed is should/must have refcnt 1. */
571  	return netmem;
572  }
573  
574  /* For using page_pool replace: alloc_pages() API calls, but provide
575   * synchronization guarantee for allocation side.
576   */
page_pool_alloc_netmem(struct page_pool * pool,gfp_t gfp)577  netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
578  {
579  	netmem_ref netmem;
580  
581  	/* Fast-path: Get a page from cache */
582  	netmem = __page_pool_get_cached(pool);
583  	if (netmem)
584  		return netmem;
585  
586  	/* Slow-path: cache empty, do real allocation */
587  	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
588  		netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
589  	else
590  		netmem = __page_pool_alloc_pages_slow(pool, gfp);
591  	return netmem;
592  }
593  EXPORT_SYMBOL(page_pool_alloc_netmem);
594  
page_pool_alloc_pages(struct page_pool * pool,gfp_t gfp)595  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
596  {
597  	return netmem_to_page(page_pool_alloc_netmem(pool, gfp));
598  }
599  EXPORT_SYMBOL(page_pool_alloc_pages);
600  ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
601  
602  /* Calculate distance between two u32 values, valid if distance is below 2^(31)
603   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
604   */
605  #define _distance(a, b)	(s32)((a) - (b))
606  
page_pool_inflight(const struct page_pool * pool,bool strict)607  s32 page_pool_inflight(const struct page_pool *pool, bool strict)
608  {
609  	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
610  	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
611  	s32 inflight;
612  
613  	inflight = _distance(hold_cnt, release_cnt);
614  
615  	if (strict) {
616  		trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
617  		WARN(inflight < 0, "Negative(%d) inflight packet-pages",
618  		     inflight);
619  	} else {
620  		inflight = max(0, inflight);
621  	}
622  
623  	return inflight;
624  }
625  
page_pool_set_pp_info(struct page_pool * pool,netmem_ref netmem)626  void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
627  {
628  	netmem_set_pp(netmem, pool);
629  	netmem_or_pp_magic(netmem, PP_SIGNATURE);
630  
631  	/* Ensuring all pages have been split into one fragment initially:
632  	 * page_pool_set_pp_info() is only called once for every page when it
633  	 * is allocated from the page allocator and page_pool_fragment_page()
634  	 * is dirtying the same cache line as the page->pp_magic above, so
635  	 * the overhead is negligible.
636  	 */
637  	page_pool_fragment_netmem(netmem, 1);
638  	if (pool->has_init_callback)
639  		pool->slow.init_callback(netmem, pool->slow.init_arg);
640  }
641  
page_pool_clear_pp_info(netmem_ref netmem)642  void page_pool_clear_pp_info(netmem_ref netmem)
643  {
644  	netmem_clear_pp_magic(netmem);
645  	netmem_set_pp(netmem, NULL);
646  }
647  
__page_pool_release_page_dma(struct page_pool * pool,netmem_ref netmem)648  static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
649  							 netmem_ref netmem)
650  {
651  	dma_addr_t dma;
652  
653  	if (!pool->dma_map)
654  		/* Always account for inflight pages, even if we didn't
655  		 * map them
656  		 */
657  		return;
658  
659  	dma = page_pool_get_dma_addr_netmem(netmem);
660  
661  	/* When page is unmapped, it cannot be returned to our pool */
662  	dma_unmap_page_attrs(pool->p.dev, dma,
663  			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
664  			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
665  	page_pool_set_dma_addr_netmem(netmem, 0);
666  }
667  
668  /* Disconnects a page (from a page_pool).  API users can have a need
669   * to disconnect a page (from a page_pool), to allow it to be used as
670   * a regular page (that will eventually be returned to the normal
671   * page-allocator via put_page).
672   */
page_pool_return_page(struct page_pool * pool,netmem_ref netmem)673  void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
674  {
675  	int count;
676  	bool put;
677  
678  	put = true;
679  	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
680  		put = mp_dmabuf_devmem_release_page(pool, netmem);
681  	else
682  		__page_pool_release_page_dma(pool, netmem);
683  
684  	/* This may be the last page returned, releasing the pool, so
685  	 * it is not safe to reference pool afterwards.
686  	 */
687  	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
688  	trace_page_pool_state_release(pool, netmem, count);
689  
690  	if (put) {
691  		page_pool_clear_pp_info(netmem);
692  		put_page(netmem_to_page(netmem));
693  	}
694  	/* An optimization would be to call __free_pages(page, pool->p.order)
695  	 * knowing page is not part of page-cache (thus avoiding a
696  	 * __page_cache_release() call).
697  	 */
698  }
699  
page_pool_recycle_in_ring(struct page_pool * pool,netmem_ref netmem)700  static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
701  {
702  	int ret;
703  	/* BH protection not needed if current is softirq */
704  	if (in_softirq())
705  		ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
706  	else
707  		ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
708  
709  	if (!ret) {
710  		recycle_stat_inc(pool, ring);
711  		return true;
712  	}
713  
714  	return false;
715  }
716  
717  /* Only allow direct recycling in special circumstances, into the
718   * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
719   *
720   * Caller must provide appropriate safe context.
721   */
page_pool_recycle_in_cache(netmem_ref netmem,struct page_pool * pool)722  static bool page_pool_recycle_in_cache(netmem_ref netmem,
723  				       struct page_pool *pool)
724  {
725  	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
726  		recycle_stat_inc(pool, cache_full);
727  		return false;
728  	}
729  
730  	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
731  	pool->alloc.cache[pool->alloc.count++] = netmem;
732  	recycle_stat_inc(pool, cached);
733  	return true;
734  }
735  
__page_pool_page_can_be_recycled(netmem_ref netmem)736  static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
737  {
738  	return netmem_is_net_iov(netmem) ||
739  	       (page_ref_count(netmem_to_page(netmem)) == 1 &&
740  		!page_is_pfmemalloc(netmem_to_page(netmem)));
741  }
742  
743  /* If the page refcnt == 1, this will try to recycle the page.
744   * If pool->dma_sync is set, we'll try to sync the DMA area for
745   * the configured size min(dma_sync_size, pool->max_len).
746   * If the page refcnt != 1, then the page will be returned to memory
747   * subsystem.
748   */
749  static __always_inline netmem_ref
__page_pool_put_page(struct page_pool * pool,netmem_ref netmem,unsigned int dma_sync_size,bool allow_direct)750  __page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
751  		     unsigned int dma_sync_size, bool allow_direct)
752  {
753  	lockdep_assert_no_hardirq();
754  
755  	/* This allocator is optimized for the XDP mode that uses
756  	 * one-frame-per-page, but have fallbacks that act like the
757  	 * regular page allocator APIs.
758  	 *
759  	 * refcnt == 1 means page_pool owns page, and can recycle it.
760  	 *
761  	 * page is NOT reusable when allocated when system is under
762  	 * some pressure. (page_is_pfmemalloc)
763  	 */
764  	if (likely(__page_pool_page_can_be_recycled(netmem))) {
765  		/* Read barrier done in page_ref_count / READ_ONCE */
766  
767  		page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
768  
769  		if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
770  			return 0;
771  
772  		/* Page found as candidate for recycling */
773  		return netmem;
774  	}
775  
776  	/* Fallback/non-XDP mode: API user have elevated refcnt.
777  	 *
778  	 * Many drivers split up the page into fragments, and some
779  	 * want to keep doing this to save memory and do refcnt based
780  	 * recycling. Support this use case too, to ease drivers
781  	 * switching between XDP/non-XDP.
782  	 *
783  	 * In-case page_pool maintains the DMA mapping, API user must
784  	 * call page_pool_put_page once.  In this elevated refcnt
785  	 * case, the DMA is unmapped/released, as driver is likely
786  	 * doing refcnt based recycle tricks, meaning another process
787  	 * will be invoking put_page.
788  	 */
789  	recycle_stat_inc(pool, released_refcnt);
790  	page_pool_return_page(pool, netmem);
791  
792  	return 0;
793  }
794  
page_pool_napi_local(const struct page_pool * pool)795  static bool page_pool_napi_local(const struct page_pool *pool)
796  {
797  	const struct napi_struct *napi;
798  	u32 cpuid;
799  
800  	if (unlikely(!in_softirq()))
801  		return false;
802  
803  	/* Allow direct recycle if we have reasons to believe that we are
804  	 * in the same context as the consumer would run, so there's
805  	 * no possible race.
806  	 * __page_pool_put_page() makes sure we're not in hardirq context
807  	 * and interrupts are enabled prior to accessing the cache.
808  	 */
809  	cpuid = smp_processor_id();
810  	if (READ_ONCE(pool->cpuid) == cpuid)
811  		return true;
812  
813  	napi = READ_ONCE(pool->p.napi);
814  
815  	return napi && READ_ONCE(napi->list_owner) == cpuid;
816  }
817  
page_pool_put_unrefed_netmem(struct page_pool * pool,netmem_ref netmem,unsigned int dma_sync_size,bool allow_direct)818  void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
819  				  unsigned int dma_sync_size, bool allow_direct)
820  {
821  	if (!allow_direct)
822  		allow_direct = page_pool_napi_local(pool);
823  
824  	netmem =
825  		__page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
826  	if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
827  		/* Cache full, fallback to free pages */
828  		recycle_stat_inc(pool, ring_full);
829  		page_pool_return_page(pool, netmem);
830  	}
831  }
832  EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
833  
page_pool_put_unrefed_page(struct page_pool * pool,struct page * page,unsigned int dma_sync_size,bool allow_direct)834  void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
835  				unsigned int dma_sync_size, bool allow_direct)
836  {
837  	page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
838  				     allow_direct);
839  }
840  EXPORT_SYMBOL(page_pool_put_unrefed_page);
841  
842  /**
843   * page_pool_put_page_bulk() - release references on multiple pages
844   * @pool:	pool from which pages were allocated
845   * @data:	array holding page pointers
846   * @count:	number of pages in @data
847   *
848   * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
849   * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
850   * will release leftover pages to the page allocator.
851   * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
852   * completion loop for the XDP_REDIRECT use case.
853   *
854   * Please note the caller must not use data area after running
855   * page_pool_put_page_bulk(), as this function overwrites it.
856   */
page_pool_put_page_bulk(struct page_pool * pool,void ** data,int count)857  void page_pool_put_page_bulk(struct page_pool *pool, void **data,
858  			     int count)
859  {
860  	int i, bulk_len = 0;
861  	bool allow_direct;
862  	bool in_softirq;
863  
864  	allow_direct = page_pool_napi_local(pool);
865  
866  	for (i = 0; i < count; i++) {
867  		netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));
868  
869  		/* It is not the last user for the page frag case */
870  		if (!page_pool_is_last_ref(netmem))
871  			continue;
872  
873  		netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
874  		/* Approved for bulk recycling in ptr_ring cache */
875  		if (netmem)
876  			data[bulk_len++] = (__force void *)netmem;
877  	}
878  
879  	if (!bulk_len)
880  		return;
881  
882  	/* Bulk producer into ptr_ring page_pool cache */
883  	in_softirq = page_pool_producer_lock(pool);
884  	for (i = 0; i < bulk_len; i++) {
885  		if (__ptr_ring_produce(&pool->ring, data[i])) {
886  			/* ring full */
887  			recycle_stat_inc(pool, ring_full);
888  			break;
889  		}
890  	}
891  	recycle_stat_add(pool, ring, i);
892  	page_pool_producer_unlock(pool, in_softirq);
893  
894  	/* Hopefully all pages was return into ptr_ring */
895  	if (likely(i == bulk_len))
896  		return;
897  
898  	/* ptr_ring cache full, free remaining pages outside producer lock
899  	 * since put_page() with refcnt == 1 can be an expensive operation
900  	 */
901  	for (; i < bulk_len; i++)
902  		page_pool_return_page(pool, (__force netmem_ref)data[i]);
903  }
904  EXPORT_SYMBOL(page_pool_put_page_bulk);
905  
page_pool_drain_frag(struct page_pool * pool,netmem_ref netmem)906  static netmem_ref page_pool_drain_frag(struct page_pool *pool,
907  				       netmem_ref netmem)
908  {
909  	long drain_count = BIAS_MAX - pool->frag_users;
910  
911  	/* Some user is still using the page frag */
912  	if (likely(page_pool_unref_netmem(netmem, drain_count)))
913  		return 0;
914  
915  	if (__page_pool_page_can_be_recycled(netmem)) {
916  		page_pool_dma_sync_for_device(pool, netmem, -1);
917  		return netmem;
918  	}
919  
920  	page_pool_return_page(pool, netmem);
921  	return 0;
922  }
923  
page_pool_free_frag(struct page_pool * pool)924  static void page_pool_free_frag(struct page_pool *pool)
925  {
926  	long drain_count = BIAS_MAX - pool->frag_users;
927  	netmem_ref netmem = pool->frag_page;
928  
929  	pool->frag_page = 0;
930  
931  	if (!netmem || page_pool_unref_netmem(netmem, drain_count))
932  		return;
933  
934  	page_pool_return_page(pool, netmem);
935  }
936  
page_pool_alloc_frag_netmem(struct page_pool * pool,unsigned int * offset,unsigned int size,gfp_t gfp)937  netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
938  				       unsigned int *offset, unsigned int size,
939  				       gfp_t gfp)
940  {
941  	unsigned int max_size = PAGE_SIZE << pool->p.order;
942  	netmem_ref netmem = pool->frag_page;
943  
944  	if (WARN_ON(size > max_size))
945  		return 0;
946  
947  	size = ALIGN(size, dma_get_cache_alignment());
948  	*offset = pool->frag_offset;
949  
950  	if (netmem && *offset + size > max_size) {
951  		netmem = page_pool_drain_frag(pool, netmem);
952  		if (netmem) {
953  			alloc_stat_inc(pool, fast);
954  			goto frag_reset;
955  		}
956  	}
957  
958  	if (!netmem) {
959  		netmem = page_pool_alloc_netmem(pool, gfp);
960  		if (unlikely(!netmem)) {
961  			pool->frag_page = 0;
962  			return 0;
963  		}
964  
965  		pool->frag_page = netmem;
966  
967  frag_reset:
968  		pool->frag_users = 1;
969  		*offset = 0;
970  		pool->frag_offset = size;
971  		page_pool_fragment_netmem(netmem, BIAS_MAX);
972  		return netmem;
973  	}
974  
975  	pool->frag_users++;
976  	pool->frag_offset = *offset + size;
977  	alloc_stat_inc(pool, fast);
978  	return netmem;
979  }
980  EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
981  
page_pool_alloc_frag(struct page_pool * pool,unsigned int * offset,unsigned int size,gfp_t gfp)982  struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
983  				  unsigned int size, gfp_t gfp)
984  {
985  	return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
986  							  gfp));
987  }
988  EXPORT_SYMBOL(page_pool_alloc_frag);
989  
page_pool_empty_ring(struct page_pool * pool)990  static void page_pool_empty_ring(struct page_pool *pool)
991  {
992  	netmem_ref netmem;
993  
994  	/* Empty recycle ring */
995  	while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
996  		/* Verify the refcnt invariant of cached pages */
997  		if (!(netmem_ref_count(netmem) == 1))
998  			pr_crit("%s() page_pool refcnt %d violation\n",
999  				__func__, netmem_ref_count(netmem));
1000  
1001  		page_pool_return_page(pool, netmem);
1002  	}
1003  }
1004  
__page_pool_destroy(struct page_pool * pool)1005  static void __page_pool_destroy(struct page_pool *pool)
1006  {
1007  	if (pool->disconnect)
1008  		pool->disconnect(pool);
1009  
1010  	page_pool_unlist(pool);
1011  	page_pool_uninit(pool);
1012  
1013  	if (pool->mp_priv) {
1014  		mp_dmabuf_devmem_destroy(pool);
1015  		static_branch_dec(&page_pool_mem_providers);
1016  	}
1017  
1018  	kfree(pool);
1019  }
1020  
page_pool_empty_alloc_cache_once(struct page_pool * pool)1021  static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
1022  {
1023  	netmem_ref netmem;
1024  
1025  	if (pool->destroy_cnt)
1026  		return;
1027  
1028  	/* Empty alloc cache, assume caller made sure this is
1029  	 * no-longer in use, and page_pool_alloc_pages() cannot be
1030  	 * call concurrently.
1031  	 */
1032  	while (pool->alloc.count) {
1033  		netmem = pool->alloc.cache[--pool->alloc.count];
1034  		page_pool_return_page(pool, netmem);
1035  	}
1036  }
1037  
page_pool_scrub(struct page_pool * pool)1038  static void page_pool_scrub(struct page_pool *pool)
1039  {
1040  	page_pool_empty_alloc_cache_once(pool);
1041  	pool->destroy_cnt++;
1042  
1043  	/* No more consumers should exist, but producers could still
1044  	 * be in-flight.
1045  	 */
1046  	page_pool_empty_ring(pool);
1047  }
1048  
page_pool_release(struct page_pool * pool)1049  static int page_pool_release(struct page_pool *pool)
1050  {
1051  	int inflight;
1052  
1053  	page_pool_scrub(pool);
1054  	inflight = page_pool_inflight(pool, true);
1055  	if (!inflight)
1056  		__page_pool_destroy(pool);
1057  
1058  	return inflight;
1059  }
1060  
page_pool_release_retry(struct work_struct * wq)1061  static void page_pool_release_retry(struct work_struct *wq)
1062  {
1063  	struct delayed_work *dwq = to_delayed_work(wq);
1064  	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1065  	void *netdev;
1066  	int inflight;
1067  
1068  	inflight = page_pool_release(pool);
1069  	if (!inflight)
1070  		return;
1071  
1072  	/* Periodic warning for page pools the user can't see */
1073  	netdev = READ_ONCE(pool->slow.netdev);
1074  	if (time_after_eq(jiffies, pool->defer_warn) &&
1075  	    (!netdev || netdev == NET_PTR_POISON)) {
1076  		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1077  
1078  		pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1079  			__func__, pool->user.id, inflight, sec);
1080  		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1081  	}
1082  
1083  	/* Still not ready to be disconnected, retry later */
1084  	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1085  }
1086  
page_pool_use_xdp_mem(struct page_pool * pool,void (* disconnect)(void *),const struct xdp_mem_info * mem)1087  void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1088  			   const struct xdp_mem_info *mem)
1089  {
1090  	refcount_inc(&pool->user_cnt);
1091  	pool->disconnect = disconnect;
1092  	pool->xdp_mem_id = mem->id;
1093  }
1094  
page_pool_disable_direct_recycling(struct page_pool * pool)1095  void page_pool_disable_direct_recycling(struct page_pool *pool)
1096  {
1097  	/* Disable direct recycling based on pool->cpuid.
1098  	 * Paired with READ_ONCE() in page_pool_napi_local().
1099  	 */
1100  	WRITE_ONCE(pool->cpuid, -1);
1101  
1102  	if (!pool->p.napi)
1103  		return;
1104  
1105  	/* To avoid races with recycling and additional barriers make sure
1106  	 * pool and NAPI are unlinked when NAPI is disabled.
1107  	 */
1108  	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
1109  	WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
1110  
1111  	WRITE_ONCE(pool->p.napi, NULL);
1112  }
1113  EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1114  
page_pool_destroy(struct page_pool * pool)1115  void page_pool_destroy(struct page_pool *pool)
1116  {
1117  	if (!pool)
1118  		return;
1119  
1120  	if (!page_pool_put(pool))
1121  		return;
1122  
1123  	page_pool_disable_direct_recycling(pool);
1124  	page_pool_free_frag(pool);
1125  
1126  	if (!page_pool_release(pool))
1127  		return;
1128  
1129  	page_pool_detached(pool);
1130  	pool->defer_start = jiffies;
1131  	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1132  
1133  	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1134  	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1135  }
1136  EXPORT_SYMBOL(page_pool_destroy);
1137  
1138  /* Caller must provide appropriate safe context, e.g. NAPI. */
page_pool_update_nid(struct page_pool * pool,int new_nid)1139  void page_pool_update_nid(struct page_pool *pool, int new_nid)
1140  {
1141  	netmem_ref netmem;
1142  
1143  	trace_page_pool_update_nid(pool, new_nid);
1144  	pool->p.nid = new_nid;
1145  
1146  	/* Flush pool alloc cache, as refill will check NUMA node */
1147  	while (pool->alloc.count) {
1148  		netmem = pool->alloc.cache[--pool->alloc.count];
1149  		page_pool_return_page(pool, netmem);
1150  	}
1151  }
1152  EXPORT_SYMBOL(page_pool_update_nid);
1153