1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * net/sched/sch_generic.c	Generic packet scheduler routines.
4   *
5   * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6   *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
7   *              - Ingress support
8   */
9  
10  #include <linux/bitops.h>
11  #include <linux/module.h>
12  #include <linux/types.h>
13  #include <linux/kernel.h>
14  #include <linux/sched.h>
15  #include <linux/string.h>
16  #include <linux/errno.h>
17  #include <linux/netdevice.h>
18  #include <linux/skbuff.h>
19  #include <linux/rtnetlink.h>
20  #include <linux/init.h>
21  #include <linux/rcupdate.h>
22  #include <linux/list.h>
23  #include <linux/slab.h>
24  #include <linux/if_vlan.h>
25  #include <linux/skb_array.h>
26  #include <linux/if_macvlan.h>
27  #include <net/sch_generic.h>
28  #include <net/pkt_sched.h>
29  #include <net/dst.h>
30  #include <net/hotdata.h>
31  #include <trace/events/qdisc.h>
32  #include <trace/events/net.h>
33  #include <net/xfrm.h>
34  
35  /* Qdisc to use by default */
36  const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
37  EXPORT_SYMBOL(default_qdisc_ops);
38  
qdisc_maybe_clear_missed(struct Qdisc * q,const struct netdev_queue * txq)39  static void qdisc_maybe_clear_missed(struct Qdisc *q,
40  				     const struct netdev_queue *txq)
41  {
42  	clear_bit(__QDISC_STATE_MISSED, &q->state);
43  
44  	/* Make sure the below netif_xmit_frozen_or_stopped()
45  	 * checking happens after clearing STATE_MISSED.
46  	 */
47  	smp_mb__after_atomic();
48  
49  	/* Checking netif_xmit_frozen_or_stopped() again to
50  	 * make sure STATE_MISSED is set if the STATE_MISSED
51  	 * set by netif_tx_wake_queue()'s rescheduling of
52  	 * net_tx_action() is cleared by the above clear_bit().
53  	 */
54  	if (!netif_xmit_frozen_or_stopped(txq))
55  		set_bit(__QDISC_STATE_MISSED, &q->state);
56  	else
57  		set_bit(__QDISC_STATE_DRAINING, &q->state);
58  }
59  
60  /* Main transmission queue. */
61  
62  /* Modifications to data participating in scheduling must be protected with
63   * qdisc_lock(qdisc) spinlock.
64   *
65   * The idea is the following:
66   * - enqueue, dequeue are serialized via qdisc root lock
67   * - ingress filtering is also serialized via qdisc root lock
68   * - updates to tree and tree walking are only done under the rtnl mutex.
69   */
70  
71  #define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)
72  
__skb_dequeue_bad_txq(struct Qdisc * q)73  static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
74  {
75  	const struct netdev_queue *txq = q->dev_queue;
76  	spinlock_t *lock = NULL;
77  	struct sk_buff *skb;
78  
79  	if (q->flags & TCQ_F_NOLOCK) {
80  		lock = qdisc_lock(q);
81  		spin_lock(lock);
82  	}
83  
84  	skb = skb_peek(&q->skb_bad_txq);
85  	if (skb) {
86  		/* check the reason of requeuing without tx lock first */
87  		txq = skb_get_tx_queue(txq->dev, skb);
88  		if (!netif_xmit_frozen_or_stopped(txq)) {
89  			skb = __skb_dequeue(&q->skb_bad_txq);
90  			if (qdisc_is_percpu_stats(q)) {
91  				qdisc_qstats_cpu_backlog_dec(q, skb);
92  				qdisc_qstats_cpu_qlen_dec(q);
93  			} else {
94  				qdisc_qstats_backlog_dec(q, skb);
95  				q->q.qlen--;
96  			}
97  		} else {
98  			skb = SKB_XOFF_MAGIC;
99  			qdisc_maybe_clear_missed(q, txq);
100  		}
101  	}
102  
103  	if (lock)
104  		spin_unlock(lock);
105  
106  	return skb;
107  }
108  
qdisc_dequeue_skb_bad_txq(struct Qdisc * q)109  static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
110  {
111  	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);
112  
113  	if (unlikely(skb))
114  		skb = __skb_dequeue_bad_txq(q);
115  
116  	return skb;
117  }
118  
qdisc_enqueue_skb_bad_txq(struct Qdisc * q,struct sk_buff * skb)119  static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
120  					     struct sk_buff *skb)
121  {
122  	spinlock_t *lock = NULL;
123  
124  	if (q->flags & TCQ_F_NOLOCK) {
125  		lock = qdisc_lock(q);
126  		spin_lock(lock);
127  	}
128  
129  	__skb_queue_tail(&q->skb_bad_txq, skb);
130  
131  	if (qdisc_is_percpu_stats(q)) {
132  		qdisc_qstats_cpu_backlog_inc(q, skb);
133  		qdisc_qstats_cpu_qlen_inc(q);
134  	} else {
135  		qdisc_qstats_backlog_inc(q, skb);
136  		q->q.qlen++;
137  	}
138  
139  	if (lock)
140  		spin_unlock(lock);
141  }
142  
dev_requeue_skb(struct sk_buff * skb,struct Qdisc * q)143  static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
144  {
145  	spinlock_t *lock = NULL;
146  
147  	if (q->flags & TCQ_F_NOLOCK) {
148  		lock = qdisc_lock(q);
149  		spin_lock(lock);
150  	}
151  
152  	while (skb) {
153  		struct sk_buff *next = skb->next;
154  
155  		__skb_queue_tail(&q->gso_skb, skb);
156  
157  		/* it's still part of the queue */
158  		if (qdisc_is_percpu_stats(q)) {
159  			qdisc_qstats_cpu_requeues_inc(q);
160  			qdisc_qstats_cpu_backlog_inc(q, skb);
161  			qdisc_qstats_cpu_qlen_inc(q);
162  		} else {
163  			q->qstats.requeues++;
164  			qdisc_qstats_backlog_inc(q, skb);
165  			q->q.qlen++;
166  		}
167  
168  		skb = next;
169  	}
170  
171  	if (lock) {
172  		spin_unlock(lock);
173  		set_bit(__QDISC_STATE_MISSED, &q->state);
174  	} else {
175  		__netif_schedule(q);
176  	}
177  }
178  
try_bulk_dequeue_skb(struct Qdisc * q,struct sk_buff * skb,const struct netdev_queue * txq,int * packets)179  static void try_bulk_dequeue_skb(struct Qdisc *q,
180  				 struct sk_buff *skb,
181  				 const struct netdev_queue *txq,
182  				 int *packets)
183  {
184  	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
185  
186  	while (bytelimit > 0) {
187  		struct sk_buff *nskb = q->dequeue(q);
188  
189  		if (!nskb)
190  			break;
191  
192  		bytelimit -= nskb->len; /* covers GSO len */
193  		skb->next = nskb;
194  		skb = nskb;
195  		(*packets)++; /* GSO counts as one pkt */
196  	}
197  	skb_mark_not_on_list(skb);
198  }
199  
200  /* This variant of try_bulk_dequeue_skb() makes sure
201   * all skbs in the chain are for the same txq
202   */
try_bulk_dequeue_skb_slow(struct Qdisc * q,struct sk_buff * skb,int * packets)203  static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
204  				      struct sk_buff *skb,
205  				      int *packets)
206  {
207  	int mapping = skb_get_queue_mapping(skb);
208  	struct sk_buff *nskb;
209  	int cnt = 0;
210  
211  	do {
212  		nskb = q->dequeue(q);
213  		if (!nskb)
214  			break;
215  		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
216  			qdisc_enqueue_skb_bad_txq(q, nskb);
217  			break;
218  		}
219  		skb->next = nskb;
220  		skb = nskb;
221  	} while (++cnt < 8);
222  	(*packets) += cnt;
223  	skb_mark_not_on_list(skb);
224  }
225  
226  /* Note that dequeue_skb can possibly return a SKB list (via skb->next).
227   * A requeued skb (via q->gso_skb) can also be a SKB list.
228   */
dequeue_skb(struct Qdisc * q,bool * validate,int * packets)229  static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
230  				   int *packets)
231  {
232  	const struct netdev_queue *txq = q->dev_queue;
233  	struct sk_buff *skb = NULL;
234  
235  	*packets = 1;
236  	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
237  		spinlock_t *lock = NULL;
238  
239  		if (q->flags & TCQ_F_NOLOCK) {
240  			lock = qdisc_lock(q);
241  			spin_lock(lock);
242  		}
243  
244  		skb = skb_peek(&q->gso_skb);
245  
246  		/* skb may be null if another cpu pulls gso_skb off in between
247  		 * empty check and lock.
248  		 */
249  		if (!skb) {
250  			if (lock)
251  				spin_unlock(lock);
252  			goto validate;
253  		}
254  
255  		/* skb in gso_skb were already validated */
256  		*validate = false;
257  		if (xfrm_offload(skb))
258  			*validate = true;
259  		/* check the reason of requeuing without tx lock first */
260  		txq = skb_get_tx_queue(txq->dev, skb);
261  		if (!netif_xmit_frozen_or_stopped(txq)) {
262  			skb = __skb_dequeue(&q->gso_skb);
263  			if (qdisc_is_percpu_stats(q)) {
264  				qdisc_qstats_cpu_backlog_dec(q, skb);
265  				qdisc_qstats_cpu_qlen_dec(q);
266  			} else {
267  				qdisc_qstats_backlog_dec(q, skb);
268  				q->q.qlen--;
269  			}
270  		} else {
271  			skb = NULL;
272  			qdisc_maybe_clear_missed(q, txq);
273  		}
274  		if (lock)
275  			spin_unlock(lock);
276  		goto trace;
277  	}
278  validate:
279  	*validate = true;
280  
281  	if ((q->flags & TCQ_F_ONETXQUEUE) &&
282  	    netif_xmit_frozen_or_stopped(txq)) {
283  		qdisc_maybe_clear_missed(q, txq);
284  		return skb;
285  	}
286  
287  	skb = qdisc_dequeue_skb_bad_txq(q);
288  	if (unlikely(skb)) {
289  		if (skb == SKB_XOFF_MAGIC)
290  			return NULL;
291  		goto bulk;
292  	}
293  	skb = q->dequeue(q);
294  	if (skb) {
295  bulk:
296  		if (qdisc_may_bulk(q))
297  			try_bulk_dequeue_skb(q, skb, txq, packets);
298  		else
299  			try_bulk_dequeue_skb_slow(q, skb, packets);
300  	}
301  trace:
302  	trace_qdisc_dequeue(q, txq, *packets, skb);
303  	return skb;
304  }
305  
306  /*
307   * Transmit possibly several skbs, and handle the return status as
308   * required. Owning qdisc running bit guarantees that only one CPU
309   * can execute this function.
310   *
311   * Returns to the caller:
312   *				false  - hardware queue frozen backoff
313   *				true   - feel free to send more pkts
314   */
sch_direct_xmit(struct sk_buff * skb,struct Qdisc * q,struct net_device * dev,struct netdev_queue * txq,spinlock_t * root_lock,bool validate)315  bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
316  		     struct net_device *dev, struct netdev_queue *txq,
317  		     spinlock_t *root_lock, bool validate)
318  {
319  	int ret = NETDEV_TX_BUSY;
320  	bool again = false;
321  
322  	/* And release qdisc */
323  	if (root_lock)
324  		spin_unlock(root_lock);
325  
326  	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
327  	if (validate)
328  		skb = validate_xmit_skb_list(skb, dev, &again);
329  
330  #ifdef CONFIG_XFRM_OFFLOAD
331  	if (unlikely(again)) {
332  		if (root_lock)
333  			spin_lock(root_lock);
334  
335  		dev_requeue_skb(skb, q);
336  		return false;
337  	}
338  #endif
339  
340  	if (likely(skb)) {
341  		HARD_TX_LOCK(dev, txq, smp_processor_id());
342  		if (!netif_xmit_frozen_or_stopped(txq))
343  			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
344  		else
345  			qdisc_maybe_clear_missed(q, txq);
346  
347  		HARD_TX_UNLOCK(dev, txq);
348  	} else {
349  		if (root_lock)
350  			spin_lock(root_lock);
351  		return true;
352  	}
353  
354  	if (root_lock)
355  		spin_lock(root_lock);
356  
357  	if (!dev_xmit_complete(ret)) {
358  		/* Driver returned NETDEV_TX_BUSY - requeue skb */
359  		if (unlikely(ret != NETDEV_TX_BUSY))
360  			net_warn_ratelimited("BUG %s code %d qlen %d\n",
361  					     dev->name, ret, q->q.qlen);
362  
363  		dev_requeue_skb(skb, q);
364  		return false;
365  	}
366  
367  	return true;
368  }
369  
370  /*
371   * NOTE: Called under qdisc_lock(q) with locally disabled BH.
372   *
373   * running seqcount guarantees only one CPU can process
374   * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
375   * this queue.
376   *
377   *  netif_tx_lock serializes accesses to device driver.
378   *
379   *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
380   *  if one is grabbed, another must be free.
381   *
382   * Note, that this procedure can be called by a watchdog timer
383   *
384   * Returns to the caller:
385   *				0  - queue is empty or throttled.
386   *				>0 - queue is not empty.
387   *
388   */
qdisc_restart(struct Qdisc * q,int * packets)389  static inline bool qdisc_restart(struct Qdisc *q, int *packets)
390  {
391  	spinlock_t *root_lock = NULL;
392  	struct netdev_queue *txq;
393  	struct net_device *dev;
394  	struct sk_buff *skb;
395  	bool validate;
396  
397  	/* Dequeue packet */
398  	skb = dequeue_skb(q, &validate, packets);
399  	if (unlikely(!skb))
400  		return false;
401  
402  	if (!(q->flags & TCQ_F_NOLOCK))
403  		root_lock = qdisc_lock(q);
404  
405  	dev = qdisc_dev(q);
406  	txq = skb_get_tx_queue(dev, skb);
407  
408  	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
409  }
410  
__qdisc_run(struct Qdisc * q)411  void __qdisc_run(struct Qdisc *q)
412  {
413  	int quota = READ_ONCE(net_hotdata.dev_tx_weight);
414  	int packets;
415  
416  	while (qdisc_restart(q, &packets)) {
417  		quota -= packets;
418  		if (quota <= 0) {
419  			if (q->flags & TCQ_F_NOLOCK)
420  				set_bit(__QDISC_STATE_MISSED, &q->state);
421  			else
422  				__netif_schedule(q);
423  
424  			break;
425  		}
426  	}
427  }
428  
dev_trans_start(struct net_device * dev)429  unsigned long dev_trans_start(struct net_device *dev)
430  {
431  	unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start);
432  	unsigned long val;
433  	unsigned int i;
434  
435  	for (i = 1; i < dev->num_tx_queues; i++) {
436  		val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start);
437  		if (val && time_after(val, res))
438  			res = val;
439  	}
440  
441  	return res;
442  }
443  EXPORT_SYMBOL(dev_trans_start);
444  
netif_freeze_queues(struct net_device * dev)445  static void netif_freeze_queues(struct net_device *dev)
446  {
447  	unsigned int i;
448  	int cpu;
449  
450  	cpu = smp_processor_id();
451  	for (i = 0; i < dev->num_tx_queues; i++) {
452  		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
453  
454  		/* We are the only thread of execution doing a
455  		 * freeze, but we have to grab the _xmit_lock in
456  		 * order to synchronize with threads which are in
457  		 * the ->hard_start_xmit() handler and already
458  		 * checked the frozen bit.
459  		 */
460  		__netif_tx_lock(txq, cpu);
461  		set_bit(__QUEUE_STATE_FROZEN, &txq->state);
462  		__netif_tx_unlock(txq);
463  	}
464  }
465  
netif_tx_lock(struct net_device * dev)466  void netif_tx_lock(struct net_device *dev)
467  {
468  	spin_lock(&dev->tx_global_lock);
469  	netif_freeze_queues(dev);
470  }
471  EXPORT_SYMBOL(netif_tx_lock);
472  
netif_unfreeze_queues(struct net_device * dev)473  static void netif_unfreeze_queues(struct net_device *dev)
474  {
475  	unsigned int i;
476  
477  	for (i = 0; i < dev->num_tx_queues; i++) {
478  		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
479  
480  		/* No need to grab the _xmit_lock here.  If the
481  		 * queue is not stopped for another reason, we
482  		 * force a schedule.
483  		 */
484  		clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
485  		netif_schedule_queue(txq);
486  	}
487  }
488  
netif_tx_unlock(struct net_device * dev)489  void netif_tx_unlock(struct net_device *dev)
490  {
491  	netif_unfreeze_queues(dev);
492  	spin_unlock(&dev->tx_global_lock);
493  }
494  EXPORT_SYMBOL(netif_tx_unlock);
495  
dev_watchdog(struct timer_list * t)496  static void dev_watchdog(struct timer_list *t)
497  {
498  	struct net_device *dev = from_timer(dev, t, watchdog_timer);
499  	bool release = true;
500  
501  	spin_lock(&dev->tx_global_lock);
502  	if (!qdisc_tx_is_noop(dev)) {
503  		if (netif_device_present(dev) &&
504  		    netif_running(dev) &&
505  		    netif_carrier_ok(dev)) {
506  			unsigned int timedout_ms = 0;
507  			unsigned int i;
508  			unsigned long trans_start;
509  			unsigned long oldest_start = jiffies;
510  
511  			for (i = 0; i < dev->num_tx_queues; i++) {
512  				struct netdev_queue *txq;
513  
514  				txq = netdev_get_tx_queue(dev, i);
515  				if (!netif_xmit_stopped(txq))
516  					continue;
517  
518  				/* Paired with WRITE_ONCE() + smp_mb...() in
519  				 * netdev_tx_sent_queue() and netif_tx_stop_queue().
520  				 */
521  				smp_mb();
522  				trans_start = READ_ONCE(txq->trans_start);
523  
524  				if (time_after(jiffies, trans_start + dev->watchdog_timeo)) {
525  					timedout_ms = jiffies_to_msecs(jiffies - trans_start);
526  					atomic_long_inc(&txq->trans_timeout);
527  					break;
528  				}
529  				if (time_after(oldest_start, trans_start))
530  					oldest_start = trans_start;
531  			}
532  
533  			if (unlikely(timedout_ms)) {
534  				trace_net_dev_xmit_timeout(dev, i);
535  				netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n",
536  					    raw_smp_processor_id(),
537  					    i, timedout_ms);
538  				netif_freeze_queues(dev);
539  				dev->netdev_ops->ndo_tx_timeout(dev, i);
540  				netif_unfreeze_queues(dev);
541  			}
542  			if (!mod_timer(&dev->watchdog_timer,
543  				       round_jiffies(oldest_start +
544  						     dev->watchdog_timeo)))
545  				release = false;
546  		}
547  	}
548  	spin_unlock(&dev->tx_global_lock);
549  
550  	if (release)
551  		netdev_put(dev, &dev->watchdog_dev_tracker);
552  }
553  
__netdev_watchdog_up(struct net_device * dev)554  void __netdev_watchdog_up(struct net_device *dev)
555  {
556  	if (dev->netdev_ops->ndo_tx_timeout) {
557  		if (dev->watchdog_timeo <= 0)
558  			dev->watchdog_timeo = 5*HZ;
559  		if (!mod_timer(&dev->watchdog_timer,
560  			       round_jiffies(jiffies + dev->watchdog_timeo)))
561  			netdev_hold(dev, &dev->watchdog_dev_tracker,
562  				    GFP_ATOMIC);
563  	}
564  }
565  EXPORT_SYMBOL_GPL(__netdev_watchdog_up);
566  
dev_watchdog_up(struct net_device * dev)567  static void dev_watchdog_up(struct net_device *dev)
568  {
569  	__netdev_watchdog_up(dev);
570  }
571  
dev_watchdog_down(struct net_device * dev)572  static void dev_watchdog_down(struct net_device *dev)
573  {
574  	netif_tx_lock_bh(dev);
575  	if (del_timer(&dev->watchdog_timer))
576  		netdev_put(dev, &dev->watchdog_dev_tracker);
577  	netif_tx_unlock_bh(dev);
578  }
579  
580  /**
581   *	netif_carrier_on - set carrier
582   *	@dev: network device
583   *
584   * Device has detected acquisition of carrier.
585   */
netif_carrier_on(struct net_device * dev)586  void netif_carrier_on(struct net_device *dev)
587  {
588  	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
589  		if (dev->reg_state == NETREG_UNINITIALIZED)
590  			return;
591  		atomic_inc(&dev->carrier_up_count);
592  		linkwatch_fire_event(dev);
593  		if (netif_running(dev))
594  			__netdev_watchdog_up(dev);
595  	}
596  }
597  EXPORT_SYMBOL(netif_carrier_on);
598  
599  /**
600   *	netif_carrier_off - clear carrier
601   *	@dev: network device
602   *
603   * Device has detected loss of carrier.
604   */
netif_carrier_off(struct net_device * dev)605  void netif_carrier_off(struct net_device *dev)
606  {
607  	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
608  		if (dev->reg_state == NETREG_UNINITIALIZED)
609  			return;
610  		atomic_inc(&dev->carrier_down_count);
611  		linkwatch_fire_event(dev);
612  	}
613  }
614  EXPORT_SYMBOL(netif_carrier_off);
615  
616  /**
617   *	netif_carrier_event - report carrier state event
618   *	@dev: network device
619   *
620   * Device has detected a carrier event but the carrier state wasn't changed.
621   * Use in drivers when querying carrier state asynchronously, to avoid missing
622   * events (link flaps) if link recovers before it's queried.
623   */
netif_carrier_event(struct net_device * dev)624  void netif_carrier_event(struct net_device *dev)
625  {
626  	if (dev->reg_state == NETREG_UNINITIALIZED)
627  		return;
628  	atomic_inc(&dev->carrier_up_count);
629  	atomic_inc(&dev->carrier_down_count);
630  	linkwatch_fire_event(dev);
631  }
632  EXPORT_SYMBOL_GPL(netif_carrier_event);
633  
634  /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
635     under all circumstances. It is difficult to invent anything faster or
636     cheaper.
637   */
638  
noop_enqueue(struct sk_buff * skb,struct Qdisc * qdisc,struct sk_buff ** to_free)639  static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
640  			struct sk_buff **to_free)
641  {
642  	dev_core_stats_tx_dropped_inc(skb->dev);
643  	__qdisc_drop(skb, to_free);
644  	return NET_XMIT_CN;
645  }
646  
noop_dequeue(struct Qdisc * qdisc)647  static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
648  {
649  	return NULL;
650  }
651  
652  struct Qdisc_ops noop_qdisc_ops __read_mostly = {
653  	.id		=	"noop",
654  	.priv_size	=	0,
655  	.enqueue	=	noop_enqueue,
656  	.dequeue	=	noop_dequeue,
657  	.peek		=	noop_dequeue,
658  	.owner		=	THIS_MODULE,
659  };
660  
661  static struct netdev_queue noop_netdev_queue = {
662  	RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
663  	RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc),
664  };
665  
666  struct Qdisc noop_qdisc = {
667  	.enqueue	=	noop_enqueue,
668  	.dequeue	=	noop_dequeue,
669  	.flags		=	TCQ_F_BUILTIN,
670  	.ops		=	&noop_qdisc_ops,
671  	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
672  	.dev_queue	=	&noop_netdev_queue,
673  	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
674  	.gso_skb = {
675  		.next = (struct sk_buff *)&noop_qdisc.gso_skb,
676  		.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
677  		.qlen = 0,
678  		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
679  	},
680  	.skb_bad_txq = {
681  		.next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
682  		.prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
683  		.qlen = 0,
684  		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
685  	},
686  	.owner = -1,
687  };
688  EXPORT_SYMBOL(noop_qdisc);
689  
noqueue_init(struct Qdisc * qdisc,struct nlattr * opt,struct netlink_ext_ack * extack)690  static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
691  			struct netlink_ext_ack *extack)
692  {
693  	/* register_qdisc() assigns a default of noop_enqueue if unset,
694  	 * but __dev_queue_xmit() treats noqueue only as such
695  	 * if this is NULL - so clear it here. */
696  	qdisc->enqueue = NULL;
697  	return 0;
698  }
699  
700  struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
701  	.id		=	"noqueue",
702  	.priv_size	=	0,
703  	.init		=	noqueue_init,
704  	.enqueue	=	noop_enqueue,
705  	.dequeue	=	noop_dequeue,
706  	.peek		=	noop_dequeue,
707  	.owner		=	THIS_MODULE,
708  };
709  
710  const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = {
711  	1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
712  };
713  EXPORT_SYMBOL(sch_default_prio2band);
714  
715  /* 3-band FIFO queue: old style, but should be a bit faster than
716     generic prio+fifo combination.
717   */
718  
719  #define PFIFO_FAST_BANDS 3
720  
721  /*
722   * Private data for a pfifo_fast scheduler containing:
723   *	- rings for priority bands
724   */
725  struct pfifo_fast_priv {
726  	struct skb_array q[PFIFO_FAST_BANDS];
727  };
728  
band2list(struct pfifo_fast_priv * priv,int band)729  static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
730  					  int band)
731  {
732  	return &priv->q[band];
733  }
734  
pfifo_fast_enqueue(struct sk_buff * skb,struct Qdisc * qdisc,struct sk_buff ** to_free)735  static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
736  			      struct sk_buff **to_free)
737  {
738  	int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX];
739  	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
740  	struct skb_array *q = band2list(priv, band);
741  	unsigned int pkt_len = qdisc_pkt_len(skb);
742  	int err;
743  
744  	err = skb_array_produce(q, skb);
745  
746  	if (unlikely(err)) {
747  		if (qdisc_is_percpu_stats(qdisc))
748  			return qdisc_drop_cpu(skb, qdisc, to_free);
749  		else
750  			return qdisc_drop(skb, qdisc, to_free);
751  	}
752  
753  	qdisc_update_stats_at_enqueue(qdisc, pkt_len);
754  	return NET_XMIT_SUCCESS;
755  }
756  
pfifo_fast_dequeue(struct Qdisc * qdisc)757  static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
758  {
759  	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
760  	struct sk_buff *skb = NULL;
761  	bool need_retry = true;
762  	int band;
763  
764  retry:
765  	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
766  		struct skb_array *q = band2list(priv, band);
767  
768  		if (__skb_array_empty(q))
769  			continue;
770  
771  		skb = __skb_array_consume(q);
772  	}
773  	if (likely(skb)) {
774  		qdisc_update_stats_at_dequeue(qdisc, skb);
775  	} else if (need_retry &&
776  		   READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) {
777  		/* Delay clearing the STATE_MISSED here to reduce
778  		 * the overhead of the second spin_trylock() in
779  		 * qdisc_run_begin() and __netif_schedule() calling
780  		 * in qdisc_run_end().
781  		 */
782  		clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
783  		clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
784  
785  		/* Make sure dequeuing happens after clearing
786  		 * STATE_MISSED.
787  		 */
788  		smp_mb__after_atomic();
789  
790  		need_retry = false;
791  
792  		goto retry;
793  	}
794  
795  	return skb;
796  }
797  
pfifo_fast_peek(struct Qdisc * qdisc)798  static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
799  {
800  	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
801  	struct sk_buff *skb = NULL;
802  	int band;
803  
804  	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
805  		struct skb_array *q = band2list(priv, band);
806  
807  		skb = __skb_array_peek(q);
808  	}
809  
810  	return skb;
811  }
812  
pfifo_fast_reset(struct Qdisc * qdisc)813  static void pfifo_fast_reset(struct Qdisc *qdisc)
814  {
815  	int i, band;
816  	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
817  
818  	for (band = 0; band < PFIFO_FAST_BANDS; band++) {
819  		struct skb_array *q = band2list(priv, band);
820  		struct sk_buff *skb;
821  
822  		/* NULL ring is possible if destroy path is due to a failed
823  		 * skb_array_init() in pfifo_fast_init() case.
824  		 */
825  		if (!q->ring.queue)
826  			continue;
827  
828  		while ((skb = __skb_array_consume(q)) != NULL)
829  			kfree_skb(skb);
830  	}
831  
832  	if (qdisc_is_percpu_stats(qdisc)) {
833  		for_each_possible_cpu(i) {
834  			struct gnet_stats_queue *q;
835  
836  			q = per_cpu_ptr(qdisc->cpu_qstats, i);
837  			q->backlog = 0;
838  			q->qlen = 0;
839  		}
840  	}
841  }
842  
pfifo_fast_dump(struct Qdisc * qdisc,struct sk_buff * skb)843  static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
844  {
845  	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
846  
847  	memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1);
848  	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
849  		goto nla_put_failure;
850  	return skb->len;
851  
852  nla_put_failure:
853  	return -1;
854  }
855  
pfifo_fast_init(struct Qdisc * qdisc,struct nlattr * opt,struct netlink_ext_ack * extack)856  static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
857  			   struct netlink_ext_ack *extack)
858  {
859  	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
860  	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
861  	int prio;
862  
863  	/* guard against zero length rings */
864  	if (!qlen)
865  		return -EINVAL;
866  
867  	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
868  		struct skb_array *q = band2list(priv, prio);
869  		int err;
870  
871  		err = skb_array_init(q, qlen, GFP_KERNEL);
872  		if (err)
873  			return -ENOMEM;
874  	}
875  
876  	/* Can by-pass the queue discipline */
877  	qdisc->flags |= TCQ_F_CAN_BYPASS;
878  	return 0;
879  }
880  
pfifo_fast_destroy(struct Qdisc * sch)881  static void pfifo_fast_destroy(struct Qdisc *sch)
882  {
883  	struct pfifo_fast_priv *priv = qdisc_priv(sch);
884  	int prio;
885  
886  	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
887  		struct skb_array *q = band2list(priv, prio);
888  
889  		/* NULL ring is possible if destroy path is due to a failed
890  		 * skb_array_init() in pfifo_fast_init() case.
891  		 */
892  		if (!q->ring.queue)
893  			continue;
894  		/* Destroy ring but no need to kfree_skb because a call to
895  		 * pfifo_fast_reset() has already done that work.
896  		 */
897  		ptr_ring_cleanup(&q->ring, NULL);
898  	}
899  }
900  
pfifo_fast_change_tx_queue_len(struct Qdisc * sch,unsigned int new_len)901  static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
902  					  unsigned int new_len)
903  {
904  	struct pfifo_fast_priv *priv = qdisc_priv(sch);
905  	struct skb_array *bands[PFIFO_FAST_BANDS];
906  	int prio;
907  
908  	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
909  		struct skb_array *q = band2list(priv, prio);
910  
911  		bands[prio] = q;
912  	}
913  
914  	return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
915  					 GFP_KERNEL);
916  }
917  
918  struct Qdisc_ops pfifo_fast_ops __read_mostly = {
919  	.id		=	"pfifo_fast",
920  	.priv_size	=	sizeof(struct pfifo_fast_priv),
921  	.enqueue	=	pfifo_fast_enqueue,
922  	.dequeue	=	pfifo_fast_dequeue,
923  	.peek		=	pfifo_fast_peek,
924  	.init		=	pfifo_fast_init,
925  	.destroy	=	pfifo_fast_destroy,
926  	.reset		=	pfifo_fast_reset,
927  	.dump		=	pfifo_fast_dump,
928  	.change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
929  	.owner		=	THIS_MODULE,
930  	.static_flags	=	TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
931  };
932  EXPORT_SYMBOL(pfifo_fast_ops);
933  
934  static struct lock_class_key qdisc_tx_busylock;
935  
qdisc_alloc(struct netdev_queue * dev_queue,const struct Qdisc_ops * ops,struct netlink_ext_ack * extack)936  struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
937  			  const struct Qdisc_ops *ops,
938  			  struct netlink_ext_ack *extack)
939  {
940  	struct Qdisc *sch;
941  	unsigned int size = sizeof(*sch) + ops->priv_size;
942  	int err = -ENOBUFS;
943  	struct net_device *dev;
944  
945  	if (!dev_queue) {
946  		NL_SET_ERR_MSG(extack, "No device queue given");
947  		err = -EINVAL;
948  		goto errout;
949  	}
950  
951  	dev = dev_queue->dev;
952  	sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue));
953  
954  	if (!sch)
955  		goto errout;
956  	__skb_queue_head_init(&sch->gso_skb);
957  	__skb_queue_head_init(&sch->skb_bad_txq);
958  	gnet_stats_basic_sync_init(&sch->bstats);
959  	lockdep_register_key(&sch->root_lock_key);
960  	spin_lock_init(&sch->q.lock);
961  	lockdep_set_class(&sch->q.lock, &sch->root_lock_key);
962  
963  	if (ops->static_flags & TCQ_F_CPUSTATS) {
964  		sch->cpu_bstats =
965  			netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
966  		if (!sch->cpu_bstats)
967  			goto errout1;
968  
969  		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
970  		if (!sch->cpu_qstats) {
971  			free_percpu(sch->cpu_bstats);
972  			goto errout1;
973  		}
974  	}
975  
976  	spin_lock_init(&sch->busylock);
977  	lockdep_set_class(&sch->busylock,
978  			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
979  
980  	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
981  	spin_lock_init(&sch->seqlock);
982  	lockdep_set_class(&sch->seqlock,
983  			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
984  
985  	sch->ops = ops;
986  	sch->flags = ops->static_flags;
987  	sch->enqueue = ops->enqueue;
988  	sch->dequeue = ops->dequeue;
989  	sch->dev_queue = dev_queue;
990  	sch->owner = -1;
991  	netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL);
992  	refcount_set(&sch->refcnt, 1);
993  
994  	return sch;
995  errout1:
996  	lockdep_unregister_key(&sch->root_lock_key);
997  	kfree(sch);
998  errout:
999  	return ERR_PTR(err);
1000  }
1001  
qdisc_create_dflt(struct netdev_queue * dev_queue,const struct Qdisc_ops * ops,unsigned int parentid,struct netlink_ext_ack * extack)1002  struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
1003  				const struct Qdisc_ops *ops,
1004  				unsigned int parentid,
1005  				struct netlink_ext_ack *extack)
1006  {
1007  	struct Qdisc *sch;
1008  
1009  	if (!try_module_get(ops->owner)) {
1010  		NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
1011  		return NULL;
1012  	}
1013  
1014  	sch = qdisc_alloc(dev_queue, ops, extack);
1015  	if (IS_ERR(sch)) {
1016  		module_put(ops->owner);
1017  		return NULL;
1018  	}
1019  	sch->parent = parentid;
1020  
1021  	if (!ops->init || ops->init(sch, NULL, extack) == 0) {
1022  		trace_qdisc_create(ops, dev_queue->dev, parentid);
1023  		return sch;
1024  	}
1025  
1026  	qdisc_put(sch);
1027  	return NULL;
1028  }
1029  EXPORT_SYMBOL(qdisc_create_dflt);
1030  
1031  /* Under qdisc_lock(qdisc) and BH! */
1032  
qdisc_reset(struct Qdisc * qdisc)1033  void qdisc_reset(struct Qdisc *qdisc)
1034  {
1035  	const struct Qdisc_ops *ops = qdisc->ops;
1036  
1037  	trace_qdisc_reset(qdisc);
1038  
1039  	if (ops->reset)
1040  		ops->reset(qdisc);
1041  
1042  	__skb_queue_purge(&qdisc->gso_skb);
1043  	__skb_queue_purge(&qdisc->skb_bad_txq);
1044  
1045  	qdisc->q.qlen = 0;
1046  	qdisc->qstats.backlog = 0;
1047  }
1048  EXPORT_SYMBOL(qdisc_reset);
1049  
qdisc_free(struct Qdisc * qdisc)1050  void qdisc_free(struct Qdisc *qdisc)
1051  {
1052  	if (qdisc_is_percpu_stats(qdisc)) {
1053  		free_percpu(qdisc->cpu_bstats);
1054  		free_percpu(qdisc->cpu_qstats);
1055  	}
1056  
1057  	kfree(qdisc);
1058  }
1059  
qdisc_free_cb(struct rcu_head * head)1060  static void qdisc_free_cb(struct rcu_head *head)
1061  {
1062  	struct Qdisc *q = container_of(head, struct Qdisc, rcu);
1063  
1064  	qdisc_free(q);
1065  }
1066  
__qdisc_destroy(struct Qdisc * qdisc)1067  static void __qdisc_destroy(struct Qdisc *qdisc)
1068  {
1069  	const struct Qdisc_ops  *ops = qdisc->ops;
1070  	struct net_device *dev = qdisc_dev(qdisc);
1071  
1072  #ifdef CONFIG_NET_SCHED
1073  	qdisc_hash_del(qdisc);
1074  
1075  	qdisc_put_stab(rtnl_dereference(qdisc->stab));
1076  #endif
1077  	gen_kill_estimator(&qdisc->rate_est);
1078  
1079  	qdisc_reset(qdisc);
1080  
1081  
1082  	if (ops->destroy)
1083  		ops->destroy(qdisc);
1084  
1085  	lockdep_unregister_key(&qdisc->root_lock_key);
1086  	module_put(ops->owner);
1087  	netdev_put(dev, &qdisc->dev_tracker);
1088  
1089  	trace_qdisc_destroy(qdisc);
1090  
1091  	call_rcu(&qdisc->rcu, qdisc_free_cb);
1092  }
1093  
qdisc_destroy(struct Qdisc * qdisc)1094  void qdisc_destroy(struct Qdisc *qdisc)
1095  {
1096  	if (qdisc->flags & TCQ_F_BUILTIN)
1097  		return;
1098  
1099  	__qdisc_destroy(qdisc);
1100  }
1101  
qdisc_put(struct Qdisc * qdisc)1102  void qdisc_put(struct Qdisc *qdisc)
1103  {
1104  	if (!qdisc)
1105  		return;
1106  
1107  	if (qdisc->flags & TCQ_F_BUILTIN ||
1108  	    !refcount_dec_and_test(&qdisc->refcnt))
1109  		return;
1110  
1111  	__qdisc_destroy(qdisc);
1112  }
1113  EXPORT_SYMBOL(qdisc_put);
1114  
1115  /* Version of qdisc_put() that is called with rtnl mutex unlocked.
1116   * Intended to be used as optimization, this function only takes rtnl lock if
1117   * qdisc reference counter reached zero.
1118   */
1119  
qdisc_put_unlocked(struct Qdisc * qdisc)1120  void qdisc_put_unlocked(struct Qdisc *qdisc)
1121  {
1122  	if (qdisc->flags & TCQ_F_BUILTIN ||
1123  	    !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
1124  		return;
1125  
1126  	__qdisc_destroy(qdisc);
1127  	rtnl_unlock();
1128  }
1129  EXPORT_SYMBOL(qdisc_put_unlocked);
1130  
1131  /* Attach toplevel qdisc to device queue. */
dev_graft_qdisc(struct netdev_queue * dev_queue,struct Qdisc * qdisc)1132  struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
1133  			      struct Qdisc *qdisc)
1134  {
1135  	struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
1136  	spinlock_t *root_lock;
1137  
1138  	root_lock = qdisc_lock(oqdisc);
1139  	spin_lock_bh(root_lock);
1140  
1141  	/* ... and graft new one */
1142  	if (qdisc == NULL)
1143  		qdisc = &noop_qdisc;
1144  	rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
1145  	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
1146  
1147  	spin_unlock_bh(root_lock);
1148  
1149  	return oqdisc;
1150  }
1151  EXPORT_SYMBOL(dev_graft_qdisc);
1152  
shutdown_scheduler_queue(struct net_device * dev,struct netdev_queue * dev_queue,void * _qdisc_default)1153  static void shutdown_scheduler_queue(struct net_device *dev,
1154  				     struct netdev_queue *dev_queue,
1155  				     void *_qdisc_default)
1156  {
1157  	struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
1158  	struct Qdisc *qdisc_default = _qdisc_default;
1159  
1160  	if (qdisc) {
1161  		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1162  		rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default);
1163  
1164  		qdisc_put(qdisc);
1165  	}
1166  }
1167  
attach_one_default_qdisc(struct net_device * dev,struct netdev_queue * dev_queue,void * _unused)1168  static void attach_one_default_qdisc(struct net_device *dev,
1169  				     struct netdev_queue *dev_queue,
1170  				     void *_unused)
1171  {
1172  	struct Qdisc *qdisc;
1173  	const struct Qdisc_ops *ops = default_qdisc_ops;
1174  
1175  	if (dev->priv_flags & IFF_NO_QUEUE)
1176  		ops = &noqueue_qdisc_ops;
1177  	else if(dev->type == ARPHRD_CAN)
1178  		ops = &pfifo_fast_ops;
1179  
1180  	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
1181  	if (!qdisc)
1182  		return;
1183  
1184  	if (!netif_is_multiqueue(dev))
1185  		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1186  	rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
1187  }
1188  
attach_default_qdiscs(struct net_device * dev)1189  static void attach_default_qdiscs(struct net_device *dev)
1190  {
1191  	struct netdev_queue *txq;
1192  	struct Qdisc *qdisc;
1193  
1194  	txq = netdev_get_tx_queue(dev, 0);
1195  
1196  	if (!netif_is_multiqueue(dev) ||
1197  	    dev->priv_flags & IFF_NO_QUEUE) {
1198  		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
1199  		qdisc = rtnl_dereference(txq->qdisc_sleeping);
1200  		rcu_assign_pointer(dev->qdisc, qdisc);
1201  		qdisc_refcount_inc(qdisc);
1202  	} else {
1203  		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
1204  		if (qdisc) {
1205  			rcu_assign_pointer(dev->qdisc, qdisc);
1206  			qdisc->ops->attach(qdisc);
1207  		}
1208  	}
1209  	qdisc = rtnl_dereference(dev->qdisc);
1210  
1211  	/* Detect default qdisc setup/init failed and fallback to "noqueue" */
1212  	if (qdisc == &noop_qdisc) {
1213  		netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n",
1214  			    default_qdisc_ops->id, noqueue_qdisc_ops.id);
1215  		netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1216  		dev->priv_flags |= IFF_NO_QUEUE;
1217  		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
1218  		qdisc = rtnl_dereference(txq->qdisc_sleeping);
1219  		rcu_assign_pointer(dev->qdisc, qdisc);
1220  		qdisc_refcount_inc(qdisc);
1221  		dev->priv_flags ^= IFF_NO_QUEUE;
1222  	}
1223  
1224  #ifdef CONFIG_NET_SCHED
1225  	if (qdisc != &noop_qdisc)
1226  		qdisc_hash_add(qdisc, false);
1227  #endif
1228  }
1229  
transition_one_qdisc(struct net_device * dev,struct netdev_queue * dev_queue,void * _need_watchdog)1230  static void transition_one_qdisc(struct net_device *dev,
1231  				 struct netdev_queue *dev_queue,
1232  				 void *_need_watchdog)
1233  {
1234  	struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
1235  	int *need_watchdog_p = _need_watchdog;
1236  
1237  	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
1238  		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
1239  
1240  	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
1241  	if (need_watchdog_p) {
1242  		WRITE_ONCE(dev_queue->trans_start, 0);
1243  		*need_watchdog_p = 1;
1244  	}
1245  }
1246  
dev_activate(struct net_device * dev)1247  void dev_activate(struct net_device *dev)
1248  {
1249  	int need_watchdog;
1250  
1251  	/* No queueing discipline is attached to device;
1252  	 * create default one for devices, which need queueing
1253  	 * and noqueue_qdisc for virtual interfaces
1254  	 */
1255  
1256  	if (rtnl_dereference(dev->qdisc) == &noop_qdisc)
1257  		attach_default_qdiscs(dev);
1258  
1259  	if (!netif_carrier_ok(dev))
1260  		/* Delay activation until next carrier-on event */
1261  		return;
1262  
1263  	need_watchdog = 0;
1264  	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
1265  	if (dev_ingress_queue(dev))
1266  		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
1267  
1268  	if (need_watchdog) {
1269  		netif_trans_update(dev);
1270  		dev_watchdog_up(dev);
1271  	}
1272  }
1273  EXPORT_SYMBOL(dev_activate);
1274  
qdisc_deactivate(struct Qdisc * qdisc)1275  static void qdisc_deactivate(struct Qdisc *qdisc)
1276  {
1277  	if (qdisc->flags & TCQ_F_BUILTIN)
1278  		return;
1279  
1280  	set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
1281  }
1282  
dev_deactivate_queue(struct net_device * dev,struct netdev_queue * dev_queue,void * _qdisc_default)1283  static void dev_deactivate_queue(struct net_device *dev,
1284  				 struct netdev_queue *dev_queue,
1285  				 void *_qdisc_default)
1286  {
1287  	struct Qdisc *qdisc_default = _qdisc_default;
1288  	struct Qdisc *qdisc;
1289  
1290  	qdisc = rtnl_dereference(dev_queue->qdisc);
1291  	if (qdisc) {
1292  		qdisc_deactivate(qdisc);
1293  		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1294  	}
1295  }
1296  
dev_reset_queue(struct net_device * dev,struct netdev_queue * dev_queue,void * _unused)1297  static void dev_reset_queue(struct net_device *dev,
1298  			    struct netdev_queue *dev_queue,
1299  			    void *_unused)
1300  {
1301  	struct Qdisc *qdisc;
1302  	bool nolock;
1303  
1304  	qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
1305  	if (!qdisc)
1306  		return;
1307  
1308  	nolock = qdisc->flags & TCQ_F_NOLOCK;
1309  
1310  	if (nolock)
1311  		spin_lock_bh(&qdisc->seqlock);
1312  	spin_lock_bh(qdisc_lock(qdisc));
1313  
1314  	qdisc_reset(qdisc);
1315  
1316  	spin_unlock_bh(qdisc_lock(qdisc));
1317  	if (nolock) {
1318  		clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
1319  		clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
1320  		spin_unlock_bh(&qdisc->seqlock);
1321  	}
1322  }
1323  
some_qdisc_is_busy(struct net_device * dev)1324  static bool some_qdisc_is_busy(struct net_device *dev)
1325  {
1326  	unsigned int i;
1327  
1328  	for (i = 0; i < dev->num_tx_queues; i++) {
1329  		struct netdev_queue *dev_queue;
1330  		spinlock_t *root_lock;
1331  		struct Qdisc *q;
1332  		int val;
1333  
1334  		dev_queue = netdev_get_tx_queue(dev, i);
1335  		q = rtnl_dereference(dev_queue->qdisc_sleeping);
1336  
1337  		root_lock = qdisc_lock(q);
1338  		spin_lock_bh(root_lock);
1339  
1340  		val = (qdisc_is_running(q) ||
1341  		       test_bit(__QDISC_STATE_SCHED, &q->state));
1342  
1343  		spin_unlock_bh(root_lock);
1344  
1345  		if (val)
1346  			return true;
1347  	}
1348  	return false;
1349  }
1350  
1351  /**
1352   * 	dev_deactivate_many - deactivate transmissions on several devices
1353   * 	@head: list of devices to deactivate
1354   *
1355   *	This function returns only when all outstanding transmissions
1356   *	have completed, unless all devices are in dismantle phase.
1357   */
dev_deactivate_many(struct list_head * head)1358  void dev_deactivate_many(struct list_head *head)
1359  {
1360  	struct net_device *dev;
1361  
1362  	list_for_each_entry(dev, head, close_list) {
1363  		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
1364  					 &noop_qdisc);
1365  		if (dev_ingress_queue(dev))
1366  			dev_deactivate_queue(dev, dev_ingress_queue(dev),
1367  					     &noop_qdisc);
1368  
1369  		dev_watchdog_down(dev);
1370  	}
1371  
1372  	/* Wait for outstanding qdisc-less dev_queue_xmit calls or
1373  	 * outstanding qdisc enqueuing calls.
1374  	 * This is avoided if all devices are in dismantle phase :
1375  	 * Caller will call synchronize_net() for us
1376  	 */
1377  	synchronize_net();
1378  
1379  	list_for_each_entry(dev, head, close_list) {
1380  		netdev_for_each_tx_queue(dev, dev_reset_queue, NULL);
1381  
1382  		if (dev_ingress_queue(dev))
1383  			dev_reset_queue(dev, dev_ingress_queue(dev), NULL);
1384  	}
1385  
1386  	/* Wait for outstanding qdisc_run calls. */
1387  	list_for_each_entry(dev, head, close_list) {
1388  		while (some_qdisc_is_busy(dev)) {
1389  			/* wait_event() would avoid this sleep-loop but would
1390  			 * require expensive checks in the fast paths of packet
1391  			 * processing which isn't worth it.
1392  			 */
1393  			schedule_timeout_uninterruptible(1);
1394  		}
1395  	}
1396  }
1397  
dev_deactivate(struct net_device * dev)1398  void dev_deactivate(struct net_device *dev)
1399  {
1400  	LIST_HEAD(single);
1401  
1402  	list_add(&dev->close_list, &single);
1403  	dev_deactivate_many(&single);
1404  	list_del(&single);
1405  }
1406  EXPORT_SYMBOL(dev_deactivate);
1407  
qdisc_change_tx_queue_len(struct net_device * dev,struct netdev_queue * dev_queue)1408  static int qdisc_change_tx_queue_len(struct net_device *dev,
1409  				     struct netdev_queue *dev_queue)
1410  {
1411  	struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
1412  	const struct Qdisc_ops *ops = qdisc->ops;
1413  
1414  	if (ops->change_tx_queue_len)
1415  		return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
1416  	return 0;
1417  }
1418  
dev_qdisc_change_real_num_tx(struct net_device * dev,unsigned int new_real_tx)1419  void dev_qdisc_change_real_num_tx(struct net_device *dev,
1420  				  unsigned int new_real_tx)
1421  {
1422  	struct Qdisc *qdisc = rtnl_dereference(dev->qdisc);
1423  
1424  	if (qdisc->ops->change_real_num_tx)
1425  		qdisc->ops->change_real_num_tx(qdisc, new_real_tx);
1426  }
1427  
mq_change_real_num_tx(struct Qdisc * sch,unsigned int new_real_tx)1428  void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx)
1429  {
1430  #ifdef CONFIG_NET_SCHED
1431  	struct net_device *dev = qdisc_dev(sch);
1432  	struct Qdisc *qdisc;
1433  	unsigned int i;
1434  
1435  	for (i = new_real_tx; i < dev->real_num_tx_queues; i++) {
1436  		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping);
1437  		/* Only update the default qdiscs we created,
1438  		 * qdiscs with handles are always hashed.
1439  		 */
1440  		if (qdisc != &noop_qdisc && !qdisc->handle)
1441  			qdisc_hash_del(qdisc);
1442  	}
1443  	for (i = dev->real_num_tx_queues; i < new_real_tx; i++) {
1444  		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping);
1445  		if (qdisc != &noop_qdisc && !qdisc->handle)
1446  			qdisc_hash_add(qdisc, false);
1447  	}
1448  #endif
1449  }
1450  EXPORT_SYMBOL(mq_change_real_num_tx);
1451  
dev_qdisc_change_tx_queue_len(struct net_device * dev)1452  int dev_qdisc_change_tx_queue_len(struct net_device *dev)
1453  {
1454  	bool up = dev->flags & IFF_UP;
1455  	unsigned int i;
1456  	int ret = 0;
1457  
1458  	if (up)
1459  		dev_deactivate(dev);
1460  
1461  	for (i = 0; i < dev->num_tx_queues; i++) {
1462  		ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);
1463  
1464  		/* TODO: revert changes on a partial failure */
1465  		if (ret)
1466  			break;
1467  	}
1468  
1469  	if (up)
1470  		dev_activate(dev);
1471  	return ret;
1472  }
1473  
dev_init_scheduler_queue(struct net_device * dev,struct netdev_queue * dev_queue,void * _qdisc)1474  static void dev_init_scheduler_queue(struct net_device *dev,
1475  				     struct netdev_queue *dev_queue,
1476  				     void *_qdisc)
1477  {
1478  	struct Qdisc *qdisc = _qdisc;
1479  
1480  	rcu_assign_pointer(dev_queue->qdisc, qdisc);
1481  	rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
1482  }
1483  
dev_init_scheduler(struct net_device * dev)1484  void dev_init_scheduler(struct net_device *dev)
1485  {
1486  	rcu_assign_pointer(dev->qdisc, &noop_qdisc);
1487  	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
1488  	if (dev_ingress_queue(dev))
1489  		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1490  
1491  	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
1492  }
1493  
dev_shutdown(struct net_device * dev)1494  void dev_shutdown(struct net_device *dev)
1495  {
1496  	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1497  	if (dev_ingress_queue(dev))
1498  		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1499  	qdisc_put(rtnl_dereference(dev->qdisc));
1500  	rcu_assign_pointer(dev->qdisc, &noop_qdisc);
1501  
1502  	WARN_ON(timer_pending(&dev->watchdog_timer));
1503  }
1504  
1505  /**
1506   * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division
1507   * @rate:   Rate to compute reciprocal division values of
1508   * @mult:   Multiplier for reciprocal division
1509   * @shift:  Shift for reciprocal division
1510   *
1511   * The multiplier and shift for reciprocal division by rate are stored
1512   * in mult and shift.
1513   *
1514   * The deal here is to replace a divide by a reciprocal one
1515   * in fast path (a reciprocal divide is a multiply and a shift)
1516   *
1517   * Normal formula would be :
1518   *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
1519   *
1520   * We compute mult/shift to use instead :
1521   *  time_in_ns = (len * mult) >> shift;
1522   *
1523   * We try to get the highest possible mult value for accuracy,
1524   * but have to make sure no overflows will ever happen.
1525   *
1526   * reciprocal_value() is not used here it doesn't handle 64-bit values.
1527   */
psched_ratecfg_precompute__(u64 rate,u32 * mult,u8 * shift)1528  static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift)
1529  {
1530  	u64 factor = NSEC_PER_SEC;
1531  
1532  	*mult = 1;
1533  	*shift = 0;
1534  
1535  	if (rate <= 0)
1536  		return;
1537  
1538  	for (;;) {
1539  		*mult = div64_u64(factor, rate);
1540  		if (*mult & (1U << 31) || factor & (1ULL << 63))
1541  			break;
1542  		factor <<= 1;
1543  		(*shift)++;
1544  	}
1545  }
1546  
psched_ratecfg_precompute(struct psched_ratecfg * r,const struct tc_ratespec * conf,u64 rate64)1547  void psched_ratecfg_precompute(struct psched_ratecfg *r,
1548  			       const struct tc_ratespec *conf,
1549  			       u64 rate64)
1550  {
1551  	memset(r, 0, sizeof(*r));
1552  	r->overhead = conf->overhead;
1553  	r->mpu = conf->mpu;
1554  	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1555  	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1556  	psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift);
1557  }
1558  EXPORT_SYMBOL(psched_ratecfg_precompute);
1559  
psched_ppscfg_precompute(struct psched_pktrate * r,u64 pktrate64)1560  void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64)
1561  {
1562  	r->rate_pkts_ps = pktrate64;
1563  	psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift);
1564  }
1565  EXPORT_SYMBOL(psched_ppscfg_precompute);
1566  
mini_qdisc_pair_swap(struct mini_Qdisc_pair * miniqp,struct tcf_proto * tp_head)1567  void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
1568  			  struct tcf_proto *tp_head)
1569  {
1570  	/* Protected with chain0->filter_chain_lock.
1571  	 * Can't access chain directly because tp_head can be NULL.
1572  	 */
1573  	struct mini_Qdisc *miniq_old =
1574  		rcu_dereference_protected(*miniqp->p_miniq, 1);
1575  	struct mini_Qdisc *miniq;
1576  
1577  	if (!tp_head) {
1578  		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
1579  	} else {
1580  		miniq = miniq_old != &miniqp->miniq1 ?
1581  			&miniqp->miniq1 : &miniqp->miniq2;
1582  
1583  		/* We need to make sure that readers won't see the miniq
1584  		 * we are about to modify. So ensure that at least one RCU
1585  		 * grace period has elapsed since the miniq was made
1586  		 * inactive.
1587  		 */
1588  		if (IS_ENABLED(CONFIG_PREEMPT_RT))
1589  			cond_synchronize_rcu(miniq->rcu_state);
1590  		else if (!poll_state_synchronize_rcu(miniq->rcu_state))
1591  			synchronize_rcu_expedited();
1592  
1593  		miniq->filter_list = tp_head;
1594  		rcu_assign_pointer(*miniqp->p_miniq, miniq);
1595  	}
1596  
1597  	if (miniq_old)
1598  		/* This is counterpart of the rcu sync above. We need to
1599  		 * block potential new user of miniq_old until all readers
1600  		 * are not seeing it.
1601  		 */
1602  		miniq_old->rcu_state = start_poll_synchronize_rcu();
1603  }
1604  EXPORT_SYMBOL(mini_qdisc_pair_swap);
1605  
mini_qdisc_pair_block_init(struct mini_Qdisc_pair * miniqp,struct tcf_block * block)1606  void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
1607  				struct tcf_block *block)
1608  {
1609  	miniqp->miniq1.block = block;
1610  	miniqp->miniq2.block = block;
1611  }
1612  EXPORT_SYMBOL(mini_qdisc_pair_block_init);
1613  
mini_qdisc_pair_init(struct mini_Qdisc_pair * miniqp,struct Qdisc * qdisc,struct mini_Qdisc __rcu ** p_miniq)1614  void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
1615  			  struct mini_Qdisc __rcu **p_miniq)
1616  {
1617  	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
1618  	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
1619  	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
1620  	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
1621  	miniqp->miniq1.rcu_state = get_state_synchronize_rcu();
1622  	miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state;
1623  	miniqp->p_miniq = p_miniq;
1624  }
1625  EXPORT_SYMBOL(mini_qdisc_pair_init);
1626