1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   *	Linux INET6 implementation
4   *	Forwarding Information Database
5   *
6   *	Authors:
7   *	Pedro Roque		<roque@di.fc.ul.pt>
8   *
9   *	Changes:
10   *	Yuji SEKIYA @USAGI:	Support default route on router node;
11   *				remove ip6_null_entry from the top of
12   *				routing table.
13   *	Ville Nuorvala:		Fixed routing subtrees.
14   */
15  
16  #define pr_fmt(fmt) "IPv6: " fmt
17  
18  #include <linux/bpf.h>
19  #include <linux/errno.h>
20  #include <linux/types.h>
21  #include <linux/net.h>
22  #include <linux/route.h>
23  #include <linux/netdevice.h>
24  #include <linux/in6.h>
25  #include <linux/init.h>
26  #include <linux/list.h>
27  #include <linux/slab.h>
28  
29  #include <net/ip.h>
30  #include <net/ipv6.h>
31  #include <net/ndisc.h>
32  #include <net/addrconf.h>
33  #include <net/lwtunnel.h>
34  #include <net/fib_notifier.h>
35  
36  #include <net/ip_fib.h>
37  #include <net/ip6_fib.h>
38  #include <net/ip6_route.h>
39  
40  static struct kmem_cache *fib6_node_kmem __read_mostly;
41  
42  struct fib6_cleaner {
43  	struct fib6_walker w;
44  	struct net *net;
45  	int (*func)(struct fib6_info *, void *arg);
46  	int sernum;
47  	void *arg;
48  	bool skip_notify;
49  };
50  
51  #ifdef CONFIG_IPV6_SUBTREES
52  #define FWS_INIT FWS_S
53  #else
54  #define FWS_INIT FWS_L
55  #endif
56  
57  static struct fib6_info *fib6_find_prefix(struct net *net,
58  					 struct fib6_table *table,
59  					 struct fib6_node *fn);
60  static struct fib6_node *fib6_repair_tree(struct net *net,
61  					  struct fib6_table *table,
62  					  struct fib6_node *fn);
63  static int fib6_walk(struct net *net, struct fib6_walker *w);
64  static int fib6_walk_continue(struct fib6_walker *w);
65  
66  /*
67   *	A routing update causes an increase of the serial number on the
68   *	affected subtree. This allows for cached routes to be asynchronously
69   *	tested when modifications are made to the destination cache as a
70   *	result of redirects, path MTU changes, etc.
71   */
72  
73  static void fib6_gc_timer_cb(struct timer_list *t);
74  
75  #define FOR_WALKERS(net, w) \
76  	list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
77  
fib6_walker_link(struct net * net,struct fib6_walker * w)78  static void fib6_walker_link(struct net *net, struct fib6_walker *w)
79  {
80  	write_lock_bh(&net->ipv6.fib6_walker_lock);
81  	list_add(&w->lh, &net->ipv6.fib6_walkers);
82  	write_unlock_bh(&net->ipv6.fib6_walker_lock);
83  }
84  
fib6_walker_unlink(struct net * net,struct fib6_walker * w)85  static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
86  {
87  	write_lock_bh(&net->ipv6.fib6_walker_lock);
88  	list_del(&w->lh);
89  	write_unlock_bh(&net->ipv6.fib6_walker_lock);
90  }
91  
fib6_new_sernum(struct net * net)92  static int fib6_new_sernum(struct net *net)
93  {
94  	int new, old = atomic_read(&net->ipv6.fib6_sernum);
95  
96  	do {
97  		new = old < INT_MAX ? old + 1 : 1;
98  	} while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new));
99  
100  	return new;
101  }
102  
103  enum {
104  	FIB6_NO_SERNUM_CHANGE = 0,
105  };
106  
fib6_update_sernum(struct net * net,struct fib6_info * f6i)107  void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
108  {
109  	struct fib6_node *fn;
110  
111  	fn = rcu_dereference_protected(f6i->fib6_node,
112  			lockdep_is_held(&f6i->fib6_table->tb6_lock));
113  	if (fn)
114  		WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
115  }
116  
117  /*
118   *	Auxiliary address test functions for the radix tree.
119   *
120   *	These assume a 32bit processor (although it will work on
121   *	64bit processors)
122   */
123  
124  /*
125   *	test bit
126   */
127  #if defined(__LITTLE_ENDIAN)
128  # define BITOP_BE32_SWIZZLE	(0x1F & ~7)
129  #else
130  # define BITOP_BE32_SWIZZLE	0
131  #endif
132  
addr_bit_set(const void * token,int fn_bit)133  static __be32 addr_bit_set(const void *token, int fn_bit)
134  {
135  	const __be32 *addr = token;
136  	/*
137  	 * Here,
138  	 *	1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
139  	 * is optimized version of
140  	 *	htonl(1 << ((~fn_bit)&0x1F))
141  	 * See include/asm-generic/bitops/le.h.
142  	 */
143  	return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
144  	       addr[fn_bit >> 5];
145  }
146  
fib6_info_alloc(gfp_t gfp_flags,bool with_fib6_nh)147  struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
148  {
149  	struct fib6_info *f6i;
150  	size_t sz = sizeof(*f6i);
151  
152  	if (with_fib6_nh)
153  		sz += sizeof(struct fib6_nh);
154  
155  	f6i = kzalloc(sz, gfp_flags);
156  	if (!f6i)
157  		return NULL;
158  
159  	/* fib6_siblings is a union with nh_list, so this initializes both */
160  	INIT_LIST_HEAD(&f6i->fib6_siblings);
161  	refcount_set(&f6i->fib6_ref, 1);
162  
163  	INIT_HLIST_NODE(&f6i->gc_link);
164  
165  	return f6i;
166  }
167  
fib6_info_destroy_rcu(struct rcu_head * head)168  void fib6_info_destroy_rcu(struct rcu_head *head)
169  {
170  	struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
171  
172  	WARN_ON(f6i->fib6_node);
173  
174  	if (f6i->nh)
175  		nexthop_put(f6i->nh);
176  	else
177  		fib6_nh_release(f6i->fib6_nh);
178  
179  	ip_fib_metrics_put(f6i->fib6_metrics);
180  	kfree(f6i);
181  }
182  EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
183  
node_alloc(struct net * net)184  static struct fib6_node *node_alloc(struct net *net)
185  {
186  	struct fib6_node *fn;
187  
188  	fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
189  	if (fn)
190  		net->ipv6.rt6_stats->fib_nodes++;
191  
192  	return fn;
193  }
194  
node_free_immediate(struct net * net,struct fib6_node * fn)195  static void node_free_immediate(struct net *net, struct fib6_node *fn)
196  {
197  	kmem_cache_free(fib6_node_kmem, fn);
198  	net->ipv6.rt6_stats->fib_nodes--;
199  }
200  
node_free_rcu(struct rcu_head * head)201  static void node_free_rcu(struct rcu_head *head)
202  {
203  	struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
204  
205  	kmem_cache_free(fib6_node_kmem, fn);
206  }
207  
node_free(struct net * net,struct fib6_node * fn)208  static void node_free(struct net *net, struct fib6_node *fn)
209  {
210  	call_rcu(&fn->rcu, node_free_rcu);
211  	net->ipv6.rt6_stats->fib_nodes--;
212  }
213  
fib6_free_table(struct fib6_table * table)214  static void fib6_free_table(struct fib6_table *table)
215  {
216  	inetpeer_invalidate_tree(&table->tb6_peers);
217  	kfree(table);
218  }
219  
fib6_link_table(struct net * net,struct fib6_table * tb)220  static void fib6_link_table(struct net *net, struct fib6_table *tb)
221  {
222  	unsigned int h;
223  
224  	/*
225  	 * Initialize table lock at a single place to give lockdep a key,
226  	 * tables aren't visible prior to being linked to the list.
227  	 */
228  	spin_lock_init(&tb->tb6_lock);
229  	h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
230  
231  	/*
232  	 * No protection necessary, this is the only list mutatation
233  	 * operation, tables never disappear once they exist.
234  	 */
235  	hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
236  }
237  
238  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
239  
fib6_alloc_table(struct net * net,u32 id)240  static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
241  {
242  	struct fib6_table *table;
243  
244  	table = kzalloc(sizeof(*table), GFP_ATOMIC);
245  	if (table) {
246  		table->tb6_id = id;
247  		rcu_assign_pointer(table->tb6_root.leaf,
248  				   net->ipv6.fib6_null_entry);
249  		table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
250  		inet_peer_base_init(&table->tb6_peers);
251  		INIT_HLIST_HEAD(&table->tb6_gc_hlist);
252  	}
253  
254  	return table;
255  }
256  
fib6_new_table(struct net * net,u32 id)257  struct fib6_table *fib6_new_table(struct net *net, u32 id)
258  {
259  	struct fib6_table *tb;
260  
261  	if (id == 0)
262  		id = RT6_TABLE_MAIN;
263  	tb = fib6_get_table(net, id);
264  	if (tb)
265  		return tb;
266  
267  	tb = fib6_alloc_table(net, id);
268  	if (tb)
269  		fib6_link_table(net, tb);
270  
271  	return tb;
272  }
273  EXPORT_SYMBOL_GPL(fib6_new_table);
274  
fib6_get_table(struct net * net,u32 id)275  struct fib6_table *fib6_get_table(struct net *net, u32 id)
276  {
277  	struct fib6_table *tb;
278  	struct hlist_head *head;
279  	unsigned int h;
280  
281  	if (id == 0)
282  		id = RT6_TABLE_MAIN;
283  	h = id & (FIB6_TABLE_HASHSZ - 1);
284  	rcu_read_lock();
285  	head = &net->ipv6.fib_table_hash[h];
286  	hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
287  		if (tb->tb6_id == id) {
288  			rcu_read_unlock();
289  			return tb;
290  		}
291  	}
292  	rcu_read_unlock();
293  
294  	return NULL;
295  }
296  EXPORT_SYMBOL_GPL(fib6_get_table);
297  
fib6_tables_init(struct net * net)298  static void __net_init fib6_tables_init(struct net *net)
299  {
300  	fib6_link_table(net, net->ipv6.fib6_main_tbl);
301  	fib6_link_table(net, net->ipv6.fib6_local_tbl);
302  }
303  #else
304  
fib6_new_table(struct net * net,u32 id)305  struct fib6_table *fib6_new_table(struct net *net, u32 id)
306  {
307  	return fib6_get_table(net, id);
308  }
309  
fib6_get_table(struct net * net,u32 id)310  struct fib6_table *fib6_get_table(struct net *net, u32 id)
311  {
312  	  return net->ipv6.fib6_main_tbl;
313  }
314  
fib6_rule_lookup(struct net * net,struct flowi6 * fl6,const struct sk_buff * skb,int flags,pol_lookup_t lookup)315  struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
316  				   const struct sk_buff *skb,
317  				   int flags, pol_lookup_t lookup)
318  {
319  	struct rt6_info *rt;
320  
321  	rt = pol_lookup_func(lookup,
322  			net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
323  	if (rt->dst.error == -EAGAIN) {
324  		ip6_rt_put_flags(rt, flags);
325  		rt = net->ipv6.ip6_null_entry;
326  		if (!(flags & RT6_LOOKUP_F_DST_NOREF))
327  			dst_hold(&rt->dst);
328  	}
329  
330  	return &rt->dst;
331  }
332  
333  /* called with rcu lock held; no reference taken on fib6_info */
fib6_lookup(struct net * net,int oif,struct flowi6 * fl6,struct fib6_result * res,int flags)334  int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
335  		struct fib6_result *res, int flags)
336  {
337  	return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
338  				 res, flags);
339  }
340  
fib6_tables_init(struct net * net)341  static void __net_init fib6_tables_init(struct net *net)
342  {
343  	fib6_link_table(net, net->ipv6.fib6_main_tbl);
344  }
345  
346  #endif
347  
fib6_tables_seq_read(struct net * net)348  unsigned int fib6_tables_seq_read(struct net *net)
349  {
350  	unsigned int h, fib_seq = 0;
351  
352  	rcu_read_lock();
353  	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
354  		struct hlist_head *head = &net->ipv6.fib_table_hash[h];
355  		struct fib6_table *tb;
356  
357  		hlist_for_each_entry_rcu(tb, head, tb6_hlist)
358  			fib_seq += tb->fib_seq;
359  	}
360  	rcu_read_unlock();
361  
362  	return fib_seq;
363  }
364  
call_fib6_entry_notifier(struct notifier_block * nb,enum fib_event_type event_type,struct fib6_info * rt,struct netlink_ext_ack * extack)365  static int call_fib6_entry_notifier(struct notifier_block *nb,
366  				    enum fib_event_type event_type,
367  				    struct fib6_info *rt,
368  				    struct netlink_ext_ack *extack)
369  {
370  	struct fib6_entry_notifier_info info = {
371  		.info.extack = extack,
372  		.rt = rt,
373  	};
374  
375  	return call_fib6_notifier(nb, event_type, &info.info);
376  }
377  
call_fib6_multipath_entry_notifier(struct notifier_block * nb,enum fib_event_type event_type,struct fib6_info * rt,unsigned int nsiblings,struct netlink_ext_ack * extack)378  static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
379  					      enum fib_event_type event_type,
380  					      struct fib6_info *rt,
381  					      unsigned int nsiblings,
382  					      struct netlink_ext_ack *extack)
383  {
384  	struct fib6_entry_notifier_info info = {
385  		.info.extack = extack,
386  		.rt = rt,
387  		.nsiblings = nsiblings,
388  	};
389  
390  	return call_fib6_notifier(nb, event_type, &info.info);
391  }
392  
call_fib6_entry_notifiers(struct net * net,enum fib_event_type event_type,struct fib6_info * rt,struct netlink_ext_ack * extack)393  int call_fib6_entry_notifiers(struct net *net,
394  			      enum fib_event_type event_type,
395  			      struct fib6_info *rt,
396  			      struct netlink_ext_ack *extack)
397  {
398  	struct fib6_entry_notifier_info info = {
399  		.info.extack = extack,
400  		.rt = rt,
401  	};
402  
403  	rt->fib6_table->fib_seq++;
404  	return call_fib6_notifiers(net, event_type, &info.info);
405  }
406  
call_fib6_multipath_entry_notifiers(struct net * net,enum fib_event_type event_type,struct fib6_info * rt,unsigned int nsiblings,struct netlink_ext_ack * extack)407  int call_fib6_multipath_entry_notifiers(struct net *net,
408  					enum fib_event_type event_type,
409  					struct fib6_info *rt,
410  					unsigned int nsiblings,
411  					struct netlink_ext_ack *extack)
412  {
413  	struct fib6_entry_notifier_info info = {
414  		.info.extack = extack,
415  		.rt = rt,
416  		.nsiblings = nsiblings,
417  	};
418  
419  	rt->fib6_table->fib_seq++;
420  	return call_fib6_notifiers(net, event_type, &info.info);
421  }
422  
call_fib6_entry_notifiers_replace(struct net * net,struct fib6_info * rt)423  int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
424  {
425  	struct fib6_entry_notifier_info info = {
426  		.rt = rt,
427  		.nsiblings = rt->fib6_nsiblings,
428  	};
429  
430  	rt->fib6_table->fib_seq++;
431  	return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
432  }
433  
434  struct fib6_dump_arg {
435  	struct net *net;
436  	struct notifier_block *nb;
437  	struct netlink_ext_ack *extack;
438  };
439  
fib6_rt_dump(struct fib6_info * rt,struct fib6_dump_arg * arg)440  static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
441  {
442  	enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
443  	int err;
444  
445  	if (!rt || rt == arg->net->ipv6.fib6_null_entry)
446  		return 0;
447  
448  	if (rt->fib6_nsiblings)
449  		err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
450  							 rt,
451  							 rt->fib6_nsiblings,
452  							 arg->extack);
453  	else
454  		err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
455  					       arg->extack);
456  
457  	return err;
458  }
459  
fib6_node_dump(struct fib6_walker * w)460  static int fib6_node_dump(struct fib6_walker *w)
461  {
462  	int err;
463  
464  	err = fib6_rt_dump(w->leaf, w->args);
465  	w->leaf = NULL;
466  	return err;
467  }
468  
fib6_table_dump(struct net * net,struct fib6_table * tb,struct fib6_walker * w)469  static int fib6_table_dump(struct net *net, struct fib6_table *tb,
470  			   struct fib6_walker *w)
471  {
472  	int err;
473  
474  	w->root = &tb->tb6_root;
475  	spin_lock_bh(&tb->tb6_lock);
476  	err = fib6_walk(net, w);
477  	spin_unlock_bh(&tb->tb6_lock);
478  	return err;
479  }
480  
481  /* Called with rcu_read_lock() */
fib6_tables_dump(struct net * net,struct notifier_block * nb,struct netlink_ext_ack * extack)482  int fib6_tables_dump(struct net *net, struct notifier_block *nb,
483  		     struct netlink_ext_ack *extack)
484  {
485  	struct fib6_dump_arg arg;
486  	struct fib6_walker *w;
487  	unsigned int h;
488  	int err = 0;
489  
490  	w = kzalloc(sizeof(*w), GFP_ATOMIC);
491  	if (!w)
492  		return -ENOMEM;
493  
494  	w->func = fib6_node_dump;
495  	arg.net = net;
496  	arg.nb = nb;
497  	arg.extack = extack;
498  	w->args = &arg;
499  
500  	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
501  		struct hlist_head *head = &net->ipv6.fib_table_hash[h];
502  		struct fib6_table *tb;
503  
504  		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
505  			err = fib6_table_dump(net, tb, w);
506  			if (err)
507  				goto out;
508  		}
509  	}
510  
511  out:
512  	kfree(w);
513  
514  	/* The tree traversal function should never return a positive value. */
515  	return err > 0 ? -EINVAL : err;
516  }
517  
fib6_dump_node(struct fib6_walker * w)518  static int fib6_dump_node(struct fib6_walker *w)
519  {
520  	int res;
521  	struct fib6_info *rt;
522  
523  	for_each_fib6_walker_rt(w) {
524  		res = rt6_dump_route(rt, w->args, w->skip_in_node);
525  		if (res >= 0) {
526  			/* Frame is full, suspend walking */
527  			w->leaf = rt;
528  
529  			/* We'll restart from this node, so if some routes were
530  			 * already dumped, skip them next time.
531  			 */
532  			w->skip_in_node += res;
533  
534  			return 1;
535  		}
536  		w->skip_in_node = 0;
537  
538  		/* Multipath routes are dumped in one route with the
539  		 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
540  		 * last sibling of this route (no need to dump the
541  		 * sibling routes again)
542  		 */
543  		if (rt->fib6_nsiblings)
544  			rt = list_last_entry(&rt->fib6_siblings,
545  					     struct fib6_info,
546  					     fib6_siblings);
547  	}
548  	w->leaf = NULL;
549  	return 0;
550  }
551  
fib6_dump_end(struct netlink_callback * cb)552  static void fib6_dump_end(struct netlink_callback *cb)
553  {
554  	struct net *net = sock_net(cb->skb->sk);
555  	struct fib6_walker *w = (void *)cb->args[2];
556  
557  	if (w) {
558  		if (cb->args[4]) {
559  			cb->args[4] = 0;
560  			fib6_walker_unlink(net, w);
561  		}
562  		cb->args[2] = 0;
563  		kfree(w);
564  	}
565  	cb->done = (void *)cb->args[3];
566  	cb->args[1] = 3;
567  }
568  
fib6_dump_done(struct netlink_callback * cb)569  static int fib6_dump_done(struct netlink_callback *cb)
570  {
571  	fib6_dump_end(cb);
572  	return cb->done ? cb->done(cb) : 0;
573  }
574  
fib6_dump_table(struct fib6_table * table,struct sk_buff * skb,struct netlink_callback * cb)575  static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
576  			   struct netlink_callback *cb)
577  {
578  	struct net *net = sock_net(skb->sk);
579  	struct fib6_walker *w;
580  	int res;
581  
582  	w = (void *)cb->args[2];
583  	w->root = &table->tb6_root;
584  
585  	if (cb->args[4] == 0) {
586  		w->count = 0;
587  		w->skip = 0;
588  		w->skip_in_node = 0;
589  
590  		spin_lock_bh(&table->tb6_lock);
591  		res = fib6_walk(net, w);
592  		spin_unlock_bh(&table->tb6_lock);
593  		if (res > 0) {
594  			cb->args[4] = 1;
595  			cb->args[5] = READ_ONCE(w->root->fn_sernum);
596  		}
597  	} else {
598  		int sernum = READ_ONCE(w->root->fn_sernum);
599  		if (cb->args[5] != sernum) {
600  			/* Begin at the root if the tree changed */
601  			cb->args[5] = sernum;
602  			w->state = FWS_INIT;
603  			w->node = w->root;
604  			w->skip = w->count;
605  			w->skip_in_node = 0;
606  		} else
607  			w->skip = 0;
608  
609  		spin_lock_bh(&table->tb6_lock);
610  		res = fib6_walk_continue(w);
611  		spin_unlock_bh(&table->tb6_lock);
612  		if (res <= 0) {
613  			fib6_walker_unlink(net, w);
614  			cb->args[4] = 0;
615  		}
616  	}
617  
618  	return res;
619  }
620  
inet6_dump_fib(struct sk_buff * skb,struct netlink_callback * cb)621  static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
622  {
623  	struct rt6_rtnl_dump_arg arg = {
624  		.filter.dump_exceptions = true,
625  		.filter.dump_routes = true,
626  		.filter.rtnl_held = false,
627  	};
628  	const struct nlmsghdr *nlh = cb->nlh;
629  	struct net *net = sock_net(skb->sk);
630  	unsigned int e = 0, s_e;
631  	struct hlist_head *head;
632  	struct fib6_walker *w;
633  	struct fib6_table *tb;
634  	unsigned int h, s_h;
635  	int err = 0;
636  
637  	rcu_read_lock();
638  	if (cb->strict_check) {
639  		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
640  		if (err < 0)
641  			goto unlock;
642  	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
643  		struct rtmsg *rtm = nlmsg_data(nlh);
644  
645  		if (rtm->rtm_flags & RTM_F_PREFIX)
646  			arg.filter.flags = RTM_F_PREFIX;
647  	}
648  
649  	w = (void *)cb->args[2];
650  	if (!w) {
651  		/* New dump:
652  		 *
653  		 * 1. allocate and initialize walker.
654  		 */
655  		w = kzalloc(sizeof(*w), GFP_ATOMIC);
656  		if (!w) {
657  			err = -ENOMEM;
658  			goto unlock;
659  		}
660  		w->func = fib6_dump_node;
661  		cb->args[2] = (long)w;
662  
663  		/* 2. hook callback destructor.
664  		 */
665  		cb->args[3] = (long)cb->done;
666  		cb->done = fib6_dump_done;
667  
668  	}
669  
670  	arg.skb = skb;
671  	arg.cb = cb;
672  	arg.net = net;
673  	w->args = &arg;
674  
675  	if (arg.filter.table_id) {
676  		tb = fib6_get_table(net, arg.filter.table_id);
677  		if (!tb) {
678  			if (rtnl_msg_family(cb->nlh) != PF_INET6)
679  				goto unlock;
680  
681  			NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
682  			err = -ENOENT;
683  			goto unlock;
684  		}
685  
686  		if (!cb->args[0]) {
687  			err = fib6_dump_table(tb, skb, cb);
688  			if (!err)
689  				cb->args[0] = 1;
690  		}
691  		goto unlock;
692  	}
693  
694  	s_h = cb->args[0];
695  	s_e = cb->args[1];
696  
697  	for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
698  		e = 0;
699  		head = &net->ipv6.fib_table_hash[h];
700  		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
701  			if (e < s_e)
702  				goto next;
703  			err = fib6_dump_table(tb, skb, cb);
704  			if (err != 0)
705  				goto out;
706  next:
707  			e++;
708  		}
709  	}
710  out:
711  	cb->args[1] = e;
712  	cb->args[0] = h;
713  
714  unlock:
715  	rcu_read_unlock();
716  	if (err <= 0)
717  		fib6_dump_end(cb);
718  	return err;
719  }
720  
fib6_metric_set(struct fib6_info * f6i,int metric,u32 val)721  void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
722  {
723  	if (!f6i)
724  		return;
725  
726  	if (f6i->fib6_metrics == &dst_default_metrics) {
727  		struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
728  
729  		if (!p)
730  			return;
731  
732  		refcount_set(&p->refcnt, 1);
733  		f6i->fib6_metrics = p;
734  	}
735  
736  	f6i->fib6_metrics->metrics[metric - 1] = val;
737  }
738  
739  /*
740   *	Routing Table
741   *
742   *	return the appropriate node for a routing tree "add" operation
743   *	by either creating and inserting or by returning an existing
744   *	node.
745   */
746  
fib6_add_1(struct net * net,struct fib6_table * table,struct fib6_node * root,struct in6_addr * addr,int plen,int offset,int allow_create,int replace_required,struct netlink_ext_ack * extack)747  static struct fib6_node *fib6_add_1(struct net *net,
748  				    struct fib6_table *table,
749  				    struct fib6_node *root,
750  				    struct in6_addr *addr, int plen,
751  				    int offset, int allow_create,
752  				    int replace_required,
753  				    struct netlink_ext_ack *extack)
754  {
755  	struct fib6_node *fn, *in, *ln;
756  	struct fib6_node *pn = NULL;
757  	struct rt6key *key;
758  	int	bit;
759  	__be32	dir = 0;
760  
761  	/* insert node in tree */
762  
763  	fn = root;
764  
765  	do {
766  		struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
767  					    lockdep_is_held(&table->tb6_lock));
768  		key = (struct rt6key *)((u8 *)leaf + offset);
769  
770  		/*
771  		 *	Prefix match
772  		 */
773  		if (plen < fn->fn_bit ||
774  		    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
775  			if (!allow_create) {
776  				if (replace_required) {
777  					NL_SET_ERR_MSG(extack,
778  						       "Can not replace route - no match found");
779  					pr_warn("Can't replace route, no match found\n");
780  					return ERR_PTR(-ENOENT);
781  				}
782  				pr_warn("NLM_F_CREATE should be set when creating new route\n");
783  			}
784  			goto insert_above;
785  		}
786  
787  		/*
788  		 *	Exact match ?
789  		 */
790  
791  		if (plen == fn->fn_bit) {
792  			/* clean up an intermediate node */
793  			if (!(fn->fn_flags & RTN_RTINFO)) {
794  				RCU_INIT_POINTER(fn->leaf, NULL);
795  				fib6_info_release(leaf);
796  			/* remove null_entry in the root node */
797  			} else if (fn->fn_flags & RTN_TL_ROOT &&
798  				   rcu_access_pointer(fn->leaf) ==
799  				   net->ipv6.fib6_null_entry) {
800  				RCU_INIT_POINTER(fn->leaf, NULL);
801  			}
802  
803  			return fn;
804  		}
805  
806  		/*
807  		 *	We have more bits to go
808  		 */
809  
810  		/* Try to walk down on tree. */
811  		dir = addr_bit_set(addr, fn->fn_bit);
812  		pn = fn;
813  		fn = dir ?
814  		     rcu_dereference_protected(fn->right,
815  					lockdep_is_held(&table->tb6_lock)) :
816  		     rcu_dereference_protected(fn->left,
817  					lockdep_is_held(&table->tb6_lock));
818  	} while (fn);
819  
820  	if (!allow_create) {
821  		/* We should not create new node because
822  		 * NLM_F_REPLACE was specified without NLM_F_CREATE
823  		 * I assume it is safe to require NLM_F_CREATE when
824  		 * REPLACE flag is used! Later we may want to remove the
825  		 * check for replace_required, because according
826  		 * to netlink specification, NLM_F_CREATE
827  		 * MUST be specified if new route is created.
828  		 * That would keep IPv6 consistent with IPv4
829  		 */
830  		if (replace_required) {
831  			NL_SET_ERR_MSG(extack,
832  				       "Can not replace route - no match found");
833  			pr_warn("Can't replace route, no match found\n");
834  			return ERR_PTR(-ENOENT);
835  		}
836  		pr_warn("NLM_F_CREATE should be set when creating new route\n");
837  	}
838  	/*
839  	 *	We walked to the bottom of tree.
840  	 *	Create new leaf node without children.
841  	 */
842  
843  	ln = node_alloc(net);
844  
845  	if (!ln)
846  		return ERR_PTR(-ENOMEM);
847  	ln->fn_bit = plen;
848  	RCU_INIT_POINTER(ln->parent, pn);
849  
850  	if (dir)
851  		rcu_assign_pointer(pn->right, ln);
852  	else
853  		rcu_assign_pointer(pn->left, ln);
854  
855  	return ln;
856  
857  
858  insert_above:
859  	/*
860  	 * split since we don't have a common prefix anymore or
861  	 * we have a less significant route.
862  	 * we've to insert an intermediate node on the list
863  	 * this new node will point to the one we need to create
864  	 * and the current
865  	 */
866  
867  	pn = rcu_dereference_protected(fn->parent,
868  				       lockdep_is_held(&table->tb6_lock));
869  
870  	/* find 1st bit in difference between the 2 addrs.
871  
872  	   See comment in __ipv6_addr_diff: bit may be an invalid value,
873  	   but if it is >= plen, the value is ignored in any case.
874  	 */
875  
876  	bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
877  
878  	/*
879  	 *		(intermediate)[in]
880  	 *	          /	   \
881  	 *	(new leaf node)[ln] (old node)[fn]
882  	 */
883  	if (plen > bit) {
884  		in = node_alloc(net);
885  		ln = node_alloc(net);
886  
887  		if (!in || !ln) {
888  			if (in)
889  				node_free_immediate(net, in);
890  			if (ln)
891  				node_free_immediate(net, ln);
892  			return ERR_PTR(-ENOMEM);
893  		}
894  
895  		/*
896  		 * new intermediate node.
897  		 * RTN_RTINFO will
898  		 * be off since that an address that chooses one of
899  		 * the branches would not match less specific routes
900  		 * in the other branch
901  		 */
902  
903  		in->fn_bit = bit;
904  
905  		RCU_INIT_POINTER(in->parent, pn);
906  		in->leaf = fn->leaf;
907  		fib6_info_hold(rcu_dereference_protected(in->leaf,
908  				lockdep_is_held(&table->tb6_lock)));
909  
910  		/* update parent pointer */
911  		if (dir)
912  			rcu_assign_pointer(pn->right, in);
913  		else
914  			rcu_assign_pointer(pn->left, in);
915  
916  		ln->fn_bit = plen;
917  
918  		RCU_INIT_POINTER(ln->parent, in);
919  		rcu_assign_pointer(fn->parent, in);
920  
921  		if (addr_bit_set(addr, bit)) {
922  			rcu_assign_pointer(in->right, ln);
923  			rcu_assign_pointer(in->left, fn);
924  		} else {
925  			rcu_assign_pointer(in->left, ln);
926  			rcu_assign_pointer(in->right, fn);
927  		}
928  	} else { /* plen <= bit */
929  
930  		/*
931  		 *		(new leaf node)[ln]
932  		 *	          /	   \
933  		 *	     (old node)[fn] NULL
934  		 */
935  
936  		ln = node_alloc(net);
937  
938  		if (!ln)
939  			return ERR_PTR(-ENOMEM);
940  
941  		ln->fn_bit = plen;
942  
943  		RCU_INIT_POINTER(ln->parent, pn);
944  
945  		if (addr_bit_set(&key->addr, plen))
946  			RCU_INIT_POINTER(ln->right, fn);
947  		else
948  			RCU_INIT_POINTER(ln->left, fn);
949  
950  		rcu_assign_pointer(fn->parent, ln);
951  
952  		if (dir)
953  			rcu_assign_pointer(pn->right, ln);
954  		else
955  			rcu_assign_pointer(pn->left, ln);
956  	}
957  	return ln;
958  }
959  
__fib6_drop_pcpu_from(struct fib6_nh * fib6_nh,const struct fib6_info * match,const struct fib6_table * table)960  static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
961  				  const struct fib6_info *match,
962  				  const struct fib6_table *table)
963  {
964  	int cpu;
965  
966  	if (!fib6_nh->rt6i_pcpu)
967  		return;
968  
969  	rcu_read_lock();
970  	/* release the reference to this fib entry from
971  	 * all of its cached pcpu routes
972  	 */
973  	for_each_possible_cpu(cpu) {
974  		struct rt6_info **ppcpu_rt;
975  		struct rt6_info *pcpu_rt;
976  
977  		ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
978  
979  		/* Paired with xchg() in rt6_get_pcpu_route() */
980  		pcpu_rt = READ_ONCE(*ppcpu_rt);
981  
982  		/* only dropping the 'from' reference if the cached route
983  		 * is using 'match'. The cached pcpu_rt->from only changes
984  		 * from a fib6_info to NULL (ip6_dst_destroy); it can never
985  		 * change from one fib6_info reference to another
986  		 */
987  		if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
988  			struct fib6_info *from;
989  
990  			from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
991  			fib6_info_release(from);
992  		}
993  	}
994  	rcu_read_unlock();
995  }
996  
997  struct fib6_nh_pcpu_arg {
998  	struct fib6_info	*from;
999  	const struct fib6_table *table;
1000  };
1001  
fib6_nh_drop_pcpu_from(struct fib6_nh * nh,void * _arg)1002  static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
1003  {
1004  	struct fib6_nh_pcpu_arg *arg = _arg;
1005  
1006  	__fib6_drop_pcpu_from(nh, arg->from, arg->table);
1007  	return 0;
1008  }
1009  
fib6_drop_pcpu_from(struct fib6_info * f6i,const struct fib6_table * table)1010  static void fib6_drop_pcpu_from(struct fib6_info *f6i,
1011  				const struct fib6_table *table)
1012  {
1013  	/* Make sure rt6_make_pcpu_route() wont add other percpu routes
1014  	 * while we are cleaning them here.
1015  	 */
1016  	f6i->fib6_destroying = 1;
1017  	mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
1018  
1019  	if (f6i->nh) {
1020  		struct fib6_nh_pcpu_arg arg = {
1021  			.from = f6i,
1022  			.table = table
1023  		};
1024  
1025  		nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
1026  					 &arg);
1027  	} else {
1028  		struct fib6_nh *fib6_nh;
1029  
1030  		fib6_nh = f6i->fib6_nh;
1031  		__fib6_drop_pcpu_from(fib6_nh, f6i, table);
1032  	}
1033  }
1034  
fib6_purge_rt(struct fib6_info * rt,struct fib6_node * fn,struct net * net)1035  static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
1036  			  struct net *net)
1037  {
1038  	struct fib6_table *table = rt->fib6_table;
1039  
1040  	/* Flush all cached dst in exception table */
1041  	rt6_flush_exceptions(rt);
1042  	fib6_drop_pcpu_from(rt, table);
1043  
1044  	if (rt->nh && !list_empty(&rt->nh_list))
1045  		list_del_init(&rt->nh_list);
1046  
1047  	if (refcount_read(&rt->fib6_ref) != 1) {
1048  		/* This route is used as dummy address holder in some split
1049  		 * nodes. It is not leaked, but it still holds other resources,
1050  		 * which must be released in time. So, scan ascendant nodes
1051  		 * and replace dummy references to this route with references
1052  		 * to still alive ones.
1053  		 */
1054  		while (fn) {
1055  			struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
1056  					    lockdep_is_held(&table->tb6_lock));
1057  			struct fib6_info *new_leaf;
1058  			if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
1059  				new_leaf = fib6_find_prefix(net, table, fn);
1060  				fib6_info_hold(new_leaf);
1061  
1062  				rcu_assign_pointer(fn->leaf, new_leaf);
1063  				fib6_info_release(rt);
1064  			}
1065  			fn = rcu_dereference_protected(fn->parent,
1066  				    lockdep_is_held(&table->tb6_lock));
1067  		}
1068  	}
1069  
1070  	fib6_clean_expires(rt);
1071  	fib6_remove_gc_list(rt);
1072  }
1073  
1074  /*
1075   *	Insert routing information in a node.
1076   */
1077  
fib6_add_rt2node(struct fib6_node * fn,struct fib6_info * rt,struct nl_info * info,struct netlink_ext_ack * extack)1078  static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
1079  			    struct nl_info *info,
1080  			    struct netlink_ext_ack *extack)
1081  {
1082  	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
1083  				    lockdep_is_held(&rt->fib6_table->tb6_lock));
1084  	struct fib6_info *iter = NULL;
1085  	struct fib6_info __rcu **ins;
1086  	struct fib6_info __rcu **fallback_ins = NULL;
1087  	int replace = (info->nlh &&
1088  		       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
1089  	int add = (!info->nlh ||
1090  		   (info->nlh->nlmsg_flags & NLM_F_CREATE));
1091  	int found = 0;
1092  	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
1093  	bool notify_sibling_rt = false;
1094  	u16 nlflags = NLM_F_EXCL;
1095  	int err;
1096  
1097  	if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
1098  		nlflags |= NLM_F_APPEND;
1099  
1100  	ins = &fn->leaf;
1101  
1102  	for (iter = leaf; iter;
1103  	     iter = rcu_dereference_protected(iter->fib6_next,
1104  				lockdep_is_held(&rt->fib6_table->tb6_lock))) {
1105  		/*
1106  		 *	Search for duplicates
1107  		 */
1108  
1109  		if (iter->fib6_metric == rt->fib6_metric) {
1110  			/*
1111  			 *	Same priority level
1112  			 */
1113  			if (info->nlh &&
1114  			    (info->nlh->nlmsg_flags & NLM_F_EXCL))
1115  				return -EEXIST;
1116  
1117  			nlflags &= ~NLM_F_EXCL;
1118  			if (replace) {
1119  				if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
1120  					found++;
1121  					break;
1122  				}
1123  				fallback_ins = fallback_ins ?: ins;
1124  				goto next_iter;
1125  			}
1126  
1127  			if (rt6_duplicate_nexthop(iter, rt)) {
1128  				if (rt->fib6_nsiblings)
1129  					rt->fib6_nsiblings = 0;
1130  				if (!(iter->fib6_flags & RTF_EXPIRES))
1131  					return -EEXIST;
1132  				if (!(rt->fib6_flags & RTF_EXPIRES)) {
1133  					fib6_clean_expires(iter);
1134  					fib6_remove_gc_list(iter);
1135  				} else {
1136  					fib6_set_expires(iter, rt->expires);
1137  					fib6_add_gc_list(iter);
1138  				}
1139  
1140  				if (rt->fib6_pmtu)
1141  					fib6_metric_set(iter, RTAX_MTU,
1142  							rt->fib6_pmtu);
1143  				return -EEXIST;
1144  			}
1145  			/* If we have the same destination and the same metric,
1146  			 * but not the same gateway, then the route we try to
1147  			 * add is sibling to this route, increment our counter
1148  			 * of siblings, and later we will add our route to the
1149  			 * list.
1150  			 * Only static routes (which don't have flag
1151  			 * RTF_EXPIRES) are used for ECMPv6.
1152  			 *
1153  			 * To avoid long list, we only had siblings if the
1154  			 * route have a gateway.
1155  			 */
1156  			if (rt_can_ecmp &&
1157  			    rt6_qualify_for_ecmp(iter))
1158  				rt->fib6_nsiblings++;
1159  		}
1160  
1161  		if (iter->fib6_metric > rt->fib6_metric)
1162  			break;
1163  
1164  next_iter:
1165  		ins = &iter->fib6_next;
1166  	}
1167  
1168  	if (fallback_ins && !found) {
1169  		/* No matching route with same ecmp-able-ness found, replace
1170  		 * first matching route
1171  		 */
1172  		ins = fallback_ins;
1173  		iter = rcu_dereference_protected(*ins,
1174  				    lockdep_is_held(&rt->fib6_table->tb6_lock));
1175  		found++;
1176  	}
1177  
1178  	/* Reset round-robin state, if necessary */
1179  	if (ins == &fn->leaf)
1180  		fn->rr_ptr = NULL;
1181  
1182  	/* Link this route to others same route. */
1183  	if (rt->fib6_nsiblings) {
1184  		unsigned int fib6_nsiblings;
1185  		struct fib6_info *sibling, *temp_sibling;
1186  
1187  		/* Find the first route that have the same metric */
1188  		sibling = leaf;
1189  		notify_sibling_rt = true;
1190  		while (sibling) {
1191  			if (sibling->fib6_metric == rt->fib6_metric &&
1192  			    rt6_qualify_for_ecmp(sibling)) {
1193  				list_add_tail(&rt->fib6_siblings,
1194  					      &sibling->fib6_siblings);
1195  				break;
1196  			}
1197  			sibling = rcu_dereference_protected(sibling->fib6_next,
1198  				    lockdep_is_held(&rt->fib6_table->tb6_lock));
1199  			notify_sibling_rt = false;
1200  		}
1201  		/* For each sibling in the list, increment the counter of
1202  		 * siblings. BUG() if counters does not match, list of siblings
1203  		 * is broken!
1204  		 */
1205  		fib6_nsiblings = 0;
1206  		list_for_each_entry_safe(sibling, temp_sibling,
1207  					 &rt->fib6_siblings, fib6_siblings) {
1208  			sibling->fib6_nsiblings++;
1209  			BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
1210  			fib6_nsiblings++;
1211  		}
1212  		BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
1213  		rt6_multipath_rebalance(temp_sibling);
1214  	}
1215  
1216  	/*
1217  	 *	insert node
1218  	 */
1219  	if (!replace) {
1220  		if (!add)
1221  			pr_warn("NLM_F_CREATE should be set when creating new route\n");
1222  
1223  add:
1224  		nlflags |= NLM_F_CREATE;
1225  
1226  		/* The route should only be notified if it is the first
1227  		 * route in the node or if it is added as a sibling
1228  		 * route to the first route in the node.
1229  		 */
1230  		if (!info->skip_notify_kernel &&
1231  		    (notify_sibling_rt || ins == &fn->leaf)) {
1232  			enum fib_event_type fib_event;
1233  
1234  			if (notify_sibling_rt)
1235  				fib_event = FIB_EVENT_ENTRY_APPEND;
1236  			else
1237  				fib_event = FIB_EVENT_ENTRY_REPLACE;
1238  			err = call_fib6_entry_notifiers(info->nl_net,
1239  							fib_event, rt,
1240  							extack);
1241  			if (err) {
1242  				struct fib6_info *sibling, *next_sibling;
1243  
1244  				/* If the route has siblings, then it first
1245  				 * needs to be unlinked from them.
1246  				 */
1247  				if (!rt->fib6_nsiblings)
1248  					return err;
1249  
1250  				list_for_each_entry_safe(sibling, next_sibling,
1251  							 &rt->fib6_siblings,
1252  							 fib6_siblings)
1253  					sibling->fib6_nsiblings--;
1254  				rt->fib6_nsiblings = 0;
1255  				list_del_init(&rt->fib6_siblings);
1256  				rt6_multipath_rebalance(next_sibling);
1257  				return err;
1258  			}
1259  		}
1260  
1261  		rcu_assign_pointer(rt->fib6_next, iter);
1262  		fib6_info_hold(rt);
1263  		rcu_assign_pointer(rt->fib6_node, fn);
1264  		rcu_assign_pointer(*ins, rt);
1265  		if (!info->skip_notify)
1266  			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
1267  		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
1268  
1269  		if (!(fn->fn_flags & RTN_RTINFO)) {
1270  			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1271  			fn->fn_flags |= RTN_RTINFO;
1272  		}
1273  
1274  	} else {
1275  		int nsiblings;
1276  
1277  		if (!found) {
1278  			if (add)
1279  				goto add;
1280  			pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
1281  			return -ENOENT;
1282  		}
1283  
1284  		if (!info->skip_notify_kernel && ins == &fn->leaf) {
1285  			err = call_fib6_entry_notifiers(info->nl_net,
1286  							FIB_EVENT_ENTRY_REPLACE,
1287  							rt, extack);
1288  			if (err)
1289  				return err;
1290  		}
1291  
1292  		fib6_info_hold(rt);
1293  		rcu_assign_pointer(rt->fib6_node, fn);
1294  		rt->fib6_next = iter->fib6_next;
1295  		rcu_assign_pointer(*ins, rt);
1296  		if (!info->skip_notify)
1297  			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
1298  		if (!(fn->fn_flags & RTN_RTINFO)) {
1299  			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1300  			fn->fn_flags |= RTN_RTINFO;
1301  		}
1302  		nsiblings = iter->fib6_nsiblings;
1303  		iter->fib6_node = NULL;
1304  		fib6_purge_rt(iter, fn, info->nl_net);
1305  		if (rcu_access_pointer(fn->rr_ptr) == iter)
1306  			fn->rr_ptr = NULL;
1307  		fib6_info_release(iter);
1308  
1309  		if (nsiblings) {
1310  			/* Replacing an ECMP route, remove all siblings */
1311  			ins = &rt->fib6_next;
1312  			iter = rcu_dereference_protected(*ins,
1313  				    lockdep_is_held(&rt->fib6_table->tb6_lock));
1314  			while (iter) {
1315  				if (iter->fib6_metric > rt->fib6_metric)
1316  					break;
1317  				if (rt6_qualify_for_ecmp(iter)) {
1318  					*ins = iter->fib6_next;
1319  					iter->fib6_node = NULL;
1320  					fib6_purge_rt(iter, fn, info->nl_net);
1321  					if (rcu_access_pointer(fn->rr_ptr) == iter)
1322  						fn->rr_ptr = NULL;
1323  					fib6_info_release(iter);
1324  					nsiblings--;
1325  					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1326  				} else {
1327  					ins = &iter->fib6_next;
1328  				}
1329  				iter = rcu_dereference_protected(*ins,
1330  					lockdep_is_held(&rt->fib6_table->tb6_lock));
1331  			}
1332  			WARN_ON(nsiblings != 0);
1333  		}
1334  	}
1335  
1336  	return 0;
1337  }
1338  
fib6_start_gc(struct net * net,struct fib6_info * rt)1339  static void fib6_start_gc(struct net *net, struct fib6_info *rt)
1340  {
1341  	if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
1342  	    (rt->fib6_flags & RTF_EXPIRES))
1343  		mod_timer(&net->ipv6.ip6_fib_timer,
1344  			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1345  }
1346  
fib6_force_start_gc(struct net * net)1347  void fib6_force_start_gc(struct net *net)
1348  {
1349  	if (!timer_pending(&net->ipv6.ip6_fib_timer))
1350  		mod_timer(&net->ipv6.ip6_fib_timer,
1351  			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1352  }
1353  
__fib6_update_sernum_upto_root(struct fib6_info * rt,int sernum)1354  static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
1355  					   int sernum)
1356  {
1357  	struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1358  				lockdep_is_held(&rt->fib6_table->tb6_lock));
1359  
1360  	/* paired with smp_rmb() in fib6_get_cookie_safe() */
1361  	smp_wmb();
1362  	while (fn) {
1363  		WRITE_ONCE(fn->fn_sernum, sernum);
1364  		fn = rcu_dereference_protected(fn->parent,
1365  				lockdep_is_held(&rt->fib6_table->tb6_lock));
1366  	}
1367  }
1368  
fib6_update_sernum_upto_root(struct net * net,struct fib6_info * rt)1369  void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
1370  {
1371  	__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
1372  }
1373  
1374  /* allow ipv4 to update sernum via ipv6_stub */
fib6_update_sernum_stub(struct net * net,struct fib6_info * f6i)1375  void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
1376  {
1377  	spin_lock_bh(&f6i->fib6_table->tb6_lock);
1378  	fib6_update_sernum_upto_root(net, f6i);
1379  	spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1380  }
1381  
1382  /*
1383   *	Add routing information to the routing tree.
1384   *	<destination addr>/<source addr>
1385   *	with source addr info in sub-trees
1386   *	Need to own table->tb6_lock
1387   */
1388  
fib6_add(struct fib6_node * root,struct fib6_info * rt,struct nl_info * info,struct netlink_ext_ack * extack)1389  int fib6_add(struct fib6_node *root, struct fib6_info *rt,
1390  	     struct nl_info *info, struct netlink_ext_ack *extack)
1391  {
1392  	struct fib6_table *table = rt->fib6_table;
1393  	struct fib6_node *fn;
1394  #ifdef CONFIG_IPV6_SUBTREES
1395  	struct fib6_node *pn = NULL;
1396  #endif
1397  	int err = -ENOMEM;
1398  	int allow_create = 1;
1399  	int replace_required = 0;
1400  
1401  	if (info->nlh) {
1402  		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
1403  			allow_create = 0;
1404  		if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
1405  			replace_required = 1;
1406  	}
1407  	if (!allow_create && !replace_required)
1408  		pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
1409  
1410  	fn = fib6_add_1(info->nl_net, table, root,
1411  			&rt->fib6_dst.addr, rt->fib6_dst.plen,
1412  			offsetof(struct fib6_info, fib6_dst), allow_create,
1413  			replace_required, extack);
1414  	if (IS_ERR(fn)) {
1415  		err = PTR_ERR(fn);
1416  		fn = NULL;
1417  		goto out;
1418  	}
1419  
1420  #ifdef CONFIG_IPV6_SUBTREES
1421  	pn = fn;
1422  
1423  	if (rt->fib6_src.plen) {
1424  		struct fib6_node *sn;
1425  
1426  		if (!rcu_access_pointer(fn->subtree)) {
1427  			struct fib6_node *sfn;
1428  
1429  			/*
1430  			 * Create subtree.
1431  			 *
1432  			 *		fn[main tree]
1433  			 *		|
1434  			 *		sfn[subtree root]
1435  			 *		   \
1436  			 *		    sn[new leaf node]
1437  			 */
1438  
1439  			/* Create subtree root node */
1440  			sfn = node_alloc(info->nl_net);
1441  			if (!sfn)
1442  				goto failure;
1443  
1444  			fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
1445  			rcu_assign_pointer(sfn->leaf,
1446  					   info->nl_net->ipv6.fib6_null_entry);
1447  			sfn->fn_flags = RTN_ROOT;
1448  
1449  			/* Now add the first leaf node to new subtree */
1450  
1451  			sn = fib6_add_1(info->nl_net, table, sfn,
1452  					&rt->fib6_src.addr, rt->fib6_src.plen,
1453  					offsetof(struct fib6_info, fib6_src),
1454  					allow_create, replace_required, extack);
1455  
1456  			if (IS_ERR(sn)) {
1457  				/* If it is failed, discard just allocated
1458  				   root, and then (in failure) stale node
1459  				   in main tree.
1460  				 */
1461  				node_free_immediate(info->nl_net, sfn);
1462  				err = PTR_ERR(sn);
1463  				goto failure;
1464  			}
1465  
1466  			/* Now link new subtree to main tree */
1467  			rcu_assign_pointer(sfn->parent, fn);
1468  			rcu_assign_pointer(fn->subtree, sfn);
1469  		} else {
1470  			sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
1471  					&rt->fib6_src.addr, rt->fib6_src.plen,
1472  					offsetof(struct fib6_info, fib6_src),
1473  					allow_create, replace_required, extack);
1474  
1475  			if (IS_ERR(sn)) {
1476  				err = PTR_ERR(sn);
1477  				goto failure;
1478  			}
1479  		}
1480  
1481  		if (!rcu_access_pointer(fn->leaf)) {
1482  			if (fn->fn_flags & RTN_TL_ROOT) {
1483  				/* put back null_entry for root node */
1484  				rcu_assign_pointer(fn->leaf,
1485  					    info->nl_net->ipv6.fib6_null_entry);
1486  			} else {
1487  				fib6_info_hold(rt);
1488  				rcu_assign_pointer(fn->leaf, rt);
1489  			}
1490  		}
1491  		fn = sn;
1492  	}
1493  #endif
1494  
1495  	err = fib6_add_rt2node(fn, rt, info, extack);
1496  	if (!err) {
1497  		if (rt->nh)
1498  			list_add(&rt->nh_list, &rt->nh->f6i_list);
1499  		__fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
1500  
1501  		if (rt->fib6_flags & RTF_EXPIRES)
1502  			fib6_add_gc_list(rt);
1503  
1504  		fib6_start_gc(info->nl_net, rt);
1505  	}
1506  
1507  out:
1508  	if (err) {
1509  #ifdef CONFIG_IPV6_SUBTREES
1510  		/*
1511  		 * If fib6_add_1 has cleared the old leaf pointer in the
1512  		 * super-tree leaf node we have to find a new one for it.
1513  		 */
1514  		if (pn != fn) {
1515  			struct fib6_info *pn_leaf =
1516  				rcu_dereference_protected(pn->leaf,
1517  				    lockdep_is_held(&table->tb6_lock));
1518  			if (pn_leaf == rt) {
1519  				pn_leaf = NULL;
1520  				RCU_INIT_POINTER(pn->leaf, NULL);
1521  				fib6_info_release(rt);
1522  			}
1523  			if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
1524  				pn_leaf = fib6_find_prefix(info->nl_net, table,
1525  							   pn);
1526  				if (!pn_leaf)
1527  					pn_leaf =
1528  					    info->nl_net->ipv6.fib6_null_entry;
1529  				fib6_info_hold(pn_leaf);
1530  				rcu_assign_pointer(pn->leaf, pn_leaf);
1531  			}
1532  		}
1533  #endif
1534  		goto failure;
1535  	} else if (fib6_requires_src(rt)) {
1536  		fib6_routes_require_src_inc(info->nl_net);
1537  	}
1538  	return err;
1539  
1540  failure:
1541  	/* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
1542  	 * 1. fn is an intermediate node and we failed to add the new
1543  	 * route to it in both subtree creation failure and fib6_add_rt2node()
1544  	 * failure case.
1545  	 * 2. fn is the root node in the table and we fail to add the first
1546  	 * default route to it.
1547  	 */
1548  	if (fn &&
1549  	    (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
1550  	     (fn->fn_flags & RTN_TL_ROOT &&
1551  	      !rcu_access_pointer(fn->leaf))))
1552  		fib6_repair_tree(info->nl_net, table, fn);
1553  	return err;
1554  }
1555  
1556  /*
1557   *	Routing tree lookup
1558   *
1559   */
1560  
1561  struct lookup_args {
1562  	int			offset;		/* key offset on fib6_info */
1563  	const struct in6_addr	*addr;		/* search key			*/
1564  };
1565  
fib6_node_lookup_1(struct fib6_node * root,struct lookup_args * args)1566  static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
1567  					    struct lookup_args *args)
1568  {
1569  	struct fib6_node *fn;
1570  	__be32 dir;
1571  
1572  	if (unlikely(args->offset == 0))
1573  		return NULL;
1574  
1575  	/*
1576  	 *	Descend on a tree
1577  	 */
1578  
1579  	fn = root;
1580  
1581  	for (;;) {
1582  		struct fib6_node *next;
1583  
1584  		dir = addr_bit_set(args->addr, fn->fn_bit);
1585  
1586  		next = dir ? rcu_dereference(fn->right) :
1587  			     rcu_dereference(fn->left);
1588  
1589  		if (next) {
1590  			fn = next;
1591  			continue;
1592  		}
1593  		break;
1594  	}
1595  
1596  	while (fn) {
1597  		struct fib6_node *subtree = FIB6_SUBTREE(fn);
1598  
1599  		if (subtree || fn->fn_flags & RTN_RTINFO) {
1600  			struct fib6_info *leaf = rcu_dereference(fn->leaf);
1601  			struct rt6key *key;
1602  
1603  			if (!leaf)
1604  				goto backtrack;
1605  
1606  			key = (struct rt6key *) ((u8 *)leaf + args->offset);
1607  
1608  			if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
1609  #ifdef CONFIG_IPV6_SUBTREES
1610  				if (subtree) {
1611  					struct fib6_node *sfn;
1612  					sfn = fib6_node_lookup_1(subtree,
1613  								 args + 1);
1614  					if (!sfn)
1615  						goto backtrack;
1616  					fn = sfn;
1617  				}
1618  #endif
1619  				if (fn->fn_flags & RTN_RTINFO)
1620  					return fn;
1621  			}
1622  		}
1623  backtrack:
1624  		if (fn->fn_flags & RTN_ROOT)
1625  			break;
1626  
1627  		fn = rcu_dereference(fn->parent);
1628  	}
1629  
1630  	return NULL;
1631  }
1632  
1633  /* called with rcu_read_lock() held
1634   */
fib6_node_lookup(struct fib6_node * root,const struct in6_addr * daddr,const struct in6_addr * saddr)1635  struct fib6_node *fib6_node_lookup(struct fib6_node *root,
1636  				   const struct in6_addr *daddr,
1637  				   const struct in6_addr *saddr)
1638  {
1639  	struct fib6_node *fn;
1640  	struct lookup_args args[] = {
1641  		{
1642  			.offset = offsetof(struct fib6_info, fib6_dst),
1643  			.addr = daddr,
1644  		},
1645  #ifdef CONFIG_IPV6_SUBTREES
1646  		{
1647  			.offset = offsetof(struct fib6_info, fib6_src),
1648  			.addr = saddr,
1649  		},
1650  #endif
1651  		{
1652  			.offset = 0,	/* sentinel */
1653  		}
1654  	};
1655  
1656  	fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
1657  	if (!fn || fn->fn_flags & RTN_TL_ROOT)
1658  		fn = root;
1659  
1660  	return fn;
1661  }
1662  
1663  /*
1664   *	Get node with specified destination prefix (and source prefix,
1665   *	if subtrees are used)
1666   *	exact_match == true means we try to find fn with exact match of
1667   *	the passed in prefix addr
1668   *	exact_match == false means we try to find fn with longest prefix
1669   *	match of the passed in prefix addr. This is useful for finding fn
1670   *	for cached route as it will be stored in the exception table under
1671   *	the node with longest prefix length.
1672   */
1673  
1674  
fib6_locate_1(struct fib6_node * root,const struct in6_addr * addr,int plen,int offset,bool exact_match)1675  static struct fib6_node *fib6_locate_1(struct fib6_node *root,
1676  				       const struct in6_addr *addr,
1677  				       int plen, int offset,
1678  				       bool exact_match)
1679  {
1680  	struct fib6_node *fn, *prev = NULL;
1681  
1682  	for (fn = root; fn ; ) {
1683  		struct fib6_info *leaf = rcu_dereference(fn->leaf);
1684  		struct rt6key *key;
1685  
1686  		/* This node is being deleted */
1687  		if (!leaf) {
1688  			if (plen <= fn->fn_bit)
1689  				goto out;
1690  			else
1691  				goto next;
1692  		}
1693  
1694  		key = (struct rt6key *)((u8 *)leaf + offset);
1695  
1696  		/*
1697  		 *	Prefix match
1698  		 */
1699  		if (plen < fn->fn_bit ||
1700  		    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
1701  			goto out;
1702  
1703  		if (plen == fn->fn_bit)
1704  			return fn;
1705  
1706  		if (fn->fn_flags & RTN_RTINFO)
1707  			prev = fn;
1708  
1709  next:
1710  		/*
1711  		 *	We have more bits to go
1712  		 */
1713  		if (addr_bit_set(addr, fn->fn_bit))
1714  			fn = rcu_dereference(fn->right);
1715  		else
1716  			fn = rcu_dereference(fn->left);
1717  	}
1718  out:
1719  	if (exact_match)
1720  		return NULL;
1721  	else
1722  		return prev;
1723  }
1724  
fib6_locate(struct fib6_node * root,const struct in6_addr * daddr,int dst_len,const struct in6_addr * saddr,int src_len,bool exact_match)1725  struct fib6_node *fib6_locate(struct fib6_node *root,
1726  			      const struct in6_addr *daddr, int dst_len,
1727  			      const struct in6_addr *saddr, int src_len,
1728  			      bool exact_match)
1729  {
1730  	struct fib6_node *fn;
1731  
1732  	fn = fib6_locate_1(root, daddr, dst_len,
1733  			   offsetof(struct fib6_info, fib6_dst),
1734  			   exact_match);
1735  
1736  #ifdef CONFIG_IPV6_SUBTREES
1737  	if (src_len) {
1738  		WARN_ON(saddr == NULL);
1739  		if (fn) {
1740  			struct fib6_node *subtree = FIB6_SUBTREE(fn);
1741  
1742  			if (subtree) {
1743  				fn = fib6_locate_1(subtree, saddr, src_len,
1744  					   offsetof(struct fib6_info, fib6_src),
1745  					   exact_match);
1746  			}
1747  		}
1748  	}
1749  #endif
1750  
1751  	if (fn && fn->fn_flags & RTN_RTINFO)
1752  		return fn;
1753  
1754  	return NULL;
1755  }
1756  
1757  
1758  /*
1759   *	Deletion
1760   *
1761   */
1762  
fib6_find_prefix(struct net * net,struct fib6_table * table,struct fib6_node * fn)1763  static struct fib6_info *fib6_find_prefix(struct net *net,
1764  					 struct fib6_table *table,
1765  					 struct fib6_node *fn)
1766  {
1767  	struct fib6_node *child_left, *child_right;
1768  
1769  	if (fn->fn_flags & RTN_ROOT)
1770  		return net->ipv6.fib6_null_entry;
1771  
1772  	while (fn) {
1773  		child_left = rcu_dereference_protected(fn->left,
1774  				    lockdep_is_held(&table->tb6_lock));
1775  		child_right = rcu_dereference_protected(fn->right,
1776  				    lockdep_is_held(&table->tb6_lock));
1777  		if (child_left)
1778  			return rcu_dereference_protected(child_left->leaf,
1779  					lockdep_is_held(&table->tb6_lock));
1780  		if (child_right)
1781  			return rcu_dereference_protected(child_right->leaf,
1782  					lockdep_is_held(&table->tb6_lock));
1783  
1784  		fn = FIB6_SUBTREE(fn);
1785  	}
1786  	return NULL;
1787  }
1788  
1789  /*
1790   *	Called to trim the tree of intermediate nodes when possible. "fn"
1791   *	is the node we want to try and remove.
1792   *	Need to own table->tb6_lock
1793   */
1794  
fib6_repair_tree(struct net * net,struct fib6_table * table,struct fib6_node * fn)1795  static struct fib6_node *fib6_repair_tree(struct net *net,
1796  					  struct fib6_table *table,
1797  					  struct fib6_node *fn)
1798  {
1799  	int children;
1800  	int nstate;
1801  	struct fib6_node *child;
1802  	struct fib6_walker *w;
1803  	int iter = 0;
1804  
1805  	/* Set fn->leaf to null_entry for root node. */
1806  	if (fn->fn_flags & RTN_TL_ROOT) {
1807  		rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
1808  		return fn;
1809  	}
1810  
1811  	for (;;) {
1812  		struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
1813  					    lockdep_is_held(&table->tb6_lock));
1814  		struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
1815  					    lockdep_is_held(&table->tb6_lock));
1816  		struct fib6_node *pn = rcu_dereference_protected(fn->parent,
1817  					    lockdep_is_held(&table->tb6_lock));
1818  		struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
1819  					    lockdep_is_held(&table->tb6_lock));
1820  		struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
1821  					    lockdep_is_held(&table->tb6_lock));
1822  		struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
1823  					    lockdep_is_held(&table->tb6_lock));
1824  		struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
1825  					    lockdep_is_held(&table->tb6_lock));
1826  		struct fib6_info *new_fn_leaf;
1827  
1828  		pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1829  		iter++;
1830  
1831  		WARN_ON(fn->fn_flags & RTN_RTINFO);
1832  		WARN_ON(fn->fn_flags & RTN_TL_ROOT);
1833  		WARN_ON(fn_leaf);
1834  
1835  		children = 0;
1836  		child = NULL;
1837  		if (fn_r) {
1838  			child = fn_r;
1839  			children |= 1;
1840  		}
1841  		if (fn_l) {
1842  			child = fn_l;
1843  			children |= 2;
1844  		}
1845  
1846  		if (children == 3 || FIB6_SUBTREE(fn)
1847  #ifdef CONFIG_IPV6_SUBTREES
1848  		    /* Subtree root (i.e. fn) may have one child */
1849  		    || (children && fn->fn_flags & RTN_ROOT)
1850  #endif
1851  		    ) {
1852  			new_fn_leaf = fib6_find_prefix(net, table, fn);
1853  #if RT6_DEBUG >= 2
1854  			if (!new_fn_leaf) {
1855  				WARN_ON(!new_fn_leaf);
1856  				new_fn_leaf = net->ipv6.fib6_null_entry;
1857  			}
1858  #endif
1859  			fib6_info_hold(new_fn_leaf);
1860  			rcu_assign_pointer(fn->leaf, new_fn_leaf);
1861  			return pn;
1862  		}
1863  
1864  #ifdef CONFIG_IPV6_SUBTREES
1865  		if (FIB6_SUBTREE(pn) == fn) {
1866  			WARN_ON(!(fn->fn_flags & RTN_ROOT));
1867  			RCU_INIT_POINTER(pn->subtree, NULL);
1868  			nstate = FWS_L;
1869  		} else {
1870  			WARN_ON(fn->fn_flags & RTN_ROOT);
1871  #endif
1872  			if (pn_r == fn)
1873  				rcu_assign_pointer(pn->right, child);
1874  			else if (pn_l == fn)
1875  				rcu_assign_pointer(pn->left, child);
1876  #if RT6_DEBUG >= 2
1877  			else
1878  				WARN_ON(1);
1879  #endif
1880  			if (child)
1881  				rcu_assign_pointer(child->parent, pn);
1882  			nstate = FWS_R;
1883  #ifdef CONFIG_IPV6_SUBTREES
1884  		}
1885  #endif
1886  
1887  		read_lock(&net->ipv6.fib6_walker_lock);
1888  		FOR_WALKERS(net, w) {
1889  			if (!child) {
1890  				if (w->node == fn) {
1891  					pr_debug("W %p adjusted by delnode 1, s=%d/%d\n",
1892  						 w, w->state, nstate);
1893  					w->node = pn;
1894  					w->state = nstate;
1895  				}
1896  			} else {
1897  				if (w->node == fn) {
1898  					w->node = child;
1899  					if (children&2) {
1900  						pr_debug("W %p adjusted by delnode 2, s=%d\n",
1901  							 w, w->state);
1902  						w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
1903  					} else {
1904  						pr_debug("W %p adjusted by delnode 2, s=%d\n",
1905  							 w, w->state);
1906  						w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
1907  					}
1908  				}
1909  			}
1910  		}
1911  		read_unlock(&net->ipv6.fib6_walker_lock);
1912  
1913  		node_free(net, fn);
1914  		if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
1915  			return pn;
1916  
1917  		RCU_INIT_POINTER(pn->leaf, NULL);
1918  		fib6_info_release(pn_leaf);
1919  		fn = pn;
1920  	}
1921  }
1922  
fib6_del_route(struct fib6_table * table,struct fib6_node * fn,struct fib6_info __rcu ** rtp,struct nl_info * info)1923  static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1924  			   struct fib6_info __rcu **rtp, struct nl_info *info)
1925  {
1926  	struct fib6_info *leaf, *replace_rt = NULL;
1927  	struct fib6_walker *w;
1928  	struct fib6_info *rt = rcu_dereference_protected(*rtp,
1929  				    lockdep_is_held(&table->tb6_lock));
1930  	struct net *net = info->nl_net;
1931  	bool notify_del = false;
1932  
1933  	/* If the deleted route is the first in the node and it is not part of
1934  	 * a multipath route, then we need to replace it with the next route
1935  	 * in the node, if exists.
1936  	 */
1937  	leaf = rcu_dereference_protected(fn->leaf,
1938  					 lockdep_is_held(&table->tb6_lock));
1939  	if (leaf == rt && !rt->fib6_nsiblings) {
1940  		if (rcu_access_pointer(rt->fib6_next))
1941  			replace_rt = rcu_dereference_protected(rt->fib6_next,
1942  					    lockdep_is_held(&table->tb6_lock));
1943  		else
1944  			notify_del = true;
1945  	}
1946  
1947  	/* Unlink it */
1948  	*rtp = rt->fib6_next;
1949  	rt->fib6_node = NULL;
1950  	net->ipv6.rt6_stats->fib_rt_entries--;
1951  	net->ipv6.rt6_stats->fib_discarded_routes++;
1952  
1953  	/* Reset round-robin state, if necessary */
1954  	if (rcu_access_pointer(fn->rr_ptr) == rt)
1955  		fn->rr_ptr = NULL;
1956  
1957  	/* Remove this entry from other siblings */
1958  	if (rt->fib6_nsiblings) {
1959  		struct fib6_info *sibling, *next_sibling;
1960  
1961  		/* The route is deleted from a multipath route. If this
1962  		 * multipath route is the first route in the node, then we need
1963  		 * to emit a delete notification. Otherwise, we need to skip
1964  		 * the notification.
1965  		 */
1966  		if (rt->fib6_metric == leaf->fib6_metric &&
1967  		    rt6_qualify_for_ecmp(leaf))
1968  			notify_del = true;
1969  		list_for_each_entry_safe(sibling, next_sibling,
1970  					 &rt->fib6_siblings, fib6_siblings)
1971  			sibling->fib6_nsiblings--;
1972  		rt->fib6_nsiblings = 0;
1973  		list_del_init(&rt->fib6_siblings);
1974  		rt6_multipath_rebalance(next_sibling);
1975  	}
1976  
1977  	/* Adjust walkers */
1978  	read_lock(&net->ipv6.fib6_walker_lock);
1979  	FOR_WALKERS(net, w) {
1980  		if (w->state == FWS_C && w->leaf == rt) {
1981  			pr_debug("walker %p adjusted by delroute\n", w);
1982  			w->leaf = rcu_dereference_protected(rt->fib6_next,
1983  					    lockdep_is_held(&table->tb6_lock));
1984  			if (!w->leaf)
1985  				w->state = FWS_U;
1986  		}
1987  	}
1988  	read_unlock(&net->ipv6.fib6_walker_lock);
1989  
1990  	/* If it was last route, call fib6_repair_tree() to:
1991  	 * 1. For root node, put back null_entry as how the table was created.
1992  	 * 2. For other nodes, expunge its radix tree node.
1993  	 */
1994  	if (!rcu_access_pointer(fn->leaf)) {
1995  		if (!(fn->fn_flags & RTN_TL_ROOT)) {
1996  			fn->fn_flags &= ~RTN_RTINFO;
1997  			net->ipv6.rt6_stats->fib_route_nodes--;
1998  		}
1999  		fn = fib6_repair_tree(net, table, fn);
2000  	}
2001  
2002  	fib6_purge_rt(rt, fn, net);
2003  
2004  	if (!info->skip_notify_kernel) {
2005  		if (notify_del)
2006  			call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
2007  						  rt, NULL);
2008  		else if (replace_rt)
2009  			call_fib6_entry_notifiers_replace(net, replace_rt);
2010  	}
2011  	if (!info->skip_notify)
2012  		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
2013  
2014  	fib6_info_release(rt);
2015  }
2016  
2017  /* Need to own table->tb6_lock */
fib6_del(struct fib6_info * rt,struct nl_info * info)2018  int fib6_del(struct fib6_info *rt, struct nl_info *info)
2019  {
2020  	struct net *net = info->nl_net;
2021  	struct fib6_info __rcu **rtp;
2022  	struct fib6_info __rcu **rtp_next;
2023  	struct fib6_table *table;
2024  	struct fib6_node *fn;
2025  
2026  	if (rt == net->ipv6.fib6_null_entry)
2027  		return -ENOENT;
2028  
2029  	table = rt->fib6_table;
2030  	fn = rcu_dereference_protected(rt->fib6_node,
2031  				       lockdep_is_held(&table->tb6_lock));
2032  	if (!fn)
2033  		return -ENOENT;
2034  
2035  	WARN_ON(!(fn->fn_flags & RTN_RTINFO));
2036  
2037  	/*
2038  	 *	Walk the leaf entries looking for ourself
2039  	 */
2040  
2041  	for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
2042  		struct fib6_info *cur = rcu_dereference_protected(*rtp,
2043  					lockdep_is_held(&table->tb6_lock));
2044  		if (rt == cur) {
2045  			if (fib6_requires_src(cur))
2046  				fib6_routes_require_src_dec(info->nl_net);
2047  			fib6_del_route(table, fn, rtp, info);
2048  			return 0;
2049  		}
2050  		rtp_next = &cur->fib6_next;
2051  	}
2052  	return -ENOENT;
2053  }
2054  
2055  /*
2056   *	Tree traversal function.
2057   *
2058   *	Certainly, it is not interrupt safe.
2059   *	However, it is internally reenterable wrt itself and fib6_add/fib6_del.
2060   *	It means, that we can modify tree during walking
2061   *	and use this function for garbage collection, clone pruning,
2062   *	cleaning tree when a device goes down etc. etc.
2063   *
2064   *	It guarantees that every node will be traversed,
2065   *	and that it will be traversed only once.
2066   *
2067   *	Callback function w->func may return:
2068   *	0 -> continue walking.
2069   *	positive value -> walking is suspended (used by tree dumps,
2070   *	and probably by gc, if it will be split to several slices)
2071   *	negative value -> terminate walking.
2072   *
2073   *	The function itself returns:
2074   *	0   -> walk is complete.
2075   *	>0  -> walk is incomplete (i.e. suspended)
2076   *	<0  -> walk is terminated by an error.
2077   *
2078   *	This function is called with tb6_lock held.
2079   */
2080  
fib6_walk_continue(struct fib6_walker * w)2081  static int fib6_walk_continue(struct fib6_walker *w)
2082  {
2083  	struct fib6_node *fn, *pn, *left, *right;
2084  
2085  	/* w->root should always be table->tb6_root */
2086  	WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
2087  
2088  	for (;;) {
2089  		fn = w->node;
2090  		if (!fn)
2091  			return 0;
2092  
2093  		switch (w->state) {
2094  #ifdef CONFIG_IPV6_SUBTREES
2095  		case FWS_S:
2096  			if (FIB6_SUBTREE(fn)) {
2097  				w->node = FIB6_SUBTREE(fn);
2098  				continue;
2099  			}
2100  			w->state = FWS_L;
2101  			fallthrough;
2102  #endif
2103  		case FWS_L:
2104  			left = rcu_dereference_protected(fn->left, 1);
2105  			if (left) {
2106  				w->node = left;
2107  				w->state = FWS_INIT;
2108  				continue;
2109  			}
2110  			w->state = FWS_R;
2111  			fallthrough;
2112  		case FWS_R:
2113  			right = rcu_dereference_protected(fn->right, 1);
2114  			if (right) {
2115  				w->node = right;
2116  				w->state = FWS_INIT;
2117  				continue;
2118  			}
2119  			w->state = FWS_C;
2120  			w->leaf = rcu_dereference_protected(fn->leaf, 1);
2121  			fallthrough;
2122  		case FWS_C:
2123  			if (w->leaf && fn->fn_flags & RTN_RTINFO) {
2124  				int err;
2125  
2126  				if (w->skip) {
2127  					w->skip--;
2128  					goto skip;
2129  				}
2130  
2131  				err = w->func(w);
2132  				if (err)
2133  					return err;
2134  
2135  				w->count++;
2136  				continue;
2137  			}
2138  skip:
2139  			w->state = FWS_U;
2140  			fallthrough;
2141  		case FWS_U:
2142  			if (fn == w->root)
2143  				return 0;
2144  			pn = rcu_dereference_protected(fn->parent, 1);
2145  			left = rcu_dereference_protected(pn->left, 1);
2146  			right = rcu_dereference_protected(pn->right, 1);
2147  			w->node = pn;
2148  #ifdef CONFIG_IPV6_SUBTREES
2149  			if (FIB6_SUBTREE(pn) == fn) {
2150  				WARN_ON(!(fn->fn_flags & RTN_ROOT));
2151  				w->state = FWS_L;
2152  				continue;
2153  			}
2154  #endif
2155  			if (left == fn) {
2156  				w->state = FWS_R;
2157  				continue;
2158  			}
2159  			if (right == fn) {
2160  				w->state = FWS_C;
2161  				w->leaf = rcu_dereference_protected(w->node->leaf, 1);
2162  				continue;
2163  			}
2164  #if RT6_DEBUG >= 2
2165  			WARN_ON(1);
2166  #endif
2167  		}
2168  	}
2169  }
2170  
fib6_walk(struct net * net,struct fib6_walker * w)2171  static int fib6_walk(struct net *net, struct fib6_walker *w)
2172  {
2173  	int res;
2174  
2175  	w->state = FWS_INIT;
2176  	w->node = w->root;
2177  
2178  	fib6_walker_link(net, w);
2179  	res = fib6_walk_continue(w);
2180  	if (res <= 0)
2181  		fib6_walker_unlink(net, w);
2182  	return res;
2183  }
2184  
fib6_clean_node(struct fib6_walker * w)2185  static int fib6_clean_node(struct fib6_walker *w)
2186  {
2187  	int res;
2188  	struct fib6_info *rt;
2189  	struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
2190  	struct nl_info info = {
2191  		.nl_net = c->net,
2192  		.skip_notify = c->skip_notify,
2193  	};
2194  
2195  	if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
2196  	    READ_ONCE(w->node->fn_sernum) != c->sernum)
2197  		WRITE_ONCE(w->node->fn_sernum, c->sernum);
2198  
2199  	if (!c->func) {
2200  		WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
2201  		w->leaf = NULL;
2202  		return 0;
2203  	}
2204  
2205  	for_each_fib6_walker_rt(w) {
2206  		res = c->func(rt, c->arg);
2207  		if (res == -1) {
2208  			w->leaf = rt;
2209  			res = fib6_del(rt, &info);
2210  			if (res) {
2211  #if RT6_DEBUG >= 2
2212  				pr_debug("%s: del failed: rt=%p@%p err=%d\n",
2213  					 __func__, rt,
2214  					 rcu_access_pointer(rt->fib6_node),
2215  					 res);
2216  #endif
2217  				continue;
2218  			}
2219  			return 0;
2220  		} else if (res == -2) {
2221  			if (WARN_ON(!rt->fib6_nsiblings))
2222  				continue;
2223  			rt = list_last_entry(&rt->fib6_siblings,
2224  					     struct fib6_info, fib6_siblings);
2225  			continue;
2226  		}
2227  		WARN_ON(res != 0);
2228  	}
2229  	w->leaf = rt;
2230  	return 0;
2231  }
2232  
2233  /*
2234   *	Convenient frontend to tree walker.
2235   *
2236   *	func is called on each route.
2237   *		It may return -2 -> skip multipath route.
2238   *			      -1 -> delete this route.
2239   *		              0  -> continue walking
2240   */
2241  
fib6_clean_tree(struct net * net,struct fib6_node * root,int (* func)(struct fib6_info *,void * arg),int sernum,void * arg,bool skip_notify)2242  static void fib6_clean_tree(struct net *net, struct fib6_node *root,
2243  			    int (*func)(struct fib6_info *, void *arg),
2244  			    int sernum, void *arg, bool skip_notify)
2245  {
2246  	struct fib6_cleaner c;
2247  
2248  	c.w.root = root;
2249  	c.w.func = fib6_clean_node;
2250  	c.w.count = 0;
2251  	c.w.skip = 0;
2252  	c.w.skip_in_node = 0;
2253  	c.func = func;
2254  	c.sernum = sernum;
2255  	c.arg = arg;
2256  	c.net = net;
2257  	c.skip_notify = skip_notify;
2258  
2259  	fib6_walk(net, &c.w);
2260  }
2261  
__fib6_clean_all(struct net * net,int (* func)(struct fib6_info *,void *),int sernum,void * arg,bool skip_notify)2262  static void __fib6_clean_all(struct net *net,
2263  			     int (*func)(struct fib6_info *, void *),
2264  			     int sernum, void *arg, bool skip_notify)
2265  {
2266  	struct fib6_table *table;
2267  	struct hlist_head *head;
2268  	unsigned int h;
2269  
2270  	rcu_read_lock();
2271  	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2272  		head = &net->ipv6.fib_table_hash[h];
2273  		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2274  			spin_lock_bh(&table->tb6_lock);
2275  			fib6_clean_tree(net, &table->tb6_root,
2276  					func, sernum, arg, skip_notify);
2277  			spin_unlock_bh(&table->tb6_lock);
2278  		}
2279  	}
2280  	rcu_read_unlock();
2281  }
2282  
fib6_clean_all(struct net * net,int (* func)(struct fib6_info *,void *),void * arg)2283  void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
2284  		    void *arg)
2285  {
2286  	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
2287  }
2288  
fib6_clean_all_skip_notify(struct net * net,int (* func)(struct fib6_info *,void *),void * arg)2289  void fib6_clean_all_skip_notify(struct net *net,
2290  				int (*func)(struct fib6_info *, void *),
2291  				void *arg)
2292  {
2293  	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
2294  }
2295  
fib6_flush_trees(struct net * net)2296  static void fib6_flush_trees(struct net *net)
2297  {
2298  	int new_sernum = fib6_new_sernum(net);
2299  
2300  	__fib6_clean_all(net, NULL, new_sernum, NULL, false);
2301  }
2302  
2303  /*
2304   *	Garbage collection
2305   */
2306  
fib6_age(struct fib6_info * rt,struct fib6_gc_args * gc_args)2307  static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args)
2308  {
2309  	unsigned long now = jiffies;
2310  
2311  	/*
2312  	 *	check addrconf expiration here.
2313  	 *	Routes are expired even if they are in use.
2314  	 */
2315  
2316  	if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
2317  		if (time_after(now, rt->expires)) {
2318  			pr_debug("expiring %p\n", rt);
2319  			return -1;
2320  		}
2321  		gc_args->more++;
2322  	}
2323  
2324  	/*	Also age clones in the exception table.
2325  	 *	Note, that clones are aged out
2326  	 *	only if they are not in use now.
2327  	 */
2328  	rt6_age_exceptions(rt, gc_args, now);
2329  
2330  	return 0;
2331  }
2332  
fib6_gc_table(struct net * net,struct fib6_table * tb6,struct fib6_gc_args * gc_args)2333  static void fib6_gc_table(struct net *net,
2334  			  struct fib6_table *tb6,
2335  			  struct fib6_gc_args *gc_args)
2336  {
2337  	struct fib6_info *rt;
2338  	struct hlist_node *n;
2339  	struct nl_info info = {
2340  		.nl_net = net,
2341  		.skip_notify = false,
2342  	};
2343  
2344  	hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
2345  		if (fib6_age(rt, gc_args) == -1)
2346  			fib6_del(rt, &info);
2347  }
2348  
fib6_gc_all(struct net * net,struct fib6_gc_args * gc_args)2349  static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
2350  {
2351  	struct fib6_table *table;
2352  	struct hlist_head *head;
2353  	unsigned int h;
2354  
2355  	rcu_read_lock();
2356  	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2357  		head = &net->ipv6.fib_table_hash[h];
2358  		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2359  			spin_lock_bh(&table->tb6_lock);
2360  
2361  			fib6_gc_table(net, table, gc_args);
2362  
2363  			spin_unlock_bh(&table->tb6_lock);
2364  		}
2365  	}
2366  	rcu_read_unlock();
2367  }
2368  
fib6_run_gc(unsigned long expires,struct net * net,bool force)2369  void fib6_run_gc(unsigned long expires, struct net *net, bool force)
2370  {
2371  	struct fib6_gc_args gc_args;
2372  	unsigned long now;
2373  
2374  	if (force) {
2375  		spin_lock_bh(&net->ipv6.fib6_gc_lock);
2376  	} else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
2377  		mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
2378  		return;
2379  	}
2380  	gc_args.timeout = expires ? (int)expires :
2381  			  net->ipv6.sysctl.ip6_rt_gc_interval;
2382  	gc_args.more = 0;
2383  
2384  	fib6_gc_all(net, &gc_args);
2385  	now = jiffies;
2386  	net->ipv6.ip6_rt_last_gc = now;
2387  
2388  	if (gc_args.more)
2389  		mod_timer(&net->ipv6.ip6_fib_timer,
2390  			  round_jiffies(now
2391  					+ net->ipv6.sysctl.ip6_rt_gc_interval));
2392  	else
2393  		del_timer(&net->ipv6.ip6_fib_timer);
2394  	spin_unlock_bh(&net->ipv6.fib6_gc_lock);
2395  }
2396  
fib6_gc_timer_cb(struct timer_list * t)2397  static void fib6_gc_timer_cb(struct timer_list *t)
2398  {
2399  	struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer);
2400  
2401  	fib6_run_gc(0, arg, true);
2402  }
2403  
fib6_net_init(struct net * net)2404  static int __net_init fib6_net_init(struct net *net)
2405  {
2406  	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
2407  	int err;
2408  
2409  	err = fib6_notifier_init(net);
2410  	if (err)
2411  		return err;
2412  
2413  	/* Default to 3-tuple */
2414  	net->ipv6.sysctl.multipath_hash_fields =
2415  		FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
2416  
2417  	spin_lock_init(&net->ipv6.fib6_gc_lock);
2418  	rwlock_init(&net->ipv6.fib6_walker_lock);
2419  	INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
2420  	timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);
2421  
2422  	net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
2423  	if (!net->ipv6.rt6_stats)
2424  		goto out_notifier;
2425  
2426  	/* Avoid false sharing : Use at least a full cache line */
2427  	size = max_t(size_t, size, L1_CACHE_BYTES);
2428  
2429  	net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
2430  	if (!net->ipv6.fib_table_hash)
2431  		goto out_rt6_stats;
2432  
2433  	net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
2434  					  GFP_KERNEL);
2435  	if (!net->ipv6.fib6_main_tbl)
2436  		goto out_fib_table_hash;
2437  
2438  	net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
2439  	rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
2440  			   net->ipv6.fib6_null_entry);
2441  	net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
2442  		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2443  	inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
2444  	INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist);
2445  
2446  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2447  	net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
2448  					   GFP_KERNEL);
2449  	if (!net->ipv6.fib6_local_tbl)
2450  		goto out_fib6_main_tbl;
2451  	net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
2452  	rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
2453  			   net->ipv6.fib6_null_entry);
2454  	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
2455  		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2456  	inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
2457  	INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist);
2458  #endif
2459  	fib6_tables_init(net);
2460  
2461  	return 0;
2462  
2463  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2464  out_fib6_main_tbl:
2465  	kfree(net->ipv6.fib6_main_tbl);
2466  #endif
2467  out_fib_table_hash:
2468  	kfree(net->ipv6.fib_table_hash);
2469  out_rt6_stats:
2470  	kfree(net->ipv6.rt6_stats);
2471  out_notifier:
2472  	fib6_notifier_exit(net);
2473  	return -ENOMEM;
2474  }
2475  
fib6_net_exit(struct net * net)2476  static void fib6_net_exit(struct net *net)
2477  {
2478  	unsigned int i;
2479  
2480  	del_timer_sync(&net->ipv6.ip6_fib_timer);
2481  
2482  	for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
2483  		struct hlist_head *head = &net->ipv6.fib_table_hash[i];
2484  		struct hlist_node *tmp;
2485  		struct fib6_table *tb;
2486  
2487  		hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
2488  			hlist_del(&tb->tb6_hlist);
2489  			fib6_free_table(tb);
2490  		}
2491  	}
2492  
2493  	kfree(net->ipv6.fib_table_hash);
2494  	kfree(net->ipv6.rt6_stats);
2495  	fib6_notifier_exit(net);
2496  }
2497  
2498  static struct pernet_operations fib6_net_ops = {
2499  	.init = fib6_net_init,
2500  	.exit = fib6_net_exit,
2501  };
2502  
fib6_init(void)2503  int __init fib6_init(void)
2504  {
2505  	int ret = -ENOMEM;
2506  
2507  	fib6_node_kmem = KMEM_CACHE(fib6_node,
2508  				    SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT);
2509  	if (!fib6_node_kmem)
2510  		goto out;
2511  
2512  	ret = register_pernet_subsys(&fib6_net_ops);
2513  	if (ret)
2514  		goto out_kmem_cache_create;
2515  
2516  	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
2517  				   inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED |
2518  				   RTNL_FLAG_DUMP_SPLIT_NLM_DONE);
2519  	if (ret)
2520  		goto out_unregister_subsys;
2521  
2522  	__fib6_flush_trees = fib6_flush_trees;
2523  out:
2524  	return ret;
2525  
2526  out_unregister_subsys:
2527  	unregister_pernet_subsys(&fib6_net_ops);
2528  out_kmem_cache_create:
2529  	kmem_cache_destroy(fib6_node_kmem);
2530  	goto out;
2531  }
2532  
fib6_gc_cleanup(void)2533  void fib6_gc_cleanup(void)
2534  {
2535  	unregister_pernet_subsys(&fib6_net_ops);
2536  	kmem_cache_destroy(fib6_node_kmem);
2537  }
2538  
2539  #ifdef CONFIG_PROC_FS
ipv6_route_native_seq_show(struct seq_file * seq,void * v)2540  static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
2541  {
2542  	struct fib6_info *rt = v;
2543  	struct ipv6_route_iter *iter = seq->private;
2544  	struct fib6_nh *fib6_nh = rt->fib6_nh;
2545  	unsigned int flags = rt->fib6_flags;
2546  	const struct net_device *dev;
2547  
2548  	if (rt->nh)
2549  		fib6_nh = nexthop_fib6_nh(rt->nh);
2550  
2551  	seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
2552  
2553  #ifdef CONFIG_IPV6_SUBTREES
2554  	seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
2555  #else
2556  	seq_puts(seq, "00000000000000000000000000000000 00 ");
2557  #endif
2558  	if (fib6_nh->fib_nh_gw_family) {
2559  		flags |= RTF_GATEWAY;
2560  		seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
2561  	} else {
2562  		seq_puts(seq, "00000000000000000000000000000000");
2563  	}
2564  
2565  	dev = fib6_nh->fib_nh_dev;
2566  	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
2567  		   rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
2568  		   flags, dev ? dev->name : "");
2569  	iter->w.leaf = NULL;
2570  	return 0;
2571  }
2572  
ipv6_route_yield(struct fib6_walker * w)2573  static int ipv6_route_yield(struct fib6_walker *w)
2574  {
2575  	struct ipv6_route_iter *iter = w->args;
2576  
2577  	if (!iter->skip)
2578  		return 1;
2579  
2580  	do {
2581  		iter->w.leaf = rcu_dereference_protected(
2582  				iter->w.leaf->fib6_next,
2583  				lockdep_is_held(&iter->tbl->tb6_lock));
2584  		iter->skip--;
2585  		if (!iter->skip && iter->w.leaf)
2586  			return 1;
2587  	} while (iter->w.leaf);
2588  
2589  	return 0;
2590  }
2591  
ipv6_route_seq_setup_walk(struct ipv6_route_iter * iter,struct net * net)2592  static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
2593  				      struct net *net)
2594  {
2595  	memset(&iter->w, 0, sizeof(iter->w));
2596  	iter->w.func = ipv6_route_yield;
2597  	iter->w.root = &iter->tbl->tb6_root;
2598  	iter->w.state = FWS_INIT;
2599  	iter->w.node = iter->w.root;
2600  	iter->w.args = iter;
2601  	iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
2602  	INIT_LIST_HEAD(&iter->w.lh);
2603  	fib6_walker_link(net, &iter->w);
2604  }
2605  
ipv6_route_seq_next_table(struct fib6_table * tbl,struct net * net)2606  static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
2607  						    struct net *net)
2608  {
2609  	unsigned int h;
2610  	struct hlist_node *node;
2611  
2612  	if (tbl) {
2613  		h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
2614  		node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist));
2615  	} else {
2616  		h = 0;
2617  		node = NULL;
2618  	}
2619  
2620  	while (!node && h < FIB6_TABLE_HASHSZ) {
2621  		node = rcu_dereference(
2622  			hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
2623  	}
2624  	return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
2625  }
2626  
ipv6_route_check_sernum(struct ipv6_route_iter * iter)2627  static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
2628  {
2629  	int sernum = READ_ONCE(iter->w.root->fn_sernum);
2630  
2631  	if (iter->sernum != sernum) {
2632  		iter->sernum = sernum;
2633  		iter->w.state = FWS_INIT;
2634  		iter->w.node = iter->w.root;
2635  		WARN_ON(iter->w.skip);
2636  		iter->w.skip = iter->w.count;
2637  	}
2638  }
2639  
ipv6_route_seq_next(struct seq_file * seq,void * v,loff_t * pos)2640  static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2641  {
2642  	int r;
2643  	struct fib6_info *n;
2644  	struct net *net = seq_file_net(seq);
2645  	struct ipv6_route_iter *iter = seq->private;
2646  
2647  	++(*pos);
2648  	if (!v)
2649  		goto iter_table;
2650  
2651  	n = rcu_dereference(((struct fib6_info *)v)->fib6_next);
2652  	if (n)
2653  		return n;
2654  
2655  iter_table:
2656  	ipv6_route_check_sernum(iter);
2657  	spin_lock_bh(&iter->tbl->tb6_lock);
2658  	r = fib6_walk_continue(&iter->w);
2659  	spin_unlock_bh(&iter->tbl->tb6_lock);
2660  	if (r > 0) {
2661  		return iter->w.leaf;
2662  	} else if (r < 0) {
2663  		fib6_walker_unlink(net, &iter->w);
2664  		return NULL;
2665  	}
2666  	fib6_walker_unlink(net, &iter->w);
2667  
2668  	iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
2669  	if (!iter->tbl)
2670  		return NULL;
2671  
2672  	ipv6_route_seq_setup_walk(iter, net);
2673  	goto iter_table;
2674  }
2675  
ipv6_route_seq_start(struct seq_file * seq,loff_t * pos)2676  static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
2677  	__acquires(RCU)
2678  {
2679  	struct net *net = seq_file_net(seq);
2680  	struct ipv6_route_iter *iter = seq->private;
2681  
2682  	rcu_read_lock();
2683  	iter->tbl = ipv6_route_seq_next_table(NULL, net);
2684  	iter->skip = *pos;
2685  
2686  	if (iter->tbl) {
2687  		loff_t p = 0;
2688  
2689  		ipv6_route_seq_setup_walk(iter, net);
2690  		return ipv6_route_seq_next(seq, NULL, &p);
2691  	} else {
2692  		return NULL;
2693  	}
2694  }
2695  
ipv6_route_iter_active(struct ipv6_route_iter * iter)2696  static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
2697  {
2698  	struct fib6_walker *w = &iter->w;
2699  	return w->node && !(w->state == FWS_U && w->node == w->root);
2700  }
2701  
ipv6_route_native_seq_stop(struct seq_file * seq,void * v)2702  static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
2703  	__releases(RCU)
2704  {
2705  	struct net *net = seq_file_net(seq);
2706  	struct ipv6_route_iter *iter = seq->private;
2707  
2708  	if (ipv6_route_iter_active(iter))
2709  		fib6_walker_unlink(net, &iter->w);
2710  
2711  	rcu_read_unlock();
2712  }
2713  
2714  #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
ipv6_route_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,void * v)2715  static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
2716  				    struct bpf_iter_meta *meta,
2717  				    void *v)
2718  {
2719  	struct bpf_iter__ipv6_route ctx;
2720  
2721  	ctx.meta = meta;
2722  	ctx.rt = v;
2723  	return bpf_iter_run_prog(prog, &ctx);
2724  }
2725  
ipv6_route_seq_show(struct seq_file * seq,void * v)2726  static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2727  {
2728  	struct ipv6_route_iter *iter = seq->private;
2729  	struct bpf_iter_meta meta;
2730  	struct bpf_prog *prog;
2731  	int ret;
2732  
2733  	meta.seq = seq;
2734  	prog = bpf_iter_get_info(&meta, false);
2735  	if (!prog)
2736  		return ipv6_route_native_seq_show(seq, v);
2737  
2738  	ret = ipv6_route_prog_seq_show(prog, &meta, v);
2739  	iter->w.leaf = NULL;
2740  
2741  	return ret;
2742  }
2743  
ipv6_route_seq_stop(struct seq_file * seq,void * v)2744  static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2745  {
2746  	struct bpf_iter_meta meta;
2747  	struct bpf_prog *prog;
2748  
2749  	if (!v) {
2750  		meta.seq = seq;
2751  		prog = bpf_iter_get_info(&meta, true);
2752  		if (prog)
2753  			(void)ipv6_route_prog_seq_show(prog, &meta, v);
2754  	}
2755  
2756  	ipv6_route_native_seq_stop(seq, v);
2757  }
2758  #else
ipv6_route_seq_show(struct seq_file * seq,void * v)2759  static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2760  {
2761  	return ipv6_route_native_seq_show(seq, v);
2762  }
2763  
ipv6_route_seq_stop(struct seq_file * seq,void * v)2764  static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2765  {
2766  	ipv6_route_native_seq_stop(seq, v);
2767  }
2768  #endif
2769  
2770  const struct seq_operations ipv6_route_seq_ops = {
2771  	.start	= ipv6_route_seq_start,
2772  	.next	= ipv6_route_seq_next,
2773  	.stop	= ipv6_route_seq_stop,
2774  	.show	= ipv6_route_seq_show
2775  };
2776  #endif /* CONFIG_PROC_FS */
2777