1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Copyright (c) 2013 Nicira, Inc.
4   */
5  
6  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7  
8  #include <linux/capability.h>
9  #include <linux/module.h>
10  #include <linux/types.h>
11  #include <linux/kernel.h>
12  #include <linux/slab.h>
13  #include <linux/uaccess.h>
14  #include <linux/skbuff.h>
15  #include <linux/netdevice.h>
16  #include <linux/in.h>
17  #include <linux/tcp.h>
18  #include <linux/udp.h>
19  #include <linux/if_arp.h>
20  #include <linux/init.h>
21  #include <linux/in6.h>
22  #include <linux/inetdevice.h>
23  #include <linux/igmp.h>
24  #include <linux/netfilter_ipv4.h>
25  #include <linux/etherdevice.h>
26  #include <linux/if_ether.h>
27  #include <linux/if_vlan.h>
28  #include <linux/rculist.h>
29  #include <linux/err.h>
30  
31  #include <net/sock.h>
32  #include <net/ip.h>
33  #include <net/icmp.h>
34  #include <net/protocol.h>
35  #include <net/ip_tunnels.h>
36  #include <net/arp.h>
37  #include <net/checksum.h>
38  #include <net/dsfield.h>
39  #include <net/inet_ecn.h>
40  #include <net/xfrm.h>
41  #include <net/net_namespace.h>
42  #include <net/netns/generic.h>
43  #include <net/rtnetlink.h>
44  #include <net/udp.h>
45  #include <net/dst_metadata.h>
46  #include <net/inet_dscp.h>
47  
48  #if IS_ENABLED(CONFIG_IPV6)
49  #include <net/ipv6.h>
50  #include <net/ip6_fib.h>
51  #include <net/ip6_route.h>
52  #endif
53  
ip_tunnel_hash(__be32 key,__be32 remote)54  static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
55  {
56  	return hash_32((__force u32)key ^ (__force u32)remote,
57  			 IP_TNL_HASH_BITS);
58  }
59  
ip_tunnel_key_match(const struct ip_tunnel_parm_kern * p,const unsigned long * flags,__be32 key)60  static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
61  				const unsigned long *flags, __be32 key)
62  {
63  	if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
64  		return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
65  
66  	return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
67  }
68  
69  /* Fallback tunnel: no source, no destination, no key, no options
70  
71     Tunnel hash table:
72     We require exact key match i.e. if a key is present in packet
73     it will match only tunnel with the same key; if it is not present,
74     it will match only keyless tunnel.
75  
76     All keysless packets, if not matched configured keyless tunnels
77     will match fallback tunnel.
78     Given src, dst and key, find appropriate for input tunnel.
79  */
ip_tunnel_lookup(struct ip_tunnel_net * itn,int link,const unsigned long * flags,__be32 remote,__be32 local,__be32 key)80  struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
81  				   int link, const unsigned long *flags,
82  				   __be32 remote, __be32 local,
83  				   __be32 key)
84  {
85  	struct ip_tunnel *t, *cand = NULL;
86  	struct hlist_head *head;
87  	struct net_device *ndev;
88  	unsigned int hash;
89  
90  	hash = ip_tunnel_hash(key, remote);
91  	head = &itn->tunnels[hash];
92  
93  	hlist_for_each_entry_rcu(t, head, hash_node) {
94  		if (local != t->parms.iph.saddr ||
95  		    remote != t->parms.iph.daddr ||
96  		    !(t->dev->flags & IFF_UP))
97  			continue;
98  
99  		if (!ip_tunnel_key_match(&t->parms, flags, key))
100  			continue;
101  
102  		if (READ_ONCE(t->parms.link) == link)
103  			return t;
104  		cand = t;
105  	}
106  
107  	hlist_for_each_entry_rcu(t, head, hash_node) {
108  		if (remote != t->parms.iph.daddr ||
109  		    t->parms.iph.saddr != 0 ||
110  		    !(t->dev->flags & IFF_UP))
111  			continue;
112  
113  		if (!ip_tunnel_key_match(&t->parms, flags, key))
114  			continue;
115  
116  		if (READ_ONCE(t->parms.link) == link)
117  			return t;
118  		if (!cand)
119  			cand = t;
120  	}
121  
122  	hash = ip_tunnel_hash(key, 0);
123  	head = &itn->tunnels[hash];
124  
125  	hlist_for_each_entry_rcu(t, head, hash_node) {
126  		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
127  		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
128  			continue;
129  
130  		if (!(t->dev->flags & IFF_UP))
131  			continue;
132  
133  		if (!ip_tunnel_key_match(&t->parms, flags, key))
134  			continue;
135  
136  		if (READ_ONCE(t->parms.link) == link)
137  			return t;
138  		if (!cand)
139  			cand = t;
140  	}
141  
142  	hlist_for_each_entry_rcu(t, head, hash_node) {
143  		if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
144  		     t->parms.i_key != key) ||
145  		    t->parms.iph.saddr != 0 ||
146  		    t->parms.iph.daddr != 0 ||
147  		    !(t->dev->flags & IFF_UP))
148  			continue;
149  
150  		if (READ_ONCE(t->parms.link) == link)
151  			return t;
152  		if (!cand)
153  			cand = t;
154  	}
155  
156  	if (cand)
157  		return cand;
158  
159  	t = rcu_dereference(itn->collect_md_tun);
160  	if (t && t->dev->flags & IFF_UP)
161  		return t;
162  
163  	ndev = READ_ONCE(itn->fb_tunnel_dev);
164  	if (ndev && ndev->flags & IFF_UP)
165  		return netdev_priv(ndev);
166  
167  	return NULL;
168  }
169  EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
170  
ip_bucket(struct ip_tunnel_net * itn,struct ip_tunnel_parm_kern * parms)171  static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
172  				    struct ip_tunnel_parm_kern *parms)
173  {
174  	unsigned int h;
175  	__be32 remote;
176  	__be32 i_key = parms->i_key;
177  
178  	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
179  		remote = parms->iph.daddr;
180  	else
181  		remote = 0;
182  
183  	if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
184  	    test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
185  		i_key = 0;
186  
187  	h = ip_tunnel_hash(i_key, remote);
188  	return &itn->tunnels[h];
189  }
190  
ip_tunnel_add(struct ip_tunnel_net * itn,struct ip_tunnel * t)191  static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
192  {
193  	struct hlist_head *head = ip_bucket(itn, &t->parms);
194  
195  	if (t->collect_md)
196  		rcu_assign_pointer(itn->collect_md_tun, t);
197  	hlist_add_head_rcu(&t->hash_node, head);
198  }
199  
ip_tunnel_del(struct ip_tunnel_net * itn,struct ip_tunnel * t)200  static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
201  {
202  	if (t->collect_md)
203  		rcu_assign_pointer(itn->collect_md_tun, NULL);
204  	hlist_del_init_rcu(&t->hash_node);
205  }
206  
ip_tunnel_find(struct ip_tunnel_net * itn,struct ip_tunnel_parm_kern * parms,int type)207  static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
208  					struct ip_tunnel_parm_kern *parms,
209  					int type)
210  {
211  	__be32 remote = parms->iph.daddr;
212  	__be32 local = parms->iph.saddr;
213  	IP_TUNNEL_DECLARE_FLAGS(flags);
214  	__be32 key = parms->i_key;
215  	int link = parms->link;
216  	struct ip_tunnel *t = NULL;
217  	struct hlist_head *head = ip_bucket(itn, parms);
218  
219  	ip_tunnel_flags_copy(flags, parms->i_flags);
220  
221  	hlist_for_each_entry_rcu(t, head, hash_node, lockdep_rtnl_is_held()) {
222  		if (local == t->parms.iph.saddr &&
223  		    remote == t->parms.iph.daddr &&
224  		    link == READ_ONCE(t->parms.link) &&
225  		    type == t->dev->type &&
226  		    ip_tunnel_key_match(&t->parms, flags, key))
227  			break;
228  	}
229  	return t;
230  }
231  
__ip_tunnel_create(struct net * net,const struct rtnl_link_ops * ops,struct ip_tunnel_parm_kern * parms)232  static struct net_device *__ip_tunnel_create(struct net *net,
233  					     const struct rtnl_link_ops *ops,
234  					     struct ip_tunnel_parm_kern *parms)
235  {
236  	int err;
237  	struct ip_tunnel *tunnel;
238  	struct net_device *dev;
239  	char name[IFNAMSIZ];
240  
241  	err = -E2BIG;
242  	if (parms->name[0]) {
243  		if (!dev_valid_name(parms->name))
244  			goto failed;
245  		strscpy(name, parms->name, IFNAMSIZ);
246  	} else {
247  		if (strlen(ops->kind) > (IFNAMSIZ - 3))
248  			goto failed;
249  		strcpy(name, ops->kind);
250  		strcat(name, "%d");
251  	}
252  
253  	ASSERT_RTNL();
254  	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255  	if (!dev) {
256  		err = -ENOMEM;
257  		goto failed;
258  	}
259  	dev_net_set(dev, net);
260  
261  	dev->rtnl_link_ops = ops;
262  
263  	tunnel = netdev_priv(dev);
264  	tunnel->parms = *parms;
265  	tunnel->net = net;
266  
267  	err = register_netdevice(dev);
268  	if (err)
269  		goto failed_free;
270  
271  	return dev;
272  
273  failed_free:
274  	free_netdev(dev);
275  failed:
276  	return ERR_PTR(err);
277  }
278  
ip_tunnel_bind_dev(struct net_device * dev)279  static int ip_tunnel_bind_dev(struct net_device *dev)
280  {
281  	struct net_device *tdev = NULL;
282  	struct ip_tunnel *tunnel = netdev_priv(dev);
283  	const struct iphdr *iph;
284  	int hlen = LL_MAX_HEADER;
285  	int mtu = ETH_DATA_LEN;
286  	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287  
288  	iph = &tunnel->parms.iph;
289  
290  	/* Guess output device to choose reasonable mtu and needed_headroom */
291  	if (iph->daddr) {
292  		struct flowi4 fl4;
293  		struct rtable *rt;
294  
295  		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296  				    iph->saddr, tunnel->parms.o_key,
297  				    iph->tos & INET_DSCP_MASK, dev_net(dev),
298  				    tunnel->parms.link, tunnel->fwmark, 0, 0);
299  		rt = ip_route_output_key(tunnel->net, &fl4);
300  
301  		if (!IS_ERR(rt)) {
302  			tdev = rt->dst.dev;
303  			ip_rt_put(rt);
304  		}
305  		if (dev->type != ARPHRD_ETHER)
306  			dev->flags |= IFF_POINTOPOINT;
307  
308  		dst_cache_reset(&tunnel->dst_cache);
309  	}
310  
311  	if (!tdev && tunnel->parms.link)
312  		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313  
314  	if (tdev) {
315  		hlen = tdev->hard_header_len + tdev->needed_headroom;
316  		mtu = min(tdev->mtu, IP_MAX_MTU);
317  	}
318  
319  	dev->needed_headroom = t_hlen + hlen;
320  	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
321  
322  	if (mtu < IPV4_MIN_MTU)
323  		mtu = IPV4_MIN_MTU;
324  
325  	return mtu;
326  }
327  
ip_tunnel_create(struct net * net,struct ip_tunnel_net * itn,struct ip_tunnel_parm_kern * parms)328  static struct ip_tunnel *ip_tunnel_create(struct net *net,
329  					  struct ip_tunnel_net *itn,
330  					  struct ip_tunnel_parm_kern *parms)
331  {
332  	struct ip_tunnel *nt;
333  	struct net_device *dev;
334  	int t_hlen;
335  	int mtu;
336  	int err;
337  
338  	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339  	if (IS_ERR(dev))
340  		return ERR_CAST(dev);
341  
342  	mtu = ip_tunnel_bind_dev(dev);
343  	err = dev_set_mtu(dev, mtu);
344  	if (err)
345  		goto err_dev_set_mtu;
346  
347  	nt = netdev_priv(dev);
348  	t_hlen = nt->hlen + sizeof(struct iphdr);
349  	dev->min_mtu = ETH_MIN_MTU;
350  	dev->max_mtu = IP_MAX_MTU - t_hlen;
351  	if (dev->type == ARPHRD_ETHER)
352  		dev->max_mtu -= dev->hard_header_len;
353  
354  	ip_tunnel_add(itn, nt);
355  	return nt;
356  
357  err_dev_set_mtu:
358  	unregister_netdevice(dev);
359  	return ERR_PTR(err);
360  }
361  
ip_tunnel_md_udp_encap(struct sk_buff * skb,struct ip_tunnel_info * info)362  void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
363  {
364  	const struct iphdr *iph = ip_hdr(skb);
365  	const struct udphdr *udph;
366  
367  	if (iph->protocol != IPPROTO_UDP)
368  		return;
369  
370  	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
371  	info->encap.sport = udph->source;
372  	info->encap.dport = udph->dest;
373  }
374  EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
375  
ip_tunnel_rcv(struct ip_tunnel * tunnel,struct sk_buff * skb,const struct tnl_ptk_info * tpi,struct metadata_dst * tun_dst,bool log_ecn_error)376  int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
377  		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
378  		  bool log_ecn_error)
379  {
380  	const struct iphdr *iph = ip_hdr(skb);
381  	int nh, err;
382  
383  #ifdef CONFIG_NET_IPGRE_BROADCAST
384  	if (ipv4_is_multicast(iph->daddr)) {
385  		DEV_STATS_INC(tunnel->dev, multicast);
386  		skb->pkt_type = PACKET_BROADCAST;
387  	}
388  #endif
389  
390  	if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
391  	    test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
392  		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
393  		DEV_STATS_INC(tunnel->dev, rx_errors);
394  		goto drop;
395  	}
396  
397  	if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
398  		if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
399  		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
400  			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
401  			DEV_STATS_INC(tunnel->dev, rx_errors);
402  			goto drop;
403  		}
404  		tunnel->i_seqno = ntohl(tpi->seq) + 1;
405  	}
406  
407  	/* Save offset of outer header relative to skb->head,
408  	 * because we are going to reset the network header to the inner header
409  	 * and might change skb->head.
410  	 */
411  	nh = skb_network_header(skb) - skb->head;
412  
413  	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
414  
415  	if (!pskb_inet_may_pull(skb)) {
416  		DEV_STATS_INC(tunnel->dev, rx_length_errors);
417  		DEV_STATS_INC(tunnel->dev, rx_errors);
418  		goto drop;
419  	}
420  	iph = (struct iphdr *)(skb->head + nh);
421  
422  	err = IP_ECN_decapsulate(iph, skb);
423  	if (unlikely(err)) {
424  		if (log_ecn_error)
425  			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
426  					&iph->saddr, iph->tos);
427  		if (err > 1) {
428  			DEV_STATS_INC(tunnel->dev, rx_frame_errors);
429  			DEV_STATS_INC(tunnel->dev, rx_errors);
430  			goto drop;
431  		}
432  	}
433  
434  	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
435  	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
436  
437  	if (tunnel->dev->type == ARPHRD_ETHER) {
438  		skb->protocol = eth_type_trans(skb, tunnel->dev);
439  		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
440  	} else {
441  		skb->dev = tunnel->dev;
442  	}
443  
444  	if (tun_dst)
445  		skb_dst_set(skb, (struct dst_entry *)tun_dst);
446  
447  	gro_cells_receive(&tunnel->gro_cells, skb);
448  	return 0;
449  
450  drop:
451  	if (tun_dst)
452  		dst_release((struct dst_entry *)tun_dst);
453  	kfree_skb(skb);
454  	return 0;
455  }
456  EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
457  
ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops * ops,unsigned int num)458  int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
459  			    unsigned int num)
460  {
461  	if (num >= MAX_IPTUN_ENCAP_OPS)
462  		return -ERANGE;
463  
464  	return !cmpxchg((const struct ip_tunnel_encap_ops **)
465  			&iptun_encaps[num],
466  			NULL, ops) ? 0 : -1;
467  }
468  EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
469  
ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops * ops,unsigned int num)470  int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
471  			    unsigned int num)
472  {
473  	int ret;
474  
475  	if (num >= MAX_IPTUN_ENCAP_OPS)
476  		return -ERANGE;
477  
478  	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
479  		       &iptun_encaps[num],
480  		       ops, NULL) == ops) ? 0 : -1;
481  
482  	synchronize_net();
483  
484  	return ret;
485  }
486  EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
487  
ip_tunnel_encap_setup(struct ip_tunnel * t,struct ip_tunnel_encap * ipencap)488  int ip_tunnel_encap_setup(struct ip_tunnel *t,
489  			  struct ip_tunnel_encap *ipencap)
490  {
491  	int hlen;
492  
493  	memset(&t->encap, 0, sizeof(t->encap));
494  
495  	hlen = ip_encap_hlen(ipencap);
496  	if (hlen < 0)
497  		return hlen;
498  
499  	t->encap.type = ipencap->type;
500  	t->encap.sport = ipencap->sport;
501  	t->encap.dport = ipencap->dport;
502  	t->encap.flags = ipencap->flags;
503  
504  	t->encap_hlen = hlen;
505  	t->hlen = t->encap_hlen + t->tun_hlen;
506  
507  	return 0;
508  }
509  EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
510  
tnl_update_pmtu(struct net_device * dev,struct sk_buff * skb,struct rtable * rt,__be16 df,const struct iphdr * inner_iph,int tunnel_hlen,__be32 dst,bool md)511  static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
512  			    struct rtable *rt, __be16 df,
513  			    const struct iphdr *inner_iph,
514  			    int tunnel_hlen, __be32 dst, bool md)
515  {
516  	struct ip_tunnel *tunnel = netdev_priv(dev);
517  	int pkt_size;
518  	int mtu;
519  
520  	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
521  	pkt_size = skb->len - tunnel_hlen;
522  	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
523  
524  	if (df) {
525  		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
526  		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
527  	} else {
528  		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
529  	}
530  
531  	if (skb_valid_dst(skb))
532  		skb_dst_update_pmtu_no_confirm(skb, mtu);
533  
534  	if (skb->protocol == htons(ETH_P_IP)) {
535  		if (!skb_is_gso(skb) &&
536  		    (inner_iph->frag_off & htons(IP_DF)) &&
537  		    mtu < pkt_size) {
538  			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
539  			return -E2BIG;
540  		}
541  	}
542  #if IS_ENABLED(CONFIG_IPV6)
543  	else if (skb->protocol == htons(ETH_P_IPV6)) {
544  		struct rt6_info *rt6;
545  		__be32 daddr;
546  
547  		rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
548  					   NULL;
549  		daddr = md ? dst : tunnel->parms.iph.daddr;
550  
551  		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
552  			   mtu >= IPV6_MIN_MTU) {
553  			if ((daddr && !ipv4_is_multicast(daddr)) ||
554  			    rt6->rt6i_dst.plen == 128) {
555  				rt6->rt6i_flags |= RTF_MODIFIED;
556  				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
557  			}
558  		}
559  
560  		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
561  					mtu < pkt_size) {
562  			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
563  			return -E2BIG;
564  		}
565  	}
566  #endif
567  	return 0;
568  }
569  
ip_tunnel_adj_headroom(struct net_device * dev,unsigned int headroom)570  static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
571  {
572  	/* we must cap headroom to some upperlimit, else pskb_expand_head
573  	 * will overflow header offsets in skb_headers_offset_update().
574  	 */
575  	static const unsigned int max_allowed = 512;
576  
577  	if (headroom > max_allowed)
578  		headroom = max_allowed;
579  
580  	if (headroom > READ_ONCE(dev->needed_headroom))
581  		WRITE_ONCE(dev->needed_headroom, headroom);
582  }
583  
ip_md_tunnel_xmit(struct sk_buff * skb,struct net_device * dev,u8 proto,int tunnel_hlen)584  void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
585  		       u8 proto, int tunnel_hlen)
586  {
587  	struct ip_tunnel *tunnel = netdev_priv(dev);
588  	u32 headroom = sizeof(struct iphdr);
589  	struct ip_tunnel_info *tun_info;
590  	const struct ip_tunnel_key *key;
591  	const struct iphdr *inner_iph;
592  	struct rtable *rt = NULL;
593  	struct flowi4 fl4;
594  	__be16 df = 0;
595  	u8 tos, ttl;
596  	bool use_cache;
597  
598  	tun_info = skb_tunnel_info(skb);
599  	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
600  		     ip_tunnel_info_af(tun_info) != AF_INET))
601  		goto tx_error;
602  	key = &tun_info->key;
603  	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
604  	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
605  	tos = key->tos;
606  	if (tos == 1) {
607  		if (skb->protocol == htons(ETH_P_IP))
608  			tos = inner_iph->tos;
609  		else if (skb->protocol == htons(ETH_P_IPV6))
610  			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
611  	}
612  	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
613  			    tunnel_id_to_key32(key->tun_id),
614  			    tos & INET_DSCP_MASK, dev_net(dev), 0, skb->mark,
615  			    skb_get_hash(skb), key->flow_flags);
616  
617  	if (!tunnel_hlen)
618  		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
619  
620  	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
621  		goto tx_error;
622  
623  	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
624  	if (use_cache)
625  		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
626  	if (!rt) {
627  		rt = ip_route_output_key(tunnel->net, &fl4);
628  		if (IS_ERR(rt)) {
629  			DEV_STATS_INC(dev, tx_carrier_errors);
630  			goto tx_error;
631  		}
632  		if (use_cache)
633  			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
634  					  fl4.saddr);
635  	}
636  	if (rt->dst.dev == dev) {
637  		ip_rt_put(rt);
638  		DEV_STATS_INC(dev, collisions);
639  		goto tx_error;
640  	}
641  
642  	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
643  		df = htons(IP_DF);
644  	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
645  			    key->u.ipv4.dst, true)) {
646  		ip_rt_put(rt);
647  		goto tx_error;
648  	}
649  
650  	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
651  	ttl = key->ttl;
652  	if (ttl == 0) {
653  		if (skb->protocol == htons(ETH_P_IP))
654  			ttl = inner_iph->ttl;
655  		else if (skb->protocol == htons(ETH_P_IPV6))
656  			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
657  		else
658  			ttl = ip4_dst_hoplimit(&rt->dst);
659  	}
660  
661  	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
662  	if (skb_cow_head(skb, headroom)) {
663  		ip_rt_put(rt);
664  		goto tx_dropped;
665  	}
666  
667  	ip_tunnel_adj_headroom(dev, headroom);
668  
669  	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
670  		      df, !net_eq(tunnel->net, dev_net(dev)));
671  	return;
672  tx_error:
673  	DEV_STATS_INC(dev, tx_errors);
674  	goto kfree;
675  tx_dropped:
676  	DEV_STATS_INC(dev, tx_dropped);
677  kfree:
678  	kfree_skb(skb);
679  }
680  EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
681  
ip_tunnel_xmit(struct sk_buff * skb,struct net_device * dev,const struct iphdr * tnl_params,u8 protocol)682  void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
683  		    const struct iphdr *tnl_params, u8 protocol)
684  {
685  	struct ip_tunnel *tunnel = netdev_priv(dev);
686  	struct ip_tunnel_info *tun_info = NULL;
687  	const struct iphdr *inner_iph;
688  	unsigned int max_headroom;	/* The extra header space needed */
689  	struct rtable *rt = NULL;		/* Route to the other host */
690  	__be16 payload_protocol;
691  	bool use_cache = false;
692  	struct flowi4 fl4;
693  	bool md = false;
694  	bool connected;
695  	u8 tos, ttl;
696  	__be32 dst;
697  	__be16 df;
698  
699  	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
700  	connected = (tunnel->parms.iph.daddr != 0);
701  	payload_protocol = skb_protocol(skb, true);
702  
703  	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
704  
705  	dst = tnl_params->daddr;
706  	if (dst == 0) {
707  		/* NBMA tunnel */
708  
709  		if (!skb_dst(skb)) {
710  			DEV_STATS_INC(dev, tx_fifo_errors);
711  			goto tx_error;
712  		}
713  
714  		tun_info = skb_tunnel_info(skb);
715  		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
716  		    ip_tunnel_info_af(tun_info) == AF_INET &&
717  		    tun_info->key.u.ipv4.dst) {
718  			dst = tun_info->key.u.ipv4.dst;
719  			md = true;
720  			connected = true;
721  		} else if (payload_protocol == htons(ETH_P_IP)) {
722  			rt = skb_rtable(skb);
723  			dst = rt_nexthop(rt, inner_iph->daddr);
724  		}
725  #if IS_ENABLED(CONFIG_IPV6)
726  		else if (payload_protocol == htons(ETH_P_IPV6)) {
727  			const struct in6_addr *addr6;
728  			struct neighbour *neigh;
729  			bool do_tx_error_icmp;
730  			int addr_type;
731  
732  			neigh = dst_neigh_lookup(skb_dst(skb),
733  						 &ipv6_hdr(skb)->daddr);
734  			if (!neigh)
735  				goto tx_error;
736  
737  			addr6 = (const struct in6_addr *)&neigh->primary_key;
738  			addr_type = ipv6_addr_type(addr6);
739  
740  			if (addr_type == IPV6_ADDR_ANY) {
741  				addr6 = &ipv6_hdr(skb)->daddr;
742  				addr_type = ipv6_addr_type(addr6);
743  			}
744  
745  			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
746  				do_tx_error_icmp = true;
747  			else {
748  				do_tx_error_icmp = false;
749  				dst = addr6->s6_addr32[3];
750  			}
751  			neigh_release(neigh);
752  			if (do_tx_error_icmp)
753  				goto tx_error_icmp;
754  		}
755  #endif
756  		else
757  			goto tx_error;
758  
759  		if (!md)
760  			connected = false;
761  	}
762  
763  	tos = tnl_params->tos;
764  	if (tos & 0x1) {
765  		tos &= ~0x1;
766  		if (payload_protocol == htons(ETH_P_IP)) {
767  			tos = inner_iph->tos;
768  			connected = false;
769  		} else if (payload_protocol == htons(ETH_P_IPV6)) {
770  			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
771  			connected = false;
772  		}
773  	}
774  
775  	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
776  			    tunnel->parms.o_key, tos & INET_DSCP_MASK,
777  			    dev_net(dev), READ_ONCE(tunnel->parms.link),
778  			    tunnel->fwmark, skb_get_hash(skb), 0);
779  
780  	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
781  		goto tx_error;
782  
783  	if (connected && md) {
784  		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
785  		if (use_cache)
786  			rt = dst_cache_get_ip4(&tun_info->dst_cache,
787  					       &fl4.saddr);
788  	} else {
789  		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
790  						&fl4.saddr) : NULL;
791  	}
792  
793  	if (!rt) {
794  		rt = ip_route_output_key(tunnel->net, &fl4);
795  
796  		if (IS_ERR(rt)) {
797  			DEV_STATS_INC(dev, tx_carrier_errors);
798  			goto tx_error;
799  		}
800  		if (use_cache)
801  			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
802  					  fl4.saddr);
803  		else if (!md && connected)
804  			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
805  					  fl4.saddr);
806  	}
807  
808  	if (rt->dst.dev == dev) {
809  		ip_rt_put(rt);
810  		DEV_STATS_INC(dev, collisions);
811  		goto tx_error;
812  	}
813  
814  	df = tnl_params->frag_off;
815  	if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
816  		df |= (inner_iph->frag_off & htons(IP_DF));
817  
818  	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
819  		ip_rt_put(rt);
820  		goto tx_error;
821  	}
822  
823  	if (tunnel->err_count > 0) {
824  		if (time_before(jiffies,
825  				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
826  			tunnel->err_count--;
827  
828  			dst_link_failure(skb);
829  		} else
830  			tunnel->err_count = 0;
831  	}
832  
833  	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
834  	ttl = tnl_params->ttl;
835  	if (ttl == 0) {
836  		if (payload_protocol == htons(ETH_P_IP))
837  			ttl = inner_iph->ttl;
838  #if IS_ENABLED(CONFIG_IPV6)
839  		else if (payload_protocol == htons(ETH_P_IPV6))
840  			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
841  #endif
842  		else
843  			ttl = ip4_dst_hoplimit(&rt->dst);
844  	}
845  
846  	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
847  			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
848  
849  	if (skb_cow_head(skb, max_headroom)) {
850  		ip_rt_put(rt);
851  		DEV_STATS_INC(dev, tx_dropped);
852  		kfree_skb(skb);
853  		return;
854  	}
855  
856  	ip_tunnel_adj_headroom(dev, max_headroom);
857  
858  	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
859  		      df, !net_eq(tunnel->net, dev_net(dev)));
860  	return;
861  
862  #if IS_ENABLED(CONFIG_IPV6)
863  tx_error_icmp:
864  	dst_link_failure(skb);
865  #endif
866  tx_error:
867  	DEV_STATS_INC(dev, tx_errors);
868  	kfree_skb(skb);
869  }
870  EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
871  
ip_tunnel_update(struct ip_tunnel_net * itn,struct ip_tunnel * t,struct net_device * dev,struct ip_tunnel_parm_kern * p,bool set_mtu,__u32 fwmark)872  static void ip_tunnel_update(struct ip_tunnel_net *itn,
873  			     struct ip_tunnel *t,
874  			     struct net_device *dev,
875  			     struct ip_tunnel_parm_kern *p,
876  			     bool set_mtu,
877  			     __u32 fwmark)
878  {
879  	ip_tunnel_del(itn, t);
880  	t->parms.iph.saddr = p->iph.saddr;
881  	t->parms.iph.daddr = p->iph.daddr;
882  	t->parms.i_key = p->i_key;
883  	t->parms.o_key = p->o_key;
884  	if (dev->type != ARPHRD_ETHER) {
885  		__dev_addr_set(dev, &p->iph.saddr, 4);
886  		memcpy(dev->broadcast, &p->iph.daddr, 4);
887  	}
888  	ip_tunnel_add(itn, t);
889  
890  	t->parms.iph.ttl = p->iph.ttl;
891  	t->parms.iph.tos = p->iph.tos;
892  	t->parms.iph.frag_off = p->iph.frag_off;
893  
894  	if (t->parms.link != p->link || t->fwmark != fwmark) {
895  		int mtu;
896  
897  		WRITE_ONCE(t->parms.link, p->link);
898  		t->fwmark = fwmark;
899  		mtu = ip_tunnel_bind_dev(dev);
900  		if (set_mtu)
901  			WRITE_ONCE(dev->mtu, mtu);
902  	}
903  	dst_cache_reset(&t->dst_cache);
904  	netdev_state_change(dev);
905  }
906  
ip_tunnel_ctl(struct net_device * dev,struct ip_tunnel_parm_kern * p,int cmd)907  int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
908  		  int cmd)
909  {
910  	int err = 0;
911  	struct ip_tunnel *t = netdev_priv(dev);
912  	struct net *net = t->net;
913  	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
914  
915  	switch (cmd) {
916  	case SIOCGETTUNNEL:
917  		if (dev == itn->fb_tunnel_dev) {
918  			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
919  			if (!t)
920  				t = netdev_priv(dev);
921  		}
922  		memcpy(p, &t->parms, sizeof(*p));
923  		break;
924  
925  	case SIOCADDTUNNEL:
926  	case SIOCCHGTUNNEL:
927  		err = -EPERM;
928  		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
929  			goto done;
930  		if (p->iph.ttl)
931  			p->iph.frag_off |= htons(IP_DF);
932  		if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
933  			if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
934  				p->i_key = 0;
935  			if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
936  				p->o_key = 0;
937  		}
938  
939  		t = ip_tunnel_find(itn, p, itn->type);
940  
941  		if (cmd == SIOCADDTUNNEL) {
942  			if (!t) {
943  				t = ip_tunnel_create(net, itn, p);
944  				err = PTR_ERR_OR_ZERO(t);
945  				break;
946  			}
947  
948  			err = -EEXIST;
949  			break;
950  		}
951  		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
952  			if (t) {
953  				if (t->dev != dev) {
954  					err = -EEXIST;
955  					break;
956  				}
957  			} else {
958  				unsigned int nflags = 0;
959  
960  				if (ipv4_is_multicast(p->iph.daddr))
961  					nflags = IFF_BROADCAST;
962  				else if (p->iph.daddr)
963  					nflags = IFF_POINTOPOINT;
964  
965  				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
966  					err = -EINVAL;
967  					break;
968  				}
969  
970  				t = netdev_priv(dev);
971  			}
972  		}
973  
974  		if (t) {
975  			err = 0;
976  			ip_tunnel_update(itn, t, dev, p, true, 0);
977  		} else {
978  			err = -ENOENT;
979  		}
980  		break;
981  
982  	case SIOCDELTUNNEL:
983  		err = -EPERM;
984  		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
985  			goto done;
986  
987  		if (dev == itn->fb_tunnel_dev) {
988  			err = -ENOENT;
989  			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
990  			if (!t)
991  				goto done;
992  			err = -EPERM;
993  			if (t == netdev_priv(itn->fb_tunnel_dev))
994  				goto done;
995  			dev = t->dev;
996  		}
997  		unregister_netdevice(dev);
998  		err = 0;
999  		break;
1000  
1001  	default:
1002  		err = -EINVAL;
1003  	}
1004  
1005  done:
1006  	return err;
1007  }
1008  EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
1009  
ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern * kp,const void __user * data)1010  bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
1011  			      const void __user *data)
1012  {
1013  	struct ip_tunnel_parm p;
1014  
1015  	if (copy_from_user(&p, data, sizeof(p)))
1016  		return false;
1017  
1018  	strscpy(kp->name, p.name);
1019  	kp->link = p.link;
1020  	ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
1021  	ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
1022  	kp->i_key = p.i_key;
1023  	kp->o_key = p.o_key;
1024  	memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
1025  
1026  	return true;
1027  }
1028  EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
1029  
ip_tunnel_parm_to_user(void __user * data,struct ip_tunnel_parm_kern * kp)1030  bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
1031  {
1032  	struct ip_tunnel_parm p;
1033  
1034  	if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
1035  	    !ip_tunnel_flags_is_be16_compat(kp->o_flags))
1036  		return false;
1037  
1038  	memset(&p, 0, sizeof(p));
1039  
1040  	strscpy(p.name, kp->name);
1041  	p.link = kp->link;
1042  	p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
1043  	p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
1044  	p.i_key = kp->i_key;
1045  	p.o_key = kp->o_key;
1046  	memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
1047  
1048  	return !copy_to_user(data, &p, sizeof(p));
1049  }
1050  EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
1051  
ip_tunnel_siocdevprivate(struct net_device * dev,struct ifreq * ifr,void __user * data,int cmd)1052  int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1053  			     void __user *data, int cmd)
1054  {
1055  	struct ip_tunnel_parm_kern p;
1056  	int err;
1057  
1058  	if (!ip_tunnel_parm_from_user(&p, data))
1059  		return -EFAULT;
1060  	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1061  	if (!err && !ip_tunnel_parm_to_user(data, &p))
1062  		return -EFAULT;
1063  	return err;
1064  }
1065  EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1066  
__ip_tunnel_change_mtu(struct net_device * dev,int new_mtu,bool strict)1067  int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1068  {
1069  	struct ip_tunnel *tunnel = netdev_priv(dev);
1070  	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1071  	int max_mtu = IP_MAX_MTU - t_hlen;
1072  
1073  	if (dev->type == ARPHRD_ETHER)
1074  		max_mtu -= dev->hard_header_len;
1075  
1076  	if (new_mtu < ETH_MIN_MTU)
1077  		return -EINVAL;
1078  
1079  	if (new_mtu > max_mtu) {
1080  		if (strict)
1081  			return -EINVAL;
1082  
1083  		new_mtu = max_mtu;
1084  	}
1085  
1086  	WRITE_ONCE(dev->mtu, new_mtu);
1087  	return 0;
1088  }
1089  EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1090  
ip_tunnel_change_mtu(struct net_device * dev,int new_mtu)1091  int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1092  {
1093  	return __ip_tunnel_change_mtu(dev, new_mtu, true);
1094  }
1095  EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1096  
ip_tunnel_dev_free(struct net_device * dev)1097  static void ip_tunnel_dev_free(struct net_device *dev)
1098  {
1099  	struct ip_tunnel *tunnel = netdev_priv(dev);
1100  
1101  	gro_cells_destroy(&tunnel->gro_cells);
1102  	dst_cache_destroy(&tunnel->dst_cache);
1103  }
1104  
ip_tunnel_dellink(struct net_device * dev,struct list_head * head)1105  void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1106  {
1107  	struct ip_tunnel *tunnel = netdev_priv(dev);
1108  	struct ip_tunnel_net *itn;
1109  
1110  	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1111  
1112  	if (itn->fb_tunnel_dev != dev) {
1113  		ip_tunnel_del(itn, netdev_priv(dev));
1114  		unregister_netdevice_queue(dev, head);
1115  	}
1116  }
1117  EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1118  
ip_tunnel_get_link_net(const struct net_device * dev)1119  struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1120  {
1121  	struct ip_tunnel *tunnel = netdev_priv(dev);
1122  
1123  	return READ_ONCE(tunnel->net);
1124  }
1125  EXPORT_SYMBOL(ip_tunnel_get_link_net);
1126  
ip_tunnel_get_iflink(const struct net_device * dev)1127  int ip_tunnel_get_iflink(const struct net_device *dev)
1128  {
1129  	const struct ip_tunnel *tunnel = netdev_priv(dev);
1130  
1131  	return READ_ONCE(tunnel->parms.link);
1132  }
1133  EXPORT_SYMBOL(ip_tunnel_get_iflink);
1134  
ip_tunnel_init_net(struct net * net,unsigned int ip_tnl_net_id,struct rtnl_link_ops * ops,char * devname)1135  int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1136  				  struct rtnl_link_ops *ops, char *devname)
1137  {
1138  	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1139  	struct ip_tunnel_parm_kern parms;
1140  	unsigned int i;
1141  
1142  	itn->rtnl_link_ops = ops;
1143  	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1144  		INIT_HLIST_HEAD(&itn->tunnels[i]);
1145  
1146  	if (!ops || !net_has_fallback_tunnels(net)) {
1147  		struct ip_tunnel_net *it_init_net;
1148  
1149  		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1150  		itn->type = it_init_net->type;
1151  		itn->fb_tunnel_dev = NULL;
1152  		return 0;
1153  	}
1154  
1155  	memset(&parms, 0, sizeof(parms));
1156  	if (devname)
1157  		strscpy(parms.name, devname, IFNAMSIZ);
1158  
1159  	rtnl_lock();
1160  	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1161  	/* FB netdevice is special: we have one, and only one per netns.
1162  	 * Allowing to move it to another netns is clearly unsafe.
1163  	 */
1164  	if (!IS_ERR(itn->fb_tunnel_dev)) {
1165  		itn->fb_tunnel_dev->netns_local = true;
1166  		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1167  		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1168  		itn->type = itn->fb_tunnel_dev->type;
1169  	}
1170  	rtnl_unlock();
1171  
1172  	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1173  }
1174  EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1175  
ip_tunnel_destroy(struct net * net,struct ip_tunnel_net * itn,struct list_head * head,struct rtnl_link_ops * ops)1176  static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1177  			      struct list_head *head,
1178  			      struct rtnl_link_ops *ops)
1179  {
1180  	struct net_device *dev, *aux;
1181  	int h;
1182  
1183  	for_each_netdev_safe(net, dev, aux)
1184  		if (dev->rtnl_link_ops == ops)
1185  			unregister_netdevice_queue(dev, head);
1186  
1187  	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1188  		struct ip_tunnel *t;
1189  		struct hlist_node *n;
1190  		struct hlist_head *thead = &itn->tunnels[h];
1191  
1192  		hlist_for_each_entry_safe(t, n, thead, hash_node)
1193  			/* If dev is in the same netns, it has already
1194  			 * been added to the list by the previous loop.
1195  			 */
1196  			if (!net_eq(dev_net(t->dev), net))
1197  				unregister_netdevice_queue(t->dev, head);
1198  	}
1199  }
1200  
ip_tunnel_delete_nets(struct list_head * net_list,unsigned int id,struct rtnl_link_ops * ops,struct list_head * dev_to_kill)1201  void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1202  			   struct rtnl_link_ops *ops,
1203  			   struct list_head *dev_to_kill)
1204  {
1205  	struct ip_tunnel_net *itn;
1206  	struct net *net;
1207  
1208  	ASSERT_RTNL();
1209  	list_for_each_entry(net, net_list, exit_list) {
1210  		itn = net_generic(net, id);
1211  		ip_tunnel_destroy(net, itn, dev_to_kill, ops);
1212  	}
1213  }
1214  EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1215  
ip_tunnel_newlink(struct net_device * dev,struct nlattr * tb[],struct ip_tunnel_parm_kern * p,__u32 fwmark)1216  int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1217  		      struct ip_tunnel_parm_kern *p, __u32 fwmark)
1218  {
1219  	struct ip_tunnel *nt;
1220  	struct net *net = dev_net(dev);
1221  	struct ip_tunnel_net *itn;
1222  	int mtu;
1223  	int err;
1224  
1225  	nt = netdev_priv(dev);
1226  	itn = net_generic(net, nt->ip_tnl_net_id);
1227  
1228  	if (nt->collect_md) {
1229  		if (rtnl_dereference(itn->collect_md_tun))
1230  			return -EEXIST;
1231  	} else {
1232  		if (ip_tunnel_find(itn, p, dev->type))
1233  			return -EEXIST;
1234  	}
1235  
1236  	nt->net = net;
1237  	nt->parms = *p;
1238  	nt->fwmark = fwmark;
1239  	err = register_netdevice(dev);
1240  	if (err)
1241  		goto err_register_netdevice;
1242  
1243  	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1244  		eth_hw_addr_random(dev);
1245  
1246  	mtu = ip_tunnel_bind_dev(dev);
1247  	if (tb[IFLA_MTU]) {
1248  		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1249  
1250  		if (dev->type == ARPHRD_ETHER)
1251  			max -= dev->hard_header_len;
1252  
1253  		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1254  	}
1255  
1256  	err = dev_set_mtu(dev, mtu);
1257  	if (err)
1258  		goto err_dev_set_mtu;
1259  
1260  	ip_tunnel_add(itn, nt);
1261  	return 0;
1262  
1263  err_dev_set_mtu:
1264  	unregister_netdevice(dev);
1265  err_register_netdevice:
1266  	return err;
1267  }
1268  EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1269  
ip_tunnel_changelink(struct net_device * dev,struct nlattr * tb[],struct ip_tunnel_parm_kern * p,__u32 fwmark)1270  int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1271  			 struct ip_tunnel_parm_kern *p, __u32 fwmark)
1272  {
1273  	struct ip_tunnel *t;
1274  	struct ip_tunnel *tunnel = netdev_priv(dev);
1275  	struct net *net = tunnel->net;
1276  	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1277  
1278  	if (dev == itn->fb_tunnel_dev)
1279  		return -EINVAL;
1280  
1281  	t = ip_tunnel_find(itn, p, dev->type);
1282  
1283  	if (t) {
1284  		if (t->dev != dev)
1285  			return -EEXIST;
1286  	} else {
1287  		t = tunnel;
1288  
1289  		if (dev->type != ARPHRD_ETHER) {
1290  			unsigned int nflags = 0;
1291  
1292  			if (ipv4_is_multicast(p->iph.daddr))
1293  				nflags = IFF_BROADCAST;
1294  			else if (p->iph.daddr)
1295  				nflags = IFF_POINTOPOINT;
1296  
1297  			if ((dev->flags ^ nflags) &
1298  			    (IFF_POINTOPOINT | IFF_BROADCAST))
1299  				return -EINVAL;
1300  		}
1301  	}
1302  
1303  	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1304  	return 0;
1305  }
1306  EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1307  
ip_tunnel_init(struct net_device * dev)1308  int ip_tunnel_init(struct net_device *dev)
1309  {
1310  	struct ip_tunnel *tunnel = netdev_priv(dev);
1311  	struct iphdr *iph = &tunnel->parms.iph;
1312  	int err;
1313  
1314  	dev->needs_free_netdev = true;
1315  	dev->priv_destructor = ip_tunnel_dev_free;
1316  	dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
1317  
1318  	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1319  	if (err)
1320  		return err;
1321  
1322  	err = gro_cells_init(&tunnel->gro_cells, dev);
1323  	if (err) {
1324  		dst_cache_destroy(&tunnel->dst_cache);
1325  		return err;
1326  	}
1327  
1328  	tunnel->dev = dev;
1329  	tunnel->net = dev_net(dev);
1330  	strscpy(tunnel->parms.name, dev->name);
1331  	iph->version		= 4;
1332  	iph->ihl		= 5;
1333  
1334  	if (tunnel->collect_md)
1335  		netif_keep_dst(dev);
1336  	netdev_lockdep_set_classes(dev);
1337  	return 0;
1338  }
1339  EXPORT_SYMBOL_GPL(ip_tunnel_init);
1340  
ip_tunnel_uninit(struct net_device * dev)1341  void ip_tunnel_uninit(struct net_device *dev)
1342  {
1343  	struct ip_tunnel *tunnel = netdev_priv(dev);
1344  	struct net *net = tunnel->net;
1345  	struct ip_tunnel_net *itn;
1346  
1347  	itn = net_generic(net, tunnel->ip_tnl_net_id);
1348  	ip_tunnel_del(itn, netdev_priv(dev));
1349  	if (itn->fb_tunnel_dev == dev)
1350  		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1351  
1352  	dst_cache_reset(&tunnel->dst_cache);
1353  }
1354  EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1355  
1356  /* Do least required initialization, rest of init is done in tunnel_init call */
ip_tunnel_setup(struct net_device * dev,unsigned int net_id)1357  void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1358  {
1359  	struct ip_tunnel *tunnel = netdev_priv(dev);
1360  	tunnel->ip_tnl_net_id = net_id;
1361  }
1362  EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1363  
1364  MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1365  MODULE_LICENSE("GPL");
1366