1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * INET		An implementation of the TCP/IP protocol suite for the LINUX
4   *		operating system.  INET is implemented using the  BSD Socket
5   *		interface as the means of communication with the user level.
6   *
7   *		Implementation of the Transmission Control Protocol(TCP).
8   *
9   *		IPv4 specific functions
10   *
11   *		code split from:
12   *		linux/ipv4/tcp.c
13   *		linux/ipv4/tcp_input.c
14   *		linux/ipv4/tcp_output.c
15   *
16   *		See tcp.c for author information
17   */
18  
19  /*
20   * Changes:
21   *		David S. Miller	:	New socket lookup architecture.
22   *					This code is dedicated to John Dyson.
23   *		David S. Miller :	Change semantics of established hash,
24   *					half is devoted to TIME_WAIT sockets
25   *					and the rest go in the other half.
26   *		Andi Kleen :		Add support for syncookies and fixed
27   *					some bugs: ip options weren't passed to
28   *					the TCP layer, missed a check for an
29   *					ACK bit.
30   *		Andi Kleen :		Implemented fast path mtu discovery.
31   *	     				Fixed many serious bugs in the
32   *					request_sock handling and moved
33   *					most of it into the af independent code.
34   *					Added tail drop and some other bugfixes.
35   *					Added new listen semantics.
36   *		Mike McLagan	:	Routing by source
37   *	Juan Jose Ciarlante:		ip_dynaddr bits
38   *		Andi Kleen:		various fixes.
39   *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40   *					coma.
41   *	Andi Kleen		:	Fix new listen.
42   *	Andi Kleen		:	Fix accept error reporting.
43   *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44   *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45   *					a single port at the same time.
46   */
47  
48  #define pr_fmt(fmt) "TCP: " fmt
49  
50  #include <linux/bottom_half.h>
51  #include <linux/types.h>
52  #include <linux/fcntl.h>
53  #include <linux/module.h>
54  #include <linux/random.h>
55  #include <linux/cache.h>
56  #include <linux/jhash.h>
57  #include <linux/init.h>
58  #include <linux/times.h>
59  #include <linux/slab.h>
60  #include <linux/sched.h>
61  
62  #include <net/net_namespace.h>
63  #include <net/icmp.h>
64  #include <net/inet_hashtables.h>
65  #include <net/tcp.h>
66  #include <net/transp_v6.h>
67  #include <net/ipv6.h>
68  #include <net/inet_common.h>
69  #include <net/timewait_sock.h>
70  #include <net/xfrm.h>
71  #include <net/secure_seq.h>
72  #include <net/busy_poll.h>
73  #include <net/rstreason.h>
74  
75  #include <linux/inet.h>
76  #include <linux/ipv6.h>
77  #include <linux/stddef.h>
78  #include <linux/proc_fs.h>
79  #include <linux/seq_file.h>
80  #include <linux/inetdevice.h>
81  #include <linux/btf_ids.h>
82  #include <linux/skbuff_ref.h>
83  
84  #include <crypto/hash.h>
85  #include <linux/scatterlist.h>
86  
87  #include <trace/events/tcp.h>
88  
89  #ifdef CONFIG_TCP_MD5SIG
90  static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91  			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92  #endif
93  
94  struct inet_hashinfo tcp_hashinfo;
95  EXPORT_SYMBOL(tcp_hashinfo);
96  
97  static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98  	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
99  };
100  
101  static DEFINE_MUTEX(tcp_exit_batch_mutex);
102  
tcp_v4_init_seq(const struct sk_buff * skb)103  static u32 tcp_v4_init_seq(const struct sk_buff *skb)
104  {
105  	return secure_tcp_seq(ip_hdr(skb)->daddr,
106  			      ip_hdr(skb)->saddr,
107  			      tcp_hdr(skb)->dest,
108  			      tcp_hdr(skb)->source);
109  }
110  
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)111  static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
112  {
113  	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
114  }
115  
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)116  int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117  {
118  	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
119  	const struct inet_timewait_sock *tw = inet_twsk(sktw);
120  	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
121  	struct tcp_sock *tp = tcp_sk(sk);
122  	int ts_recent_stamp;
123  
124  	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
125  		reuse = 0;
126  
127  	if (reuse == 2) {
128  		/* Still does not detect *everything* that goes through
129  		 * lo, since we require a loopback src or dst address
130  		 * or direct binding to 'lo' interface.
131  		 */
132  		bool loopback = false;
133  		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
134  			loopback = true;
135  #if IS_ENABLED(CONFIG_IPV6)
136  		if (tw->tw_family == AF_INET6) {
137  			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
138  			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
139  			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
140  			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
141  				loopback = true;
142  		} else
143  #endif
144  		{
145  			if (ipv4_is_loopback(tw->tw_daddr) ||
146  			    ipv4_is_loopback(tw->tw_rcv_saddr))
147  				loopback = true;
148  		}
149  		if (!loopback)
150  			reuse = 0;
151  	}
152  
153  	/* With PAWS, it is safe from the viewpoint
154  	   of data integrity. Even without PAWS it is safe provided sequence
155  	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
156  
157  	   Actually, the idea is close to VJ's one, only timestamp cache is
158  	   held not per host, but per port pair and TW bucket is used as state
159  	   holder.
160  
161  	   If TW bucket has been already destroyed we fall back to VJ's scheme
162  	   and use initial timestamp retrieved from peer table.
163  	 */
164  	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
165  	if (ts_recent_stamp &&
166  	    (!twp || (reuse && time_after32(ktime_get_seconds(),
167  					    ts_recent_stamp)))) {
168  		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
169  		 * and releasing the bucket lock.
170  		 */
171  		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
172  			return 0;
173  
174  		/* In case of repair and re-using TIME-WAIT sockets we still
175  		 * want to be sure that it is safe as above but honor the
176  		 * sequence numbers and time stamps set as part of the repair
177  		 * process.
178  		 *
179  		 * Without this check re-using a TIME-WAIT socket with TCP
180  		 * repair would accumulate a -1 on the repair assigned
181  		 * sequence number. The first time it is reused the sequence
182  		 * is -1, the second time -2, etc. This fixes that issue
183  		 * without appearing to create any others.
184  		 */
185  		if (likely(!tp->repair)) {
186  			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
187  
188  			if (!seq)
189  				seq = 1;
190  			WRITE_ONCE(tp->write_seq, seq);
191  			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
192  			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
193  		}
194  
195  		return 1;
196  	}
197  
198  	return 0;
199  }
200  EXPORT_SYMBOL_GPL(tcp_twsk_unique);
201  
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)202  static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
203  			      int addr_len)
204  {
205  	/* This check is replicated from tcp_v4_connect() and intended to
206  	 * prevent BPF program called below from accessing bytes that are out
207  	 * of the bound specified by user in addr_len.
208  	 */
209  	if (addr_len < sizeof(struct sockaddr_in))
210  		return -EINVAL;
211  
212  	sock_owned_by_me(sk);
213  
214  	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
215  }
216  
217  /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)218  int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
219  {
220  	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
221  	struct inet_timewait_death_row *tcp_death_row;
222  	struct inet_sock *inet = inet_sk(sk);
223  	struct tcp_sock *tp = tcp_sk(sk);
224  	struct ip_options_rcu *inet_opt;
225  	struct net *net = sock_net(sk);
226  	__be16 orig_sport, orig_dport;
227  	__be32 daddr, nexthop;
228  	struct flowi4 *fl4;
229  	struct rtable *rt;
230  	int err;
231  
232  	if (addr_len < sizeof(struct sockaddr_in))
233  		return -EINVAL;
234  
235  	if (usin->sin_family != AF_INET)
236  		return -EAFNOSUPPORT;
237  
238  	nexthop = daddr = usin->sin_addr.s_addr;
239  	inet_opt = rcu_dereference_protected(inet->inet_opt,
240  					     lockdep_sock_is_held(sk));
241  	if (inet_opt && inet_opt->opt.srr) {
242  		if (!daddr)
243  			return -EINVAL;
244  		nexthop = inet_opt->opt.faddr;
245  	}
246  
247  	orig_sport = inet->inet_sport;
248  	orig_dport = usin->sin_port;
249  	fl4 = &inet->cork.fl.u.ip4;
250  	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
251  			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
252  			      orig_dport, sk);
253  	if (IS_ERR(rt)) {
254  		err = PTR_ERR(rt);
255  		if (err == -ENETUNREACH)
256  			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
257  		return err;
258  	}
259  
260  	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
261  		ip_rt_put(rt);
262  		return -ENETUNREACH;
263  	}
264  
265  	if (!inet_opt || !inet_opt->opt.srr)
266  		daddr = fl4->daddr;
267  
268  	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
269  
270  	if (!inet->inet_saddr) {
271  		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
272  		if (err) {
273  			ip_rt_put(rt);
274  			return err;
275  		}
276  	} else {
277  		sk_rcv_saddr_set(sk, inet->inet_saddr);
278  	}
279  
280  	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
281  		/* Reset inherited state */
282  		tp->rx_opt.ts_recent	   = 0;
283  		tp->rx_opt.ts_recent_stamp = 0;
284  		if (likely(!tp->repair))
285  			WRITE_ONCE(tp->write_seq, 0);
286  	}
287  
288  	inet->inet_dport = usin->sin_port;
289  	sk_daddr_set(sk, daddr);
290  
291  	inet_csk(sk)->icsk_ext_hdr_len = 0;
292  	if (inet_opt)
293  		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
294  
295  	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
296  
297  	/* Socket identity is still unknown (sport may be zero).
298  	 * However we set state to SYN-SENT and not releasing socket
299  	 * lock select source port, enter ourselves into the hash tables and
300  	 * complete initialization after this.
301  	 */
302  	tcp_set_state(sk, TCP_SYN_SENT);
303  	err = inet_hash_connect(tcp_death_row, sk);
304  	if (err)
305  		goto failure;
306  
307  	sk_set_txhash(sk);
308  
309  	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
310  			       inet->inet_sport, inet->inet_dport, sk);
311  	if (IS_ERR(rt)) {
312  		err = PTR_ERR(rt);
313  		rt = NULL;
314  		goto failure;
315  	}
316  	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
317  	/* OK, now commit destination to socket.  */
318  	sk->sk_gso_type = SKB_GSO_TCPV4;
319  	sk_setup_caps(sk, &rt->dst);
320  	rt = NULL;
321  
322  	if (likely(!tp->repair)) {
323  		if (!tp->write_seq)
324  			WRITE_ONCE(tp->write_seq,
325  				   secure_tcp_seq(inet->inet_saddr,
326  						  inet->inet_daddr,
327  						  inet->inet_sport,
328  						  usin->sin_port));
329  		WRITE_ONCE(tp->tsoffset,
330  			   secure_tcp_ts_off(net, inet->inet_saddr,
331  					     inet->inet_daddr));
332  	}
333  
334  	atomic_set(&inet->inet_id, get_random_u16());
335  
336  	if (tcp_fastopen_defer_connect(sk, &err))
337  		return err;
338  	if (err)
339  		goto failure;
340  
341  	err = tcp_connect(sk);
342  
343  	if (err)
344  		goto failure;
345  
346  	return 0;
347  
348  failure:
349  	/*
350  	 * This unhashes the socket and releases the local port,
351  	 * if necessary.
352  	 */
353  	tcp_set_state(sk, TCP_CLOSE);
354  	inet_bhash2_reset_saddr(sk);
355  	ip_rt_put(rt);
356  	sk->sk_route_caps = 0;
357  	inet->inet_dport = 0;
358  	return err;
359  }
360  EXPORT_SYMBOL(tcp_v4_connect);
361  
362  /*
363   * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
364   * It can be called through tcp_release_cb() if socket was owned by user
365   * at the time tcp_v4_err() was called to handle ICMP message.
366   */
tcp_v4_mtu_reduced(struct sock * sk)367  void tcp_v4_mtu_reduced(struct sock *sk)
368  {
369  	struct inet_sock *inet = inet_sk(sk);
370  	struct dst_entry *dst;
371  	u32 mtu;
372  
373  	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
374  		return;
375  	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
376  	dst = inet_csk_update_pmtu(sk, mtu);
377  	if (!dst)
378  		return;
379  
380  	/* Something is about to be wrong... Remember soft error
381  	 * for the case, if this connection will not able to recover.
382  	 */
383  	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
384  		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
385  
386  	mtu = dst_mtu(dst);
387  
388  	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
389  	    ip_sk_accept_pmtu(sk) &&
390  	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
391  		tcp_sync_mss(sk, mtu);
392  
393  		/* Resend the TCP packet because it's
394  		 * clear that the old packet has been
395  		 * dropped. This is the new "fast" path mtu
396  		 * discovery.
397  		 */
398  		tcp_simple_retransmit(sk);
399  	} /* else let the usual retransmit timer handle it */
400  }
401  EXPORT_SYMBOL(tcp_v4_mtu_reduced);
402  
do_redirect(struct sk_buff * skb,struct sock * sk)403  static void do_redirect(struct sk_buff *skb, struct sock *sk)
404  {
405  	struct dst_entry *dst = __sk_dst_check(sk, 0);
406  
407  	if (dst)
408  		dst->ops->redirect(dst, sk, skb);
409  }
410  
411  
412  /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)413  void tcp_req_err(struct sock *sk, u32 seq, bool abort)
414  {
415  	struct request_sock *req = inet_reqsk(sk);
416  	struct net *net = sock_net(sk);
417  
418  	/* ICMPs are not backlogged, hence we cannot get
419  	 * an established socket here.
420  	 */
421  	if (seq != tcp_rsk(req)->snt_isn) {
422  		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
423  	} else if (abort) {
424  		/*
425  		 * Still in SYN_RECV, just remove it silently.
426  		 * There is no good way to pass the error to the newly
427  		 * created socket, and POSIX does not want network
428  		 * errors returned from accept().
429  		 */
430  		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
431  		tcp_listendrop(req->rsk_listener);
432  	}
433  	reqsk_put(req);
434  }
435  EXPORT_SYMBOL(tcp_req_err);
436  
437  /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)438  void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
439  {
440  	struct inet_connection_sock *icsk = inet_csk(sk);
441  	struct tcp_sock *tp = tcp_sk(sk);
442  	struct sk_buff *skb;
443  	s32 remaining;
444  	u32 delta_us;
445  
446  	if (sock_owned_by_user(sk))
447  		return;
448  
449  	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
450  	    !icsk->icsk_backoff)
451  		return;
452  
453  	skb = tcp_rtx_queue_head(sk);
454  	if (WARN_ON_ONCE(!skb))
455  		return;
456  
457  	icsk->icsk_backoff--;
458  	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
459  	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
460  
461  	tcp_mstamp_refresh(tp);
462  	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
463  	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
464  
465  	if (remaining > 0) {
466  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
467  					  remaining, TCP_RTO_MAX);
468  	} else {
469  		/* RTO revert clocked out retransmission.
470  		 * Will retransmit now.
471  		 */
472  		tcp_retransmit_timer(sk);
473  	}
474  }
475  EXPORT_SYMBOL(tcp_ld_RTO_revert);
476  
477  /*
478   * This routine is called by the ICMP module when it gets some
479   * sort of error condition.  If err < 0 then the socket should
480   * be closed and the error returned to the user.  If err > 0
481   * it's just the icmp type << 8 | icmp code.  After adjustment
482   * header points to the first 8 bytes of the tcp header.  We need
483   * to find the appropriate port.
484   *
485   * The locking strategy used here is very "optimistic". When
486   * someone else accesses the socket the ICMP is just dropped
487   * and for some paths there is no check at all.
488   * A more general error queue to queue errors for later handling
489   * is probably better.
490   *
491   */
492  
tcp_v4_err(struct sk_buff * skb,u32 info)493  int tcp_v4_err(struct sk_buff *skb, u32 info)
494  {
495  	const struct iphdr *iph = (const struct iphdr *)skb->data;
496  	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
497  	struct tcp_sock *tp;
498  	const int type = icmp_hdr(skb)->type;
499  	const int code = icmp_hdr(skb)->code;
500  	struct sock *sk;
501  	struct request_sock *fastopen;
502  	u32 seq, snd_una;
503  	int err;
504  	struct net *net = dev_net(skb->dev);
505  
506  	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
507  				       iph->daddr, th->dest, iph->saddr,
508  				       ntohs(th->source), inet_iif(skb), 0);
509  	if (!sk) {
510  		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
511  		return -ENOENT;
512  	}
513  	if (sk->sk_state == TCP_TIME_WAIT) {
514  		/* To increase the counter of ignored icmps for TCP-AO */
515  		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
516  		inet_twsk_put(inet_twsk(sk));
517  		return 0;
518  	}
519  	seq = ntohl(th->seq);
520  	if (sk->sk_state == TCP_NEW_SYN_RECV) {
521  		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
522  				     type == ICMP_TIME_EXCEEDED ||
523  				     (type == ICMP_DEST_UNREACH &&
524  				      (code == ICMP_NET_UNREACH ||
525  				       code == ICMP_HOST_UNREACH)));
526  		return 0;
527  	}
528  
529  	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
530  		sock_put(sk);
531  		return 0;
532  	}
533  
534  	bh_lock_sock(sk);
535  	/* If too many ICMPs get dropped on busy
536  	 * servers this needs to be solved differently.
537  	 * We do take care of PMTU discovery (RFC1191) special case :
538  	 * we can receive locally generated ICMP messages while socket is held.
539  	 */
540  	if (sock_owned_by_user(sk)) {
541  		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
542  			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
543  	}
544  	if (sk->sk_state == TCP_CLOSE)
545  		goto out;
546  
547  	if (static_branch_unlikely(&ip4_min_ttl)) {
548  		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
549  		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
550  			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
551  			goto out;
552  		}
553  	}
554  
555  	tp = tcp_sk(sk);
556  	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
557  	fastopen = rcu_dereference(tp->fastopen_rsk);
558  	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
559  	if (sk->sk_state != TCP_LISTEN &&
560  	    !between(seq, snd_una, tp->snd_nxt)) {
561  		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
562  		goto out;
563  	}
564  
565  	switch (type) {
566  	case ICMP_REDIRECT:
567  		if (!sock_owned_by_user(sk))
568  			do_redirect(skb, sk);
569  		goto out;
570  	case ICMP_SOURCE_QUENCH:
571  		/* Just silently ignore these. */
572  		goto out;
573  	case ICMP_PARAMETERPROB:
574  		err = EPROTO;
575  		break;
576  	case ICMP_DEST_UNREACH:
577  		if (code > NR_ICMP_UNREACH)
578  			goto out;
579  
580  		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
581  			/* We are not interested in TCP_LISTEN and open_requests
582  			 * (SYN-ACKs send out by Linux are always <576bytes so
583  			 * they should go through unfragmented).
584  			 */
585  			if (sk->sk_state == TCP_LISTEN)
586  				goto out;
587  
588  			WRITE_ONCE(tp->mtu_info, info);
589  			if (!sock_owned_by_user(sk)) {
590  				tcp_v4_mtu_reduced(sk);
591  			} else {
592  				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
593  					sock_hold(sk);
594  			}
595  			goto out;
596  		}
597  
598  		err = icmp_err_convert[code].errno;
599  		/* check if this ICMP message allows revert of backoff.
600  		 * (see RFC 6069)
601  		 */
602  		if (!fastopen &&
603  		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
604  			tcp_ld_RTO_revert(sk, seq);
605  		break;
606  	case ICMP_TIME_EXCEEDED:
607  		err = EHOSTUNREACH;
608  		break;
609  	default:
610  		goto out;
611  	}
612  
613  	switch (sk->sk_state) {
614  	case TCP_SYN_SENT:
615  	case TCP_SYN_RECV:
616  		/* Only in fast or simultaneous open. If a fast open socket is
617  		 * already accepted it is treated as a connected one below.
618  		 */
619  		if (fastopen && !fastopen->sk)
620  			break;
621  
622  		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
623  
624  		if (!sock_owned_by_user(sk))
625  			tcp_done_with_error(sk, err);
626  		else
627  			WRITE_ONCE(sk->sk_err_soft, err);
628  		goto out;
629  	}
630  
631  	/* If we've already connected we will keep trying
632  	 * until we time out, or the user gives up.
633  	 *
634  	 * rfc1122 4.2.3.9 allows to consider as hard errors
635  	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
636  	 * but it is obsoleted by pmtu discovery).
637  	 *
638  	 * Note, that in modern internet, where routing is unreliable
639  	 * and in each dark corner broken firewalls sit, sending random
640  	 * errors ordered by their masters even this two messages finally lose
641  	 * their original sense (even Linux sends invalid PORT_UNREACHs)
642  	 *
643  	 * Now we are in compliance with RFCs.
644  	 *							--ANK (980905)
645  	 */
646  
647  	if (!sock_owned_by_user(sk) &&
648  	    inet_test_bit(RECVERR, sk)) {
649  		WRITE_ONCE(sk->sk_err, err);
650  		sk_error_report(sk);
651  	} else	{ /* Only an error on timeout */
652  		WRITE_ONCE(sk->sk_err_soft, err);
653  	}
654  
655  out:
656  	bh_unlock_sock(sk);
657  	sock_put(sk);
658  	return 0;
659  }
660  
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)661  void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
662  {
663  	struct tcphdr *th = tcp_hdr(skb);
664  
665  	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
666  	skb->csum_start = skb_transport_header(skb) - skb->head;
667  	skb->csum_offset = offsetof(struct tcphdr, check);
668  }
669  
670  /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)671  void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
672  {
673  	const struct inet_sock *inet = inet_sk(sk);
674  
675  	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
676  }
677  EXPORT_SYMBOL(tcp_v4_send_check);
678  
679  #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
680  
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])681  static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
682  				 const struct tcp_ao_hdr *aoh,
683  				 struct ip_reply_arg *arg, struct tcphdr *reply,
684  				 __be32 reply_options[REPLY_OPTIONS_LEN])
685  {
686  #ifdef CONFIG_TCP_AO
687  	int sdif = tcp_v4_sdif(skb);
688  	int dif = inet_iif(skb);
689  	int l3index = sdif ? dif : 0;
690  	bool allocated_traffic_key;
691  	struct tcp_ao_key *key;
692  	char *traffic_key;
693  	bool drop = true;
694  	u32 ao_sne = 0;
695  	u8 keyid;
696  
697  	rcu_read_lock();
698  	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
699  				 &key, &traffic_key, &allocated_traffic_key,
700  				 &keyid, &ao_sne))
701  		goto out;
702  
703  	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
704  				 (aoh->rnext_keyid << 8) | keyid);
705  	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
706  	reply->doff = arg->iov[0].iov_len / 4;
707  
708  	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
709  			    key, traffic_key,
710  			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
711  			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
712  			    reply, ao_sne))
713  		goto out;
714  	drop = false;
715  out:
716  	rcu_read_unlock();
717  	if (allocated_traffic_key)
718  		kfree(traffic_key);
719  	return drop;
720  #else
721  	return true;
722  #endif
723  }
724  
725  /*
726   *	This routine will send an RST to the other tcp.
727   *
728   *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
729   *		      for reset.
730   *	Answer: if a packet caused RST, it is not for a socket
731   *		existing in our system, if it is matched to a socket,
732   *		it is just duplicate segment or bug in other side's TCP.
733   *		So that we build reply only basing on parameters
734   *		arrived with segment.
735   *	Exception: precedence violation. We do not implement it in any case.
736   */
737  
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)738  static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
739  			      enum sk_rst_reason reason)
740  {
741  	const struct tcphdr *th = tcp_hdr(skb);
742  	struct {
743  		struct tcphdr th;
744  		__be32 opt[REPLY_OPTIONS_LEN];
745  	} rep;
746  	const __u8 *md5_hash_location = NULL;
747  	const struct tcp_ao_hdr *aoh;
748  	struct ip_reply_arg arg;
749  #ifdef CONFIG_TCP_MD5SIG
750  	struct tcp_md5sig_key *key = NULL;
751  	unsigned char newhash[16];
752  	struct sock *sk1 = NULL;
753  	int genhash;
754  #endif
755  	u64 transmit_time = 0;
756  	struct sock *ctl_sk;
757  	struct net *net;
758  	u32 txhash = 0;
759  
760  	/* Never send a reset in response to a reset. */
761  	if (th->rst)
762  		return;
763  
764  	/* If sk not NULL, it means we did a successful lookup and incoming
765  	 * route had to be correct. prequeue might have dropped our dst.
766  	 */
767  	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
768  		return;
769  
770  	/* Swap the send and the receive. */
771  	memset(&rep, 0, sizeof(rep));
772  	rep.th.dest   = th->source;
773  	rep.th.source = th->dest;
774  	rep.th.doff   = sizeof(struct tcphdr) / 4;
775  	rep.th.rst    = 1;
776  
777  	if (th->ack) {
778  		rep.th.seq = th->ack_seq;
779  	} else {
780  		rep.th.ack = 1;
781  		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
782  				       skb->len - (th->doff << 2));
783  	}
784  
785  	memset(&arg, 0, sizeof(arg));
786  	arg.iov[0].iov_base = (unsigned char *)&rep;
787  	arg.iov[0].iov_len  = sizeof(rep.th);
788  
789  	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
790  
791  	/* Invalid TCP option size or twice included auth */
792  	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
793  		return;
794  
795  	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
796  		return;
797  
798  #ifdef CONFIG_TCP_MD5SIG
799  	rcu_read_lock();
800  	if (sk && sk_fullsock(sk)) {
801  		const union tcp_md5_addr *addr;
802  		int l3index;
803  
804  		/* sdif set, means packet ingressed via a device
805  		 * in an L3 domain and inet_iif is set to it.
806  		 */
807  		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
808  		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
809  		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
810  	} else if (md5_hash_location) {
811  		const union tcp_md5_addr *addr;
812  		int sdif = tcp_v4_sdif(skb);
813  		int dif = inet_iif(skb);
814  		int l3index;
815  
816  		/*
817  		 * active side is lost. Try to find listening socket through
818  		 * source port, and then find md5 key through listening socket.
819  		 * we are not loose security here:
820  		 * Incoming packet is checked with md5 hash with finding key,
821  		 * no RST generated if md5 hash doesn't match.
822  		 */
823  		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
824  					     NULL, 0, ip_hdr(skb)->saddr,
825  					     th->source, ip_hdr(skb)->daddr,
826  					     ntohs(th->source), dif, sdif);
827  		/* don't send rst if it can't find key */
828  		if (!sk1)
829  			goto out;
830  
831  		/* sdif set, means packet ingressed via a device
832  		 * in an L3 domain and dif is set to it.
833  		 */
834  		l3index = sdif ? dif : 0;
835  		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
836  		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
837  		if (!key)
838  			goto out;
839  
840  
841  		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
842  		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
843  			goto out;
844  
845  	}
846  
847  	if (key) {
848  		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
849  				   (TCPOPT_NOP << 16) |
850  				   (TCPOPT_MD5SIG << 8) |
851  				   TCPOLEN_MD5SIG);
852  		/* Update length and the length the header thinks exists */
853  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
854  		rep.th.doff = arg.iov[0].iov_len / 4;
855  
856  		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
857  				     key, ip_hdr(skb)->saddr,
858  				     ip_hdr(skb)->daddr, &rep.th);
859  	}
860  #endif
861  	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
862  	if (rep.opt[0] == 0) {
863  		__be32 mrst = mptcp_reset_option(skb);
864  
865  		if (mrst) {
866  			rep.opt[0] = mrst;
867  			arg.iov[0].iov_len += sizeof(mrst);
868  			rep.th.doff = arg.iov[0].iov_len / 4;
869  		}
870  	}
871  
872  	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
873  				      ip_hdr(skb)->saddr, /* XXX */
874  				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
875  	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
876  	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
877  
878  	/* When socket is gone, all binding information is lost.
879  	 * routing might fail in this case. No choice here, if we choose to force
880  	 * input interface, we will misroute in case of asymmetric route.
881  	 */
882  	if (sk)
883  		arg.bound_dev_if = sk->sk_bound_dev_if;
884  
885  	trace_tcp_send_reset(sk, skb, reason);
886  
887  	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
888  		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
889  
890  	arg.tos = ip_hdr(skb)->tos;
891  	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
892  	local_bh_disable();
893  	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
894  	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
895  
896  	sock_net_set(ctl_sk, net);
897  	if (sk) {
898  		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
899  				   inet_twsk(sk)->tw_mark : sk->sk_mark;
900  		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
901  				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
902  		transmit_time = tcp_transmit_time(sk);
903  		xfrm_sk_clone_policy(ctl_sk, sk);
904  		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
905  			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
906  	} else {
907  		ctl_sk->sk_mark = 0;
908  		ctl_sk->sk_priority = 0;
909  	}
910  	ip_send_unicast_reply(ctl_sk,
911  			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
912  			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913  			      &arg, arg.iov[0].iov_len,
914  			      transmit_time, txhash);
915  
916  	xfrm_sk_free_policy(ctl_sk);
917  	sock_net_set(ctl_sk, &init_net);
918  	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
919  	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
920  	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
921  	local_bh_enable();
922  
923  #ifdef CONFIG_TCP_MD5SIG
924  out:
925  	rcu_read_unlock();
926  #endif
927  }
928  
929  /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
930     outside socket context is ugly, certainly. What can I do?
931   */
932  
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)933  static void tcp_v4_send_ack(const struct sock *sk,
934  			    struct sk_buff *skb, u32 seq, u32 ack,
935  			    u32 win, u32 tsval, u32 tsecr, int oif,
936  			    struct tcp_key *key,
937  			    int reply_flags, u8 tos, u32 txhash)
938  {
939  	const struct tcphdr *th = tcp_hdr(skb);
940  	struct {
941  		struct tcphdr th;
942  		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
943  	} rep;
944  	struct net *net = sock_net(sk);
945  	struct ip_reply_arg arg;
946  	struct sock *ctl_sk;
947  	u64 transmit_time;
948  
949  	memset(&rep.th, 0, sizeof(struct tcphdr));
950  	memset(&arg, 0, sizeof(arg));
951  
952  	arg.iov[0].iov_base = (unsigned char *)&rep;
953  	arg.iov[0].iov_len  = sizeof(rep.th);
954  	if (tsecr) {
955  		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
956  				   (TCPOPT_TIMESTAMP << 8) |
957  				   TCPOLEN_TIMESTAMP);
958  		rep.opt[1] = htonl(tsval);
959  		rep.opt[2] = htonl(tsecr);
960  		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
961  	}
962  
963  	/* Swap the send and the receive. */
964  	rep.th.dest    = th->source;
965  	rep.th.source  = th->dest;
966  	rep.th.doff    = arg.iov[0].iov_len / 4;
967  	rep.th.seq     = htonl(seq);
968  	rep.th.ack_seq = htonl(ack);
969  	rep.th.ack     = 1;
970  	rep.th.window  = htons(win);
971  
972  #ifdef CONFIG_TCP_MD5SIG
973  	if (tcp_key_is_md5(key)) {
974  		int offset = (tsecr) ? 3 : 0;
975  
976  		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
977  					  (TCPOPT_NOP << 16) |
978  					  (TCPOPT_MD5SIG << 8) |
979  					  TCPOLEN_MD5SIG);
980  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
981  		rep.th.doff = arg.iov[0].iov_len/4;
982  
983  		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
984  				    key->md5_key, ip_hdr(skb)->saddr,
985  				    ip_hdr(skb)->daddr, &rep.th);
986  	}
987  #endif
988  #ifdef CONFIG_TCP_AO
989  	if (tcp_key_is_ao(key)) {
990  		int offset = (tsecr) ? 3 : 0;
991  
992  		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
993  					  (tcp_ao_len(key->ao_key) << 16) |
994  					  (key->ao_key->sndid << 8) |
995  					  key->rcv_next);
996  		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
997  		rep.th.doff = arg.iov[0].iov_len / 4;
998  
999  		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1000  				key->ao_key, key->traffic_key,
1001  				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1002  				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1003  				&rep.th, key->sne);
1004  	}
1005  #endif
1006  	arg.flags = reply_flags;
1007  	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1008  				      ip_hdr(skb)->saddr, /* XXX */
1009  				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1010  	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1011  	if (oif)
1012  		arg.bound_dev_if = oif;
1013  	arg.tos = tos;
1014  	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1015  	local_bh_disable();
1016  	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1017  	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1018  	sock_net_set(ctl_sk, net);
1019  	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1020  			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1021  	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1022  			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1023  	transmit_time = tcp_transmit_time(sk);
1024  	ip_send_unicast_reply(ctl_sk,
1025  			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1026  			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1027  			      &arg, arg.iov[0].iov_len,
1028  			      transmit_time, txhash);
1029  
1030  	sock_net_set(ctl_sk, &init_net);
1031  	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1032  	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1033  	local_bh_enable();
1034  }
1035  
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)1036  static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1037  {
1038  	struct inet_timewait_sock *tw = inet_twsk(sk);
1039  	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1040  	struct tcp_key key = {};
1041  #ifdef CONFIG_TCP_AO
1042  	struct tcp_ao_info *ao_info;
1043  
1044  	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1045  		/* FIXME: the segment to-be-acked is not verified yet */
1046  		ao_info = rcu_dereference(tcptw->ao_info);
1047  		if (ao_info) {
1048  			const struct tcp_ao_hdr *aoh;
1049  
1050  			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1051  				inet_twsk_put(tw);
1052  				return;
1053  			}
1054  
1055  			if (aoh)
1056  				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1057  		}
1058  	}
1059  	if (key.ao_key) {
1060  		struct tcp_ao_key *rnext_key;
1061  
1062  		key.traffic_key = snd_other_key(key.ao_key);
1063  		key.sne = READ_ONCE(ao_info->snd_sne);
1064  		rnext_key = READ_ONCE(ao_info->rnext_key);
1065  		key.rcv_next = rnext_key->rcvid;
1066  		key.type = TCP_KEY_AO;
1067  #else
1068  	if (0) {
1069  #endif
1070  	} else if (static_branch_tcp_md5()) {
1071  		key.md5_key = tcp_twsk_md5_key(tcptw);
1072  		if (key.md5_key)
1073  			key.type = TCP_KEY_MD5;
1074  	}
1075  
1076  	tcp_v4_send_ack(sk, skb,
1077  			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1078  			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1079  			tcp_tw_tsval(tcptw),
1080  			READ_ONCE(tcptw->tw_ts_recent),
1081  			tw->tw_bound_dev_if, &key,
1082  			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1083  			tw->tw_tos,
1084  			tw->tw_txhash);
1085  
1086  	inet_twsk_put(tw);
1087  }
1088  
1089  static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1090  				  struct request_sock *req)
1091  {
1092  	struct tcp_key key = {};
1093  
1094  	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1095  	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1096  	 */
1097  	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1098  					     tcp_sk(sk)->snd_nxt;
1099  
1100  #ifdef CONFIG_TCP_AO
1101  	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1102  	    tcp_rsk_used_ao(req)) {
1103  		const union tcp_md5_addr *addr;
1104  		const struct tcp_ao_hdr *aoh;
1105  		int l3index;
1106  
1107  		/* Invalid TCP option size or twice included auth */
1108  		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1109  			return;
1110  		if (!aoh)
1111  			return;
1112  
1113  		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1114  		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1115  		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1116  					      aoh->rnext_keyid, -1);
1117  		if (unlikely(!key.ao_key)) {
1118  			/* Send ACK with any matching MKT for the peer */
1119  			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1120  			/* Matching key disappeared (user removed the key?)
1121  			 * let the handshake timeout.
1122  			 */
1123  			if (!key.ao_key) {
1124  				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1125  						     addr,
1126  						     ntohs(tcp_hdr(skb)->source),
1127  						     &ip_hdr(skb)->daddr,
1128  						     ntohs(tcp_hdr(skb)->dest));
1129  				return;
1130  			}
1131  		}
1132  		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1133  		if (!key.traffic_key)
1134  			return;
1135  
1136  		key.type = TCP_KEY_AO;
1137  		key.rcv_next = aoh->keyid;
1138  		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1139  #else
1140  	if (0) {
1141  #endif
1142  	} else if (static_branch_tcp_md5()) {
1143  		const union tcp_md5_addr *addr;
1144  		int l3index;
1145  
1146  		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1147  		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1148  		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1149  		if (key.md5_key)
1150  			key.type = TCP_KEY_MD5;
1151  	}
1152  
1153  	tcp_v4_send_ack(sk, skb, seq,
1154  			tcp_rsk(req)->rcv_nxt,
1155  			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1156  			tcp_rsk_tsval(tcp_rsk(req)),
1157  			READ_ONCE(req->ts_recent),
1158  			0, &key,
1159  			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1160  			ip_hdr(skb)->tos,
1161  			READ_ONCE(tcp_rsk(req)->txhash));
1162  	if (tcp_key_is_ao(&key))
1163  		kfree(key.traffic_key);
1164  }
1165  
1166  /*
1167   *	Send a SYN-ACK after having received a SYN.
1168   *	This still operates on a request_sock only, not on a big
1169   *	socket.
1170   */
1171  static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1172  			      struct flowi *fl,
1173  			      struct request_sock *req,
1174  			      struct tcp_fastopen_cookie *foc,
1175  			      enum tcp_synack_type synack_type,
1176  			      struct sk_buff *syn_skb)
1177  {
1178  	const struct inet_request_sock *ireq = inet_rsk(req);
1179  	struct flowi4 fl4;
1180  	int err = -1;
1181  	struct sk_buff *skb;
1182  	u8 tos;
1183  
1184  	/* First, grab a route. */
1185  	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1186  		return -1;
1187  
1188  	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1189  
1190  	if (skb) {
1191  		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1192  
1193  		tos = READ_ONCE(inet_sk(sk)->tos);
1194  
1195  		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1196  			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1197  			      (tos & INET_ECN_MASK);
1198  
1199  		if (!INET_ECN_is_capable(tos) &&
1200  		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1201  			tos |= INET_ECN_ECT_0;
1202  
1203  		rcu_read_lock();
1204  		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1205  					    ireq->ir_rmt_addr,
1206  					    rcu_dereference(ireq->ireq_opt),
1207  					    tos);
1208  		rcu_read_unlock();
1209  		err = net_xmit_eval(err);
1210  	}
1211  
1212  	return err;
1213  }
1214  
1215  /*
1216   *	IPv4 request_sock destructor.
1217   */
1218  static void tcp_v4_reqsk_destructor(struct request_sock *req)
1219  {
1220  	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1221  }
1222  
1223  #ifdef CONFIG_TCP_MD5SIG
1224  /*
1225   * RFC2385 MD5 checksumming requires a mapping of
1226   * IP address->MD5 Key.
1227   * We need to maintain these in the sk structure.
1228   */
1229  
1230  DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1231  EXPORT_SYMBOL(tcp_md5_needed);
1232  
1233  static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1234  {
1235  	if (!old)
1236  		return true;
1237  
1238  	/* l3index always overrides non-l3index */
1239  	if (old->l3index && new->l3index == 0)
1240  		return false;
1241  	if (old->l3index == 0 && new->l3index)
1242  		return true;
1243  
1244  	return old->prefixlen < new->prefixlen;
1245  }
1246  
1247  /* Find the Key structure for an address.  */
1248  struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1249  					   const union tcp_md5_addr *addr,
1250  					   int family, bool any_l3index)
1251  {
1252  	const struct tcp_sock *tp = tcp_sk(sk);
1253  	struct tcp_md5sig_key *key;
1254  	const struct tcp_md5sig_info *md5sig;
1255  	__be32 mask;
1256  	struct tcp_md5sig_key *best_match = NULL;
1257  	bool match;
1258  
1259  	/* caller either holds rcu_read_lock() or socket lock */
1260  	md5sig = rcu_dereference_check(tp->md5sig_info,
1261  				       lockdep_sock_is_held(sk));
1262  	if (!md5sig)
1263  		return NULL;
1264  
1265  	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1266  				 lockdep_sock_is_held(sk)) {
1267  		if (key->family != family)
1268  			continue;
1269  		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1270  		    key->l3index != l3index)
1271  			continue;
1272  		if (family == AF_INET) {
1273  			mask = inet_make_mask(key->prefixlen);
1274  			match = (key->addr.a4.s_addr & mask) ==
1275  				(addr->a4.s_addr & mask);
1276  #if IS_ENABLED(CONFIG_IPV6)
1277  		} else if (family == AF_INET6) {
1278  			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1279  						  key->prefixlen);
1280  #endif
1281  		} else {
1282  			match = false;
1283  		}
1284  
1285  		if (match && better_md5_match(best_match, key))
1286  			best_match = key;
1287  	}
1288  	return best_match;
1289  }
1290  EXPORT_SYMBOL(__tcp_md5_do_lookup);
1291  
1292  static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1293  						      const union tcp_md5_addr *addr,
1294  						      int family, u8 prefixlen,
1295  						      int l3index, u8 flags)
1296  {
1297  	const struct tcp_sock *tp = tcp_sk(sk);
1298  	struct tcp_md5sig_key *key;
1299  	unsigned int size = sizeof(struct in_addr);
1300  	const struct tcp_md5sig_info *md5sig;
1301  
1302  	/* caller either holds rcu_read_lock() or socket lock */
1303  	md5sig = rcu_dereference_check(tp->md5sig_info,
1304  				       lockdep_sock_is_held(sk));
1305  	if (!md5sig)
1306  		return NULL;
1307  #if IS_ENABLED(CONFIG_IPV6)
1308  	if (family == AF_INET6)
1309  		size = sizeof(struct in6_addr);
1310  #endif
1311  	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1312  				 lockdep_sock_is_held(sk)) {
1313  		if (key->family != family)
1314  			continue;
1315  		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1316  			continue;
1317  		if (key->l3index != l3index)
1318  			continue;
1319  		if (!memcmp(&key->addr, addr, size) &&
1320  		    key->prefixlen == prefixlen)
1321  			return key;
1322  	}
1323  	return NULL;
1324  }
1325  
1326  struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1327  					 const struct sock *addr_sk)
1328  {
1329  	const union tcp_md5_addr *addr;
1330  	int l3index;
1331  
1332  	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1333  						 addr_sk->sk_bound_dev_if);
1334  	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1335  	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1336  }
1337  EXPORT_SYMBOL(tcp_v4_md5_lookup);
1338  
1339  static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1340  {
1341  	struct tcp_sock *tp = tcp_sk(sk);
1342  	struct tcp_md5sig_info *md5sig;
1343  
1344  	md5sig = kmalloc(sizeof(*md5sig), gfp);
1345  	if (!md5sig)
1346  		return -ENOMEM;
1347  
1348  	sk_gso_disable(sk);
1349  	INIT_HLIST_HEAD(&md5sig->head);
1350  	rcu_assign_pointer(tp->md5sig_info, md5sig);
1351  	return 0;
1352  }
1353  
1354  /* This can be called on a newly created socket, from other files */
1355  static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1356  			    int family, u8 prefixlen, int l3index, u8 flags,
1357  			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1358  {
1359  	/* Add Key to the list */
1360  	struct tcp_md5sig_key *key;
1361  	struct tcp_sock *tp = tcp_sk(sk);
1362  	struct tcp_md5sig_info *md5sig;
1363  
1364  	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1365  	if (key) {
1366  		/* Pre-existing entry - just update that one.
1367  		 * Note that the key might be used concurrently.
1368  		 * data_race() is telling kcsan that we do not care of
1369  		 * key mismatches, since changing MD5 key on live flows
1370  		 * can lead to packet drops.
1371  		 */
1372  		data_race(memcpy(key->key, newkey, newkeylen));
1373  
1374  		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1375  		 * Also note that a reader could catch new key->keylen value
1376  		 * but old key->key[], this is the reason we use __GFP_ZERO
1377  		 * at sock_kmalloc() time below these lines.
1378  		 */
1379  		WRITE_ONCE(key->keylen, newkeylen);
1380  
1381  		return 0;
1382  	}
1383  
1384  	md5sig = rcu_dereference_protected(tp->md5sig_info,
1385  					   lockdep_sock_is_held(sk));
1386  
1387  	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1388  	if (!key)
1389  		return -ENOMEM;
1390  
1391  	memcpy(key->key, newkey, newkeylen);
1392  	key->keylen = newkeylen;
1393  	key->family = family;
1394  	key->prefixlen = prefixlen;
1395  	key->l3index = l3index;
1396  	key->flags = flags;
1397  	memcpy(&key->addr, addr,
1398  	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1399  								 sizeof(struct in_addr));
1400  	hlist_add_head_rcu(&key->node, &md5sig->head);
1401  	return 0;
1402  }
1403  
1404  int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1405  		   int family, u8 prefixlen, int l3index, u8 flags,
1406  		   const u8 *newkey, u8 newkeylen)
1407  {
1408  	struct tcp_sock *tp = tcp_sk(sk);
1409  
1410  	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1411  		if (tcp_md5_alloc_sigpool())
1412  			return -ENOMEM;
1413  
1414  		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1415  			tcp_md5_release_sigpool();
1416  			return -ENOMEM;
1417  		}
1418  
1419  		if (!static_branch_inc(&tcp_md5_needed.key)) {
1420  			struct tcp_md5sig_info *md5sig;
1421  
1422  			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1423  			rcu_assign_pointer(tp->md5sig_info, NULL);
1424  			kfree_rcu(md5sig, rcu);
1425  			tcp_md5_release_sigpool();
1426  			return -EUSERS;
1427  		}
1428  	}
1429  
1430  	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1431  				newkey, newkeylen, GFP_KERNEL);
1432  }
1433  EXPORT_SYMBOL(tcp_md5_do_add);
1434  
1435  int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1436  		     int family, u8 prefixlen, int l3index,
1437  		     struct tcp_md5sig_key *key)
1438  {
1439  	struct tcp_sock *tp = tcp_sk(sk);
1440  
1441  	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1442  		tcp_md5_add_sigpool();
1443  
1444  		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1445  			tcp_md5_release_sigpool();
1446  			return -ENOMEM;
1447  		}
1448  
1449  		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1450  			struct tcp_md5sig_info *md5sig;
1451  
1452  			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1453  			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1454  			rcu_assign_pointer(tp->md5sig_info, NULL);
1455  			kfree_rcu(md5sig, rcu);
1456  			tcp_md5_release_sigpool();
1457  			return -EUSERS;
1458  		}
1459  	}
1460  
1461  	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1462  				key->flags, key->key, key->keylen,
1463  				sk_gfp_mask(sk, GFP_ATOMIC));
1464  }
1465  EXPORT_SYMBOL(tcp_md5_key_copy);
1466  
1467  int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1468  		   u8 prefixlen, int l3index, u8 flags)
1469  {
1470  	struct tcp_md5sig_key *key;
1471  
1472  	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1473  	if (!key)
1474  		return -ENOENT;
1475  	hlist_del_rcu(&key->node);
1476  	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1477  	kfree_rcu(key, rcu);
1478  	return 0;
1479  }
1480  EXPORT_SYMBOL(tcp_md5_do_del);
1481  
1482  void tcp_clear_md5_list(struct sock *sk)
1483  {
1484  	struct tcp_sock *tp = tcp_sk(sk);
1485  	struct tcp_md5sig_key *key;
1486  	struct hlist_node *n;
1487  	struct tcp_md5sig_info *md5sig;
1488  
1489  	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1490  
1491  	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1492  		hlist_del_rcu(&key->node);
1493  		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1494  		kfree_rcu(key, rcu);
1495  	}
1496  }
1497  
1498  static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1499  				 sockptr_t optval, int optlen)
1500  {
1501  	struct tcp_md5sig cmd;
1502  	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1503  	const union tcp_md5_addr *addr;
1504  	u8 prefixlen = 32;
1505  	int l3index = 0;
1506  	bool l3flag;
1507  	u8 flags;
1508  
1509  	if (optlen < sizeof(cmd))
1510  		return -EINVAL;
1511  
1512  	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1513  		return -EFAULT;
1514  
1515  	if (sin->sin_family != AF_INET)
1516  		return -EINVAL;
1517  
1518  	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1519  	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1520  
1521  	if (optname == TCP_MD5SIG_EXT &&
1522  	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1523  		prefixlen = cmd.tcpm_prefixlen;
1524  		if (prefixlen > 32)
1525  			return -EINVAL;
1526  	}
1527  
1528  	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1529  	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1530  		struct net_device *dev;
1531  
1532  		rcu_read_lock();
1533  		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1534  		if (dev && netif_is_l3_master(dev))
1535  			l3index = dev->ifindex;
1536  
1537  		rcu_read_unlock();
1538  
1539  		/* ok to reference set/not set outside of rcu;
1540  		 * right now device MUST be an L3 master
1541  		 */
1542  		if (!dev || !l3index)
1543  			return -EINVAL;
1544  	}
1545  
1546  	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1547  
1548  	if (!cmd.tcpm_keylen)
1549  		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1550  
1551  	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1552  		return -EINVAL;
1553  
1554  	/* Don't allow keys for peers that have a matching TCP-AO key.
1555  	 * See the comment in tcp_ao_add_cmd()
1556  	 */
1557  	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1558  		return -EKEYREJECTED;
1559  
1560  	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1561  			      cmd.tcpm_key, cmd.tcpm_keylen);
1562  }
1563  
1564  static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1565  				   __be32 daddr, __be32 saddr,
1566  				   const struct tcphdr *th, int nbytes)
1567  {
1568  	struct tcp4_pseudohdr *bp;
1569  	struct scatterlist sg;
1570  	struct tcphdr *_th;
1571  
1572  	bp = hp->scratch;
1573  	bp->saddr = saddr;
1574  	bp->daddr = daddr;
1575  	bp->pad = 0;
1576  	bp->protocol = IPPROTO_TCP;
1577  	bp->len = cpu_to_be16(nbytes);
1578  
1579  	_th = (struct tcphdr *)(bp + 1);
1580  	memcpy(_th, th, sizeof(*th));
1581  	_th->check = 0;
1582  
1583  	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1584  	ahash_request_set_crypt(hp->req, &sg, NULL,
1585  				sizeof(*bp) + sizeof(*th));
1586  	return crypto_ahash_update(hp->req);
1587  }
1588  
1589  static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1590  			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1591  {
1592  	struct tcp_sigpool hp;
1593  
1594  	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1595  		goto clear_hash_nostart;
1596  
1597  	if (crypto_ahash_init(hp.req))
1598  		goto clear_hash;
1599  	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1600  		goto clear_hash;
1601  	if (tcp_md5_hash_key(&hp, key))
1602  		goto clear_hash;
1603  	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1604  	if (crypto_ahash_final(hp.req))
1605  		goto clear_hash;
1606  
1607  	tcp_sigpool_end(&hp);
1608  	return 0;
1609  
1610  clear_hash:
1611  	tcp_sigpool_end(&hp);
1612  clear_hash_nostart:
1613  	memset(md5_hash, 0, 16);
1614  	return 1;
1615  }
1616  
1617  int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1618  			const struct sock *sk,
1619  			const struct sk_buff *skb)
1620  {
1621  	const struct tcphdr *th = tcp_hdr(skb);
1622  	struct tcp_sigpool hp;
1623  	__be32 saddr, daddr;
1624  
1625  	if (sk) { /* valid for establish/request sockets */
1626  		saddr = sk->sk_rcv_saddr;
1627  		daddr = sk->sk_daddr;
1628  	} else {
1629  		const struct iphdr *iph = ip_hdr(skb);
1630  		saddr = iph->saddr;
1631  		daddr = iph->daddr;
1632  	}
1633  
1634  	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1635  		goto clear_hash_nostart;
1636  
1637  	if (crypto_ahash_init(hp.req))
1638  		goto clear_hash;
1639  
1640  	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1641  		goto clear_hash;
1642  	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1643  		goto clear_hash;
1644  	if (tcp_md5_hash_key(&hp, key))
1645  		goto clear_hash;
1646  	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1647  	if (crypto_ahash_final(hp.req))
1648  		goto clear_hash;
1649  
1650  	tcp_sigpool_end(&hp);
1651  	return 0;
1652  
1653  clear_hash:
1654  	tcp_sigpool_end(&hp);
1655  clear_hash_nostart:
1656  	memset(md5_hash, 0, 16);
1657  	return 1;
1658  }
1659  EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1660  
1661  #endif
1662  
1663  static void tcp_v4_init_req(struct request_sock *req,
1664  			    const struct sock *sk_listener,
1665  			    struct sk_buff *skb)
1666  {
1667  	struct inet_request_sock *ireq = inet_rsk(req);
1668  	struct net *net = sock_net(sk_listener);
1669  
1670  	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1671  	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1672  	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1673  }
1674  
1675  static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1676  					  struct sk_buff *skb,
1677  					  struct flowi *fl,
1678  					  struct request_sock *req,
1679  					  u32 tw_isn)
1680  {
1681  	tcp_v4_init_req(req, sk, skb);
1682  
1683  	if (security_inet_conn_request(sk, skb, req))
1684  		return NULL;
1685  
1686  	return inet_csk_route_req(sk, &fl->u.ip4, req);
1687  }
1688  
1689  struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1690  	.family		=	PF_INET,
1691  	.obj_size	=	sizeof(struct tcp_request_sock),
1692  	.rtx_syn_ack	=	tcp_rtx_synack,
1693  	.send_ack	=	tcp_v4_reqsk_send_ack,
1694  	.destructor	=	tcp_v4_reqsk_destructor,
1695  	.send_reset	=	tcp_v4_send_reset,
1696  	.syn_ack_timeout =	tcp_syn_ack_timeout,
1697  };
1698  
1699  const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1700  	.mss_clamp	=	TCP_MSS_DEFAULT,
1701  #ifdef CONFIG_TCP_MD5SIG
1702  	.req_md5_lookup	=	tcp_v4_md5_lookup,
1703  	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1704  #endif
1705  #ifdef CONFIG_TCP_AO
1706  	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1707  	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1708  	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1709  #endif
1710  #ifdef CONFIG_SYN_COOKIES
1711  	.cookie_init_seq =	cookie_v4_init_sequence,
1712  #endif
1713  	.route_req	=	tcp_v4_route_req,
1714  	.init_seq	=	tcp_v4_init_seq,
1715  	.init_ts_off	=	tcp_v4_init_ts_off,
1716  	.send_synack	=	tcp_v4_send_synack,
1717  };
1718  
1719  int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1720  {
1721  	/* Never answer to SYNs send to broadcast or multicast */
1722  	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1723  		goto drop;
1724  
1725  	return tcp_conn_request(&tcp_request_sock_ops,
1726  				&tcp_request_sock_ipv4_ops, sk, skb);
1727  
1728  drop:
1729  	tcp_listendrop(sk);
1730  	return 0;
1731  }
1732  EXPORT_SYMBOL(tcp_v4_conn_request);
1733  
1734  
1735  /*
1736   * The three way handshake has completed - we got a valid synack -
1737   * now create the new socket.
1738   */
1739  struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1740  				  struct request_sock *req,
1741  				  struct dst_entry *dst,
1742  				  struct request_sock *req_unhash,
1743  				  bool *own_req)
1744  {
1745  	struct inet_request_sock *ireq;
1746  	bool found_dup_sk = false;
1747  	struct inet_sock *newinet;
1748  	struct tcp_sock *newtp;
1749  	struct sock *newsk;
1750  #ifdef CONFIG_TCP_MD5SIG
1751  	const union tcp_md5_addr *addr;
1752  	struct tcp_md5sig_key *key;
1753  	int l3index;
1754  #endif
1755  	struct ip_options_rcu *inet_opt;
1756  
1757  	if (sk_acceptq_is_full(sk))
1758  		goto exit_overflow;
1759  
1760  	newsk = tcp_create_openreq_child(sk, req, skb);
1761  	if (!newsk)
1762  		goto exit_nonewsk;
1763  
1764  	newsk->sk_gso_type = SKB_GSO_TCPV4;
1765  	inet_sk_rx_dst_set(newsk, skb);
1766  
1767  	newtp		      = tcp_sk(newsk);
1768  	newinet		      = inet_sk(newsk);
1769  	ireq		      = inet_rsk(req);
1770  	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1771  	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1772  	newsk->sk_bound_dev_if = ireq->ir_iif;
1773  	newinet->inet_saddr   = ireq->ir_loc_addr;
1774  	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1775  	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1776  	newinet->mc_index     = inet_iif(skb);
1777  	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1778  	newinet->rcv_tos      = ip_hdr(skb)->tos;
1779  	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1780  	if (inet_opt)
1781  		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1782  	atomic_set(&newinet->inet_id, get_random_u16());
1783  
1784  	/* Set ToS of the new socket based upon the value of incoming SYN.
1785  	 * ECT bits are set later in tcp_init_transfer().
1786  	 */
1787  	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1788  		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1789  
1790  	if (!dst) {
1791  		dst = inet_csk_route_child_sock(sk, newsk, req);
1792  		if (!dst)
1793  			goto put_and_exit;
1794  	} else {
1795  		/* syncookie case : see end of cookie_v4_check() */
1796  	}
1797  	sk_setup_caps(newsk, dst);
1798  
1799  	tcp_ca_openreq_child(newsk, dst);
1800  
1801  	tcp_sync_mss(newsk, dst_mtu(dst));
1802  	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1803  
1804  	tcp_initialize_rcv_mss(newsk);
1805  
1806  #ifdef CONFIG_TCP_MD5SIG
1807  	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1808  	/* Copy over the MD5 key from the original socket */
1809  	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1810  	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1811  	if (key && !tcp_rsk_used_ao(req)) {
1812  		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1813  			goto put_and_exit;
1814  		sk_gso_disable(newsk);
1815  	}
1816  #endif
1817  #ifdef CONFIG_TCP_AO
1818  	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1819  		goto put_and_exit; /* OOM, release back memory */
1820  #endif
1821  
1822  	if (__inet_inherit_port(sk, newsk) < 0)
1823  		goto put_and_exit;
1824  	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1825  				       &found_dup_sk);
1826  	if (likely(*own_req)) {
1827  		tcp_move_syn(newtp, req);
1828  		ireq->ireq_opt = NULL;
1829  	} else {
1830  		newinet->inet_opt = NULL;
1831  
1832  		if (!req_unhash && found_dup_sk) {
1833  			/* This code path should only be executed in the
1834  			 * syncookie case only
1835  			 */
1836  			bh_unlock_sock(newsk);
1837  			sock_put(newsk);
1838  			newsk = NULL;
1839  		}
1840  	}
1841  	return newsk;
1842  
1843  exit_overflow:
1844  	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1845  exit_nonewsk:
1846  	dst_release(dst);
1847  exit:
1848  	tcp_listendrop(sk);
1849  	return NULL;
1850  put_and_exit:
1851  	newinet->inet_opt = NULL;
1852  	inet_csk_prepare_forced_close(newsk);
1853  	tcp_done(newsk);
1854  	goto exit;
1855  }
1856  EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1857  
1858  static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1859  {
1860  #ifdef CONFIG_SYN_COOKIES
1861  	const struct tcphdr *th = tcp_hdr(skb);
1862  
1863  	if (!th->syn)
1864  		sk = cookie_v4_check(sk, skb);
1865  #endif
1866  	return sk;
1867  }
1868  
1869  u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1870  			 struct tcphdr *th, u32 *cookie)
1871  {
1872  	u16 mss = 0;
1873  #ifdef CONFIG_SYN_COOKIES
1874  	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1875  				    &tcp_request_sock_ipv4_ops, sk, th);
1876  	if (mss) {
1877  		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1878  		tcp_synq_overflow(sk);
1879  	}
1880  #endif
1881  	return mss;
1882  }
1883  
1884  INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1885  							   u32));
1886  /* The socket must have it's spinlock held when we get
1887   * here, unless it is a TCP_LISTEN socket.
1888   *
1889   * We have a potential double-lock case here, so even when
1890   * doing backlog processing we use the BH locking scheme.
1891   * This is because we cannot sleep with the original spinlock
1892   * held.
1893   */
1894  int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1895  {
1896  	enum skb_drop_reason reason;
1897  	struct sock *rsk;
1898  
1899  	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1900  		struct dst_entry *dst;
1901  
1902  		dst = rcu_dereference_protected(sk->sk_rx_dst,
1903  						lockdep_sock_is_held(sk));
1904  
1905  		sock_rps_save_rxhash(sk, skb);
1906  		sk_mark_napi_id(sk, skb);
1907  		if (dst) {
1908  			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1909  			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1910  					     dst, 0)) {
1911  				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1912  				dst_release(dst);
1913  			}
1914  		}
1915  		tcp_rcv_established(sk, skb);
1916  		return 0;
1917  	}
1918  
1919  	if (tcp_checksum_complete(skb))
1920  		goto csum_err;
1921  
1922  	if (sk->sk_state == TCP_LISTEN) {
1923  		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1924  
1925  		if (!nsk)
1926  			return 0;
1927  		if (nsk != sk) {
1928  			reason = tcp_child_process(sk, nsk, skb);
1929  			if (reason) {
1930  				rsk = nsk;
1931  				goto reset;
1932  			}
1933  			return 0;
1934  		}
1935  	} else
1936  		sock_rps_save_rxhash(sk, skb);
1937  
1938  	reason = tcp_rcv_state_process(sk, skb);
1939  	if (reason) {
1940  		rsk = sk;
1941  		goto reset;
1942  	}
1943  	return 0;
1944  
1945  reset:
1946  	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1947  discard:
1948  	sk_skb_reason_drop(sk, skb, reason);
1949  	/* Be careful here. If this function gets more complicated and
1950  	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1951  	 * might be destroyed here. This current version compiles correctly,
1952  	 * but you have been warned.
1953  	 */
1954  	return 0;
1955  
1956  csum_err:
1957  	reason = SKB_DROP_REASON_TCP_CSUM;
1958  	trace_tcp_bad_csum(skb);
1959  	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1960  	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1961  	goto discard;
1962  }
1963  EXPORT_SYMBOL(tcp_v4_do_rcv);
1964  
1965  int tcp_v4_early_demux(struct sk_buff *skb)
1966  {
1967  	struct net *net = dev_net(skb->dev);
1968  	const struct iphdr *iph;
1969  	const struct tcphdr *th;
1970  	struct sock *sk;
1971  
1972  	if (skb->pkt_type != PACKET_HOST)
1973  		return 0;
1974  
1975  	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1976  		return 0;
1977  
1978  	iph = ip_hdr(skb);
1979  	th = tcp_hdr(skb);
1980  
1981  	if (th->doff < sizeof(struct tcphdr) / 4)
1982  		return 0;
1983  
1984  	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1985  				       iph->saddr, th->source,
1986  				       iph->daddr, ntohs(th->dest),
1987  				       skb->skb_iif, inet_sdif(skb));
1988  	if (sk) {
1989  		skb->sk = sk;
1990  		skb->destructor = sock_edemux;
1991  		if (sk_fullsock(sk)) {
1992  			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1993  
1994  			if (dst)
1995  				dst = dst_check(dst, 0);
1996  			if (dst &&
1997  			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1998  				skb_dst_set_noref(skb, dst);
1999  		}
2000  	}
2001  	return 0;
2002  }
2003  
2004  bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2005  		     enum skb_drop_reason *reason)
2006  {
2007  	u32 tail_gso_size, tail_gso_segs;
2008  	struct skb_shared_info *shinfo;
2009  	const struct tcphdr *th;
2010  	struct tcphdr *thtail;
2011  	struct sk_buff *tail;
2012  	unsigned int hdrlen;
2013  	bool fragstolen;
2014  	u32 gso_segs;
2015  	u32 gso_size;
2016  	u64 limit;
2017  	int delta;
2018  
2019  	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2020  	 * we can fix skb->truesize to its real value to avoid future drops.
2021  	 * This is valid because skb is not yet charged to the socket.
2022  	 * It has been noticed pure SACK packets were sometimes dropped
2023  	 * (if cooked by drivers without copybreak feature).
2024  	 */
2025  	skb_condense(skb);
2026  
2027  	skb_dst_drop(skb);
2028  
2029  	if (unlikely(tcp_checksum_complete(skb))) {
2030  		bh_unlock_sock(sk);
2031  		trace_tcp_bad_csum(skb);
2032  		*reason = SKB_DROP_REASON_TCP_CSUM;
2033  		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2034  		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2035  		return true;
2036  	}
2037  
2038  	/* Attempt coalescing to last skb in backlog, even if we are
2039  	 * above the limits.
2040  	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2041  	 */
2042  	th = (const struct tcphdr *)skb->data;
2043  	hdrlen = th->doff * 4;
2044  
2045  	tail = sk->sk_backlog.tail;
2046  	if (!tail)
2047  		goto no_coalesce;
2048  	thtail = (struct tcphdr *)tail->data;
2049  
2050  	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2051  	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2052  	    ((TCP_SKB_CB(tail)->tcp_flags |
2053  	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2054  	    !((TCP_SKB_CB(tail)->tcp_flags &
2055  	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2056  	    ((TCP_SKB_CB(tail)->tcp_flags ^
2057  	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2058  	    !tcp_skb_can_collapse_rx(tail, skb) ||
2059  	    thtail->doff != th->doff ||
2060  	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2061  		goto no_coalesce;
2062  
2063  	__skb_pull(skb, hdrlen);
2064  
2065  	shinfo = skb_shinfo(skb);
2066  	gso_size = shinfo->gso_size ?: skb->len;
2067  	gso_segs = shinfo->gso_segs ?: 1;
2068  
2069  	shinfo = skb_shinfo(tail);
2070  	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2071  	tail_gso_segs = shinfo->gso_segs ?: 1;
2072  
2073  	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2074  		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2075  
2076  		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2077  			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2078  			thtail->window = th->window;
2079  		}
2080  
2081  		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2082  		 * thtail->fin, so that the fast path in tcp_rcv_established()
2083  		 * is not entered if we append a packet with a FIN.
2084  		 * SYN, RST, URG are not present.
2085  		 * ACK is set on both packets.
2086  		 * PSH : we do not really care in TCP stack,
2087  		 *       at least for 'GRO' packets.
2088  		 */
2089  		thtail->fin |= th->fin;
2090  		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2091  
2092  		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2093  			TCP_SKB_CB(tail)->has_rxtstamp = true;
2094  			tail->tstamp = skb->tstamp;
2095  			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2096  		}
2097  
2098  		/* Not as strict as GRO. We only need to carry mss max value */
2099  		shinfo->gso_size = max(gso_size, tail_gso_size);
2100  		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2101  
2102  		sk->sk_backlog.len += delta;
2103  		__NET_INC_STATS(sock_net(sk),
2104  				LINUX_MIB_TCPBACKLOGCOALESCE);
2105  		kfree_skb_partial(skb, fragstolen);
2106  		return false;
2107  	}
2108  	__skb_push(skb, hdrlen);
2109  
2110  no_coalesce:
2111  	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2112  	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2113  	 * sk_rcvbuf in normal conditions.
2114  	 */
2115  	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2116  
2117  	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2118  
2119  	/* Only socket owner can try to collapse/prune rx queues
2120  	 * to reduce memory overhead, so add a little headroom here.
2121  	 * Few sockets backlog are possibly concurrently non empty.
2122  	 */
2123  	limit += 64 * 1024;
2124  
2125  	limit = min_t(u64, limit, UINT_MAX);
2126  
2127  	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2128  		bh_unlock_sock(sk);
2129  		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2130  		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2131  		return true;
2132  	}
2133  	return false;
2134  }
2135  EXPORT_SYMBOL(tcp_add_backlog);
2136  
2137  int tcp_filter(struct sock *sk, struct sk_buff *skb)
2138  {
2139  	struct tcphdr *th = (struct tcphdr *)skb->data;
2140  
2141  	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2142  }
2143  EXPORT_SYMBOL(tcp_filter);
2144  
2145  static void tcp_v4_restore_cb(struct sk_buff *skb)
2146  {
2147  	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2148  		sizeof(struct inet_skb_parm));
2149  }
2150  
2151  static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2152  			   const struct tcphdr *th)
2153  {
2154  	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2155  	 * barrier() makes sure compiler wont play fool^Waliasing games.
2156  	 */
2157  	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2158  		sizeof(struct inet_skb_parm));
2159  	barrier();
2160  
2161  	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2162  	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2163  				    skb->len - th->doff * 4);
2164  	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2165  	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2166  	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2167  	TCP_SKB_CB(skb)->sacked	 = 0;
2168  	TCP_SKB_CB(skb)->has_rxtstamp =
2169  			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2170  }
2171  
2172  /*
2173   *	From tcp_input.c
2174   */
2175  
2176  int tcp_v4_rcv(struct sk_buff *skb)
2177  {
2178  	struct net *net = dev_net(skb->dev);
2179  	enum skb_drop_reason drop_reason;
2180  	int sdif = inet_sdif(skb);
2181  	int dif = inet_iif(skb);
2182  	const struct iphdr *iph;
2183  	const struct tcphdr *th;
2184  	struct sock *sk = NULL;
2185  	bool refcounted;
2186  	int ret;
2187  	u32 isn;
2188  
2189  	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2190  	if (skb->pkt_type != PACKET_HOST)
2191  		goto discard_it;
2192  
2193  	/* Count it even if it's bad */
2194  	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2195  
2196  	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2197  		goto discard_it;
2198  
2199  	th = (const struct tcphdr *)skb->data;
2200  
2201  	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2202  		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2203  		goto bad_packet;
2204  	}
2205  	if (!pskb_may_pull(skb, th->doff * 4))
2206  		goto discard_it;
2207  
2208  	/* An explanation is required here, I think.
2209  	 * Packet length and doff are validated by header prediction,
2210  	 * provided case of th->doff==0 is eliminated.
2211  	 * So, we defer the checks. */
2212  
2213  	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2214  		goto csum_error;
2215  
2216  	th = (const struct tcphdr *)skb->data;
2217  	iph = ip_hdr(skb);
2218  lookup:
2219  	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2220  			       skb, __tcp_hdrlen(th), th->source,
2221  			       th->dest, sdif, &refcounted);
2222  	if (!sk)
2223  		goto no_tcp_socket;
2224  
2225  	if (sk->sk_state == TCP_TIME_WAIT)
2226  		goto do_time_wait;
2227  
2228  	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2229  		struct request_sock *req = inet_reqsk(sk);
2230  		bool req_stolen = false;
2231  		struct sock *nsk;
2232  
2233  		sk = req->rsk_listener;
2234  		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2235  			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2236  		else
2237  			drop_reason = tcp_inbound_hash(sk, req, skb,
2238  						       &iph->saddr, &iph->daddr,
2239  						       AF_INET, dif, sdif);
2240  		if (unlikely(drop_reason)) {
2241  			sk_drops_add(sk, skb);
2242  			reqsk_put(req);
2243  			goto discard_it;
2244  		}
2245  		if (tcp_checksum_complete(skb)) {
2246  			reqsk_put(req);
2247  			goto csum_error;
2248  		}
2249  		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2250  			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2251  			if (!nsk) {
2252  				inet_csk_reqsk_queue_drop_and_put(sk, req);
2253  				goto lookup;
2254  			}
2255  			sk = nsk;
2256  			/* reuseport_migrate_sock() has already held one sk_refcnt
2257  			 * before returning.
2258  			 */
2259  		} else {
2260  			/* We own a reference on the listener, increase it again
2261  			 * as we might lose it too soon.
2262  			 */
2263  			sock_hold(sk);
2264  		}
2265  		refcounted = true;
2266  		nsk = NULL;
2267  		if (!tcp_filter(sk, skb)) {
2268  			th = (const struct tcphdr *)skb->data;
2269  			iph = ip_hdr(skb);
2270  			tcp_v4_fill_cb(skb, iph, th);
2271  			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2272  		} else {
2273  			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2274  		}
2275  		if (!nsk) {
2276  			reqsk_put(req);
2277  			if (req_stolen) {
2278  				/* Another cpu got exclusive access to req
2279  				 * and created a full blown socket.
2280  				 * Try to feed this packet to this socket
2281  				 * instead of discarding it.
2282  				 */
2283  				tcp_v4_restore_cb(skb);
2284  				sock_put(sk);
2285  				goto lookup;
2286  			}
2287  			goto discard_and_relse;
2288  		}
2289  		nf_reset_ct(skb);
2290  		if (nsk == sk) {
2291  			reqsk_put(req);
2292  			tcp_v4_restore_cb(skb);
2293  		} else {
2294  			drop_reason = tcp_child_process(sk, nsk, skb);
2295  			if (drop_reason) {
2296  				enum sk_rst_reason rst_reason;
2297  
2298  				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2299  				tcp_v4_send_reset(nsk, skb, rst_reason);
2300  				goto discard_and_relse;
2301  			}
2302  			sock_put(sk);
2303  			return 0;
2304  		}
2305  	}
2306  
2307  process:
2308  	if (static_branch_unlikely(&ip4_min_ttl)) {
2309  		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2310  		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2311  			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2312  			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2313  			goto discard_and_relse;
2314  		}
2315  	}
2316  
2317  	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2318  		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2319  		goto discard_and_relse;
2320  	}
2321  
2322  	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2323  				       AF_INET, dif, sdif);
2324  	if (drop_reason)
2325  		goto discard_and_relse;
2326  
2327  	nf_reset_ct(skb);
2328  
2329  	if (tcp_filter(sk, skb)) {
2330  		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2331  		goto discard_and_relse;
2332  	}
2333  	th = (const struct tcphdr *)skb->data;
2334  	iph = ip_hdr(skb);
2335  	tcp_v4_fill_cb(skb, iph, th);
2336  
2337  	skb->dev = NULL;
2338  
2339  	if (sk->sk_state == TCP_LISTEN) {
2340  		ret = tcp_v4_do_rcv(sk, skb);
2341  		goto put_and_return;
2342  	}
2343  
2344  	sk_incoming_cpu_update(sk);
2345  
2346  	bh_lock_sock_nested(sk);
2347  	tcp_segs_in(tcp_sk(sk), skb);
2348  	ret = 0;
2349  	if (!sock_owned_by_user(sk)) {
2350  		ret = tcp_v4_do_rcv(sk, skb);
2351  	} else {
2352  		if (tcp_add_backlog(sk, skb, &drop_reason))
2353  			goto discard_and_relse;
2354  	}
2355  	bh_unlock_sock(sk);
2356  
2357  put_and_return:
2358  	if (refcounted)
2359  		sock_put(sk);
2360  
2361  	return ret;
2362  
2363  no_tcp_socket:
2364  	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2365  	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2366  		goto discard_it;
2367  
2368  	tcp_v4_fill_cb(skb, iph, th);
2369  
2370  	if (tcp_checksum_complete(skb)) {
2371  csum_error:
2372  		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2373  		trace_tcp_bad_csum(skb);
2374  		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2375  bad_packet:
2376  		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2377  	} else {
2378  		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2379  	}
2380  
2381  discard_it:
2382  	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2383  	/* Discard frame. */
2384  	sk_skb_reason_drop(sk, skb, drop_reason);
2385  	return 0;
2386  
2387  discard_and_relse:
2388  	sk_drops_add(sk, skb);
2389  	if (refcounted)
2390  		sock_put(sk);
2391  	goto discard_it;
2392  
2393  do_time_wait:
2394  	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2395  		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2396  		inet_twsk_put(inet_twsk(sk));
2397  		goto discard_it;
2398  	}
2399  
2400  	tcp_v4_fill_cb(skb, iph, th);
2401  
2402  	if (tcp_checksum_complete(skb)) {
2403  		inet_twsk_put(inet_twsk(sk));
2404  		goto csum_error;
2405  	}
2406  	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2407  	case TCP_TW_SYN: {
2408  		struct sock *sk2 = inet_lookup_listener(net,
2409  							net->ipv4.tcp_death_row.hashinfo,
2410  							skb, __tcp_hdrlen(th),
2411  							iph->saddr, th->source,
2412  							iph->daddr, th->dest,
2413  							inet_iif(skb),
2414  							sdif);
2415  		if (sk2) {
2416  			inet_twsk_deschedule_put(inet_twsk(sk));
2417  			sk = sk2;
2418  			tcp_v4_restore_cb(skb);
2419  			refcounted = false;
2420  			__this_cpu_write(tcp_tw_isn, isn);
2421  			goto process;
2422  		}
2423  	}
2424  		/* to ACK */
2425  		fallthrough;
2426  	case TCP_TW_ACK:
2427  		tcp_v4_timewait_ack(sk, skb);
2428  		break;
2429  	case TCP_TW_RST:
2430  		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2431  		inet_twsk_deschedule_put(inet_twsk(sk));
2432  		goto discard_it;
2433  	case TCP_TW_SUCCESS:;
2434  	}
2435  	goto discard_it;
2436  }
2437  
2438  static struct timewait_sock_ops tcp_timewait_sock_ops = {
2439  	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2440  	.twsk_destructor= tcp_twsk_destructor,
2441  };
2442  
2443  void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2444  {
2445  	struct dst_entry *dst = skb_dst(skb);
2446  
2447  	if (dst && dst_hold_safe(dst)) {
2448  		rcu_assign_pointer(sk->sk_rx_dst, dst);
2449  		sk->sk_rx_dst_ifindex = skb->skb_iif;
2450  	}
2451  }
2452  EXPORT_SYMBOL(inet_sk_rx_dst_set);
2453  
2454  const struct inet_connection_sock_af_ops ipv4_specific = {
2455  	.queue_xmit	   = ip_queue_xmit,
2456  	.send_check	   = tcp_v4_send_check,
2457  	.rebuild_header	   = inet_sk_rebuild_header,
2458  	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2459  	.conn_request	   = tcp_v4_conn_request,
2460  	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2461  	.net_header_len	   = sizeof(struct iphdr),
2462  	.setsockopt	   = ip_setsockopt,
2463  	.getsockopt	   = ip_getsockopt,
2464  	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2465  	.sockaddr_len	   = sizeof(struct sockaddr_in),
2466  	.mtu_reduced	   = tcp_v4_mtu_reduced,
2467  };
2468  EXPORT_SYMBOL(ipv4_specific);
2469  
2470  #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2471  static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2472  #ifdef CONFIG_TCP_MD5SIG
2473  	.md5_lookup		= tcp_v4_md5_lookup,
2474  	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2475  	.md5_parse		= tcp_v4_parse_md5_keys,
2476  #endif
2477  #ifdef CONFIG_TCP_AO
2478  	.ao_lookup		= tcp_v4_ao_lookup,
2479  	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2480  	.ao_parse		= tcp_v4_parse_ao,
2481  	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2482  #endif
2483  };
2484  #endif
2485  
2486  /* NOTE: A lot of things set to zero explicitly by call to
2487   *       sk_alloc() so need not be done here.
2488   */
2489  static int tcp_v4_init_sock(struct sock *sk)
2490  {
2491  	struct inet_connection_sock *icsk = inet_csk(sk);
2492  
2493  	tcp_init_sock(sk);
2494  
2495  	icsk->icsk_af_ops = &ipv4_specific;
2496  
2497  #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2498  	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2499  #endif
2500  
2501  	return 0;
2502  }
2503  
2504  #ifdef CONFIG_TCP_MD5SIG
2505  static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2506  {
2507  	struct tcp_md5sig_info *md5sig;
2508  
2509  	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2510  	kfree(md5sig);
2511  	static_branch_slow_dec_deferred(&tcp_md5_needed);
2512  	tcp_md5_release_sigpool();
2513  }
2514  #endif
2515  
2516  static void tcp_release_user_frags(struct sock *sk)
2517  {
2518  #ifdef CONFIG_PAGE_POOL
2519  	unsigned long index;
2520  	void *netmem;
2521  
2522  	xa_for_each(&sk->sk_user_frags, index, netmem)
2523  		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2524  #endif
2525  }
2526  
2527  void tcp_v4_destroy_sock(struct sock *sk)
2528  {
2529  	struct tcp_sock *tp = tcp_sk(sk);
2530  
2531  	tcp_release_user_frags(sk);
2532  
2533  	xa_destroy(&sk->sk_user_frags);
2534  
2535  	trace_tcp_destroy_sock(sk);
2536  
2537  	tcp_clear_xmit_timers(sk);
2538  
2539  	tcp_cleanup_congestion_control(sk);
2540  
2541  	tcp_cleanup_ulp(sk);
2542  
2543  	/* Cleanup up the write buffer. */
2544  	tcp_write_queue_purge(sk);
2545  
2546  	/* Check if we want to disable active TFO */
2547  	tcp_fastopen_active_disable_ofo_check(sk);
2548  
2549  	/* Cleans up our, hopefully empty, out_of_order_queue. */
2550  	skb_rbtree_purge(&tp->out_of_order_queue);
2551  
2552  #ifdef CONFIG_TCP_MD5SIG
2553  	/* Clean up the MD5 key list, if any */
2554  	if (tp->md5sig_info) {
2555  		struct tcp_md5sig_info *md5sig;
2556  
2557  		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2558  		tcp_clear_md5_list(sk);
2559  		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2560  		rcu_assign_pointer(tp->md5sig_info, NULL);
2561  	}
2562  #endif
2563  	tcp_ao_destroy_sock(sk, false);
2564  
2565  	/* Clean up a referenced TCP bind bucket. */
2566  	if (inet_csk(sk)->icsk_bind_hash)
2567  		inet_put_port(sk);
2568  
2569  	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2570  
2571  	/* If socket is aborted during connect operation */
2572  	tcp_free_fastopen_req(tp);
2573  	tcp_fastopen_destroy_cipher(sk);
2574  	tcp_saved_syn_free(tp);
2575  
2576  	sk_sockets_allocated_dec(sk);
2577  }
2578  EXPORT_SYMBOL(tcp_v4_destroy_sock);
2579  
2580  #ifdef CONFIG_PROC_FS
2581  /* Proc filesystem TCP sock list dumping. */
2582  
2583  static unsigned short seq_file_family(const struct seq_file *seq);
2584  
2585  static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2586  {
2587  	unsigned short family = seq_file_family(seq);
2588  
2589  	/* AF_UNSPEC is used as a match all */
2590  	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2591  		net_eq(sock_net(sk), seq_file_net(seq)));
2592  }
2593  
2594  /* Find a non empty bucket (starting from st->bucket)
2595   * and return the first sk from it.
2596   */
2597  static void *listening_get_first(struct seq_file *seq)
2598  {
2599  	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2600  	struct tcp_iter_state *st = seq->private;
2601  
2602  	st->offset = 0;
2603  	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2604  		struct inet_listen_hashbucket *ilb2;
2605  		struct hlist_nulls_node *node;
2606  		struct sock *sk;
2607  
2608  		ilb2 = &hinfo->lhash2[st->bucket];
2609  		if (hlist_nulls_empty(&ilb2->nulls_head))
2610  			continue;
2611  
2612  		spin_lock(&ilb2->lock);
2613  		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2614  			if (seq_sk_match(seq, sk))
2615  				return sk;
2616  		}
2617  		spin_unlock(&ilb2->lock);
2618  	}
2619  
2620  	return NULL;
2621  }
2622  
2623  /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2624   * If "cur" is the last one in the st->bucket,
2625   * call listening_get_first() to return the first sk of the next
2626   * non empty bucket.
2627   */
2628  static void *listening_get_next(struct seq_file *seq, void *cur)
2629  {
2630  	struct tcp_iter_state *st = seq->private;
2631  	struct inet_listen_hashbucket *ilb2;
2632  	struct hlist_nulls_node *node;
2633  	struct inet_hashinfo *hinfo;
2634  	struct sock *sk = cur;
2635  
2636  	++st->num;
2637  	++st->offset;
2638  
2639  	sk = sk_nulls_next(sk);
2640  	sk_nulls_for_each_from(sk, node) {
2641  		if (seq_sk_match(seq, sk))
2642  			return sk;
2643  	}
2644  
2645  	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2646  	ilb2 = &hinfo->lhash2[st->bucket];
2647  	spin_unlock(&ilb2->lock);
2648  	++st->bucket;
2649  	return listening_get_first(seq);
2650  }
2651  
2652  static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2653  {
2654  	struct tcp_iter_state *st = seq->private;
2655  	void *rc;
2656  
2657  	st->bucket = 0;
2658  	st->offset = 0;
2659  	rc = listening_get_first(seq);
2660  
2661  	while (rc && *pos) {
2662  		rc = listening_get_next(seq, rc);
2663  		--*pos;
2664  	}
2665  	return rc;
2666  }
2667  
2668  static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2669  				const struct tcp_iter_state *st)
2670  {
2671  	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2672  }
2673  
2674  /*
2675   * Get first established socket starting from bucket given in st->bucket.
2676   * If st->bucket is zero, the very first socket in the hash is returned.
2677   */
2678  static void *established_get_first(struct seq_file *seq)
2679  {
2680  	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2681  	struct tcp_iter_state *st = seq->private;
2682  
2683  	st->offset = 0;
2684  	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2685  		struct sock *sk;
2686  		struct hlist_nulls_node *node;
2687  		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2688  
2689  		cond_resched();
2690  
2691  		/* Lockless fast path for the common case of empty buckets */
2692  		if (empty_bucket(hinfo, st))
2693  			continue;
2694  
2695  		spin_lock_bh(lock);
2696  		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2697  			if (seq_sk_match(seq, sk))
2698  				return sk;
2699  		}
2700  		spin_unlock_bh(lock);
2701  	}
2702  
2703  	return NULL;
2704  }
2705  
2706  static void *established_get_next(struct seq_file *seq, void *cur)
2707  {
2708  	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2709  	struct tcp_iter_state *st = seq->private;
2710  	struct hlist_nulls_node *node;
2711  	struct sock *sk = cur;
2712  
2713  	++st->num;
2714  	++st->offset;
2715  
2716  	sk = sk_nulls_next(sk);
2717  
2718  	sk_nulls_for_each_from(sk, node) {
2719  		if (seq_sk_match(seq, sk))
2720  			return sk;
2721  	}
2722  
2723  	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2724  	++st->bucket;
2725  	return established_get_first(seq);
2726  }
2727  
2728  static void *established_get_idx(struct seq_file *seq, loff_t pos)
2729  {
2730  	struct tcp_iter_state *st = seq->private;
2731  	void *rc;
2732  
2733  	st->bucket = 0;
2734  	rc = established_get_first(seq);
2735  
2736  	while (rc && pos) {
2737  		rc = established_get_next(seq, rc);
2738  		--pos;
2739  	}
2740  	return rc;
2741  }
2742  
2743  static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2744  {
2745  	void *rc;
2746  	struct tcp_iter_state *st = seq->private;
2747  
2748  	st->state = TCP_SEQ_STATE_LISTENING;
2749  	rc	  = listening_get_idx(seq, &pos);
2750  
2751  	if (!rc) {
2752  		st->state = TCP_SEQ_STATE_ESTABLISHED;
2753  		rc	  = established_get_idx(seq, pos);
2754  	}
2755  
2756  	return rc;
2757  }
2758  
2759  static void *tcp_seek_last_pos(struct seq_file *seq)
2760  {
2761  	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2762  	struct tcp_iter_state *st = seq->private;
2763  	int bucket = st->bucket;
2764  	int offset = st->offset;
2765  	int orig_num = st->num;
2766  	void *rc = NULL;
2767  
2768  	switch (st->state) {
2769  	case TCP_SEQ_STATE_LISTENING:
2770  		if (st->bucket > hinfo->lhash2_mask)
2771  			break;
2772  		rc = listening_get_first(seq);
2773  		while (offset-- && rc && bucket == st->bucket)
2774  			rc = listening_get_next(seq, rc);
2775  		if (rc)
2776  			break;
2777  		st->bucket = 0;
2778  		st->state = TCP_SEQ_STATE_ESTABLISHED;
2779  		fallthrough;
2780  	case TCP_SEQ_STATE_ESTABLISHED:
2781  		if (st->bucket > hinfo->ehash_mask)
2782  			break;
2783  		rc = established_get_first(seq);
2784  		while (offset-- && rc && bucket == st->bucket)
2785  			rc = established_get_next(seq, rc);
2786  	}
2787  
2788  	st->num = orig_num;
2789  
2790  	return rc;
2791  }
2792  
2793  void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2794  {
2795  	struct tcp_iter_state *st = seq->private;
2796  	void *rc;
2797  
2798  	if (*pos && *pos == st->last_pos) {
2799  		rc = tcp_seek_last_pos(seq);
2800  		if (rc)
2801  			goto out;
2802  	}
2803  
2804  	st->state = TCP_SEQ_STATE_LISTENING;
2805  	st->num = 0;
2806  	st->bucket = 0;
2807  	st->offset = 0;
2808  	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2809  
2810  out:
2811  	st->last_pos = *pos;
2812  	return rc;
2813  }
2814  EXPORT_SYMBOL(tcp_seq_start);
2815  
2816  void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2817  {
2818  	struct tcp_iter_state *st = seq->private;
2819  	void *rc = NULL;
2820  
2821  	if (v == SEQ_START_TOKEN) {
2822  		rc = tcp_get_idx(seq, 0);
2823  		goto out;
2824  	}
2825  
2826  	switch (st->state) {
2827  	case TCP_SEQ_STATE_LISTENING:
2828  		rc = listening_get_next(seq, v);
2829  		if (!rc) {
2830  			st->state = TCP_SEQ_STATE_ESTABLISHED;
2831  			st->bucket = 0;
2832  			st->offset = 0;
2833  			rc	  = established_get_first(seq);
2834  		}
2835  		break;
2836  	case TCP_SEQ_STATE_ESTABLISHED:
2837  		rc = established_get_next(seq, v);
2838  		break;
2839  	}
2840  out:
2841  	++*pos;
2842  	st->last_pos = *pos;
2843  	return rc;
2844  }
2845  EXPORT_SYMBOL(tcp_seq_next);
2846  
2847  void tcp_seq_stop(struct seq_file *seq, void *v)
2848  {
2849  	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2850  	struct tcp_iter_state *st = seq->private;
2851  
2852  	switch (st->state) {
2853  	case TCP_SEQ_STATE_LISTENING:
2854  		if (v != SEQ_START_TOKEN)
2855  			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2856  		break;
2857  	case TCP_SEQ_STATE_ESTABLISHED:
2858  		if (v)
2859  			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2860  		break;
2861  	}
2862  }
2863  EXPORT_SYMBOL(tcp_seq_stop);
2864  
2865  static void get_openreq4(const struct request_sock *req,
2866  			 struct seq_file *f, int i)
2867  {
2868  	const struct inet_request_sock *ireq = inet_rsk(req);
2869  	long delta = req->rsk_timer.expires - jiffies;
2870  
2871  	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2872  		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2873  		i,
2874  		ireq->ir_loc_addr,
2875  		ireq->ir_num,
2876  		ireq->ir_rmt_addr,
2877  		ntohs(ireq->ir_rmt_port),
2878  		TCP_SYN_RECV,
2879  		0, 0, /* could print option size, but that is af dependent. */
2880  		1,    /* timers active (only the expire timer) */
2881  		jiffies_delta_to_clock_t(delta),
2882  		req->num_timeout,
2883  		from_kuid_munged(seq_user_ns(f),
2884  				 sock_i_uid(req->rsk_listener)),
2885  		0,  /* non standard timer */
2886  		0, /* open_requests have no inode */
2887  		0,
2888  		req);
2889  }
2890  
2891  static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2892  {
2893  	int timer_active;
2894  	unsigned long timer_expires;
2895  	const struct tcp_sock *tp = tcp_sk(sk);
2896  	const struct inet_connection_sock *icsk = inet_csk(sk);
2897  	const struct inet_sock *inet = inet_sk(sk);
2898  	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2899  	__be32 dest = inet->inet_daddr;
2900  	__be32 src = inet->inet_rcv_saddr;
2901  	__u16 destp = ntohs(inet->inet_dport);
2902  	__u16 srcp = ntohs(inet->inet_sport);
2903  	int rx_queue;
2904  	int state;
2905  
2906  	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2907  	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2908  	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2909  		timer_active	= 1;
2910  		timer_expires	= icsk->icsk_timeout;
2911  	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2912  		timer_active	= 4;
2913  		timer_expires	= icsk->icsk_timeout;
2914  	} else if (timer_pending(&sk->sk_timer)) {
2915  		timer_active	= 2;
2916  		timer_expires	= sk->sk_timer.expires;
2917  	} else {
2918  		timer_active	= 0;
2919  		timer_expires = jiffies;
2920  	}
2921  
2922  	state = inet_sk_state_load(sk);
2923  	if (state == TCP_LISTEN)
2924  		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2925  	else
2926  		/* Because we don't lock the socket,
2927  		 * we might find a transient negative value.
2928  		 */
2929  		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2930  				      READ_ONCE(tp->copied_seq), 0);
2931  
2932  	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2933  			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2934  		i, src, srcp, dest, destp, state,
2935  		READ_ONCE(tp->write_seq) - tp->snd_una,
2936  		rx_queue,
2937  		timer_active,
2938  		jiffies_delta_to_clock_t(timer_expires - jiffies),
2939  		icsk->icsk_retransmits,
2940  		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2941  		icsk->icsk_probes_out,
2942  		sock_i_ino(sk),
2943  		refcount_read(&sk->sk_refcnt), sk,
2944  		jiffies_to_clock_t(icsk->icsk_rto),
2945  		jiffies_to_clock_t(icsk->icsk_ack.ato),
2946  		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2947  		tcp_snd_cwnd(tp),
2948  		state == TCP_LISTEN ?
2949  		    fastopenq->max_qlen :
2950  		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2951  }
2952  
2953  static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2954  			       struct seq_file *f, int i)
2955  {
2956  	long delta = tw->tw_timer.expires - jiffies;
2957  	__be32 dest, src;
2958  	__u16 destp, srcp;
2959  
2960  	dest  = tw->tw_daddr;
2961  	src   = tw->tw_rcv_saddr;
2962  	destp = ntohs(tw->tw_dport);
2963  	srcp  = ntohs(tw->tw_sport);
2964  
2965  	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2966  		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2967  		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2968  		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2969  		refcount_read(&tw->tw_refcnt), tw);
2970  }
2971  
2972  #define TMPSZ 150
2973  
2974  static int tcp4_seq_show(struct seq_file *seq, void *v)
2975  {
2976  	struct tcp_iter_state *st;
2977  	struct sock *sk = v;
2978  
2979  	seq_setwidth(seq, TMPSZ - 1);
2980  	if (v == SEQ_START_TOKEN) {
2981  		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2982  			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2983  			   "inode");
2984  		goto out;
2985  	}
2986  	st = seq->private;
2987  
2988  	if (sk->sk_state == TCP_TIME_WAIT)
2989  		get_timewait4_sock(v, seq, st->num);
2990  	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2991  		get_openreq4(v, seq, st->num);
2992  	else
2993  		get_tcp4_sock(v, seq, st->num);
2994  out:
2995  	seq_pad(seq, '\n');
2996  	return 0;
2997  }
2998  
2999  #ifdef CONFIG_BPF_SYSCALL
3000  struct bpf_tcp_iter_state {
3001  	struct tcp_iter_state state;
3002  	unsigned int cur_sk;
3003  	unsigned int end_sk;
3004  	unsigned int max_sk;
3005  	struct sock **batch;
3006  	bool st_bucket_done;
3007  };
3008  
3009  struct bpf_iter__tcp {
3010  	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3011  	__bpf_md_ptr(struct sock_common *, sk_common);
3012  	uid_t uid __aligned(8);
3013  };
3014  
3015  static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3016  			     struct sock_common *sk_common, uid_t uid)
3017  {
3018  	struct bpf_iter__tcp ctx;
3019  
3020  	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3021  	ctx.meta = meta;
3022  	ctx.sk_common = sk_common;
3023  	ctx.uid = uid;
3024  	return bpf_iter_run_prog(prog, &ctx);
3025  }
3026  
3027  static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3028  {
3029  	while (iter->cur_sk < iter->end_sk)
3030  		sock_gen_put(iter->batch[iter->cur_sk++]);
3031  }
3032  
3033  static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3034  				      unsigned int new_batch_sz)
3035  {
3036  	struct sock **new_batch;
3037  
3038  	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3039  			     GFP_USER | __GFP_NOWARN);
3040  	if (!new_batch)
3041  		return -ENOMEM;
3042  
3043  	bpf_iter_tcp_put_batch(iter);
3044  	kvfree(iter->batch);
3045  	iter->batch = new_batch;
3046  	iter->max_sk = new_batch_sz;
3047  
3048  	return 0;
3049  }
3050  
3051  static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3052  						 struct sock *start_sk)
3053  {
3054  	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3055  	struct bpf_tcp_iter_state *iter = seq->private;
3056  	struct tcp_iter_state *st = &iter->state;
3057  	struct hlist_nulls_node *node;
3058  	unsigned int expected = 1;
3059  	struct sock *sk;
3060  
3061  	sock_hold(start_sk);
3062  	iter->batch[iter->end_sk++] = start_sk;
3063  
3064  	sk = sk_nulls_next(start_sk);
3065  	sk_nulls_for_each_from(sk, node) {
3066  		if (seq_sk_match(seq, sk)) {
3067  			if (iter->end_sk < iter->max_sk) {
3068  				sock_hold(sk);
3069  				iter->batch[iter->end_sk++] = sk;
3070  			}
3071  			expected++;
3072  		}
3073  	}
3074  	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3075  
3076  	return expected;
3077  }
3078  
3079  static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3080  						   struct sock *start_sk)
3081  {
3082  	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3083  	struct bpf_tcp_iter_state *iter = seq->private;
3084  	struct tcp_iter_state *st = &iter->state;
3085  	struct hlist_nulls_node *node;
3086  	unsigned int expected = 1;
3087  	struct sock *sk;
3088  
3089  	sock_hold(start_sk);
3090  	iter->batch[iter->end_sk++] = start_sk;
3091  
3092  	sk = sk_nulls_next(start_sk);
3093  	sk_nulls_for_each_from(sk, node) {
3094  		if (seq_sk_match(seq, sk)) {
3095  			if (iter->end_sk < iter->max_sk) {
3096  				sock_hold(sk);
3097  				iter->batch[iter->end_sk++] = sk;
3098  			}
3099  			expected++;
3100  		}
3101  	}
3102  	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3103  
3104  	return expected;
3105  }
3106  
3107  static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3108  {
3109  	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3110  	struct bpf_tcp_iter_state *iter = seq->private;
3111  	struct tcp_iter_state *st = &iter->state;
3112  	unsigned int expected;
3113  	bool resized = false;
3114  	struct sock *sk;
3115  
3116  	/* The st->bucket is done.  Directly advance to the next
3117  	 * bucket instead of having the tcp_seek_last_pos() to skip
3118  	 * one by one in the current bucket and eventually find out
3119  	 * it has to advance to the next bucket.
3120  	 */
3121  	if (iter->st_bucket_done) {
3122  		st->offset = 0;
3123  		st->bucket++;
3124  		if (st->state == TCP_SEQ_STATE_LISTENING &&
3125  		    st->bucket > hinfo->lhash2_mask) {
3126  			st->state = TCP_SEQ_STATE_ESTABLISHED;
3127  			st->bucket = 0;
3128  		}
3129  	}
3130  
3131  again:
3132  	/* Get a new batch */
3133  	iter->cur_sk = 0;
3134  	iter->end_sk = 0;
3135  	iter->st_bucket_done = false;
3136  
3137  	sk = tcp_seek_last_pos(seq);
3138  	if (!sk)
3139  		return NULL; /* Done */
3140  
3141  	if (st->state == TCP_SEQ_STATE_LISTENING)
3142  		expected = bpf_iter_tcp_listening_batch(seq, sk);
3143  	else
3144  		expected = bpf_iter_tcp_established_batch(seq, sk);
3145  
3146  	if (iter->end_sk == expected) {
3147  		iter->st_bucket_done = true;
3148  		return sk;
3149  	}
3150  
3151  	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3152  		resized = true;
3153  		goto again;
3154  	}
3155  
3156  	return sk;
3157  }
3158  
3159  static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3160  {
3161  	/* bpf iter does not support lseek, so it always
3162  	 * continue from where it was stop()-ped.
3163  	 */
3164  	if (*pos)
3165  		return bpf_iter_tcp_batch(seq);
3166  
3167  	return SEQ_START_TOKEN;
3168  }
3169  
3170  static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3171  {
3172  	struct bpf_tcp_iter_state *iter = seq->private;
3173  	struct tcp_iter_state *st = &iter->state;
3174  	struct sock *sk;
3175  
3176  	/* Whenever seq_next() is called, the iter->cur_sk is
3177  	 * done with seq_show(), so advance to the next sk in
3178  	 * the batch.
3179  	 */
3180  	if (iter->cur_sk < iter->end_sk) {
3181  		/* Keeping st->num consistent in tcp_iter_state.
3182  		 * bpf_iter_tcp does not use st->num.
3183  		 * meta.seq_num is used instead.
3184  		 */
3185  		st->num++;
3186  		/* Move st->offset to the next sk in the bucket such that
3187  		 * the future start() will resume at st->offset in
3188  		 * st->bucket.  See tcp_seek_last_pos().
3189  		 */
3190  		st->offset++;
3191  		sock_gen_put(iter->batch[iter->cur_sk++]);
3192  	}
3193  
3194  	if (iter->cur_sk < iter->end_sk)
3195  		sk = iter->batch[iter->cur_sk];
3196  	else
3197  		sk = bpf_iter_tcp_batch(seq);
3198  
3199  	++*pos;
3200  	/* Keeping st->last_pos consistent in tcp_iter_state.
3201  	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3202  	 */
3203  	st->last_pos = *pos;
3204  	return sk;
3205  }
3206  
3207  static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3208  {
3209  	struct bpf_iter_meta meta;
3210  	struct bpf_prog *prog;
3211  	struct sock *sk = v;
3212  	uid_t uid;
3213  	int ret;
3214  
3215  	if (v == SEQ_START_TOKEN)
3216  		return 0;
3217  
3218  	if (sk_fullsock(sk))
3219  		lock_sock(sk);
3220  
3221  	if (unlikely(sk_unhashed(sk))) {
3222  		ret = SEQ_SKIP;
3223  		goto unlock;
3224  	}
3225  
3226  	if (sk->sk_state == TCP_TIME_WAIT) {
3227  		uid = 0;
3228  	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3229  		const struct request_sock *req = v;
3230  
3231  		uid = from_kuid_munged(seq_user_ns(seq),
3232  				       sock_i_uid(req->rsk_listener));
3233  	} else {
3234  		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3235  	}
3236  
3237  	meta.seq = seq;
3238  	prog = bpf_iter_get_info(&meta, false);
3239  	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3240  
3241  unlock:
3242  	if (sk_fullsock(sk))
3243  		release_sock(sk);
3244  	return ret;
3245  
3246  }
3247  
3248  static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3249  {
3250  	struct bpf_tcp_iter_state *iter = seq->private;
3251  	struct bpf_iter_meta meta;
3252  	struct bpf_prog *prog;
3253  
3254  	if (!v) {
3255  		meta.seq = seq;
3256  		prog = bpf_iter_get_info(&meta, true);
3257  		if (prog)
3258  			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3259  	}
3260  
3261  	if (iter->cur_sk < iter->end_sk) {
3262  		bpf_iter_tcp_put_batch(iter);
3263  		iter->st_bucket_done = false;
3264  	}
3265  }
3266  
3267  static const struct seq_operations bpf_iter_tcp_seq_ops = {
3268  	.show		= bpf_iter_tcp_seq_show,
3269  	.start		= bpf_iter_tcp_seq_start,
3270  	.next		= bpf_iter_tcp_seq_next,
3271  	.stop		= bpf_iter_tcp_seq_stop,
3272  };
3273  #endif
3274  static unsigned short seq_file_family(const struct seq_file *seq)
3275  {
3276  	const struct tcp_seq_afinfo *afinfo;
3277  
3278  #ifdef CONFIG_BPF_SYSCALL
3279  	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3280  	if (seq->op == &bpf_iter_tcp_seq_ops)
3281  		return AF_UNSPEC;
3282  #endif
3283  
3284  	/* Iterated from proc fs */
3285  	afinfo = pde_data(file_inode(seq->file));
3286  	return afinfo->family;
3287  }
3288  
3289  static const struct seq_operations tcp4_seq_ops = {
3290  	.show		= tcp4_seq_show,
3291  	.start		= tcp_seq_start,
3292  	.next		= tcp_seq_next,
3293  	.stop		= tcp_seq_stop,
3294  };
3295  
3296  static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3297  	.family		= AF_INET,
3298  };
3299  
3300  static int __net_init tcp4_proc_init_net(struct net *net)
3301  {
3302  	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3303  			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3304  		return -ENOMEM;
3305  	return 0;
3306  }
3307  
3308  static void __net_exit tcp4_proc_exit_net(struct net *net)
3309  {
3310  	remove_proc_entry("tcp", net->proc_net);
3311  }
3312  
3313  static struct pernet_operations tcp4_net_ops = {
3314  	.init = tcp4_proc_init_net,
3315  	.exit = tcp4_proc_exit_net,
3316  };
3317  
3318  int __init tcp4_proc_init(void)
3319  {
3320  	return register_pernet_subsys(&tcp4_net_ops);
3321  }
3322  
3323  void tcp4_proc_exit(void)
3324  {
3325  	unregister_pernet_subsys(&tcp4_net_ops);
3326  }
3327  #endif /* CONFIG_PROC_FS */
3328  
3329  /* @wake is one when sk_stream_write_space() calls us.
3330   * This sends EPOLLOUT only if notsent_bytes is half the limit.
3331   * This mimics the strategy used in sock_def_write_space().
3332   */
3333  bool tcp_stream_memory_free(const struct sock *sk, int wake)
3334  {
3335  	const struct tcp_sock *tp = tcp_sk(sk);
3336  	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3337  			    READ_ONCE(tp->snd_nxt);
3338  
3339  	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3340  }
3341  EXPORT_SYMBOL(tcp_stream_memory_free);
3342  
3343  struct proto tcp_prot = {
3344  	.name			= "TCP",
3345  	.owner			= THIS_MODULE,
3346  	.close			= tcp_close,
3347  	.pre_connect		= tcp_v4_pre_connect,
3348  	.connect		= tcp_v4_connect,
3349  	.disconnect		= tcp_disconnect,
3350  	.accept			= inet_csk_accept,
3351  	.ioctl			= tcp_ioctl,
3352  	.init			= tcp_v4_init_sock,
3353  	.destroy		= tcp_v4_destroy_sock,
3354  	.shutdown		= tcp_shutdown,
3355  	.setsockopt		= tcp_setsockopt,
3356  	.getsockopt		= tcp_getsockopt,
3357  	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3358  	.keepalive		= tcp_set_keepalive,
3359  	.recvmsg		= tcp_recvmsg,
3360  	.sendmsg		= tcp_sendmsg,
3361  	.splice_eof		= tcp_splice_eof,
3362  	.backlog_rcv		= tcp_v4_do_rcv,
3363  	.release_cb		= tcp_release_cb,
3364  	.hash			= inet_hash,
3365  	.unhash			= inet_unhash,
3366  	.get_port		= inet_csk_get_port,
3367  	.put_port		= inet_put_port,
3368  #ifdef CONFIG_BPF_SYSCALL
3369  	.psock_update_sk_prot	= tcp_bpf_update_proto,
3370  #endif
3371  	.enter_memory_pressure	= tcp_enter_memory_pressure,
3372  	.leave_memory_pressure	= tcp_leave_memory_pressure,
3373  	.stream_memory_free	= tcp_stream_memory_free,
3374  	.sockets_allocated	= &tcp_sockets_allocated,
3375  	.orphan_count		= &tcp_orphan_count,
3376  
3377  	.memory_allocated	= &tcp_memory_allocated,
3378  	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3379  
3380  	.memory_pressure	= &tcp_memory_pressure,
3381  	.sysctl_mem		= sysctl_tcp_mem,
3382  	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3383  	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3384  	.max_header		= MAX_TCP_HEADER,
3385  	.obj_size		= sizeof(struct tcp_sock),
3386  	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3387  	.twsk_prot		= &tcp_timewait_sock_ops,
3388  	.rsk_prot		= &tcp_request_sock_ops,
3389  	.h.hashinfo		= NULL,
3390  	.no_autobind		= true,
3391  	.diag_destroy		= tcp_abort,
3392  };
3393  EXPORT_SYMBOL(tcp_prot);
3394  
3395  static void __net_exit tcp_sk_exit(struct net *net)
3396  {
3397  	if (net->ipv4.tcp_congestion_control)
3398  		bpf_module_put(net->ipv4.tcp_congestion_control,
3399  			       net->ipv4.tcp_congestion_control->owner);
3400  }
3401  
3402  static void __net_init tcp_set_hashinfo(struct net *net)
3403  {
3404  	struct inet_hashinfo *hinfo;
3405  	unsigned int ehash_entries;
3406  	struct net *old_net;
3407  
3408  	if (net_eq(net, &init_net))
3409  		goto fallback;
3410  
3411  	old_net = current->nsproxy->net_ns;
3412  	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3413  	if (!ehash_entries)
3414  		goto fallback;
3415  
3416  	ehash_entries = roundup_pow_of_two(ehash_entries);
3417  	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3418  	if (!hinfo) {
3419  		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3420  			"for a netns, fallback to the global one\n",
3421  			ehash_entries);
3422  fallback:
3423  		hinfo = &tcp_hashinfo;
3424  		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3425  	}
3426  
3427  	net->ipv4.tcp_death_row.hashinfo = hinfo;
3428  	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3429  	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3430  }
3431  
3432  static int __net_init tcp_sk_init(struct net *net)
3433  {
3434  	net->ipv4.sysctl_tcp_ecn = 2;
3435  	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3436  
3437  	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3438  	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3439  	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3440  	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3441  	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3442  
3443  	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3444  	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3445  	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3446  
3447  	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3448  	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3449  	net->ipv4.sysctl_tcp_syncookies = 1;
3450  	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3451  	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3452  	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3453  	net->ipv4.sysctl_tcp_orphan_retries = 0;
3454  	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3455  	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3456  	net->ipv4.sysctl_tcp_tw_reuse = 2;
3457  	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3458  
3459  	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3460  	tcp_set_hashinfo(net);
3461  
3462  	net->ipv4.sysctl_tcp_sack = 1;
3463  	net->ipv4.sysctl_tcp_window_scaling = 1;
3464  	net->ipv4.sysctl_tcp_timestamps = 1;
3465  	net->ipv4.sysctl_tcp_early_retrans = 3;
3466  	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3467  	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3468  	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3469  	net->ipv4.sysctl_tcp_max_reordering = 300;
3470  	net->ipv4.sysctl_tcp_dsack = 1;
3471  	net->ipv4.sysctl_tcp_app_win = 31;
3472  	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3473  	net->ipv4.sysctl_tcp_frto = 2;
3474  	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3475  	/* This limits the percentage of the congestion window which we
3476  	 * will allow a single TSO frame to consume.  Building TSO frames
3477  	 * which are too large can cause TCP streams to be bursty.
3478  	 */
3479  	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3480  	/* Default TSQ limit of 16 TSO segments */
3481  	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3482  
3483  	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3484  	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3485  
3486  	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3487  	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3488  	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3489  	net->ipv4.sysctl_tcp_autocorking = 1;
3490  	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3491  	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3492  	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3493  	if (net != &init_net) {
3494  		memcpy(net->ipv4.sysctl_tcp_rmem,
3495  		       init_net.ipv4.sysctl_tcp_rmem,
3496  		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3497  		memcpy(net->ipv4.sysctl_tcp_wmem,
3498  		       init_net.ipv4.sysctl_tcp_wmem,
3499  		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3500  	}
3501  	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3502  	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3503  	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3504  	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3505  	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3506  	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3507  	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3508  
3509  	/* Set default values for PLB */
3510  	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3511  	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3512  	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3513  	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3514  	/* Default congestion threshold for PLB to mark a round is 50% */
3515  	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3516  
3517  	/* Reno is always built in */
3518  	if (!net_eq(net, &init_net) &&
3519  	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3520  			       init_net.ipv4.tcp_congestion_control->owner))
3521  		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3522  	else
3523  		net->ipv4.tcp_congestion_control = &tcp_reno;
3524  
3525  	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3526  	net->ipv4.sysctl_tcp_shrink_window = 0;
3527  
3528  	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3529  	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3530  
3531  	return 0;
3532  }
3533  
3534  static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3535  {
3536  	struct net *net;
3537  
3538  	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3539  	 * and failed setup_net error unwinding path are serialized.
3540  	 *
3541  	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3542  	 * net_exit_list, the thread that dismantles a particular twsk must
3543  	 * do so without other thread progressing to refcount_dec_and_test() of
3544  	 * tcp_death_row.tw_refcount.
3545  	 */
3546  	mutex_lock(&tcp_exit_batch_mutex);
3547  
3548  	tcp_twsk_purge(net_exit_list);
3549  
3550  	list_for_each_entry(net, net_exit_list, exit_list) {
3551  		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3552  		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3553  		tcp_fastopen_ctx_destroy(net);
3554  	}
3555  
3556  	mutex_unlock(&tcp_exit_batch_mutex);
3557  }
3558  
3559  static struct pernet_operations __net_initdata tcp_sk_ops = {
3560         .init	   = tcp_sk_init,
3561         .exit	   = tcp_sk_exit,
3562         .exit_batch = tcp_sk_exit_batch,
3563  };
3564  
3565  #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3566  DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3567  		     struct sock_common *sk_common, uid_t uid)
3568  
3569  #define INIT_BATCH_SZ 16
3570  
3571  static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3572  {
3573  	struct bpf_tcp_iter_state *iter = priv_data;
3574  	int err;
3575  
3576  	err = bpf_iter_init_seq_net(priv_data, aux);
3577  	if (err)
3578  		return err;
3579  
3580  	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3581  	if (err) {
3582  		bpf_iter_fini_seq_net(priv_data);
3583  		return err;
3584  	}
3585  
3586  	return 0;
3587  }
3588  
3589  static void bpf_iter_fini_tcp(void *priv_data)
3590  {
3591  	struct bpf_tcp_iter_state *iter = priv_data;
3592  
3593  	bpf_iter_fini_seq_net(priv_data);
3594  	kvfree(iter->batch);
3595  }
3596  
3597  static const struct bpf_iter_seq_info tcp_seq_info = {
3598  	.seq_ops		= &bpf_iter_tcp_seq_ops,
3599  	.init_seq_private	= bpf_iter_init_tcp,
3600  	.fini_seq_private	= bpf_iter_fini_tcp,
3601  	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3602  };
3603  
3604  static const struct bpf_func_proto *
3605  bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3606  			    const struct bpf_prog *prog)
3607  {
3608  	switch (func_id) {
3609  	case BPF_FUNC_setsockopt:
3610  		return &bpf_sk_setsockopt_proto;
3611  	case BPF_FUNC_getsockopt:
3612  		return &bpf_sk_getsockopt_proto;
3613  	default:
3614  		return NULL;
3615  	}
3616  }
3617  
3618  static struct bpf_iter_reg tcp_reg_info = {
3619  	.target			= "tcp",
3620  	.ctx_arg_info_size	= 1,
3621  	.ctx_arg_info		= {
3622  		{ offsetof(struct bpf_iter__tcp, sk_common),
3623  		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3624  	},
3625  	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3626  	.seq_info		= &tcp_seq_info,
3627  };
3628  
3629  static void __init bpf_iter_register(void)
3630  {
3631  	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3632  	if (bpf_iter_reg_target(&tcp_reg_info))
3633  		pr_warn("Warning: could not register bpf iterator tcp\n");
3634  }
3635  
3636  #endif
3637  
3638  void __init tcp_v4_init(void)
3639  {
3640  	int cpu, res;
3641  
3642  	for_each_possible_cpu(cpu) {
3643  		struct sock *sk;
3644  
3645  		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3646  					   IPPROTO_TCP, &init_net);
3647  		if (res)
3648  			panic("Failed to create the TCP control socket.\n");
3649  		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3650  
3651  		/* Please enforce IP_DF and IPID==0 for RST and
3652  		 * ACK sent in SYN-RECV and TIME-WAIT state.
3653  		 */
3654  		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3655  
3656  		sk->sk_clockid = CLOCK_MONOTONIC;
3657  
3658  		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3659  	}
3660  	if (register_pernet_subsys(&tcp_sk_ops))
3661  		panic("Failed to create the TCP control socket.\n");
3662  
3663  #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3664  	bpf_iter_register();
3665  #endif
3666  }
3667