1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * INET		An implementation of the TCP/IP protocol suite for the LINUX
4   *		operating system.  INET is implemented using the  BSD Socket
5   *		interface as the means of communication with the user level.
6   *
7   *		Implementation of the Transmission Control Protocol(TCP).
8   *
9   * Authors:	Ross Biro
10   *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11   *		Mark Evans, <evansmp@uhura.aston.ac.uk>
12   *		Corey Minyard <wf-rch!minyard@relay.EU.net>
13   *		Florian La Roche, <flla@stud.uni-sb.de>
14   *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
15   *		Linus Torvalds, <torvalds@cs.helsinki.fi>
16   *		Alan Cox, <gw4pts@gw4pts.ampr.org>
17   *		Matthew Dillon, <dillon@apollo.west.oic.com>
18   *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
19   *		Jorge Cwik, <jorge@laser.satlink.net>
20   */
21  
22  #include <net/tcp.h>
23  #include <net/xfrm.h>
24  #include <net/busy_poll.h>
25  #include <net/rstreason.h>
26  
tcp_in_window(u32 seq,u32 end_seq,u32 s_win,u32 e_win)27  static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
28  {
29  	if (seq == s_win)
30  		return true;
31  	if (after(end_seq, s_win) && before(seq, e_win))
32  		return true;
33  	return seq == e_win && seq == end_seq;
34  }
35  
36  static enum tcp_tw_status
tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock * tw,const struct sk_buff * skb,int mib_idx)37  tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
38  				  const struct sk_buff *skb, int mib_idx)
39  {
40  	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
41  
42  	if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
43  				  &tcptw->tw_last_oow_ack_time)) {
44  		/* Send ACK. Note, we do not put the bucket,
45  		 * it will be released by caller.
46  		 */
47  		return TCP_TW_ACK;
48  	}
49  
50  	/* We are rate-limiting, so just release the tw sock and drop skb. */
51  	inet_twsk_put(tw);
52  	return TCP_TW_SUCCESS;
53  }
54  
twsk_rcv_nxt_update(struct tcp_timewait_sock * tcptw,u32 seq,u32 rcv_nxt)55  static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq,
56  				u32 rcv_nxt)
57  {
58  #ifdef CONFIG_TCP_AO
59  	struct tcp_ao_info *ao;
60  
61  	ao = rcu_dereference(tcptw->ao_info);
62  	if (unlikely(ao && seq < rcv_nxt))
63  		WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1);
64  #endif
65  	WRITE_ONCE(tcptw->tw_rcv_nxt, seq);
66  }
67  
68  /*
69   * * Main purpose of TIME-WAIT state is to close connection gracefully,
70   *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
71   *   (and, probably, tail of data) and one or more our ACKs are lost.
72   * * What is TIME-WAIT timeout? It is associated with maximal packet
73   *   lifetime in the internet, which results in wrong conclusion, that
74   *   it is set to catch "old duplicate segments" wandering out of their path.
75   *   It is not quite correct. This timeout is calculated so that it exceeds
76   *   maximal retransmission timeout enough to allow to lose one (or more)
77   *   segments sent by peer and our ACKs. This time may be calculated from RTO.
78   * * When TIME-WAIT socket receives RST, it means that another end
79   *   finally closed and we are allowed to kill TIME-WAIT too.
80   * * Second purpose of TIME-WAIT is catching old duplicate segments.
81   *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
82   *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
83   * * If we invented some more clever way to catch duplicates
84   *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
85   *
86   * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
87   * When you compare it to RFCs, please, read section SEGMENT ARRIVES
88   * from the very beginning.
89   *
90   * NOTE. With recycling (and later with fin-wait-2) TW bucket
91   * is _not_ stateless. It means, that strictly speaking we must
92   * spinlock it. I do not want! Well, probability of misbehaviour
93   * is ridiculously low and, seems, we could use some mb() tricks
94   * to avoid misread sequence numbers, states etc.  --ANK
95   *
96   * We don't need to initialize tmp_out.sack_ok as we don't use the results
97   */
98  enum tcp_tw_status
tcp_timewait_state_process(struct inet_timewait_sock * tw,struct sk_buff * skb,const struct tcphdr * th,u32 * tw_isn)99  tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
100  			   const struct tcphdr *th, u32 *tw_isn)
101  {
102  	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
103  	u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt);
104  	struct tcp_options_received tmp_opt;
105  	bool paws_reject = false;
106  	int ts_recent_stamp;
107  
108  	tmp_opt.saw_tstamp = 0;
109  	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
110  	if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) {
111  		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
112  
113  		if (tmp_opt.saw_tstamp) {
114  			if (tmp_opt.rcv_tsecr)
115  				tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
116  			tmp_opt.ts_recent	= READ_ONCE(tcptw->tw_ts_recent);
117  			tmp_opt.ts_recent_stamp	= ts_recent_stamp;
118  			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
119  		}
120  	}
121  
122  	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) {
123  		/* Just repeat all the checks of tcp_rcv_state_process() */
124  
125  		/* Out of window, send ACK */
126  		if (paws_reject ||
127  		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
128  				   rcv_nxt,
129  				   rcv_nxt + tcptw->tw_rcv_wnd))
130  			return tcp_timewait_check_oow_rate_limit(
131  				tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
132  
133  		if (th->rst)
134  			goto kill;
135  
136  		if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt))
137  			return TCP_TW_RST;
138  
139  		/* Dup ACK? */
140  		if (!th->ack ||
141  		    !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) ||
142  		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
143  			inet_twsk_put(tw);
144  			return TCP_TW_SUCCESS;
145  		}
146  
147  		/* New data or FIN. If new data arrive after half-duplex close,
148  		 * reset.
149  		 */
150  		if (!th->fin ||
151  		    TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1)
152  			return TCP_TW_RST;
153  
154  		/* FIN arrived, enter true time-wait state. */
155  		WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT);
156  		twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq,
157  				    rcv_nxt);
158  
159  		if (tmp_opt.saw_tstamp) {
160  			WRITE_ONCE(tcptw->tw_ts_recent_stamp,
161  				  ktime_get_seconds());
162  			WRITE_ONCE(tcptw->tw_ts_recent,
163  				   tmp_opt.rcv_tsval);
164  		}
165  
166  		inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
167  		return TCP_TW_ACK;
168  	}
169  
170  	/*
171  	 *	Now real TIME-WAIT state.
172  	 *
173  	 *	RFC 1122:
174  	 *	"When a connection is [...] on TIME-WAIT state [...]
175  	 *	[a TCP] MAY accept a new SYN from the remote TCP to
176  	 *	reopen the connection directly, if it:
177  	 *
178  	 *	(1)  assigns its initial sequence number for the new
179  	 *	connection to be larger than the largest sequence
180  	 *	number it used on the previous connection incarnation,
181  	 *	and
182  	 *
183  	 *	(2)  returns to TIME-WAIT state if the SYN turns out
184  	 *	to be an old duplicate".
185  	 */
186  
187  	if (!paws_reject &&
188  	    (TCP_SKB_CB(skb)->seq == rcv_nxt &&
189  	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
190  		/* In window segment, it may be only reset or bare ack. */
191  
192  		if (th->rst) {
193  			/* This is TIME_WAIT assassination, in two flavors.
194  			 * Oh well... nobody has a sufficient solution to this
195  			 * protocol bug yet.
196  			 */
197  			if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) {
198  kill:
199  				inet_twsk_deschedule_put(tw);
200  				return TCP_TW_SUCCESS;
201  			}
202  		} else {
203  			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
204  		}
205  
206  		if (tmp_opt.saw_tstamp) {
207  			WRITE_ONCE(tcptw->tw_ts_recent,
208  				   tmp_opt.rcv_tsval);
209  			WRITE_ONCE(tcptw->tw_ts_recent_stamp,
210  				   ktime_get_seconds());
211  		}
212  
213  		inet_twsk_put(tw);
214  		return TCP_TW_SUCCESS;
215  	}
216  
217  	/* Out of window segment.
218  
219  	   All the segments are ACKed immediately.
220  
221  	   The only exception is new SYN. We accept it, if it is
222  	   not old duplicate and we are not in danger to be killed
223  	   by delayed old duplicates. RFC check is that it has
224  	   newer sequence number works at rates <40Mbit/sec.
225  	   However, if paws works, it is reliable AND even more,
226  	   we even may relax silly seq space cutoff.
227  
228  	   RED-PEN: we violate main RFC requirement, if this SYN will appear
229  	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
230  	   we must return socket to time-wait state. It is not good,
231  	   but not fatal yet.
232  	 */
233  
234  	if (th->syn && !th->rst && !th->ack && !paws_reject &&
235  	    (after(TCP_SKB_CB(skb)->seq, rcv_nxt) ||
236  	     (tmp_opt.saw_tstamp &&
237  	      (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) {
238  		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
239  		if (isn == 0)
240  			isn++;
241  		*tw_isn = isn;
242  		return TCP_TW_SYN;
243  	}
244  
245  	if (paws_reject)
246  		__NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
247  
248  	if (!th->rst) {
249  		/* In this case we must reset the TIMEWAIT timer.
250  		 *
251  		 * If it is ACKless SYN it may be both old duplicate
252  		 * and new good SYN with random sequence number <rcv_nxt.
253  		 * Do not reschedule in the last case.
254  		 */
255  		if (paws_reject || th->ack)
256  			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
257  
258  		return tcp_timewait_check_oow_rate_limit(
259  			tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
260  	}
261  	inet_twsk_put(tw);
262  	return TCP_TW_SUCCESS;
263  }
264  EXPORT_SYMBOL(tcp_timewait_state_process);
265  
tcp_time_wait_init(struct sock * sk,struct tcp_timewait_sock * tcptw)266  static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
267  {
268  #ifdef CONFIG_TCP_MD5SIG
269  	const struct tcp_sock *tp = tcp_sk(sk);
270  	struct tcp_md5sig_key *key;
271  
272  	/*
273  	 * The timewait bucket does not have the key DB from the
274  	 * sock structure. We just make a quick copy of the
275  	 * md5 key being used (if indeed we are using one)
276  	 * so the timewait ack generating code has the key.
277  	 */
278  	tcptw->tw_md5_key = NULL;
279  	if (!static_branch_unlikely(&tcp_md5_needed.key))
280  		return;
281  
282  	key = tp->af_specific->md5_lookup(sk, sk);
283  	if (key) {
284  		tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
285  		if (!tcptw->tw_md5_key)
286  			return;
287  		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key))
288  			goto out_free;
289  		tcp_md5_add_sigpool();
290  	}
291  	return;
292  out_free:
293  	WARN_ON_ONCE(1);
294  	kfree(tcptw->tw_md5_key);
295  	tcptw->tw_md5_key = NULL;
296  #endif
297  }
298  
299  /*
300   * Move a socket to time-wait or dead fin-wait-2 state.
301   */
tcp_time_wait(struct sock * sk,int state,int timeo)302  void tcp_time_wait(struct sock *sk, int state, int timeo)
303  {
304  	const struct inet_connection_sock *icsk = inet_csk(sk);
305  	struct tcp_sock *tp = tcp_sk(sk);
306  	struct net *net = sock_net(sk);
307  	struct inet_timewait_sock *tw;
308  
309  	tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state);
310  
311  	if (tw) {
312  		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
313  		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
314  
315  		tw->tw_transparent	= inet_test_bit(TRANSPARENT, sk);
316  		tw->tw_mark		= sk->sk_mark;
317  		tw->tw_priority		= READ_ONCE(sk->sk_priority);
318  		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
319  		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
320  		tcptw->tw_snd_nxt	= tp->snd_nxt;
321  		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
322  		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
323  		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
324  		tcptw->tw_ts_offset	= tp->tsoffset;
325  		tw->tw_usec_ts		= tp->tcp_usec_ts;
326  		tcptw->tw_last_oow_ack_time = 0;
327  		tcptw->tw_tx_delay	= tp->tcp_tx_delay;
328  		tw->tw_txhash		= sk->sk_txhash;
329  #if IS_ENABLED(CONFIG_IPV6)
330  		if (tw->tw_family == PF_INET6) {
331  			struct ipv6_pinfo *np = inet6_sk(sk);
332  
333  			tw->tw_v6_daddr = sk->sk_v6_daddr;
334  			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
335  			tw->tw_tclass = np->tclass;
336  			tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
337  			tw->tw_ipv6only = sk->sk_ipv6only;
338  		}
339  #endif
340  
341  		tcp_time_wait_init(sk, tcptw);
342  		tcp_ao_time_wait(tcptw, tp);
343  
344  		/* Get the TIME_WAIT timeout firing. */
345  		if (timeo < rto)
346  			timeo = rto;
347  
348  		if (state == TCP_TIME_WAIT)
349  			timeo = TCP_TIMEWAIT_LEN;
350  
351  		/* Linkage updates.
352  		 * Note that access to tw after this point is illegal.
353  		 */
354  		inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo);
355  	} else {
356  		/* Sorry, if we're out of memory, just CLOSE this
357  		 * socket up.  We've got bigger problems than
358  		 * non-graceful socket closings.
359  		 */
360  		NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW);
361  	}
362  
363  	tcp_update_metrics(sk);
364  	tcp_done(sk);
365  }
366  EXPORT_SYMBOL(tcp_time_wait);
367  
368  #ifdef CONFIG_TCP_MD5SIG
tcp_md5_twsk_free_rcu(struct rcu_head * head)369  static void tcp_md5_twsk_free_rcu(struct rcu_head *head)
370  {
371  	struct tcp_md5sig_key *key;
372  
373  	key = container_of(head, struct tcp_md5sig_key, rcu);
374  	kfree(key);
375  	static_branch_slow_dec_deferred(&tcp_md5_needed);
376  	tcp_md5_release_sigpool();
377  }
378  #endif
379  
tcp_twsk_destructor(struct sock * sk)380  void tcp_twsk_destructor(struct sock *sk)
381  {
382  #ifdef CONFIG_TCP_MD5SIG
383  	if (static_branch_unlikely(&tcp_md5_needed.key)) {
384  		struct tcp_timewait_sock *twsk = tcp_twsk(sk);
385  
386  		if (twsk->tw_md5_key)
387  			call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu);
388  	}
389  #endif
390  	tcp_ao_destroy_sock(sk, true);
391  }
392  EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
393  
tcp_twsk_purge(struct list_head * net_exit_list)394  void tcp_twsk_purge(struct list_head *net_exit_list)
395  {
396  	bool purged_once = false;
397  	struct net *net;
398  
399  	list_for_each_entry(net, net_exit_list, exit_list) {
400  		if (net->ipv4.tcp_death_row.hashinfo->pernet) {
401  			/* Even if tw_refcount == 1, we must clean up kernel reqsk */
402  			inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo);
403  		} else if (!purged_once) {
404  			inet_twsk_purge(&tcp_hashinfo);
405  			purged_once = true;
406  		}
407  	}
408  }
409  
410  /* Warning : This function is called without sk_listener being locked.
411   * Be sure to read socket fields once, as their value could change under us.
412   */
tcp_openreq_init_rwin(struct request_sock * req,const struct sock * sk_listener,const struct dst_entry * dst)413  void tcp_openreq_init_rwin(struct request_sock *req,
414  			   const struct sock *sk_listener,
415  			   const struct dst_entry *dst)
416  {
417  	struct inet_request_sock *ireq = inet_rsk(req);
418  	const struct tcp_sock *tp = tcp_sk(sk_listener);
419  	int full_space = tcp_full_space(sk_listener);
420  	u32 window_clamp;
421  	__u8 rcv_wscale;
422  	u32 rcv_wnd;
423  	int mss;
424  
425  	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
426  	window_clamp = READ_ONCE(tp->window_clamp);
427  	/* Set this up on the first call only */
428  	req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
429  
430  	/* limit the window selection if the user enforce a smaller rx buffer */
431  	if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
432  	    (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
433  		req->rsk_window_clamp = full_space;
434  
435  	rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
436  	if (rcv_wnd == 0)
437  		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
438  	else if (full_space < rcv_wnd * mss)
439  		full_space = rcv_wnd * mss;
440  
441  	/* tcp_full_space because it is guaranteed to be the first packet */
442  	tcp_select_initial_window(sk_listener, full_space,
443  		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
444  		&req->rsk_rcv_wnd,
445  		&req->rsk_window_clamp,
446  		ireq->wscale_ok,
447  		&rcv_wscale,
448  		rcv_wnd);
449  	ireq->rcv_wscale = rcv_wscale;
450  }
451  EXPORT_SYMBOL(tcp_openreq_init_rwin);
452  
tcp_ecn_openreq_child(struct tcp_sock * tp,const struct request_sock * req)453  static void tcp_ecn_openreq_child(struct tcp_sock *tp,
454  				  const struct request_sock *req)
455  {
456  	tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
457  }
458  
tcp_ca_openreq_child(struct sock * sk,const struct dst_entry * dst)459  void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
460  {
461  	struct inet_connection_sock *icsk = inet_csk(sk);
462  	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
463  	bool ca_got_dst = false;
464  
465  	if (ca_key != TCP_CA_UNSPEC) {
466  		const struct tcp_congestion_ops *ca;
467  
468  		rcu_read_lock();
469  		ca = tcp_ca_find_key(ca_key);
470  		if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
471  			icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
472  			icsk->icsk_ca_ops = ca;
473  			ca_got_dst = true;
474  		}
475  		rcu_read_unlock();
476  	}
477  
478  	/* If no valid choice made yet, assign current system default ca. */
479  	if (!ca_got_dst &&
480  	    (!icsk->icsk_ca_setsockopt ||
481  	     !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
482  		tcp_assign_congestion_control(sk);
483  
484  	tcp_set_ca_state(sk, TCP_CA_Open);
485  }
486  EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
487  
smc_check_reset_syn_req(const struct tcp_sock * oldtp,struct request_sock * req,struct tcp_sock * newtp)488  static void smc_check_reset_syn_req(const struct tcp_sock *oldtp,
489  				    struct request_sock *req,
490  				    struct tcp_sock *newtp)
491  {
492  #if IS_ENABLED(CONFIG_SMC)
493  	struct inet_request_sock *ireq;
494  
495  	if (static_branch_unlikely(&tcp_have_smc)) {
496  		ireq = inet_rsk(req);
497  		if (oldtp->syn_smc && !ireq->smc_ok)
498  			newtp->syn_smc = 0;
499  	}
500  #endif
501  }
502  
503  /* This is not only more efficient than what we used to do, it eliminates
504   * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
505   *
506   * Actually, we could lots of memory writes here. tp of listening
507   * socket contains all necessary default parameters.
508   */
tcp_create_openreq_child(const struct sock * sk,struct request_sock * req,struct sk_buff * skb)509  struct sock *tcp_create_openreq_child(const struct sock *sk,
510  				      struct request_sock *req,
511  				      struct sk_buff *skb)
512  {
513  	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
514  	const struct inet_request_sock *ireq = inet_rsk(req);
515  	struct tcp_request_sock *treq = tcp_rsk(req);
516  	struct inet_connection_sock *newicsk;
517  	const struct tcp_sock *oldtp;
518  	struct tcp_sock *newtp;
519  	u32 seq;
520  
521  	if (!newsk)
522  		return NULL;
523  
524  	newicsk = inet_csk(newsk);
525  	newtp = tcp_sk(newsk);
526  	oldtp = tcp_sk(sk);
527  
528  	smc_check_reset_syn_req(oldtp, req, newtp);
529  
530  	/* Now setup tcp_sock */
531  	newtp->pred_flags = 0;
532  
533  	seq = treq->rcv_isn + 1;
534  	newtp->rcv_wup = seq;
535  	WRITE_ONCE(newtp->copied_seq, seq);
536  	WRITE_ONCE(newtp->rcv_nxt, seq);
537  	newtp->segs_in = 1;
538  
539  	seq = treq->snt_isn + 1;
540  	newtp->snd_sml = newtp->snd_una = seq;
541  	WRITE_ONCE(newtp->snd_nxt, seq);
542  	newtp->snd_up = seq;
543  
544  	INIT_LIST_HEAD(&newtp->tsq_node);
545  	INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
546  
547  	tcp_init_wl(newtp, treq->rcv_isn);
548  
549  	minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
550  	newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
551  
552  	newtp->lsndtime = tcp_jiffies32;
553  	newsk->sk_txhash = READ_ONCE(treq->txhash);
554  	newtp->total_retrans = req->num_retrans;
555  
556  	tcp_init_xmit_timers(newsk);
557  	WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
558  
559  	if (sock_flag(newsk, SOCK_KEEPOPEN))
560  		inet_csk_reset_keepalive_timer(newsk,
561  					       keepalive_time_when(newtp));
562  
563  	newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
564  	newtp->rx_opt.sack_ok = ireq->sack_ok;
565  	newtp->window_clamp = req->rsk_window_clamp;
566  	newtp->rcv_ssthresh = req->rsk_rcv_wnd;
567  	newtp->rcv_wnd = req->rsk_rcv_wnd;
568  	newtp->rx_opt.wscale_ok = ireq->wscale_ok;
569  	if (newtp->rx_opt.wscale_ok) {
570  		newtp->rx_opt.snd_wscale = ireq->snd_wscale;
571  		newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
572  	} else {
573  		newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
574  		newtp->window_clamp = min(newtp->window_clamp, 65535U);
575  	}
576  	newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
577  	newtp->max_window = newtp->snd_wnd;
578  
579  	if (newtp->rx_opt.tstamp_ok) {
580  		newtp->tcp_usec_ts = treq->req_usec_ts;
581  		newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent);
582  		newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
583  		newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
584  	} else {
585  		newtp->tcp_usec_ts = 0;
586  		newtp->rx_opt.ts_recent_stamp = 0;
587  		newtp->tcp_header_len = sizeof(struct tcphdr);
588  	}
589  	if (req->num_timeout) {
590  		newtp->total_rto = req->num_timeout;
591  		newtp->undo_marker = treq->snt_isn;
592  		if (newtp->tcp_usec_ts) {
593  			newtp->retrans_stamp = treq->snt_synack;
594  			newtp->total_rto_time = (u32)(tcp_clock_us() -
595  						      newtp->retrans_stamp) / USEC_PER_MSEC;
596  		} else {
597  			newtp->retrans_stamp = div_u64(treq->snt_synack,
598  						       USEC_PER_SEC / TCP_TS_HZ);
599  			newtp->total_rto_time = tcp_clock_ms() -
600  						newtp->retrans_stamp;
601  		}
602  		newtp->total_rto_recoveries = 1;
603  	}
604  	newtp->tsoffset = treq->ts_off;
605  #ifdef CONFIG_TCP_MD5SIG
606  	newtp->md5sig_info = NULL;	/*XXX*/
607  #endif
608  #ifdef CONFIG_TCP_AO
609  	newtp->ao_info = NULL;
610  
611  	if (tcp_rsk_used_ao(req)) {
612  		struct tcp_ao_key *ao_key;
613  
614  		ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1);
615  		if (ao_key)
616  			newtp->tcp_header_len += tcp_ao_len_aligned(ao_key);
617  	}
618   #endif
619  	if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
620  		newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
621  	newtp->rx_opt.mss_clamp = req->mss;
622  	tcp_ecn_openreq_child(newtp, req);
623  	newtp->fastopen_req = NULL;
624  	RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
625  
626  	newtp->bpf_chg_cc_inprogress = 0;
627  	tcp_bpf_clone(sk, newsk);
628  
629  	__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
630  
631  	xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1);
632  
633  	return newsk;
634  }
635  EXPORT_SYMBOL(tcp_create_openreq_child);
636  
637  /*
638   * Process an incoming packet for SYN_RECV sockets represented as a
639   * request_sock. Normally sk is the listener socket but for TFO it
640   * points to the child socket.
641   *
642   * XXX (TFO) - The current impl contains a special check for ack
643   * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
644   *
645   * We don't need to initialize tmp_opt.sack_ok as we don't use the results
646   *
647   * Note: If @fastopen is true, this can be called from process context.
648   *       Otherwise, this is from BH context.
649   */
650  
tcp_check_req(struct sock * sk,struct sk_buff * skb,struct request_sock * req,bool fastopen,bool * req_stolen)651  struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
652  			   struct request_sock *req,
653  			   bool fastopen, bool *req_stolen)
654  {
655  	struct tcp_options_received tmp_opt;
656  	struct sock *child;
657  	const struct tcphdr *th = tcp_hdr(skb);
658  	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
659  	bool paws_reject = false;
660  	bool own_req;
661  
662  	tmp_opt.saw_tstamp = 0;
663  	if (th->doff > (sizeof(struct tcphdr)>>2)) {
664  		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
665  
666  		if (tmp_opt.saw_tstamp) {
667  			tmp_opt.ts_recent = READ_ONCE(req->ts_recent);
668  			if (tmp_opt.rcv_tsecr)
669  				tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
670  			/* We do not store true stamp, but it is not required,
671  			 * it can be estimated (approximately)
672  			 * from another data.
673  			 */
674  			tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
675  			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
676  		}
677  	}
678  
679  	/* Check for pure retransmitted SYN. */
680  	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
681  	    flg == TCP_FLAG_SYN &&
682  	    !paws_reject) {
683  		/*
684  		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
685  		 * this case on figure 6 and figure 8, but formal
686  		 * protocol description says NOTHING.
687  		 * To be more exact, it says that we should send ACK,
688  		 * because this segment (at least, if it has no data)
689  		 * is out of window.
690  		 *
691  		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
692  		 *  describe SYN-RECV state. All the description
693  		 *  is wrong, we cannot believe to it and should
694  		 *  rely only on common sense and implementation
695  		 *  experience.
696  		 *
697  		 * Enforce "SYN-ACK" according to figure 8, figure 6
698  		 * of RFC793, fixed by RFC1122.
699  		 *
700  		 * Note that even if there is new data in the SYN packet
701  		 * they will be thrown away too.
702  		 *
703  		 * Reset timer after retransmitting SYNACK, similar to
704  		 * the idea of fast retransmit in recovery.
705  		 */
706  		if (!tcp_oow_rate_limited(sock_net(sk), skb,
707  					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
708  					  &tcp_rsk(req)->last_oow_ack_time) &&
709  
710  		    !inet_rtx_syn_ack(sk, req)) {
711  			unsigned long expires = jiffies;
712  
713  			expires += reqsk_timeout(req, TCP_RTO_MAX);
714  			if (!fastopen)
715  				mod_timer_pending(&req->rsk_timer, expires);
716  			else
717  				req->rsk_timer.expires = expires;
718  		}
719  		return NULL;
720  	}
721  
722  	/* Further reproduces section "SEGMENT ARRIVES"
723  	   for state SYN-RECEIVED of RFC793.
724  	   It is broken, however, it does not work only
725  	   when SYNs are crossed.
726  
727  	   You would think that SYN crossing is impossible here, since
728  	   we should have a SYN_SENT socket (from connect()) on our end,
729  	   but this is not true if the crossed SYNs were sent to both
730  	   ends by a malicious third party.  We must defend against this,
731  	   and to do that we first verify the ACK (as per RFC793, page
732  	   36) and reset if it is invalid.  Is this a true full defense?
733  	   To convince ourselves, let us consider a way in which the ACK
734  	   test can still pass in this 'malicious crossed SYNs' case.
735  	   Malicious sender sends identical SYNs (and thus identical sequence
736  	   numbers) to both A and B:
737  
738  		A: gets SYN, seq=7
739  		B: gets SYN, seq=7
740  
741  	   By our good fortune, both A and B select the same initial
742  	   send sequence number of seven :-)
743  
744  		A: sends SYN|ACK, seq=7, ack_seq=8
745  		B: sends SYN|ACK, seq=7, ack_seq=8
746  
747  	   So we are now A eating this SYN|ACK, ACK test passes.  So
748  	   does sequence test, SYN is truncated, and thus we consider
749  	   it a bare ACK.
750  
751  	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
752  	   bare ACK.  Otherwise, we create an established connection.  Both
753  	   ends (listening sockets) accept the new incoming connection and try
754  	   to talk to each other. 8-)
755  
756  	   Note: This case is both harmless, and rare.  Possibility is about the
757  	   same as us discovering intelligent life on another plant tomorrow.
758  
759  	   But generally, we should (RFC lies!) to accept ACK
760  	   from SYNACK both here and in tcp_rcv_state_process().
761  	   tcp_rcv_state_process() does not, hence, we do not too.
762  
763  	   Note that the case is absolutely generic:
764  	   we cannot optimize anything here without
765  	   violating protocol. All the checks must be made
766  	   before attempt to create socket.
767  	 */
768  
769  	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
770  	 *                  and the incoming segment acknowledges something not yet
771  	 *                  sent (the segment carries an unacceptable ACK) ...
772  	 *                  a reset is sent."
773  	 *
774  	 * Invalid ACK: reset will be sent by listening socket.
775  	 * Note that the ACK validity check for a Fast Open socket is done
776  	 * elsewhere and is checked directly against the child socket rather
777  	 * than req because user data may have been sent out.
778  	 */
779  	if ((flg & TCP_FLAG_ACK) && !fastopen &&
780  	    (TCP_SKB_CB(skb)->ack_seq !=
781  	     tcp_rsk(req)->snt_isn + 1))
782  		return sk;
783  
784  	/* Also, it would be not so bad idea to check rcv_tsecr, which
785  	 * is essentially ACK extension and too early or too late values
786  	 * should cause reset in unsynchronized states.
787  	 */
788  
789  	/* RFC793: "first check sequence number". */
790  
791  	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq,
792  					  TCP_SKB_CB(skb)->end_seq,
793  					  tcp_rsk(req)->rcv_nxt,
794  					  tcp_rsk(req)->rcv_nxt +
795  					  tcp_synack_window(req))) {
796  		/* Out of window: send ACK and drop. */
797  		if (!(flg & TCP_FLAG_RST) &&
798  		    !tcp_oow_rate_limited(sock_net(sk), skb,
799  					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
800  					  &tcp_rsk(req)->last_oow_ack_time))
801  			req->rsk_ops->send_ack(sk, skb, req);
802  		if (paws_reject)
803  			NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
804  		return NULL;
805  	}
806  
807  	/* In sequence, PAWS is OK. */
808  
809  	/* TODO: We probably should defer ts_recent change once
810  	 * we take ownership of @req.
811  	 */
812  	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
813  		WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval);
814  
815  	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
816  		/* Truncate SYN, it is out of window starting
817  		   at tcp_rsk(req)->rcv_isn + 1. */
818  		flg &= ~TCP_FLAG_SYN;
819  	}
820  
821  	/* RFC793: "second check the RST bit" and
822  	 *	   "fourth, check the SYN bit"
823  	 */
824  	if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
825  		TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
826  		goto embryonic_reset;
827  	}
828  
829  	/* ACK sequence verified above, just make sure ACK is
830  	 * set.  If ACK not set, just silently drop the packet.
831  	 *
832  	 * XXX (TFO) - if we ever allow "data after SYN", the
833  	 * following check needs to be removed.
834  	 */
835  	if (!(flg & TCP_FLAG_ACK))
836  		return NULL;
837  
838  	/* For Fast Open no more processing is needed (sk is the
839  	 * child socket).
840  	 */
841  	if (fastopen)
842  		return sk;
843  
844  	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
845  	if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
846  	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
847  		inet_rsk(req)->acked = 1;
848  		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
849  		return NULL;
850  	}
851  
852  	/* OK, ACK is valid, create big socket and
853  	 * feed this segment to it. It will repeat all
854  	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
855  	 * ESTABLISHED STATE. If it will be dropped after
856  	 * socket is created, wait for troubles.
857  	 */
858  	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
859  							 req, &own_req);
860  	if (!child)
861  		goto listen_overflow;
862  
863  	if (own_req && rsk_drop_req(req)) {
864  		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
865  		inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
866  		return child;
867  	}
868  
869  	sock_rps_save_rxhash(child, skb);
870  	tcp_synack_rtt_meas(child, req);
871  	*req_stolen = !own_req;
872  	return inet_csk_complete_hashdance(sk, child, req, own_req);
873  
874  listen_overflow:
875  	if (sk != req->rsk_listener)
876  		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
877  
878  	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) {
879  		inet_rsk(req)->acked = 1;
880  		return NULL;
881  	}
882  
883  embryonic_reset:
884  	if (!(flg & TCP_FLAG_RST)) {
885  		/* Received a bad SYN pkt - for TFO We try not to reset
886  		 * the local connection unless it's really necessary to
887  		 * avoid becoming vulnerable to outside attack aiming at
888  		 * resetting legit local connections.
889  		 */
890  		req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN);
891  	} else if (fastopen) { /* received a valid RST pkt */
892  		reqsk_fastopen_remove(sk, req, true);
893  		tcp_reset(sk, skb);
894  	}
895  	if (!fastopen) {
896  		bool unlinked = inet_csk_reqsk_queue_drop(sk, req);
897  
898  		if (unlinked)
899  			__NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
900  		*req_stolen = !unlinked;
901  	}
902  	return NULL;
903  }
904  EXPORT_SYMBOL(tcp_check_req);
905  
906  /*
907   * Queue segment on the new socket if the new socket is active,
908   * otherwise we just shortcircuit this and continue with
909   * the new socket.
910   *
911   * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
912   * when entering. But other states are possible due to a race condition
913   * where after __inet_lookup_established() fails but before the listener
914   * locked is obtained, other packets cause the same connection to
915   * be created.
916   */
917  
tcp_child_process(struct sock * parent,struct sock * child,struct sk_buff * skb)918  enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
919  				       struct sk_buff *skb)
920  	__releases(&((child)->sk_lock.slock))
921  {
922  	enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
923  	int state = child->sk_state;
924  
925  	/* record sk_napi_id and sk_rx_queue_mapping of child. */
926  	sk_mark_napi_id_set(child, skb);
927  
928  	tcp_segs_in(tcp_sk(child), skb);
929  	if (!sock_owned_by_user(child)) {
930  		reason = tcp_rcv_state_process(child, skb);
931  		/* Wakeup parent, send SIGIO */
932  		if (state == TCP_SYN_RECV && child->sk_state != state)
933  			parent->sk_data_ready(parent);
934  	} else {
935  		/* Alas, it is possible again, because we do lookup
936  		 * in main socket hash table and lock on listening
937  		 * socket does not protect us more.
938  		 */
939  		__sk_add_backlog(child, skb);
940  	}
941  
942  	bh_unlock_sock(child);
943  	sock_put(child);
944  	return reason;
945  }
946  EXPORT_SYMBOL(tcp_child_process);
947