1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * To speed up listener socket lookup, create an array to store all sockets
4   * listening on the same port.  This allows a decision to be made after finding
5   * the first socket.  An optional BPF program can also be configured for
6   * selecting the socket index from the array of available sockets.
7   */
8  
9  #include <net/ip.h>
10  #include <net/sock_reuseport.h>
11  #include <linux/bpf.h>
12  #include <linux/idr.h>
13  #include <linux/filter.h>
14  #include <linux/rcupdate.h>
15  
16  #define INIT_SOCKS 128
17  
18  DEFINE_SPINLOCK(reuseport_lock);
19  
20  static DEFINE_IDA(reuseport_ida);
21  static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
22  			       struct sock_reuseport *reuse, bool bind_inany);
23  
reuseport_has_conns_set(struct sock * sk)24  void reuseport_has_conns_set(struct sock *sk)
25  {
26  	struct sock_reuseport *reuse;
27  
28  	if (!rcu_access_pointer(sk->sk_reuseport_cb))
29  		return;
30  
31  	spin_lock_bh(&reuseport_lock);
32  	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
33  					  lockdep_is_held(&reuseport_lock));
34  	if (likely(reuse))
35  		reuse->has_conns = 1;
36  	spin_unlock_bh(&reuseport_lock);
37  }
38  EXPORT_SYMBOL(reuseport_has_conns_set);
39  
__reuseport_get_incoming_cpu(struct sock_reuseport * reuse)40  static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse)
41  {
42  	/* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
43  	WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1);
44  }
45  
__reuseport_put_incoming_cpu(struct sock_reuseport * reuse)46  static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse)
47  {
48  	/* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
49  	WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1);
50  }
51  
reuseport_get_incoming_cpu(struct sock * sk,struct sock_reuseport * reuse)52  static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
53  {
54  	if (sk->sk_incoming_cpu >= 0)
55  		__reuseport_get_incoming_cpu(reuse);
56  }
57  
reuseport_put_incoming_cpu(struct sock * sk,struct sock_reuseport * reuse)58  static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
59  {
60  	if (sk->sk_incoming_cpu >= 0)
61  		__reuseport_put_incoming_cpu(reuse);
62  }
63  
reuseport_update_incoming_cpu(struct sock * sk,int val)64  void reuseport_update_incoming_cpu(struct sock *sk, int val)
65  {
66  	struct sock_reuseport *reuse;
67  	int old_sk_incoming_cpu;
68  
69  	if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) {
70  		/* Paired with REAE_ONCE() in sk_incoming_cpu_update()
71  		 * and compute_score().
72  		 */
73  		WRITE_ONCE(sk->sk_incoming_cpu, val);
74  		return;
75  	}
76  
77  	spin_lock_bh(&reuseport_lock);
78  
79  	/* This must be done under reuseport_lock to avoid a race with
80  	 * reuseport_grow(), which accesses sk->sk_incoming_cpu without
81  	 * lock_sock() when detaching a shutdown()ed sk.
82  	 *
83  	 * Paired with READ_ONCE() in reuseport_select_sock_by_hash().
84  	 */
85  	old_sk_incoming_cpu = sk->sk_incoming_cpu;
86  	WRITE_ONCE(sk->sk_incoming_cpu, val);
87  
88  	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
89  					  lockdep_is_held(&reuseport_lock));
90  
91  	/* reuseport_grow() has detached a closed sk. */
92  	if (!reuse)
93  		goto out;
94  
95  	if (old_sk_incoming_cpu < 0 && val >= 0)
96  		__reuseport_get_incoming_cpu(reuse);
97  	else if (old_sk_incoming_cpu >= 0 && val < 0)
98  		__reuseport_put_incoming_cpu(reuse);
99  
100  out:
101  	spin_unlock_bh(&reuseport_lock);
102  }
103  
reuseport_sock_index(struct sock * sk,const struct sock_reuseport * reuse,bool closed)104  static int reuseport_sock_index(struct sock *sk,
105  				const struct sock_reuseport *reuse,
106  				bool closed)
107  {
108  	int left, right;
109  
110  	if (!closed) {
111  		left = 0;
112  		right = reuse->num_socks;
113  	} else {
114  		left = reuse->max_socks - reuse->num_closed_socks;
115  		right = reuse->max_socks;
116  	}
117  
118  	for (; left < right; left++)
119  		if (reuse->socks[left] == sk)
120  			return left;
121  	return -1;
122  }
123  
__reuseport_add_sock(struct sock * sk,struct sock_reuseport * reuse)124  static void __reuseport_add_sock(struct sock *sk,
125  				 struct sock_reuseport *reuse)
126  {
127  	reuse->socks[reuse->num_socks] = sk;
128  	/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
129  	smp_wmb();
130  	reuse->num_socks++;
131  	reuseport_get_incoming_cpu(sk, reuse);
132  }
133  
__reuseport_detach_sock(struct sock * sk,struct sock_reuseport * reuse)134  static bool __reuseport_detach_sock(struct sock *sk,
135  				    struct sock_reuseport *reuse)
136  {
137  	int i = reuseport_sock_index(sk, reuse, false);
138  
139  	if (i == -1)
140  		return false;
141  
142  	reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
143  	reuse->num_socks--;
144  	reuseport_put_incoming_cpu(sk, reuse);
145  
146  	return true;
147  }
148  
__reuseport_add_closed_sock(struct sock * sk,struct sock_reuseport * reuse)149  static void __reuseport_add_closed_sock(struct sock *sk,
150  					struct sock_reuseport *reuse)
151  {
152  	reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
153  	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
154  	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
155  	reuseport_get_incoming_cpu(sk, reuse);
156  }
157  
__reuseport_detach_closed_sock(struct sock * sk,struct sock_reuseport * reuse)158  static bool __reuseport_detach_closed_sock(struct sock *sk,
159  					   struct sock_reuseport *reuse)
160  {
161  	int i = reuseport_sock_index(sk, reuse, true);
162  
163  	if (i == -1)
164  		return false;
165  
166  	reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
167  	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
168  	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
169  	reuseport_put_incoming_cpu(sk, reuse);
170  
171  	return true;
172  }
173  
__reuseport_alloc(unsigned int max_socks)174  static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
175  {
176  	struct sock_reuseport *reuse;
177  
178  	reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC);
179  	if (!reuse)
180  		return NULL;
181  
182  	reuse->max_socks = max_socks;
183  
184  	RCU_INIT_POINTER(reuse->prog, NULL);
185  	return reuse;
186  }
187  
reuseport_alloc(struct sock * sk,bool bind_inany)188  int reuseport_alloc(struct sock *sk, bool bind_inany)
189  {
190  	struct sock_reuseport *reuse;
191  	int id, ret = 0;
192  
193  	/* bh lock used since this function call may precede hlist lock in
194  	 * soft irq of receive path or setsockopt from process context
195  	 */
196  	spin_lock_bh(&reuseport_lock);
197  
198  	/* Allocation attempts can occur concurrently via the setsockopt path
199  	 * and the bind/hash path.  Nothing to do when we lose the race.
200  	 */
201  	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
202  					  lockdep_is_held(&reuseport_lock));
203  	if (reuse) {
204  		if (reuse->num_closed_socks) {
205  			/* sk was shutdown()ed before */
206  			ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
207  			goto out;
208  		}
209  
210  		/* Only set reuse->bind_inany if the bind_inany is true.
211  		 * Otherwise, it will overwrite the reuse->bind_inany
212  		 * which was set by the bind/hash path.
213  		 */
214  		if (bind_inany)
215  			reuse->bind_inany = bind_inany;
216  		goto out;
217  	}
218  
219  	reuse = __reuseport_alloc(INIT_SOCKS);
220  	if (!reuse) {
221  		ret = -ENOMEM;
222  		goto out;
223  	}
224  
225  	id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
226  	if (id < 0) {
227  		kfree(reuse);
228  		ret = id;
229  		goto out;
230  	}
231  
232  	reuse->reuseport_id = id;
233  	reuse->bind_inany = bind_inany;
234  	reuse->socks[0] = sk;
235  	reuse->num_socks = 1;
236  	reuseport_get_incoming_cpu(sk, reuse);
237  	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
238  
239  out:
240  	spin_unlock_bh(&reuseport_lock);
241  
242  	return ret;
243  }
244  EXPORT_SYMBOL(reuseport_alloc);
245  
reuseport_grow(struct sock_reuseport * reuse)246  static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
247  {
248  	struct sock_reuseport *more_reuse;
249  	u32 more_socks_size, i;
250  
251  	more_socks_size = reuse->max_socks * 2U;
252  	if (more_socks_size > U16_MAX) {
253  		if (reuse->num_closed_socks) {
254  			/* Make room by removing a closed sk.
255  			 * The child has already been migrated.
256  			 * Only reqsk left at this point.
257  			 */
258  			struct sock *sk;
259  
260  			sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
261  			RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
262  			__reuseport_detach_closed_sock(sk, reuse);
263  
264  			return reuse;
265  		}
266  
267  		return NULL;
268  	}
269  
270  	more_reuse = __reuseport_alloc(more_socks_size);
271  	if (!more_reuse)
272  		return NULL;
273  
274  	more_reuse->num_socks = reuse->num_socks;
275  	more_reuse->num_closed_socks = reuse->num_closed_socks;
276  	more_reuse->prog = reuse->prog;
277  	more_reuse->reuseport_id = reuse->reuseport_id;
278  	more_reuse->bind_inany = reuse->bind_inany;
279  	more_reuse->has_conns = reuse->has_conns;
280  	more_reuse->incoming_cpu = reuse->incoming_cpu;
281  
282  	memcpy(more_reuse->socks, reuse->socks,
283  	       reuse->num_socks * sizeof(struct sock *));
284  	memcpy(more_reuse->socks +
285  	       (more_reuse->max_socks - more_reuse->num_closed_socks),
286  	       reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
287  	       reuse->num_closed_socks * sizeof(struct sock *));
288  	more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
289  
290  	for (i = 0; i < reuse->max_socks; ++i)
291  		rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
292  				   more_reuse);
293  
294  	/* Note: we use kfree_rcu here instead of reuseport_free_rcu so
295  	 * that reuse and more_reuse can temporarily share a reference
296  	 * to prog.
297  	 */
298  	kfree_rcu(reuse, rcu);
299  	return more_reuse;
300  }
301  
reuseport_free_rcu(struct rcu_head * head)302  static void reuseport_free_rcu(struct rcu_head *head)
303  {
304  	struct sock_reuseport *reuse;
305  
306  	reuse = container_of(head, struct sock_reuseport, rcu);
307  	sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
308  	ida_free(&reuseport_ida, reuse->reuseport_id);
309  	kfree(reuse);
310  }
311  
312  /**
313   *  reuseport_add_sock - Add a socket to the reuseport group of another.
314   *  @sk:  New socket to add to the group.
315   *  @sk2: Socket belonging to the existing reuseport group.
316   *  @bind_inany: Whether or not the group is bound to a local INANY address.
317   *
318   *  May return ENOMEM and not add socket to group under memory pressure.
319   */
reuseport_add_sock(struct sock * sk,struct sock * sk2,bool bind_inany)320  int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
321  {
322  	struct sock_reuseport *old_reuse, *reuse;
323  
324  	if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
325  		int err = reuseport_alloc(sk2, bind_inany);
326  
327  		if (err)
328  			return err;
329  	}
330  
331  	spin_lock_bh(&reuseport_lock);
332  	reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
333  					  lockdep_is_held(&reuseport_lock));
334  	old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
335  					      lockdep_is_held(&reuseport_lock));
336  	if (old_reuse && old_reuse->num_closed_socks) {
337  		/* sk was shutdown()ed before */
338  		int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
339  
340  		spin_unlock_bh(&reuseport_lock);
341  		return err;
342  	}
343  
344  	if (old_reuse && old_reuse->num_socks != 1) {
345  		spin_unlock_bh(&reuseport_lock);
346  		return -EBUSY;
347  	}
348  
349  	if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
350  		reuse = reuseport_grow(reuse);
351  		if (!reuse) {
352  			spin_unlock_bh(&reuseport_lock);
353  			return -ENOMEM;
354  		}
355  	}
356  
357  	__reuseport_add_sock(sk, reuse);
358  	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
359  
360  	spin_unlock_bh(&reuseport_lock);
361  
362  	if (old_reuse)
363  		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
364  	return 0;
365  }
366  EXPORT_SYMBOL(reuseport_add_sock);
367  
reuseport_resurrect(struct sock * sk,struct sock_reuseport * old_reuse,struct sock_reuseport * reuse,bool bind_inany)368  static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
369  			       struct sock_reuseport *reuse, bool bind_inany)
370  {
371  	if (old_reuse == reuse) {
372  		/* If sk was in the same reuseport group, just pop sk out of
373  		 * the closed section and push sk into the listening section.
374  		 */
375  		__reuseport_detach_closed_sock(sk, old_reuse);
376  		__reuseport_add_sock(sk, old_reuse);
377  		return 0;
378  	}
379  
380  	if (!reuse) {
381  		/* In bind()/listen() path, we cannot carry over the eBPF prog
382  		 * for the shutdown()ed socket. In setsockopt() path, we should
383  		 * not change the eBPF prog of listening sockets by attaching a
384  		 * prog to the shutdown()ed socket. Thus, we will allocate a new
385  		 * reuseport group and detach sk from the old group.
386  		 */
387  		int id;
388  
389  		reuse = __reuseport_alloc(INIT_SOCKS);
390  		if (!reuse)
391  			return -ENOMEM;
392  
393  		id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
394  		if (id < 0) {
395  			kfree(reuse);
396  			return id;
397  		}
398  
399  		reuse->reuseport_id = id;
400  		reuse->bind_inany = bind_inany;
401  	} else {
402  		/* Move sk from the old group to the new one if
403  		 * - all the other listeners in the old group were close()d or
404  		 *   shutdown()ed, and then sk2 has listen()ed on the same port
405  		 * OR
406  		 * - sk listen()ed without bind() (or with autobind), was
407  		 *   shutdown()ed, and then listen()s on another port which
408  		 *   sk2 listen()s on.
409  		 */
410  		if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
411  			reuse = reuseport_grow(reuse);
412  			if (!reuse)
413  				return -ENOMEM;
414  		}
415  	}
416  
417  	__reuseport_detach_closed_sock(sk, old_reuse);
418  	__reuseport_add_sock(sk, reuse);
419  	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
420  
421  	if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
422  		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
423  
424  	return 0;
425  }
426  
reuseport_detach_sock(struct sock * sk)427  void reuseport_detach_sock(struct sock *sk)
428  {
429  	struct sock_reuseport *reuse;
430  
431  	spin_lock_bh(&reuseport_lock);
432  	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
433  					  lockdep_is_held(&reuseport_lock));
434  
435  	/* reuseport_grow() has detached a closed sk */
436  	if (!reuse)
437  		goto out;
438  
439  	/* Notify the bpf side. The sk may be added to a sockarray
440  	 * map. If so, sockarray logic will remove it from the map.
441  	 *
442  	 * Other bpf map types that work with reuseport, like sockmap,
443  	 * don't need an explicit callback from here. They override sk
444  	 * unhash/close ops to remove the sk from the map before we
445  	 * get to this point.
446  	 */
447  	bpf_sk_reuseport_detach(sk);
448  
449  	rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
450  
451  	if (!__reuseport_detach_closed_sock(sk, reuse))
452  		__reuseport_detach_sock(sk, reuse);
453  
454  	if (reuse->num_socks + reuse->num_closed_socks == 0)
455  		call_rcu(&reuse->rcu, reuseport_free_rcu);
456  
457  out:
458  	spin_unlock_bh(&reuseport_lock);
459  }
460  EXPORT_SYMBOL(reuseport_detach_sock);
461  
reuseport_stop_listen_sock(struct sock * sk)462  void reuseport_stop_listen_sock(struct sock *sk)
463  {
464  	if (sk->sk_protocol == IPPROTO_TCP) {
465  		struct sock_reuseport *reuse;
466  		struct bpf_prog *prog;
467  
468  		spin_lock_bh(&reuseport_lock);
469  
470  		reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
471  						  lockdep_is_held(&reuseport_lock));
472  		prog = rcu_dereference_protected(reuse->prog,
473  						 lockdep_is_held(&reuseport_lock));
474  
475  		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) ||
476  		    (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
477  			/* Migration capable, move sk from the listening section
478  			 * to the closed section.
479  			 */
480  			bpf_sk_reuseport_detach(sk);
481  
482  			__reuseport_detach_sock(sk, reuse);
483  			__reuseport_add_closed_sock(sk, reuse);
484  
485  			spin_unlock_bh(&reuseport_lock);
486  			return;
487  		}
488  
489  		spin_unlock_bh(&reuseport_lock);
490  	}
491  
492  	/* Not capable to do migration, detach immediately */
493  	reuseport_detach_sock(sk);
494  }
495  EXPORT_SYMBOL(reuseport_stop_listen_sock);
496  
run_bpf_filter(struct sock_reuseport * reuse,u16 socks,struct bpf_prog * prog,struct sk_buff * skb,int hdr_len)497  static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
498  				   struct bpf_prog *prog, struct sk_buff *skb,
499  				   int hdr_len)
500  {
501  	struct sk_buff *nskb = NULL;
502  	u32 index;
503  
504  	if (skb_shared(skb)) {
505  		nskb = skb_clone(skb, GFP_ATOMIC);
506  		if (!nskb)
507  			return NULL;
508  		skb = nskb;
509  	}
510  
511  	/* temporarily advance data past protocol header */
512  	if (!pskb_pull(skb, hdr_len)) {
513  		kfree_skb(nskb);
514  		return NULL;
515  	}
516  	index = bpf_prog_run_save_cb(prog, skb);
517  	__skb_push(skb, hdr_len);
518  
519  	consume_skb(nskb);
520  
521  	if (index >= socks)
522  		return NULL;
523  
524  	return reuse->socks[index];
525  }
526  
reuseport_select_sock_by_hash(struct sock_reuseport * reuse,u32 hash,u16 num_socks)527  static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
528  						  u32 hash, u16 num_socks)
529  {
530  	struct sock *first_valid_sk = NULL;
531  	int i, j;
532  
533  	i = j = reciprocal_scale(hash, num_socks);
534  	do {
535  		struct sock *sk = reuse->socks[i];
536  
537  		if (sk->sk_state != TCP_ESTABLISHED) {
538  			/* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */
539  			if (!READ_ONCE(reuse->incoming_cpu))
540  				return sk;
541  
542  			/* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */
543  			if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
544  				return sk;
545  
546  			if (!first_valid_sk)
547  				first_valid_sk = sk;
548  		}
549  
550  		i++;
551  		if (i >= num_socks)
552  			i = 0;
553  	} while (i != j);
554  
555  	return first_valid_sk;
556  }
557  
558  /**
559   *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
560   *  @sk: First socket in the group.
561   *  @hash: When no BPF filter is available, use this hash to select.
562   *  @skb: skb to run through BPF filter.
563   *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
564   *    the skb does not yet point at the payload, this parameter represents
565   *    how far the pointer needs to advance to reach the payload.
566   *  Returns a socket that should receive the packet (or NULL on error).
567   */
reuseport_select_sock(struct sock * sk,u32 hash,struct sk_buff * skb,int hdr_len)568  struct sock *reuseport_select_sock(struct sock *sk,
569  				   u32 hash,
570  				   struct sk_buff *skb,
571  				   int hdr_len)
572  {
573  	struct sock_reuseport *reuse;
574  	struct bpf_prog *prog;
575  	struct sock *sk2 = NULL;
576  	u16 socks;
577  
578  	rcu_read_lock();
579  	reuse = rcu_dereference(sk->sk_reuseport_cb);
580  
581  	/* if memory allocation failed or add call is not yet complete */
582  	if (!reuse)
583  		goto out;
584  
585  	prog = rcu_dereference(reuse->prog);
586  	socks = READ_ONCE(reuse->num_socks);
587  	if (likely(socks)) {
588  		/* paired with smp_wmb() in __reuseport_add_sock() */
589  		smp_rmb();
590  
591  		if (!prog || !skb)
592  			goto select_by_hash;
593  
594  		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
595  			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
596  		else
597  			sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
598  
599  select_by_hash:
600  		/* no bpf or invalid bpf result: fall back to hash usage */
601  		if (!sk2)
602  			sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
603  	}
604  
605  out:
606  	rcu_read_unlock();
607  	return sk2;
608  }
609  EXPORT_SYMBOL(reuseport_select_sock);
610  
611  /**
612   *  reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
613   *  @sk: close()ed or shutdown()ed socket in the group.
614   *  @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
615   *    NEW_SYN_RECV request socket during 3WHS.
616   *  @skb: skb to run through BPF filter.
617   *  Returns a socket (with sk_refcnt +1) that should accept the child socket
618   *  (or NULL on error).
619   */
reuseport_migrate_sock(struct sock * sk,struct sock * migrating_sk,struct sk_buff * skb)620  struct sock *reuseport_migrate_sock(struct sock *sk,
621  				    struct sock *migrating_sk,
622  				    struct sk_buff *skb)
623  {
624  	struct sock_reuseport *reuse;
625  	struct sock *nsk = NULL;
626  	bool allocated = false;
627  	struct bpf_prog *prog;
628  	u16 socks;
629  	u32 hash;
630  
631  	rcu_read_lock();
632  
633  	reuse = rcu_dereference(sk->sk_reuseport_cb);
634  	if (!reuse)
635  		goto out;
636  
637  	socks = READ_ONCE(reuse->num_socks);
638  	if (unlikely(!socks))
639  		goto failure;
640  
641  	/* paired with smp_wmb() in __reuseport_add_sock() */
642  	smp_rmb();
643  
644  	hash = migrating_sk->sk_hash;
645  	prog = rcu_dereference(reuse->prog);
646  	if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
647  		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req))
648  			goto select_by_hash;
649  		goto failure;
650  	}
651  
652  	if (!skb) {
653  		skb = alloc_skb(0, GFP_ATOMIC);
654  		if (!skb)
655  			goto failure;
656  		allocated = true;
657  	}
658  
659  	nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
660  
661  	if (allocated)
662  		kfree_skb(skb);
663  
664  select_by_hash:
665  	if (!nsk)
666  		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
667  
668  	if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
669  		nsk = NULL;
670  		goto failure;
671  	}
672  
673  out:
674  	rcu_read_unlock();
675  	return nsk;
676  
677  failure:
678  	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
679  	goto out;
680  }
681  EXPORT_SYMBOL(reuseport_migrate_sock);
682  
reuseport_attach_prog(struct sock * sk,struct bpf_prog * prog)683  int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
684  {
685  	struct sock_reuseport *reuse;
686  	struct bpf_prog *old_prog;
687  
688  	if (sk_unhashed(sk)) {
689  		int err;
690  
691  		if (!sk->sk_reuseport)
692  			return -EINVAL;
693  
694  		err = reuseport_alloc(sk, false);
695  		if (err)
696  			return err;
697  	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
698  		/* The socket wasn't bound with SO_REUSEPORT */
699  		return -EINVAL;
700  	}
701  
702  	spin_lock_bh(&reuseport_lock);
703  	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
704  					  lockdep_is_held(&reuseport_lock));
705  	old_prog = rcu_dereference_protected(reuse->prog,
706  					     lockdep_is_held(&reuseport_lock));
707  	rcu_assign_pointer(reuse->prog, prog);
708  	spin_unlock_bh(&reuseport_lock);
709  
710  	sk_reuseport_prog_free(old_prog);
711  	return 0;
712  }
713  EXPORT_SYMBOL(reuseport_attach_prog);
714  
reuseport_detach_prog(struct sock * sk)715  int reuseport_detach_prog(struct sock *sk)
716  {
717  	struct sock_reuseport *reuse;
718  	struct bpf_prog *old_prog;
719  
720  	old_prog = NULL;
721  	spin_lock_bh(&reuseport_lock);
722  	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
723  					  lockdep_is_held(&reuseport_lock));
724  
725  	/* reuse must be checked after acquiring the reuseport_lock
726  	 * because reuseport_grow() can detach a closed sk.
727  	 */
728  	if (!reuse) {
729  		spin_unlock_bh(&reuseport_lock);
730  		return sk->sk_reuseport ? -ENOENT : -EINVAL;
731  	}
732  
733  	if (sk_unhashed(sk) && reuse->num_closed_socks) {
734  		spin_unlock_bh(&reuseport_lock);
735  		return -ENOENT;
736  	}
737  
738  	old_prog = rcu_replace_pointer(reuse->prog, old_prog,
739  				       lockdep_is_held(&reuseport_lock));
740  	spin_unlock_bh(&reuseport_lock);
741  
742  	if (!old_prog)
743  		return -ENOENT;
744  
745  	sk_reuseport_prog_free(old_prog);
746  	return 0;
747  }
748  EXPORT_SYMBOL(reuseport_detach_prog);
749