1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * INET		An implementation of the TCP/IP protocol suite for the LINUX
4   *		operating system.  INET is implemented using the  BSD Socket
5   *		interface as the means of communication with the user level.
6   *
7   *		Generic socket support routines. Memory allocators, socket lock/release
8   *		handler for protocols to use and generic option handler.
9   *
10   * Authors:	Ross Biro
11   *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12   *		Florian La Roche, <flla@stud.uni-sb.de>
13   *		Alan Cox, <A.Cox@swansea.ac.uk>
14   *
15   * Fixes:
16   *		Alan Cox	: 	Numerous verify_area() problems
17   *		Alan Cox	:	Connecting on a connecting socket
18   *					now returns an error for tcp.
19   *		Alan Cox	:	sock->protocol is set correctly.
20   *					and is not sometimes left as 0.
21   *		Alan Cox	:	connect handles icmp errors on a
22   *					connect properly. Unfortunately there
23   *					is a restart syscall nasty there. I
24   *					can't match BSD without hacking the C
25   *					library. Ideas urgently sought!
26   *		Alan Cox	:	Disallow bind() to addresses that are
27   *					not ours - especially broadcast ones!!
28   *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29   *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30   *					instead they leave that for the DESTROY timer.
31   *		Alan Cox	:	Clean up error flag in accept
32   *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33   *					was buggy. Put a remove_sock() in the handler
34   *					for memory when we hit 0. Also altered the timer
35   *					code. The ACK stuff can wait and needs major
36   *					TCP layer surgery.
37   *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38   *					and fixed timer/inet_bh race.
39   *		Alan Cox	:	Added zapped flag for TCP
40   *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41   *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42   *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43   *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44   *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45   *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46   *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47   *	Pauline Middelink	:	identd support
48   *		Alan Cox	:	Fixed connect() taking signals I think.
49   *		Alan Cox	:	SO_LINGER supported
50   *		Alan Cox	:	Error reporting fixes
51   *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52   *		Alan Cox	:	inet sockets don't set sk->type!
53   *		Alan Cox	:	Split socket option code
54   *		Alan Cox	:	Callbacks
55   *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56   *		Alex		:	Removed restriction on inet fioctl
57   *		Alan Cox	:	Splitting INET from NET core
58   *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59   *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60   *		Alan Cox	:	Split IP from generic code
61   *		Alan Cox	:	New kfree_skbmem()
62   *		Alan Cox	:	Make SO_DEBUG superuser only.
63   *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64   *					(compatibility fix)
65   *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66   *		Alan Cox	:	Allocator for a socket is settable.
67   *		Alan Cox	:	SO_ERROR includes soft errors.
68   *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69   *		Alan Cox	: 	Generic socket allocation to make hooks
70   *					easier (suggested by Craig Metz).
71   *		Michael Pall	:	SO_ERROR returns positive errno again
72   *              Steve Whitehouse:       Added default destructor to free
73   *                                      protocol private data.
74   *              Steve Whitehouse:       Added various other default routines
75   *                                      common to several socket families.
76   *              Chris Evans     :       Call suser() check last on F_SETOWN
77   *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78   *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79   *		Andi Kleen	:	Fix write_space callback
80   *		Chris Evans	:	Security fixes - signedness again
81   *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82   *
83   * To Fix:
84   */
85  
86  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87  
88  #include <linux/unaligned.h>
89  #include <linux/capability.h>
90  #include <linux/errno.h>
91  #include <linux/errqueue.h>
92  #include <linux/types.h>
93  #include <linux/socket.h>
94  #include <linux/in.h>
95  #include <linux/kernel.h>
96  #include <linux/module.h>
97  #include <linux/proc_fs.h>
98  #include <linux/seq_file.h>
99  #include <linux/sched.h>
100  #include <linux/sched/mm.h>
101  #include <linux/timer.h>
102  #include <linux/string.h>
103  #include <linux/sockios.h>
104  #include <linux/net.h>
105  #include <linux/mm.h>
106  #include <linux/slab.h>
107  #include <linux/interrupt.h>
108  #include <linux/poll.h>
109  #include <linux/tcp.h>
110  #include <linux/udp.h>
111  #include <linux/init.h>
112  #include <linux/highmem.h>
113  #include <linux/user_namespace.h>
114  #include <linux/static_key.h>
115  #include <linux/memcontrol.h>
116  #include <linux/prefetch.h>
117  #include <linux/compat.h>
118  #include <linux/mroute.h>
119  #include <linux/mroute6.h>
120  #include <linux/icmpv6.h>
121  
122  #include <linux/uaccess.h>
123  
124  #include <linux/netdevice.h>
125  #include <net/protocol.h>
126  #include <linux/skbuff.h>
127  #include <linux/skbuff_ref.h>
128  #include <net/net_namespace.h>
129  #include <net/request_sock.h>
130  #include <net/sock.h>
131  #include <net/proto_memory.h>
132  #include <linux/net_tstamp.h>
133  #include <net/xfrm.h>
134  #include <linux/ipsec.h>
135  #include <net/cls_cgroup.h>
136  #include <net/netprio_cgroup.h>
137  #include <linux/sock_diag.h>
138  
139  #include <linux/filter.h>
140  #include <net/sock_reuseport.h>
141  #include <net/bpf_sk_storage.h>
142  
143  #include <trace/events/sock.h>
144  
145  #include <net/tcp.h>
146  #include <net/busy_poll.h>
147  #include <net/phonet/phonet.h>
148  
149  #include <linux/ethtool.h>
150  
151  #include "dev.h"
152  
153  static DEFINE_MUTEX(proto_list_mutex);
154  static LIST_HEAD(proto_list);
155  
156  static void sock_def_write_space_wfree(struct sock *sk);
157  static void sock_def_write_space(struct sock *sk);
158  
159  /**
160   * sk_ns_capable - General socket capability test
161   * @sk: Socket to use a capability on or through
162   * @user_ns: The user namespace of the capability to use
163   * @cap: The capability to use
164   *
165   * Test to see if the opener of the socket had when the socket was
166   * created and the current process has the capability @cap in the user
167   * namespace @user_ns.
168   */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)169  bool sk_ns_capable(const struct sock *sk,
170  		   struct user_namespace *user_ns, int cap)
171  {
172  	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
173  		ns_capable(user_ns, cap);
174  }
175  EXPORT_SYMBOL(sk_ns_capable);
176  
177  /**
178   * sk_capable - Socket global capability test
179   * @sk: Socket to use a capability on or through
180   * @cap: The global capability to use
181   *
182   * Test to see if the opener of the socket had when the socket was
183   * created and the current process has the capability @cap in all user
184   * namespaces.
185   */
sk_capable(const struct sock * sk,int cap)186  bool sk_capable(const struct sock *sk, int cap)
187  {
188  	return sk_ns_capable(sk, &init_user_ns, cap);
189  }
190  EXPORT_SYMBOL(sk_capable);
191  
192  /**
193   * sk_net_capable - Network namespace socket capability test
194   * @sk: Socket to use a capability on or through
195   * @cap: The capability to use
196   *
197   * Test to see if the opener of the socket had when the socket was created
198   * and the current process has the capability @cap over the network namespace
199   * the socket is a member of.
200   */
sk_net_capable(const struct sock * sk,int cap)201  bool sk_net_capable(const struct sock *sk, int cap)
202  {
203  	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
204  }
205  EXPORT_SYMBOL(sk_net_capable);
206  
207  /*
208   * Each address family might have different locking rules, so we have
209   * one slock key per address family and separate keys for internal and
210   * userspace sockets.
211   */
212  static struct lock_class_key af_family_keys[AF_MAX];
213  static struct lock_class_key af_family_kern_keys[AF_MAX];
214  static struct lock_class_key af_family_slock_keys[AF_MAX];
215  static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
216  
217  /*
218   * Make lock validator output more readable. (we pre-construct these
219   * strings build-time, so that runtime initialization of socket
220   * locks is fast):
221   */
222  
223  #define _sock_locks(x)						  \
224    x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
225    x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
226    x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
227    x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
228    x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
229    x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
230    x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
231    x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
232    x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
233    x "27"       ,	x "28"          ,	x "AF_CAN"      , \
234    x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
235    x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
236    x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
237    x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
238    x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
239    x "AF_MCTP"  , \
240    x "AF_MAX"
241  
242  static const char *const af_family_key_strings[AF_MAX+1] = {
243  	_sock_locks("sk_lock-")
244  };
245  static const char *const af_family_slock_key_strings[AF_MAX+1] = {
246  	_sock_locks("slock-")
247  };
248  static const char *const af_family_clock_key_strings[AF_MAX+1] = {
249  	_sock_locks("clock-")
250  };
251  
252  static const char *const af_family_kern_key_strings[AF_MAX+1] = {
253  	_sock_locks("k-sk_lock-")
254  };
255  static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
256  	_sock_locks("k-slock-")
257  };
258  static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
259  	_sock_locks("k-clock-")
260  };
261  static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262  	_sock_locks("rlock-")
263  };
264  static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
265  	_sock_locks("wlock-")
266  };
267  static const char *const af_family_elock_key_strings[AF_MAX+1] = {
268  	_sock_locks("elock-")
269  };
270  
271  /*
272   * sk_callback_lock and sk queues locking rules are per-address-family,
273   * so split the lock classes by using a per-AF key:
274   */
275  static struct lock_class_key af_callback_keys[AF_MAX];
276  static struct lock_class_key af_rlock_keys[AF_MAX];
277  static struct lock_class_key af_wlock_keys[AF_MAX];
278  static struct lock_class_key af_elock_keys[AF_MAX];
279  static struct lock_class_key af_kern_callback_keys[AF_MAX];
280  
281  /* Run time adjustable parameters. */
282  __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283  EXPORT_SYMBOL(sysctl_wmem_max);
284  __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285  EXPORT_SYMBOL(sysctl_rmem_max);
286  __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287  __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288  
289  int sysctl_tstamp_allow_data __read_mostly = 1;
290  
291  DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
292  EXPORT_SYMBOL_GPL(memalloc_socks_key);
293  
294  /**
295   * sk_set_memalloc - sets %SOCK_MEMALLOC
296   * @sk: socket to set it on
297   *
298   * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
299   * It's the responsibility of the admin to adjust min_free_kbytes
300   * to meet the requirements
301   */
sk_set_memalloc(struct sock * sk)302  void sk_set_memalloc(struct sock *sk)
303  {
304  	sock_set_flag(sk, SOCK_MEMALLOC);
305  	sk->sk_allocation |= __GFP_MEMALLOC;
306  	static_branch_inc(&memalloc_socks_key);
307  }
308  EXPORT_SYMBOL_GPL(sk_set_memalloc);
309  
sk_clear_memalloc(struct sock * sk)310  void sk_clear_memalloc(struct sock *sk)
311  {
312  	sock_reset_flag(sk, SOCK_MEMALLOC);
313  	sk->sk_allocation &= ~__GFP_MEMALLOC;
314  	static_branch_dec(&memalloc_socks_key);
315  
316  	/*
317  	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
318  	 * progress of swapping. SOCK_MEMALLOC may be cleared while
319  	 * it has rmem allocations due to the last swapfile being deactivated
320  	 * but there is a risk that the socket is unusable due to exceeding
321  	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
322  	 */
323  	sk_mem_reclaim(sk);
324  }
325  EXPORT_SYMBOL_GPL(sk_clear_memalloc);
326  
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)327  int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
328  {
329  	int ret;
330  	unsigned int noreclaim_flag;
331  
332  	/* these should have been dropped before queueing */
333  	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
334  
335  	noreclaim_flag = memalloc_noreclaim_save();
336  	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
337  				 tcp_v6_do_rcv,
338  				 tcp_v4_do_rcv,
339  				 sk, skb);
340  	memalloc_noreclaim_restore(noreclaim_flag);
341  
342  	return ret;
343  }
344  EXPORT_SYMBOL(__sk_backlog_rcv);
345  
sk_error_report(struct sock * sk)346  void sk_error_report(struct sock *sk)
347  {
348  	sk->sk_error_report(sk);
349  
350  	switch (sk->sk_family) {
351  	case AF_INET:
352  		fallthrough;
353  	case AF_INET6:
354  		trace_inet_sk_error_report(sk);
355  		break;
356  	default:
357  		break;
358  	}
359  }
360  EXPORT_SYMBOL(sk_error_report);
361  
sock_get_timeout(long timeo,void * optval,bool old_timeval)362  int sock_get_timeout(long timeo, void *optval, bool old_timeval)
363  {
364  	struct __kernel_sock_timeval tv;
365  
366  	if (timeo == MAX_SCHEDULE_TIMEOUT) {
367  		tv.tv_sec = 0;
368  		tv.tv_usec = 0;
369  	} else {
370  		tv.tv_sec = timeo / HZ;
371  		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
372  	}
373  
374  	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
375  		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
376  		*(struct old_timeval32 *)optval = tv32;
377  		return sizeof(tv32);
378  	}
379  
380  	if (old_timeval) {
381  		struct __kernel_old_timeval old_tv;
382  		old_tv.tv_sec = tv.tv_sec;
383  		old_tv.tv_usec = tv.tv_usec;
384  		*(struct __kernel_old_timeval *)optval = old_tv;
385  		return sizeof(old_tv);
386  	}
387  
388  	*(struct __kernel_sock_timeval *)optval = tv;
389  	return sizeof(tv);
390  }
391  EXPORT_SYMBOL(sock_get_timeout);
392  
sock_copy_user_timeval(struct __kernel_sock_timeval * tv,sockptr_t optval,int optlen,bool old_timeval)393  int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
394  			   sockptr_t optval, int optlen, bool old_timeval)
395  {
396  	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
397  		struct old_timeval32 tv32;
398  
399  		if (optlen < sizeof(tv32))
400  			return -EINVAL;
401  
402  		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
403  			return -EFAULT;
404  		tv->tv_sec = tv32.tv_sec;
405  		tv->tv_usec = tv32.tv_usec;
406  	} else if (old_timeval) {
407  		struct __kernel_old_timeval old_tv;
408  
409  		if (optlen < sizeof(old_tv))
410  			return -EINVAL;
411  		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
412  			return -EFAULT;
413  		tv->tv_sec = old_tv.tv_sec;
414  		tv->tv_usec = old_tv.tv_usec;
415  	} else {
416  		if (optlen < sizeof(*tv))
417  			return -EINVAL;
418  		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
419  			return -EFAULT;
420  	}
421  
422  	return 0;
423  }
424  EXPORT_SYMBOL(sock_copy_user_timeval);
425  
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)426  static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
427  			    bool old_timeval)
428  {
429  	struct __kernel_sock_timeval tv;
430  	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
431  	long val;
432  
433  	if (err)
434  		return err;
435  
436  	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437  		return -EDOM;
438  
439  	if (tv.tv_sec < 0) {
440  		static int warned __read_mostly;
441  
442  		WRITE_ONCE(*timeo_p, 0);
443  		if (warned < 10 && net_ratelimit()) {
444  			warned++;
445  			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446  				__func__, current->comm, task_pid_nr(current));
447  		}
448  		return 0;
449  	}
450  	val = MAX_SCHEDULE_TIMEOUT;
451  	if ((tv.tv_sec || tv.tv_usec) &&
452  	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
453  		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
454  						    USEC_PER_SEC / HZ);
455  	WRITE_ONCE(*timeo_p, val);
456  	return 0;
457  }
458  
sock_needs_netstamp(const struct sock * sk)459  static bool sock_needs_netstamp(const struct sock *sk)
460  {
461  	switch (sk->sk_family) {
462  	case AF_UNSPEC:
463  	case AF_UNIX:
464  		return false;
465  	default:
466  		return true;
467  	}
468  }
469  
sock_disable_timestamp(struct sock * sk,unsigned long flags)470  static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
471  {
472  	if (sk->sk_flags & flags) {
473  		sk->sk_flags &= ~flags;
474  		if (sock_needs_netstamp(sk) &&
475  		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
476  			net_disable_timestamp();
477  	}
478  }
479  
480  
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)481  int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
482  {
483  	unsigned long flags;
484  	struct sk_buff_head *list = &sk->sk_receive_queue;
485  
486  	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
487  		atomic_inc(&sk->sk_drops);
488  		trace_sock_rcvqueue_full(sk, skb);
489  		return -ENOMEM;
490  	}
491  
492  	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
493  		atomic_inc(&sk->sk_drops);
494  		return -ENOBUFS;
495  	}
496  
497  	skb->dev = NULL;
498  	skb_set_owner_r(skb, sk);
499  
500  	/* we escape from rcu protected region, make sure we dont leak
501  	 * a norefcounted dst
502  	 */
503  	skb_dst_force(skb);
504  
505  	spin_lock_irqsave(&list->lock, flags);
506  	sock_skb_set_dropcount(sk, skb);
507  	__skb_queue_tail(list, skb);
508  	spin_unlock_irqrestore(&list->lock, flags);
509  
510  	if (!sock_flag(sk, SOCK_DEAD))
511  		sk->sk_data_ready(sk);
512  	return 0;
513  }
514  EXPORT_SYMBOL(__sock_queue_rcv_skb);
515  
sock_queue_rcv_skb_reason(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)516  int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
517  			      enum skb_drop_reason *reason)
518  {
519  	enum skb_drop_reason drop_reason;
520  	int err;
521  
522  	err = sk_filter(sk, skb);
523  	if (err) {
524  		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
525  		goto out;
526  	}
527  	err = __sock_queue_rcv_skb(sk, skb);
528  	switch (err) {
529  	case -ENOMEM:
530  		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
531  		break;
532  	case -ENOBUFS:
533  		drop_reason = SKB_DROP_REASON_PROTO_MEM;
534  		break;
535  	default:
536  		drop_reason = SKB_NOT_DROPPED_YET;
537  		break;
538  	}
539  out:
540  	if (reason)
541  		*reason = drop_reason;
542  	return err;
543  }
544  EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
545  
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)546  int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
547  		     const int nested, unsigned int trim_cap, bool refcounted)
548  {
549  	int rc = NET_RX_SUCCESS;
550  
551  	if (sk_filter_trim_cap(sk, skb, trim_cap))
552  		goto discard_and_relse;
553  
554  	skb->dev = NULL;
555  
556  	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
557  		atomic_inc(&sk->sk_drops);
558  		goto discard_and_relse;
559  	}
560  	if (nested)
561  		bh_lock_sock_nested(sk);
562  	else
563  		bh_lock_sock(sk);
564  	if (!sock_owned_by_user(sk)) {
565  		/*
566  		 * trylock + unlock semantics:
567  		 */
568  		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
569  
570  		rc = sk_backlog_rcv(sk, skb);
571  
572  		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
573  	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
574  		bh_unlock_sock(sk);
575  		atomic_inc(&sk->sk_drops);
576  		goto discard_and_relse;
577  	}
578  
579  	bh_unlock_sock(sk);
580  out:
581  	if (refcounted)
582  		sock_put(sk);
583  	return rc;
584  discard_and_relse:
585  	kfree_skb(skb);
586  	goto out;
587  }
588  EXPORT_SYMBOL(__sk_receive_skb);
589  
590  INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
591  							  u32));
592  INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
593  							   u32));
__sk_dst_check(struct sock * sk,u32 cookie)594  struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
595  {
596  	struct dst_entry *dst = __sk_dst_get(sk);
597  
598  	if (dst && dst->obsolete &&
599  	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
600  			       dst, cookie) == NULL) {
601  		sk_tx_queue_clear(sk);
602  		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
603  		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
604  		dst_release(dst);
605  		return NULL;
606  	}
607  
608  	return dst;
609  }
610  EXPORT_SYMBOL(__sk_dst_check);
611  
sk_dst_check(struct sock * sk,u32 cookie)612  struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
613  {
614  	struct dst_entry *dst = sk_dst_get(sk);
615  
616  	if (dst && dst->obsolete &&
617  	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
618  			       dst, cookie) == NULL) {
619  		sk_dst_reset(sk);
620  		dst_release(dst);
621  		return NULL;
622  	}
623  
624  	return dst;
625  }
626  EXPORT_SYMBOL(sk_dst_check);
627  
sock_bindtoindex_locked(struct sock * sk,int ifindex)628  static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
629  {
630  	int ret = -ENOPROTOOPT;
631  #ifdef CONFIG_NETDEVICES
632  	struct net *net = sock_net(sk);
633  
634  	/* Sorry... */
635  	ret = -EPERM;
636  	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
637  		goto out;
638  
639  	ret = -EINVAL;
640  	if (ifindex < 0)
641  		goto out;
642  
643  	/* Paired with all READ_ONCE() done locklessly. */
644  	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
645  
646  	if (sk->sk_prot->rehash)
647  		sk->sk_prot->rehash(sk);
648  	sk_dst_reset(sk);
649  
650  	ret = 0;
651  
652  out:
653  #endif
654  
655  	return ret;
656  }
657  
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)658  int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
659  {
660  	int ret;
661  
662  	if (lock_sk)
663  		lock_sock(sk);
664  	ret = sock_bindtoindex_locked(sk, ifindex);
665  	if (lock_sk)
666  		release_sock(sk);
667  
668  	return ret;
669  }
670  EXPORT_SYMBOL(sock_bindtoindex);
671  
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)672  static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
673  {
674  	int ret = -ENOPROTOOPT;
675  #ifdef CONFIG_NETDEVICES
676  	struct net *net = sock_net(sk);
677  	char devname[IFNAMSIZ];
678  	int index;
679  
680  	ret = -EINVAL;
681  	if (optlen < 0)
682  		goto out;
683  
684  	/* Bind this socket to a particular device like "eth0",
685  	 * as specified in the passed interface name. If the
686  	 * name is "" or the option length is zero the socket
687  	 * is not bound.
688  	 */
689  	if (optlen > IFNAMSIZ - 1)
690  		optlen = IFNAMSIZ - 1;
691  	memset(devname, 0, sizeof(devname));
692  
693  	ret = -EFAULT;
694  	if (copy_from_sockptr(devname, optval, optlen))
695  		goto out;
696  
697  	index = 0;
698  	if (devname[0] != '\0') {
699  		struct net_device *dev;
700  
701  		rcu_read_lock();
702  		dev = dev_get_by_name_rcu(net, devname);
703  		if (dev)
704  			index = dev->ifindex;
705  		rcu_read_unlock();
706  		ret = -ENODEV;
707  		if (!dev)
708  			goto out;
709  	}
710  
711  	sockopt_lock_sock(sk);
712  	ret = sock_bindtoindex_locked(sk, index);
713  	sockopt_release_sock(sk);
714  out:
715  #endif
716  
717  	return ret;
718  }
719  
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)720  static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
721  				sockptr_t optlen, int len)
722  {
723  	int ret = -ENOPROTOOPT;
724  #ifdef CONFIG_NETDEVICES
725  	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
726  	struct net *net = sock_net(sk);
727  	char devname[IFNAMSIZ];
728  
729  	if (bound_dev_if == 0) {
730  		len = 0;
731  		goto zero;
732  	}
733  
734  	ret = -EINVAL;
735  	if (len < IFNAMSIZ)
736  		goto out;
737  
738  	ret = netdev_get_name(net, devname, bound_dev_if);
739  	if (ret)
740  		goto out;
741  
742  	len = strlen(devname) + 1;
743  
744  	ret = -EFAULT;
745  	if (copy_to_sockptr(optval, devname, len))
746  		goto out;
747  
748  zero:
749  	ret = -EFAULT;
750  	if (copy_to_sockptr(optlen, &len, sizeof(int)))
751  		goto out;
752  
753  	ret = 0;
754  
755  out:
756  #endif
757  
758  	return ret;
759  }
760  
sk_mc_loop(const struct sock * sk)761  bool sk_mc_loop(const struct sock *sk)
762  {
763  	if (dev_recursion_level())
764  		return false;
765  	if (!sk)
766  		return true;
767  	/* IPV6_ADDRFORM can change sk->sk_family under us. */
768  	switch (READ_ONCE(sk->sk_family)) {
769  	case AF_INET:
770  		return inet_test_bit(MC_LOOP, sk);
771  #if IS_ENABLED(CONFIG_IPV6)
772  	case AF_INET6:
773  		return inet6_test_bit(MC6_LOOP, sk);
774  #endif
775  	}
776  	WARN_ON_ONCE(1);
777  	return true;
778  }
779  EXPORT_SYMBOL(sk_mc_loop);
780  
sock_set_reuseaddr(struct sock * sk)781  void sock_set_reuseaddr(struct sock *sk)
782  {
783  	lock_sock(sk);
784  	sk->sk_reuse = SK_CAN_REUSE;
785  	release_sock(sk);
786  }
787  EXPORT_SYMBOL(sock_set_reuseaddr);
788  
sock_set_reuseport(struct sock * sk)789  void sock_set_reuseport(struct sock *sk)
790  {
791  	lock_sock(sk);
792  	sk->sk_reuseport = true;
793  	release_sock(sk);
794  }
795  EXPORT_SYMBOL(sock_set_reuseport);
796  
sock_no_linger(struct sock * sk)797  void sock_no_linger(struct sock *sk)
798  {
799  	lock_sock(sk);
800  	WRITE_ONCE(sk->sk_lingertime, 0);
801  	sock_set_flag(sk, SOCK_LINGER);
802  	release_sock(sk);
803  }
804  EXPORT_SYMBOL(sock_no_linger);
805  
sock_set_priority(struct sock * sk,u32 priority)806  void sock_set_priority(struct sock *sk, u32 priority)
807  {
808  	WRITE_ONCE(sk->sk_priority, priority);
809  }
810  EXPORT_SYMBOL(sock_set_priority);
811  
sock_set_sndtimeo(struct sock * sk,s64 secs)812  void sock_set_sndtimeo(struct sock *sk, s64 secs)
813  {
814  	lock_sock(sk);
815  	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
816  		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
817  	else
818  		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
819  	release_sock(sk);
820  }
821  EXPORT_SYMBOL(sock_set_sndtimeo);
822  
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)823  static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
824  {
825  	if (val)  {
826  		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
827  		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
828  		sock_set_flag(sk, SOCK_RCVTSTAMP);
829  		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
830  	} else {
831  		sock_reset_flag(sk, SOCK_RCVTSTAMP);
832  		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
833  	}
834  }
835  
sock_enable_timestamps(struct sock * sk)836  void sock_enable_timestamps(struct sock *sk)
837  {
838  	lock_sock(sk);
839  	__sock_set_timestamps(sk, true, false, true);
840  	release_sock(sk);
841  }
842  EXPORT_SYMBOL(sock_enable_timestamps);
843  
sock_set_timestamp(struct sock * sk,int optname,bool valbool)844  void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
845  {
846  	switch (optname) {
847  	case SO_TIMESTAMP_OLD:
848  		__sock_set_timestamps(sk, valbool, false, false);
849  		break;
850  	case SO_TIMESTAMP_NEW:
851  		__sock_set_timestamps(sk, valbool, true, false);
852  		break;
853  	case SO_TIMESTAMPNS_OLD:
854  		__sock_set_timestamps(sk, valbool, false, true);
855  		break;
856  	case SO_TIMESTAMPNS_NEW:
857  		__sock_set_timestamps(sk, valbool, true, true);
858  		break;
859  	}
860  }
861  
sock_timestamping_bind_phc(struct sock * sk,int phc_index)862  static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
863  {
864  	struct net *net = sock_net(sk);
865  	struct net_device *dev = NULL;
866  	bool match = false;
867  	int *vclock_index;
868  	int i, num;
869  
870  	if (sk->sk_bound_dev_if)
871  		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
872  
873  	if (!dev) {
874  		pr_err("%s: sock not bind to device\n", __func__);
875  		return -EOPNOTSUPP;
876  	}
877  
878  	num = ethtool_get_phc_vclocks(dev, &vclock_index);
879  	dev_put(dev);
880  
881  	for (i = 0; i < num; i++) {
882  		if (*(vclock_index + i) == phc_index) {
883  			match = true;
884  			break;
885  		}
886  	}
887  
888  	if (num > 0)
889  		kfree(vclock_index);
890  
891  	if (!match)
892  		return -EINVAL;
893  
894  	WRITE_ONCE(sk->sk_bind_phc, phc_index);
895  
896  	return 0;
897  }
898  
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)899  int sock_set_timestamping(struct sock *sk, int optname,
900  			  struct so_timestamping timestamping)
901  {
902  	int val = timestamping.flags;
903  	int ret;
904  
905  	if (val & ~SOF_TIMESTAMPING_MASK)
906  		return -EINVAL;
907  
908  	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
909  	    !(val & SOF_TIMESTAMPING_OPT_ID))
910  		return -EINVAL;
911  
912  	if (val & SOF_TIMESTAMPING_OPT_ID &&
913  	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
914  		if (sk_is_tcp(sk)) {
915  			if ((1 << sk->sk_state) &
916  			    (TCPF_CLOSE | TCPF_LISTEN))
917  				return -EINVAL;
918  			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
919  				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
920  			else
921  				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
922  		} else {
923  			atomic_set(&sk->sk_tskey, 0);
924  		}
925  	}
926  
927  	if (val & SOF_TIMESTAMPING_OPT_STATS &&
928  	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
929  		return -EINVAL;
930  
931  	if (val & SOF_TIMESTAMPING_BIND_PHC) {
932  		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
933  		if (ret)
934  			return ret;
935  	}
936  
937  	WRITE_ONCE(sk->sk_tsflags, val);
938  	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
939  
940  	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
941  		sock_enable_timestamp(sk,
942  				      SOCK_TIMESTAMPING_RX_SOFTWARE);
943  	else
944  		sock_disable_timestamp(sk,
945  				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
946  	return 0;
947  }
948  
sock_set_keepalive(struct sock * sk)949  void sock_set_keepalive(struct sock *sk)
950  {
951  	lock_sock(sk);
952  	if (sk->sk_prot->keepalive)
953  		sk->sk_prot->keepalive(sk, true);
954  	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
955  	release_sock(sk);
956  }
957  EXPORT_SYMBOL(sock_set_keepalive);
958  
__sock_set_rcvbuf(struct sock * sk,int val)959  static void __sock_set_rcvbuf(struct sock *sk, int val)
960  {
961  	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
962  	 * as a negative value.
963  	 */
964  	val = min_t(int, val, INT_MAX / 2);
965  	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
966  
967  	/* We double it on the way in to account for "struct sk_buff" etc.
968  	 * overhead.   Applications assume that the SO_RCVBUF setting they make
969  	 * will allow that much actual data to be received on that socket.
970  	 *
971  	 * Applications are unaware that "struct sk_buff" and other overheads
972  	 * allocate from the receive buffer during socket buffer allocation.
973  	 *
974  	 * And after considering the possible alternatives, returning the value
975  	 * we actually used in getsockopt is the most desirable behavior.
976  	 */
977  	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
978  }
979  
sock_set_rcvbuf(struct sock * sk,int val)980  void sock_set_rcvbuf(struct sock *sk, int val)
981  {
982  	lock_sock(sk);
983  	__sock_set_rcvbuf(sk, val);
984  	release_sock(sk);
985  }
986  EXPORT_SYMBOL(sock_set_rcvbuf);
987  
__sock_set_mark(struct sock * sk,u32 val)988  static void __sock_set_mark(struct sock *sk, u32 val)
989  {
990  	if (val != sk->sk_mark) {
991  		WRITE_ONCE(sk->sk_mark, val);
992  		sk_dst_reset(sk);
993  	}
994  }
995  
sock_set_mark(struct sock * sk,u32 val)996  void sock_set_mark(struct sock *sk, u32 val)
997  {
998  	lock_sock(sk);
999  	__sock_set_mark(sk, val);
1000  	release_sock(sk);
1001  }
1002  EXPORT_SYMBOL(sock_set_mark);
1003  
sock_release_reserved_memory(struct sock * sk,int bytes)1004  static void sock_release_reserved_memory(struct sock *sk, int bytes)
1005  {
1006  	/* Round down bytes to multiple of pages */
1007  	bytes = round_down(bytes, PAGE_SIZE);
1008  
1009  	WARN_ON(bytes > sk->sk_reserved_mem);
1010  	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1011  	sk_mem_reclaim(sk);
1012  }
1013  
sock_reserve_memory(struct sock * sk,int bytes)1014  static int sock_reserve_memory(struct sock *sk, int bytes)
1015  {
1016  	long allocated;
1017  	bool charged;
1018  	int pages;
1019  
1020  	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1021  		return -EOPNOTSUPP;
1022  
1023  	if (!bytes)
1024  		return 0;
1025  
1026  	pages = sk_mem_pages(bytes);
1027  
1028  	/* pre-charge to memcg */
1029  	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1030  					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1031  	if (!charged)
1032  		return -ENOMEM;
1033  
1034  	/* pre-charge to forward_alloc */
1035  	sk_memory_allocated_add(sk, pages);
1036  	allocated = sk_memory_allocated(sk);
1037  	/* If the system goes into memory pressure with this
1038  	 * precharge, give up and return error.
1039  	 */
1040  	if (allocated > sk_prot_mem_limits(sk, 1)) {
1041  		sk_memory_allocated_sub(sk, pages);
1042  		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1043  		return -ENOMEM;
1044  	}
1045  	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1046  
1047  	WRITE_ONCE(sk->sk_reserved_mem,
1048  		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1049  
1050  	return 0;
1051  }
1052  
1053  #ifdef CONFIG_PAGE_POOL
1054  
1055  /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1056   * in 1 syscall. The limit exists to limit the amount of memory the kernel
1057   * allocates to copy these tokens, and to prevent looping over the frags for
1058   * too long.
1059   */
1060  #define MAX_DONTNEED_TOKENS 128
1061  #define MAX_DONTNEED_FRAGS 1024
1062  
1063  static noinline_for_stack int
sock_devmem_dontneed(struct sock * sk,sockptr_t optval,unsigned int optlen)1064  sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1065  {
1066  	unsigned int num_tokens, i, j, k, netmem_num = 0;
1067  	struct dmabuf_token *tokens;
1068  	int ret = 0, num_frags = 0;
1069  	netmem_ref netmems[16];
1070  
1071  	if (!sk_is_tcp(sk))
1072  		return -EBADF;
1073  
1074  	if (optlen % sizeof(*tokens) ||
1075  	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1076  		return -EINVAL;
1077  
1078  	num_tokens = optlen / sizeof(*tokens);
1079  	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1080  	if (!tokens)
1081  		return -ENOMEM;
1082  
1083  	if (copy_from_sockptr(tokens, optval, optlen)) {
1084  		kvfree(tokens);
1085  		return -EFAULT;
1086  	}
1087  
1088  	xa_lock_bh(&sk->sk_user_frags);
1089  	for (i = 0; i < num_tokens; i++) {
1090  		for (j = 0; j < tokens[i].token_count; j++) {
1091  			if (++num_frags > MAX_DONTNEED_FRAGS)
1092  				goto frag_limit_reached;
1093  
1094  			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1095  				&sk->sk_user_frags, tokens[i].token_start + j);
1096  
1097  			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1098  				continue;
1099  
1100  			netmems[netmem_num++] = netmem;
1101  			if (netmem_num == ARRAY_SIZE(netmems)) {
1102  				xa_unlock_bh(&sk->sk_user_frags);
1103  				for (k = 0; k < netmem_num; k++)
1104  					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1105  				netmem_num = 0;
1106  				xa_lock_bh(&sk->sk_user_frags);
1107  			}
1108  			ret++;
1109  		}
1110  	}
1111  
1112  frag_limit_reached:
1113  	xa_unlock_bh(&sk->sk_user_frags);
1114  	for (k = 0; k < netmem_num; k++)
1115  		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1116  
1117  	kvfree(tokens);
1118  	return ret;
1119  }
1120  #endif
1121  
sockopt_lock_sock(struct sock * sk)1122  void sockopt_lock_sock(struct sock *sk)
1123  {
1124  	/* When current->bpf_ctx is set, the setsockopt is called from
1125  	 * a bpf prog.  bpf has ensured the sk lock has been
1126  	 * acquired before calling setsockopt().
1127  	 */
1128  	if (has_current_bpf_ctx())
1129  		return;
1130  
1131  	lock_sock(sk);
1132  }
1133  EXPORT_SYMBOL(sockopt_lock_sock);
1134  
sockopt_release_sock(struct sock * sk)1135  void sockopt_release_sock(struct sock *sk)
1136  {
1137  	if (has_current_bpf_ctx())
1138  		return;
1139  
1140  	release_sock(sk);
1141  }
1142  EXPORT_SYMBOL(sockopt_release_sock);
1143  
sockopt_ns_capable(struct user_namespace * ns,int cap)1144  bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1145  {
1146  	return has_current_bpf_ctx() || ns_capable(ns, cap);
1147  }
1148  EXPORT_SYMBOL(sockopt_ns_capable);
1149  
sockopt_capable(int cap)1150  bool sockopt_capable(int cap)
1151  {
1152  	return has_current_bpf_ctx() || capable(cap);
1153  }
1154  EXPORT_SYMBOL(sockopt_capable);
1155  
sockopt_validate_clockid(__kernel_clockid_t value)1156  static int sockopt_validate_clockid(__kernel_clockid_t value)
1157  {
1158  	switch (value) {
1159  	case CLOCK_REALTIME:
1160  	case CLOCK_MONOTONIC:
1161  	case CLOCK_TAI:
1162  		return 0;
1163  	}
1164  	return -EINVAL;
1165  }
1166  
1167  /*
1168   *	This is meant for all protocols to use and covers goings on
1169   *	at the socket level. Everything here is generic.
1170   */
1171  
sk_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)1172  int sk_setsockopt(struct sock *sk, int level, int optname,
1173  		  sockptr_t optval, unsigned int optlen)
1174  {
1175  	struct so_timestamping timestamping;
1176  	struct socket *sock = sk->sk_socket;
1177  	struct sock_txtime sk_txtime;
1178  	int val;
1179  	int valbool;
1180  	struct linger ling;
1181  	int ret = 0;
1182  
1183  	/*
1184  	 *	Options without arguments
1185  	 */
1186  
1187  	if (optname == SO_BINDTODEVICE)
1188  		return sock_setbindtodevice(sk, optval, optlen);
1189  
1190  	if (optlen < sizeof(int))
1191  		return -EINVAL;
1192  
1193  	if (copy_from_sockptr(&val, optval, sizeof(val)))
1194  		return -EFAULT;
1195  
1196  	valbool = val ? 1 : 0;
1197  
1198  	/* handle options which do not require locking the socket. */
1199  	switch (optname) {
1200  	case SO_PRIORITY:
1201  		if ((val >= 0 && val <= 6) ||
1202  		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1203  		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1204  			sock_set_priority(sk, val);
1205  			return 0;
1206  		}
1207  		return -EPERM;
1208  	case SO_PASSSEC:
1209  		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1210  		return 0;
1211  	case SO_PASSCRED:
1212  		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1213  		return 0;
1214  	case SO_PASSPIDFD:
1215  		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1216  		return 0;
1217  	case SO_TYPE:
1218  	case SO_PROTOCOL:
1219  	case SO_DOMAIN:
1220  	case SO_ERROR:
1221  		return -ENOPROTOOPT;
1222  #ifdef CONFIG_NET_RX_BUSY_POLL
1223  	case SO_BUSY_POLL:
1224  		if (val < 0)
1225  			return -EINVAL;
1226  		WRITE_ONCE(sk->sk_ll_usec, val);
1227  		return 0;
1228  	case SO_PREFER_BUSY_POLL:
1229  		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1230  			return -EPERM;
1231  		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1232  		return 0;
1233  	case SO_BUSY_POLL_BUDGET:
1234  		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1235  		    !sockopt_capable(CAP_NET_ADMIN))
1236  			return -EPERM;
1237  		if (val < 0 || val > U16_MAX)
1238  			return -EINVAL;
1239  		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1240  		return 0;
1241  #endif
1242  	case SO_MAX_PACING_RATE:
1243  		{
1244  		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1245  		unsigned long pacing_rate;
1246  
1247  		if (sizeof(ulval) != sizeof(val) &&
1248  		    optlen >= sizeof(ulval) &&
1249  		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1250  			return -EFAULT;
1251  		}
1252  		if (ulval != ~0UL)
1253  			cmpxchg(&sk->sk_pacing_status,
1254  				SK_PACING_NONE,
1255  				SK_PACING_NEEDED);
1256  		/* Pairs with READ_ONCE() from sk_getsockopt() */
1257  		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1258  		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1259  		if (ulval < pacing_rate)
1260  			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1261  		return 0;
1262  		}
1263  	case SO_TXREHASH:
1264  		if (val < -1 || val > 1)
1265  			return -EINVAL;
1266  		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1267  			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1268  		/* Paired with READ_ONCE() in tcp_rtx_synack()
1269  		 * and sk_getsockopt().
1270  		 */
1271  		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1272  		return 0;
1273  	case SO_PEEK_OFF:
1274  		{
1275  		int (*set_peek_off)(struct sock *sk, int val);
1276  
1277  		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1278  		if (set_peek_off)
1279  			ret = set_peek_off(sk, val);
1280  		else
1281  			ret = -EOPNOTSUPP;
1282  		return ret;
1283  		}
1284  #ifdef CONFIG_PAGE_POOL
1285  	case SO_DEVMEM_DONTNEED:
1286  		return sock_devmem_dontneed(sk, optval, optlen);
1287  #endif
1288  	}
1289  
1290  	sockopt_lock_sock(sk);
1291  
1292  	switch (optname) {
1293  	case SO_DEBUG:
1294  		if (val && !sockopt_capable(CAP_NET_ADMIN))
1295  			ret = -EACCES;
1296  		else
1297  			sock_valbool_flag(sk, SOCK_DBG, valbool);
1298  		break;
1299  	case SO_REUSEADDR:
1300  		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1301  		break;
1302  	case SO_REUSEPORT:
1303  		sk->sk_reuseport = valbool;
1304  		break;
1305  	case SO_DONTROUTE:
1306  		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1307  		sk_dst_reset(sk);
1308  		break;
1309  	case SO_BROADCAST:
1310  		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1311  		break;
1312  	case SO_SNDBUF:
1313  		/* Don't error on this BSD doesn't and if you think
1314  		 * about it this is right. Otherwise apps have to
1315  		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1316  		 * are treated in BSD as hints
1317  		 */
1318  		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1319  set_sndbuf:
1320  		/* Ensure val * 2 fits into an int, to prevent max_t()
1321  		 * from treating it as a negative value.
1322  		 */
1323  		val = min_t(int, val, INT_MAX / 2);
1324  		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1325  		WRITE_ONCE(sk->sk_sndbuf,
1326  			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1327  		/* Wake up sending tasks if we upped the value. */
1328  		sk->sk_write_space(sk);
1329  		break;
1330  
1331  	case SO_SNDBUFFORCE:
1332  		if (!sockopt_capable(CAP_NET_ADMIN)) {
1333  			ret = -EPERM;
1334  			break;
1335  		}
1336  
1337  		/* No negative values (to prevent underflow, as val will be
1338  		 * multiplied by 2).
1339  		 */
1340  		if (val < 0)
1341  			val = 0;
1342  		goto set_sndbuf;
1343  
1344  	case SO_RCVBUF:
1345  		/* Don't error on this BSD doesn't and if you think
1346  		 * about it this is right. Otherwise apps have to
1347  		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1348  		 * are treated in BSD as hints
1349  		 */
1350  		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1351  		break;
1352  
1353  	case SO_RCVBUFFORCE:
1354  		if (!sockopt_capable(CAP_NET_ADMIN)) {
1355  			ret = -EPERM;
1356  			break;
1357  		}
1358  
1359  		/* No negative values (to prevent underflow, as val will be
1360  		 * multiplied by 2).
1361  		 */
1362  		__sock_set_rcvbuf(sk, max(val, 0));
1363  		break;
1364  
1365  	case SO_KEEPALIVE:
1366  		if (sk->sk_prot->keepalive)
1367  			sk->sk_prot->keepalive(sk, valbool);
1368  		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1369  		break;
1370  
1371  	case SO_OOBINLINE:
1372  		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1373  		break;
1374  
1375  	case SO_NO_CHECK:
1376  		sk->sk_no_check_tx = valbool;
1377  		break;
1378  
1379  	case SO_LINGER:
1380  		if (optlen < sizeof(ling)) {
1381  			ret = -EINVAL;	/* 1003.1g */
1382  			break;
1383  		}
1384  		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1385  			ret = -EFAULT;
1386  			break;
1387  		}
1388  		if (!ling.l_onoff) {
1389  			sock_reset_flag(sk, SOCK_LINGER);
1390  		} else {
1391  			unsigned long t_sec = ling.l_linger;
1392  
1393  			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1394  				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1395  			else
1396  				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1397  			sock_set_flag(sk, SOCK_LINGER);
1398  		}
1399  		break;
1400  
1401  	case SO_BSDCOMPAT:
1402  		break;
1403  
1404  	case SO_TIMESTAMP_OLD:
1405  	case SO_TIMESTAMP_NEW:
1406  	case SO_TIMESTAMPNS_OLD:
1407  	case SO_TIMESTAMPNS_NEW:
1408  		sock_set_timestamp(sk, optname, valbool);
1409  		break;
1410  
1411  	case SO_TIMESTAMPING_NEW:
1412  	case SO_TIMESTAMPING_OLD:
1413  		if (optlen == sizeof(timestamping)) {
1414  			if (copy_from_sockptr(&timestamping, optval,
1415  					      sizeof(timestamping))) {
1416  				ret = -EFAULT;
1417  				break;
1418  			}
1419  		} else {
1420  			memset(&timestamping, 0, sizeof(timestamping));
1421  			timestamping.flags = val;
1422  		}
1423  		ret = sock_set_timestamping(sk, optname, timestamping);
1424  		break;
1425  
1426  	case SO_RCVLOWAT:
1427  		{
1428  		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1429  
1430  		if (val < 0)
1431  			val = INT_MAX;
1432  		if (sock)
1433  			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1434  		if (set_rcvlowat)
1435  			ret = set_rcvlowat(sk, val);
1436  		else
1437  			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1438  		break;
1439  		}
1440  	case SO_RCVTIMEO_OLD:
1441  	case SO_RCVTIMEO_NEW:
1442  		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1443  				       optlen, optname == SO_RCVTIMEO_OLD);
1444  		break;
1445  
1446  	case SO_SNDTIMEO_OLD:
1447  	case SO_SNDTIMEO_NEW:
1448  		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1449  				       optlen, optname == SO_SNDTIMEO_OLD);
1450  		break;
1451  
1452  	case SO_ATTACH_FILTER: {
1453  		struct sock_fprog fprog;
1454  
1455  		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1456  		if (!ret)
1457  			ret = sk_attach_filter(&fprog, sk);
1458  		break;
1459  	}
1460  	case SO_ATTACH_BPF:
1461  		ret = -EINVAL;
1462  		if (optlen == sizeof(u32)) {
1463  			u32 ufd;
1464  
1465  			ret = -EFAULT;
1466  			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1467  				break;
1468  
1469  			ret = sk_attach_bpf(ufd, sk);
1470  		}
1471  		break;
1472  
1473  	case SO_ATTACH_REUSEPORT_CBPF: {
1474  		struct sock_fprog fprog;
1475  
1476  		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1477  		if (!ret)
1478  			ret = sk_reuseport_attach_filter(&fprog, sk);
1479  		break;
1480  	}
1481  	case SO_ATTACH_REUSEPORT_EBPF:
1482  		ret = -EINVAL;
1483  		if (optlen == sizeof(u32)) {
1484  			u32 ufd;
1485  
1486  			ret = -EFAULT;
1487  			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1488  				break;
1489  
1490  			ret = sk_reuseport_attach_bpf(ufd, sk);
1491  		}
1492  		break;
1493  
1494  	case SO_DETACH_REUSEPORT_BPF:
1495  		ret = reuseport_detach_prog(sk);
1496  		break;
1497  
1498  	case SO_DETACH_FILTER:
1499  		ret = sk_detach_filter(sk);
1500  		break;
1501  
1502  	case SO_LOCK_FILTER:
1503  		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1504  			ret = -EPERM;
1505  		else
1506  			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1507  		break;
1508  
1509  	case SO_MARK:
1510  		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1511  		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1512  			ret = -EPERM;
1513  			break;
1514  		}
1515  
1516  		__sock_set_mark(sk, val);
1517  		break;
1518  	case SO_RCVMARK:
1519  		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1520  		break;
1521  
1522  	case SO_RXQ_OVFL:
1523  		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1524  		break;
1525  
1526  	case SO_WIFI_STATUS:
1527  		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1528  		break;
1529  
1530  	case SO_NOFCS:
1531  		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1532  		break;
1533  
1534  	case SO_SELECT_ERR_QUEUE:
1535  		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1536  		break;
1537  
1538  
1539  	case SO_INCOMING_CPU:
1540  		reuseport_update_incoming_cpu(sk, val);
1541  		break;
1542  
1543  	case SO_CNX_ADVICE:
1544  		if (val == 1)
1545  			dst_negative_advice(sk);
1546  		break;
1547  
1548  	case SO_ZEROCOPY:
1549  		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1550  			if (!(sk_is_tcp(sk) ||
1551  			      (sk->sk_type == SOCK_DGRAM &&
1552  			       sk->sk_protocol == IPPROTO_UDP)))
1553  				ret = -EOPNOTSUPP;
1554  		} else if (sk->sk_family != PF_RDS) {
1555  			ret = -EOPNOTSUPP;
1556  		}
1557  		if (!ret) {
1558  			if (val < 0 || val > 1)
1559  				ret = -EINVAL;
1560  			else
1561  				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1562  		}
1563  		break;
1564  
1565  	case SO_TXTIME:
1566  		if (optlen != sizeof(struct sock_txtime)) {
1567  			ret = -EINVAL;
1568  			break;
1569  		} else if (copy_from_sockptr(&sk_txtime, optval,
1570  			   sizeof(struct sock_txtime))) {
1571  			ret = -EFAULT;
1572  			break;
1573  		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1574  			ret = -EINVAL;
1575  			break;
1576  		}
1577  		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1578  		 * scheduler has enough safe guards.
1579  		 */
1580  		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1581  		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1582  			ret = -EPERM;
1583  			break;
1584  		}
1585  
1586  		ret = sockopt_validate_clockid(sk_txtime.clockid);
1587  		if (ret)
1588  			break;
1589  
1590  		sock_valbool_flag(sk, SOCK_TXTIME, true);
1591  		sk->sk_clockid = sk_txtime.clockid;
1592  		sk->sk_txtime_deadline_mode =
1593  			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1594  		sk->sk_txtime_report_errors =
1595  			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1596  		break;
1597  
1598  	case SO_BINDTOIFINDEX:
1599  		ret = sock_bindtoindex_locked(sk, val);
1600  		break;
1601  
1602  	case SO_BUF_LOCK:
1603  		if (val & ~SOCK_BUF_LOCK_MASK) {
1604  			ret = -EINVAL;
1605  			break;
1606  		}
1607  		sk->sk_userlocks = val | (sk->sk_userlocks &
1608  					  ~SOCK_BUF_LOCK_MASK);
1609  		break;
1610  
1611  	case SO_RESERVE_MEM:
1612  	{
1613  		int delta;
1614  
1615  		if (val < 0) {
1616  			ret = -EINVAL;
1617  			break;
1618  		}
1619  
1620  		delta = val - sk->sk_reserved_mem;
1621  		if (delta < 0)
1622  			sock_release_reserved_memory(sk, -delta);
1623  		else
1624  			ret = sock_reserve_memory(sk, delta);
1625  		break;
1626  	}
1627  
1628  	default:
1629  		ret = -ENOPROTOOPT;
1630  		break;
1631  	}
1632  	sockopt_release_sock(sk);
1633  	return ret;
1634  }
1635  
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1636  int sock_setsockopt(struct socket *sock, int level, int optname,
1637  		    sockptr_t optval, unsigned int optlen)
1638  {
1639  	return sk_setsockopt(sock->sk, level, optname,
1640  			     optval, optlen);
1641  }
1642  EXPORT_SYMBOL(sock_setsockopt);
1643  
sk_get_peer_cred(struct sock * sk)1644  static const struct cred *sk_get_peer_cred(struct sock *sk)
1645  {
1646  	const struct cred *cred;
1647  
1648  	spin_lock(&sk->sk_peer_lock);
1649  	cred = get_cred(sk->sk_peer_cred);
1650  	spin_unlock(&sk->sk_peer_lock);
1651  
1652  	return cred;
1653  }
1654  
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1655  static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1656  			  struct ucred *ucred)
1657  {
1658  	ucred->pid = pid_vnr(pid);
1659  	ucred->uid = ucred->gid = -1;
1660  	if (cred) {
1661  		struct user_namespace *current_ns = current_user_ns();
1662  
1663  		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1664  		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1665  	}
1666  }
1667  
groups_to_user(sockptr_t dst,const struct group_info * src)1668  static int groups_to_user(sockptr_t dst, const struct group_info *src)
1669  {
1670  	struct user_namespace *user_ns = current_user_ns();
1671  	int i;
1672  
1673  	for (i = 0; i < src->ngroups; i++) {
1674  		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1675  
1676  		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1677  			return -EFAULT;
1678  	}
1679  
1680  	return 0;
1681  }
1682  
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1683  int sk_getsockopt(struct sock *sk, int level, int optname,
1684  		  sockptr_t optval, sockptr_t optlen)
1685  {
1686  	struct socket *sock = sk->sk_socket;
1687  
1688  	union {
1689  		int val;
1690  		u64 val64;
1691  		unsigned long ulval;
1692  		struct linger ling;
1693  		struct old_timeval32 tm32;
1694  		struct __kernel_old_timeval tm;
1695  		struct  __kernel_sock_timeval stm;
1696  		struct sock_txtime txtime;
1697  		struct so_timestamping timestamping;
1698  	} v;
1699  
1700  	int lv = sizeof(int);
1701  	int len;
1702  
1703  	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1704  		return -EFAULT;
1705  	if (len < 0)
1706  		return -EINVAL;
1707  
1708  	memset(&v, 0, sizeof(v));
1709  
1710  	switch (optname) {
1711  	case SO_DEBUG:
1712  		v.val = sock_flag(sk, SOCK_DBG);
1713  		break;
1714  
1715  	case SO_DONTROUTE:
1716  		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1717  		break;
1718  
1719  	case SO_BROADCAST:
1720  		v.val = sock_flag(sk, SOCK_BROADCAST);
1721  		break;
1722  
1723  	case SO_SNDBUF:
1724  		v.val = READ_ONCE(sk->sk_sndbuf);
1725  		break;
1726  
1727  	case SO_RCVBUF:
1728  		v.val = READ_ONCE(sk->sk_rcvbuf);
1729  		break;
1730  
1731  	case SO_REUSEADDR:
1732  		v.val = sk->sk_reuse;
1733  		break;
1734  
1735  	case SO_REUSEPORT:
1736  		v.val = sk->sk_reuseport;
1737  		break;
1738  
1739  	case SO_KEEPALIVE:
1740  		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1741  		break;
1742  
1743  	case SO_TYPE:
1744  		v.val = sk->sk_type;
1745  		break;
1746  
1747  	case SO_PROTOCOL:
1748  		v.val = sk->sk_protocol;
1749  		break;
1750  
1751  	case SO_DOMAIN:
1752  		v.val = sk->sk_family;
1753  		break;
1754  
1755  	case SO_ERROR:
1756  		v.val = -sock_error(sk);
1757  		if (v.val == 0)
1758  			v.val = xchg(&sk->sk_err_soft, 0);
1759  		break;
1760  
1761  	case SO_OOBINLINE:
1762  		v.val = sock_flag(sk, SOCK_URGINLINE);
1763  		break;
1764  
1765  	case SO_NO_CHECK:
1766  		v.val = sk->sk_no_check_tx;
1767  		break;
1768  
1769  	case SO_PRIORITY:
1770  		v.val = READ_ONCE(sk->sk_priority);
1771  		break;
1772  
1773  	case SO_LINGER:
1774  		lv		= sizeof(v.ling);
1775  		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1776  		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1777  		break;
1778  
1779  	case SO_BSDCOMPAT:
1780  		break;
1781  
1782  	case SO_TIMESTAMP_OLD:
1783  		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1784  				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1785  				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1786  		break;
1787  
1788  	case SO_TIMESTAMPNS_OLD:
1789  		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1790  		break;
1791  
1792  	case SO_TIMESTAMP_NEW:
1793  		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1794  		break;
1795  
1796  	case SO_TIMESTAMPNS_NEW:
1797  		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1798  		break;
1799  
1800  	case SO_TIMESTAMPING_OLD:
1801  	case SO_TIMESTAMPING_NEW:
1802  		lv = sizeof(v.timestamping);
1803  		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1804  		 * returning the flags when they were set through the same option.
1805  		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1806  		 */
1807  		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1808  			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1809  			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1810  		}
1811  		break;
1812  
1813  	case SO_RCVTIMEO_OLD:
1814  	case SO_RCVTIMEO_NEW:
1815  		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1816  				      SO_RCVTIMEO_OLD == optname);
1817  		break;
1818  
1819  	case SO_SNDTIMEO_OLD:
1820  	case SO_SNDTIMEO_NEW:
1821  		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1822  				      SO_SNDTIMEO_OLD == optname);
1823  		break;
1824  
1825  	case SO_RCVLOWAT:
1826  		v.val = READ_ONCE(sk->sk_rcvlowat);
1827  		break;
1828  
1829  	case SO_SNDLOWAT:
1830  		v.val = 1;
1831  		break;
1832  
1833  	case SO_PASSCRED:
1834  		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1835  		break;
1836  
1837  	case SO_PASSPIDFD:
1838  		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1839  		break;
1840  
1841  	case SO_PEERCRED:
1842  	{
1843  		struct ucred peercred;
1844  		if (len > sizeof(peercred))
1845  			len = sizeof(peercred);
1846  
1847  		spin_lock(&sk->sk_peer_lock);
1848  		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1849  		spin_unlock(&sk->sk_peer_lock);
1850  
1851  		if (copy_to_sockptr(optval, &peercred, len))
1852  			return -EFAULT;
1853  		goto lenout;
1854  	}
1855  
1856  	case SO_PEERPIDFD:
1857  	{
1858  		struct pid *peer_pid;
1859  		struct file *pidfd_file = NULL;
1860  		int pidfd;
1861  
1862  		if (len > sizeof(pidfd))
1863  			len = sizeof(pidfd);
1864  
1865  		spin_lock(&sk->sk_peer_lock);
1866  		peer_pid = get_pid(sk->sk_peer_pid);
1867  		spin_unlock(&sk->sk_peer_lock);
1868  
1869  		if (!peer_pid)
1870  			return -ENODATA;
1871  
1872  		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1873  		put_pid(peer_pid);
1874  		if (pidfd < 0)
1875  			return pidfd;
1876  
1877  		if (copy_to_sockptr(optval, &pidfd, len) ||
1878  		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1879  			put_unused_fd(pidfd);
1880  			fput(pidfd_file);
1881  
1882  			return -EFAULT;
1883  		}
1884  
1885  		fd_install(pidfd, pidfd_file);
1886  		return 0;
1887  	}
1888  
1889  	case SO_PEERGROUPS:
1890  	{
1891  		const struct cred *cred;
1892  		int ret, n;
1893  
1894  		cred = sk_get_peer_cred(sk);
1895  		if (!cred)
1896  			return -ENODATA;
1897  
1898  		n = cred->group_info->ngroups;
1899  		if (len < n * sizeof(gid_t)) {
1900  			len = n * sizeof(gid_t);
1901  			put_cred(cred);
1902  			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1903  		}
1904  		len = n * sizeof(gid_t);
1905  
1906  		ret = groups_to_user(optval, cred->group_info);
1907  		put_cred(cred);
1908  		if (ret)
1909  			return ret;
1910  		goto lenout;
1911  	}
1912  
1913  	case SO_PEERNAME:
1914  	{
1915  		struct sockaddr_storage address;
1916  
1917  		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1918  		if (lv < 0)
1919  			return -ENOTCONN;
1920  		if (lv < len)
1921  			return -EINVAL;
1922  		if (copy_to_sockptr(optval, &address, len))
1923  			return -EFAULT;
1924  		goto lenout;
1925  	}
1926  
1927  	/* Dubious BSD thing... Probably nobody even uses it, but
1928  	 * the UNIX standard wants it for whatever reason... -DaveM
1929  	 */
1930  	case SO_ACCEPTCONN:
1931  		v.val = sk->sk_state == TCP_LISTEN;
1932  		break;
1933  
1934  	case SO_PASSSEC:
1935  		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1936  		break;
1937  
1938  	case SO_PEERSEC:
1939  		return security_socket_getpeersec_stream(sock,
1940  							 optval, optlen, len);
1941  
1942  	case SO_MARK:
1943  		v.val = READ_ONCE(sk->sk_mark);
1944  		break;
1945  
1946  	case SO_RCVMARK:
1947  		v.val = sock_flag(sk, SOCK_RCVMARK);
1948  		break;
1949  
1950  	case SO_RXQ_OVFL:
1951  		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1952  		break;
1953  
1954  	case SO_WIFI_STATUS:
1955  		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1956  		break;
1957  
1958  	case SO_PEEK_OFF:
1959  		if (!READ_ONCE(sock->ops)->set_peek_off)
1960  			return -EOPNOTSUPP;
1961  
1962  		v.val = READ_ONCE(sk->sk_peek_off);
1963  		break;
1964  	case SO_NOFCS:
1965  		v.val = sock_flag(sk, SOCK_NOFCS);
1966  		break;
1967  
1968  	case SO_BINDTODEVICE:
1969  		return sock_getbindtodevice(sk, optval, optlen, len);
1970  
1971  	case SO_GET_FILTER:
1972  		len = sk_get_filter(sk, optval, len);
1973  		if (len < 0)
1974  			return len;
1975  
1976  		goto lenout;
1977  
1978  	case SO_LOCK_FILTER:
1979  		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1980  		break;
1981  
1982  	case SO_BPF_EXTENSIONS:
1983  		v.val = bpf_tell_extensions();
1984  		break;
1985  
1986  	case SO_SELECT_ERR_QUEUE:
1987  		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1988  		break;
1989  
1990  #ifdef CONFIG_NET_RX_BUSY_POLL
1991  	case SO_BUSY_POLL:
1992  		v.val = READ_ONCE(sk->sk_ll_usec);
1993  		break;
1994  	case SO_PREFER_BUSY_POLL:
1995  		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1996  		break;
1997  #endif
1998  
1999  	case SO_MAX_PACING_RATE:
2000  		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2001  		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2002  			lv = sizeof(v.ulval);
2003  			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2004  		} else {
2005  			/* 32bit version */
2006  			v.val = min_t(unsigned long, ~0U,
2007  				      READ_ONCE(sk->sk_max_pacing_rate));
2008  		}
2009  		break;
2010  
2011  	case SO_INCOMING_CPU:
2012  		v.val = READ_ONCE(sk->sk_incoming_cpu);
2013  		break;
2014  
2015  	case SO_MEMINFO:
2016  	{
2017  		u32 meminfo[SK_MEMINFO_VARS];
2018  
2019  		sk_get_meminfo(sk, meminfo);
2020  
2021  		len = min_t(unsigned int, len, sizeof(meminfo));
2022  		if (copy_to_sockptr(optval, &meminfo, len))
2023  			return -EFAULT;
2024  
2025  		goto lenout;
2026  	}
2027  
2028  #ifdef CONFIG_NET_RX_BUSY_POLL
2029  	case SO_INCOMING_NAPI_ID:
2030  		v.val = READ_ONCE(sk->sk_napi_id);
2031  
2032  		/* aggregate non-NAPI IDs down to 0 */
2033  		if (v.val < MIN_NAPI_ID)
2034  			v.val = 0;
2035  
2036  		break;
2037  #endif
2038  
2039  	case SO_COOKIE:
2040  		lv = sizeof(u64);
2041  		if (len < lv)
2042  			return -EINVAL;
2043  		v.val64 = sock_gen_cookie(sk);
2044  		break;
2045  
2046  	case SO_ZEROCOPY:
2047  		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2048  		break;
2049  
2050  	case SO_TXTIME:
2051  		lv = sizeof(v.txtime);
2052  		v.txtime.clockid = sk->sk_clockid;
2053  		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2054  				  SOF_TXTIME_DEADLINE_MODE : 0;
2055  		v.txtime.flags |= sk->sk_txtime_report_errors ?
2056  				  SOF_TXTIME_REPORT_ERRORS : 0;
2057  		break;
2058  
2059  	case SO_BINDTOIFINDEX:
2060  		v.val = READ_ONCE(sk->sk_bound_dev_if);
2061  		break;
2062  
2063  	case SO_NETNS_COOKIE:
2064  		lv = sizeof(u64);
2065  		if (len != lv)
2066  			return -EINVAL;
2067  		v.val64 = sock_net(sk)->net_cookie;
2068  		break;
2069  
2070  	case SO_BUF_LOCK:
2071  		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2072  		break;
2073  
2074  	case SO_RESERVE_MEM:
2075  		v.val = READ_ONCE(sk->sk_reserved_mem);
2076  		break;
2077  
2078  	case SO_TXREHASH:
2079  		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2080  		v.val = READ_ONCE(sk->sk_txrehash);
2081  		break;
2082  
2083  	default:
2084  		/* We implement the SO_SNDLOWAT etc to not be settable
2085  		 * (1003.1g 7).
2086  		 */
2087  		return -ENOPROTOOPT;
2088  	}
2089  
2090  	if (len > lv)
2091  		len = lv;
2092  	if (copy_to_sockptr(optval, &v, len))
2093  		return -EFAULT;
2094  lenout:
2095  	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2096  		return -EFAULT;
2097  	return 0;
2098  }
2099  
2100  /*
2101   * Initialize an sk_lock.
2102   *
2103   * (We also register the sk_lock with the lock validator.)
2104   */
sock_lock_init(struct sock * sk)2105  static inline void sock_lock_init(struct sock *sk)
2106  {
2107  	if (sk->sk_kern_sock)
2108  		sock_lock_init_class_and_name(
2109  			sk,
2110  			af_family_kern_slock_key_strings[sk->sk_family],
2111  			af_family_kern_slock_keys + sk->sk_family,
2112  			af_family_kern_key_strings[sk->sk_family],
2113  			af_family_kern_keys + sk->sk_family);
2114  	else
2115  		sock_lock_init_class_and_name(
2116  			sk,
2117  			af_family_slock_key_strings[sk->sk_family],
2118  			af_family_slock_keys + sk->sk_family,
2119  			af_family_key_strings[sk->sk_family],
2120  			af_family_keys + sk->sk_family);
2121  }
2122  
2123  /*
2124   * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2125   * even temporarily, because of RCU lookups. sk_node should also be left as is.
2126   * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2127   */
sock_copy(struct sock * nsk,const struct sock * osk)2128  static void sock_copy(struct sock *nsk, const struct sock *osk)
2129  {
2130  	const struct proto *prot = READ_ONCE(osk->sk_prot);
2131  #ifdef CONFIG_SECURITY_NETWORK
2132  	void *sptr = nsk->sk_security;
2133  #endif
2134  
2135  	/* If we move sk_tx_queue_mapping out of the private section,
2136  	 * we must check if sk_tx_queue_clear() is called after
2137  	 * sock_copy() in sk_clone_lock().
2138  	 */
2139  	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2140  		     offsetof(struct sock, sk_dontcopy_begin) ||
2141  		     offsetof(struct sock, sk_tx_queue_mapping) >=
2142  		     offsetof(struct sock, sk_dontcopy_end));
2143  
2144  	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2145  
2146  	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2147  		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2148  		      /* alloc is larger than struct, see sk_prot_alloc() */);
2149  
2150  #ifdef CONFIG_SECURITY_NETWORK
2151  	nsk->sk_security = sptr;
2152  	security_sk_clone(osk, nsk);
2153  #endif
2154  }
2155  
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)2156  static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2157  		int family)
2158  {
2159  	struct sock *sk;
2160  	struct kmem_cache *slab;
2161  
2162  	slab = prot->slab;
2163  	if (slab != NULL) {
2164  		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2165  		if (!sk)
2166  			return sk;
2167  		if (want_init_on_alloc(priority))
2168  			sk_prot_clear_nulls(sk, prot->obj_size);
2169  	} else
2170  		sk = kmalloc(prot->obj_size, priority);
2171  
2172  	if (sk != NULL) {
2173  		if (security_sk_alloc(sk, family, priority))
2174  			goto out_free;
2175  
2176  		if (!try_module_get(prot->owner))
2177  			goto out_free_sec;
2178  	}
2179  
2180  	return sk;
2181  
2182  out_free_sec:
2183  	security_sk_free(sk);
2184  out_free:
2185  	if (slab != NULL)
2186  		kmem_cache_free(slab, sk);
2187  	else
2188  		kfree(sk);
2189  	return NULL;
2190  }
2191  
sk_prot_free(struct proto * prot,struct sock * sk)2192  static void sk_prot_free(struct proto *prot, struct sock *sk)
2193  {
2194  	struct kmem_cache *slab;
2195  	struct module *owner;
2196  
2197  	owner = prot->owner;
2198  	slab = prot->slab;
2199  
2200  	cgroup_sk_free(&sk->sk_cgrp_data);
2201  	mem_cgroup_sk_free(sk);
2202  	security_sk_free(sk);
2203  	if (slab != NULL)
2204  		kmem_cache_free(slab, sk);
2205  	else
2206  		kfree(sk);
2207  	module_put(owner);
2208  }
2209  
2210  /**
2211   *	sk_alloc - All socket objects are allocated here
2212   *	@net: the applicable net namespace
2213   *	@family: protocol family
2214   *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2215   *	@prot: struct proto associated with this new sock instance
2216   *	@kern: is this to be a kernel socket?
2217   */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)2218  struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2219  		      struct proto *prot, int kern)
2220  {
2221  	struct sock *sk;
2222  
2223  	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2224  	if (sk) {
2225  		sk->sk_family = family;
2226  		/*
2227  		 * See comment in struct sock definition to understand
2228  		 * why we need sk_prot_creator -acme
2229  		 */
2230  		sk->sk_prot = sk->sk_prot_creator = prot;
2231  		sk->sk_kern_sock = kern;
2232  		sock_lock_init(sk);
2233  		sk->sk_net_refcnt = kern ? 0 : 1;
2234  		if (likely(sk->sk_net_refcnt)) {
2235  			get_net_track(net, &sk->ns_tracker, priority);
2236  			sock_inuse_add(net, 1);
2237  		} else {
2238  			__netns_tracker_alloc(net, &sk->ns_tracker,
2239  					      false, priority);
2240  		}
2241  
2242  		sock_net_set(sk, net);
2243  		refcount_set(&sk->sk_wmem_alloc, 1);
2244  
2245  		mem_cgroup_sk_alloc(sk);
2246  		cgroup_sk_alloc(&sk->sk_cgrp_data);
2247  		sock_update_classid(&sk->sk_cgrp_data);
2248  		sock_update_netprioidx(&sk->sk_cgrp_data);
2249  		sk_tx_queue_clear(sk);
2250  	}
2251  
2252  	return sk;
2253  }
2254  EXPORT_SYMBOL(sk_alloc);
2255  
2256  /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2257   * grace period. This is the case for UDP sockets and TCP listeners.
2258   */
__sk_destruct(struct rcu_head * head)2259  static void __sk_destruct(struct rcu_head *head)
2260  {
2261  	struct sock *sk = container_of(head, struct sock, sk_rcu);
2262  	struct sk_filter *filter;
2263  
2264  	if (sk->sk_destruct)
2265  		sk->sk_destruct(sk);
2266  
2267  	filter = rcu_dereference_check(sk->sk_filter,
2268  				       refcount_read(&sk->sk_wmem_alloc) == 0);
2269  	if (filter) {
2270  		sk_filter_uncharge(sk, filter);
2271  		RCU_INIT_POINTER(sk->sk_filter, NULL);
2272  	}
2273  
2274  	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2275  
2276  #ifdef CONFIG_BPF_SYSCALL
2277  	bpf_sk_storage_free(sk);
2278  #endif
2279  
2280  	if (atomic_read(&sk->sk_omem_alloc))
2281  		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2282  			 __func__, atomic_read(&sk->sk_omem_alloc));
2283  
2284  	if (sk->sk_frag.page) {
2285  		put_page(sk->sk_frag.page);
2286  		sk->sk_frag.page = NULL;
2287  	}
2288  
2289  	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2290  	put_cred(sk->sk_peer_cred);
2291  	put_pid(sk->sk_peer_pid);
2292  
2293  	if (likely(sk->sk_net_refcnt))
2294  		put_net_track(sock_net(sk), &sk->ns_tracker);
2295  	else
2296  		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2297  
2298  	sk_prot_free(sk->sk_prot_creator, sk);
2299  }
2300  
sk_destruct(struct sock * sk)2301  void sk_destruct(struct sock *sk)
2302  {
2303  	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2304  
2305  	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2306  		reuseport_detach_sock(sk);
2307  		use_call_rcu = true;
2308  	}
2309  
2310  	if (use_call_rcu)
2311  		call_rcu(&sk->sk_rcu, __sk_destruct);
2312  	else
2313  		__sk_destruct(&sk->sk_rcu);
2314  }
2315  
__sk_free(struct sock * sk)2316  static void __sk_free(struct sock *sk)
2317  {
2318  	if (likely(sk->sk_net_refcnt))
2319  		sock_inuse_add(sock_net(sk), -1);
2320  
2321  	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2322  		sock_diag_broadcast_destroy(sk);
2323  	else
2324  		sk_destruct(sk);
2325  }
2326  
sk_free(struct sock * sk)2327  void sk_free(struct sock *sk)
2328  {
2329  	/*
2330  	 * We subtract one from sk_wmem_alloc and can know if
2331  	 * some packets are still in some tx queue.
2332  	 * If not null, sock_wfree() will call __sk_free(sk) later
2333  	 */
2334  	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2335  		__sk_free(sk);
2336  }
2337  EXPORT_SYMBOL(sk_free);
2338  
sk_init_common(struct sock * sk)2339  static void sk_init_common(struct sock *sk)
2340  {
2341  	skb_queue_head_init(&sk->sk_receive_queue);
2342  	skb_queue_head_init(&sk->sk_write_queue);
2343  	skb_queue_head_init(&sk->sk_error_queue);
2344  
2345  	rwlock_init(&sk->sk_callback_lock);
2346  	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2347  			af_rlock_keys + sk->sk_family,
2348  			af_family_rlock_key_strings[sk->sk_family]);
2349  	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2350  			af_wlock_keys + sk->sk_family,
2351  			af_family_wlock_key_strings[sk->sk_family]);
2352  	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2353  			af_elock_keys + sk->sk_family,
2354  			af_family_elock_key_strings[sk->sk_family]);
2355  	if (sk->sk_kern_sock)
2356  		lockdep_set_class_and_name(&sk->sk_callback_lock,
2357  			af_kern_callback_keys + sk->sk_family,
2358  			af_family_kern_clock_key_strings[sk->sk_family]);
2359  	else
2360  		lockdep_set_class_and_name(&sk->sk_callback_lock,
2361  			af_callback_keys + sk->sk_family,
2362  			af_family_clock_key_strings[sk->sk_family]);
2363  }
2364  
2365  /**
2366   *	sk_clone_lock - clone a socket, and lock its clone
2367   *	@sk: the socket to clone
2368   *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2369   *
2370   *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2371   */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2372  struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2373  {
2374  	struct proto *prot = READ_ONCE(sk->sk_prot);
2375  	struct sk_filter *filter;
2376  	bool is_charged = true;
2377  	struct sock *newsk;
2378  
2379  	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2380  	if (!newsk)
2381  		goto out;
2382  
2383  	sock_copy(newsk, sk);
2384  
2385  	newsk->sk_prot_creator = prot;
2386  
2387  	/* SANITY */
2388  	if (likely(newsk->sk_net_refcnt)) {
2389  		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2390  		sock_inuse_add(sock_net(newsk), 1);
2391  	} else {
2392  		/* Kernel sockets are not elevating the struct net refcount.
2393  		 * Instead, use a tracker to more easily detect if a layer
2394  		 * is not properly dismantling its kernel sockets at netns
2395  		 * destroy time.
2396  		 */
2397  		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2398  				      false, priority);
2399  	}
2400  	sk_node_init(&newsk->sk_node);
2401  	sock_lock_init(newsk);
2402  	bh_lock_sock(newsk);
2403  	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2404  	newsk->sk_backlog.len = 0;
2405  
2406  	atomic_set(&newsk->sk_rmem_alloc, 0);
2407  
2408  	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2409  	refcount_set(&newsk->sk_wmem_alloc, 1);
2410  
2411  	atomic_set(&newsk->sk_omem_alloc, 0);
2412  	sk_init_common(newsk);
2413  
2414  	newsk->sk_dst_cache	= NULL;
2415  	newsk->sk_dst_pending_confirm = 0;
2416  	newsk->sk_wmem_queued	= 0;
2417  	newsk->sk_forward_alloc = 0;
2418  	newsk->sk_reserved_mem  = 0;
2419  	atomic_set(&newsk->sk_drops, 0);
2420  	newsk->sk_send_head	= NULL;
2421  	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2422  	atomic_set(&newsk->sk_zckey, 0);
2423  
2424  	sock_reset_flag(newsk, SOCK_DONE);
2425  
2426  	/* sk->sk_memcg will be populated at accept() time */
2427  	newsk->sk_memcg = NULL;
2428  
2429  	cgroup_sk_clone(&newsk->sk_cgrp_data);
2430  
2431  	rcu_read_lock();
2432  	filter = rcu_dereference(sk->sk_filter);
2433  	if (filter != NULL)
2434  		/* though it's an empty new sock, the charging may fail
2435  		 * if sysctl_optmem_max was changed between creation of
2436  		 * original socket and cloning
2437  		 */
2438  		is_charged = sk_filter_charge(newsk, filter);
2439  	RCU_INIT_POINTER(newsk->sk_filter, filter);
2440  	rcu_read_unlock();
2441  
2442  	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2443  		/* We need to make sure that we don't uncharge the new
2444  		 * socket if we couldn't charge it in the first place
2445  		 * as otherwise we uncharge the parent's filter.
2446  		 */
2447  		if (!is_charged)
2448  			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2449  		sk_free_unlock_clone(newsk);
2450  		newsk = NULL;
2451  		goto out;
2452  	}
2453  	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2454  
2455  	if (bpf_sk_storage_clone(sk, newsk)) {
2456  		sk_free_unlock_clone(newsk);
2457  		newsk = NULL;
2458  		goto out;
2459  	}
2460  
2461  	/* Clear sk_user_data if parent had the pointer tagged
2462  	 * as not suitable for copying when cloning.
2463  	 */
2464  	if (sk_user_data_is_nocopy(newsk))
2465  		newsk->sk_user_data = NULL;
2466  
2467  	newsk->sk_err	   = 0;
2468  	newsk->sk_err_soft = 0;
2469  	newsk->sk_priority = 0;
2470  	newsk->sk_incoming_cpu = raw_smp_processor_id();
2471  
2472  	/* Before updating sk_refcnt, we must commit prior changes to memory
2473  	 * (Documentation/RCU/rculist_nulls.rst for details)
2474  	 */
2475  	smp_wmb();
2476  	refcount_set(&newsk->sk_refcnt, 2);
2477  
2478  	sk_set_socket(newsk, NULL);
2479  	sk_tx_queue_clear(newsk);
2480  	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2481  
2482  	if (newsk->sk_prot->sockets_allocated)
2483  		sk_sockets_allocated_inc(newsk);
2484  
2485  	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2486  		net_enable_timestamp();
2487  out:
2488  	return newsk;
2489  }
2490  EXPORT_SYMBOL_GPL(sk_clone_lock);
2491  
sk_free_unlock_clone(struct sock * sk)2492  void sk_free_unlock_clone(struct sock *sk)
2493  {
2494  	/* It is still raw copy of parent, so invalidate
2495  	 * destructor and make plain sk_free() */
2496  	sk->sk_destruct = NULL;
2497  	bh_unlock_sock(sk);
2498  	sk_free(sk);
2499  }
2500  EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2501  
sk_dst_gso_max_size(struct sock * sk,struct dst_entry * dst)2502  static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2503  {
2504  	bool is_ipv6 = false;
2505  	u32 max_size;
2506  
2507  #if IS_ENABLED(CONFIG_IPV6)
2508  	is_ipv6 = (sk->sk_family == AF_INET6 &&
2509  		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2510  #endif
2511  	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2512  	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2513  			READ_ONCE(dst->dev->gso_ipv4_max_size);
2514  	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2515  		max_size = GSO_LEGACY_MAX_SIZE;
2516  
2517  	return max_size - (MAX_TCP_HEADER + 1);
2518  }
2519  
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2520  void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2521  {
2522  	u32 max_segs = 1;
2523  
2524  	sk->sk_route_caps = dst->dev->features;
2525  	if (sk_is_tcp(sk))
2526  		sk->sk_route_caps |= NETIF_F_GSO;
2527  	if (sk->sk_route_caps & NETIF_F_GSO)
2528  		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2529  	if (unlikely(sk->sk_gso_disabled))
2530  		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2531  	if (sk_can_gso(sk)) {
2532  		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2533  			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2534  		} else {
2535  			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2536  			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2537  			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2538  			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2539  		}
2540  	}
2541  	sk->sk_gso_max_segs = max_segs;
2542  	sk_dst_set(sk, dst);
2543  }
2544  EXPORT_SYMBOL_GPL(sk_setup_caps);
2545  
2546  /*
2547   *	Simple resource managers for sockets.
2548   */
2549  
2550  
2551  /*
2552   * Write buffer destructor automatically called from kfree_skb.
2553   */
sock_wfree(struct sk_buff * skb)2554  void sock_wfree(struct sk_buff *skb)
2555  {
2556  	struct sock *sk = skb->sk;
2557  	unsigned int len = skb->truesize;
2558  	bool free;
2559  
2560  	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2561  		if (sock_flag(sk, SOCK_RCU_FREE) &&
2562  		    sk->sk_write_space == sock_def_write_space) {
2563  			rcu_read_lock();
2564  			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2565  			sock_def_write_space_wfree(sk);
2566  			rcu_read_unlock();
2567  			if (unlikely(free))
2568  				__sk_free(sk);
2569  			return;
2570  		}
2571  
2572  		/*
2573  		 * Keep a reference on sk_wmem_alloc, this will be released
2574  		 * after sk_write_space() call
2575  		 */
2576  		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2577  		sk->sk_write_space(sk);
2578  		len = 1;
2579  	}
2580  	/*
2581  	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2582  	 * could not do because of in-flight packets
2583  	 */
2584  	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2585  		__sk_free(sk);
2586  }
2587  EXPORT_SYMBOL(sock_wfree);
2588  
2589  /* This variant of sock_wfree() is used by TCP,
2590   * since it sets SOCK_USE_WRITE_QUEUE.
2591   */
__sock_wfree(struct sk_buff * skb)2592  void __sock_wfree(struct sk_buff *skb)
2593  {
2594  	struct sock *sk = skb->sk;
2595  
2596  	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2597  		__sk_free(sk);
2598  }
2599  
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2600  void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2601  {
2602  	skb_orphan(skb);
2603  	skb->sk = sk;
2604  #ifdef CONFIG_INET
2605  	if (unlikely(!sk_fullsock(sk))) {
2606  		skb->destructor = sock_edemux;
2607  		sock_hold(sk);
2608  		return;
2609  	}
2610  #endif
2611  	skb->destructor = sock_wfree;
2612  	skb_set_hash_from_sk(skb, sk);
2613  	/*
2614  	 * We used to take a refcount on sk, but following operation
2615  	 * is enough to guarantee sk_free() won't free this sock until
2616  	 * all in-flight packets are completed
2617  	 */
2618  	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2619  }
2620  EXPORT_SYMBOL(skb_set_owner_w);
2621  
can_skb_orphan_partial(const struct sk_buff * skb)2622  static bool can_skb_orphan_partial(const struct sk_buff *skb)
2623  {
2624  	/* Drivers depend on in-order delivery for crypto offload,
2625  	 * partial orphan breaks out-of-order-OK logic.
2626  	 */
2627  	if (skb_is_decrypted(skb))
2628  		return false;
2629  
2630  	return (skb->destructor == sock_wfree ||
2631  		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2632  }
2633  
2634  /* This helper is used by netem, as it can hold packets in its
2635   * delay queue. We want to allow the owner socket to send more
2636   * packets, as if they were already TX completed by a typical driver.
2637   * But we also want to keep skb->sk set because some packet schedulers
2638   * rely on it (sch_fq for example).
2639   */
skb_orphan_partial(struct sk_buff * skb)2640  void skb_orphan_partial(struct sk_buff *skb)
2641  {
2642  	if (skb_is_tcp_pure_ack(skb))
2643  		return;
2644  
2645  	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2646  		return;
2647  
2648  	skb_orphan(skb);
2649  }
2650  EXPORT_SYMBOL(skb_orphan_partial);
2651  
2652  /*
2653   * Read buffer destructor automatically called from kfree_skb.
2654   */
sock_rfree(struct sk_buff * skb)2655  void sock_rfree(struct sk_buff *skb)
2656  {
2657  	struct sock *sk = skb->sk;
2658  	unsigned int len = skb->truesize;
2659  
2660  	atomic_sub(len, &sk->sk_rmem_alloc);
2661  	sk_mem_uncharge(sk, len);
2662  }
2663  EXPORT_SYMBOL(sock_rfree);
2664  
2665  /*
2666   * Buffer destructor for skbs that are not used directly in read or write
2667   * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2668   */
sock_efree(struct sk_buff * skb)2669  void sock_efree(struct sk_buff *skb)
2670  {
2671  	sock_put(skb->sk);
2672  }
2673  EXPORT_SYMBOL(sock_efree);
2674  
2675  /* Buffer destructor for prefetch/receive path where reference count may
2676   * not be held, e.g. for listen sockets.
2677   */
2678  #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2679  void sock_pfree(struct sk_buff *skb)
2680  {
2681  	struct sock *sk = skb->sk;
2682  
2683  	if (!sk_is_refcounted(sk))
2684  		return;
2685  
2686  	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2687  		inet_reqsk(sk)->rsk_listener = NULL;
2688  		reqsk_free(inet_reqsk(sk));
2689  		return;
2690  	}
2691  
2692  	sock_gen_put(sk);
2693  }
2694  EXPORT_SYMBOL(sock_pfree);
2695  #endif /* CONFIG_INET */
2696  
sock_i_uid(struct sock * sk)2697  kuid_t sock_i_uid(struct sock *sk)
2698  {
2699  	kuid_t uid;
2700  
2701  	read_lock_bh(&sk->sk_callback_lock);
2702  	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2703  	read_unlock_bh(&sk->sk_callback_lock);
2704  	return uid;
2705  }
2706  EXPORT_SYMBOL(sock_i_uid);
2707  
__sock_i_ino(struct sock * sk)2708  unsigned long __sock_i_ino(struct sock *sk)
2709  {
2710  	unsigned long ino;
2711  
2712  	read_lock(&sk->sk_callback_lock);
2713  	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2714  	read_unlock(&sk->sk_callback_lock);
2715  	return ino;
2716  }
2717  EXPORT_SYMBOL(__sock_i_ino);
2718  
sock_i_ino(struct sock * sk)2719  unsigned long sock_i_ino(struct sock *sk)
2720  {
2721  	unsigned long ino;
2722  
2723  	local_bh_disable();
2724  	ino = __sock_i_ino(sk);
2725  	local_bh_enable();
2726  	return ino;
2727  }
2728  EXPORT_SYMBOL(sock_i_ino);
2729  
2730  /*
2731   * Allocate a skb from the socket's send buffer.
2732   */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2733  struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2734  			     gfp_t priority)
2735  {
2736  	if (force ||
2737  	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2738  		struct sk_buff *skb = alloc_skb(size, priority);
2739  
2740  		if (skb) {
2741  			skb_set_owner_w(skb, sk);
2742  			return skb;
2743  		}
2744  	}
2745  	return NULL;
2746  }
2747  EXPORT_SYMBOL(sock_wmalloc);
2748  
sock_ofree(struct sk_buff * skb)2749  static void sock_ofree(struct sk_buff *skb)
2750  {
2751  	struct sock *sk = skb->sk;
2752  
2753  	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2754  }
2755  
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2756  struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2757  			     gfp_t priority)
2758  {
2759  	struct sk_buff *skb;
2760  
2761  	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2762  	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2763  	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2764  		return NULL;
2765  
2766  	skb = alloc_skb(size, priority);
2767  	if (!skb)
2768  		return NULL;
2769  
2770  	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2771  	skb->sk = sk;
2772  	skb->destructor = sock_ofree;
2773  	return skb;
2774  }
2775  
2776  /*
2777   * Allocate a memory block from the socket's option memory buffer.
2778   */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2779  void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2780  {
2781  	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2782  
2783  	if ((unsigned int)size <= optmem_max &&
2784  	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2785  		void *mem;
2786  		/* First do the add, to avoid the race if kmalloc
2787  		 * might sleep.
2788  		 */
2789  		atomic_add(size, &sk->sk_omem_alloc);
2790  		mem = kmalloc(size, priority);
2791  		if (mem)
2792  			return mem;
2793  		atomic_sub(size, &sk->sk_omem_alloc);
2794  	}
2795  	return NULL;
2796  }
2797  EXPORT_SYMBOL(sock_kmalloc);
2798  
2799  /* Free an option memory block. Note, we actually want the inline
2800   * here as this allows gcc to detect the nullify and fold away the
2801   * condition entirely.
2802   */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2803  static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2804  				  const bool nullify)
2805  {
2806  	if (WARN_ON_ONCE(!mem))
2807  		return;
2808  	if (nullify)
2809  		kfree_sensitive(mem);
2810  	else
2811  		kfree(mem);
2812  	atomic_sub(size, &sk->sk_omem_alloc);
2813  }
2814  
sock_kfree_s(struct sock * sk,void * mem,int size)2815  void sock_kfree_s(struct sock *sk, void *mem, int size)
2816  {
2817  	__sock_kfree_s(sk, mem, size, false);
2818  }
2819  EXPORT_SYMBOL(sock_kfree_s);
2820  
sock_kzfree_s(struct sock * sk,void * mem,int size)2821  void sock_kzfree_s(struct sock *sk, void *mem, int size)
2822  {
2823  	__sock_kfree_s(sk, mem, size, true);
2824  }
2825  EXPORT_SYMBOL(sock_kzfree_s);
2826  
2827  /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2828     I think, these locks should be removed for datagram sockets.
2829   */
sock_wait_for_wmem(struct sock * sk,long timeo)2830  static long sock_wait_for_wmem(struct sock *sk, long timeo)
2831  {
2832  	DEFINE_WAIT(wait);
2833  
2834  	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2835  	for (;;) {
2836  		if (!timeo)
2837  			break;
2838  		if (signal_pending(current))
2839  			break;
2840  		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2841  		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2842  		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2843  			break;
2844  		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2845  			break;
2846  		if (READ_ONCE(sk->sk_err))
2847  			break;
2848  		timeo = schedule_timeout(timeo);
2849  	}
2850  	finish_wait(sk_sleep(sk), &wait);
2851  	return timeo;
2852  }
2853  
2854  
2855  /*
2856   *	Generic send/receive buffer handlers
2857   */
2858  
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2859  struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2860  				     unsigned long data_len, int noblock,
2861  				     int *errcode, int max_page_order)
2862  {
2863  	struct sk_buff *skb;
2864  	long timeo;
2865  	int err;
2866  
2867  	timeo = sock_sndtimeo(sk, noblock);
2868  	for (;;) {
2869  		err = sock_error(sk);
2870  		if (err != 0)
2871  			goto failure;
2872  
2873  		err = -EPIPE;
2874  		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2875  			goto failure;
2876  
2877  		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2878  			break;
2879  
2880  		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2881  		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2882  		err = -EAGAIN;
2883  		if (!timeo)
2884  			goto failure;
2885  		if (signal_pending(current))
2886  			goto interrupted;
2887  		timeo = sock_wait_for_wmem(sk, timeo);
2888  	}
2889  	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2890  				   errcode, sk->sk_allocation);
2891  	if (skb)
2892  		skb_set_owner_w(skb, sk);
2893  	return skb;
2894  
2895  interrupted:
2896  	err = sock_intr_errno(timeo);
2897  failure:
2898  	*errcode = err;
2899  	return NULL;
2900  }
2901  EXPORT_SYMBOL(sock_alloc_send_pskb);
2902  
__sock_cmsg_send(struct sock * sk,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2903  int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2904  		     struct sockcm_cookie *sockc)
2905  {
2906  	u32 tsflags;
2907  
2908  	switch (cmsg->cmsg_type) {
2909  	case SO_MARK:
2910  		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2911  		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2912  			return -EPERM;
2913  		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2914  			return -EINVAL;
2915  		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2916  		break;
2917  	case SO_TIMESTAMPING_OLD:
2918  	case SO_TIMESTAMPING_NEW:
2919  		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2920  			return -EINVAL;
2921  
2922  		tsflags = *(u32 *)CMSG_DATA(cmsg);
2923  		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2924  			return -EINVAL;
2925  
2926  		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2927  		sockc->tsflags |= tsflags;
2928  		break;
2929  	case SCM_TXTIME:
2930  		if (!sock_flag(sk, SOCK_TXTIME))
2931  			return -EINVAL;
2932  		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2933  			return -EINVAL;
2934  		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2935  		break;
2936  	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2937  	case SCM_RIGHTS:
2938  	case SCM_CREDENTIALS:
2939  		break;
2940  	default:
2941  		return -EINVAL;
2942  	}
2943  	return 0;
2944  }
2945  EXPORT_SYMBOL(__sock_cmsg_send);
2946  
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2947  int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2948  		   struct sockcm_cookie *sockc)
2949  {
2950  	struct cmsghdr *cmsg;
2951  	int ret;
2952  
2953  	for_each_cmsghdr(cmsg, msg) {
2954  		if (!CMSG_OK(msg, cmsg))
2955  			return -EINVAL;
2956  		if (cmsg->cmsg_level != SOL_SOCKET)
2957  			continue;
2958  		ret = __sock_cmsg_send(sk, cmsg, sockc);
2959  		if (ret)
2960  			return ret;
2961  	}
2962  	return 0;
2963  }
2964  EXPORT_SYMBOL(sock_cmsg_send);
2965  
sk_enter_memory_pressure(struct sock * sk)2966  static void sk_enter_memory_pressure(struct sock *sk)
2967  {
2968  	if (!sk->sk_prot->enter_memory_pressure)
2969  		return;
2970  
2971  	sk->sk_prot->enter_memory_pressure(sk);
2972  }
2973  
sk_leave_memory_pressure(struct sock * sk)2974  static void sk_leave_memory_pressure(struct sock *sk)
2975  {
2976  	if (sk->sk_prot->leave_memory_pressure) {
2977  		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2978  				     tcp_leave_memory_pressure, sk);
2979  	} else {
2980  		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2981  
2982  		if (memory_pressure && READ_ONCE(*memory_pressure))
2983  			WRITE_ONCE(*memory_pressure, 0);
2984  	}
2985  }
2986  
2987  DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2988  
2989  /**
2990   * skb_page_frag_refill - check that a page_frag contains enough room
2991   * @sz: minimum size of the fragment we want to get
2992   * @pfrag: pointer to page_frag
2993   * @gfp: priority for memory allocation
2994   *
2995   * Note: While this allocator tries to use high order pages, there is
2996   * no guarantee that allocations succeed. Therefore, @sz MUST be
2997   * less or equal than PAGE_SIZE.
2998   */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2999  bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3000  {
3001  	if (pfrag->page) {
3002  		if (page_ref_count(pfrag->page) == 1) {
3003  			pfrag->offset = 0;
3004  			return true;
3005  		}
3006  		if (pfrag->offset + sz <= pfrag->size)
3007  			return true;
3008  		put_page(pfrag->page);
3009  	}
3010  
3011  	pfrag->offset = 0;
3012  	if (SKB_FRAG_PAGE_ORDER &&
3013  	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3014  		/* Avoid direct reclaim but allow kswapd to wake */
3015  		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3016  					  __GFP_COMP | __GFP_NOWARN |
3017  					  __GFP_NORETRY,
3018  					  SKB_FRAG_PAGE_ORDER);
3019  		if (likely(pfrag->page)) {
3020  			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3021  			return true;
3022  		}
3023  	}
3024  	pfrag->page = alloc_page(gfp);
3025  	if (likely(pfrag->page)) {
3026  		pfrag->size = PAGE_SIZE;
3027  		return true;
3028  	}
3029  	return false;
3030  }
3031  EXPORT_SYMBOL(skb_page_frag_refill);
3032  
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)3033  bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3034  {
3035  	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3036  		return true;
3037  
3038  	sk_enter_memory_pressure(sk);
3039  	sk_stream_moderate_sndbuf(sk);
3040  	return false;
3041  }
3042  EXPORT_SYMBOL(sk_page_frag_refill);
3043  
__lock_sock(struct sock * sk)3044  void __lock_sock(struct sock *sk)
3045  	__releases(&sk->sk_lock.slock)
3046  	__acquires(&sk->sk_lock.slock)
3047  {
3048  	DEFINE_WAIT(wait);
3049  
3050  	for (;;) {
3051  		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3052  					TASK_UNINTERRUPTIBLE);
3053  		spin_unlock_bh(&sk->sk_lock.slock);
3054  		schedule();
3055  		spin_lock_bh(&sk->sk_lock.slock);
3056  		if (!sock_owned_by_user(sk))
3057  			break;
3058  	}
3059  	finish_wait(&sk->sk_lock.wq, &wait);
3060  }
3061  
__release_sock(struct sock * sk)3062  void __release_sock(struct sock *sk)
3063  	__releases(&sk->sk_lock.slock)
3064  	__acquires(&sk->sk_lock.slock)
3065  {
3066  	struct sk_buff *skb, *next;
3067  
3068  	while ((skb = sk->sk_backlog.head) != NULL) {
3069  		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3070  
3071  		spin_unlock_bh(&sk->sk_lock.slock);
3072  
3073  		do {
3074  			next = skb->next;
3075  			prefetch(next);
3076  			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3077  			skb_mark_not_on_list(skb);
3078  			sk_backlog_rcv(sk, skb);
3079  
3080  			cond_resched();
3081  
3082  			skb = next;
3083  		} while (skb != NULL);
3084  
3085  		spin_lock_bh(&sk->sk_lock.slock);
3086  	}
3087  
3088  	/*
3089  	 * Doing the zeroing here guarantee we can not loop forever
3090  	 * while a wild producer attempts to flood us.
3091  	 */
3092  	sk->sk_backlog.len = 0;
3093  }
3094  
__sk_flush_backlog(struct sock * sk)3095  void __sk_flush_backlog(struct sock *sk)
3096  {
3097  	spin_lock_bh(&sk->sk_lock.slock);
3098  	__release_sock(sk);
3099  
3100  	if (sk->sk_prot->release_cb)
3101  		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3102  				     tcp_release_cb, sk);
3103  
3104  	spin_unlock_bh(&sk->sk_lock.slock);
3105  }
3106  EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3107  
3108  /**
3109   * sk_wait_data - wait for data to arrive at sk_receive_queue
3110   * @sk:    sock to wait on
3111   * @timeo: for how long
3112   * @skb:   last skb seen on sk_receive_queue
3113   *
3114   * Now socket state including sk->sk_err is changed only under lock,
3115   * hence we may omit checks after joining wait queue.
3116   * We check receive queue before schedule() only as optimization;
3117   * it is very likely that release_sock() added new data.
3118   */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)3119  int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3120  {
3121  	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3122  	int rc;
3123  
3124  	add_wait_queue(sk_sleep(sk), &wait);
3125  	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3126  	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3127  	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3128  	remove_wait_queue(sk_sleep(sk), &wait);
3129  	return rc;
3130  }
3131  EXPORT_SYMBOL(sk_wait_data);
3132  
3133  /**
3134   *	__sk_mem_raise_allocated - increase memory_allocated
3135   *	@sk: socket
3136   *	@size: memory size to allocate
3137   *	@amt: pages to allocate
3138   *	@kind: allocation type
3139   *
3140   *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3141   *
3142   *	Unlike the globally shared limits among the sockets under same protocol,
3143   *	consuming the budget of a memcg won't have direct effect on other ones.
3144   *	So be optimistic about memcg's tolerance, and leave the callers to decide
3145   *	whether or not to raise allocated through sk_under_memory_pressure() or
3146   *	its variants.
3147   */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)3148  int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3149  {
3150  	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3151  	struct proto *prot = sk->sk_prot;
3152  	bool charged = false;
3153  	long allocated;
3154  
3155  	sk_memory_allocated_add(sk, amt);
3156  	allocated = sk_memory_allocated(sk);
3157  
3158  	if (memcg) {
3159  		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3160  			goto suppress_allocation;
3161  		charged = true;
3162  	}
3163  
3164  	/* Under limit. */
3165  	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3166  		sk_leave_memory_pressure(sk);
3167  		return 1;
3168  	}
3169  
3170  	/* Under pressure. */
3171  	if (allocated > sk_prot_mem_limits(sk, 1))
3172  		sk_enter_memory_pressure(sk);
3173  
3174  	/* Over hard limit. */
3175  	if (allocated > sk_prot_mem_limits(sk, 2))
3176  		goto suppress_allocation;
3177  
3178  	/* Guarantee minimum buffer size under pressure (either global
3179  	 * or memcg) to make sure features described in RFC 7323 (TCP
3180  	 * Extensions for High Performance) work properly.
3181  	 *
3182  	 * This rule does NOT stand when exceeds global or memcg's hard
3183  	 * limit, or else a DoS attack can be taken place by spawning
3184  	 * lots of sockets whose usage are under minimum buffer size.
3185  	 */
3186  	if (kind == SK_MEM_RECV) {
3187  		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3188  			return 1;
3189  
3190  	} else { /* SK_MEM_SEND */
3191  		int wmem0 = sk_get_wmem0(sk, prot);
3192  
3193  		if (sk->sk_type == SOCK_STREAM) {
3194  			if (sk->sk_wmem_queued < wmem0)
3195  				return 1;
3196  		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3197  				return 1;
3198  		}
3199  	}
3200  
3201  	if (sk_has_memory_pressure(sk)) {
3202  		u64 alloc;
3203  
3204  		/* The following 'average' heuristic is within the
3205  		 * scope of global accounting, so it only makes
3206  		 * sense for global memory pressure.
3207  		 */
3208  		if (!sk_under_global_memory_pressure(sk))
3209  			return 1;
3210  
3211  		/* Try to be fair among all the sockets under global
3212  		 * pressure by allowing the ones that below average
3213  		 * usage to raise.
3214  		 */
3215  		alloc = sk_sockets_allocated_read_positive(sk);
3216  		if (sk_prot_mem_limits(sk, 2) > alloc *
3217  		    sk_mem_pages(sk->sk_wmem_queued +
3218  				 atomic_read(&sk->sk_rmem_alloc) +
3219  				 sk->sk_forward_alloc))
3220  			return 1;
3221  	}
3222  
3223  suppress_allocation:
3224  
3225  	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3226  		sk_stream_moderate_sndbuf(sk);
3227  
3228  		/* Fail only if socket is _under_ its sndbuf.
3229  		 * In this case we cannot block, so that we have to fail.
3230  		 */
3231  		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3232  			/* Force charge with __GFP_NOFAIL */
3233  			if (memcg && !charged) {
3234  				mem_cgroup_charge_skmem(memcg, amt,
3235  					gfp_memcg_charge() | __GFP_NOFAIL);
3236  			}
3237  			return 1;
3238  		}
3239  	}
3240  
3241  	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3242  		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3243  
3244  	sk_memory_allocated_sub(sk, amt);
3245  
3246  	if (charged)
3247  		mem_cgroup_uncharge_skmem(memcg, amt);
3248  
3249  	return 0;
3250  }
3251  
3252  /**
3253   *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3254   *	@sk: socket
3255   *	@size: memory size to allocate
3256   *	@kind: allocation type
3257   *
3258   *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3259   *	rmem allocation. This function assumes that protocols which have
3260   *	memory_pressure use sk_wmem_queued as write buffer accounting.
3261   */
__sk_mem_schedule(struct sock * sk,int size,int kind)3262  int __sk_mem_schedule(struct sock *sk, int size, int kind)
3263  {
3264  	int ret, amt = sk_mem_pages(size);
3265  
3266  	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3267  	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3268  	if (!ret)
3269  		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3270  	return ret;
3271  }
3272  EXPORT_SYMBOL(__sk_mem_schedule);
3273  
3274  /**
3275   *	__sk_mem_reduce_allocated - reclaim memory_allocated
3276   *	@sk: socket
3277   *	@amount: number of quanta
3278   *
3279   *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3280   */
__sk_mem_reduce_allocated(struct sock * sk,int amount)3281  void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3282  {
3283  	sk_memory_allocated_sub(sk, amount);
3284  
3285  	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3286  		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3287  
3288  	if (sk_under_global_memory_pressure(sk) &&
3289  	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3290  		sk_leave_memory_pressure(sk);
3291  }
3292  
3293  /**
3294   *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3295   *	@sk: socket
3296   *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3297   */
__sk_mem_reclaim(struct sock * sk,int amount)3298  void __sk_mem_reclaim(struct sock *sk, int amount)
3299  {
3300  	amount >>= PAGE_SHIFT;
3301  	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3302  	__sk_mem_reduce_allocated(sk, amount);
3303  }
3304  EXPORT_SYMBOL(__sk_mem_reclaim);
3305  
sk_set_peek_off(struct sock * sk,int val)3306  int sk_set_peek_off(struct sock *sk, int val)
3307  {
3308  	WRITE_ONCE(sk->sk_peek_off, val);
3309  	return 0;
3310  }
3311  EXPORT_SYMBOL_GPL(sk_set_peek_off);
3312  
3313  /*
3314   * Set of default routines for initialising struct proto_ops when
3315   * the protocol does not support a particular function. In certain
3316   * cases where it makes no sense for a protocol to have a "do nothing"
3317   * function, some default processing is provided.
3318   */
3319  
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)3320  int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3321  {
3322  	return -EOPNOTSUPP;
3323  }
3324  EXPORT_SYMBOL(sock_no_bind);
3325  
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)3326  int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3327  		    int len, int flags)
3328  {
3329  	return -EOPNOTSUPP;
3330  }
3331  EXPORT_SYMBOL(sock_no_connect);
3332  
sock_no_socketpair(struct socket * sock1,struct socket * sock2)3333  int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3334  {
3335  	return -EOPNOTSUPP;
3336  }
3337  EXPORT_SYMBOL(sock_no_socketpair);
3338  
sock_no_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)3339  int sock_no_accept(struct socket *sock, struct socket *newsock,
3340  		   struct proto_accept_arg *arg)
3341  {
3342  	return -EOPNOTSUPP;
3343  }
3344  EXPORT_SYMBOL(sock_no_accept);
3345  
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)3346  int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3347  		    int peer)
3348  {
3349  	return -EOPNOTSUPP;
3350  }
3351  EXPORT_SYMBOL(sock_no_getname);
3352  
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3353  int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3354  {
3355  	return -EOPNOTSUPP;
3356  }
3357  EXPORT_SYMBOL(sock_no_ioctl);
3358  
sock_no_listen(struct socket * sock,int backlog)3359  int sock_no_listen(struct socket *sock, int backlog)
3360  {
3361  	return -EOPNOTSUPP;
3362  }
3363  EXPORT_SYMBOL(sock_no_listen);
3364  
sock_no_shutdown(struct socket * sock,int how)3365  int sock_no_shutdown(struct socket *sock, int how)
3366  {
3367  	return -EOPNOTSUPP;
3368  }
3369  EXPORT_SYMBOL(sock_no_shutdown);
3370  
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)3371  int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3372  {
3373  	return -EOPNOTSUPP;
3374  }
3375  EXPORT_SYMBOL(sock_no_sendmsg);
3376  
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)3377  int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3378  {
3379  	return -EOPNOTSUPP;
3380  }
3381  EXPORT_SYMBOL(sock_no_sendmsg_locked);
3382  
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3383  int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3384  		    int flags)
3385  {
3386  	return -EOPNOTSUPP;
3387  }
3388  EXPORT_SYMBOL(sock_no_recvmsg);
3389  
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3390  int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3391  {
3392  	/* Mirror missing mmap method error code */
3393  	return -ENODEV;
3394  }
3395  EXPORT_SYMBOL(sock_no_mmap);
3396  
3397  /*
3398   * When a file is received (via SCM_RIGHTS, etc), we must bump the
3399   * various sock-based usage counts.
3400   */
__receive_sock(struct file * file)3401  void __receive_sock(struct file *file)
3402  {
3403  	struct socket *sock;
3404  
3405  	sock = sock_from_file(file);
3406  	if (sock) {
3407  		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3408  		sock_update_classid(&sock->sk->sk_cgrp_data);
3409  	}
3410  }
3411  
3412  /*
3413   *	Default Socket Callbacks
3414   */
3415  
sock_def_wakeup(struct sock * sk)3416  static void sock_def_wakeup(struct sock *sk)
3417  {
3418  	struct socket_wq *wq;
3419  
3420  	rcu_read_lock();
3421  	wq = rcu_dereference(sk->sk_wq);
3422  	if (skwq_has_sleeper(wq))
3423  		wake_up_interruptible_all(&wq->wait);
3424  	rcu_read_unlock();
3425  }
3426  
sock_def_error_report(struct sock * sk)3427  static void sock_def_error_report(struct sock *sk)
3428  {
3429  	struct socket_wq *wq;
3430  
3431  	rcu_read_lock();
3432  	wq = rcu_dereference(sk->sk_wq);
3433  	if (skwq_has_sleeper(wq))
3434  		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3435  	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3436  	rcu_read_unlock();
3437  }
3438  
sock_def_readable(struct sock * sk)3439  void sock_def_readable(struct sock *sk)
3440  {
3441  	struct socket_wq *wq;
3442  
3443  	trace_sk_data_ready(sk);
3444  
3445  	rcu_read_lock();
3446  	wq = rcu_dereference(sk->sk_wq);
3447  	if (skwq_has_sleeper(wq))
3448  		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3449  						EPOLLRDNORM | EPOLLRDBAND);
3450  	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3451  	rcu_read_unlock();
3452  }
3453  
sock_def_write_space(struct sock * sk)3454  static void sock_def_write_space(struct sock *sk)
3455  {
3456  	struct socket_wq *wq;
3457  
3458  	rcu_read_lock();
3459  
3460  	/* Do not wake up a writer until he can make "significant"
3461  	 * progress.  --DaveM
3462  	 */
3463  	if (sock_writeable(sk)) {
3464  		wq = rcu_dereference(sk->sk_wq);
3465  		if (skwq_has_sleeper(wq))
3466  			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3467  						EPOLLWRNORM | EPOLLWRBAND);
3468  
3469  		/* Should agree with poll, otherwise some programs break */
3470  		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3471  	}
3472  
3473  	rcu_read_unlock();
3474  }
3475  
3476  /* An optimised version of sock_def_write_space(), should only be called
3477   * for SOCK_RCU_FREE sockets under RCU read section and after putting
3478   * ->sk_wmem_alloc.
3479   */
sock_def_write_space_wfree(struct sock * sk)3480  static void sock_def_write_space_wfree(struct sock *sk)
3481  {
3482  	/* Do not wake up a writer until he can make "significant"
3483  	 * progress.  --DaveM
3484  	 */
3485  	if (sock_writeable(sk)) {
3486  		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3487  
3488  		/* rely on refcount_sub from sock_wfree() */
3489  		smp_mb__after_atomic();
3490  		if (wq && waitqueue_active(&wq->wait))
3491  			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3492  						EPOLLWRNORM | EPOLLWRBAND);
3493  
3494  		/* Should agree with poll, otherwise some programs break */
3495  		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3496  	}
3497  }
3498  
sock_def_destruct(struct sock * sk)3499  static void sock_def_destruct(struct sock *sk)
3500  {
3501  }
3502  
sk_send_sigurg(struct sock * sk)3503  void sk_send_sigurg(struct sock *sk)
3504  {
3505  	if (sk->sk_socket && sk->sk_socket->file)
3506  		if (send_sigurg(sk->sk_socket->file))
3507  			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3508  }
3509  EXPORT_SYMBOL(sk_send_sigurg);
3510  
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3511  void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3512  		    unsigned long expires)
3513  {
3514  	if (!mod_timer(timer, expires))
3515  		sock_hold(sk);
3516  }
3517  EXPORT_SYMBOL(sk_reset_timer);
3518  
sk_stop_timer(struct sock * sk,struct timer_list * timer)3519  void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3520  {
3521  	if (del_timer(timer))
3522  		__sock_put(sk);
3523  }
3524  EXPORT_SYMBOL(sk_stop_timer);
3525  
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3526  void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3527  {
3528  	if (del_timer_sync(timer))
3529  		__sock_put(sk);
3530  }
3531  EXPORT_SYMBOL(sk_stop_timer_sync);
3532  
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3533  void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3534  {
3535  	sk_init_common(sk);
3536  	sk->sk_send_head	=	NULL;
3537  
3538  	timer_setup(&sk->sk_timer, NULL, 0);
3539  
3540  	sk->sk_allocation	=	GFP_KERNEL;
3541  	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3542  	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3543  	sk->sk_state		=	TCP_CLOSE;
3544  	sk->sk_use_task_frag	=	true;
3545  	sk_set_socket(sk, sock);
3546  
3547  	sock_set_flag(sk, SOCK_ZAPPED);
3548  
3549  	if (sock) {
3550  		sk->sk_type	=	sock->type;
3551  		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3552  		sock->sk	=	sk;
3553  	} else {
3554  		RCU_INIT_POINTER(sk->sk_wq, NULL);
3555  	}
3556  	sk->sk_uid	=	uid;
3557  
3558  	sk->sk_state_change	=	sock_def_wakeup;
3559  	sk->sk_data_ready	=	sock_def_readable;
3560  	sk->sk_write_space	=	sock_def_write_space;
3561  	sk->sk_error_report	=	sock_def_error_report;
3562  	sk->sk_destruct		=	sock_def_destruct;
3563  
3564  	sk->sk_frag.page	=	NULL;
3565  	sk->sk_frag.offset	=	0;
3566  	sk->sk_peek_off		=	-1;
3567  
3568  	sk->sk_peer_pid 	=	NULL;
3569  	sk->sk_peer_cred	=	NULL;
3570  	spin_lock_init(&sk->sk_peer_lock);
3571  
3572  	sk->sk_write_pending	=	0;
3573  	sk->sk_rcvlowat		=	1;
3574  	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3575  	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3576  
3577  	sk->sk_stamp = SK_DEFAULT_STAMP;
3578  #if BITS_PER_LONG==32
3579  	seqlock_init(&sk->sk_stamp_seq);
3580  #endif
3581  	atomic_set(&sk->sk_zckey, 0);
3582  
3583  #ifdef CONFIG_NET_RX_BUSY_POLL
3584  	sk->sk_napi_id		=	0;
3585  	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3586  #endif
3587  
3588  	sk->sk_max_pacing_rate = ~0UL;
3589  	sk->sk_pacing_rate = ~0UL;
3590  	WRITE_ONCE(sk->sk_pacing_shift, 10);
3591  	sk->sk_incoming_cpu = -1;
3592  
3593  	sk_rx_queue_clear(sk);
3594  	/*
3595  	 * Before updating sk_refcnt, we must commit prior changes to memory
3596  	 * (Documentation/RCU/rculist_nulls.rst for details)
3597  	 */
3598  	smp_wmb();
3599  	refcount_set(&sk->sk_refcnt, 1);
3600  	atomic_set(&sk->sk_drops, 0);
3601  }
3602  EXPORT_SYMBOL(sock_init_data_uid);
3603  
sock_init_data(struct socket * sock,struct sock * sk)3604  void sock_init_data(struct socket *sock, struct sock *sk)
3605  {
3606  	kuid_t uid = sock ?
3607  		SOCK_INODE(sock)->i_uid :
3608  		make_kuid(sock_net(sk)->user_ns, 0);
3609  
3610  	sock_init_data_uid(sock, sk, uid);
3611  }
3612  EXPORT_SYMBOL(sock_init_data);
3613  
lock_sock_nested(struct sock * sk,int subclass)3614  void lock_sock_nested(struct sock *sk, int subclass)
3615  {
3616  	/* The sk_lock has mutex_lock() semantics here. */
3617  	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3618  
3619  	might_sleep();
3620  	spin_lock_bh(&sk->sk_lock.slock);
3621  	if (sock_owned_by_user_nocheck(sk))
3622  		__lock_sock(sk);
3623  	sk->sk_lock.owned = 1;
3624  	spin_unlock_bh(&sk->sk_lock.slock);
3625  }
3626  EXPORT_SYMBOL(lock_sock_nested);
3627  
release_sock(struct sock * sk)3628  void release_sock(struct sock *sk)
3629  {
3630  	spin_lock_bh(&sk->sk_lock.slock);
3631  	if (sk->sk_backlog.tail)
3632  		__release_sock(sk);
3633  
3634  	if (sk->sk_prot->release_cb)
3635  		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3636  				     tcp_release_cb, sk);
3637  
3638  	sock_release_ownership(sk);
3639  	if (waitqueue_active(&sk->sk_lock.wq))
3640  		wake_up(&sk->sk_lock.wq);
3641  	spin_unlock_bh(&sk->sk_lock.slock);
3642  }
3643  EXPORT_SYMBOL(release_sock);
3644  
__lock_sock_fast(struct sock * sk)3645  bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3646  {
3647  	might_sleep();
3648  	spin_lock_bh(&sk->sk_lock.slock);
3649  
3650  	if (!sock_owned_by_user_nocheck(sk)) {
3651  		/*
3652  		 * Fast path return with bottom halves disabled and
3653  		 * sock::sk_lock.slock held.
3654  		 *
3655  		 * The 'mutex' is not contended and holding
3656  		 * sock::sk_lock.slock prevents all other lockers to
3657  		 * proceed so the corresponding unlock_sock_fast() can
3658  		 * avoid the slow path of release_sock() completely and
3659  		 * just release slock.
3660  		 *
3661  		 * From a semantical POV this is equivalent to 'acquiring'
3662  		 * the 'mutex', hence the corresponding lockdep
3663  		 * mutex_release() has to happen in the fast path of
3664  		 * unlock_sock_fast().
3665  		 */
3666  		return false;
3667  	}
3668  
3669  	__lock_sock(sk);
3670  	sk->sk_lock.owned = 1;
3671  	__acquire(&sk->sk_lock.slock);
3672  	spin_unlock_bh(&sk->sk_lock.slock);
3673  	return true;
3674  }
3675  EXPORT_SYMBOL(__lock_sock_fast);
3676  
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3677  int sock_gettstamp(struct socket *sock, void __user *userstamp,
3678  		   bool timeval, bool time32)
3679  {
3680  	struct sock *sk = sock->sk;
3681  	struct timespec64 ts;
3682  
3683  	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3684  	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3685  	if (ts.tv_sec == -1)
3686  		return -ENOENT;
3687  	if (ts.tv_sec == 0) {
3688  		ktime_t kt = ktime_get_real();
3689  		sock_write_timestamp(sk, kt);
3690  		ts = ktime_to_timespec64(kt);
3691  	}
3692  
3693  	if (timeval)
3694  		ts.tv_nsec /= 1000;
3695  
3696  #ifdef CONFIG_COMPAT_32BIT_TIME
3697  	if (time32)
3698  		return put_old_timespec32(&ts, userstamp);
3699  #endif
3700  #ifdef CONFIG_SPARC64
3701  	/* beware of padding in sparc64 timeval */
3702  	if (timeval && !in_compat_syscall()) {
3703  		struct __kernel_old_timeval __user tv = {
3704  			.tv_sec = ts.tv_sec,
3705  			.tv_usec = ts.tv_nsec,
3706  		};
3707  		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3708  			return -EFAULT;
3709  		return 0;
3710  	}
3711  #endif
3712  	return put_timespec64(&ts, userstamp);
3713  }
3714  EXPORT_SYMBOL(sock_gettstamp);
3715  
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3716  void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3717  {
3718  	if (!sock_flag(sk, flag)) {
3719  		unsigned long previous_flags = sk->sk_flags;
3720  
3721  		sock_set_flag(sk, flag);
3722  		/*
3723  		 * we just set one of the two flags which require net
3724  		 * time stamping, but time stamping might have been on
3725  		 * already because of the other one
3726  		 */
3727  		if (sock_needs_netstamp(sk) &&
3728  		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3729  			net_enable_timestamp();
3730  	}
3731  }
3732  
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3733  int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3734  		       int level, int type)
3735  {
3736  	struct sock_exterr_skb *serr;
3737  	struct sk_buff *skb;
3738  	int copied, err;
3739  
3740  	err = -EAGAIN;
3741  	skb = sock_dequeue_err_skb(sk);
3742  	if (skb == NULL)
3743  		goto out;
3744  
3745  	copied = skb->len;
3746  	if (copied > len) {
3747  		msg->msg_flags |= MSG_TRUNC;
3748  		copied = len;
3749  	}
3750  	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3751  	if (err)
3752  		goto out_free_skb;
3753  
3754  	sock_recv_timestamp(msg, sk, skb);
3755  
3756  	serr = SKB_EXT_ERR(skb);
3757  	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3758  
3759  	msg->msg_flags |= MSG_ERRQUEUE;
3760  	err = copied;
3761  
3762  out_free_skb:
3763  	kfree_skb(skb);
3764  out:
3765  	return err;
3766  }
3767  EXPORT_SYMBOL(sock_recv_errqueue);
3768  
3769  /*
3770   *	Get a socket option on an socket.
3771   *
3772   *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3773   *	asynchronous errors should be reported by getsockopt. We assume
3774   *	this means if you specify SO_ERROR (otherwise what is the point of it).
3775   */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3776  int sock_common_getsockopt(struct socket *sock, int level, int optname,
3777  			   char __user *optval, int __user *optlen)
3778  {
3779  	struct sock *sk = sock->sk;
3780  
3781  	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3782  	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3783  }
3784  EXPORT_SYMBOL(sock_common_getsockopt);
3785  
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3786  int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3787  			int flags)
3788  {
3789  	struct sock *sk = sock->sk;
3790  	int addr_len = 0;
3791  	int err;
3792  
3793  	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3794  	if (err >= 0)
3795  		msg->msg_namelen = addr_len;
3796  	return err;
3797  }
3798  EXPORT_SYMBOL(sock_common_recvmsg);
3799  
3800  /*
3801   *	Set socket options on an inet socket.
3802   */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3803  int sock_common_setsockopt(struct socket *sock, int level, int optname,
3804  			   sockptr_t optval, unsigned int optlen)
3805  {
3806  	struct sock *sk = sock->sk;
3807  
3808  	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3809  	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3810  }
3811  EXPORT_SYMBOL(sock_common_setsockopt);
3812  
sk_common_release(struct sock * sk)3813  void sk_common_release(struct sock *sk)
3814  {
3815  	if (sk->sk_prot->destroy)
3816  		sk->sk_prot->destroy(sk);
3817  
3818  	/*
3819  	 * Observation: when sk_common_release is called, processes have
3820  	 * no access to socket. But net still has.
3821  	 * Step one, detach it from networking:
3822  	 *
3823  	 * A. Remove from hash tables.
3824  	 */
3825  
3826  	sk->sk_prot->unhash(sk);
3827  
3828  	if (sk->sk_socket)
3829  		sk->sk_socket->sk = NULL;
3830  
3831  	/*
3832  	 * In this point socket cannot receive new packets, but it is possible
3833  	 * that some packets are in flight because some CPU runs receiver and
3834  	 * did hash table lookup before we unhashed socket. They will achieve
3835  	 * receive queue and will be purged by socket destructor.
3836  	 *
3837  	 * Also we still have packets pending on receive queue and probably,
3838  	 * our own packets waiting in device queues. sock_destroy will drain
3839  	 * receive queue, but transmitted packets will delay socket destruction
3840  	 * until the last reference will be released.
3841  	 */
3842  
3843  	sock_orphan(sk);
3844  
3845  	xfrm_sk_free_policy(sk);
3846  
3847  	sock_put(sk);
3848  }
3849  EXPORT_SYMBOL(sk_common_release);
3850  
sk_get_meminfo(const struct sock * sk,u32 * mem)3851  void sk_get_meminfo(const struct sock *sk, u32 *mem)
3852  {
3853  	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3854  
3855  	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3856  	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3857  	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3858  	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3859  	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3860  	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3861  	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3862  	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3863  	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3864  }
3865  
3866  #ifdef CONFIG_PROC_FS
3867  static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3868  
sock_prot_inuse_get(struct net * net,struct proto * prot)3869  int sock_prot_inuse_get(struct net *net, struct proto *prot)
3870  {
3871  	int cpu, idx = prot->inuse_idx;
3872  	int res = 0;
3873  
3874  	for_each_possible_cpu(cpu)
3875  		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3876  
3877  	return res >= 0 ? res : 0;
3878  }
3879  EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3880  
sock_inuse_get(struct net * net)3881  int sock_inuse_get(struct net *net)
3882  {
3883  	int cpu, res = 0;
3884  
3885  	for_each_possible_cpu(cpu)
3886  		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3887  
3888  	return res;
3889  }
3890  
3891  EXPORT_SYMBOL_GPL(sock_inuse_get);
3892  
sock_inuse_init_net(struct net * net)3893  static int __net_init sock_inuse_init_net(struct net *net)
3894  {
3895  	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3896  	if (net->core.prot_inuse == NULL)
3897  		return -ENOMEM;
3898  	return 0;
3899  }
3900  
sock_inuse_exit_net(struct net * net)3901  static void __net_exit sock_inuse_exit_net(struct net *net)
3902  {
3903  	free_percpu(net->core.prot_inuse);
3904  }
3905  
3906  static struct pernet_operations net_inuse_ops = {
3907  	.init = sock_inuse_init_net,
3908  	.exit = sock_inuse_exit_net,
3909  };
3910  
net_inuse_init(void)3911  static __init int net_inuse_init(void)
3912  {
3913  	if (register_pernet_subsys(&net_inuse_ops))
3914  		panic("Cannot initialize net inuse counters");
3915  
3916  	return 0;
3917  }
3918  
3919  core_initcall(net_inuse_init);
3920  
assign_proto_idx(struct proto * prot)3921  static int assign_proto_idx(struct proto *prot)
3922  {
3923  	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3924  
3925  	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3926  		pr_err("PROTO_INUSE_NR exhausted\n");
3927  		return -ENOSPC;
3928  	}
3929  
3930  	set_bit(prot->inuse_idx, proto_inuse_idx);
3931  	return 0;
3932  }
3933  
release_proto_idx(struct proto * prot)3934  static void release_proto_idx(struct proto *prot)
3935  {
3936  	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3937  		clear_bit(prot->inuse_idx, proto_inuse_idx);
3938  }
3939  #else
assign_proto_idx(struct proto * prot)3940  static inline int assign_proto_idx(struct proto *prot)
3941  {
3942  	return 0;
3943  }
3944  
release_proto_idx(struct proto * prot)3945  static inline void release_proto_idx(struct proto *prot)
3946  {
3947  }
3948  
3949  #endif
3950  
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3951  static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3952  {
3953  	if (!twsk_prot)
3954  		return;
3955  	kfree(twsk_prot->twsk_slab_name);
3956  	twsk_prot->twsk_slab_name = NULL;
3957  	kmem_cache_destroy(twsk_prot->twsk_slab);
3958  	twsk_prot->twsk_slab = NULL;
3959  }
3960  
tw_prot_init(const struct proto * prot)3961  static int tw_prot_init(const struct proto *prot)
3962  {
3963  	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3964  
3965  	if (!twsk_prot)
3966  		return 0;
3967  
3968  	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3969  					      prot->name);
3970  	if (!twsk_prot->twsk_slab_name)
3971  		return -ENOMEM;
3972  
3973  	twsk_prot->twsk_slab =
3974  		kmem_cache_create(twsk_prot->twsk_slab_name,
3975  				  twsk_prot->twsk_obj_size, 0,
3976  				  SLAB_ACCOUNT | prot->slab_flags,
3977  				  NULL);
3978  	if (!twsk_prot->twsk_slab) {
3979  		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3980  			prot->name);
3981  		return -ENOMEM;
3982  	}
3983  
3984  	return 0;
3985  }
3986  
req_prot_cleanup(struct request_sock_ops * rsk_prot)3987  static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3988  {
3989  	if (!rsk_prot)
3990  		return;
3991  	kfree(rsk_prot->slab_name);
3992  	rsk_prot->slab_name = NULL;
3993  	kmem_cache_destroy(rsk_prot->slab);
3994  	rsk_prot->slab = NULL;
3995  }
3996  
req_prot_init(const struct proto * prot)3997  static int req_prot_init(const struct proto *prot)
3998  {
3999  	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4000  
4001  	if (!rsk_prot)
4002  		return 0;
4003  
4004  	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4005  					prot->name);
4006  	if (!rsk_prot->slab_name)
4007  		return -ENOMEM;
4008  
4009  	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4010  					   rsk_prot->obj_size, 0,
4011  					   SLAB_ACCOUNT | prot->slab_flags,
4012  					   NULL);
4013  
4014  	if (!rsk_prot->slab) {
4015  		pr_crit("%s: Can't create request sock SLAB cache!\n",
4016  			prot->name);
4017  		return -ENOMEM;
4018  	}
4019  	return 0;
4020  }
4021  
proto_register(struct proto * prot,int alloc_slab)4022  int proto_register(struct proto *prot, int alloc_slab)
4023  {
4024  	int ret = -ENOBUFS;
4025  
4026  	if (prot->memory_allocated && !prot->sysctl_mem) {
4027  		pr_err("%s: missing sysctl_mem\n", prot->name);
4028  		return -EINVAL;
4029  	}
4030  	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4031  		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4032  		return -EINVAL;
4033  	}
4034  	if (alloc_slab) {
4035  		prot->slab = kmem_cache_create_usercopy(prot->name,
4036  					prot->obj_size, 0,
4037  					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4038  					prot->slab_flags,
4039  					prot->useroffset, prot->usersize,
4040  					NULL);
4041  
4042  		if (prot->slab == NULL) {
4043  			pr_crit("%s: Can't create sock SLAB cache!\n",
4044  				prot->name);
4045  			goto out;
4046  		}
4047  
4048  		if (req_prot_init(prot))
4049  			goto out_free_request_sock_slab;
4050  
4051  		if (tw_prot_init(prot))
4052  			goto out_free_timewait_sock_slab;
4053  	}
4054  
4055  	mutex_lock(&proto_list_mutex);
4056  	ret = assign_proto_idx(prot);
4057  	if (ret) {
4058  		mutex_unlock(&proto_list_mutex);
4059  		goto out_free_timewait_sock_slab;
4060  	}
4061  	list_add(&prot->node, &proto_list);
4062  	mutex_unlock(&proto_list_mutex);
4063  	return ret;
4064  
4065  out_free_timewait_sock_slab:
4066  	if (alloc_slab)
4067  		tw_prot_cleanup(prot->twsk_prot);
4068  out_free_request_sock_slab:
4069  	if (alloc_slab) {
4070  		req_prot_cleanup(prot->rsk_prot);
4071  
4072  		kmem_cache_destroy(prot->slab);
4073  		prot->slab = NULL;
4074  	}
4075  out:
4076  	return ret;
4077  }
4078  EXPORT_SYMBOL(proto_register);
4079  
proto_unregister(struct proto * prot)4080  void proto_unregister(struct proto *prot)
4081  {
4082  	mutex_lock(&proto_list_mutex);
4083  	release_proto_idx(prot);
4084  	list_del(&prot->node);
4085  	mutex_unlock(&proto_list_mutex);
4086  
4087  	kmem_cache_destroy(prot->slab);
4088  	prot->slab = NULL;
4089  
4090  	req_prot_cleanup(prot->rsk_prot);
4091  	tw_prot_cleanup(prot->twsk_prot);
4092  }
4093  EXPORT_SYMBOL(proto_unregister);
4094  
sock_load_diag_module(int family,int protocol)4095  int sock_load_diag_module(int family, int protocol)
4096  {
4097  	if (!protocol) {
4098  		if (!sock_is_registered(family))
4099  			return -ENOENT;
4100  
4101  		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4102  				      NETLINK_SOCK_DIAG, family);
4103  	}
4104  
4105  #ifdef CONFIG_INET
4106  	if (family == AF_INET &&
4107  	    protocol != IPPROTO_RAW &&
4108  	    protocol < MAX_INET_PROTOS &&
4109  	    !rcu_access_pointer(inet_protos[protocol]))
4110  		return -ENOENT;
4111  #endif
4112  
4113  	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4114  			      NETLINK_SOCK_DIAG, family, protocol);
4115  }
4116  EXPORT_SYMBOL(sock_load_diag_module);
4117  
4118  #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)4119  static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4120  	__acquires(proto_list_mutex)
4121  {
4122  	mutex_lock(&proto_list_mutex);
4123  	return seq_list_start_head(&proto_list, *pos);
4124  }
4125  
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)4126  static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4127  {
4128  	return seq_list_next(v, &proto_list, pos);
4129  }
4130  
proto_seq_stop(struct seq_file * seq,void * v)4131  static void proto_seq_stop(struct seq_file *seq, void *v)
4132  	__releases(proto_list_mutex)
4133  {
4134  	mutex_unlock(&proto_list_mutex);
4135  }
4136  
proto_method_implemented(const void * method)4137  static char proto_method_implemented(const void *method)
4138  {
4139  	return method == NULL ? 'n' : 'y';
4140  }
sock_prot_memory_allocated(struct proto * proto)4141  static long sock_prot_memory_allocated(struct proto *proto)
4142  {
4143  	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4144  }
4145  
sock_prot_memory_pressure(struct proto * proto)4146  static const char *sock_prot_memory_pressure(struct proto *proto)
4147  {
4148  	return proto->memory_pressure != NULL ?
4149  	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4150  }
4151  
proto_seq_printf(struct seq_file * seq,struct proto * proto)4152  static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4153  {
4154  
4155  	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4156  			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4157  		   proto->name,
4158  		   proto->obj_size,
4159  		   sock_prot_inuse_get(seq_file_net(seq), proto),
4160  		   sock_prot_memory_allocated(proto),
4161  		   sock_prot_memory_pressure(proto),
4162  		   proto->max_header,
4163  		   proto->slab == NULL ? "no" : "yes",
4164  		   module_name(proto->owner),
4165  		   proto_method_implemented(proto->close),
4166  		   proto_method_implemented(proto->connect),
4167  		   proto_method_implemented(proto->disconnect),
4168  		   proto_method_implemented(proto->accept),
4169  		   proto_method_implemented(proto->ioctl),
4170  		   proto_method_implemented(proto->init),
4171  		   proto_method_implemented(proto->destroy),
4172  		   proto_method_implemented(proto->shutdown),
4173  		   proto_method_implemented(proto->setsockopt),
4174  		   proto_method_implemented(proto->getsockopt),
4175  		   proto_method_implemented(proto->sendmsg),
4176  		   proto_method_implemented(proto->recvmsg),
4177  		   proto_method_implemented(proto->bind),
4178  		   proto_method_implemented(proto->backlog_rcv),
4179  		   proto_method_implemented(proto->hash),
4180  		   proto_method_implemented(proto->unhash),
4181  		   proto_method_implemented(proto->get_port),
4182  		   proto_method_implemented(proto->enter_memory_pressure));
4183  }
4184  
proto_seq_show(struct seq_file * seq,void * v)4185  static int proto_seq_show(struct seq_file *seq, void *v)
4186  {
4187  	if (v == &proto_list)
4188  		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4189  			   "protocol",
4190  			   "size",
4191  			   "sockets",
4192  			   "memory",
4193  			   "press",
4194  			   "maxhdr",
4195  			   "slab",
4196  			   "module",
4197  			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4198  	else
4199  		proto_seq_printf(seq, list_entry(v, struct proto, node));
4200  	return 0;
4201  }
4202  
4203  static const struct seq_operations proto_seq_ops = {
4204  	.start  = proto_seq_start,
4205  	.next   = proto_seq_next,
4206  	.stop   = proto_seq_stop,
4207  	.show   = proto_seq_show,
4208  };
4209  
proto_init_net(struct net * net)4210  static __net_init int proto_init_net(struct net *net)
4211  {
4212  	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4213  			sizeof(struct seq_net_private)))
4214  		return -ENOMEM;
4215  
4216  	return 0;
4217  }
4218  
proto_exit_net(struct net * net)4219  static __net_exit void proto_exit_net(struct net *net)
4220  {
4221  	remove_proc_entry("protocols", net->proc_net);
4222  }
4223  
4224  
4225  static __net_initdata struct pernet_operations proto_net_ops = {
4226  	.init = proto_init_net,
4227  	.exit = proto_exit_net,
4228  };
4229  
proto_init(void)4230  static int __init proto_init(void)
4231  {
4232  	return register_pernet_subsys(&proto_net_ops);
4233  }
4234  
4235  subsys_initcall(proto_init);
4236  
4237  #endif /* PROC_FS */
4238  
4239  #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)4240  bool sk_busy_loop_end(void *p, unsigned long start_time)
4241  {
4242  	struct sock *sk = p;
4243  
4244  	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4245  		return true;
4246  
4247  	if (sk_is_udp(sk) &&
4248  	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4249  		return true;
4250  
4251  	return sk_busy_loop_timeout(sk, start_time);
4252  }
4253  EXPORT_SYMBOL(sk_busy_loop_end);
4254  #endif /* CONFIG_NET_RX_BUSY_POLL */
4255  
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)4256  int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4257  {
4258  	if (!sk->sk_prot->bind_add)
4259  		return -EOPNOTSUPP;
4260  	return sk->sk_prot->bind_add(sk, addr, addr_len);
4261  }
4262  EXPORT_SYMBOL(sock_bind_add);
4263  
4264  /* Copy 'size' bytes from userspace and return `size` back to userspace */
sock_ioctl_inout(struct sock * sk,unsigned int cmd,void __user * arg,void * karg,size_t size)4265  int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4266  		     void __user *arg, void *karg, size_t size)
4267  {
4268  	int ret;
4269  
4270  	if (copy_from_user(karg, arg, size))
4271  		return -EFAULT;
4272  
4273  	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4274  	if (ret)
4275  		return ret;
4276  
4277  	if (copy_to_user(arg, karg, size))
4278  		return -EFAULT;
4279  
4280  	return 0;
4281  }
4282  EXPORT_SYMBOL(sock_ioctl_inout);
4283  
4284  /* This is the most common ioctl prep function, where the result (4 bytes) is
4285   * copied back to userspace if the ioctl() returns successfully. No input is
4286   * copied from userspace as input argument.
4287   */
sock_ioctl_out(struct sock * sk,unsigned int cmd,void __user * arg)4288  static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4289  {
4290  	int ret, karg = 0;
4291  
4292  	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4293  	if (ret)
4294  		return ret;
4295  
4296  	return put_user(karg, (int __user *)arg);
4297  }
4298  
4299  /* A wrapper around sock ioctls, which copies the data from userspace
4300   * (depending on the protocol/ioctl), and copies back the result to userspace.
4301   * The main motivation for this function is to pass kernel memory to the
4302   * protocol ioctl callbacks, instead of userspace memory.
4303   */
sk_ioctl(struct sock * sk,unsigned int cmd,void __user * arg)4304  int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4305  {
4306  	int rc = 1;
4307  
4308  	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4309  		rc = ipmr_sk_ioctl(sk, cmd, arg);
4310  	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4311  		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4312  	else if (sk_is_phonet(sk))
4313  		rc = phonet_sk_ioctl(sk, cmd, arg);
4314  
4315  	/* If ioctl was processed, returns its value */
4316  	if (rc <= 0)
4317  		return rc;
4318  
4319  	/* Otherwise call the default handler */
4320  	return sock_ioctl_out(sk, cmd, arg);
4321  }
4322  EXPORT_SYMBOL(sk_ioctl);
4323  
sock_struct_check(void)4324  static int __init sock_struct_check(void)
4325  {
4326  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4327  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4328  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4329  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4330  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4331  
4332  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4333  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4334  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4335  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4336  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4337  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4338  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4339  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4340  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4341  
4342  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4343  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4344  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4345  
4346  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4347  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4348  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4349  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4350  
4351  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4352  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4353  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4354  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4355  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4356  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4357  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4358  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4359  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4360  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4361  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4362  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4363  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4364  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4365  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4366  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4367  
4368  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4369  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4370  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4371  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4372  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4373  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4374  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4375  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4376  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4377  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4378  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4379  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4380  	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4381  	return 0;
4382  }
4383  
4384  core_initcall(sock_struct_check);
4385