PageRenderTime 141ms CodeModel.GetById 15ms app.highlight 110ms RepoModel.GetById 1ms app.codeStats 1ms

/net/core/sock.c

http://github.com/mirrors/linux
C | 3627 lines | 2606 code | 537 blank | 484 comment | 441 complexity | 6f71689e654581312f9fd9a8aa4beffa MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Generic socket support routines. Memory allocators, socket lock/release
   8 *		handler for protocols to use and generic option handler.
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116
 117#include <linux/uaccess.h>
 118
 119#include <linux/netdevice.h>
 120#include <net/protocol.h>
 121#include <linux/skbuff.h>
 122#include <net/net_namespace.h>
 123#include <net/request_sock.h>
 124#include <net/sock.h>
 125#include <linux/net_tstamp.h>
 126#include <net/xfrm.h>
 127#include <linux/ipsec.h>
 128#include <net/cls_cgroup.h>
 129#include <net/netprio_cgroup.h>
 130#include <linux/sock_diag.h>
 131
 132#include <linux/filter.h>
 133#include <net/sock_reuseport.h>
 134#include <net/bpf_sk_storage.h>
 135
 136#include <trace/events/sock.h>
 137
 138#include <net/tcp.h>
 139#include <net/busy_poll.h>
 140
 141static DEFINE_MUTEX(proto_list_mutex);
 142static LIST_HEAD(proto_list);
 143
 144static void sock_inuse_add(struct net *net, int val);
 145
 146/**
 147 * sk_ns_capable - General socket capability test
 148 * @sk: Socket to use a capability on or through
 149 * @user_ns: The user namespace of the capability to use
 150 * @cap: The capability to use
 151 *
 152 * Test to see if the opener of the socket had when the socket was
 153 * created and the current process has the capability @cap in the user
 154 * namespace @user_ns.
 155 */
 156bool sk_ns_capable(const struct sock *sk,
 157		   struct user_namespace *user_ns, int cap)
 158{
 159	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 160		ns_capable(user_ns, cap);
 161}
 162EXPORT_SYMBOL(sk_ns_capable);
 163
 164/**
 165 * sk_capable - Socket global capability test
 166 * @sk: Socket to use a capability on or through
 167 * @cap: The global capability to use
 168 *
 169 * Test to see if the opener of the socket had when the socket was
 170 * created and the current process has the capability @cap in all user
 171 * namespaces.
 172 */
 173bool sk_capable(const struct sock *sk, int cap)
 174{
 175	return sk_ns_capable(sk, &init_user_ns, cap);
 176}
 177EXPORT_SYMBOL(sk_capable);
 178
 179/**
 180 * sk_net_capable - Network namespace socket capability test
 181 * @sk: Socket to use a capability on or through
 182 * @cap: The capability to use
 183 *
 184 * Test to see if the opener of the socket had when the socket was created
 185 * and the current process has the capability @cap over the network namespace
 186 * the socket is a member of.
 187 */
 188bool sk_net_capable(const struct sock *sk, int cap)
 189{
 190	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 191}
 192EXPORT_SYMBOL(sk_net_capable);
 193
 194/*
 195 * Each address family might have different locking rules, so we have
 196 * one slock key per address family and separate keys for internal and
 197 * userspace sockets.
 198 */
 199static struct lock_class_key af_family_keys[AF_MAX];
 200static struct lock_class_key af_family_kern_keys[AF_MAX];
 201static struct lock_class_key af_family_slock_keys[AF_MAX];
 202static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 203
 204/*
 205 * Make lock validator output more readable. (we pre-construct these
 206 * strings build-time, so that runtime initialization of socket
 207 * locks is fast):
 208 */
 209
 210#define _sock_locks(x)						  \
 211  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 212  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 213  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 214  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 215  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 216  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 217  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 218  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 219  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 220  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 221  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 222  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 223  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 224  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 225  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
 226  x "AF_MAX"
 227
 228static const char *const af_family_key_strings[AF_MAX+1] = {
 229	_sock_locks("sk_lock-")
 230};
 231static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 232	_sock_locks("slock-")
 233};
 234static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 235	_sock_locks("clock-")
 236};
 237
 238static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 239	_sock_locks("k-sk_lock-")
 240};
 241static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 242	_sock_locks("k-slock-")
 243};
 244static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 245	_sock_locks("k-clock-")
 246};
 247static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 248	_sock_locks("rlock-")
 249};
 250static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 251	_sock_locks("wlock-")
 252};
 253static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 254	_sock_locks("elock-")
 255};
 256
 257/*
 258 * sk_callback_lock and sk queues locking rules are per-address-family,
 259 * so split the lock classes by using a per-AF key:
 260 */
 261static struct lock_class_key af_callback_keys[AF_MAX];
 262static struct lock_class_key af_rlock_keys[AF_MAX];
 263static struct lock_class_key af_wlock_keys[AF_MAX];
 264static struct lock_class_key af_elock_keys[AF_MAX];
 265static struct lock_class_key af_kern_callback_keys[AF_MAX];
 266
 267/* Run time adjustable parameters. */
 268__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 269EXPORT_SYMBOL(sysctl_wmem_max);
 270__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 271EXPORT_SYMBOL(sysctl_rmem_max);
 272__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 273__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 274
 275/* Maximal space eaten by iovec or ancillary data plus some space */
 276int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 277EXPORT_SYMBOL(sysctl_optmem_max);
 278
 279int sysctl_tstamp_allow_data __read_mostly = 1;
 280
 281DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 282EXPORT_SYMBOL_GPL(memalloc_socks_key);
 283
 284/**
 285 * sk_set_memalloc - sets %SOCK_MEMALLOC
 286 * @sk: socket to set it on
 287 *
 288 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 289 * It's the responsibility of the admin to adjust min_free_kbytes
 290 * to meet the requirements
 291 */
 292void sk_set_memalloc(struct sock *sk)
 293{
 294	sock_set_flag(sk, SOCK_MEMALLOC);
 295	sk->sk_allocation |= __GFP_MEMALLOC;
 296	static_branch_inc(&memalloc_socks_key);
 297}
 298EXPORT_SYMBOL_GPL(sk_set_memalloc);
 299
 300void sk_clear_memalloc(struct sock *sk)
 301{
 302	sock_reset_flag(sk, SOCK_MEMALLOC);
 303	sk->sk_allocation &= ~__GFP_MEMALLOC;
 304	static_branch_dec(&memalloc_socks_key);
 305
 306	/*
 307	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 308	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 309	 * it has rmem allocations due to the last swapfile being deactivated
 310	 * but there is a risk that the socket is unusable due to exceeding
 311	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 312	 */
 313	sk_mem_reclaim(sk);
 314}
 315EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 316
 317int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 318{
 319	int ret;
 320	unsigned int noreclaim_flag;
 321
 322	/* these should have been dropped before queueing */
 323	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 324
 325	noreclaim_flag = memalloc_noreclaim_save();
 326	ret = sk->sk_backlog_rcv(sk, skb);
 327	memalloc_noreclaim_restore(noreclaim_flag);
 328
 329	return ret;
 330}
 331EXPORT_SYMBOL(__sk_backlog_rcv);
 332
 333static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 334{
 335	struct __kernel_sock_timeval tv;
 336
 337	if (timeo == MAX_SCHEDULE_TIMEOUT) {
 338		tv.tv_sec = 0;
 339		tv.tv_usec = 0;
 340	} else {
 341		tv.tv_sec = timeo / HZ;
 342		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 343	}
 344
 345	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 346		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 347		*(struct old_timeval32 *)optval = tv32;
 348		return sizeof(tv32);
 349	}
 350
 351	if (old_timeval) {
 352		struct __kernel_old_timeval old_tv;
 353		old_tv.tv_sec = tv.tv_sec;
 354		old_tv.tv_usec = tv.tv_usec;
 355		*(struct __kernel_old_timeval *)optval = old_tv;
 356		return sizeof(old_tv);
 357	}
 358
 359	*(struct __kernel_sock_timeval *)optval = tv;
 360	return sizeof(tv);
 361}
 362
 363static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
 364{
 365	struct __kernel_sock_timeval tv;
 366
 367	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 368		struct old_timeval32 tv32;
 369
 370		if (optlen < sizeof(tv32))
 371			return -EINVAL;
 372
 373		if (copy_from_user(&tv32, optval, sizeof(tv32)))
 374			return -EFAULT;
 375		tv.tv_sec = tv32.tv_sec;
 376		tv.tv_usec = tv32.tv_usec;
 377	} else if (old_timeval) {
 378		struct __kernel_old_timeval old_tv;
 379
 380		if (optlen < sizeof(old_tv))
 381			return -EINVAL;
 382		if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
 383			return -EFAULT;
 384		tv.tv_sec = old_tv.tv_sec;
 385		tv.tv_usec = old_tv.tv_usec;
 386	} else {
 387		if (optlen < sizeof(tv))
 388			return -EINVAL;
 389		if (copy_from_user(&tv, optval, sizeof(tv)))
 390			return -EFAULT;
 391	}
 392	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 393		return -EDOM;
 394
 395	if (tv.tv_sec < 0) {
 396		static int warned __read_mostly;
 397
 398		*timeo_p = 0;
 399		if (warned < 10 && net_ratelimit()) {
 400			warned++;
 401			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 402				__func__, current->comm, task_pid_nr(current));
 403		}
 404		return 0;
 405	}
 406	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 407	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 408		return 0;
 409	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 410		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 411	return 0;
 412}
 413
 414static void sock_warn_obsolete_bsdism(const char *name)
 415{
 416	static int warned;
 417	static char warncomm[TASK_COMM_LEN];
 418	if (strcmp(warncomm, current->comm) && warned < 5) {
 419		strcpy(warncomm,  current->comm);
 420		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 421			warncomm, name);
 422		warned++;
 423	}
 424}
 425
 426static bool sock_needs_netstamp(const struct sock *sk)
 427{
 428	switch (sk->sk_family) {
 429	case AF_UNSPEC:
 430	case AF_UNIX:
 431		return false;
 432	default:
 433		return true;
 434	}
 435}
 436
 437static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 438{
 439	if (sk->sk_flags & flags) {
 440		sk->sk_flags &= ~flags;
 441		if (sock_needs_netstamp(sk) &&
 442		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 443			net_disable_timestamp();
 444	}
 445}
 446
 447
 448int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 449{
 450	unsigned long flags;
 451	struct sk_buff_head *list = &sk->sk_receive_queue;
 452
 453	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 454		atomic_inc(&sk->sk_drops);
 455		trace_sock_rcvqueue_full(sk, skb);
 456		return -ENOMEM;
 457	}
 458
 459	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 460		atomic_inc(&sk->sk_drops);
 461		return -ENOBUFS;
 462	}
 463
 464	skb->dev = NULL;
 465	skb_set_owner_r(skb, sk);
 466
 467	/* we escape from rcu protected region, make sure we dont leak
 468	 * a norefcounted dst
 469	 */
 470	skb_dst_force(skb);
 471
 472	spin_lock_irqsave(&list->lock, flags);
 473	sock_skb_set_dropcount(sk, skb);
 474	__skb_queue_tail(list, skb);
 475	spin_unlock_irqrestore(&list->lock, flags);
 476
 477	if (!sock_flag(sk, SOCK_DEAD))
 478		sk->sk_data_ready(sk);
 479	return 0;
 480}
 481EXPORT_SYMBOL(__sock_queue_rcv_skb);
 482
 483int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 484{
 485	int err;
 486
 487	err = sk_filter(sk, skb);
 488	if (err)
 489		return err;
 490
 491	return __sock_queue_rcv_skb(sk, skb);
 492}
 493EXPORT_SYMBOL(sock_queue_rcv_skb);
 494
 495int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 496		     const int nested, unsigned int trim_cap, bool refcounted)
 497{
 498	int rc = NET_RX_SUCCESS;
 499
 500	if (sk_filter_trim_cap(sk, skb, trim_cap))
 501		goto discard_and_relse;
 502
 503	skb->dev = NULL;
 504
 505	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 506		atomic_inc(&sk->sk_drops);
 507		goto discard_and_relse;
 508	}
 509	if (nested)
 510		bh_lock_sock_nested(sk);
 511	else
 512		bh_lock_sock(sk);
 513	if (!sock_owned_by_user(sk)) {
 514		/*
 515		 * trylock + unlock semantics:
 516		 */
 517		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 518
 519		rc = sk_backlog_rcv(sk, skb);
 520
 521		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 522	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 523		bh_unlock_sock(sk);
 524		atomic_inc(&sk->sk_drops);
 525		goto discard_and_relse;
 526	}
 527
 528	bh_unlock_sock(sk);
 529out:
 530	if (refcounted)
 531		sock_put(sk);
 532	return rc;
 533discard_and_relse:
 534	kfree_skb(skb);
 535	goto out;
 536}
 537EXPORT_SYMBOL(__sk_receive_skb);
 538
 539struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 540{
 541	struct dst_entry *dst = __sk_dst_get(sk);
 542
 543	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 544		sk_tx_queue_clear(sk);
 545		sk->sk_dst_pending_confirm = 0;
 546		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 547		dst_release(dst);
 548		return NULL;
 549	}
 550
 551	return dst;
 552}
 553EXPORT_SYMBOL(__sk_dst_check);
 554
 555struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 556{
 557	struct dst_entry *dst = sk_dst_get(sk);
 558
 559	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 560		sk_dst_reset(sk);
 561		dst_release(dst);
 562		return NULL;
 563	}
 564
 565	return dst;
 566}
 567EXPORT_SYMBOL(sk_dst_check);
 568
 569static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
 570{
 571	int ret = -ENOPROTOOPT;
 572#ifdef CONFIG_NETDEVICES
 573	struct net *net = sock_net(sk);
 574
 575	/* Sorry... */
 576	ret = -EPERM;
 577	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 578		goto out;
 579
 580	ret = -EINVAL;
 581	if (ifindex < 0)
 582		goto out;
 583
 584	sk->sk_bound_dev_if = ifindex;
 585	if (sk->sk_prot->rehash)
 586		sk->sk_prot->rehash(sk);
 587	sk_dst_reset(sk);
 588
 589	ret = 0;
 590
 591out:
 592#endif
 593
 594	return ret;
 595}
 596
 597static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 598				int optlen)
 599{
 600	int ret = -ENOPROTOOPT;
 601#ifdef CONFIG_NETDEVICES
 602	struct net *net = sock_net(sk);
 603	char devname[IFNAMSIZ];
 604	int index;
 605
 606	ret = -EINVAL;
 607	if (optlen < 0)
 608		goto out;
 609
 610	/* Bind this socket to a particular device like "eth0",
 611	 * as specified in the passed interface name. If the
 612	 * name is "" or the option length is zero the socket
 613	 * is not bound.
 614	 */
 615	if (optlen > IFNAMSIZ - 1)
 616		optlen = IFNAMSIZ - 1;
 617	memset(devname, 0, sizeof(devname));
 618
 619	ret = -EFAULT;
 620	if (copy_from_user(devname, optval, optlen))
 621		goto out;
 622
 623	index = 0;
 624	if (devname[0] != '\0') {
 625		struct net_device *dev;
 626
 627		rcu_read_lock();
 628		dev = dev_get_by_name_rcu(net, devname);
 629		if (dev)
 630			index = dev->ifindex;
 631		rcu_read_unlock();
 632		ret = -ENODEV;
 633		if (!dev)
 634			goto out;
 635	}
 636
 637	lock_sock(sk);
 638	ret = sock_setbindtodevice_locked(sk, index);
 639	release_sock(sk);
 640
 641out:
 642#endif
 643
 644	return ret;
 645}
 646
 647static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 648				int __user *optlen, int len)
 649{
 650	int ret = -ENOPROTOOPT;
 651#ifdef CONFIG_NETDEVICES
 652	struct net *net = sock_net(sk);
 653	char devname[IFNAMSIZ];
 654
 655	if (sk->sk_bound_dev_if == 0) {
 656		len = 0;
 657		goto zero;
 658	}
 659
 660	ret = -EINVAL;
 661	if (len < IFNAMSIZ)
 662		goto out;
 663
 664	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 665	if (ret)
 666		goto out;
 667
 668	len = strlen(devname) + 1;
 669
 670	ret = -EFAULT;
 671	if (copy_to_user(optval, devname, len))
 672		goto out;
 673
 674zero:
 675	ret = -EFAULT;
 676	if (put_user(len, optlen))
 677		goto out;
 678
 679	ret = 0;
 680
 681out:
 682#endif
 683
 684	return ret;
 685}
 686
 687static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
 688				     int valbool)
 689{
 690	if (valbool)
 691		sock_set_flag(sk, bit);
 692	else
 693		sock_reset_flag(sk, bit);
 694}
 695
 696bool sk_mc_loop(struct sock *sk)
 697{
 698	if (dev_recursion_level())
 699		return false;
 700	if (!sk)
 701		return true;
 702	switch (sk->sk_family) {
 703	case AF_INET:
 704		return inet_sk(sk)->mc_loop;
 705#if IS_ENABLED(CONFIG_IPV6)
 706	case AF_INET6:
 707		return inet6_sk(sk)->mc_loop;
 708#endif
 709	}
 710	WARN_ON(1);
 711	return true;
 712}
 713EXPORT_SYMBOL(sk_mc_loop);
 714
 715/*
 716 *	This is meant for all protocols to use and covers goings on
 717 *	at the socket level. Everything here is generic.
 718 */
 719
 720int sock_setsockopt(struct socket *sock, int level, int optname,
 721		    char __user *optval, unsigned int optlen)
 722{
 723	struct sock_txtime sk_txtime;
 724	struct sock *sk = sock->sk;
 725	int val;
 726	int valbool;
 727	struct linger ling;
 728	int ret = 0;
 729
 730	/*
 731	 *	Options without arguments
 732	 */
 733
 734	if (optname == SO_BINDTODEVICE)
 735		return sock_setbindtodevice(sk, optval, optlen);
 736
 737	if (optlen < sizeof(int))
 738		return -EINVAL;
 739
 740	if (get_user(val, (int __user *)optval))
 741		return -EFAULT;
 742
 743	valbool = val ? 1 : 0;
 744
 745	lock_sock(sk);
 746
 747	switch (optname) {
 748	case SO_DEBUG:
 749		if (val && !capable(CAP_NET_ADMIN))
 750			ret = -EACCES;
 751		else
 752			sock_valbool_flag(sk, SOCK_DBG, valbool);
 753		break;
 754	case SO_REUSEADDR:
 755		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 756		break;
 757	case SO_REUSEPORT:
 758		sk->sk_reuseport = valbool;
 759		break;
 760	case SO_TYPE:
 761	case SO_PROTOCOL:
 762	case SO_DOMAIN:
 763	case SO_ERROR:
 764		ret = -ENOPROTOOPT;
 765		break;
 766	case SO_DONTROUTE:
 767		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 768		sk_dst_reset(sk);
 769		break;
 770	case SO_BROADCAST:
 771		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 772		break;
 773	case SO_SNDBUF:
 774		/* Don't error on this BSD doesn't and if you think
 775		 * about it this is right. Otherwise apps have to
 776		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 777		 * are treated in BSD as hints
 778		 */
 779		val = min_t(u32, val, sysctl_wmem_max);
 780set_sndbuf:
 781		/* Ensure val * 2 fits into an int, to prevent max_t()
 782		 * from treating it as a negative value.
 783		 */
 784		val = min_t(int, val, INT_MAX / 2);
 785		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 786		WRITE_ONCE(sk->sk_sndbuf,
 787			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
 788		/* Wake up sending tasks if we upped the value. */
 789		sk->sk_write_space(sk);
 790		break;
 791
 792	case SO_SNDBUFFORCE:
 793		if (!capable(CAP_NET_ADMIN)) {
 794			ret = -EPERM;
 795			break;
 796		}
 797
 798		/* No negative values (to prevent underflow, as val will be
 799		 * multiplied by 2).
 800		 */
 801		if (val < 0)
 802			val = 0;
 803		goto set_sndbuf;
 804
 805	case SO_RCVBUF:
 806		/* Don't error on this BSD doesn't and if you think
 807		 * about it this is right. Otherwise apps have to
 808		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 809		 * are treated in BSD as hints
 810		 */
 811		val = min_t(u32, val, sysctl_rmem_max);
 812set_rcvbuf:
 813		/* Ensure val * 2 fits into an int, to prevent max_t()
 814		 * from treating it as a negative value.
 815		 */
 816		val = min_t(int, val, INT_MAX / 2);
 817		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 818		/*
 819		 * We double it on the way in to account for
 820		 * "struct sk_buff" etc. overhead.   Applications
 821		 * assume that the SO_RCVBUF setting they make will
 822		 * allow that much actual data to be received on that
 823		 * socket.
 824		 *
 825		 * Applications are unaware that "struct sk_buff" and
 826		 * other overheads allocate from the receive buffer
 827		 * during socket buffer allocation.
 828		 *
 829		 * And after considering the possible alternatives,
 830		 * returning the value we actually used in getsockopt
 831		 * is the most desirable behavior.
 832		 */
 833		WRITE_ONCE(sk->sk_rcvbuf,
 834			   max_t(int, val * 2, SOCK_MIN_RCVBUF));
 835		break;
 836
 837	case SO_RCVBUFFORCE:
 838		if (!capable(CAP_NET_ADMIN)) {
 839			ret = -EPERM;
 840			break;
 841		}
 842
 843		/* No negative values (to prevent underflow, as val will be
 844		 * multiplied by 2).
 845		 */
 846		if (val < 0)
 847			val = 0;
 848		goto set_rcvbuf;
 849
 850	case SO_KEEPALIVE:
 851		if (sk->sk_prot->keepalive)
 852			sk->sk_prot->keepalive(sk, valbool);
 853		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 854		break;
 855
 856	case SO_OOBINLINE:
 857		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 858		break;
 859
 860	case SO_NO_CHECK:
 861		sk->sk_no_check_tx = valbool;
 862		break;
 863
 864	case SO_PRIORITY:
 865		if ((val >= 0 && val <= 6) ||
 866		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 867			sk->sk_priority = val;
 868		else
 869			ret = -EPERM;
 870		break;
 871
 872	case SO_LINGER:
 873		if (optlen < sizeof(ling)) {
 874			ret = -EINVAL;	/* 1003.1g */
 875			break;
 876		}
 877		if (copy_from_user(&ling, optval, sizeof(ling))) {
 878			ret = -EFAULT;
 879			break;
 880		}
 881		if (!ling.l_onoff)
 882			sock_reset_flag(sk, SOCK_LINGER);
 883		else {
 884#if (BITS_PER_LONG == 32)
 885			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 886				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 887			else
 888#endif
 889				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 890			sock_set_flag(sk, SOCK_LINGER);
 891		}
 892		break;
 893
 894	case SO_BSDCOMPAT:
 895		sock_warn_obsolete_bsdism("setsockopt");
 896		break;
 897
 898	case SO_PASSCRED:
 899		if (valbool)
 900			set_bit(SOCK_PASSCRED, &sock->flags);
 901		else
 902			clear_bit(SOCK_PASSCRED, &sock->flags);
 903		break;
 904
 905	case SO_TIMESTAMP_OLD:
 906	case SO_TIMESTAMP_NEW:
 907	case SO_TIMESTAMPNS_OLD:
 908	case SO_TIMESTAMPNS_NEW:
 909		if (valbool)  {
 910			if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
 911				sock_set_flag(sk, SOCK_TSTAMP_NEW);
 912			else
 913				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 914
 915			if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
 916				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 917			else
 918				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 919			sock_set_flag(sk, SOCK_RCVTSTAMP);
 920			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 921		} else {
 922			sock_reset_flag(sk, SOCK_RCVTSTAMP);
 923			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 924			sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 925		}
 926		break;
 927
 928	case SO_TIMESTAMPING_NEW:
 929		sock_set_flag(sk, SOCK_TSTAMP_NEW);
 930		/* fall through */
 931	case SO_TIMESTAMPING_OLD:
 932		if (val & ~SOF_TIMESTAMPING_MASK) {
 933			ret = -EINVAL;
 934			break;
 935		}
 936
 937		if (val & SOF_TIMESTAMPING_OPT_ID &&
 938		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 939			if (sk->sk_protocol == IPPROTO_TCP &&
 940			    sk->sk_type == SOCK_STREAM) {
 941				if ((1 << sk->sk_state) &
 942				    (TCPF_CLOSE | TCPF_LISTEN)) {
 943					ret = -EINVAL;
 944					break;
 945				}
 946				sk->sk_tskey = tcp_sk(sk)->snd_una;
 947			} else {
 948				sk->sk_tskey = 0;
 949			}
 950		}
 951
 952		if (val & SOF_TIMESTAMPING_OPT_STATS &&
 953		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 954			ret = -EINVAL;
 955			break;
 956		}
 957
 958		sk->sk_tsflags = val;
 959		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 960			sock_enable_timestamp(sk,
 961					      SOCK_TIMESTAMPING_RX_SOFTWARE);
 962		else {
 963			if (optname == SO_TIMESTAMPING_NEW)
 964				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
 965
 966			sock_disable_timestamp(sk,
 967					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 968		}
 969		break;
 970
 971	case SO_RCVLOWAT:
 972		if (val < 0)
 973			val = INT_MAX;
 974		if (sock->ops->set_rcvlowat)
 975			ret = sock->ops->set_rcvlowat(sk, val);
 976		else
 977			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
 978		break;
 979
 980	case SO_RCVTIMEO_OLD:
 981	case SO_RCVTIMEO_NEW:
 982		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
 983		break;
 984
 985	case SO_SNDTIMEO_OLD:
 986	case SO_SNDTIMEO_NEW:
 987		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
 988		break;
 989
 990	case SO_ATTACH_FILTER:
 991		ret = -EINVAL;
 992		if (optlen == sizeof(struct sock_fprog)) {
 993			struct sock_fprog fprog;
 994
 995			ret = -EFAULT;
 996			if (copy_from_user(&fprog, optval, sizeof(fprog)))
 997				break;
 998
 999			ret = sk_attach_filter(&fprog, sk);
1000		}
1001		break;
1002
1003	case SO_ATTACH_BPF:
1004		ret = -EINVAL;
1005		if (optlen == sizeof(u32)) {
1006			u32 ufd;
1007
1008			ret = -EFAULT;
1009			if (copy_from_user(&ufd, optval, sizeof(ufd)))
1010				break;
1011
1012			ret = sk_attach_bpf(ufd, sk);
1013		}
1014		break;
1015
1016	case SO_ATTACH_REUSEPORT_CBPF:
1017		ret = -EINVAL;
1018		if (optlen == sizeof(struct sock_fprog)) {
1019			struct sock_fprog fprog;
1020
1021			ret = -EFAULT;
1022			if (copy_from_user(&fprog, optval, sizeof(fprog)))
1023				break;
1024
1025			ret = sk_reuseport_attach_filter(&fprog, sk);
1026		}
1027		break;
1028
1029	case SO_ATTACH_REUSEPORT_EBPF:
1030		ret = -EINVAL;
1031		if (optlen == sizeof(u32)) {
1032			u32 ufd;
1033
1034			ret = -EFAULT;
1035			if (copy_from_user(&ufd, optval, sizeof(ufd)))
1036				break;
1037
1038			ret = sk_reuseport_attach_bpf(ufd, sk);
1039		}
1040		break;
1041
1042	case SO_DETACH_REUSEPORT_BPF:
1043		ret = reuseport_detach_prog(sk);
1044		break;
1045
1046	case SO_DETACH_FILTER:
1047		ret = sk_detach_filter(sk);
1048		break;
1049
1050	case SO_LOCK_FILTER:
1051		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1052			ret = -EPERM;
1053		else
1054			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1055		break;
1056
1057	case SO_PASSSEC:
1058		if (valbool)
1059			set_bit(SOCK_PASSSEC, &sock->flags);
1060		else
1061			clear_bit(SOCK_PASSSEC, &sock->flags);
1062		break;
1063	case SO_MARK:
1064		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1065			ret = -EPERM;
1066		} else if (val != sk->sk_mark) {
1067			sk->sk_mark = val;
1068			sk_dst_reset(sk);
1069		}
1070		break;
1071
1072	case SO_RXQ_OVFL:
1073		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1074		break;
1075
1076	case SO_WIFI_STATUS:
1077		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1078		break;
1079
1080	case SO_PEEK_OFF:
1081		if (sock->ops->set_peek_off)
1082			ret = sock->ops->set_peek_off(sk, val);
1083		else
1084			ret = -EOPNOTSUPP;
1085		break;
1086
1087	case SO_NOFCS:
1088		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1089		break;
1090
1091	case SO_SELECT_ERR_QUEUE:
1092		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1093		break;
1094
1095#ifdef CONFIG_NET_RX_BUSY_POLL
1096	case SO_BUSY_POLL:
1097		/* allow unprivileged users to decrease the value */
1098		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1099			ret = -EPERM;
1100		else {
1101			if (val < 0)
1102				ret = -EINVAL;
1103			else
1104				sk->sk_ll_usec = val;
1105		}
1106		break;
1107#endif
1108
1109	case SO_MAX_PACING_RATE:
1110		{
1111		unsigned long ulval = (val == ~0U) ? ~0UL : val;
1112
1113		if (sizeof(ulval) != sizeof(val) &&
1114		    optlen >= sizeof(ulval) &&
1115		    get_user(ulval, (unsigned long __user *)optval)) {
1116			ret = -EFAULT;
1117			break;
1118		}
1119		if (ulval != ~0UL)
1120			cmpxchg(&sk->sk_pacing_status,
1121				SK_PACING_NONE,
1122				SK_PACING_NEEDED);
1123		sk->sk_max_pacing_rate = ulval;
1124		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1125		break;
1126		}
1127	case SO_INCOMING_CPU:
1128		WRITE_ONCE(sk->sk_incoming_cpu, val);
1129		break;
1130
1131	case SO_CNX_ADVICE:
1132		if (val == 1)
1133			dst_negative_advice(sk);
1134		break;
1135
1136	case SO_ZEROCOPY:
1137		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1138			if (!((sk->sk_type == SOCK_STREAM &&
1139			       sk->sk_protocol == IPPROTO_TCP) ||
1140			      (sk->sk_type == SOCK_DGRAM &&
1141			       sk->sk_protocol == IPPROTO_UDP)))
1142				ret = -ENOTSUPP;
1143		} else if (sk->sk_family != PF_RDS) {
1144			ret = -ENOTSUPP;
1145		}
1146		if (!ret) {
1147			if (val < 0 || val > 1)
1148				ret = -EINVAL;
1149			else
1150				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1151		}
1152		break;
1153
1154	case SO_TXTIME:
1155		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1156			ret = -EPERM;
1157		} else if (optlen != sizeof(struct sock_txtime)) {
1158			ret = -EINVAL;
1159		} else if (copy_from_user(&sk_txtime, optval,
1160			   sizeof(struct sock_txtime))) {
1161			ret = -EFAULT;
1162		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1163			ret = -EINVAL;
1164		} else {
1165			sock_valbool_flag(sk, SOCK_TXTIME, true);
1166			sk->sk_clockid = sk_txtime.clockid;
1167			sk->sk_txtime_deadline_mode =
1168				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1169			sk->sk_txtime_report_errors =
1170				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1171		}
1172		break;
1173
1174	case SO_BINDTOIFINDEX:
1175		ret = sock_setbindtodevice_locked(sk, val);
1176		break;
1177
1178	default:
1179		ret = -ENOPROTOOPT;
1180		break;
1181	}
1182	release_sock(sk);
1183	return ret;
1184}
1185EXPORT_SYMBOL(sock_setsockopt);
1186
1187
1188static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1189			  struct ucred *ucred)
1190{
1191	ucred->pid = pid_vnr(pid);
1192	ucred->uid = ucred->gid = -1;
1193	if (cred) {
1194		struct user_namespace *current_ns = current_user_ns();
1195
1196		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1197		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1198	}
1199}
1200
1201static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1202{
1203	struct user_namespace *user_ns = current_user_ns();
1204	int i;
1205
1206	for (i = 0; i < src->ngroups; i++)
1207		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1208			return -EFAULT;
1209
1210	return 0;
1211}
1212
1213int sock_getsockopt(struct socket *sock, int level, int optname,
1214		    char __user *optval, int __user *optlen)
1215{
1216	struct sock *sk = sock->sk;
1217
1218	union {
1219		int val;
1220		u64 val64;
1221		unsigned long ulval;
1222		struct linger ling;
1223		struct old_timeval32 tm32;
1224		struct __kernel_old_timeval tm;
1225		struct  __kernel_sock_timeval stm;
1226		struct sock_txtime txtime;
1227	} v;
1228
1229	int lv = sizeof(int);
1230	int len;
1231
1232	if (get_user(len, optlen))
1233		return -EFAULT;
1234	if (len < 0)
1235		return -EINVAL;
1236
1237	memset(&v, 0, sizeof(v));
1238
1239	switch (optname) {
1240	case SO_DEBUG:
1241		v.val = sock_flag(sk, SOCK_DBG);
1242		break;
1243
1244	case SO_DONTROUTE:
1245		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1246		break;
1247
1248	case SO_BROADCAST:
1249		v.val = sock_flag(sk, SOCK_BROADCAST);
1250		break;
1251
1252	case SO_SNDBUF:
1253		v.val = sk->sk_sndbuf;
1254		break;
1255
1256	case SO_RCVBUF:
1257		v.val = sk->sk_rcvbuf;
1258		break;
1259
1260	case SO_REUSEADDR:
1261		v.val = sk->sk_reuse;
1262		break;
1263
1264	case SO_REUSEPORT:
1265		v.val = sk->sk_reuseport;
1266		break;
1267
1268	case SO_KEEPALIVE:
1269		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1270		break;
1271
1272	case SO_TYPE:
1273		v.val = sk->sk_type;
1274		break;
1275
1276	case SO_PROTOCOL:
1277		v.val = sk->sk_protocol;
1278		break;
1279
1280	case SO_DOMAIN:
1281		v.val = sk->sk_family;
1282		break;
1283
1284	case SO_ERROR:
1285		v.val = -sock_error(sk);
1286		if (v.val == 0)
1287			v.val = xchg(&sk->sk_err_soft, 0);
1288		break;
1289
1290	case SO_OOBINLINE:
1291		v.val = sock_flag(sk, SOCK_URGINLINE);
1292		break;
1293
1294	case SO_NO_CHECK:
1295		v.val = sk->sk_no_check_tx;
1296		break;
1297
1298	case SO_PRIORITY:
1299		v.val = sk->sk_priority;
1300		break;
1301
1302	case SO_LINGER:
1303		lv		= sizeof(v.ling);
1304		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1305		v.ling.l_linger	= sk->sk_lingertime / HZ;
1306		break;
1307
1308	case SO_BSDCOMPAT:
1309		sock_warn_obsolete_bsdism("getsockopt");
1310		break;
1311
1312	case SO_TIMESTAMP_OLD:
1313		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1314				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1315				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1316		break;
1317
1318	case SO_TIMESTAMPNS_OLD:
1319		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1320		break;
1321
1322	case SO_TIMESTAMP_NEW:
1323		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1324		break;
1325
1326	case SO_TIMESTAMPNS_NEW:
1327		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1328		break;
1329
1330	case SO_TIMESTAMPING_OLD:
1331		v.val = sk->sk_tsflags;
1332		break;
1333
1334	case SO_RCVTIMEO_OLD:
1335	case SO_RCVTIMEO_NEW:
1336		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1337		break;
1338
1339	case SO_SNDTIMEO_OLD:
1340	case SO_SNDTIMEO_NEW:
1341		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1342		break;
1343
1344	case SO_RCVLOWAT:
1345		v.val = sk->sk_rcvlowat;
1346		break;
1347
1348	case SO_SNDLOWAT:
1349		v.val = 1;
1350		break;
1351
1352	case SO_PASSCRED:
1353		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1354		break;
1355
1356	case SO_PEERCRED:
1357	{
1358		struct ucred peercred;
1359		if (len > sizeof(peercred))
1360			len = sizeof(peercred);
1361		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1362		if (copy_to_user(optval, &peercred, len))
1363			return -EFAULT;
1364		goto lenout;
1365	}
1366
1367	case SO_PEERGROUPS:
1368	{
1369		int ret, n;
1370
1371		if (!sk->sk_peer_cred)
1372			return -ENODATA;
1373
1374		n = sk->sk_peer_cred->group_info->ngroups;
1375		if (len < n * sizeof(gid_t)) {
1376			len = n * sizeof(gid_t);
1377			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1378		}
1379		len = n * sizeof(gid_t);
1380
1381		ret = groups_to_user((gid_t __user *)optval,
1382				     sk->sk_peer_cred->group_info);
1383		if (ret)
1384			return ret;
1385		goto lenout;
1386	}
1387
1388	case SO_PEERNAME:
1389	{
1390		char address[128];
1391
1392		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1393		if (lv < 0)
1394			return -ENOTCONN;
1395		if (lv < len)
1396			return -EINVAL;
1397		if (copy_to_user(optval, address, len))
1398			return -EFAULT;
1399		goto lenout;
1400	}
1401
1402	/* Dubious BSD thing... Probably nobody even uses it, but
1403	 * the UNIX standard wants it for whatever reason... -DaveM
1404	 */
1405	case SO_ACCEPTCONN:
1406		v.val = sk->sk_state == TCP_LISTEN;
1407		break;
1408
1409	case SO_PASSSEC:
1410		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1411		break;
1412
1413	case SO_PEERSEC:
1414		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1415
1416	case SO_MARK:
1417		v.val = sk->sk_mark;
1418		break;
1419
1420	case SO_RXQ_OVFL:
1421		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1422		break;
1423
1424	case SO_WIFI_STATUS:
1425		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1426		break;
1427
1428	case SO_PEEK_OFF:
1429		if (!sock->ops->set_peek_off)
1430			return -EOPNOTSUPP;
1431
1432		v.val = sk->sk_peek_off;
1433		break;
1434	case SO_NOFCS:
1435		v.val = sock_flag(sk, SOCK_NOFCS);
1436		break;
1437
1438	case SO_BINDTODEVICE:
1439		return sock_getbindtodevice(sk, optval, optlen, len);
1440
1441	case SO_GET_FILTER:
1442		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1443		if (len < 0)
1444			return len;
1445
1446		goto lenout;
1447
1448	case SO_LOCK_FILTER:
1449		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1450		break;
1451
1452	case SO_BPF_EXTENSIONS:
1453		v.val = bpf_tell_extensions();
1454		break;
1455
1456	case SO_SELECT_ERR_QUEUE:
1457		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1458		break;
1459
1460#ifdef CONFIG_NET_RX_BUSY_POLL
1461	case SO_BUSY_POLL:
1462		v.val = sk->sk_ll_usec;
1463		break;
1464#endif
1465
1466	case SO_MAX_PACING_RATE:
1467		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1468			lv = sizeof(v.ulval);
1469			v.ulval = sk->sk_max_pacing_rate;
1470		} else {
1471			/* 32bit version */
1472			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1473		}
1474		break;
1475
1476	case SO_INCOMING_CPU:
1477		v.val = READ_ONCE(sk->sk_incoming_cpu);
1478		break;
1479
1480	case SO_MEMINFO:
1481	{
1482		u32 meminfo[SK_MEMINFO_VARS];
1483
1484		sk_get_meminfo(sk, meminfo);
1485
1486		len = min_t(unsigned int, len, sizeof(meminfo));
1487		if (copy_to_user(optval, &meminfo, len))
1488			return -EFAULT;
1489
1490		goto lenout;
1491	}
1492
1493#ifdef CONFIG_NET_RX_BUSY_POLL
1494	case SO_INCOMING_NAPI_ID:
1495		v.val = READ_ONCE(sk->sk_napi_id);
1496
1497		/* aggregate non-NAPI IDs down to 0 */
1498		if (v.val < MIN_NAPI_ID)
1499			v.val = 0;
1500
1501		break;
1502#endif
1503
1504	case SO_COOKIE:
1505		lv = sizeof(u64);
1506		if (len < lv)
1507			return -EINVAL;
1508		v.val64 = sock_gen_cookie(sk);
1509		break;
1510
1511	case SO_ZEROCOPY:
1512		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1513		break;
1514
1515	case SO_TXTIME:
1516		lv = sizeof(v.txtime);
1517		v.txtime.clockid = sk->sk_clockid;
1518		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1519				  SOF_TXTIME_DEADLINE_MODE : 0;
1520		v.txtime.flags |= sk->sk_txtime_report_errors ?
1521				  SOF_TXTIME_REPORT_ERRORS : 0;
1522		break;
1523
1524	case SO_BINDTOIFINDEX:
1525		v.val = sk->sk_bound_dev_if;
1526		break;
1527
1528	default:
1529		/* We implement the SO_SNDLOWAT etc to not be settable
1530		 * (1003.1g 7).
1531		 */
1532		return -ENOPROTOOPT;
1533	}
1534
1535	if (len > lv)
1536		len = lv;
1537	if (copy_to_user(optval, &v, len))
1538		return -EFAULT;
1539lenout:
1540	if (put_user(len, optlen))
1541		return -EFAULT;
1542	return 0;
1543}
1544
1545/*
1546 * Initialize an sk_lock.
1547 *
1548 * (We also register the sk_lock with the lock validator.)
1549 */
1550static inline void sock_lock_init(struct sock *sk)
1551{
1552	if (sk->sk_kern_sock)
1553		sock_lock_init_class_and_name(
1554			sk,
1555			af_family_kern_slock_key_strings[sk->sk_family],
1556			af_family_kern_slock_keys + sk->sk_family,
1557			af_family_kern_key_strings[sk->sk_family],
1558			af_family_kern_keys + sk->sk_family);
1559	else
1560		sock_lock_init_class_and_name(
1561			sk,
1562			af_family_slock_key_strings[sk->sk_family],
1563			af_family_slock_keys + sk->sk_family,
1564			af_family_key_strings[sk->sk_family],
1565			af_family_keys + sk->sk_family);
1566}
1567
1568/*
1569 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1570 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1571 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1572 */
1573static void sock_copy(struct sock *nsk, const struct sock *osk)
1574{
1575	const struct proto *prot = READ_ONCE(osk->sk_prot);
1576#ifdef CONFIG_SECURITY_NETWORK
1577	void *sptr = nsk->sk_security;
1578#endif
1579	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1580
1581	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1582	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1583
1584#ifdef CONFIG_SECURITY_NETWORK
1585	nsk->sk_security = sptr;
1586	security_sk_clone(osk, nsk);
1587#endif
1588}
1589
1590static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1591		int family)
1592{
1593	struct sock *sk;
1594	struct kmem_cache *slab;
1595
1596	slab = prot->slab;
1597	if (slab != NULL) {
1598		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1599		if (!sk)
1600			return sk;
1601		if (want_init_on_alloc(priority))
1602			sk_prot_clear_nulls(sk, prot->obj_size);
1603	} else
1604		sk = kmalloc(prot->obj_size, priority);
1605
1606	if (sk != NULL) {
1607		if (security_sk_alloc(sk, family, priority))
1608			goto out_free;
1609
1610		if (!try_module_get(prot->owner))
1611			goto out_free_sec;
1612		sk_tx_queue_clear(sk);
1613	}
1614
1615	return sk;
1616
1617out_free_sec:
1618	security_sk_free(sk);
1619out_free:
1620	if (slab != NULL)
1621		kmem_cache_free(slab, sk);
1622	else
1623		kfree(sk);
1624	return NULL;
1625}
1626
1627static void sk_prot_free(struct proto *prot, struct sock *sk)
1628{
1629	struct kmem_cache *slab;
1630	struct module *owner;
1631
1632	owner = prot->owner;
1633	slab = prot->slab;
1634
1635	cgroup_sk_free(&sk->sk_cgrp_data);
1636	mem_cgroup_sk_free(sk);
1637	security_sk_free(sk);
1638	if (slab != NULL)
1639		kmem_cache_free(slab, sk);
1640	else
1641		kfree(sk);
1642	module_put(owner);
1643}
1644
1645/**
1646 *	sk_alloc - All socket objects are allocated here
1647 *	@net: the applicable net namespace
1648 *	@family: protocol family
1649 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1650 *	@prot: struct proto associated with this new sock instance
1651 *	@kern: is this to be a kernel socket?
1652 */
1653struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1654		      struct proto *prot, int kern)
1655{
1656	struct sock *sk;
1657
1658	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1659	if (sk) {
1660		sk->sk_family = family;
1661		/*
1662		 * See comment in struct sock definition to understand
1663		 * why we need sk_prot_creator -acme
1664		 */
1665		sk->sk_prot = sk->sk_prot_creator = prot;
1666		sk->sk_kern_sock = kern;
1667		sock_lock_init(sk);
1668		sk->sk_net_refcnt = kern ? 0 : 1;
1669		if (likely(sk->sk_net_refcnt)) {
1670			get_net(net);
1671			sock_inuse_add(net, 1);
1672		}
1673
1674		sock_net_set(sk, net);
1675		refcount_set(&sk->sk_wmem_alloc, 1);
1676
1677		mem_cgroup_sk_alloc(sk);
1678		cgroup_sk_alloc(&sk->sk_cgrp_data);
1679		sock_update_classid(&sk->sk_cgrp_data);
1680		sock_update_netprioidx(&sk->sk_cgrp_data);
1681	}
1682
1683	return sk;
1684}
1685EXPORT_SYMBOL(sk_alloc);
1686
1687/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1688 * grace period. This is the case for UDP sockets and TCP listeners.
1689 */
1690static void __sk_destruct(struct rcu_head *head)
1691{
1692	struct sock *sk = container_of(head, struct sock, sk_rcu);
1693	struct sk_filter *filter;
1694
1695	if (sk->sk_destruct)
1696		sk->sk_destruct(sk);
1697
1698	filter = rcu_dereference_check(sk->sk_filter,
1699				       refcount_read(&sk->sk_wmem_alloc) == 0);
1700	if (filter) {
1701		sk_filter_uncharge(sk, filter);
1702		RCU_INIT_POINTER(sk->sk_filter, NULL);
1703	}
1704
1705	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1706
1707#ifdef CONFIG_BPF_SYSCALL
1708	bpf_sk_storage_free(sk);
1709#endif
1710
1711	if (atomic_read(&sk->sk_omem_alloc))
1712		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1713			 __func__, atomic_read(&sk->sk_omem_alloc));
1714
1715	if (sk->sk_frag.page) {
1716		put_page(sk->sk_frag.page);
1717		sk->sk_frag.page = NULL;
1718	}
1719
1720	if (sk->sk_peer_cred)
1721		put_cred(sk->sk_peer_cred);
1722	put_pid(sk->sk_peer_pid);
1723	if (likely(sk->sk_net_refcnt))
1724		put_net(sock_net(sk));
1725	sk_prot_free(sk->sk_prot_creator, sk);
1726}
1727
1728void sk_destruct(struct sock *sk)
1729{
1730	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1731
1732	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1733		reuseport_detach_sock(sk);
1734		use_call_rcu = true;
1735	}
1736
1737	if (use_call_rcu)
1738		call_rcu(&sk->sk_rcu, __sk_destruct);
1739	else
1740		__sk_destruct(&sk->sk_rcu);
1741}
1742
1743static void __sk_free(struct sock *sk)
1744{
1745	if (likely(sk->sk_net_refcnt))
1746		sock_inuse_add(sock_net(sk), -1);
1747
1748	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1749		sock_diag_broadcast_destroy(sk);
1750	else
1751		sk_destruct(sk);
1752}
1753
1754void sk_free(struct sock *sk)
1755{
1756	/*
1757	 * We subtract one from sk_wmem_alloc and can know if
1758	 * some packets are still in some tx queue.
1759	 * If not null, sock_wfree() will call __sk_free(sk) later
1760	 */
1761	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1762		__sk_free(sk);
1763}
1764EXPORT_SYMBOL(sk_free);
1765
1766static void sk_init_common(struct sock *sk)
1767{
1768	skb_queue_head_init(&sk->sk_receive_queue);
1769	skb_queue_head_init(&sk->sk_write_queue);
1770	skb_queue_head_init(&sk->sk_error_queue);
1771
1772	rwlock_init(&sk->sk_callback_lock);
1773	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1774			af_rlock_keys + sk->sk_family,
1775			af_family_rlock_key_strings[sk->sk_family]);
1776	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1777			af_wlock_keys + sk->sk_family,
1778			af_family_wlock_key_strings[sk->sk_family]);
1779	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1780			af_elock_keys + sk->sk_family,
1781			af_family_elock_key_strings[sk->sk_family]);
1782	lockdep_set_class_and_name(&sk->sk_callback_lock,
1783			af_callback_keys + sk->sk_family,
1784			af_family_clock_key_strings[sk->sk_family]);
1785}
1786
1787/**
1788 *	sk_clone_lock - clone a socket, and lock its clone
1789 *	@sk: the socket to clone
1790 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1791 *
1792 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1793 */
1794struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1795{
1796	struct proto *prot = READ_ONCE(sk->sk_prot);
1797	struct sock *newsk;
1798	bool is_charged = true;
1799
1800	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1801	if (newsk != NULL) {
1802		struct sk_filter *filter;
1803
1804		sock_copy(newsk, sk);
1805
1806		newsk->sk_prot_creator = prot;
1807
1808		/* SANITY */
1809		if (likely(newsk->sk_net_refcnt))
1810			get_net(sock_net(newsk));
1811		sk_node_init(&newsk->sk_node);
1812		sock_lock_init(newsk);
1813		bh_lock_sock(newsk);
1814		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1815		newsk->sk_backlog.len = 0;
1816
1817		atomic_set(&newsk->sk_rmem_alloc, 0);
1818		/*
1819		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1820		 */
1821		refcount_set(&newsk->sk_wmem_alloc, 1);
1822		atomic_set(&newsk->sk_omem_alloc, 0);
1823		sk_init_common(newsk);
1824
1825		newsk->sk_dst_cache	= NULL;
1826		newsk->sk_dst_pending_confirm = 0;
1827		newsk->sk_wmem_queued	= 0;
1828		newsk->sk_forward_alloc = 0;
1829		atomic_set(&newsk->sk_drops, 0);
1830		newsk->sk_send_head	= NULL;
1831		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1832		atomic_set(&newsk->sk_zckey, 0);
1833
1834		sock_reset_flag(newsk, SOCK_DONE);
1835
1836		/* sk->sk_memcg will be populated at accept() time */
1837		newsk->sk_memcg = NULL;
1838
1839		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1840
1841		rcu_read_lock();
1842		filter = rcu_dereference(sk->sk_filter);
1843		if (filter != NULL)
1844			/* though it's an empty new sock, the charging may fail
1845			 * if sysctl_optmem_max was changed between creation of
1846			 * original socket and cloning
1847			 */
1848			is_charged = sk_filter_charge(newsk, filter);
1849		RCU_INIT_POINTER(newsk->sk_filter, filter);
1850		rcu_read_unlock();
1851
1852		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1853			/* We need to make sure that we don't uncharge the new
1854			 * socket if we couldn't charge it in the first place
1855			 * as otherwise we uncharge the parent's filter.
1856			 */
1857			if (!is_charged)
1858				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1859			sk_free_unlock_clone(newsk);
1860			newsk = NULL;
1861			goto out;
1862		}
1863		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1864
1865		if (bpf_sk_storage_clone(sk, newsk)) {
1866			sk_free_unlock_clone(newsk);
1867			newsk = NULL;
1868			goto out;
1869		}
1870
1871		/* Clear sk_user_data if parent had the pointer tagged
1872		 * as not suitable for copying when cloning.
1873		 */
1874		if (sk_user_data_is_nocopy(newsk))
1875			newsk->sk_user_data = NULL;
1876
1877		newsk->sk_err	   = 0;
1878		newsk->sk_err_soft = 0;
1879		newsk->sk_priority = 0;
1880		newsk->sk_incoming_cpu = raw_smp_processor_id();
1881		if (likely(newsk->sk_net_refcnt))
1882			sock_inuse_add(sock_net(newsk), 1);
1883
1884		/*
1885		 * Before updating sk_refcnt, we must commit prior changes to memory
1886		 * (Documentation/RCU/rculist_nulls.txt for details)
1887		 */
1888		smp_wmb();
1889		refcount_set(&newsk->sk_refcnt, 2);
1890
1891		/*
1892		 * Increment the counter in the same struct proto as the master
1893		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1894		 * is the same as sk->sk_prot->socks, as this field was copied
1895		 * with memcpy).
1896		 *
1897		 * This _changes_ the previous behaviour, where
1898		 * tcp_create_openreq_child always was incrementing the
1899		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1900		 * to be taken into account in all callers. -acme
1901		 */
1902		sk_refcnt_debug_inc(newsk);
1903		sk_set_socket(newsk, NULL);
1904		RCU_INIT_POINTER(newsk->sk_wq, NULL);
1905
1906		if (newsk->sk_prot->sockets_allocated)
1907			sk_sockets_allocated_inc(newsk);
1908
1909		if (sock_needs_netstamp(sk) &&
1910		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1911			net_enable_timestamp();
1912	}
1913out:
1914	return newsk;
1915}
1916EXPORT_SYMBOL_GPL(sk_clone_lock);
1917
1918void sk_free_unlock_clone(struct sock *sk)
1919{
1920	/* It is still raw copy of parent, so invalidate
1921	 * destructor and make plain sk_free() */
1922	sk->sk_destruct = NULL;
1923	bh_unlock_sock(sk);
1924	sk_free(sk);
1925}
1926EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1927
1928void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1929{
1930	u32 max_segs = 1;
1931
1932	sk_dst_set(sk, dst);
1933	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1934	if (sk->sk_route_caps & NETIF_F_GSO)
1935		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1936	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1937	if (sk_can_gso(sk)) {
1938		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1939			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1940		} else {
1941			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1942			sk->sk_gso_max_size = dst->dev->gso_max_size;
1943			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1944		}
1945	}
1946	sk->sk_gso_max_segs = max_segs;
1947}
1948EXPORT_SYMBOL_GPL(sk_setup_caps);
1949
1950/*
1951 *	Simple resource managers for sockets.
1952 */
1953
1954
1955/*
1956 * Write buffer destructor automatically called from kfree_skb.
1957 */
1958void sock_wfree(struct sk_buff *skb)
1959{
1960	struct sock *sk = skb->sk;
1961	unsigned int len = skb->truesize;
1962
1963	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1964		/*
1965		 * Keep a reference on sk_wmem_alloc, this will be released
1966		 * after sk_write_space() call
1967		 */
1968		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1969		sk->sk_write_space(sk);
1970		len = 1;
1971	}
1972	/*
1973	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1974	 * could not do because of in-flight packets
1975	 */
1976	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1977		__sk_free(sk);
1978}
1979EXPORT_SYMBOL(sock_wfree);
1980
1981/* This variant of sock_wfree() is used by TCP,
1982 * since it sets SOCK_USE_WRITE_QUEUE.
1983 */
1984void __sock_wfree(struct sk_buff *skb)
1985{
1986	struct sock *sk = skb->sk;
1987
1988	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1989		__sk_free(sk);
1990}
1991
1992void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1993{
1994	skb_orphan(skb);
1995	skb->sk = sk;
1996#ifdef CONFIG_INET
1997	if (unlikely(!sk_fullsock(sk))) {
1998		skb->destructor = sock_edemux;
1999		sock_hold(sk);
2000		return;
2001	}
2002#endif
2003	skb->destructor = sock_wfree;
2004	skb_set_hash_from_sk(skb, sk);
2005	/*
2006	 * We used to take a refcount on sk, but following operation
2007	 * is enough to guarantee sk_free() wont free this sock until
2008	 * all in-flight packets are completed
2009	 */
2010	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2011}
2012EXPORT_SYMBOL(skb_set_owner_w);
2013
2014static bool can_skb_orphan_partial(const struct sk_buff *skb)
2015{
2016#ifdef CONFIG_TLS_DEVICE
2017	/* Drivers depend on in-order delivery for crypto offload,
2018	 * partial orphan breaks out-of-order-OK logic.
2019	 */
2020	if (skb->decrypted)
2021		return false;
2022#endif
2023	return (skb->destructor == sock_wfree ||
2024		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2025}
2026
2027/* This helper is used by netem, as it can hold packets i…

Large files files are truncated, but you can click here to view the full file