PageRenderTime 9ms CodeModel.GetById 16ms app.highlight 96ms RepoModel.GetById 1ms app.codeStats 1ms

/net/socket.c

https://bitbucket.org/bradfa/linux
C | 3456 lines | 2500 code | 495 blank | 461 comment | 395 complexity | 4f7ba4b21274e7f1c301fe32993b3f84 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * NET		An implementation of the SOCKET network access protocol.
   3 *
   4 * Version:	@(#)socket.c	1.1.93	18/02/95
   5 *
   6 * Authors:	Orest Zborowski, <obz@Kodak.COM>
   7 *		Ross Biro
   8 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   9 *
  10 * Fixes:
  11 *		Anonymous	:	NOTSOCK/BADF cleanup. Error fix in
  12 *					shutdown()
  13 *		Alan Cox	:	verify_area() fixes
  14 *		Alan Cox	:	Removed DDI
  15 *		Jonathan Kamens	:	SOCK_DGRAM reconnect bug
  16 *		Alan Cox	:	Moved a load of checks to the very
  17 *					top level.
  18 *		Alan Cox	:	Move address structures to/from user
  19 *					mode above the protocol layers.
  20 *		Rob Janssen	:	Allow 0 length sends.
  21 *		Alan Cox	:	Asynchronous I/O support (cribbed from the
  22 *					tty drivers).
  23 *		Niibe Yutaka	:	Asynchronous I/O for writes (4.4BSD style)
  24 *		Jeff Uphoff	:	Made max number of sockets command-line
  25 *					configurable.
  26 *		Matti Aarnio	:	Made the number of sockets dynamic,
  27 *					to be allocated when needed, and mr.
  28 *					Uphoff's max is used as max to be
  29 *					allowed to allocate.
  30 *		Linus		:	Argh. removed all the socket allocation
  31 *					altogether: it's in the inode now.
  32 *		Alan Cox	:	Made sock_alloc()/sock_release() public
  33 *					for NetROM and future kernel nfsd type
  34 *					stuff.
  35 *		Alan Cox	:	sendmsg/recvmsg basics.
  36 *		Tom Dyas	:	Export net symbols.
  37 *		Marcin Dalecki	:	Fixed problems with CONFIG_NET="n".
  38 *		Alan Cox	:	Added thread locking to sys_* calls
  39 *					for sockets. May have errors at the
  40 *					moment.
  41 *		Kevin Buhr	:	Fixed the dumb errors in the above.
  42 *		Andi Kleen	:	Some small cleanups, optimizations,
  43 *					and fixed a copy_from_user() bug.
  44 *		Tigran Aivazian	:	sys_send(args) calls sys_sendto(args, NULL, 0)
  45 *		Tigran Aivazian	:	Made listen(2) backlog sanity checks
  46 *					protocol-independent
  47 *
  48 *
  49 *		This program is free software; you can redistribute it and/or
  50 *		modify it under the terms of the GNU General Public License
  51 *		as published by the Free Software Foundation; either version
  52 *		2 of the License, or (at your option) any later version.
  53 *
  54 *
  55 *	This module is effectively the top level interface to the BSD socket
  56 *	paradigm.
  57 *
  58 *	Based upon Swansea University Computer Society NET3.039
  59 */
  60
  61#include <linux/mm.h>
  62#include <linux/socket.h>
  63#include <linux/file.h>
  64#include <linux/net.h>
  65#include <linux/interrupt.h>
  66#include <linux/thread_info.h>
  67#include <linux/rcupdate.h>
  68#include <linux/netdevice.h>
  69#include <linux/proc_fs.h>
  70#include <linux/seq_file.h>
  71#include <linux/mutex.h>
  72#include <linux/if_bridge.h>
  73#include <linux/if_frad.h>
  74#include <linux/if_vlan.h>
  75#include <linux/init.h>
  76#include <linux/poll.h>
  77#include <linux/cache.h>
  78#include <linux/module.h>
  79#include <linux/highmem.h>
  80#include <linux/mount.h>
  81#include <linux/security.h>
  82#include <linux/syscalls.h>
  83#include <linux/compat.h>
  84#include <linux/kmod.h>
  85#include <linux/audit.h>
  86#include <linux/wireless.h>
  87#include <linux/nsproxy.h>
  88#include <linux/magic.h>
  89#include <linux/slab.h>
  90#include <linux/xattr.h>
  91
  92#include <asm/uaccess.h>
  93#include <asm/unistd.h>
  94
  95#include <net/compat.h>
  96#include <net/wext.h>
  97#include <net/cls_cgroup.h>
  98
  99#include <net/sock.h>
 100#include <linux/netfilter.h>
 101
 102#include <linux/if_tun.h>
 103#include <linux/ipv6_route.h>
 104#include <linux/route.h>
 105#include <linux/sockios.h>
 106#include <linux/atalk.h>
 107
 108static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 109static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 110			 unsigned long nr_segs, loff_t pos);
 111static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
 112			  unsigned long nr_segs, loff_t pos);
 113static int sock_mmap(struct file *file, struct vm_area_struct *vma);
 114
 115static int sock_close(struct inode *inode, struct file *file);
 116static unsigned int sock_poll(struct file *file,
 117			      struct poll_table_struct *wait);
 118static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 119#ifdef CONFIG_COMPAT
 120static long compat_sock_ioctl(struct file *file,
 121			      unsigned int cmd, unsigned long arg);
 122#endif
 123static int sock_fasync(int fd, struct file *filp, int on);
 124static ssize_t sock_sendpage(struct file *file, struct page *page,
 125			     int offset, size_t size, loff_t *ppos, int more);
 126static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
 127				struct pipe_inode_info *pipe, size_t len,
 128				unsigned int flags);
 129
 130/*
 131 *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 132 *	in the operation structures but are done directly via the socketcall() multiplexor.
 133 */
 134
 135static const struct file_operations socket_file_ops = {
 136	.owner =	THIS_MODULE,
 137	.llseek =	no_llseek,
 138	.aio_read =	sock_aio_read,
 139	.aio_write =	sock_aio_write,
 140	.poll =		sock_poll,
 141	.unlocked_ioctl = sock_ioctl,
 142#ifdef CONFIG_COMPAT
 143	.compat_ioctl = compat_sock_ioctl,
 144#endif
 145	.mmap =		sock_mmap,
 146	.open =		sock_no_open,	/* special open code to disallow open via /proc */
 147	.release =	sock_close,
 148	.fasync =	sock_fasync,
 149	.sendpage =	sock_sendpage,
 150	.splice_write = generic_splice_sendpage,
 151	.splice_read =	sock_splice_read,
 152};
 153
 154/*
 155 *	The protocol list. Each protocol is registered in here.
 156 */
 157
 158static DEFINE_SPINLOCK(net_family_lock);
 159static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
 160
 161/*
 162 *	Statistics counters of the socket lists
 163 */
 164
 165static DEFINE_PER_CPU(int, sockets_in_use);
 166
 167/*
 168 * Support routines.
 169 * Move socket addresses back and forth across the kernel/user
 170 * divide and look after the messy bits.
 171 */
 172
 173/**
 174 *	move_addr_to_kernel	-	copy a socket address into kernel space
 175 *	@uaddr: Address in user space
 176 *	@kaddr: Address in kernel space
 177 *	@ulen: Length in user space
 178 *
 179 *	The address is copied into kernel space. If the provided address is
 180 *	too long an error code of -EINVAL is returned. If the copy gives
 181 *	invalid addresses -EFAULT is returned. On a success 0 is returned.
 182 */
 183
 184int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
 185{
 186	if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
 187		return -EINVAL;
 188	if (ulen == 0)
 189		return 0;
 190	if (copy_from_user(kaddr, uaddr, ulen))
 191		return -EFAULT;
 192	return audit_sockaddr(ulen, kaddr);
 193}
 194
 195/**
 196 *	move_addr_to_user	-	copy an address to user space
 197 *	@kaddr: kernel space address
 198 *	@klen: length of address in kernel
 199 *	@uaddr: user space address
 200 *	@ulen: pointer to user length field
 201 *
 202 *	The value pointed to by ulen on entry is the buffer length available.
 203 *	This is overwritten with the buffer space used. -EINVAL is returned
 204 *	if an overlong buffer is specified or a negative buffer size. -EFAULT
 205 *	is returned if either the buffer or the length field are not
 206 *	accessible.
 207 *	After copying the data up to the limit the user specifies, the true
 208 *	length of the data is written over the length limit the user
 209 *	specified. Zero is returned for a success.
 210 */
 211
 212static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
 213			     void __user *uaddr, int __user *ulen)
 214{
 215	int err;
 216	int len;
 217
 218	err = get_user(len, ulen);
 219	if (err)
 220		return err;
 221	if (len > klen)
 222		len = klen;
 223	if (len < 0 || len > sizeof(struct sockaddr_storage))
 224		return -EINVAL;
 225	if (len) {
 226		if (audit_sockaddr(klen, kaddr))
 227			return -ENOMEM;
 228		if (copy_to_user(uaddr, kaddr, len))
 229			return -EFAULT;
 230	}
 231	/*
 232	 *      "fromlen shall refer to the value before truncation.."
 233	 *                      1003.1g
 234	 */
 235	return __put_user(klen, ulen);
 236}
 237
 238static struct kmem_cache *sock_inode_cachep __read_mostly;
 239
 240static struct inode *sock_alloc_inode(struct super_block *sb)
 241{
 242	struct socket_alloc *ei;
 243	struct socket_wq *wq;
 244
 245	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
 246	if (!ei)
 247		return NULL;
 248	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
 249	if (!wq) {
 250		kmem_cache_free(sock_inode_cachep, ei);
 251		return NULL;
 252	}
 253	init_waitqueue_head(&wq->wait);
 254	wq->fasync_list = NULL;
 255	RCU_INIT_POINTER(ei->socket.wq, wq);
 256
 257	ei->socket.state = SS_UNCONNECTED;
 258	ei->socket.flags = 0;
 259	ei->socket.ops = NULL;
 260	ei->socket.sk = NULL;
 261	ei->socket.file = NULL;
 262
 263	return &ei->vfs_inode;
 264}
 265
 266static void sock_destroy_inode(struct inode *inode)
 267{
 268	struct socket_alloc *ei;
 269	struct socket_wq *wq;
 270
 271	ei = container_of(inode, struct socket_alloc, vfs_inode);
 272	wq = rcu_dereference_protected(ei->socket.wq, 1);
 273	kfree_rcu(wq, rcu);
 274	kmem_cache_free(sock_inode_cachep, ei);
 275}
 276
 277static void init_once(void *foo)
 278{
 279	struct socket_alloc *ei = (struct socket_alloc *)foo;
 280
 281	inode_init_once(&ei->vfs_inode);
 282}
 283
 284static int init_inodecache(void)
 285{
 286	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
 287					      sizeof(struct socket_alloc),
 288					      0,
 289					      (SLAB_HWCACHE_ALIGN |
 290					       SLAB_RECLAIM_ACCOUNT |
 291					       SLAB_MEM_SPREAD),
 292					      init_once);
 293	if (sock_inode_cachep == NULL)
 294		return -ENOMEM;
 295	return 0;
 296}
 297
 298static const struct super_operations sockfs_ops = {
 299	.alloc_inode	= sock_alloc_inode,
 300	.destroy_inode	= sock_destroy_inode,
 301	.statfs		= simple_statfs,
 302};
 303
 304/*
 305 * sockfs_dname() is called from d_path().
 306 */
 307static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
 308{
 309	return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
 310				dentry->d_inode->i_ino);
 311}
 312
 313static const struct dentry_operations sockfs_dentry_operations = {
 314	.d_dname  = sockfs_dname,
 315};
 316
 317static struct dentry *sockfs_mount(struct file_system_type *fs_type,
 318			 int flags, const char *dev_name, void *data)
 319{
 320	return mount_pseudo(fs_type, "socket:", &sockfs_ops,
 321		&sockfs_dentry_operations, SOCKFS_MAGIC);
 322}
 323
 324static struct vfsmount *sock_mnt __read_mostly;
 325
 326static struct file_system_type sock_fs_type = {
 327	.name =		"sockfs",
 328	.mount =	sockfs_mount,
 329	.kill_sb =	kill_anon_super,
 330};
 331
 332/*
 333 *	Obtains the first available file descriptor and sets it up for use.
 334 *
 335 *	These functions create file structures and maps them to fd space
 336 *	of the current process. On success it returns file descriptor
 337 *	and file struct implicitly stored in sock->file.
 338 *	Note that another thread may close file descriptor before we return
 339 *	from this function. We use the fact that now we do not refer
 340 *	to socket after mapping. If one day we will need it, this
 341 *	function will increment ref. count on file by 1.
 342 *
 343 *	In any case returned fd MAY BE not valid!
 344 *	This race condition is unavoidable
 345 *	with shared fd spaces, we cannot solve it inside kernel,
 346 *	but we take care of internal coherence yet.
 347 */
 348
 349struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
 350{
 351	struct qstr name = { .name = "" };
 352	struct path path;
 353	struct file *file;
 354
 355	if (dname) {
 356		name.name = dname;
 357		name.len = strlen(name.name);
 358	} else if (sock->sk) {
 359		name.name = sock->sk->sk_prot_creator->name;
 360		name.len = strlen(name.name);
 361	}
 362	path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
 363	if (unlikely(!path.dentry))
 364		return ERR_PTR(-ENOMEM);
 365	path.mnt = mntget(sock_mnt);
 366
 367	d_instantiate(path.dentry, SOCK_INODE(sock));
 368	SOCK_INODE(sock)->i_fop = &socket_file_ops;
 369
 370	file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
 371		  &socket_file_ops);
 372	if (unlikely(IS_ERR(file))) {
 373		/* drop dentry, keep inode */
 374		ihold(path.dentry->d_inode);
 375		path_put(&path);
 376		return file;
 377	}
 378
 379	sock->file = file;
 380	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
 381	file->private_data = sock;
 382	return file;
 383}
 384EXPORT_SYMBOL(sock_alloc_file);
 385
 386static int sock_map_fd(struct socket *sock, int flags)
 387{
 388	struct file *newfile;
 389	int fd = get_unused_fd_flags(flags);
 390	if (unlikely(fd < 0))
 391		return fd;
 392
 393	newfile = sock_alloc_file(sock, flags, NULL);
 394	if (likely(!IS_ERR(newfile))) {
 395		fd_install(fd, newfile);
 396		return fd;
 397	}
 398
 399	put_unused_fd(fd);
 400	return PTR_ERR(newfile);
 401}
 402
 403struct socket *sock_from_file(struct file *file, int *err)
 404{
 405	if (file->f_op == &socket_file_ops)
 406		return file->private_data;	/* set in sock_map_fd */
 407
 408	*err = -ENOTSOCK;
 409	return NULL;
 410}
 411EXPORT_SYMBOL(sock_from_file);
 412
 413/**
 414 *	sockfd_lookup - Go from a file number to its socket slot
 415 *	@fd: file handle
 416 *	@err: pointer to an error code return
 417 *
 418 *	The file handle passed in is locked and the socket it is bound
 419 *	too is returned. If an error occurs the err pointer is overwritten
 420 *	with a negative errno code and NULL is returned. The function checks
 421 *	for both invalid handles and passing a handle which is not a socket.
 422 *
 423 *	On a success the socket object pointer is returned.
 424 */
 425
 426struct socket *sockfd_lookup(int fd, int *err)
 427{
 428	struct file *file;
 429	struct socket *sock;
 430
 431	file = fget(fd);
 432	if (!file) {
 433		*err = -EBADF;
 434		return NULL;
 435	}
 436
 437	sock = sock_from_file(file, err);
 438	if (!sock)
 439		fput(file);
 440	return sock;
 441}
 442EXPORT_SYMBOL(sockfd_lookup);
 443
 444static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
 445{
 446	struct file *file;
 447	struct socket *sock;
 448
 449	*err = -EBADF;
 450	file = fget_light(fd, fput_needed);
 451	if (file) {
 452		sock = sock_from_file(file, err);
 453		if (sock)
 454			return sock;
 455		fput_light(file, *fput_needed);
 456	}
 457	return NULL;
 458}
 459
 460#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
 461#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
 462#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)
 463static ssize_t sockfs_getxattr(struct dentry *dentry,
 464			       const char *name, void *value, size_t size)
 465{
 466	const char *proto_name;
 467	size_t proto_size;
 468	int error;
 469
 470	error = -ENODATA;
 471	if (!strncmp(name, XATTR_NAME_SOCKPROTONAME, XATTR_NAME_SOCKPROTONAME_LEN)) {
 472		proto_name = dentry->d_name.name;
 473		proto_size = strlen(proto_name);
 474
 475		if (value) {
 476			error = -ERANGE;
 477			if (proto_size + 1 > size)
 478				goto out;
 479
 480			strncpy(value, proto_name, proto_size + 1);
 481		}
 482		error = proto_size + 1;
 483	}
 484
 485out:
 486	return error;
 487}
 488
 489static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
 490				size_t size)
 491{
 492	ssize_t len;
 493	ssize_t used = 0;
 494
 495	len = security_inode_listsecurity(dentry->d_inode, buffer, size);
 496	if (len < 0)
 497		return len;
 498	used += len;
 499	if (buffer) {
 500		if (size < used)
 501			return -ERANGE;
 502		buffer += len;
 503	}
 504
 505	len = (XATTR_NAME_SOCKPROTONAME_LEN + 1);
 506	used += len;
 507	if (buffer) {
 508		if (size < used)
 509			return -ERANGE;
 510		memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len);
 511		buffer += len;
 512	}
 513
 514	return used;
 515}
 516
 517static const struct inode_operations sockfs_inode_ops = {
 518	.getxattr = sockfs_getxattr,
 519	.listxattr = sockfs_listxattr,
 520};
 521
 522/**
 523 *	sock_alloc	-	allocate a socket
 524 *
 525 *	Allocate a new inode and socket object. The two are bound together
 526 *	and initialised. The socket is then returned. If we are out of inodes
 527 *	NULL is returned.
 528 */
 529
 530static struct socket *sock_alloc(void)
 531{
 532	struct inode *inode;
 533	struct socket *sock;
 534
 535	inode = new_inode_pseudo(sock_mnt->mnt_sb);
 536	if (!inode)
 537		return NULL;
 538
 539	sock = SOCKET_I(inode);
 540
 541	kmemcheck_annotate_bitfield(sock, type);
 542	inode->i_ino = get_next_ino();
 543	inode->i_mode = S_IFSOCK | S_IRWXUGO;
 544	inode->i_uid = current_fsuid();
 545	inode->i_gid = current_fsgid();
 546	inode->i_op = &sockfs_inode_ops;
 547
 548	this_cpu_add(sockets_in_use, 1);
 549	return sock;
 550}
 551
 552/*
 553 *	In theory you can't get an open on this inode, but /proc provides
 554 *	a back door. Remember to keep it shut otherwise you'll let the
 555 *	creepy crawlies in.
 556 */
 557
 558static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 559{
 560	return -ENXIO;
 561}
 562
 563const struct file_operations bad_sock_fops = {
 564	.owner = THIS_MODULE,
 565	.open = sock_no_open,
 566	.llseek = noop_llseek,
 567};
 568
 569/**
 570 *	sock_release	-	close a socket
 571 *	@sock: socket to close
 572 *
 573 *	The socket is released from the protocol stack if it has a release
 574 *	callback, and the inode is then released if the socket is bound to
 575 *	an inode not a file.
 576 */
 577
 578void sock_release(struct socket *sock)
 579{
 580	if (sock->ops) {
 581		struct module *owner = sock->ops->owner;
 582
 583		sock->ops->release(sock);
 584		sock->ops = NULL;
 585		module_put(owner);
 586	}
 587
 588	if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
 589		printk(KERN_ERR "sock_release: fasync list not empty!\n");
 590
 591	if (test_bit(SOCK_EXTERNALLY_ALLOCATED, &sock->flags))
 592		return;
 593
 594	this_cpu_sub(sockets_in_use, 1);
 595	if (!sock->file) {
 596		iput(SOCK_INODE(sock));
 597		return;
 598	}
 599	sock->file = NULL;
 600}
 601EXPORT_SYMBOL(sock_release);
 602
 603int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags)
 604{
 605	*tx_flags = 0;
 606	if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 607		*tx_flags |= SKBTX_HW_TSTAMP;
 608	if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 609		*tx_flags |= SKBTX_SW_TSTAMP;
 610	if (sock_flag(sk, SOCK_WIFI_STATUS))
 611		*tx_flags |= SKBTX_WIFI_STATUS;
 612	return 0;
 613}
 614EXPORT_SYMBOL(sock_tx_timestamp);
 615
 616static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock,
 617				       struct msghdr *msg, size_t size)
 618{
 619	struct sock_iocb *si = kiocb_to_siocb(iocb);
 620
 621	si->sock = sock;
 622	si->scm = NULL;
 623	si->msg = msg;
 624	si->size = size;
 625
 626	return sock->ops->sendmsg(iocb, sock, msg, size);
 627}
 628
 629static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 630				 struct msghdr *msg, size_t size)
 631{
 632	int err = security_socket_sendmsg(sock, msg, size);
 633
 634	return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size);
 635}
 636
 637int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 638{
 639	struct kiocb iocb;
 640	struct sock_iocb siocb;
 641	int ret;
 642
 643	init_sync_kiocb(&iocb, NULL);
 644	iocb.private = &siocb;
 645	ret = __sock_sendmsg(&iocb, sock, msg, size);
 646	if (-EIOCBQUEUED == ret)
 647		ret = wait_on_sync_kiocb(&iocb);
 648	return ret;
 649}
 650EXPORT_SYMBOL(sock_sendmsg);
 651
 652static int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, size_t size)
 653{
 654	struct kiocb iocb;
 655	struct sock_iocb siocb;
 656	int ret;
 657
 658	init_sync_kiocb(&iocb, NULL);
 659	iocb.private = &siocb;
 660	ret = __sock_sendmsg_nosec(&iocb, sock, msg, size);
 661	if (-EIOCBQUEUED == ret)
 662		ret = wait_on_sync_kiocb(&iocb);
 663	return ret;
 664}
 665
 666int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 667		   struct kvec *vec, size_t num, size_t size)
 668{
 669	mm_segment_t oldfs = get_fs();
 670	int result;
 671
 672	set_fs(KERNEL_DS);
 673	/*
 674	 * the following is safe, since for compiler definitions of kvec and
 675	 * iovec are identical, yielding the same in-core layout and alignment
 676	 */
 677	msg->msg_iov = (struct iovec *)vec;
 678	msg->msg_iovlen = num;
 679	result = sock_sendmsg(sock, msg, size);
 680	set_fs(oldfs);
 681	return result;
 682}
 683EXPORT_SYMBOL(kernel_sendmsg);
 684
 685static int ktime2ts(ktime_t kt, struct timespec *ts)
 686{
 687	if (kt.tv64) {
 688		*ts = ktime_to_timespec(kt);
 689		return 1;
 690	} else {
 691		return 0;
 692	}
 693}
 694
 695/*
 696 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
 697 */
 698void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 699	struct sk_buff *skb)
 700{
 701	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
 702	struct timespec ts[3];
 703	int empty = 1;
 704	struct skb_shared_hwtstamps *shhwtstamps =
 705		skb_hwtstamps(skb);
 706
 707	/* Race occurred between timestamp enabling and packet
 708	   receiving.  Fill in the current time for now. */
 709	if (need_software_tstamp && skb->tstamp.tv64 == 0)
 710		__net_timestamp(skb);
 711
 712	if (need_software_tstamp) {
 713		if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
 714			struct timeval tv;
 715			skb_get_timestamp(skb, &tv);
 716			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
 717				 sizeof(tv), &tv);
 718		} else {
 719			skb_get_timestampns(skb, &ts[0]);
 720			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
 721				 sizeof(ts[0]), &ts[0]);
 722		}
 723	}
 724
 725
 726	memset(ts, 0, sizeof(ts));
 727	if (skb->tstamp.tv64 &&
 728	    sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) {
 729		skb_get_timestampns(skb, ts + 0);
 730		empty = 0;
 731	}
 732	if (shhwtstamps) {
 733		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE) &&
 734		    ktime2ts(shhwtstamps->syststamp, ts + 1))
 735			empty = 0;
 736		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) &&
 737		    ktime2ts(shhwtstamps->hwtstamp, ts + 2))
 738			empty = 0;
 739	}
 740	if (!empty)
 741		put_cmsg(msg, SOL_SOCKET,
 742			 SCM_TIMESTAMPING, sizeof(ts), &ts);
 743}
 744EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
 745
 746void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
 747	struct sk_buff *skb)
 748{
 749	int ack;
 750
 751	if (!sock_flag(sk, SOCK_WIFI_STATUS))
 752		return;
 753	if (!skb->wifi_acked_valid)
 754		return;
 755
 756	ack = skb->wifi_acked;
 757
 758	put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
 759}
 760EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);
 761
 762static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
 763				   struct sk_buff *skb)
 764{
 765	if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount)
 766		put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
 767			sizeof(__u32), &skb->dropcount);
 768}
 769
 770void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 771	struct sk_buff *skb)
 772{
 773	sock_recv_timestamp(msg, sk, skb);
 774	sock_recv_drops(msg, sk, skb);
 775}
 776EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
 777
 778static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
 779				       struct msghdr *msg, size_t size, int flags)
 780{
 781	struct sock_iocb *si = kiocb_to_siocb(iocb);
 782
 783	si->sock = sock;
 784	si->scm = NULL;
 785	si->msg = msg;
 786	si->size = size;
 787	si->flags = flags;
 788
 789	return sock->ops->recvmsg(iocb, sock, msg, size, flags);
 790}
 791
 792static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 793				 struct msghdr *msg, size_t size, int flags)
 794{
 795	int err = security_socket_recvmsg(sock, msg, size, flags);
 796
 797	return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
 798}
 799
 800int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 801		 size_t size, int flags)
 802{
 803	struct kiocb iocb;
 804	struct sock_iocb siocb;
 805	int ret;
 806
 807	init_sync_kiocb(&iocb, NULL);
 808	iocb.private = &siocb;
 809	ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
 810	if (-EIOCBQUEUED == ret)
 811		ret = wait_on_sync_kiocb(&iocb);
 812	return ret;
 813}
 814EXPORT_SYMBOL(sock_recvmsg);
 815
 816static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
 817			      size_t size, int flags)
 818{
 819	struct kiocb iocb;
 820	struct sock_iocb siocb;
 821	int ret;
 822
 823	init_sync_kiocb(&iocb, NULL);
 824	iocb.private = &siocb;
 825	ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
 826	if (-EIOCBQUEUED == ret)
 827		ret = wait_on_sync_kiocb(&iocb);
 828	return ret;
 829}
 830
 831/**
 832 * kernel_recvmsg - Receive a message from a socket (kernel space)
 833 * @sock:       The socket to receive the message from
 834 * @msg:        Received message
 835 * @vec:        Input s/g array for message data
 836 * @num:        Size of input s/g array
 837 * @size:       Number of bytes to read
 838 * @flags:      Message flags (MSG_DONTWAIT, etc...)
 839 *
 840 * On return the msg structure contains the scatter/gather array passed in the
 841 * vec argument. The array is modified so that it consists of the unfilled
 842 * portion of the original array.
 843 *
 844 * The returned value is the total number of bytes received, or an error.
 845 */
 846int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 847		   struct kvec *vec, size_t num, size_t size, int flags)
 848{
 849	mm_segment_t oldfs = get_fs();
 850	int result;
 851
 852	set_fs(KERNEL_DS);
 853	/*
 854	 * the following is safe, since for compiler definitions of kvec and
 855	 * iovec are identical, yielding the same in-core layout and alignment
 856	 */
 857	msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num;
 858	result = sock_recvmsg(sock, msg, size, flags);
 859	set_fs(oldfs);
 860	return result;
 861}
 862EXPORT_SYMBOL(kernel_recvmsg);
 863
 864static void sock_aio_dtor(struct kiocb *iocb)
 865{
 866	kfree(iocb->private);
 867}
 868
 869static ssize_t sock_sendpage(struct file *file, struct page *page,
 870			     int offset, size_t size, loff_t *ppos, int more)
 871{
 872	struct socket *sock;
 873	int flags;
 874
 875	sock = file->private_data;
 876
 877	flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 878	/* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
 879	flags |= more;
 880
 881	return kernel_sendpage(sock, page, offset, size, flags);
 882}
 883
 884static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
 885				struct pipe_inode_info *pipe, size_t len,
 886				unsigned int flags)
 887{
 888	struct socket *sock = file->private_data;
 889
 890	if (unlikely(!sock->ops->splice_read))
 891		return -EINVAL;
 892
 893	return sock->ops->splice_read(sock, ppos, pipe, len, flags);
 894}
 895
 896static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
 897					 struct sock_iocb *siocb)
 898{
 899	if (!is_sync_kiocb(iocb)) {
 900		siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
 901		if (!siocb)
 902			return NULL;
 903		iocb->ki_dtor = sock_aio_dtor;
 904	}
 905
 906	siocb->kiocb = iocb;
 907	iocb->private = siocb;
 908	return siocb;
 909}
 910
 911static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
 912		struct file *file, const struct iovec *iov,
 913		unsigned long nr_segs)
 914{
 915	struct socket *sock = file->private_data;
 916	size_t size = 0;
 917	int i;
 918
 919	for (i = 0; i < nr_segs; i++)
 920		size += iov[i].iov_len;
 921
 922	msg->msg_name = NULL;
 923	msg->msg_namelen = 0;
 924	msg->msg_control = NULL;
 925	msg->msg_controllen = 0;
 926	msg->msg_iov = (struct iovec *)iov;
 927	msg->msg_iovlen = nr_segs;
 928	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 929
 930	return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
 931}
 932
 933static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 934				unsigned long nr_segs, loff_t pos)
 935{
 936	struct sock_iocb siocb, *x;
 937
 938	if (pos != 0)
 939		return -ESPIPE;
 940
 941	if (iocb->ki_left == 0)	/* Match SYS5 behaviour */
 942		return 0;
 943
 944
 945	x = alloc_sock_iocb(iocb, &siocb);
 946	if (!x)
 947		return -ENOMEM;
 948	return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
 949}
 950
 951static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
 952			struct file *file, const struct iovec *iov,
 953			unsigned long nr_segs)
 954{
 955	struct socket *sock = file->private_data;
 956	size_t size = 0;
 957	int i;
 958
 959	for (i = 0; i < nr_segs; i++)
 960		size += iov[i].iov_len;
 961
 962	msg->msg_name = NULL;
 963	msg->msg_namelen = 0;
 964	msg->msg_control = NULL;
 965	msg->msg_controllen = 0;
 966	msg->msg_iov = (struct iovec *)iov;
 967	msg->msg_iovlen = nr_segs;
 968	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 969	if (sock->type == SOCK_SEQPACKET)
 970		msg->msg_flags |= MSG_EOR;
 971
 972	return __sock_sendmsg(iocb, sock, msg, size);
 973}
 974
 975static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
 976			  unsigned long nr_segs, loff_t pos)
 977{
 978	struct sock_iocb siocb, *x;
 979
 980	if (pos != 0)
 981		return -ESPIPE;
 982
 983	x = alloc_sock_iocb(iocb, &siocb);
 984	if (!x)
 985		return -ENOMEM;
 986
 987	return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
 988}
 989
 990/*
 991 * Atomic setting of ioctl hooks to avoid race
 992 * with module unload.
 993 */
 994
 995static DEFINE_MUTEX(br_ioctl_mutex);
 996static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
 997
 998void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
 999{
1000	mutex_lock(&br_ioctl_mutex);
1001	br_ioctl_hook = hook;
1002	mutex_unlock(&br_ioctl_mutex);
1003}
1004EXPORT_SYMBOL(brioctl_set);
1005
1006static DEFINE_MUTEX(vlan_ioctl_mutex);
1007static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
1008
1009void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
1010{
1011	mutex_lock(&vlan_ioctl_mutex);
1012	vlan_ioctl_hook = hook;
1013	mutex_unlock(&vlan_ioctl_mutex);
1014}
1015EXPORT_SYMBOL(vlan_ioctl_set);
1016
1017static DEFINE_MUTEX(dlci_ioctl_mutex);
1018static int (*dlci_ioctl_hook) (unsigned int, void __user *);
1019
1020void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
1021{
1022	mutex_lock(&dlci_ioctl_mutex);
1023	dlci_ioctl_hook = hook;
1024	mutex_unlock(&dlci_ioctl_mutex);
1025}
1026EXPORT_SYMBOL(dlci_ioctl_set);
1027
1028static long sock_do_ioctl(struct net *net, struct socket *sock,
1029				 unsigned int cmd, unsigned long arg)
1030{
1031	int err;
1032	void __user *argp = (void __user *)arg;
1033
1034	err = sock->ops->ioctl(sock, cmd, arg);
1035
1036	/*
1037	 * If this ioctl is unknown try to hand it down
1038	 * to the NIC driver.
1039	 */
1040	if (err == -ENOIOCTLCMD)
1041		err = dev_ioctl(net, cmd, argp);
1042
1043	return err;
1044}
1045
1046/*
1047 *	With an ioctl, arg may well be a user mode pointer, but we don't know
1048 *	what to do with it - that's up to the protocol still.
1049 */
1050
1051static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1052{
1053	struct socket *sock;
1054	struct sock *sk;
1055	void __user *argp = (void __user *)arg;
1056	int pid, err;
1057	struct net *net;
1058
1059	sock = file->private_data;
1060	sk = sock->sk;
1061	net = sock_net(sk);
1062	if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
1063		err = dev_ioctl(net, cmd, argp);
1064	} else
1065#ifdef CONFIG_WEXT_CORE
1066	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1067		err = dev_ioctl(net, cmd, argp);
1068	} else
1069#endif
1070		switch (cmd) {
1071		case FIOSETOWN:
1072		case SIOCSPGRP:
1073			err = -EFAULT;
1074			if (get_user(pid, (int __user *)argp))
1075				break;
1076			err = f_setown(sock->file, pid, 1);
1077			break;
1078		case FIOGETOWN:
1079		case SIOCGPGRP:
1080			err = put_user(f_getown(sock->file),
1081				       (int __user *)argp);
1082			break;
1083		case SIOCGIFBR:
1084		case SIOCSIFBR:
1085		case SIOCBRADDBR:
1086		case SIOCBRDELBR:
1087			err = -ENOPKG;
1088			if (!br_ioctl_hook)
1089				request_module("bridge");
1090
1091			mutex_lock(&br_ioctl_mutex);
1092			if (br_ioctl_hook)
1093				err = br_ioctl_hook(net, cmd, argp);
1094			mutex_unlock(&br_ioctl_mutex);
1095			break;
1096		case SIOCGIFVLAN:
1097		case SIOCSIFVLAN:
1098			err = -ENOPKG;
1099			if (!vlan_ioctl_hook)
1100				request_module("8021q");
1101
1102			mutex_lock(&vlan_ioctl_mutex);
1103			if (vlan_ioctl_hook)
1104				err = vlan_ioctl_hook(net, argp);
1105			mutex_unlock(&vlan_ioctl_mutex);
1106			break;
1107		case SIOCADDDLCI:
1108		case SIOCDELDLCI:
1109			err = -ENOPKG;
1110			if (!dlci_ioctl_hook)
1111				request_module("dlci");
1112
1113			mutex_lock(&dlci_ioctl_mutex);
1114			if (dlci_ioctl_hook)
1115				err = dlci_ioctl_hook(cmd, argp);
1116			mutex_unlock(&dlci_ioctl_mutex);
1117			break;
1118		default:
1119			err = sock_do_ioctl(net, sock, cmd, arg);
1120			break;
1121		}
1122	return err;
1123}
1124
1125int sock_create_lite(int family, int type, int protocol, struct socket **res)
1126{
1127	int err;
1128	struct socket *sock = NULL;
1129
1130	err = security_socket_create(family, type, protocol, 1);
1131	if (err)
1132		goto out;
1133
1134	sock = sock_alloc();
1135	if (!sock) {
1136		err = -ENOMEM;
1137		goto out;
1138	}
1139
1140	sock->type = type;
1141	err = security_socket_post_create(sock, family, type, protocol, 1);
1142	if (err)
1143		goto out_release;
1144
1145out:
1146	*res = sock;
1147	return err;
1148out_release:
1149	sock_release(sock);
1150	sock = NULL;
1151	goto out;
1152}
1153EXPORT_SYMBOL(sock_create_lite);
1154
1155/* No kernel lock held - perfect */
1156static unsigned int sock_poll(struct file *file, poll_table *wait)
1157{
1158	struct socket *sock;
1159
1160	/*
1161	 *      We can't return errors to poll, so it's either yes or no.
1162	 */
1163	sock = file->private_data;
1164	return sock->ops->poll(file, sock, wait);
1165}
1166
1167static int sock_mmap(struct file *file, struct vm_area_struct *vma)
1168{
1169	struct socket *sock = file->private_data;
1170
1171	return sock->ops->mmap(file, sock, vma);
1172}
1173
1174static int sock_close(struct inode *inode, struct file *filp)
1175{
1176	/*
1177	 *      It was possible the inode is NULL we were
1178	 *      closing an unfinished socket.
1179	 */
1180
1181	if (!inode) {
1182		printk(KERN_DEBUG "sock_close: NULL inode\n");
1183		return 0;
1184	}
1185	sock_release(SOCKET_I(inode));
1186	return 0;
1187}
1188
1189/*
1190 *	Update the socket async list
1191 *
1192 *	Fasync_list locking strategy.
1193 *
1194 *	1. fasync_list is modified only under process context socket lock
1195 *	   i.e. under semaphore.
1196 *	2. fasync_list is used under read_lock(&sk->sk_callback_lock)
1197 *	   or under socket lock
1198 */
1199
1200static int sock_fasync(int fd, struct file *filp, int on)
1201{
1202	struct socket *sock = filp->private_data;
1203	struct sock *sk = sock->sk;
1204	struct socket_wq *wq;
1205
1206	if (sk == NULL)
1207		return -EINVAL;
1208
1209	lock_sock(sk);
1210	wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));
1211	fasync_helper(fd, filp, on, &wq->fasync_list);
1212
1213	if (!wq->fasync_list)
1214		sock_reset_flag(sk, SOCK_FASYNC);
1215	else
1216		sock_set_flag(sk, SOCK_FASYNC);
1217
1218	release_sock(sk);
1219	return 0;
1220}
1221
1222/* This function may be called only under socket lock or callback_lock or rcu_lock */
1223
1224int sock_wake_async(struct socket *sock, int how, int band)
1225{
1226	struct socket_wq *wq;
1227
1228	if (!sock)
1229		return -1;
1230	rcu_read_lock();
1231	wq = rcu_dereference(sock->wq);
1232	if (!wq || !wq->fasync_list) {
1233		rcu_read_unlock();
1234		return -1;
1235	}
1236	switch (how) {
1237	case SOCK_WAKE_WAITD:
1238		if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
1239			break;
1240		goto call_kill;
1241	case SOCK_WAKE_SPACE:
1242		if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
1243			break;
1244		/* fall through */
1245	case SOCK_WAKE_IO:
1246call_kill:
1247		kill_fasync(&wq->fasync_list, SIGIO, band);
1248		break;
1249	case SOCK_WAKE_URG:
1250		kill_fasync(&wq->fasync_list, SIGURG, band);
1251	}
1252	rcu_read_unlock();
1253	return 0;
1254}
1255EXPORT_SYMBOL(sock_wake_async);
1256
1257int __sock_create(struct net *net, int family, int type, int protocol,
1258			 struct socket **res, int kern)
1259{
1260	int err;
1261	struct socket *sock;
1262	const struct net_proto_family *pf;
1263
1264	/*
1265	 *      Check protocol is in range
1266	 */
1267	if (family < 0 || family >= NPROTO)
1268		return -EAFNOSUPPORT;
1269	if (type < 0 || type >= SOCK_MAX)
1270		return -EINVAL;
1271
1272	/* Compatibility.
1273
1274	   This uglymoron is moved from INET layer to here to avoid
1275	   deadlock in module load.
1276	 */
1277	if (family == PF_INET && type == SOCK_PACKET) {
1278		static int warned;
1279		if (!warned) {
1280			warned = 1;
1281			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
1282			       current->comm);
1283		}
1284		family = PF_PACKET;
1285	}
1286
1287	err = security_socket_create(family, type, protocol, kern);
1288	if (err)
1289		return err;
1290
1291	/*
1292	 *	Allocate the socket and allow the family to set things up. if
1293	 *	the protocol is 0, the family is instructed to select an appropriate
1294	 *	default.
1295	 */
1296	sock = sock_alloc();
1297	if (!sock) {
1298		net_warn_ratelimited("socket: no more sockets\n");
1299		return -ENFILE;	/* Not exactly a match, but its the
1300				   closest posix thing */
1301	}
1302
1303	sock->type = type;
1304
1305#ifdef CONFIG_MODULES
1306	/* Attempt to load a protocol module if the find failed.
1307	 *
1308	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
1309	 * requested real, full-featured networking support upon configuration.
1310	 * Otherwise module support will break!
1311	 */
1312	if (rcu_access_pointer(net_families[family]) == NULL)
1313		request_module("net-pf-%d", family);
1314#endif
1315
1316	rcu_read_lock();
1317	pf = rcu_dereference(net_families[family]);
1318	err = -EAFNOSUPPORT;
1319	if (!pf)
1320		goto out_release;
1321
1322	/*
1323	 * We will call the ->create function, that possibly is in a loadable
1324	 * module, so we have to bump that loadable module refcnt first.
1325	 */
1326	if (!try_module_get(pf->owner))
1327		goto out_release;
1328
1329	/* Now protected by module ref count */
1330	rcu_read_unlock();
1331
1332	err = pf->create(net, sock, protocol, kern);
1333	if (err < 0)
1334		goto out_module_put;
1335
1336	/*
1337	 * Now to bump the refcnt of the [loadable] module that owns this
1338	 * socket at sock_release time we decrement its refcnt.
1339	 */
1340	if (!try_module_get(sock->ops->owner))
1341		goto out_module_busy;
1342
1343	/*
1344	 * Now that we're done with the ->create function, the [loadable]
1345	 * module can have its refcnt decremented
1346	 */
1347	module_put(pf->owner);
1348	err = security_socket_post_create(sock, family, type, protocol, kern);
1349	if (err)
1350		goto out_sock_release;
1351	*res = sock;
1352
1353	return 0;
1354
1355out_module_busy:
1356	err = -EAFNOSUPPORT;
1357out_module_put:
1358	sock->ops = NULL;
1359	module_put(pf->owner);
1360out_sock_release:
1361	sock_release(sock);
1362	return err;
1363
1364out_release:
1365	rcu_read_unlock();
1366	goto out_sock_release;
1367}
1368EXPORT_SYMBOL(__sock_create);
1369
1370int sock_create(int family, int type, int protocol, struct socket **res)
1371{
1372	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
1373}
1374EXPORT_SYMBOL(sock_create);
1375
1376int sock_create_kern(int family, int type, int protocol, struct socket **res)
1377{
1378	return __sock_create(&init_net, family, type, protocol, res, 1);
1379}
1380EXPORT_SYMBOL(sock_create_kern);
1381
1382SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
1383{
1384	int retval;
1385	struct socket *sock;
1386	int flags;
1387
1388	/* Check the SOCK_* constants for consistency.  */
1389	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
1390	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
1391	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
1392	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
1393
1394	flags = type & ~SOCK_TYPE_MASK;
1395	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1396		return -EINVAL;
1397	type &= SOCK_TYPE_MASK;
1398
1399	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
1400		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1401
1402	retval = sock_create(family, type, protocol, &sock);
1403	if (retval < 0)
1404		goto out;
1405
1406	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
1407	if (retval < 0)
1408		goto out_release;
1409
1410out:
1411	/* It may be already another descriptor 8) Not kernel problem. */
1412	return retval;
1413
1414out_release:
1415	sock_release(sock);
1416	return retval;
1417}
1418
1419/*
1420 *	Create a pair of connected sockets.
1421 */
1422
1423SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
1424		int __user *, usockvec)
1425{
1426	struct socket *sock1, *sock2;
1427	int fd1, fd2, err;
1428	struct file *newfile1, *newfile2;
1429	int flags;
1430
1431	flags = type & ~SOCK_TYPE_MASK;
1432	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1433		return -EINVAL;
1434	type &= SOCK_TYPE_MASK;
1435
1436	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
1437		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1438
1439	/*
1440	 * Obtain the first socket and check if the underlying protocol
1441	 * supports the socketpair call.
1442	 */
1443
1444	err = sock_create(family, type, protocol, &sock1);
1445	if (err < 0)
1446		goto out;
1447
1448	err = sock_create(family, type, protocol, &sock2);
1449	if (err < 0)
1450		goto out_release_1;
1451
1452	err = sock1->ops->socketpair(sock1, sock2);
1453	if (err < 0)
1454		goto out_release_both;
1455
1456	fd1 = get_unused_fd_flags(flags);
1457	if (unlikely(fd1 < 0)) {
1458		err = fd1;
1459		goto out_release_both;
1460	}
1461	fd2 = get_unused_fd_flags(flags);
1462	if (unlikely(fd2 < 0)) {
1463		err = fd2;
1464		put_unused_fd(fd1);
1465		goto out_release_both;
1466	}
1467
1468	newfile1 = sock_alloc_file(sock1, flags, NULL);
1469	if (unlikely(IS_ERR(newfile1))) {
1470		err = PTR_ERR(newfile1);
1471		put_unused_fd(fd1);
1472		put_unused_fd(fd2);
1473		goto out_release_both;
1474	}
1475
1476	newfile2 = sock_alloc_file(sock2, flags, NULL);
1477	if (IS_ERR(newfile2)) {
1478		err = PTR_ERR(newfile2);
1479		fput(newfile1);
1480		put_unused_fd(fd1);
1481		put_unused_fd(fd2);
1482		sock_release(sock2);
1483		goto out;
1484	}
1485
1486	audit_fd_pair(fd1, fd2);
1487	fd_install(fd1, newfile1);
1488	fd_install(fd2, newfile2);
1489	/* fd1 and fd2 may be already another descriptors.
1490	 * Not kernel problem.
1491	 */
1492
1493	err = put_user(fd1, &usockvec[0]);
1494	if (!err)
1495		err = put_user(fd2, &usockvec[1]);
1496	if (!err)
1497		return 0;
1498
1499	sys_close(fd2);
1500	sys_close(fd1);
1501	return err;
1502
1503out_release_both:
1504	sock_release(sock2);
1505out_release_1:
1506	sock_release(sock1);
1507out:
1508	return err;
1509}
1510
1511/*
1512 *	Bind a name to a socket. Nothing much to do here since it's
1513 *	the protocol's responsibility to handle the local address.
1514 *
1515 *	We move the socket address to kernel space before we call
1516 *	the protocol layer (having also checked the address is ok).
1517 */
1518
1519SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
1520{
1521	struct socket *sock;
1522	struct sockaddr_storage address;
1523	int err, fput_needed;
1524
1525	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1526	if (sock) {
1527		err = move_addr_to_kernel(umyaddr, addrlen, &address);
1528		if (err >= 0) {
1529			err = security_socket_bind(sock,
1530						   (struct sockaddr *)&address,
1531						   addrlen);
1532			if (!err)
1533				err = sock->ops->bind(sock,
1534						      (struct sockaddr *)
1535						      &address, addrlen);
1536		}
1537		fput_light(sock->file, fput_needed);
1538	}
1539	return err;
1540}
1541
1542/*
1543 *	Perform a listen. Basically, we allow the protocol to do anything
1544 *	necessary for a listen, and if that works, we mark the socket as
1545 *	ready for listening.
1546 */
1547
1548SYSCALL_DEFINE2(listen, int, fd, int, backlog)
1549{
1550	struct socket *sock;
1551	int err, fput_needed;
1552	int somaxconn;
1553
1554	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1555	if (sock) {
1556		somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
1557		if ((unsigned int)backlog > somaxconn)
1558			backlog = somaxconn;
1559
1560		err = security_socket_listen(sock, backlog);
1561		if (!err)
1562			err = sock->ops->listen(sock, backlog);
1563
1564		fput_light(sock->file, fput_needed);
1565	}
1566	return err;
1567}
1568
1569/*
1570 *	For accept, we attempt to create a new socket, set up the link
1571 *	with the client, wake up the client, then return the new
1572 *	connected fd. We collect the address of the connector in kernel
1573 *	space and move it to user at the very end. This is unclean because
1574 *	we open the socket then return an error.
1575 *
1576 *	1003.1g adds the ability to recvmsg() to query connection pending
1577 *	status to recvmsg. We need to add that support in a way thats
1578 *	clean when we restucture accept also.
1579 */
1580
1581SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
1582		int __user *, upeer_addrlen, int, flags)
1583{
1584	struct socket *sock, *newsock;
1585	struct file *newfile;
1586	int err, len, newfd, fput_needed;
1587	struct sockaddr_storage address;
1588
1589	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1590		return -EINVAL;
1591
1592	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
1593		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1594
1595	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1596	if (!sock)
1597		goto out;
1598
1599	err = -ENFILE;
1600	newsock = sock_alloc();
1601	if (!newsock)
1602		goto out_put;
1603
1604	newsock->type = sock->type;
1605	newsock->ops = sock->ops;
1606
1607	/*
1608	 * We don't need try_module_get here, as the listening socket (sock)
1609	 * has the protocol module (sock->ops->owner) held.
1610	 */
1611	__module_get(newsock->ops->owner);
1612
1613	newfd = get_unused_fd_flags(flags);
1614	if (unlikely(newfd < 0)) {
1615		err = newfd;
1616		sock_release(newsock);
1617		goto out_put;
1618	}
1619	newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
1620	if (unlikely(IS_ERR(newfile))) {
1621		err = PTR_ERR(newfile);
1622		put_unused_fd(newfd);
1623		sock_release(newsock);
1624		goto out_put;
1625	}
1626
1627	err = security_socket_accept(sock, newsock);
1628	if (err)
1629		goto out_fd;
1630
1631	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
1632	if (err < 0)
1633		goto out_fd;
1634
1635	if (upeer_sockaddr) {
1636		if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
1637					  &len, 2) < 0) {
1638			err = -ECONNABORTED;
1639			goto out_fd;
1640		}
1641		err = move_addr_to_user(&address,
1642					len, upeer_sockaddr, upeer_addrlen);
1643		if (err < 0)
1644			goto out_fd;
1645	}
1646
1647	/* File flags are not inherited via accept() unlike another OSes. */
1648
1649	fd_install(newfd, newfile);
1650	err = newfd;
1651
1652out_put:
1653	fput_light(sock->file, fput_needed);
1654out:
1655	return err;
1656out_fd:
1657	fput(newfile);
1658	put_unused_fd(newfd);
1659	goto out_put;
1660}
1661
1662SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
1663		int __user *, upeer_addrlen)
1664{
1665	return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
1666}
1667
1668/*
1669 *	Attempt to connect to a socket with the server address.  The address
1670 *	is in user space so we verify it is OK and move it to kernel space.
1671 *
1672 *	For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
1673 *	break bindings
1674 *
1675 *	NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
1676 *	other SEQPACKET protocols that take time to connect() as it doesn't
1677 *	include the -EINPROGRESS status for such sockets.
1678 */
1679
1680SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
1681		int, addrlen)
1682{
1683	struct socket *sock;
1684	struct sockaddr_storage address;
1685	int err, fput_needed;
1686
1687	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1688	if (!sock)
1689		goto out;
1690	err = move_addr_to_kernel(uservaddr, addrlen, &address);
1691	if (err < 0)
1692		goto out_put;
1693
1694	err =
1695	    security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
1696	if (err)
1697		goto out_put;
1698
1699	err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
1700				 sock->file->f_flags);
1701out_put:
1702	fput_light(sock->file, fput_needed);
1703out:
1704	return err;
1705}
1706
1707/*
1708 *	Get the local address ('name') of a socket object. Move the obtained
1709 *	name to user space.
1710 */
1711
1712SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
1713		int __user *, usockaddr_len)
1714{
1715	struct socket *sock;
1716	struct sockaddr_storage address;
1717	int len, err, fput_needed;
1718
1719	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1720	if (!sock)
1721		goto out;
1722
1723	err = security_socket_getsockname(sock);
1724	if (err)
1725		goto out_put;
1726
1727	err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0);
1728	if (err)
1729		goto out_put;
1730	err = move_addr_to_user(&address, len, usockaddr, usockaddr_len);
1731
1732out_put:
1733	fput_light(sock->file, fput_needed);
1734out:
1735	return err;
1736}
1737
1738/*
1739 *	Get the remote address ('name') of a socket object. Move the obtained
1740 *	name to user space.
1741 */
1742
1743SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
1744		int __user *, usockaddr_len)
1745{
1746	struct socket *sock;
1747	struct sockaddr_storage address;
1748	int len, err, fput_needed;
1749
1750	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1751	if (sock != NULL) {
1752		err = security_socket_getpeername(sock);
1753		if (err) {
1754			fput_light(sock->file, fput_needed);
1755			return err;
1756		}
1757
1758		err =
1759		    sock->ops->getname(sock, (struct sockaddr *)&address, &len,
1760				       1);
1761		if (!err)
1762			err = move_addr_to_user(&address, len, usockaddr,
1763						usockaddr_len);
1764		fput_light(sock->file, fput_needed);
1765	}
1766	return err;
1767}
1768
1769/*
1770 *	Send a datagram to a given address. We move the address into kernel
1771 *	space and check the user space data area is readable before invoking
1772 *	the protocol.
1773 */
1774
1775SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
1776		unsigned int, flags, struct sockaddr __user *, addr,
1777		int, addr_len)
1778{
1779	struct socket *sock;
1780	struct sockaddr_storage address;
1781	int err;
1782	struct msghdr msg;
1783	struct iovec iov;
1784	int fput_needed;
1785
1786	if (len > INT_MAX)
1787		len = INT_MAX;
1788	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1789	if (!sock)
1790		goto out;
1791
1792	iov.iov_base = buff;
1793	iov.iov_len = len;
1794	msg.msg_name = NULL;
1795	msg.msg_iov = &iov;
1796	msg.msg_iovlen = 1;
1797	msg.msg_control = NULL;
1798	msg.msg_controllen = 0;
1799	msg.msg_namelen = 0;
1800	if (addr) {
1801		err = move_addr_to_kernel(addr, addr_len, &address);
1802		if (err < 0)
1803			goto out_put;
1804		msg.msg_name = (struct sockaddr *)&address;
1805		msg.msg_namelen = addr_len;
1806	}
1807	if (sock->file->f_flags & O_NONBLOCK)
1808		flags |= MSG_DONTWAIT;
1809	msg.msg_flags = flags;
1810	err = sock_sendmsg(sock, &msg, len);
1811
1812out_put:
1813	fput_light(sock->file, fput_needed);
1814out:
1815	return err;
1816}
1817
1818/*
1819 *	Send a datagram down a socket.
1820 */
1821
1822SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
1823		unsigned int, flags)
1824{
1825	return sys_sendto(fd, buff, len, flags, NULL, 0);
1826}
1827
1828/*
1829 *	Receive a frame from the socket and optionally record the address of the
1830 *	sender. We verify the buffers are writable and if needed move the
1831 *	sender address from kernel to user space.
1832 */
1833
1834SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
1835		unsigned int, flags, struct sockaddr __user *, addr,
1836		int __user *, addr_len)
1837{
1838	struct socket *sock;
1839	struct iovec iov;
1840	struct msghdr msg;
1841	struct sockaddr_storage address;
1842	int err, err2;
1843	int fput_needed;
1844
1845	if (size > INT_MAX)
1846		size = INT_MAX;
1847	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1848	if (!sock)
1849		goto out;
1850
1851	msg.msg_control = NULL;
1852	msg.msg_controllen = 0;
1853	msg.msg_iovlen = 1;
1854	msg.msg_iov = &iov;
1855	iov.iov_len = size;
1856	iov.iov_base = ubuf;
1857	msg.msg_name = (struct sockaddr *)&address;
1858	msg.msg_namelen = sizeof(address);
1859	if (sock->file->f_flags & O_NONBLOCK)
1860		flags |= MSG_DONTWAIT;
1861	err = sock_recvmsg(sock, &msg, size, flags);
1862
1863	if (err >= 0 && addr != NULL) {
1864		err2 = move_addr_to_user(&address,
1865					 msg.msg_namelen, addr, addr_len);
1866		if (err2 < 0)
1867			err = err2;
1868	}
1869
1870	fput_light(sock->file, fput_needed);
1871out:
1872	return err;
1873}
1874
1875/*
1876 *	Receive a datagram from a socket.
1877 */
1878
1879asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size,
1880			 unsigned int flags)
1881{
1882	return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
1883}
1884
1885/*
1886 *	Set a socket option. Because we don't know the option lengths we have
1887 *	to pass the user mode parameter for the protocols to sort out.
1888 */
1889
1890SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
1891		char __user *, optval, int, optlen)
1892{
1893	int err, fput_needed;
1894	struct socket *sock;
1895
1896	if (optlen < 0)
1897		return -EINVAL;
1898
1899	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1900	if (sock != NULL) {
1901		err = security_socket_setsockopt(sock, level, optname);
1902		if (err)
1903			goto out_put;
1904
1905		if (level == SOL_SOCKET)
1906			err =
1907			    sock_setsockopt(sock, level, optname, optval,
1908					    optlen);
1909		else
1910			err =
1911			    sock->ops->setsockopt(sock, level, optname, optval,
1912						  optlen);
1913out_put:
1914		fput_light(sock->file, fput_needed);
1915	}
1916	return err;
1917}
1918
1919/*
1920 *	Get a socket option. Because we don't know the option lengths we have
1921 *	to pass a user mode parameter for the protocols to sort out.
1922 */
1923
1924SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
1925		char __user *, optval, int __user *, optlen)
1926{
1927	int err, fput_needed;
1928	struct socket *sock;
1929
1930	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1931	if (sock != NULL) {
1932		err = security_socket_getsockopt(sock, level, optname);
1933		if (err)
1934			goto out_put;
1935
1936		if (level == SOL_SOCKET)
1937			err =
1938			    sock_getsockopt(sock, level, optname, optval,
1939					    optlen);
1940		else
1941			err =
1942			    sock->ops->getsockopt(sock, level, optname, optval,
1943						  optlen);
1944out_put:
1945		fput_light(sock->file, fput_needed);
1946	}
1947	return err;
1948}
1949
1950/*
1951 *	Shutdown a socket.
1952 */
1953
1954SYSCALL_DEFINE2(shutdown, int, fd, int, how)
1955{
1956	int err, fput_needed;
1957	struct socket *sock;
1958
1959	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1960	if (sock != NULL) {
1961		err = security_socket_shutdown(sock, how);
1962		if (!err)
1963			err = sock->ops->shutdown(sock, how);
1964		fput_light(sock->file, fput_needed);
1965	}
1966	return err;
1967}
1968
1969/* A couple of helpful macros for getting the address of the 32/64 bit
1970 * fields which are the same type (int / unsigned) on our platforms.
1971 */
1972#define COMPAT_MSG(msg, member)	((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
1973#define COMPAT_NAMELEN(msg)	COMPAT_MSG(msg, msg_namelen)
1974#define COMPAT_FLAGS(msg)	COMPAT_MSG(msg, msg_flags)
1975
1976struct used_address {
1977	struct sockaddr_storage name;
1978	unsigned int name_len;
1979};
1980
1981static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
1982			 struct msghdr *msg_sys, unsigned int flags,
1983			 struct used_address *used_address)
1984{
1985	struct compat_msghdr __user *msg_compat =
1986	    (struct compat_msghdr __user *)msg;
1987	struct sockaddr_storage address;
1988	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
1989	unsigned char ctl[sizeof(struct cmsghdr) + 20]
1990	    __attribute__ ((aligned(sizeof(__kernel_size_t))));
1991	/* 20 is size of ipv6_pktinfo */
1992	unsigned char *ctl_buf = ctl;
1993	int err, ctl_len, total_len;
1994
1995	err = -EFAULT;
1996	if (MSG_CMSG_COMPAT & flags) {
1997		if (get_compat_msghdr(msg_sys, msg_compat))
1998			return -EFAULT;
1999	} else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
2000		return -EFAULT;
2001
2002	if (msg_sys->msg_iovlen > UIO_FASTIOV) {
2003		err = -EMSGSIZE;
2004		if (msg_sys->msg_iovlen > UIO_MAXIOV)
2005			goto out;
2006		err = -ENOMEM;
2007		iov = kmalloc(msg_sys->msg_iovlen * sizeof(struct iovec),
2008			      GFP_KERNEL);
2009		if (!iov)
2010			goto out;
2011	}
2012
2013	/* This will also move the address data into kernel space */
2014	if (MSG_CMSG_COMPAT & flags) {
2015		err = verify_compat_iovec(msg_sys, iov, &address, VERIFY_READ);
2016	} else
2017		err = verify_iovec(msg_sys, iov, &address, VERIFY_READ);
2018	if (err < 0)
2019		goto out_freeiov;
2020	total_len = err;
2021
2022	err = -ENOBUFS;
2023
2024	if (msg_sys->msg_controllen > INT_MAX)
2025		goto out_

Large files files are truncated, but you can click here to view the full file