PageRenderTime 134ms CodeModel.GetById 3ms app.highlight 116ms RepoModel.GetById 0ms app.codeStats 1ms

/net/socket.c

https://bitbucket.org/zossso/android-kernel-2.6.34-motus
C | 3152 lines | 2260 code | 462 blank | 430 comment | 356 complexity | 02555c78d8719a4950120d217d286e42 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * NET		An implementation of the SOCKET network access protocol.
   3 *
   4 * Version:	@(#)socket.c	1.1.93	18/02/95
   5 *
   6 * Authors:	Orest Zborowski, <obz@Kodak.COM>
   7 *		Ross Biro
   8 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   9 *
  10 * Fixes:
  11 *		Anonymous	:	NOTSOCK/BADF cleanup. Error fix in
  12 *					shutdown()
  13 *		Alan Cox	:	verify_area() fixes
  14 *		Alan Cox	:	Removed DDI
  15 *		Jonathan Kamens	:	SOCK_DGRAM reconnect bug
  16 *		Alan Cox	:	Moved a load of checks to the very
  17 *					top level.
  18 *		Alan Cox	:	Move address structures to/from user
  19 *					mode above the protocol layers.
  20 *		Rob Janssen	:	Allow 0 length sends.
  21 *		Alan Cox	:	Asynchronous I/O support (cribbed from the
  22 *					tty drivers).
  23 *		Niibe Yutaka	:	Asynchronous I/O for writes (4.4BSD style)
  24 *		Jeff Uphoff	:	Made max number of sockets command-line
  25 *					configurable.
  26 *		Matti Aarnio	:	Made the number of sockets dynamic,
  27 *					to be allocated when needed, and mr.
  28 *					Uphoff's max is used as max to be
  29 *					allowed to allocate.
  30 *		Linus		:	Argh. removed all the socket allocation
  31 *					altogether: it's in the inode now.
  32 *		Alan Cox	:	Made sock_alloc()/sock_release() public
  33 *					for NetROM and future kernel nfsd type
  34 *					stuff.
  35 *		Alan Cox	:	sendmsg/recvmsg basics.
  36 *		Tom Dyas	:	Export net symbols.
  37 *		Marcin Dalecki	:	Fixed problems with CONFIG_NET="n".
  38 *		Alan Cox	:	Added thread locking to sys_* calls
  39 *					for sockets. May have errors at the
  40 *					moment.
  41 *		Kevin Buhr	:	Fixed the dumb errors in the above.
  42 *		Andi Kleen	:	Some small cleanups, optimizations,
  43 *					and fixed a copy_from_user() bug.
  44 *		Tigran Aivazian	:	sys_send(args) calls sys_sendto(args, NULL, 0)
  45 *		Tigran Aivazian	:	Made listen(2) backlog sanity checks
  46 *					protocol-independent
  47 *
  48 *
  49 *		This program is free software; you can redistribute it and/or
  50 *		modify it under the terms of the GNU General Public License
  51 *		as published by the Free Software Foundation; either version
  52 *		2 of the License, or (at your option) any later version.
  53 *
  54 *
  55 *	This module is effectively the top level interface to the BSD socket
  56 *	paradigm.
  57 *
  58 *	Based upon Swansea University Computer Society NET3.039
  59 */
  60
  61#include <linux/mm.h>
  62#include <linux/socket.h>
  63#include <linux/file.h>
  64#include <linux/net.h>
  65#include <linux/interrupt.h>
  66#include <linux/thread_info.h>
  67#include <linux/rcupdate.h>
  68#include <linux/netdevice.h>
  69#include <linux/proc_fs.h>
  70#include <linux/seq_file.h>
  71#include <linux/mutex.h>
  72#include <linux/wanrouter.h>
  73#include <linux/if_bridge.h>
  74#include <linux/if_frad.h>
  75#include <linux/if_vlan.h>
  76#include <linux/init.h>
  77#include <linux/poll.h>
  78#include <linux/cache.h>
  79#include <linux/module.h>
  80#include <linux/highmem.h>
  81#include <linux/mount.h>
  82#include <linux/security.h>
  83#include <linux/syscalls.h>
  84#include <linux/compat.h>
  85#include <linux/kmod.h>
  86#include <linux/audit.h>
  87#include <linux/wireless.h>
  88#include <linux/nsproxy.h>
  89#include <linux/magic.h>
  90#include <linux/slab.h>
  91
  92#include <asm/uaccess.h>
  93#include <asm/unistd.h>
  94
  95#include <net/compat.h>
  96#include <net/wext.h>
  97
  98#include <net/sock.h>
  99#include <linux/netfilter.h>
 100
 101#include <linux/if_tun.h>
 102#include <linux/ipv6_route.h>
 103#include <linux/route.h>
 104#include <linux/sockios.h>
 105#include <linux/atalk.h>
 106
 107#ifdef CONFIG_UID_STAT
 108#include <linux/uid_stat.h>
 109#endif
 110
 111static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 112static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 113			 unsigned long nr_segs, loff_t pos);
 114static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
 115			  unsigned long nr_segs, loff_t pos);
 116static int sock_mmap(struct file *file, struct vm_area_struct *vma);
 117
 118static int sock_close(struct inode *inode, struct file *file);
 119static unsigned int sock_poll(struct file *file,
 120			      struct poll_table_struct *wait);
 121static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 122#ifdef CONFIG_COMPAT
 123static long compat_sock_ioctl(struct file *file,
 124			      unsigned int cmd, unsigned long arg);
 125#endif
 126static int sock_fasync(int fd, struct file *filp, int on);
 127static ssize_t sock_sendpage(struct file *file, struct page *page,
 128			     int offset, size_t size, loff_t *ppos, int more);
 129static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
 130			        struct pipe_inode_info *pipe, size_t len,
 131				unsigned int flags);
 132
 133/*
 134 *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 135 *	in the operation structures but are done directly via the socketcall() multiplexor.
 136 */
 137
 138static const struct file_operations socket_file_ops = {
 139	.owner =	THIS_MODULE,
 140	.llseek =	no_llseek,
 141	.aio_read =	sock_aio_read,
 142	.aio_write =	sock_aio_write,
 143	.poll =		sock_poll,
 144	.unlocked_ioctl = sock_ioctl,
 145#ifdef CONFIG_COMPAT
 146	.compat_ioctl = compat_sock_ioctl,
 147#endif
 148	.mmap =		sock_mmap,
 149	.open =		sock_no_open,	/* special open code to disallow open via /proc */
 150	.release =	sock_close,
 151	.fasync =	sock_fasync,
 152	.sendpage =	sock_sendpage,
 153	.splice_write = generic_splice_sendpage,
 154	.splice_read =	sock_splice_read,
 155};
 156
 157/*
 158 *	The protocol list. Each protocol is registered in here.
 159 */
 160
 161static DEFINE_SPINLOCK(net_family_lock);
 162static const struct net_proto_family *net_families[NPROTO] __read_mostly;
 163
 164/*
 165 *	Statistics counters of the socket lists
 166 */
 167
 168static DEFINE_PER_CPU(int, sockets_in_use) = 0;
 169
 170/*
 171 * Support routines.
 172 * Move socket addresses back and forth across the kernel/user
 173 * divide and look after the messy bits.
 174 */
 175
 176#define MAX_SOCK_ADDR	128		/* 108 for Unix domain -
 177					   16 for IP, 16 for IPX,
 178					   24 for IPv6,
 179					   about 80 for AX.25
 180					   must be at least one bigger than
 181					   the AF_UNIX size (see net/unix/af_unix.c
 182					   :unix_mkname()).
 183					 */
 184
 185/**
 186 *	move_addr_to_kernel	-	copy a socket address into kernel space
 187 *	@uaddr: Address in user space
 188 *	@kaddr: Address in kernel space
 189 *	@ulen: Length in user space
 190 *
 191 *	The address is copied into kernel space. If the provided address is
 192 *	too long an error code of -EINVAL is returned. If the copy gives
 193 *	invalid addresses -EFAULT is returned. On a success 0 is returned.
 194 */
 195
 196int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr)
 197{
 198	if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
 199		return -EINVAL;
 200	if (ulen == 0)
 201		return 0;
 202	if (copy_from_user(kaddr, uaddr, ulen))
 203		return -EFAULT;
 204	return audit_sockaddr(ulen, kaddr);
 205}
 206
 207/**
 208 *	move_addr_to_user	-	copy an address to user space
 209 *	@kaddr: kernel space address
 210 *	@klen: length of address in kernel
 211 *	@uaddr: user space address
 212 *	@ulen: pointer to user length field
 213 *
 214 *	The value pointed to by ulen on entry is the buffer length available.
 215 *	This is overwritten with the buffer space used. -EINVAL is returned
 216 *	if an overlong buffer is specified or a negative buffer size. -EFAULT
 217 *	is returned if either the buffer or the length field are not
 218 *	accessible.
 219 *	After copying the data up to the limit the user specifies, the true
 220 *	length of the data is written over the length limit the user
 221 *	specified. Zero is returned for a success.
 222 */
 223
 224int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr,
 225		      int __user *ulen)
 226{
 227	int err;
 228	int len;
 229
 230	err = get_user(len, ulen);
 231	if (err)
 232		return err;
 233	if (len > klen)
 234		len = klen;
 235	if (len < 0 || len > sizeof(struct sockaddr_storage))
 236		return -EINVAL;
 237	if (len) {
 238		if (audit_sockaddr(klen, kaddr))
 239			return -ENOMEM;
 240		if (copy_to_user(uaddr, kaddr, len))
 241			return -EFAULT;
 242	}
 243	/*
 244	 *      "fromlen shall refer to the value before truncation.."
 245	 *                      1003.1g
 246	 */
 247	return __put_user(klen, ulen);
 248}
 249
 250static struct kmem_cache *sock_inode_cachep __read_mostly;
 251
 252static struct inode *sock_alloc_inode(struct super_block *sb)
 253{
 254	struct socket_alloc *ei;
 255
 256	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
 257	if (!ei)
 258		return NULL;
 259	init_waitqueue_head(&ei->socket.wait);
 260
 261	ei->socket.fasync_list = NULL;
 262	ei->socket.state = SS_UNCONNECTED;
 263	ei->socket.flags = 0;
 264	ei->socket.ops = NULL;
 265	ei->socket.sk = NULL;
 266	ei->socket.file = NULL;
 267
 268	return &ei->vfs_inode;
 269}
 270
 271static void sock_destroy_inode(struct inode *inode)
 272{
 273	kmem_cache_free(sock_inode_cachep,
 274			container_of(inode, struct socket_alloc, vfs_inode));
 275}
 276
 277static void init_once(void *foo)
 278{
 279	struct socket_alloc *ei = (struct socket_alloc *)foo;
 280
 281	inode_init_once(&ei->vfs_inode);
 282}
 283
 284static int init_inodecache(void)
 285{
 286	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
 287					      sizeof(struct socket_alloc),
 288					      0,
 289					      (SLAB_HWCACHE_ALIGN |
 290					       SLAB_RECLAIM_ACCOUNT |
 291					       SLAB_MEM_SPREAD),
 292					      init_once);
 293	if (sock_inode_cachep == NULL)
 294		return -ENOMEM;
 295	return 0;
 296}
 297
 298static const struct super_operations sockfs_ops = {
 299	.alloc_inode =	sock_alloc_inode,
 300	.destroy_inode =sock_destroy_inode,
 301	.statfs =	simple_statfs,
 302};
 303
 304static int sockfs_get_sb(struct file_system_type *fs_type,
 305			 int flags, const char *dev_name, void *data,
 306			 struct vfsmount *mnt)
 307{
 308	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
 309			     mnt);
 310}
 311
 312static struct vfsmount *sock_mnt __read_mostly;
 313
 314static struct file_system_type sock_fs_type = {
 315	.name =		"sockfs",
 316	.get_sb =	sockfs_get_sb,
 317	.kill_sb =	kill_anon_super,
 318};
 319
 320/*
 321 * sockfs_dname() is called from d_path().
 322 */
 323static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
 324{
 325	return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
 326				dentry->d_inode->i_ino);
 327}
 328
 329static const struct dentry_operations sockfs_dentry_operations = {
 330	.d_dname  = sockfs_dname,
 331};
 332
 333/*
 334 *	Obtains the first available file descriptor and sets it up for use.
 335 *
 336 *	These functions create file structures and maps them to fd space
 337 *	of the current process. On success it returns file descriptor
 338 *	and file struct implicitly stored in sock->file.
 339 *	Note that another thread may close file descriptor before we return
 340 *	from this function. We use the fact that now we do not refer
 341 *	to socket after mapping. If one day we will need it, this
 342 *	function will increment ref. count on file by 1.
 343 *
 344 *	In any case returned fd MAY BE not valid!
 345 *	This race condition is unavoidable
 346 *	with shared fd spaces, we cannot solve it inside kernel,
 347 *	but we take care of internal coherence yet.
 348 */
 349
 350static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
 351{
 352	struct qstr name = { .name = "" };
 353	struct path path;
 354	struct file *file;
 355	int fd;
 356
 357	fd = get_unused_fd_flags(flags);
 358	if (unlikely(fd < 0))
 359		return fd;
 360
 361	path.dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
 362	if (unlikely(!path.dentry)) {
 363		put_unused_fd(fd);
 364		return -ENOMEM;
 365	}
 366	path.mnt = mntget(sock_mnt);
 367
 368	path.dentry->d_op = &sockfs_dentry_operations;
 369	d_instantiate(path.dentry, SOCK_INODE(sock));
 370	SOCK_INODE(sock)->i_fop = &socket_file_ops;
 371
 372	file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
 373		  &socket_file_ops);
 374	if (unlikely(!file)) {
 375		/* drop dentry, keep inode */
 376		atomic_inc(&path.dentry->d_inode->i_count);
 377		path_put(&path);
 378		put_unused_fd(fd);
 379		return -ENFILE;
 380	}
 381
 382	sock->file = file;
 383	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
 384	file->f_pos = 0;
 385	file->private_data = sock;
 386
 387	*f = file;
 388	return fd;
 389}
 390
 391int sock_map_fd(struct socket *sock, int flags)
 392{
 393	struct file *newfile;
 394	int fd = sock_alloc_file(sock, &newfile, flags);
 395
 396	if (likely(fd >= 0))
 397		fd_install(fd, newfile);
 398
 399	return fd;
 400}
 401
 402static struct socket *sock_from_file(struct file *file, int *err)
 403{
 404	if (file->f_op == &socket_file_ops)
 405		return file->private_data;	/* set in sock_map_fd */
 406
 407	*err = -ENOTSOCK;
 408	return NULL;
 409}
 410
 411/**
 412 *	sockfd_lookup	- 	Go from a file number to its socket slot
 413 *	@fd: file handle
 414 *	@err: pointer to an error code return
 415 *
 416 *	The file handle passed in is locked and the socket it is bound
 417 *	too is returned. If an error occurs the err pointer is overwritten
 418 *	with a negative errno code and NULL is returned. The function checks
 419 *	for both invalid handles and passing a handle which is not a socket.
 420 *
 421 *	On a success the socket object pointer is returned.
 422 */
 423
 424struct socket *sockfd_lookup(int fd, int *err)
 425{
 426	struct file *file;
 427	struct socket *sock;
 428
 429	file = fget(fd);
 430	if (!file) {
 431		*err = -EBADF;
 432		return NULL;
 433	}
 434
 435	sock = sock_from_file(file, err);
 436	if (!sock)
 437		fput(file);
 438	return sock;
 439}
 440
 441static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
 442{
 443	struct file *file;
 444	struct socket *sock;
 445
 446	*err = -EBADF;
 447	file = fget_light(fd, fput_needed);
 448	if (file) {
 449		sock = sock_from_file(file, err);
 450		if (sock)
 451			return sock;
 452		fput_light(file, *fput_needed);
 453	}
 454	return NULL;
 455}
 456
 457/**
 458 *	sock_alloc	-	allocate a socket
 459 *
 460 *	Allocate a new inode and socket object. The two are bound together
 461 *	and initialised. The socket is then returned. If we are out of inodes
 462 *	NULL is returned.
 463 */
 464
 465static struct socket *sock_alloc(void)
 466{
 467	struct inode *inode;
 468	struct socket *sock;
 469
 470	inode = new_inode(sock_mnt->mnt_sb);
 471	if (!inode)
 472		return NULL;
 473
 474	sock = SOCKET_I(inode);
 475
 476	kmemcheck_annotate_bitfield(sock, type);
 477	inode->i_mode = S_IFSOCK | S_IRWXUGO;
 478	inode->i_uid = current_fsuid();
 479	inode->i_gid = current_fsgid();
 480
 481	percpu_add(sockets_in_use, 1);
 482	return sock;
 483}
 484
 485/*
 486 *	In theory you can't get an open on this inode, but /proc provides
 487 *	a back door. Remember to keep it shut otherwise you'll let the
 488 *	creepy crawlies in.
 489 */
 490
 491static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 492{
 493	return -ENXIO;
 494}
 495
 496const struct file_operations bad_sock_fops = {
 497	.owner = THIS_MODULE,
 498	.open = sock_no_open,
 499};
 500
 501/**
 502 *	sock_release	-	close a socket
 503 *	@sock: socket to close
 504 *
 505 *	The socket is released from the protocol stack if it has a release
 506 *	callback, and the inode is then released if the socket is bound to
 507 *	an inode not a file.
 508 */
 509
 510void sock_release(struct socket *sock)
 511{
 512	if (sock->ops) {
 513		struct module *owner = sock->ops->owner;
 514
 515		sock->ops->release(sock);
 516		sock->ops = NULL;
 517		module_put(owner);
 518	}
 519
 520	if (sock->fasync_list)
 521		printk(KERN_ERR "sock_release: fasync list not empty!\n");
 522
 523	percpu_sub(sockets_in_use, 1);
 524	if (!sock->file) {
 525		iput(SOCK_INODE(sock));
 526		return;
 527	}
 528	sock->file = NULL;
 529}
 530
 531int sock_tx_timestamp(struct msghdr *msg, struct sock *sk,
 532		      union skb_shared_tx *shtx)
 533{
 534	shtx->flags = 0;
 535	if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 536		shtx->hardware = 1;
 537	if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 538		shtx->software = 1;
 539	return 0;
 540}
 541EXPORT_SYMBOL(sock_tx_timestamp);
 542
 543static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 544				 struct msghdr *msg, size_t size)
 545{
 546	struct sock_iocb *si = kiocb_to_siocb(iocb);
 547	int err;
 548
 549	si->sock = sock;
 550	si->scm = NULL;
 551	si->msg = msg;
 552	si->size = size;
 553
 554	err = security_socket_sendmsg(sock, msg, size);
 555	if (err)
 556		return err;
 557
 558	err = sock->ops->sendmsg(iocb, sock, msg, size);
 559	return err;
 560}
 561
 562int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 563{
 564	struct kiocb iocb;
 565	struct sock_iocb siocb;
 566	int ret;
 567
 568	init_sync_kiocb(&iocb, NULL);
 569	iocb.private = &siocb;
 570	ret = __sock_sendmsg(&iocb, sock, msg, size);
 571	if (-EIOCBQUEUED == ret)
 572		ret = wait_on_sync_kiocb(&iocb);
 573	return ret;
 574}
 575
 576int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 577		   struct kvec *vec, size_t num, size_t size)
 578{
 579	mm_segment_t oldfs = get_fs();
 580	int result;
 581
 582	set_fs(KERNEL_DS);
 583	/*
 584	 * the following is safe, since for compiler definitions of kvec and
 585	 * iovec are identical, yielding the same in-core layout and alignment
 586	 */
 587	msg->msg_iov = (struct iovec *)vec;
 588	msg->msg_iovlen = num;
 589	result = sock_sendmsg(sock, msg, size);
 590	set_fs(oldfs);
 591	return result;
 592}
 593
 594static int ktime2ts(ktime_t kt, struct timespec *ts)
 595{
 596	if (kt.tv64) {
 597		*ts = ktime_to_timespec(kt);
 598		return 1;
 599	} else {
 600		return 0;
 601	}
 602}
 603
 604/*
 605 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
 606 */
 607void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 608	struct sk_buff *skb)
 609{
 610	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
 611	struct timespec ts[3];
 612	int empty = 1;
 613	struct skb_shared_hwtstamps *shhwtstamps =
 614		skb_hwtstamps(skb);
 615
 616	/* Race occurred between timestamp enabling and packet
 617	   receiving.  Fill in the current time for now. */
 618	if (need_software_tstamp && skb->tstamp.tv64 == 0)
 619		__net_timestamp(skb);
 620
 621	if (need_software_tstamp) {
 622		if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
 623			struct timeval tv;
 624			skb_get_timestamp(skb, &tv);
 625			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
 626				 sizeof(tv), &tv);
 627		} else {
 628			struct timespec ts;
 629			skb_get_timestampns(skb, &ts);
 630			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
 631				 sizeof(ts), &ts);
 632		}
 633	}
 634
 635
 636	memset(ts, 0, sizeof(ts));
 637	if (skb->tstamp.tv64 &&
 638	    sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) {
 639		skb_get_timestampns(skb, ts + 0);
 640		empty = 0;
 641	}
 642	if (shhwtstamps) {
 643		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE) &&
 644		    ktime2ts(shhwtstamps->syststamp, ts + 1))
 645			empty = 0;
 646		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) &&
 647		    ktime2ts(shhwtstamps->hwtstamp, ts + 2))
 648			empty = 0;
 649	}
 650	if (!empty)
 651		put_cmsg(msg, SOL_SOCKET,
 652			 SCM_TIMESTAMPING, sizeof(ts), &ts);
 653}
 654
 655EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
 656
 657inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 658{
 659	if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount)
 660		put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
 661			sizeof(__u32), &skb->dropcount);
 662}
 663
 664void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 665	struct sk_buff *skb)
 666{
 667	sock_recv_timestamp(msg, sk, skb);
 668	sock_recv_drops(msg, sk, skb);
 669}
 670EXPORT_SYMBOL_GPL(sock_recv_ts_and_drops);
 671
 672static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
 673				       struct msghdr *msg, size_t size, int flags)
 674{
 675	int err;
 676	struct sock_iocb *si = kiocb_to_siocb(iocb);
 677
 678	si->sock = sock;
 679	si->scm = NULL;
 680	si->msg = msg;
 681	si->size = size;
 682	si->flags = flags;
 683
 684	err = sock->ops->recvmsg(iocb, sock, msg, size, flags);
 685	return err;
 686}
 687
 688static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 689				 struct msghdr *msg, size_t size, int flags)
 690{
 691	int err = security_socket_recvmsg(sock, msg, size, flags);
 692
 693	return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
 694}
 695
 696int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 697		 size_t size, int flags)
 698{
 699	struct kiocb iocb;
 700	struct sock_iocb siocb;
 701	int ret;
 702
 703	init_sync_kiocb(&iocb, NULL);
 704	iocb.private = &siocb;
 705	ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
 706	if (-EIOCBQUEUED == ret)
 707		ret = wait_on_sync_kiocb(&iocb);
 708	return ret;
 709}
 710
 711static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
 712			      size_t size, int flags)
 713{
 714	struct kiocb iocb;
 715	struct sock_iocb siocb;
 716	int ret;
 717
 718	init_sync_kiocb(&iocb, NULL);
 719	iocb.private = &siocb;
 720	ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
 721	if (-EIOCBQUEUED == ret)
 722		ret = wait_on_sync_kiocb(&iocb);
 723	return ret;
 724}
 725
 726int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 727		   struct kvec *vec, size_t num, size_t size, int flags)
 728{
 729	mm_segment_t oldfs = get_fs();
 730	int result;
 731
 732	set_fs(KERNEL_DS);
 733	/*
 734	 * the following is safe, since for compiler definitions of kvec and
 735	 * iovec are identical, yielding the same in-core layout and alignment
 736	 */
 737	msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num;
 738	result = sock_recvmsg(sock, msg, size, flags);
 739	set_fs(oldfs);
 740	return result;
 741}
 742
 743static void sock_aio_dtor(struct kiocb *iocb)
 744{
 745	kfree(iocb->private);
 746}
 747
 748static ssize_t sock_sendpage(struct file *file, struct page *page,
 749			     int offset, size_t size, loff_t *ppos, int more)
 750{
 751	struct socket *sock;
 752	int flags;
 753
 754	sock = file->private_data;
 755
 756	flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
 757	if (more)
 758		flags |= MSG_MORE;
 759
 760	return kernel_sendpage(sock, page, offset, size, flags);
 761}
 762
 763static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
 764			        struct pipe_inode_info *pipe, size_t len,
 765				unsigned int flags)
 766{
 767	struct socket *sock = file->private_data;
 768
 769	if (unlikely(!sock->ops->splice_read))
 770		return -EINVAL;
 771
 772	return sock->ops->splice_read(sock, ppos, pipe, len, flags);
 773}
 774
 775static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
 776					 struct sock_iocb *siocb)
 777{
 778	if (!is_sync_kiocb(iocb)) {
 779		siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
 780		if (!siocb)
 781			return NULL;
 782		iocb->ki_dtor = sock_aio_dtor;
 783	}
 784
 785	siocb->kiocb = iocb;
 786	iocb->private = siocb;
 787	return siocb;
 788}
 789
 790static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
 791		struct file *file, const struct iovec *iov,
 792		unsigned long nr_segs)
 793{
 794	struct socket *sock = file->private_data;
 795	size_t size = 0;
 796	int i;
 797
 798	for (i = 0; i < nr_segs; i++)
 799		size += iov[i].iov_len;
 800
 801	msg->msg_name = NULL;
 802	msg->msg_namelen = 0;
 803	msg->msg_control = NULL;
 804	msg->msg_controllen = 0;
 805	msg->msg_iov = (struct iovec *)iov;
 806	msg->msg_iovlen = nr_segs;
 807	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 808
 809	return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
 810}
 811
 812static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 813				unsigned long nr_segs, loff_t pos)
 814{
 815	struct sock_iocb siocb, *x;
 816
 817	if (pos != 0)
 818		return -ESPIPE;
 819
 820	if (iocb->ki_left == 0)	/* Match SYS5 behaviour */
 821		return 0;
 822
 823
 824	x = alloc_sock_iocb(iocb, &siocb);
 825	if (!x)
 826		return -ENOMEM;
 827	return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
 828}
 829
 830static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
 831			struct file *file, const struct iovec *iov,
 832			unsigned long nr_segs)
 833{
 834	struct socket *sock = file->private_data;
 835	size_t size = 0;
 836	int i;
 837
 838	for (i = 0; i < nr_segs; i++)
 839		size += iov[i].iov_len;
 840
 841	msg->msg_name = NULL;
 842	msg->msg_namelen = 0;
 843	msg->msg_control = NULL;
 844	msg->msg_controllen = 0;
 845	msg->msg_iov = (struct iovec *)iov;
 846	msg->msg_iovlen = nr_segs;
 847	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 848	if (sock->type == SOCK_SEQPACKET)
 849		msg->msg_flags |= MSG_EOR;
 850
 851	return __sock_sendmsg(iocb, sock, msg, size);
 852}
 853
 854static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
 855			  unsigned long nr_segs, loff_t pos)
 856{
 857	struct sock_iocb siocb, *x;
 858
 859	if (pos != 0)
 860		return -ESPIPE;
 861
 862	x = alloc_sock_iocb(iocb, &siocb);
 863	if (!x)
 864		return -ENOMEM;
 865
 866	return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
 867}
 868
 869/*
 870 * Atomic setting of ioctl hooks to avoid race
 871 * with module unload.
 872 */
 873
 874static DEFINE_MUTEX(br_ioctl_mutex);
 875static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg) = NULL;
 876
 877void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
 878{
 879	mutex_lock(&br_ioctl_mutex);
 880	br_ioctl_hook = hook;
 881	mutex_unlock(&br_ioctl_mutex);
 882}
 883
 884EXPORT_SYMBOL(brioctl_set);
 885
 886static DEFINE_MUTEX(vlan_ioctl_mutex);
 887static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
 888
 889void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
 890{
 891	mutex_lock(&vlan_ioctl_mutex);
 892	vlan_ioctl_hook = hook;
 893	mutex_unlock(&vlan_ioctl_mutex);
 894}
 895
 896EXPORT_SYMBOL(vlan_ioctl_set);
 897
 898static DEFINE_MUTEX(dlci_ioctl_mutex);
 899static int (*dlci_ioctl_hook) (unsigned int, void __user *);
 900
 901void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
 902{
 903	mutex_lock(&dlci_ioctl_mutex);
 904	dlci_ioctl_hook = hook;
 905	mutex_unlock(&dlci_ioctl_mutex);
 906}
 907
 908EXPORT_SYMBOL(dlci_ioctl_set);
 909
 910static long sock_do_ioctl(struct net *net, struct socket *sock,
 911				 unsigned int cmd, unsigned long arg)
 912{
 913	int err;
 914	void __user *argp = (void __user *)arg;
 915
 916	err = sock->ops->ioctl(sock, cmd, arg);
 917
 918	/*
 919	 * If this ioctl is unknown try to hand it down
 920	 * to the NIC driver.
 921	 */
 922	if (err == -ENOIOCTLCMD)
 923		err = dev_ioctl(net, cmd, argp);
 924
 925	return err;
 926}
 927
 928/*
 929 *	With an ioctl, arg may well be a user mode pointer, but we don't know
 930 *	what to do with it - that's up to the protocol still.
 931 */
 932
 933static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 934{
 935	struct socket *sock;
 936	struct sock *sk;
 937	void __user *argp = (void __user *)arg;
 938	int pid, err;
 939	struct net *net;
 940
 941	sock = file->private_data;
 942	sk = sock->sk;
 943	net = sock_net(sk);
 944	if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
 945		err = dev_ioctl(net, cmd, argp);
 946	} else
 947#ifdef CONFIG_WEXT_CORE
 948	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
 949		err = dev_ioctl(net, cmd, argp);
 950	} else
 951#endif
 952		switch (cmd) {
 953		case FIOSETOWN:
 954		case SIOCSPGRP:
 955			err = -EFAULT;
 956			if (get_user(pid, (int __user *)argp))
 957				break;
 958			err = f_setown(sock->file, pid, 1);
 959			break;
 960		case FIOGETOWN:
 961		case SIOCGPGRP:
 962			err = put_user(f_getown(sock->file),
 963				       (int __user *)argp);
 964			break;
 965		case SIOCGIFBR:
 966		case SIOCSIFBR:
 967		case SIOCBRADDBR:
 968		case SIOCBRDELBR:
 969			err = -ENOPKG;
 970			if (!br_ioctl_hook)
 971				request_module("bridge");
 972
 973			mutex_lock(&br_ioctl_mutex);
 974			if (br_ioctl_hook)
 975				err = br_ioctl_hook(net, cmd, argp);
 976			mutex_unlock(&br_ioctl_mutex);
 977			break;
 978		case SIOCGIFVLAN:
 979		case SIOCSIFVLAN:
 980			err = -ENOPKG;
 981			if (!vlan_ioctl_hook)
 982				request_module("8021q");
 983
 984			mutex_lock(&vlan_ioctl_mutex);
 985			if (vlan_ioctl_hook)
 986				err = vlan_ioctl_hook(net, argp);
 987			mutex_unlock(&vlan_ioctl_mutex);
 988			break;
 989		case SIOCADDDLCI:
 990		case SIOCDELDLCI:
 991			err = -ENOPKG;
 992			if (!dlci_ioctl_hook)
 993				request_module("dlci");
 994
 995			mutex_lock(&dlci_ioctl_mutex);
 996			if (dlci_ioctl_hook)
 997				err = dlci_ioctl_hook(cmd, argp);
 998			mutex_unlock(&dlci_ioctl_mutex);
 999			break;
1000		default:
1001			err = sock_do_ioctl(net, sock, cmd, arg);
1002			break;
1003		}
1004	return err;
1005}
1006
1007int sock_create_lite(int family, int type, int protocol, struct socket **res)
1008{
1009	int err;
1010	struct socket *sock = NULL;
1011
1012	err = security_socket_create(family, type, protocol, 1);
1013	if (err)
1014		goto out;
1015
1016	sock = sock_alloc();
1017	if (!sock) {
1018		err = -ENOMEM;
1019		goto out;
1020	}
1021
1022	sock->type = type;
1023	err = security_socket_post_create(sock, family, type, protocol, 1);
1024	if (err)
1025		goto out_release;
1026
1027out:
1028	*res = sock;
1029	return err;
1030out_release:
1031	sock_release(sock);
1032	sock = NULL;
1033	goto out;
1034}
1035
1036/* No kernel lock held - perfect */
1037static unsigned int sock_poll(struct file *file, poll_table *wait)
1038{
1039	struct socket *sock;
1040
1041	/*
1042	 *      We can't return errors to poll, so it's either yes or no.
1043	 */
1044	sock = file->private_data;
1045	return sock->ops->poll(file, sock, wait);
1046}
1047
1048static int sock_mmap(struct file *file, struct vm_area_struct *vma)
1049{
1050	struct socket *sock = file->private_data;
1051
1052	return sock->ops->mmap(file, sock, vma);
1053}
1054
1055static int sock_close(struct inode *inode, struct file *filp)
1056{
1057	/*
1058	 *      It was possible the inode is NULL we were
1059	 *      closing an unfinished socket.
1060	 */
1061
1062	if (!inode) {
1063		printk(KERN_DEBUG "sock_close: NULL inode\n");
1064		return 0;
1065	}
1066	sock_release(SOCKET_I(inode));
1067	return 0;
1068}
1069
1070/*
1071 *	Update the socket async list
1072 *
1073 *	Fasync_list locking strategy.
1074 *
1075 *	1. fasync_list is modified only under process context socket lock
1076 *	   i.e. under semaphore.
1077 *	2. fasync_list is used under read_lock(&sk->sk_callback_lock)
1078 *	   or under socket lock.
1079 *	3. fasync_list can be used from softirq context, so that
1080 *	   modification under socket lock have to be enhanced with
1081 *	   write_lock_bh(&sk->sk_callback_lock).
1082 *							--ANK (990710)
1083 */
1084
1085static int sock_fasync(int fd, struct file *filp, int on)
1086{
1087	struct fasync_struct *fa, *fna = NULL, **prev;
1088	struct socket *sock;
1089	struct sock *sk;
1090
1091	if (on) {
1092		fna = kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);
1093		if (fna == NULL)
1094			return -ENOMEM;
1095	}
1096
1097	sock = filp->private_data;
1098
1099	sk = sock->sk;
1100	if (sk == NULL) {
1101		kfree(fna);
1102		return -EINVAL;
1103	}
1104
1105	lock_sock(sk);
1106
1107	spin_lock(&filp->f_lock);
1108	if (on)
1109		filp->f_flags |= FASYNC;
1110	else
1111		filp->f_flags &= ~FASYNC;
1112	spin_unlock(&filp->f_lock);
1113
1114	prev = &(sock->fasync_list);
1115
1116	for (fa = *prev; fa != NULL; prev = &fa->fa_next, fa = *prev)
1117		if (fa->fa_file == filp)
1118			break;
1119
1120	if (on) {
1121		if (fa != NULL) {
1122			write_lock_bh(&sk->sk_callback_lock);
1123			fa->fa_fd = fd;
1124			write_unlock_bh(&sk->sk_callback_lock);
1125
1126			kfree(fna);
1127			goto out;
1128		}
1129		fna->fa_file = filp;
1130		fna->fa_fd = fd;
1131		fna->magic = FASYNC_MAGIC;
1132		fna->fa_next = sock->fasync_list;
1133		write_lock_bh(&sk->sk_callback_lock);
1134		sock->fasync_list = fna;
1135		sock_set_flag(sk, SOCK_FASYNC);
1136		write_unlock_bh(&sk->sk_callback_lock);
1137	} else {
1138		if (fa != NULL) {
1139			write_lock_bh(&sk->sk_callback_lock);
1140			*prev = fa->fa_next;
1141			if (!sock->fasync_list)
1142				sock_reset_flag(sk, SOCK_FASYNC);
1143			write_unlock_bh(&sk->sk_callback_lock);
1144			kfree(fa);
1145		}
1146	}
1147
1148out:
1149	release_sock(sock->sk);
1150	return 0;
1151}
1152
1153/* This function may be called only under socket lock or callback_lock */
1154
1155int sock_wake_async(struct socket *sock, int how, int band)
1156{
1157	if (!sock || !sock->fasync_list)
1158		return -1;
1159	switch (how) {
1160	case SOCK_WAKE_WAITD:
1161		if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
1162			break;
1163		goto call_kill;
1164	case SOCK_WAKE_SPACE:
1165		if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
1166			break;
1167		/* fall through */
1168	case SOCK_WAKE_IO:
1169call_kill:
1170		__kill_fasync(sock->fasync_list, SIGIO, band);
1171		break;
1172	case SOCK_WAKE_URG:
1173		__kill_fasync(sock->fasync_list, SIGURG, band);
1174	}
1175	return 0;
1176}
1177
1178static int __sock_create(struct net *net, int family, int type, int protocol,
1179			 struct socket **res, int kern)
1180{
1181	int err;
1182	struct socket *sock;
1183	const struct net_proto_family *pf;
1184
1185	/*
1186	 *      Check protocol is in range
1187	 */
1188	if (family < 0 || family >= NPROTO)
1189		return -EAFNOSUPPORT;
1190	if (type < 0 || type >= SOCK_MAX)
1191		return -EINVAL;
1192
1193	/* Compatibility.
1194
1195	   This uglymoron is moved from INET layer to here to avoid
1196	   deadlock in module load.
1197	 */
1198	if (family == PF_INET && type == SOCK_PACKET) {
1199		static int warned;
1200		if (!warned) {
1201			warned = 1;
1202			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
1203			       current->comm);
1204		}
1205		family = PF_PACKET;
1206	}
1207
1208	err = security_socket_create(family, type, protocol, kern);
1209	if (err)
1210		return err;
1211
1212	/*
1213	 *	Allocate the socket and allow the family to set things up. if
1214	 *	the protocol is 0, the family is instructed to select an appropriate
1215	 *	default.
1216	 */
1217	sock = sock_alloc();
1218	if (!sock) {
1219		if (net_ratelimit())
1220			printk(KERN_WARNING "socket: no more sockets\n");
1221		return -ENFILE;	/* Not exactly a match, but its the
1222				   closest posix thing */
1223	}
1224
1225	sock->type = type;
1226
1227#ifdef CONFIG_MODULES
1228	/* Attempt to load a protocol module if the find failed.
1229	 *
1230	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
1231	 * requested real, full-featured networking support upon configuration.
1232	 * Otherwise module support will break!
1233	 */
1234	if (net_families[family] == NULL)
1235		request_module("net-pf-%d", family);
1236#endif
1237
1238	rcu_read_lock();
1239	pf = rcu_dereference(net_families[family]);
1240	err = -EAFNOSUPPORT;
1241	if (!pf)
1242		goto out_release;
1243
1244	/*
1245	 * We will call the ->create function, that possibly is in a loadable
1246	 * module, so we have to bump that loadable module refcnt first.
1247	 */
1248	if (!try_module_get(pf->owner))
1249		goto out_release;
1250
1251	/* Now protected by module ref count */
1252	rcu_read_unlock();
1253
1254	err = pf->create(net, sock, protocol, kern);
1255	if (err < 0)
1256		goto out_module_put;
1257
1258	/*
1259	 * Now to bump the refcnt of the [loadable] module that owns this
1260	 * socket at sock_release time we decrement its refcnt.
1261	 */
1262	if (!try_module_get(sock->ops->owner))
1263		goto out_module_busy;
1264
1265	/*
1266	 * Now that we're done with the ->create function, the [loadable]
1267	 * module can have its refcnt decremented
1268	 */
1269	module_put(pf->owner);
1270	err = security_socket_post_create(sock, family, type, protocol, kern);
1271	if (err)
1272		goto out_sock_release;
1273	*res = sock;
1274
1275	return 0;
1276
1277out_module_busy:
1278	err = -EAFNOSUPPORT;
1279out_module_put:
1280	sock->ops = NULL;
1281	module_put(pf->owner);
1282out_sock_release:
1283	sock_release(sock);
1284	return err;
1285
1286out_release:
1287	rcu_read_unlock();
1288	goto out_sock_release;
1289}
1290
1291int sock_create(int family, int type, int protocol, struct socket **res)
1292{
1293	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
1294}
1295
1296int sock_create_kern(int family, int type, int protocol, struct socket **res)
1297{
1298	return __sock_create(&init_net, family, type, protocol, res, 1);
1299}
1300
1301SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
1302{
1303	int retval;
1304	struct socket *sock;
1305	int flags;
1306
1307	/* Check the SOCK_* constants for consistency.  */
1308	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
1309	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
1310	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
1311	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
1312
1313	flags = type & ~SOCK_TYPE_MASK;
1314	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1315		return -EINVAL;
1316	type &= SOCK_TYPE_MASK;
1317
1318	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
1319		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1320
1321	retval = sock_create(family, type, protocol, &sock);
1322	if (retval < 0)
1323		goto out;
1324
1325	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
1326	if (retval < 0)
1327		goto out_release;
1328
1329out:
1330	/* It may be already another descriptor 8) Not kernel problem. */
1331	return retval;
1332
1333out_release:
1334	sock_release(sock);
1335	return retval;
1336}
1337
1338/*
1339 *	Create a pair of connected sockets.
1340 */
1341
1342SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
1343		int __user *, usockvec)
1344{
1345	struct socket *sock1, *sock2;
1346	int fd1, fd2, err;
1347	struct file *newfile1, *newfile2;
1348	int flags;
1349
1350	flags = type & ~SOCK_TYPE_MASK;
1351	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1352		return -EINVAL;
1353	type &= SOCK_TYPE_MASK;
1354
1355	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
1356		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1357
1358	/*
1359	 * Obtain the first socket and check if the underlying protocol
1360	 * supports the socketpair call.
1361	 */
1362
1363	err = sock_create(family, type, protocol, &sock1);
1364	if (err < 0)
1365		goto out;
1366
1367	err = sock_create(family, type, protocol, &sock2);
1368	if (err < 0)
1369		goto out_release_1;
1370
1371	err = sock1->ops->socketpair(sock1, sock2);
1372	if (err < 0)
1373		goto out_release_both;
1374
1375	fd1 = sock_alloc_file(sock1, &newfile1, flags);
1376	if (unlikely(fd1 < 0)) {
1377		err = fd1;
1378		goto out_release_both;
1379	}
1380
1381	fd2 = sock_alloc_file(sock2, &newfile2, flags);
1382	if (unlikely(fd2 < 0)) {
1383		err = fd2;
1384		fput(newfile1);
1385		put_unused_fd(fd1);
1386		sock_release(sock2);
1387		goto out;
1388	}
1389
1390	audit_fd_pair(fd1, fd2);
1391	fd_install(fd1, newfile1);
1392	fd_install(fd2, newfile2);
1393	/* fd1 and fd2 may be already another descriptors.
1394	 * Not kernel problem.
1395	 */
1396
1397	err = put_user(fd1, &usockvec[0]);
1398	if (!err)
1399		err = put_user(fd2, &usockvec[1]);
1400	if (!err)
1401		return 0;
1402
1403	sys_close(fd2);
1404	sys_close(fd1);
1405	return err;
1406
1407out_release_both:
1408	sock_release(sock2);
1409out_release_1:
1410	sock_release(sock1);
1411out:
1412	return err;
1413}
1414
1415/*
1416 *	Bind a name to a socket. Nothing much to do here since it's
1417 *	the protocol's responsibility to handle the local address.
1418 *
1419 *	We move the socket address to kernel space before we call
1420 *	the protocol layer (having also checked the address is ok).
1421 */
1422
1423SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
1424{
1425	struct socket *sock;
1426	struct sockaddr_storage address;
1427	int err, fput_needed;
1428
1429	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1430	if (sock) {
1431		err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
1432		if (err >= 0) {
1433			err = security_socket_bind(sock,
1434						   (struct sockaddr *)&address,
1435						   addrlen);
1436			if (!err)
1437				err = sock->ops->bind(sock,
1438						      (struct sockaddr *)
1439						      &address, addrlen);
1440		}
1441		fput_light(sock->file, fput_needed);
1442	}
1443	return err;
1444}
1445
1446/*
1447 *	Perform a listen. Basically, we allow the protocol to do anything
1448 *	necessary for a listen, and if that works, we mark the socket as
1449 *	ready for listening.
1450 */
1451
1452SYSCALL_DEFINE2(listen, int, fd, int, backlog)
1453{
1454	struct socket *sock;
1455	int err, fput_needed;
1456	int somaxconn;
1457
1458	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1459	if (sock) {
1460		somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
1461		if ((unsigned)backlog > somaxconn)
1462			backlog = somaxconn;
1463
1464		err = security_socket_listen(sock, backlog);
1465		if (!err)
1466			err = sock->ops->listen(sock, backlog);
1467
1468		fput_light(sock->file, fput_needed);
1469	}
1470	return err;
1471}
1472
1473/*
1474 *	For accept, we attempt to create a new socket, set up the link
1475 *	with the client, wake up the client, then return the new
1476 *	connected fd. We collect the address of the connector in kernel
1477 *	space and move it to user at the very end. This is unclean because
1478 *	we open the socket then return an error.
1479 *
1480 *	1003.1g adds the ability to recvmsg() to query connection pending
1481 *	status to recvmsg. We need to add that support in a way thats
1482 *	clean when we restucture accept also.
1483 */
1484
1485SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
1486		int __user *, upeer_addrlen, int, flags)
1487{
1488	struct socket *sock, *newsock;
1489	struct file *newfile;
1490	int err, len, newfd, fput_needed;
1491	struct sockaddr_storage address;
1492
1493	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1494		return -EINVAL;
1495
1496	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
1497		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1498
1499	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1500	if (!sock)
1501		goto out;
1502
1503	err = -ENFILE;
1504	if (!(newsock = sock_alloc()))
1505		goto out_put;
1506
1507	newsock->type = sock->type;
1508	newsock->ops = sock->ops;
1509
1510	/*
1511	 * We don't need try_module_get here, as the listening socket (sock)
1512	 * has the protocol module (sock->ops->owner) held.
1513	 */
1514	__module_get(newsock->ops->owner);
1515
1516	newfd = sock_alloc_file(newsock, &newfile, flags);
1517	if (unlikely(newfd < 0)) {
1518		err = newfd;
1519		sock_release(newsock);
1520		goto out_put;
1521	}
1522
1523	err = security_socket_accept(sock, newsock);
1524	if (err)
1525		goto out_fd;
1526
1527	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
1528	if (err < 0)
1529		goto out_fd;
1530
1531	if (upeer_sockaddr) {
1532		if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
1533					  &len, 2) < 0) {
1534			err = -ECONNABORTED;
1535			goto out_fd;
1536		}
1537		err = move_addr_to_user((struct sockaddr *)&address,
1538					len, upeer_sockaddr, upeer_addrlen);
1539		if (err < 0)
1540			goto out_fd;
1541	}
1542
1543	/* File flags are not inherited via accept() unlike another OSes. */
1544
1545	fd_install(newfd, newfile);
1546	err = newfd;
1547
1548out_put:
1549	fput_light(sock->file, fput_needed);
1550out:
1551	return err;
1552out_fd:
1553	fput(newfile);
1554	put_unused_fd(newfd);
1555	goto out_put;
1556}
1557
1558SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
1559		int __user *, upeer_addrlen)
1560{
1561	return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
1562}
1563
1564/*
1565 *	Attempt to connect to a socket with the server address.  The address
1566 *	is in user space so we verify it is OK and move it to kernel space.
1567 *
1568 *	For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
1569 *	break bindings
1570 *
1571 *	NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
1572 *	other SEQPACKET protocols that take time to connect() as it doesn't
1573 *	include the -EINPROGRESS status for such sockets.
1574 */
1575
1576SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
1577		int, addrlen)
1578{
1579	struct socket *sock;
1580	struct sockaddr_storage address;
1581	int err, fput_needed;
1582
1583	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1584	if (!sock)
1585		goto out;
1586	err = move_addr_to_kernel(uservaddr, addrlen, (struct sockaddr *)&address);
1587	if (err < 0)
1588		goto out_put;
1589
1590	err =
1591	    security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
1592	if (err)
1593		goto out_put;
1594
1595	err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
1596				 sock->file->f_flags);
1597out_put:
1598	fput_light(sock->file, fput_needed);
1599out:
1600	return err;
1601}
1602
1603/*
1604 *	Get the local address ('name') of a socket object. Move the obtained
1605 *	name to user space.
1606 */
1607
1608SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
1609		int __user *, usockaddr_len)
1610{
1611	struct socket *sock;
1612	struct sockaddr_storage address;
1613	int len, err, fput_needed;
1614
1615	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1616	if (!sock)
1617		goto out;
1618
1619	err = security_socket_getsockname(sock);
1620	if (err)
1621		goto out_put;
1622
1623	err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0);
1624	if (err)
1625		goto out_put;
1626	err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr, usockaddr_len);
1627
1628out_put:
1629	fput_light(sock->file, fput_needed);
1630out:
1631	return err;
1632}
1633
1634/*
1635 *	Get the remote address ('name') of a socket object. Move the obtained
1636 *	name to user space.
1637 */
1638
1639SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
1640		int __user *, usockaddr_len)
1641{
1642	struct socket *sock;
1643	struct sockaddr_storage address;
1644	int len, err, fput_needed;
1645
1646	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1647	if (sock != NULL) {
1648		err = security_socket_getpeername(sock);
1649		if (err) {
1650			fput_light(sock->file, fput_needed);
1651			return err;
1652		}
1653
1654		err =
1655		    sock->ops->getname(sock, (struct sockaddr *)&address, &len,
1656				       1);
1657		if (!err)
1658			err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr,
1659						usockaddr_len);
1660		fput_light(sock->file, fput_needed);
1661	}
1662	return err;
1663}
1664
1665/*
1666 *	Send a datagram to a given address. We move the address into kernel
1667 *	space and check the user space data area is readable before invoking
1668 *	the protocol.
1669 */
1670
1671SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
1672		unsigned, flags, struct sockaddr __user *, addr,
1673		int, addr_len)
1674{
1675	struct socket *sock;
1676	struct sockaddr_storage address;
1677	int err;
1678	struct msghdr msg;
1679	struct iovec iov;
1680	int fput_needed;
1681
1682	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1683	if (!sock)
1684		goto out;
1685
1686	iov.iov_base = buff;
1687	iov.iov_len = len;
1688	msg.msg_name = NULL;
1689	msg.msg_iov = &iov;
1690	msg.msg_iovlen = 1;
1691	msg.msg_control = NULL;
1692	msg.msg_controllen = 0;
1693	msg.msg_namelen = 0;
1694	if (addr) {
1695		err = move_addr_to_kernel(addr, addr_len, (struct sockaddr *)&address);
1696		if (err < 0)
1697			goto out_put;
1698		msg.msg_name = (struct sockaddr *)&address;
1699		msg.msg_namelen = addr_len;
1700	}
1701	if (sock->file->f_flags & O_NONBLOCK)
1702		flags |= MSG_DONTWAIT;
1703	msg.msg_flags = flags;
1704	err = sock_sendmsg(sock, &msg, len);
1705
1706out_put:
1707	fput_light(sock->file, fput_needed);
1708out:
1709	return err;
1710}
1711
1712/*
1713 *	Send a datagram down a socket.
1714 */
1715
1716SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
1717		unsigned, flags)
1718{
1719	return sys_sendto(fd, buff, len, flags, NULL, 0);
1720}
1721
1722/*
1723 *	Receive a frame from the socket and optionally record the address of the
1724 *	sender. We verify the buffers are writable and if needed move the
1725 *	sender address from kernel to user space.
1726 */
1727
1728SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
1729		unsigned, flags, struct sockaddr __user *, addr,
1730		int __user *, addr_len)
1731{
1732	struct socket *sock;
1733	struct iovec iov;
1734	struct msghdr msg;
1735	struct sockaddr_storage address;
1736	int err, err2;
1737	int fput_needed;
1738
1739	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1740	if (!sock)
1741		goto out;
1742
1743	msg.msg_control = NULL;
1744	msg.msg_controllen = 0;
1745	msg.msg_iovlen = 1;
1746	msg.msg_iov = &iov;
1747	iov.iov_len = size;
1748	iov.iov_base = ubuf;
1749	msg.msg_name = (struct sockaddr *)&address;
1750	msg.msg_namelen = sizeof(address);
1751	if (sock->file->f_flags & O_NONBLOCK)
1752		flags |= MSG_DONTWAIT;
1753	err = sock_recvmsg(sock, &msg, size, flags);
1754
1755	if (err >= 0 && addr != NULL) {
1756		err2 = move_addr_to_user((struct sockaddr *)&address,
1757					 msg.msg_namelen, addr, addr_len);
1758		if (err2 < 0)
1759			err = err2;
1760	}
1761
1762	fput_light(sock->file, fput_needed);
1763out:
1764	return err;
1765}
1766
1767/*
1768 *	Receive a datagram from a socket.
1769 */
1770
1771asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size,
1772			 unsigned flags)
1773{
1774	return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
1775}
1776
1777/*
1778 *	Set a socket option. Because we don't know the option lengths we have
1779 *	to pass the user mode parameter for the protocols to sort out.
1780 */
1781
1782SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
1783		char __user *, optval, int, optlen)
1784{
1785	int err, fput_needed;
1786	struct socket *sock;
1787
1788	if (optlen < 0)
1789		return -EINVAL;
1790
1791	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1792	if (sock != NULL) {
1793		err = security_socket_setsockopt(sock, level, optname);
1794		if (err)
1795			goto out_put;
1796
1797		if (level == SOL_SOCKET)
1798			err =
1799			    sock_setsockopt(sock, level, optname, optval,
1800					    optlen);
1801		else
1802			err =
1803			    sock->ops->setsockopt(sock, level, optname, optval,
1804						  optlen);
1805out_put:
1806		fput_light(sock->file, fput_needed);
1807	}
1808	return err;
1809}
1810
1811/*
1812 *	Get a socket option. Because we don't know the option lengths we have
1813 *	to pass a user mode parameter for the protocols to sort out.
1814 */
1815
1816SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
1817		char __user *, optval, int __user *, optlen)
1818{
1819	int err, fput_needed;
1820	struct socket *sock;
1821
1822	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1823	if (sock != NULL) {
1824		err = security_socket_getsockopt(sock, level, optname);
1825		if (err)
1826			goto out_put;
1827
1828		if (level == SOL_SOCKET)
1829			err =
1830			    sock_getsockopt(sock, level, optname, optval,
1831					    optlen);
1832		else
1833			err =
1834			    sock->ops->getsockopt(sock, level, optname, optval,
1835						  optlen);
1836out_put:
1837		fput_light(sock->file, fput_needed);
1838	}
1839	return err;
1840}
1841
1842/*
1843 *	Shutdown a socket.
1844 */
1845
1846SYSCALL_DEFINE2(shutdown, int, fd, int, how)
1847{
1848	int err, fput_needed;
1849	struct socket *sock;
1850
1851	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1852	if (sock != NULL) {
1853		err = security_socket_shutdown(sock, how);
1854		if (!err)
1855			err = sock->ops->shutdown(sock, how);
1856		fput_light(sock->file, fput_needed);
1857	}
1858	return err;
1859}
1860
1861/* A couple of helpful macros for getting the address of the 32/64 bit
1862 * fields which are the same type (int / unsigned) on our platforms.
1863 */
1864#define COMPAT_MSG(msg, member)	((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
1865#define COMPAT_NAMELEN(msg)	COMPAT_MSG(msg, msg_namelen)
1866#define COMPAT_FLAGS(msg)	COMPAT_MSG(msg, msg_flags)
1867
1868/*
1869 *	BSD sendmsg interface
1870 */
1871
1872SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags)
1873{
1874	struct compat_msghdr __user *msg_compat =
1875	    (struct compat_msghdr __user *)msg;
1876	struct socket *sock;
1877	struct sockaddr_storage address;
1878	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
1879	unsigned char ctl[sizeof(struct cmsghdr) + 20]
1880	    __attribute__ ((aligned(sizeof(__kernel_size_t))));
1881	/* 20 is size of ipv6_pktinfo */
1882	unsigned char *ctl_buf = ctl;
1883	struct msghdr msg_sys;
1884	int err, ctl_len, iov_size, total_len;
1885	int fput_needed;
1886
1887	err = -EFAULT;
1888	if (MSG_CMSG_COMPAT & flags) {
1889		if (get_compat_msghdr(&msg_sys, msg_compat))
1890			return -EFAULT;
1891	}
1892	else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
1893		return -EFAULT;
1894
1895	sock = sockfd_lookup_light(fd, &err, &fput_needed);
1896	if (!sock)
1897		goto out;
1898
1899	/* do not move before msg_sys is valid */
1900	err = -EMSGSIZE;
1901	if (msg_sys.msg_iovlen > UIO_MAXIOV)
1902		goto out_put;
1903
1904	/* Check whether to allocate the iovec area */
1905	err = -ENOMEM;
1906	iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
1907	if (msg_sys.msg_iovlen > UIO_FASTIOV) {
1908		iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
1909		if (!iov)
1910			goto out_put;
1911	}
1912
1913	/* This will also move the address data into kernel space */
1914	if (MSG_CMSG_COMPAT & flags) {
1915		err = verify_compat_iovec(&msg_sys, iov,
1916					  (struct sockaddr *)&address,
1917					  VERIFY_READ);
1918	} else
1919		err = verify_iovec(&msg_sys, iov,
1920				   (struct sockaddr *)&address,
1921				   VERIFY_READ);
1922	if (err < 0)
1923		goto out_freeiov;
1924	total_len = err;
1925
1926	err = -ENOBUFS;
1927
1928	if (msg_sys.msg_controllen > INT_MAX)
1929		goto out_freeiov;
1930	ctl_len = msg_sys.msg_controllen;
1931	if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
1932		err =
1933		    cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl,
1934						     sizeof(ctl));
1935		if (err)
1936			goto out_freeiov;
1937		ctl_buf = msg_sys.msg_control;
1938		ctl_len = msg_sys.msg_controllen;
1939	} else if (ctl_len) {
1940		if (ctl_len > sizeof(ctl)) {
1941			ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
1942			if (ctl_buf == NULL)
1943				goto out_freeiov;
1944		}
1945		err = -EFAULT;
1946		/*
1947		 * Careful! Before this, msg_sys.msg_control contains a user pointer.
1948		 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
1949		 * checking falls down on this.
1950		 */
1951		if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control,
1952				   ctl_len))
1953			goto out_freectl;
1954		msg_sys.msg_control = ctl_buf;
1955	}
1956	msg_sys.msg_flags = flags;
1957
1958	if (sock->file->f_flags & O_NONBLOCK)
1959		msg_sys.msg_flags |= MSG_DONTWAIT;
1960	err = sock_sendmsg(sock, &msg_sys, total_len);
1961
1962out_freectl:
1963	if (ctl_buf != ctl)
1964		sock_kfree_s(sock->sk, ctl_buf, ctl_len);
1965out_freeiov:
1966	if (iov != iovstack)
1967		sock_kfree_s(sock->sk, iov, iov_size);
1968out_put:
1969	fput_light(sock->file, fput_needed);
1970out:
1971	return err;
1972}
1973
1974static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
1975			 struct msghdr *msg_sys, unsigned flags, int nosec)
1976{
1977	struct compat_msghdr __user *msg_compat =
1978	    (struct compat_msghdr __user *)msg;
1979	struct iovec iovstack[UIO_FASTIOV];
1980	struct iovec *iov = iovstack;
1981	unsigned long cmsg_ptr;
1982	int err, iov_size, total_len, len;
1983
1984	/* kernel mode address */
1985	struct sockaddr_storage addr;
1986
1987	/* user mode address pointers */
1988	struct sockaddr __user *uaddr;
1989	int __user *uaddr_len;
1990
1991	if (MSG_CMSG_COMPAT & flags) {
1992		if (get_compat_msghdr(msg_sys, msg_compat))
1993			return -EFAULT;
1994	}
1995	else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
1996		return -EFAULT;
1997
1998	err = -EMSGSIZE;
1999	if (msg_sys->msg_iovlen > UIO_MAXIOV)
2000		goto out;
2001
2002	/* Check whether to allocate the iovec area */
2003	err = -ENOMEM;
2004	iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);
2005	if (msg_sys->msg_iovlen > UIO_FASTIOV) {
2006		iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
2007		if (!iov)
2008			goto out;
2009	}
2010
2011	/*
2012	 *      Save the user-mode address (verify_iovec will change the
2013	 *      kernel msghdr to use the kernel address space)
2014	 */
2015
2016	uaddr = (__force void __user *)msg_sys->msg_name;
2017	uaddr_len = COMPAT_NAMELEN(msg);
2018	if (MSG_CMSG_COMPAT & flags) {
2019		err = verify_compat_iovec(msg_sys, iov,
2020					  (struct sockaddr *)&addr,
2021					  VERIFY_WRITE);
2022	} else
2023		err = verify_iovec(msg_sys, iov,
2024				   (struct sockaddr *)&addr,
2025				   VERIFY_WRITE);
2026	if (err < 0)
2027		goto out_freeiov;
2028	total_len = err;
2029
2030	cmsg_ptr = (unsigned long)msg_sys->msg_control;
2031	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
2032
2033	if (sock->file->f_flags & O_NONBLOCK)
2034		flags |= MSG_DONTWAIT;
2035	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sy

Large files files are truncated, but you can click here to view the full file