PageRenderTime 78ms CodeModel.GetById 3ms app.highlight 59ms RepoModel.GetById 1ms app.codeStats 1ms

/kern/uipc_socket.c

https://bitbucket.org/brucec/sctpdrv
C | 3490 lines | 2503 code | 328 blank | 659 comment | 708 complexity | 837653b306623c240174d108dff4766b MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*-
   2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
   3 *	The Regents of the University of California.
   4 * Copyright (c) 2004 The FreeBSD Foundation
   5 * Copyright (c) 2004-2008 Robert N. M. Watson
   6 * All rights reserved.
   7 *
   8 * Redistribution and use in source and binary forms, with or without
   9 * modification, are permitted provided that the following conditions
  10 * are met:
  11 * 1. Redistributions of source code must retain the above copyright
  12 *    notice, this list of conditions and the following disclaimer.
  13 * 2. Redistributions in binary form must reproduce the above copyright
  14 *    notice, this list of conditions and the following disclaimer in the
  15 *    documentation and/or other materials provided with the distribution.
  16 * 4. Neither the name of the University nor the names of its contributors
  17 *    may be used to endorse or promote products derived from this software
  18 *    without specific prior written permission.
  19 *
  20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30 * SUCH DAMAGE.
  31 *
  32 *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
  33 */
  34
  35/*
  36 * Comments on the socket life cycle:
  37 *
  38 * soalloc() sets of socket layer state for a socket, called only by
  39 * socreate() and sonewconn().  Socket layer private.
  40 *
  41 * sodealloc() tears down socket layer state for a socket, called only by
  42 * sofree() and sonewconn().  Socket layer private.
  43 *
  44 * pru_attach() associates protocol layer state with an allocated socket;
  45 * called only once, may fail, aborting socket allocation.  This is called
  46 * from socreate() and sonewconn().  Socket layer private.
  47 *
  48 * pru_detach() disassociates protocol layer state from an attached socket,
  49 * and will be called exactly once for sockets in which pru_attach() has
  50 * been successfully called.  If pru_attach() returned an error,
  51 * pru_detach() will not be called.  Socket layer private.
  52 *
  53 * pru_abort() and pru_close() notify the protocol layer that the last
  54 * consumer of a socket is starting to tear down the socket, and that the
  55 * protocol should terminate the connection.  Historically, pru_abort() also
  56 * detached protocol state from the socket state, but this is no longer the
  57 * case.
  58 *
  59 * socreate() creates a socket and attaches protocol state.  This is a public
  60 * interface that may be used by socket layer consumers to create new
  61 * sockets.
  62 *
  63 * sonewconn() creates a socket and attaches protocol state.  This is a
  64 * public interface  that may be used by protocols to create new sockets when
  65 * a new connection is received and will be available for accept() on a
  66 * listen socket.
  67 *
  68 * soclose() destroys a socket after possibly waiting for it to disconnect.
  69 * This is a public interface that socket consumers should use to close and
  70 * release a socket when done with it.
  71 *
  72 * soabort() destroys a socket without waiting for it to disconnect (used
  73 * only for incoming connections that are already partially or fully
  74 * connected).  This is used internally by the socket layer when clearing
  75 * listen socket queues (due to overflow or close on the listen socket), but
  76 * is also a public interface protocols may use to abort connections in
  77 * their incomplete listen queues should they no longer be required.  Sockets
  78 * placed in completed connection listen queues should not be aborted for
  79 * reasons described in the comment above the soclose() implementation.  This
  80 * is not a general purpose close routine, and except in the specific
  81 * circumstances described here, should not be used.
  82 *
  83 * sofree() will free a socket and its protocol state if all references on
  84 * the socket have been released, and is the public interface to attempt to
  85 * free a socket when a reference is removed.  This is a socket layer private
  86 * interface.
  87 *
  88 * NOTE: In addition to socreate() and soclose(), which provide a single
  89 * socket reference to the consumer to be managed as required, there are two
  90 * calls to explicitly manage socket references, soref(), and sorele().
  91 * Currently, these are generally required only when transitioning a socket
  92 * from a listen queue to a file descriptor, in order to prevent garbage
  93 * collection of the socket at an untimely moment.  For a number of reasons,
  94 * these interfaces are not preferred, and should be avoided.
  95 */
  96
  97#include <sys/cdefs.h>
  98__FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.317 2008/10/01 19:14:05 jhb Exp $");
  99
 100#include <ntifs.h>
 101#include <sys/param.h>
 102#include <sys/systm.h>
 103#include <sys/malloc.h>
 104#include <sys/lock.h>
 105#include <sys/spinlock.h>
 106#include <sys/mbuf.h>
 107#include <sys/domain.h>
 108#include <sys/poll.h>
 109#include <sys/protosw.h>
 110#include <sys/socket.h>
 111#include <sys/socketvar.h>
 112#include <sys/sysctl.h>
 113#include <sys/uio.h>
 114
 115#include <netinet/sctp_os.h>
 116
 117
 118static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
 119		    int flags);
 120
 121#if 0
 122static void	filt_sordetach(struct knote *kn);
 123static int	filt_soread(struct knote *kn, long hint);
 124static void	filt_sowdetach(struct knote *kn);
 125static int	filt_sowrite(struct knote *kn, long hint);
 126static int	filt_solisten(struct knote *kn, long hint);
 127
 128static struct filterops solisten_filtops =
 129	{ 1, NULL, filt_sordetach, filt_solisten };
 130static struct filterops soread_filtops =
 131	{ 1, NULL, filt_sordetach, filt_soread };
 132static struct filterops sowrite_filtops =
 133	{ 1, NULL, filt_sowdetach, filt_sowrite };
 134#endif
 135
 136NPAGED_LOOKASIDE_LIST socket_zone;
 137so_gen_t	so_gencnt;	/* generation count for sockets */
 138
 139int	maxsockets;
 140
 141MALLOC_DEFINE(M_SONAME, 'km01', "soname", "socket name");
 142MALLOC_DEFINE(M_PCB, 'km02', "pcb", "protocol control block");
 143
 144static int somaxconn = SOMAXCONN;
 145#if 0
 146static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
 147/* XXX: we dont have SYSCTL_USHORT */
 148SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
 149    0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
 150    "queue size");
 151#endif
 152static int numopensockets;
 153#if 0
 154SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
 155    &numopensockets, 0, "Number of open sockets");
 156#endif
 157#ifdef ZERO_COPY_SOCKETS
 158/* These aren't static because they're used in other files. */
 159int so_zero_copy_send = 1;
 160int so_zero_copy_receive = 1;
 161SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
 162    "Zero copy controls");
 163SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
 164    &so_zero_copy_receive, 0, "Enable zero copy receive");
 165SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
 166    &so_zero_copy_send, 0, "Enable zero copy send");
 167#endif /* ZERO_COPY_SOCKETS */
 168
 169/*
 170 * accept_mtx locks down per-socket fields relating to accept queues.  See
 171 * socketvar.h for an annotation of the protected fields of struct socket.
 172 */
 173struct spinlock accept_lock;
 174
 175
 176/*
 177 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
 178 * so_gencnt field.
 179 */
 180struct spinlock so_global_lock;
 181
 182
 183#if 0
 184/*
 185 * General IPC sysctl name space, used by sockets and a variety of other IPC
 186 * types.
 187 */
 188SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
 189
 190/*
 191 * Sysctl to get and set the maximum global sockets limit.  Notify protocols
 192 * of the change so that they can update their dependent limits as required.
 193 */
 194static int
 195sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 196{
 197	int error, newmaxsockets;
 198
 199	newmaxsockets = maxsockets;
 200	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 201	if (error == 0 && req->newptr) {
 202		if (newmaxsockets > maxsockets) {
 203			maxsockets = newmaxsockets;
 204			if (maxsockets > ((maxfiles / 4) * 3)) {
 205				maxfiles = (maxsockets * 5) / 4;
 206				maxfilesperproc = (maxfiles * 9) / 10;
 207			}
 208			EVENTHANDLER_INVOKE(maxsockets_change);
 209		} else
 210			error = EINVAL;
 211	}
 212	return (error);
 213}
 214
 215SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
 216    &maxsockets, 0, sysctl_maxsockets, "IU",
 217    "Maximum number of sockets avaliable");
 218
 219/*
 220 * Initialise maxsockets.
 221 */
 222static void init_maxsockets(void *ignored)
 223{
 224	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 225	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
 226}
 227SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 228#endif
 229
 230/*
 231 * Socket operation routines.  These routines are called by the routines in
 232 * sys_socket.c or from a system process, and implement the semantics of
 233 * socket operations by switching out to the protocol specific routines.
 234 */
 235
 236/*
 237 * Get a socket structure from our zone, and initialize it.  Note that it
 238 * would probably be better to allocate socket and PCB at the same time, but
 239 * I'm not convinced that all the protocols can be easily modified to do
 240 * this.
 241 *
 242 * soalloc() returns a socket with a ref count of 0.
 243 */
 244static struct socket *
 245soalloc(void)
 246{
 247	struct socket *so;
 248
 249	so = (struct socket *)ExAllocateFromNPagedLookasideList(&socket_zone);
 250	if (so == NULL)
 251		return (NULL);
 252	RtlZeroMemory(so, sizeof(*so));
 253#ifdef MAC
 254	if (mac_socket_init(so, M_NOWAIT) != 0) {
 255		uma_zfree(socket_zone, so);
 256		return (NULL);
 257	}
 258#endif
 259	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
 260	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
 261	SOCKEVENT_LOCK_INIT(&so->so_event, "so_event");
 262
 263	InitializeListHead(&so->so_snd.sb_csq.irpList);
 264	spinlock_init(&so->so_snd.sb_csq.lock, "sb_csq", "sb_csq", 0);
 265	IoCsqInitialize((PIO_CSQ)&so->so_snd.sb_csq,
 266	    AioCsqInsertIrp, AioCsqRemoveIrp, AioCsqPeekNextIrp,
 267	    AioCsqAcquireLock, AioCsqReleaseLock, AioCsqCompleteCanceledIrp);
 268	InitializeListHead(&so->so_rcv.sb_csq.irpList);
 269	spinlock_init(&so->so_rcv.sb_csq.lock, "sb_csq", "sb_csq", 0);
 270	IoCsqInitialize((PIO_CSQ)&so->so_rcv.sb_csq,
 271	    AioCsqInsertIrp, AioCsqRemoveIrp, AioCsqPeekNextIrp,
 272	    AioCsqAcquireLock, AioCsqReleaseLock, AioCsqCompleteCanceledIrp);
 273
 274	spinlock_acquire(&so_global_lock);
 275	so->so_gencnt = ++so_gencnt;
 276	++numopensockets;
 277	spinlock_release(&so_global_lock);
 278	return (so);
 279}
 280
 281/*
 282 * Free the storage associated with a socket at the socket layer, tear down
 283 * locks, labels, etc.  All protocol state is assumed already to have been
 284 * torn down (and possibly never set up) by the caller.
 285 */
 286static void
 287sodealloc(struct socket *so)
 288{
 289	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 290	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 291
 292	spinlock_acquire(&so_global_lock);
 293	so->so_gencnt = ++so_gencnt;
 294	--numopensockets;	/* Could be below, but faster here. */
 295	spinlock_release(&so_global_lock);
 296#if 0
 297	if (so->so_rcv.sb_hiwat)
 298		(void)chgsbsize(so->so_cred->cr_uidinfo,
 299		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 300	if (so->so_snd.sb_hiwat)
 301		(void)chgsbsize(so->so_cred->cr_uidinfo,
 302		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 303#ifdef INET
 304	/* remove acccept filter if one is present. */
 305	if (so->so_accf != NULL)
 306		do_setopt_accept_filter(so, NULL);
 307#endif
 308#endif
 309#ifdef MAC
 310	mac_socket_destroy(so);
 311#endif
 312	//crfree(so->so_cred);
 313	SOCKBUF_LOCK_DESTROY(&so->so_snd);
 314	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
 315
 316	SOCKEVENT_LOCK(&so->so_event);
 317	if (so->so_event.se_Event != NULL) {
 318		ObDereferenceObject(so->so_event.se_Event);
 319		so->so_event.se_Event = NULL;
 320	}
 321	SOCKEVENT_LOCK_DESTROY(&so->so_event);
 322
 323	ExFreeToNPagedLookasideList(&socket_zone, so);
 324}
 325
 326/*
 327 * socreate returns a socket with a ref count of 1.  The socket should be
 328 * closed with soclose().
 329 */
 330int
 331socreate(int dom, struct socket **aso, int type, int proto,
 332    struct ucred *cred, PKTHREAD td)
 333{
 334	struct protosw *prp;
 335	struct socket *so;
 336	int error;
 337
 338	if (proto)
 339		prp = pffindproto(dom, proto, type);
 340	else
 341		prp = pffindtype(dom, type);
 342
 343	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
 344	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
 345		return (EPROTONOSUPPORT);
 346
 347	if (prp->pr_type != type)
 348		return (EPROTOTYPE);
 349	so = soalloc();
 350	if (so == NULL)
 351		return (ENOBUFS);
 352
 353	TAILQ_INIT(&so->so_incomp);
 354	TAILQ_INIT(&so->so_comp);
 355	so->so_type = type;
 356#if 0
 357	so->so_cred = crhold(cred);
 358	if ((prp->pr_domain->dom_family == PF_INET) ||
 359	    (prp->pr_domain->dom_family == PF_ROUTE))
 360		so->so_fibnum = td->td_proc->p_fibnum;
 361	else
 362#endif
 363		so->so_fibnum = 0;
 364	so->so_proto = prp;
 365#ifdef MAC
 366	mac_create_socket(cred, so);
 367#endif
 368	KeInitializeEvent(&so->so_waitEvent, NotificationEvent, FALSE);
 369	KeInitializeEvent(&so->so_waitSyncEvent, SynchronizationEvent, FALSE);
 370	KeInitializeEvent(&so->so_rcv.sb_waitEvent, NotificationEvent, FALSE);
 371	KeInitializeEvent(&so->so_snd.sb_waitEvent, NotificationEvent, FALSE);
 372	KeInitializeEvent(&so->so_rcv.sb_selEvent, SynchronizationEvent, FALSE);
 373	KeInitializeEvent(&so->so_snd.sb_selEvent, SynchronizationEvent, FALSE);
 374	KeInitializeEvent(&so->so_rcv.sb_lockEvent, SynchronizationEvent, FALSE);
 375	KeInitializeEvent(&so->so_snd.sb_lockEvent, SynchronizationEvent, FALSE);
 376	so->so_count = 1;
 377	/*
 378	 * Auto-sizing of socket buffers is managed by the protocols and
 379	 * the appropriate flags must be set in the pru_attach function.
 380	 */
 381	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 382	if (error) {
 383		KASSERT(so->so_count == 1, ("socreate: so_count %d",
 384		    so->so_count));
 385		so->so_count = 0;
 386		sodealloc(so);
 387		return (error);
 388	}
 389	*aso = so;
 390	return (0);
 391}
 392
 393#ifdef REGRESSION
 394static int regression_sonewconn_earlytest = 1;
 395SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
 396    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 397#endif
 398
 399/*
 400 * When an attempt at a new connection is noted on a socket which accepts
 401 * connections, sonewconn is called.  If the connection is possible (subject
 402 * to space constraints, etc.) then we allocate a new structure, propoerly
 403 * linked into the data structure of the original socket, and return this.
 404 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
 405 *
 406 * Note: the ref count on the socket is 0 on return.
 407 */
 408struct socket *
 409sonewconn(struct socket *head, int connstatus)
 410{
 411	struct socket *so;
 412	int over;
 413
 414	ACCEPT_LOCK();
 415	over = (head->so_qlen > 3 * head->so_qlimit / 2);
 416	ACCEPT_UNLOCK();
 417#ifdef REGRESSION
 418	if (regression_sonewconn_earlytest && over)
 419#else
 420	if (over)
 421#endif
 422		return (NULL);
 423	so = soalloc();
 424	if (so == NULL)
 425		return (NULL);
 426	if ((head->so_options & SO_ACCEPTFILTER) != 0)
 427		connstatus = 0;
 428	so->so_head = head;
 429	so->so_type = head->so_type;
 430	so->so_options = head->so_options &~ SO_ACCEPTCONN;
 431	so->so_linger = head->so_linger;
 432	so->so_state = head->so_state | SS_NOFDREF;
 433	so->so_proto = head->so_proto;
 434	//so->so_cred = crhold(head->so_cred);
 435#ifdef MAC
 436	SOCK_LOCK(head);
 437	mac_socket_newconn(head, so);
 438	SOCK_UNLOCK(head);
 439#endif
 440	KeInitializeEvent(&so->so_waitEvent, NotificationEvent, FALSE);
 441	KeInitializeEvent(&so->so_waitSyncEvent, SynchronizationEvent, FALSE);
 442	KeInitializeEvent(&so->so_rcv.sb_waitEvent, NotificationEvent, FALSE);
 443	KeInitializeEvent(&so->so_snd.sb_waitEvent, NotificationEvent, FALSE);
 444	KeInitializeEvent(&so->so_rcv.sb_selEvent, SynchronizationEvent, FALSE);
 445	KeInitializeEvent(&so->so_snd.sb_selEvent, SynchronizationEvent, FALSE);
 446	KeInitializeEvent(&so->so_rcv.sb_lockEvent, SynchronizationEvent, FALSE);
 447	KeInitializeEvent(&so->so_snd.sb_lockEvent, SynchronizationEvent, FALSE);
 448	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
 449	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
 450		sodealloc(so);
 451		return (NULL);
 452	}
 453	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 454	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 455	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 456	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 457	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 458	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 459	so->so_state |= connstatus;
 460	SOCKEVENT_LOCK(&head->so_event);
 461	if (head->so_event.se_Event != NULL) {
 462		ObReferenceObject(head->so_event.se_Event);
 463		so->so_event.se_Event = head->so_event.se_Event;
 464	}
 465	SOCKEVENT_UNLOCK(&head->so_event);
 466	ACCEPT_LOCK();
 467	if (connstatus) {
 468		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 469		so->so_qstate |= SQ_COMP;
 470		head->so_qlen++;
 471	} else {
 472		/*
 473		 * Keep removing sockets from the head until there's room for
 474		 * us to insert on the tail.  In pre-locking revisions, this
 475		 * was a simple if(), but as we could be racing with other
 476		 * threads and soabort() requires dropping locks, we must
 477		 * loop waiting for the condition to be true.
 478		 */
 479		while (head->so_incqlen > head->so_qlimit) {
 480			struct socket *sp;
 481			sp = TAILQ_FIRST(&head->so_incomp);
 482			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
 483			head->so_incqlen--;
 484			sp->so_qstate &= ~SQ_INCOMP;
 485			sp->so_head = NULL;
 486			ACCEPT_UNLOCK();
 487			soabort(sp);
 488			ACCEPT_LOCK();
 489		}
 490		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 491		so->so_qstate |= SQ_INCOMP;
 492		head->so_incqlen++;
 493	}
 494	ACCEPT_UNLOCK();
 495	if (connstatus) {
 496		SOCKEVENT_LOCK(&head->so_event);
 497		if (head->so_event.se_Event != NULL && (head->so_event.se_Events & FD_ACCEPT) != 0) {
 498			head->so_event.se_EventsRet.lNetworkEvents |= FD_ACCEPT;
 499			KeSetEvent(head->so_event.se_Event, 0, FALSE);
 500		}
 501		SOCKEVENT_UNLOCK(&head->so_event);
 502		sorwakeup(head);
 503		KeSetEvent(&head->so_waitSyncEvent, 0, FALSE);
 504	}
 505	return (so);
 506}
 507
 508int
 509sobind(struct socket *so, struct sockaddr *nam, PKTHREAD td)
 510{
 511
 512	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
 513}
 514
 515/*
 516 * solisten() transitions a socket from a non-listening state to a listening
 517 * state, but can also be used to update the listen queue depth on an
 518 * existing listen socket.  The protocol will call back into the sockets
 519 * layer using solisten_proto_check() and solisten_proto() to check and set
 520 * socket-layer listen state.  Call backs are used so that the protocol can
 521 * acquire both protocol and socket layer locks in whatever order is required
 522 * by the protocol.
 523 *
 524 * Protocol implementors are advised to hold the socket lock across the
 525 * socket-layer test and set to avoid races at the socket layer.
 526 */
 527int
 528solisten(struct socket *so, int backlog, PKTHREAD td)
 529{
 530
 531	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
 532}
 533
 534int
 535solisten_proto_check(struct socket *so)
 536{
 537
 538	SOCK_LOCK_ASSERT(so);
 539
 540	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 541	    SS_ISDISCONNECTING))
 542		return (EINVAL);
 543	return (0);
 544}
 545
 546void
 547solisten_proto(struct socket *so, int backlog)
 548{
 549
 550	SOCK_LOCK_ASSERT(so);
 551
 552	if (backlog < 0 || backlog > somaxconn)
 553		backlog = somaxconn;
 554	so->so_qlimit = backlog;
 555	so->so_options |= SO_ACCEPTCONN;
 556}
 557
 558/*
 559 * Attempt to free a socket.  This should really be sotryfree().
 560 *
 561 * sofree() will succeed if:
 562 *
 563 * - There are no outstanding file descriptor references or related consumers
 564 *   (so_count == 0).
 565 *
 566 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
 567 *
 568 * - The protocol does not have an outstanding strong reference on the socket
 569 *   (SS_PROTOREF).
 570 *
 571 * - The socket is not in a completed connection queue, so a process has been
 572 *   notified that it is present.  If it is removed, the user process may
 573 *   block in accept() despite select() saying the socket was ready.
 574 *
 575 * Otherwise, it will quietly abort so that a future call to sofree(), when
 576 * conditions are right, can succeed.
 577 */
 578void
 579sofree(struct socket *so)
 580{
 581	struct socket *head;
 582
 583	ACCEPT_LOCK_ASSERT();
 584	SOCK_LOCK_ASSERT(so);
 585
 586	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
 587	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
 588		SOCK_UNLOCK(so);
 589		ACCEPT_UNLOCK();
 590		return;
 591	}
 592
 593	head = so->so_head;
 594	if (head != NULL) {
 595		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
 596		    (so->so_qstate & SQ_INCOMP) != 0,
 597		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
 598		    "SQ_INCOMP"));
 599		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
 600		    (so->so_qstate & SQ_INCOMP) == 0,
 601		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
 602		TAILQ_REMOVE(&head->so_incomp, so, so_list);
 603		head->so_incqlen--;
 604		so->so_qstate &= ~SQ_INCOMP;
 605		so->so_head = NULL;
 606	}
 607	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
 608	    (so->so_qstate & SQ_INCOMP) == 0,
 609	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
 610	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
 611	if (so->so_options & SO_ACCEPTCONN) {
 612		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
 613		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
 614	}
 615	SOCK_UNLOCK(so);
 616	ACCEPT_UNLOCK();
 617
 618#if 0
 619	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
 620		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
 621#endif
 622	if (so->so_proto->pr_usrreqs->pru_detach != NULL)
 623		(*so->so_proto->pr_usrreqs->pru_detach)(so);
 624
 625	/*
 626	 * From this point on, we assume that no other references to this
 627	 * socket exist anywhere else in the stack.  Therefore, no locks need
 628	 * to be acquired or held.
 629	 *
 630	 * We used to do a lot of socket buffer and socket locking here, as
 631	 * well as invoke sorflush() and perform wakeups.  The direct call to
 632	 * dom_dispose() and sbrelease_internal() are an inlining of what was
 633	 * necessary from sorflush().
 634	 *
 635	 * Notice that the socket buffer and kqueue state are torn down
 636	 * before calling pru_detach.  This means that protocols shold not
 637	 * assume they can perform socket wakeups, etc, in their detach code.
 638	 */
 639	sbdestroy(&so->so_snd, so);
 640	sbdestroy(&so->so_rcv, so);
 641	sodealloc(so);
 642}
 643
 644/*
 645 * Close a socket on last file table reference removal.  Initiate disconnect
 646 * if connected.  Free socket when disconnect complete.
 647 *
 648 * This function will sorele() the socket.  Note that soclose() may be called
 649 * prior to the ref count reaching zero.  The actual socket structure will
 650 * not be freed until the ref count reaches zero.
 651 */
 652int
 653soclose(struct socket *so)
 654{
 655	int error = 0;
 656	NTSTATUS status = STATUS_SUCCESS;
 657	LARGE_INTEGER timeout;
 658	KIRQL oldIrql;
 659
 660	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
 661
 662	//funsetown(&so->so_sigio);
 663	if (so->so_state & SS_ISCONNECTED) {
 664		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 665			error = sodisconnect(so);
 666			if (error)
 667				goto drop;
 668		}
 669		if (so->so_options & SO_LINGER) {
 670			if ((so->so_state & SS_ISDISCONNECTING) &&
 671			    (so->so_state & SS_NBIO))
 672				goto drop;
 673
 674			timeout.QuadPart = -10000000 * so->so_linger;
 675			SOCK_LOCK(so);
 676			if (so->so_state & SS_ISCONNECTED) {
 677				SOCK_UNLOCK(so);
 678				KeClearEvent(&so->so_waitEvent);
 679				status = KeWaitForSingleObject(&so->so_waitEvent, UserRequest,
 680				    UserMode, FALSE, so->so_linger > 0 ? &timeout : NULL);
 681			} else {
 682				SOCK_UNLOCK(so);
 683			}
 684		}
 685	}
 686
 687drop:
 688	if (so->so_proto->pr_usrreqs->pru_close != NULL)
 689		(*so->so_proto->pr_usrreqs->pru_close)(so);
 690	if (so->so_options & SO_ACCEPTCONN) {
 691		struct socket *sp;
 692		ACCEPT_LOCK();
 693		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
 694			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 695			so->so_incqlen--;
 696			sp->so_qstate &= ~SQ_INCOMP;
 697			sp->so_head = NULL;
 698			ACCEPT_UNLOCK();
 699			soabort(sp);
 700			ACCEPT_LOCK();
 701		}
 702		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 703			TAILQ_REMOVE(&so->so_comp, sp, so_list);
 704			so->so_qlen--;
 705			sp->so_qstate &= ~SQ_COMP;
 706			sp->so_head = NULL;
 707			ACCEPT_UNLOCK();
 708			soabort(sp);
 709			ACCEPT_LOCK();
 710		}
 711		ACCEPT_UNLOCK();
 712	} else {
 713		SOCKEVENT_LOCK(&so->so_event);
 714		if (so->so_event.se_Event != NULL && (so->so_event.se_Events & FD_CLOSE) != 0) {
 715			so->so_event.se_EventsRet.lNetworkEvents |= FD_CLOSE;
 716			KeSetEvent(so->so_event.se_Event, 0, FALSE);
 717		}
 718		SOCKEVENT_UNLOCK(&so->so_event);
 719	}
 720	ACCEPT_LOCK();
 721	SOCK_LOCK(so);
 722	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
 723	so->so_state |= SS_NOFDREF;
 724	sorele(so);
 725	return (error);
 726}
 727
 728/*
 729 * soabort() is used to abruptly tear down a connection, such as when a
 730 * resource limit is reached (listen queue depth exceeded), or if a listen
 731 * socket is closed while there are sockets waiting to be accepted.
 732 *
 733 * This interface is tricky, because it is called on an unreferenced socket,
 734 * and must be called only by a thread that has actually removed the socket
 735 * from the listen queue it was on, or races with other threads are risked.
 736 *
 737 * This interface will call into the protocol code, so must not be called
 738 * with any socket locks held.  Protocols do call it while holding their own
 739 * recursible protocol mutexes, but this is something that should be subject
 740 * to review in the future.
 741 */
 742void
 743soabort(struct socket *so)
 744{
 745
 746	/*
 747	 * In as much as is possible, assert that no references to this
 748	 * socket are held.  This is not quite the same as asserting that the
 749	 * current thread is responsible for arranging for no references, but
 750	 * is as close as we can get for now.
 751	 */
 752	KASSERT(so->so_count == 0, ("soabort: so_count"));
 753	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
 754	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
 755	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
 756	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
 757
 758	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
 759		(*so->so_proto->pr_usrreqs->pru_abort)(so);
 760	ACCEPT_LOCK();
 761	SOCK_LOCK(so);
 762	sofree(so);
 763}
 764
 765int
 766soaccept(struct socket *so, struct sockaddr **nam)
 767{
 768	int error;
 769
 770	SOCK_LOCK(so);
 771	KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
 772	so->so_state &= ~SS_NOFDREF;
 773	SOCK_UNLOCK(so);
 774	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 775	return (error);
 776}
 777
 778int
 779soconnect(struct socket *so, struct sockaddr *nam, PKTHREAD td)
 780{
 781	int error;
 782
 783	if (so->so_options & SO_ACCEPTCONN)
 784		return (EOPNOTSUPP);
 785	/*
 786	 * If protocol is connection-based, can only connect once.
 787	 * Otherwise, if connected, try to disconnect first.  This allows
 788	 * user to disconnect by connecting to, e.g., a null address.
 789	 */
 790	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 791	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 792	    (error = sodisconnect(so)))) {
 793		error = EISCONN;
 794	} else {
 795		/*
 796		 * Prevent accumulated error from previous connection from
 797		 * biting us.
 798		 */
 799		so->so_error = 0;
 800		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
 801	}
 802
 803	return (error);
 804}
 805
 806int
 807soconnect2(struct socket *so1, struct socket *so2)
 808{
 809
 810	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
 811}
 812
 813int
 814sodisconnect(struct socket *so)
 815{
 816	int error;
 817
 818	if ((so->so_state & SS_ISCONNECTED) == 0)
 819		return (ENOTCONN);
 820	if (so->so_state & SS_ISDISCONNECTING)
 821		return (EALREADY);
 822	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
 823	return (error);
 824}
 825
 826#ifdef ZERO_COPY_SOCKETS
 827struct so_zerocopy_stats{
 828	int size_ok;
 829	int align_ok;
 830	int found_ifp;
 831};
 832struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
 833#include <netinet/in.h>
 834#include <net/route.h>
 835#include <netinet/in_pcb.h>
 836#include <vm/vm.h>
 837#include <vm/vm_page.h>
 838#include <vm/vm_object.h>
 839
 840/*
 841 * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
 842 * sosend_dgram() and sosend_generic() use m_uiotombuf().
 843 * 
 844 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
 845 * all of the data referenced by the uio.  If desired, it uses zero-copy.
 846 * *space will be updated to reflect data copied in.
 847 *
 848 * NB: If atomic I/O is requested, the caller must already have checked that
 849 * space can hold resid bytes.
 850 *
 851 * NB: In the event of an error, the caller may need to free the partial
 852 * chain pointed to by *mpp.  The contents of both *uio and *space may be
 853 * modified even in the case of an error.
 854 */
 855static int
 856sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
 857    int flags)
 858{
 859	struct mbuf *m, **mp, *top;
 860	long len, resid;
 861	int error;
 862#ifdef ZERO_COPY_SOCKETS
 863	int cow_send;
 864#endif
 865
 866	*retmp = top = NULL;
 867	mp = &top;
 868	len = 0;
 869	resid = uio->uio_resid;
 870	error = 0;
 871	do {
 872#ifdef ZERO_COPY_SOCKETS
 873		cow_send = 0;
 874#endif /* ZERO_COPY_SOCKETS */
 875		if (resid >= MINCLSIZE) {
 876#ifdef ZERO_COPY_SOCKETS
 877			if (top == NULL) {
 878				m = m_gethdr(M_WAITOK, MT_DATA);
 879				m->m_pkthdr.len = 0;
 880				m->m_pkthdr.rcvif = NULL;
 881			} else
 882				m = m_get(M_WAITOK, MT_DATA);
 883			if (so_zero_copy_send &&
 884			    resid>=PAGE_SIZE &&
 885			    *space>=PAGE_SIZE &&
 886			    uio->uio_iov->iov_len>=PAGE_SIZE) {
 887				so_zerocp_stats.size_ok++;
 888				so_zerocp_stats.align_ok++;
 889				cow_send = socow_setup(m, uio);
 890				len = cow_send;
 891			}
 892			if (!cow_send) {
 893				m_clget(m, M_WAITOK);
 894				len = min(min(MCLBYTES, resid), *space);
 895			}
 896#else /* ZERO_COPY_SOCKETS */
 897			if (top == NULL) {
 898				m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
 899				m->m_pkthdr.len = 0;
 900				m->m_pkthdr.rcvif = NULL;
 901			} else
 902				m = m_getcl(M_WAIT, MT_DATA, 0);
 903			len = min(min(MCLBYTES, resid), *space);
 904#endif /* ZERO_COPY_SOCKETS */
 905		} else {
 906			if (top == NULL) {
 907				m = m_gethdr(M_WAIT, MT_DATA);
 908				m->m_pkthdr.len = 0;
 909				m->m_pkthdr.rcvif = NULL;
 910
 911				len = min(min(MHLEN, resid), *space);
 912				/*
 913				 * For datagram protocols, leave room
 914				 * for protocol headers in first mbuf.
 915				 */
 916				if (atomic && m && len < MHLEN)
 917					MH_ALIGN(m, len);
 918			} else {
 919				m = m_get(M_WAIT, MT_DATA);
 920				len = min(min(MLEN, resid), *space);
 921			}
 922		}
 923		if (m == NULL) {
 924			error = ENOBUFS;
 925			goto out;
 926		}
 927
 928		*space -= len;
 929#ifdef ZERO_COPY_SOCKETS
 930		if (cow_send)
 931			error = 0;
 932		else
 933#endif /* ZERO_COPY_SOCKETS */
 934		error = uiomove(mtod(m, void *), (int)len, uio);
 935		resid = uio->uio_resid;
 936		m->m_len = len;
 937		*mp = m;
 938		top->m_pkthdr.len += len;
 939		if (error)
 940			goto out;
 941		mp = &m->m_next;
 942		if (resid <= 0) {
 943			if (flags & MSG_EOR)
 944				top->m_flags |= M_EOR;
 945			break;
 946		}
 947	} while (*space > 0 && atomic);
 948out:
 949	*retmp = top;
 950	return (error);
 951}
 952#endif /*ZERO_COPY_SOCKETS*/
 953
 954#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
 955
 956#if 0
 957int
 958sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 959    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 960{
 961	long space, resid;
 962	int clen = 0, error, dontroute;
 963#ifdef ZERO_COPY_SOCKETS
 964	int atomic = sosendallatonce(so) || top;
 965#endif
 966
 967	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
 968	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
 969	    ("sodgram_send: !PR_ATOMIC"));
 970
 971	if (uio != NULL)
 972		resid = uio->uio_resid;
 973	else
 974		resid = top->m_pkthdr.len;
 975	/*
 976	 * In theory resid should be unsigned.  However, space must be
 977	 * signed, as it might be less than 0 if we over-committed, and we
 978	 * must use a signed comparison of space and resid.  On the other
 979	 * hand, a negative resid causes us to loop sending 0-length
 980	 * segments to the protocol.
 981	 *
 982	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
 983	 * type sockets since that's an error.
 984	 */
 985	if (resid < 0) {
 986		error = EINVAL;
 987		goto out;
 988	}
 989
 990	dontroute =
 991	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
 992	if (td != NULL)
 993		td->td_ru.ru_msgsnd++;
 994	if (control != NULL)
 995		clen = control->m_len;
 996
 997	SOCKBUF_LOCK(&so->so_snd);
 998	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 999		SOCKBUF_UNLOCK(&so->so_snd);
1000		error = EPIPE;
1001		goto out;
1002	}
1003	if (so->so_error) {
1004		error = so->so_error;
1005		so->so_error = 0;
1006		SOCKBUF_UNLOCK(&so->so_snd);
1007		goto out;
1008	}
1009	if ((so->so_state & SS_ISCONNECTED) == 0) {
1010		/*
1011		 * `sendto' and `sendmsg' is allowed on a connection-based
1012		 * socket if it supports implied connect.  Return ENOTCONN if
1013		 * not connected and no address is supplied.
1014		 */
1015		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1016		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1017			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1018			    !(resid == 0 && clen != 0)) {
1019				SOCKBUF_UNLOCK(&so->so_snd);
1020				error = ENOTCONN;
1021				goto out;
1022			}
1023		} else if (addr == NULL) {
1024			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1025				error = ENOTCONN;
1026			else
1027				error = EDESTADDRREQ;
1028			SOCKBUF_UNLOCK(&so->so_snd);
1029			goto out;
1030		}
1031	}
1032
1033	/*
1034	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
1035	 * problem and need fixing.
1036	 */
1037	space = sbspace(&so->so_snd);
1038	if (flags & MSG_OOB)
1039		space += 1024;
1040	space -= clen;
1041	SOCKBUF_UNLOCK(&so->so_snd);
1042	if (resid > space) {
1043		error = EMSGSIZE;
1044		goto out;
1045	}
1046	if (uio == NULL) {
1047		resid = 0;
1048		if (flags & MSG_EOR)
1049			top->m_flags |= M_EOR;
1050	} else {
1051#ifdef ZERO_COPY_SOCKETS
1052		error = sosend_copyin(uio, &top, atomic, &space, flags);
1053		if (error)
1054			goto out;
1055#else
1056		/*
1057		 * Copy the data from userland into a mbuf chain.
1058		 * If no data is to be copied in, a single empty mbuf
1059		 * is returned.
1060		 */
1061		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1062		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1063		if (top == NULL) {
1064			error = EFAULT;	/* only possible error */
1065			goto out;
1066		}
1067		space -= resid - uio->uio_resid;
1068#endif
1069		resid = uio->uio_resid;
1070	}
1071	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1072	/*
1073	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1074	 * than with.
1075	 */
1076	if (dontroute) {
1077		SOCK_LOCK(so);
1078		so->so_options |= SO_DONTROUTE;
1079		SOCK_UNLOCK(so);
1080	}
1081	/*
1082	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1083	 * of date.  We could have recieved a reset packet in an interrupt or
1084	 * maybe we slept while doing page faults in uiomove() etc.  We could
1085	 * probably recheck again inside the locking protection here, but
1086	 * there are probably other places that this also happens.  We must
1087	 * rethink this.
1088	 */
1089	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1090	    (flags & MSG_OOB) ? PRUS_OOB :
1091	/*
1092	 * If the user set MSG_EOF, the protocol understands this flag and
1093	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1094	 */
1095	    ((flags & MSG_EOF) &&
1096	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1097	     (resid <= 0)) ?
1098		PRUS_EOF :
1099		/* If there is more to send set PRUS_MORETOCOME */
1100		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1101		top, addr, control, td);
1102	if (dontroute) {
1103		SOCK_LOCK(so);
1104		so->so_options &= ~SO_DONTROUTE;
1105		SOCK_UNLOCK(so);
1106	}
1107	clen = 0;
1108	control = NULL;
1109	top = NULL;
1110out:
1111	if (top != NULL)
1112		m_freem(top);
1113	if (control != NULL)
1114		m_freem(control);
1115	return (error);
1116}
1117
1118/*
1119 * Send on a socket.  If send must go all at once and message is larger than
1120 * send buffering, then hard error.  Lock against other senders.  If must go
1121 * all at once and not enough room now, then inform user that this would
1122 * block and do nothing.  Otherwise, if nonblocking, send as much as
1123 * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1124 * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1125 * in mbuf chain must be small enough to send all at once.
1126 *
1127 * Returns nonzero on error, timeout or signal; callers must check for short
1128 * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1129 * on return.
1130 */
1131int
1132sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1133    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1134{
1135	long space, resid;
1136	int clen = 0, error, dontroute;
1137	int atomic = sosendallatonce(so) || top;
1138
1139	if (uio != NULL)
1140		resid = uio->uio_resid;
1141	else
1142		resid = top->m_pkthdr.len;
1143	/*
1144	 * In theory resid should be unsigned.  However, space must be
1145	 * signed, as it might be less than 0 if we over-committed, and we
1146	 * must use a signed comparison of space and resid.  On the other
1147	 * hand, a negative resid causes us to loop sending 0-length
1148	 * segments to the protocol.
1149	 *
1150	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1151	 * type sockets since that's an error.
1152	 */
1153	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1154		error = EINVAL;
1155		goto out;
1156	}
1157
1158	dontroute =
1159	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1160	    (so->so_proto->pr_flags & PR_ATOMIC);
1161	if (td != NULL)
1162		td->td_ru.ru_msgsnd++;
1163	if (control != NULL)
1164		clen = control->m_len;
1165
1166	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1167	if (error)
1168		goto out;
1169
1170restart:
1171	do {
1172		SOCKBUF_LOCK(&so->so_snd);
1173		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1174			SOCKBUF_UNLOCK(&so->so_snd);
1175			error = EPIPE;
1176			goto release;
1177		}
1178		if (so->so_error) {
1179			error = so->so_error;
1180			so->so_error = 0;
1181			SOCKBUF_UNLOCK(&so->so_snd);
1182			goto release;
1183		}
1184		if ((so->so_state & SS_ISCONNECTED) == 0) {
1185			/*
1186			 * `sendto' and `sendmsg' is allowed on a connection-
1187			 * based socket if it supports implied connect.
1188			 * Return ENOTCONN if not connected and no address is
1189			 * supplied.
1190			 */
1191			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1192			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1193				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1194				    !(resid == 0 && clen != 0)) {
1195					SOCKBUF_UNLOCK(&so->so_snd);
1196					error = ENOTCONN;
1197					goto release;
1198				}
1199			} else if (addr == NULL) {
1200				SOCKBUF_UNLOCK(&so->so_snd);
1201				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1202					error = ENOTCONN;
1203				else
1204					error = EDESTADDRREQ;
1205				goto release;
1206			}
1207		}
1208		space = sbspace(&so->so_snd);
1209		if (flags & MSG_OOB)
1210			space += 1024;
1211		if ((atomic && resid > so->so_snd.sb_hiwat) ||
1212		    clen > so->so_snd.sb_hiwat) {
1213			SOCKBUF_UNLOCK(&so->so_snd);
1214			error = EMSGSIZE;
1215			goto release;
1216		}
1217		if (space < resid + clen &&
1218		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1219			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
1220				SOCKBUF_UNLOCK(&so->so_snd);
1221				error = EWOULDBLOCK;
1222				goto release;
1223			}
1224			error = sbwait(&so->so_snd);
1225			SOCKBUF_UNLOCK(&so->so_snd);
1226			if (error)
1227				goto release;
1228			goto restart;
1229		}
1230		SOCKBUF_UNLOCK(&so->so_snd);
1231		space -= clen;
1232		do {
1233			if (uio == NULL) {
1234				resid = 0;
1235				if (flags & MSG_EOR)
1236					top->m_flags |= M_EOR;
1237			} else {
1238#ifdef ZERO_COPY_SOCKETS
1239				error = sosend_copyin(uio, &top, atomic,
1240				    &space, flags);
1241				if (error != 0)
1242					goto release;
1243#else
1244				/*
1245				 * Copy the data from userland into a mbuf
1246				 * chain.  If no data is to be copied in,
1247				 * a single empty mbuf is returned.
1248				 */
1249				top = m_uiotombuf(uio, M_WAITOK, space,
1250				    (atomic ? max_hdr : 0),
1251				    (atomic ? M_PKTHDR : 0) |
1252				    ((flags & MSG_EOR) ? M_EOR : 0));
1253				if (top == NULL) {
1254					error = EFAULT; /* only possible error */
1255					goto release;
1256				}
1257				space -= resid - uio->uio_resid;
1258#endif
1259				resid = uio->uio_resid;
1260			}
1261			if (dontroute) {
1262				SOCK_LOCK(so);
1263				so->so_options |= SO_DONTROUTE;
1264				SOCK_UNLOCK(so);
1265			}
1266			/*
1267			 * XXX all the SBS_CANTSENDMORE checks previously
1268			 * done could be out of date.  We could have recieved
1269			 * a reset packet in an interrupt or maybe we slept
1270			 * while doing page faults in uiomove() etc.  We
1271			 * could probably recheck again inside the locking
1272			 * protection here, but there are probably other
1273			 * places that this also happens.  We must rethink
1274			 * this.
1275			 */
1276			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1277			    (flags & MSG_OOB) ? PRUS_OOB :
1278			/*
1279			 * If the user set MSG_EOF, the protocol understands
1280			 * this flag and nothing left to send then use
1281			 * PRU_SEND_EOF instead of PRU_SEND.
1282			 */
1283			    ((flags & MSG_EOF) &&
1284			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1285			     (resid <= 0)) ?
1286				PRUS_EOF :
1287			/* If there is more to send set PRUS_MORETOCOME. */
1288			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1289			    top, addr, control, td);
1290			if (dontroute) {
1291				SOCK_LOCK(so);
1292				so->so_options &= ~SO_DONTROUTE;
1293				SOCK_UNLOCK(so);
1294			}
1295			clen = 0;
1296			control = NULL;
1297			top = NULL;
1298			if (error)
1299				goto release;
1300		} while (resid && space > 0);
1301	} while (resid);
1302
1303release:
1304	sbunlock(&so->so_snd);
1305out:
1306	if (top != NULL)
1307		m_freem(top);
1308	if (control != NULL)
1309		m_freem(control);
1310	return (error);
1311}
1312#endif
1313
1314int
1315sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1316    struct mbuf *top, struct mbuf *control, int flags, PKTHREAD td)
1317{
1318
1319	/* XXXRW: Temporary debugging. */
1320	KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
1321	    ("sosend: protocol calls sosend"));
1322
1323	return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
1324	    control, flags, td));
1325}
1326
1327/*
1328 * The part of soreceive() that implements reading non-inline out-of-band
1329 * data from a socket.  For more complete comments, see soreceive(), from
1330 * which this code originated.
1331 *
1332 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1333 * unable to return an mbuf chain to the caller.
1334 */
1335static int
1336soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1337{
1338	struct protosw *pr = so->so_proto;
1339	struct mbuf *m;
1340	int error;
1341
1342	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1343
1344	m = m_get(M_WAIT, MT_DATA);
1345	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1346	if (error)
1347		goto bad;
1348	do {
1349#ifdef ZERO_COPY_SOCKETS
1350		if (so_zero_copy_receive) {
1351			int disposable;
1352
1353			if ((m->m_flags & M_EXT)
1354			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
1355				disposable = 1;
1356			else
1357				disposable = 0;
1358
1359			error = uiomoveco(mtod(m, void *),
1360					  min(uio->uio_resid, m->m_len),
1361					  uio, disposable);
1362		} else
1363#endif /* ZERO_COPY_SOCKETS */
1364		error = uiomove(mtod(m, void *),
1365		    (int) min(uio->uio_resid, m->m_len), uio);
1366		m = m_free(m);
1367	} while (uio->uio_resid && error == 0 && m);
1368bad:
1369	if (m != NULL)
1370		m_freem(m);
1371	return (error);
1372}
1373
1374/*
1375 * Following replacement or removal of the first mbuf on the first mbuf chain
1376 * of a socket buffer, push necessary state changes back into the socket
1377 * buffer so that other consumers see the values consistently.  'nextrecord'
1378 * is the callers locally stored value of the original value of
1379 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1380 * NOTE: 'nextrecord' may be NULL.
1381 */
1382static __inline void
1383sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1384{
1385
1386	SOCKBUF_LOCK_ASSERT(sb);
1387	/*
1388	 * First, update for the new value of nextrecord.  If necessary, make
1389	 * it the first record.
1390	 */
1391	if (sb->sb_mb != NULL)
1392		sb->sb_mb->m_nextpkt = nextrecord;
1393	else
1394		sb->sb_mb = nextrecord;
1395
1396        /*
1397         * Now update any dependent socket buffer fields to reflect the new
1398         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
1399	 * addition of a second clause that takes care of the case where
1400	 * sb_mb has been updated, but remains the last record.
1401         */
1402        if (sb->sb_mb == NULL) {
1403                sb->sb_mbtail = NULL;
1404                sb->sb_lastrecord = NULL;
1405        } else if (sb->sb_mb->m_nextpkt == NULL)
1406                sb->sb_lastrecord = sb->sb_mb;
1407}
1408
1409
1410/*
1411 * Implement receive operations on a socket.  We depend on the way that
1412 * records are added to the sockbuf by sbappend.  In particular, each record
1413 * (mbufs linked through m_next) must begin with an address if the protocol
1414 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1415 * data, and then zero or more mbufs of data.  In order to allow parallelism
1416 * between network receive and copying to user space, as well as avoid
1417 * sleeping with a mutex held, we release the socket buffer mutex during the
1418 * user space copy.  Although the sockbuf is locked, new data may still be
1419 * appended, and thus we must maintain consistency of the sockbuf during that
1420 * time.
1421 *
1422 * The caller may receive the data as a single mbuf chain by supplying an
1423 * mbuf **mp0 for use in returning the chain.  The uio is then used only for
1424 * the count in uio_resid.
1425 */
1426int
1427soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1428    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1429{
1430	struct mbuf *m, **mp;
1431	int flags, len, error, offset;
1432	struct protosw *pr = so->so_proto;
1433	struct mbuf *nextrecord;
1434	int moff, type = 0;
1435	int orig_resid = uio->uio_resid;
1436
1437	mp = mp0;
1438	if (psa != NULL)
1439		*psa = NULL;
1440	if (controlp != NULL)
1441		*controlp = NULL;
1442	if (flagsp != NULL)
1443		flags = *flagsp &~ MSG_EOR;
1444	else
1445		flags = 0;
1446	if (flags & MSG_OOB)
1447		return (soreceive_rcvoob(so, uio, flags));
1448	if (mp != NULL)
1449		*mp = NULL;
1450	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1451	    && uio->uio_resid)
1452		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
1453
1454	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
1455	if (error)
1456		return (error);
1457
1458restart:
1459	SOCKBUF_LOCK(&so->so_rcv);
1460	m = so->so_rcv.sb_mb;
1461	/*
1462	 * If we have less data than requested, block awaiting more (subject
1463	 * to any timeout) if:
1464	 *   1. the current count is less than the low water mark, or
1465	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1466	 *	receive operation at once if we block (resid <= hiwat).
1467	 *   3. MSG_DONTWAIT is not set
1468	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1469	 * we have to do the receive in sections, and thus risk returning a
1470	 * short count if a timeout or signal occurs after we start.
1471	 */
1472	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1473	    so->so_rcv.sb_cc < uio->uio_resid) &&
1474	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1475	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1476	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1477		KASSERT(m != NULL || !so->so_rcv.sb_cc,
1478		    ("receive: m == %p so->so_rcv.sb_cc == %u",
1479		    m, so->so_rcv.sb_cc));
1480		if (so->so_error) {
1481			if (m != NULL)
1482				goto dontblock;
1483			error = so->so_error;
1484			if ((flags & MSG_PEEK) == 0)
1485				so->so_error = 0;
1486			SOCKBUF_UNLOCK(&so->so_rcv);
1487			goto release;
1488		}
1489		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1490		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1491			if (m == NULL) {
1492				SOCKBUF_UNLOCK(&so->so_rcv);
1493				goto release;
1494			} else
1495				goto dontblock;
1496		}
1497		for (; m != NULL; m = m->m_next)
1498			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1499				m = so->so_rcv.sb_mb;
1500				goto dontblock;
1501			}
1502		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1503		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1504			SOCKBUF_UNLOCK(&so->so_rcv);
1505			error = ENOTCONN;
1506			goto release;
1507		}
1508		if (uio->uio_resid == 0) {
1509			SOCKBUF_UNLOCK(&so->so_rcv);
1510			goto release;
1511		}
1512		if ((so->so_state & SS_NBIO) ||
1513		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1514			SOCKBUF_UNLOCK(&so->so_rcv);
1515			error = EWOULDBLOCK;
1516			goto release;
1517		}
1518		SBLASTRECORDCHK(&so->so_rcv);
1519		SBLASTMBUFCHK(&so->so_rcv);
1520		error = sbwait(&so->so_rcv);
1521		SOCKBUF_UNLOCK(&so->so_rcv);
1522		if (error)
1523			goto release;
1524		goto restart;
1525	}
1526dontblock:
1527	/*
1528	 * From this point onward, we maintain 'nextrecord' as a cache of the
1529	 * pointer to the next record in the socket buffer.  We must keep the
1530	 * various socket buffer pointers and local stack versions of the
1531	 * pointers in sync, pushing out modifications before dropping the
1532	 * socket buffer mutex, and re-reading them when picking it up.
1533	 *
1534	 * Otherwise, we will race with the network stack appending new data
1535	 * or records onto the socket buffer by using inconsistent/stale
1536	 * versions of the field, possibly resulting in socket buffer
1537	 * corruption.
1538	 *
1539	 * By holding the high-level sblock(), we prevent simultaneous
1540	 * readers from pulling off the front of the socket buffer.
1541	 */
1542	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1543#if 0
1544	if (uio->uio_td)
1545		uio->uio_td->td_ru.ru_msgrcv++;
1546#endif
1547	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
1548	SBLASTRECORDCHK(&so->so_rcv);
1549	SBLASTMBUFCHK(&so->so_rcv);
1550	nextrecord = m->m_nextpkt;
1551	if (pr->pr_flags & PR_ADDR) {
1552		KASSERT(m->m_type == MT_SONAME,
1553		    ("m->m_type == %d", m->m_type));
1554		orig_resid = 0;
1555		if (psa != NULL)
1556			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
1557			    M_NOWAIT);
1558		if (flags & MSG_PEEK) {
1559			m = m->m_next;
1560		} else {
1561			sbfree(&so->so_rcv, m);
1562			so->so_rcv.sb_mb = m_free(m);
1563			m = so->so_rcv.sb_mb;
1564			sockbuf_pushsync(&so->so_rcv, nextrecord);
1565		}
1566	}
1567
1568	/*
1569	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1570	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1571	 * just copy the data; if !MSG_PEEK, we call into the protocol to
1572	 * perform externalization (or freeing if controlp == NULL).
1573	 */
1574	if (m != NULL && m->m_type == MT_CONTROL) {
1575		struct mbuf *cm = NULL, *cmn;
1576		struct mbuf **cme = &cm;
1577
1578		do {
1579			if (flags & MSG_PEEK) {
1580				if (controlp != NULL) {
1581					*controlp = m_copy(m, 0, m->m_len);
1582					controlp = &(*controlp)->m_next;
1583				}
1584				m = m->m_next;
1585			} else {
1586				sbfree(&so->so_rcv, m);
1587		

Large files files are truncated, but you can click here to view the full file