/sys/kern/uipc_socket.c
C | 3902 lines | 2830 code | 357 blank | 715 comment | 768 complexity | 567e0eb2b3bb65b01dd0a33ff9a26120 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2008 Robert N. M. Watson 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35/* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 * 96 * NOTE: With regard to VNETs the general rule is that callers do not set 97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 99 * and sorflush(), which are usually called from a pre-set VNET context. 100 * sopoll() currently does not need a VNET context to be set. 101 */ 102 103#include <sys/cdefs.h> 104__FBSDID("$FreeBSD: release/9.1.0/sys/kern/uipc_socket.c 233353 2012-03-23 11:26:54Z kib $"); 105 106#include "opt_inet.h" 107#include "opt_inet6.h" 108#include "opt_zero.h" 109#include "opt_compat.h" 110#include "opt_passiveinet.h" 111#include "opt_promiscinet.h" 112 113#include <sys/param.h> 114#include <sys/systm.h> 115#include <sys/fcntl.h> 116#include <sys/limits.h> 117#include <sys/lock.h> 118#include <sys/mac.h> 119#include <sys/malloc.h> 120#include <sys/mbuf.h> 121#include <sys/mutex.h> 122#include <sys/domain.h> 123#include <sys/file.h> /* for struct knote */ 124#include <sys/kernel.h> 125#include <sys/event.h> 126#include <sys/eventhandler.h> 127#include <sys/poll.h> 128#include <sys/proc.h> 129#include <sys/protosw.h> 130#include <sys/socket.h> 131#include <sys/socketvar.h> 132#include <sys/resourcevar.h> 133#include <net/route.h> 134#include <sys/signalvar.h> 135#include <sys/stat.h> 136#include <sys/sx.h> 137#include <sys/sysctl.h> 138#include <sys/uio.h> 139#include <sys/jail.h> 140 141#include <net/vnet.h> 142 143#ifdef MAC 144#include <security/mac/mac_framework.h> 145#endif /* MAC */ 146 147#ifdef PROMISCUOUS_INET 148#include <netinet/in_promisc.h> 149#endif 150 151#include <vm/uma.h> 152 153#ifdef COMPAT_FREEBSD32 154#include <sys/mount.h> 155#include <sys/sysent.h> 156#include <compat/freebsd32/freebsd32.h> 157#endif 158 159static int soreceive_rcvoob(struct socket *so, struct uio *uio, 160 int flags); 161 162static void filt_sordetach(struct knote *kn); 163static int filt_soread(struct knote *kn, long hint); 164static void filt_sowdetach(struct knote *kn); 165static int filt_sowrite(struct knote *kn, long hint); 166static int filt_solisten(struct knote *kn, long hint); 167 168static struct filterops solisten_filtops = { 169 .f_isfd = 1, 170 .f_detach = filt_sordetach, 171 .f_event = filt_solisten, 172}; 173static struct filterops soread_filtops = { 174 .f_isfd = 1, 175 .f_detach = filt_sordetach, 176 .f_event = filt_soread, 177}; 178static struct filterops sowrite_filtops = { 179 .f_isfd = 1, 180 .f_detach = filt_sowdetach, 181 .f_event = filt_sowrite, 182}; 183 184uma_zone_t socket_zone; 185so_gen_t so_gencnt; /* generation count for sockets */ 186 187int maxsockets; 188 189MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 190MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 191 192#define VNET_SO_ASSERT(so) \ 193 VNET_ASSERT(curvnet != NULL, \ 194 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 195 196static int somaxconn = SOMAXCONN; 197static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS); 198/* XXX: we dont have SYSCTL_USHORT */ 199SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW, 200 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection " 201 "queue size"); 202static int numopensockets; 203SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 204 &numopensockets, 0, "Number of open sockets"); 205#ifdef ZERO_COPY_SOCKETS 206/* These aren't static because they're used in other files. */ 207int so_zero_copy_send = 1; 208int so_zero_copy_receive = 1; 209SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 210 "Zero copy controls"); 211SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 212 &so_zero_copy_receive, 0, "Enable zero copy receive"); 213SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 214 &so_zero_copy_send, 0, "Enable zero copy send"); 215#endif /* ZERO_COPY_SOCKETS */ 216 217/* 218 * accept_mtx locks down per-socket fields relating to accept queues. See 219 * socketvar.h for an annotation of the protected fields of struct socket. 220 */ 221struct mtx accept_mtx; 222MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 223 224/* 225 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 226 * so_gencnt field. 227 */ 228static struct mtx so_global_mtx; 229MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 230 231/* 232 * General IPC sysctl name space, used by sockets and a variety of other IPC 233 * types. 234 */ 235SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 236 237/* 238 * Sysctl to get and set the maximum global sockets limit. Notify protocols 239 * of the change so that they can update their dependent limits as required. 240 */ 241static int 242sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 243{ 244 int error, newmaxsockets; 245 246 newmaxsockets = maxsockets; 247 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 248 if (error == 0 && req->newptr) { 249 if (newmaxsockets > maxsockets) { 250 maxsockets = newmaxsockets; 251 if (maxsockets > ((maxfiles / 4) * 3)) { 252 maxfiles = (maxsockets * 5) / 4; 253 maxfilesperproc = (maxfiles * 9) / 10; 254 } 255 EVENTHANDLER_INVOKE(maxsockets_change); 256 } else 257 error = EINVAL; 258 } 259 return (error); 260} 261 262SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 263 &maxsockets, 0, sysctl_maxsockets, "IU", 264 "Maximum number of sockets avaliable"); 265 266/* 267 * Initialise maxsockets. This SYSINIT must be run after 268 * tunable_mbinit(). 269 */ 270static void 271init_maxsockets(void *ignored) 272{ 273 274 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 275 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); 276} 277SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 278 279/* 280 * Socket operation routines. These routines are called by the routines in 281 * sys_socket.c or from a system process, and implement the semantics of 282 * socket operations by switching out to the protocol specific routines. 283 */ 284 285/* 286 * Get a socket structure from our zone, and initialize it. Note that it 287 * would probably be better to allocate socket and PCB at the same time, but 288 * I'm not convinced that all the protocols can be easily modified to do 289 * this. 290 * 291 * soalloc() returns a socket with a ref count of 0. 292 */ 293static struct socket * 294soalloc(struct vnet *vnet) 295{ 296 struct socket *so; 297 298 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 299 if (so == NULL) 300 return (NULL); 301#ifdef MAC 302 if (mac_socket_init(so, M_NOWAIT) != 0) { 303 uma_zfree(socket_zone, so); 304 return (NULL); 305 } 306#endif 307#ifdef PROMISCUOUS_INET 308 if (in_promisc_socket_init(so, M_NOWAIT) != 0) { 309#ifdef MAC 310 mac_socket_destroy(so); 311#endif 312 uma_zfree(socket_zone, so); 313 return (NULL); 314 } 315#endif /* PROMISCUOUS_INET */ 316 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 317 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 318 sx_init(&so->so_snd.sb_sx, "so_snd_sx"); 319 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); 320 TAILQ_INIT(&so->so_aiojobq); 321 mtx_lock(&so_global_mtx); 322 so->so_gencnt = ++so_gencnt; 323 ++numopensockets; 324#ifdef VIMAGE 325 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 326 __func__, __LINE__, so)); 327 vnet->vnet_sockcnt++; 328 so->so_vnet = vnet; 329#endif 330 mtx_unlock(&so_global_mtx); 331 return (so); 332} 333 334/* 335 * Free the storage associated with a socket at the socket layer, tear down 336 * locks, labels, etc. All protocol state is assumed already to have been 337 * torn down (and possibly never set up) by the caller. 338 */ 339static void 340sodealloc(struct socket *so) 341{ 342 343 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 344 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 345 346 mtx_lock(&so_global_mtx); 347 so->so_gencnt = ++so_gencnt; 348 --numopensockets; /* Could be below, but faster here. */ 349#ifdef VIMAGE 350 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 351 __func__, __LINE__, so)); 352 so->so_vnet->vnet_sockcnt--; 353#endif 354 mtx_unlock(&so_global_mtx); 355 if (so->so_rcv.sb_hiwat) 356 (void)chgsbsize(so->so_cred->cr_uidinfo, 357 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 358 if (so->so_snd.sb_hiwat) 359 (void)chgsbsize(so->so_cred->cr_uidinfo, 360 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 361#ifdef INET 362 /* remove acccept filter if one is present. */ 363 if (so->so_accf != NULL) 364 do_setopt_accept_filter(so, NULL); 365#endif 366#ifdef PROMISCUOUS_INET 367 in_promisc_socket_destroy(so); 368#endif 369#ifdef MAC 370 mac_socket_destroy(so); 371#endif 372 crfree(so->so_cred); 373 sx_destroy(&so->so_snd.sb_sx); 374 sx_destroy(&so->so_rcv.sb_sx); 375 SOCKBUF_LOCK_DESTROY(&so->so_snd); 376 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 377 uma_zfree(socket_zone, so); 378} 379 380/* 381 * socreate returns a socket with a ref count of 1. The socket should be 382 * closed with soclose(). 383 */ 384int 385socreate(int dom, struct socket **aso, int type, int proto, 386 struct ucred *cred, struct thread *td) 387{ 388 struct protosw *prp; 389 struct socket *so; 390 int error; 391 392 if (proto) 393 prp = pffindproto(dom, proto, type); 394 else 395 prp = pffindtype(dom, type); 396 397 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || 398 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 399 return (EPROTONOSUPPORT); 400 401 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 402 return (EPROTONOSUPPORT); 403 404 if (prp->pr_type != type) 405 return (EPROTOTYPE); 406 so = soalloc(CRED_TO_VNET(cred)); 407 if (so == NULL) 408 return (ENOBUFS); 409 410 TAILQ_INIT(&so->so_incomp); 411 TAILQ_INIT(&so->so_comp); 412 so->so_type = type; 413 so->so_cred = crhold(cred); 414 if ((prp->pr_domain->dom_family == PF_INET) || 415 (prp->pr_domain->dom_family == PF_INET6) || 416 (prp->pr_domain->dom_family == PF_ROUTE)) 417 so->so_fibnum = td->td_proc->p_fibnum; 418 else 419 so->so_fibnum = 0; 420 so->so_proto = prp; 421#ifdef MAC 422 mac_socket_create(cred, so); 423#endif 424 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 425 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 426 so->so_count = 1; 427 /* 428 * Auto-sizing of socket buffers is managed by the protocols and 429 * the appropriate flags must be set in the pru_attach function. 430 */ 431 CURVNET_SET(so->so_vnet); 432 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 433 CURVNET_RESTORE(); 434 if (error) { 435 KASSERT(so->so_count == 1, ("socreate: so_count %d", 436 so->so_count)); 437 so->so_count = 0; 438 sodealloc(so); 439 return (error); 440 } 441 *aso = so; 442 return (0); 443} 444 445#ifdef REGRESSION 446static int regression_sonewconn_earlytest = 1; 447SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 448 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 449#endif 450 451/* 452 * When an attempt at a new connection is noted on a socket which accepts 453 * connections, sonewconn is called. If the connection is possible (subject 454 * to space constraints, etc.) then we allocate a new structure, properly 455 * linked into the data structure of the original socket, and return this. 456 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 457 * 458 * Note: the ref count on the socket is 0 on return. 459 */ 460struct socket * 461sonewconn(struct socket *head, int connstatus) 462{ 463 struct socket *so; 464 int over; 465 466 ACCEPT_LOCK(); 467 over = (head->so_qlen > 3 * head->so_qlimit / 2); 468 ACCEPT_UNLOCK(); 469#ifdef REGRESSION 470 if (regression_sonewconn_earlytest && over) 471#else 472 if (over) 473#endif 474 return (NULL); 475 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 476 __func__, __LINE__, head)); 477 so = soalloc(head->so_vnet); 478 if (so == NULL) 479 return (NULL); 480 if ((head->so_options & SO_ACCEPTFILTER) != 0) 481 connstatus = 0; 482 so->so_head = head; 483 so->so_type = head->so_type; 484 so->so_options = head->so_options &~ SO_ACCEPTCONN; 485 so->so_linger = head->so_linger; 486 so->so_state = head->so_state | SS_NOFDREF; 487 so->so_fibnum = head->so_fibnum; 488 so->so_proto = head->so_proto; 489 so->so_cred = crhold(head->so_cred); 490#ifdef MAC 491 mac_socket_newconn(head, so); 492#endif 493#ifdef PROMISCUOUS_INET 494 in_promisc_socket_newconn(head, so); 495#endif 496 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 497 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 498 VNET_SO_ASSERT(head); 499 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || 500 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 501 sodealloc(so); 502 return (NULL); 503 } 504 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 505 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 506 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 507 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 508 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 509 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 510 so->so_state |= connstatus; 511 ACCEPT_LOCK(); 512 if (connstatus) { 513 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 514 so->so_qstate |= SQ_COMP; 515 head->so_qlen++; 516 } else { 517 /* 518 * Keep removing sockets from the head until there's room for 519 * us to insert on the tail. In pre-locking revisions, this 520 * was a simple if(), but as we could be racing with other 521 * threads and soabort() requires dropping locks, we must 522 * loop waiting for the condition to be true. 523 */ 524 while (head->so_incqlen > head->so_qlimit) { 525 struct socket *sp; 526 sp = TAILQ_FIRST(&head->so_incomp); 527 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 528 head->so_incqlen--; 529 sp->so_qstate &= ~SQ_INCOMP; 530 sp->so_head = NULL; 531 ACCEPT_UNLOCK(); 532 soabort(sp); 533 ACCEPT_LOCK(); 534 } 535 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 536 so->so_qstate |= SQ_INCOMP; 537 head->so_incqlen++; 538 } 539 ACCEPT_UNLOCK(); 540 if (connstatus) { 541 sorwakeup(head); 542 wakeup_one(&head->so_timeo); 543 } 544 return (so); 545} 546 547#ifdef PASSIVE_INET 548/* 549 * When a new connection is completed on a listening socket that is 550 * configured for passive reassembly, sonewconn_passive_client is called to 551 * create a socket representing the client side of the reassembled 552 * connection. We allocate a new structure, inherit configuration from the 553 * listening socket, and return this. 554 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTING. 555 * 556 * Note: the ref count on the socket is 1 on return. 557 */ 558struct socket * 559sonewconn_passive_client(struct socket *head, int connstatus) 560{ 561 struct socket *so; 562 563 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 564 __func__, __LINE__, head)); 565 so = soalloc(head->so_vnet); 566 if (so == NULL) 567 return (NULL); 568 so->so_head = NULL; /* just inheriting from head, not otherwise associating */ 569 so->so_type = head->so_type; 570 so->so_options = head->so_options &~ SO_ACCEPTCONN; 571 so->so_linger = head->so_linger; 572 so->so_state = head->so_state | SS_NOFDREF; 573 so->so_fibnum = head->so_fibnum; 574 so->so_proto = head->so_proto; 575 so->so_cred = crhold(head->so_cred); 576#ifdef MAC 577 mac_socket_newconn(head, so); 578#endif 579#ifdef PROMISCUOUS_INET 580 in_promisc_socket_newconn(head, so); 581#endif 582 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 583 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 584 VNET_SO_ASSERT(head); 585 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || 586 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 587 sodealloc(so); 588 return (NULL); 589 } 590 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 591 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 592 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 593 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 594 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 595 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 596 so->so_state |= connstatus; 597 598 return (so); 599} 600#endif /* PASSIVE_INET */ 601 602int 603sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 604{ 605 int error; 606 607 CURVNET_SET(so->so_vnet); 608 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); 609 CURVNET_RESTORE(); 610 return error; 611} 612 613/* 614 * solisten() transitions a socket from a non-listening state to a listening 615 * state, but can also be used to update the listen queue depth on an 616 * existing listen socket. The protocol will call back into the sockets 617 * layer using solisten_proto_check() and solisten_proto() to check and set 618 * socket-layer listen state. Call backs are used so that the protocol can 619 * acquire both protocol and socket layer locks in whatever order is required 620 * by the protocol. 621 * 622 * Protocol implementors are advised to hold the socket lock across the 623 * socket-layer test and set to avoid races at the socket layer. 624 */ 625int 626solisten(struct socket *so, int backlog, struct thread *td) 627{ 628 int error; 629 630 CURVNET_SET(so->so_vnet); 631 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); 632 CURVNET_RESTORE(); 633 return error; 634} 635 636int 637solisten_proto_check(struct socket *so) 638{ 639 640 SOCK_LOCK_ASSERT(so); 641 642 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 643 SS_ISDISCONNECTING)) 644 return (EINVAL); 645 return (0); 646} 647 648void 649solisten_proto(struct socket *so, int backlog) 650{ 651 652 SOCK_LOCK_ASSERT(so); 653 654 if (backlog < 0 || backlog > somaxconn) 655 backlog = somaxconn; 656 so->so_qlimit = backlog; 657 so->so_options |= SO_ACCEPTCONN; 658} 659 660/* 661 * Evaluate the reference count and named references on a socket; if no 662 * references remain, free it. This should be called whenever a reference is 663 * released, such as in sorele(), but also when named reference flags are 664 * cleared in socket or protocol code. 665 * 666 * sofree() will free the socket if: 667 * 668 * - There are no outstanding file descriptor references or related consumers 669 * (so_count == 0). 670 * 671 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 672 * 673 * - The protocol does not have an outstanding strong reference on the socket 674 * (SS_PROTOREF). 675 * 676 * - The socket is not in a completed connection queue, so a process has been 677 * notified that it is present. If it is removed, the user process may 678 * block in accept() despite select() saying the socket was ready. 679 */ 680void 681sofree(struct socket *so) 682{ 683 struct protosw *pr = so->so_proto; 684 struct socket *head; 685 686 ACCEPT_LOCK_ASSERT(); 687 SOCK_LOCK_ASSERT(so); 688 689 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 690 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { 691 SOCK_UNLOCK(so); 692 ACCEPT_UNLOCK(); 693 return; 694 } 695 696 head = so->so_head; 697 if (head != NULL) { 698 KASSERT((so->so_qstate & SQ_COMP) != 0 || 699 (so->so_qstate & SQ_INCOMP) != 0, 700 ("sofree: so_head != NULL, but neither SQ_COMP nor " 701 "SQ_INCOMP")); 702 KASSERT((so->so_qstate & SQ_COMP) == 0 || 703 (so->so_qstate & SQ_INCOMP) == 0, 704 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 705 TAILQ_REMOVE(&head->so_incomp, so, so_list); 706 head->so_incqlen--; 707 so->so_qstate &= ~SQ_INCOMP; 708 so->so_head = NULL; 709 } 710 KASSERT((so->so_qstate & SQ_COMP) == 0 && 711 (so->so_qstate & SQ_INCOMP) == 0, 712 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 713 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 714 if (so->so_options & SO_ACCEPTCONN) { 715 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated")); 716 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated")); 717 } 718 SOCK_UNLOCK(so); 719 ACCEPT_UNLOCK(); 720 721 VNET_SO_ASSERT(so); 722 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 723 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 724 if (pr->pr_usrreqs->pru_detach != NULL) 725 (*pr->pr_usrreqs->pru_detach)(so); 726 727 /* 728 * From this point on, we assume that no other references to this 729 * socket exist anywhere else in the stack. Therefore, no locks need 730 * to be acquired or held. 731 * 732 * We used to do a lot of socket buffer and socket locking here, as 733 * well as invoke sorflush() and perform wakeups. The direct call to 734 * dom_dispose() and sbrelease_internal() are an inlining of what was 735 * necessary from sorflush(). 736 * 737 * Notice that the socket buffer and kqueue state are torn down 738 * before calling pru_detach. This means that protocols shold not 739 * assume they can perform socket wakeups, etc, in their detach code. 740 */ 741 sbdestroy(&so->so_snd, so); 742 sbdestroy(&so->so_rcv, so); 743 seldrain(&so->so_snd.sb_sel); 744 seldrain(&so->so_rcv.sb_sel); 745 knlist_destroy(&so->so_rcv.sb_sel.si_note); 746 knlist_destroy(&so->so_snd.sb_sel.si_note); 747 sodealloc(so); 748} 749 750/* 751 * Close a socket on last file table reference removal. Initiate disconnect 752 * if connected. Free socket when disconnect complete. 753 * 754 * This function will sorele() the socket. Note that soclose() may be called 755 * prior to the ref count reaching zero. The actual socket structure will 756 * not be freed until the ref count reaches zero. 757 */ 758int 759soclose(struct socket *so) 760{ 761 int error = 0; 762 763 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 764 765 CURVNET_SET(so->so_vnet); 766 funsetown(&so->so_sigio); 767 if (so->so_state & SS_ISCONNECTED) { 768 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 769 error = sodisconnect(so); 770 if (error) { 771 if (error == ENOTCONN) 772 error = 0; 773 goto drop; 774 } 775 } 776 if (so->so_options & SO_LINGER) { 777 if ((so->so_state & SS_ISDISCONNECTING) && 778 (so->so_state & SS_NBIO)) 779 goto drop; 780 while (so->so_state & SS_ISCONNECTED) { 781 error = tsleep(&so->so_timeo, 782 PSOCK | PCATCH, "soclos", so->so_linger * hz); 783 if (error) 784 break; 785 } 786 } 787 } 788 789drop: 790 if (so->so_proto->pr_usrreqs->pru_close != NULL) 791 (*so->so_proto->pr_usrreqs->pru_close)(so); 792 if (so->so_options & SO_ACCEPTCONN) { 793 struct socket *sp; 794 ACCEPT_LOCK(); 795 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 796 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 797 so->so_incqlen--; 798 sp->so_qstate &= ~SQ_INCOMP; 799 sp->so_head = NULL; 800 ACCEPT_UNLOCK(); 801 soabort(sp); 802 ACCEPT_LOCK(); 803 } 804 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 805 TAILQ_REMOVE(&so->so_comp, sp, so_list); 806 so->so_qlen--; 807 sp->so_qstate &= ~SQ_COMP; 808 sp->so_head = NULL; 809 ACCEPT_UNLOCK(); 810 soabort(sp); 811 ACCEPT_LOCK(); 812 } 813 ACCEPT_UNLOCK(); 814 } 815 ACCEPT_LOCK(); 816 SOCK_LOCK(so); 817 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 818 so->so_state |= SS_NOFDREF; 819 sorele(so); 820 CURVNET_RESTORE(); 821 return (error); 822} 823 824/* 825 * soabort() is used to abruptly tear down a connection, such as when a 826 * resource limit is reached (listen queue depth exceeded), or if a listen 827 * socket is closed while there are sockets waiting to be accepted. 828 * 829 * This interface is tricky, because it is called on an unreferenced socket, 830 * and must be called only by a thread that has actually removed the socket 831 * from the listen queue it was on, or races with other threads are risked. 832 * 833 * This interface will call into the protocol code, so must not be called 834 * with any socket locks held. Protocols do call it while holding their own 835 * recursible protocol mutexes, but this is something that should be subject 836 * to review in the future. 837 */ 838void 839soabort(struct socket *so) 840{ 841 842 /* 843 * In as much as is possible, assert that no references to this 844 * socket are held. This is not quite the same as asserting that the 845 * current thread is responsible for arranging for no references, but 846 * is as close as we can get for now. 847 */ 848 KASSERT(so->so_count == 0, ("soabort: so_count")); 849 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 850 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 851 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 852 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 853 VNET_SO_ASSERT(so); 854 855 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 856 (*so->so_proto->pr_usrreqs->pru_abort)(so); 857 ACCEPT_LOCK(); 858 SOCK_LOCK(so); 859 sofree(so); 860} 861 862int 863soaccept(struct socket *so, struct sockaddr **nam) 864{ 865 int error; 866 867 SOCK_LOCK(so); 868 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 869 so->so_state &= ~SS_NOFDREF; 870 SOCK_UNLOCK(so); 871 872 CURVNET_SET(so->so_vnet); 873 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 874 CURVNET_RESTORE(); 875 return (error); 876} 877 878int 879soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 880{ 881 int error; 882 883 if (so->so_options & SO_ACCEPTCONN) 884 return (EOPNOTSUPP); 885 886 CURVNET_SET(so->so_vnet); 887 /* 888 * If protocol is connection-based, can only connect once. 889 * Otherwise, if connected, try to disconnect first. This allows 890 * user to disconnect by connecting to, e.g., a null address. 891 */ 892 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 893 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 894 (error = sodisconnect(so)))) { 895 error = EISCONN; 896 } else { 897 /* 898 * Prevent accumulated error from previous connection from 899 * biting us. 900 */ 901 so->so_error = 0; 902 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 903 } 904 CURVNET_RESTORE(); 905 906 return (error); 907} 908 909int 910soconnect2(struct socket *so1, struct socket *so2) 911{ 912 int error; 913 914 CURVNET_SET(so1->so_vnet); 915 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 916 CURVNET_RESTORE(); 917 return (error); 918} 919 920int 921sodisconnect(struct socket *so) 922{ 923 int error; 924 925 if ((so->so_state & SS_ISCONNECTED) == 0) 926 return (ENOTCONN); 927 if (so->so_state & SS_ISDISCONNECTING) 928 return (EALREADY); 929 VNET_SO_ASSERT(so); 930 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 931 return (error); 932} 933 934#ifdef ZERO_COPY_SOCKETS 935struct so_zerocopy_stats{ 936 int size_ok; 937 int align_ok; 938 int found_ifp; 939}; 940struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 941#include <netinet/in.h> 942#include <net/route.h> 943#include <netinet/in_pcb.h> 944#include <vm/vm.h> 945#include <vm/vm_page.h> 946#include <vm/vm_object.h> 947 948/* 949 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise 950 * sosend_dgram() and sosend_generic() use m_uiotombuf(). 951 * 952 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 953 * all of the data referenced by the uio. If desired, it uses zero-copy. 954 * *space will be updated to reflect data copied in. 955 * 956 * NB: If atomic I/O is requested, the caller must already have checked that 957 * space can hold resid bytes. 958 * 959 * NB: In the event of an error, the caller may need to free the partial 960 * chain pointed to by *mpp. The contents of both *uio and *space may be 961 * modified even in the case of an error. 962 */ 963static int 964sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, 965 int flags) 966{ 967 struct mbuf *m, **mp, *top; 968 long len; 969 ssize_t resid; 970 int error; 971#ifdef ZERO_COPY_SOCKETS 972 int cow_send; 973#endif 974 975 *retmp = top = NULL; 976 mp = ⊤ 977 len = 0; 978 resid = uio->uio_resid; 979 error = 0; 980 do { 981#ifdef ZERO_COPY_SOCKETS 982 cow_send = 0; 983#endif /* ZERO_COPY_SOCKETS */ 984 if (resid >= MINCLSIZE) { 985#ifdef ZERO_COPY_SOCKETS 986 if (top == NULL) { 987 m = m_gethdr(M_WAITOK, MT_DATA); 988 m->m_pkthdr.len = 0; 989 m->m_pkthdr.rcvif = NULL; 990 } else 991 m = m_get(M_WAITOK, MT_DATA); 992 if (so_zero_copy_send && 993 resid>=PAGE_SIZE && 994 *space>=PAGE_SIZE && 995 uio->uio_iov->iov_len>=PAGE_SIZE) { 996 so_zerocp_stats.size_ok++; 997 so_zerocp_stats.align_ok++; 998 cow_send = socow_setup(m, uio); 999 len = cow_send; 1000 } 1001 if (!cow_send) { 1002 m_clget(m, M_WAITOK); 1003 len = min(min(MCLBYTES, resid), *space); 1004 } 1005#else /* ZERO_COPY_SOCKETS */ 1006 if (top == NULL) { 1007 m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR); 1008 m->m_pkthdr.len = 0; 1009 m->m_pkthdr.rcvif = NULL; 1010 } else 1011 m = m_getcl(M_WAIT, MT_DATA, 0); 1012 len = min(min(MCLBYTES, resid), *space); 1013#endif /* ZERO_COPY_SOCKETS */ 1014 } else { 1015 if (top == NULL) { 1016 m = m_gethdr(M_WAIT, MT_DATA); 1017 m->m_pkthdr.len = 0; 1018 m->m_pkthdr.rcvif = NULL; 1019 1020 len = min(min(MHLEN, resid), *space); 1021 /* 1022 * For datagram protocols, leave room 1023 * for protocol headers in first mbuf. 1024 */ 1025 if (atomic && m && len < MHLEN) 1026 MH_ALIGN(m, len); 1027 } else { 1028 m = m_get(M_WAIT, MT_DATA); 1029 len = min(min(MLEN, resid), *space); 1030 } 1031 } 1032 if (m == NULL) { 1033 error = ENOBUFS; 1034 goto out; 1035 } 1036 1037 *space -= len; 1038#ifdef ZERO_COPY_SOCKETS 1039 if (cow_send) 1040 error = 0; 1041 else 1042#endif /* ZERO_COPY_SOCKETS */ 1043 error = uiomove(mtod(m, void *), (int)len, uio); 1044 resid = uio->uio_resid; 1045 m->m_len = len; 1046 *mp = m; 1047 top->m_pkthdr.len += len; 1048 if (error) 1049 goto out; 1050 mp = &m->m_next; 1051 if (resid <= 0) { 1052 if (flags & MSG_EOR) 1053 top->m_flags |= M_EOR; 1054 break; 1055 } 1056 } while (*space > 0 && atomic); 1057out: 1058 *retmp = top; 1059 return (error); 1060} 1061#endif /*ZERO_COPY_SOCKETS*/ 1062 1063#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1064 1065int 1066sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1067 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1068{ 1069 long space; 1070 ssize_t resid; 1071 int clen = 0, error, dontroute; 1072#ifdef ZERO_COPY_SOCKETS 1073 int atomic = sosendallatonce(so) || top; 1074#endif 1075 1076 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM")); 1077 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1078 ("sodgram_send: !PR_ATOMIC")); 1079 1080 if (uio != NULL) 1081 resid = uio->uio_resid; 1082 else 1083 resid = top->m_pkthdr.len; 1084 /* 1085 * In theory resid should be unsigned. However, space must be 1086 * signed, as it might be less than 0 if we over-committed, and we 1087 * must use a signed comparison of space and resid. On the other 1088 * hand, a negative resid causes us to loop sending 0-length 1089 * segments to the protocol. 1090 */ 1091 if (resid < 0) { 1092 error = EINVAL; 1093 goto out; 1094 } 1095 1096 dontroute = 1097 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1098 if (td != NULL) 1099 td->td_ru.ru_msgsnd++; 1100 if (control != NULL) 1101 clen = control->m_len; 1102 1103 SOCKBUF_LOCK(&so->so_snd); 1104 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1105 SOCKBUF_UNLOCK(&so->so_snd); 1106 error = EPIPE; 1107 goto out; 1108 } 1109 if (so->so_error) { 1110 error = so->so_error; 1111 so->so_error = 0; 1112 SOCKBUF_UNLOCK(&so->so_snd); 1113 goto out; 1114 } 1115 if ((so->so_state & SS_ISCONNECTED) == 0) { 1116 /* 1117 * `sendto' and `sendmsg' is allowed on a connection-based 1118 * socket if it supports implied connect. Return ENOTCONN if 1119 * not connected and no address is supplied. 1120 */ 1121 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1122 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1123 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1124 !(resid == 0 && clen != 0)) { 1125 SOCKBUF_UNLOCK(&so->so_snd); 1126 error = ENOTCONN; 1127 goto out; 1128 } 1129 } else if (addr == NULL) { 1130 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1131 error = ENOTCONN; 1132 else 1133 error = EDESTADDRREQ; 1134 SOCKBUF_UNLOCK(&so->so_snd); 1135 goto out; 1136 } 1137 } 1138 1139 /* 1140 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1141 * problem and need fixing. 1142 */ 1143 space = sbspace(&so->so_snd); 1144 if (flags & MSG_OOB) 1145 space += 1024; 1146 space -= clen; 1147 SOCKBUF_UNLOCK(&so->so_snd); 1148 if (resid > space) { 1149 error = EMSGSIZE; 1150 goto out; 1151 } 1152 if (uio == NULL) { 1153 resid = 0; 1154 if (flags & MSG_EOR) 1155 top->m_flags |= M_EOR; 1156 } else { 1157#ifdef ZERO_COPY_SOCKETS 1158 error = sosend_copyin(uio, &top, atomic, &space, flags); 1159 if (error) 1160 goto out; 1161#else 1162 /* 1163 * Copy the data from userland into a mbuf chain. 1164 * If no data is to be copied in, a single empty mbuf 1165 * is returned. 1166 */ 1167 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1168 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1169 if (top == NULL) { 1170 error = EFAULT; /* only possible error */ 1171 goto out; 1172 } 1173 space -= resid - uio->uio_resid; 1174#endif 1175 resid = uio->uio_resid; 1176 } 1177 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1178 /* 1179 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1180 * than with. 1181 */ 1182 if (dontroute) { 1183 SOCK_LOCK(so); 1184 so->so_options |= SO_DONTROUTE; 1185 SOCK_UNLOCK(so); 1186 } 1187 /* 1188 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1189 * of date. We could have recieved a reset packet in an interrupt or 1190 * maybe we slept while doing page faults in uiomove() etc. We could 1191 * probably recheck again inside the locking protection here, but 1192 * there are probably other places that this also happens. We must 1193 * rethink this. 1194 */ 1195 VNET_SO_ASSERT(so); 1196 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1197 (flags & MSG_OOB) ? PRUS_OOB : 1198 /* 1199 * If the user set MSG_EOF, the protocol understands this flag and 1200 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1201 */ 1202 ((flags & MSG_EOF) && 1203 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1204 (resid <= 0)) ? 1205 PRUS_EOF : 1206 /* If there is more to send set PRUS_MORETOCOME */ 1207 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1208 top, addr, control, td); 1209 if (dontroute) { 1210 SOCK_LOCK(so); 1211 so->so_options &= ~SO_DONTROUTE; 1212 SOCK_UNLOCK(so); 1213 } 1214 clen = 0; 1215 control = NULL; 1216 top = NULL; 1217out: 1218 if (top != NULL) 1219 m_freem(top); 1220 if (control != NULL) 1221 m_freem(control); 1222 return (error); 1223} 1224 1225/* 1226 * Send on a socket. If send must go all at once and message is larger than 1227 * send buffering, then hard error. Lock against other senders. If must go 1228 * all at once and not enough room now, then inform user that this would 1229 * block and do nothing. Otherwise, if nonblocking, send as much as 1230 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1231 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1232 * in mbuf chain must be small enough to send all at once. 1233 * 1234 * Returns nonzero on error, timeout or signal; callers must check for short 1235 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1236 * on return. 1237 */ 1238int 1239sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1240 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1241{ 1242 long space; 1243 ssize_t resid; 1244 int clen = 0, error, dontroute; 1245 int atomic = sosendallatonce(so) || top; 1246 1247 if (uio != NULL) 1248 resid = uio->uio_resid; 1249 else 1250 resid = top->m_pkthdr.len; 1251 /* 1252 * In theory resid should be unsigned. However, space must be 1253 * signed, as it might be less than 0 if we over-committed, and we 1254 * must use a signed comparison of space and resid. On the other 1255 * hand, a negative resid causes us to loop sending 0-length 1256 * segments to the protocol. 1257 * 1258 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1259 * type sockets since that's an error. 1260 */ 1261 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1262 error = EINVAL; 1263 goto out; 1264 } 1265 1266 dontroute = 1267 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1268 (so->so_proto->pr_flags & PR_ATOMIC); 1269 if (td != NULL) 1270 td->td_ru.ru_msgsnd++; 1271 if (control != NULL) 1272 clen = control->m_len; 1273 1274 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1275 if (error) 1276 goto out; 1277 1278restart: 1279 do { 1280 SOCKBUF_LOCK(&so->so_snd); 1281 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1282 SOCKBUF_UNLOCK(&so->so_snd); 1283 error = EPIPE; 1284 goto release; 1285 } 1286 if (so->so_error) { 1287 error = so->so_error; 1288 so->so_error = 0; 1289 SOCKBUF_UNLOCK(&so->so_snd); 1290 goto release; 1291 } 1292 if ((so->so_state & SS_ISCONNECTED) == 0) { 1293 /* 1294 * `sendto' and `sendmsg' is allowed on a connection- 1295 * based socket if it supports implied connect. 1296 * Return ENOTCONN if not connected and no address is 1297 * supplied. 1298 */ 1299 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1300 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1301 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1302 !(resid == 0 && clen != 0)) { 1303 SOCKBUF_UNLOCK(&so->so_snd); 1304 error = ENOTCONN; 1305 goto release; 1306 } 1307 } else if (addr == NULL) { 1308 SOCKBUF_UNLOCK(&so->so_snd); 1309 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1310 error = ENOTCONN; 1311 else 1312 error = EDESTADDRREQ; 1313 goto release; 1314 } 1315 } 1316 space = sbspace(&so->so_snd); 1317 if (flags & MSG_OOB) 1318 space += 1024; 1319 if ((atomic && resid > so->so_snd.sb_hiwat) || 1320 clen > so->so_snd.sb_hiwat) { 1321 SOCKBUF_UNLOCK(&so->so_snd); 1322 error = EMSGSIZE; 1323 goto release; 1324 } 1325 if (space < resid + clen && 1326 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1327 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 1328 if (so->so_upcallprep.soup_send) { 1329 so->so_upcallprep.soup_send(so, 1330 so->so_upcallprep.soup_send_arg, 1331 resid); 1332 } 1333 SOCKBUF_UNLOCK(&so->so_snd); 1334 error = EWOULDBLOCK; 1335 goto release; 1336 } 1337 error = sbwait(&so->so_snd); 1338 SOCKBUF_UNLOCK(&so->so_snd); 1339 if (error) 1340 goto release; 1341 goto restart; 1342 } 1343 SOCKBUF_UNLOCK(&so->so_snd); 1344 space -= clen; 1345 do { 1346 if (uio == NULL) { 1347 resid = 0; 1348 if (flags & MSG_EOR) 1349 top->m_flags |= M_EOR; 1350 } else { 1351#ifdef ZERO_COPY_SOCKETS 1352 error = sosend_copyin(uio, &top, atomic, 1353 &space, flags); 1354 if (error != 0) 1355 goto release; 1356#else 1357 /* 1358 * Copy the data from userland into a mbuf 1359 * chain. If no data is to be copied in, 1360 * a single empty mbuf is returned. 1361 */ 1362 top = m_uiotombuf(uio, M_WAITOK, space, 1363 (atomic ? max_hdr : 0), 1364 (atomic ? M_PKTHDR : 0) | 1365 ((flags & MSG_EOR) ? M_EOR : 0)); 1366 if (top == NULL) { 1367 error = EFAULT; /* only possible error */ 1368 goto release; 1369 } 1370 space -= resid - uio->uio_resid; 1371#endif 1372 resid = uio->uio_resid; 1373 } 1374 if (dontroute) { 1375 SOCK_LOCK(so); 1376 so->so_options |= SO_DONTROUTE; 1377 SOCK_UNLOCK(so); 1378 } 1379 /* 1380 * XXX all the SBS_CANTSENDMORE checks previously 1381 * done could be out of date. We could have recieved 1382 * a reset packet in an interrupt or maybe we slept 1383 * while doing page faults in uiomove() etc. We 1384 * could probably recheck again inside the locking 1385 * protection here, but there are probably other 1386 * places that this also happens. We must rethink 1387 * this. 1388 */ 1389 VNET_SO_ASSERT(so); 1390 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1391 (flags & MSG_OOB) ? PRUS_OOB : 1392 /* 1393 * If the user set MSG_EOF, the protocol understands 1394 * this flag and nothing left to send then use 1395 * PRU_SEND_EOF instead of PRU_SEND. 1396 */ 1397 ((flags & MSG_EOF) && 1398 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1399 (resid <= 0)) ? 1400 PRUS_EOF : 1401 /* If there is more to send set PRUS_MORETOCOME. */ 1402 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1403 top, addr, control, td); 1404 if (dontroute) { 1405 SOCK_LOCK(so); 1406 so->so_options &= ~SO_DONTROUTE; 1407 SOCK_UNLOCK(so); 1408 } 1409 clen = 0; 1410 control = NULL; 1411 top = NULL; 1412 if (error) 1413 goto release; 1414 } while (resid && space > 0); 1415 } while (resid); 1416 1417release: 1418 sbunlock(&so->so_snd); 1419out: 1420 if (top != NULL) 1421 m_freem(top); 1422 if (control != NULL) 1423 m_freem(control); 1424 return (error); 1425} 1426 1427int 1428sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1429 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1430{ 1431 int error; 1432 1433 CURVNET_SET(so->so_vnet); 1434 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, 1435 control, flags, td); 1436 CURVNET_RESTORE(); 1437 return (error); 1438} 1439 1440/* 1441 * The part of soreceive() that implements reading non-inline out-of-band 1442 * data from a socket. For more complete comments, see soreceive(), from 1443 * which this code originated. 1444 * 1445 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1446 * unable to return an mbuf chain to the caller. 1447 */ 1448static int 1449soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1450{ 1451 struct protosw *pr = so->so_proto; 1452 struct mbuf *m; 1453 int error; 1454 1455 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1456 VNET_SO_ASSERT(so); 1457 1458 m = m_get(M_WAIT, MT_DATA); 1459 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1460 if (error) 1461 goto bad; 1462 do { 1463#ifdef ZERO_COPY_SOCKETS 1464 if (so_zero_copy_receive) { 1465 int disposable; 1466 1467 if ((m->m_flags & M_EXT) 1468 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1469 disposable = 1; 1470 else 1471 disposable = 0; 1472 1473 error = uiomoveco(mtod(m, void *), 1474 min(uio->uio_resid, m->m_len), 1475 uio, disposable); 1476 } else 1477#endif /* ZERO_COPY_SOCKETS */ 1478 error = uiomove(mtod(m, void *), 1479 (int) min(uio->uio_resid, m->m_len), uio); 1480 m = m_free(m); 1481 } while (uio->uio_resid && error == 0 && m); 1482bad: 1483 if (m != NULL) 1484 m_freem(m); 1485 return (error); 1486} 1487 1488/* 1489 * Following replacement or removal of the first mbuf on the first mbuf chain 1490 * of a socket buffer, push necessary state changes back into the socket 1491 * buffer so that other consumers see the values consistently. 'nextrecord' 1492 * is the callers locally stored value of the original value of 1493 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1494 * NOTE: 'nextrecord' may be NULL. 1495 */ 1496static __inline void 1497sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1498{ 1499 1500 SOCKBUF_LOCK_ASSERT(sb); 1501 /* 1502 * First, update for the new value of nextrecord. If necessary, make 1503 * it the first record. 1504 */ 1505 if (sb->sb_mb != NULL) 1506 sb->sb_mb->m_nextpkt = nextrecord; 1507 else 1508 sb->sb_mb = nextrecord; 1509 1510 /* 1511 * Now update any dependent socket buffer fields to reflect the new 1512 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1513 * addition of a second clause that takes care of the case where 1514 * sb_mb has been updated, but remains the last record. 1515 */ 1516 if (sb->sb_mb == NULL) { 1517 sb->sb_mbtail = NULL; 1518 sb->sb_lastrecord = NULL; 1519 } else if (sb->sb_mb->m_nextpkt == NULL) 1520 sb->sb_lastrecord = sb->sb_mb; 1521} 1522 1523 1524/* 1525 * Implement receive operations on a socket. We depend on the way that 1526 * records are added to the sockbuf by sbappend. In particular, each record 1527 * (mbufs linked through m_next) must begin with an address if the protocol 1528 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1529 * data, and then zero or more mbufs of data. In order to allow parallelism 1530 * between network receive and copying to user space, as well as avoid 1531 * sleeping with a mutex held, we release the socket buffer mutex during the 1532 * user space copy. Although the sockbuf is locked, new data may still be 1533 * appended, and thus we must maintain consistency of the sockbuf during that 1534 * time. 1535 * 1536 * The caller may receive the data as a single mbuf chain by supplying an 1537 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1538 * the count in uio_resid. 1539 */ 1540int 1541soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1542 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1543{ 1544 struct mbuf *m, **mp; 1545 int flags, error, offset; 1546 ssize_t len; 1547 struct protosw *pr = so->so_proto; 1548 struct mbuf *nextrecord; 1549 int moff, type = 0, last_m_flags, hole_break = 0; 1550 ssize_t orig_resid = uio->uio_resid; 1551 1552 mp = mp0; 1553 if (psa != NULL) 1554 *psa = NULL; 1555 if (controlp != NULL) 1556 *controlp = NULL; 1557 if (flagsp != NULL) { 1558 hole_break = *flagsp & MSG_HOLE_BREAK; 1559 *flagsp &= ~MSG_HOLE_BREAK; 1560 flags = *flagsp &~ MSG_EOR; 1561 } else 1562 flags = 0; 1563 if (flags & MSG_OOB) 1564 return (soreceive_rcvoob(so, uio, flags)); 1565 if (mp != NULL) 1566 *mp = NULL; 1567 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1568 && uio->uio_resid) { 1569 VNET_SO_ASSERT(so); 1570 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1571 } 1572 1573 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1574 if (error) 1575 return (error); 1576 1577restart: 1578 SOCKBUF_LOCK(&so->so_rcv); 1579 m = so->so_rcv.sb_mb; 1580 /* 1581 * If we have less data than requested, block awaiting more (subject 1582 * to any timeout) if: 1583 * 1. the current count is less than the low water mark, or 1584 * 2. MSG_WAITALL is set, and it is possible to do the entire 1585 * receive operation at once if we block (resid <= hiwat). 1586 * 3. MSG_DONTWAIT is not set 1587 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1588 * we have to do the receive in sections, and thus risk returning a 1589 * short count if a timeout or signal occurs after we start. 1590 */ 1591 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1592 so->so_rcv.sb_cc < uio->uio_resid) && 1593 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1594 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1595 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1596 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1597 ("receive: m == %p so->so_rcv.sb_cc == %u", 1598 m, so->so_rcv.sb_cc)); 1599 if (so->so_error) { 1600 if (m != NULL) 1601 goto dontblock; 1602 error = so->so_error; 1603 if ((flags & MSG_PEEK) == 0) 1604 so->so_error = 0; 1605 SOCKBUF_UNLOCK(&so->so_rcv); 1606 goto release; 1607 } 1608 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1609 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1610 if (m == NULL) { 1611 SOCKBUF_UNLOCK(&so->so_rcv); 1612 goto release; 1613 } else 1614 goto dontblock; 1615 } 1616 for (; m != NULL; m = m->m_next) 1617 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1618 m = so->so_rcv.sb_mb; 1619 goto dontblock; 1620 } 1621 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1622 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1623 SOCKBUF_UNLOCK(&so->so_rcv); 1624 error = ENOTCONN; 1625 goto release; 1626 } 1627 if (uio->uio_resid == 0) { 1628 SOCKBUF_UNLOCK(&so->so_rcv); 1629 goto release; 1630 } 1631 if ((so->so_state & SS_NBIO) || 1632 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1633 if (so->so_upcallprep.soup_receive != NULL) { 1634 so->so_upcallprep.soup_receive(so, 1635 so->so_upcallprep.soup_receive_arg, 1636 orig_resid - uio->uio_resid, uio->uio_resid); 1637 } 1638 SOCKBUF_UNLOCK(&so->so_rcv); 1639 error = EWOULDBLOCK; 1640 goto release; 1641 } 1642 SBLASTRECORDCHK(&so->so_rcv); 1643 SBLASTMBUFCHK(&so->so_rcv); 1644 error = sbwait(&so->so_rcv); 1645 SOCKBUF_UNLOCK(&so->so_rcv); 1646 if (error) 1647 goto release; 1648 goto restart; 1649 } 1650dontblock: 1651 /* 1652 * From this point onward, we maintain 'nextrecord' as a cache of the 1653 * pointer to the next record in the socket buffer. We must keep the 1654 * various socket buffer pointers and local stack versions of the 1655 * pointers in sync, pushing out modifications before dropping the 1656 * socket buffer mutex, and re-reading them when picking it up. 1657 * 1658 * Otherwise, we will race with the network stack appending new data 1659 * or records onto the socket buffer by using inconsistent/stale 1660 * versions of the field, possibly resulting in socket buffer 1661 * corruption. 1662 * 1663 * By holding the high-level sblock(), we prevent simultaneous 1664 * readers from pulling off the front of the socket buffer. 1665 */ 1666 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1667 if (uio->u…
Large files files are truncated, but you can click here to view the full file