PageRenderTime 76ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 1ms

/sys/kern/uipc_socket.c

https://github.com/jwg286/freebsd_usb
C | 3579 lines | 2567 code | 327 blank | 685 comment | 712 complexity | a56d623563f9d08cb916691cee72b1db MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, 0BSD, BSD-3-Clause
  1. /*-
  2. * Copyright (c) 1982, 1986, 1988, 1990, 1993
  3. * The Regents of the University of California.
  4. * Copyright (c) 2004 The FreeBSD Foundation
  5. * Copyright (c) 2004-2008 Robert N. M. Watson
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. * 4. Neither the name of the University nor the names of its contributors
  17. * may be used to endorse or promote products derived from this software
  18. * without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30. * SUCH DAMAGE.
  31. *
  32. * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
  33. */
  34. /*
  35. * Comments on the socket life cycle:
  36. *
  37. * soalloc() sets of socket layer state for a socket, called only by
  38. * socreate() and sonewconn(). Socket layer private.
  39. *
  40. * sodealloc() tears down socket layer state for a socket, called only by
  41. * sofree() and sonewconn(). Socket layer private.
  42. *
  43. * pru_attach() associates protocol layer state with an allocated socket;
  44. * called only once, may fail, aborting socket allocation. This is called
  45. * from socreate() and sonewconn(). Socket layer private.
  46. *
  47. * pru_detach() disassociates protocol layer state from an attached socket,
  48. * and will be called exactly once for sockets in which pru_attach() has
  49. * been successfully called. If pru_attach() returned an error,
  50. * pru_detach() will not be called. Socket layer private.
  51. *
  52. * pru_abort() and pru_close() notify the protocol layer that the last
  53. * consumer of a socket is starting to tear down the socket, and that the
  54. * protocol should terminate the connection. Historically, pru_abort() also
  55. * detached protocol state from the socket state, but this is no longer the
  56. * case.
  57. *
  58. * socreate() creates a socket and attaches protocol state. This is a public
  59. * interface that may be used by socket layer consumers to create new
  60. * sockets.
  61. *
  62. * sonewconn() creates a socket and attaches protocol state. This is a
  63. * public interface that may be used by protocols to create new sockets when
  64. * a new connection is received and will be available for accept() on a
  65. * listen socket.
  66. *
  67. * soclose() destroys a socket after possibly waiting for it to disconnect.
  68. * This is a public interface that socket consumers should use to close and
  69. * release a socket when done with it.
  70. *
  71. * soabort() destroys a socket without waiting for it to disconnect (used
  72. * only for incoming connections that are already partially or fully
  73. * connected). This is used internally by the socket layer when clearing
  74. * listen socket queues (due to overflow or close on the listen socket), but
  75. * is also a public interface protocols may use to abort connections in
  76. * their incomplete listen queues should they no longer be required. Sockets
  77. * placed in completed connection listen queues should not be aborted for
  78. * reasons described in the comment above the soclose() implementation. This
  79. * is not a general purpose close routine, and except in the specific
  80. * circumstances described here, should not be used.
  81. *
  82. * sofree() will free a socket and its protocol state if all references on
  83. * the socket have been released, and is the public interface to attempt to
  84. * free a socket when a reference is removed. This is a socket layer private
  85. * interface.
  86. *
  87. * NOTE: In addition to socreate() and soclose(), which provide a single
  88. * socket reference to the consumer to be managed as required, there are two
  89. * calls to explicitly manage socket references, soref(), and sorele().
  90. * Currently, these are generally required only when transitioning a socket
  91. * from a listen queue to a file descriptor, in order to prevent garbage
  92. * collection of the socket at an untimely moment. For a number of reasons,
  93. * these interfaces are not preferred, and should be avoided.
  94. */
  95. #include <sys/cdefs.h>
  96. __FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.348 2010/05/27 15:27:31 rwatson Exp $");
  97. #include "opt_inet.h"
  98. #include "opt_inet6.h"
  99. #include "opt_zero.h"
  100. #include "opt_compat.h"
  101. #include <sys/param.h>
  102. #include <sys/systm.h>
  103. #include <sys/fcntl.h>
  104. #include <sys/limits.h>
  105. #include <sys/lock.h>
  106. #include <sys/mac.h>
  107. #include <sys/malloc.h>
  108. #include <sys/mbuf.h>
  109. #include <sys/mutex.h>
  110. #include <sys/domain.h>
  111. #include <sys/file.h> /* for struct knote */
  112. #include <sys/kernel.h>
  113. #include <sys/event.h>
  114. #include <sys/eventhandler.h>
  115. #include <sys/poll.h>
  116. #include <sys/proc.h>
  117. #include <sys/protosw.h>
  118. #include <sys/socket.h>
  119. #include <sys/socketvar.h>
  120. #include <sys/resourcevar.h>
  121. #include <net/route.h>
  122. #include <sys/signalvar.h>
  123. #include <sys/stat.h>
  124. #include <sys/sx.h>
  125. #include <sys/sysctl.h>
  126. #include <sys/uio.h>
  127. #include <sys/jail.h>
  128. #include <net/vnet.h>
  129. #include <security/mac/mac_framework.h>
  130. #include <vm/uma.h>
  131. #ifdef COMPAT_FREEBSD32
  132. #include <sys/mount.h>
  133. #include <sys/sysent.h>
  134. #include <compat/freebsd32/freebsd32.h>
  135. #endif
  136. static int soreceive_rcvoob(struct socket *so, struct uio *uio,
  137. int flags);
  138. static void filt_sordetach(struct knote *kn);
  139. static int filt_soread(struct knote *kn, long hint);
  140. static void filt_sowdetach(struct knote *kn);
  141. static int filt_sowrite(struct knote *kn, long hint);
  142. static int filt_solisten(struct knote *kn, long hint);
  143. static struct filterops solisten_filtops = {
  144. .f_isfd = 1,
  145. .f_detach = filt_sordetach,
  146. .f_event = filt_solisten,
  147. };
  148. static struct filterops soread_filtops = {
  149. .f_isfd = 1,
  150. .f_detach = filt_sordetach,
  151. .f_event = filt_soread,
  152. };
  153. static struct filterops sowrite_filtops = {
  154. .f_isfd = 1,
  155. .f_detach = filt_sowdetach,
  156. .f_event = filt_sowrite,
  157. };
  158. uma_zone_t socket_zone;
  159. so_gen_t so_gencnt; /* generation count for sockets */
  160. int maxsockets;
  161. MALLOC_DEFINE(M_SONAME, "soname", "socket name");
  162. MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
  163. static int somaxconn = SOMAXCONN;
  164. static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
  165. /* XXX: we dont have SYSCTL_USHORT */
  166. SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
  167. 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
  168. "queue size");
  169. static int numopensockets;
  170. SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
  171. &numopensockets, 0, "Number of open sockets");
  172. #ifdef ZERO_COPY_SOCKETS
  173. /* These aren't static because they're used in other files. */
  174. int so_zero_copy_send = 1;
  175. int so_zero_copy_receive = 1;
  176. SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
  177. "Zero copy controls");
  178. SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
  179. &so_zero_copy_receive, 0, "Enable zero copy receive");
  180. SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
  181. &so_zero_copy_send, 0, "Enable zero copy send");
  182. #endif /* ZERO_COPY_SOCKETS */
  183. /*
  184. * accept_mtx locks down per-socket fields relating to accept queues. See
  185. * socketvar.h for an annotation of the protected fields of struct socket.
  186. */
  187. struct mtx accept_mtx;
  188. MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
  189. /*
  190. * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
  191. * so_gencnt field.
  192. */
  193. static struct mtx so_global_mtx;
  194. MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
  195. /*
  196. * General IPC sysctl name space, used by sockets and a variety of other IPC
  197. * types.
  198. */
  199. SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
  200. /*
  201. * Sysctl to get and set the maximum global sockets limit. Notify protocols
  202. * of the change so that they can update their dependent limits as required.
  203. */
  204. static int
  205. sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
  206. {
  207. int error, newmaxsockets;
  208. newmaxsockets = maxsockets;
  209. error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
  210. if (error == 0 && req->newptr) {
  211. if (newmaxsockets > maxsockets) {
  212. maxsockets = newmaxsockets;
  213. if (maxsockets > ((maxfiles / 4) * 3)) {
  214. maxfiles = (maxsockets * 5) / 4;
  215. maxfilesperproc = (maxfiles * 9) / 10;
  216. }
  217. EVENTHANDLER_INVOKE(maxsockets_change);
  218. } else
  219. error = EINVAL;
  220. }
  221. return (error);
  222. }
  223. SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
  224. &maxsockets, 0, sysctl_maxsockets, "IU",
  225. "Maximum number of sockets avaliable");
  226. /*
  227. * Initialise maxsockets. This SYSINIT must be run after
  228. * tunable_mbinit().
  229. */
  230. static void
  231. init_maxsockets(void *ignored)
  232. {
  233. TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
  234. maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
  235. }
  236. SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
  237. /*
  238. * Socket operation routines. These routines are called by the routines in
  239. * sys_socket.c or from a system process, and implement the semantics of
  240. * socket operations by switching out to the protocol specific routines.
  241. */
  242. /*
  243. * Get a socket structure from our zone, and initialize it. Note that it
  244. * would probably be better to allocate socket and PCB at the same time, but
  245. * I'm not convinced that all the protocols can be easily modified to do
  246. * this.
  247. *
  248. * soalloc() returns a socket with a ref count of 0.
  249. */
  250. static struct socket *
  251. soalloc(struct vnet *vnet)
  252. {
  253. struct socket *so;
  254. so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
  255. if (so == NULL)
  256. return (NULL);
  257. #ifdef MAC
  258. if (mac_socket_init(so, M_NOWAIT) != 0) {
  259. uma_zfree(socket_zone, so);
  260. return (NULL);
  261. }
  262. #endif
  263. SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
  264. SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
  265. sx_init(&so->so_snd.sb_sx, "so_snd_sx");
  266. sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
  267. TAILQ_INIT(&so->so_aiojobq);
  268. mtx_lock(&so_global_mtx);
  269. so->so_gencnt = ++so_gencnt;
  270. ++numopensockets;
  271. #ifdef VIMAGE
  272. vnet->vnet_sockcnt++;
  273. so->so_vnet = vnet;
  274. #endif
  275. mtx_unlock(&so_global_mtx);
  276. return (so);
  277. }
  278. /*
  279. * Free the storage associated with a socket at the socket layer, tear down
  280. * locks, labels, etc. All protocol state is assumed already to have been
  281. * torn down (and possibly never set up) by the caller.
  282. */
  283. static void
  284. sodealloc(struct socket *so)
  285. {
  286. KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
  287. KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
  288. mtx_lock(&so_global_mtx);
  289. so->so_gencnt = ++so_gencnt;
  290. --numopensockets; /* Could be below, but faster here. */
  291. #ifdef VIMAGE
  292. so->so_vnet->vnet_sockcnt--;
  293. #endif
  294. mtx_unlock(&so_global_mtx);
  295. if (so->so_rcv.sb_hiwat)
  296. (void)chgsbsize(so->so_cred->cr_uidinfo,
  297. &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
  298. if (so->so_snd.sb_hiwat)
  299. (void)chgsbsize(so->so_cred->cr_uidinfo,
  300. &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
  301. #ifdef INET
  302. /* remove acccept filter if one is present. */
  303. if (so->so_accf != NULL)
  304. do_setopt_accept_filter(so, NULL);
  305. #endif
  306. #ifdef MAC
  307. mac_socket_destroy(so);
  308. #endif
  309. crfree(so->so_cred);
  310. sx_destroy(&so->so_snd.sb_sx);
  311. sx_destroy(&so->so_rcv.sb_sx);
  312. SOCKBUF_LOCK_DESTROY(&so->so_snd);
  313. SOCKBUF_LOCK_DESTROY(&so->so_rcv);
  314. uma_zfree(socket_zone, so);
  315. }
  316. /*
  317. * socreate returns a socket with a ref count of 1. The socket should be
  318. * closed with soclose().
  319. */
  320. int
  321. socreate(int dom, struct socket **aso, int type, int proto,
  322. struct ucred *cred, struct thread *td)
  323. {
  324. struct protosw *prp;
  325. struct socket *so;
  326. int error;
  327. if (proto)
  328. prp = pffindproto(dom, proto, type);
  329. else
  330. prp = pffindtype(dom, type);
  331. if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
  332. prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
  333. return (EPROTONOSUPPORT);
  334. if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
  335. return (EPROTONOSUPPORT);
  336. if (prp->pr_type != type)
  337. return (EPROTOTYPE);
  338. so = soalloc(CRED_TO_VNET(cred));
  339. if (so == NULL)
  340. return (ENOBUFS);
  341. TAILQ_INIT(&so->so_incomp);
  342. TAILQ_INIT(&so->so_comp);
  343. so->so_type = type;
  344. so->so_cred = crhold(cred);
  345. if ((prp->pr_domain->dom_family == PF_INET) ||
  346. (prp->pr_domain->dom_family == PF_ROUTE))
  347. so->so_fibnum = td->td_proc->p_fibnum;
  348. else
  349. so->so_fibnum = 0;
  350. so->so_proto = prp;
  351. #ifdef MAC
  352. mac_socket_create(cred, so);
  353. #endif
  354. knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
  355. knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
  356. so->so_count = 1;
  357. /*
  358. * Auto-sizing of socket buffers is managed by the protocols and
  359. * the appropriate flags must be set in the pru_attach function.
  360. */
  361. CURVNET_SET(so->so_vnet);
  362. error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
  363. CURVNET_RESTORE();
  364. if (error) {
  365. KASSERT(so->so_count == 1, ("socreate: so_count %d",
  366. so->so_count));
  367. so->so_count = 0;
  368. sodealloc(so);
  369. return (error);
  370. }
  371. *aso = so;
  372. return (0);
  373. }
  374. #ifdef REGRESSION
  375. static int regression_sonewconn_earlytest = 1;
  376. SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
  377. &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
  378. #endif
  379. /*
  380. * When an attempt at a new connection is noted on a socket which accepts
  381. * connections, sonewconn is called. If the connection is possible (subject
  382. * to space constraints, etc.) then we allocate a new structure, propoerly
  383. * linked into the data structure of the original socket, and return this.
  384. * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
  385. *
  386. * Note: the ref count on the socket is 0 on return.
  387. */
  388. struct socket *
  389. sonewconn(struct socket *head, int connstatus)
  390. {
  391. struct socket *so;
  392. int over;
  393. ACCEPT_LOCK();
  394. over = (head->so_qlen > 3 * head->so_qlimit / 2);
  395. ACCEPT_UNLOCK();
  396. #ifdef REGRESSION
  397. if (regression_sonewconn_earlytest && over)
  398. #else
  399. if (over)
  400. #endif
  401. return (NULL);
  402. VNET_ASSERT(head->so_vnet);
  403. so = soalloc(head->so_vnet);
  404. if (so == NULL)
  405. return (NULL);
  406. if ((head->so_options & SO_ACCEPTFILTER) != 0)
  407. connstatus = 0;
  408. so->so_head = head;
  409. so->so_type = head->so_type;
  410. so->so_options = head->so_options &~ SO_ACCEPTCONN;
  411. so->so_linger = head->so_linger;
  412. so->so_state = head->so_state | SS_NOFDREF;
  413. so->so_fibnum = head->so_fibnum;
  414. so->so_proto = head->so_proto;
  415. so->so_cred = crhold(head->so_cred);
  416. #ifdef MAC
  417. mac_socket_newconn(head, so);
  418. #endif
  419. knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
  420. knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
  421. if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
  422. (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
  423. sodealloc(so);
  424. return (NULL);
  425. }
  426. so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
  427. so->so_snd.sb_lowat = head->so_snd.sb_lowat;
  428. so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
  429. so->so_snd.sb_timeo = head->so_snd.sb_timeo;
  430. so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
  431. so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
  432. so->so_state |= connstatus;
  433. ACCEPT_LOCK();
  434. if (connstatus) {
  435. TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
  436. so->so_qstate |= SQ_COMP;
  437. head->so_qlen++;
  438. } else {
  439. /*
  440. * Keep removing sockets from the head until there's room for
  441. * us to insert on the tail. In pre-locking revisions, this
  442. * was a simple if(), but as we could be racing with other
  443. * threads and soabort() requires dropping locks, we must
  444. * loop waiting for the condition to be true.
  445. */
  446. while (head->so_incqlen > head->so_qlimit) {
  447. struct socket *sp;
  448. sp = TAILQ_FIRST(&head->so_incomp);
  449. TAILQ_REMOVE(&head->so_incomp, sp, so_list);
  450. head->so_incqlen--;
  451. sp->so_qstate &= ~SQ_INCOMP;
  452. sp->so_head = NULL;
  453. ACCEPT_UNLOCK();
  454. soabort(sp);
  455. ACCEPT_LOCK();
  456. }
  457. TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
  458. so->so_qstate |= SQ_INCOMP;
  459. head->so_incqlen++;
  460. }
  461. ACCEPT_UNLOCK();
  462. if (connstatus) {
  463. sorwakeup(head);
  464. wakeup_one(&head->so_timeo);
  465. }
  466. return (so);
  467. }
  468. int
  469. sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
  470. {
  471. int error;
  472. CURVNET_SET(so->so_vnet);
  473. error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
  474. CURVNET_RESTORE();
  475. return error;
  476. }
  477. /*
  478. * solisten() transitions a socket from a non-listening state to a listening
  479. * state, but can also be used to update the listen queue depth on an
  480. * existing listen socket. The protocol will call back into the sockets
  481. * layer using solisten_proto_check() and solisten_proto() to check and set
  482. * socket-layer listen state. Call backs are used so that the protocol can
  483. * acquire both protocol and socket layer locks in whatever order is required
  484. * by the protocol.
  485. *
  486. * Protocol implementors are advised to hold the socket lock across the
  487. * socket-layer test and set to avoid races at the socket layer.
  488. */
  489. int
  490. solisten(struct socket *so, int backlog, struct thread *td)
  491. {
  492. return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
  493. }
  494. int
  495. solisten_proto_check(struct socket *so)
  496. {
  497. SOCK_LOCK_ASSERT(so);
  498. if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
  499. SS_ISDISCONNECTING))
  500. return (EINVAL);
  501. return (0);
  502. }
  503. void
  504. solisten_proto(struct socket *so, int backlog)
  505. {
  506. SOCK_LOCK_ASSERT(so);
  507. if (backlog < 0 || backlog > somaxconn)
  508. backlog = somaxconn;
  509. so->so_qlimit = backlog;
  510. so->so_options |= SO_ACCEPTCONN;
  511. }
  512. /*
  513. * Attempt to free a socket. This should really be sotryfree().
  514. *
  515. * sofree() will succeed if:
  516. *
  517. * - There are no outstanding file descriptor references or related consumers
  518. * (so_count == 0).
  519. *
  520. * - The socket has been closed by user space, if ever open (SS_NOFDREF).
  521. *
  522. * - The protocol does not have an outstanding strong reference on the socket
  523. * (SS_PROTOREF).
  524. *
  525. * - The socket is not in a completed connection queue, so a process has been
  526. * notified that it is present. If it is removed, the user process may
  527. * block in accept() despite select() saying the socket was ready.
  528. *
  529. * Otherwise, it will quietly abort so that a future call to sofree(), when
  530. * conditions are right, can succeed.
  531. */
  532. void
  533. sofree(struct socket *so)
  534. {
  535. struct protosw *pr = so->so_proto;
  536. struct socket *head;
  537. ACCEPT_LOCK_ASSERT();
  538. SOCK_LOCK_ASSERT(so);
  539. if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
  540. (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
  541. SOCK_UNLOCK(so);
  542. ACCEPT_UNLOCK();
  543. return;
  544. }
  545. head = so->so_head;
  546. if (head != NULL) {
  547. KASSERT((so->so_qstate & SQ_COMP) != 0 ||
  548. (so->so_qstate & SQ_INCOMP) != 0,
  549. ("sofree: so_head != NULL, but neither SQ_COMP nor "
  550. "SQ_INCOMP"));
  551. KASSERT((so->so_qstate & SQ_COMP) == 0 ||
  552. (so->so_qstate & SQ_INCOMP) == 0,
  553. ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
  554. TAILQ_REMOVE(&head->so_incomp, so, so_list);
  555. head->so_incqlen--;
  556. so->so_qstate &= ~SQ_INCOMP;
  557. so->so_head = NULL;
  558. }
  559. KASSERT((so->so_qstate & SQ_COMP) == 0 &&
  560. (so->so_qstate & SQ_INCOMP) == 0,
  561. ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
  562. so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
  563. if (so->so_options & SO_ACCEPTCONN) {
  564. KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
  565. KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
  566. }
  567. SOCK_UNLOCK(so);
  568. ACCEPT_UNLOCK();
  569. if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
  570. (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
  571. if (pr->pr_usrreqs->pru_detach != NULL)
  572. (*pr->pr_usrreqs->pru_detach)(so);
  573. /*
  574. * From this point on, we assume that no other references to this
  575. * socket exist anywhere else in the stack. Therefore, no locks need
  576. * to be acquired or held.
  577. *
  578. * We used to do a lot of socket buffer and socket locking here, as
  579. * well as invoke sorflush() and perform wakeups. The direct call to
  580. * dom_dispose() and sbrelease_internal() are an inlining of what was
  581. * necessary from sorflush().
  582. *
  583. * Notice that the socket buffer and kqueue state are torn down
  584. * before calling pru_detach. This means that protocols shold not
  585. * assume they can perform socket wakeups, etc, in their detach code.
  586. */
  587. sbdestroy(&so->so_snd, so);
  588. sbdestroy(&so->so_rcv, so);
  589. knlist_destroy(&so->so_rcv.sb_sel.si_note);
  590. knlist_destroy(&so->so_snd.sb_sel.si_note);
  591. sodealloc(so);
  592. }
  593. /*
  594. * Close a socket on last file table reference removal. Initiate disconnect
  595. * if connected. Free socket when disconnect complete.
  596. *
  597. * This function will sorele() the socket. Note that soclose() may be called
  598. * prior to the ref count reaching zero. The actual socket structure will
  599. * not be freed until the ref count reaches zero.
  600. */
  601. int
  602. soclose(struct socket *so)
  603. {
  604. int error = 0;
  605. KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
  606. CURVNET_SET(so->so_vnet);
  607. funsetown(&so->so_sigio);
  608. if (so->so_state & SS_ISCONNECTED) {
  609. if ((so->so_state & SS_ISDISCONNECTING) == 0) {
  610. error = sodisconnect(so);
  611. if (error) {
  612. if (error == ENOTCONN)
  613. error = 0;
  614. goto drop;
  615. }
  616. }
  617. if (so->so_options & SO_LINGER) {
  618. if ((so->so_state & SS_ISDISCONNECTING) &&
  619. (so->so_state & SS_NBIO))
  620. goto drop;
  621. while (so->so_state & SS_ISCONNECTED) {
  622. error = tsleep(&so->so_timeo,
  623. PSOCK | PCATCH, "soclos", so->so_linger * hz);
  624. if (error)
  625. break;
  626. }
  627. }
  628. }
  629. drop:
  630. if (so->so_proto->pr_usrreqs->pru_close != NULL)
  631. (*so->so_proto->pr_usrreqs->pru_close)(so);
  632. if (so->so_options & SO_ACCEPTCONN) {
  633. struct socket *sp;
  634. ACCEPT_LOCK();
  635. while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
  636. TAILQ_REMOVE(&so->so_incomp, sp, so_list);
  637. so->so_incqlen--;
  638. sp->so_qstate &= ~SQ_INCOMP;
  639. sp->so_head = NULL;
  640. ACCEPT_UNLOCK();
  641. soabort(sp);
  642. ACCEPT_LOCK();
  643. }
  644. while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
  645. TAILQ_REMOVE(&so->so_comp, sp, so_list);
  646. so->so_qlen--;
  647. sp->so_qstate &= ~SQ_COMP;
  648. sp->so_head = NULL;
  649. ACCEPT_UNLOCK();
  650. soabort(sp);
  651. ACCEPT_LOCK();
  652. }
  653. ACCEPT_UNLOCK();
  654. }
  655. ACCEPT_LOCK();
  656. SOCK_LOCK(so);
  657. KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
  658. so->so_state |= SS_NOFDREF;
  659. sorele(so);
  660. CURVNET_RESTORE();
  661. return (error);
  662. }
  663. /*
  664. * soabort() is used to abruptly tear down a connection, such as when a
  665. * resource limit is reached (listen queue depth exceeded), or if a listen
  666. * socket is closed while there are sockets waiting to be accepted.
  667. *
  668. * This interface is tricky, because it is called on an unreferenced socket,
  669. * and must be called only by a thread that has actually removed the socket
  670. * from the listen queue it was on, or races with other threads are risked.
  671. *
  672. * This interface will call into the protocol code, so must not be called
  673. * with any socket locks held. Protocols do call it while holding their own
  674. * recursible protocol mutexes, but this is something that should be subject
  675. * to review in the future.
  676. */
  677. void
  678. soabort(struct socket *so)
  679. {
  680. /*
  681. * In as much as is possible, assert that no references to this
  682. * socket are held. This is not quite the same as asserting that the
  683. * current thread is responsible for arranging for no references, but
  684. * is as close as we can get for now.
  685. */
  686. KASSERT(so->so_count == 0, ("soabort: so_count"));
  687. KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
  688. KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
  689. KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
  690. KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
  691. if (so->so_proto->pr_usrreqs->pru_abort != NULL)
  692. (*so->so_proto->pr_usrreqs->pru_abort)(so);
  693. ACCEPT_LOCK();
  694. SOCK_LOCK(so);
  695. sofree(so);
  696. }
  697. int
  698. soaccept(struct socket *so, struct sockaddr **nam)
  699. {
  700. int error;
  701. SOCK_LOCK(so);
  702. KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
  703. so->so_state &= ~SS_NOFDREF;
  704. SOCK_UNLOCK(so);
  705. error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
  706. return (error);
  707. }
  708. int
  709. soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
  710. {
  711. int error;
  712. if (so->so_options & SO_ACCEPTCONN)
  713. return (EOPNOTSUPP);
  714. CURVNET_SET(so->so_vnet);
  715. /*
  716. * If protocol is connection-based, can only connect once.
  717. * Otherwise, if connected, try to disconnect first. This allows
  718. * user to disconnect by connecting to, e.g., a null address.
  719. */
  720. if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
  721. ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
  722. (error = sodisconnect(so)))) {
  723. error = EISCONN;
  724. } else {
  725. /*
  726. * Prevent accumulated error from previous connection from
  727. * biting us.
  728. */
  729. so->so_error = 0;
  730. error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
  731. }
  732. CURVNET_RESTORE();
  733. return (error);
  734. }
  735. int
  736. soconnect2(struct socket *so1, struct socket *so2)
  737. {
  738. return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
  739. }
  740. int
  741. sodisconnect(struct socket *so)
  742. {
  743. int error;
  744. if ((so->so_state & SS_ISCONNECTED) == 0)
  745. return (ENOTCONN);
  746. if (so->so_state & SS_ISDISCONNECTING)
  747. return (EALREADY);
  748. error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
  749. return (error);
  750. }
  751. #ifdef ZERO_COPY_SOCKETS
  752. struct so_zerocopy_stats{
  753. int size_ok;
  754. int align_ok;
  755. int found_ifp;
  756. };
  757. struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
  758. #include <netinet/in.h>
  759. #include <net/route.h>
  760. #include <netinet/in_pcb.h>
  761. #include <vm/vm.h>
  762. #include <vm/vm_page.h>
  763. #include <vm/vm_object.h>
  764. /*
  765. * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise
  766. * sosend_dgram() and sosend_generic() use m_uiotombuf().
  767. *
  768. * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
  769. * all of the data referenced by the uio. If desired, it uses zero-copy.
  770. * *space will be updated to reflect data copied in.
  771. *
  772. * NB: If atomic I/O is requested, the caller must already have checked that
  773. * space can hold resid bytes.
  774. *
  775. * NB: In the event of an error, the caller may need to free the partial
  776. * chain pointed to by *mpp. The contents of both *uio and *space may be
  777. * modified even in the case of an error.
  778. */
  779. static int
  780. sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
  781. int flags)
  782. {
  783. struct mbuf *m, **mp, *top;
  784. long len, resid;
  785. int error;
  786. #ifdef ZERO_COPY_SOCKETS
  787. int cow_send;
  788. #endif
  789. *retmp = top = NULL;
  790. mp = &top;
  791. len = 0;
  792. resid = uio->uio_resid;
  793. error = 0;
  794. do {
  795. #ifdef ZERO_COPY_SOCKETS
  796. cow_send = 0;
  797. #endif /* ZERO_COPY_SOCKETS */
  798. if (resid >= MINCLSIZE) {
  799. #ifdef ZERO_COPY_SOCKETS
  800. if (top == NULL) {
  801. m = m_gethdr(M_WAITOK, MT_DATA);
  802. m->m_pkthdr.len = 0;
  803. m->m_pkthdr.rcvif = NULL;
  804. } else
  805. m = m_get(M_WAITOK, MT_DATA);
  806. if (so_zero_copy_send &&
  807. resid>=PAGE_SIZE &&
  808. *space>=PAGE_SIZE &&
  809. uio->uio_iov->iov_len>=PAGE_SIZE) {
  810. so_zerocp_stats.size_ok++;
  811. so_zerocp_stats.align_ok++;
  812. cow_send = socow_setup(m, uio);
  813. len = cow_send;
  814. }
  815. if (!cow_send) {
  816. m_clget(m, M_WAITOK);
  817. len = min(min(MCLBYTES, resid), *space);
  818. }
  819. #else /* ZERO_COPY_SOCKETS */
  820. if (top == NULL) {
  821. m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
  822. m->m_pkthdr.len = 0;
  823. m->m_pkthdr.rcvif = NULL;
  824. } else
  825. m = m_getcl(M_WAIT, MT_DATA, 0);
  826. len = min(min(MCLBYTES, resid), *space);
  827. #endif /* ZERO_COPY_SOCKETS */
  828. } else {
  829. if (top == NULL) {
  830. m = m_gethdr(M_WAIT, MT_DATA);
  831. m->m_pkthdr.len = 0;
  832. m->m_pkthdr.rcvif = NULL;
  833. len = min(min(MHLEN, resid), *space);
  834. /*
  835. * For datagram protocols, leave room
  836. * for protocol headers in first mbuf.
  837. */
  838. if (atomic && m && len < MHLEN)
  839. MH_ALIGN(m, len);
  840. } else {
  841. m = m_get(M_WAIT, MT_DATA);
  842. len = min(min(MLEN, resid), *space);
  843. }
  844. }
  845. if (m == NULL) {
  846. error = ENOBUFS;
  847. goto out;
  848. }
  849. *space -= len;
  850. #ifdef ZERO_COPY_SOCKETS
  851. if (cow_send)
  852. error = 0;
  853. else
  854. #endif /* ZERO_COPY_SOCKETS */
  855. error = uiomove(mtod(m, void *), (int)len, uio);
  856. resid = uio->uio_resid;
  857. m->m_len = len;
  858. *mp = m;
  859. top->m_pkthdr.len += len;
  860. if (error)
  861. goto out;
  862. mp = &m->m_next;
  863. if (resid <= 0) {
  864. if (flags & MSG_EOR)
  865. top->m_flags |= M_EOR;
  866. break;
  867. }
  868. } while (*space > 0 && atomic);
  869. out:
  870. *retmp = top;
  871. return (error);
  872. }
  873. #endif /*ZERO_COPY_SOCKETS*/
  874. #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
  875. int
  876. sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
  877. struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
  878. {
  879. long space, resid;
  880. int clen = 0, error, dontroute;
  881. #ifdef ZERO_COPY_SOCKETS
  882. int atomic = sosendallatonce(so) || top;
  883. #endif
  884. KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
  885. KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
  886. ("sodgram_send: !PR_ATOMIC"));
  887. if (uio != NULL)
  888. resid = uio->uio_resid;
  889. else
  890. resid = top->m_pkthdr.len;
  891. /*
  892. * In theory resid should be unsigned. However, space must be
  893. * signed, as it might be less than 0 if we over-committed, and we
  894. * must use a signed comparison of space and resid. On the other
  895. * hand, a negative resid causes us to loop sending 0-length
  896. * segments to the protocol.
  897. */
  898. if (resid < 0) {
  899. error = EINVAL;
  900. goto out;
  901. }
  902. dontroute =
  903. (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
  904. if (td != NULL)
  905. td->td_ru.ru_msgsnd++;
  906. if (control != NULL)
  907. clen = control->m_len;
  908. SOCKBUF_LOCK(&so->so_snd);
  909. if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
  910. SOCKBUF_UNLOCK(&so->so_snd);
  911. error = EPIPE;
  912. goto out;
  913. }
  914. if (so->so_error) {
  915. error = so->so_error;
  916. so->so_error = 0;
  917. SOCKBUF_UNLOCK(&so->so_snd);
  918. goto out;
  919. }
  920. if ((so->so_state & SS_ISCONNECTED) == 0) {
  921. /*
  922. * `sendto' and `sendmsg' is allowed on a connection-based
  923. * socket if it supports implied connect. Return ENOTCONN if
  924. * not connected and no address is supplied.
  925. */
  926. if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
  927. (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
  928. if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  929. !(resid == 0 && clen != 0)) {
  930. SOCKBUF_UNLOCK(&so->so_snd);
  931. error = ENOTCONN;
  932. goto out;
  933. }
  934. } else if (addr == NULL) {
  935. if (so->so_proto->pr_flags & PR_CONNREQUIRED)
  936. error = ENOTCONN;
  937. else
  938. error = EDESTADDRREQ;
  939. SOCKBUF_UNLOCK(&so->so_snd);
  940. goto out;
  941. }
  942. }
  943. /*
  944. * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
  945. * problem and need fixing.
  946. */
  947. space = sbspace(&so->so_snd);
  948. if (flags & MSG_OOB)
  949. space += 1024;
  950. space -= clen;
  951. SOCKBUF_UNLOCK(&so->so_snd);
  952. if (resid > space) {
  953. error = EMSGSIZE;
  954. goto out;
  955. }
  956. if (uio == NULL) {
  957. resid = 0;
  958. if (flags & MSG_EOR)
  959. top->m_flags |= M_EOR;
  960. } else {
  961. #ifdef ZERO_COPY_SOCKETS
  962. error = sosend_copyin(uio, &top, atomic, &space, flags);
  963. if (error)
  964. goto out;
  965. #else
  966. /*
  967. * Copy the data from userland into a mbuf chain.
  968. * If no data is to be copied in, a single empty mbuf
  969. * is returned.
  970. */
  971. top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
  972. (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
  973. if (top == NULL) {
  974. error = EFAULT; /* only possible error */
  975. goto out;
  976. }
  977. space -= resid - uio->uio_resid;
  978. #endif
  979. resid = uio->uio_resid;
  980. }
  981. KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
  982. /*
  983. * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
  984. * than with.
  985. */
  986. if (dontroute) {
  987. SOCK_LOCK(so);
  988. so->so_options |= SO_DONTROUTE;
  989. SOCK_UNLOCK(so);
  990. }
  991. /*
  992. * XXX all the SBS_CANTSENDMORE checks previously done could be out
  993. * of date. We could have recieved a reset packet in an interrupt or
  994. * maybe we slept while doing page faults in uiomove() etc. We could
  995. * probably recheck again inside the locking protection here, but
  996. * there are probably other places that this also happens. We must
  997. * rethink this.
  998. */
  999. error = (*so->so_proto->pr_usrreqs->pru_send)(so,
  1000. (flags & MSG_OOB) ? PRUS_OOB :
  1001. /*
  1002. * If the user set MSG_EOF, the protocol understands this flag and
  1003. * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
  1004. */
  1005. ((flags & MSG_EOF) &&
  1006. (so->so_proto->pr_flags & PR_IMPLOPCL) &&
  1007. (resid <= 0)) ?
  1008. PRUS_EOF :
  1009. /* If there is more to send set PRUS_MORETOCOME */
  1010. (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
  1011. top, addr, control, td);
  1012. if (dontroute) {
  1013. SOCK_LOCK(so);
  1014. so->so_options &= ~SO_DONTROUTE;
  1015. SOCK_UNLOCK(so);
  1016. }
  1017. clen = 0;
  1018. control = NULL;
  1019. top = NULL;
  1020. out:
  1021. if (top != NULL)
  1022. m_freem(top);
  1023. if (control != NULL)
  1024. m_freem(control);
  1025. return (error);
  1026. }
  1027. /*
  1028. * Send on a socket. If send must go all at once and message is larger than
  1029. * send buffering, then hard error. Lock against other senders. If must go
  1030. * all at once and not enough room now, then inform user that this would
  1031. * block and do nothing. Otherwise, if nonblocking, send as much as
  1032. * possible. The data to be sent is described by "uio" if nonzero, otherwise
  1033. * by the mbuf chain "top" (which must be null if uio is not). Data provided
  1034. * in mbuf chain must be small enough to send all at once.
  1035. *
  1036. * Returns nonzero on error, timeout or signal; callers must check for short
  1037. * counts if EINTR/ERESTART are returned. Data and control buffers are freed
  1038. * on return.
  1039. */
  1040. int
  1041. sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
  1042. struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
  1043. {
  1044. long space, resid;
  1045. int clen = 0, error, dontroute;
  1046. int atomic = sosendallatonce(so) || top;
  1047. if (uio != NULL)
  1048. resid = uio->uio_resid;
  1049. else
  1050. resid = top->m_pkthdr.len;
  1051. /*
  1052. * In theory resid should be unsigned. However, space must be
  1053. * signed, as it might be less than 0 if we over-committed, and we
  1054. * must use a signed comparison of space and resid. On the other
  1055. * hand, a negative resid causes us to loop sending 0-length
  1056. * segments to the protocol.
  1057. *
  1058. * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
  1059. * type sockets since that's an error.
  1060. */
  1061. if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
  1062. error = EINVAL;
  1063. goto out;
  1064. }
  1065. dontroute =
  1066. (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
  1067. (so->so_proto->pr_flags & PR_ATOMIC);
  1068. if (td != NULL)
  1069. td->td_ru.ru_msgsnd++;
  1070. if (control != NULL)
  1071. clen = control->m_len;
  1072. error = sblock(&so->so_snd, SBLOCKWAIT(flags));
  1073. if (error)
  1074. goto out;
  1075. restart:
  1076. do {
  1077. SOCKBUF_LOCK(&so->so_snd);
  1078. if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
  1079. SOCKBUF_UNLOCK(&so->so_snd);
  1080. error = EPIPE;
  1081. goto release;
  1082. }
  1083. if (so->so_error) {
  1084. error = so->so_error;
  1085. so->so_error = 0;
  1086. SOCKBUF_UNLOCK(&so->so_snd);
  1087. goto release;
  1088. }
  1089. if ((so->so_state & SS_ISCONNECTED) == 0) {
  1090. /*
  1091. * `sendto' and `sendmsg' is allowed on a connection-
  1092. * based socket if it supports implied connect.
  1093. * Return ENOTCONN if not connected and no address is
  1094. * supplied.
  1095. */
  1096. if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
  1097. (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
  1098. if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  1099. !(resid == 0 && clen != 0)) {
  1100. SOCKBUF_UNLOCK(&so->so_snd);
  1101. error = ENOTCONN;
  1102. goto release;
  1103. }
  1104. } else if (addr == NULL) {
  1105. SOCKBUF_UNLOCK(&so->so_snd);
  1106. if (so->so_proto->pr_flags & PR_CONNREQUIRED)
  1107. error = ENOTCONN;
  1108. else
  1109. error = EDESTADDRREQ;
  1110. goto release;
  1111. }
  1112. }
  1113. space = sbspace(&so->so_snd);
  1114. if (flags & MSG_OOB)
  1115. space += 1024;
  1116. if ((atomic && resid > so->so_snd.sb_hiwat) ||
  1117. clen > so->so_snd.sb_hiwat) {
  1118. SOCKBUF_UNLOCK(&so->so_snd);
  1119. error = EMSGSIZE;
  1120. goto release;
  1121. }
  1122. if (space < resid + clen &&
  1123. (atomic || space < so->so_snd.sb_lowat || space < clen)) {
  1124. if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
  1125. SOCKBUF_UNLOCK(&so->so_snd);
  1126. error = EWOULDBLOCK;
  1127. goto release;
  1128. }
  1129. error = sbwait(&so->so_snd);
  1130. SOCKBUF_UNLOCK(&so->so_snd);
  1131. if (error)
  1132. goto release;
  1133. goto restart;
  1134. }
  1135. SOCKBUF_UNLOCK(&so->so_snd);
  1136. space -= clen;
  1137. do {
  1138. if (uio == NULL) {
  1139. resid = 0;
  1140. if (flags & MSG_EOR)
  1141. top->m_flags |= M_EOR;
  1142. } else {
  1143. #ifdef ZERO_COPY_SOCKETS
  1144. error = sosend_copyin(uio, &top, atomic,
  1145. &space, flags);
  1146. if (error != 0)
  1147. goto release;
  1148. #else
  1149. /*
  1150. * Copy the data from userland into a mbuf
  1151. * chain. If no data is to be copied in,
  1152. * a single empty mbuf is returned.
  1153. */
  1154. top = m_uiotombuf(uio, M_WAITOK, space,
  1155. (atomic ? max_hdr : 0),
  1156. (atomic ? M_PKTHDR : 0) |
  1157. ((flags & MSG_EOR) ? M_EOR : 0));
  1158. if (top == NULL) {
  1159. error = EFAULT; /* only possible error */
  1160. goto release;
  1161. }
  1162. space -= resid - uio->uio_resid;
  1163. #endif
  1164. resid = uio->uio_resid;
  1165. }
  1166. if (dontroute) {
  1167. SOCK_LOCK(so);
  1168. so->so_options |= SO_DONTROUTE;
  1169. SOCK_UNLOCK(so);
  1170. }
  1171. /*
  1172. * XXX all the SBS_CANTSENDMORE checks previously
  1173. * done could be out of date. We could have recieved
  1174. * a reset packet in an interrupt or maybe we slept
  1175. * while doing page faults in uiomove() etc. We
  1176. * could probably recheck again inside the locking
  1177. * protection here, but there are probably other
  1178. * places that this also happens. We must rethink
  1179. * this.
  1180. */
  1181. error = (*so->so_proto->pr_usrreqs->pru_send)(so,
  1182. (flags & MSG_OOB) ? PRUS_OOB :
  1183. /*
  1184. * If the user set MSG_EOF, the protocol understands
  1185. * this flag and nothing left to send then use
  1186. * PRU_SEND_EOF instead of PRU_SEND.
  1187. */
  1188. ((flags & MSG_EOF) &&
  1189. (so->so_proto->pr_flags & PR_IMPLOPCL) &&
  1190. (resid <= 0)) ?
  1191. PRUS_EOF :
  1192. /* If there is more to send set PRUS_MORETOCOME. */
  1193. (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
  1194. top, addr, control, td);
  1195. if (dontroute) {
  1196. SOCK_LOCK(so);
  1197. so->so_options &= ~SO_DONTROUTE;
  1198. SOCK_UNLOCK(so);
  1199. }
  1200. clen = 0;
  1201. control = NULL;
  1202. top = NULL;
  1203. if (error)
  1204. goto release;
  1205. } while (resid && space > 0);
  1206. } while (resid);
  1207. release:
  1208. sbunlock(&so->so_snd);
  1209. out:
  1210. if (top != NULL)
  1211. m_freem(top);
  1212. if (control != NULL)
  1213. m_freem(control);
  1214. return (error);
  1215. }
  1216. int
  1217. sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
  1218. struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
  1219. {
  1220. int error;
  1221. CURVNET_SET(so->so_vnet);
  1222. error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
  1223. control, flags, td);
  1224. CURVNET_RESTORE();
  1225. return (error);
  1226. }
  1227. /*
  1228. * The part of soreceive() that implements reading non-inline out-of-band
  1229. * data from a socket. For more complete comments, see soreceive(), from
  1230. * which this code originated.
  1231. *
  1232. * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
  1233. * unable to return an mbuf chain to the caller.
  1234. */
  1235. static int
  1236. soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
  1237. {
  1238. struct protosw *pr = so->so_proto;
  1239. struct mbuf *m;
  1240. int error;
  1241. KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
  1242. m = m_get(M_WAIT, MT_DATA);
  1243. error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
  1244. if (error)
  1245. goto bad;
  1246. do {
  1247. #ifdef ZERO_COPY_SOCKETS
  1248. if (so_zero_copy_receive) {
  1249. int disposable;
  1250. if ((m->m_flags & M_EXT)
  1251. && (m->m_ext.ext_type == EXT_DISPOSABLE))
  1252. disposable = 1;
  1253. else
  1254. disposable = 0;
  1255. error = uiomoveco(mtod(m, void *),
  1256. min(uio->uio_resid, m->m_len),
  1257. uio, disposable);
  1258. } else
  1259. #endif /* ZERO_COPY_SOCKETS */
  1260. error = uiomove(mtod(m, void *),
  1261. (int) min(uio->uio_resid, m->m_len), uio);
  1262. m = m_free(m);
  1263. } while (uio->uio_resid && error == 0 && m);
  1264. bad:
  1265. if (m != NULL)
  1266. m_freem(m);
  1267. return (error);
  1268. }
  1269. /*
  1270. * Following replacement or removal of the first mbuf on the first mbuf chain
  1271. * of a socket buffer, push necessary state changes back into the socket
  1272. * buffer so that other consumers see the values consistently. 'nextrecord'
  1273. * is the callers locally stored value of the original value of
  1274. * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
  1275. * NOTE: 'nextrecord' may be NULL.
  1276. */
  1277. static __inline void
  1278. sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
  1279. {
  1280. SOCKBUF_LOCK_ASSERT(sb);
  1281. /*
  1282. * First, update for the new value of nextrecord. If necessary, make
  1283. * it the first record.
  1284. */
  1285. if (sb->sb_mb != NULL)
  1286. sb->sb_mb->m_nextpkt = nextrecord;
  1287. else
  1288. sb->sb_mb = nextrecord;
  1289. /*
  1290. * Now update any dependent socket buffer fields to reflect the new
  1291. * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
  1292. * addition of a second clause that takes care of the case where
  1293. * sb_mb has been updated, but remains the last record.
  1294. */
  1295. if (sb->sb_mb == NULL) {
  1296. sb->sb_mbtail = NULL;
  1297. sb->sb_lastrecord = NULL;
  1298. } else if (sb->sb_mb->m_nextpkt == NULL)
  1299. sb->sb_lastrecord = sb->sb_mb;
  1300. }
  1301. /*
  1302. * Implement receive operations on a socket. We depend on the way that
  1303. * records are added to the sockbuf by sbappend. In particular, each record
  1304. * (mbufs linked through m_next) must begin with an address if the protocol
  1305. * so specifies, followed by an optional mbuf or mbufs containing ancillary
  1306. * data, and then zero or more mbufs of data. In order to allow parallelism
  1307. * between network receive and copying to user space, as well as avoid
  1308. * sleeping with a mutex held, we release the socket buffer mutex during the
  1309. * user space copy. Although the sockbuf is locked, new data may still be
  1310. * appended, and thus we must maintain consistency of the sockbuf during that
  1311. * time.
  1312. *
  1313. * The caller may receive the data as a single mbuf chain by supplying an
  1314. * mbuf **mp0 for use in returning the chain. The uio is then used only for
  1315. * the count in uio_resid.
  1316. */
  1317. int
  1318. soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
  1319. struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
  1320. {
  1321. struct mbuf *m, **mp;
  1322. int flags, len, error, offset;
  1323. struct protosw *pr = so->so_proto;
  1324. struct mbuf *nextrecord;
  1325. int moff, type = 0;
  1326. int orig_resid = uio->uio_resid;
  1327. mp = mp0;
  1328. if (psa != NULL)
  1329. *psa = NULL;
  1330. if (controlp != NULL)
  1331. *controlp = NULL;
  1332. if (flagsp != NULL)
  1333. flags = *flagsp &~ MSG_EOR;
  1334. else
  1335. flags = 0;
  1336. if (flags & MSG_OOB)
  1337. return (soreceive_rcvoob(so, uio, flags));
  1338. if (mp != NULL)
  1339. *mp = NULL;
  1340. if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
  1341. && uio->uio_resid)
  1342. (*pr->pr_usrreqs->pru_rcvd)(so, 0);
  1343. error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
  1344. if (error)
  1345. return (error);
  1346. restart:
  1347. SOCKBUF_LOCK(&so->so_rcv);
  1348. m = so->so_rcv.sb_mb;
  1349. /*
  1350. * If we have less data than requested, block awaiting more (subject
  1351. * to any timeout) if:
  1352. * 1. the current count is less than the low water mark, or
  1353. * 2. MSG_WAITALL is set, and it is possible to do the entire
  1354. * receive operation at once if we block (resid <= hiwat).
  1355. * 3. MSG_DONTWAIT is not set
  1356. * If MSG_WAITALL is set but resid is larger than the receive buffer,
  1357. * we have to do the receive in sections, and thus risk returning a
  1358. * short count if a timeout or signal occurs after we start.
  1359. */
  1360. if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
  1361. so->so_rcv.sb_cc < uio->uio_resid) &&
  1362. (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
  1363. ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
  1364. m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
  1365. KASSERT(m != NULL || !so->so_rcv.sb_cc,
  1366. ("receive: m == %p so->so_rcv.sb_cc == %u",
  1367. m, so->so_rcv.sb_cc));
  1368. if (so->so_error) {
  1369. if (m != NULL)
  1370. goto dontblock;
  1371. error = so->so_error;
  1372. if ((flags & MSG_PEEK) == 0)
  1373. so->so_error = 0;
  1374. SOCKBUF_UNLOCK(&so->so_rcv);
  1375. goto release;
  1376. }
  1377. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1378. if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
  1379. if (m == NULL) {
  1380. SOCKBUF_UNLOCK(&so->so_rcv);
  1381. goto release;
  1382. } else
  1383. goto dontblock;
  1384. }
  1385. for (; m != NULL; m = m->m_next)
  1386. if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
  1387. m = so->so_rcv.sb_mb;
  1388. goto dontblock;
  1389. }
  1390. if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
  1391. (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
  1392. SOCKBUF_UNLOCK(&so->so_rcv);
  1393. error = ENOTCONN;
  1394. goto release;
  1395. }
  1396. if (uio->uio_resid == 0) {
  1397. SOCKBUF_UNLOCK(&so->so_rcv);
  1398. goto release;
  1399. }
  1400. if ((so->so_state & SS_NBIO) ||
  1401. (flags & (MSG_DONTWAIT|MSG_NBIO))) {
  1402. SOCKBUF_UNLOCK(&so->so_rcv);
  1403. error = EWOULDBLOCK;
  1404. goto release;
  1405. }
  1406. SBLASTRECORDCHK(&so->so_rcv);
  1407. SBLASTMBUFCHK(&so->so_rcv);
  1408. error = sbwait(&so->so_rcv);
  1409. SOCKBUF_UNLOCK(&so->so_rcv);
  1410. if (error)
  1411. goto release;
  1412. goto restart;
  1413. }
  1414. dontblock:
  1415. /*
  1416. * From this point onward, we maintain 'nextrecord' as a cache of the
  1417. * pointer to the next record in the socket buffer. We must keep the
  1418. * various socket buffer pointers and local stack versions of the
  1419. * pointers in sync, pushing out modifications before dropping the
  1420. * socket buffer mutex, and re-reading them when picking it up.
  1421. *
  1422. * Otherwise, we will race with the network stack appending new data
  1423. * or records onto the socket buffer by using inconsistent/stale
  1424. * versions of the field, possibly resulting in socket buffer
  1425. * corruption.
  1426. *
  1427. * By holding the high-level sblock(), we prevent simultaneous
  1428. * readers from pulling off the front of the socket buffer.
  1429. */
  1430. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1431. if (uio->uio_td)
  1432. uio->uio_td->td_ru.ru_msgrcv++;
  1433. KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
  1434. SBLASTRECORDCHK(&so->so_rcv);
  1435. SBLASTMBUFCHK(&so->so_rcv);
  1436. nextrecord = m->m_nextpkt;
  1437. if (pr->pr_flags & PR_ADDR) {
  1438. KASSERT(m->m_type == MT_SONAME,
  1439. ("m->m_type == %d", m->m_type));
  1440. orig_resid = 0;
  1441. if (psa != NULL)
  1442. *psa = sodupsockaddr(mtod(m, struct sockaddr *),
  1443. M_NOWAIT);
  1444. if (flags & MSG_PEEK) {
  1445. m = m->m_next;
  1446. } else {
  1447. sbfree(&so->so_rcv, m);
  1448. so->so_rcv.sb_mb = m_free(m);
  1449. m = so->so_rcv.sb_mb;
  1450. sockbuf_pushsync(&so->so_rcv, nextrecord);
  1451. }
  1452. }
  1453. /*
  1454. * Process one or more MT_CONTROL mbufs present before any data mbufs
  1455. * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
  1456. * just copy the data; if !MSG_PEEK, we call into the protocol to
  1457. * perform externalization (or freeing if controlp == NULL).
  1458. */
  1459. if (m != NULL && m->m_type == MT_CONTROL) {
  1460. struct mbuf *cm = NULL, *cmn;
  1461. struct mbuf **cme = &cm;
  1462. do {
  1463. if (flags & MSG_PEEK) {
  1464. if (controlp != NULL) {
  1465. *controlp = m_copy(m, 0, m->m_len);
  1466. controlp = &(*controlp)->m_next;
  1467. }
  1468. m = m->m_next;
  1469. } else {
  1470. sbfree(&so->so_rcv, m);
  1471. so->so_rcv.sb_mb = m->m_next;
  1472. m->m_next = NULL;
  1473. *cme = m;
  1474. cme = &(*cme)->m_next;
  1475. m = so->so_rcv.sb_mb;
  1476. }
  1477. } while (m != NULL && m->m_type == MT_CONTROL);
  1478. if ((flags & MSG_PEEK) == 0)
  1479. sockbuf_pushsync(&so->so_rcv, nextrecord);
  1480. while (cm != NULL) {
  1481. cmn = cm->m_next;
  1482. cm->m_next = NULL;
  1483. if (pr->pr_domain->dom_externalize != NULL) {
  1484. SOCKBUF_UNLOCK(&so->so_rcv);
  1485. error = (*pr->pr_domain->dom_externalize)
  1486. (cm, controlp);
  1487. SOCKBUF_LOCK(&so->so_rcv);
  1488. } else if (controlp != NULL)
  1489. *controlp = cm;
  1490. else
  1491. m_freem(cm);
  1492. if (controlp != NULL) {
  1493. orig_resid = 0;
  1494. while (*controlp != NULL)
  1495. controlp = &(*controlp)->m_next;
  1496. }
  1497. cm = cmn;
  1498. }
  1499. if (m != NULL)
  1500. nextrecord = so->so_rcv.sb_mb->m_nextpkt;
  1501. else
  1502. nextrecord = so->so_rcv.sb_mb;
  1503. orig_resid = 0;
  1504. }
  1505. if (m != NULL) {
  1506. if ((flags & MSG_PEEK) == 0) {
  1507. KASSERT(m->m_nextpkt == nextrecord,
  1508. ("soreceive: post-control, nextrecord !sync"));
  1509. if (nextrecord == NULL) {
  1510. KASSERT(so->so_rcv.sb_mb == m,
  1511. ("soreceive: post-control, sb_mb!=m"));
  1512. KASSERT(so->so_rcv.sb_lastrecord == m,
  1513. ("soreceive: post-control, lastrecord!=m"));
  1514. }
  1515. }
  1516. type = m->m_type;
  1517. if (type == MT_OOBDATA)
  1518. flags |= MSG_OOB;
  1519. } else {
  1520. if ((flags & MSG_PEEK) == 0) {
  1521. KASSERT(so->so_rcv.sb_mb == nextrecord,
  1522. ("soreceive: sb_mb != nextrecord"));
  1523. if (so->so_rcv.sb_mb == NULL) {
  1524. KASSERT(so->so_rcv.sb_lastrecord == NULL,
  1525. ("soreceive: sb_lastercord != NULL"));
  1526. }
  1527. }
  1528. }
  1529. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1530. SBLASTRECORDCHK(&so->so_rcv);
  1531. SBLASTMBUFCHK(&so->so_rcv);
  1532. /*
  1533. * Now continue to read any data mbufs off of the head of the socket
  1534. * buffer until the read request is satisfied. Note that 'type' is
  1535. * used to store the type of any mbuf reads that have happened so far
  1536. * such that soreceive() can stop reading if the type changes, which
  1537. * causes soreceive() to return only one of regular data and inline
  1538. * out-of-band data in a single socket receive operation.
  1539. */
  1540. moff = 0;
  1541. offset = 0;
  1542. while (m != NULL && uio->uio_resid > 0 && error == 0) {
  1543. /*
  1544. * If the type of mbuf has changed since the last mbuf
  1545. * examined ('type'), end the receive operation.
  1546. */
  1547. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1548. if (m->m_type == MT_OOBDATA) {
  1549. if (type != MT_OOBDATA)
  1550. break;
  1551. } else if (type == MT_OOBDATA)
  1552. break;
  1553. else
  1554. KASSERT(m->m_type == MT_DATA,
  1555. ("m->m_type == %d", m->m_type));
  1556. so->so_rcv.sb_state &= ~SBS_RCVATMARK;
  1557. len = uio->uio_resid;
  1558. if (so->so_oobmark && len > so->so_oobmark - offset)
  1559. len = so->so_oobmark - offset;
  1560. if (len > m->m_len - moff)
  1561. len = m->m_len - moff;
  1562. /*
  1563. * If mp is set, just pass back the mbufs. Otherwise copy
  1564. * them out via the uio, then free. Sockbuf must be
  1565. * consistent here (points to current mbuf, it points to next
  1566. * record) when we drop priority; we must note any additions
  1567. * to the sockbuf when we block interrupts again.
  1568. */
  1569. if (mp == NULL) {
  1570. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1571. SBLASTRECORDCHK(&so->so_rcv);
  1572. SBLASTMBUFCHK(&so->so_rcv);
  1573. SOCKBUF_UNLOCK(&so->so_rcv);
  1574. #ifdef ZERO_COPY_SOCKETS
  1575. if (so_zero_copy_receive) {
  1576. int disposable;
  1577. if ((m->m_flags & M_EXT)
  1578. && (m->m_ext.ext_type == EXT_DISPOSABLE))
  1579. disposable = 1;
  1580. else
  1581. disposable = 0;
  1582. error = uiomoveco(mtod(m, char *) + moff,
  1583. (int)len, uio,
  1584. disposable);
  1585. } else
  1586. #endif /* ZERO_COPY_SOCKETS */
  1587. error = uiomove(mtod(m, char *) + moff, (int)len, uio);
  1588. SOCKBUF_LOCK(&so->so_rcv);
  1589. if (error) {
  1590. /*
  1591. * The MT_SONAME mbuf has already been removed
  1592. * from the record, so it is necessary to
  1593. * remove the data mbufs, if any, to preserve
  1594. * the invariant in the case of PR_ADDR that
  1595. * requires MT_SONAME mbufs at the head of
  1596. * each record.
  1597. */
  1598. if (m && pr->pr_flags & PR_ATOMIC &&
  1599. ((flags & MSG_PEEK) == 0))
  1600. (void)sbdroprecord_locked(&so->so_rcv);
  1601. SOCKBUF_UNLOCK(&so->so_rcv);
  1602. goto release;
  1603. }
  1604. } else
  1605. uio->uio_resid -= len;
  1606. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1607. if (len == m->m_len - moff) {
  1608. if (m->m_flags & M_EOR)
  1609. flags |= MSG_EOR;
  1610. if (flags & MSG_PEEK) {
  1611. m = m->m_next;
  1612. moff = 0;
  1613. } else {
  1614. nextrecord = m->m_nextpkt;
  1615. sbfree(&so->so_rcv, m);
  1616. if (mp != NULL) {
  1617. *mp = m;
  1618. mp = &m->m_next;
  1619. so->so_rcv.sb_mb = m = m->m_next;
  1620. *mp = NULL;
  1621. } else {
  1622. so->so_rcv.sb_mb = m_free(m);
  1623. m = so->so_rcv.sb_mb;
  1624. }
  1625. sockbuf_pushsync(&so->so_rcv, nextrecord);
  1626. SBLASTRECORDCHK(&so->so_rcv);
  1627. SBLASTMBUFCHK(&so->so_rcv);
  1628. }
  1629. } else {
  1630. if (flags & MSG_PEEK)
  1631. moff += len;
  1632. else {
  1633. if (mp != NULL) {
  1634. int copy_flag;
  1635. if (flags & MSG_DONTWAIT)
  1636. copy_flag = M_DONTWAIT;
  1637. else
  1638. copy_flag = M_WAIT;
  1639. if (copy_flag == M_WAIT)
  1640. SOCKBUF_UNLOCK(&so->so_rcv);
  1641. *mp = m_copym(m, 0, len, copy_flag);
  1642. if (copy_flag == M_WAIT)
  1643. SOCKBUF_LOCK(&so->so_rcv);
  1644. if (*mp == NULL) {
  1645. /*
  1646. * m_copym() couldn't
  1647. * allocate an mbuf. Adjust
  1648. * uio_resid back (it was
  1649. * adjusted down by len
  1650. * bytes, which we didn't end
  1651. * up "copying" over).
  1652. */
  1653. uio->uio_resid += len;
  1654. break;
  1655. }
  1656. }
  1657. m->m_data += len;
  1658. m->m_len -= len;
  1659. so->so_rcv.sb_cc -= len;
  1660. }
  1661. }
  1662. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1663. if (so->so_oobmark) {
  1664. if ((flags & MSG_PEEK) == 0) {
  1665. so->so_oobmark -= len;
  1666. if (so->so_oobmark == 0) {
  1667. so->so_rcv.sb_state |= SBS_RCVATMARK;
  1668. break;
  1669. }
  1670. } else {
  1671. offset += len;
  1672. if (offset == so->so_oobmark)
  1673. break;
  1674. }
  1675. }
  1676. if (flags & MSG_EOR)
  1677. break;
  1678. /*
  1679. * If the MSG_WAITALL flag is set (for non-atomic socket), we
  1680. * must not quit until "uio->uio_resid == 0" or an error
  1681. * termination. If a signal/timeout occurs, return with a
  1682. * short count but without error. Keep sockbuf locked
  1683. * against other readers.
  1684. */
  1685. while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
  1686. !sosendallatonce(so) && nextrecord == NULL) {
  1687. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1688. if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
  1689. break;
  1690. /*
  1691. * Notify the protocol that some data has been
  1692. * drained before blocking.
  1693. */
  1694. if (pr->pr_flags & PR_WANTRCVD) {
  1695. SOCKBUF_UNLOCK(&so->so_rcv);
  1696. (*pr->pr_usrreqs->pru_rcvd)(so, flags);
  1697. SOCKBUF_LOCK(&so->so_rcv);
  1698. }
  1699. SBLASTRECORDCHK(&so->so_rcv);
  1700. SBLASTMBUFCHK(&so->so_rcv);
  1701. error = sbwait(&so->so_rcv);
  1702. if (error) {
  1703. SOCKBUF_UNLOCK(&so->so_rcv);
  1704. goto release;
  1705. }
  1706. m = so->so_rcv.sb_mb;
  1707. if (m != NULL)
  1708. nextrecord = m->m_nextpkt;
  1709. }
  1710. }
  1711. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1712. if (m != NULL && pr->pr_flags & PR_ATOMIC) {
  1713. flags |= MSG_TRUNC;
  1714. if ((flags & MSG_PEEK) == 0)
  1715. (void) sbdroprecord_locked(&so->so_rcv);
  1716. }
  1717. if ((flags & MSG_PEEK) == 0) {
  1718. if (m == NULL) {
  1719. /*
  1720. * First part is an inline SB_EMPTY_FIXUP(). Second
  1721. * part makes sure sb_lastrecord is up-to-date if
  1722. * there is still data in the socket buffer.
  1723. */
  1724. so->so_rcv.sb_mb = nextrecord;
  1725. if (so->so_rcv.sb_mb == NULL) {
  1726. so->so_rcv.sb_mbtail = NULL;
  1727. so->so_rcv.sb_lastrecord = NULL;
  1728. } else if (nextrecord->m_nextpkt == NULL)
  1729. so->so_rcv.sb_lastrecord = nextrecord;
  1730. }
  1731. SBLASTRECORDCHK(&so->so_rcv);
  1732. SBLASTMBUFCHK(&so->so_rcv);
  1733. /*
  1734. * If soreceive() is being done from the socket callback,
  1735. * then don't need to generate ACK to peer to update window,
  1736. * since ACK will be generated on return to TCP.
  1737. */
  1738. if (!(flags & MSG_SOCALLBCK) &&
  1739. (pr->pr_flags & PR_WANTRCVD)) {
  1740. SOCKBUF_UNLOCK(&so->so_rcv);
  1741. (*pr->pr_usrreqs->pru_rcvd)(so, flags);
  1742. SOCKBUF_LOCK(&so->so_rcv);
  1743. }
  1744. }
  1745. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1746. if (orig_resid == uio->uio_resid && orig_resid &&
  1747. (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
  1748. SOCKBUF_UNLOCK(&so->so_rcv);
  1749. goto restart;
  1750. }
  1751. SOCKBUF_UNLOCK(&so->so_rcv);
  1752. if (flagsp != NULL)
  1753. *flagsp |= flags;
  1754. release:
  1755. sbunlock(&so->so_rcv);
  1756. return (error);
  1757. }
  1758. /*
  1759. * Optimized version of soreceive() for stream (TCP) sockets.
  1760. */
  1761. #ifdef TCP_SORECEIVE_STREAM
  1762. int
  1763. soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
  1764. struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
  1765. {
  1766. int len = 0, error = 0, flags, oresid;
  1767. struct sockbuf *sb;
  1768. struct mbuf *m, *n = NULL;
  1769. /* We only do stream sockets. */
  1770. if (so->so_type != SOCK_STREAM)
  1771. return (EINVAL);
  1772. if (psa != NULL)
  1773. *psa = NULL;
  1774. if (controlp != NULL)
  1775. return (EINVAL);
  1776. if (flagsp != NULL)
  1777. flags = *flagsp &~ MSG_EOR;
  1778. else
  1779. flags = 0;
  1780. if (flags & MSG_OOB)
  1781. return (soreceive_rcvoob(so, uio, flags));
  1782. if (mp0 != NULL)
  1783. *mp0 = NULL;
  1784. sb = &so->so_rcv;
  1785. /* Prevent other readers from entering the socket. */
  1786. error = sblock(sb, SBLOCKWAIT(flags));
  1787. if (error)
  1788. goto out;
  1789. SOCKBUF_LOCK(sb);
  1790. /* Easy one, no space to copyout anything. */
  1791. if (uio->uio_resid == 0) {
  1792. error = EINVAL;
  1793. goto out;
  1794. }
  1795. oresid = uio->uio_resid;
  1796. /* We will never ever get anything unless we are connected. */
  1797. if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
  1798. /* When disconnecting there may be still some data left. */
  1799. if (sb->sb_cc > 0)
  1800. goto deliver;
  1801. if (!(so->so_state & SS_ISDISCONNECTED))
  1802. error = ENOTCONN;
  1803. goto out;
  1804. }
  1805. /* Socket buffer is empty and we shall not block. */
  1806. if (sb->sb_cc == 0 &&
  1807. ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
  1808. error = EAGAIN;
  1809. goto out;
  1810. }
  1811. restart:
  1812. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1813. /* Abort if socket has reported problems. */
  1814. if (so->so_error) {
  1815. if (sb->sb_cc > 0)
  1816. goto deliver;
  1817. if (oresid > uio->uio_resid)
  1818. goto out;
  1819. error = so->so_error;
  1820. if (!(flags & MSG_PEEK))
  1821. so->so_error = 0;
  1822. goto out;
  1823. }
  1824. /* Door is closed. Deliver what is left, if any. */
  1825. if (sb->sb_state & SBS_CANTRCVMORE) {
  1826. if (sb->sb_cc > 0)
  1827. goto deliver;
  1828. else
  1829. goto out;
  1830. }
  1831. /* Socket buffer got some data that we shall deliver now. */
  1832. if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
  1833. ((sb->sb_flags & SS_NBIO) ||
  1834. (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
  1835. sb->sb_cc >= sb->sb_lowat ||
  1836. sb->sb_cc >= uio->uio_resid ||
  1837. sb->sb_cc >= sb->sb_hiwat) ) {
  1838. goto deliver;
  1839. }
  1840. /* On MSG_WAITALL we must wait until all data or error arrives. */
  1841. if ((flags & MSG_WAITALL) &&
  1842. (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
  1843. goto deliver;
  1844. /*
  1845. * Wait and block until (more) data comes in.
  1846. * NB: Drops the sockbuf lock during wait.
  1847. */
  1848. error = sbwait(sb);
  1849. if (error)
  1850. goto out;
  1851. goto restart;
  1852. deliver:
  1853. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  1854. KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
  1855. KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
  1856. /* Statistics. */
  1857. if (uio->uio_td)
  1858. uio->uio_td->td_ru.ru_msgrcv++;
  1859. /* Fill uio until full or current end of socket buffer is reached. */
  1860. len = min(uio->uio_resid, sb->sb_cc);
  1861. if (mp0 != NULL) {
  1862. /* Dequeue as many mbufs as possible. */
  1863. if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
  1864. for (*mp0 = m = sb->sb_mb;
  1865. m != NULL && m->m_len <= len;
  1866. m = m->m_next) {
  1867. len -= m->m_len;
  1868. uio->uio_resid -= m->m_len;
  1869. sbfree(sb, m);
  1870. n = m;
  1871. }
  1872. sb->sb_mb = m;
  1873. if (sb->sb_mb == NULL)
  1874. SB_EMPTY_FIXUP(sb);
  1875. n->m_next = NULL;
  1876. }
  1877. /* Copy the remainder. */
  1878. if (len > 0) {
  1879. KASSERT(sb->sb_mb != NULL,
  1880. ("%s: len > 0 && sb->sb_mb empty", __func__));
  1881. m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
  1882. if (m == NULL)
  1883. len = 0; /* Don't flush data from sockbuf. */
  1884. else
  1885. uio->uio_resid -= m->m_len;
  1886. if (*mp0 != NULL)
  1887. n->m_next = m;
  1888. else
  1889. *mp0 = m;
  1890. if (*mp0 == NULL) {
  1891. error = ENOBUFS;
  1892. goto out;
  1893. }
  1894. }
  1895. } else {
  1896. /* NB: Must unlock socket buffer as uiomove may sleep. */
  1897. SOCKBUF_UNLOCK(sb);
  1898. error = m_mbuftouio(uio, sb->sb_mb, len);
  1899. SOCKBUF_LOCK(sb);
  1900. if (error)
  1901. goto out;
  1902. }
  1903. SBLASTRECORDCHK(sb);
  1904. SBLASTMBUFCHK(sb);
  1905. /*
  1906. * Remove the delivered data from the socket buffer unless we
  1907. * were only peeking.
  1908. */
  1909. if (!(flags & MSG_PEEK)) {
  1910. if (len > 0)
  1911. sbdrop_locked(sb, len);
  1912. /* Notify protocol that we drained some data. */
  1913. if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
  1914. (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
  1915. !(flags & MSG_SOCALLBCK))) {
  1916. SOCKBUF_UNLOCK(sb);
  1917. (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
  1918. SOCKBUF_LOCK(sb);
  1919. }
  1920. }
  1921. /*
  1922. * For MSG_WAITALL we may have to loop again and wait for
  1923. * more data to come in.
  1924. */
  1925. if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
  1926. goto restart;
  1927. out:
  1928. SOCKBUF_LOCK_ASSERT(sb);
  1929. SBLASTRECORDCHK(sb);
  1930. SBLASTMBUFCHK(sb);
  1931. SOCKBUF_UNLOCK(sb);
  1932. sbunlock(sb);
  1933. return (error);
  1934. }
  1935. #endif /* TCP_SORECEIVE_STREAM */
  1936. /*
  1937. * Optimized version of soreceive() for simple datagram cases from userspace.
  1938. * Unlike in the stream case, we're able to drop a datagram if copyout()
  1939. * fails, and because we handle datagrams atomically, we don't need to use a
  1940. * sleep lock to prevent I/O interlacing.
  1941. */
  1942. int
  1943. soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
  1944. struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
  1945. {
  1946. struct mbuf *m, *m2;
  1947. int flags, len, error;
  1948. struct protosw *pr = so->so_proto;
  1949. struct mbuf *nextrecord;
  1950. if (psa != NULL)
  1951. *psa = NULL;
  1952. if (controlp != NULL)
  1953. *controlp = NULL;
  1954. if (flagsp != NULL)
  1955. flags = *flagsp &~ MSG_EOR;
  1956. else
  1957. flags = 0;
  1958. /*
  1959. * For any complicated cases, fall back to the full
  1960. * soreceive_generic().
  1961. */
  1962. if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
  1963. return (soreceive_generic(so, psa, uio, mp0, controlp,
  1964. flagsp));
  1965. /*
  1966. * Enforce restrictions on use.
  1967. */
  1968. KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
  1969. ("soreceive_dgram: wantrcvd"));
  1970. KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
  1971. KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
  1972. ("soreceive_dgram: SBS_RCVATMARK"));
  1973. KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
  1974. ("soreceive_dgram: P_CONNREQUIRED"));
  1975. /*
  1976. * Loop blocking while waiting for a datagram.
  1977. */
  1978. SOCKBUF_LOCK(&so->so_rcv);
  1979. while ((m = so->so_rcv.sb_mb) == NULL) {
  1980. KASSERT(so->so_rcv.sb_cc == 0,
  1981. ("soreceive_dgram: sb_mb NULL but sb_cc %u",
  1982. so->so_rcv.sb_cc));
  1983. if (so->so_error) {
  1984. error = so->so_error;
  1985. so->so_error = 0;
  1986. SOCKBUF_UNLOCK(&so->so_rcv);
  1987. return (error);
  1988. }
  1989. if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
  1990. uio->uio_resid == 0) {
  1991. SOCKBUF_UNLOCK(&so->so_rcv);
  1992. return (0);
  1993. }
  1994. if ((so->so_state & SS_NBIO) ||
  1995. (flags & (MSG_DONTWAIT|MSG_NBIO))) {
  1996. SOCKBUF_UNLOCK(&so->so_rcv);
  1997. return (EWOULDBLOCK);
  1998. }
  1999. SBLASTRECORDCHK(&so->so_rcv);
  2000. SBLASTMBUFCHK(&so->so_rcv);
  2001. error = sbwait(&so->so_rcv);
  2002. if (error) {
  2003. SOCKBUF_UNLOCK(&so->so_rcv);
  2004. return (error);
  2005. }
  2006. }
  2007. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  2008. if (uio->uio_td)
  2009. uio->uio_td->td_ru.ru_msgrcv++;
  2010. SBLASTRECORDCHK(&so->so_rcv);
  2011. SBLASTMBUFCHK(&so->so_rcv);
  2012. nextrecord = m->m_nextpkt;
  2013. if (nextrecord == NULL) {
  2014. KASSERT(so->so_rcv.sb_lastrecord == m,
  2015. ("soreceive_dgram: lastrecord != m"));
  2016. }
  2017. KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
  2018. ("soreceive_dgram: m_nextpkt != nextrecord"));
  2019. /*
  2020. * Pull 'm' and its chain off the front of the packet queue.
  2021. */
  2022. so->so_rcv.sb_mb = NULL;
  2023. sockbuf_pushsync(&so->so_rcv, nextrecord);
  2024. /*
  2025. * Walk 'm's chain and free that many bytes from the socket buffer.
  2026. */
  2027. for (m2 = m; m2 != NULL; m2 = m2->m_next)
  2028. sbfree(&so->so_rcv, m2);
  2029. /*
  2030. * Do a few last checks before we let go of the lock.
  2031. */
  2032. SBLASTRECORDCHK(&so->so_rcv);
  2033. SBLASTMBUFCHK(&so->so_rcv);
  2034. SOCKBUF_UNLOCK(&so->so_rcv);
  2035. if (pr->pr_flags & PR_ADDR) {
  2036. KASSERT(m->m_type == MT_SONAME,
  2037. ("m->m_type == %d", m->m_type));
  2038. if (psa != NULL)
  2039. *psa = sodupsockaddr(mtod(m, struct sockaddr *),
  2040. M_NOWAIT);
  2041. m = m_free(m);
  2042. }
  2043. if (m == NULL) {
  2044. /* XXXRW: Can this happen? */
  2045. return (0);
  2046. }
  2047. /*
  2048. * Packet to copyout() is now in 'm' and it is disconnected from the
  2049. * queue.
  2050. *
  2051. * Process one or more MT_CONTROL mbufs present before any data mbufs
  2052. * in the first mbuf chain on the socket buffer. We call into the
  2053. * protocol to perform externalization (or freeing if controlp ==
  2054. * NULL).
  2055. */
  2056. if (m->m_type == MT_CONTROL) {
  2057. struct mbuf *cm = NULL, *cmn;
  2058. struct mbuf **cme = &cm;
  2059. do {
  2060. m2 = m->m_next;
  2061. m->m_next = NULL;
  2062. *cme = m;
  2063. cme = &(*cme)->m_next;
  2064. m = m2;
  2065. } while (m != NULL && m->m_type == MT_CONTROL);
  2066. while (cm != NULL) {
  2067. cmn = cm->m_next;
  2068. cm->m_next = NULL;
  2069. if (pr->pr_domain->dom_externalize != NULL) {
  2070. error = (*pr->pr_domain->dom_externalize)
  2071. (cm, controlp);
  2072. } else if (controlp != NULL)
  2073. *controlp = cm;
  2074. else
  2075. m_freem(cm);
  2076. if (controlp != NULL) {
  2077. while (*controlp != NULL)
  2078. controlp = &(*controlp)->m_next;
  2079. }
  2080. cm = cmn;
  2081. }
  2082. }
  2083. KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
  2084. while (m != NULL && uio->uio_resid > 0) {
  2085. len = uio->uio_resid;
  2086. if (len > m->m_len)
  2087. len = m->m_len;
  2088. error = uiomove(mtod(m, char *), (int)len, uio);
  2089. if (error) {
  2090. m_freem(m);
  2091. return (error);
  2092. }
  2093. m = m_free(m);
  2094. }
  2095. if (m != NULL)
  2096. flags |= MSG_TRUNC;
  2097. m_freem(m);
  2098. if (flagsp != NULL)
  2099. *flagsp |= flags;
  2100. return (0);
  2101. }
  2102. int
  2103. soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
  2104. struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
  2105. {
  2106. return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
  2107. controlp, flagsp));
  2108. }
  2109. int
  2110. soshutdown(struct socket *so, int how)
  2111. {
  2112. struct protosw *pr = so->so_proto;
  2113. int error;
  2114. if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
  2115. return (EINVAL);
  2116. if (pr->pr_usrreqs->pru_flush != NULL) {
  2117. (*pr->pr_usrreqs->pru_flush)(so, how);
  2118. }
  2119. if (how != SHUT_WR)
  2120. sorflush(so);
  2121. if (how != SHUT_RD) {
  2122. CURVNET_SET(so->so_vnet);
  2123. error = (*pr->pr_usrreqs->pru_shutdown)(so);
  2124. CURVNET_RESTORE();
  2125. return (error);
  2126. }
  2127. return (0);
  2128. }
  2129. void
  2130. sorflush(struct socket *so)
  2131. {
  2132. struct sockbuf *sb = &so->so_rcv;
  2133. struct protosw *pr = so->so_proto;
  2134. struct sockbuf asb;
  2135. /*
  2136. * In order to avoid calling dom_dispose with the socket buffer mutex
  2137. * held, and in order to generally avoid holding the lock for a long
  2138. * time, we make a copy of the socket buffer and clear the original
  2139. * (except locks, state). The new socket buffer copy won't have
  2140. * initialized locks so we can only call routines that won't use or
  2141. * assert those locks.
  2142. *
  2143. * Dislodge threads currently blocked in receive and wait to acquire
  2144. * a lock against other simultaneous readers before clearing the
  2145. * socket buffer. Don't let our acquire be interrupted by a signal
  2146. * despite any existing socket disposition on interruptable waiting.
  2147. */
  2148. CURVNET_SET(so->so_vnet);
  2149. socantrcvmore(so);
  2150. (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
  2151. /*
  2152. * Invalidate/clear most of the sockbuf structure, but leave selinfo
  2153. * and mutex data unchanged.
  2154. */
  2155. SOCKBUF_LOCK(sb);
  2156. bzero(&asb, offsetof(struct sockbuf, sb_startzero));
  2157. bcopy(&sb->sb_startzero, &asb.sb_startzero,
  2158. sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
  2159. bzero(&sb->sb_startzero,
  2160. sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
  2161. SOCKBUF_UNLOCK(sb);
  2162. sbunlock(sb);
  2163. /*
  2164. * Dispose of special rights and flush the socket buffer. Don't call
  2165. * any unsafe routines (that rely on locks being initialized) on asb.
  2166. */
  2167. if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
  2168. (*pr->pr_domain->dom_dispose)(asb.sb_mb);
  2169. sbrelease_internal(&asb, so);
  2170. CURVNET_RESTORE();
  2171. }
  2172. /*
  2173. * Perhaps this routine, and sooptcopyout(), below, ought to come in an
  2174. * additional variant to handle the case where the option value needs to be
  2175. * some kind of integer, but not a specific size. In addition to their use
  2176. * here, these functions are also called by the protocol-level pr_ctloutput()
  2177. * routines.
  2178. */
  2179. int
  2180. sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
  2181. {
  2182. size_t valsize;
  2183. /*
  2184. * If the user gives us more than we wanted, we ignore it, but if we
  2185. * don't get the minimum length the caller wants, we return EINVAL.
  2186. * On success, sopt->sopt_valsize is set to however much we actually
  2187. * retrieved.
  2188. */
  2189. if ((valsize = sopt->sopt_valsize) < minlen)
  2190. return EINVAL;
  2191. if (valsize > len)
  2192. sopt->sopt_valsize = valsize = len;
  2193. if (sopt->sopt_td != NULL)
  2194. return (copyin(sopt->sopt_val, buf, valsize));
  2195. bcopy(sopt->sopt_val, buf, valsize);
  2196. return (0);
  2197. }
  2198. /*
  2199. * Kernel version of setsockopt(2).
  2200. *
  2201. * XXX: optlen is size_t, not socklen_t
  2202. */
  2203. int
  2204. so_setsockopt(struct socket *so, int level, int optname, void *optval,
  2205. size_t optlen)
  2206. {
  2207. struct sockopt sopt;
  2208. sopt.sopt_level = level;
  2209. sopt.sopt_name = optname;
  2210. sopt.sopt_dir = SOPT_SET;
  2211. sopt.sopt_val = optval;
  2212. sopt.sopt_valsize = optlen;
  2213. sopt.sopt_td = NULL;
  2214. return (sosetopt(so, &sopt));
  2215. }
  2216. int
  2217. sosetopt(struct socket *so, struct sockopt *sopt)
  2218. {
  2219. int error, optval;
  2220. struct linger l;
  2221. struct timeval tv;
  2222. u_long val;
  2223. #ifdef MAC
  2224. struct mac extmac;
  2225. #endif
  2226. error = 0;
  2227. if (sopt->sopt_level != SOL_SOCKET) {
  2228. if (so->so_proto && so->so_proto->pr_ctloutput)
  2229. return ((*so->so_proto->pr_ctloutput)
  2230. (so, sopt));
  2231. error = ENOPROTOOPT;
  2232. } else {
  2233. switch (sopt->sopt_name) {
  2234. #ifdef INET
  2235. case SO_ACCEPTFILTER:
  2236. error = do_setopt_accept_filter(so, sopt);
  2237. if (error)
  2238. goto bad;
  2239. break;
  2240. #endif
  2241. case SO_LINGER:
  2242. error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
  2243. if (error)
  2244. goto bad;
  2245. SOCK_LOCK(so);
  2246. so->so_linger = l.l_linger;
  2247. if (l.l_onoff)
  2248. so->so_options |= SO_LINGER;
  2249. else
  2250. so->so_options &= ~SO_LINGER;
  2251. SOCK_UNLOCK(so);
  2252. break;
  2253. case SO_DEBUG:
  2254. case SO_KEEPALIVE:
  2255. case SO_DONTROUTE:
  2256. case SO_USELOOPBACK:
  2257. case SO_BROADCAST:
  2258. case SO_REUSEADDR:
  2259. case SO_REUSEPORT:
  2260. case SO_OOBINLINE:
  2261. case SO_TIMESTAMP:
  2262. case SO_BINTIME:
  2263. case SO_NOSIGPIPE:
  2264. case SO_NO_DDP:
  2265. case SO_NO_OFFLOAD:
  2266. error = sooptcopyin(sopt, &optval, sizeof optval,
  2267. sizeof optval);
  2268. if (error)
  2269. goto bad;
  2270. SOCK_LOCK(so);
  2271. if (optval)
  2272. so->so_options |= sopt->sopt_name;
  2273. else
  2274. so->so_options &= ~sopt->sopt_name;
  2275. SOCK_UNLOCK(so);
  2276. break;
  2277. case SO_SETFIB:
  2278. error = sooptcopyin(sopt, &optval, sizeof optval,
  2279. sizeof optval);
  2280. if (optval < 1 || optval > rt_numfibs) {
  2281. error = EINVAL;
  2282. goto bad;
  2283. }
  2284. if ((so->so_proto->pr_domain->dom_family == PF_INET) ||
  2285. (so->so_proto->pr_domain->dom_family == PF_ROUTE)) {
  2286. so->so_fibnum = optval;
  2287. /* Note: ignore error */
  2288. if (so->so_proto && so->so_proto->pr_ctloutput)
  2289. (*so->so_proto->pr_ctloutput)(so, sopt);
  2290. } else {
  2291. so->so_fibnum = 0;
  2292. }
  2293. break;
  2294. case SO_SNDBUF:
  2295. case SO_RCVBUF:
  2296. case SO_SNDLOWAT:
  2297. case SO_RCVLOWAT:
  2298. error = sooptcopyin(sopt, &optval, sizeof optval,
  2299. sizeof optval);
  2300. if (error)
  2301. goto bad;
  2302. /*
  2303. * Values < 1 make no sense for any of these options,
  2304. * so disallow them.
  2305. */
  2306. if (optval < 1) {
  2307. error = EINVAL;
  2308. goto bad;
  2309. }
  2310. switch (sopt->sopt_name) {
  2311. case SO_SNDBUF:
  2312. case SO_RCVBUF:
  2313. if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
  2314. &so->so_snd : &so->so_rcv, (u_long)optval,
  2315. so, curthread) == 0) {
  2316. error = ENOBUFS;
  2317. goto bad;
  2318. }
  2319. (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
  2320. &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
  2321. break;
  2322. /*
  2323. * Make sure the low-water is never greater than the
  2324. * high-water.
  2325. */
  2326. case SO_SNDLOWAT:
  2327. SOCKBUF_LOCK(&so->so_snd);
  2328. so->so_snd.sb_lowat =
  2329. (optval > so->so_snd.sb_hiwat) ?
  2330. so->so_snd.sb_hiwat : optval;
  2331. SOCKBUF_UNLOCK(&so->so_snd);
  2332. break;
  2333. case SO_RCVLOWAT:
  2334. SOCKBUF_LOCK(&so->so_rcv);
  2335. so->so_rcv.sb_lowat =
  2336. (optval > so->so_rcv.sb_hiwat) ?
  2337. so->so_rcv.sb_hiwat : optval;
  2338. SOCKBUF_UNLOCK(&so->so_rcv);
  2339. break;
  2340. }
  2341. break;
  2342. case SO_SNDTIMEO:
  2343. case SO_RCVTIMEO:
  2344. #ifdef COMPAT_FREEBSD32
  2345. if (SV_CURPROC_FLAG(SV_ILP32)) {
  2346. struct timeval32 tv32;
  2347. error = sooptcopyin(sopt, &tv32, sizeof tv32,
  2348. sizeof tv32);
  2349. CP(tv32, tv, tv_sec);
  2350. CP(tv32, tv, tv_usec);
  2351. } else
  2352. #endif
  2353. error = sooptcopyin(sopt, &tv, sizeof tv,
  2354. sizeof tv);
  2355. if (error)
  2356. goto bad;
  2357. /* assert(hz > 0); */
  2358. if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
  2359. tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
  2360. error = EDOM;
  2361. goto bad;
  2362. }
  2363. /* assert(tick > 0); */
  2364. /* assert(ULONG_MAX - INT_MAX >= 1000000); */
  2365. val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
  2366. if (val > INT_MAX) {
  2367. error = EDOM;
  2368. goto bad;
  2369. }
  2370. if (val == 0 && tv.tv_usec != 0)
  2371. val = 1;
  2372. switch (sopt->sopt_name) {
  2373. case SO_SNDTIMEO:
  2374. so->so_snd.sb_timeo = val;
  2375. break;
  2376. case SO_RCVTIMEO:
  2377. so->so_rcv.sb_timeo = val;
  2378. break;
  2379. }
  2380. break;
  2381. case SO_LABEL:
  2382. #ifdef MAC
  2383. error = sooptcopyin(sopt, &extmac, sizeof extmac,
  2384. sizeof extmac);
  2385. if (error)
  2386. goto bad;
  2387. error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
  2388. so, &extmac);
  2389. #else
  2390. error = EOPNOTSUPP;
  2391. #endif
  2392. break;
  2393. default:
  2394. error = ENOPROTOOPT;
  2395. break;
  2396. }
  2397. if (error == 0 && so->so_proto != NULL &&
  2398. so->so_proto->pr_ctloutput != NULL) {
  2399. (void) ((*so->so_proto->pr_ctloutput)
  2400. (so, sopt));
  2401. }
  2402. }
  2403. bad:
  2404. return (error);
  2405. }
  2406. /*
  2407. * Helper routine for getsockopt.
  2408. */
  2409. int
  2410. sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
  2411. {
  2412. int error;
  2413. size_t valsize;
  2414. error = 0;
  2415. /*
  2416. * Documented get behavior is that we always return a value, possibly
  2417. * truncated to fit in the user's buffer. Traditional behavior is
  2418. * that we always tell the user precisely how much we copied, rather
  2419. * than something useful like the total amount we had available for
  2420. * her. Note that this interface is not idempotent; the entire
  2421. * answer must generated ahead of time.
  2422. */
  2423. valsize = min(len, sopt->sopt_valsize);
  2424. sopt->sopt_valsize = valsize;
  2425. if (sopt->sopt_val != NULL) {
  2426. if (sopt->sopt_td != NULL)
  2427. error = copyout(buf, sopt->sopt_val, valsize);
  2428. else
  2429. bcopy(buf, sopt->sopt_val, valsize);
  2430. }
  2431. return (error);
  2432. }
  2433. int
  2434. sogetopt(struct socket *so, struct sockopt *sopt)
  2435. {
  2436. int error, optval;
  2437. struct linger l;
  2438. struct timeval tv;
  2439. #ifdef MAC
  2440. struct mac extmac;
  2441. #endif
  2442. error = 0;
  2443. if (sopt->sopt_level != SOL_SOCKET) {
  2444. if (so->so_proto && so->so_proto->pr_ctloutput) {
  2445. return ((*so->so_proto->pr_ctloutput)
  2446. (so, sopt));
  2447. } else
  2448. return (ENOPROTOOPT);
  2449. } else {
  2450. switch (sopt->sopt_name) {
  2451. #ifdef INET
  2452. case SO_ACCEPTFILTER:
  2453. error = do_getopt_accept_filter(so, sopt);
  2454. break;
  2455. #endif
  2456. case SO_LINGER:
  2457. SOCK_LOCK(so);
  2458. l.l_onoff = so->so_options & SO_LINGER;
  2459. l.l_linger = so->so_linger;
  2460. SOCK_UNLOCK(so);
  2461. error = sooptcopyout(sopt, &l, sizeof l);
  2462. break;
  2463. case SO_USELOOPBACK:
  2464. case SO_DONTROUTE:
  2465. case SO_DEBUG:
  2466. case SO_KEEPALIVE:
  2467. case SO_REUSEADDR:
  2468. case SO_REUSEPORT:
  2469. case SO_BROADCAST:
  2470. case SO_OOBINLINE:
  2471. case SO_ACCEPTCONN:
  2472. case SO_TIMESTAMP:
  2473. case SO_BINTIME:
  2474. case SO_NOSIGPIPE:
  2475. optval = so->so_options & sopt->sopt_name;
  2476. integer:
  2477. error = sooptcopyout(sopt, &optval, sizeof optval);
  2478. break;
  2479. case SO_TYPE:
  2480. optval = so->so_type;
  2481. goto integer;
  2482. case SO_ERROR:
  2483. SOCK_LOCK(so);
  2484. optval = so->so_error;
  2485. so->so_error = 0;
  2486. SOCK_UNLOCK(so);
  2487. goto integer;
  2488. case SO_SNDBUF:
  2489. optval = so->so_snd.sb_hiwat;
  2490. goto integer;
  2491. case SO_RCVBUF:
  2492. optval = so->so_rcv.sb_hiwat;
  2493. goto integer;
  2494. case SO_SNDLOWAT:
  2495. optval = so->so_snd.sb_lowat;
  2496. goto integer;
  2497. case SO_RCVLOWAT:
  2498. optval = so->so_rcv.sb_lowat;
  2499. goto integer;
  2500. case SO_SNDTIMEO:
  2501. case SO_RCVTIMEO:
  2502. optval = (sopt->sopt_name == SO_SNDTIMEO ?
  2503. so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
  2504. tv.tv_sec = optval / hz;
  2505. tv.tv_usec = (optval % hz) * tick;
  2506. #ifdef COMPAT_FREEBSD32
  2507. if (SV_CURPROC_FLAG(SV_ILP32)) {
  2508. struct timeval32 tv32;
  2509. CP(tv, tv32, tv_sec);
  2510. CP(tv, tv32, tv_usec);
  2511. error = sooptcopyout(sopt, &tv32, sizeof tv32);
  2512. } else
  2513. #endif
  2514. error = sooptcopyout(sopt, &tv, sizeof tv);
  2515. break;
  2516. case SO_LABEL:
  2517. #ifdef MAC
  2518. error = sooptcopyin(sopt, &extmac, sizeof(extmac),
  2519. sizeof(extmac));
  2520. if (error)
  2521. return (error);
  2522. error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
  2523. so, &extmac);
  2524. if (error)
  2525. return (error);
  2526. error = sooptcopyout(sopt, &extmac, sizeof extmac);
  2527. #else
  2528. error = EOPNOTSUPP;
  2529. #endif
  2530. break;
  2531. case SO_PEERLABEL:
  2532. #ifdef MAC
  2533. error = sooptcopyin(sopt, &extmac, sizeof(extmac),
  2534. sizeof(extmac));
  2535. if (error)
  2536. return (error);
  2537. error = mac_getsockopt_peerlabel(
  2538. sopt->sopt_td->td_ucred, so, &extmac);
  2539. if (error)
  2540. return (error);
  2541. error = sooptcopyout(sopt, &extmac, sizeof extmac);
  2542. #else
  2543. error = EOPNOTSUPP;
  2544. #endif
  2545. break;
  2546. case SO_LISTENQLIMIT:
  2547. optval = so->so_qlimit;
  2548. goto integer;
  2549. case SO_LISTENQLEN:
  2550. optval = so->so_qlen;
  2551. goto integer;
  2552. case SO_LISTENINCQLEN:
  2553. optval = so->so_incqlen;
  2554. goto integer;
  2555. default:
  2556. error = ENOPROTOOPT;
  2557. break;
  2558. }
  2559. return (error);
  2560. }
  2561. }
  2562. /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
  2563. int
  2564. soopt_getm(struct sockopt *sopt, struct mbuf **mp)
  2565. {
  2566. struct mbuf *m, *m_prev;
  2567. int sopt_size = sopt->sopt_valsize;
  2568. MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
  2569. if (m == NULL)
  2570. return ENOBUFS;
  2571. if (sopt_size > MLEN) {
  2572. MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
  2573. if ((m->m_flags & M_EXT) == 0) {
  2574. m_free(m);
  2575. return ENOBUFS;
  2576. }
  2577. m->m_len = min(MCLBYTES, sopt_size);
  2578. } else {
  2579. m->m_len = min(MLEN, sopt_size);
  2580. }
  2581. sopt_size -= m->m_len;
  2582. *mp = m;
  2583. m_prev = m;
  2584. while (sopt_size) {
  2585. MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
  2586. if (m == NULL) {
  2587. m_freem(*mp);
  2588. return ENOBUFS;
  2589. }
  2590. if (sopt_size > MLEN) {
  2591. MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
  2592. M_DONTWAIT);
  2593. if ((m->m_flags & M_EXT) == 0) {
  2594. m_freem(m);
  2595. m_freem(*mp);
  2596. return ENOBUFS;
  2597. }
  2598. m->m_len = min(MCLBYTES, sopt_size);
  2599. } else {
  2600. m->m_len = min(MLEN, sopt_size);
  2601. }
  2602. sopt_size -= m->m_len;
  2603. m_prev->m_next = m;
  2604. m_prev = m;
  2605. }
  2606. return (0);
  2607. }
  2608. /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
  2609. int
  2610. soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
  2611. {
  2612. struct mbuf *m0 = m;
  2613. if (sopt->sopt_val == NULL)
  2614. return (0);
  2615. while (m != NULL && sopt->sopt_valsize >= m->m_len) {
  2616. if (sopt->sopt_td != NULL) {
  2617. int error;
  2618. error = copyin(sopt->sopt_val, mtod(m, char *),
  2619. m->m_len);
  2620. if (error != 0) {
  2621. m_freem(m0);
  2622. return(error);
  2623. }
  2624. } else
  2625. bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
  2626. sopt->sopt_valsize -= m->m_len;
  2627. sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
  2628. m = m->m_next;
  2629. }
  2630. if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
  2631. panic("ip6_sooptmcopyin");
  2632. return (0);
  2633. }
  2634. /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
  2635. int
  2636. soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
  2637. {
  2638. struct mbuf *m0 = m;
  2639. size_t valsize = 0;
  2640. if (sopt->sopt_val == NULL)
  2641. return (0);
  2642. while (m != NULL && sopt->sopt_valsize >= m->m_len) {
  2643. if (sopt->sopt_td != NULL) {
  2644. int error;
  2645. error = copyout(mtod(m, char *), sopt->sopt_val,
  2646. m->m_len);
  2647. if (error != 0) {
  2648. m_freem(m0);
  2649. return(error);
  2650. }
  2651. } else
  2652. bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
  2653. sopt->sopt_valsize -= m->m_len;
  2654. sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
  2655. valsize += m->m_len;
  2656. m = m->m_next;
  2657. }
  2658. if (m != NULL) {
  2659. /* enough soopt buffer should be given from user-land */
  2660. m_freem(m0);
  2661. return(EINVAL);
  2662. }
  2663. sopt->sopt_valsize = valsize;
  2664. return (0);
  2665. }
  2666. /*
  2667. * sohasoutofband(): protocol notifies socket layer of the arrival of new
  2668. * out-of-band data, which will then notify socket consumers.
  2669. */
  2670. void
  2671. sohasoutofband(struct socket *so)
  2672. {
  2673. if (so->so_sigio != NULL)
  2674. pgsigio(&so->so_sigio, SIGURG, 0);
  2675. selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
  2676. }
  2677. int
  2678. sopoll(struct socket *so, int events, struct ucred *active_cred,
  2679. struct thread *td)
  2680. {
  2681. return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
  2682. td));
  2683. }
  2684. int
  2685. sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
  2686. struct thread *td)
  2687. {
  2688. int revents = 0;
  2689. SOCKBUF_LOCK(&so->so_snd);
  2690. SOCKBUF_LOCK(&so->so_rcv);
  2691. if (events & (POLLIN | POLLRDNORM))
  2692. if (soreadabledata(so))
  2693. revents |= events & (POLLIN | POLLRDNORM);
  2694. if (events & (POLLOUT | POLLWRNORM))
  2695. if (sowriteable(so))
  2696. revents |= events & (POLLOUT | POLLWRNORM);
  2697. if (events & (POLLPRI | POLLRDBAND))
  2698. if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
  2699. revents |= events & (POLLPRI | POLLRDBAND);
  2700. if ((events & POLLINIGNEOF) == 0) {
  2701. if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
  2702. revents |= events & (POLLIN | POLLRDNORM);
  2703. if (so->so_snd.sb_state & SBS_CANTSENDMORE)
  2704. revents |= POLLHUP;
  2705. }
  2706. }
  2707. if (revents == 0) {
  2708. if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
  2709. selrecord(td, &so->so_rcv.sb_sel);
  2710. so->so_rcv.sb_flags |= SB_SEL;
  2711. }
  2712. if (events & (POLLOUT | POLLWRNORM)) {
  2713. selrecord(td, &so->so_snd.sb_sel);
  2714. so->so_snd.sb_flags |= SB_SEL;
  2715. }
  2716. }
  2717. SOCKBUF_UNLOCK(&so->so_rcv);
  2718. SOCKBUF_UNLOCK(&so->so_snd);
  2719. return (revents);
  2720. }
  2721. int
  2722. soo_kqfilter(struct file *fp, struct knote *kn)
  2723. {
  2724. struct socket *so = kn->kn_fp->f_data;
  2725. struct sockbuf *sb;
  2726. switch (kn->kn_filter) {
  2727. case EVFILT_READ:
  2728. if (so->so_options & SO_ACCEPTCONN)
  2729. kn->kn_fop = &solisten_filtops;
  2730. else
  2731. kn->kn_fop = &soread_filtops;
  2732. sb = &so->so_rcv;
  2733. break;
  2734. case EVFILT_WRITE:
  2735. kn->kn_fop = &sowrite_filtops;
  2736. sb = &so->so_snd;
  2737. break;
  2738. default:
  2739. return (EINVAL);
  2740. }
  2741. SOCKBUF_LOCK(sb);
  2742. knlist_add(&sb->sb_sel.si_note, kn, 1);
  2743. sb->sb_flags |= SB_KNOTE;
  2744. SOCKBUF_UNLOCK(sb);
  2745. return (0);
  2746. }
  2747. /*
  2748. * Some routines that return EOPNOTSUPP for entry points that are not
  2749. * supported by a protocol. Fill in as needed.
  2750. */
  2751. int
  2752. pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
  2753. {
  2754. return EOPNOTSUPP;
  2755. }
  2756. int
  2757. pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
  2758. {
  2759. return EOPNOTSUPP;
  2760. }
  2761. int
  2762. pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
  2763. {
  2764. return EOPNOTSUPP;
  2765. }
  2766. int
  2767. pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
  2768. {
  2769. return EOPNOTSUPP;
  2770. }
  2771. int
  2772. pru_connect2_notsupp(struct socket *so1, struct socket *so2)
  2773. {
  2774. return EOPNOTSUPP;
  2775. }
  2776. int
  2777. pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
  2778. struct ifnet *ifp, struct thread *td)
  2779. {
  2780. return EOPNOTSUPP;
  2781. }
  2782. int
  2783. pru_disconnect_notsupp(struct socket *so)
  2784. {
  2785. return EOPNOTSUPP;
  2786. }
  2787. int
  2788. pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
  2789. {
  2790. return EOPNOTSUPP;
  2791. }
  2792. int
  2793. pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
  2794. {
  2795. return EOPNOTSUPP;
  2796. }
  2797. int
  2798. pru_rcvd_notsupp(struct socket *so, int flags)
  2799. {
  2800. return EOPNOTSUPP;
  2801. }
  2802. int
  2803. pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
  2804. {
  2805. return EOPNOTSUPP;
  2806. }
  2807. int
  2808. pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
  2809. struct sockaddr *addr, struct mbuf *control, struct thread *td)
  2810. {
  2811. return EOPNOTSUPP;
  2812. }
  2813. /*
  2814. * This isn't really a ``null'' operation, but it's the default one and
  2815. * doesn't do anything destructive.
  2816. */
  2817. int
  2818. pru_sense_null(struct socket *so, struct stat *sb)
  2819. {
  2820. sb->st_blksize = so->so_snd.sb_hiwat;
  2821. return 0;
  2822. }
  2823. int
  2824. pru_shutdown_notsupp(struct socket *so)
  2825. {
  2826. return EOPNOTSUPP;
  2827. }
  2828. int
  2829. pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
  2830. {
  2831. return EOPNOTSUPP;
  2832. }
  2833. int
  2834. pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
  2835. struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
  2836. {
  2837. return EOPNOTSUPP;
  2838. }
  2839. int
  2840. pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
  2841. struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
  2842. {
  2843. return EOPNOTSUPP;
  2844. }
  2845. int
  2846. pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
  2847. struct thread *td)
  2848. {
  2849. return EOPNOTSUPP;
  2850. }
  2851. static void
  2852. filt_sordetach(struct knote *kn)
  2853. {
  2854. struct socket *so = kn->kn_fp->f_data;
  2855. SOCKBUF_LOCK(&so->so_rcv);
  2856. knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
  2857. if (knlist_empty(&so->so_rcv.sb_sel.si_note))
  2858. so->so_rcv.sb_flags &= ~SB_KNOTE;
  2859. SOCKBUF_UNLOCK(&so->so_rcv);
  2860. }
  2861. /*ARGSUSED*/
  2862. static int
  2863. filt_soread(struct knote *kn, long hint)
  2864. {
  2865. struct socket *so;
  2866. so = kn->kn_fp->f_data;
  2867. SOCKBUF_LOCK_ASSERT(&so->so_rcv);
  2868. kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
  2869. if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
  2870. kn->kn_flags |= EV_EOF;
  2871. kn->kn_fflags = so->so_error;
  2872. return (1);
  2873. } else if (so->so_error) /* temporary udp error */
  2874. return (1);
  2875. else if (kn->kn_sfflags & NOTE_LOWAT)
  2876. return (kn->kn_data >= kn->kn_sdata);
  2877. else
  2878. return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
  2879. }
  2880. static void
  2881. filt_sowdetach(struct knote *kn)
  2882. {
  2883. struct socket *so = kn->kn_fp->f_data;
  2884. SOCKBUF_LOCK(&so->so_snd);
  2885. knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
  2886. if (knlist_empty(&so->so_snd.sb_sel.si_note))
  2887. so->so_snd.sb_flags &= ~SB_KNOTE;
  2888. SOCKBUF_UNLOCK(&so->so_snd);
  2889. }
  2890. /*ARGSUSED*/
  2891. static int
  2892. filt_sowrite(struct knote *kn, long hint)
  2893. {
  2894. struct socket *so;
  2895. so = kn->kn_fp->f_data;
  2896. SOCKBUF_LOCK_ASSERT(&so->so_snd);
  2897. kn->kn_data = sbspace(&so->so_snd);
  2898. if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
  2899. kn->kn_flags |= EV_EOF;
  2900. kn->kn_fflags = so->so_error;
  2901. return (1);
  2902. } else if (so->so_error) /* temporary udp error */
  2903. return (1);
  2904. else if (((so->so_state & SS_ISCONNECTED) == 0) &&
  2905. (so->so_proto->pr_flags & PR_CONNREQUIRED))
  2906. return (0);
  2907. else if (kn->kn_sfflags & NOTE_LOWAT)
  2908. return (kn->kn_data >= kn->kn_sdata);
  2909. else
  2910. return (kn->kn_data >= so->so_snd.sb_lowat);
  2911. }
  2912. /*ARGSUSED*/
  2913. static int
  2914. filt_solisten(struct knote *kn, long hint)
  2915. {
  2916. struct socket *so = kn->kn_fp->f_data;
  2917. kn->kn_data = so->so_qlen;
  2918. return (! TAILQ_EMPTY(&so->so_comp));
  2919. }
  2920. int
  2921. socheckuid(struct socket *so, uid_t uid)
  2922. {
  2923. if (so == NULL)
  2924. return (EPERM);
  2925. if (so->so_cred->cr_uid != uid)
  2926. return (EPERM);
  2927. return (0);
  2928. }
  2929. static int
  2930. sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
  2931. {
  2932. int error;
  2933. int val;
  2934. val = somaxconn;
  2935. error = sysctl_handle_int(oidp, &val, 0, req);
  2936. if (error || !req->newptr )
  2937. return (error);
  2938. if (val < 1 || val > USHRT_MAX)
  2939. return (EINVAL);
  2940. somaxconn = val;
  2941. return (0);
  2942. }
  2943. /*
  2944. * These functions are used by protocols to notify the socket layer (and its
  2945. * consumers) of state changes in the sockets driven by protocol-side events.
  2946. */
  2947. /*
  2948. * Procedures to manipulate state flags of socket and do appropriate wakeups.
  2949. *
  2950. * Normal sequence from the active (originating) side is that
  2951. * soisconnecting() is called during processing of connect() call, resulting
  2952. * in an eventual call to soisconnected() if/when the connection is
  2953. * established. When the connection is torn down soisdisconnecting() is
  2954. * called during processing of disconnect() call, and soisdisconnected() is
  2955. * called when the connection to the peer is totally severed. The semantics
  2956. * of these routines are such that connectionless protocols can call
  2957. * soisconnected() and soisdisconnected() only, bypassing the in-progress
  2958. * calls when setting up a ``connection'' takes no time.
  2959. *
  2960. * From the passive side, a socket is created with two queues of sockets:
  2961. * so_incomp for connections in progress and so_comp for connections already
  2962. * made and awaiting user acceptance. As a protocol is preparing incoming
  2963. * connections, it creates a socket structure queued on so_incomp by calling
  2964. * sonewconn(). When the connection is established, soisconnected() is
  2965. * called, and transfers the socket structure to so_comp, making it available
  2966. * to accept().
  2967. *
  2968. * If a socket is closed with sockets on either so_incomp or so_comp, these
  2969. * sockets are dropped.
  2970. *
  2971. * If higher-level protocols are implemented in the kernel, the wakeups done
  2972. * here will sometimes cause software-interrupt process scheduling.
  2973. */
  2974. void
  2975. soisconnecting(struct socket *so)
  2976. {
  2977. SOCK_LOCK(so);
  2978. so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
  2979. so->so_state |= SS_ISCONNECTING;
  2980. SOCK_UNLOCK(so);
  2981. }
  2982. void
  2983. soisconnected(struct socket *so)
  2984. {
  2985. struct socket *head;
  2986. int ret;
  2987. restart:
  2988. ACCEPT_LOCK();
  2989. SOCK_LOCK(so);
  2990. so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
  2991. so->so_state |= SS_ISCONNECTED;
  2992. head = so->so_head;
  2993. if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
  2994. if ((so->so_options & SO_ACCEPTFILTER) == 0) {
  2995. SOCK_UNLOCK(so);
  2996. TAILQ_REMOVE(&head->so_incomp, so, so_list);
  2997. head->so_incqlen--;
  2998. so->so_qstate &= ~SQ_INCOMP;
  2999. TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
  3000. head->so_qlen++;
  3001. so->so_qstate |= SQ_COMP;
  3002. ACCEPT_UNLOCK();
  3003. sorwakeup(head);
  3004. wakeup_one(&head->so_timeo);
  3005. } else {
  3006. ACCEPT_UNLOCK();
  3007. soupcall_set(so, SO_RCV,
  3008. head->so_accf->so_accept_filter->accf_callback,
  3009. head->so_accf->so_accept_filter_arg);
  3010. so->so_options &= ~SO_ACCEPTFILTER;
  3011. ret = head->so_accf->so_accept_filter->accf_callback(so,
  3012. head->so_accf->so_accept_filter_arg, M_DONTWAIT);
  3013. if (ret == SU_ISCONNECTED)
  3014. soupcall_clear(so, SO_RCV);
  3015. SOCK_UNLOCK(so);
  3016. if (ret == SU_ISCONNECTED)
  3017. goto restart;
  3018. }
  3019. return;
  3020. }
  3021. SOCK_UNLOCK(so);
  3022. ACCEPT_UNLOCK();
  3023. wakeup(&so->so_timeo);
  3024. sorwakeup(so);
  3025. sowwakeup(so);
  3026. }
  3027. void
  3028. soisdisconnecting(struct socket *so)
  3029. {
  3030. /*
  3031. * Note: This code assumes that SOCK_LOCK(so) and
  3032. * SOCKBUF_LOCK(&so->so_rcv) are the same.
  3033. */
  3034. SOCKBUF_LOCK(&so->so_rcv);
  3035. so->so_state &= ~SS_ISCONNECTING;
  3036. so->so_state |= SS_ISDISCONNECTING;
  3037. so->so_rcv.sb_state |= SBS_CANTRCVMORE;
  3038. sorwakeup_locked(so);
  3039. SOCKBUF_LOCK(&so->so_snd);
  3040. so->so_snd.sb_state |= SBS_CANTSENDMORE;
  3041. sowwakeup_locked(so);
  3042. wakeup(&so->so_timeo);
  3043. }
  3044. void
  3045. soisdisconnected(struct socket *so)
  3046. {
  3047. /*
  3048. * Note: This code assumes that SOCK_LOCK(so) and
  3049. * SOCKBUF_LOCK(&so->so_rcv) are the same.
  3050. */
  3051. SOCKBUF_LOCK(&so->so_rcv);
  3052. so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
  3053. so->so_state |= SS_ISDISCONNECTED;
  3054. so->so_rcv.sb_state |= SBS_CANTRCVMORE;
  3055. sorwakeup_locked(so);
  3056. SOCKBUF_LOCK(&so->so_snd);
  3057. so->so_snd.sb_state |= SBS_CANTSENDMORE;
  3058. sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
  3059. sowwakeup_locked(so);
  3060. wakeup(&so->so_timeo);
  3061. }
  3062. /*
  3063. * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
  3064. */
  3065. struct sockaddr *
  3066. sodupsockaddr(const struct sockaddr *sa, int mflags)
  3067. {
  3068. struct sockaddr *sa2;
  3069. sa2 = malloc(sa->sa_len, M_SONAME, mflags);
  3070. if (sa2)
  3071. bcopy(sa, sa2, sa->sa_len);
  3072. return sa2;
  3073. }
  3074. /*
  3075. * Register per-socket buffer upcalls.
  3076. */
  3077. void
  3078. soupcall_set(struct socket *so, int which,
  3079. int (*func)(struct socket *, void *, int), void *arg)
  3080. {
  3081. struct sockbuf *sb;
  3082. switch (which) {
  3083. case SO_RCV:
  3084. sb = &so->so_rcv;
  3085. break;
  3086. case SO_SND:
  3087. sb = &so->so_snd;
  3088. break;
  3089. default:
  3090. panic("soupcall_set: bad which");
  3091. }
  3092. SOCKBUF_LOCK_ASSERT(sb);
  3093. #if 0
  3094. /* XXX: accf_http actually wants to do this on purpose. */
  3095. KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
  3096. #endif
  3097. sb->sb_upcall = func;
  3098. sb->sb_upcallarg = arg;
  3099. sb->sb_flags |= SB_UPCALL;
  3100. }
  3101. void
  3102. soupcall_clear(struct socket *so, int which)
  3103. {
  3104. struct sockbuf *sb;
  3105. switch (which) {
  3106. case SO_RCV:
  3107. sb = &so->so_rcv;
  3108. break;
  3109. case SO_SND:
  3110. sb = &so->so_snd;
  3111. break;
  3112. default:
  3113. panic("soupcall_clear: bad which");
  3114. }
  3115. SOCKBUF_LOCK_ASSERT(sb);
  3116. KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
  3117. sb->sb_upcall = NULL;
  3118. sb->sb_upcallarg = NULL;
  3119. sb->sb_flags &= ~SB_UPCALL;
  3120. }
  3121. /*
  3122. * Create an external-format (``xsocket'') structure using the information in
  3123. * the kernel-format socket structure pointed to by so. This is done to
  3124. * reduce the spew of irrelevant information over this interface, to isolate
  3125. * user code from changes in the kernel structure, and potentially to provide
  3126. * information-hiding if we decide that some of this information should be
  3127. * hidden from users.
  3128. */
  3129. void
  3130. sotoxsocket(struct socket *so, struct xsocket *xso)
  3131. {
  3132. xso->xso_len = sizeof *xso;
  3133. xso->xso_so = so;
  3134. xso->so_type = so->so_type;
  3135. xso->so_options = so->so_options;
  3136. xso->so_linger = so->so_linger;
  3137. xso->so_state = so->so_state;
  3138. xso->so_pcb = so->so_pcb;
  3139. xso->xso_protocol = so->so_proto->pr_protocol;
  3140. xso->xso_family = so->so_proto->pr_domain->dom_family;
  3141. xso->so_qlen = so->so_qlen;
  3142. xso->so_incqlen = so->so_incqlen;
  3143. xso->so_qlimit = so->so_qlimit;
  3144. xso->so_timeo = so->so_timeo;
  3145. xso->so_error = so->so_error;
  3146. xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
  3147. xso->so_oobmark = so->so_oobmark;
  3148. sbtoxsockbuf(&so->so_snd, &xso->so_snd);
  3149. sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
  3150. xso->so_uid = so->so_cred->cr_uid;
  3151. }
  3152. /*
  3153. * Socket accessor functions to provide external consumers with
  3154. * a safe interface to socket state
  3155. *
  3156. */
  3157. void
  3158. so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
  3159. {
  3160. TAILQ_FOREACH(so, &so->so_comp, so_list)
  3161. func(so, arg);
  3162. }
  3163. struct sockbuf *
  3164. so_sockbuf_rcv(struct socket *so)
  3165. {
  3166. return (&so->so_rcv);
  3167. }
  3168. struct sockbuf *
  3169. so_sockbuf_snd(struct socket *so)
  3170. {
  3171. return (&so->so_snd);
  3172. }
  3173. int
  3174. so_state_get(const struct socket *so)
  3175. {
  3176. return (so->so_state);
  3177. }
  3178. void
  3179. so_state_set(struct socket *so, int val)
  3180. {
  3181. so->so_state = val;
  3182. }
  3183. int
  3184. so_options_get(const struct socket *so)
  3185. {
  3186. return (so->so_options);
  3187. }
  3188. void
  3189. so_options_set(struct socket *so, int val)
  3190. {
  3191. so->so_options = val;
  3192. }
  3193. int
  3194. so_error_get(const struct socket *so)
  3195. {
  3196. return (so->so_error);
  3197. }
  3198. void
  3199. so_error_set(struct socket *so, int val)
  3200. {
  3201. so->so_error = val;
  3202. }
  3203. int
  3204. so_linger_get(const struct socket *so)
  3205. {
  3206. return (so->so_linger);
  3207. }
  3208. void
  3209. so_linger_set(struct socket *so, int val)
  3210. {
  3211. so->so_linger = val;
  3212. }
  3213. struct protosw *
  3214. so_protosw_get(const struct socket *so)
  3215. {
  3216. return (so->so_proto);
  3217. }
  3218. void
  3219. so_protosw_set(struct socket *so, struct protosw *val)
  3220. {
  3221. so->so_proto = val;
  3222. }
  3223. void
  3224. so_sorwakeup(struct socket *so)
  3225. {
  3226. sorwakeup(so);
  3227. }
  3228. void
  3229. so_sowwakeup(struct socket *so)
  3230. {
  3231. sowwakeup(so);
  3232. }
  3233. void
  3234. so_sorwakeup_locked(struct socket *so)
  3235. {
  3236. sorwakeup_locked(so);
  3237. }
  3238. void
  3239. so_sowwakeup_locked(struct socket *so)
  3240. {
  3241. sowwakeup_locked(so);
  3242. }
  3243. void
  3244. so_lock(struct socket *so)
  3245. {
  3246. SOCK_LOCK(so);
  3247. }
  3248. void
  3249. so_unlock(struct socket *so)
  3250. {
  3251. SOCK_UNLOCK(so);
  3252. }