PageRenderTime 39ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/bsd/sys/kern/uipc_socket.cc

https://gitlab.com/jforge/osv
C++ | 3582 lines | 2512 code | 293 blank | 777 comment | 821 complexity | 4a910845644ac13034a5795ea76da75e MD5 | raw file
Possible License(s): BSD-3-Clause, 0BSD, MPL-2.0-no-copyleft-exception

Large files files are truncated, but you can click here to view the full file

  1. /*-
  2. * Copyright (c) 1982, 1986, 1988, 1990, 1993
  3. * The Regents of the University of California.
  4. * Copyright (c) 2004 The FreeBSD Foundation
  5. * Copyright (c) 2004-2008 Robert N. M. Watson
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. * 4. Neither the name of the University nor the names of its contributors
  17. * may be used to endorse or promote products derived from this software
  18. * without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30. * SUCH DAMAGE.
  31. *
  32. * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
  33. */
  34. /*
  35. * Comments on the socket life cycle:
  36. *
  37. * soalloc() sets of socket layer state for a socket, called only by
  38. * socreate() and sonewconn(). Socket layer private.
  39. *
  40. * sodealloc() tears down socket layer state for a socket, called only by
  41. * sofree() and sonewconn(). Socket layer private.
  42. *
  43. * pru_attach() associates protocol layer state with an allocated socket;
  44. * called only once, may fail, aborting socket allocation. This is called
  45. * from socreate() and sonewconn(). Socket layer private.
  46. *
  47. * pru_detach() disassociates protocol layer state from an attached socket,
  48. * and will be called exactly once for sockets in which pru_attach() has
  49. * been successfully called. If pru_attach() returned an error,
  50. * pru_detach() will not be called. Socket layer private.
  51. *
  52. * pru_abort() and pru_close() notify the protocol layer that the last
  53. * consumer of a socket is starting to tear down the socket, and that the
  54. * protocol should terminate the connection. Historically, pru_abort() also
  55. * detached protocol state from the socket state, but this is no longer the
  56. * case.
  57. *
  58. * socreate() creates a socket and attaches protocol state. This is a public
  59. * interface that may be used by socket layer consumers to create new
  60. * sockets.
  61. *
  62. * sonewconn() creates a socket and attaches protocol state. This is a
  63. * public interface that may be used by protocols to create new sockets when
  64. * a new connection is received and will be available for accept() on a
  65. * listen socket.
  66. *
  67. * soclose() destroys a socket after possibly waiting for it to disconnect.
  68. * This is a public interface that socket consumers should use to close and
  69. * release a socket when done with it.
  70. *
  71. * soabort() destroys a socket without waiting for it to disconnect (used
  72. * only for incoming connections that are already partially or fully
  73. * connected). This is used internally by the socket layer when clearing
  74. * listen socket queues (due to overflow or close on the listen socket), but
  75. * is also a public interface protocols may use to abort connections in
  76. * their incomplete listen queues should they no longer be required. Sockets
  77. * placed in completed connection listen queues should not be aborted for
  78. * reasons described in the comment above the soclose() implementation. This
  79. * is not a general purpose close routine, and except in the specific
  80. * circumstances described here, should not be used.
  81. *
  82. * sofree() will free a socket and its protocol state if all references on
  83. * the socket have been released, and is the public interface to attempt to
  84. * free a socket when a reference is removed. This is a socket layer private
  85. * interface.
  86. *
  87. * NOTE: In addition to socreate() and soclose(), which provide a single
  88. * socket reference to the consumer to be managed as required, there are two
  89. * calls to explicitly manage socket references, soref(), and sorele().
  90. * Currently, these are generally required only when transitioning a socket
  91. * from a listen queue to a file descriptor, in order to prevent garbage
  92. * collection of the socket at an untimely moment. For a number of reasons,
  93. * these interfaces are not preferred, and should be avoided.
  94. *
  95. * NOTE: With regard to VNETs the general rule is that callers do not set
  96. * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
  97. * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
  98. * and sorflush(), which are usually called from a pre-set VNET context.
  99. * sopoll() currently does not need a VNET context to be set.
  100. */
  101. #include <sys/cdefs.h>
  102. #include <stddef.h>
  103. #include <osv/poll.h>
  104. #include <sys/epoll.h>
  105. #include <osv/debug.h>
  106. #include <cinttypes>
  107. #include <bsd/porting/netport.h>
  108. #include <bsd/porting/uma_stub.h>
  109. #include <bsd/porting/sync_stub.h>
  110. #include <bsd/porting/synch.h>
  111. #include <bsd/sys/sys/libkern.h>
  112. #include <bsd/sys/sys/param.h>
  113. #include <bsd/sys/sys/limits.h>
  114. #include <bsd/sys/sys/mbuf.h>
  115. #include <bsd/sys/sys/domain.h>
  116. #include <bsd/sys/sys/eventhandler.h>
  117. #include <bsd/sys/sys/protosw.h>
  118. #include <bsd/sys/sys/socket.h>
  119. #include <bsd/sys/sys/socketvar.h>
  120. #include <bsd/sys/net/route.h>
  121. #include <bsd/sys/net/vnet.h>
  122. #include <osv/zcopy.hh>
  123. #define uipc_d(...) tprintf_d("uipc_socket", __VA_ARGS__)
  124. static int soreceive_rcvoob(struct socket *so, struct uio *uio,
  125. int flags);
  126. so_gen_t so_gencnt; /* generation count for sockets */
  127. int maxsockets = 256;
  128. MALLOC_DEFINE(M_SONAME, "soname", "socket name");
  129. MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
  130. #define VNET_SO_ASSERT(so) \
  131. VNET_ASSERT(curvnet != NULL, \
  132. ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
  133. static int somaxconn = SOMAXCONN;
  134. #if 0
  135. static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
  136. /* XXX: we dont have SYSCTL_USHORT */
  137. SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
  138. 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
  139. "queue size");
  140. #endif
  141. static int numopensockets;
  142. SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
  143. &numopensockets, 0, "Number of open sockets");
  144. /*
  145. * accept_mtx locks down per-socket fields relating to accept queues. See
  146. * socketvar.h for an annotation of the protected fields of struct socket.
  147. */
  148. struct mtx accept_mtx;
  149. /*
  150. * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
  151. * so_gencnt field.
  152. */
  153. static struct mtx so_global_mtx;
  154. /*
  155. * General IPC sysctl name space, used by sockets and a variety of other IPC
  156. * types.
  157. */
  158. SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
  159. #if 0
  160. /*
  161. * Sysctl to get and set the maximum global sockets limit. Notify protocols
  162. * of the change so that they can update their dependent limits as required.
  163. */
  164. static int
  165. sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
  166. {
  167. int error, newmaxsockets;
  168. newmaxsockets = maxsockets;
  169. error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
  170. if (error == 0 && req->newptr) {
  171. if (newmaxsockets > maxsockets) {
  172. maxsockets = newmaxsockets;
  173. if (maxsockets > ((maxfiles / 4) * 3)) {
  174. maxfiles = (maxsockets * 5) / 4;
  175. maxfilesperproc = (maxfiles * 9) / 10;
  176. }
  177. EVENTHANDLER_INVOKE(maxsockets_change);
  178. } else
  179. error = EINVAL;
  180. }
  181. return (error);
  182. }
  183. SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
  184. &maxsockets, 0, sysctl_maxsockets, "IU",
  185. "Maximum number of sockets avaliable");
  186. #endif
  187. /*
  188. * Initialise maxsockets. This SYSINIT must be run after
  189. * tunable_mbinit().
  190. */
  191. void
  192. init_maxsockets(void *ignored)
  193. {
  194. mtx_init(&accept_mtx, "accept", NULL, MTX_DEF);
  195. mtx_init(&so_global_mtx, "so_global", NULL, MTX_DEF);
  196. TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
  197. maxsockets = 0x2000;
  198. }
  199. SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
  200. /*
  201. * Socket operation routines. These routines are called by the routines in
  202. * sys_socket.c or from a system process, and implement the semantics of
  203. * socket operations by switching out to the protocol specific routines.
  204. */
  205. /*
  206. * Get a socket structure from our zone, and initialize it. Note that it
  207. * would probably be better to allocate socket and PCB at the same time, but
  208. * I'm not convinced that all the protocols can be easily modified to do
  209. * this.
  210. *
  211. * soalloc() returns a socket with a ref count of 0.
  212. */
  213. static struct socket *
  214. soalloc(struct vnet *vnet)
  215. {
  216. struct socket *so;
  217. so = new socket{};
  218. if (so == NULL)
  219. return (NULL);
  220. uipc_d("soalloc() so=%" PRIx64, (uint64_t)so);
  221. TAILQ_INIT(&so->so_aiojobq);
  222. mtx_lock(&so_global_mtx);
  223. so->so_gencnt = ++so_gencnt;
  224. ++numopensockets;
  225. mtx_unlock(&so_global_mtx);
  226. return (so);
  227. }
  228. /*
  229. * Free the storage associated with a socket at the socket layer, tear down
  230. * locks, labels, etc. All protocol state is assumed already to have been
  231. * torn down (and possibly never set up) by the caller.
  232. */
  233. static void
  234. sodealloc(struct socket *so)
  235. {
  236. uipc_d("sodealloc() so=%" PRIx64, (uint64_t)so);
  237. KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
  238. KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
  239. mtx_lock(&so_global_mtx);
  240. so->so_gencnt = ++so_gencnt;
  241. --numopensockets; /* Could be below, but faster here. */
  242. mtx_unlock(&so_global_mtx);
  243. so->so_rcv.sb_hiwat = 0;
  244. so->so_snd.sb_hiwat = 0;
  245. #ifdef INET
  246. /* FIXME: OSv - should this be supported? */
  247. # if 0
  248. /* remove acccept filter if one is present. */
  249. if (so->so_accf != NULL)
  250. do_setopt_accept_filter(so, NULL);
  251. # endif
  252. #endif
  253. delete so;
  254. }
  255. /*
  256. * socreate returns a socket with a ref count of 1. The socket should be
  257. * closed with soclose().
  258. */
  259. int
  260. socreate(int dom, struct socket **aso, int type, int proto,
  261. struct ucred *cred, struct thread *td)
  262. {
  263. struct protosw *prp;
  264. struct socket *so;
  265. int error;
  266. if (proto)
  267. prp = pffindproto(dom, proto, type);
  268. else
  269. prp = pffindtype(dom, type);
  270. if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
  271. prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
  272. return (EPROTONOSUPPORT);
  273. if (prp->pr_type != type)
  274. return (EPROTOTYPE);
  275. so = soalloc(CRED_TO_VNET(cred));
  276. if (so == NULL)
  277. return (ENOBUFS);
  278. TAILQ_INIT(&so->so_incomp);
  279. TAILQ_INIT(&so->so_comp);
  280. so->so_type = type;
  281. if ((prp->pr_domain->dom_family == PF_INET) ||
  282. (prp->pr_domain->dom_family == PF_INET6) ||
  283. (prp->pr_domain->dom_family == PF_ROUTE))
  284. so->so_fibnum = RT_DEFAULT_FIB;
  285. else
  286. so->so_fibnum = 0;
  287. so->so_proto = prp;
  288. so->so_count = 1;
  289. /*
  290. * Auto-sizing of socket buffers is managed by the protocols and
  291. * the appropriate flags must be set in the pru_attach function.
  292. */
  293. CURVNET_SET(so->so_vnet);
  294. error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
  295. CURVNET_RESTORE();
  296. if (error) {
  297. KASSERT(so->so_count == 1, ("socreate: so_count %d",
  298. so->so_count));
  299. so->so_count = 0;
  300. sodealloc(so);
  301. return (error);
  302. }
  303. *aso = so;
  304. return (0);
  305. }
  306. #ifdef REGRESSION
  307. static int regression_sonewconn_earlytest = 1;
  308. SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
  309. &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
  310. #endif
  311. /*
  312. * When an attempt at a new connection is noted on a socket which accepts
  313. * connections, sonewconn is called. If the connection is possible (subject
  314. * to space constraints, etc.) then we allocate a new structure, propoerly
  315. * linked into the data structure of the original socket, and return this.
  316. * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
  317. *
  318. * Note: the ref count on the socket is 0 on return.
  319. */
  320. struct socket *
  321. sonewconn(struct socket *head, int connstatus)
  322. {
  323. struct socket *so;
  324. int over;
  325. uipc_d("sonewconn() head=%" PRIx64, (uint64_t)head);
  326. ACCEPT_LOCK();
  327. over = (head->so_qlen > 3 * head->so_qlimit / 2);
  328. ACCEPT_UNLOCK();
  329. #ifdef REGRESSION
  330. if (regression_sonewconn_earlytest && over)
  331. #else
  332. if (over)
  333. #endif
  334. return (NULL);
  335. VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
  336. __func__, __LINE__, head));
  337. so = soalloc(head->so_vnet);
  338. if (so == NULL)
  339. return (NULL);
  340. if ((head->so_options & SO_ACCEPTFILTER) != 0)
  341. connstatus = 0;
  342. so->so_head = head;
  343. so->so_type = head->so_type;
  344. so->so_options = head->so_options &~ SO_ACCEPTCONN;
  345. so->so_linger = head->so_linger;
  346. so->so_state = head->so_state | SS_NOFDREF;
  347. so->so_fibnum = head->so_fibnum;
  348. so->so_proto = head->so_proto;
  349. VNET_SO_ASSERT(head);
  350. if (soreserve_internal(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
  351. (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
  352. sodealloc(so);
  353. return (NULL);
  354. }
  355. so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
  356. so->so_snd.sb_lowat = head->so_snd.sb_lowat;
  357. so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
  358. so->so_snd.sb_timeo = head->so_snd.sb_timeo;
  359. so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
  360. so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
  361. so->so_state |= connstatus;
  362. ACCEPT_LOCK();
  363. if (connstatus) {
  364. TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
  365. so->so_qstate |= SQ_COMP;
  366. head->so_qlen++;
  367. } else {
  368. /*
  369. * Keep removing sockets from the head until there's room for
  370. * us to insert on the tail. In pre-locking revisions, this
  371. * was a simple if(), but as we could be racing with other
  372. * threads and soabort() requires dropping locks, we must
  373. * loop waiting for the condition to be true.
  374. */
  375. while (head->so_incqlen > head->so_qlimit) {
  376. struct socket *sp;
  377. sp = TAILQ_FIRST(&head->so_incomp);
  378. TAILQ_REMOVE(&head->so_incomp, sp, so_list);
  379. head->so_incqlen--;
  380. sp->so_qstate &= ~SQ_INCOMP;
  381. sp->so_head = NULL;
  382. ACCEPT_UNLOCK();
  383. soabort(sp);
  384. ACCEPT_LOCK();
  385. }
  386. TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
  387. so->so_qstate |= SQ_INCOMP;
  388. head->so_incqlen++;
  389. }
  390. ACCEPT_UNLOCK();
  391. if (connstatus) {
  392. sorwakeup(head);
  393. wakeup_one(&head->so_timeo);
  394. }
  395. return (so);
  396. }
  397. int
  398. sobind(struct socket *so, struct bsd_sockaddr *nam, struct thread *td)
  399. {
  400. int error;
  401. CURVNET_SET(so->so_vnet);
  402. error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
  403. CURVNET_RESTORE();
  404. return error;
  405. }
  406. /*
  407. * solisten() transitions a socket from a non-listening state to a listening
  408. * state, but can also be used to update the listen queue depth on an
  409. * existing listen socket. The protocol will call back into the sockets
  410. * layer using solisten_proto_check() and solisten_proto() to check and set
  411. * socket-layer listen state. Call backs are used so that the protocol can
  412. * acquire both protocol and socket layer locks in whatever order is required
  413. * by the protocol.
  414. *
  415. * Protocol implementors are advised to hold the socket lock across the
  416. * socket-layer test and set to avoid races at the socket layer.
  417. */
  418. int
  419. solisten(struct socket *so, int backlog, struct thread *td)
  420. {
  421. int error;
  422. CURVNET_SET(so->so_vnet);
  423. error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
  424. CURVNET_RESTORE();
  425. return error;
  426. }
  427. int
  428. solisten_proto_check(struct socket *so)
  429. {
  430. SOCK_LOCK_ASSERT(so);
  431. if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
  432. SS_ISDISCONNECTING))
  433. return (EINVAL);
  434. return (0);
  435. }
  436. void
  437. solisten_proto(struct socket *so, int backlog)
  438. {
  439. SOCK_LOCK_ASSERT(so);
  440. backlog = somaxconn;
  441. so->so_qlimit = backlog;
  442. so->so_options |= SO_ACCEPTCONN;
  443. }
  444. /*
  445. * Evaluate the reference count and named references on a socket; if no
  446. * references remain, free it. This should be called whenever a reference is
  447. * released, such as in sorele(), but also when named reference flags are
  448. * cleared in socket or protocol code.
  449. *
  450. * sofree() will free the socket if:
  451. *
  452. * - There are no outstanding file descriptor references or related consumers
  453. * (so_count == 0).
  454. *
  455. * - The socket has been closed by user space, if ever open (SS_NOFDREF).
  456. *
  457. * - The protocol does not have an outstanding strong reference on the socket
  458. * (SS_PROTOREF).
  459. *
  460. * - The socket is not in a completed connection queue, so a process has been
  461. * notified that it is present. If it is removed, the user process may
  462. * block in accept() despite select() saying the socket was ready.
  463. */
  464. void
  465. sofree(struct socket *so)
  466. {
  467. uipc_d("sofree() so=%" PRIx64, (uint64_t)so);
  468. struct protosw *pr = so->so_proto;
  469. struct socket *head;
  470. ACCEPT_LOCK_ASSERT();
  471. SOCK_LOCK_ASSERT(so);
  472. if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
  473. (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
  474. SOCK_UNLOCK(so);
  475. ACCEPT_UNLOCK();
  476. return;
  477. }
  478. head = so->so_head;
  479. if (head != NULL) {
  480. KASSERT((so->so_qstate & SQ_COMP) != 0 ||
  481. (so->so_qstate & SQ_INCOMP) != 0,
  482. ("sofree: so_head != NULL, but neither SQ_COMP nor "
  483. "SQ_INCOMP"));
  484. KASSERT((so->so_qstate & SQ_COMP) == 0 ||
  485. (so->so_qstate & SQ_INCOMP) == 0,
  486. ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
  487. TAILQ_REMOVE(&head->so_incomp, so, so_list);
  488. head->so_incqlen--;
  489. so->so_qstate &= ~SQ_INCOMP;
  490. so->so_head = NULL;
  491. }
  492. KASSERT((so->so_qstate & SQ_COMP) == 0 &&
  493. (so->so_qstate & SQ_INCOMP) == 0,
  494. ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
  495. so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
  496. if (so->so_options & SO_ACCEPTCONN) {
  497. KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
  498. KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_incomp populated"));
  499. }
  500. SOCK_UNLOCK(so);
  501. ACCEPT_UNLOCK();
  502. VNET_SO_ASSERT(so);
  503. if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
  504. (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
  505. if (pr->pr_usrreqs->pru_detach != NULL)
  506. (*pr->pr_usrreqs->pru_detach)(so);
  507. so->so_mtx = nullptr;
  508. /*
  509. * From this point on, we assume that no other references to this
  510. * socket exist anywhere else in the stack. Therefore, no locks need
  511. * to be acquired or held.
  512. *
  513. * We used to do a lot of socket buffer and socket locking here, as
  514. * well as invoke sorflush() and perform wakeups. The direct call to
  515. * dom_dispose() and sbrelease_internal() are an inlining of what was
  516. * necessary from sorflush().
  517. *
  518. * Notice that the socket buffer and kqueue state are torn down
  519. * before calling pru_detach. This means that protocols shold not
  520. * assume they can perform socket wakeups, etc, in their detach code.
  521. */
  522. sbdestroy(&so->so_snd, so);
  523. sbdestroy(&so->so_rcv, so);
  524. sodealloc(so);
  525. }
  526. static void flush_net_channel(struct socket *so)
  527. {
  528. if (so->so_nc) {
  529. so->so_nc->process_queue();
  530. }
  531. }
  532. /*
  533. * Close a socket on last file table reference removal. Initiate disconnect
  534. * if connected. Free socket when disconnect complete.
  535. *
  536. * This function will sorele() the socket. Note that soclose() may be called
  537. * prior to the ref count reaching zero. The actual socket structure will
  538. * not be freed until the ref count reaches zero.
  539. */
  540. int
  541. soclose(struct socket *so)
  542. {
  543. int error = 0;
  544. uipc_d("soclose() so=%" PRIx64, (uint64_t)so);
  545. KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
  546. CURVNET_SET(so->so_vnet);
  547. if (so->so_state & SS_ISCONNECTED) {
  548. if ((so->so_state & SS_ISDISCONNECTING) == 0) {
  549. error = sodisconnect(so);
  550. if (error) {
  551. if (error == ENOTCONN)
  552. error = 0;
  553. goto drop;
  554. }
  555. }
  556. if (so->so_options & SO_LINGER) {
  557. if ((so->so_state & SS_ISDISCONNECTING) &&
  558. (so->so_state & SS_NBIO))
  559. goto drop;
  560. while (so->so_state & SS_ISCONNECTED) {
  561. error = tsleep(&so->so_timeo,
  562. 0, "soclos", so->so_linger * hz);
  563. if (error)
  564. break;
  565. }
  566. }
  567. }
  568. drop:
  569. if (so->so_proto->pr_usrreqs->pru_close != NULL)
  570. (*so->so_proto->pr_usrreqs->pru_close)(so);
  571. if (so->so_options & SO_ACCEPTCONN) {
  572. struct socket *sp;
  573. ACCEPT_LOCK();
  574. while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
  575. TAILQ_REMOVE(&so->so_incomp, sp, so_list);
  576. so->so_incqlen--;
  577. sp->so_qstate &= ~SQ_INCOMP;
  578. sp->so_head = NULL;
  579. ACCEPT_UNLOCK();
  580. soabort(sp);
  581. ACCEPT_LOCK();
  582. }
  583. while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
  584. TAILQ_REMOVE(&so->so_comp, sp, so_list);
  585. so->so_qlen--;
  586. sp->so_qstate &= ~SQ_COMP;
  587. sp->so_head = NULL;
  588. ACCEPT_UNLOCK();
  589. soabort(sp);
  590. ACCEPT_LOCK();
  591. }
  592. ACCEPT_UNLOCK();
  593. }
  594. ACCEPT_LOCK();
  595. SOCK_LOCK(so);
  596. flush_net_channel(so);
  597. KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
  598. so->so_state |= SS_NOFDREF;
  599. so->fp = NULL;
  600. sorele(so);
  601. CURVNET_RESTORE();
  602. return (error);
  603. }
  604. /*
  605. * soabort() is used to abruptly tear down a connection, such as when a
  606. * resource limit is reached (listen queue depth exceeded), or if a listen
  607. * socket is closed while there are sockets waiting to be accepted.
  608. *
  609. * This interface is tricky, because it is called on an unreferenced socket,
  610. * and must be called only by a thread that has actually removed the socket
  611. * from the listen queue it was on, or races with other threads are risked.
  612. *
  613. * This interface will call into the protocol code, so must not be called
  614. * with any socket locks held. Protocols do call it while holding their own
  615. * recursible protocol mutexes, but this is something that should be subject
  616. * to review in the future.
  617. */
  618. void
  619. soabort(struct socket *so)
  620. {
  621. uipc_d("soabort() so=%" PRIx64, (uint64_t)so);
  622. /*
  623. * In as much as is possible, assert that no references to this
  624. * socket are held. This is not quite the same as asserting that the
  625. * current thread is responsible for arranging for no references, but
  626. * is as close as we can get for now.
  627. */
  628. KASSERT(so->so_count == 0, ("soabort: so_count"));
  629. KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
  630. KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
  631. KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
  632. KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
  633. VNET_SO_ASSERT(so);
  634. if (so->so_proto->pr_usrreqs->pru_abort != NULL)
  635. (*so->so_proto->pr_usrreqs->pru_abort)(so);
  636. ACCEPT_LOCK();
  637. SOCK_LOCK(so);
  638. sofree(so);
  639. }
  640. int
  641. soaccept(struct socket *so, struct bsd_sockaddr **nam)
  642. {
  643. int error;
  644. SOCK_LOCK(so);
  645. KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
  646. so->so_state &= ~SS_NOFDREF;
  647. SOCK_UNLOCK(so);
  648. CURVNET_SET(so->so_vnet);
  649. error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
  650. CURVNET_RESTORE();
  651. return (error);
  652. }
  653. int
  654. soconnect(struct socket *so, struct bsd_sockaddr *nam, struct thread *td)
  655. {
  656. int error;
  657. if (so->so_options & SO_ACCEPTCONN)
  658. return (EOPNOTSUPP);
  659. CURVNET_SET(so->so_vnet);
  660. /*
  661. * If protocol is connection-based, can only connect once.
  662. * Otherwise, if connected, try to disconnect first. This allows
  663. * user to disconnect by connecting to, e.g., a null address.
  664. */
  665. if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
  666. ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
  667. (error = sodisconnect(so)))) {
  668. error = EISCONN;
  669. } else {
  670. /*
  671. * Prevent accumulated error from previous connection from
  672. * biting us.
  673. */
  674. so->so_error = 0;
  675. error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
  676. }
  677. CURVNET_RESTORE();
  678. return (error);
  679. }
  680. int
  681. soconnect2(struct socket *so1, struct socket *so2)
  682. {
  683. int error;
  684. CURVNET_SET(so1->so_vnet);
  685. error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
  686. CURVNET_RESTORE();
  687. return (error);
  688. }
  689. int
  690. sodisconnect(struct socket *so)
  691. {
  692. int error;
  693. if ((so->so_state & SS_ISCONNECTED) == 0)
  694. return (ENOTCONN);
  695. if (so->so_state & SS_ISDISCONNECTING)
  696. return (EALREADY);
  697. VNET_SO_ASSERT(so);
  698. error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
  699. return (error);
  700. }
  701. #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
  702. int
  703. sosend_dgram(struct socket *so, struct bsd_sockaddr *addr, struct uio *uio,
  704. struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
  705. {
  706. long space;
  707. ssize_t resid;
  708. int clen = 0, error, dontroute;
  709. KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
  710. KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
  711. ("sodgram_send: !PR_ATOMIC"));
  712. if (uio != NULL)
  713. resid = uio->uio_resid;
  714. else
  715. resid = top->M_dat.MH.MH_pkthdr.len;
  716. /*
  717. * In theory resid should be unsigned. However, space must be
  718. * signed, as it might be less than 0 if we over-committed, and we
  719. * must use a signed comparison of space and resid. On the other
  720. * hand, a negative resid causes us to loop sending 0-length
  721. * segments to the protocol.
  722. */
  723. if (resid < 0) {
  724. error = EINVAL;
  725. goto out;
  726. }
  727. dontroute =
  728. (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
  729. if (control != NULL)
  730. clen = control->m_hdr.mh_len;
  731. SOCK_LOCK(so);
  732. if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
  733. SOCK_UNLOCK(so);
  734. error = EPIPE;
  735. goto out;
  736. }
  737. if (so->so_error) {
  738. error = so->so_error;
  739. so->so_error = 0;
  740. SOCK_UNLOCK(so);
  741. goto out;
  742. }
  743. if ((so->so_state & SS_ISCONNECTED) == 0) {
  744. /*
  745. * `sendto' and `sendmsg' is allowed on a connection-based
  746. * socket if it supports implied connect. Return ENOTCONN if
  747. * not connected and no address is supplied.
  748. */
  749. if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
  750. (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
  751. if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  752. !(resid == 0 && clen != 0)) {
  753. SOCK_UNLOCK(so);
  754. error = ENOTCONN;
  755. goto out;
  756. }
  757. } else if (addr == NULL) {
  758. if (so->so_proto->pr_flags & PR_CONNREQUIRED)
  759. error = ENOTCONN;
  760. else
  761. error = EDESTADDRREQ;
  762. SOCK_UNLOCK(so);
  763. goto out;
  764. }
  765. }
  766. /*
  767. * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
  768. * problem and need fixing.
  769. */
  770. space = sbspace(&so->so_snd);
  771. if (flags & MSG_OOB)
  772. space += 1024;
  773. space -= clen;
  774. SOCK_UNLOCK(so);
  775. if (resid > space) {
  776. error = EMSGSIZE;
  777. goto out;
  778. }
  779. if (uio == NULL) {
  780. resid = 0;
  781. if (flags & MSG_EOR)
  782. top->m_hdr.mh_flags |= M_EOR;
  783. } else {
  784. /*
  785. * Copy the data from userland into a mbuf chain.
  786. * If no data is to be copied in, a single empty mbuf
  787. * is returned.
  788. */
  789. top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1,
  790. (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
  791. if (top == NULL) {
  792. error = EFAULT; /* only possible error */
  793. goto out;
  794. }
  795. space -= resid - uio->uio_resid;
  796. resid = uio->uio_resid;
  797. }
  798. KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
  799. /*
  800. * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
  801. * than with.
  802. */
  803. if (dontroute) {
  804. SOCK_LOCK(so);
  805. so->so_options |= SO_DONTROUTE;
  806. SOCK_UNLOCK(so);
  807. }
  808. /*
  809. * XXX all the SBS_CANTSENDMORE checks previously done could be out
  810. * of date. We could have recieved a reset packet in an interrupt or
  811. * maybe we slept while doing page faults in uiomove() etc. We could
  812. * probably recheck again inside the locking protection here, but
  813. * there are probably other places that this also happens. We must
  814. * rethink this.
  815. */
  816. VNET_SO_ASSERT(so);
  817. error = (*so->so_proto->pr_usrreqs->pru_send)(so,
  818. (flags & MSG_OOB) ? PRUS_OOB :
  819. /*
  820. * If the user set MSG_EOF, the protocol understands this flag and
  821. * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
  822. */
  823. ((flags & MSG_EOF) &&
  824. (so->so_proto->pr_flags & PR_IMPLOPCL) &&
  825. (resid <= 0)) ?
  826. PRUS_EOF :
  827. /* If there is more to send set PRUS_MORETOCOME */
  828. (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
  829. top, addr, control, td);
  830. if (dontroute) {
  831. SOCK_LOCK(so);
  832. so->so_options &= ~SO_DONTROUTE;
  833. SOCK_UNLOCK(so);
  834. }
  835. clen = 0;
  836. control = NULL;
  837. top = NULL;
  838. out:
  839. if (top != NULL)
  840. m_freem(top);
  841. if (control != NULL)
  842. m_freem(control);
  843. return (error);
  844. }
  845. /*
  846. * Send on a socket. If send must go all at once and message is larger than
  847. * send buffering, then hard error. Lock against other senders. If must go
  848. * all at once and not enough room now, then inform user that this would
  849. * block and do nothing. Otherwise, if nonblocking, send as much as
  850. * possible. The data to be sent is described by "uio" if nonzero, otherwise
  851. * by the mbuf chain "top" (which must be null if uio is not). Data provided
  852. * in mbuf chain must be small enough to send all at once.
  853. *
  854. * Returns nonzero on error, timeout or signal; callers must check for short
  855. * counts if EINTR/ERESTART are returned. Data and control buffers are freed
  856. * on return.
  857. */
  858. int
  859. sosend_generic(struct socket *so, struct bsd_sockaddr *addr, struct uio *uio,
  860. struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
  861. {
  862. long space;
  863. ssize_t resid;
  864. int clen = 0, error, dontroute;
  865. int atomic = sosendallatonce(so) || top;
  866. if (uio != NULL)
  867. resid = uio->uio_resid;
  868. else
  869. resid = top->M_dat.MH.MH_pkthdr.len;
  870. /*
  871. * In theory resid should be unsigned. However, space must be
  872. * signed, as it might be less than 0 if we over-committed, and we
  873. * must use a signed comparison of space and resid. On the other
  874. * hand, a negative resid causes us to loop sending 0-length
  875. * segments to the protocol.
  876. *
  877. * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
  878. * type sockets since that's an error.
  879. */
  880. if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
  881. error = EINVAL;
  882. goto out_unlocked;
  883. }
  884. dontroute =
  885. (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
  886. (so->so_proto->pr_flags & PR_ATOMIC);
  887. if (control != NULL)
  888. clen = control->m_hdr.mh_len;
  889. SOCK_LOCK(so);
  890. error = sblock(so, &so->so_snd, SBLOCKWAIT(flags));
  891. if (error)
  892. goto out;
  893. restart:
  894. flush_net_channel(so);
  895. do {
  896. if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
  897. error = EPIPE;
  898. goto release;
  899. }
  900. if (so->so_error) {
  901. error = so->so_error;
  902. so->so_error = 0;
  903. goto release;
  904. }
  905. if ((so->so_state & SS_ISCONNECTED) == 0) {
  906. /*
  907. * `sendto' and `sendmsg' is allowed on a connection-
  908. * based socket if it supports implied connect.
  909. * Return ENOTCONN if not connected and no address is
  910. * supplied.
  911. */
  912. if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
  913. (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
  914. if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  915. !(resid == 0 && clen != 0)) {
  916. error = ENOTCONN;
  917. goto release;
  918. }
  919. } else if (addr == NULL) {
  920. if (so->so_proto->pr_flags & PR_CONNREQUIRED)
  921. error = ENOTCONN;
  922. else
  923. error = EDESTADDRREQ;
  924. goto release;
  925. }
  926. }
  927. space = sbspace(&so->so_snd);
  928. if (flags & MSG_OOB)
  929. space += 1024;
  930. if ((atomic && resid > so->so_snd.sb_hiwat) ||
  931. (u_int)clen > so->so_snd.sb_hiwat) {
  932. error = EMSGSIZE;
  933. goto release;
  934. }
  935. if (space < resid + clen &&
  936. (atomic || space < so->so_snd.sb_lowat || space < clen)) {
  937. if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
  938. error = EWOULDBLOCK;
  939. goto release;
  940. }
  941. error = sbwait(so, &so->so_snd);
  942. if (error)
  943. goto release;
  944. goto restart;
  945. }
  946. space -= clen;
  947. do {
  948. if (uio == NULL) {
  949. resid = 0;
  950. if (flags & MSG_EOR)
  951. top->m_hdr.mh_flags |= M_EOR;
  952. } else {
  953. /*
  954. * Copy the data from userland into a mbuf
  955. * chain. If no data is to be copied in,
  956. * a single empty mbuf is returned.
  957. */
  958. top = m_uiotombuf(uio, M_WAITOK, space,
  959. (atomic ? max_hdr : 0), MCLBYTES,
  960. (atomic ? M_PKTHDR : 0) |
  961. ((flags & MSG_EOR) ? M_EOR : 0));
  962. if (top == NULL) {
  963. error = EFAULT; /* only possible error */
  964. goto release;
  965. }
  966. space -= resid - uio->uio_resid;
  967. resid = uio->uio_resid;
  968. }
  969. if (dontroute) {
  970. so->so_options |= SO_DONTROUTE;
  971. }
  972. /*
  973. * XXX all the SBS_CANTSENDMORE checks previously
  974. * done could be out of date. We could have recieved
  975. * a reset packet in an interrupt or maybe we slept
  976. * while doing page faults in uiomove() etc. We
  977. * could probably recheck again inside the locking
  978. * protection here, but there are probably other
  979. * places that this also happens. We must rethink
  980. * this.
  981. */
  982. VNET_SO_ASSERT(so);
  983. SOCK_UNLOCK(so);
  984. error = (*so->so_proto->pr_usrreqs->pru_send)(so,
  985. (flags & MSG_OOB) ? PRUS_OOB :
  986. /*
  987. * If the user set MSG_EOF, the protocol understands
  988. * this flag and nothing left to send then use
  989. * PRU_SEND_EOF instead of PRU_SEND.
  990. */
  991. ((flags & MSG_EOF) &&
  992. (so->so_proto->pr_flags & PR_IMPLOPCL) &&
  993. (resid <= 0)) ?
  994. PRUS_EOF :
  995. /* If there is more to send set PRUS_MORETOCOME. */
  996. (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
  997. top, addr, control, td);
  998. SOCK_LOCK(so);
  999. if (dontroute) {
  1000. so->so_options &= ~SO_DONTROUTE;
  1001. }
  1002. clen = 0;
  1003. control = NULL;
  1004. top = NULL;
  1005. if (error)
  1006. goto release;
  1007. } while (resid && space > 0);
  1008. } while (resid);
  1009. release:
  1010. sbunlock(so, &so->so_snd);
  1011. out:
  1012. SOCK_UNLOCK(so);
  1013. out_unlocked:
  1014. if (top != NULL)
  1015. m_freem(top);
  1016. if (control != NULL)
  1017. m_freem(control);
  1018. return (error);
  1019. }
  1020. int
  1021. zsend(struct socket *so, struct uio *uio, struct zmsghdr *zm, int flags)
  1022. {
  1023. long space;
  1024. ssize_t resid;
  1025. int clen = 0, error, dontroute;
  1026. struct mbuf *top = NULL;
  1027. int atomic = sosendallatonce(so) || top;
  1028. if (uio != NULL)
  1029. resid = uio->uio_resid;
  1030. else
  1031. resid = top->M_dat.MH.MH_pkthdr.len;
  1032. KASSERT(uio->uio_iov, ("iov is null on MSG_ZCOPY"));
  1033. /*
  1034. * In theory resid should be unsigned. However, space must be
  1035. * signed, as it might be less than 0 if we over-committed, and we
  1036. * must use a signed comparison of space and resid. On the other
  1037. * hand, a negative resid causes us to loop sending 0-length
  1038. * segments to the protocol.
  1039. *
  1040. * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
  1041. * type sockets since that's an error.
  1042. */
  1043. if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
  1044. error = EINVAL;
  1045. goto out_unlocked;
  1046. }
  1047. dontroute =
  1048. (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
  1049. (so->so_proto->pr_flags & PR_ATOMIC);
  1050. SOCK_LOCK(so);
  1051. error = sblock(so, &so->so_snd, SBLOCKWAIT(flags));
  1052. if (error)
  1053. goto out;
  1054. restart:
  1055. flush_net_channel(so);
  1056. do {
  1057. if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
  1058. error = EPIPE;
  1059. goto release;
  1060. }
  1061. if (so->so_error) {
  1062. error = so->so_error;
  1063. so->so_error = 0;
  1064. goto release;
  1065. }
  1066. if ((so->so_state & SS_ISCONNECTED) == 0) {
  1067. /*
  1068. * `sendto' and `sendmsg' is allowed on a connection-
  1069. * based socket if it supports implied connect.
  1070. * Return ENOTCONN if not connected and no address is
  1071. * supplied.
  1072. */
  1073. if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
  1074. (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
  1075. if ((so->so_state & SS_ISCONFIRMING) == 0 &&
  1076. !(resid == 0 && clen != 0)) {
  1077. error = ENOTCONN;
  1078. goto release;
  1079. }
  1080. }
  1081. if (so->so_proto->pr_flags & PR_CONNREQUIRED)
  1082. error = ENOTCONN;
  1083. else
  1084. error = EDESTADDRREQ;
  1085. goto release;
  1086. }
  1087. space = sbspace(&so->so_snd);
  1088. if (flags & MSG_OOB)
  1089. space += 1024;
  1090. if ((atomic && resid > so->so_snd.sb_hiwat) ||
  1091. (u_int)clen > so->so_snd.sb_hiwat) {
  1092. error = EMSGSIZE;
  1093. goto release;
  1094. }
  1095. if (space < resid + clen &&
  1096. (atomic || space < so->so_snd.sb_lowat || space < clen)) {
  1097. if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
  1098. error = EWOULDBLOCK;
  1099. goto release;
  1100. }
  1101. error = sbwait(so, &so->so_snd);
  1102. if (error)
  1103. goto release;
  1104. goto restart;
  1105. }
  1106. space -= clen;
  1107. do {
  1108. if (uio == NULL) {
  1109. resid = 0;
  1110. if (flags & MSG_EOR)
  1111. top->m_hdr.mh_flags |= M_EOR;
  1112. } else {
  1113. /*
  1114. * Copy the data from userland into a mbuf
  1115. * chain. If no data is to be copied in,
  1116. * a single empty mbuf is returned.
  1117. */
  1118. top = m_uiotombuf_zcopy(uio, M_WAITOK, space,
  1119. (atomic ? max_hdr : 0), MCLBYTES,
  1120. (atomic ? M_PKTHDR : 0) |
  1121. ((flags & MSG_EOR) ? M_EOR : 0),
  1122. zm);
  1123. if (top == NULL) {
  1124. error = EFAULT; /* only possible error */
  1125. goto release;
  1126. }
  1127. space -= resid - uio->uio_resid;
  1128. resid = uio->uio_resid;
  1129. }
  1130. if (dontroute) {
  1131. so->so_options |= SO_DONTROUTE;
  1132. }
  1133. /*
  1134. * XXX all the SBS_CANTSENDMORE checks previously
  1135. * done could be out of date. We could have recieved
  1136. * a reset packet in an interrupt or maybe we slept
  1137. * while doing page faults in uiomove() etc. We
  1138. * could probably recheck again inside the locking
  1139. * protection here, but there are probably other
  1140. * places that this also happens. We must rethink
  1141. * this.
  1142. */
  1143. VNET_SO_ASSERT(so);
  1144. SOCK_UNLOCK(so);
  1145. error = (*so->so_proto->pr_usrreqs->pru_send)(so,
  1146. (flags & MSG_OOB) ? PRUS_OOB :
  1147. /*
  1148. * If the user set MSG_EOF, the protocol understands
  1149. * this flag and nothing left to send then use
  1150. * PRU_SEND_EOF instead of PRU_SEND.
  1151. */
  1152. ((flags & MSG_EOF) &&
  1153. (so->so_proto->pr_flags & PR_IMPLOPCL) &&
  1154. (resid <= 0)) ?
  1155. PRUS_EOF :
  1156. /* If there is more to send set PRUS_MORETOCOME. */
  1157. (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
  1158. top, NULL, NULL, NULL);
  1159. SOCK_LOCK(so);
  1160. if (dontroute) {
  1161. so->so_options &= ~SO_DONTROUTE;
  1162. }
  1163. clen = 0;
  1164. top = NULL;
  1165. if (error)
  1166. goto release;
  1167. } while (resid && space > 0);
  1168. } while (resid);
  1169. release:
  1170. sbunlock(so, &so->so_snd);
  1171. out:
  1172. SOCK_UNLOCK(so);
  1173. out_unlocked:
  1174. if (top != NULL)
  1175. m_freem(top);
  1176. return (error);
  1177. }
  1178. int
  1179. sosend(struct socket *so, struct bsd_sockaddr *addr, struct uio *uio,
  1180. struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
  1181. {
  1182. int error;
  1183. CURVNET_SET(so->so_vnet);
  1184. error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
  1185. control, flags, td);
  1186. CURVNET_RESTORE();
  1187. return (error);
  1188. }
  1189. /*
  1190. * The part of soreceive() that implements reading non-inline out-of-band
  1191. * data from a socket. For more complete comments, see soreceive(), from
  1192. * which this code originated.
  1193. *
  1194. * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
  1195. * unable to return an mbuf chain to the caller.
  1196. */
  1197. static int
  1198. soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
  1199. {
  1200. struct protosw *pr = so->so_proto;
  1201. struct mbuf *m;
  1202. int error;
  1203. KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
  1204. VNET_SO_ASSERT(so);
  1205. m = m_get(M_WAIT, MT_DATA);
  1206. error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
  1207. if (error)
  1208. goto bad;
  1209. do {
  1210. error = uiomove(mtod(m, void *),
  1211. (int) bsd_min(uio->uio_resid, m->m_hdr.mh_len), uio);
  1212. m = m_free(m);
  1213. } while (uio->uio_resid && error == 0 && m);
  1214. bad:
  1215. if (m != NULL)
  1216. m_freem(m);
  1217. return (error);
  1218. }
  1219. /*
  1220. * Following replacement or removal of the first mbuf on the first mbuf chain
  1221. * of a socket buffer, push necessary state changes back into the socket
  1222. * buffer so that other consumers see the values consistently. 'nextrecord'
  1223. * is the callers locally stored value of the original value of
  1224. * sb->sb_mb->m_hdr.mh_nextpkt which must be restored when the lead mbuf changes.
  1225. * NOTE: 'nextrecord' may be NULL.
  1226. */
  1227. static __inline void
  1228. sockbuf_pushsync(socket* so, struct sockbuf *sb, struct mbuf *nextrecord)
  1229. {
  1230. SOCK_LOCK_ASSERT(so);
  1231. /*
  1232. * First, update for the new value of nextrecord. If necessary, make
  1233. * it the first record.
  1234. */
  1235. if (sb->sb_mb != NULL)
  1236. sb->sb_mb->m_hdr.mh_nextpkt = nextrecord;
  1237. else
  1238. sb->sb_mb = nextrecord;
  1239. /*
  1240. * Now update any dependent socket buffer fields to reflect the new
  1241. * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
  1242. * addition of a second clause that takes care of the case where
  1243. * sb_mb has been updated, but remains the last record.
  1244. */
  1245. if (sb->sb_mb == NULL) {
  1246. sb->sb_mbtail = NULL;
  1247. sb->sb_lastrecord = NULL;
  1248. } else if (sb->sb_mb->m_hdr.mh_nextpkt == NULL)
  1249. sb->sb_lastrecord = sb->sb_mb;
  1250. }
  1251. /*
  1252. * Implement receive operations on a socket. We depend on the way that
  1253. * records are added to the sockbuf by sbappend. In particular, each record
  1254. * (mbufs linked through m_hdr.mh_next) must begin with an address if the protocol
  1255. * so specifies, followed by an optional mbuf or mbufs containing ancillary
  1256. * data, and then zero or more mbufs of data. In order to allow parallelism
  1257. * between network receive and copying to user space, as well as avoid
  1258. * sleeping with a mutex held, we release the socket buffer mutex during the
  1259. * user space copy. Although the sockbuf is locked, new data may still be
  1260. * appended, and thus we must maintain consistency of the sockbuf during that
  1261. * time.
  1262. *
  1263. * The caller may receive the data as a single mbuf chain by supplying an
  1264. * mbuf **mp0 for use in returning the chain. The uio is then used only for
  1265. * the count in uio_resid.
  1266. */
  1267. int
  1268. soreceive_generic(struct socket *so, struct bsd_sockaddr **psa, struct uio *uio,
  1269. struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
  1270. {
  1271. struct mbuf *m, **mp;
  1272. int flags, error, offset;
  1273. ssize_t len;
  1274. struct protosw *pr = so->so_proto;
  1275. struct mbuf *nextrecord;
  1276. int moff, type = 0;
  1277. ssize_t orig_resid = uio->uio_resid;
  1278. mp = mp0;
  1279. if (psa != NULL)
  1280. *psa = NULL;
  1281. if (controlp != NULL)
  1282. *controlp = NULL;
  1283. if (flagsp != NULL)
  1284. flags = *flagsp &~ MSG_EOR;
  1285. else
  1286. flags = 0;
  1287. if (flags & MSG_OOB)
  1288. return (soreceive_rcvoob(so, uio, flags));
  1289. if (mp != NULL)
  1290. *mp = NULL;
  1291. if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
  1292. && uio->uio_resid) {
  1293. VNET_SO_ASSERT(so);
  1294. (*pr->pr_usrreqs->pru_rcvd)(so, 0);
  1295. }
  1296. SOCK_LOCK(so);
  1297. error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags));
  1298. if (error)
  1299. goto out;
  1300. restart:
  1301. flush_net_channel(so);
  1302. m = so->so_rcv.sb_mb;
  1303. /*
  1304. * If we have less data than requested, block awaiting more (subject
  1305. * to any timeout) if:
  1306. * 1. the current count is less than the low water mark, or
  1307. * 2. MSG_DONTWAIT is not set
  1308. */
  1309. if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
  1310. so->so_rcv.sb_cc < uio->uio_resid) &&
  1311. so->so_rcv.sb_cc < (u_int)so->so_rcv.sb_lowat &&
  1312. m->m_hdr.mh_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
  1313. KASSERT(m != NULL || !so->so_rcv.sb_cc,
  1314. ("receive: m == %p so->so_rcv.sb_cc == %u",
  1315. m, so->so_rcv.sb_cc));
  1316. if (so->so_error) {
  1317. if (m != NULL)
  1318. goto dontblock;
  1319. error = so->so_error;
  1320. if ((flags & MSG_PEEK) == 0)
  1321. so->so_error = 0;
  1322. goto release;
  1323. }
  1324. SOCK_LOCK_ASSERT(so);
  1325. if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
  1326. if (m == NULL) {
  1327. goto release;
  1328. } else
  1329. goto dontblock;
  1330. }
  1331. for (; m != NULL; m = m->m_hdr.mh_next)
  1332. if (m->m_hdr.mh_type == MT_OOBDATA || (m->m_hdr.mh_flags & M_EOR)) {
  1333. m = so->so_rcv.sb_mb;
  1334. goto dontblock;
  1335. }
  1336. if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
  1337. (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
  1338. error = ENOTCONN;
  1339. goto release;
  1340. }
  1341. if (uio->uio_resid == 0) {
  1342. goto release;
  1343. }
  1344. if ((so->so_state & SS_NBIO) ||
  1345. (flags & (MSG_DONTWAIT|MSG_NBIO))) {
  1346. error = EWOULDBLOCK;
  1347. goto release;
  1348. }
  1349. SBLASTRECORDCHK(&so->so_rcv);
  1350. SBLASTMBUFCHK(&so->so_rcv);
  1351. error = sbwait(so, &so->so_rcv);
  1352. if (error)
  1353. goto release;
  1354. goto restart;
  1355. }
  1356. dontblock:
  1357. /*
  1358. * From this point onward, we maintain 'nextrecord' as a cache of the
  1359. * pointer to the next record in the socket buffer. We must keep the
  1360. * various socket buffer pointers and local stack versions of the
  1361. * pointers in sync, pushing out modifications before dropping the
  1362. * socket buffer mutex, and re-reading them when picking it up.
  1363. *
  1364. * Otherwise, we will race with the network stack appending new data
  1365. * or records onto the socket buffer by using inconsistent/stale
  1366. * versions of the field, possibly resulting in socket buffer
  1367. * corruption.
  1368. *
  1369. * By holding the high-level sblock(), we prevent simultaneous
  1370. * readers from pulling off the front of the socket buffer.
  1371. */
  1372. SOCK_LOCK_ASSERT(so);
  1373. KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
  1374. SBLASTRECORDCHK(&so->so_rcv);
  1375. SBLASTMBUFCHK(&so->so_rcv);
  1376. nextrecord = m->m_hdr.mh_nextpkt;
  1377. if (pr->pr_flags & PR_ADDR) {
  1378. KASSERT(m->m_hdr.mh_type == MT_SONAME,
  1379. ("m->m_hdr.mh_type == %d", m->m_hdr.mh_type));
  1380. orig_resid = 0;
  1381. if (psa != NULL)
  1382. *psa = sodupbsd_sockaddr(mtod(m, struct bsd_sockaddr *),
  1383. M_NOWAIT);
  1384. if (flags & MSG_PEEK) {
  1385. m = m->m_hdr.mh_next;
  1386. } else {
  1387. sbfree(&so->so_rcv, m);
  1388. so->so_rcv.sb_mb = m_free(m);
  1389. m = so->so_rcv.sb_mb;
  1390. sockbuf_pushsync(so, &so->so_rcv, nextrecord);
  1391. }
  1392. }
  1393. /*
  1394. * Process one or more MT_CONTROL mbufs present before any data mbufs
  1395. * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
  1396. * just copy the data; if !MSG_PEEK, we call into the protocol to
  1397. * perform externalization (or freeing if controlp == NULL).
  1398. */
  1399. if (m != NULL && m->m_hdr.mh_type == MT_CONTROL) {
  1400. struct mbuf *cm = NULL, *cmn;
  1401. struct mbuf **cme = &cm;
  1402. do {
  1403. if (flags & MSG_PEEK) {
  1404. if (controlp != NULL) {
  1405. *controlp = m_copy(m, 0, m->m_hdr.mh_len);
  1406. controlp = &(*controlp)->m_hdr.mh_next;
  1407. }
  1408. m = m->m_hdr.mh_next;
  1409. } else {
  1410. sbfree(&so->so_rcv, m);
  1411. so->so_rcv.sb_mb = m->m_hdr.mh_next;
  1412. m->m_hdr.mh_next = NULL;
  1413. *cme = m;
  1414. cme = &(*cme)->m_hdr.mh_next;
  1415. m = so->so_rcv.sb_mb;
  1416. }
  1417. } while (m != NULL && m->m_hdr.mh_type == MT_CONTROL);
  1418. if ((flags & MSG_PEEK) == 0)
  1419. sockbuf_pushsync(so, &so->so_rcv, nextrecord);
  1420. while (cm != NULL) {
  1421. cmn = cm->m_hdr.mh_next;
  1422. cm->m_hdr.mh_next = NULL;
  1423. if (pr->pr_domain->dom_externalize != NULL) {
  1424. SOCK_UNLOCK(so);
  1425. VNET_SO_ASSERT(so);
  1426. error = (*pr->pr_domain->dom_externalize)
  1427. (cm, controlp);
  1428. SOCK_LOCK(so);
  1429. } else if (controlp != NULL)
  1430. *controlp = cm;
  1431. else
  1432. m_freem(cm);
  1433. if (controlp != NULL) {
  1434. orig_resid = 0;
  1435. while (*controlp != NULL)
  1436. controlp = &(*controlp)->m_hdr.mh_next;
  1437. }
  1438. cm = cmn;
  1439. }
  1440. if (m != NULL)
  1441. nextrecord = so->so_rcv.sb_mb->m_hdr.mh_nextpkt;
  1442. else
  1443. nextrecord = so->so_rcv.sb_mb;
  1444. orig_resid = 0;
  1445. }
  1446. if (m != NULL) {
  1447. if ((flags & MSG_PEEK) == 0) {
  1448. KASSERT(m->m_hdr.mh_nextpkt == nextrecord,
  1449. ("soreceive: post-control, nextrecord !sync"));
  1450. if (nextrecord == NULL) {
  1451. KASSERT(so->so_rcv.sb_mb == m,
  1452. ("soreceive: post-control, sb_mb!=m"));
  1453. KASSERT(so->so_rcv.sb_lastrecord == m,
  1454. ("soreceive: post-control, lastrecord!=m"));
  1455. }
  1456. }
  1457. type = m->m_hdr.mh_type;
  1458. if (type == MT_OOBDATA)
  1459. flags |= MSG_OOB;
  1460. } else {
  1461. if ((flags & MSG_PEEK) == 0) {
  1462. KASSERT(so->so_rcv.sb_mb == nextrecord,
  1463. ("soreceive: sb_mb != nextrecord"));
  1464. if (so->so_rcv.sb_mb == NULL) {
  1465. KASSERT(so->so_rcv.sb_lastrecord == NULL,
  1466. ("soreceive: sb_lastercord != NULL"));
  1467. }
  1468. }
  1469. }
  1470. SOCK_LOCK_ASSERT(so);
  1471. SBLASTRECORDCHK(&so->so_rcv);
  1472. SBLASTMBUFCHK(&so->so_rcv);
  1473. /*
  1474. * Now continue to read any data mbufs off of the head of the socket
  1475. * buffer until the read request is satisfied. Note that 'type' is
  1476. * used to store the type of any mbuf reads that have happened so far
  1477. * such that soreceive() can stop reading if the type changes, which
  1478. * causes soreceive() to return only one of regular data and inline
  1479. * out-of-band data in a single socket receive operation.
  1480. */
  1481. moff = 0;
  1482. offset = 0;
  1483. while (m != NULL && uio->uio_resid > 0 && error == 0) {
  1484. /*
  1485. * If the type of mbuf has changed since the last mbuf
  1486. * examined ('type'), end the receive operation.
  1487. */
  1488. SOCK_LOCK_ASSERT(so);
  1489. if (m->m_hdr.mh_type == MT_OOBDATA || m->m_hdr.mh_type == MT_CONTROL) {
  1490. if (type != m->m_hdr.mh_type)
  1491. break;
  1492. } else if (type == MT_OOBDATA)
  1493. break;
  1494. else
  1495. KASSERT(m->m_hdr.mh_type == MT_DATA,
  1496. ("m->m_hdr.mh_type == %d", m->m_hdr.mh_type));
  1497. so->so_rcv.sb_state &= ~SBS_RCVATMARK;
  1498. len = uio->uio_resid;
  1499. if (so->so_oobmark && len > so->so_oobmark - offset)
  1500. len = so->so_oobmark - offset;
  1501. if (len > m->m_hdr.mh_len - moff)
  1502. len = m->m_hdr.mh_len - moff;
  1503. /*
  1504. * If mp is set, just pass back the mbufs. Otherwise copy
  1505. * them out via the uio, then free. Sockbuf must be
  1506. * consistent here (points to current mbuf, it points to next
  1507. * record) when we drop priority; we must note any additions
  1508. * to the sockbuf when we block interrupts again.
  1509. */
  1510. if (mp == NULL) {
  1511. SOCK_LOCK_ASSERT(so);
  1512. SBLASTRECORDCHK(&so->so_rcv);
  1513. SBLASTMBUFCHK(&so->so_rcv);
  1514. error = uiomove(mtod(m, char *) + moff, (int)len, uio);
  1515. if (error) {
  1516. /*
  1517. * The MT_SONAME mbuf has already been removed
  1518. * from the record, so it is necessary to
  1519. * remove the data mbufs, if any, to preserve
  1520. * the invariant in the case of PR_ADDR that
  1521. * requires MT_SONAME mbufs at the head of
  1522. * each record.
  1523. */
  1524. if (m && pr->pr_flags & PR_ATOMIC &&
  1525. ((flags & MSG_PEEK) == 0))
  1526. (void)sbdroprecord_locked(so, &so->so_rcv);
  1527. goto release;
  1528. }
  1529. } else
  1530. uio->uio_resid -= len;
  1531. SOCK_LOCK_ASSERT(so);
  1532. if (len == m->m_hdr.mh_len - moff) {
  1533. if (m->m_hdr.mh_flags & M_EOR)
  1534. flags |= MSG_EOR;
  1535. if (flags & MSG_PEEK) {
  1536. m = m->m_hdr.mh_next;
  1537. moff = 0;
  1538. } else {
  1539. nextrecord = m->m_hdr.mh_nextpkt;
  1540. sbfree(&so->so_rcv, m);
  1541. if (mp != NULL) {
  1542. *mp = m;
  1543. mp = &m->m_hdr.mh_next;
  1544. so->so_rcv.sb_mb = m = m->m_hdr.mh_next;
  1545. *mp = NULL;
  1546. } else {
  1547. so->so_rcv.sb_mb = m_free(m);
  1548. m = so->so_rcv.sb_mb;
  1549. }
  1550. sockbuf_pushsync(so, &so->so_rcv, nextrecord);
  1551. SBLASTRECORDCHK(&so->so_rcv);
  1552. SBLASTMBUFCHK(&so->so_rcv);
  1553. }
  1554. } else {
  1555. if (flags & MSG_PEEK)
  1556. moff += len;
  1557. else {
  1558. if (mp != NULL) {
  1559. int copy_flag;
  1560. if (flags & MSG_DONTWAIT)
  1561. copy_flag = M_DONTWAIT;
  1562. else
  1563. copy_flag = M_WAIT;
  1564. if (copy_flag ==

Large files files are truncated, but you can click here to view the full file