PageRenderTime 52ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/bsd/sys/kern/uipc_sockbuf.cc

https://gitlab.com/jforge/osv
C++ | 1119 lines | 730 code | 139 blank | 250 comment | 172 complexity | d1b3238918e354a367b9af8912fc0bca MD5 | raw file
Possible License(s): BSD-3-Clause, 0BSD, MPL-2.0-no-copyleft-exception
  1. /*-
  2. * Copyright (c) 1982, 1986, 1988, 1990, 1993
  3. * The Regents of the University of California. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. * 4. Neither the name of the University nor the names of its contributors
  14. * may be used to endorse or promote products derived from this software
  15. * without specific prior written permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. *
  29. * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
  30. */
  31. #include <sys/cdefs.h>
  32. #include <osv/poll.h>
  33. #include <osv/clock.hh>
  34. #include <osv/signal.hh>
  35. #include <bsd/porting/netport.h>
  36. #include <bsd/porting/rwlock.h>
  37. #include <bsd/porting/synch.h>
  38. #include <bsd/sys/sys/param.h>
  39. #include <bsd/sys/sys/mbuf.h>
  40. #include <bsd/sys/sys/protosw.h>
  41. #include <bsd/sys/sys/socket.h>
  42. #include <bsd/sys/sys/socketvar.h>
  43. #include <bsd/sys/sys/libkern.h>
  44. /*
  45. * Function pointer set by the AIO routines so that the socket buffer code
  46. * can call back into the AIO module if it is loaded.
  47. */
  48. void (*aio_swake)(struct socket *, struct sockbuf *);
  49. /*
  50. * Primitive routines for operating on socket buffers
  51. */
  52. u_long sb_max = SB_MAX;
  53. u_long sb_max_adj =
  54. (quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
  55. static u_long sb_efficiency = 8; /* parameter for sbreserve() */
  56. static void sbdrop_internal(struct sockbuf *sb, int len);
  57. static void sbflush_internal(struct sockbuf *sb);
  58. /*
  59. * Socantsendmore indicates that no more data will be sent on the socket; it
  60. * would normally be applied to a socket when the user informs the system
  61. * that no more data is to be sent, by the protocol code (in case
  62. * PRU_SHUTDOWN). Socantrcvmore indicates that no more data will be
  63. * received, and will normally be applied to the socket by a protocol when it
  64. * detects that the peer will send no more data. Data queued for reading in
  65. * the socket may yet be read.
  66. */
  67. void
  68. socantsendmore_locked(struct socket *so)
  69. {
  70. SOCK_LOCK_ASSERT(so);
  71. so->so_snd.sb_state |= SBS_CANTSENDMORE;
  72. sowwakeup_locked(so);
  73. }
  74. void
  75. socantsendmore(struct socket *so)
  76. {
  77. SOCK_LOCK(so);
  78. socantsendmore_locked(so);
  79. SOCK_UNLOCK(so);
  80. SOCK_UNLOCK_ASSERT(so);
  81. }
  82. void
  83. socantrcvmore_locked(struct socket *so)
  84. {
  85. SOCK_LOCK_ASSERT(so);
  86. so->so_rcv.sb_state |= SBS_CANTRCVMORE;
  87. sorwakeup_locked(so);
  88. }
  89. void
  90. socantrcvmore(struct socket *so)
  91. {
  92. SOCK_LOCK(so);
  93. socantrcvmore_locked(so);
  94. SOCK_UNLOCK(so);
  95. SOCK_UNLOCK_ASSERT(so);
  96. }
  97. void sockbuf_iolock::lock(mutex& mtx)
  98. {
  99. while (_owner) {
  100. _wq.wait(mtx);
  101. }
  102. _owner = sched::thread::current();
  103. }
  104. bool sockbuf_iolock::try_lock(mutex& mtx)
  105. {
  106. if (!_owner) {
  107. _owner = sched::thread::current();
  108. return true;
  109. } else {
  110. return false;
  111. }
  112. }
  113. void sockbuf_iolock::unlock(mutex& mtx)
  114. {
  115. _owner = nullptr;
  116. _wq.wake_all(mtx);
  117. }
  118. template<typename Clock>
  119. int sbwait_tmo(socket* so, struct sockbuf *sb, boost::optional<std::chrono::time_point<Clock>> timeout)
  120. {
  121. SOCK_LOCK_ASSERT(so);
  122. sb->sb_flags |= SB_WAIT;
  123. sched::timer tmr(*sched::thread::current());
  124. if (timeout) {
  125. tmr.set(*timeout);
  126. }
  127. signal_catcher sc;
  128. if (so->so_nc && !so->so_nc_busy) {
  129. so->so_nc_busy = true;
  130. sched::thread::wait_for(SOCK_MTX_REF(so), *so->so_nc, sb->sb_cc_wq, tmr, sc);
  131. so->so_nc_busy = false;
  132. so->so_nc_wq.wake_all(SOCK_MTX_REF(so));
  133. } else {
  134. sched::thread::wait_for(SOCK_MTX_REF(so), so->so_nc_wq, sb->sb_cc_wq, tmr, sc);
  135. }
  136. if (sc.interrupted()) {
  137. return EINTR;
  138. }
  139. if (tmr.expired()) {
  140. return EWOULDBLOCK;
  141. }
  142. if (so->so_nc) {
  143. so->so_nc->process_queue();
  144. }
  145. return 0;
  146. }
  147. template<typename Clock>
  148. static inline boost::optional<std::chrono::time_point<Clock>> parse_timeout(int timeout)
  149. {
  150. if (timeout == 0) {
  151. return boost::optional<std::chrono::time_point<Clock>>();
  152. }
  153. return boost::optional<std::chrono::time_point<Clock>>(
  154. Clock::now() + std::chrono::nanoseconds(ticks2ns(timeout)));
  155. }
  156. /*
  157. * Wait for data to arrive at/drain from a socket buffer.
  158. */
  159. int
  160. sbwait(socket* so, struct sockbuf *sb)
  161. {
  162. return sbwait_tmo(so, sb, parse_timeout<osv::clock::uptime>(sb->sb_timeo));
  163. }
  164. int
  165. sblock(socket* so, struct sockbuf *sb, int flags)
  166. {
  167. SOCK_LOCK_ASSERT(so);
  168. KASSERT((flags & SBL_VALID) == flags,
  169. ("sblock: flags invalid (0x%x)", flags));
  170. if (flags & SBL_WAIT) {
  171. sb->sb_iolock.lock(SOCK_MTX_REF(so));
  172. return (0);
  173. } else {
  174. if (!sb->sb_iolock.try_lock(SOCK_MTX_REF(so)))
  175. return (EWOULDBLOCK);
  176. return (0);
  177. }
  178. }
  179. void
  180. sbunlock(socket* so, struct sockbuf *sb)
  181. {
  182. SOCK_LOCK_ASSERT(so);
  183. sb->sb_iolock.unlock(SOCK_MTX_REF(so));
  184. }
  185. void so_wake_poll(struct socket *so, struct sockbuf *sb)
  186. {
  187. /* Read */
  188. if (&so->so_rcv == sb) {
  189. if (soreadable(so)) {
  190. poll_wake(so->fp, (POLLIN | POLLRDNORM));
  191. sb->sb_flags &= ~SB_SEL;
  192. }
  193. }
  194. /* Write */
  195. if (&so->so_snd == sb) {
  196. if (sowriteable(so)) {
  197. poll_wake(so->fp, (POLLOUT | POLLWRNORM));
  198. sb->sb_flags &= ~SB_SEL;
  199. }
  200. }
  201. }
  202. /*
  203. * Wakeup processes waiting on a socket buffer. Do asynchronous notification
  204. * via SIGIO if the socket has the SS_ASYNC flag set.
  205. *
  206. * Called with the socket buffer lock held; we currently hold the lock
  207. * through calls out to other subsystems (with the exception of kqueue), and
  208. * then release it to avoid lock order issues. It's not clear that's
  209. * correct.
  210. */
  211. void
  212. sowakeup(struct socket *so, struct sockbuf *sb)
  213. {
  214. int ret = 0;
  215. SOCK_LOCK_ASSERT(so);
  216. so_wake_poll(so, sb);
  217. if (sb->sb_flags & SB_WAIT) {
  218. sb->sb_flags &= ~SB_WAIT;
  219. sb->sb_cc_wq.wake_all(SOCK_MTX_REF(so));
  220. }
  221. if (sb->sb_upcall != NULL) {
  222. ret = sb->sb_upcall(so, sb->sb_upcallarg, M_DONTWAIT);
  223. if (ret == SU_ISCONNECTED) {
  224. KASSERT(sb == &so->so_rcv,
  225. ("SO_SND upcall returned SU_ISCONNECTED"));
  226. soupcall_clear(so, SO_RCV);
  227. }
  228. } else
  229. ret = SU_OK;
  230. if (ret == SU_ISCONNECTED) {
  231. soisconnected(so);
  232. }
  233. }
  234. /*
  235. * Socket buffer (struct sockbuf) utility routines.
  236. *
  237. * Each socket contains two socket buffers: one for sending data and one for
  238. * receiving data. Each buffer contains a queue of mbufs, information about
  239. * the number of mbufs and amount of data in the queue, and other fields
  240. * allowing select() statements and notification on data availability to be
  241. * implemented.
  242. *
  243. * Data stored in a socket buffer is maintained as a list of records. Each
  244. * record is a list of mbufs chained together with the m_hdr.mh_next field. Records
  245. * are chained together with the m_hdr.mh_nextpkt field. The upper level routine
  246. * soreceive() expects the following conventions to be observed when placing
  247. * information in the receive buffer:
  248. *
  249. * 1. If the protocol requires each message be preceded by the sender's name,
  250. * then a record containing that name must be present before any
  251. * associated data (mbuf's must be of type MT_SONAME).
  252. * 2. If the protocol supports the exchange of ``access rights'' (really just
  253. * additional data associated with the message), and there are ``rights''
  254. * to be received, then a record containing this data should be present
  255. * (mbuf's must be of type MT_RIGHTS).
  256. * 3. If a name or rights record exists, then it must be followed by a data
  257. * record, perhaps of zero length.
  258. *
  259. * Before using a new socket structure it is first necessary to reserve
  260. * buffer space to the socket, by calling sbreserve(). This should commit
  261. * some of the available buffer space in the system buffer pool for the
  262. * socket (currently, it does nothing but enforce limits). The space should
  263. * be released by calling sbrelease() when the socket is destroyed.
  264. *
  265. * Used during construction, so we can't assert() the mutex is locked -
  266. * it doesn't exist yet.
  267. */
  268. int
  269. soreserve_internal(struct socket *so, u_long sndcc, u_long rcvcc)
  270. {
  271. struct thread *td = NULL;
  272. if (sbreserve_internal(&so->so_snd, sndcc, so, td) == 0)
  273. goto bad;
  274. if (sbreserve_internal(&so->so_rcv, rcvcc, so, td) == 0)
  275. goto bad2;
  276. if (so->so_rcv.sb_lowat == 0)
  277. so->so_rcv.sb_lowat = 1;
  278. if (so->so_snd.sb_lowat == 0)
  279. so->so_snd.sb_lowat = MCLBYTES;
  280. if ((u_int)so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
  281. so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
  282. return (0);
  283. bad2:
  284. sbrelease_internal(&so->so_snd, so);
  285. bad:
  286. return (ENOBUFS);
  287. }
  288. int
  289. soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
  290. {
  291. SOCK_LOCK(so);
  292. auto error = soreserve_internal(so, sndcc, rcvcc);
  293. SOCK_UNLOCK(so);
  294. return error;
  295. }
  296. #if 0
  297. static int
  298. sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
  299. {
  300. int error = 0;
  301. u_long tmp_sb_max = sb_max;
  302. error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req);
  303. if (error || !req->newptr)
  304. return (error);
  305. if (tmp_sb_max < MSIZE + MCLBYTES)
  306. return (EINVAL);
  307. sb_max = tmp_sb_max;
  308. sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
  309. return (0);
  310. }
  311. #endif
  312. /*
  313. * Allot mbufs to a sockbuf. Attempt to scale mbmax so that mbcnt doesn't
  314. * become limiting if buffering efficiency is near the normal case.
  315. */
  316. int
  317. sbreserve_internal(struct sockbuf *sb, u_long cc, struct socket *so,
  318. struct thread *td)
  319. {
  320. /*
  321. * When a thread is passed, we take into account the thread's socket
  322. * buffer size limit. The caller will generally pass curthread, but
  323. * in the TCP input path, NULL will be passed to indicate that no
  324. * appropriate thread resource limits are available. In that case,
  325. * we don't apply a process limit.
  326. */
  327. if (cc > sb_max_adj)
  328. return (0);
  329. sb->sb_hiwat = cc;
  330. sb->sb_mbmax = bsd_min(cc * sb_efficiency, sb_max);
  331. if ((u_int)sb->sb_lowat > sb->sb_hiwat)
  332. sb->sb_lowat = sb->sb_hiwat;
  333. return (1);
  334. }
  335. int
  336. sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
  337. struct thread *td)
  338. {
  339. SOCK_LOCK_ASSERT(so);
  340. return sbreserve_internal(sb, cc, so, td);
  341. }
  342. int
  343. sbreserve(struct sockbuf *sb, u_long cc, struct socket *so,
  344. struct thread *td)
  345. {
  346. int error;
  347. SOCK_LOCK(so);
  348. error = sbreserve_internal(sb, cc, so, td);
  349. SOCK_UNLOCK(so);
  350. return (error);
  351. }
  352. /*
  353. * Free mbufs held by a socket, and reserved mbuf space.
  354. */
  355. void
  356. sbrelease_internal(struct sockbuf *sb, struct socket *so)
  357. {
  358. sbflush_internal(sb);
  359. sb->sb_hiwat = 0;
  360. sb->sb_mbmax = 0;
  361. }
  362. void
  363. sbrelease_locked(struct sockbuf *sb, struct socket *so)
  364. {
  365. SOCK_LOCK_ASSERT(so);
  366. sbrelease_internal(sb, so);
  367. }
  368. void
  369. sbrelease(struct sockbuf *sb, struct socket *so)
  370. {
  371. SOCK_LOCK(so);
  372. sbrelease_locked(sb, so);
  373. SOCK_UNLOCK(so);
  374. }
  375. void
  376. sbdestroy(struct sockbuf *sb, struct socket *so)
  377. {
  378. sbrelease_internal(sb, so);
  379. }
  380. /*
  381. * Routines to add and remove data from an mbuf queue.
  382. *
  383. * The routines sbappend() or sbappendrecord() are normally called to append
  384. * new mbufs to a socket buffer, after checking that adequate space is
  385. * available, comparing the function sbspace() with the amount of data to be
  386. * added. sbappendrecord() differs from sbappend() in that data supplied is
  387. * treated as the beginning of a new record. To place a sender's address,
  388. * optional access rights, and data in a socket receive buffer,
  389. * sbappendaddr() should be used. To place access rights and data in a
  390. * socket receive buffer, sbappendrights() should be used. In either case,
  391. * the new data begins a new record. Note that unlike sbappend() and
  392. * sbappendrecord(), these routines check for the caller that there will be
  393. * enough space to store the data. Each fails if there is not enough space,
  394. * or if it cannot find mbufs to store additional information in.
  395. *
  396. * Reliable protocols may use the socket send buffer to hold data awaiting
  397. * acknowledgement. Data is normally copied from a socket send buffer in a
  398. * protocol with m_copy for output to a peer, and then removing the data from
  399. * the socket buffer with sbdrop() or sbdroprecord() when the data is
  400. * acknowledged by the peer.
  401. */
  402. #ifdef SOCKBUF_DEBUG
  403. void
  404. sblastrecordchk(struct sockbuf *sb, const char *file, int line)
  405. {
  406. struct mbuf *m = sb->sb_mb;
  407. SOCKBUF_LOCK_ASSERT(sb);
  408. while (m && m->m_hdr.mh_nextpkt)
  409. m = m->m_hdr.mh_nextpkt;
  410. if (m != sb->sb_lastrecord) {
  411. printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
  412. __func__, sb->sb_mb, sb->sb_lastrecord, m);
  413. printf("packet chain:\n");
  414. for (m = sb->sb_mb; m != NULL; m = m->m_hdr.mh_nextpkt)
  415. printf("\t%p\n", m);
  416. panic("%s from %s:%u", __func__, file, line);
  417. }
  418. }
  419. void
  420. sblastmbufchk(struct sockbuf *sb, const char *file, int line)
  421. {
  422. struct mbuf *m = sb->sb_mb;
  423. struct mbuf *n;
  424. SOCKBUF_LOCK_ASSERT(sb);
  425. while (m && m->m_hdr.mh_nextpkt)
  426. m = m->m_hdr.mh_nextpkt;
  427. while (m && m->m_hdr.mh_next)
  428. m = m->m_hdr.mh_next;
  429. if (m != sb->sb_mbtail) {
  430. printf("%s: sb_mb %p sb_mbtail %p last %p\n",
  431. __func__, sb->sb_mb, sb->sb_mbtail, m);
  432. printf("packet tree:\n");
  433. for (m = sb->sb_mb; m != NULL; m = m->m_hdr.mh_nextpkt) {
  434. printf("\t");
  435. for (n = m; n != NULL; n = n->m_hdr.mh_next)
  436. printf("%p ", n);
  437. printf("\n");
  438. }
  439. panic("%s from %s:%u", __func__, file, line);
  440. }
  441. }
  442. #endif /* SOCKBUF_DEBUG */
  443. #define SBLINKRECORD(so, sb, m0) do { \
  444. SOCK_LOCK_ASSERT(so); \
  445. if ((sb)->sb_lastrecord != NULL) \
  446. (sb)->sb_lastrecord->m_hdr.mh_nextpkt = (m0); \
  447. else \
  448. (sb)->sb_mb = (m0); \
  449. (sb)->sb_lastrecord = (m0); \
  450. } while (/*CONSTCOND*/0)
  451. /*
  452. * Append mbuf chain m to the last record in the socket buffer sb. The
  453. * additional space associated the mbuf chain is recorded in sb. Empty mbufs
  454. * are discarded and mbufs are compacted where possible.
  455. */
  456. void
  457. sbappend_locked(socket* so, struct sockbuf *sb, struct mbuf *m)
  458. {
  459. struct mbuf *n;
  460. SOCK_LOCK_ASSERT(so);
  461. if (m == 0)
  462. return;
  463. SBLASTRECORDCHK(sb);
  464. n = sb->sb_mb;
  465. if (n) {
  466. while (n->m_hdr.mh_nextpkt)
  467. n = n->m_hdr.mh_nextpkt;
  468. do {
  469. if (n->m_hdr.mh_flags & M_EOR) {
  470. sbappendrecord_locked(so, sb, m); /* XXXXXX!!!! */
  471. return;
  472. }
  473. } while (n->m_hdr.mh_next && (n = n->m_hdr.mh_next));
  474. } else {
  475. /*
  476. * XXX Would like to simply use sb_mbtail here, but
  477. * XXX I need to verify that I won't miss an EOR that
  478. * XXX way.
  479. */
  480. if ((n = sb->sb_lastrecord) != NULL) {
  481. do {
  482. if (n->m_hdr.mh_flags & M_EOR) {
  483. sbappendrecord_locked(so, sb, m); /* XXXXXX!!!! */
  484. return;
  485. }
  486. } while (n->m_hdr.mh_next && (n = n->m_hdr.mh_next));
  487. } else {
  488. /*
  489. * If this is the first record in the socket buffer,
  490. * it's also the last record.
  491. */
  492. sb->sb_lastrecord = m;
  493. }
  494. }
  495. sbcompress(so, sb, m, n);
  496. SBLASTRECORDCHK(sb);
  497. }
  498. /*
  499. * Append mbuf chain m to the last record in the socket buffer sb. The
  500. * additional space associated the mbuf chain is recorded in sb. Empty mbufs
  501. * are discarded and mbufs are compacted where possible.
  502. */
  503. void
  504. sbappend(socket* so, struct sockbuf *sb, struct mbuf *m)
  505. {
  506. SOCK_LOCK(so);
  507. sbappend_locked(so, sb, m);
  508. SOCK_UNLOCK(so);
  509. }
  510. /*
  511. * This version of sbappend() should only be used when the caller absolutely
  512. * knows that there will never be more than one record in the socket buffer,
  513. * that is, a stream protocol (such as TCP).
  514. */
  515. void
  516. sbappendstream_locked(socket* so, struct sockbuf *sb, struct mbuf *m)
  517. {
  518. SOCK_LOCK_ASSERT(so);
  519. KASSERT(m->m_hdr.mh_nextpkt == NULL,("sbappendstream 0"));
  520. KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
  521. SBLASTMBUFCHK(sb);
  522. sbcompress(so, sb, m, sb->sb_mbtail);
  523. sb->sb_lastrecord = sb->sb_mb;
  524. SBLASTRECORDCHK(sb);
  525. }
  526. /*
  527. * This version of sbappend() should only be used when the caller absolutely
  528. * knows that there will never be more than one record in the socket buffer,
  529. * that is, a stream protocol (such as TCP).
  530. */
  531. void
  532. sbappendstream(socket* so, struct sockbuf *sb, struct mbuf *m)
  533. {
  534. SOCK_LOCK(so);
  535. sbappendstream_locked(so, sb, m);
  536. SOCK_UNLOCK(so);
  537. }
  538. #ifdef SOCKBUF_DEBUG
  539. void
  540. sbcheck(struct sockbuf *sb)
  541. {
  542. struct mbuf *m;
  543. struct mbuf *n = 0;
  544. u_long len = 0, mbcnt = 0;
  545. SOCKBUF_LOCK_ASSERT(sb);
  546. for (m = sb->sb_mb; m; m = n) {
  547. n = m->m_hdr.mh_nextpkt;
  548. for (; m; m = m->m_hdr.mh_next) {
  549. len += m->m_hdr.mh_len;
  550. mbcnt += MSIZE;
  551. if (m->m_hdr.mh_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
  552. mbcnt += m->M_dat.MH.MH_dat.MH_ext.ext_size;
  553. }
  554. }
  555. if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
  556. printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
  557. mbcnt, sb->sb_mbcnt);
  558. panic("sbcheck");
  559. }
  560. }
  561. #endif
  562. /*
  563. * As above, except the mbuf chain begins a new record.
  564. */
  565. void
  566. sbappendrecord_locked(socket* so, struct sockbuf *sb, struct mbuf *m0)
  567. {
  568. struct mbuf *m;
  569. SOCK_LOCK_ASSERT(so);
  570. if (m0 == 0)
  571. return;
  572. /*
  573. * Put the first mbuf on the queue. Note this permits zero length
  574. * records.
  575. */
  576. sballoc(sb, m0);
  577. SBLASTRECORDCHK(sb);
  578. SBLINKRECORD(so, sb, m0);
  579. sb->sb_mbtail = m0;
  580. m = m0->m_hdr.mh_next;
  581. m0->m_hdr.mh_next = 0;
  582. if (m && (m0->m_hdr.mh_flags & M_EOR)) {
  583. m0->m_hdr.mh_flags &= ~M_EOR;
  584. m->m_hdr.mh_flags |= M_EOR;
  585. }
  586. /* always call sbcompress() so it can do SBLASTMBUFCHK() */
  587. sbcompress(so, sb, m, m0);
  588. }
  589. /*
  590. * As above, except the mbuf chain begins a new record.
  591. */
  592. void
  593. sbappendrecord(socket* so, struct sockbuf *sb, struct mbuf *m0)
  594. {
  595. SOCK_LOCK(so);
  596. sbappendrecord_locked(so, sb, m0);
  597. SOCK_UNLOCK(so);
  598. }
  599. /*
  600. * Append address and data, and optionally, control (ancillary) data to the
  601. * receive queue of a socket. If present, m0 must include a packet header
  602. * with total length. Returns 0 if no space in sockbuf or insufficient
  603. * mbufs.
  604. */
  605. int
  606. sbappendaddr_locked(socket* so, struct sockbuf *sb, const struct bsd_sockaddr *asa,
  607. struct mbuf *m0, struct mbuf *control)
  608. {
  609. struct mbuf *m, *n, *nlast;
  610. int space = asa->sa_len;
  611. SOCK_LOCK_ASSERT(so);
  612. if (m0 && (m0->m_hdr.mh_flags & M_PKTHDR) == 0)
  613. panic("sbappendaddr_locked");
  614. if (m0)
  615. space += m0->M_dat.MH.MH_pkthdr.len;
  616. space += m_length(control, &n);
  617. if (space > sbspace(sb))
  618. return (0);
  619. #if MSIZE <= 256
  620. if (asa->sa_len > MLEN)
  621. return (0);
  622. #endif
  623. MGET(m, M_DONTWAIT, MT_SONAME);
  624. if (m == 0)
  625. return (0);
  626. m->m_hdr.mh_len = asa->sa_len;
  627. bcopy(asa, mtod(m, caddr_t), asa->sa_len);
  628. if (n)
  629. n->m_hdr.mh_next = m0; /* concatenate data to control */
  630. else
  631. control = m0;
  632. m->m_hdr.mh_next = control;
  633. for (n = m; n->m_hdr.mh_next != NULL; n = n->m_hdr.mh_next)
  634. sballoc(sb, n);
  635. sballoc(sb, n);
  636. nlast = n;
  637. SBLINKRECORD(so, sb, m);
  638. sb->sb_mbtail = nlast;
  639. SBLASTMBUFCHK(sb);
  640. SBLASTRECORDCHK(sb);
  641. return (1);
  642. }
  643. /*
  644. * Append address and data, and optionally, control (ancillary) data to the
  645. * receive queue of a socket. If present, m0 must include a packet header
  646. * with total length. Returns 0 if no space in sockbuf or insufficient
  647. * mbufs.
  648. */
  649. int
  650. sbappendaddr(socket* so, struct sockbuf *sb, const struct bsd_sockaddr *asa,
  651. struct mbuf *m0, struct mbuf *control)
  652. {
  653. int retval;
  654. SOCK_LOCK(so);
  655. retval = sbappendaddr_locked(so, sb, asa, m0, control);
  656. SOCK_UNLOCK(so);
  657. return (retval);
  658. }
  659. int
  660. sbappendcontrol_locked(socket* so, struct sockbuf *sb, struct mbuf *m0,
  661. struct mbuf *control)
  662. {
  663. struct mbuf *m, *n, *mlast;
  664. int space;
  665. SOCK_LOCK_ASSERT(so);
  666. if (control == 0)
  667. panic("sbappendcontrol_locked");
  668. space = m_length(control, &n) + m_length(m0, NULL);
  669. if (space > sbspace(sb))
  670. return (0);
  671. n->m_hdr.mh_next = m0; /* concatenate data to control */
  672. SBLASTRECORDCHK(sb);
  673. for (m = control; m->m_hdr.mh_next; m = m->m_hdr.mh_next)
  674. sballoc(sb, m);
  675. sballoc(sb, m);
  676. mlast = m;
  677. SBLINKRECORD(so, sb, control);
  678. sb->sb_mbtail = mlast;
  679. SBLASTMBUFCHK(sb);
  680. SBLASTRECORDCHK(sb);
  681. return (1);
  682. }
  683. int
  684. sbappendcontrol(socket* so, struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
  685. {
  686. int retval;
  687. SOCK_LOCK(so);
  688. retval = sbappendcontrol_locked(so, sb, m0, control);
  689. SOCK_UNLOCK(so);
  690. return (retval);
  691. }
  692. /*
  693. * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
  694. * (n). If (n) is NULL, the buffer is presumed empty.
  695. *
  696. * When the data is compressed, mbufs in the chain may be handled in one of
  697. * three ways:
  698. *
  699. * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
  700. * record boundary, and no change in data type).
  701. *
  702. * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
  703. * an mbuf already in the socket buffer. This can occur if an
  704. * appropriate mbuf exists, there is room, and no merging of data types
  705. * will occur.
  706. *
  707. * (3) The mbuf may be appended to the end of the existing mbuf chain.
  708. *
  709. * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
  710. * end-of-record.
  711. */
  712. void
  713. sbcompress(socket* so, struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
  714. {
  715. int eor = 0;
  716. struct mbuf *o;
  717. SOCK_LOCK_ASSERT(so);
  718. while (m) {
  719. eor |= m->m_hdr.mh_flags & M_EOR;
  720. if (m->m_hdr.mh_len == 0 &&
  721. (eor == 0 ||
  722. (((o = m->m_hdr.mh_next) || (o = n)) &&
  723. o->m_hdr.mh_type == m->m_hdr.mh_type))) {
  724. if (sb->sb_lastrecord == m)
  725. sb->sb_lastrecord = m->m_hdr.mh_next;
  726. m = m_free(m);
  727. continue;
  728. }
  729. if (n && (n->m_hdr.mh_flags & M_EOR) == 0 &&
  730. M_WRITABLE(n) &&
  731. ((sb->sb_flags & SB_NOCOALESCE) == 0) &&
  732. m->m_hdr.mh_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
  733. m->m_hdr.mh_len <= M_TRAILINGSPACE(n) &&
  734. n->m_hdr.mh_type == m->m_hdr.mh_type) {
  735. bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_hdr.mh_len,
  736. (unsigned)m->m_hdr.mh_len);
  737. n->m_hdr.mh_len += m->m_hdr.mh_len;
  738. sb->sb_cc += m->m_hdr.mh_len;
  739. if (m->m_hdr.mh_type != MT_DATA && m->m_hdr.mh_type != MT_OOBDATA)
  740. /* XXX: Probably don't need.*/
  741. sb->sb_ctl += m->m_hdr.mh_len;
  742. m = m_free(m);
  743. continue;
  744. }
  745. if (n)
  746. n->m_hdr.mh_next = m;
  747. else
  748. sb->sb_mb = m;
  749. sb->sb_mbtail = m;
  750. sballoc(sb, m);
  751. n = m;
  752. m->m_hdr.mh_flags &= ~M_EOR;
  753. m = m->m_hdr.mh_next;
  754. n->m_hdr.mh_next = 0;
  755. }
  756. if (eor) {
  757. KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
  758. n->m_hdr.mh_flags |= eor;
  759. }
  760. SBLASTMBUFCHK(sb);
  761. }
  762. /*
  763. * Free all mbufs in a sockbuf. Check that all resources are reclaimed.
  764. */
  765. static void
  766. sbflush_internal(struct sockbuf *sb)
  767. {
  768. while (sb->sb_mbcnt) {
  769. /*
  770. * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
  771. * we would loop forever. Panic instead.
  772. */
  773. if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_hdr.mh_len))
  774. break;
  775. sbdrop_internal(sb, (int)sb->sb_cc);
  776. }
  777. if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
  778. panic("sbflush_internal: cc %u || mb %p || mbcnt %u",
  779. sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
  780. }
  781. void
  782. sbflush_locked(socket* so, struct sockbuf *sb)
  783. {
  784. SOCK_LOCK_ASSERT(so);
  785. sbflush_internal(sb);
  786. }
  787. void
  788. sbflush(socket* so, struct sockbuf *sb)
  789. {
  790. SOCK_LOCK(so);
  791. sbflush_locked(so, sb);
  792. SOCK_UNLOCK(so);
  793. }
  794. /*
  795. * Drop data from (the front of) a sockbuf.
  796. */
  797. static void
  798. sbdrop_internal(struct sockbuf *sb, int len)
  799. {
  800. struct mbuf *m;
  801. struct mbuf *next;
  802. next = (m = sb->sb_mb) ? m->m_hdr.mh_nextpkt : 0;
  803. while (len > 0) {
  804. if (m == 0) {
  805. if (next == 0)
  806. panic("sbdrop");
  807. m = next;
  808. next = m->m_hdr.mh_nextpkt;
  809. continue;
  810. }
  811. if (m->m_hdr.mh_len > len) {
  812. m->m_hdr.mh_len -= len;
  813. m->m_hdr.mh_data += len;
  814. sb->sb_cc -= len;
  815. if (sb->sb_sndptroff != 0)
  816. sb->sb_sndptroff -= len;
  817. if (m->m_hdr.mh_type != MT_DATA && m->m_hdr.mh_type != MT_OOBDATA)
  818. sb->sb_ctl -= len;
  819. break;
  820. }
  821. len -= m->m_hdr.mh_len;
  822. sbfree(sb, m);
  823. m = m_free(m);
  824. }
  825. while (m && m->m_hdr.mh_len == 0) {
  826. sbfree(sb, m);
  827. m = m_free(m);
  828. }
  829. if (m) {
  830. sb->sb_mb = m;
  831. m->m_hdr.mh_nextpkt = next;
  832. } else
  833. sb->sb_mb = next;
  834. /*
  835. * First part is an inline SB_EMPTY_FIXUP(). Second part makes sure
  836. * sb_lastrecord is up-to-date if we dropped part of the last record.
  837. */
  838. m = sb->sb_mb;
  839. if (m == NULL) {
  840. sb->sb_mbtail = NULL;
  841. sb->sb_lastrecord = NULL;
  842. } else if (m->m_hdr.mh_nextpkt == NULL) {
  843. sb->sb_lastrecord = m;
  844. }
  845. }
  846. /*
  847. * Drop data from (the front of) a sockbuf.
  848. */
  849. void
  850. sbdrop_locked(socket* so, struct sockbuf *sb, int len)
  851. {
  852. SOCK_LOCK_ASSERT(so);
  853. sbdrop_internal(sb, len);
  854. }
  855. void
  856. sbdrop(socket* so, struct sockbuf *sb, int len)
  857. {
  858. SOCK_LOCK(so);
  859. sbdrop_locked(so, sb, len);
  860. SOCK_UNLOCK(so);
  861. }
  862. /*
  863. * Maintain a pointer and offset pair into the socket buffer mbuf chain to
  864. * avoid traversal of the entire socket buffer for larger offsets.
  865. */
  866. struct mbuf *
  867. sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
  868. {
  869. struct mbuf *m, *ret;
  870. KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
  871. KASSERT(off + len <= sb->sb_cc, ("%s: beyond sb", __func__));
  872. KASSERT(sb->sb_sndptroff <= sb->sb_cc, ("%s: sndptroff broken", __func__));
  873. /*
  874. * Is off below stored offset? Happens on retransmits.
  875. * Just return, we can't help here.
  876. */
  877. if (sb->sb_sndptroff > off) {
  878. *moff = off;
  879. return (sb->sb_mb);
  880. }
  881. /* Return closest mbuf in chain for current offset. */
  882. *moff = off - sb->sb_sndptroff;
  883. m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb;
  884. /* Advance by len to be as close as possible for the next transmit. */
  885. for (off = off - sb->sb_sndptroff + len - 1;
  886. off > 0 && m != NULL && off >= (u_int)m->m_hdr.mh_len;
  887. m = m->m_hdr.mh_next) {
  888. sb->sb_sndptroff += m->m_hdr.mh_len;
  889. off -= m->m_hdr.mh_len;
  890. }
  891. if (off > 0 && m == NULL)
  892. panic("%s: sockbuf %p and mbuf %p clashing", __func__, sb, ret);
  893. sb->sb_sndptr = m;
  894. return (ret);
  895. }
  896. /*
  897. * Drop a record off the front of a sockbuf and move the next record to the
  898. * front.
  899. */
  900. void
  901. sbdroprecord_locked(socket* so, struct sockbuf *sb)
  902. {
  903. struct mbuf *m;
  904. SOCK_LOCK_ASSERT(so);
  905. m = sb->sb_mb;
  906. if (m) {
  907. sb->sb_mb = m->m_hdr.mh_nextpkt;
  908. do {
  909. sbfree(sb, m);
  910. m = m_free(m);
  911. } while (m);
  912. }
  913. SB_EMPTY_FIXUP(sb);
  914. }
  915. /*
  916. * Drop a record off the front of a sockbuf and move the next record to the
  917. * front.
  918. */
  919. void
  920. sbdroprecord(socket* so, struct sockbuf *sb)
  921. {
  922. SOCK_LOCK(so);
  923. sbdroprecord_locked(so, sb);
  924. SOCK_UNLOCK(so);
  925. }
  926. /*
  927. * Create a "control" mbuf containing the specified data with the specified
  928. * type for presentation on a socket buffer.
  929. */
  930. struct mbuf *
  931. sbcreatecontrol(caddr_t p, int size, int type, int level)
  932. {
  933. struct cmsghdr *cp;
  934. struct mbuf *m;
  935. if (CMSG_SPACE((u_int)size) > MCLBYTES)
  936. return ((struct mbuf *) NULL);
  937. if (CMSG_SPACE((u_int)size) > MLEN)
  938. m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
  939. else
  940. m = m_get(M_DONTWAIT, MT_CONTROL);
  941. if (m == NULL)
  942. return ((struct mbuf *) NULL);
  943. cp = mtod(m, struct cmsghdr *);
  944. m->m_hdr.mh_len = 0;
  945. KASSERT(CMSG_SPACE((u_int)size) <= (u_int)M_TRAILINGSPACE(m),
  946. ("sbcreatecontrol: short mbuf"));
  947. if (p != NULL)
  948. (void)memcpy(CMSG_DATA(cp), p, size);
  949. m->m_hdr.mh_len = CMSG_SPACE(size);
  950. cp->cmsg_len = CMSG_LEN(size);
  951. cp->cmsg_level = level;
  952. cp->cmsg_type = type;
  953. return (m);
  954. }
  955. /*
  956. * This does the same for socket buffers that sotoxsocket does for sockets:
  957. * generate an user-format data structure describing the socket buffer. Note
  958. * that the xsockbuf structure, since it is always embedded in a socket, does
  959. * not include a self pointer nor a length. We make this entry point public
  960. * in case some other mechanism needs it.
  961. */
  962. void
  963. sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
  964. {
  965. xsb->sb_cc = sb->sb_cc;
  966. xsb->sb_hiwat = sb->sb_hiwat;
  967. xsb->sb_mbcnt = sb->sb_mbcnt;
  968. xsb->sb_mcnt = sb->sb_mcnt;
  969. xsb->sb_ccnt = sb->sb_ccnt;
  970. xsb->sb_mbmax = sb->sb_mbmax;
  971. xsb->sb_lowat = sb->sb_lowat;
  972. xsb->sb_flags = sb->sb_flags;
  973. xsb->sb_timeo = sb->sb_timeo;
  974. }
  975. /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
  976. SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
  977. SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
  978. &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
  979. SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
  980. &sb_efficiency, 0, "");