PageRenderTime 244ms CodeModel.GetById 109ms RepoModel.GetById 0ms app.codeStats 1ms

/bsd/sys/netinet/tcp_input.cc

https://gitlab.com/jforge/osv
C++ | 3287 lines | 1953 code | 237 blank | 1097 comment | 548 complexity | 0d391bf1f1401a73c33885330ac4d916 MD5 | raw file
Possible License(s): BSD-3-Clause, 0BSD, MPL-2.0-no-copyleft-exception
  1. /*-
  2. * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  3. * The Regents of the University of California. All rights reserved.
  4. * Copyright (c) 2007-2008,2010
  5. * Swinburne University of Technology, Melbourne, Australia.
  6. * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  7. * Copyright (c) 2010 The FreeBSD Foundation
  8. * Copyright (c) 2010-2011 Juniper Networks, Inc.
  9. * All rights reserved.
  10. *
  11. * Portions of this software were developed at the Centre for Advanced Internet
  12. * Architectures, Swinburne University of Technology, by Lawrence Stewart,
  13. * James Healy and David Hayes, made possible in part by a grant from the Cisco
  14. * University Research Program Fund at Community Foundation Silicon Valley.
  15. *
  16. * Portions of this software were developed at the Centre for Advanced
  17. * Internet Architectures, Swinburne University of Technology, Melbourne,
  18. * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  19. *
  20. * Portions of this software were developed by Robert N. M. Watson under
  21. * contract to Juniper Networks, Inc.
  22. *
  23. * Redistribution and use in source and binary forms, with or without
  24. * modification, are permitted provided that the following conditions
  25. * are met:
  26. * 1. Redistributions of source code must retain the above copyright
  27. * notice, this list of conditions and the following disclaimer.
  28. * 2. Redistributions in binary form must reproduce the above copyright
  29. * notice, this list of conditions and the following disclaimer in the
  30. * documentation and/or other materials provided with the distribution.
  31. * 4. Neither the name of the University nor the names of its contributors
  32. * may be used to endorse or promote products derived from this software
  33. * without specific prior written permission.
  34. *
  35. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  36. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  37. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  38. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  39. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  40. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  41. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  42. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  43. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  44. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  45. * SUCH DAMAGE.
  46. *
  47. * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
  48. */
  49. #include <sys/cdefs.h>
  50. #include <bsd/porting/netport.h>
  51. #include <bsd/porting/sync_stub.h>
  52. #include <bsd/porting/uma_stub.h>
  53. #include <bsd/sys/sys/libkern.h>
  54. #include <bsd/sys/sys/param.h>
  55. #include <bsd/sys/sys/mbuf.h>
  56. #include <bsd/sys/sys/protosw.h>
  57. #include <bsd/sys/sys/socket.h>
  58. #include <bsd/sys/sys/socketvar.h>
  59. #include <bsd/sys/net/if.h>
  60. #include <bsd/sys/net/route.h>
  61. #include <bsd/sys/net/vnet.h>
  62. #define TCPSTATES /* for logging */
  63. #include <bsd/sys/netinet/cc.h>
  64. #include <bsd/sys/netinet/in.h>
  65. #include <bsd/sys/netinet/in_pcb.h>
  66. #include <bsd/sys/netinet/in_systm.h>
  67. #include <bsd/sys/netinet/in_var.h>
  68. #include <bsd/sys/netinet/ip.h>
  69. #include <bsd/sys/netinet/ip_icmp.h> /* required for icmp_var.h */
  70. #include <bsd/sys/netinet/icmp_var.h> /* for ICMP_BANDLIM */
  71. #include <bsd/sys/netinet/ip_var.h>
  72. #include <bsd/sys/netinet/ip_options.h>
  73. #include <bsd/sys/netinet/tcp_fsm.h>
  74. #include <bsd/sys/netinet/tcp_seq.h>
  75. #include <bsd/sys/netinet/tcp_timer.h>
  76. #include <bsd/sys/netinet/tcp_var.h>
  77. #include <bsd/sys/netinet/tcpip.h>
  78. #include <bsd/sys/netinet/tcp_syncache.h>
  79. #ifdef TCPDEBUG
  80. #include <netinet/tcp_debug.h>
  81. #endif /* TCPDEBUG */
  82. #include <machine/in_cksum.h>
  83. #include <osv/poll.h>
  84. #include <osv/net_trace.hh>
  85. TRACEPOINT(trace_tcp_input_ack, "%p: We've got ACK: %u", void*, unsigned int);
  86. const int tcprexmtthresh = 3;
  87. VNET_DEFINE(struct tcpstat, tcpstat);
  88. SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
  89. &VNET_NAME(tcpstat), tcpstat,
  90. "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
  91. int tcp_log_in_vain = 0;
  92. SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
  93. &tcp_log_in_vain, 0,
  94. "Log all incoming TCP segments to closed ports");
  95. VNET_DEFINE(int, blackhole) = 0;
  96. #define V_blackhole VNET(blackhole)
  97. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
  98. &VNET_NAME(blackhole), 0,
  99. "Do not send RST on segments to closed ports");
  100. VNET_DEFINE(int, tcp_delack_enabled) = 1;
  101. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
  102. &VNET_NAME(tcp_delack_enabled), 0,
  103. "Delay ACK to try and piggyback it onto a data packet");
  104. VNET_DEFINE(int, drop_synfin) = 0;
  105. #define V_drop_synfin VNET(drop_synfin)
  106. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
  107. &VNET_NAME(drop_synfin), 0,
  108. "Drop TCP packets with SYN+FIN set");
  109. VNET_DEFINE(int, tcp_do_rfc3042) = 1;
  110. #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042)
  111. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
  112. &VNET_NAME(tcp_do_rfc3042), 0,
  113. "Enable RFC 3042 (Limited Transmit)");
  114. VNET_DEFINE(int, tcp_do_rfc3390) = 1;
  115. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
  116. &VNET_NAME(tcp_do_rfc3390), 0,
  117. "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
  118. VNET_DEFINE(int, tcp_do_rfc3465) = 1;
  119. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
  120. &VNET_NAME(tcp_do_rfc3465), 0,
  121. "Enable RFC 3465 (Appropriate Byte Counting)");
  122. VNET_DEFINE(int, tcp_abc_l_var) = 2;
  123. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
  124. &VNET_NAME(tcp_abc_l_var), 2,
  125. "Cap the max cwnd increment during slow-start to this number of segments");
  126. SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
  127. VNET_DEFINE(int, tcp_do_ecn) = 0;
  128. SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
  129. &VNET_NAME(tcp_do_ecn), 0,
  130. "TCP ECN support");
  131. VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
  132. SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
  133. &VNET_NAME(tcp_ecn_maxretries), 0,
  134. "Max retries before giving up on ECN");
  135. VNET_DEFINE(int, tcp_insecure_rst) = 0;
  136. #define V_tcp_insecure_rst VNET(tcp_insecure_rst)
  137. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
  138. &VNET_NAME(tcp_insecure_rst), 0,
  139. "Follow the old (insecure) criteria for accepting RST packets");
  140. VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
  141. #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
  142. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
  143. &VNET_NAME(tcp_do_autorcvbuf), 0,
  144. "Enable automatic receive buffer sizing");
  145. VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024;
  146. #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
  147. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
  148. &VNET_NAME(tcp_autorcvbuf_inc), 0,
  149. "Incrementor step size of automatic receive buffer");
  150. VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024;
  151. #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
  152. SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
  153. &VNET_NAME(tcp_autorcvbuf_max), 0,
  154. "Max size of automatic receive buffer");
  155. VNET_DEFINE(struct inpcbhead, tcb);
  156. #define tcb6 tcb /* for KAME src sync over BSD*'s */
  157. VNET_DEFINE(struct inpcbinfo, tcbinfo);
  158. static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
  159. static void tcp_do_segment(struct mbuf *, struct tcphdr *,
  160. struct socket *, struct tcpcb *, int, int, uint8_t,
  161. int, bool& want_close);
  162. static void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
  163. struct tcpcb *, int, int);
  164. static void tcp_pulloutofband(struct socket *,
  165. struct tcphdr *, struct mbuf *, int);
  166. static void tcp_xmit_timer(struct tcpcb *, int);
  167. static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
  168. static void inline tcp_fields_to_host(struct tcphdr *);
  169. #ifdef TCP_SIGNATURE
  170. static void inline tcp_fields_to_net(struct tcphdr *);
  171. static int inline tcp_signature_verify_input(struct mbuf *, int, int,
  172. int, struct tcpopt *, struct tcphdr *, u_int);
  173. #endif
  174. static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
  175. uint16_t type);
  176. static void inline cc_conn_init(struct tcpcb *tp);
  177. static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
  178. /*
  179. * Kernel module interface for updating tcpstat. The argument is an index
  180. * into tcpstat treated as an array of u_long. While this encodes the
  181. * general layout of tcpstat into the caller, it doesn't encode its location,
  182. * so that future changes to add, for example, per-CPU stats support won't
  183. * cause binary compatibility problems for kernel modules.
  184. */
  185. void
  186. kmod_tcpstat_inc(int statnum)
  187. {
  188. (*((u_long *)&V_tcpstat + statnum))++;
  189. }
  190. /*
  191. * CC wrapper hook functions
  192. */
  193. static void inline
  194. cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
  195. {
  196. INP_LOCK_ASSERT(tp->t_inpcb);
  197. tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
  198. if (tp->snd_cwnd == bsd_min(tp->snd_cwnd, tp->snd_wnd))
  199. tp->ccv->flags |= CCF_CWND_LIMITED;
  200. else
  201. tp->ccv->flags &= ~CCF_CWND_LIMITED;
  202. if (type == CC_ACK) {
  203. if (tp->snd_cwnd > tp->snd_ssthresh) {
  204. tp->t_bytes_acked += bsd_min(tp->ccv->bytes_this_ack,
  205. V_tcp_abc_l_var * tp->t_maxseg);
  206. if (tp->t_bytes_acked >= tp->snd_cwnd) {
  207. tp->t_bytes_acked -= tp->snd_cwnd;
  208. tp->ccv->flags |= CCF_ABC_SENTAWND;
  209. }
  210. } else {
  211. tp->ccv->flags &= ~CCF_ABC_SENTAWND;
  212. tp->t_bytes_acked = 0;
  213. }
  214. }
  215. if (CC_ALGO(tp)->ack_received != NULL) {
  216. /* XXXLAS: Find a way to live without this */
  217. tp->ccv->curack = th->th_ack;
  218. CC_ALGO(tp)->ack_received(tp->ccv, type);
  219. }
  220. }
  221. static void inline
  222. cc_conn_init(struct tcpcb *tp)
  223. {
  224. struct hc_metrics_lite metrics;
  225. struct inpcb *inp = tp->t_inpcb;
  226. int rtt;
  227. INP_LOCK_ASSERT(tp->t_inpcb);
  228. tcp_hc_get(&inp->inp_inc, &metrics);
  229. if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
  230. tp->t_srtt = rtt;
  231. tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
  232. TCPSTAT_INC(tcps_usedrtt);
  233. if (metrics.rmx_rttvar) {
  234. tp->t_rttvar = metrics.rmx_rttvar;
  235. TCPSTAT_INC(tcps_usedrttvar);
  236. } else {
  237. /* default variation is +- 1 rtt */
  238. tp->t_rttvar =
  239. tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
  240. }
  241. TCPT_RANGESET(tp->t_rxtcur,
  242. ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
  243. tp->t_rttmin, TCPTV_REXMTMAX);
  244. }
  245. if (metrics.rmx_ssthresh) {
  246. /*
  247. * There's some sort of gateway or interface
  248. * buffer limit on the path. Use this to set
  249. * the slow start threshhold, but set the
  250. * threshold to no less than 2*mss.
  251. */
  252. tp->snd_ssthresh = bsd_max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
  253. TCPSTAT_INC(tcps_usedssthresh);
  254. }
  255. /*
  256. * Set the slow-start flight size depending on whether this
  257. * is a local network or not.
  258. *
  259. * Extend this so we cache the cwnd too and retrieve it here.
  260. * Make cwnd even bigger than RFC3390 suggests but only if we
  261. * have previous experience with the remote host. Be careful
  262. * not make cwnd bigger than remote receive window or our own
  263. * send socket buffer. Maybe put some additional upper bound
  264. * on the retrieved cwnd. Should do incremental updates to
  265. * hostcache when cwnd collapses so next connection doesn't
  266. * overloads the path again.
  267. *
  268. * XXXAO: Initializing the CWND from the hostcache is broken
  269. * and in its current form not RFC conformant. It is disabled
  270. * until fixed or removed entirely.
  271. *
  272. * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
  273. * We currently check only in syncache_socket for that.
  274. */
  275. /* #define TCP_METRICS_CWND */
  276. #ifdef TCP_METRICS_CWND
  277. if (metrics.rmx_cwnd)
  278. tp->snd_cwnd = bsd_max(tp->t_maxseg, bsd_min(metrics.rmx_cwnd / 2,
  279. bsd_min(tp->snd_wnd, so->so_snd.sb_hiwat)));
  280. else
  281. #endif
  282. if (V_tcp_do_rfc3390)
  283. tp->snd_cwnd = bsd_min(4 * tp->t_maxseg,
  284. bsd_max(2 * tp->t_maxseg, 4380));
  285. else if (in_localaddr(inp->inp_faddr))
  286. tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local;
  287. else
  288. tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz;
  289. if (CC_ALGO(tp)->conn_init != NULL)
  290. CC_ALGO(tp)->conn_init(tp->ccv);
  291. }
  292. void
  293. cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
  294. {
  295. INP_LOCK_ASSERT(tp->t_inpcb);
  296. switch(type) {
  297. case CC_NDUPACK:
  298. if (!IN_FASTRECOVERY(tp->t_flags)) {
  299. tp->snd_recover = tp->snd_max;
  300. if (tp->t_flags & TF_ECN_PERMIT)
  301. tp->t_flags |= TF_ECN_SND_CWR;
  302. }
  303. break;
  304. case CC_ECN:
  305. if (!IN_CONGRECOVERY(tp->t_flags)) {
  306. TCPSTAT_INC(tcps_ecn_rcwnd);
  307. tp->snd_recover = tp->snd_max;
  308. if (tp->t_flags & TF_ECN_PERMIT)
  309. tp->t_flags |= TF_ECN_SND_CWR;
  310. }
  311. break;
  312. case CC_RTO:
  313. tp->t_dupacks = 0;
  314. tp->t_bytes_acked = 0;
  315. EXIT_RECOVERY(tp->t_flags);
  316. tp->snd_ssthresh = bsd_max(2, bsd_min(tp->snd_wnd, tp->snd_cwnd) / 2 /
  317. tp->t_maxseg) * tp->t_maxseg;
  318. tp->snd_cwnd = tp->t_maxseg;
  319. break;
  320. case CC_RTO_ERR:
  321. TCPSTAT_INC(tcps_sndrexmitbad);
  322. /* RTO was unnecessary, so reset everything. */
  323. tp->snd_cwnd = tp->snd_cwnd_prev;
  324. tp->snd_ssthresh = tp->snd_ssthresh_prev;
  325. tp->snd_recover = tp->snd_recover_prev;
  326. if (tp->t_flags & TF_WASFRECOVERY)
  327. ENTER_FASTRECOVERY(tp->t_flags);
  328. if (tp->t_flags & TF_WASCRECOVERY)
  329. ENTER_CONGRECOVERY(tp->t_flags);
  330. tp->snd_nxt = tp->snd_max;
  331. tp->t_flags &= ~TF_PREVVALID;
  332. tp->t_badrxtwin = 0;
  333. break;
  334. }
  335. if (CC_ALGO(tp)->cong_signal != NULL) {
  336. if (th != NULL)
  337. tp->ccv->curack = th->th_ack;
  338. CC_ALGO(tp)->cong_signal(tp->ccv, type);
  339. }
  340. }
  341. static void inline
  342. cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
  343. {
  344. INP_LOCK_ASSERT(tp->t_inpcb);
  345. /* XXXLAS: KASSERT that we're in recovery? */
  346. if (CC_ALGO(tp)->post_recovery != NULL) {
  347. tp->ccv->curack = th->th_ack;
  348. CC_ALGO(tp)->post_recovery(tp->ccv);
  349. }
  350. /* XXXLAS: EXIT_RECOVERY ? */
  351. tp->t_bytes_acked = 0;
  352. }
  353. static inline void
  354. tcp_fields_to_host(struct tcphdr *th)
  355. {
  356. th->th_seq = ntohl(th->th_seq);
  357. th->th_ack = ntohl(th->th_ack);
  358. th->th_win = ntohs(th->th_win);
  359. th->th_urp = ntohs(th->th_urp);
  360. }
  361. /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
  362. #define ND6_HINT(tp)
  363. /*
  364. * Indicate whether this ack should be delayed. We can delay the ack if
  365. * - there is no delayed ack timer in progress and
  366. * - our last ack wasn't a 0-sized window. We never want to delay
  367. * the ack that opens up a 0-sized window and
  368. * - delayed acks are enabled or
  369. * - this is a half-synchronized T/TCP connection.
  370. */
  371. #define DELAY_ACK(tp) \
  372. ((!tcp_timer_active(tp, TT_DELACK) && \
  373. (tp->t_flags & TF_RXWIN0SENT) == 0) && \
  374. (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
  375. void
  376. tcp_input(struct mbuf *m, int off0)
  377. {
  378. struct tcphdr *th = NULL;
  379. struct ip *ip = NULL;
  380. struct ipovly *ipov;
  381. struct inpcb *inp = NULL;
  382. struct tcpcb *tp = NULL;
  383. struct socket *so = NULL;
  384. u_char *optp = NULL;
  385. int optlen = 0;
  386. int len;
  387. int tlen = 0, off;
  388. int drop_hdrlen;
  389. int thflags;
  390. int rstreason = 0; /* For badport_bandlim accounting purposes */
  391. uint8_t iptos = 0;
  392. const void *ip6 = NULL;
  393. struct tcpopt to; /* options in this segment */
  394. char *s = NULL; /* address and port logging */
  395. int ti_locked;
  396. #define TI_UNLOCKED 1
  397. #define TI_WLOCKED 2
  398. #ifdef TCPDEBUG
  399. /*
  400. * The size of tcp_saveipgen must be the size of the max ip header,
  401. * now IPv6.
  402. */
  403. u_char tcp_saveipgen[IP6_HDR_LEN];
  404. struct tcphdr tcp_savetcp;
  405. short ostate = 0;
  406. #endif
  407. to.to_flags = 0;
  408. TCPSTAT_INC(tcps_rcvtotal);
  409. /*
  410. * Get IP and TCP header together in first mbuf.
  411. * Note: IP leaves IP header in first mbuf.
  412. */
  413. if (off0 > sizeof (struct ip)) {
  414. ip_stripoptions(m, (struct mbuf *)0);
  415. off0 = sizeof(struct ip);
  416. }
  417. if (m->m_hdr.mh_len < sizeof (struct tcpiphdr)) {
  418. if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
  419. == NULL) {
  420. TCPSTAT_INC(tcps_rcvshort);
  421. return;
  422. }
  423. }
  424. ip = mtod(m, struct ip *);
  425. ipov = (struct ipovly *)ip;
  426. th = (struct tcphdr *)((caddr_t)ip + off0);
  427. tlen = ip->ip_len;
  428. if (m->M_dat.MH.MH_pkthdr.csum_flags & CSUM_DATA_VALID) {
  429. if (m->M_dat.MH.MH_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
  430. th->th_sum = m->M_dat.MH.MH_pkthdr.csum_data;
  431. else
  432. th->th_sum = in_pseudo(ip->ip_src.s_addr,
  433. ip->ip_dst.s_addr,
  434. htonl(m->M_dat.MH.MH_pkthdr.csum_data +
  435. ip->ip_len +
  436. IPPROTO_TCP));
  437. th->th_sum ^= 0xffff;
  438. #ifdef TCPDEBUG
  439. ipov->ih_len = (u_short)tlen;
  440. ipov->ih_len = htons(ipov->ih_len);
  441. #endif
  442. } else {
  443. /*
  444. * Checksum extended TCP header and data.
  445. */
  446. len = sizeof (struct ip) + tlen;
  447. bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
  448. ipov->ih_len = (u_short)tlen;
  449. ipov->ih_len = htons(ipov->ih_len);
  450. th->th_sum = in_cksum(m, len);
  451. }
  452. if (th->th_sum) {
  453. TCPSTAT_INC(tcps_rcvbadsum);
  454. goto drop;
  455. }
  456. /* Re-initialization for later version check */
  457. ip->ip_v = IPVERSION;
  458. iptos = ip->ip_tos;
  459. /*
  460. * Check that TCP offset makes sense,
  461. * pull out TCP options and adjust length. XXX
  462. */
  463. off = th->th_off << 2;
  464. if (off < sizeof (struct tcphdr) || off > tlen) {
  465. TCPSTAT_INC(tcps_rcvbadoff);
  466. goto drop;
  467. }
  468. tlen -= off; /* tlen is used instead of ti->ti_len */
  469. if (off > sizeof (struct tcphdr)) {
  470. if (m->m_hdr.mh_len < sizeof(struct ip) + off) {
  471. if ((m = m_pullup(m, sizeof (struct ip) + off))
  472. == NULL) {
  473. TCPSTAT_INC(tcps_rcvshort);
  474. return;
  475. }
  476. ip = mtod(m, struct ip *);
  477. ipov = (struct ipovly *)ip;
  478. th = (struct tcphdr *)((caddr_t)ip + off0);
  479. }
  480. optlen = off - sizeof (struct tcphdr);
  481. optp = (u_char *)(th + 1);
  482. }
  483. thflags = th->th_flags;
  484. /*
  485. * Convert TCP protocol specific fields to host format.
  486. */
  487. tcp_fields_to_host(th);
  488. /*
  489. * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
  490. */
  491. drop_hdrlen = off0 + off;
  492. /*
  493. * Locate pcb for segment; if we're likely to add or remove a
  494. * connection then first acquire pcbinfo lock. There are two cases
  495. * where we might discover later we need a write lock despite the
  496. * flags: ACKs moving a connection out of the syncache, and ACKs for
  497. * a connection in TIMEWAIT.
  498. */
  499. if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) {
  500. INP_INFO_WLOCK(&V_tcbinfo);
  501. ti_locked = TI_WLOCKED;
  502. } else
  503. ti_locked = TI_UNLOCKED;
  504. findpcb:
  505. #ifdef INVARIANTS
  506. if (ti_locked == TI_WLOCKED) {
  507. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  508. } else {
  509. INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  510. }
  511. #endif
  512. inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
  513. th->th_sport, ip->ip_dst, th->th_dport,
  514. INPLOOKUP_WILDCARD | INPLOOKUP_LOCKPCB,
  515. m->M_dat.MH.MH_pkthdr.rcvif, m);
  516. /*
  517. * If the INPCB does not exist then all data in the incoming
  518. * segment is discarded and an appropriate RST is sent back.
  519. * XXX MRT Send RST using which routing table?
  520. */
  521. if (inp == NULL) {
  522. /*
  523. * Log communication attempts to ports that are not
  524. * in use.
  525. */
  526. if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
  527. tcp_log_in_vain == 2) {
  528. if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6)))
  529. bsd_log(LOG_INFO, "%s; %s: Connection attempt "
  530. "to closed port\n", s, __func__);
  531. }
  532. /*
  533. * When blackholing do not respond with a RST but
  534. * completely ignore the segment and drop it.
  535. */
  536. if ((V_blackhole == 1 && (thflags & TH_SYN)) ||
  537. V_blackhole == 2)
  538. goto dropunlock;
  539. rstreason = BANDLIM_RST_CLOSEDPORT;
  540. goto dropwithreset;
  541. }
  542. INP_LOCK_ASSERT(inp);
  543. if (!(inp->inp_flags & INP_HW_FLOWID)
  544. && (m->m_hdr.mh_flags & M_FLOWID)
  545. && ((inp->inp_socket == NULL)
  546. || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) {
  547. inp->inp_flags |= INP_HW_FLOWID;
  548. inp->inp_flags &= ~INP_SW_FLOWID;
  549. inp->inp_flowid = m->M_dat.MH.MH_pkthdr.flowid;
  550. }
  551. /*
  552. * Check the minimum TTL for socket.
  553. */
  554. if (inp->inp_ip_minttl != 0) {
  555. if (inp->inp_ip_minttl > ip->ip_ttl)
  556. goto dropunlock;
  557. }
  558. /*
  559. * A previous connection in TIMEWAIT state is supposed to catch stray
  560. * or duplicate segments arriving late. If this segment was a
  561. * legitimate new connection attempt the old INPCB gets removed and
  562. * we can try again to find a listening socket.
  563. *
  564. * At this point, due to earlier optimism, we may hold only an inpcb
  565. * lock, and not the inpcbinfo write lock. If so, we need to try to
  566. * acquire it, or if that fails, acquire a reference on the inpcb,
  567. * drop all locks, acquire a global write lock, and then re-acquire
  568. * the inpcb lock. We may at that point discover that another thread
  569. * has tried to free the inpcb, in which case we need to loop back
  570. * and try to find a new inpcb to deliver to.
  571. *
  572. * XXXRW: It may be time to rethink timewait locking.
  573. */
  574. relocked:
  575. if (inp->inp_flags & INP_TIMEWAIT) {
  576. if (ti_locked == TI_UNLOCKED) {
  577. if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
  578. in_pcbref(inp);
  579. INP_UNLOCK(inp);
  580. INP_INFO_WLOCK(&V_tcbinfo);
  581. ti_locked = TI_WLOCKED;
  582. INP_LOCK(inp);
  583. if (in_pcbrele_locked(inp)) {
  584. inp = NULL;
  585. goto findpcb;
  586. }
  587. } else
  588. ti_locked = TI_WLOCKED;
  589. }
  590. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  591. if (thflags & TH_SYN)
  592. tcp_dooptions(&to, optp, optlen, TO_SYN);
  593. /*
  594. * NB: tcp_twcheck unlocks the INP and frees the mbuf.
  595. */
  596. if (tcp_twcheck(inp, &to, th, m, tlen))
  597. goto findpcb;
  598. INP_INFO_WUNLOCK(&V_tcbinfo);
  599. return;
  600. }
  601. /*
  602. * The TCPCB may no longer exist if the connection is winding
  603. * down or it is in the CLOSED state. Either way we drop the
  604. * segment and send an appropriate response.
  605. */
  606. tp = intotcpcb(inp);
  607. if (tp == NULL || tp->get_state() == TCPS_CLOSED) {
  608. rstreason = BANDLIM_RST_CLOSEDPORT;
  609. goto dropwithreset;
  610. }
  611. // We may be processing a FIN here, process all preceding
  612. // normal packets first.
  613. tcp_flush_net_channel(tp);
  614. /*
  615. * We've identified a valid inpcb, but it could be that we need an
  616. * inpcbinfo write lock but don't hold it. In this case, attempt to
  617. * acquire using the same strategy as the TIMEWAIT case above. If we
  618. * relock, we have to jump back to 'relocked' as the connection might
  619. * now be in TIMEWAIT.
  620. */
  621. #ifdef INVARIANTS
  622. if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0)
  623. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  624. #endif
  625. if (tp->get_state() != TCPS_ESTABLISHED) {
  626. if (ti_locked == TI_UNLOCKED) {
  627. if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
  628. in_pcbref(inp);
  629. INP_UNLOCK(inp);
  630. INP_INFO_WLOCK(&V_tcbinfo);
  631. ti_locked = TI_WLOCKED;
  632. INP_LOCK(inp);
  633. if (in_pcbrele_locked(inp)) {
  634. inp = NULL;
  635. goto findpcb;
  636. }
  637. goto relocked;
  638. } else
  639. ti_locked = TI_WLOCKED;
  640. }
  641. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  642. }
  643. so = inp->inp_socket;
  644. KASSERT(so != NULL, ("%s: so == NULL", __func__));
  645. #ifdef TCPDEBUG
  646. if (so->so_options & SO_DEBUG) {
  647. ostate = tp->get_state();
  648. bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
  649. tcp_savetcp = *th;
  650. }
  651. #endif /* TCPDEBUG */
  652. /*
  653. * When the socket is accepting connections (the INPCB is in LISTEN
  654. * state) we look into the SYN cache if this is a new connection
  655. * attempt or the completion of a previous one. Because listen
  656. * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be
  657. * held in this case.
  658. */
  659. if (so->so_options & SO_ACCEPTCONN) {
  660. struct in_conninfo inc;
  661. KASSERT(tp->get_state() == TCPS_LISTEN, ("%s: so accepting but "
  662. "tp not listening", __func__));
  663. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  664. bzero(&inc, sizeof(inc));
  665. {
  666. inc.inc_faddr = ip->ip_src;
  667. inc.inc_laddr = ip->ip_dst;
  668. }
  669. inc.inc_fport = th->th_sport;
  670. inc.inc_lport = th->th_dport;
  671. inc.inc_fibnum = so->so_fibnum;
  672. /*
  673. * Check for an existing connection attempt in syncache if
  674. * the flag is only ACK. A successful lookup creates a new
  675. * socket appended to the listen queue in SYN_RECEIVED state.
  676. */
  677. if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
  678. /*
  679. * Parse the TCP options here because
  680. * syncookies need access to the reflected
  681. * timestamp.
  682. */
  683. tcp_dooptions(&to, optp, optlen, 0);
  684. /*
  685. * NB: syncache_expand() doesn't unlock
  686. * inp and tcpinfo locks.
  687. */
  688. if (!syncache_expand(&inc, &to, th, &so, m)) {
  689. /*
  690. * No syncache entry or ACK was not
  691. * for our SYN/ACK. Send a RST.
  692. * NB: syncache did its own logging
  693. * of the failure cause.
  694. */
  695. rstreason = BANDLIM_RST_OPENPORT;
  696. goto dropwithreset;
  697. }
  698. if (so == NULL) {
  699. /*
  700. * We completed the 3-way handshake
  701. * but could not allocate a socket
  702. * either due to memory shortage,
  703. * listen queue length limits or
  704. * global socket limits. Send RST
  705. * or wait and have the remote end
  706. * retransmit the ACK for another
  707. * try.
  708. */
  709. if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  710. bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
  711. "Socket allocation failed due to "
  712. "limits or memory shortage, %s\n",
  713. s, __func__,
  714. V_tcp_sc_rst_sock_fail ?
  715. "sending RST" : "try again");
  716. if (V_tcp_sc_rst_sock_fail) {
  717. rstreason = BANDLIM_UNLIMITED;
  718. goto dropwithreset;
  719. } else
  720. goto dropunlock;
  721. }
  722. /*
  723. * Socket is created in state SYN_RECEIVED.
  724. * Unlock the listen socket, lock the newly
  725. * created socket and update the tp variable.
  726. */
  727. INP_UNLOCK(inp); /* listen socket */
  728. inp = sotoinpcb(so);
  729. INP_LOCK(inp); /* new connection */
  730. tp = intotcpcb(inp);
  731. KASSERT(tp->get_state() == TCPS_SYN_RECEIVED,
  732. ("%s: ", __func__));
  733. /*
  734. * Process the segment and the data it
  735. * contains. tcp_do_segment() consumes
  736. * the mbuf chain.
  737. */
  738. bool want_close;
  739. tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
  740. iptos, ti_locked, want_close);
  741. INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  742. // if tcp_close() indeed closes, it also unlocks
  743. if (!want_close || tcp_close(tp)) {
  744. INP_UNLOCK(inp);
  745. }
  746. return;
  747. }
  748. /*
  749. * Segment flag validation for new connection attempts:
  750. *
  751. * Our (SYN|ACK) response was rejected.
  752. * Check with syncache and remove entry to prevent
  753. * retransmits.
  754. *
  755. * NB: syncache_chkrst does its own logging of failure
  756. * causes.
  757. */
  758. if (thflags & TH_RST) {
  759. syncache_chkrst(&inc, th);
  760. goto dropunlock;
  761. }
  762. /*
  763. * We can't do anything without SYN.
  764. */
  765. if ((thflags & TH_SYN) == 0) {
  766. if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  767. bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
  768. "SYN is missing, segment ignored\n",
  769. s, __func__);
  770. TCPSTAT_INC(tcps_badsyn);
  771. goto dropunlock;
  772. }
  773. /*
  774. * (SYN|ACK) is bogus on a listen socket.
  775. */
  776. if (thflags & TH_ACK) {
  777. if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  778. bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
  779. "SYN|ACK invalid, segment rejected\n",
  780. s, __func__);
  781. syncache_badack(&inc); /* XXX: Not needed! */
  782. TCPSTAT_INC(tcps_badsyn);
  783. rstreason = BANDLIM_RST_OPENPORT;
  784. goto dropwithreset;
  785. }
  786. /*
  787. * If the drop_synfin option is enabled, drop all
  788. * segments with both the SYN and FIN bits set.
  789. * This prevents e.g. nmap from identifying the
  790. * TCP/IP stack.
  791. * XXX: Poor reasoning. nmap has other methods
  792. * and is constantly refining its stack detection
  793. * strategies.
  794. * XXX: This is a violation of the TCP specification
  795. * and was used by RFC1644.
  796. */
  797. if ((thflags & TH_FIN) && V_drop_synfin) {
  798. if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  799. bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
  800. "SYN|FIN segment ignored (based on "
  801. "sysctl setting)\n", s, __func__);
  802. TCPSTAT_INC(tcps_badsyn);
  803. goto dropunlock;
  804. }
  805. /*
  806. * Segment's flags are (SYN) or (SYN|FIN).
  807. *
  808. * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
  809. * as they do not affect the state of the TCP FSM.
  810. * The data pointed to by TH_URG and th_urp is ignored.
  811. */
  812. KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
  813. ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
  814. KASSERT(thflags & (TH_SYN),
  815. ("%s: Listen socket: TH_SYN not set", __func__));
  816. /*
  817. * Basic sanity checks on incoming SYN requests:
  818. * Don't respond if the destination is a link layer
  819. * broadcast according to RFC1122 4.2.3.10, p. 104.
  820. * If it is from this socket it must be forged.
  821. * Don't respond if the source or destination is a
  822. * global or subnet broad- or multicast address.
  823. * Note that it is quite possible to receive unicast
  824. * link-layer packets with a broadcast IP address. Use
  825. * in_broadcast() to find them.
  826. */
  827. if (m->m_hdr.mh_flags & (M_BCAST|M_MCAST)) {
  828. if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  829. bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
  830. "Connection attempt from broad- or multicast "
  831. "link layer address ignored\n", s, __func__);
  832. goto dropunlock;
  833. }
  834. if (th->th_dport == th->th_sport &&
  835. ip->ip_dst.s_addr == ip->ip_src.s_addr) {
  836. if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  837. bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
  838. "Connection attempt from/to self "
  839. "ignored\n", s, __func__);
  840. goto dropunlock;
  841. }
  842. if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
  843. IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
  844. ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
  845. in_broadcast(ip->ip_dst, m->M_dat.MH.MH_pkthdr.rcvif)) {
  846. if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  847. bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
  848. "Connection attempt from/to broad- "
  849. "or multicast address ignored\n",
  850. s, __func__);
  851. goto dropunlock;
  852. }
  853. /*
  854. * SYN appears to be valid. Create compressed TCP state
  855. * for syncache.
  856. */
  857. #ifdef TCPDEBUG
  858. if (so->so_options & SO_DEBUG)
  859. tcp_trace(TA_INPUT, ostate, tp,
  860. (void *)tcp_saveipgen, &tcp_savetcp, 0);
  861. #endif
  862. tcp_dooptions(&to, optp, optlen, TO_SYN);
  863. syncache_add(&inc, &to, th, inp, &so, m);
  864. /*
  865. * Entry added to syncache and mbuf consumed.
  866. * Everything already unlocked by syncache_add().
  867. */
  868. INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  869. return;
  870. }
  871. /*
  872. * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
  873. * state. tcp_do_segment() always consumes the mbuf chain and unlocks pcbinfo.
  874. */
  875. bool want_close;
  876. tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked, want_close);
  877. INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  878. // if tcp_close() indeed closes, it also unlocks
  879. if (!want_close || tcp_close(tp)) {
  880. INP_UNLOCK(inp);
  881. }
  882. return;
  883. dropwithreset:
  884. if (ti_locked == TI_WLOCKED) {
  885. INP_INFO_WUNLOCK(&V_tcbinfo);
  886. ti_locked = TI_UNLOCKED;
  887. }
  888. #ifdef INVARIANTS
  889. else {
  890. KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset "
  891. "ti_locked: %d", __func__, ti_locked));
  892. INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  893. }
  894. #endif
  895. if (inp != NULL) {
  896. tcp_dropwithreset(m, th, tp, tlen, rstreason);
  897. INP_UNLOCK(inp);
  898. } else
  899. tcp_dropwithreset(m, th, NULL, tlen, rstreason);
  900. m = NULL; /* mbuf chain got consumed. */
  901. goto drop;
  902. dropunlock:
  903. if (ti_locked == TI_WLOCKED) {
  904. INP_INFO_WUNLOCK(&V_tcbinfo);
  905. ti_locked = TI_UNLOCKED;
  906. }
  907. #ifdef INVARIANTS
  908. else {
  909. KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock "
  910. "ti_locked: %d", __func__, ti_locked));
  911. INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  912. }
  913. #endif
  914. if (inp != NULL)
  915. INP_UNLOCK(inp);
  916. drop:
  917. INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  918. if (s != NULL)
  919. free(s);
  920. if (m != NULL)
  921. m_freem(m);
  922. }
  923. static void
  924. tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
  925. struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
  926. int ti_locked, bool& want_close)
  927. {
  928. int thflags, acked, ourfinisacked, needoutput = 0;
  929. int rstreason, todrop, win;
  930. u_long tiwin;
  931. struct tcpopt to;
  932. auto inp = tp->t_inpcb;
  933. want_close = false;
  934. #ifdef TCPDEBUG
  935. /*
  936. * The size of tcp_saveipgen must be the size of the max ip header,
  937. * now IPv6.
  938. */
  939. u_char tcp_saveipgen[IP6_HDR_LEN];
  940. struct tcphdr tcp_savetcp;
  941. short ostate = 0;
  942. #endif
  943. thflags = th->th_flags;
  944. tp->sackhint.last_sack_ack = tcp_seq(0);
  945. /*
  946. * If this is either a state-changing packet or current state isn't
  947. * established, we require a write lock on tcbinfo. Otherwise, we
  948. * allow either a read lock or a write lock, as we may have acquired
  949. * a write lock due to a race.
  950. *
  951. * Require a global write lock for SYN/FIN/RST segments or
  952. * non-established connections; otherwise accept either a read or
  953. * write lock, as we may have conservatively acquired a write lock in
  954. * certain cases in tcp_input() (is this still true?). Currently we
  955. * will never enter with no lock, so we try to drop it quickly in the
  956. * common pure ack/pure data cases.
  957. *
  958. * net channels process packets without the lock, so try to acquire it.
  959. * if we fail, drop the packet. FIXME: invert the lock order so we don't
  960. * have to drop packets.
  961. */
  962. if (tp->get_state() != TCPS_ESTABLISHED && ti_locked == TI_UNLOCKED) {
  963. if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
  964. ti_locked = TI_WLOCKED;
  965. } else {
  966. goto drop;
  967. }
  968. }
  969. if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
  970. tp->get_state() != TCPS_ESTABLISHED) {
  971. KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
  972. "SYN/FIN/RST/!EST", __func__, ti_locked));
  973. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  974. } else {
  975. #ifdef INVARIANTS
  976. if (ti_locked == TI_WLOCKED)
  977. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  978. else {
  979. KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
  980. "ti_locked: %d", __func__, ti_locked));
  981. INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  982. }
  983. #endif
  984. }
  985. INP_LOCK_ASSERT(tp->t_inpcb);
  986. KASSERT(tp->get_state() > TCPS_LISTEN, ("%s: TCPS_LISTEN",
  987. __func__));
  988. KASSERT(tp->get_state() != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
  989. __func__));
  990. /*
  991. * Segment received on connection.
  992. * Reset idle time and keep-alive timer.
  993. * XXX: This should be done after segment
  994. * validation to ignore broken/spoofed segs.
  995. */
  996. tp->t_rcvtime = bsd_ticks;
  997. if (TCPS_HAVEESTABLISHED(tp->get_state()))
  998. tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
  999. /*
  1000. * Unscale the window into a 32-bit value.
  1001. * For the SYN_SENT state the scale is zero.
  1002. */
  1003. tiwin = th->th_win << tp->snd_scale;
  1004. /*
  1005. * TCP ECN processing.
  1006. */
  1007. if (tp->t_flags & TF_ECN_PERMIT) {
  1008. if (thflags & TH_CWR)
  1009. tp->t_flags &= ~TF_ECN_SND_ECE;
  1010. switch (iptos & IPTOS_ECN_MASK) {
  1011. case IPTOS_ECN_CE:
  1012. tp->t_flags |= TF_ECN_SND_ECE;
  1013. TCPSTAT_INC(tcps_ecn_ce);
  1014. break;
  1015. case IPTOS_ECN_ECT0:
  1016. TCPSTAT_INC(tcps_ecn_ect0);
  1017. break;
  1018. case IPTOS_ECN_ECT1:
  1019. TCPSTAT_INC(tcps_ecn_ect1);
  1020. break;
  1021. }
  1022. /* Congestion experienced. */
  1023. if (thflags & TH_ECE) {
  1024. cc_cong_signal(tp, th, CC_ECN);
  1025. }
  1026. }
  1027. /*
  1028. * Parse options on any incoming segment.
  1029. */
  1030. tcp_dooptions(&to, (u_char *)(th + 1),
  1031. (th->th_off << 2) - sizeof(struct tcphdr),
  1032. (thflags & TH_SYN) ? TO_SYN : 0);
  1033. /*
  1034. * If echoed timestamp is later than the current time,
  1035. * fall back to non RFC1323 RTT calculation. Normalize
  1036. * timestamp if syncookies were used when this connection
  1037. * was established.
  1038. */
  1039. if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
  1040. to.to_tsecr -= tp->ts_offset;
  1041. if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
  1042. to.to_tsecr = 0;
  1043. }
  1044. /*
  1045. * Process options only when we get SYN/ACK back. The SYN case
  1046. * for incoming connections is handled in tcp_syncache.
  1047. * According to RFC1323 the window field in a SYN (i.e., a <SYN>
  1048. * or <SYN,ACK>) segment itself is never scaled.
  1049. * XXX this is traditional behavior, may need to be cleaned up.
  1050. */
  1051. if (tp->get_state() == TCPS_SYN_SENT && (thflags & TH_SYN)) {
  1052. if ((to.to_flags & TOF_SCALE) &&
  1053. (tp->t_flags & TF_REQ_SCALE)) {
  1054. tp->t_flags |= TF_RCVD_SCALE;
  1055. tp->snd_scale = to.to_wscale;
  1056. }
  1057. /*
  1058. * Initial send window. It will be updated with
  1059. * the next incoming segment to the scaled value.
  1060. */
  1061. tp->snd_wnd = th->th_win;
  1062. if (to.to_flags & TOF_TS) {
  1063. tp->t_flags |= TF_RCVD_TSTMP;
  1064. tp->ts_recent = to.to_tsval;
  1065. tp->ts_recent_age = tcp_ts_getticks();
  1066. }
  1067. if (to.to_flags & TOF_MSS)
  1068. tcp_mss(tp, to.to_mss);
  1069. if ((tp->t_flags & TF_SACK_PERMIT) &&
  1070. (to.to_flags & TOF_SACKPERM) == 0)
  1071. tp->t_flags &= ~TF_SACK_PERMIT;
  1072. }
  1073. /*
  1074. * Header prediction: check for the two common cases
  1075. * of a uni-directional data xfer. If the packet has
  1076. * no control flags, is in-sequence, the window didn't
  1077. * change and we're not retransmitting, it's a
  1078. * candidate. If the length is zero and the ack moved
  1079. * forward, we're the sender side of the xfer. Just
  1080. * free the data acked & wake any higher level process
  1081. * that was blocked waiting for space. If the length
  1082. * is non-zero and the ack didn't move, we're the
  1083. * receiver side. If we're getting packets in-order
  1084. * (the reassembly queue is empty), add the data to
  1085. * the socket buffer and note that we need a delayed ack.
  1086. * Make sure that the hidden state-flags are also off.
  1087. * Since we check for TCPS_ESTABLISHED first, it can only
  1088. * be TH_NEEDSYN.
  1089. */
  1090. if (tp->get_state() == TCPS_ESTABLISHED &&
  1091. th->th_seq == tp->rcv_nxt &&
  1092. (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
  1093. tp->snd_nxt == tp->snd_max &&
  1094. tiwin && tiwin == tp->snd_wnd &&
  1095. ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
  1096. LIST_EMPTY(&tp->t_segq) &&
  1097. ((to.to_flags & TOF_TS) == 0 ||
  1098. TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
  1099. /*
  1100. * If last ACK falls within this segment's sequence numbers,
  1101. * record the timestamp.
  1102. * NOTE that the test is modified according to the latest
  1103. * proposal of the tcplw@cray.com list (Braden 1993/04/26).
  1104. */
  1105. if ((to.to_flags & TOF_TS) != 0 &&
  1106. th->th_seq <= tp->last_ack_sent) {
  1107. tp->ts_recent_age = tcp_ts_getticks();
  1108. tp->ts_recent = to.to_tsval;
  1109. }
  1110. if (tlen == 0) {
  1111. if (th->th_ack > tp->snd_una &&
  1112. th->th_ack <= tp->snd_max &&
  1113. !IN_RECOVERY(tp->t_flags) &&
  1114. (to.to_flags & TOF_SACK) == 0 &&
  1115. TAILQ_EMPTY(&tp->snd_holes)) {
  1116. /*
  1117. * This is a pure ack for outstanding data.
  1118. */
  1119. if (ti_locked == TI_WLOCKED)
  1120. INP_INFO_WUNLOCK(&V_tcbinfo);
  1121. ti_locked = TI_UNLOCKED;
  1122. TCPSTAT_INC(tcps_predack);
  1123. /*
  1124. * "bad retransmit" recovery.
  1125. */
  1126. if (tp->t_rxtshift == 1 &&
  1127. tp->t_flags & TF_PREVVALID &&
  1128. (int)(bsd_ticks - tp->t_badrxtwin) < 0) {
  1129. cc_cong_signal(tp, th, CC_RTO_ERR);
  1130. }
  1131. /*
  1132. * Recalculate the transmit timer / rtt.
  1133. *
  1134. * Some boxes send broken timestamp replies
  1135. * during the SYN+ACK phase, ignore
  1136. * timestamps of 0 or we could calculate a
  1137. * huge RTT and blow up the retransmit timer.
  1138. */
  1139. if ((to.to_flags & TOF_TS) != 0 &&
  1140. to.to_tsecr) {
  1141. u_int t;
  1142. t = tcp_ts_getticks() - to.to_tsecr;
  1143. if (!tp->t_rttlow || tp->t_rttlow > t)
  1144. tp->t_rttlow = t;
  1145. tcp_xmit_timer(tp,
  1146. TCP_TS_TO_TICKS(t) + 1);
  1147. } else if (tp->t_rtttime &&
  1148. th->th_ack > tp->t_rtseq) {
  1149. if (!tp->t_rttlow ||
  1150. tp->t_rttlow > bsd_ticks - tp->t_rtttime)
  1151. tp->t_rttlow = bsd_ticks - tp->t_rtttime;
  1152. tcp_xmit_timer(tp,
  1153. bsd_ticks - tp->t_rtttime);
  1154. }
  1155. acked = BYTES_THIS_ACK(tp, th);
  1156. TCPSTAT_INC(tcps_rcvackpack);
  1157. TCPSTAT_ADD(tcps_rcvackbyte, acked);
  1158. sbdrop_locked(so, &so->so_snd, acked);
  1159. if (tp->snd_una > tp->snd_recover &&
  1160. th->th_ack <= tp->snd_recover)
  1161. tp->snd_recover = th->th_ack - 1;
  1162. /*
  1163. * Let the congestion control algorithm update
  1164. * congestion control related information. This
  1165. * typically means increasing the congestion
  1166. * window.
  1167. */
  1168. cc_ack_received(tp, th, CC_ACK);
  1169. tp->snd_una = th->th_ack;
  1170. /*
  1171. * Pull snd_wl2 up to prevent seq wrap relative
  1172. * to th_ack.
  1173. */
  1174. tp->snd_wl2 = th->th_ack;
  1175. tp->t_dupacks = 0;
  1176. m_freem(m);
  1177. ND6_HINT(tp); /* Some progress has been made. */
  1178. /*
  1179. * If all outstanding data are acked, stop
  1180. * retransmit timer, otherwise restart timer
  1181. * using current (possibly backed-off) value.
  1182. * If process is waiting for space,
  1183. * wakeup/selwakeup/signal. If data
  1184. * are ready to send, let tcp_output
  1185. * decide between more output or persist.
  1186. */
  1187. #ifdef TCPDEBUG
  1188. if (so->so_options & SO_DEBUG)
  1189. tcp_trace(TA_INPUT, ostate, tp,
  1190. (void *)tcp_saveipgen,
  1191. &tcp_savetcp, 0);
  1192. #endif
  1193. if (tp->snd_una == tp->snd_max)
  1194. tcp_timer_activate(tp, TT_REXMT, 0);
  1195. else if (!tcp_timer_active(tp, TT_PERSIST))
  1196. tcp_timer_activate(tp, TT_REXMT,
  1197. tp->t_rxtcur);
  1198. sowwakeup_locked(so);
  1199. if (so->so_snd.sb_cc)
  1200. (void) tcp_output(tp);
  1201. goto check_delack;
  1202. }
  1203. } else if (th->th_ack == tp->snd_una &&
  1204. tlen <= sbspace(&so->so_rcv)) {
  1205. int newsize = 0; /* automatic sockbuf scaling */
  1206. /*
  1207. * This is a pure, in-sequence data packet with
  1208. * nothing on the reassembly queue and we have enough
  1209. * buffer space to take it.
  1210. */
  1211. if (ti_locked == TI_WLOCKED)
  1212. INP_INFO_WUNLOCK(&V_tcbinfo);
  1213. ti_locked = TI_UNLOCKED;
  1214. /* Clean receiver SACK report if present */
  1215. if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
  1216. tcp_clean_sackreport(tp);
  1217. TCPSTAT_INC(tcps_preddat);
  1218. tp->rcv_nxt += tlen;
  1219. /*
  1220. * Pull snd_wl1 up to prevent seq wrap relative to
  1221. * th_seq.
  1222. */
  1223. tp->snd_wl1 = th->th_seq;
  1224. /*
  1225. * Pull rcv_up up to prevent seq wrap relative to
  1226. * rcv_nxt.
  1227. */
  1228. tp->rcv_up = tp->rcv_nxt;
  1229. TCPSTAT_INC(tcps_rcvpack);
  1230. TCPSTAT_ADD(tcps_rcvbyte, tlen);
  1231. ND6_HINT(tp); /* Some progress has been made */
  1232. #ifdef TCPDEBUG
  1233. if (so->so_options & SO_DEBUG)
  1234. tcp_trace(TA_INPUT, ostate, tp,
  1235. (void *)tcp_saveipgen, &tcp_savetcp, 0);
  1236. #endif
  1237. /*
  1238. * Automatic sizing of receive socket buffer. Often the send
  1239. * buffer size is not optimally adjusted to the actual network
  1240. * conditions at hand (delay bandwidth product). Setting the
  1241. * buffer size too small limits throughput on links with high
  1242. * bandwidth and high delay (eg. trans-continental/oceanic links).
  1243. *
  1244. * On the receive side the socket buffer memory is only rarely
  1245. * used to any significant extent. This allows us to be much
  1246. * more aggressive in scaling the receive socket buffer. For
  1247. * the case that the buffer space is actually used to a large
  1248. * extent and we run out of kernel memory we can simply drop
  1249. * the new segments; TCP on the sender will just retransmit it
  1250. * later. Setting the buffer size too big may only consume too
  1251. * much kernel memory if the application doesn't read() from
  1252. * the socket or packet loss or reordering makes use of the
  1253. * reassembly queue.
  1254. *
  1255. * The criteria to step up the receive buffer one notch are:
  1256. * 1. the number of bytes received during the time it takes
  1257. * one timestamp to be reflected back to us (the RTT);
  1258. * 2. received bytes per RTT is within seven eighth of the
  1259. * current socket buffer size;
  1260. * 3. receive buffer size has not hit maximal automatic size;
  1261. *
  1262. * This algorithm does one step per RTT at most and only if
  1263. * we receive a bulk stream w/o packet losses or reorderings.
  1264. * Shrinking the buffer during idle times is not necessary as
  1265. * it doesn't consume any memory when idle.
  1266. *
  1267. * TODO: Only step up if the application is actually serving
  1268. * the buffer to better manage the socket buffer resources.
  1269. */
  1270. if (V_tcp_do_autorcvbuf &&
  1271. to.to_tsecr &&
  1272. (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
  1273. if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
  1274. to.to_tsecr - tp->rfbuf_ts < hz) {
  1275. if (tp->rfbuf_cnt >
  1276. (so->so_rcv.sb_hiwat / 8 * 7) &&
  1277. so->so_rcv.sb_hiwat <
  1278. V_tcp_autorcvbuf_max) {
  1279. newsize =
  1280. bsd_min(so->so_rcv.sb_hiwat +
  1281. V_tcp_autorcvbuf_inc,
  1282. V_tcp_autorcvbuf_max);
  1283. }
  1284. /* Start over with next RTT. */
  1285. tp->rfbuf_ts = 0;
  1286. tp->rfbuf_cnt = 0;
  1287. } else
  1288. tp->rfbuf_cnt += tlen; /* add up */
  1289. }
  1290. /* Add data to socket buffer. */
  1291. SOCK_LOCK_ASSERT(so);
  1292. if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
  1293. m_freem(m);
  1294. } else {
  1295. /*
  1296. * Set new socket buffer size.
  1297. * Give up when limit is reached.
  1298. */
  1299. if (newsize)
  1300. if (!sbreserve_locked(&so->so_rcv,
  1301. newsize, so, NULL))
  1302. so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
  1303. m_adj(m, drop_hdrlen); /* delayed header drop */
  1304. sbappendstream_locked(so, &so->so_rcv, m);
  1305. }
  1306. sorwakeup_locked(so);
  1307. if (DELAY_ACK(tp)) {
  1308. tp->t_flags |= TF_DELACK;
  1309. } else {
  1310. tp->t_flags |= TF_ACKNOW;
  1311. tcp_output(tp);
  1312. }
  1313. goto check_delack;
  1314. }
  1315. }
  1316. /*
  1317. * Calculate amount of space in receive window,
  1318. * and then do TCP input processing.
  1319. * Receive window is amount of space in rcv queue,
  1320. * but not less than advertised window.
  1321. */
  1322. win = sbspace(&so->so_rcv);
  1323. if (win < 0)
  1324. win = 0;
  1325. tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
  1326. /* Reset receive buffer auto scaling when not in bulk receive mode. */
  1327. tp->rfbuf_ts = 0;
  1328. tp->rfbuf_cnt = 0;
  1329. switch (tp->get_state()) {
  1330. /*
  1331. * If the state is SYN_RECEIVED:
  1332. * if seg contains an ACK, but not for our SYN/ACK, send a RST.
  1333. */
  1334. case TCPS_SYN_RECEIVED:
  1335. if ((thflags & TH_ACK) &&
  1336. (th->th_ack <= tp->snd_una ||
  1337. th->th_ack > tp->snd_max)) {
  1338. rstreason = BANDLIM_RST_OPENPORT;
  1339. goto dropwithreset;
  1340. }
  1341. break;
  1342. /*
  1343. * If the state is SYN_SENT:
  1344. * if seg contains an ACK, but not for our SYN, drop the input.
  1345. * if seg contains a RST, then drop the connection.
  1346. * if seg does not contain SYN, then drop it.
  1347. * Otherwise this is an acceptable SYN segment
  1348. * initialize tp->rcv_nxt and tp->irs
  1349. * if seg contains ack then advance tp->snd_una
  1350. * if seg contains an ECE and ECN support is enabled, the stream
  1351. * is ECN capable.
  1352. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
  1353. * arrange for segment to be acked (eventually)
  1354. * continue processing rest of data/controls, beginning with URG
  1355. */
  1356. case TCPS_SYN_SENT:
  1357. if ((thflags & TH_ACK) &&
  1358. (th->th_ack <= tp->iss ||
  1359. th->th_ack > tp->snd_max)) {
  1360. rstreason = BANDLIM_UNLIMITED;
  1361. goto dropwithreset;
  1362. }
  1363. if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
  1364. tcp_drop_noclose(tp, ECONNREFUSED);
  1365. want_close = true;
  1366. }
  1367. if (thflags & TH_RST)
  1368. goto drop;
  1369. if (!(thflags & TH_SYN))
  1370. goto drop;
  1371. tp->irs = th->th_seq;
  1372. tcp_rcvseqinit(tp);
  1373. if (thflags & TH_ACK) {
  1374. TCPSTAT_INC(tcps_connects);
  1375. soisconnected(so);
  1376. /* Do window scaling on this connection? */
  1377. if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
  1378. (TF_RCVD_SCALE|TF_REQ_SCALE)) {
  1379. tp->rcv_scale = tp->request_r_scale;
  1380. }
  1381. tp->rcv_adv += imin(tp->rcv_wnd,
  1382. TCP_MAXWIN << tp->rcv_scale);
  1383. tp->snd_una++; /* SYN is acked */
  1384. /*
  1385. * If there's data, delay ACK; if there's also a FIN
  1386. * ACKNOW will be turned on later.
  1387. */
  1388. if (DELAY_ACK(tp) && tlen != 0)
  1389. tcp_timer_activate(tp, TT_DELACK,
  1390. tcp_delacktime);
  1391. else
  1392. tp->t_flags |= TF_ACKNOW;
  1393. if ((thflags & TH_ECE) && V_tcp_do_ecn) {
  1394. tp->t_flags |= TF_ECN_PERMIT;
  1395. TCPSTAT_INC(tcps_ecn_shs);
  1396. }
  1397. /*
  1398. * Received <SYN,ACK> in SYN_SENT[*] state.
  1399. * Transitions:
  1400. * SYN_SENT --> ESTABLISHED
  1401. * SYN_SENT* --> FIN_WAIT_1
  1402. */
  1403. tp->t_starttime = bsd_ticks;
  1404. if (tp->t_flags & TF_NEEDFIN) {
  1405. tp->set_state(TCPS_FIN_WAIT_1);
  1406. tp->t_flags &= ~TF_NEEDFIN;
  1407. thflags &= ~TH_SYN;
  1408. } else {
  1409. tp->set_state(TCPS_ESTABLISHED);
  1410. tcp_setup_net_channel(tp, m->M_dat.MH.MH_pkthdr.rcvif);
  1411. cc_conn_init(tp);
  1412. tcp_timer_activate(tp, TT_KEEP,
  1413. TP_KEEPIDLE(tp));
  1414. }
  1415. } else {
  1416. /*
  1417. * Received initial SYN in SYN-SENT[*] state =>
  1418. * simultaneous open. If segment contains CC option
  1419. * and there is a cached CC, apply TAO test.
  1420. * If it succeeds, connection is * half-synchronized.
  1421. * Otherwise, do 3-way handshake:
  1422. * SYN-SENT -> SYN-RECEIVED
  1423. * SYN-SENT* -> SYN-RECEIVED*
  1424. * If there was no CC option, clear cached CC value.
  1425. */
  1426. tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
  1427. tcp_timer_activate(tp, TT_REXMT, 0);
  1428. tp->set_state(TCPS_SYN_RECEIVED);
  1429. }
  1430. KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
  1431. "ti_locked %d", __func__, ti_locked));
  1432. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  1433. INP_LOCK_ASSERT(tp->t_inpcb);
  1434. /*
  1435. * Advance th->th_seq to correspond to first data byte.
  1436. * If data, trim to stay within window,
  1437. * dropping FIN if necessary.
  1438. */
  1439. th->th_seq++;
  1440. if (tlen > tp->rcv_wnd) {
  1441. todrop = tlen - tp->rcv_wnd;
  1442. m_adj(m, -todrop);
  1443. tlen = tp->rcv_wnd;
  1444. thflags &= ~TH_FIN;
  1445. TCPSTAT_INC(tcps_rcvpackafterwin);
  1446. TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
  1447. }
  1448. tp->snd_wl1 = th->th_seq - 1;
  1449. tp->rcv_up = th->th_seq;
  1450. /*
  1451. * Client side of transaction: already sent SYN and data.
  1452. * If the remote host used T/TCP to validate the SYN,
  1453. * our data will be ACK'd; if so, enter normal data segment
  1454. * processing in the middle of step 5, ack processing.
  1455. * Otherwise, goto step 6.
  1456. */
  1457. if (thflags & TH_ACK)
  1458. goto process_ACK;
  1459. goto step6;
  1460. /*
  1461. * If the state is LAST_ACK or CLOSING or TIME_WAIT:
  1462. * do normal processing.
  1463. *
  1464. * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
  1465. */
  1466. case TCPS_LAST_ACK:
  1467. case TCPS_CLOSING:
  1468. break; /* continue normal processing */
  1469. }
  1470. /*
  1471. * States other than LISTEN or SYN_SENT.
  1472. * First check the RST flag and sequence number since reset segments
  1473. * are exempt from the timestamp and connection count tests. This
  1474. * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
  1475. * below which allowed reset segments in half the sequence space
  1476. * to fall though and be processed (which gives forged reset
  1477. * segments with a random sequence number a 50 percent chance of
  1478. * killing a connection).
  1479. * Then check timestamp, if present.
  1480. * Then check the connection count, if present.
  1481. * Then check that at least some bytes of segment are within
  1482. * receive window. If segment begins before rcv_nxt,
  1483. * drop leading data (and SYN); if nothing left, just ack.
  1484. *
  1485. *
  1486. * If the RST bit is set, check the sequence number to see
  1487. * if this is a valid reset segment.
  1488. * RFC 793 page 37:
  1489. * In all states except SYN-SENT, all reset (RST) segments
  1490. * are validated by checking their SEQ-fields. A reset is
  1491. * valid if its sequence number is in the window.
  1492. * Note: this does not take into account delayed ACKs, so
  1493. * we should test against last_ack_sent instead of rcv_nxt.
  1494. * The sequence number in the reset segment is normally an
  1495. * echo of our outgoing acknowlegement numbers, but some hosts
  1496. * send a reset with the sequence number at the rightmost edge
  1497. * of our receive window, and we have to handle this case.
  1498. * Note 2: Paul Watson's paper "Slipping in the Window" has shown
  1499. * that brute force RST attacks are possible. To combat this,
  1500. * we use a much stricter check while in the ESTABLISHED state,
  1501. * only accepting RSTs where the sequence number is equal to
  1502. * last_ack_sent. In all other states (the states in which a
  1503. * RST is more likely), the more permissive check is used.
  1504. * If we have multiple segments in flight, the initial reset
  1505. * segment sequence numbers will be to the left of last_ack_sent,
  1506. * but they will eventually catch up.
  1507. * In any case, it never made sense to trim reset segments to
  1508. * fit the receive window since RFC 1122 says:
  1509. * 4.2.2.12 RST Segment: RFC-793 Section 3.4
  1510. *
  1511. * A TCP SHOULD allow a received RST segment to include data.
  1512. *
  1513. * DISCUSSION
  1514. * It has been suggested that a RST segment could contain
  1515. * ASCII text that encoded and explained the cause of the
  1516. * RST. No standard has yet been established for such
  1517. * data.
  1518. *
  1519. * If the reset segment passes the sequence number test examine
  1520. * the state:
  1521. * SYN_RECEIVED STATE:
  1522. * If passive open, return to LISTEN state.
  1523. * If active open, inform user that connection was refused.
  1524. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
  1525. * Inform user that connection was reset, and close tcb.
  1526. * CLOSING, LAST_ACK STATES:
  1527. * Close the tcb.
  1528. * TIME_WAIT STATE:
  1529. * Drop the segment - see Stevens, vol. 2, p. 964 and
  1530. * RFC 1337.
  1531. */
  1532. if (thflags & TH_RST) {
  1533. if (th->th_seq >= tp->last_ack_sent - 1 &&
  1534. th->th_seq <= tp->last_ack_sent + tp->rcv_wnd) {
  1535. switch (tp->get_state()) {
  1536. case TCPS_SYN_RECEIVED:
  1537. so->so_error = ECONNREFUSED;
  1538. goto close;
  1539. case TCPS_ESTABLISHED:
  1540. if (V_tcp_insecure_rst == 0 &&
  1541. !(th->th_seq >= tp->rcv_nxt - 1) &&
  1542. th->th_seq <= tp->rcv_nxt + 1 &&
  1543. !(th->th_seq >= tp->last_ack_sent - 1) &&
  1544. th->th_seq <= tp->last_ack_sent + 1) {
  1545. TCPSTAT_INC(tcps_badrst);
  1546. goto drop;
  1547. }
  1548. /* FALLTHROUGH */
  1549. case TCPS_FIN_WAIT_1:
  1550. case TCPS_FIN_WAIT_2:
  1551. case TCPS_CLOSE_WAIT:
  1552. so->so_error = ECONNRESET;
  1553. close:
  1554. KASSERT(ti_locked == TI_WLOCKED,
  1555. ("tcp_do_segment: TH_RST 1 ti_locked %d",
  1556. ti_locked));
  1557. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  1558. tp->set_state(TCPS_CLOSED);
  1559. TCPSTAT_INC(tcps_drops);
  1560. want_close = true;
  1561. break;
  1562. case TCPS_CLOSING:
  1563. case TCPS_LAST_ACK:
  1564. KASSERT(ti_locked == TI_WLOCKED,
  1565. ("tcp_do_segment: TH_RST 2 ti_locked %d",
  1566. ti_locked));
  1567. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  1568. want_close = true;
  1569. break;
  1570. }
  1571. }
  1572. goto drop;
  1573. }
  1574. /*
  1575. * RFC 1323 PAWS: If we have a timestamp reply on this segment
  1576. * and it's less than ts_recent, drop it.
  1577. */
  1578. if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
  1579. TSTMP_LT(to.to_tsval, tp->ts_recent)) {
  1580. /* Check to see if ts_recent is over 24 days old. */
  1581. if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
  1582. /*
  1583. * Invalidate ts_recent. If this segment updates
  1584. * ts_recent, the age will be reset later and ts_recent
  1585. * will get a valid value. If it does not, setting
  1586. * ts_recent to zero will at least satisfy the
  1587. * requirement that zero be placed in the timestamp
  1588. * echo reply when ts_recent isn't valid. The
  1589. * age isn't reset until we get a valid ts_recent
  1590. * because we don't want out-of-order segments to be
  1591. * dropped when ts_recent is old.
  1592. */
  1593. tp->ts_recent = 0;
  1594. } else {
  1595. TCPSTAT_INC(tcps_rcvduppack);
  1596. TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
  1597. TCPSTAT_INC(tcps_pawsdrop);
  1598. if (tlen)
  1599. goto dropafterack;
  1600. goto drop;
  1601. }
  1602. }
  1603. /*
  1604. * In the SYN-RECEIVED state, validate that the packet belongs to
  1605. * this connection before trimming the data to fit the receive
  1606. * window. Check the sequence number versus IRS since we know
  1607. * the sequence numbers haven't wrapped. This is a partial fix
  1608. * for the "LAND" DoS attack.
  1609. */
  1610. if (tp->get_state() == TCPS_SYN_RECEIVED && th->th_seq < tp->irs) {
  1611. rstreason = BANDLIM_RST_OPENPORT;
  1612. goto dropwithreset;
  1613. }
  1614. todrop = tp->rcv_nxt - th->th_seq;
  1615. if (todrop > 0) {
  1616. /*
  1617. * If this is a duplicate SYN for our current connection,
  1618. * advance over it and pretend and it's not a SYN.
  1619. */
  1620. if (thflags & TH_SYN && th->th_seq == tp->irs) {
  1621. thflags &= ~TH_SYN;
  1622. th->th_seq++;
  1623. if (th->th_urp > 1)
  1624. th->th_urp--;
  1625. else
  1626. thflags &= ~TH_URG;
  1627. todrop--;
  1628. }
  1629. /*
  1630. * Following if statement from Stevens, vol. 2, p. 960.
  1631. */
  1632. if (todrop > tlen
  1633. || (todrop == tlen && (thflags & TH_FIN) == 0)) {
  1634. /*
  1635. * Any valid FIN must be to the left of the window.
  1636. * At this point the FIN must be a duplicate or out
  1637. * of sequence; drop it.
  1638. */
  1639. thflags &= ~TH_FIN;
  1640. /*
  1641. * Send an ACK to resynchronize and drop any data.
  1642. * But keep on processing for RST or ACK.
  1643. */
  1644. tp->t_flags |= TF_ACKNOW;
  1645. todrop = tlen;
  1646. TCPSTAT_INC(tcps_rcvduppack);
  1647. TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
  1648. } else {
  1649. TCPSTAT_INC(tcps_rcvpartduppack);
  1650. TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
  1651. }
  1652. drop_hdrlen += todrop; /* drop from the top afterwards */
  1653. th->th_seq += todrop;
  1654. tlen -= todrop;
  1655. if (th->th_urp > todrop)
  1656. th->th_urp -= todrop;
  1657. else {
  1658. thflags &= ~TH_URG;
  1659. th->th_urp = 0;
  1660. }
  1661. }
  1662. /*
  1663. * If new data are received on a connection after the
  1664. * user processes are gone, then RST the other end.
  1665. */
  1666. if ((so->so_state & SS_NOFDREF) &&
  1667. tp->get_state() > TCPS_CLOSE_WAIT && tlen) {
  1668. char *s;
  1669. KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
  1670. "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
  1671. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  1672. if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
  1673. bsd_log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket "
  1674. "was closed, sending RST and removing tcpcb\n",
  1675. s, __func__, tcpstates[tp->get_state()], tlen);
  1676. free(s);
  1677. }
  1678. want_close = true;
  1679. TCPSTAT_INC(tcps_rcvafterclose);
  1680. rstreason = BANDLIM_UNLIMITED;
  1681. goto dropwithreset;
  1682. }
  1683. /*
  1684. * If segment ends after window, drop trailing data
  1685. * (and PUSH and FIN); if nothing left, just ACK.
  1686. */
  1687. todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
  1688. if (todrop > 0) {
  1689. TCPSTAT_INC(tcps_rcvpackafterwin);
  1690. if (todrop >= tlen) {
  1691. TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
  1692. /*
  1693. * If window is closed can only take segments at
  1694. * window edge, and have to drop data and PUSH from
  1695. * incoming segments. Continue processing, but
  1696. * remember to ack. Otherwise, drop segment
  1697. * and ack.
  1698. */
  1699. if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
  1700. tp->t_flags |= TF_ACKNOW;
  1701. TCPSTAT_INC(tcps_rcvwinprobe);
  1702. } else
  1703. goto dropafterack;
  1704. } else
  1705. TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
  1706. m_adj(m, -todrop);
  1707. tlen -= todrop;
  1708. thflags &= ~(TH_PUSH|TH_FIN);
  1709. }
  1710. /*
  1711. * If last ACK falls within this segment's sequence numbers,
  1712. * record its timestamp.
  1713. * NOTE:
  1714. * 1) That the test incorporates suggestions from the latest
  1715. * proposal of the tcplw@cray.com list (Braden 1993/04/26).
  1716. * 2) That updating only on newer timestamps interferes with
  1717. * our earlier PAWS tests, so this check should be solely
  1718. * predicated on the sequence space of this segment.
  1719. * 3) That we modify the segment boundary check to be
  1720. * Last.ACK.Sent <= SEG.SEQ + SEG.Len
  1721. * instead of RFC1323's
  1722. * Last.ACK.Sent < SEG.SEQ + SEG.Len,
  1723. * This modified check allows us to overcome RFC1323's
  1724. * limitations as described in Stevens TCP/IP Illustrated
  1725. * Vol. 2 p.869. In such cases, we can still calculate the
  1726. * RTT correctly when RCV.NXT == Last.ACK.Sent.
  1727. */
  1728. if ((to.to_flags & TOF_TS) != 0 &&
  1729. th->th_seq <= tp->last_ack_sent &&
  1730. tp->last_ack_sent <= th->th_seq + tlen +
  1731. ((thflags & (TH_SYN|TH_FIN)) != 0)) {
  1732. tp->ts_recent_age = tcp_ts_getticks();
  1733. tp->ts_recent = to.to_tsval;
  1734. }
  1735. /*
  1736. * If a SYN is in the window, then this is an
  1737. * error and we send an RST and drop the connection.
  1738. */
  1739. if (thflags & TH_SYN) {
  1740. KASSERT(ti_locked == TI_WLOCKED,
  1741. ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
  1742. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  1743. tcp_drop_noclose(tp, ECONNRESET);
  1744. want_close = true;
  1745. rstreason = BANDLIM_UNLIMITED;
  1746. goto drop;
  1747. }
  1748. /*
  1749. * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
  1750. * flag is on (half-synchronized state), then queue data for
  1751. * later processing; else drop segment and return.
  1752. */
  1753. if ((thflags & TH_ACK) == 0) {
  1754. if (tp->get_state() == TCPS_SYN_RECEIVED ||
  1755. (tp->t_flags & TF_NEEDSYN))
  1756. goto step6;
  1757. else if (tp->t_flags & TF_ACKNOW)
  1758. goto dropafterack;
  1759. else
  1760. goto drop;
  1761. }
  1762. /*
  1763. * Ack processing.
  1764. */
  1765. switch (tp->get_state()) {
  1766. /*
  1767. * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
  1768. * ESTABLISHED state and continue processing.
  1769. * The ACK was checked above.
  1770. */
  1771. case TCPS_SYN_RECEIVED:
  1772. TCPSTAT_INC(tcps_connects);
  1773. soisconnected(so);
  1774. /* Do window scaling? */
  1775. if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
  1776. (TF_RCVD_SCALE|TF_REQ_SCALE)) {
  1777. tp->rcv_scale = tp->request_r_scale;
  1778. tp->snd_wnd = tiwin;
  1779. }
  1780. /*
  1781. * Make transitions:
  1782. * SYN-RECEIVED -> ESTABLISHED
  1783. * SYN-RECEIVED* -> FIN-WAIT-1
  1784. */
  1785. tp->t_starttime = bsd_ticks;
  1786. if (tp->t_flags & TF_NEEDFIN) {
  1787. tp->set_state(TCPS_FIN_WAIT_1);
  1788. tp->t_flags &= ~TF_NEEDFIN;
  1789. } else {
  1790. tp->set_state(TCPS_ESTABLISHED);
  1791. tcp_setup_net_channel(tp, m->M_dat.MH.MH_pkthdr.rcvif);
  1792. cc_conn_init(tp);
  1793. tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
  1794. }
  1795. /*
  1796. * If segment contains data or ACK, will call tcp_reass()
  1797. * later; if not, do so now to pass queued data to user.
  1798. */
  1799. if (tlen == 0 && (thflags & TH_FIN) == 0)
  1800. (void) tcp_reass(tp, (struct tcphdr *)0, 0,
  1801. (struct mbuf *)0);
  1802. tp->snd_wl1 = th->th_seq - 1;
  1803. /* FALLTHROUGH */
  1804. /*
  1805. * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
  1806. * ACKs. If the ack is in the range
  1807. * tp->snd_una < th->th_ack <= tp->snd_max
  1808. * then advance tp->snd_una to th->th_ack and drop
  1809. * data from the retransmission queue. If this ACK reflects
  1810. * more up to date window information we update our window information.
  1811. */
  1812. case TCPS_ESTABLISHED:
  1813. case TCPS_FIN_WAIT_1:
  1814. case TCPS_FIN_WAIT_2:
  1815. case TCPS_CLOSE_WAIT:
  1816. case TCPS_CLOSING:
  1817. case TCPS_LAST_ACK:
  1818. if (th->th_ack > tp->snd_max) {
  1819. TCPSTAT_INC(tcps_rcvacktoomuch);
  1820. goto dropafterack;
  1821. }
  1822. if ((tp->t_flags & TF_SACK_PERMIT) &&
  1823. ((to.to_flags & TOF_SACK) ||
  1824. !TAILQ_EMPTY(&tp->snd_holes)))
  1825. tcp_sack_doack(tp, &to, th->th_ack);
  1826. if (th->th_ack <= tp->snd_una) {
  1827. if (tlen == 0 && tiwin == tp->snd_wnd) {
  1828. TCPSTAT_INC(tcps_rcvdupack);
  1829. /*
  1830. * If we have outstanding data (other than
  1831. * a window probe), this is a completely
  1832. * duplicate ack (ie, window info didn't
  1833. * change), the ack is the biggest we've
  1834. * seen and we've seen exactly our rexmt
  1835. * threshhold of them, assume a packet
  1836. * has been dropped and retransmit it.
  1837. * Kludge snd_nxt & the congestion
  1838. * window so we send only this one
  1839. * packet.
  1840. *
  1841. * We know we're losing at the current
  1842. * window size so do congestion avoidance
  1843. * (set ssthresh to half the current window
  1844. * and pull our congestion window back to
  1845. * the new ssthresh).
  1846. *
  1847. * Dup acks mean that packets have left the
  1848. * network (they're now cached at the receiver)
  1849. * so bump cwnd by the amount in the receiver
  1850. * to keep a constant cwnd packets in the
  1851. * network.
  1852. *
  1853. * When using TCP ECN, notify the peer that
  1854. * we reduced the cwnd.
  1855. */
  1856. if (!tcp_timer_active(tp, TT_REXMT) ||
  1857. th->th_ack != tp->snd_una)
  1858. tp->t_dupacks = 0;
  1859. else if (++tp->t_dupacks > tcprexmtthresh ||
  1860. IN_FASTRECOVERY(tp->t_flags)) {
  1861. cc_ack_received(tp, th, CC_DUPACK);
  1862. if ((tp->t_flags & TF_SACK_PERMIT) &&
  1863. IN_FASTRECOVERY(tp->t_flags)) {
  1864. int awnd;
  1865. /*
  1866. * Compute the amount of data in flight first.
  1867. * We can inject new data into the pipe iff
  1868. * we have less than 1/2 the original window's
  1869. * worth of data in flight.
  1870. */
  1871. awnd = (tp->snd_nxt - tp->snd_fack) +
  1872. tp->sackhint.sack_bytes_rexmit;
  1873. if (awnd < tp->snd_ssthresh) {
  1874. tp->snd_cwnd += tp->t_maxseg;
  1875. if (tp->snd_cwnd > tp->snd_ssthresh)
  1876. tp->snd_cwnd = tp->snd_ssthresh;
  1877. }
  1878. } else
  1879. tp->snd_cwnd += tp->t_maxseg;
  1880. (void) tcp_output(tp);
  1881. goto drop;
  1882. } else if (tp->t_dupacks == tcprexmtthresh) {
  1883. tcp_seq onxt = tp->snd_nxt;
  1884. /*
  1885. * If we're doing sack, check to
  1886. * see if we're already in sack
  1887. * recovery. If we're not doing sack,
  1888. * check to see if we're in newreno
  1889. * recovery.
  1890. */
  1891. if (tp->t_flags & TF_SACK_PERMIT) {
  1892. if (IN_FASTRECOVERY(tp->t_flags)) {
  1893. tp->t_dupacks = 0;
  1894. break;
  1895. }
  1896. } else {
  1897. if (th->th_ack <= tp->snd_recover) {
  1898. tp->t_dupacks = 0;
  1899. break;
  1900. }
  1901. }
  1902. /* Congestion signal before ack. */
  1903. cc_cong_signal(tp, th, CC_NDUPACK);
  1904. cc_ack_received(tp, th, CC_DUPACK);
  1905. tcp_timer_activate(tp, TT_REXMT, 0);
  1906. tp->t_rtttime = 0;
  1907. if (tp->t_flags & TF_SACK_PERMIT) {
  1908. TCPSTAT_INC(
  1909. tcps_sack_recovery_episode);
  1910. tp->sack_newdata = tp->snd_nxt;
  1911. tp->snd_cwnd = tp->t_maxseg;
  1912. (void) tcp_output(tp);
  1913. goto drop;
  1914. }
  1915. tp->snd_nxt = th->th_ack;
  1916. tp->snd_cwnd = tp->t_maxseg;
  1917. (void) tcp_output(tp);
  1918. KASSERT(tp->snd_limited <= 2,
  1919. ("%s: tp->snd_limited too big",
  1920. __func__));
  1921. tp->snd_cwnd = tp->snd_ssthresh +
  1922. tp->t_maxseg *
  1923. (tp->t_dupacks - tp->snd_limited);
  1924. if (onxt > tp->snd_nxt)
  1925. tp->snd_nxt = onxt;
  1926. goto drop;
  1927. } else if (V_tcp_do_rfc3042) {
  1928. cc_ack_received(tp, th, CC_DUPACK);
  1929. u_long oldcwnd = tp->snd_cwnd;
  1930. tcp_seq oldsndmax = tp->snd_max;
  1931. u_int sent;
  1932. KASSERT(tp->t_dupacks == 1 ||
  1933. tp->t_dupacks == 2,
  1934. ("%s: dupacks not 1 or 2",
  1935. __func__));
  1936. if (tp->t_dupacks == 1)
  1937. tp->snd_limited = 0;
  1938. tp->snd_cwnd =
  1939. (tp->snd_nxt - tp->snd_una) +
  1940. (tp->t_dupacks - tp->snd_limited) *
  1941. tp->t_maxseg;
  1942. (void) tcp_output(tp);
  1943. sent = tp->snd_max - oldsndmax;
  1944. if (sent > tp->t_maxseg) {
  1945. KASSERT((tp->t_dupacks == 2 &&
  1946. tp->snd_limited == 0) ||
  1947. (sent == tp->t_maxseg + 1 &&
  1948. tp->t_flags & TF_SENTFIN),
  1949. ("%s: sent too much",
  1950. __func__));
  1951. tp->snd_limited = 2;
  1952. } else if (sent > 0)
  1953. ++tp->snd_limited;
  1954. tp->snd_cwnd = oldcwnd;
  1955. goto drop;
  1956. }
  1957. } else
  1958. tp->t_dupacks = 0;
  1959. break;
  1960. }
  1961. KASSERT(th->th_ack > tp->snd_una,
  1962. ("%s: th_ack <= snd_una", __func__));
  1963. /*
  1964. * If the congestion window was inflated to account
  1965. * for the other side's cached packets, retract it.
  1966. */
  1967. if (IN_FASTRECOVERY(tp->t_flags)) {
  1968. if (th->th_ack < tp->snd_recover) {
  1969. if (tp->t_flags & TF_SACK_PERMIT)
  1970. tcp_sack_partialack(tp, th);
  1971. else
  1972. tcp_newreno_partial_ack(tp, th);
  1973. } else
  1974. cc_post_recovery(tp, th);
  1975. }
  1976. tp->t_dupacks = 0;
  1977. /*
  1978. * If we reach this point, ACK is not a duplicate,
  1979. * i.e., it ACKs something we sent.
  1980. */
  1981. if (tp->t_flags & TF_NEEDSYN) {
  1982. /*
  1983. * T/TCP: Connection was half-synchronized, and our
  1984. * SYN has been ACK'd (so connection is now fully
  1985. * synchronized). Go to non-starred state,
  1986. * increment snd_una for ACK of SYN, and check if
  1987. * we can do window scaling.
  1988. */
  1989. tp->t_flags &= ~TF_NEEDSYN;
  1990. tp->snd_una++;
  1991. /* Do window scaling? */
  1992. if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
  1993. (TF_RCVD_SCALE|TF_REQ_SCALE)) {
  1994. tp->rcv_scale = tp->request_r_scale;
  1995. /* Send window already scaled. */
  1996. }
  1997. }
  1998. process_ACK:
  1999. INP_LOCK_ASSERT(tp->t_inpcb);
  2000. acked = BYTES_THIS_ACK(tp, th);
  2001. TCPSTAT_INC(tcps_rcvackpack);
  2002. TCPSTAT_ADD(tcps_rcvackbyte, acked);
  2003. /*
  2004. * If we just performed our first retransmit, and the ACK
  2005. * arrives within our recovery window, then it was a mistake
  2006. * to do the retransmit in the first place. Recover our
  2007. * original cwnd and ssthresh, and proceed to transmit where
  2008. * we left off.
  2009. */
  2010. if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
  2011. (int)(bsd_ticks - tp->t_badrxtwin) < 0)
  2012. cc_cong_signal(tp, th, CC_RTO_ERR);
  2013. /*
  2014. * If we have a timestamp reply, update smoothed
  2015. * round trip time. If no timestamp is present but
  2016. * transmit timer is running and timed sequence
  2017. * number was acked, update smoothed round trip time.
  2018. * Since we now have an rtt measurement, cancel the
  2019. * timer backoff (cf., Phil Karn's retransmit alg.).
  2020. * Recompute the initial retransmit timer.
  2021. *
  2022. * Some boxes send broken timestamp replies
  2023. * during the SYN+ACK phase, ignore
  2024. * timestamps of 0 or we could calculate a
  2025. * huge RTT and blow up the retransmit timer.
  2026. */
  2027. if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
  2028. u_int t;
  2029. t = tcp_ts_getticks() - to.to_tsecr;
  2030. if (!tp->t_rttlow || tp->t_rttlow > t)
  2031. tp->t_rttlow = t;
  2032. tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
  2033. } else if (tp->t_rtttime && th->th_ack > tp->t_rtseq) {
  2034. if (!tp->t_rttlow || tp->t_rttlow > bsd_ticks - tp->t_rtttime)
  2035. tp->t_rttlow = bsd_ticks - tp->t_rtttime;
  2036. tcp_xmit_timer(tp, bsd_ticks - tp->t_rtttime);
  2037. }
  2038. /*
  2039. * If all outstanding data is acked, stop retransmit
  2040. * timer and remember to restart (more output or persist).
  2041. * If there is more data to be acked, restart retransmit
  2042. * timer, using current (possibly backed-off) value.
  2043. */
  2044. if (th->th_ack == tp->snd_max) {
  2045. tcp_timer_activate(tp, TT_REXMT, 0);
  2046. needoutput = 1;
  2047. } else if (!tcp_timer_active(tp, TT_PERSIST))
  2048. tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
  2049. /*
  2050. * If no data (only SYN) was ACK'd,
  2051. * skip rest of ACK processing.
  2052. */
  2053. if (acked == 0)
  2054. goto step6;
  2055. /*
  2056. * Let the congestion control algorithm update congestion
  2057. * control related information. This typically means increasing
  2058. * the congestion window.
  2059. */
  2060. cc_ack_received(tp, th, CC_ACK);
  2061. if (acked > so->so_snd.sb_cc) {
  2062. tp->snd_wnd -= so->so_snd.sb_cc;
  2063. sbdrop_locked(so, &so->so_snd, (int)so->so_snd.sb_cc);
  2064. ourfinisacked = 1;
  2065. } else {
  2066. sbdrop_locked(so, &so->so_snd, acked);
  2067. tp->snd_wnd -= acked;
  2068. ourfinisacked = 0;
  2069. }
  2070. sowwakeup_locked(so);
  2071. /* Detect una wraparound. */
  2072. if (!IN_RECOVERY(tp->t_flags) &&
  2073. tp->snd_una > tp->snd_recover &&
  2074. th->th_ack <= tp->snd_recover)
  2075. tp->snd_recover = th->th_ack - 1;
  2076. /* XXXLAS: Can this be moved up into cc_post_recovery? */
  2077. if (IN_RECOVERY(tp->t_flags) &&
  2078. th->th_ack >= tp->snd_recover) {
  2079. EXIT_RECOVERY(tp->t_flags);
  2080. }
  2081. tp->snd_una = th->th_ack;
  2082. if (tp->t_flags & TF_SACK_PERMIT) {
  2083. if (tp->snd_una > tp->snd_recover)
  2084. tp->snd_recover = tp->snd_una;
  2085. }
  2086. if (tp->snd_nxt < tp->snd_una)
  2087. tp->snd_nxt = tp->snd_una;
  2088. switch (tp->get_state()) {
  2089. /*
  2090. * In FIN_WAIT_1 STATE in addition to the processing
  2091. * for the ESTABLISHED state if our FIN is now acknowledged
  2092. * then enter FIN_WAIT_2.
  2093. */
  2094. case TCPS_FIN_WAIT_1:
  2095. if (ourfinisacked) {
  2096. /*
  2097. * If we can't receive any more
  2098. * data, then closing user can proceed.
  2099. * Starting the timer is contrary to the
  2100. * specification, but if we don't get a FIN
  2101. * we'll hang forever.
  2102. *
  2103. * XXXjl:
  2104. * we should release the tp also, and use a
  2105. * compressed state.
  2106. */
  2107. if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
  2108. soisdisconnected(so);
  2109. tcp_timer_activate(tp, TT_2MSL,
  2110. (tcp_fast_finwait2_recycle ?
  2111. tcp_finwait2_timeout :
  2112. TP_MAXIDLE(tp)));
  2113. }
  2114. tp->set_state(TCPS_FIN_WAIT_2);
  2115. }
  2116. break;
  2117. /*
  2118. * In CLOSING STATE in addition to the processing for
  2119. * the ESTABLISHED state if the ACK acknowledges our FIN
  2120. * then enter the TIME-WAIT state, otherwise ignore
  2121. * the segment.
  2122. */
  2123. case TCPS_CLOSING:
  2124. if (ourfinisacked) {
  2125. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  2126. tcp_twstart(tp);
  2127. INP_INFO_WUNLOCK(&V_tcbinfo);
  2128. m_freem(m);
  2129. INP_LOCK(inp);
  2130. return;
  2131. }
  2132. break;
  2133. /*
  2134. * In LAST_ACK, we may still be waiting for data to drain
  2135. * and/or to be acked, as well as for the ack of our FIN.
  2136. * If our FIN is now acknowledged, delete the TCB,
  2137. * enter the closed state and return.
  2138. */
  2139. case TCPS_LAST_ACK:
  2140. if (ourfinisacked) {
  2141. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  2142. want_close = true;
  2143. goto drop;
  2144. }
  2145. break;
  2146. }
  2147. }
  2148. step6:
  2149. INP_LOCK_ASSERT(tp->t_inpcb);
  2150. /*
  2151. * Update window information.
  2152. * Don't look at window if no ACK: TAC's send garbage on first SYN.
  2153. */
  2154. if ((thflags & TH_ACK) &&
  2155. (tp->snd_wl1 < th->th_seq ||
  2156. (tp->snd_wl1 == th->th_seq && (tp->snd_wl2 < th->th_ack ||
  2157. (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
  2158. /* keep track of pure window updates */
  2159. if (tlen == 0 &&
  2160. tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
  2161. TCPSTAT_INC(tcps_rcvwinupd);
  2162. tp->snd_wnd = tiwin;
  2163. tp->snd_wl1 = th->th_seq;
  2164. tp->snd_wl2 = th->th_ack;
  2165. if (tp->snd_wnd > tp->max_sndwnd)
  2166. tp->max_sndwnd = tp->snd_wnd;
  2167. needoutput = 1;
  2168. }
  2169. /*
  2170. * Process segments with URG.
  2171. */
  2172. if ((thflags & TH_URG) && th->th_urp &&
  2173. TCPS_HAVERCVDFIN(tp->get_state()) == 0) {
  2174. /*
  2175. * This is a kludge, but if we receive and accept
  2176. * random urgent pointers, we'll crash in
  2177. * soreceive. It's hard to imagine someone
  2178. * actually wanting to send this much urgent data.
  2179. */
  2180. SOCK_LOCK_ASSERT(so);
  2181. if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
  2182. th->th_urp = 0; /* XXX */
  2183. thflags &= ~TH_URG; /* XXX */
  2184. goto dodata; /* XXX */
  2185. }
  2186. /*
  2187. * If this segment advances the known urgent pointer,
  2188. * then mark the data stream. This should not happen
  2189. * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
  2190. * a FIN has been received from the remote side.
  2191. * In these states we ignore the URG.
  2192. *
  2193. * According to RFC961 (Assigned Protocols),
  2194. * the urgent pointer points to the last octet
  2195. * of urgent data. We continue, however,
  2196. * to consider it to indicate the first octet
  2197. * of data past the urgent section as the original
  2198. * spec states (in one of two places).
  2199. */
  2200. if (th->th_seq+th->th_urp > tp->rcv_up) {
  2201. tp->rcv_up = th->th_seq + th->th_urp;
  2202. so->so_oobmark = so->so_rcv.sb_cc +
  2203. (tp->rcv_up - tp->rcv_nxt) - 1;
  2204. if (so->so_oobmark == 0)
  2205. so->so_rcv.sb_state |= SBS_RCVATMARK;
  2206. sohasoutofband(so);
  2207. tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
  2208. }
  2209. /*
  2210. * Remove out of band data so doesn't get presented to user.
  2211. * This can happen independent of advancing the URG pointer,
  2212. * but if two URG's are pending at once, some out-of-band
  2213. * data may creep in... ick.
  2214. */
  2215. if (th->th_urp <= (u_long)tlen &&
  2216. !(so->so_options & SO_OOBINLINE)) {
  2217. /* hdr drop is delayed */
  2218. tcp_pulloutofband(so, th, m, drop_hdrlen);
  2219. }
  2220. } else {
  2221. /*
  2222. * If no out of band data is expected,
  2223. * pull receive urgent pointer along
  2224. * with the receive window.
  2225. */
  2226. if (tp->rcv_nxt > tp->rcv_up)
  2227. tp->rcv_up = tp->rcv_nxt;
  2228. }
  2229. dodata: /* XXX */
  2230. INP_LOCK_ASSERT(tp->t_inpcb);
  2231. /*
  2232. * Process the segment text, merging it into the TCP sequencing queue,
  2233. * and arranging for acknowledgment of receipt if necessary.
  2234. * This process logically involves adjusting tp->rcv_wnd as data
  2235. * is presented to the user (this happens in tcp_usrreq.c,
  2236. * case PRU_RCVD). If a FIN has already been received on this
  2237. * connection then we just ignore the text.
  2238. */
  2239. if ((tlen || (thflags & TH_FIN)) &&
  2240. TCPS_HAVERCVDFIN(tp->get_state()) == 0) {
  2241. tcp_seq save_start = th->th_seq;
  2242. m_adj(m, drop_hdrlen); /* delayed header drop */
  2243. /*
  2244. * Insert segment which includes th into TCP reassembly queue
  2245. * with control block tp. Set thflags to whether reassembly now
  2246. * includes a segment with FIN. This handles the common case
  2247. * inline (segment is the next to be received on an established
  2248. * connection, and the queue is empty), avoiding linkage into
  2249. * and removal from the queue and repetition of various
  2250. * conversions.
  2251. * Set DELACK for segments received in order, but ack
  2252. * immediately when segments are out of order (so
  2253. * fast retransmit can work).
  2254. */
  2255. if (th->th_seq == tp->rcv_nxt &&
  2256. LIST_EMPTY(&tp->t_segq) &&
  2257. TCPS_HAVEESTABLISHED(tp->get_state())) {
  2258. if (DELAY_ACK(tp))
  2259. tp->t_flags |= TF_DELACK;
  2260. else
  2261. tp->t_flags |= TF_ACKNOW;
  2262. tp->rcv_nxt += tlen;
  2263. thflags = th->th_flags & TH_FIN;
  2264. TCPSTAT_INC(tcps_rcvpack);
  2265. TCPSTAT_ADD(tcps_rcvbyte, tlen);
  2266. ND6_HINT(tp);
  2267. if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
  2268. m_freem(m);
  2269. else
  2270. sbappendstream_locked(so, &so->so_rcv, m);
  2271. sorwakeup_locked(so);
  2272. } else {
  2273. /*
  2274. * XXX: Due to the header drop above "th" is
  2275. * theoretically invalid by now. Fortunately
  2276. * m_adj() doesn't actually frees any mbufs
  2277. * when trimming from the head.
  2278. */
  2279. thflags = tcp_reass(tp, th, &tlen, m);
  2280. tp->t_flags |= TF_ACKNOW;
  2281. }
  2282. if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
  2283. tcp_update_sack_list(tp, save_start, save_start + tlen);
  2284. } else {
  2285. m_freem(m);
  2286. thflags &= ~TH_FIN;
  2287. }
  2288. /*
  2289. * If FIN is received ACK the FIN and let the user know
  2290. * that the connection is closing.
  2291. */
  2292. if (thflags & TH_FIN) {
  2293. if (TCPS_HAVERCVDFIN(tp->get_state()) == 0) {
  2294. socantrcvmore_locked(so);
  2295. /*
  2296. * If connection is half-synchronized
  2297. * (ie NEEDSYN flag on) then delay ACK,
  2298. * so it may be piggybacked when SYN is sent.
  2299. * Otherwise, since we received a FIN then no
  2300. * more input can be expected, send ACK now.
  2301. */
  2302. if (tp->t_flags & TF_NEEDSYN)
  2303. tp->t_flags |= TF_DELACK;
  2304. else
  2305. tp->t_flags |= TF_ACKNOW;
  2306. tp->rcv_nxt++;
  2307. }
  2308. switch (tp->get_state()) {
  2309. /*
  2310. * In SYN_RECEIVED and ESTABLISHED STATES
  2311. * enter the CLOSE_WAIT state.
  2312. */
  2313. case TCPS_SYN_RECEIVED:
  2314. tp->t_starttime = bsd_ticks;
  2315. /* FALLTHROUGH */
  2316. case TCPS_ESTABLISHED:
  2317. tcp_teardown_net_channel(tp);
  2318. tp->set_state(TCPS_CLOSE_WAIT);
  2319. break;
  2320. /*
  2321. * If still in FIN_WAIT_1 STATE FIN has not been acked so
  2322. * enter the CLOSING state.
  2323. */
  2324. case TCPS_FIN_WAIT_1:
  2325. tp->set_state(TCPS_CLOSING);
  2326. break;
  2327. /*
  2328. * In FIN_WAIT_2 state enter the TIME_WAIT state,
  2329. * starting the time-wait timer, turning off the other
  2330. * standard timers.
  2331. */
  2332. case TCPS_FIN_WAIT_2:
  2333. INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  2334. KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
  2335. "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
  2336. ti_locked));
  2337. tcp_twstart(tp);
  2338. INP_INFO_WUNLOCK(&V_tcbinfo);
  2339. INP_LOCK(inp);
  2340. return;
  2341. }
  2342. }
  2343. if (ti_locked == TI_WLOCKED)
  2344. INP_INFO_WUNLOCK(&V_tcbinfo);
  2345. ti_locked = TI_UNLOCKED;
  2346. #ifdef TCPDEBUG
  2347. if (so->so_options & SO_DEBUG)
  2348. tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
  2349. &tcp_savetcp, 0);
  2350. #endif
  2351. /*
  2352. * Return any desired output.
  2353. */
  2354. if (needoutput || (tp->t_flags & TF_ACKNOW))
  2355. (void) tcp_output(tp);
  2356. check_delack:
  2357. KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
  2358. __func__, ti_locked));
  2359. INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  2360. INP_LOCK_ASSERT(tp->t_inpcb);
  2361. if (tp->t_flags & TF_DELACK) {
  2362. tp->t_flags &= ~TF_DELACK;
  2363. tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
  2364. }
  2365. return;
  2366. dropafterack:
  2367. /*
  2368. * Generate an ACK dropping incoming segment if it occupies
  2369. * sequence space, where the ACK reflects our state.
  2370. *
  2371. * We can now skip the test for the RST flag since all
  2372. * paths to this code happen after packets containing
  2373. * RST have been dropped.
  2374. *
  2375. * In the SYN-RECEIVED state, don't send an ACK unless the
  2376. * segment we received passes the SYN-RECEIVED ACK test.
  2377. * If it fails send a RST. This breaks the loop in the
  2378. * "LAND" DoS attack, and also prevents an ACK storm
  2379. * between two listening ports that have been sent forged
  2380. * SYN segments, each with the source address of the other.
  2381. */
  2382. if (tp->get_state() == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
  2383. (tp->snd_una > th->th_ack ||
  2384. th->th_ack > tp->snd_max) ) {
  2385. rstreason = BANDLIM_RST_OPENPORT;
  2386. goto dropwithreset;
  2387. }
  2388. #ifdef TCPDEBUG
  2389. if (so->so_options & SO_DEBUG)
  2390. tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
  2391. &tcp_savetcp, 0);
  2392. #endif
  2393. if (ti_locked == TI_WLOCKED)
  2394. INP_INFO_WUNLOCK(&V_tcbinfo);
  2395. ti_locked = TI_UNLOCKED;
  2396. tp->t_flags |= TF_ACKNOW;
  2397. (void) tcp_output(tp);
  2398. m_freem(m);
  2399. return;
  2400. dropwithreset:
  2401. if (ti_locked == TI_WLOCKED)
  2402. INP_INFO_WUNLOCK(&V_tcbinfo);
  2403. ti_locked = TI_UNLOCKED;
  2404. tcp_dropwithreset(m, th, !want_close ? tp : nullptr, tlen, rstreason);
  2405. return;
  2406. drop:
  2407. if (ti_locked == TI_WLOCKED) {
  2408. INP_INFO_WUNLOCK(&V_tcbinfo);
  2409. ti_locked = TI_UNLOCKED;
  2410. }
  2411. #ifdef INVARIANTS
  2412. else
  2413. INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  2414. #endif
  2415. /*
  2416. * Drop space held by incoming segment and return.
  2417. */
  2418. #ifdef TCPDEBUG
  2419. if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
  2420. tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
  2421. &tcp_savetcp, 0);
  2422. #endif
  2423. m_freem(m);
  2424. }
  2425. /*
  2426. * Issue RST and make ACK acceptable to originator of segment.
  2427. * The mbuf must still include the original packet header.
  2428. * tp may be NULL.
  2429. */
  2430. static void
  2431. tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
  2432. int tlen, int rstreason)
  2433. {
  2434. struct ip *ip;
  2435. if (tp != NULL) {
  2436. INP_LOCK_ASSERT(tp->t_inpcb);
  2437. }
  2438. /* Don't bother if destination was broadcast/multicast. */
  2439. if ((th->th_flags & TH_RST) || m->m_hdr.mh_flags & (M_BCAST|M_MCAST))
  2440. goto drop;
  2441. {
  2442. ip = mtod(m, struct ip *);
  2443. if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
  2444. IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
  2445. ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
  2446. in_broadcast(ip->ip_dst, m->M_dat.MH.MH_pkthdr.rcvif))
  2447. goto drop;
  2448. }
  2449. /* Perform bandwidth limiting. */
  2450. if (badport_bandlim(rstreason) < 0)
  2451. goto drop;
  2452. /* tcp_respond consumes the mbuf chain. */
  2453. if (th->th_flags & TH_ACK) {
  2454. tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
  2455. th->th_ack, TH_RST);
  2456. } else {
  2457. if (th->th_flags & TH_SYN)
  2458. tlen++;
  2459. tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
  2460. (tcp_seq)0, TH_RST|TH_ACK);
  2461. }
  2462. return;
  2463. drop:
  2464. m_freem(m);
  2465. }
  2466. /*
  2467. * Parse TCP options and place in tcpopt.
  2468. */
  2469. static void
  2470. tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
  2471. {
  2472. int opt, optlen;
  2473. to->to_flags = 0;
  2474. for (; cnt > 0; cnt -= optlen, cp += optlen) {
  2475. opt = cp[0];
  2476. if (opt == TCPOPT_EOL)
  2477. break;
  2478. if (opt == TCPOPT_NOP)
  2479. optlen = 1;
  2480. else {
  2481. if (cnt < 2)
  2482. break;
  2483. optlen = cp[1];
  2484. if (optlen < 2 || optlen > cnt)
  2485. break;
  2486. }
  2487. switch (opt) {
  2488. case TCPOPT_MAXSEG:
  2489. if (optlen != TCPOLEN_MAXSEG)
  2490. continue;
  2491. if (!(flags & TO_SYN))
  2492. continue;
  2493. to->to_flags |= TOF_MSS;
  2494. bcopy((char *)cp + 2,
  2495. (char *)&to->to_mss, sizeof(to->to_mss));
  2496. to->to_mss = ntohs(to->to_mss);
  2497. break;
  2498. case TCPOPT_WINDOW:
  2499. if (optlen != TCPOLEN_WINDOW)
  2500. continue;
  2501. if (!(flags & TO_SYN))
  2502. continue;
  2503. to->to_flags |= TOF_SCALE;
  2504. to->to_wscale = bsd_min(cp[2], TCP_MAX_WINSHIFT);
  2505. break;
  2506. case TCPOPT_TIMESTAMP:
  2507. if (optlen != TCPOLEN_TIMESTAMP)
  2508. continue;
  2509. to->to_flags |= TOF_TS;
  2510. bcopy((char *)cp + 2,
  2511. (char *)&to->to_tsval, sizeof(to->to_tsval));
  2512. to->to_tsval = ntohl(to->to_tsval);
  2513. bcopy((char *)cp + 6,
  2514. (char *)&to->to_tsecr, sizeof(to->to_tsecr));
  2515. to->to_tsecr = ntohl(to->to_tsecr);
  2516. break;
  2517. case TCPOPT_SACK_PERMITTED:
  2518. if (optlen != TCPOLEN_SACK_PERMITTED)
  2519. continue;
  2520. if (!(flags & TO_SYN))
  2521. continue;
  2522. if (!V_tcp_do_sack)
  2523. continue;
  2524. to->to_flags |= TOF_SACKPERM;
  2525. break;
  2526. case TCPOPT_SACK:
  2527. if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
  2528. continue;
  2529. if (flags & TO_SYN)
  2530. continue;
  2531. to->to_flags |= TOF_SACK;
  2532. to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
  2533. to->to_sacks = cp + 2;
  2534. TCPSTAT_INC(tcps_sack_rcv_blocks);
  2535. break;
  2536. default:
  2537. continue;
  2538. }
  2539. }
  2540. }
  2541. /*
  2542. * Pull out of band byte out of a segment so
  2543. * it doesn't appear in the user's data queue.
  2544. * It is still reflected in the segment length for
  2545. * sequencing purposes.
  2546. */
  2547. static void
  2548. tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
  2549. int off)
  2550. {
  2551. int cnt = off + th->th_urp - 1;
  2552. while (cnt >= 0) {
  2553. if (m->m_hdr.mh_len > cnt) {
  2554. char *cp = mtod(m, caddr_t) + cnt;
  2555. struct tcpcb *tp = sototcpcb(so);
  2556. INP_LOCK_ASSERT(tp->t_inpcb);
  2557. tp->t_iobc = *cp;
  2558. tp->t_oobflags |= TCPOOB_HAVEDATA;
  2559. bcopy(cp+1, cp, (unsigned)(m->m_hdr.mh_len - cnt - 1));
  2560. m->m_hdr.mh_len--;
  2561. if (m->m_hdr.mh_flags & M_PKTHDR)
  2562. m->M_dat.MH.MH_pkthdr.len--;
  2563. return;
  2564. }
  2565. cnt -= m->m_hdr.mh_len;
  2566. m = m->m_hdr.mh_next;
  2567. if (m == NULL)
  2568. break;
  2569. }
  2570. panic("tcp_pulloutofband");
  2571. }
  2572. /*
  2573. * Collect new round-trip time estimate
  2574. * and update averages and current timeout.
  2575. */
  2576. static void
  2577. tcp_xmit_timer(struct tcpcb *tp, int rtt)
  2578. {
  2579. int delta;
  2580. INP_LOCK_ASSERT(tp->t_inpcb);
  2581. TCPSTAT_INC(tcps_rttupdated);
  2582. tp->t_rttupdated++;
  2583. if (tp->t_srtt != 0) {
  2584. /*
  2585. * srtt is stored as fixed point with 5 bits after the
  2586. * binary point (i.e., scaled by 8). The following magic
  2587. * is equivalent to the smoothing algorithm in rfc793 with
  2588. * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
  2589. * point). Adjust rtt to origin 0.
  2590. */
  2591. delta = ((rtt - 1) << TCP_DELTA_SHIFT)
  2592. - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
  2593. if ((tp->t_srtt += delta) <= 0)
  2594. tp->t_srtt = 1;
  2595. /*
  2596. * We accumulate a smoothed rtt variance (actually, a
  2597. * smoothed mean difference), then set the retransmit
  2598. * timer to smoothed rtt + 4 times the smoothed variance.
  2599. * rttvar is stored as fixed point with 4 bits after the
  2600. * binary point (scaled by 16). The following is
  2601. * equivalent to rfc793 smoothing with an alpha of .75
  2602. * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
  2603. * rfc793's wired-in beta.
  2604. */
  2605. if (delta < 0)
  2606. delta = -delta;
  2607. delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
  2608. if ((tp->t_rttvar += delta) <= 0)
  2609. tp->t_rttvar = 1;
  2610. if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
  2611. tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
  2612. } else {
  2613. /*
  2614. * No rtt measurement yet - use the unsmoothed rtt.
  2615. * Set the variance to half the rtt (so our first
  2616. * retransmit happens at 3*rtt).
  2617. */
  2618. tp->t_srtt = rtt << TCP_RTT_SHIFT;
  2619. tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
  2620. tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
  2621. }
  2622. tp->t_rtttime = 0;
  2623. tp->t_rxtshift = 0;
  2624. /*
  2625. * the retransmit should happen at rtt + 4 * rttvar.
  2626. * Because of the way we do the smoothing, srtt and rttvar
  2627. * will each average +1/2 tick of bias. When we compute
  2628. * the retransmit timer, we want 1/2 tick of rounding and
  2629. * 1 extra tick because of +-1/2 tick uncertainty in the
  2630. * firing of the timer. The bias will give us exactly the
  2631. * 1.5 tick we need. But, because the bias is
  2632. * statistical, we have to test that we don't drop below
  2633. * the minimum feasible timer (which is 2 bsd_ticks).
  2634. */
  2635. TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
  2636. bsd_max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
  2637. /*
  2638. * We received an ack for a packet that wasn't retransmitted;
  2639. * it is probably safe to discard any error indications we've
  2640. * received recently. This isn't quite right, but close enough
  2641. * for now (a route might have failed after we sent a segment,
  2642. * and the return path might not be symmetrical).
  2643. */
  2644. tp->t_softerror = 0;
  2645. }
  2646. /*
  2647. * Determine a reasonable value for maxseg size.
  2648. * If the route is known, check route for mtu.
  2649. * If none, use an mss that can be handled on the outgoing
  2650. * interface without forcing IP to fragment; if bigger than
  2651. * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
  2652. * to utilize large mbufs. If no route is found, route has no mtu,
  2653. * or the destination isn't local, use a default, hopefully conservative
  2654. * size (usually 512 or the default IP max size, but no more than the mtu
  2655. * of the interface), as we can't discover anything about intervening
  2656. * gateways or networks. We also initialize the congestion/slow start
  2657. * window to be a single segment if the destination isn't local.
  2658. * While looking at the routing entry, we also initialize other path-dependent
  2659. * parameters from pre-set or cached values in the routing entry.
  2660. *
  2661. * Also take into account the space needed for options that we
  2662. * send regularly. Make maxseg shorter by that amount to assure
  2663. * that we can send maxseg amount of data even when the options
  2664. * are present. Store the upper limit of the length of options plus
  2665. * data in maxopd.
  2666. *
  2667. * NOTE that this routine is only called when we process an incoming
  2668. * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
  2669. * settings are handled in tcp_mssopt().
  2670. */
  2671. void
  2672. tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
  2673. struct hc_metrics_lite *metricptr, int *mtuflags)
  2674. {
  2675. int mss = 0;
  2676. u_long maxmtu = 0;
  2677. struct inpcb *inp = tp->t_inpcb;
  2678. struct hc_metrics_lite metrics;
  2679. int origoffer;
  2680. #ifdef INET6
  2681. int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
  2682. size_t min_protoh = isipv6 ?
  2683. sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
  2684. sizeof (struct tcpiphdr);
  2685. #else
  2686. const size_t min_protoh = sizeof(struct tcpiphdr);
  2687. #endif
  2688. INP_LOCK_ASSERT(tp->t_inpcb);
  2689. if (mtuoffer != -1) {
  2690. KASSERT(offer == -1, ("%s: conflict", __func__));
  2691. offer = mtuoffer - min_protoh;
  2692. }
  2693. origoffer = offer;
  2694. /* Initialize. */
  2695. #ifdef INET6
  2696. if (isipv6) {
  2697. maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags);
  2698. tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
  2699. }
  2700. #endif
  2701. #if defined(INET) && defined(INET6)
  2702. else
  2703. #endif
  2704. #ifdef INET
  2705. {
  2706. maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags);
  2707. tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
  2708. }
  2709. #endif
  2710. /*
  2711. * No route to sender, stay with default mss and return.
  2712. */
  2713. if (maxmtu == 0) {
  2714. /*
  2715. * In case we return early we need to initialize metrics
  2716. * to a defined state as tcp_hc_get() would do for us
  2717. * if there was no cache hit.
  2718. */
  2719. if (metricptr != NULL)
  2720. bzero(metricptr, sizeof(struct hc_metrics_lite));
  2721. return;
  2722. }
  2723. /* What have we got? */
  2724. switch (offer) {
  2725. case 0:
  2726. /*
  2727. * Offer == 0 means that there was no MSS on the SYN
  2728. * segment, in this case we use tcp_mssdflt as
  2729. * already assigned to t_maxopd above.
  2730. */
  2731. offer = tp->t_maxopd;
  2732. break;
  2733. case -1:
  2734. /*
  2735. * Offer == -1 means that we didn't receive SYN yet.
  2736. */
  2737. /* FALLTHROUGH */
  2738. default:
  2739. /*
  2740. * Prevent DoS attack with too small MSS. Round up
  2741. * to at least minmss.
  2742. */
  2743. offer = bsd_max(offer, V_tcp_minmss);
  2744. }
  2745. /*
  2746. * rmx information is now retrieved from tcp_hostcache.
  2747. */
  2748. tcp_hc_get(&inp->inp_inc, &metrics);
  2749. if (metricptr != NULL)
  2750. bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
  2751. /*
  2752. * If there's a discovered mtu int tcp hostcache, use it
  2753. * else, use the link mtu.
  2754. */
  2755. if (metrics.rmx_mtu)
  2756. mss = bsd_min(metrics.rmx_mtu, maxmtu) - min_protoh;
  2757. else {
  2758. #ifdef INET6
  2759. if (isipv6) {
  2760. mss = maxmtu - min_protoh;
  2761. if (!V_path_mtu_discovery &&
  2762. !in6_localaddr(&inp->in6p_faddr))
  2763. mss = bsd_min(mss, V_tcp_v6mssdflt);
  2764. }
  2765. #endif
  2766. #if defined(INET) && defined(INET6)
  2767. else
  2768. #endif
  2769. #ifdef INET
  2770. {
  2771. mss = maxmtu - min_protoh;
  2772. if (!V_path_mtu_discovery &&
  2773. !in_localaddr(inp->inp_faddr))
  2774. mss = bsd_min(mss, V_tcp_mssdflt);
  2775. }
  2776. #endif
  2777. /*
  2778. * XXX - The above conditional (mss = maxmtu - min_protoh)
  2779. * probably violates the TCP spec.
  2780. * The problem is that, since we don't know the
  2781. * other end's MSS, we are supposed to use a conservative
  2782. * default. But, if we do that, then MTU discovery will
  2783. * never actually take place, because the conservative
  2784. * default is much less than the MTUs typically seen
  2785. * on the Internet today. For the moment, we'll sweep
  2786. * this under the carpet.
  2787. *
  2788. * The conservative default might not actually be a problem
  2789. * if the only case this occurs is when sending an initial
  2790. * SYN with options and data to a host we've never talked
  2791. * to before. Then, they will reply with an MSS value which
  2792. * will get recorded and the new parameters should get
  2793. * recomputed. For Further Study.
  2794. */
  2795. }
  2796. mss = bsd_min(mss, offer);
  2797. /*
  2798. * Sanity check: make sure that maxopd will be large
  2799. * enough to allow some data on segments even if the
  2800. * all the option space is used (40bytes). Otherwise
  2801. * funny things may happen in tcp_output.
  2802. */
  2803. mss = bsd_max(mss, 64);
  2804. /*
  2805. * maxopd stores the maximum length of data AND options
  2806. * in a segment; maxseg is the amount of data in a normal
  2807. * segment. We need to store this value (maxopd) apart
  2808. * from maxseg, because now every segment carries options
  2809. * and thus we normally have somewhat less data in segments.
  2810. */
  2811. tp->t_maxopd = mss;
  2812. /*
  2813. * origoffer==-1 indicates that no segments were received yet.
  2814. * In this case we just guess.
  2815. */
  2816. if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
  2817. (origoffer == -1 ||
  2818. (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
  2819. mss -= TCPOLEN_TSTAMP_APPA;
  2820. #if (MCLBYTES & (MCLBYTES - 1)) == 0
  2821. if (mss > MCLBYTES)
  2822. mss &= ~(MCLBYTES-1);
  2823. #else
  2824. if (mss > MCLBYTES)
  2825. mss = mss / MCLBYTES * MCLBYTES;
  2826. #endif
  2827. tp->t_maxseg = mss;
  2828. }
  2829. void
  2830. tcp_mss(struct tcpcb *tp, int offer)
  2831. {
  2832. int mss;
  2833. u_long bufsize;
  2834. struct inpcb *inp;
  2835. struct socket *so;
  2836. struct hc_metrics_lite metrics;
  2837. int mtuflags = 0;
  2838. KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
  2839. tcp_mss_update(tp, offer, -1, &metrics, &mtuflags);
  2840. mss = tp->t_maxseg;
  2841. inp = tp->t_inpcb;
  2842. /*
  2843. * If there's a pipesize, change the socket buffer to that size,
  2844. * don't change if sb_hiwat is different than default (then it
  2845. * has been changed on purpose with setsockopt).
  2846. * Make the socket buffers an integral number of mss units;
  2847. * if the mss is larger than the socket buffer, decrease the mss.
  2848. */
  2849. so = inp->inp_socket;
  2850. SOCK_LOCK(so);
  2851. if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
  2852. bufsize = metrics.rmx_sendpipe;
  2853. else
  2854. bufsize = so->so_snd.sb_hiwat;
  2855. if (bufsize < mss)
  2856. mss = bufsize;
  2857. else {
  2858. bufsize = roundup(bufsize, mss);
  2859. if (bufsize > sb_max)
  2860. bufsize = sb_max;
  2861. if (bufsize > so->so_snd.sb_hiwat)
  2862. (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
  2863. }
  2864. tp->t_maxseg = mss;
  2865. if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
  2866. bufsize = metrics.rmx_recvpipe;
  2867. else
  2868. bufsize = so->so_rcv.sb_hiwat;
  2869. if (bufsize > mss) {
  2870. bufsize = roundup(bufsize, mss);
  2871. if (bufsize > sb_max)
  2872. bufsize = sb_max;
  2873. if (bufsize > so->so_rcv.sb_hiwat)
  2874. (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
  2875. }
  2876. SOCK_UNLOCK(so);
  2877. /* Check the interface for TSO capabilities. */
  2878. if (mtuflags & CSUM_TSO)
  2879. tp->t_flags |= TF_TSO;
  2880. }
  2881. /*
  2882. * Determine the MSS option to send on an outgoing SYN.
  2883. */
  2884. int
  2885. tcp_mssopt(struct in_conninfo *inc)
  2886. {
  2887. int mss = 0;
  2888. u_long maxmtu = 0;
  2889. u_long thcmtu = 0;
  2890. size_t min_protoh;
  2891. KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
  2892. #ifdef INET6
  2893. if (inc->inc_flags & INC_ISIPV6) {
  2894. mss = V_tcp_v6mssdflt;
  2895. maxmtu = tcp_maxmtu6(inc, NULL);
  2896. min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
  2897. }
  2898. #endif
  2899. #if defined(INET) && defined(INET6)
  2900. else
  2901. #endif
  2902. #ifdef INET
  2903. {
  2904. mss = V_tcp_mssdflt;
  2905. maxmtu = tcp_maxmtu(inc, NULL);
  2906. min_protoh = sizeof(struct tcpiphdr);
  2907. }
  2908. #endif
  2909. #if defined(INET6) || defined(INET)
  2910. thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
  2911. #endif
  2912. if (maxmtu && thcmtu)
  2913. mss = bsd_min(maxmtu, thcmtu) - min_protoh;
  2914. else if (maxmtu || thcmtu)
  2915. mss = bsd_max(maxmtu, thcmtu) - min_protoh;
  2916. return (mss);
  2917. }
  2918. /*
  2919. * On a partial ack arrives, force the retransmission of the
  2920. * next unacknowledged segment. Do not clear tp->t_dupacks.
  2921. * By setting snd_nxt to ti_ack, this forces retransmission timer to
  2922. * be started again.
  2923. */
  2924. static void
  2925. tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
  2926. {
  2927. tcp_seq onxt = tp->snd_nxt;
  2928. u_long ocwnd = tp->snd_cwnd;
  2929. INP_LOCK_ASSERT(tp->t_inpcb);
  2930. tcp_timer_activate(tp, TT_REXMT, 0);
  2931. tp->t_rtttime = 0;
  2932. tp->snd_nxt = th->th_ack;
  2933. /*
  2934. * Set snd_cwnd to one segment beyond acknowledged offset.
  2935. * (tp->snd_una has not yet been updated when this function is called.)
  2936. */
  2937. tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
  2938. tp->t_flags |= TF_ACKNOW;
  2939. (void) tcp_output(tp);
  2940. tp->snd_cwnd = ocwnd;
  2941. if (onxt > tp->snd_nxt)
  2942. tp->snd_nxt = onxt;
  2943. /*
  2944. * Partial window deflation. Relies on fact that tp->snd_una
  2945. * not updated yet.
  2946. */
  2947. if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
  2948. tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
  2949. else
  2950. tp->snd_cwnd = 0;
  2951. tp->snd_cwnd += tp->t_maxseg;
  2952. }
  2953. #include <bsd/sys/net/ethernet.h>
  2954. #include <bsd/sys/net/netisr.h>
  2955. // INP_LOCK held
  2956. static void
  2957. tcp_net_channel_packet(tcpcb* tp, mbuf* m)
  2958. {
  2959. log_packet_handling(m, NETISR_ETHER);
  2960. caddr_t start = m->m_hdr.mh_data;
  2961. auto h = start;
  2962. h += ETHER_HDR_LEN;
  2963. auto ip_hdr = reinterpret_cast<ip*>(h);
  2964. unsigned ip_size = ip_hdr->ip_hl << 2;
  2965. h += ip_size;
  2966. auto th = reinterpret_cast<tcphdr*>(h);
  2967. h += th->th_off << 2;
  2968. auto drop_hdrlen = h - start;
  2969. tcp_fields_to_host(th);
  2970. trace_tcp_input_ack(tp, th->th_ack.raw());
  2971. auto so = tp->t_inpcb->inp_socket;
  2972. auto ip_len = ntohs(ip_hdr->ip_len);
  2973. auto tlen = ip_len - (ip_size + (th->th_off << 2));
  2974. auto iptos = ip_hdr->ip_tos;
  2975. SOCK_LOCK_ASSERT(so);
  2976. bool want_close;
  2977. m_trim(m, ETHER_HDR_LEN + ip_len);
  2978. tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, TI_UNLOCKED, want_close);
  2979. // since a socket is still attached, we should not be closing
  2980. assert(!want_close);
  2981. }
  2982. static ipv4_tcp_conn_id tcp_connection_id(tcpcb* tp)
  2983. {
  2984. auto& conn = tp->t_inpcb->inp_inc.inc_ie;
  2985. return {
  2986. conn.ie_dependfaddr.ie46_foreign.ia46_addr4,
  2987. conn.ie_dependladdr.ie46_local.ia46_addr4,
  2988. ntohs(conn.ie_fport),
  2989. ntohs(conn.ie_lport)
  2990. };
  2991. }
  2992. void
  2993. tcp_setup_net_channel(tcpcb* tp, struct ifnet* intf)
  2994. {
  2995. auto nc = new net_channel([=] (mbuf *m) { tcp_net_channel_packet(tp, m); });
  2996. tp->nc = nc;
  2997. tp->nc_intf = intf;
  2998. intf->add_net_channel(nc, tcp_connection_id(tp));
  2999. auto so = tp->t_inpcb->inp_socket;
  3000. so->so_nc = nc;
  3001. if (so->fp) {
  3002. WITH_LOCK(so->fp->f_lock) {
  3003. for (auto&& pl : so->fp->f_poll_list) {
  3004. so->so_nc->add_poller(*pl._req);
  3005. }
  3006. if (so->fp->f_epolls) {
  3007. for (auto&& ep : *so->fp->f_epolls) {
  3008. so->so_nc->add_epoll(ep);
  3009. }
  3010. }
  3011. }
  3012. }
  3013. }
  3014. void tcp_teardown_net_channel(tcpcb *tp)
  3015. {
  3016. if (!tp->nc_intf) {
  3017. return;
  3018. }
  3019. tp->nc_intf->del_net_channel(tcp_connection_id(tp));
  3020. tp->nc_intf = nullptr;
  3021. // keep tp->nc around since it might still contain packets
  3022. }
  3023. void
  3024. tcp_free_net_channel(tcpcb* tp)
  3025. {
  3026. if (!tp->nc) {
  3027. return;
  3028. }
  3029. tcp_teardown_net_channel(tp);
  3030. auto so = tp->t_inpcb->inp_socket;
  3031. if (so && so->fp) {
  3032. for (auto&& pl : so->fp->f_poll_list) {
  3033. so->so_nc->del_poller(*pl._req);
  3034. }
  3035. so->so_nc = nullptr;
  3036. }
  3037. if (tp->nc_intf) {
  3038. tp->nc_intf->del_net_channel(tcp_connection_id(tp));
  3039. }
  3040. osv::rcu_dispose(tp->nc);
  3041. tp->nc = nullptr;
  3042. }
  3043. void
  3044. tcp_flush_net_channel(tcpcb *tp)
  3045. {
  3046. auto nc = tp->nc;
  3047. if (nc) {
  3048. nc->process_queue();
  3049. }
  3050. }