/bsd/sys/netinet/tcp_input.cc
C++ | 3287 lines | 1953 code | 237 blank | 1097 comment | 548 complexity | 0d391bf1f1401a73c33885330ac4d916 MD5 | raw file
Possible License(s): BSD-3-Clause, 0BSD, MPL-2.0-no-copyleft-exception
Large files files are truncated, but you can click here to view the full file
- /*-
- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
- * The Regents of the University of California. All rights reserved.
- * Copyright (c) 2007-2008,2010
- * Swinburne University of Technology, Melbourne, Australia.
- * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
- * Copyright (c) 2010 The FreeBSD Foundation
- * Copyright (c) 2010-2011 Juniper Networks, Inc.
- * All rights reserved.
- *
- * Portions of this software were developed at the Centre for Advanced Internet
- * Architectures, Swinburne University of Technology, by Lawrence Stewart,
- * James Healy and David Hayes, made possible in part by a grant from the Cisco
- * University Research Program Fund at Community Foundation Silicon Valley.
- *
- * Portions of this software were developed at the Centre for Advanced
- * Internet Architectures, Swinburne University of Technology, Melbourne,
- * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
- *
- * Portions of this software were developed by Robert N. M. Watson under
- * contract to Juniper Networks, Inc.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
- */
- #include <sys/cdefs.h>
- #include <bsd/porting/netport.h>
- #include <bsd/porting/sync_stub.h>
- #include <bsd/porting/uma_stub.h>
- #include <bsd/sys/sys/libkern.h>
- #include <bsd/sys/sys/param.h>
- #include <bsd/sys/sys/mbuf.h>
- #include <bsd/sys/sys/protosw.h>
- #include <bsd/sys/sys/socket.h>
- #include <bsd/sys/sys/socketvar.h>
- #include <bsd/sys/net/if.h>
- #include <bsd/sys/net/route.h>
- #include <bsd/sys/net/vnet.h>
- #define TCPSTATES /* for logging */
- #include <bsd/sys/netinet/cc.h>
- #include <bsd/sys/netinet/in.h>
- #include <bsd/sys/netinet/in_pcb.h>
- #include <bsd/sys/netinet/in_systm.h>
- #include <bsd/sys/netinet/in_var.h>
- #include <bsd/sys/netinet/ip.h>
- #include <bsd/sys/netinet/ip_icmp.h> /* required for icmp_var.h */
- #include <bsd/sys/netinet/icmp_var.h> /* for ICMP_BANDLIM */
- #include <bsd/sys/netinet/ip_var.h>
- #include <bsd/sys/netinet/ip_options.h>
- #include <bsd/sys/netinet/tcp_fsm.h>
- #include <bsd/sys/netinet/tcp_seq.h>
- #include <bsd/sys/netinet/tcp_timer.h>
- #include <bsd/sys/netinet/tcp_var.h>
- #include <bsd/sys/netinet/tcpip.h>
- #include <bsd/sys/netinet/tcp_syncache.h>
- #ifdef TCPDEBUG
- #include <netinet/tcp_debug.h>
- #endif /* TCPDEBUG */
- #include <machine/in_cksum.h>
- #include <osv/poll.h>
- #include <osv/net_trace.hh>
- TRACEPOINT(trace_tcp_input_ack, "%p: We've got ACK: %u", void*, unsigned int);
- const int tcprexmtthresh = 3;
- VNET_DEFINE(struct tcpstat, tcpstat);
- SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
- &VNET_NAME(tcpstat), tcpstat,
- "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
- int tcp_log_in_vain = 0;
- SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
- &tcp_log_in_vain, 0,
- "Log all incoming TCP segments to closed ports");
- VNET_DEFINE(int, blackhole) = 0;
- #define V_blackhole VNET(blackhole)
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
- &VNET_NAME(blackhole), 0,
- "Do not send RST on segments to closed ports");
- VNET_DEFINE(int, tcp_delack_enabled) = 1;
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
- &VNET_NAME(tcp_delack_enabled), 0,
- "Delay ACK to try and piggyback it onto a data packet");
- VNET_DEFINE(int, drop_synfin) = 0;
- #define V_drop_synfin VNET(drop_synfin)
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
- &VNET_NAME(drop_synfin), 0,
- "Drop TCP packets with SYN+FIN set");
- VNET_DEFINE(int, tcp_do_rfc3042) = 1;
- #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042)
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
- &VNET_NAME(tcp_do_rfc3042), 0,
- "Enable RFC 3042 (Limited Transmit)");
- VNET_DEFINE(int, tcp_do_rfc3390) = 1;
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
- &VNET_NAME(tcp_do_rfc3390), 0,
- "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
- VNET_DEFINE(int, tcp_do_rfc3465) = 1;
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
- &VNET_NAME(tcp_do_rfc3465), 0,
- "Enable RFC 3465 (Appropriate Byte Counting)");
- VNET_DEFINE(int, tcp_abc_l_var) = 2;
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
- &VNET_NAME(tcp_abc_l_var), 2,
- "Cap the max cwnd increment during slow-start to this number of segments");
- SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
- VNET_DEFINE(int, tcp_do_ecn) = 0;
- SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
- &VNET_NAME(tcp_do_ecn), 0,
- "TCP ECN support");
- VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
- SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
- &VNET_NAME(tcp_ecn_maxretries), 0,
- "Max retries before giving up on ECN");
- VNET_DEFINE(int, tcp_insecure_rst) = 0;
- #define V_tcp_insecure_rst VNET(tcp_insecure_rst)
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
- &VNET_NAME(tcp_insecure_rst), 0,
- "Follow the old (insecure) criteria for accepting RST packets");
- VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
- #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
- &VNET_NAME(tcp_do_autorcvbuf), 0,
- "Enable automatic receive buffer sizing");
- VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024;
- #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
- &VNET_NAME(tcp_autorcvbuf_inc), 0,
- "Incrementor step size of automatic receive buffer");
- VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024;
- #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
- SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
- &VNET_NAME(tcp_autorcvbuf_max), 0,
- "Max size of automatic receive buffer");
- VNET_DEFINE(struct inpcbhead, tcb);
- #define tcb6 tcb /* for KAME src sync over BSD*'s */
- VNET_DEFINE(struct inpcbinfo, tcbinfo);
- static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
- static void tcp_do_segment(struct mbuf *, struct tcphdr *,
- struct socket *, struct tcpcb *, int, int, uint8_t,
- int, bool& want_close);
- static void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
- struct tcpcb *, int, int);
- static void tcp_pulloutofband(struct socket *,
- struct tcphdr *, struct mbuf *, int);
- static void tcp_xmit_timer(struct tcpcb *, int);
- static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
- static void inline tcp_fields_to_host(struct tcphdr *);
- #ifdef TCP_SIGNATURE
- static void inline tcp_fields_to_net(struct tcphdr *);
- static int inline tcp_signature_verify_input(struct mbuf *, int, int,
- int, struct tcpopt *, struct tcphdr *, u_int);
- #endif
- static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
- uint16_t type);
- static void inline cc_conn_init(struct tcpcb *tp);
- static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
- /*
- * Kernel module interface for updating tcpstat. The argument is an index
- * into tcpstat treated as an array of u_long. While this encodes the
- * general layout of tcpstat into the caller, it doesn't encode its location,
- * so that future changes to add, for example, per-CPU stats support won't
- * cause binary compatibility problems for kernel modules.
- */
- void
- kmod_tcpstat_inc(int statnum)
- {
- (*((u_long *)&V_tcpstat + statnum))++;
- }
- /*
- * CC wrapper hook functions
- */
- static void inline
- cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
- {
- INP_LOCK_ASSERT(tp->t_inpcb);
- tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
- if (tp->snd_cwnd == bsd_min(tp->snd_cwnd, tp->snd_wnd))
- tp->ccv->flags |= CCF_CWND_LIMITED;
- else
- tp->ccv->flags &= ~CCF_CWND_LIMITED;
- if (type == CC_ACK) {
- if (tp->snd_cwnd > tp->snd_ssthresh) {
- tp->t_bytes_acked += bsd_min(tp->ccv->bytes_this_ack,
- V_tcp_abc_l_var * tp->t_maxseg);
- if (tp->t_bytes_acked >= tp->snd_cwnd) {
- tp->t_bytes_acked -= tp->snd_cwnd;
- tp->ccv->flags |= CCF_ABC_SENTAWND;
- }
- } else {
- tp->ccv->flags &= ~CCF_ABC_SENTAWND;
- tp->t_bytes_acked = 0;
- }
- }
- if (CC_ALGO(tp)->ack_received != NULL) {
- /* XXXLAS: Find a way to live without this */
- tp->ccv->curack = th->th_ack;
- CC_ALGO(tp)->ack_received(tp->ccv, type);
- }
- }
- static void inline
- cc_conn_init(struct tcpcb *tp)
- {
- struct hc_metrics_lite metrics;
- struct inpcb *inp = tp->t_inpcb;
- int rtt;
- INP_LOCK_ASSERT(tp->t_inpcb);
- tcp_hc_get(&inp->inp_inc, &metrics);
- if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
- tp->t_srtt = rtt;
- tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
- TCPSTAT_INC(tcps_usedrtt);
- if (metrics.rmx_rttvar) {
- tp->t_rttvar = metrics.rmx_rttvar;
- TCPSTAT_INC(tcps_usedrttvar);
- } else {
- /* default variation is +- 1 rtt */
- tp->t_rttvar =
- tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
- }
- TCPT_RANGESET(tp->t_rxtcur,
- ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
- }
- if (metrics.rmx_ssthresh) {
- /*
- * There's some sort of gateway or interface
- * buffer limit on the path. Use this to set
- * the slow start threshhold, but set the
- * threshold to no less than 2*mss.
- */
- tp->snd_ssthresh = bsd_max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
- TCPSTAT_INC(tcps_usedssthresh);
- }
- /*
- * Set the slow-start flight size depending on whether this
- * is a local network or not.
- *
- * Extend this so we cache the cwnd too and retrieve it here.
- * Make cwnd even bigger than RFC3390 suggests but only if we
- * have previous experience with the remote host. Be careful
- * not make cwnd bigger than remote receive window or our own
- * send socket buffer. Maybe put some additional upper bound
- * on the retrieved cwnd. Should do incremental updates to
- * hostcache when cwnd collapses so next connection doesn't
- * overloads the path again.
- *
- * XXXAO: Initializing the CWND from the hostcache is broken
- * and in its current form not RFC conformant. It is disabled
- * until fixed or removed entirely.
- *
- * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
- * We currently check only in syncache_socket for that.
- */
- /* #define TCP_METRICS_CWND */
- #ifdef TCP_METRICS_CWND
- if (metrics.rmx_cwnd)
- tp->snd_cwnd = bsd_max(tp->t_maxseg, bsd_min(metrics.rmx_cwnd / 2,
- bsd_min(tp->snd_wnd, so->so_snd.sb_hiwat)));
- else
- #endif
- if (V_tcp_do_rfc3390)
- tp->snd_cwnd = bsd_min(4 * tp->t_maxseg,
- bsd_max(2 * tp->t_maxseg, 4380));
- else if (in_localaddr(inp->inp_faddr))
- tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local;
- else
- tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz;
- if (CC_ALGO(tp)->conn_init != NULL)
- CC_ALGO(tp)->conn_init(tp->ccv);
- }
- void
- cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
- {
- INP_LOCK_ASSERT(tp->t_inpcb);
- switch(type) {
- case CC_NDUPACK:
- if (!IN_FASTRECOVERY(tp->t_flags)) {
- tp->snd_recover = tp->snd_max;
- if (tp->t_flags & TF_ECN_PERMIT)
- tp->t_flags |= TF_ECN_SND_CWR;
- }
- break;
- case CC_ECN:
- if (!IN_CONGRECOVERY(tp->t_flags)) {
- TCPSTAT_INC(tcps_ecn_rcwnd);
- tp->snd_recover = tp->snd_max;
- if (tp->t_flags & TF_ECN_PERMIT)
- tp->t_flags |= TF_ECN_SND_CWR;
- }
- break;
- case CC_RTO:
- tp->t_dupacks = 0;
- tp->t_bytes_acked = 0;
- EXIT_RECOVERY(tp->t_flags);
- tp->snd_ssthresh = bsd_max(2, bsd_min(tp->snd_wnd, tp->snd_cwnd) / 2 /
- tp->t_maxseg) * tp->t_maxseg;
- tp->snd_cwnd = tp->t_maxseg;
- break;
- case CC_RTO_ERR:
- TCPSTAT_INC(tcps_sndrexmitbad);
- /* RTO was unnecessary, so reset everything. */
- tp->snd_cwnd = tp->snd_cwnd_prev;
- tp->snd_ssthresh = tp->snd_ssthresh_prev;
- tp->snd_recover = tp->snd_recover_prev;
- if (tp->t_flags & TF_WASFRECOVERY)
- ENTER_FASTRECOVERY(tp->t_flags);
- if (tp->t_flags & TF_WASCRECOVERY)
- ENTER_CONGRECOVERY(tp->t_flags);
- tp->snd_nxt = tp->snd_max;
- tp->t_flags &= ~TF_PREVVALID;
- tp->t_badrxtwin = 0;
- break;
- }
- if (CC_ALGO(tp)->cong_signal != NULL) {
- if (th != NULL)
- tp->ccv->curack = th->th_ack;
- CC_ALGO(tp)->cong_signal(tp->ccv, type);
- }
- }
- static void inline
- cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
- {
- INP_LOCK_ASSERT(tp->t_inpcb);
- /* XXXLAS: KASSERT that we're in recovery? */
- if (CC_ALGO(tp)->post_recovery != NULL) {
- tp->ccv->curack = th->th_ack;
- CC_ALGO(tp)->post_recovery(tp->ccv);
- }
- /* XXXLAS: EXIT_RECOVERY ? */
- tp->t_bytes_acked = 0;
- }
- static inline void
- tcp_fields_to_host(struct tcphdr *th)
- {
- th->th_seq = ntohl(th->th_seq);
- th->th_ack = ntohl(th->th_ack);
- th->th_win = ntohs(th->th_win);
- th->th_urp = ntohs(th->th_urp);
- }
- /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
- #define ND6_HINT(tp)
- /*
- * Indicate whether this ack should be delayed. We can delay the ack if
- * - there is no delayed ack timer in progress and
- * - our last ack wasn't a 0-sized window. We never want to delay
- * the ack that opens up a 0-sized window and
- * - delayed acks are enabled or
- * - this is a half-synchronized T/TCP connection.
- */
- #define DELAY_ACK(tp) \
- ((!tcp_timer_active(tp, TT_DELACK) && \
- (tp->t_flags & TF_RXWIN0SENT) == 0) && \
- (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
- void
- tcp_input(struct mbuf *m, int off0)
- {
- struct tcphdr *th = NULL;
- struct ip *ip = NULL;
- struct ipovly *ipov;
- struct inpcb *inp = NULL;
- struct tcpcb *tp = NULL;
- struct socket *so = NULL;
- u_char *optp = NULL;
- int optlen = 0;
- int len;
- int tlen = 0, off;
- int drop_hdrlen;
- int thflags;
- int rstreason = 0; /* For badport_bandlim accounting purposes */
- uint8_t iptos = 0;
- const void *ip6 = NULL;
- struct tcpopt to; /* options in this segment */
- char *s = NULL; /* address and port logging */
- int ti_locked;
- #define TI_UNLOCKED 1
- #define TI_WLOCKED 2
- #ifdef TCPDEBUG
- /*
- * The size of tcp_saveipgen must be the size of the max ip header,
- * now IPv6.
- */
- u_char tcp_saveipgen[IP6_HDR_LEN];
- struct tcphdr tcp_savetcp;
- short ostate = 0;
- #endif
- to.to_flags = 0;
- TCPSTAT_INC(tcps_rcvtotal);
- /*
- * Get IP and TCP header together in first mbuf.
- * Note: IP leaves IP header in first mbuf.
- */
- if (off0 > sizeof (struct ip)) {
- ip_stripoptions(m, (struct mbuf *)0);
- off0 = sizeof(struct ip);
- }
- if (m->m_hdr.mh_len < sizeof (struct tcpiphdr)) {
- if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
- == NULL) {
- TCPSTAT_INC(tcps_rcvshort);
- return;
- }
- }
- ip = mtod(m, struct ip *);
- ipov = (struct ipovly *)ip;
- th = (struct tcphdr *)((caddr_t)ip + off0);
- tlen = ip->ip_len;
- if (m->M_dat.MH.MH_pkthdr.csum_flags & CSUM_DATA_VALID) {
- if (m->M_dat.MH.MH_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
- th->th_sum = m->M_dat.MH.MH_pkthdr.csum_data;
- else
- th->th_sum = in_pseudo(ip->ip_src.s_addr,
- ip->ip_dst.s_addr,
- htonl(m->M_dat.MH.MH_pkthdr.csum_data +
- ip->ip_len +
- IPPROTO_TCP));
- th->th_sum ^= 0xffff;
- #ifdef TCPDEBUG
- ipov->ih_len = (u_short)tlen;
- ipov->ih_len = htons(ipov->ih_len);
- #endif
- } else {
- /*
- * Checksum extended TCP header and data.
- */
- len = sizeof (struct ip) + tlen;
- bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
- ipov->ih_len = (u_short)tlen;
- ipov->ih_len = htons(ipov->ih_len);
- th->th_sum = in_cksum(m, len);
- }
- if (th->th_sum) {
- TCPSTAT_INC(tcps_rcvbadsum);
- goto drop;
- }
- /* Re-initialization for later version check */
- ip->ip_v = IPVERSION;
- iptos = ip->ip_tos;
- /*
- * Check that TCP offset makes sense,
- * pull out TCP options and adjust length. XXX
- */
- off = th->th_off << 2;
- if (off < sizeof (struct tcphdr) || off > tlen) {
- TCPSTAT_INC(tcps_rcvbadoff);
- goto drop;
- }
- tlen -= off; /* tlen is used instead of ti->ti_len */
- if (off > sizeof (struct tcphdr)) {
- if (m->m_hdr.mh_len < sizeof(struct ip) + off) {
- if ((m = m_pullup(m, sizeof (struct ip) + off))
- == NULL) {
- TCPSTAT_INC(tcps_rcvshort);
- return;
- }
- ip = mtod(m, struct ip *);
- ipov = (struct ipovly *)ip;
- th = (struct tcphdr *)((caddr_t)ip + off0);
- }
- optlen = off - sizeof (struct tcphdr);
- optp = (u_char *)(th + 1);
- }
- thflags = th->th_flags;
- /*
- * Convert TCP protocol specific fields to host format.
- */
- tcp_fields_to_host(th);
- /*
- * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
- */
- drop_hdrlen = off0 + off;
- /*
- * Locate pcb for segment; if we're likely to add or remove a
- * connection then first acquire pcbinfo lock. There are two cases
- * where we might discover later we need a write lock despite the
- * flags: ACKs moving a connection out of the syncache, and ACKs for
- * a connection in TIMEWAIT.
- */
- if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) {
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
- } else
- ti_locked = TI_UNLOCKED;
- findpcb:
- #ifdef INVARIANTS
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- } else {
- INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- }
- #endif
- inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
- th->th_sport, ip->ip_dst, th->th_dport,
- INPLOOKUP_WILDCARD | INPLOOKUP_LOCKPCB,
- m->M_dat.MH.MH_pkthdr.rcvif, m);
- /*
- * If the INPCB does not exist then all data in the incoming
- * segment is discarded and an appropriate RST is sent back.
- * XXX MRT Send RST using which routing table?
- */
- if (inp == NULL) {
- /*
- * Log communication attempts to ports that are not
- * in use.
- */
- if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
- tcp_log_in_vain == 2) {
- if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6)))
- bsd_log(LOG_INFO, "%s; %s: Connection attempt "
- "to closed port\n", s, __func__);
- }
- /*
- * When blackholing do not respond with a RST but
- * completely ignore the segment and drop it.
- */
- if ((V_blackhole == 1 && (thflags & TH_SYN)) ||
- V_blackhole == 2)
- goto dropunlock;
- rstreason = BANDLIM_RST_CLOSEDPORT;
- goto dropwithreset;
- }
- INP_LOCK_ASSERT(inp);
- if (!(inp->inp_flags & INP_HW_FLOWID)
- && (m->m_hdr.mh_flags & M_FLOWID)
- && ((inp->inp_socket == NULL)
- || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) {
- inp->inp_flags |= INP_HW_FLOWID;
- inp->inp_flags &= ~INP_SW_FLOWID;
- inp->inp_flowid = m->M_dat.MH.MH_pkthdr.flowid;
- }
- /*
- * Check the minimum TTL for socket.
- */
- if (inp->inp_ip_minttl != 0) {
- if (inp->inp_ip_minttl > ip->ip_ttl)
- goto dropunlock;
- }
- /*
- * A previous connection in TIMEWAIT state is supposed to catch stray
- * or duplicate segments arriving late. If this segment was a
- * legitimate new connection attempt the old INPCB gets removed and
- * we can try again to find a listening socket.
- *
- * At this point, due to earlier optimism, we may hold only an inpcb
- * lock, and not the inpcbinfo write lock. If so, we need to try to
- * acquire it, or if that fails, acquire a reference on the inpcb,
- * drop all locks, acquire a global write lock, and then re-acquire
- * the inpcb lock. We may at that point discover that another thread
- * has tried to free the inpcb, in which case we need to loop back
- * and try to find a new inpcb to deliver to.
- *
- * XXXRW: It may be time to rethink timewait locking.
- */
- relocked:
- if (inp->inp_flags & INP_TIMEWAIT) {
- if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
- in_pcbref(inp);
- INP_UNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
- INP_LOCK(inp);
- if (in_pcbrele_locked(inp)) {
- inp = NULL;
- goto findpcb;
- }
- } else
- ti_locked = TI_WLOCKED;
- }
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- if (thflags & TH_SYN)
- tcp_dooptions(&to, optp, optlen, TO_SYN);
- /*
- * NB: tcp_twcheck unlocks the INP and frees the mbuf.
- */
- if (tcp_twcheck(inp, &to, th, m, tlen))
- goto findpcb;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- return;
- }
- /*
- * The TCPCB may no longer exist if the connection is winding
- * down or it is in the CLOSED state. Either way we drop the
- * segment and send an appropriate response.
- */
- tp = intotcpcb(inp);
- if (tp == NULL || tp->get_state() == TCPS_CLOSED) {
- rstreason = BANDLIM_RST_CLOSEDPORT;
- goto dropwithreset;
- }
- // We may be processing a FIN here, process all preceding
- // normal packets first.
- tcp_flush_net_channel(tp);
- /*
- * We've identified a valid inpcb, but it could be that we need an
- * inpcbinfo write lock but don't hold it. In this case, attempt to
- * acquire using the same strategy as the TIMEWAIT case above. If we
- * relock, we have to jump back to 'relocked' as the connection might
- * now be in TIMEWAIT.
- */
- #ifdef INVARIANTS
- if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0)
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- #endif
- if (tp->get_state() != TCPS_ESTABLISHED) {
- if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
- in_pcbref(inp);
- INP_UNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
- INP_LOCK(inp);
- if (in_pcbrele_locked(inp)) {
- inp = NULL;
- goto findpcb;
- }
- goto relocked;
- } else
- ti_locked = TI_WLOCKED;
- }
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- }
- so = inp->inp_socket;
- KASSERT(so != NULL, ("%s: so == NULL", __func__));
- #ifdef TCPDEBUG
- if (so->so_options & SO_DEBUG) {
- ostate = tp->get_state();
- bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
- tcp_savetcp = *th;
- }
- #endif /* TCPDEBUG */
- /*
- * When the socket is accepting connections (the INPCB is in LISTEN
- * state) we look into the SYN cache if this is a new connection
- * attempt or the completion of a previous one. Because listen
- * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be
- * held in this case.
- */
- if (so->so_options & SO_ACCEPTCONN) {
- struct in_conninfo inc;
- KASSERT(tp->get_state() == TCPS_LISTEN, ("%s: so accepting but "
- "tp not listening", __func__));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- bzero(&inc, sizeof(inc));
- {
- inc.inc_faddr = ip->ip_src;
- inc.inc_laddr = ip->ip_dst;
- }
- inc.inc_fport = th->th_sport;
- inc.inc_lport = th->th_dport;
- inc.inc_fibnum = so->so_fibnum;
- /*
- * Check for an existing connection attempt in syncache if
- * the flag is only ACK. A successful lookup creates a new
- * socket appended to the listen queue in SYN_RECEIVED state.
- */
- if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
- /*
- * Parse the TCP options here because
- * syncookies need access to the reflected
- * timestamp.
- */
- tcp_dooptions(&to, optp, optlen, 0);
- /*
- * NB: syncache_expand() doesn't unlock
- * inp and tcpinfo locks.
- */
- if (!syncache_expand(&inc, &to, th, &so, m)) {
- /*
- * No syncache entry or ACK was not
- * for our SYN/ACK. Send a RST.
- * NB: syncache did its own logging
- * of the failure cause.
- */
- rstreason = BANDLIM_RST_OPENPORT;
- goto dropwithreset;
- }
- if (so == NULL) {
- /*
- * We completed the 3-way handshake
- * but could not allocate a socket
- * either due to memory shortage,
- * listen queue length limits or
- * global socket limits. Send RST
- * or wait and have the remote end
- * retransmit the ACK for another
- * try.
- */
- if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
- bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
- "Socket allocation failed due to "
- "limits or memory shortage, %s\n",
- s, __func__,
- V_tcp_sc_rst_sock_fail ?
- "sending RST" : "try again");
- if (V_tcp_sc_rst_sock_fail) {
- rstreason = BANDLIM_UNLIMITED;
- goto dropwithreset;
- } else
- goto dropunlock;
- }
- /*
- * Socket is created in state SYN_RECEIVED.
- * Unlock the listen socket, lock the newly
- * created socket and update the tp variable.
- */
- INP_UNLOCK(inp); /* listen socket */
- inp = sotoinpcb(so);
- INP_LOCK(inp); /* new connection */
- tp = intotcpcb(inp);
- KASSERT(tp->get_state() == TCPS_SYN_RECEIVED,
- ("%s: ", __func__));
- /*
- * Process the segment and the data it
- * contains. tcp_do_segment() consumes
- * the mbuf chain.
- */
- bool want_close;
- tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
- iptos, ti_locked, want_close);
- INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- // if tcp_close() indeed closes, it also unlocks
- if (!want_close || tcp_close(tp)) {
- INP_UNLOCK(inp);
- }
- return;
- }
- /*
- * Segment flag validation for new connection attempts:
- *
- * Our (SYN|ACK) response was rejected.
- * Check with syncache and remove entry to prevent
- * retransmits.
- *
- * NB: syncache_chkrst does its own logging of failure
- * causes.
- */
- if (thflags & TH_RST) {
- syncache_chkrst(&inc, th);
- goto dropunlock;
- }
- /*
- * We can't do anything without SYN.
- */
- if ((thflags & TH_SYN) == 0) {
- if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
- bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
- "SYN is missing, segment ignored\n",
- s, __func__);
- TCPSTAT_INC(tcps_badsyn);
- goto dropunlock;
- }
- /*
- * (SYN|ACK) is bogus on a listen socket.
- */
- if (thflags & TH_ACK) {
- if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
- bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
- "SYN|ACK invalid, segment rejected\n",
- s, __func__);
- syncache_badack(&inc); /* XXX: Not needed! */
- TCPSTAT_INC(tcps_badsyn);
- rstreason = BANDLIM_RST_OPENPORT;
- goto dropwithreset;
- }
- /*
- * If the drop_synfin option is enabled, drop all
- * segments with both the SYN and FIN bits set.
- * This prevents e.g. nmap from identifying the
- * TCP/IP stack.
- * XXX: Poor reasoning. nmap has other methods
- * and is constantly refining its stack detection
- * strategies.
- * XXX: This is a violation of the TCP specification
- * and was used by RFC1644.
- */
- if ((thflags & TH_FIN) && V_drop_synfin) {
- if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
- bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
- "SYN|FIN segment ignored (based on "
- "sysctl setting)\n", s, __func__);
- TCPSTAT_INC(tcps_badsyn);
- goto dropunlock;
- }
- /*
- * Segment's flags are (SYN) or (SYN|FIN).
- *
- * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
- * as they do not affect the state of the TCP FSM.
- * The data pointed to by TH_URG and th_urp is ignored.
- */
- KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
- ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
- KASSERT(thflags & (TH_SYN),
- ("%s: Listen socket: TH_SYN not set", __func__));
- /*
- * Basic sanity checks on incoming SYN requests:
- * Don't respond if the destination is a link layer
- * broadcast according to RFC1122 4.2.3.10, p. 104.
- * If it is from this socket it must be forged.
- * Don't respond if the source or destination is a
- * global or subnet broad- or multicast address.
- * Note that it is quite possible to receive unicast
- * link-layer packets with a broadcast IP address. Use
- * in_broadcast() to find them.
- */
- if (m->m_hdr.mh_flags & (M_BCAST|M_MCAST)) {
- if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
- bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
- "Connection attempt from broad- or multicast "
- "link layer address ignored\n", s, __func__);
- goto dropunlock;
- }
- if (th->th_dport == th->th_sport &&
- ip->ip_dst.s_addr == ip->ip_src.s_addr) {
- if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
- bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
- "Connection attempt from/to self "
- "ignored\n", s, __func__);
- goto dropunlock;
- }
- if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
- IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
- ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
- in_broadcast(ip->ip_dst, m->M_dat.MH.MH_pkthdr.rcvif)) {
- if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
- bsd_log(LOG_DEBUG, "%s; %s: Listen socket: "
- "Connection attempt from/to broad- "
- "or multicast address ignored\n",
- s, __func__);
- goto dropunlock;
- }
- /*
- * SYN appears to be valid. Create compressed TCP state
- * for syncache.
- */
- #ifdef TCPDEBUG
- if (so->so_options & SO_DEBUG)
- tcp_trace(TA_INPUT, ostate, tp,
- (void *)tcp_saveipgen, &tcp_savetcp, 0);
- #endif
- tcp_dooptions(&to, optp, optlen, TO_SYN);
- syncache_add(&inc, &to, th, inp, &so, m);
- /*
- * Entry added to syncache and mbuf consumed.
- * Everything already unlocked by syncache_add().
- */
- INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- return;
- }
- /*
- * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
- * state. tcp_do_segment() always consumes the mbuf chain and unlocks pcbinfo.
- */
- bool want_close;
- tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked, want_close);
- INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- // if tcp_close() indeed closes, it also unlocks
- if (!want_close || tcp_close(tp)) {
- INP_UNLOCK(inp);
- }
- return;
- dropwithreset:
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
- ti_locked = TI_UNLOCKED;
- }
- #ifdef INVARIANTS
- else {
- KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset "
- "ti_locked: %d", __func__, ti_locked));
- INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- }
- #endif
- if (inp != NULL) {
- tcp_dropwithreset(m, th, tp, tlen, rstreason);
- INP_UNLOCK(inp);
- } else
- tcp_dropwithreset(m, th, NULL, tlen, rstreason);
- m = NULL; /* mbuf chain got consumed. */
- goto drop;
- dropunlock:
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
- ti_locked = TI_UNLOCKED;
- }
- #ifdef INVARIANTS
- else {
- KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock "
- "ti_locked: %d", __func__, ti_locked));
- INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- }
- #endif
- if (inp != NULL)
- INP_UNLOCK(inp);
- drop:
- INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- if (s != NULL)
- free(s);
- if (m != NULL)
- m_freem(m);
- }
- static void
- tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
- struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
- int ti_locked, bool& want_close)
- {
- int thflags, acked, ourfinisacked, needoutput = 0;
- int rstreason, todrop, win;
- u_long tiwin;
- struct tcpopt to;
- auto inp = tp->t_inpcb;
- want_close = false;
- #ifdef TCPDEBUG
- /*
- * The size of tcp_saveipgen must be the size of the max ip header,
- * now IPv6.
- */
- u_char tcp_saveipgen[IP6_HDR_LEN];
- struct tcphdr tcp_savetcp;
- short ostate = 0;
- #endif
- thflags = th->th_flags;
- tp->sackhint.last_sack_ack = tcp_seq(0);
- /*
- * If this is either a state-changing packet or current state isn't
- * established, we require a write lock on tcbinfo. Otherwise, we
- * allow either a read lock or a write lock, as we may have acquired
- * a write lock due to a race.
- *
- * Require a global write lock for SYN/FIN/RST segments or
- * non-established connections; otherwise accept either a read or
- * write lock, as we may have conservatively acquired a write lock in
- * certain cases in tcp_input() (is this still true?). Currently we
- * will never enter with no lock, so we try to drop it quickly in the
- * common pure ack/pure data cases.
- *
- * net channels process packets without the lock, so try to acquire it.
- * if we fail, drop the packet. FIXME: invert the lock order so we don't
- * have to drop packets.
- */
- if (tp->get_state() != TCPS_ESTABLISHED && ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
- ti_locked = TI_WLOCKED;
- } else {
- goto drop;
- }
- }
- if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
- tp->get_state() != TCPS_ESTABLISHED) {
- KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
- "SYN/FIN/RST/!EST", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- } else {
- #ifdef INVARIANTS
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- else {
- KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
- "ti_locked: %d", __func__, ti_locked));
- INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- }
- #endif
- }
- INP_LOCK_ASSERT(tp->t_inpcb);
- KASSERT(tp->get_state() > TCPS_LISTEN, ("%s: TCPS_LISTEN",
- __func__));
- KASSERT(tp->get_state() != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
- __func__));
- /*
- * Segment received on connection.
- * Reset idle time and keep-alive timer.
- * XXX: This should be done after segment
- * validation to ignore broken/spoofed segs.
- */
- tp->t_rcvtime = bsd_ticks;
- if (TCPS_HAVEESTABLISHED(tp->get_state()))
- tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
- /*
- * Unscale the window into a 32-bit value.
- * For the SYN_SENT state the scale is zero.
- */
- tiwin = th->th_win << tp->snd_scale;
- /*
- * TCP ECN processing.
- */
- if (tp->t_flags & TF_ECN_PERMIT) {
- if (thflags & TH_CWR)
- tp->t_flags &= ~TF_ECN_SND_ECE;
- switch (iptos & IPTOS_ECN_MASK) {
- case IPTOS_ECN_CE:
- tp->t_flags |= TF_ECN_SND_ECE;
- TCPSTAT_INC(tcps_ecn_ce);
- break;
- case IPTOS_ECN_ECT0:
- TCPSTAT_INC(tcps_ecn_ect0);
- break;
- case IPTOS_ECN_ECT1:
- TCPSTAT_INC(tcps_ecn_ect1);
- break;
- }
- /* Congestion experienced. */
- if (thflags & TH_ECE) {
- cc_cong_signal(tp, th, CC_ECN);
- }
- }
- /*
- * Parse options on any incoming segment.
- */
- tcp_dooptions(&to, (u_char *)(th + 1),
- (th->th_off << 2) - sizeof(struct tcphdr),
- (thflags & TH_SYN) ? TO_SYN : 0);
- /*
- * If echoed timestamp is later than the current time,
- * fall back to non RFC1323 RTT calculation. Normalize
- * timestamp if syncookies were used when this connection
- * was established.
- */
- if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
- to.to_tsecr -= tp->ts_offset;
- if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
- to.to_tsecr = 0;
- }
- /*
- * Process options only when we get SYN/ACK back. The SYN case
- * for incoming connections is handled in tcp_syncache.
- * According to RFC1323 the window field in a SYN (i.e., a <SYN>
- * or <SYN,ACK>) segment itself is never scaled.
- * XXX this is traditional behavior, may need to be cleaned up.
- */
- if (tp->get_state() == TCPS_SYN_SENT && (thflags & TH_SYN)) {
- if ((to.to_flags & TOF_SCALE) &&
- (tp->t_flags & TF_REQ_SCALE)) {
- tp->t_flags |= TF_RCVD_SCALE;
- tp->snd_scale = to.to_wscale;
- }
- /*
- * Initial send window. It will be updated with
- * the next incoming segment to the scaled value.
- */
- tp->snd_wnd = th->th_win;
- if (to.to_flags & TOF_TS) {
- tp->t_flags |= TF_RCVD_TSTMP;
- tp->ts_recent = to.to_tsval;
- tp->ts_recent_age = tcp_ts_getticks();
- }
- if (to.to_flags & TOF_MSS)
- tcp_mss(tp, to.to_mss);
- if ((tp->t_flags & TF_SACK_PERMIT) &&
- (to.to_flags & TOF_SACKPERM) == 0)
- tp->t_flags &= ~TF_SACK_PERMIT;
- }
- /*
- * Header prediction: check for the two common cases
- * of a uni-directional data xfer. If the packet has
- * no control flags, is in-sequence, the window didn't
- * change and we're not retransmitting, it's a
- * candidate. If the length is zero and the ack moved
- * forward, we're the sender side of the xfer. Just
- * free the data acked & wake any higher level process
- * that was blocked waiting for space. If the length
- * is non-zero and the ack didn't move, we're the
- * receiver side. If we're getting packets in-order
- * (the reassembly queue is empty), add the data to
- * the socket buffer and note that we need a delayed ack.
- * Make sure that the hidden state-flags are also off.
- * Since we check for TCPS_ESTABLISHED first, it can only
- * be TH_NEEDSYN.
- */
- if (tp->get_state() == TCPS_ESTABLISHED &&
- th->th_seq == tp->rcv_nxt &&
- (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
- tp->snd_nxt == tp->snd_max &&
- tiwin && tiwin == tp->snd_wnd &&
- ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
- LIST_EMPTY(&tp->t_segq) &&
- ((to.to_flags & TOF_TS) == 0 ||
- TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
- /*
- * If last ACK falls within this segment's sequence numbers,
- * record the timestamp.
- * NOTE that the test is modified according to the latest
- * proposal of the tcplw@cray.com list (Braden 1993/04/26).
- */
- if ((to.to_flags & TOF_TS) != 0 &&
- th->th_seq <= tp->last_ack_sent) {
- tp->ts_recent_age = tcp_ts_getticks();
- tp->ts_recent = to.to_tsval;
- }
- if (tlen == 0) {
- if (th->th_ack > tp->snd_una &&
- th->th_ack <= tp->snd_max &&
- !IN_RECOVERY(tp->t_flags) &&
- (to.to_flags & TOF_SACK) == 0 &&
- TAILQ_EMPTY(&tp->snd_holes)) {
- /*
- * This is a pure ack for outstanding data.
- */
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
- ti_locked = TI_UNLOCKED;
- TCPSTAT_INC(tcps_predack);
- /*
- * "bad retransmit" recovery.
- */
- if (tp->t_rxtshift == 1 &&
- tp->t_flags & TF_PREVVALID &&
- (int)(bsd_ticks - tp->t_badrxtwin) < 0) {
- cc_cong_signal(tp, th, CC_RTO_ERR);
- }
- /*
- * Recalculate the transmit timer / rtt.
- *
- * Some boxes send broken timestamp replies
- * during the SYN+ACK phase, ignore
- * timestamps of 0 or we could calculate a
- * huge RTT and blow up the retransmit timer.
- */
- if ((to.to_flags & TOF_TS) != 0 &&
- to.to_tsecr) {
- u_int t;
- t = tcp_ts_getticks() - to.to_tsecr;
- if (!tp->t_rttlow || tp->t_rttlow > t)
- tp->t_rttlow = t;
- tcp_xmit_timer(tp,
- TCP_TS_TO_TICKS(t) + 1);
- } else if (tp->t_rtttime &&
- th->th_ack > tp->t_rtseq) {
- if (!tp->t_rttlow ||
- tp->t_rttlow > bsd_ticks - tp->t_rtttime)
- tp->t_rttlow = bsd_ticks - tp->t_rtttime;
- tcp_xmit_timer(tp,
- bsd_ticks - tp->t_rtttime);
- }
- acked = BYTES_THIS_ACK(tp, th);
- TCPSTAT_INC(tcps_rcvackpack);
- TCPSTAT_ADD(tcps_rcvackbyte, acked);
- sbdrop_locked(so, &so->so_snd, acked);
- if (tp->snd_una > tp->snd_recover &&
- th->th_ack <= tp->snd_recover)
- tp->snd_recover = th->th_ack - 1;
-
- /*
- * Let the congestion control algorithm update
- * congestion control related information. This
- * typically means increasing the congestion
- * window.
- */
- cc_ack_received(tp, th, CC_ACK);
- tp->snd_una = th->th_ack;
- /*
- * Pull snd_wl2 up to prevent seq wrap relative
- * to th_ack.
- */
- tp->snd_wl2 = th->th_ack;
- tp->t_dupacks = 0;
- m_freem(m);
- ND6_HINT(tp); /* Some progress has been made. */
- /*
- * If all outstanding data are acked, stop
- * retransmit timer, otherwise restart timer
- * using current (possibly backed-off) value.
- * If process is waiting for space,
- * wakeup/selwakeup/signal. If data
- * are ready to send, let tcp_output
- * decide between more output or persist.
- */
- #ifdef TCPDEBUG
- if (so->so_options & SO_DEBUG)
- tcp_trace(TA_INPUT, ostate, tp,
- (void *)tcp_saveipgen,
- &tcp_savetcp, 0);
- #endif
- if (tp->snd_una == tp->snd_max)
- tcp_timer_activate(tp, TT_REXMT, 0);
- else if (!tcp_timer_active(tp, TT_PERSIST))
- tcp_timer_activate(tp, TT_REXMT,
- tp->t_rxtcur);
- sowwakeup_locked(so);
- if (so->so_snd.sb_cc)
- (void) tcp_output(tp);
- goto check_delack;
- }
- } else if (th->th_ack == tp->snd_una &&
- tlen <= sbspace(&so->so_rcv)) {
- int newsize = 0; /* automatic sockbuf scaling */
- /*
- * This is a pure, in-sequence data packet with
- * nothing on the reassembly queue and we have enough
- * buffer space to take it.
- */
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
- ti_locked = TI_UNLOCKED;
- /* Clean receiver SACK report if present */
- if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
- tcp_clean_sackreport(tp);
- TCPSTAT_INC(tcps_preddat);
- tp->rcv_nxt += tlen;
- /*
- * Pull snd_wl1 up to prevent seq wrap relative to
- * th_seq.
- */
- tp->snd_wl1 = th->th_seq;
- /*
- * Pull rcv_up up to prevent seq wrap relative to
- * rcv_nxt.
- */
- tp->rcv_up = tp->rcv_nxt;
- TCPSTAT_INC(tcps_rcvpack);
- TCPSTAT_ADD(tcps_rcvbyte, tlen);
- ND6_HINT(tp); /* Some progress has been made */
- #ifdef TCPDEBUG
- if (so->so_options & SO_DEBUG)
- tcp_trace(TA_INPUT, ostate, tp,
- (void *)tcp_saveipgen, &tcp_savetcp, 0);
- #endif
- /*
- * Automatic sizing of receive socket buffer. Often the send
- * buffer size is not optimally adjusted to the actual network
- * conditions at hand (delay bandwidth product). Setting the
- * buffer size too small limits throughput on links with high
- * bandwidth and high delay (eg. trans-continental/oceanic links).
- *
- * On the receive side the socket buffer memory is only rarely
- * used to any significant extent. This allows us to be much
- * more aggressive in scaling the receive socket buffer. For
- * the case that the buffer space is actually used to a large
- * extent and we run out of kernel memory we can simply drop
- * the new segments; TCP on the sender will just retransmit it
- * later. Setting the buffer size too big may only consume too
- * much kernel memory if the application doesn't read() from
- * the socket or packet loss or reordering makes use of the
- * reassembly queue.
- *
- * The criteria to step up the receive buffer one notch are:
- * 1. the number of bytes received during the time it takes
- * one timestamp to be reflected back to us (the RTT);
- * 2. received bytes per RTT is within seven eighth of the
- * current socket buffer size;
- * 3. receive buffer size has not hit maximal automatic size;
- *
- * This algorithm does one step per RTT at most and only if
- * we receive a bulk stream w/o packet losses or reorderings.
- * Shrinking the buffer during idle times is not necessary as
- * it doesn't consume any memory when idle.
- *
- * TODO: Only step up if the application is actually serving
- * the buffer to better manage the socket buffer resources.
- */
- if (V_tcp_do_autorcvbuf &&
- to.to_tsecr &&
- (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
- if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
- to.to_tsecr - tp->rfbuf_ts < hz) {
- if (tp->rfbuf_cnt >
- (so->so_rcv.sb_hiwat / 8 * 7) &&
- so->so_rcv.sb_hiwat <
- V_tcp_autorcvbuf_max) {
- newsize =
- bsd_min(so->so_rcv.sb_hiwat +
- V_tcp_autorcvbuf_inc,
- V_tcp_autorcvbuf_max);
- }
- /* Start over with next RTT. */
- tp->rfbuf_ts = 0;
- tp->rfbuf_cnt = 0;
- } else
- tp->rfbuf_cnt += tlen; /* add up */
- }
- /* Add data to socket buffer. */
- SOCK_LOCK_ASSERT(so);
- if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
- m_freem(m);
- } else {
- /*
- * Set new socket buffer size.
- * Give up when limit is reached.
- */
- if (newsize)
- if (!sbreserve_locked(&so->so_rcv,
- newsize, so, NULL))
- so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
- m_adj(m, drop_hdrlen); /* delayed header drop */
- sbappendstream_locked(so, &so->so_rcv, m);
- }
- sorwakeup_locked(so);
- if (DELAY_ACK(tp)) {
- tp->t_flags |= TF_DELACK;
- } else {
- tp->t_flags |= TF_ACKNOW;
- tcp_output(tp);
- }
- goto check_delack;
- }
- }
- /*
- * Calculate amount of space in receive window,
- * and then do TCP input processing.
- * Receive window is amount of space in rcv queue,
- * but not less than advertised window.
- */
- win = sbspace(&so->so_rcv);
- if (win < 0)
- win = 0;
- tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
- /* Reset receive buffer auto scaling when not in bulk receive mode. */
- tp->rfbuf_ts = 0;
- tp->rfbuf_cnt = 0;
- switch (tp->get_state()) {
- /*
- * If the state is SYN_RECEIVED:
- * if seg contains an ACK, but not for our SYN/ACK, send a RST.
- */
- case TCPS_SYN_RECEIVED:
- if ((thflags & TH_ACK) &&
- (th->th_ack <= tp->snd_una ||
- th->th_ack > tp->snd_max)) {
- rstreason = BANDLIM_RST_OPENPORT;
- goto dropwithreset;
- }
- break;
- /*
- * If the state is SYN_SENT:
- * if seg contains an ACK, but not for our SYN, drop the input.
- * if seg contains a RST, then drop the connection.
- * if seg does not contain SYN, then drop it.
- * Otherwise this is an acceptable SYN segment
- * initialize tp->rcv_nxt and tp->irs
- * if seg contains ack then advance tp->snd_una
- * if seg contains an ECE and ECN support is enabled, the stream
- * is ECN capable.
- * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
- * arrange for segment to be acked (eventually)
- * continue processing rest of data/controls, beginning with URG
- */
- case TCPS_SYN_SENT:
- if ((thflags & TH_ACK) &&
- (th->th_ack <= tp->iss ||
- th->th_ack > tp->snd_max)) {
- rstreason = BANDLIM_UNLIMITED;
- goto dropwithreset;
- }
- if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
- tcp_drop_noclose(tp, ECONNREFUSED);
- want_close = true;
- }
- if (thflags & TH_RST)
- goto drop;
- if (!(thflags & TH_SYN))
- goto drop;
- tp->irs = th->th_seq;
- tcp_rcvseqinit(tp);
- if (thflags & TH_ACK) {
- TCPSTAT_INC(tcps_connects);
- soisconnected(so);
- /* Do window scaling on this connection? */
- if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
- (TF_RCVD_SCALE|TF_REQ_SCALE)) {
- tp->rcv_scale = tp->request_r_scale;
- }
- tp->rcv_adv += imin(tp->rcv_wnd,
- TCP_MAXWIN << tp->rcv_scale);
- tp->snd_una++; /* SYN is acked */
- /*
- * If there's data, delay ACK; if there's also a FIN
- * ACKNOW will be turned on later.
- */
- if (DELAY_ACK(tp) && tlen != 0)
- tcp_timer_activate(tp, TT_DELACK,
- tcp_delacktime);
- else
- tp->t_flags |= TF_ACKNOW;
- if ((thflags & TH_ECE) && V_tcp_do_ecn) {
- tp->t_flags |= TF_ECN_PERMIT;
- TCPSTAT_INC(tcps_ecn_shs);
- }
-
- /*
- * Received <SYN,ACK> in SYN_SENT[*] state.
- * Transitions:
- * SYN_SENT --> ESTABLISHED
- * SYN_SENT* --> FIN_WAIT_1
- */
- tp->t_starttime = bsd_ticks;
- if (tp->t_flags & TF_NEEDFIN) {
- tp->set_state(TCPS_FIN_WAIT_1);
- tp->t_flags &= ~TF_NEEDFIN;
- thflags &= ~TH_SYN;
- } else {
- tp->set_state(TCPS_ESTABLISHED);
- tcp_setup_net_channel(tp, m->M_dat.MH.MH_pkthdr.rcvif);
- cc_conn_init(tp);
- tcp_timer_activate(tp, TT_KEEP,
- TP_KEEPIDLE(tp));
- }
- } else {
- /*
- * Received initial SYN in SYN-SENT[*] state =>
- * simultaneous open. If segment contains CC option
- * and there is a cached CC, apply TAO test.
- * If it succeeds, connection is * half-synchronized.
- * Otherwise, do 3-way handshake:
- * SYN-SENT -> SYN-RECEIVED
- * SYN-SENT* -> SYN-RECEIVED*
- * If there was no CC option, clear cached CC value.
- */
- tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
- tcp_timer_activate(tp, TT_REXMT, 0);
- tp->set_state(TCPS_SYN_RECEIVED);
- }
- KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
- "ti_locked %d", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- INP_LOCK_ASSERT(tp->t_inpcb);
- /*
- * Advance th->th_seq to correspond to first data byte.
- * If data, trim to stay within window,
- * dropping FIN if necessary.
- */
- th->th_seq++;
- if (tlen > tp->rcv_wnd) {
- todrop = tlen - tp->rcv_wnd;
- m_adj(m, -todrop);
- tlen = tp->rcv_wnd;
- thflags &= ~TH_FIN;
- TCPSTAT_INC(tcps_rcvpackafterwin);
- TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
- }
- tp->snd_wl1 = th->th_seq - 1;
- tp->rcv_up = th->th_seq;
- /*
- * Client side of transaction: already sent SYN and data.
- * If the remote host used T/TCP to validate the SYN,
- * our data will be ACK'd; if so, enter normal data segment
- * processing in the middle of step 5, ack processing.
- * Otherwise, goto step 6.
- */
- if (thflags & TH_ACK)
- goto process_ACK;
- goto step6;
- /*
- * If the state is LAST_ACK or CLOSING or TIME_WAIT:
- * do normal processing.
- *
- * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
- */
- case TCPS_LAST_ACK:
- case TCPS_CLOSING:
- break; /* continue normal processing */
- }
- /*
- * States other than LISTEN or SYN_SENT.
- * First check the RST flag and sequence number since reset segments
- * are exempt from the timestamp and connection count tests. This
- * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
- * below which allowed reset segments in half the sequence space
- * to fall though and be processed (which gives forged reset
- * segments with a random sequence number a 50 percent chance of
- * killing a connection).
- * Then check timestamp, if present.
- * Then check the connection count, if present.
- * Then check that at least some bytes of segment are within
- * receive window. If segment begins before rcv_nxt,
- * drop leading data (and SYN); if nothing left, just ack.
- *
- *
- * If the RST bit is set, check the sequence number to see
- * if this is a valid reset segment.
- * RFC 793 page 37:
- * In all states except SYN-SENT, all reset (RST) segments
- * are validated by checking their SEQ-fields. A reset is
- * valid if its sequence number is in the window.
- * Note: this does not …
Large files files are truncated, but you can click here to view the full file