1292309Srrs/*- 2292309Srrs * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3292309Srrs * The Regents of the University of California. All rights reserved. 4292309Srrs * Copyright (c) 2007-2008,2010 5292309Srrs * Swinburne University of Technology, Melbourne, Australia. 6292309Srrs * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7292309Srrs * Copyright (c) 2010 The FreeBSD Foundation 8292309Srrs * Copyright (c) 2010-2011 Juniper Networks, Inc. 9292309Srrs * Copyright (c) 2015 Netflix Inc. 10292309Srrs * All rights reserved. 11292309Srrs * 12292309Srrs * Portions of this software were developed at the Centre for Advanced Internet 13292309Srrs * Architectures, Swinburne University of Technology, by Lawrence Stewart, 14292309Srrs * James Healy and David Hayes, made possible in part by a grant from the Cisco 15292309Srrs * University Research Program Fund at Community Foundation Silicon Valley. 16292309Srrs * 17292309Srrs * Portions of this software were developed at the Centre for Advanced 18292309Srrs * Internet Architectures, Swinburne University of Technology, Melbourne, 19292309Srrs * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 20292309Srrs * 21292309Srrs * Portions of this software were developed by Robert N. M. Watson under 22292309Srrs * contract to Juniper Networks, Inc. 23292309Srrs * 24292309Srrs * Portions of this software were developed by Randall R. Stewart while 25292309Srrs * working for Netflix Inc. 26292309Srrs * 27292309Srrs * Redistribution and use in source and binary forms, with or without 28292309Srrs * modification, are permitted provided that the following conditions 29292309Srrs * are met: 30292309Srrs * 1. Redistributions of source code must retain the above copyright 31292309Srrs * notice, this list of conditions and the following disclaimer. 32292309Srrs * 2. Redistributions in binary form must reproduce the above copyright 33292309Srrs * notice, this list of conditions and the following disclaimer in the 34292309Srrs * documentation and/or other materials provided with the distribution. 35292309Srrs * 4. Neither the name of the University nor the names of its contributors 36292309Srrs * may be used to endorse or promote products derived from this software 37292309Srrs * without specific prior written permission. 38292309Srrs * 39292309Srrs * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 40292309Srrs * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 41292309Srrs * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 42292309Srrs * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 43292309Srrs * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 44292309Srrs * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 45292309Srrs * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 46292309Srrs * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 47292309Srrs * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 48292309Srrs * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 49292309Srrs * SUCH DAMAGE. 50292309Srrs * 51292309Srrs * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 52292309Srrs */ 53292309Srrs 54292309Srrs#include <sys/cdefs.h> 55292309Srrs__FBSDID("$FreeBSD: stable/11/sys/netinet/tcp_stacks/fastpath.c 344511 2019-02-25 10:38:37Z tuexen $"); 56292309Srrs 57292309Srrs#include "opt_inet.h" 58292309Srrs#include "opt_inet6.h" 59292309Srrs#include "opt_tcpdebug.h" 60292309Srrs 61292309Srrs#include <sys/param.h> 62292309Srrs#include <sys/module.h> 63292309Srrs#include <sys/kernel.h> 64292309Srrs#include <sys/hhook.h> 65292309Srrs#include <sys/malloc.h> 66292309Srrs#include <sys/mbuf.h> 67292309Srrs#include <sys/proc.h> /* for proc0 declaration */ 68292309Srrs#include <sys/protosw.h> 69292309Srrs#include <sys/sdt.h> 70292309Srrs#include <sys/signalvar.h> 71292309Srrs#include <sys/socket.h> 72292309Srrs#include <sys/socketvar.h> 73292309Srrs#include <sys/sysctl.h> 74292309Srrs#include <sys/syslog.h> 75292309Srrs#include <sys/systm.h> 76292309Srrs 77292309Srrs#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 78292309Srrs 79292309Srrs#include <vm/uma.h> 80292309Srrs 81292309Srrs#include <net/route.h> 82292309Srrs#include <net/vnet.h> 83292309Srrs 84292309Srrs#define TCPSTATES /* for logging */ 85292309Srrs 86292309Srrs#include <netinet/in.h> 87292309Srrs#include <netinet/in_kdtrace.h> 88292309Srrs#include <netinet/in_pcb.h> 89292309Srrs#include <netinet/in_systm.h> 90292309Srrs#include <netinet/ip.h> 91292309Srrs#include <netinet/ip_icmp.h> /* required for icmp_var.h */ 92292309Srrs#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 93292309Srrs#include <netinet/ip_var.h> 94292309Srrs#include <netinet/ip_options.h> 95292309Srrs#include <netinet/ip6.h> 96292309Srrs#include <netinet/icmp6.h> 97292309Srrs#include <netinet6/in6_pcb.h> 98292309Srrs#include <netinet6/ip6_var.h> 99294535Sglebius#include <netinet/tcp.h> 100292309Srrs#include <netinet/tcp_fsm.h> 101292309Srrs#include <netinet/tcp_seq.h> 102292309Srrs#include <netinet/tcp_timer.h> 103292309Srrs#include <netinet/tcp_var.h> 104292309Srrs#include <netinet6/tcp6_var.h> 105292309Srrs#include <netinet/tcpip.h> 106292309Srrs#include <netinet/tcp_syncache.h> 107294931Sglebius#include <netinet/cc/cc.h> 108292309Srrs#ifdef TCPDEBUG 109292309Srrs#include <netinet/tcp_debug.h> 110292309Srrs#endif /* TCPDEBUG */ 111292309Srrs#ifdef TCP_OFFLOAD 112292309Srrs#include <netinet/tcp_offload.h> 113292309Srrs#endif 114292309Srrs 115292309Srrs#include <machine/in_cksum.h> 116292309Srrs 117292309Srrs#include <security/mac/mac_framework.h> 118292309Srrs 119292309SrrsVNET_DECLARE(int, tcp_autorcvbuf_inc); 120292309Srrs#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 121292309SrrsVNET_DECLARE(int, tcp_autorcvbuf_max); 122292309Srrs#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 123292309SrrsVNET_DECLARE(int, tcp_do_rfc3042); 124292309Srrs#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) 125292309SrrsVNET_DECLARE(int, tcp_do_autorcvbuf); 126292309Srrs#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 127292309SrrsVNET_DECLARE(int, tcp_insecure_rst); 128292309Srrs#define V_tcp_insecure_rst VNET(tcp_insecure_rst) 129292309SrrsVNET_DECLARE(int, tcp_insecure_syn); 130292309Srrs#define V_tcp_insecure_syn VNET(tcp_insecure_syn) 131319399StuexenVNET_DECLARE(int, drop_synfin); 132319399Stuexen#define V_drop_synfin VNET(drop_synfin) 133292309Srrs 134292309Srrsstatic void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *, 135292309Srrs struct socket *, struct tcpcb *, int, int, uint8_t, 136292309Srrs int); 137292309Srrs 138292309Srrsstatic void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *, 139292309Srrs struct socket *, struct tcpcb *, int, int, uint8_t, 140292309Srrs int); 141292309Srrs 142292309Srrs/* 143292309Srrs * Indicate whether this ack should be delayed. We can delay the ack if 144292309Srrs * following conditions are met: 145292309Srrs * - There is no delayed ack timer in progress. 146292309Srrs * - Our last ack wasn't a 0-sized window. We never want to delay 147292309Srrs * the ack that opens up a 0-sized window. 148292309Srrs * - LRO wasn't used for this segment. We make sure by checking that the 149292309Srrs * segment size is not larger than the MSS. 150292309Srrs */ 151292309Srrs#define DELAY_ACK(tp, tlen) \ 152292309Srrs ((!tcp_timer_active(tp, TT_DELACK) && \ 153292309Srrs (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 154293313Sjtl (tlen <= tp->t_maxseg) && \ 155292309Srrs (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 156292309Srrs 157292309Srrs/* 158292309Srrs * So how is this faster than the normal fast ack? 159292309Srrs * It basically allows us to also stay in the fastpath 160292309Srrs * when a window-update ack also arrives. In testing 161292309Srrs * we saw only 25-30% of connections doing fastpath 162292309Srrs * due to the fact that along with moving forward 163292309Srrs * in sequence the window was also updated. 164292309Srrs */ 165292309Srrsstatic void 166292309Srrstcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 167292309Srrs struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 168292309Srrs int ti_locked, u_long tiwin) 169292309Srrs{ 170292309Srrs int acked; 171292309Srrs int winup_only=0; 172292309Srrs#ifdef TCPDEBUG 173292309Srrs /* 174292309Srrs * The size of tcp_saveipgen must be the size of the max ip header, 175292309Srrs * now IPv6. 176292309Srrs */ 177292309Srrs u_char tcp_saveipgen[IP6_HDR_LEN]; 178292309Srrs struct tcphdr tcp_savetcp; 179292309Srrs short ostate = 0; 180292309Srrs#endif 181292309Srrs /* 182298995Spfg * The following if statement will be true if 183292309Srrs * we are doing the win_up_in_fp <and> 184292309Srrs * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or> 185292309Srrs * - No more new data, but we have an ack for new data 186292309Srrs * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) 187292309Srrs * - No more new data, the same ack point but the window grew 188292309Srrs * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd) 189292309Srrs */ 190292309Srrs if ((SEQ_LT(tp->snd_wl1, th->th_seq) || 191292309Srrs (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 192292309Srrs (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 193292309Srrs /* keep track of pure window updates */ 194292309Srrs if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { 195292309Srrs winup_only = 1; 196292309Srrs TCPSTAT_INC(tcps_rcvwinupd); 197292309Srrs } 198292309Srrs tp->snd_wnd = tiwin; 199292309Srrs tp->snd_wl1 = th->th_seq; 200292309Srrs tp->snd_wl2 = th->th_ack; 201292309Srrs if (tp->snd_wnd > tp->max_sndwnd) 202292309Srrs tp->max_sndwnd = tp->snd_wnd; 203292309Srrs } 204292309Srrs /* 205292309Srrs * If last ACK falls within this segment's sequence numbers, 206292309Srrs * record the timestamp. 207292309Srrs * NOTE that the test is modified according to the latest 208292309Srrs * proposal of the tcplw@cray.com list (Braden 1993/04/26). 209292309Srrs */ 210292309Srrs if ((to->to_flags & TOF_TS) != 0 && 211292309Srrs SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 212292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 213292309Srrs tp->ts_recent = to->to_tsval; 214292309Srrs } 215292309Srrs /* 216292309Srrs * This is a pure ack for outstanding data. 217292309Srrs */ 218292309Srrs if (ti_locked == TI_RLOCKED) { 219292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 220292309Srrs } 221292309Srrs ti_locked = TI_UNLOCKED; 222292309Srrs 223292309Srrs TCPSTAT_INC(tcps_predack); 224292309Srrs 225292309Srrs /* 226292309Srrs * "bad retransmit" recovery. 227292309Srrs */ 228292309Srrs if (tp->t_rxtshift == 1 && 229292309Srrs tp->t_flags & TF_PREVVALID && 230292309Srrs (int)(ticks - tp->t_badrxtwin) < 0) { 231292309Srrs cc_cong_signal(tp, th, CC_RTO_ERR); 232292309Srrs } 233292309Srrs 234292309Srrs /* 235292309Srrs * Recalculate the transmit timer / rtt. 236292309Srrs * 237292309Srrs * Some boxes send broken timestamp replies 238292309Srrs * during the SYN+ACK phase, ignore 239292309Srrs * timestamps of 0 or we could calculate a 240292309Srrs * huge RTT and blow up the retransmit timer. 241292309Srrs */ 242292309Srrs if ((to->to_flags & TOF_TS) != 0 && 243292309Srrs to->to_tsecr) { 244292309Srrs u_int t; 245292309Srrs 246292309Srrs t = tcp_ts_getticks() - to->to_tsecr; 247292309Srrs if (!tp->t_rttlow || tp->t_rttlow > t) 248292309Srrs tp->t_rttlow = t; 249292309Srrs tcp_xmit_timer(tp, 250292309Srrs TCP_TS_TO_TICKS(t) + 1); 251292309Srrs } else if (tp->t_rtttime && 252292309Srrs SEQ_GT(th->th_ack, tp->t_rtseq)) { 253292309Srrs if (!tp->t_rttlow || 254292309Srrs tp->t_rttlow > ticks - tp->t_rtttime) 255292309Srrs tp->t_rttlow = ticks - tp->t_rtttime; 256292309Srrs tcp_xmit_timer(tp, 257292309Srrs ticks - tp->t_rtttime); 258292309Srrs } 259292309Srrs if (winup_only == 0) { 260292309Srrs acked = BYTES_THIS_ACK(tp, th); 261292309Srrs 262292309Srrs /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 263292309Srrs hhook_run_tcp_est_in(tp, th, to); 264292309Srrs 265292309Srrs TCPSTAT_ADD(tcps_rcvackbyte, acked); 266292309Srrs sbdrop(&so->so_snd, acked); 267292309Srrs if (SEQ_GT(tp->snd_una, tp->snd_recover) && 268292309Srrs SEQ_LEQ(th->th_ack, tp->snd_recover)) 269292309Srrs tp->snd_recover = th->th_ack - 1; 270292309Srrs 271292309Srrs /* 272292309Srrs * Let the congestion control algorithm update 273292309Srrs * congestion control related information. This 274292309Srrs * typically means increasing the congestion 275292309Srrs * window. 276292309Srrs */ 277292309Srrs cc_ack_received(tp, th, CC_ACK); 278292309Srrs 279292309Srrs tp->snd_una = th->th_ack; 280292309Srrs /* 281292309Srrs * Pull snd_wl2 up to prevent seq wrap relative 282292309Srrs * to th_ack. 283292309Srrs */ 284292309Srrs tp->snd_wl2 = th->th_ack; 285292309Srrs tp->t_dupacks = 0; 286292309Srrs 287292309Srrs /* 288292309Srrs * If all outstanding data are acked, stop 289292309Srrs * retransmit timer, otherwise restart timer 290292309Srrs * using current (possibly backed-off) value. 291292309Srrs * If process is waiting for space, 292292309Srrs * wakeup/selwakeup/signal. If data 293292309Srrs * are ready to send, let tcp_output 294292309Srrs * decide between more output or persist. 295292309Srrs */ 296292309Srrs#ifdef TCPDEBUG 297292309Srrs if (so->so_options & SO_DEBUG) 298292309Srrs tcp_trace(TA_INPUT, ostate, tp, 299292309Srrs (void *)tcp_saveipgen, 300292309Srrs &tcp_savetcp, 0); 301292309Srrs#endif 302316208Sgnn TCP_PROBE3(debug__input, tp, th, m); 303296352Sgnn m_freem(m); 304292309Srrs if (tp->snd_una == tp->snd_max) 305292309Srrs tcp_timer_activate(tp, TT_REXMT, 0); 306292309Srrs else if (!tcp_timer_active(tp, TT_PERSIST)) 307292309Srrs tcp_timer_activate(tp, TT_REXMT, 308292309Srrs tp->t_rxtcur); 309292309Srrs } else { 310292309Srrs /* 311292309Srrs * Window update only, just free the mbufs and 312292309Srrs * send out whatever we can. 313292309Srrs */ 314292309Srrs m_freem(m); 315292309Srrs } 316292309Srrs sowwakeup(so); 317292309Srrs if (sbavail(&so->so_snd)) 318292309Srrs (void) tcp_output(tp); 319292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 320292309Srrs __func__, ti_locked)); 321292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 322292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 323292309Srrs 324292309Srrs if (tp->t_flags & TF_DELACK) { 325292309Srrs tp->t_flags &= ~TF_DELACK; 326292309Srrs tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 327292309Srrs } 328292309Srrs INP_WUNLOCK(tp->t_inpcb); 329292309Srrs} 330292309Srrs 331292309Srrs/* 332292309Srrs * Here nothing is really faster, its just that we 333292309Srrs * have broken out the fast-data path also just like 334292309Srrs * the fast-ack. 335292309Srrs */ 336292309Srrsstatic void 337292309Srrstcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 338292309Srrs struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 339292309Srrs int ti_locked, u_long tiwin) 340292309Srrs{ 341292309Srrs int newsize = 0; /* automatic sockbuf scaling */ 342292309Srrs#ifdef TCPDEBUG 343292309Srrs /* 344292309Srrs * The size of tcp_saveipgen must be the size of the max ip header, 345292309Srrs * now IPv6. 346292309Srrs */ 347292309Srrs u_char tcp_saveipgen[IP6_HDR_LEN]; 348292309Srrs struct tcphdr tcp_savetcp; 349292309Srrs short ostate = 0; 350292309Srrs#endif 351292309Srrs /* 352292309Srrs * If last ACK falls within this segment's sequence numbers, 353292309Srrs * record the timestamp. 354292309Srrs * NOTE that the test is modified according to the latest 355292309Srrs * proposal of the tcplw@cray.com list (Braden 1993/04/26). 356292309Srrs */ 357292309Srrs if ((to->to_flags & TOF_TS) != 0 && 358292309Srrs SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 359292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 360292309Srrs tp->ts_recent = to->to_tsval; 361292309Srrs } 362292309Srrs 363292309Srrs /* 364292309Srrs * This is a pure, in-sequence data packet with 365292309Srrs * nothing on the reassembly queue and we have enough 366292309Srrs * buffer space to take it. 367292309Srrs */ 368292309Srrs if (ti_locked == TI_RLOCKED) { 369292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 370292309Srrs } 371292309Srrs ti_locked = TI_UNLOCKED; 372292309Srrs 373292309Srrs /* Clean receiver SACK report if present */ 374292309Srrs if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 375292309Srrs tcp_clean_sackreport(tp); 376292309Srrs TCPSTAT_INC(tcps_preddat); 377292309Srrs tp->rcv_nxt += tlen; 378292309Srrs /* 379292309Srrs * Pull snd_wl1 up to prevent seq wrap relative to 380292309Srrs * th_seq. 381292309Srrs */ 382292309Srrs tp->snd_wl1 = th->th_seq; 383292309Srrs /* 384292309Srrs * Pull rcv_up up to prevent seq wrap relative to 385292309Srrs * rcv_nxt. 386292309Srrs */ 387292309Srrs tp->rcv_up = tp->rcv_nxt; 388292309Srrs TCPSTAT_ADD(tcps_rcvbyte, tlen); 389292309Srrs#ifdef TCPDEBUG 390292309Srrs if (so->so_options & SO_DEBUG) 391292309Srrs tcp_trace(TA_INPUT, ostate, tp, 392292309Srrs (void *)tcp_saveipgen, &tcp_savetcp, 0); 393292309Srrs#endif 394316208Sgnn TCP_PROBE3(debug__input, tp, th, m); 395292309Srrs 396317368Ssmh newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 397317368Ssmh 398292309Srrs /* Add data to socket buffer. */ 399292309Srrs SOCKBUF_LOCK(&so->so_rcv); 400292309Srrs if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 401292309Srrs m_freem(m); 402292309Srrs } else { 403292309Srrs /* 404292309Srrs * Set new socket buffer size. 405292309Srrs * Give up when limit is reached. 406292309Srrs */ 407292309Srrs if (newsize) 408292309Srrs if (!sbreserve_locked(&so->so_rcv, 409292309Srrs newsize, so, NULL)) 410292309Srrs so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 411292309Srrs m_adj(m, drop_hdrlen); /* delayed header drop */ 412292309Srrs sbappendstream_locked(&so->so_rcv, m, 0); 413292309Srrs } 414292309Srrs /* NB: sorwakeup_locked() does an implicit unlock. */ 415292309Srrs sorwakeup_locked(so); 416292309Srrs if (DELAY_ACK(tp, tlen)) { 417292309Srrs tp->t_flags |= TF_DELACK; 418292309Srrs } else { 419292309Srrs tp->t_flags |= TF_ACKNOW; 420292309Srrs tcp_output(tp); 421292309Srrs } 422292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 423292309Srrs __func__, ti_locked)); 424292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 425292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 426292309Srrs 427292309Srrs if (tp->t_flags & TF_DELACK) { 428292309Srrs tp->t_flags &= ~TF_DELACK; 429292309Srrs tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 430292309Srrs } 431292309Srrs INP_WUNLOCK(tp->t_inpcb); 432292309Srrs} 433292309Srrs 434292309Srrs/* 435292309Srrs * The slow-path is the clone of the long long part 436292309Srrs * of tcp_do_segment past all the fast-path stuff. We 437292309Srrs * use it here by two different callers, the fast/slow and 438292309Srrs * the fastack only. 439292309Srrs */ 440292309Srrsstatic void 441292309Srrstcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so, 442292309Srrs struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 443292309Srrs int ti_locked, u_long tiwin, int thflags) 444292309Srrs{ 445292309Srrs int acked, ourfinisacked, needoutput = 0; 446292309Srrs int rstreason, todrop, win; 447292309Srrs char *s; 448292309Srrs struct in_conninfo *inc; 449292309Srrs struct mbuf *mfree = NULL; 450292309Srrs#ifdef TCPDEBUG 451292309Srrs /* 452292309Srrs * The size of tcp_saveipgen must be the size of the max ip header, 453292309Srrs * now IPv6. 454292309Srrs */ 455292309Srrs u_char tcp_saveipgen[IP6_HDR_LEN]; 456292309Srrs struct tcphdr tcp_savetcp; 457292309Srrs short ostate = 0; 458292309Srrs#endif 459292309Srrs /* 460292309Srrs * Calculate amount of space in receive window, 461292309Srrs * and then do TCP input processing. 462292309Srrs * Receive window is amount of space in rcv queue, 463292309Srrs * but not less than advertised window. 464292309Srrs */ 465292309Srrs inc = &tp->t_inpcb->inp_inc; 466292309Srrs win = sbspace(&so->so_rcv); 467292309Srrs if (win < 0) 468292309Srrs win = 0; 469292309Srrs tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 470292309Srrs 471292309Srrs switch (tp->t_state) { 472292309Srrs 473292309Srrs /* 474292309Srrs * If the state is SYN_RECEIVED: 475292309Srrs * if seg contains an ACK, but not for our SYN/ACK, send a RST. 476292309Srrs */ 477292309Srrs case TCPS_SYN_RECEIVED: 478292309Srrs if ((thflags & TH_ACK) && 479292309Srrs (SEQ_LEQ(th->th_ack, tp->snd_una) || 480292309Srrs SEQ_GT(th->th_ack, tp->snd_max))) { 481292309Srrs rstreason = BANDLIM_RST_OPENPORT; 482292309Srrs goto dropwithreset; 483292309Srrs } 484292309Srrs break; 485292309Srrs 486292309Srrs /* 487292309Srrs * If the state is SYN_SENT: 488292309Srrs * if seg contains a RST, then drop the connection. 489292309Srrs * if seg does not contain SYN, then drop it. 490292309Srrs * Otherwise this is an acceptable SYN segment 491292309Srrs * initialize tp->rcv_nxt and tp->irs 492292309Srrs * if seg contains ack then advance tp->snd_una 493292309Srrs * if seg contains an ECE and ECN support is enabled, the stream 494292309Srrs * is ECN capable. 495292309Srrs * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 496292309Srrs * arrange for segment to be acked (eventually) 497292309Srrs * continue processing rest of data/controls, beginning with URG 498292309Srrs */ 499292309Srrs case TCPS_SYN_SENT: 500292309Srrs if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { 501316208Sgnn TCP_PROBE5(connect__refused, NULL, tp, m, tp, th); 502292309Srrs tp = tcp_drop(tp, ECONNREFUSED); 503292309Srrs } 504292309Srrs if (thflags & TH_RST) 505292309Srrs goto drop; 506292309Srrs if (!(thflags & TH_SYN)) 507292309Srrs goto drop; 508292309Srrs 509292309Srrs tp->irs = th->th_seq; 510292309Srrs tcp_rcvseqinit(tp); 511292309Srrs if (thflags & TH_ACK) { 512292309Srrs TCPSTAT_INC(tcps_connects); 513292309Srrs soisconnected(so); 514292309Srrs#ifdef MAC 515292309Srrs mac_socketpeer_set_from_mbuf(m, so); 516292309Srrs#endif 517292309Srrs /* Do window scaling on this connection? */ 518292309Srrs if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 519292309Srrs (TF_RCVD_SCALE|TF_REQ_SCALE)) { 520292309Srrs tp->rcv_scale = tp->request_r_scale; 521292309Srrs } 522292309Srrs tp->rcv_adv += imin(tp->rcv_wnd, 523292309Srrs TCP_MAXWIN << tp->rcv_scale); 524292309Srrs tp->snd_una++; /* SYN is acked */ 525292309Srrs /* 526292309Srrs * If there's data, delay ACK; if there's also a FIN 527292309Srrs * ACKNOW will be turned on later. 528292309Srrs */ 529292309Srrs if (DELAY_ACK(tp, tlen) && tlen != 0) 530292309Srrs tcp_timer_activate(tp, TT_DELACK, 531292309Srrs tcp_delacktime); 532292309Srrs else 533292309Srrs tp->t_flags |= TF_ACKNOW; 534292309Srrs 535292309Srrs if ((thflags & TH_ECE) && V_tcp_do_ecn) { 536292309Srrs tp->t_flags |= TF_ECN_PERMIT; 537292309Srrs TCPSTAT_INC(tcps_ecn_shs); 538292309Srrs } 539292309Srrs 540292309Srrs /* 541292309Srrs * Received <SYN,ACK> in SYN_SENT[*] state. 542292309Srrs * Transitions: 543292309Srrs * SYN_SENT --> ESTABLISHED 544292309Srrs * SYN_SENT* --> FIN_WAIT_1 545292309Srrs */ 546292309Srrs tp->t_starttime = ticks; 547292309Srrs if (tp->t_flags & TF_NEEDFIN) { 548292309Srrs tcp_state_change(tp, TCPS_FIN_WAIT_1); 549292309Srrs tp->t_flags &= ~TF_NEEDFIN; 550292309Srrs thflags &= ~TH_SYN; 551292309Srrs } else { 552292309Srrs tcp_state_change(tp, TCPS_ESTABLISHED); 553292309Srrs TCP_PROBE5(connect__established, NULL, tp, 554316208Sgnn m, tp, th); 555292309Srrs cc_conn_init(tp); 556292309Srrs tcp_timer_activate(tp, TT_KEEP, 557292309Srrs TP_KEEPIDLE(tp)); 558292309Srrs } 559292309Srrs } else { 560292309Srrs /* 561292309Srrs * Received initial SYN in SYN-SENT[*] state => 562292309Srrs * simultaneous open. 563292309Srrs * If it succeeds, connection is * half-synchronized. 564292309Srrs * Otherwise, do 3-way handshake: 565292309Srrs * SYN-SENT -> SYN-RECEIVED 566292309Srrs * SYN-SENT* -> SYN-RECEIVED* 567292309Srrs */ 568292309Srrs tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 569292309Srrs tcp_timer_activate(tp, TT_REXMT, 0); 570292309Srrs tcp_state_change(tp, TCPS_SYN_RECEIVED); 571292309Srrs } 572292309Srrs 573292309Srrs KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " 574292309Srrs "ti_locked %d", __func__, ti_locked)); 575292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 576292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 577292309Srrs 578292309Srrs /* 579292309Srrs * Advance th->th_seq to correspond to first data byte. 580292309Srrs * If data, trim to stay within window, 581292309Srrs * dropping FIN if necessary. 582292309Srrs */ 583292309Srrs th->th_seq++; 584292309Srrs if (tlen > tp->rcv_wnd) { 585292309Srrs todrop = tlen - tp->rcv_wnd; 586292309Srrs m_adj(m, -todrop); 587292309Srrs tlen = tp->rcv_wnd; 588292309Srrs thflags &= ~TH_FIN; 589292309Srrs TCPSTAT_INC(tcps_rcvpackafterwin); 590292309Srrs TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 591292309Srrs } 592292309Srrs tp->snd_wl1 = th->th_seq - 1; 593292309Srrs tp->rcv_up = th->th_seq; 594292309Srrs /* 595292309Srrs * Client side of transaction: already sent SYN and data. 596292309Srrs * If the remote host used T/TCP to validate the SYN, 597292309Srrs * our data will be ACK'd; if so, enter normal data segment 598292309Srrs * processing in the middle of step 5, ack processing. 599292309Srrs * Otherwise, goto step 6. 600292309Srrs */ 601292309Srrs if (thflags & TH_ACK) 602292309Srrs goto process_ACK; 603292309Srrs 604292309Srrs goto step6; 605292309Srrs 606292309Srrs /* 607292309Srrs * If the state is LAST_ACK or CLOSING or TIME_WAIT: 608292309Srrs * do normal processing. 609292309Srrs * 610292309Srrs * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 611292309Srrs */ 612292309Srrs case TCPS_LAST_ACK: 613292309Srrs case TCPS_CLOSING: 614292309Srrs break; /* continue normal processing */ 615292309Srrs } 616292309Srrs 617292309Srrs /* 618292309Srrs * States other than LISTEN or SYN_SENT. 619292309Srrs * First check the RST flag and sequence number since reset segments 620292309Srrs * are exempt from the timestamp and connection count tests. This 621292309Srrs * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 622292309Srrs * below which allowed reset segments in half the sequence space 623292309Srrs * to fall though and be processed (which gives forged reset 624292309Srrs * segments with a random sequence number a 50 percent chance of 625292309Srrs * killing a connection). 626292309Srrs * Then check timestamp, if present. 627292309Srrs * Then check the connection count, if present. 628292309Srrs * Then check that at least some bytes of segment are within 629292309Srrs * receive window. If segment begins before rcv_nxt, 630292309Srrs * drop leading data (and SYN); if nothing left, just ack. 631292309Srrs */ 632292309Srrs if (thflags & TH_RST) { 633292309Srrs /* 634292309Srrs * RFC5961 Section 3.2 635292309Srrs * 636292309Srrs * - RST drops connection only if SEG.SEQ == RCV.NXT. 637292309Srrs * - If RST is in window, we send challenge ACK. 638292309Srrs * 639292309Srrs * Note: to take into account delayed ACKs, we should 640292309Srrs * test against last_ack_sent instead of rcv_nxt. 641292309Srrs * Note 2: we handle special case of closed window, not 642292309Srrs * covered by the RFC. 643292309Srrs */ 644292309Srrs if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 645292309Srrs SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 646292309Srrs (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 647292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 648292309Srrs KASSERT(ti_locked == TI_RLOCKED, 649292309Srrs ("%s: TH_RST ti_locked %d, th %p tp %p", 650292309Srrs __func__, ti_locked, th, tp)); 651292309Srrs KASSERT(tp->t_state != TCPS_SYN_SENT, 652292309Srrs ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 653292309Srrs __func__, th, tp)); 654292309Srrs 655292309Srrs if (V_tcp_insecure_rst || 656292309Srrs tp->last_ack_sent == th->th_seq) { 657292309Srrs TCPSTAT_INC(tcps_drops); 658292309Srrs /* Drop the connection. */ 659292309Srrs switch (tp->t_state) { 660292309Srrs case TCPS_SYN_RECEIVED: 661292309Srrs so->so_error = ECONNREFUSED; 662292309Srrs goto close; 663292309Srrs case TCPS_ESTABLISHED: 664292309Srrs case TCPS_FIN_WAIT_1: 665292309Srrs case TCPS_FIN_WAIT_2: 666292309Srrs case TCPS_CLOSE_WAIT: 667310211Stuexen case TCPS_CLOSING: 668310211Stuexen case TCPS_LAST_ACK: 669292309Srrs so->so_error = ECONNRESET; 670292309Srrs close: 671292309Srrs /* FALLTHROUGH */ 672292309Srrs default: 673292309Srrs tp = tcp_close(tp); 674292309Srrs } 675292309Srrs } else { 676292309Srrs TCPSTAT_INC(tcps_badrst); 677292309Srrs /* Send challenge ACK. */ 678292309Srrs tcp_respond(tp, mtod(m, void *), th, m, 679292309Srrs tp->rcv_nxt, tp->snd_nxt, TH_ACK); 680292309Srrs tp->last_ack_sent = tp->rcv_nxt; 681292309Srrs m = NULL; 682292309Srrs } 683292309Srrs } 684292309Srrs goto drop; 685292309Srrs } 686292309Srrs 687292309Srrs /* 688292309Srrs * RFC5961 Section 4.2 689292309Srrs * Send challenge ACK for any SYN in synchronized state. 690292309Srrs */ 691292309Srrs if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { 692292309Srrs KASSERT(ti_locked == TI_RLOCKED, 693292309Srrs ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); 694292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 695292309Srrs 696292309Srrs TCPSTAT_INC(tcps_badsyn); 697292309Srrs if (V_tcp_insecure_syn && 698292309Srrs SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 699292309Srrs SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 700292309Srrs tp = tcp_drop(tp, ECONNRESET); 701292309Srrs rstreason = BANDLIM_UNLIMITED; 702292309Srrs } else { 703292309Srrs /* Send challenge ACK. */ 704292309Srrs tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 705292309Srrs tp->snd_nxt, TH_ACK); 706292309Srrs tp->last_ack_sent = tp->rcv_nxt; 707292309Srrs m = NULL; 708292309Srrs } 709292309Srrs goto drop; 710292309Srrs } 711292309Srrs 712292309Srrs /* 713292309Srrs * RFC 1323 PAWS: If we have a timestamp reply on this segment 714292309Srrs * and it's less than ts_recent, drop it. 715292309Srrs */ 716292309Srrs if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 717292309Srrs TSTMP_LT(to->to_tsval, tp->ts_recent)) { 718292309Srrs 719292309Srrs /* Check to see if ts_recent is over 24 days old. */ 720292309Srrs if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 721292309Srrs /* 722292309Srrs * Invalidate ts_recent. If this segment updates 723292309Srrs * ts_recent, the age will be reset later and ts_recent 724292309Srrs * will get a valid value. If it does not, setting 725292309Srrs * ts_recent to zero will at least satisfy the 726292309Srrs * requirement that zero be placed in the timestamp 727292309Srrs * echo reply when ts_recent isn't valid. The 728292309Srrs * age isn't reset until we get a valid ts_recent 729292309Srrs * because we don't want out-of-order segments to be 730292309Srrs * dropped when ts_recent is old. 731292309Srrs */ 732292309Srrs tp->ts_recent = 0; 733292309Srrs } else { 734292309Srrs TCPSTAT_INC(tcps_rcvduppack); 735292309Srrs TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 736292309Srrs TCPSTAT_INC(tcps_pawsdrop); 737292309Srrs if (tlen) 738292309Srrs goto dropafterack; 739292309Srrs goto drop; 740292309Srrs } 741292309Srrs } 742292309Srrs 743292309Srrs /* 744292309Srrs * In the SYN-RECEIVED state, validate that the packet belongs to 745292309Srrs * this connection before trimming the data to fit the receive 746292309Srrs * window. Check the sequence number versus IRS since we know 747292309Srrs * the sequence numbers haven't wrapped. This is a partial fix 748292309Srrs * for the "LAND" DoS attack. 749292309Srrs */ 750292309Srrs if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 751292309Srrs rstreason = BANDLIM_RST_OPENPORT; 752292309Srrs goto dropwithreset; 753292309Srrs } 754292309Srrs 755292309Srrs todrop = tp->rcv_nxt - th->th_seq; 756292309Srrs if (todrop > 0) { 757292309Srrs if (thflags & TH_SYN) { 758292309Srrs thflags &= ~TH_SYN; 759292309Srrs th->th_seq++; 760292309Srrs if (th->th_urp > 1) 761292309Srrs th->th_urp--; 762292309Srrs else 763292309Srrs thflags &= ~TH_URG; 764292309Srrs todrop--; 765292309Srrs } 766292309Srrs /* 767292309Srrs * Following if statement from Stevens, vol. 2, p. 960. 768292309Srrs */ 769292309Srrs if (todrop > tlen 770292309Srrs || (todrop == tlen && (thflags & TH_FIN) == 0)) { 771292309Srrs /* 772292309Srrs * Any valid FIN must be to the left of the window. 773292309Srrs * At this point the FIN must be a duplicate or out 774292309Srrs * of sequence; drop it. 775292309Srrs */ 776292309Srrs thflags &= ~TH_FIN; 777292309Srrs 778292309Srrs /* 779292309Srrs * Send an ACK to resynchronize and drop any data. 780292309Srrs * But keep on processing for RST or ACK. 781292309Srrs */ 782292309Srrs tp->t_flags |= TF_ACKNOW; 783292309Srrs todrop = tlen; 784292309Srrs TCPSTAT_INC(tcps_rcvduppack); 785292309Srrs TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 786292309Srrs } else { 787292309Srrs TCPSTAT_INC(tcps_rcvpartduppack); 788292309Srrs TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 789292309Srrs } 790292309Srrs drop_hdrlen += todrop; /* drop from the top afterwards */ 791292309Srrs th->th_seq += todrop; 792292309Srrs tlen -= todrop; 793292309Srrs if (th->th_urp > todrop) 794292309Srrs th->th_urp -= todrop; 795292309Srrs else { 796292309Srrs thflags &= ~TH_URG; 797292309Srrs th->th_urp = 0; 798292309Srrs } 799292309Srrs } 800292309Srrs 801292309Srrs /* 802292309Srrs * If new data are received on a connection after the 803292309Srrs * user processes are gone, then RST the other end. 804292309Srrs */ 805292309Srrs if ((so->so_state & SS_NOFDREF) && 806292309Srrs tp->t_state > TCPS_CLOSE_WAIT && tlen) { 807292309Srrs KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " 808292309Srrs "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); 809292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 810292309Srrs 811292309Srrs if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 812292309Srrs log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " 813292309Srrs "after socket was closed, " 814292309Srrs "sending RST and removing tcpcb\n", 815292309Srrs s, __func__, tcpstates[tp->t_state], tlen); 816292309Srrs free(s, M_TCPLOG); 817292309Srrs } 818292309Srrs tp = tcp_close(tp); 819292309Srrs TCPSTAT_INC(tcps_rcvafterclose); 820292309Srrs rstreason = BANDLIM_UNLIMITED; 821292309Srrs goto dropwithreset; 822292309Srrs } 823292309Srrs 824292309Srrs /* 825292309Srrs * If segment ends after window, drop trailing data 826292309Srrs * (and PUSH and FIN); if nothing left, just ACK. 827292309Srrs */ 828292309Srrs todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 829292309Srrs if (todrop > 0) { 830292309Srrs TCPSTAT_INC(tcps_rcvpackafterwin); 831292309Srrs if (todrop >= tlen) { 832292309Srrs TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 833292309Srrs /* 834292309Srrs * If window is closed can only take segments at 835292309Srrs * window edge, and have to drop data and PUSH from 836292309Srrs * incoming segments. Continue processing, but 837292309Srrs * remember to ack. Otherwise, drop segment 838292309Srrs * and ack. 839292309Srrs */ 840292309Srrs if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 841292309Srrs tp->t_flags |= TF_ACKNOW; 842292309Srrs TCPSTAT_INC(tcps_rcvwinprobe); 843292309Srrs } else 844292309Srrs goto dropafterack; 845292309Srrs } else 846292309Srrs TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 847292309Srrs m_adj(m, -todrop); 848292309Srrs tlen -= todrop; 849292309Srrs thflags &= ~(TH_PUSH|TH_FIN); 850292309Srrs } 851292309Srrs 852292309Srrs /* 853292309Srrs * If last ACK falls within this segment's sequence numbers, 854292309Srrs * record its timestamp. 855292309Srrs * NOTE: 856292309Srrs * 1) That the test incorporates suggestions from the latest 857292309Srrs * proposal of the tcplw@cray.com list (Braden 1993/04/26). 858292309Srrs * 2) That updating only on newer timestamps interferes with 859292309Srrs * our earlier PAWS tests, so this check should be solely 860292309Srrs * predicated on the sequence space of this segment. 861292309Srrs * 3) That we modify the segment boundary check to be 862292309Srrs * Last.ACK.Sent <= SEG.SEQ + SEG.Len 863292309Srrs * instead of RFC1323's 864292309Srrs * Last.ACK.Sent < SEG.SEQ + SEG.Len, 865292309Srrs * This modified check allows us to overcome RFC1323's 866292309Srrs * limitations as described in Stevens TCP/IP Illustrated 867292309Srrs * Vol. 2 p.869. In such cases, we can still calculate the 868292309Srrs * RTT correctly when RCV.NXT == Last.ACK.Sent. 869292309Srrs */ 870292309Srrs if ((to->to_flags & TOF_TS) != 0 && 871292309Srrs SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 872292309Srrs SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 873292309Srrs ((thflags & (TH_SYN|TH_FIN)) != 0))) { 874292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 875292309Srrs tp->ts_recent = to->to_tsval; 876292309Srrs } 877292309Srrs 878292309Srrs /* 879292309Srrs * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 880292309Srrs * flag is on (half-synchronized state), then queue data for 881292309Srrs * later processing; else drop segment and return. 882292309Srrs */ 883292309Srrs if ((thflags & TH_ACK) == 0) { 884292309Srrs if (tp->t_state == TCPS_SYN_RECEIVED || 885292309Srrs (tp->t_flags & TF_NEEDSYN)) 886292309Srrs goto step6; 887292309Srrs else if (tp->t_flags & TF_ACKNOW) 888292309Srrs goto dropafterack; 889292309Srrs else 890292309Srrs goto drop; 891292309Srrs } 892292309Srrs 893292309Srrs /* 894292309Srrs * Ack processing. 895292309Srrs */ 896292309Srrs switch (tp->t_state) { 897292309Srrs 898292309Srrs /* 899292309Srrs * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 900292309Srrs * ESTABLISHED state and continue processing. 901292309Srrs * The ACK was checked above. 902292309Srrs */ 903292309Srrs case TCPS_SYN_RECEIVED: 904292309Srrs 905292309Srrs TCPSTAT_INC(tcps_connects); 906292309Srrs soisconnected(so); 907292309Srrs /* Do window scaling? */ 908292309Srrs if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 909292309Srrs (TF_RCVD_SCALE|TF_REQ_SCALE)) { 910292309Srrs tp->rcv_scale = tp->request_r_scale; 911292309Srrs tp->snd_wnd = tiwin; 912292309Srrs } 913292309Srrs /* 914292309Srrs * Make transitions: 915292309Srrs * SYN-RECEIVED -> ESTABLISHED 916292309Srrs * SYN-RECEIVED* -> FIN-WAIT-1 917292309Srrs */ 918292309Srrs tp->t_starttime = ticks; 919292309Srrs if (tp->t_flags & TF_NEEDFIN) { 920292309Srrs tcp_state_change(tp, TCPS_FIN_WAIT_1); 921292309Srrs tp->t_flags &= ~TF_NEEDFIN; 922292309Srrs } else { 923292309Srrs tcp_state_change(tp, TCPS_ESTABLISHED); 924292309Srrs TCP_PROBE5(accept__established, NULL, tp, 925316208Sgnn m, tp, th); 926292309Srrs cc_conn_init(tp); 927292309Srrs tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 928292309Srrs } 929292309Srrs /* 930292309Srrs * If segment contains data or ACK, will call tcp_reass() 931292309Srrs * later; if not, do so now to pass queued data to user. 932292309Srrs */ 933292309Srrs if (tlen == 0 && (thflags & TH_FIN) == 0) 934344511Stuexen (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 935292309Srrs (struct mbuf *)0); 936292309Srrs tp->snd_wl1 = th->th_seq - 1; 937292309Srrs /* FALLTHROUGH */ 938292309Srrs 939292309Srrs /* 940292309Srrs * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 941292309Srrs * ACKs. If the ack is in the range 942292309Srrs * tp->snd_una < th->th_ack <= tp->snd_max 943292309Srrs * then advance tp->snd_una to th->th_ack and drop 944292309Srrs * data from the retransmission queue. If this ACK reflects 945292309Srrs * more up to date window information we update our window information. 946292309Srrs */ 947292309Srrs case TCPS_ESTABLISHED: 948292309Srrs case TCPS_FIN_WAIT_1: 949292309Srrs case TCPS_FIN_WAIT_2: 950292309Srrs case TCPS_CLOSE_WAIT: 951292309Srrs case TCPS_CLOSING: 952292309Srrs case TCPS_LAST_ACK: 953292309Srrs if (SEQ_GT(th->th_ack, tp->snd_max)) { 954292309Srrs TCPSTAT_INC(tcps_rcvacktoomuch); 955292309Srrs goto dropafterack; 956292309Srrs } 957292309Srrs if ((tp->t_flags & TF_SACK_PERMIT) && 958292309Srrs ((to->to_flags & TOF_SACK) || 959292309Srrs !TAILQ_EMPTY(&tp->snd_holes))) 960292309Srrs tcp_sack_doack(tp, to, th->th_ack); 961292309Srrs else 962292309Srrs /* 963292309Srrs * Reset the value so that previous (valid) value 964292309Srrs * from the last ack with SACK doesn't get used. 965292309Srrs */ 966292309Srrs tp->sackhint.sacked_bytes = 0; 967292309Srrs 968292309Srrs /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 969292309Srrs hhook_run_tcp_est_in(tp, th, to); 970292309Srrs 971292309Srrs if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 972292309Srrs if (tlen == 0 && tiwin == tp->snd_wnd) { 973292309Srrs /* 974292309Srrs * If this is the first time we've seen a 975292309Srrs * FIN from the remote, this is not a 976292309Srrs * duplicate and it needs to be processed 977292309Srrs * normally. This happens during a 978292309Srrs * simultaneous close. 979292309Srrs */ 980292309Srrs if ((thflags & TH_FIN) && 981292309Srrs (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { 982292309Srrs tp->t_dupacks = 0; 983292309Srrs break; 984292309Srrs } 985292309Srrs TCPSTAT_INC(tcps_rcvdupack); 986292309Srrs /* 987292309Srrs * If we have outstanding data (other than 988292309Srrs * a window probe), this is a completely 989292309Srrs * duplicate ack (ie, window info didn't 990292309Srrs * change and FIN isn't set), 991292309Srrs * the ack is the biggest we've 992292309Srrs * seen and we've seen exactly our rexmt 993298995Spfg * threshold of them, assume a packet 994292309Srrs * has been dropped and retransmit it. 995292309Srrs * Kludge snd_nxt & the congestion 996292309Srrs * window so we send only this one 997292309Srrs * packet. 998292309Srrs * 999292309Srrs * We know we're losing at the current 1000292309Srrs * window size so do congestion avoidance 1001292309Srrs * (set ssthresh to half the current window 1002292309Srrs * and pull our congestion window back to 1003292309Srrs * the new ssthresh). 1004292309Srrs * 1005292309Srrs * Dup acks mean that packets have left the 1006292309Srrs * network (they're now cached at the receiver) 1007292309Srrs * so bump cwnd by the amount in the receiver 1008292309Srrs * to keep a constant cwnd packets in the 1009292309Srrs * network. 1010292309Srrs * 1011292309Srrs * When using TCP ECN, notify the peer that 1012292309Srrs * we reduced the cwnd. 1013292309Srrs */ 1014292309Srrs if (!tcp_timer_active(tp, TT_REXMT) || 1015292309Srrs th->th_ack != tp->snd_una) 1016292309Srrs tp->t_dupacks = 0; 1017292309Srrs else if (++tp->t_dupacks > tcprexmtthresh || 1018292309Srrs IN_FASTRECOVERY(tp->t_flags)) { 1019292309Srrs cc_ack_received(tp, th, CC_DUPACK); 1020292309Srrs if ((tp->t_flags & TF_SACK_PERMIT) && 1021292309Srrs IN_FASTRECOVERY(tp->t_flags)) { 1022292309Srrs int awnd; 1023292309Srrs 1024292309Srrs /* 1025292309Srrs * Compute the amount of data in flight first. 1026292309Srrs * We can inject new data into the pipe iff 1027292309Srrs * we have less than 1/2 the original window's 1028292309Srrs * worth of data in flight. 1029292309Srrs */ 1030292309Srrs if (V_tcp_do_rfc6675_pipe) 1031292309Srrs awnd = tcp_compute_pipe(tp); 1032292309Srrs else 1033292309Srrs awnd = (tp->snd_nxt - tp->snd_fack) + 1034292309Srrs tp->sackhint.sack_bytes_rexmit; 1035292309Srrs 1036292309Srrs if (awnd < tp->snd_ssthresh) { 1037292309Srrs tp->snd_cwnd += tp->t_maxseg; 1038292309Srrs if (tp->snd_cwnd > tp->snd_ssthresh) 1039292309Srrs tp->snd_cwnd = tp->snd_ssthresh; 1040292309Srrs } 1041292309Srrs } else 1042292309Srrs tp->snd_cwnd += tp->t_maxseg; 1043292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1044292309Srrs goto drop; 1045292309Srrs } else if (tp->t_dupacks == tcprexmtthresh) { 1046292309Srrs tcp_seq onxt = tp->snd_nxt; 1047292309Srrs 1048292309Srrs /* 1049292309Srrs * If we're doing sack, check to 1050292309Srrs * see if we're already in sack 1051292309Srrs * recovery. If we're not doing sack, 1052292309Srrs * check to see if we're in newreno 1053292309Srrs * recovery. 1054292309Srrs */ 1055292309Srrs if (tp->t_flags & TF_SACK_PERMIT) { 1056292309Srrs if (IN_FASTRECOVERY(tp->t_flags)) { 1057292309Srrs tp->t_dupacks = 0; 1058292309Srrs break; 1059292309Srrs } 1060292309Srrs } else { 1061292309Srrs if (SEQ_LEQ(th->th_ack, 1062292309Srrs tp->snd_recover)) { 1063292309Srrs tp->t_dupacks = 0; 1064292309Srrs break; 1065292309Srrs } 1066292309Srrs } 1067292309Srrs /* Congestion signal before ack. */ 1068292309Srrs cc_cong_signal(tp, th, CC_NDUPACK); 1069292309Srrs cc_ack_received(tp, th, CC_DUPACK); 1070292309Srrs tcp_timer_activate(tp, TT_REXMT, 0); 1071292309Srrs tp->t_rtttime = 0; 1072292309Srrs if (tp->t_flags & TF_SACK_PERMIT) { 1073292309Srrs TCPSTAT_INC( 1074292309Srrs tcps_sack_recovery_episode); 1075292309Srrs tp->sack_newdata = tp->snd_nxt; 1076330516Seadler tp->snd_cwnd = tp->t_maxseg; 1077292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1078292309Srrs goto drop; 1079292309Srrs } 1080292309Srrs tp->snd_nxt = th->th_ack; 1081330516Seadler tp->snd_cwnd = tp->t_maxseg; 1082292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1083292309Srrs KASSERT(tp->snd_limited <= 2, 1084292309Srrs ("%s: tp->snd_limited too big", 1085292309Srrs __func__)); 1086330516Seadler tp->snd_cwnd = tp->snd_ssthresh + 1087330516Seadler tp->t_maxseg * 1088330516Seadler (tp->t_dupacks - tp->snd_limited); 1089292309Srrs if (SEQ_GT(onxt, tp->snd_nxt)) 1090292309Srrs tp->snd_nxt = onxt; 1091292309Srrs goto drop; 1092292309Srrs } else if (V_tcp_do_rfc3042) { 1093292309Srrs /* 1094292309Srrs * Process first and second duplicate 1095292309Srrs * ACKs. Each indicates a segment 1096292309Srrs * leaving the network, creating room 1097292309Srrs * for more. Make sure we can send a 1098292309Srrs * packet on reception of each duplicate 1099292309Srrs * ACK by increasing snd_cwnd by one 1100292309Srrs * segment. Restore the original 1101292309Srrs * snd_cwnd after packet transmission. 1102292309Srrs */ 1103292309Srrs cc_ack_received(tp, th, CC_DUPACK); 1104292309Srrs u_long oldcwnd = tp->snd_cwnd; 1105292309Srrs tcp_seq oldsndmax = tp->snd_max; 1106292309Srrs u_int sent; 1107292309Srrs int avail; 1108292309Srrs 1109292309Srrs KASSERT(tp->t_dupacks == 1 || 1110292309Srrs tp->t_dupacks == 2, 1111292309Srrs ("%s: dupacks not 1 or 2", 1112292309Srrs __func__)); 1113292309Srrs if (tp->t_dupacks == 1) 1114292309Srrs tp->snd_limited = 0; 1115292309Srrs tp->snd_cwnd = 1116292309Srrs (tp->snd_nxt - tp->snd_una) + 1117292309Srrs (tp->t_dupacks - tp->snd_limited) * 1118292309Srrs tp->t_maxseg; 1119292309Srrs /* 1120292309Srrs * Only call tcp_output when there 1121292309Srrs * is new data available to be sent. 1122292309Srrs * Otherwise we would send pure ACKs. 1123292309Srrs */ 1124292309Srrs SOCKBUF_LOCK(&so->so_snd); 1125292309Srrs avail = sbavail(&so->so_snd) - 1126292309Srrs (tp->snd_nxt - tp->snd_una); 1127292309Srrs SOCKBUF_UNLOCK(&so->so_snd); 1128292309Srrs if (avail > 0) 1129292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1130292309Srrs sent = tp->snd_max - oldsndmax; 1131292309Srrs if (sent > tp->t_maxseg) { 1132292309Srrs KASSERT((tp->t_dupacks == 2 && 1133292309Srrs tp->snd_limited == 0) || 1134292309Srrs (sent == tp->t_maxseg + 1 && 1135292309Srrs tp->t_flags & TF_SENTFIN), 1136292309Srrs ("%s: sent too much", 1137292309Srrs __func__)); 1138292309Srrs tp->snd_limited = 2; 1139292309Srrs } else if (sent > 0) 1140292309Srrs ++tp->snd_limited; 1141292309Srrs tp->snd_cwnd = oldcwnd; 1142292309Srrs goto drop; 1143292309Srrs } 1144292309Srrs } else 1145292309Srrs tp->t_dupacks = 0; 1146292309Srrs break; 1147292309Srrs } 1148292309Srrs 1149292309Srrs KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 1150292309Srrs ("%s: th_ack <= snd_una", __func__)); 1151292309Srrs 1152292309Srrs /* 1153292309Srrs * If the congestion window was inflated to account 1154292309Srrs * for the other side's cached packets, retract it. 1155292309Srrs */ 1156292309Srrs if (IN_FASTRECOVERY(tp->t_flags)) { 1157292309Srrs if (SEQ_LT(th->th_ack, tp->snd_recover)) { 1158292309Srrs if (tp->t_flags & TF_SACK_PERMIT) 1159292309Srrs tcp_sack_partialack(tp, th); 1160292309Srrs else 1161292309Srrs tcp_newreno_partial_ack(tp, th); 1162292309Srrs } else 1163292309Srrs cc_post_recovery(tp, th); 1164292309Srrs } 1165292309Srrs tp->t_dupacks = 0; 1166292309Srrs /* 1167292309Srrs * If we reach this point, ACK is not a duplicate, 1168292309Srrs * i.e., it ACKs something we sent. 1169292309Srrs */ 1170292309Srrs if (tp->t_flags & TF_NEEDSYN) { 1171292309Srrs /* 1172292309Srrs * T/TCP: Connection was half-synchronized, and our 1173292309Srrs * SYN has been ACK'd (so connection is now fully 1174292309Srrs * synchronized). Go to non-starred state, 1175292309Srrs * increment snd_una for ACK of SYN, and check if 1176292309Srrs * we can do window scaling. 1177292309Srrs */ 1178292309Srrs tp->t_flags &= ~TF_NEEDSYN; 1179292309Srrs tp->snd_una++; 1180292309Srrs /* Do window scaling? */ 1181292309Srrs if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1182292309Srrs (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1183292309Srrs tp->rcv_scale = tp->request_r_scale; 1184292309Srrs /* Send window already scaled. */ 1185292309Srrs } 1186292309Srrs } 1187292309Srrs 1188292309Srrsprocess_ACK: 1189292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 1190292309Srrs 1191292309Srrs acked = BYTES_THIS_ACK(tp, th); 1192292309Srrs TCPSTAT_INC(tcps_rcvackpack); 1193292309Srrs TCPSTAT_ADD(tcps_rcvackbyte, acked); 1194292309Srrs 1195292309Srrs /* 1196292309Srrs * If we just performed our first retransmit, and the ACK 1197292309Srrs * arrives within our recovery window, then it was a mistake 1198292309Srrs * to do the retransmit in the first place. Recover our 1199292309Srrs * original cwnd and ssthresh, and proceed to transmit where 1200292309Srrs * we left off. 1201292309Srrs */ 1202292309Srrs if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && 1203292309Srrs (int)(ticks - tp->t_badrxtwin) < 0) 1204292309Srrs cc_cong_signal(tp, th, CC_RTO_ERR); 1205292309Srrs 1206292309Srrs /* 1207292309Srrs * If we have a timestamp reply, update smoothed 1208292309Srrs * round trip time. If no timestamp is present but 1209292309Srrs * transmit timer is running and timed sequence 1210292309Srrs * number was acked, update smoothed round trip time. 1211292309Srrs * Since we now have an rtt measurement, cancel the 1212292309Srrs * timer backoff (cf., Phil Karn's retransmit alg.). 1213292309Srrs * Recompute the initial retransmit timer. 1214292309Srrs * 1215292309Srrs * Some boxes send broken timestamp replies 1216292309Srrs * during the SYN+ACK phase, ignore 1217292309Srrs * timestamps of 0 or we could calculate a 1218292309Srrs * huge RTT and blow up the retransmit timer. 1219292309Srrs */ 1220292309Srrs if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 1221292309Srrs u_int t; 1222292309Srrs 1223292309Srrs t = tcp_ts_getticks() - to->to_tsecr; 1224292309Srrs if (!tp->t_rttlow || tp->t_rttlow > t) 1225292309Srrs tp->t_rttlow = t; 1226292309Srrs tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); 1227292309Srrs } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 1228292309Srrs if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 1229292309Srrs tp->t_rttlow = ticks - tp->t_rtttime; 1230292309Srrs tcp_xmit_timer(tp, ticks - tp->t_rtttime); 1231292309Srrs } 1232292309Srrs 1233292309Srrs /* 1234292309Srrs * If all outstanding data is acked, stop retransmit 1235292309Srrs * timer and remember to restart (more output or persist). 1236292309Srrs * If there is more data to be acked, restart retransmit 1237292309Srrs * timer, using current (possibly backed-off) value. 1238292309Srrs */ 1239292309Srrs if (th->th_ack == tp->snd_max) { 1240292309Srrs tcp_timer_activate(tp, TT_REXMT, 0); 1241292309Srrs needoutput = 1; 1242292309Srrs } else if (!tcp_timer_active(tp, TT_PERSIST)) 1243292309Srrs tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 1244292309Srrs 1245292309Srrs /* 1246292309Srrs * If no data (only SYN) was ACK'd, 1247292309Srrs * skip rest of ACK processing. 1248292309Srrs */ 1249292309Srrs if (acked == 0) 1250292309Srrs goto step6; 1251292309Srrs 1252292309Srrs /* 1253292309Srrs * Let the congestion control algorithm update congestion 1254292309Srrs * control related information. This typically means increasing 1255292309Srrs * the congestion window. 1256292309Srrs */ 1257292309Srrs cc_ack_received(tp, th, CC_ACK); 1258292309Srrs 1259292309Srrs SOCKBUF_LOCK(&so->so_snd); 1260292309Srrs if (acked > sbavail(&so->so_snd)) { 1261292309Srrs tp->snd_wnd -= sbavail(&so->so_snd); 1262292309Srrs mfree = sbcut_locked(&so->so_snd, 1263292309Srrs (int)sbavail(&so->so_snd)); 1264292309Srrs ourfinisacked = 1; 1265292309Srrs } else { 1266292309Srrs mfree = sbcut_locked(&so->so_snd, acked); 1267292309Srrs tp->snd_wnd -= acked; 1268292309Srrs ourfinisacked = 0; 1269292309Srrs } 1270292309Srrs /* NB: sowwakeup_locked() does an implicit unlock. */ 1271292309Srrs sowwakeup_locked(so); 1272292309Srrs m_freem(mfree); 1273292309Srrs /* Detect una wraparound. */ 1274292309Srrs if (!IN_RECOVERY(tp->t_flags) && 1275292309Srrs SEQ_GT(tp->snd_una, tp->snd_recover) && 1276292309Srrs SEQ_LEQ(th->th_ack, tp->snd_recover)) 1277292309Srrs tp->snd_recover = th->th_ack - 1; 1278292309Srrs /* XXXLAS: Can this be moved up into cc_post_recovery? */ 1279292309Srrs if (IN_RECOVERY(tp->t_flags) && 1280292309Srrs SEQ_GEQ(th->th_ack, tp->snd_recover)) { 1281292309Srrs EXIT_RECOVERY(tp->t_flags); 1282292309Srrs } 1283292309Srrs tp->snd_una = th->th_ack; 1284292309Srrs if (tp->t_flags & TF_SACK_PERMIT) { 1285292309Srrs if (SEQ_GT(tp->snd_una, tp->snd_recover)) 1286292309Srrs tp->snd_recover = tp->snd_una; 1287292309Srrs } 1288292309Srrs if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1289292309Srrs tp->snd_nxt = tp->snd_una; 1290292309Srrs 1291292309Srrs switch (tp->t_state) { 1292292309Srrs 1293292309Srrs /* 1294292309Srrs * In FIN_WAIT_1 STATE in addition to the processing 1295292309Srrs * for the ESTABLISHED state if our FIN is now acknowledged 1296292309Srrs * then enter FIN_WAIT_2. 1297292309Srrs */ 1298292309Srrs case TCPS_FIN_WAIT_1: 1299292309Srrs if (ourfinisacked) { 1300292309Srrs /* 1301292309Srrs * If we can't receive any more 1302292309Srrs * data, then closing user can proceed. 1303292309Srrs * Starting the timer is contrary to the 1304292309Srrs * specification, but if we don't get a FIN 1305292309Srrs * we'll hang forever. 1306292309Srrs * 1307292309Srrs * XXXjl: 1308292309Srrs * we should release the tp also, and use a 1309292309Srrs * compressed state. 1310292309Srrs */ 1311292309Srrs if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1312292309Srrs soisdisconnected(so); 1313292309Srrs tcp_timer_activate(tp, TT_2MSL, 1314292309Srrs (tcp_fast_finwait2_recycle ? 1315292309Srrs tcp_finwait2_timeout : 1316292309Srrs TP_MAXIDLE(tp))); 1317292309Srrs } 1318292309Srrs tcp_state_change(tp, TCPS_FIN_WAIT_2); 1319292309Srrs } 1320292309Srrs break; 1321292309Srrs 1322292309Srrs /* 1323292309Srrs * In CLOSING STATE in addition to the processing for 1324292309Srrs * the ESTABLISHED state if the ACK acknowledges our FIN 1325292309Srrs * then enter the TIME-WAIT state, otherwise ignore 1326292309Srrs * the segment. 1327292309Srrs */ 1328292309Srrs case TCPS_CLOSING: 1329292309Srrs if (ourfinisacked) { 1330292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1331292309Srrs tcp_twstart(tp); 1332292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1333292309Srrs m_freem(m); 1334292309Srrs return; 1335292309Srrs } 1336292309Srrs break; 1337292309Srrs 1338292309Srrs /* 1339292309Srrs * In LAST_ACK, we may still be waiting for data to drain 1340292309Srrs * and/or to be acked, as well as for the ack of our FIN. 1341292309Srrs * If our FIN is now acknowledged, delete the TCB, 1342292309Srrs * enter the closed state and return. 1343292309Srrs */ 1344292309Srrs case TCPS_LAST_ACK: 1345292309Srrs if (ourfinisacked) { 1346292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1347292309Srrs tp = tcp_close(tp); 1348292309Srrs goto drop; 1349292309Srrs } 1350292309Srrs break; 1351292309Srrs } 1352292309Srrs } 1353292309Srrs 1354292309Srrsstep6: 1355292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 1356292309Srrs 1357292309Srrs /* 1358292309Srrs * Update window information. 1359292309Srrs * Don't look at window if no ACK: TAC's send garbage on first SYN. 1360292309Srrs */ 1361292309Srrs if ((thflags & TH_ACK) && 1362292309Srrs (SEQ_LT(tp->snd_wl1, th->th_seq) || 1363292309Srrs (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 1364292309Srrs (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1365292309Srrs /* keep track of pure window updates */ 1366292309Srrs if (tlen == 0 && 1367292309Srrs tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1368292309Srrs TCPSTAT_INC(tcps_rcvwinupd); 1369292309Srrs tp->snd_wnd = tiwin; 1370292309Srrs tp->snd_wl1 = th->th_seq; 1371292309Srrs tp->snd_wl2 = th->th_ack; 1372292309Srrs if (tp->snd_wnd > tp->max_sndwnd) 1373292309Srrs tp->max_sndwnd = tp->snd_wnd; 1374292309Srrs needoutput = 1; 1375292309Srrs } 1376292309Srrs 1377292309Srrs /* 1378292309Srrs * Process segments with URG. 1379292309Srrs */ 1380292309Srrs if ((thflags & TH_URG) && th->th_urp && 1381292309Srrs TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1382292309Srrs /* 1383292309Srrs * This is a kludge, but if we receive and accept 1384292309Srrs * random urgent pointers, we'll crash in 1385292309Srrs * soreceive. It's hard to imagine someone 1386292309Srrs * actually wanting to send this much urgent data. 1387292309Srrs */ 1388292309Srrs SOCKBUF_LOCK(&so->so_rcv); 1389292309Srrs if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 1390292309Srrs th->th_urp = 0; /* XXX */ 1391292309Srrs thflags &= ~TH_URG; /* XXX */ 1392292309Srrs SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 1393292309Srrs goto dodata; /* XXX */ 1394292309Srrs } 1395292309Srrs /* 1396292309Srrs * If this segment advances the known urgent pointer, 1397292309Srrs * then mark the data stream. This should not happen 1398292309Srrs * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1399292309Srrs * a FIN has been received from the remote side. 1400292309Srrs * In these states we ignore the URG. 1401292309Srrs * 1402292309Srrs * According to RFC961 (Assigned Protocols), 1403292309Srrs * the urgent pointer points to the last octet 1404292309Srrs * of urgent data. We continue, however, 1405292309Srrs * to consider it to indicate the first octet 1406292309Srrs * of data past the urgent section as the original 1407292309Srrs * spec states (in one of two places). 1408292309Srrs */ 1409292309Srrs if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1410292309Srrs tp->rcv_up = th->th_seq + th->th_urp; 1411292309Srrs so->so_oobmark = sbavail(&so->so_rcv) + 1412292309Srrs (tp->rcv_up - tp->rcv_nxt) - 1; 1413292309Srrs if (so->so_oobmark == 0) 1414292309Srrs so->so_rcv.sb_state |= SBS_RCVATMARK; 1415292309Srrs sohasoutofband(so); 1416292309Srrs tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1417292309Srrs } 1418292309Srrs SOCKBUF_UNLOCK(&so->so_rcv); 1419292309Srrs /* 1420292309Srrs * Remove out of band data so doesn't get presented to user. 1421292309Srrs * This can happen independent of advancing the URG pointer, 1422292309Srrs * but if two URG's are pending at once, some out-of-band 1423292309Srrs * data may creep in... ick. 1424292309Srrs */ 1425292309Srrs if (th->th_urp <= (u_long)tlen && 1426292309Srrs !(so->so_options & SO_OOBINLINE)) { 1427292309Srrs /* hdr drop is delayed */ 1428292309Srrs tcp_pulloutofband(so, th, m, drop_hdrlen); 1429292309Srrs } 1430292309Srrs } else { 1431292309Srrs /* 1432292309Srrs * If no out of band data is expected, 1433292309Srrs * pull receive urgent pointer along 1434292309Srrs * with the receive window. 1435292309Srrs */ 1436292309Srrs if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1437292309Srrs tp->rcv_up = tp->rcv_nxt; 1438292309Srrs } 1439292309Srrsdodata: /* XXX */ 1440292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 1441292309Srrs 1442292309Srrs /* 1443292309Srrs * Process the segment text, merging it into the TCP sequencing queue, 1444292309Srrs * and arranging for acknowledgment of receipt if necessary. 1445292309Srrs * This process logically involves adjusting tp->rcv_wnd as data 1446292309Srrs * is presented to the user (this happens in tcp_usrreq.c, 1447292309Srrs * case PRU_RCVD). If a FIN has already been received on this 1448292309Srrs * connection then we just ignore the text. 1449292309Srrs */ 1450292309Srrs if ((tlen || (thflags & TH_FIN)) && 1451292309Srrs TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1452292309Srrs tcp_seq save_start = th->th_seq; 1453292309Srrs m_adj(m, drop_hdrlen); /* delayed header drop */ 1454292309Srrs /* 1455292309Srrs * Insert segment which includes th into TCP reassembly queue 1456292309Srrs * with control block tp. Set thflags to whether reassembly now 1457292309Srrs * includes a segment with FIN. This handles the common case 1458292309Srrs * inline (segment is the next to be received on an established 1459292309Srrs * connection, and the queue is empty), avoiding linkage into 1460292309Srrs * and removal from the queue and repetition of various 1461292309Srrs * conversions. 1462292309Srrs * Set DELACK for segments received in order, but ack 1463292309Srrs * immediately when segments are out of order (so 1464292309Srrs * fast retransmit can work). 1465292309Srrs */ 1466292309Srrs if (th->th_seq == tp->rcv_nxt && 1467344511Stuexen SEGQ_EMPTY(tp) && 1468292309Srrs TCPS_HAVEESTABLISHED(tp->t_state)) { 1469292309Srrs if (DELAY_ACK(tp, tlen)) 1470292309Srrs tp->t_flags |= TF_DELACK; 1471292309Srrs else 1472292309Srrs tp->t_flags |= TF_ACKNOW; 1473292309Srrs tp->rcv_nxt += tlen; 1474292309Srrs thflags = th->th_flags & TH_FIN; 1475292309Srrs TCPSTAT_INC(tcps_rcvpack); 1476292309Srrs TCPSTAT_ADD(tcps_rcvbyte, tlen); 1477292309Srrs SOCKBUF_LOCK(&so->so_rcv); 1478292309Srrs if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1479292309Srrs m_freem(m); 1480292309Srrs else 1481292309Srrs sbappendstream_locked(&so->so_rcv, m, 0); 1482292309Srrs /* NB: sorwakeup_locked() does an implicit unlock. */ 1483292309Srrs sorwakeup_locked(so); 1484292309Srrs } else { 1485292309Srrs /* 1486292309Srrs * XXX: Due to the header drop above "th" is 1487292309Srrs * theoretically invalid by now. Fortunately 1488292309Srrs * m_adj() doesn't actually frees any mbufs 1489292309Srrs * when trimming from the head. 1490292309Srrs */ 1491344511Stuexen thflags = tcp_reass(tp, th, &save_start, &tlen, m); 1492292309Srrs tp->t_flags |= TF_ACKNOW; 1493292309Srrs } 1494292309Srrs if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 1495292309Srrs tcp_update_sack_list(tp, save_start, save_start + tlen); 1496292309Srrs#if 0 1497292309Srrs /* 1498292309Srrs * Note the amount of data that peer has sent into 1499292309Srrs * our window, in order to estimate the sender's 1500292309Srrs * buffer size. 1501292309Srrs * XXX: Unused. 1502292309Srrs */ 1503292309Srrs if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) 1504292309Srrs len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1505292309Srrs else 1506292309Srrs len = so->so_rcv.sb_hiwat; 1507292309Srrs#endif 1508292309Srrs } else { 1509292309Srrs m_freem(m); 1510292309Srrs thflags &= ~TH_FIN; 1511292309Srrs } 1512292309Srrs 1513292309Srrs /* 1514292309Srrs * If FIN is received ACK the FIN and let the user know 1515292309Srrs * that the connection is closing. 1516292309Srrs */ 1517292309Srrs if (thflags & TH_FIN) { 1518292309Srrs if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1519292309Srrs socantrcvmore(so); 1520292309Srrs /* 1521292309Srrs * If connection is half-synchronized 1522292309Srrs * (ie NEEDSYN flag on) then delay ACK, 1523292309Srrs * so it may be piggybacked when SYN is sent. 1524292309Srrs * Otherwise, since we received a FIN then no 1525292309Srrs * more input can be expected, send ACK now. 1526292309Srrs */ 1527292309Srrs if (tp->t_flags & TF_NEEDSYN) 1528292309Srrs tp->t_flags |= TF_DELACK; 1529292309Srrs else 1530292309Srrs tp->t_flags |= TF_ACKNOW; 1531292309Srrs tp->rcv_nxt++; 1532292309Srrs } 1533292309Srrs switch (tp->t_state) { 1534292309Srrs 1535292309Srrs /* 1536292309Srrs * In SYN_RECEIVED and ESTABLISHED STATES 1537292309Srrs * enter the CLOSE_WAIT state. 1538292309Srrs */ 1539292309Srrs case TCPS_SYN_RECEIVED: 1540292309Srrs tp->t_starttime = ticks; 1541292309Srrs /* FALLTHROUGH */ 1542292309Srrs case TCPS_ESTABLISHED: 1543292309Srrs tcp_state_change(tp, TCPS_CLOSE_WAIT); 1544292309Srrs break; 1545292309Srrs 1546292309Srrs /* 1547292309Srrs * If still in FIN_WAIT_1 STATE FIN has not been acked so 1548292309Srrs * enter the CLOSING state. 1549292309Srrs */ 1550292309Srrs case TCPS_FIN_WAIT_1: 1551292309Srrs tcp_state_change(tp, TCPS_CLOSING); 1552292309Srrs break; 1553292309Srrs 1554292309Srrs /* 1555292309Srrs * In FIN_WAIT_2 state enter the TIME_WAIT state, 1556292309Srrs * starting the time-wait timer, turning off the other 1557292309Srrs * standard timers. 1558292309Srrs */ 1559292309Srrs case TCPS_FIN_WAIT_2: 1560292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1561292309Srrs KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " 1562292309Srrs "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 1563292309Srrs ti_locked)); 1564292309Srrs 1565292309Srrs tcp_twstart(tp); 1566292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1567292309Srrs return; 1568292309Srrs } 1569292309Srrs } 1570292309Srrs if (ti_locked == TI_RLOCKED) { 1571292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1572292309Srrs } 1573292309Srrs ti_locked = TI_UNLOCKED; 1574292309Srrs 1575292309Srrs#ifdef TCPDEBUG 1576292309Srrs if (so->so_options & SO_DEBUG) 1577292309Srrs tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 1578292309Srrs &tcp_savetcp, 0); 1579292309Srrs#endif 1580316208Sgnn TCP_PROBE3(debug__input, tp, th, m); 1581292309Srrs 1582292309Srrs /* 1583292309Srrs * Return any desired output. 1584292309Srrs */ 1585292309Srrs if (needoutput || (tp->t_flags & TF_ACKNOW)) 1586292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1587292309Srrs 1588292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 1589292309Srrs __func__, ti_locked)); 1590292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1591292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 1592292309Srrs 1593292309Srrs if (tp->t_flags & TF_DELACK) { 1594292309Srrs tp->t_flags &= ~TF_DELACK; 1595292309Srrs tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 1596292309Srrs } 1597292309Srrs INP_WUNLOCK(tp->t_inpcb); 1598292309Srrs return; 1599292309Srrs 1600292309Srrsdropafterack: 1601292309Srrs /* 1602292309Srrs * Generate an ACK dropping incoming segment if it occupies 1603292309Srrs * sequence space, where the ACK reflects our state. 1604292309Srrs * 1605292309Srrs * We can now skip the test for the RST flag since all 1606292309Srrs * paths to this code happen after packets containing 1607292309Srrs * RST have been dropped. 1608292309Srrs * 1609292309Srrs * In the SYN-RECEIVED state, don't send an ACK unless the 1610292309Srrs * segment we received passes the SYN-RECEIVED ACK test. 1611292309Srrs * If it fails send a RST. This breaks the loop in the 1612292309Srrs * "LAND" DoS attack, and also prevents an ACK storm 1613292309Srrs * between two listening ports that have been sent forged 1614292309Srrs * SYN segments, each with the source address of the other. 1615292309Srrs */ 1616292309Srrs if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1617292309Srrs (SEQ_GT(tp->snd_una, th->th_ack) || 1618292309Srrs SEQ_GT(th->th_ack, tp->snd_max)) ) { 1619292309Srrs rstreason = BANDLIM_RST_OPENPORT; 1620292309Srrs goto dropwithreset; 1621292309Srrs } 1622292309Srrs#ifdef TCPDEBUG 1623292309Srrs if (so->so_options & SO_DEBUG) 1624292309Srrs tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 1625292309Srrs &tcp_savetcp, 0); 1626292309Srrs#endif 1627316208Sgnn TCP_PROBE3(debug__drop, tp, th, m); 1628292309Srrs if (ti_locked == TI_RLOCKED) { 1629292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1630292309Srrs } 1631292309Srrs ti_locked = TI_UNLOCKED; 1632292309Srrs 1633292309Srrs tp->t_flags |= TF_ACKNOW; 1634292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1635292309Srrs INP_WUNLOCK(tp->t_inpcb); 1636292309Srrs m_freem(m); 1637292309Srrs return; 1638292309Srrs 1639292309Srrsdropwithreset: 1640292309Srrs if (ti_locked == TI_RLOCKED) { 1641292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1642292309Srrs } 1643292309Srrs ti_locked = TI_UNLOCKED; 1644292309Srrs 1645292309Srrs if (tp != NULL) { 1646292309Srrs tcp_dropwithreset(m, th, tp, tlen, rstreason); 1647292309Srrs INP_WUNLOCK(tp->t_inpcb); 1648292309Srrs } else 1649292309Srrs tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1650292309Srrs return; 1651292309Srrs 1652292309Srrsdrop: 1653292309Srrs if (ti_locked == TI_RLOCKED) { 1654292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1655292309Srrs ti_locked = TI_UNLOCKED; 1656292309Srrs } 1657292309Srrs#ifdef INVARIANTS 1658292309Srrs else 1659292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1660292309Srrs#endif 1661292309Srrs 1662292309Srrs /* 1663292309Srrs * Drop space held by incoming segment and return. 1664292309Srrs */ 1665292309Srrs#ifdef TCPDEBUG 1666292309Srrs if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 1667292309Srrs tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 1668292309Srrs &tcp_savetcp, 0); 1669292309Srrs#endif 1670316208Sgnn TCP_PROBE3(debug__drop, tp, th, m); 1671292309Srrs if (tp != NULL) 1672292309Srrs INP_WUNLOCK(tp->t_inpcb); 1673292309Srrs m_freem(m); 1674292309Srrs} 1675292309Srrs 1676292309Srrs 1677292309Srrs/* 1678292309Srrs * Do fast slow is a combination of the original 1679292309Srrs * tcp_dosegment and a split fastpath, one function 1680292309Srrs * for the fast-ack which also includes allowing fastpath 1681292309Srrs * for window advanced in sequence acks. And also a 1682292309Srrs * sub-function that handles the insequence data. 1683292309Srrs */ 1684292309Srrsvoid 1685292309Srrstcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so, 1686292309Srrs struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 1687292309Srrs int ti_locked) 1688292309Srrs{ 1689292309Srrs int thflags; 1690292309Srrs u_long tiwin; 1691292309Srrs char *s; 1692292309Srrs int can_enter; 1693292309Srrs struct in_conninfo *inc; 1694292309Srrs struct tcpopt to; 1695292309Srrs 1696292309Srrs thflags = th->th_flags; 1697292309Srrs inc = &tp->t_inpcb->inp_inc; 1698292309Srrs /* 1699292309Srrs * If this is either a state-changing packet or current state isn't 1700292309Srrs * established, we require a write lock on tcbinfo. Otherwise, we 1701292309Srrs * allow the tcbinfo to be in either alocked or unlocked, as the 1702292309Srrs * caller may have unnecessarily acquired a write lock due to a race. 1703292309Srrs */ 1704292309Srrs if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 1705292309Srrs tp->t_state != TCPS_ESTABLISHED) { 1706292309Srrs KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 1707292309Srrs "SYN/FIN/RST/!EST", __func__, ti_locked)); 1708292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1709292309Srrs } else { 1710292309Srrs#ifdef INVARIANTS 1711292309Srrs if (ti_locked == TI_RLOCKED) { 1712292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1713292309Srrs } else { 1714292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 1715292309Srrs "ti_locked: %d", __func__, ti_locked)); 1716292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1717292309Srrs } 1718292309Srrs#endif 1719292309Srrs } 1720292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 1721292309Srrs KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 1722292309Srrs __func__)); 1723292309Srrs KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 1724292309Srrs __func__)); 1725292309Srrs 1726319399Stuexen if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 1727319399Stuexen if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1728319399Stuexen log(LOG_DEBUG, "%s; %s: " 1729319399Stuexen "SYN|FIN segment ignored (based on " 1730319399Stuexen "sysctl setting)\n", s, __func__); 1731319399Stuexen free(s, M_TCPLOG); 1732319399Stuexen } 1733319399Stuexen if (ti_locked == TI_RLOCKED) { 1734319399Stuexen INP_INFO_RUNLOCK(&V_tcbinfo); 1735319399Stuexen } 1736319399Stuexen INP_WUNLOCK(tp->t_inpcb); 1737319399Stuexen m_freem(m); 1738319399Stuexen return; 1739319399Stuexen } 1740319431Stuexen 1741319431Stuexen /* 1742319431Stuexen * If a segment with the ACK-bit set arrives in the SYN-SENT state 1743319431Stuexen * check SEQ.ACK first. 1744319431Stuexen */ 1745319431Stuexen if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 1746319431Stuexen (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 1747319431Stuexen tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED); 1748319431Stuexen if (ti_locked == TI_RLOCKED) { 1749319431Stuexen INP_INFO_RUNLOCK(&V_tcbinfo); 1750319431Stuexen } 1751319431Stuexen INP_WUNLOCK(tp->t_inpcb); 1752319431Stuexen return; 1753319431Stuexen } 1754319399Stuexen 1755319399Stuexen tp->sackhint.last_sack_ack = 0; 1756319399Stuexen 1757292309Srrs /* 1758292309Srrs * Segment received on connection. 1759292309Srrs * Reset idle time and keep-alive timer. 1760292309Srrs * XXX: This should be done after segment 1761292309Srrs * validation to ignore broken/spoofed segs. 1762292309Srrs */ 1763292309Srrs tp->t_rcvtime = ticks; 1764292309Srrs if (TCPS_HAVEESTABLISHED(tp->t_state)) 1765292309Srrs tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 1766292309Srrs 1767292309Srrs /* 1768292309Srrs * Unscale the window into a 32-bit value. 1769292309Srrs * For the SYN_SENT state the scale is zero. 1770292309Srrs */ 1771292309Srrs tiwin = th->th_win << tp->snd_scale; 1772292309Srrs 1773292309Srrs /* 1774292309Srrs * TCP ECN processing. 1775292309Srrs */ 1776292309Srrs if (tp->t_flags & TF_ECN_PERMIT) { 1777292309Srrs if (thflags & TH_CWR) 1778292309Srrs tp->t_flags &= ~TF_ECN_SND_ECE; 1779292309Srrs switch (iptos & IPTOS_ECN_MASK) { 1780292309Srrs case IPTOS_ECN_CE: 1781292309Srrs tp->t_flags |= TF_ECN_SND_ECE; 1782292309Srrs TCPSTAT_INC(tcps_ecn_ce); 1783292309Srrs break; 1784292309Srrs case IPTOS_ECN_ECT0: 1785292309Srrs TCPSTAT_INC(tcps_ecn_ect0); 1786292309Srrs break; 1787292309Srrs case IPTOS_ECN_ECT1: 1788292309Srrs TCPSTAT_INC(tcps_ecn_ect1); 1789292309Srrs break; 1790292309Srrs } 1791292309Srrs /* Congestion experienced. */ 1792292309Srrs if (thflags & TH_ECE) { 1793292309Srrs cc_cong_signal(tp, th, CC_ECN); 1794292309Srrs } 1795292309Srrs } 1796292309Srrs 1797292309Srrs /* 1798292309Srrs * Parse options on any incoming segment. 1799292309Srrs */ 1800292309Srrs tcp_dooptions(&to, (u_char *)(th + 1), 1801292309Srrs (th->th_off << 2) - sizeof(struct tcphdr), 1802292309Srrs (thflags & TH_SYN) ? TO_SYN : 0); 1803292309Srrs 1804292309Srrs /* 1805292309Srrs * If echoed timestamp is later than the current time, 1806292309Srrs * fall back to non RFC1323 RTT calculation. Normalize 1807292309Srrs * timestamp if syncookies were used when this connection 1808292309Srrs * was established. 1809292309Srrs */ 1810292309Srrs if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1811292309Srrs to.to_tsecr -= tp->ts_offset; 1812292309Srrs if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) 1813292309Srrs to.to_tsecr = 0; 1814292309Srrs } 1815292309Srrs 1816292309Srrs /* 1817292309Srrs * Process options only when we get SYN/ACK back. The SYN case 1818292309Srrs * for incoming connections is handled in tcp_syncache. 1819292309Srrs * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1820292309Srrs * or <SYN,ACK>) segment itself is never scaled. 1821292309Srrs * XXX this is traditional behavior, may need to be cleaned up. 1822292309Srrs */ 1823292309Srrs if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1824292309Srrs if ((to.to_flags & TOF_SCALE) && 1825292309Srrs (tp->t_flags & TF_REQ_SCALE)) { 1826292309Srrs tp->t_flags |= TF_RCVD_SCALE; 1827292309Srrs tp->snd_scale = to.to_wscale; 1828292309Srrs } 1829292309Srrs /* 1830292309Srrs * Initial send window. It will be updated with 1831292309Srrs * the next incoming segment to the scaled value. 1832292309Srrs */ 1833292309Srrs tp->snd_wnd = th->th_win; 1834292309Srrs if (to.to_flags & TOF_TS) { 1835292309Srrs tp->t_flags |= TF_RCVD_TSTMP; 1836292309Srrs tp->ts_recent = to.to_tsval; 1837292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 1838292309Srrs } 1839292309Srrs if (to.to_flags & TOF_MSS) 1840292309Srrs tcp_mss(tp, to.to_mss); 1841292309Srrs if ((tp->t_flags & TF_SACK_PERMIT) && 1842292309Srrs (to.to_flags & TOF_SACKPERM) == 0) 1843292309Srrs tp->t_flags &= ~TF_SACK_PERMIT; 1844292309Srrs } 1845332177Stuexen 1846332177Stuexen /* 1847332177Stuexen * If timestamps were negotiated during SYN/ACK they should 1848332177Stuexen * appear on every segment during this session and vice versa. 1849332177Stuexen */ 1850332177Stuexen if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 1851332177Stuexen if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1852332177Stuexen log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1853332177Stuexen "no action\n", s, __func__); 1854332177Stuexen free(s, M_TCPLOG); 1855332177Stuexen } 1856332177Stuexen } 1857332177Stuexen if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 1858332177Stuexen if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1859332177Stuexen log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 1860332177Stuexen "no action\n", s, __func__); 1861332177Stuexen free(s, M_TCPLOG); 1862332177Stuexen } 1863332177Stuexen } 1864332177Stuexen 1865292309Srrs can_enter = 0; 1866292309Srrs if (__predict_true((tlen == 0))) { 1867292309Srrs /* 1868292309Srrs * The ack moved forward and we have a window (non-zero) 1869292309Srrs * <or> 1870292309Srrs * The ack did not move forward, but the window increased. 1871292309Srrs */ 1872292309Srrs if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) || 1873292309Srrs ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) { 1874292309Srrs can_enter = 1; 1875292309Srrs } 1876292309Srrs } else { 1877292309Srrs /* 1878292309Srrs * Data incoming, use the old entry criteria 1879292309Srrs * for fast-path with data. 1880292309Srrs */ 1881292309Srrs if ((tiwin && tiwin == tp->snd_wnd)) { 1882292309Srrs can_enter = 1; 1883292309Srrs } 1884292309Srrs } 1885292309Srrs /* 1886292309Srrs * Header prediction: check for the two common cases 1887292309Srrs * of a uni-directional data xfer. If the packet has 1888292309Srrs * no control flags, is in-sequence, the window didn't 1889292309Srrs * change and we're not retransmitting, it's a 1890292309Srrs * candidate. If the length is zero and the ack moved 1891292309Srrs * forward, we're the sender side of the xfer. Just 1892292309Srrs * free the data acked & wake any higher level process 1893292309Srrs * that was blocked waiting for space. If the length 1894292309Srrs * is non-zero and the ack didn't move, we're the 1895292309Srrs * receiver side. If we're getting packets in-order 1896292309Srrs * (the reassembly queue is empty), add the data to 1897292309Srrs * the socket buffer and note that we need a delayed ack. 1898292309Srrs * Make sure that the hidden state-flags are also off. 1899292309Srrs * Since we check for TCPS_ESTABLISHED first, it can only 1900292309Srrs * be TH_NEEDSYN. 1901292309Srrs */ 1902292309Srrs if (__predict_true(tp->t_state == TCPS_ESTABLISHED && 1903292309Srrs th->th_seq == tp->rcv_nxt && 1904292309Srrs (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1905292309Srrs tp->snd_nxt == tp->snd_max && 1906292309Srrs can_enter && 1907292309Srrs ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1908344511Stuexen SEGQ_EMPTY(tp) && 1909292309Srrs ((to.to_flags & TOF_TS) == 0 || 1910292309Srrs TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) { 1911292309Srrs if (__predict_true((tlen == 0) && 1912292309Srrs (SEQ_LEQ(th->th_ack, tp->snd_max) && 1913292309Srrs !IN_RECOVERY(tp->t_flags) && 1914292309Srrs (to.to_flags & TOF_SACK) == 0 && 1915292309Srrs TAILQ_EMPTY(&tp->snd_holes)))) { 1916292309Srrs /* We are done */ 1917292309Srrs tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 1918292309Srrs ti_locked, tiwin); 1919292309Srrs return; 1920292309Srrs } else if ((tlen) && 1921292309Srrs (th->th_ack == tp->snd_una && 1922292309Srrs tlen <= sbspace(&so->so_rcv))) { 1923292309Srrs tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen, 1924292309Srrs ti_locked, tiwin); 1925292309Srrs /* We are done */ 1926292309Srrs return; 1927292309Srrs } 1928292309Srrs } 1929292309Srrs tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, 1930292309Srrs ti_locked, tiwin, thflags); 1931292309Srrs} 1932292309Srrs 1933292309Srrs 1934292309Srrs/* 1935292309Srrs * This subfunction is used to try to highly optimize the 1936292309Srrs * fast path. We again allow window updates that are 1937292309Srrs * in sequence to remain in the fast-path. We also add 1938292309Srrs * in the __predict's to attempt to help the compiler. 1939292309Srrs * Note that if we return a 0, then we can *not* process 1940292309Srrs * it and the caller should push the packet into the 1941292309Srrs * slow-path. 1942292309Srrs */ 1943292309Srrsstatic int 1944292309Srrstcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 1945292309Srrs struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 1946292309Srrs int ti_locked, u_long tiwin) 1947292309Srrs{ 1948292309Srrs int acked; 1949292309Srrs int winup_only=0; 1950292309Srrs#ifdef TCPDEBUG 1951292309Srrs /* 1952292309Srrs * The size of tcp_saveipgen must be the size of the max ip header, 1953292309Srrs * now IPv6. 1954292309Srrs */ 1955292309Srrs u_char tcp_saveipgen[IP6_HDR_LEN]; 1956292309Srrs struct tcphdr tcp_savetcp; 1957292309Srrs short ostate = 0; 1958292309Srrs#endif 1959292309Srrs 1960292309Srrs 1961292309Srrs if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 1962292309Srrs /* Old ack, behind (or duplicate to) the last one rcv'd */ 1963292309Srrs return (0); 1964292309Srrs } 1965292309Srrs if (__predict_false(th->th_ack == tp->snd_una) && 1966292309Srrs __predict_false(tiwin <= tp->snd_wnd)) { 1967292309Srrs /* duplicate ack <or> a shrinking dup ack with shrinking window */ 1968292309Srrs return (0); 1969292309Srrs } 1970292309Srrs if (__predict_false(tiwin == 0)) { 1971292309Srrs /* zero window */ 1972292309Srrs return (0); 1973292309Srrs } 1974292309Srrs if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 1975292309Srrs /* Above what we have sent? */ 1976292309Srrs return (0); 1977292309Srrs } 1978292309Srrs if (__predict_false(tp->snd_nxt != tp->snd_max)) { 1979292309Srrs /* We are retransmitting */ 1980292309Srrs return (0); 1981292309Srrs } 1982292309Srrs if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) { 1983292309Srrs /* We need a SYN or a FIN, unlikely.. */ 1984292309Srrs return (0); 1985292309Srrs } 1986292309Srrs if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 1987292309Srrs /* Timestamp is behind .. old ack with seq wrap? */ 1988292309Srrs return (0); 1989292309Srrs } 1990292309Srrs if (__predict_false(IN_RECOVERY(tp->t_flags))) { 1991292309Srrs /* Still recovering */ 1992292309Srrs return (0); 1993292309Srrs } 1994292309Srrs if (__predict_false(to->to_flags & TOF_SACK)) { 1995292309Srrs /* Sack included in the ack.. */ 1996292309Srrs return (0); 1997292309Srrs } 1998292309Srrs if (!TAILQ_EMPTY(&tp->snd_holes)) { 1999292309Srrs /* We have sack holes on our scoreboard */ 2000292309Srrs return (0); 2001292309Srrs } 2002292309Srrs /* Ok if we reach here, we can process a fast-ack */ 2003292309Srrs 2004292309Srrs /* Did the window get updated? */ 2005292309Srrs if (tiwin != tp->snd_wnd) { 2006292309Srrs /* keep track of pure window updates */ 2007292309Srrs if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { 2008292309Srrs winup_only = 1; 2009292309Srrs TCPSTAT_INC(tcps_rcvwinupd); 2010292309Srrs } 2011292309Srrs tp->snd_wnd = tiwin; 2012292309Srrs tp->snd_wl1 = th->th_seq; 2013292309Srrs if (tp->snd_wnd > tp->max_sndwnd) 2014292309Srrs tp->max_sndwnd = tp->snd_wnd; 2015292309Srrs } 2016292309Srrs /* 2017292309Srrs * Pull snd_wl2 up to prevent seq wrap relative 2018292309Srrs * to th_ack. 2019292309Srrs */ 2020292309Srrs tp->snd_wl2 = th->th_ack; 2021292309Srrs /* 2022292309Srrs * If last ACK falls within this segment's sequence numbers, 2023292309Srrs * record the timestamp. 2024292309Srrs * NOTE that the test is modified according to the latest 2025292309Srrs * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2026292309Srrs */ 2027292309Srrs if ((to->to_flags & TOF_TS) != 0 && 2028292309Srrs SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 2029292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 2030292309Srrs tp->ts_recent = to->to_tsval; 2031292309Srrs } 2032292309Srrs /* 2033292309Srrs * This is a pure ack for outstanding data. 2034292309Srrs */ 2035292309Srrs if (ti_locked == TI_RLOCKED) { 2036292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 2037292309Srrs } 2038292309Srrs ti_locked = TI_UNLOCKED; 2039292309Srrs 2040292309Srrs TCPSTAT_INC(tcps_predack); 2041292309Srrs 2042292309Srrs /* 2043292309Srrs * "bad retransmit" recovery. 2044292309Srrs */ 2045292309Srrs if (tp->t_rxtshift == 1 && 2046292309Srrs tp->t_flags & TF_PREVVALID && 2047292309Srrs (int)(ticks - tp->t_badrxtwin) < 0) { 2048292309Srrs cc_cong_signal(tp, th, CC_RTO_ERR); 2049292309Srrs } 2050292309Srrs 2051292309Srrs /* 2052292309Srrs * Recalculate the transmit timer / rtt. 2053292309Srrs * 2054292309Srrs * Some boxes send broken timestamp replies 2055292309Srrs * during the SYN+ACK phase, ignore 2056292309Srrs * timestamps of 0 or we could calculate a 2057292309Srrs * huge RTT and blow up the retransmit timer. 2058292309Srrs */ 2059292309Srrs if ((to->to_flags & TOF_TS) != 0 && 2060292309Srrs to->to_tsecr) { 2061292309Srrs u_int t; 2062292309Srrs 2063292309Srrs t = tcp_ts_getticks() - to->to_tsecr; 2064292309Srrs if (!tp->t_rttlow || tp->t_rttlow > t) 2065292309Srrs tp->t_rttlow = t; 2066292309Srrs tcp_xmit_timer(tp, 2067292309Srrs TCP_TS_TO_TICKS(t) + 1); 2068292309Srrs } else if (tp->t_rtttime && 2069292309Srrs SEQ_GT(th->th_ack, tp->t_rtseq)) { 2070292309Srrs if (!tp->t_rttlow || 2071292309Srrs tp->t_rttlow > ticks - tp->t_rtttime) 2072292309Srrs tp->t_rttlow = ticks - tp->t_rtttime; 2073292309Srrs tcp_xmit_timer(tp, 2074292309Srrs ticks - tp->t_rtttime); 2075292309Srrs } 2076292309Srrs if (winup_only == 0) { 2077292309Srrs acked = BYTES_THIS_ACK(tp, th); 2078292309Srrs 2079292309Srrs /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 2080292309Srrs hhook_run_tcp_est_in(tp, th, to); 2081292309Srrs 2082292309Srrs TCPSTAT_ADD(tcps_rcvackbyte, acked); 2083292309Srrs sbdrop(&so->so_snd, acked); 2084292309Srrs if (SEQ_GT(tp->snd_una, tp->snd_recover) && 2085292309Srrs SEQ_LEQ(th->th_ack, tp->snd_recover)) 2086292309Srrs tp->snd_recover = th->th_ack - 1; 2087292309Srrs 2088292309Srrs /* 2089292309Srrs * Let the congestion control algorithm update 2090292309Srrs * congestion control related information. This 2091292309Srrs * typically means increasing the congestion 2092292309Srrs * window. 2093292309Srrs */ 2094292309Srrs cc_ack_received(tp, th, CC_ACK); 2095292309Srrs 2096292309Srrs tp->snd_una = th->th_ack; 2097292309Srrs tp->t_dupacks = 0; 2098292309Srrs 2099292309Srrs /* 2100292309Srrs * If all outstanding data are acked, stop 2101292309Srrs * retransmit timer, otherwise restart timer 2102292309Srrs * using current (possibly backed-off) value. 2103292309Srrs * If process is waiting for space, 2104292309Srrs * wakeup/selwakeup/signal. If data 2105292309Srrs * are ready to send, let tcp_output 2106292309Srrs * decide between more output or persist. 2107292309Srrs */ 2108292309Srrs#ifdef TCPDEBUG 2109292309Srrs if (so->so_options & SO_DEBUG) 2110292309Srrs tcp_trace(TA_INPUT, ostate, tp, 2111292309Srrs (void *)tcp_saveipgen, 2112292309Srrs &tcp_savetcp, 0); 2113292309Srrs#endif 2114316208Sgnn TCP_PROBE3(debug__input, tp, th, m); 2115296352Sgnn m_freem(m); 2116292309Srrs if (tp->snd_una == tp->snd_max) 2117292309Srrs tcp_timer_activate(tp, TT_REXMT, 0); 2118292309Srrs else if (!tcp_timer_active(tp, TT_PERSIST)) 2119292309Srrs tcp_timer_activate(tp, TT_REXMT, 2120292309Srrs tp->t_rxtcur); 2121292309Srrs /* Wake up the socket if we have room to write more */ 2122292309Srrs sowwakeup(so); 2123292309Srrs } else { 2124292309Srrs /* 2125292309Srrs * Window update only, just free the mbufs and 2126292309Srrs * send out whatever we can. 2127292309Srrs */ 2128292309Srrs m_freem(m); 2129292309Srrs } 2130292309Srrs if (sbavail(&so->so_snd)) 2131292309Srrs (void) tcp_output(tp); 2132292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 2133292309Srrs __func__, ti_locked)); 2134292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2135292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 2136292309Srrs 2137292309Srrs if (tp->t_flags & TF_DELACK) { 2138292309Srrs tp->t_flags &= ~TF_DELACK; 2139292309Srrs tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2140292309Srrs } 2141292309Srrs INP_WUNLOCK(tp->t_inpcb); 2142292309Srrs return (1); 2143292309Srrs} 2144292309Srrs 2145292309Srrs/* 2146292309Srrs * This tcp-do-segment concentrates on making the fastest 2147292309Srrs * ack processing path. It does not have a fast-path for 2148292309Srrs * data (it possibly could which would then eliminate the 2149292309Srrs * need for fast-slow above). For a content distributor having 2150292309Srrs * large outgoing elephants and very very little coming in 2151292309Srrs * having no fastpath for data does not really help (since you 2152292309Srrs * don't get much data in). The most important thing is 2153292309Srrs * processing ack's quickly and getting the rest of the data 2154292309Srrs * output to the peer as quickly as possible. This routine 2155292309Srrs * seems to be about an overall 3% faster then the old 2156292309Srrs * tcp_do_segment and keeps us in the fast-path for packets 2157292309Srrs * much more (by allowing window updates to also stay in the fastpath). 2158292309Srrs */ 2159292309Srrsvoid 2160292309Srrstcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 2161292309Srrs struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 2162292309Srrs int ti_locked) 2163292309Srrs{ 2164292309Srrs int thflags; 2165292309Srrs u_long tiwin; 2166292309Srrs char *s; 2167292309Srrs struct in_conninfo *inc; 2168292309Srrs struct tcpopt to; 2169292309Srrs 2170292309Srrs thflags = th->th_flags; 2171292309Srrs inc = &tp->t_inpcb->inp_inc; 2172292309Srrs /* 2173292309Srrs * If this is either a state-changing packet or current state isn't 2174292309Srrs * established, we require a write lock on tcbinfo. Otherwise, we 2175292309Srrs * allow the tcbinfo to be in either alocked or unlocked, as the 2176292309Srrs * caller may have unnecessarily acquired a write lock due to a race. 2177292309Srrs */ 2178292309Srrs if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 2179292309Srrs tp->t_state != TCPS_ESTABLISHED) { 2180292309Srrs KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 2181292309Srrs "SYN/FIN/RST/!EST", __func__, ti_locked)); 2182292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2183292309Srrs } else { 2184292309Srrs#ifdef INVARIANTS 2185292309Srrs if (ti_locked == TI_RLOCKED) { 2186292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2187292309Srrs } else { 2188292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 2189292309Srrs "ti_locked: %d", __func__, ti_locked)); 2190292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2191292309Srrs } 2192292309Srrs#endif 2193292309Srrs } 2194292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 2195292309Srrs KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 2196292309Srrs __func__)); 2197292309Srrs KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 2198292309Srrs __func__)); 2199292309Srrs 2200319399Stuexen if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 2201319399Stuexen if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2202319399Stuexen log(LOG_DEBUG, "%s; %s: " 2203319399Stuexen "SYN|FIN segment ignored (based on " 2204319399Stuexen "sysctl setting)\n", s, __func__); 2205319399Stuexen free(s, M_TCPLOG); 2206319399Stuexen } 2207319399Stuexen if (ti_locked == TI_RLOCKED) { 2208319399Stuexen INP_INFO_RUNLOCK(&V_tcbinfo); 2209319399Stuexen } 2210319399Stuexen INP_WUNLOCK(tp->t_inpcb); 2211319399Stuexen m_freem(m); 2212319399Stuexen return; 2213319399Stuexen } 2214319399Stuexen 2215319431Stuexen /* 2216319431Stuexen * If a segment with the ACK-bit set arrives in the SYN-SENT state 2217319431Stuexen * check SEQ.ACK first. 2218319431Stuexen */ 2219319431Stuexen if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 2220319431Stuexen (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 2221319431Stuexen tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED); 2222319431Stuexen if (ti_locked == TI_RLOCKED) { 2223319431Stuexen INP_INFO_RUNLOCK(&V_tcbinfo); 2224319431Stuexen } 2225319431Stuexen INP_WUNLOCK(tp->t_inpcb); 2226319431Stuexen return; 2227319431Stuexen } 2228319431Stuexen 2229319399Stuexen tp->sackhint.last_sack_ack = 0; 2230319399Stuexen 2231292309Srrs /* 2232292309Srrs * Segment received on connection. 2233292309Srrs * Reset idle time and keep-alive timer. 2234292309Srrs * XXX: This should be done after segment 2235292309Srrs * validation to ignore broken/spoofed segs. 2236292309Srrs */ 2237292309Srrs tp->t_rcvtime = ticks; 2238292309Srrs if (TCPS_HAVEESTABLISHED(tp->t_state)) 2239292309Srrs tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 2240292309Srrs 2241292309Srrs /* 2242292309Srrs * Unscale the window into a 32-bit value. 2243292309Srrs * For the SYN_SENT state the scale is zero. 2244292309Srrs */ 2245292309Srrs tiwin = th->th_win << tp->snd_scale; 2246292309Srrs 2247292309Srrs /* 2248292309Srrs * TCP ECN processing. 2249292309Srrs */ 2250292309Srrs if (tp->t_flags & TF_ECN_PERMIT) { 2251292309Srrs if (thflags & TH_CWR) 2252292309Srrs tp->t_flags &= ~TF_ECN_SND_ECE; 2253292309Srrs switch (iptos & IPTOS_ECN_MASK) { 2254292309Srrs case IPTOS_ECN_CE: 2255292309Srrs tp->t_flags |= TF_ECN_SND_ECE; 2256292309Srrs TCPSTAT_INC(tcps_ecn_ce); 2257292309Srrs break; 2258292309Srrs case IPTOS_ECN_ECT0: 2259292309Srrs TCPSTAT_INC(tcps_ecn_ect0); 2260292309Srrs break; 2261292309Srrs case IPTOS_ECN_ECT1: 2262292309Srrs TCPSTAT_INC(tcps_ecn_ect1); 2263292309Srrs break; 2264292309Srrs } 2265292309Srrs /* Congestion experienced. */ 2266292309Srrs if (thflags & TH_ECE) { 2267292309Srrs cc_cong_signal(tp, th, CC_ECN); 2268292309Srrs } 2269292309Srrs } 2270292309Srrs 2271292309Srrs /* 2272292309Srrs * Parse options on any incoming segment. 2273292309Srrs */ 2274292309Srrs tcp_dooptions(&to, (u_char *)(th + 1), 2275292309Srrs (th->th_off << 2) - sizeof(struct tcphdr), 2276292309Srrs (thflags & TH_SYN) ? TO_SYN : 0); 2277292309Srrs 2278292309Srrs /* 2279292309Srrs * If echoed timestamp is later than the current time, 2280292309Srrs * fall back to non RFC1323 RTT calculation. Normalize 2281292309Srrs * timestamp if syncookies were used when this connection 2282292309Srrs * was established. 2283292309Srrs */ 2284292309Srrs if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 2285292309Srrs to.to_tsecr -= tp->ts_offset; 2286292309Srrs if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) 2287292309Srrs to.to_tsecr = 0; 2288292309Srrs } 2289292309Srrs 2290292309Srrs /* 2291292309Srrs * Process options only when we get SYN/ACK back. The SYN case 2292292309Srrs * for incoming connections is handled in tcp_syncache. 2293292309Srrs * According to RFC1323 the window field in a SYN (i.e., a <SYN> 2294292309Srrs * or <SYN,ACK>) segment itself is never scaled. 2295292309Srrs * XXX this is traditional behavior, may need to be cleaned up. 2296292309Srrs */ 2297292309Srrs if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 2298292309Srrs if ((to.to_flags & TOF_SCALE) && 2299292309Srrs (tp->t_flags & TF_REQ_SCALE)) { 2300292309Srrs tp->t_flags |= TF_RCVD_SCALE; 2301292309Srrs tp->snd_scale = to.to_wscale; 2302292309Srrs } 2303292309Srrs /* 2304292309Srrs * Initial send window. It will be updated with 2305292309Srrs * the next incoming segment to the scaled value. 2306292309Srrs */ 2307292309Srrs tp->snd_wnd = th->th_win; 2308292309Srrs if (to.to_flags & TOF_TS) { 2309292309Srrs tp->t_flags |= TF_RCVD_TSTMP; 2310292309Srrs tp->ts_recent = to.to_tsval; 2311292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 2312292309Srrs } 2313292309Srrs if (to.to_flags & TOF_MSS) 2314292309Srrs tcp_mss(tp, to.to_mss); 2315292309Srrs if ((tp->t_flags & TF_SACK_PERMIT) && 2316292309Srrs (to.to_flags & TOF_SACKPERM) == 0) 2317292309Srrs tp->t_flags &= ~TF_SACK_PERMIT; 2318292309Srrs } 2319332177Stuexen 2320292309Srrs /* 2321332177Stuexen * If timestamps were negotiated during SYN/ACK they should 2322332177Stuexen * appear on every segment during this session and vice versa. 2323332177Stuexen */ 2324332177Stuexen if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 2325332177Stuexen if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2326332177Stuexen log(LOG_DEBUG, "%s; %s: Timestamp missing, " 2327332177Stuexen "no action\n", s, __func__); 2328332177Stuexen free(s, M_TCPLOG); 2329332177Stuexen } 2330332177Stuexen } 2331332177Stuexen if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 2332332177Stuexen if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2333332177Stuexen log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 2334332177Stuexen "no action\n", s, __func__); 2335332177Stuexen free(s, M_TCPLOG); 2336332177Stuexen } 2337332177Stuexen } 2338332177Stuexen 2339332177Stuexen /* 2340292309Srrs * Header prediction: check for the two common cases 2341292309Srrs * of a uni-directional data xfer. If the packet has 2342292309Srrs * no control flags, is in-sequence, the window didn't 2343292309Srrs * change and we're not retransmitting, it's a 2344292309Srrs * candidate. If the length is zero and the ack moved 2345292309Srrs * forward, we're the sender side of the xfer. Just 2346292309Srrs * free the data acked & wake any higher level process 2347292309Srrs * that was blocked waiting for space. If the length 2348292309Srrs * is non-zero and the ack didn't move, we're the 2349292309Srrs * receiver side. If we're getting packets in-order 2350292309Srrs * (the reassembly queue is empty), add the data to 2351292309Srrs * the socket buffer and note that we need a delayed ack. 2352292309Srrs * Make sure that the hidden state-flags are also off. 2353292309Srrs * Since we check for TCPS_ESTABLISHED first, it can only 2354292309Srrs * be TH_NEEDSYN. 2355292309Srrs */ 2356292309Srrs if (__predict_true(tp->t_state == TCPS_ESTABLISHED) && 2357292309Srrs __predict_true(((to.to_flags & TOF_SACK) == 0)) && 2358292309Srrs __predict_true(tlen == 0) && 2359292309Srrs __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) && 2360344511Stuexen __predict_true(SEGQ_EMPTY(tp)) && 2361292309Srrs __predict_true(th->th_seq == tp->rcv_nxt)) { 2362292309Srrs if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 2363292309Srrs ti_locked, tiwin)) { 2364292309Srrs return; 2365292309Srrs } 2366292309Srrs } 2367292309Srrs tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, 2368292309Srrs ti_locked, tiwin, thflags); 2369292309Srrs} 2370292309Srrs 2371292309Srrsstruct tcp_function_block __tcp_fastslow = { 2372300042Srrs .tfb_tcp_block_name = "fastslow", 2373300042Srrs .tfb_tcp_output = tcp_output, 2374300042Srrs .tfb_tcp_do_segment = tcp_do_segment_fastslow, 2375300042Srrs .tfb_tcp_ctloutput = tcp_default_ctloutput, 2376292309Srrs}; 2377292309Srrs 2378292309Srrsstruct tcp_function_block __tcp_fastack = { 2379300042Srrs .tfb_tcp_block_name = "fastack", 2380300042Srrs .tfb_tcp_output = tcp_output, 2381300042Srrs .tfb_tcp_do_segment = tcp_do_segment_fastack, 2382300042Srrs .tfb_tcp_ctloutput = tcp_default_ctloutput 2383292309Srrs}; 2384292309Srrs 2385292309Srrsstatic int 2386292309Srrstcp_addfastpaths(module_t mod, int type, void *data) 2387292309Srrs{ 2388292309Srrs int err=0; 2389292309Srrs 2390292309Srrs switch (type) { 2391292309Srrs case MOD_LOAD: 2392292309Srrs err = register_tcp_functions(&__tcp_fastack, M_WAITOK); 2393292309Srrs if (err) { 2394292309Srrs printf("Failed to register fastack module -- err:%d\n", err); 2395292309Srrs return(err); 2396292309Srrs } 2397292309Srrs err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); 2398292309Srrs if (err) { 2399292309Srrs printf("Failed to register fastslow module -- err:%d\n", err); 2400292309Srrs deregister_tcp_functions(&__tcp_fastack); 2401292309Srrs return(err); 2402292309Srrs } 2403292309Srrs break; 2404292309Srrs case MOD_QUIESCE: 2405292309Srrs if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) { 2406292309Srrs return(EBUSY); 2407292309Srrs } 2408292309Srrs break; 2409292309Srrs case MOD_UNLOAD: 2410292309Srrs err = deregister_tcp_functions(&__tcp_fastack); 2411292309Srrs if (err == EBUSY) 2412292309Srrs break; 2413292309Srrs err = deregister_tcp_functions(&__tcp_fastslow); 2414292309Srrs if (err == EBUSY) 2415292309Srrs break; 2416292309Srrs err = 0; 2417292309Srrs break; 2418292309Srrs default: 2419292309Srrs return (EOPNOTSUPP); 2420292309Srrs } 2421292309Srrs return (err); 2422292309Srrs} 2423292309Srrs 2424292309Srrsstatic moduledata_t new_tcp_fastpaths = { 2425292309Srrs .name = "tcp_fastpaths", 2426292309Srrs .evhand = tcp_addfastpaths, 2427292309Srrs .priv = 0 2428292309Srrs}; 2429292309Srrs 2430292309SrrsMODULE_VERSION(kern_tcpfastpaths, 1); 2431295927SrrsDECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 2432