fastpath.c revision 298995
1292309Srrs/*- 2292309Srrs * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3292309Srrs * The Regents of the University of California. All rights reserved. 4292309Srrs * Copyright (c) 2007-2008,2010 5292309Srrs * Swinburne University of Technology, Melbourne, Australia. 6292309Srrs * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7292309Srrs * Copyright (c) 2010 The FreeBSD Foundation 8292309Srrs * Copyright (c) 2010-2011 Juniper Networks, Inc. 9292309Srrs * Copyright (c) 2015 Netflix Inc. 10292309Srrs * All rights reserved. 11292309Srrs * 12292309Srrs * Portions of this software were developed at the Centre for Advanced Internet 13292309Srrs * Architectures, Swinburne University of Technology, by Lawrence Stewart, 14292309Srrs * James Healy and David Hayes, made possible in part by a grant from the Cisco 15292309Srrs * University Research Program Fund at Community Foundation Silicon Valley. 16292309Srrs * 17292309Srrs * Portions of this software were developed at the Centre for Advanced 18292309Srrs * Internet Architectures, Swinburne University of Technology, Melbourne, 19292309Srrs * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 20292309Srrs * 21292309Srrs * Portions of this software were developed by Robert N. M. Watson under 22292309Srrs * contract to Juniper Networks, Inc. 23292309Srrs * 24292309Srrs * Portions of this software were developed by Randall R. Stewart while 25292309Srrs * working for Netflix Inc. 26292309Srrs * 27292309Srrs * Redistribution and use in source and binary forms, with or without 28292309Srrs * modification, are permitted provided that the following conditions 29292309Srrs * are met: 30292309Srrs * 1. Redistributions of source code must retain the above copyright 31292309Srrs * notice, this list of conditions and the following disclaimer. 32292309Srrs * 2. Redistributions in binary form must reproduce the above copyright 33292309Srrs * notice, this list of conditions and the following disclaimer in the 34292309Srrs * documentation and/or other materials provided with the distribution. 35292309Srrs * 4. Neither the name of the University nor the names of its contributors 36292309Srrs * may be used to endorse or promote products derived from this software 37292309Srrs * without specific prior written permission. 38292309Srrs * 39292309Srrs * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 40292309Srrs * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 41292309Srrs * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 42292309Srrs * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 43292309Srrs * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 44292309Srrs * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 45292309Srrs * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 46292309Srrs * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 47292309Srrs * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 48292309Srrs * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 49292309Srrs * SUCH DAMAGE. 50292309Srrs * 51292309Srrs * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 52292309Srrs */ 53292309Srrs 54292309Srrs#include <sys/cdefs.h> 55292309Srrs__FBSDID("$FreeBSD: head/sys/netinet/tcp_stacks/fastpath.c 298995 2016-05-03 18:05:43Z pfg $"); 56292309Srrs 57292309Srrs#include "opt_ipfw.h" /* for ipfw_fwd */ 58292309Srrs#include "opt_inet.h" 59292309Srrs#include "opt_inet6.h" 60292309Srrs#include "opt_ipsec.h" 61292309Srrs#include "opt_kdtrace.h" 62292309Srrs#include "opt_tcpdebug.h" 63292309Srrs 64292309Srrs#include <sys/param.h> 65292309Srrs#include <sys/module.h> 66292309Srrs#include <sys/kernel.h> 67292309Srrs#include <sys/hhook.h> 68292309Srrs#include <sys/malloc.h> 69292309Srrs#include <sys/mbuf.h> 70292309Srrs#include <sys/proc.h> /* for proc0 declaration */ 71292309Srrs#include <sys/protosw.h> 72292309Srrs#include <sys/sdt.h> 73292309Srrs#include <sys/signalvar.h> 74292309Srrs#include <sys/socket.h> 75292309Srrs#include <sys/socketvar.h> 76292309Srrs#include <sys/sysctl.h> 77292309Srrs#include <sys/syslog.h> 78292309Srrs#include <sys/systm.h> 79292309Srrs 80292309Srrs#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 81292309Srrs 82292309Srrs#include <vm/uma.h> 83292309Srrs 84292309Srrs#include <net/route.h> 85292309Srrs#include <net/vnet.h> 86292309Srrs 87292309Srrs#define TCPSTATES /* for logging */ 88292309Srrs 89292309Srrs#include <netinet/in.h> 90292309Srrs#include <netinet/in_kdtrace.h> 91292309Srrs#include <netinet/in_pcb.h> 92292309Srrs#include <netinet/in_systm.h> 93292309Srrs#include <netinet/ip.h> 94292309Srrs#include <netinet/ip_icmp.h> /* required for icmp_var.h */ 95292309Srrs#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 96292309Srrs#include <netinet/ip_var.h> 97292309Srrs#include <netinet/ip_options.h> 98292309Srrs#include <netinet/ip6.h> 99292309Srrs#include <netinet/icmp6.h> 100292309Srrs#include <netinet6/in6_pcb.h> 101292309Srrs#include <netinet6/ip6_var.h> 102294535Sglebius#include <netinet/tcp.h> 103292309Srrs#include <netinet/tcp_fsm.h> 104292309Srrs#include <netinet/tcp_seq.h> 105292309Srrs#include <netinet/tcp_timer.h> 106292309Srrs#include <netinet/tcp_var.h> 107292309Srrs#include <netinet6/tcp6_var.h> 108292309Srrs#include <netinet/tcpip.h> 109292309Srrs#include <netinet/tcp_syncache.h> 110294931Sglebius#include <netinet/cc/cc.h> 111292309Srrs#ifdef TCPDEBUG 112292309Srrs#include <netinet/tcp_debug.h> 113292309Srrs#endif /* TCPDEBUG */ 114292309Srrs#ifdef TCP_OFFLOAD 115292309Srrs#include <netinet/tcp_offload.h> 116292309Srrs#endif 117292309Srrs 118292309Srrs#ifdef IPSEC 119292309Srrs#include <netipsec/ipsec.h> 120292309Srrs#include <netipsec/ipsec6.h> 121292309Srrs#endif /*IPSEC*/ 122292309Srrs 123292309Srrs#include <machine/in_cksum.h> 124292309Srrs 125292309Srrs#include <security/mac/mac_framework.h> 126292309Srrs 127292309SrrsVNET_DECLARE(int, tcp_autorcvbuf_inc); 128292309Srrs#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 129292309SrrsVNET_DECLARE(int, tcp_autorcvbuf_max); 130292309Srrs#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 131292309SrrsVNET_DECLARE(int, tcp_do_rfc3042); 132292309Srrs#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) 133292309SrrsVNET_DECLARE(int, tcp_do_autorcvbuf); 134292309Srrs#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 135292309SrrsVNET_DECLARE(int, tcp_insecure_rst); 136292309Srrs#define V_tcp_insecure_rst VNET(tcp_insecure_rst) 137292309SrrsVNET_DECLARE(int, tcp_insecure_syn); 138292309Srrs#define V_tcp_insecure_syn VNET(tcp_insecure_syn) 139292309Srrs 140292309Srrsstatic void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *, 141292309Srrs struct socket *, struct tcpcb *, int, int, uint8_t, 142292309Srrs int); 143292309Srrs 144292309Srrsstatic void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *, 145292309Srrs struct socket *, struct tcpcb *, int, int, uint8_t, 146292309Srrs int); 147292309Srrs 148292309Srrs/* 149292309Srrs * Indicate whether this ack should be delayed. We can delay the ack if 150292309Srrs * following conditions are met: 151292309Srrs * - There is no delayed ack timer in progress. 152292309Srrs * - Our last ack wasn't a 0-sized window. We never want to delay 153292309Srrs * the ack that opens up a 0-sized window. 154292309Srrs * - LRO wasn't used for this segment. We make sure by checking that the 155292309Srrs * segment size is not larger than the MSS. 156292309Srrs */ 157292309Srrs#define DELAY_ACK(tp, tlen) \ 158292309Srrs ((!tcp_timer_active(tp, TT_DELACK) && \ 159292309Srrs (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 160293313Sjtl (tlen <= tp->t_maxseg) && \ 161292309Srrs (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 162292309Srrs 163292309Srrs/* 164292309Srrs * So how is this faster than the normal fast ack? 165292309Srrs * It basically allows us to also stay in the fastpath 166292309Srrs * when a window-update ack also arrives. In testing 167292309Srrs * we saw only 25-30% of connections doing fastpath 168292309Srrs * due to the fact that along with moving forward 169292309Srrs * in sequence the window was also updated. 170292309Srrs */ 171292309Srrsstatic void 172292309Srrstcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 173292309Srrs struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 174292309Srrs int ti_locked, u_long tiwin) 175292309Srrs{ 176292309Srrs int acked; 177292309Srrs int winup_only=0; 178292309Srrs#ifdef TCPDEBUG 179292309Srrs /* 180292309Srrs * The size of tcp_saveipgen must be the size of the max ip header, 181292309Srrs * now IPv6. 182292309Srrs */ 183292309Srrs u_char tcp_saveipgen[IP6_HDR_LEN]; 184292309Srrs struct tcphdr tcp_savetcp; 185292309Srrs short ostate = 0; 186292309Srrs#endif 187292309Srrs /* 188298995Spfg * The following if statement will be true if 189292309Srrs * we are doing the win_up_in_fp <and> 190292309Srrs * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or> 191292309Srrs * - No more new data, but we have an ack for new data 192292309Srrs * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) 193292309Srrs * - No more new data, the same ack point but the window grew 194292309Srrs * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd) 195292309Srrs */ 196292309Srrs if ((SEQ_LT(tp->snd_wl1, th->th_seq) || 197292309Srrs (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 198292309Srrs (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 199292309Srrs /* keep track of pure window updates */ 200292309Srrs if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { 201292309Srrs winup_only = 1; 202292309Srrs TCPSTAT_INC(tcps_rcvwinupd); 203292309Srrs } 204292309Srrs tp->snd_wnd = tiwin; 205292309Srrs tp->snd_wl1 = th->th_seq; 206292309Srrs tp->snd_wl2 = th->th_ack; 207292309Srrs if (tp->snd_wnd > tp->max_sndwnd) 208292309Srrs tp->max_sndwnd = tp->snd_wnd; 209292309Srrs } 210292309Srrs /* 211292309Srrs * If last ACK falls within this segment's sequence numbers, 212292309Srrs * record the timestamp. 213292309Srrs * NOTE that the test is modified according to the latest 214292309Srrs * proposal of the tcplw@cray.com list (Braden 1993/04/26). 215292309Srrs */ 216292309Srrs if ((to->to_flags & TOF_TS) != 0 && 217292309Srrs SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 218292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 219292309Srrs tp->ts_recent = to->to_tsval; 220292309Srrs } 221292309Srrs /* 222292309Srrs * This is a pure ack for outstanding data. 223292309Srrs */ 224292309Srrs if (ti_locked == TI_RLOCKED) { 225292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 226292309Srrs } 227292309Srrs ti_locked = TI_UNLOCKED; 228292309Srrs 229292309Srrs TCPSTAT_INC(tcps_predack); 230292309Srrs 231292309Srrs /* 232292309Srrs * "bad retransmit" recovery. 233292309Srrs */ 234292309Srrs if (tp->t_rxtshift == 1 && 235292309Srrs tp->t_flags & TF_PREVVALID && 236292309Srrs (int)(ticks - tp->t_badrxtwin) < 0) { 237292309Srrs cc_cong_signal(tp, th, CC_RTO_ERR); 238292309Srrs } 239292309Srrs 240292309Srrs /* 241292309Srrs * Recalculate the transmit timer / rtt. 242292309Srrs * 243292309Srrs * Some boxes send broken timestamp replies 244292309Srrs * during the SYN+ACK phase, ignore 245292309Srrs * timestamps of 0 or we could calculate a 246292309Srrs * huge RTT and blow up the retransmit timer. 247292309Srrs */ 248292309Srrs if ((to->to_flags & TOF_TS) != 0 && 249292309Srrs to->to_tsecr) { 250292309Srrs u_int t; 251292309Srrs 252292309Srrs t = tcp_ts_getticks() - to->to_tsecr; 253292309Srrs if (!tp->t_rttlow || tp->t_rttlow > t) 254292309Srrs tp->t_rttlow = t; 255292309Srrs tcp_xmit_timer(tp, 256292309Srrs TCP_TS_TO_TICKS(t) + 1); 257292309Srrs } else if (tp->t_rtttime && 258292309Srrs SEQ_GT(th->th_ack, tp->t_rtseq)) { 259292309Srrs if (!tp->t_rttlow || 260292309Srrs tp->t_rttlow > ticks - tp->t_rtttime) 261292309Srrs tp->t_rttlow = ticks - tp->t_rtttime; 262292309Srrs tcp_xmit_timer(tp, 263292309Srrs ticks - tp->t_rtttime); 264292309Srrs } 265292309Srrs if (winup_only == 0) { 266292309Srrs acked = BYTES_THIS_ACK(tp, th); 267292309Srrs 268292309Srrs /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 269292309Srrs hhook_run_tcp_est_in(tp, th, to); 270292309Srrs 271292309Srrs TCPSTAT_ADD(tcps_rcvackbyte, acked); 272292309Srrs sbdrop(&so->so_snd, acked); 273292309Srrs if (SEQ_GT(tp->snd_una, tp->snd_recover) && 274292309Srrs SEQ_LEQ(th->th_ack, tp->snd_recover)) 275292309Srrs tp->snd_recover = th->th_ack - 1; 276292309Srrs 277292309Srrs /* 278292309Srrs * Let the congestion control algorithm update 279292309Srrs * congestion control related information. This 280292309Srrs * typically means increasing the congestion 281292309Srrs * window. 282292309Srrs */ 283292309Srrs cc_ack_received(tp, th, CC_ACK); 284292309Srrs 285292309Srrs tp->snd_una = th->th_ack; 286292309Srrs /* 287292309Srrs * Pull snd_wl2 up to prevent seq wrap relative 288292309Srrs * to th_ack. 289292309Srrs */ 290292309Srrs tp->snd_wl2 = th->th_ack; 291292309Srrs tp->t_dupacks = 0; 292292309Srrs 293292309Srrs /* 294292309Srrs * If all outstanding data are acked, stop 295292309Srrs * retransmit timer, otherwise restart timer 296292309Srrs * using current (possibly backed-off) value. 297292309Srrs * If process is waiting for space, 298292309Srrs * wakeup/selwakeup/signal. If data 299292309Srrs * are ready to send, let tcp_output 300292309Srrs * decide between more output or persist. 301292309Srrs */ 302292309Srrs#ifdef TCPDEBUG 303292309Srrs if (so->so_options & SO_DEBUG) 304292309Srrs tcp_trace(TA_INPUT, ostate, tp, 305292309Srrs (void *)tcp_saveipgen, 306292309Srrs &tcp_savetcp, 0); 307292309Srrs#endif 308296352Sgnn TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); 309296352Sgnn m_freem(m); 310292309Srrs if (tp->snd_una == tp->snd_max) 311292309Srrs tcp_timer_activate(tp, TT_REXMT, 0); 312292309Srrs else if (!tcp_timer_active(tp, TT_PERSIST)) 313292309Srrs tcp_timer_activate(tp, TT_REXMT, 314292309Srrs tp->t_rxtcur); 315292309Srrs } else { 316292309Srrs /* 317292309Srrs * Window update only, just free the mbufs and 318292309Srrs * send out whatever we can. 319292309Srrs */ 320292309Srrs m_freem(m); 321292309Srrs } 322292309Srrs sowwakeup(so); 323292309Srrs if (sbavail(&so->so_snd)) 324292309Srrs (void) tcp_output(tp); 325292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 326292309Srrs __func__, ti_locked)); 327292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 328292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 329292309Srrs 330292309Srrs if (tp->t_flags & TF_DELACK) { 331292309Srrs tp->t_flags &= ~TF_DELACK; 332292309Srrs tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 333292309Srrs } 334292309Srrs INP_WUNLOCK(tp->t_inpcb); 335292309Srrs} 336292309Srrs 337292309Srrs/* 338292309Srrs * Here nothing is really faster, its just that we 339292309Srrs * have broken out the fast-data path also just like 340292309Srrs * the fast-ack. 341292309Srrs */ 342292309Srrsstatic void 343292309Srrstcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 344292309Srrs struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 345292309Srrs int ti_locked, u_long tiwin) 346292309Srrs{ 347292309Srrs int newsize = 0; /* automatic sockbuf scaling */ 348292309Srrs#ifdef TCPDEBUG 349292309Srrs /* 350292309Srrs * The size of tcp_saveipgen must be the size of the max ip header, 351292309Srrs * now IPv6. 352292309Srrs */ 353292309Srrs u_char tcp_saveipgen[IP6_HDR_LEN]; 354292309Srrs struct tcphdr tcp_savetcp; 355292309Srrs short ostate = 0; 356292309Srrs#endif 357292309Srrs /* 358292309Srrs * If last ACK falls within this segment's sequence numbers, 359292309Srrs * record the timestamp. 360292309Srrs * NOTE that the test is modified according to the latest 361292309Srrs * proposal of the tcplw@cray.com list (Braden 1993/04/26). 362292309Srrs */ 363292309Srrs if ((to->to_flags & TOF_TS) != 0 && 364292309Srrs SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 365292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 366292309Srrs tp->ts_recent = to->to_tsval; 367292309Srrs } 368292309Srrs 369292309Srrs /* 370292309Srrs * This is a pure, in-sequence data packet with 371292309Srrs * nothing on the reassembly queue and we have enough 372292309Srrs * buffer space to take it. 373292309Srrs */ 374292309Srrs if (ti_locked == TI_RLOCKED) { 375292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 376292309Srrs } 377292309Srrs ti_locked = TI_UNLOCKED; 378292309Srrs 379292309Srrs /* Clean receiver SACK report if present */ 380292309Srrs if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 381292309Srrs tcp_clean_sackreport(tp); 382292309Srrs TCPSTAT_INC(tcps_preddat); 383292309Srrs tp->rcv_nxt += tlen; 384292309Srrs /* 385292309Srrs * Pull snd_wl1 up to prevent seq wrap relative to 386292309Srrs * th_seq. 387292309Srrs */ 388292309Srrs tp->snd_wl1 = th->th_seq; 389292309Srrs /* 390292309Srrs * Pull rcv_up up to prevent seq wrap relative to 391292309Srrs * rcv_nxt. 392292309Srrs */ 393292309Srrs tp->rcv_up = tp->rcv_nxt; 394292309Srrs TCPSTAT_ADD(tcps_rcvbyte, tlen); 395292309Srrs#ifdef TCPDEBUG 396292309Srrs if (so->so_options & SO_DEBUG) 397292309Srrs tcp_trace(TA_INPUT, ostate, tp, 398292309Srrs (void *)tcp_saveipgen, &tcp_savetcp, 0); 399292309Srrs#endif 400296352Sgnn TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); 401292309Srrs /* 402292309Srrs * Automatic sizing of receive socket buffer. Often the send 403292309Srrs * buffer size is not optimally adjusted to the actual network 404292309Srrs * conditions at hand (delay bandwidth product). Setting the 405292309Srrs * buffer size too small limits throughput on links with high 406292309Srrs * bandwidth and high delay (eg. trans-continental/oceanic links). 407292309Srrs * 408292309Srrs * On the receive side the socket buffer memory is only rarely 409292309Srrs * used to any significant extent. This allows us to be much 410292309Srrs * more aggressive in scaling the receive socket buffer. For 411292309Srrs * the case that the buffer space is actually used to a large 412292309Srrs * extent and we run out of kernel memory we can simply drop 413292309Srrs * the new segments; TCP on the sender will just retransmit it 414292309Srrs * later. Setting the buffer size too big may only consume too 415292309Srrs * much kernel memory if the application doesn't read() from 416292309Srrs * the socket or packet loss or reordering makes use of the 417292309Srrs * reassembly queue. 418292309Srrs * 419292309Srrs * The criteria to step up the receive buffer one notch are: 420292309Srrs * 1. Application has not set receive buffer size with 421292309Srrs * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. 422292309Srrs * 2. the number of bytes received during the time it takes 423292309Srrs * one timestamp to be reflected back to us (the RTT); 424292309Srrs * 3. received bytes per RTT is within seven eighth of the 425292309Srrs * current socket buffer size; 426292309Srrs * 4. receive buffer size has not hit maximal automatic size; 427292309Srrs * 428292309Srrs * This algorithm does one step per RTT at most and only if 429292309Srrs * we receive a bulk stream w/o packet losses or reorderings. 430292309Srrs * Shrinking the buffer during idle times is not necessary as 431292309Srrs * it doesn't consume any memory when idle. 432292309Srrs * 433292309Srrs * TODO: Only step up if the application is actually serving 434292309Srrs * the buffer to better manage the socket buffer resources. 435292309Srrs */ 436292309Srrs if (V_tcp_do_autorcvbuf && 437292309Srrs (to->to_flags & TOF_TS) && 438292309Srrs to->to_tsecr && 439292309Srrs (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 440292309Srrs if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) && 441292309Srrs to->to_tsecr - tp->rfbuf_ts < hz) { 442292309Srrs if (tp->rfbuf_cnt > 443292309Srrs (so->so_rcv.sb_hiwat / 8 * 7) && 444292309Srrs so->so_rcv.sb_hiwat < 445292309Srrs V_tcp_autorcvbuf_max) { 446292309Srrs newsize = 447292309Srrs min(so->so_rcv.sb_hiwat + 448292309Srrs V_tcp_autorcvbuf_inc, 449292309Srrs V_tcp_autorcvbuf_max); 450292309Srrs } 451292309Srrs /* Start over with next RTT. */ 452292309Srrs tp->rfbuf_ts = 0; 453292309Srrs tp->rfbuf_cnt = 0; 454292309Srrs } else 455292309Srrs tp->rfbuf_cnt += tlen; /* add up */ 456292309Srrs } 457292309Srrs 458292309Srrs /* Add data to socket buffer. */ 459292309Srrs SOCKBUF_LOCK(&so->so_rcv); 460292309Srrs if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 461292309Srrs m_freem(m); 462292309Srrs } else { 463292309Srrs /* 464292309Srrs * Set new socket buffer size. 465292309Srrs * Give up when limit is reached. 466292309Srrs */ 467292309Srrs if (newsize) 468292309Srrs if (!sbreserve_locked(&so->so_rcv, 469292309Srrs newsize, so, NULL)) 470292309Srrs so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 471292309Srrs m_adj(m, drop_hdrlen); /* delayed header drop */ 472292309Srrs sbappendstream_locked(&so->so_rcv, m, 0); 473292309Srrs } 474292309Srrs /* NB: sorwakeup_locked() does an implicit unlock. */ 475292309Srrs sorwakeup_locked(so); 476292309Srrs if (DELAY_ACK(tp, tlen)) { 477292309Srrs tp->t_flags |= TF_DELACK; 478292309Srrs } else { 479292309Srrs tp->t_flags |= TF_ACKNOW; 480292309Srrs tcp_output(tp); 481292309Srrs } 482292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 483292309Srrs __func__, ti_locked)); 484292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 485292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 486292309Srrs 487292309Srrs if (tp->t_flags & TF_DELACK) { 488292309Srrs tp->t_flags &= ~TF_DELACK; 489292309Srrs tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 490292309Srrs } 491292309Srrs INP_WUNLOCK(tp->t_inpcb); 492292309Srrs} 493292309Srrs 494292309Srrs/* 495292309Srrs * The slow-path is the clone of the long long part 496292309Srrs * of tcp_do_segment past all the fast-path stuff. We 497292309Srrs * use it here by two different callers, the fast/slow and 498292309Srrs * the fastack only. 499292309Srrs */ 500292309Srrsstatic void 501292309Srrstcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so, 502292309Srrs struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 503292309Srrs int ti_locked, u_long tiwin, int thflags) 504292309Srrs{ 505292309Srrs int acked, ourfinisacked, needoutput = 0; 506292309Srrs int rstreason, todrop, win; 507292309Srrs char *s; 508292309Srrs struct in_conninfo *inc; 509292309Srrs struct mbuf *mfree = NULL; 510292309Srrs#ifdef TCPDEBUG 511292309Srrs /* 512292309Srrs * The size of tcp_saveipgen must be the size of the max ip header, 513292309Srrs * now IPv6. 514292309Srrs */ 515292309Srrs u_char tcp_saveipgen[IP6_HDR_LEN]; 516292309Srrs struct tcphdr tcp_savetcp; 517292309Srrs short ostate = 0; 518292309Srrs#endif 519292309Srrs /* 520292309Srrs * Calculate amount of space in receive window, 521292309Srrs * and then do TCP input processing. 522292309Srrs * Receive window is amount of space in rcv queue, 523292309Srrs * but not less than advertised window. 524292309Srrs */ 525292309Srrs inc = &tp->t_inpcb->inp_inc; 526292309Srrs win = sbspace(&so->so_rcv); 527292309Srrs if (win < 0) 528292309Srrs win = 0; 529292309Srrs tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 530292309Srrs 531292309Srrs /* Reset receive buffer auto scaling when not in bulk receive mode. */ 532292309Srrs tp->rfbuf_ts = 0; 533292309Srrs tp->rfbuf_cnt = 0; 534292309Srrs 535292309Srrs switch (tp->t_state) { 536292309Srrs 537292309Srrs /* 538292309Srrs * If the state is SYN_RECEIVED: 539292309Srrs * if seg contains an ACK, but not for our SYN/ACK, send a RST. 540292309Srrs */ 541292309Srrs case TCPS_SYN_RECEIVED: 542292309Srrs if ((thflags & TH_ACK) && 543292309Srrs (SEQ_LEQ(th->th_ack, tp->snd_una) || 544292309Srrs SEQ_GT(th->th_ack, tp->snd_max))) { 545292309Srrs rstreason = BANDLIM_RST_OPENPORT; 546292309Srrs goto dropwithreset; 547292309Srrs } 548292309Srrs break; 549292309Srrs 550292309Srrs /* 551292309Srrs * If the state is SYN_SENT: 552292309Srrs * if seg contains an ACK, but not for our SYN, drop the input. 553292309Srrs * if seg contains a RST, then drop the connection. 554292309Srrs * if seg does not contain SYN, then drop it. 555292309Srrs * Otherwise this is an acceptable SYN segment 556292309Srrs * initialize tp->rcv_nxt and tp->irs 557292309Srrs * if seg contains ack then advance tp->snd_una 558292309Srrs * if seg contains an ECE and ECN support is enabled, the stream 559292309Srrs * is ECN capable. 560292309Srrs * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 561292309Srrs * arrange for segment to be acked (eventually) 562292309Srrs * continue processing rest of data/controls, beginning with URG 563292309Srrs */ 564292309Srrs case TCPS_SYN_SENT: 565292309Srrs if ((thflags & TH_ACK) && 566292309Srrs (SEQ_LEQ(th->th_ack, tp->iss) || 567292309Srrs SEQ_GT(th->th_ack, tp->snd_max))) { 568292309Srrs rstreason = BANDLIM_UNLIMITED; 569292309Srrs goto dropwithreset; 570292309Srrs } 571292309Srrs if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { 572292309Srrs TCP_PROBE5(connect__refused, NULL, tp, 573292309Srrs mtod(m, const char *), tp, th); 574292309Srrs tp = tcp_drop(tp, ECONNREFUSED); 575292309Srrs } 576292309Srrs if (thflags & TH_RST) 577292309Srrs goto drop; 578292309Srrs if (!(thflags & TH_SYN)) 579292309Srrs goto drop; 580292309Srrs 581292309Srrs tp->irs = th->th_seq; 582292309Srrs tcp_rcvseqinit(tp); 583292309Srrs if (thflags & TH_ACK) { 584292309Srrs TCPSTAT_INC(tcps_connects); 585292309Srrs soisconnected(so); 586292309Srrs#ifdef MAC 587292309Srrs mac_socketpeer_set_from_mbuf(m, so); 588292309Srrs#endif 589292309Srrs /* Do window scaling on this connection? */ 590292309Srrs if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 591292309Srrs (TF_RCVD_SCALE|TF_REQ_SCALE)) { 592292309Srrs tp->rcv_scale = tp->request_r_scale; 593292309Srrs } 594292309Srrs tp->rcv_adv += imin(tp->rcv_wnd, 595292309Srrs TCP_MAXWIN << tp->rcv_scale); 596292309Srrs tp->snd_una++; /* SYN is acked */ 597292309Srrs /* 598292309Srrs * If there's data, delay ACK; if there's also a FIN 599292309Srrs * ACKNOW will be turned on later. 600292309Srrs */ 601292309Srrs if (DELAY_ACK(tp, tlen) && tlen != 0) 602292309Srrs tcp_timer_activate(tp, TT_DELACK, 603292309Srrs tcp_delacktime); 604292309Srrs else 605292309Srrs tp->t_flags |= TF_ACKNOW; 606292309Srrs 607292309Srrs if ((thflags & TH_ECE) && V_tcp_do_ecn) { 608292309Srrs tp->t_flags |= TF_ECN_PERMIT; 609292309Srrs TCPSTAT_INC(tcps_ecn_shs); 610292309Srrs } 611292309Srrs 612292309Srrs /* 613292309Srrs * Received <SYN,ACK> in SYN_SENT[*] state. 614292309Srrs * Transitions: 615292309Srrs * SYN_SENT --> ESTABLISHED 616292309Srrs * SYN_SENT* --> FIN_WAIT_1 617292309Srrs */ 618292309Srrs tp->t_starttime = ticks; 619292309Srrs if (tp->t_flags & TF_NEEDFIN) { 620292309Srrs tcp_state_change(tp, TCPS_FIN_WAIT_1); 621292309Srrs tp->t_flags &= ~TF_NEEDFIN; 622292309Srrs thflags &= ~TH_SYN; 623292309Srrs } else { 624292309Srrs tcp_state_change(tp, TCPS_ESTABLISHED); 625292309Srrs TCP_PROBE5(connect__established, NULL, tp, 626292309Srrs mtod(m, const char *), tp, th); 627292309Srrs cc_conn_init(tp); 628292309Srrs tcp_timer_activate(tp, TT_KEEP, 629292309Srrs TP_KEEPIDLE(tp)); 630292309Srrs } 631292309Srrs } else { 632292309Srrs /* 633292309Srrs * Received initial SYN in SYN-SENT[*] state => 634292309Srrs * simultaneous open. 635292309Srrs * If it succeeds, connection is * half-synchronized. 636292309Srrs * Otherwise, do 3-way handshake: 637292309Srrs * SYN-SENT -> SYN-RECEIVED 638292309Srrs * SYN-SENT* -> SYN-RECEIVED* 639292309Srrs */ 640292309Srrs tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 641292309Srrs tcp_timer_activate(tp, TT_REXMT, 0); 642292309Srrs tcp_state_change(tp, TCPS_SYN_RECEIVED); 643292309Srrs } 644292309Srrs 645292309Srrs KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " 646292309Srrs "ti_locked %d", __func__, ti_locked)); 647292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 648292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 649292309Srrs 650292309Srrs /* 651292309Srrs * Advance th->th_seq to correspond to first data byte. 652292309Srrs * If data, trim to stay within window, 653292309Srrs * dropping FIN if necessary. 654292309Srrs */ 655292309Srrs th->th_seq++; 656292309Srrs if (tlen > tp->rcv_wnd) { 657292309Srrs todrop = tlen - tp->rcv_wnd; 658292309Srrs m_adj(m, -todrop); 659292309Srrs tlen = tp->rcv_wnd; 660292309Srrs thflags &= ~TH_FIN; 661292309Srrs TCPSTAT_INC(tcps_rcvpackafterwin); 662292309Srrs TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 663292309Srrs } 664292309Srrs tp->snd_wl1 = th->th_seq - 1; 665292309Srrs tp->rcv_up = th->th_seq; 666292309Srrs /* 667292309Srrs * Client side of transaction: already sent SYN and data. 668292309Srrs * If the remote host used T/TCP to validate the SYN, 669292309Srrs * our data will be ACK'd; if so, enter normal data segment 670292309Srrs * processing in the middle of step 5, ack processing. 671292309Srrs * Otherwise, goto step 6. 672292309Srrs */ 673292309Srrs if (thflags & TH_ACK) 674292309Srrs goto process_ACK; 675292309Srrs 676292309Srrs goto step6; 677292309Srrs 678292309Srrs /* 679292309Srrs * If the state is LAST_ACK or CLOSING or TIME_WAIT: 680292309Srrs * do normal processing. 681292309Srrs * 682292309Srrs * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 683292309Srrs */ 684292309Srrs case TCPS_LAST_ACK: 685292309Srrs case TCPS_CLOSING: 686292309Srrs break; /* continue normal processing */ 687292309Srrs } 688292309Srrs 689292309Srrs /* 690292309Srrs * States other than LISTEN or SYN_SENT. 691292309Srrs * First check the RST flag and sequence number since reset segments 692292309Srrs * are exempt from the timestamp and connection count tests. This 693292309Srrs * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 694292309Srrs * below which allowed reset segments in half the sequence space 695292309Srrs * to fall though and be processed (which gives forged reset 696292309Srrs * segments with a random sequence number a 50 percent chance of 697292309Srrs * killing a connection). 698292309Srrs * Then check timestamp, if present. 699292309Srrs * Then check the connection count, if present. 700292309Srrs * Then check that at least some bytes of segment are within 701292309Srrs * receive window. If segment begins before rcv_nxt, 702292309Srrs * drop leading data (and SYN); if nothing left, just ack. 703292309Srrs */ 704292309Srrs if (thflags & TH_RST) { 705292309Srrs /* 706292309Srrs * RFC5961 Section 3.2 707292309Srrs * 708292309Srrs * - RST drops connection only if SEG.SEQ == RCV.NXT. 709292309Srrs * - If RST is in window, we send challenge ACK. 710292309Srrs * 711292309Srrs * Note: to take into account delayed ACKs, we should 712292309Srrs * test against last_ack_sent instead of rcv_nxt. 713292309Srrs * Note 2: we handle special case of closed window, not 714292309Srrs * covered by the RFC. 715292309Srrs */ 716292309Srrs if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 717292309Srrs SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 718292309Srrs (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 719292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 720292309Srrs KASSERT(ti_locked == TI_RLOCKED, 721292309Srrs ("%s: TH_RST ti_locked %d, th %p tp %p", 722292309Srrs __func__, ti_locked, th, tp)); 723292309Srrs KASSERT(tp->t_state != TCPS_SYN_SENT, 724292309Srrs ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 725292309Srrs __func__, th, tp)); 726292309Srrs 727292309Srrs if (V_tcp_insecure_rst || 728292309Srrs tp->last_ack_sent == th->th_seq) { 729292309Srrs TCPSTAT_INC(tcps_drops); 730292309Srrs /* Drop the connection. */ 731292309Srrs switch (tp->t_state) { 732292309Srrs case TCPS_SYN_RECEIVED: 733292309Srrs so->so_error = ECONNREFUSED; 734292309Srrs goto close; 735292309Srrs case TCPS_ESTABLISHED: 736292309Srrs case TCPS_FIN_WAIT_1: 737292309Srrs case TCPS_FIN_WAIT_2: 738292309Srrs case TCPS_CLOSE_WAIT: 739292309Srrs so->so_error = ECONNRESET; 740292309Srrs close: 741292309Srrs tcp_state_change(tp, TCPS_CLOSED); 742292309Srrs /* FALLTHROUGH */ 743292309Srrs default: 744292309Srrs tp = tcp_close(tp); 745292309Srrs } 746292309Srrs } else { 747292309Srrs TCPSTAT_INC(tcps_badrst); 748292309Srrs /* Send challenge ACK. */ 749292309Srrs tcp_respond(tp, mtod(m, void *), th, m, 750292309Srrs tp->rcv_nxt, tp->snd_nxt, TH_ACK); 751292309Srrs tp->last_ack_sent = tp->rcv_nxt; 752292309Srrs m = NULL; 753292309Srrs } 754292309Srrs } 755292309Srrs goto drop; 756292309Srrs } 757292309Srrs 758292309Srrs /* 759292309Srrs * RFC5961 Section 4.2 760292309Srrs * Send challenge ACK for any SYN in synchronized state. 761292309Srrs */ 762292309Srrs if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { 763292309Srrs KASSERT(ti_locked == TI_RLOCKED, 764292309Srrs ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); 765292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 766292309Srrs 767292309Srrs TCPSTAT_INC(tcps_badsyn); 768292309Srrs if (V_tcp_insecure_syn && 769292309Srrs SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 770292309Srrs SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 771292309Srrs tp = tcp_drop(tp, ECONNRESET); 772292309Srrs rstreason = BANDLIM_UNLIMITED; 773292309Srrs } else { 774292309Srrs /* Send challenge ACK. */ 775292309Srrs tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 776292309Srrs tp->snd_nxt, TH_ACK); 777292309Srrs tp->last_ack_sent = tp->rcv_nxt; 778292309Srrs m = NULL; 779292309Srrs } 780292309Srrs goto drop; 781292309Srrs } 782292309Srrs 783292309Srrs /* 784292309Srrs * RFC 1323 PAWS: If we have a timestamp reply on this segment 785292309Srrs * and it's less than ts_recent, drop it. 786292309Srrs */ 787292309Srrs if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 788292309Srrs TSTMP_LT(to->to_tsval, tp->ts_recent)) { 789292309Srrs 790292309Srrs /* Check to see if ts_recent is over 24 days old. */ 791292309Srrs if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 792292309Srrs /* 793292309Srrs * Invalidate ts_recent. If this segment updates 794292309Srrs * ts_recent, the age will be reset later and ts_recent 795292309Srrs * will get a valid value. If it does not, setting 796292309Srrs * ts_recent to zero will at least satisfy the 797292309Srrs * requirement that zero be placed in the timestamp 798292309Srrs * echo reply when ts_recent isn't valid. The 799292309Srrs * age isn't reset until we get a valid ts_recent 800292309Srrs * because we don't want out-of-order segments to be 801292309Srrs * dropped when ts_recent is old. 802292309Srrs */ 803292309Srrs tp->ts_recent = 0; 804292309Srrs } else { 805292309Srrs TCPSTAT_INC(tcps_rcvduppack); 806292309Srrs TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 807292309Srrs TCPSTAT_INC(tcps_pawsdrop); 808292309Srrs if (tlen) 809292309Srrs goto dropafterack; 810292309Srrs goto drop; 811292309Srrs } 812292309Srrs } 813292309Srrs 814292309Srrs /* 815292309Srrs * In the SYN-RECEIVED state, validate that the packet belongs to 816292309Srrs * this connection before trimming the data to fit the receive 817292309Srrs * window. Check the sequence number versus IRS since we know 818292309Srrs * the sequence numbers haven't wrapped. This is a partial fix 819292309Srrs * for the "LAND" DoS attack. 820292309Srrs */ 821292309Srrs if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 822292309Srrs rstreason = BANDLIM_RST_OPENPORT; 823292309Srrs goto dropwithreset; 824292309Srrs } 825292309Srrs 826292309Srrs todrop = tp->rcv_nxt - th->th_seq; 827292309Srrs if (todrop > 0) { 828292309Srrs if (thflags & TH_SYN) { 829292309Srrs thflags &= ~TH_SYN; 830292309Srrs th->th_seq++; 831292309Srrs if (th->th_urp > 1) 832292309Srrs th->th_urp--; 833292309Srrs else 834292309Srrs thflags &= ~TH_URG; 835292309Srrs todrop--; 836292309Srrs } 837292309Srrs /* 838292309Srrs * Following if statement from Stevens, vol. 2, p. 960. 839292309Srrs */ 840292309Srrs if (todrop > tlen 841292309Srrs || (todrop == tlen && (thflags & TH_FIN) == 0)) { 842292309Srrs /* 843292309Srrs * Any valid FIN must be to the left of the window. 844292309Srrs * At this point the FIN must be a duplicate or out 845292309Srrs * of sequence; drop it. 846292309Srrs */ 847292309Srrs thflags &= ~TH_FIN; 848292309Srrs 849292309Srrs /* 850292309Srrs * Send an ACK to resynchronize and drop any data. 851292309Srrs * But keep on processing for RST or ACK. 852292309Srrs */ 853292309Srrs tp->t_flags |= TF_ACKNOW; 854292309Srrs todrop = tlen; 855292309Srrs TCPSTAT_INC(tcps_rcvduppack); 856292309Srrs TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 857292309Srrs } else { 858292309Srrs TCPSTAT_INC(tcps_rcvpartduppack); 859292309Srrs TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 860292309Srrs } 861292309Srrs drop_hdrlen += todrop; /* drop from the top afterwards */ 862292309Srrs th->th_seq += todrop; 863292309Srrs tlen -= todrop; 864292309Srrs if (th->th_urp > todrop) 865292309Srrs th->th_urp -= todrop; 866292309Srrs else { 867292309Srrs thflags &= ~TH_URG; 868292309Srrs th->th_urp = 0; 869292309Srrs } 870292309Srrs } 871292309Srrs 872292309Srrs /* 873292309Srrs * If new data are received on a connection after the 874292309Srrs * user processes are gone, then RST the other end. 875292309Srrs */ 876292309Srrs if ((so->so_state & SS_NOFDREF) && 877292309Srrs tp->t_state > TCPS_CLOSE_WAIT && tlen) { 878292309Srrs KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " 879292309Srrs "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); 880292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 881292309Srrs 882292309Srrs if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 883292309Srrs log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " 884292309Srrs "after socket was closed, " 885292309Srrs "sending RST and removing tcpcb\n", 886292309Srrs s, __func__, tcpstates[tp->t_state], tlen); 887292309Srrs free(s, M_TCPLOG); 888292309Srrs } 889292309Srrs tp = tcp_close(tp); 890292309Srrs TCPSTAT_INC(tcps_rcvafterclose); 891292309Srrs rstreason = BANDLIM_UNLIMITED; 892292309Srrs goto dropwithreset; 893292309Srrs } 894292309Srrs 895292309Srrs /* 896292309Srrs * If segment ends after window, drop trailing data 897292309Srrs * (and PUSH and FIN); if nothing left, just ACK. 898292309Srrs */ 899292309Srrs todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 900292309Srrs if (todrop > 0) { 901292309Srrs TCPSTAT_INC(tcps_rcvpackafterwin); 902292309Srrs if (todrop >= tlen) { 903292309Srrs TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 904292309Srrs /* 905292309Srrs * If window is closed can only take segments at 906292309Srrs * window edge, and have to drop data and PUSH from 907292309Srrs * incoming segments. Continue processing, but 908292309Srrs * remember to ack. Otherwise, drop segment 909292309Srrs * and ack. 910292309Srrs */ 911292309Srrs if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 912292309Srrs tp->t_flags |= TF_ACKNOW; 913292309Srrs TCPSTAT_INC(tcps_rcvwinprobe); 914292309Srrs } else 915292309Srrs goto dropafterack; 916292309Srrs } else 917292309Srrs TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 918292309Srrs m_adj(m, -todrop); 919292309Srrs tlen -= todrop; 920292309Srrs thflags &= ~(TH_PUSH|TH_FIN); 921292309Srrs } 922292309Srrs 923292309Srrs /* 924292309Srrs * If last ACK falls within this segment's sequence numbers, 925292309Srrs * record its timestamp. 926292309Srrs * NOTE: 927292309Srrs * 1) That the test incorporates suggestions from the latest 928292309Srrs * proposal of the tcplw@cray.com list (Braden 1993/04/26). 929292309Srrs * 2) That updating only on newer timestamps interferes with 930292309Srrs * our earlier PAWS tests, so this check should be solely 931292309Srrs * predicated on the sequence space of this segment. 932292309Srrs * 3) That we modify the segment boundary check to be 933292309Srrs * Last.ACK.Sent <= SEG.SEQ + SEG.Len 934292309Srrs * instead of RFC1323's 935292309Srrs * Last.ACK.Sent < SEG.SEQ + SEG.Len, 936292309Srrs * This modified check allows us to overcome RFC1323's 937292309Srrs * limitations as described in Stevens TCP/IP Illustrated 938292309Srrs * Vol. 2 p.869. In such cases, we can still calculate the 939292309Srrs * RTT correctly when RCV.NXT == Last.ACK.Sent. 940292309Srrs */ 941292309Srrs if ((to->to_flags & TOF_TS) != 0 && 942292309Srrs SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 943292309Srrs SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 944292309Srrs ((thflags & (TH_SYN|TH_FIN)) != 0))) { 945292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 946292309Srrs tp->ts_recent = to->to_tsval; 947292309Srrs } 948292309Srrs 949292309Srrs /* 950292309Srrs * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 951292309Srrs * flag is on (half-synchronized state), then queue data for 952292309Srrs * later processing; else drop segment and return. 953292309Srrs */ 954292309Srrs if ((thflags & TH_ACK) == 0) { 955292309Srrs if (tp->t_state == TCPS_SYN_RECEIVED || 956292309Srrs (tp->t_flags & TF_NEEDSYN)) 957292309Srrs goto step6; 958292309Srrs else if (tp->t_flags & TF_ACKNOW) 959292309Srrs goto dropafterack; 960292309Srrs else 961292309Srrs goto drop; 962292309Srrs } 963292309Srrs 964292309Srrs /* 965292309Srrs * Ack processing. 966292309Srrs */ 967292309Srrs switch (tp->t_state) { 968292309Srrs 969292309Srrs /* 970292309Srrs * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 971292309Srrs * ESTABLISHED state and continue processing. 972292309Srrs * The ACK was checked above. 973292309Srrs */ 974292309Srrs case TCPS_SYN_RECEIVED: 975292309Srrs 976292309Srrs TCPSTAT_INC(tcps_connects); 977292309Srrs soisconnected(so); 978292309Srrs /* Do window scaling? */ 979292309Srrs if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 980292309Srrs (TF_RCVD_SCALE|TF_REQ_SCALE)) { 981292309Srrs tp->rcv_scale = tp->request_r_scale; 982292309Srrs tp->snd_wnd = tiwin; 983292309Srrs } 984292309Srrs /* 985292309Srrs * Make transitions: 986292309Srrs * SYN-RECEIVED -> ESTABLISHED 987292309Srrs * SYN-RECEIVED* -> FIN-WAIT-1 988292309Srrs */ 989292309Srrs tp->t_starttime = ticks; 990292309Srrs if (tp->t_flags & TF_NEEDFIN) { 991292309Srrs tcp_state_change(tp, TCPS_FIN_WAIT_1); 992292309Srrs tp->t_flags &= ~TF_NEEDFIN; 993292309Srrs } else { 994292309Srrs tcp_state_change(tp, TCPS_ESTABLISHED); 995292309Srrs TCP_PROBE5(accept__established, NULL, tp, 996292309Srrs mtod(m, const char *), tp, th); 997292309Srrs cc_conn_init(tp); 998292309Srrs tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 999292309Srrs } 1000292309Srrs /* 1001292309Srrs * If segment contains data or ACK, will call tcp_reass() 1002292309Srrs * later; if not, do so now to pass queued data to user. 1003292309Srrs */ 1004292309Srrs if (tlen == 0 && (thflags & TH_FIN) == 0) 1005292309Srrs (void) tcp_reass(tp, (struct tcphdr *)0, 0, 1006292309Srrs (struct mbuf *)0); 1007292309Srrs tp->snd_wl1 = th->th_seq - 1; 1008292309Srrs /* FALLTHROUGH */ 1009292309Srrs 1010292309Srrs /* 1011292309Srrs * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1012292309Srrs * ACKs. If the ack is in the range 1013292309Srrs * tp->snd_una < th->th_ack <= tp->snd_max 1014292309Srrs * then advance tp->snd_una to th->th_ack and drop 1015292309Srrs * data from the retransmission queue. If this ACK reflects 1016292309Srrs * more up to date window information we update our window information. 1017292309Srrs */ 1018292309Srrs case TCPS_ESTABLISHED: 1019292309Srrs case TCPS_FIN_WAIT_1: 1020292309Srrs case TCPS_FIN_WAIT_2: 1021292309Srrs case TCPS_CLOSE_WAIT: 1022292309Srrs case TCPS_CLOSING: 1023292309Srrs case TCPS_LAST_ACK: 1024292309Srrs if (SEQ_GT(th->th_ack, tp->snd_max)) { 1025292309Srrs TCPSTAT_INC(tcps_rcvacktoomuch); 1026292309Srrs goto dropafterack; 1027292309Srrs } 1028292309Srrs if ((tp->t_flags & TF_SACK_PERMIT) && 1029292309Srrs ((to->to_flags & TOF_SACK) || 1030292309Srrs !TAILQ_EMPTY(&tp->snd_holes))) 1031292309Srrs tcp_sack_doack(tp, to, th->th_ack); 1032292309Srrs else 1033292309Srrs /* 1034292309Srrs * Reset the value so that previous (valid) value 1035292309Srrs * from the last ack with SACK doesn't get used. 1036292309Srrs */ 1037292309Srrs tp->sackhint.sacked_bytes = 0; 1038292309Srrs 1039292309Srrs /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 1040292309Srrs hhook_run_tcp_est_in(tp, th, to); 1041292309Srrs 1042292309Srrs if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1043292309Srrs if (tlen == 0 && tiwin == tp->snd_wnd) { 1044292309Srrs /* 1045292309Srrs * If this is the first time we've seen a 1046292309Srrs * FIN from the remote, this is not a 1047292309Srrs * duplicate and it needs to be processed 1048292309Srrs * normally. This happens during a 1049292309Srrs * simultaneous close. 1050292309Srrs */ 1051292309Srrs if ((thflags & TH_FIN) && 1052292309Srrs (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { 1053292309Srrs tp->t_dupacks = 0; 1054292309Srrs break; 1055292309Srrs } 1056292309Srrs TCPSTAT_INC(tcps_rcvdupack); 1057292309Srrs /* 1058292309Srrs * If we have outstanding data (other than 1059292309Srrs * a window probe), this is a completely 1060292309Srrs * duplicate ack (ie, window info didn't 1061292309Srrs * change and FIN isn't set), 1062292309Srrs * the ack is the biggest we've 1063292309Srrs * seen and we've seen exactly our rexmt 1064298995Spfg * threshold of them, assume a packet 1065292309Srrs * has been dropped and retransmit it. 1066292309Srrs * Kludge snd_nxt & the congestion 1067292309Srrs * window so we send only this one 1068292309Srrs * packet. 1069292309Srrs * 1070292309Srrs * We know we're losing at the current 1071292309Srrs * window size so do congestion avoidance 1072292309Srrs * (set ssthresh to half the current window 1073292309Srrs * and pull our congestion window back to 1074292309Srrs * the new ssthresh). 1075292309Srrs * 1076292309Srrs * Dup acks mean that packets have left the 1077292309Srrs * network (they're now cached at the receiver) 1078292309Srrs * so bump cwnd by the amount in the receiver 1079292309Srrs * to keep a constant cwnd packets in the 1080292309Srrs * network. 1081292309Srrs * 1082292309Srrs * When using TCP ECN, notify the peer that 1083292309Srrs * we reduced the cwnd. 1084292309Srrs */ 1085292309Srrs if (!tcp_timer_active(tp, TT_REXMT) || 1086292309Srrs th->th_ack != tp->snd_una) 1087292309Srrs tp->t_dupacks = 0; 1088292309Srrs else if (++tp->t_dupacks > tcprexmtthresh || 1089292309Srrs IN_FASTRECOVERY(tp->t_flags)) { 1090292309Srrs cc_ack_received(tp, th, CC_DUPACK); 1091292309Srrs if ((tp->t_flags & TF_SACK_PERMIT) && 1092292309Srrs IN_FASTRECOVERY(tp->t_flags)) { 1093292309Srrs int awnd; 1094292309Srrs 1095292309Srrs /* 1096292309Srrs * Compute the amount of data in flight first. 1097292309Srrs * We can inject new data into the pipe iff 1098292309Srrs * we have less than 1/2 the original window's 1099292309Srrs * worth of data in flight. 1100292309Srrs */ 1101292309Srrs if (V_tcp_do_rfc6675_pipe) 1102292309Srrs awnd = tcp_compute_pipe(tp); 1103292309Srrs else 1104292309Srrs awnd = (tp->snd_nxt - tp->snd_fack) + 1105292309Srrs tp->sackhint.sack_bytes_rexmit; 1106292309Srrs 1107292309Srrs if (awnd < tp->snd_ssthresh) { 1108292309Srrs tp->snd_cwnd += tp->t_maxseg; 1109292309Srrs if (tp->snd_cwnd > tp->snd_ssthresh) 1110292309Srrs tp->snd_cwnd = tp->snd_ssthresh; 1111292309Srrs } 1112292309Srrs } else 1113292309Srrs tp->snd_cwnd += tp->t_maxseg; 1114292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1115292309Srrs goto drop; 1116292309Srrs } else if (tp->t_dupacks == tcprexmtthresh) { 1117292309Srrs tcp_seq onxt = tp->snd_nxt; 1118292309Srrs 1119292309Srrs /* 1120292309Srrs * If we're doing sack, check to 1121292309Srrs * see if we're already in sack 1122292309Srrs * recovery. If we're not doing sack, 1123292309Srrs * check to see if we're in newreno 1124292309Srrs * recovery. 1125292309Srrs */ 1126292309Srrs if (tp->t_flags & TF_SACK_PERMIT) { 1127292309Srrs if (IN_FASTRECOVERY(tp->t_flags)) { 1128292309Srrs tp->t_dupacks = 0; 1129292309Srrs break; 1130292309Srrs } 1131292309Srrs } else { 1132292309Srrs if (SEQ_LEQ(th->th_ack, 1133292309Srrs tp->snd_recover)) { 1134292309Srrs tp->t_dupacks = 0; 1135292309Srrs break; 1136292309Srrs } 1137292309Srrs } 1138292309Srrs /* Congestion signal before ack. */ 1139292309Srrs cc_cong_signal(tp, th, CC_NDUPACK); 1140292309Srrs cc_ack_received(tp, th, CC_DUPACK); 1141292309Srrs tcp_timer_activate(tp, TT_REXMT, 0); 1142292309Srrs tp->t_rtttime = 0; 1143292309Srrs if (tp->t_flags & TF_SACK_PERMIT) { 1144292309Srrs TCPSTAT_INC( 1145292309Srrs tcps_sack_recovery_episode); 1146292309Srrs tp->sack_newdata = tp->snd_nxt; 1147292309Srrs tp->snd_cwnd = tp->t_maxseg; 1148292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1149292309Srrs goto drop; 1150292309Srrs } 1151292309Srrs tp->snd_nxt = th->th_ack; 1152292309Srrs tp->snd_cwnd = tp->t_maxseg; 1153292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1154292309Srrs KASSERT(tp->snd_limited <= 2, 1155292309Srrs ("%s: tp->snd_limited too big", 1156292309Srrs __func__)); 1157292309Srrs tp->snd_cwnd = tp->snd_ssthresh + 1158292309Srrs tp->t_maxseg * 1159292309Srrs (tp->t_dupacks - tp->snd_limited); 1160292309Srrs if (SEQ_GT(onxt, tp->snd_nxt)) 1161292309Srrs tp->snd_nxt = onxt; 1162292309Srrs goto drop; 1163292309Srrs } else if (V_tcp_do_rfc3042) { 1164292309Srrs /* 1165292309Srrs * Process first and second duplicate 1166292309Srrs * ACKs. Each indicates a segment 1167292309Srrs * leaving the network, creating room 1168292309Srrs * for more. Make sure we can send a 1169292309Srrs * packet on reception of each duplicate 1170292309Srrs * ACK by increasing snd_cwnd by one 1171292309Srrs * segment. Restore the original 1172292309Srrs * snd_cwnd after packet transmission. 1173292309Srrs */ 1174292309Srrs cc_ack_received(tp, th, CC_DUPACK); 1175292309Srrs u_long oldcwnd = tp->snd_cwnd; 1176292309Srrs tcp_seq oldsndmax = tp->snd_max; 1177292309Srrs u_int sent; 1178292309Srrs int avail; 1179292309Srrs 1180292309Srrs KASSERT(tp->t_dupacks == 1 || 1181292309Srrs tp->t_dupacks == 2, 1182292309Srrs ("%s: dupacks not 1 or 2", 1183292309Srrs __func__)); 1184292309Srrs if (tp->t_dupacks == 1) 1185292309Srrs tp->snd_limited = 0; 1186292309Srrs tp->snd_cwnd = 1187292309Srrs (tp->snd_nxt - tp->snd_una) + 1188292309Srrs (tp->t_dupacks - tp->snd_limited) * 1189292309Srrs tp->t_maxseg; 1190292309Srrs /* 1191292309Srrs * Only call tcp_output when there 1192292309Srrs * is new data available to be sent. 1193292309Srrs * Otherwise we would send pure ACKs. 1194292309Srrs */ 1195292309Srrs SOCKBUF_LOCK(&so->so_snd); 1196292309Srrs avail = sbavail(&so->so_snd) - 1197292309Srrs (tp->snd_nxt - tp->snd_una); 1198292309Srrs SOCKBUF_UNLOCK(&so->so_snd); 1199292309Srrs if (avail > 0) 1200292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1201292309Srrs sent = tp->snd_max - oldsndmax; 1202292309Srrs if (sent > tp->t_maxseg) { 1203292309Srrs KASSERT((tp->t_dupacks == 2 && 1204292309Srrs tp->snd_limited == 0) || 1205292309Srrs (sent == tp->t_maxseg + 1 && 1206292309Srrs tp->t_flags & TF_SENTFIN), 1207292309Srrs ("%s: sent too much", 1208292309Srrs __func__)); 1209292309Srrs tp->snd_limited = 2; 1210292309Srrs } else if (sent > 0) 1211292309Srrs ++tp->snd_limited; 1212292309Srrs tp->snd_cwnd = oldcwnd; 1213292309Srrs goto drop; 1214292309Srrs } 1215292309Srrs } else 1216292309Srrs tp->t_dupacks = 0; 1217292309Srrs break; 1218292309Srrs } 1219292309Srrs 1220292309Srrs KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 1221292309Srrs ("%s: th_ack <= snd_una", __func__)); 1222292309Srrs 1223292309Srrs /* 1224292309Srrs * If the congestion window was inflated to account 1225292309Srrs * for the other side's cached packets, retract it. 1226292309Srrs */ 1227292309Srrs if (IN_FASTRECOVERY(tp->t_flags)) { 1228292309Srrs if (SEQ_LT(th->th_ack, tp->snd_recover)) { 1229292309Srrs if (tp->t_flags & TF_SACK_PERMIT) 1230292309Srrs tcp_sack_partialack(tp, th); 1231292309Srrs else 1232292309Srrs tcp_newreno_partial_ack(tp, th); 1233292309Srrs } else 1234292309Srrs cc_post_recovery(tp, th); 1235292309Srrs } 1236292309Srrs tp->t_dupacks = 0; 1237292309Srrs /* 1238292309Srrs * If we reach this point, ACK is not a duplicate, 1239292309Srrs * i.e., it ACKs something we sent. 1240292309Srrs */ 1241292309Srrs if (tp->t_flags & TF_NEEDSYN) { 1242292309Srrs /* 1243292309Srrs * T/TCP: Connection was half-synchronized, and our 1244292309Srrs * SYN has been ACK'd (so connection is now fully 1245292309Srrs * synchronized). Go to non-starred state, 1246292309Srrs * increment snd_una for ACK of SYN, and check if 1247292309Srrs * we can do window scaling. 1248292309Srrs */ 1249292309Srrs tp->t_flags &= ~TF_NEEDSYN; 1250292309Srrs tp->snd_una++; 1251292309Srrs /* Do window scaling? */ 1252292309Srrs if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1253292309Srrs (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1254292309Srrs tp->rcv_scale = tp->request_r_scale; 1255292309Srrs /* Send window already scaled. */ 1256292309Srrs } 1257292309Srrs } 1258292309Srrs 1259292309Srrsprocess_ACK: 1260292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 1261292309Srrs 1262292309Srrs acked = BYTES_THIS_ACK(tp, th); 1263292309Srrs TCPSTAT_INC(tcps_rcvackpack); 1264292309Srrs TCPSTAT_ADD(tcps_rcvackbyte, acked); 1265292309Srrs 1266292309Srrs /* 1267292309Srrs * If we just performed our first retransmit, and the ACK 1268292309Srrs * arrives within our recovery window, then it was a mistake 1269292309Srrs * to do the retransmit in the first place. Recover our 1270292309Srrs * original cwnd and ssthresh, and proceed to transmit where 1271292309Srrs * we left off. 1272292309Srrs */ 1273292309Srrs if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && 1274292309Srrs (int)(ticks - tp->t_badrxtwin) < 0) 1275292309Srrs cc_cong_signal(tp, th, CC_RTO_ERR); 1276292309Srrs 1277292309Srrs /* 1278292309Srrs * If we have a timestamp reply, update smoothed 1279292309Srrs * round trip time. If no timestamp is present but 1280292309Srrs * transmit timer is running and timed sequence 1281292309Srrs * number was acked, update smoothed round trip time. 1282292309Srrs * Since we now have an rtt measurement, cancel the 1283292309Srrs * timer backoff (cf., Phil Karn's retransmit alg.). 1284292309Srrs * Recompute the initial retransmit timer. 1285292309Srrs * 1286292309Srrs * Some boxes send broken timestamp replies 1287292309Srrs * during the SYN+ACK phase, ignore 1288292309Srrs * timestamps of 0 or we could calculate a 1289292309Srrs * huge RTT and blow up the retransmit timer. 1290292309Srrs */ 1291292309Srrs if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 1292292309Srrs u_int t; 1293292309Srrs 1294292309Srrs t = tcp_ts_getticks() - to->to_tsecr; 1295292309Srrs if (!tp->t_rttlow || tp->t_rttlow > t) 1296292309Srrs tp->t_rttlow = t; 1297292309Srrs tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); 1298292309Srrs } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 1299292309Srrs if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 1300292309Srrs tp->t_rttlow = ticks - tp->t_rtttime; 1301292309Srrs tcp_xmit_timer(tp, ticks - tp->t_rtttime); 1302292309Srrs } 1303292309Srrs 1304292309Srrs /* 1305292309Srrs * If all outstanding data is acked, stop retransmit 1306292309Srrs * timer and remember to restart (more output or persist). 1307292309Srrs * If there is more data to be acked, restart retransmit 1308292309Srrs * timer, using current (possibly backed-off) value. 1309292309Srrs */ 1310292309Srrs if (th->th_ack == tp->snd_max) { 1311292309Srrs tcp_timer_activate(tp, TT_REXMT, 0); 1312292309Srrs needoutput = 1; 1313292309Srrs } else if (!tcp_timer_active(tp, TT_PERSIST)) 1314292309Srrs tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 1315292309Srrs 1316292309Srrs /* 1317292309Srrs * If no data (only SYN) was ACK'd, 1318292309Srrs * skip rest of ACK processing. 1319292309Srrs */ 1320292309Srrs if (acked == 0) 1321292309Srrs goto step6; 1322292309Srrs 1323292309Srrs /* 1324292309Srrs * Let the congestion control algorithm update congestion 1325292309Srrs * control related information. This typically means increasing 1326292309Srrs * the congestion window. 1327292309Srrs */ 1328292309Srrs cc_ack_received(tp, th, CC_ACK); 1329292309Srrs 1330292309Srrs SOCKBUF_LOCK(&so->so_snd); 1331292309Srrs if (acked > sbavail(&so->so_snd)) { 1332292309Srrs tp->snd_wnd -= sbavail(&so->so_snd); 1333292309Srrs mfree = sbcut_locked(&so->so_snd, 1334292309Srrs (int)sbavail(&so->so_snd)); 1335292309Srrs ourfinisacked = 1; 1336292309Srrs } else { 1337292309Srrs mfree = sbcut_locked(&so->so_snd, acked); 1338292309Srrs tp->snd_wnd -= acked; 1339292309Srrs ourfinisacked = 0; 1340292309Srrs } 1341292309Srrs /* NB: sowwakeup_locked() does an implicit unlock. */ 1342292309Srrs sowwakeup_locked(so); 1343292309Srrs m_freem(mfree); 1344292309Srrs /* Detect una wraparound. */ 1345292309Srrs if (!IN_RECOVERY(tp->t_flags) && 1346292309Srrs SEQ_GT(tp->snd_una, tp->snd_recover) && 1347292309Srrs SEQ_LEQ(th->th_ack, tp->snd_recover)) 1348292309Srrs tp->snd_recover = th->th_ack - 1; 1349292309Srrs /* XXXLAS: Can this be moved up into cc_post_recovery? */ 1350292309Srrs if (IN_RECOVERY(tp->t_flags) && 1351292309Srrs SEQ_GEQ(th->th_ack, tp->snd_recover)) { 1352292309Srrs EXIT_RECOVERY(tp->t_flags); 1353292309Srrs } 1354292309Srrs tp->snd_una = th->th_ack; 1355292309Srrs if (tp->t_flags & TF_SACK_PERMIT) { 1356292309Srrs if (SEQ_GT(tp->snd_una, tp->snd_recover)) 1357292309Srrs tp->snd_recover = tp->snd_una; 1358292309Srrs } 1359292309Srrs if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1360292309Srrs tp->snd_nxt = tp->snd_una; 1361292309Srrs 1362292309Srrs switch (tp->t_state) { 1363292309Srrs 1364292309Srrs /* 1365292309Srrs * In FIN_WAIT_1 STATE in addition to the processing 1366292309Srrs * for the ESTABLISHED state if our FIN is now acknowledged 1367292309Srrs * then enter FIN_WAIT_2. 1368292309Srrs */ 1369292309Srrs case TCPS_FIN_WAIT_1: 1370292309Srrs if (ourfinisacked) { 1371292309Srrs /* 1372292309Srrs * If we can't receive any more 1373292309Srrs * data, then closing user can proceed. 1374292309Srrs * Starting the timer is contrary to the 1375292309Srrs * specification, but if we don't get a FIN 1376292309Srrs * we'll hang forever. 1377292309Srrs * 1378292309Srrs * XXXjl: 1379292309Srrs * we should release the tp also, and use a 1380292309Srrs * compressed state. 1381292309Srrs */ 1382292309Srrs if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1383292309Srrs soisdisconnected(so); 1384292309Srrs tcp_timer_activate(tp, TT_2MSL, 1385292309Srrs (tcp_fast_finwait2_recycle ? 1386292309Srrs tcp_finwait2_timeout : 1387292309Srrs TP_MAXIDLE(tp))); 1388292309Srrs } 1389292309Srrs tcp_state_change(tp, TCPS_FIN_WAIT_2); 1390292309Srrs } 1391292309Srrs break; 1392292309Srrs 1393292309Srrs /* 1394292309Srrs * In CLOSING STATE in addition to the processing for 1395292309Srrs * the ESTABLISHED state if the ACK acknowledges our FIN 1396292309Srrs * then enter the TIME-WAIT state, otherwise ignore 1397292309Srrs * the segment. 1398292309Srrs */ 1399292309Srrs case TCPS_CLOSING: 1400292309Srrs if (ourfinisacked) { 1401292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1402292309Srrs tcp_twstart(tp); 1403292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1404292309Srrs m_freem(m); 1405292309Srrs return; 1406292309Srrs } 1407292309Srrs break; 1408292309Srrs 1409292309Srrs /* 1410292309Srrs * In LAST_ACK, we may still be waiting for data to drain 1411292309Srrs * and/or to be acked, as well as for the ack of our FIN. 1412292309Srrs * If our FIN is now acknowledged, delete the TCB, 1413292309Srrs * enter the closed state and return. 1414292309Srrs */ 1415292309Srrs case TCPS_LAST_ACK: 1416292309Srrs if (ourfinisacked) { 1417292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1418292309Srrs tp = tcp_close(tp); 1419292309Srrs goto drop; 1420292309Srrs } 1421292309Srrs break; 1422292309Srrs } 1423292309Srrs } 1424292309Srrs 1425292309Srrsstep6: 1426292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 1427292309Srrs 1428292309Srrs /* 1429292309Srrs * Update window information. 1430292309Srrs * Don't look at window if no ACK: TAC's send garbage on first SYN. 1431292309Srrs */ 1432292309Srrs if ((thflags & TH_ACK) && 1433292309Srrs (SEQ_LT(tp->snd_wl1, th->th_seq) || 1434292309Srrs (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 1435292309Srrs (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1436292309Srrs /* keep track of pure window updates */ 1437292309Srrs if (tlen == 0 && 1438292309Srrs tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1439292309Srrs TCPSTAT_INC(tcps_rcvwinupd); 1440292309Srrs tp->snd_wnd = tiwin; 1441292309Srrs tp->snd_wl1 = th->th_seq; 1442292309Srrs tp->snd_wl2 = th->th_ack; 1443292309Srrs if (tp->snd_wnd > tp->max_sndwnd) 1444292309Srrs tp->max_sndwnd = tp->snd_wnd; 1445292309Srrs needoutput = 1; 1446292309Srrs } 1447292309Srrs 1448292309Srrs /* 1449292309Srrs * Process segments with URG. 1450292309Srrs */ 1451292309Srrs if ((thflags & TH_URG) && th->th_urp && 1452292309Srrs TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1453292309Srrs /* 1454292309Srrs * This is a kludge, but if we receive and accept 1455292309Srrs * random urgent pointers, we'll crash in 1456292309Srrs * soreceive. It's hard to imagine someone 1457292309Srrs * actually wanting to send this much urgent data. 1458292309Srrs */ 1459292309Srrs SOCKBUF_LOCK(&so->so_rcv); 1460292309Srrs if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 1461292309Srrs th->th_urp = 0; /* XXX */ 1462292309Srrs thflags &= ~TH_URG; /* XXX */ 1463292309Srrs SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 1464292309Srrs goto dodata; /* XXX */ 1465292309Srrs } 1466292309Srrs /* 1467292309Srrs * If this segment advances the known urgent pointer, 1468292309Srrs * then mark the data stream. This should not happen 1469292309Srrs * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1470292309Srrs * a FIN has been received from the remote side. 1471292309Srrs * In these states we ignore the URG. 1472292309Srrs * 1473292309Srrs * According to RFC961 (Assigned Protocols), 1474292309Srrs * the urgent pointer points to the last octet 1475292309Srrs * of urgent data. We continue, however, 1476292309Srrs * to consider it to indicate the first octet 1477292309Srrs * of data past the urgent section as the original 1478292309Srrs * spec states (in one of two places). 1479292309Srrs */ 1480292309Srrs if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1481292309Srrs tp->rcv_up = th->th_seq + th->th_urp; 1482292309Srrs so->so_oobmark = sbavail(&so->so_rcv) + 1483292309Srrs (tp->rcv_up - tp->rcv_nxt) - 1; 1484292309Srrs if (so->so_oobmark == 0) 1485292309Srrs so->so_rcv.sb_state |= SBS_RCVATMARK; 1486292309Srrs sohasoutofband(so); 1487292309Srrs tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1488292309Srrs } 1489292309Srrs SOCKBUF_UNLOCK(&so->so_rcv); 1490292309Srrs /* 1491292309Srrs * Remove out of band data so doesn't get presented to user. 1492292309Srrs * This can happen independent of advancing the URG pointer, 1493292309Srrs * but if two URG's are pending at once, some out-of-band 1494292309Srrs * data may creep in... ick. 1495292309Srrs */ 1496292309Srrs if (th->th_urp <= (u_long)tlen && 1497292309Srrs !(so->so_options & SO_OOBINLINE)) { 1498292309Srrs /* hdr drop is delayed */ 1499292309Srrs tcp_pulloutofband(so, th, m, drop_hdrlen); 1500292309Srrs } 1501292309Srrs } else { 1502292309Srrs /* 1503292309Srrs * If no out of band data is expected, 1504292309Srrs * pull receive urgent pointer along 1505292309Srrs * with the receive window. 1506292309Srrs */ 1507292309Srrs if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1508292309Srrs tp->rcv_up = tp->rcv_nxt; 1509292309Srrs } 1510292309Srrsdodata: /* XXX */ 1511292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 1512292309Srrs 1513292309Srrs /* 1514292309Srrs * Process the segment text, merging it into the TCP sequencing queue, 1515292309Srrs * and arranging for acknowledgment of receipt if necessary. 1516292309Srrs * This process logically involves adjusting tp->rcv_wnd as data 1517292309Srrs * is presented to the user (this happens in tcp_usrreq.c, 1518292309Srrs * case PRU_RCVD). If a FIN has already been received on this 1519292309Srrs * connection then we just ignore the text. 1520292309Srrs */ 1521292309Srrs if ((tlen || (thflags & TH_FIN)) && 1522292309Srrs TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1523292309Srrs tcp_seq save_start = th->th_seq; 1524292309Srrs m_adj(m, drop_hdrlen); /* delayed header drop */ 1525292309Srrs /* 1526292309Srrs * Insert segment which includes th into TCP reassembly queue 1527292309Srrs * with control block tp. Set thflags to whether reassembly now 1528292309Srrs * includes a segment with FIN. This handles the common case 1529292309Srrs * inline (segment is the next to be received on an established 1530292309Srrs * connection, and the queue is empty), avoiding linkage into 1531292309Srrs * and removal from the queue and repetition of various 1532292309Srrs * conversions. 1533292309Srrs * Set DELACK for segments received in order, but ack 1534292309Srrs * immediately when segments are out of order (so 1535292309Srrs * fast retransmit can work). 1536292309Srrs */ 1537292309Srrs if (th->th_seq == tp->rcv_nxt && 1538292309Srrs LIST_EMPTY(&tp->t_segq) && 1539292309Srrs TCPS_HAVEESTABLISHED(tp->t_state)) { 1540292309Srrs if (DELAY_ACK(tp, tlen)) 1541292309Srrs tp->t_flags |= TF_DELACK; 1542292309Srrs else 1543292309Srrs tp->t_flags |= TF_ACKNOW; 1544292309Srrs tp->rcv_nxt += tlen; 1545292309Srrs thflags = th->th_flags & TH_FIN; 1546292309Srrs TCPSTAT_INC(tcps_rcvpack); 1547292309Srrs TCPSTAT_ADD(tcps_rcvbyte, tlen); 1548292309Srrs SOCKBUF_LOCK(&so->so_rcv); 1549292309Srrs if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1550292309Srrs m_freem(m); 1551292309Srrs else 1552292309Srrs sbappendstream_locked(&so->so_rcv, m, 0); 1553292309Srrs /* NB: sorwakeup_locked() does an implicit unlock. */ 1554292309Srrs sorwakeup_locked(so); 1555292309Srrs } else { 1556292309Srrs /* 1557292309Srrs * XXX: Due to the header drop above "th" is 1558292309Srrs * theoretically invalid by now. Fortunately 1559292309Srrs * m_adj() doesn't actually frees any mbufs 1560292309Srrs * when trimming from the head. 1561292309Srrs */ 1562292309Srrs thflags = tcp_reass(tp, th, &tlen, m); 1563292309Srrs tp->t_flags |= TF_ACKNOW; 1564292309Srrs } 1565292309Srrs if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 1566292309Srrs tcp_update_sack_list(tp, save_start, save_start + tlen); 1567292309Srrs#if 0 1568292309Srrs /* 1569292309Srrs * Note the amount of data that peer has sent into 1570292309Srrs * our window, in order to estimate the sender's 1571292309Srrs * buffer size. 1572292309Srrs * XXX: Unused. 1573292309Srrs */ 1574292309Srrs if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) 1575292309Srrs len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1576292309Srrs else 1577292309Srrs len = so->so_rcv.sb_hiwat; 1578292309Srrs#endif 1579292309Srrs } else { 1580292309Srrs m_freem(m); 1581292309Srrs thflags &= ~TH_FIN; 1582292309Srrs } 1583292309Srrs 1584292309Srrs /* 1585292309Srrs * If FIN is received ACK the FIN and let the user know 1586292309Srrs * that the connection is closing. 1587292309Srrs */ 1588292309Srrs if (thflags & TH_FIN) { 1589292309Srrs if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1590292309Srrs socantrcvmore(so); 1591292309Srrs /* 1592292309Srrs * If connection is half-synchronized 1593292309Srrs * (ie NEEDSYN flag on) then delay ACK, 1594292309Srrs * so it may be piggybacked when SYN is sent. 1595292309Srrs * Otherwise, since we received a FIN then no 1596292309Srrs * more input can be expected, send ACK now. 1597292309Srrs */ 1598292309Srrs if (tp->t_flags & TF_NEEDSYN) 1599292309Srrs tp->t_flags |= TF_DELACK; 1600292309Srrs else 1601292309Srrs tp->t_flags |= TF_ACKNOW; 1602292309Srrs tp->rcv_nxt++; 1603292309Srrs } 1604292309Srrs switch (tp->t_state) { 1605292309Srrs 1606292309Srrs /* 1607292309Srrs * In SYN_RECEIVED and ESTABLISHED STATES 1608292309Srrs * enter the CLOSE_WAIT state. 1609292309Srrs */ 1610292309Srrs case TCPS_SYN_RECEIVED: 1611292309Srrs tp->t_starttime = ticks; 1612292309Srrs /* FALLTHROUGH */ 1613292309Srrs case TCPS_ESTABLISHED: 1614292309Srrs tcp_state_change(tp, TCPS_CLOSE_WAIT); 1615292309Srrs break; 1616292309Srrs 1617292309Srrs /* 1618292309Srrs * If still in FIN_WAIT_1 STATE FIN has not been acked so 1619292309Srrs * enter the CLOSING state. 1620292309Srrs */ 1621292309Srrs case TCPS_FIN_WAIT_1: 1622292309Srrs tcp_state_change(tp, TCPS_CLOSING); 1623292309Srrs break; 1624292309Srrs 1625292309Srrs /* 1626292309Srrs * In FIN_WAIT_2 state enter the TIME_WAIT state, 1627292309Srrs * starting the time-wait timer, turning off the other 1628292309Srrs * standard timers. 1629292309Srrs */ 1630292309Srrs case TCPS_FIN_WAIT_2: 1631292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1632292309Srrs KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " 1633292309Srrs "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 1634292309Srrs ti_locked)); 1635292309Srrs 1636292309Srrs tcp_twstart(tp); 1637292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1638292309Srrs return; 1639292309Srrs } 1640292309Srrs } 1641292309Srrs if (ti_locked == TI_RLOCKED) { 1642292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1643292309Srrs } 1644292309Srrs ti_locked = TI_UNLOCKED; 1645292309Srrs 1646292309Srrs#ifdef TCPDEBUG 1647292309Srrs if (so->so_options & SO_DEBUG) 1648292309Srrs tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 1649292309Srrs &tcp_savetcp, 0); 1650292309Srrs#endif 1651292309Srrs TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); 1652292309Srrs 1653292309Srrs /* 1654292309Srrs * Return any desired output. 1655292309Srrs */ 1656292309Srrs if (needoutput || (tp->t_flags & TF_ACKNOW)) 1657292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1658292309Srrs 1659292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 1660292309Srrs __func__, ti_locked)); 1661292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1662292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 1663292309Srrs 1664292309Srrs if (tp->t_flags & TF_DELACK) { 1665292309Srrs tp->t_flags &= ~TF_DELACK; 1666292309Srrs tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 1667292309Srrs } 1668292309Srrs INP_WUNLOCK(tp->t_inpcb); 1669292309Srrs return; 1670292309Srrs 1671292309Srrsdropafterack: 1672292309Srrs /* 1673292309Srrs * Generate an ACK dropping incoming segment if it occupies 1674292309Srrs * sequence space, where the ACK reflects our state. 1675292309Srrs * 1676292309Srrs * We can now skip the test for the RST flag since all 1677292309Srrs * paths to this code happen after packets containing 1678292309Srrs * RST have been dropped. 1679292309Srrs * 1680292309Srrs * In the SYN-RECEIVED state, don't send an ACK unless the 1681292309Srrs * segment we received passes the SYN-RECEIVED ACK test. 1682292309Srrs * If it fails send a RST. This breaks the loop in the 1683292309Srrs * "LAND" DoS attack, and also prevents an ACK storm 1684292309Srrs * between two listening ports that have been sent forged 1685292309Srrs * SYN segments, each with the source address of the other. 1686292309Srrs */ 1687292309Srrs if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1688292309Srrs (SEQ_GT(tp->snd_una, th->th_ack) || 1689292309Srrs SEQ_GT(th->th_ack, tp->snd_max)) ) { 1690292309Srrs rstreason = BANDLIM_RST_OPENPORT; 1691292309Srrs goto dropwithreset; 1692292309Srrs } 1693292309Srrs#ifdef TCPDEBUG 1694292309Srrs if (so->so_options & SO_DEBUG) 1695292309Srrs tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 1696292309Srrs &tcp_savetcp, 0); 1697292309Srrs#endif 1698296352Sgnn TCP_PROBE3(debug__drop, tp, th, mtod(m, const char *)); 1699292309Srrs if (ti_locked == TI_RLOCKED) { 1700292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1701292309Srrs } 1702292309Srrs ti_locked = TI_UNLOCKED; 1703292309Srrs 1704292309Srrs tp->t_flags |= TF_ACKNOW; 1705292309Srrs (void) tp->t_fb->tfb_tcp_output(tp); 1706292309Srrs INP_WUNLOCK(tp->t_inpcb); 1707292309Srrs m_freem(m); 1708292309Srrs return; 1709292309Srrs 1710292309Srrsdropwithreset: 1711292309Srrs if (ti_locked == TI_RLOCKED) { 1712292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1713292309Srrs } 1714292309Srrs ti_locked = TI_UNLOCKED; 1715292309Srrs 1716292309Srrs if (tp != NULL) { 1717292309Srrs tcp_dropwithreset(m, th, tp, tlen, rstreason); 1718292309Srrs INP_WUNLOCK(tp->t_inpcb); 1719292309Srrs } else 1720292309Srrs tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1721292309Srrs return; 1722292309Srrs 1723292309Srrsdrop: 1724292309Srrs if (ti_locked == TI_RLOCKED) { 1725292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 1726292309Srrs ti_locked = TI_UNLOCKED; 1727292309Srrs } 1728292309Srrs#ifdef INVARIANTS 1729292309Srrs else 1730292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1731292309Srrs#endif 1732292309Srrs 1733292309Srrs /* 1734292309Srrs * Drop space held by incoming segment and return. 1735292309Srrs */ 1736292309Srrs#ifdef TCPDEBUG 1737292309Srrs if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 1738292309Srrs tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 1739292309Srrs &tcp_savetcp, 0); 1740292309Srrs#endif 1741296352Sgnn TCP_PROBE3(debug__drop, tp, th, mtod(m, const char *)); 1742292309Srrs if (tp != NULL) 1743292309Srrs INP_WUNLOCK(tp->t_inpcb); 1744292309Srrs m_freem(m); 1745292309Srrs} 1746292309Srrs 1747292309Srrs 1748292309Srrs/* 1749292309Srrs * Do fast slow is a combination of the original 1750292309Srrs * tcp_dosegment and a split fastpath, one function 1751292309Srrs * for the fast-ack which also includes allowing fastpath 1752292309Srrs * for window advanced in sequence acks. And also a 1753292309Srrs * sub-function that handles the insequence data. 1754292309Srrs */ 1755292309Srrsvoid 1756292309Srrstcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so, 1757292309Srrs struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 1758292309Srrs int ti_locked) 1759292309Srrs{ 1760292309Srrs int thflags; 1761292309Srrs u_long tiwin; 1762292309Srrs char *s; 1763292309Srrs int can_enter; 1764292309Srrs struct in_conninfo *inc; 1765292309Srrs struct tcpopt to; 1766292309Srrs 1767292309Srrs thflags = th->th_flags; 1768292309Srrs tp->sackhint.last_sack_ack = 0; 1769292309Srrs inc = &tp->t_inpcb->inp_inc; 1770292309Srrs /* 1771292309Srrs * If this is either a state-changing packet or current state isn't 1772292309Srrs * established, we require a write lock on tcbinfo. Otherwise, we 1773292309Srrs * allow the tcbinfo to be in either alocked or unlocked, as the 1774292309Srrs * caller may have unnecessarily acquired a write lock due to a race. 1775292309Srrs */ 1776292309Srrs if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 1777292309Srrs tp->t_state != TCPS_ESTABLISHED) { 1778292309Srrs KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 1779292309Srrs "SYN/FIN/RST/!EST", __func__, ti_locked)); 1780292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1781292309Srrs } else { 1782292309Srrs#ifdef INVARIANTS 1783292309Srrs if (ti_locked == TI_RLOCKED) { 1784292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1785292309Srrs } else { 1786292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 1787292309Srrs "ti_locked: %d", __func__, ti_locked)); 1788292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1789292309Srrs } 1790292309Srrs#endif 1791292309Srrs } 1792292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 1793292309Srrs KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 1794292309Srrs __func__)); 1795292309Srrs KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 1796292309Srrs __func__)); 1797292309Srrs 1798292309Srrs /* 1799292309Srrs * Segment received on connection. 1800292309Srrs * Reset idle time and keep-alive timer. 1801292309Srrs * XXX: This should be done after segment 1802292309Srrs * validation to ignore broken/spoofed segs. 1803292309Srrs */ 1804292309Srrs tp->t_rcvtime = ticks; 1805292309Srrs if (TCPS_HAVEESTABLISHED(tp->t_state)) 1806292309Srrs tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 1807292309Srrs 1808292309Srrs /* 1809292309Srrs * Unscale the window into a 32-bit value. 1810292309Srrs * For the SYN_SENT state the scale is zero. 1811292309Srrs */ 1812292309Srrs tiwin = th->th_win << tp->snd_scale; 1813292309Srrs 1814292309Srrs /* 1815292309Srrs * TCP ECN processing. 1816292309Srrs */ 1817292309Srrs if (tp->t_flags & TF_ECN_PERMIT) { 1818292309Srrs if (thflags & TH_CWR) 1819292309Srrs tp->t_flags &= ~TF_ECN_SND_ECE; 1820292309Srrs switch (iptos & IPTOS_ECN_MASK) { 1821292309Srrs case IPTOS_ECN_CE: 1822292309Srrs tp->t_flags |= TF_ECN_SND_ECE; 1823292309Srrs TCPSTAT_INC(tcps_ecn_ce); 1824292309Srrs break; 1825292309Srrs case IPTOS_ECN_ECT0: 1826292309Srrs TCPSTAT_INC(tcps_ecn_ect0); 1827292309Srrs break; 1828292309Srrs case IPTOS_ECN_ECT1: 1829292309Srrs TCPSTAT_INC(tcps_ecn_ect1); 1830292309Srrs break; 1831292309Srrs } 1832292309Srrs /* Congestion experienced. */ 1833292309Srrs if (thflags & TH_ECE) { 1834292309Srrs cc_cong_signal(tp, th, CC_ECN); 1835292309Srrs } 1836292309Srrs } 1837292309Srrs 1838292309Srrs /* 1839292309Srrs * Parse options on any incoming segment. 1840292309Srrs */ 1841292309Srrs tcp_dooptions(&to, (u_char *)(th + 1), 1842292309Srrs (th->th_off << 2) - sizeof(struct tcphdr), 1843292309Srrs (thflags & TH_SYN) ? TO_SYN : 0); 1844292309Srrs 1845292309Srrs /* 1846292309Srrs * If echoed timestamp is later than the current time, 1847292309Srrs * fall back to non RFC1323 RTT calculation. Normalize 1848292309Srrs * timestamp if syncookies were used when this connection 1849292309Srrs * was established. 1850292309Srrs */ 1851292309Srrs if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1852292309Srrs to.to_tsecr -= tp->ts_offset; 1853292309Srrs if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) 1854292309Srrs to.to_tsecr = 0; 1855292309Srrs } 1856292309Srrs /* 1857292309Srrs * If timestamps were negotiated during SYN/ACK they should 1858292309Srrs * appear on every segment during this session and vice versa. 1859292309Srrs */ 1860292309Srrs if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 1861292309Srrs if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1862292309Srrs log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1863292309Srrs "no action\n", s, __func__); 1864292309Srrs free(s, M_TCPLOG); 1865292309Srrs } 1866292309Srrs } 1867292309Srrs if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 1868292309Srrs if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1869292309Srrs log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 1870292309Srrs "no action\n", s, __func__); 1871292309Srrs free(s, M_TCPLOG); 1872292309Srrs } 1873292309Srrs } 1874292309Srrs 1875292309Srrs /* 1876292309Srrs * Process options only when we get SYN/ACK back. The SYN case 1877292309Srrs * for incoming connections is handled in tcp_syncache. 1878292309Srrs * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1879292309Srrs * or <SYN,ACK>) segment itself is never scaled. 1880292309Srrs * XXX this is traditional behavior, may need to be cleaned up. 1881292309Srrs */ 1882292309Srrs if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1883292309Srrs if ((to.to_flags & TOF_SCALE) && 1884292309Srrs (tp->t_flags & TF_REQ_SCALE)) { 1885292309Srrs tp->t_flags |= TF_RCVD_SCALE; 1886292309Srrs tp->snd_scale = to.to_wscale; 1887292309Srrs } 1888292309Srrs /* 1889292309Srrs * Initial send window. It will be updated with 1890292309Srrs * the next incoming segment to the scaled value. 1891292309Srrs */ 1892292309Srrs tp->snd_wnd = th->th_win; 1893292309Srrs if (to.to_flags & TOF_TS) { 1894292309Srrs tp->t_flags |= TF_RCVD_TSTMP; 1895292309Srrs tp->ts_recent = to.to_tsval; 1896292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 1897292309Srrs } 1898292309Srrs if (to.to_flags & TOF_MSS) 1899292309Srrs tcp_mss(tp, to.to_mss); 1900292309Srrs if ((tp->t_flags & TF_SACK_PERMIT) && 1901292309Srrs (to.to_flags & TOF_SACKPERM) == 0) 1902292309Srrs tp->t_flags &= ~TF_SACK_PERMIT; 1903292309Srrs } 1904292309Srrs can_enter = 0; 1905292309Srrs if (__predict_true((tlen == 0))) { 1906292309Srrs /* 1907292309Srrs * The ack moved forward and we have a window (non-zero) 1908292309Srrs * <or> 1909292309Srrs * The ack did not move forward, but the window increased. 1910292309Srrs */ 1911292309Srrs if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) || 1912292309Srrs ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) { 1913292309Srrs can_enter = 1; 1914292309Srrs } 1915292309Srrs } else { 1916292309Srrs /* 1917292309Srrs * Data incoming, use the old entry criteria 1918292309Srrs * for fast-path with data. 1919292309Srrs */ 1920292309Srrs if ((tiwin && tiwin == tp->snd_wnd)) { 1921292309Srrs can_enter = 1; 1922292309Srrs } 1923292309Srrs } 1924292309Srrs /* 1925292309Srrs * Header prediction: check for the two common cases 1926292309Srrs * of a uni-directional data xfer. If the packet has 1927292309Srrs * no control flags, is in-sequence, the window didn't 1928292309Srrs * change and we're not retransmitting, it's a 1929292309Srrs * candidate. If the length is zero and the ack moved 1930292309Srrs * forward, we're the sender side of the xfer. Just 1931292309Srrs * free the data acked & wake any higher level process 1932292309Srrs * that was blocked waiting for space. If the length 1933292309Srrs * is non-zero and the ack didn't move, we're the 1934292309Srrs * receiver side. If we're getting packets in-order 1935292309Srrs * (the reassembly queue is empty), add the data to 1936292309Srrs * the socket buffer and note that we need a delayed ack. 1937292309Srrs * Make sure that the hidden state-flags are also off. 1938292309Srrs * Since we check for TCPS_ESTABLISHED first, it can only 1939292309Srrs * be TH_NEEDSYN. 1940292309Srrs */ 1941292309Srrs if (__predict_true(tp->t_state == TCPS_ESTABLISHED && 1942292309Srrs th->th_seq == tp->rcv_nxt && 1943292309Srrs (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1944292309Srrs tp->snd_nxt == tp->snd_max && 1945292309Srrs can_enter && 1946292309Srrs ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1947292309Srrs LIST_EMPTY(&tp->t_segq) && 1948292309Srrs ((to.to_flags & TOF_TS) == 0 || 1949292309Srrs TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) { 1950292309Srrs if (__predict_true((tlen == 0) && 1951292309Srrs (SEQ_LEQ(th->th_ack, tp->snd_max) && 1952292309Srrs !IN_RECOVERY(tp->t_flags) && 1953292309Srrs (to.to_flags & TOF_SACK) == 0 && 1954292309Srrs TAILQ_EMPTY(&tp->snd_holes)))) { 1955292309Srrs /* We are done */ 1956292309Srrs tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 1957292309Srrs ti_locked, tiwin); 1958292309Srrs return; 1959292309Srrs } else if ((tlen) && 1960292309Srrs (th->th_ack == tp->snd_una && 1961292309Srrs tlen <= sbspace(&so->so_rcv))) { 1962292309Srrs tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen, 1963292309Srrs ti_locked, tiwin); 1964292309Srrs /* We are done */ 1965292309Srrs return; 1966292309Srrs } 1967292309Srrs } 1968292309Srrs tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, 1969292309Srrs ti_locked, tiwin, thflags); 1970292309Srrs} 1971292309Srrs 1972292309Srrs 1973292309Srrs/* 1974292309Srrs * This subfunction is used to try to highly optimize the 1975292309Srrs * fast path. We again allow window updates that are 1976292309Srrs * in sequence to remain in the fast-path. We also add 1977292309Srrs * in the __predict's to attempt to help the compiler. 1978292309Srrs * Note that if we return a 0, then we can *not* process 1979292309Srrs * it and the caller should push the packet into the 1980292309Srrs * slow-path. 1981292309Srrs */ 1982292309Srrsstatic int 1983292309Srrstcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 1984292309Srrs struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 1985292309Srrs int ti_locked, u_long tiwin) 1986292309Srrs{ 1987292309Srrs int acked; 1988292309Srrs int winup_only=0; 1989292309Srrs#ifdef TCPDEBUG 1990292309Srrs /* 1991292309Srrs * The size of tcp_saveipgen must be the size of the max ip header, 1992292309Srrs * now IPv6. 1993292309Srrs */ 1994292309Srrs u_char tcp_saveipgen[IP6_HDR_LEN]; 1995292309Srrs struct tcphdr tcp_savetcp; 1996292309Srrs short ostate = 0; 1997292309Srrs#endif 1998292309Srrs 1999292309Srrs 2000292309Srrs if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 2001292309Srrs /* Old ack, behind (or duplicate to) the last one rcv'd */ 2002292309Srrs return (0); 2003292309Srrs } 2004292309Srrs if (__predict_false(th->th_ack == tp->snd_una) && 2005292309Srrs __predict_false(tiwin <= tp->snd_wnd)) { 2006292309Srrs /* duplicate ack <or> a shrinking dup ack with shrinking window */ 2007292309Srrs return (0); 2008292309Srrs } 2009292309Srrs if (__predict_false(tiwin == 0)) { 2010292309Srrs /* zero window */ 2011292309Srrs return (0); 2012292309Srrs } 2013292309Srrs if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 2014292309Srrs /* Above what we have sent? */ 2015292309Srrs return (0); 2016292309Srrs } 2017292309Srrs if (__predict_false(tp->snd_nxt != tp->snd_max)) { 2018292309Srrs /* We are retransmitting */ 2019292309Srrs return (0); 2020292309Srrs } 2021292309Srrs if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) { 2022292309Srrs /* We need a SYN or a FIN, unlikely.. */ 2023292309Srrs return (0); 2024292309Srrs } 2025292309Srrs if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 2026292309Srrs /* Timestamp is behind .. old ack with seq wrap? */ 2027292309Srrs return (0); 2028292309Srrs } 2029292309Srrs if (__predict_false(IN_RECOVERY(tp->t_flags))) { 2030292309Srrs /* Still recovering */ 2031292309Srrs return (0); 2032292309Srrs } 2033292309Srrs if (__predict_false(to->to_flags & TOF_SACK)) { 2034292309Srrs /* Sack included in the ack.. */ 2035292309Srrs return (0); 2036292309Srrs } 2037292309Srrs if (!TAILQ_EMPTY(&tp->snd_holes)) { 2038292309Srrs /* We have sack holes on our scoreboard */ 2039292309Srrs return (0); 2040292309Srrs } 2041292309Srrs /* Ok if we reach here, we can process a fast-ack */ 2042292309Srrs 2043292309Srrs /* Did the window get updated? */ 2044292309Srrs if (tiwin != tp->snd_wnd) { 2045292309Srrs /* keep track of pure window updates */ 2046292309Srrs if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { 2047292309Srrs winup_only = 1; 2048292309Srrs TCPSTAT_INC(tcps_rcvwinupd); 2049292309Srrs } 2050292309Srrs tp->snd_wnd = tiwin; 2051292309Srrs tp->snd_wl1 = th->th_seq; 2052292309Srrs if (tp->snd_wnd > tp->max_sndwnd) 2053292309Srrs tp->max_sndwnd = tp->snd_wnd; 2054292309Srrs } 2055292309Srrs /* 2056292309Srrs * Pull snd_wl2 up to prevent seq wrap relative 2057292309Srrs * to th_ack. 2058292309Srrs */ 2059292309Srrs tp->snd_wl2 = th->th_ack; 2060292309Srrs /* 2061292309Srrs * If last ACK falls within this segment's sequence numbers, 2062292309Srrs * record the timestamp. 2063292309Srrs * NOTE that the test is modified according to the latest 2064292309Srrs * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2065292309Srrs */ 2066292309Srrs if ((to->to_flags & TOF_TS) != 0 && 2067292309Srrs SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 2068292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 2069292309Srrs tp->ts_recent = to->to_tsval; 2070292309Srrs } 2071292309Srrs /* 2072292309Srrs * This is a pure ack for outstanding data. 2073292309Srrs */ 2074292309Srrs if (ti_locked == TI_RLOCKED) { 2075292309Srrs INP_INFO_RUNLOCK(&V_tcbinfo); 2076292309Srrs } 2077292309Srrs ti_locked = TI_UNLOCKED; 2078292309Srrs 2079292309Srrs TCPSTAT_INC(tcps_predack); 2080292309Srrs 2081292309Srrs /* 2082292309Srrs * "bad retransmit" recovery. 2083292309Srrs */ 2084292309Srrs if (tp->t_rxtshift == 1 && 2085292309Srrs tp->t_flags & TF_PREVVALID && 2086292309Srrs (int)(ticks - tp->t_badrxtwin) < 0) { 2087292309Srrs cc_cong_signal(tp, th, CC_RTO_ERR); 2088292309Srrs } 2089292309Srrs 2090292309Srrs /* 2091292309Srrs * Recalculate the transmit timer / rtt. 2092292309Srrs * 2093292309Srrs * Some boxes send broken timestamp replies 2094292309Srrs * during the SYN+ACK phase, ignore 2095292309Srrs * timestamps of 0 or we could calculate a 2096292309Srrs * huge RTT and blow up the retransmit timer. 2097292309Srrs */ 2098292309Srrs if ((to->to_flags & TOF_TS) != 0 && 2099292309Srrs to->to_tsecr) { 2100292309Srrs u_int t; 2101292309Srrs 2102292309Srrs t = tcp_ts_getticks() - to->to_tsecr; 2103292309Srrs if (!tp->t_rttlow || tp->t_rttlow > t) 2104292309Srrs tp->t_rttlow = t; 2105292309Srrs tcp_xmit_timer(tp, 2106292309Srrs TCP_TS_TO_TICKS(t) + 1); 2107292309Srrs } else if (tp->t_rtttime && 2108292309Srrs SEQ_GT(th->th_ack, tp->t_rtseq)) { 2109292309Srrs if (!tp->t_rttlow || 2110292309Srrs tp->t_rttlow > ticks - tp->t_rtttime) 2111292309Srrs tp->t_rttlow = ticks - tp->t_rtttime; 2112292309Srrs tcp_xmit_timer(tp, 2113292309Srrs ticks - tp->t_rtttime); 2114292309Srrs } 2115292309Srrs if (winup_only == 0) { 2116292309Srrs acked = BYTES_THIS_ACK(tp, th); 2117292309Srrs 2118292309Srrs /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 2119292309Srrs hhook_run_tcp_est_in(tp, th, to); 2120292309Srrs 2121292309Srrs TCPSTAT_ADD(tcps_rcvackbyte, acked); 2122292309Srrs sbdrop(&so->so_snd, acked); 2123292309Srrs if (SEQ_GT(tp->snd_una, tp->snd_recover) && 2124292309Srrs SEQ_LEQ(th->th_ack, tp->snd_recover)) 2125292309Srrs tp->snd_recover = th->th_ack - 1; 2126292309Srrs 2127292309Srrs /* 2128292309Srrs * Let the congestion control algorithm update 2129292309Srrs * congestion control related information. This 2130292309Srrs * typically means increasing the congestion 2131292309Srrs * window. 2132292309Srrs */ 2133292309Srrs cc_ack_received(tp, th, CC_ACK); 2134292309Srrs 2135292309Srrs tp->snd_una = th->th_ack; 2136292309Srrs tp->t_dupacks = 0; 2137292309Srrs 2138292309Srrs /* 2139292309Srrs * If all outstanding data are acked, stop 2140292309Srrs * retransmit timer, otherwise restart timer 2141292309Srrs * using current (possibly backed-off) value. 2142292309Srrs * If process is waiting for space, 2143292309Srrs * wakeup/selwakeup/signal. If data 2144292309Srrs * are ready to send, let tcp_output 2145292309Srrs * decide between more output or persist. 2146292309Srrs */ 2147292309Srrs#ifdef TCPDEBUG 2148292309Srrs if (so->so_options & SO_DEBUG) 2149292309Srrs tcp_trace(TA_INPUT, ostate, tp, 2150292309Srrs (void *)tcp_saveipgen, 2151292309Srrs &tcp_savetcp, 0); 2152292309Srrs#endif 2153296352Sgnn TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); 2154296352Sgnn m_freem(m); 2155292309Srrs if (tp->snd_una == tp->snd_max) 2156292309Srrs tcp_timer_activate(tp, TT_REXMT, 0); 2157292309Srrs else if (!tcp_timer_active(tp, TT_PERSIST)) 2158292309Srrs tcp_timer_activate(tp, TT_REXMT, 2159292309Srrs tp->t_rxtcur); 2160292309Srrs /* Wake up the socket if we have room to write more */ 2161292309Srrs sowwakeup(so); 2162292309Srrs } else { 2163292309Srrs /* 2164292309Srrs * Window update only, just free the mbufs and 2165292309Srrs * send out whatever we can. 2166292309Srrs */ 2167292309Srrs m_freem(m); 2168292309Srrs } 2169292309Srrs if (sbavail(&so->so_snd)) 2170292309Srrs (void) tcp_output(tp); 2171292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 2172292309Srrs __func__, ti_locked)); 2173292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2174292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 2175292309Srrs 2176292309Srrs if (tp->t_flags & TF_DELACK) { 2177292309Srrs tp->t_flags &= ~TF_DELACK; 2178292309Srrs tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2179292309Srrs } 2180292309Srrs INP_WUNLOCK(tp->t_inpcb); 2181292309Srrs return (1); 2182292309Srrs} 2183292309Srrs 2184292309Srrs/* 2185292309Srrs * This tcp-do-segment concentrates on making the fastest 2186292309Srrs * ack processing path. It does not have a fast-path for 2187292309Srrs * data (it possibly could which would then eliminate the 2188292309Srrs * need for fast-slow above). For a content distributor having 2189292309Srrs * large outgoing elephants and very very little coming in 2190292309Srrs * having no fastpath for data does not really help (since you 2191292309Srrs * don't get much data in). The most important thing is 2192292309Srrs * processing ack's quickly and getting the rest of the data 2193292309Srrs * output to the peer as quickly as possible. This routine 2194292309Srrs * seems to be about an overall 3% faster then the old 2195292309Srrs * tcp_do_segment and keeps us in the fast-path for packets 2196292309Srrs * much more (by allowing window updates to also stay in the fastpath). 2197292309Srrs */ 2198292309Srrsvoid 2199292309Srrstcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 2200292309Srrs struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 2201292309Srrs int ti_locked) 2202292309Srrs{ 2203292309Srrs int thflags; 2204292309Srrs u_long tiwin; 2205292309Srrs char *s; 2206292309Srrs struct in_conninfo *inc; 2207292309Srrs struct tcpopt to; 2208292309Srrs 2209292309Srrs thflags = th->th_flags; 2210292309Srrs tp->sackhint.last_sack_ack = 0; 2211292309Srrs inc = &tp->t_inpcb->inp_inc; 2212292309Srrs /* 2213292309Srrs * If this is either a state-changing packet or current state isn't 2214292309Srrs * established, we require a write lock on tcbinfo. Otherwise, we 2215292309Srrs * allow the tcbinfo to be in either alocked or unlocked, as the 2216292309Srrs * caller may have unnecessarily acquired a write lock due to a race. 2217292309Srrs */ 2218292309Srrs if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 2219292309Srrs tp->t_state != TCPS_ESTABLISHED) { 2220292309Srrs KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 2221292309Srrs "SYN/FIN/RST/!EST", __func__, ti_locked)); 2222292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2223292309Srrs } else { 2224292309Srrs#ifdef INVARIANTS 2225292309Srrs if (ti_locked == TI_RLOCKED) { 2226292309Srrs INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2227292309Srrs } else { 2228292309Srrs KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 2229292309Srrs "ti_locked: %d", __func__, ti_locked)); 2230292309Srrs INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2231292309Srrs } 2232292309Srrs#endif 2233292309Srrs } 2234292309Srrs INP_WLOCK_ASSERT(tp->t_inpcb); 2235292309Srrs KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 2236292309Srrs __func__)); 2237292309Srrs KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 2238292309Srrs __func__)); 2239292309Srrs 2240292309Srrs /* 2241292309Srrs * Segment received on connection. 2242292309Srrs * Reset idle time and keep-alive timer. 2243292309Srrs * XXX: This should be done after segment 2244292309Srrs * validation to ignore broken/spoofed segs. 2245292309Srrs */ 2246292309Srrs tp->t_rcvtime = ticks; 2247292309Srrs if (TCPS_HAVEESTABLISHED(tp->t_state)) 2248292309Srrs tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 2249292309Srrs 2250292309Srrs /* 2251292309Srrs * Unscale the window into a 32-bit value. 2252292309Srrs * For the SYN_SENT state the scale is zero. 2253292309Srrs */ 2254292309Srrs tiwin = th->th_win << tp->snd_scale; 2255292309Srrs 2256292309Srrs /* 2257292309Srrs * TCP ECN processing. 2258292309Srrs */ 2259292309Srrs if (tp->t_flags & TF_ECN_PERMIT) { 2260292309Srrs if (thflags & TH_CWR) 2261292309Srrs tp->t_flags &= ~TF_ECN_SND_ECE; 2262292309Srrs switch (iptos & IPTOS_ECN_MASK) { 2263292309Srrs case IPTOS_ECN_CE: 2264292309Srrs tp->t_flags |= TF_ECN_SND_ECE; 2265292309Srrs TCPSTAT_INC(tcps_ecn_ce); 2266292309Srrs break; 2267292309Srrs case IPTOS_ECN_ECT0: 2268292309Srrs TCPSTAT_INC(tcps_ecn_ect0); 2269292309Srrs break; 2270292309Srrs case IPTOS_ECN_ECT1: 2271292309Srrs TCPSTAT_INC(tcps_ecn_ect1); 2272292309Srrs break; 2273292309Srrs } 2274292309Srrs /* Congestion experienced. */ 2275292309Srrs if (thflags & TH_ECE) { 2276292309Srrs cc_cong_signal(tp, th, CC_ECN); 2277292309Srrs } 2278292309Srrs } 2279292309Srrs 2280292309Srrs /* 2281292309Srrs * Parse options on any incoming segment. 2282292309Srrs */ 2283292309Srrs tcp_dooptions(&to, (u_char *)(th + 1), 2284292309Srrs (th->th_off << 2) - sizeof(struct tcphdr), 2285292309Srrs (thflags & TH_SYN) ? TO_SYN : 0); 2286292309Srrs 2287292309Srrs /* 2288292309Srrs * If echoed timestamp is later than the current time, 2289292309Srrs * fall back to non RFC1323 RTT calculation. Normalize 2290292309Srrs * timestamp if syncookies were used when this connection 2291292309Srrs * was established. 2292292309Srrs */ 2293292309Srrs if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 2294292309Srrs to.to_tsecr -= tp->ts_offset; 2295292309Srrs if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) 2296292309Srrs to.to_tsecr = 0; 2297292309Srrs } 2298292309Srrs /* 2299292309Srrs * If timestamps were negotiated during SYN/ACK they should 2300292309Srrs * appear on every segment during this session and vice versa. 2301292309Srrs */ 2302292309Srrs if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 2303292309Srrs if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2304292309Srrs log(LOG_DEBUG, "%s; %s: Timestamp missing, " 2305292309Srrs "no action\n", s, __func__); 2306292309Srrs free(s, M_TCPLOG); 2307292309Srrs } 2308292309Srrs } 2309292309Srrs if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 2310292309Srrs if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2311292309Srrs log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 2312292309Srrs "no action\n", s, __func__); 2313292309Srrs free(s, M_TCPLOG); 2314292309Srrs } 2315292309Srrs } 2316292309Srrs 2317292309Srrs /* 2318292309Srrs * Process options only when we get SYN/ACK back. The SYN case 2319292309Srrs * for incoming connections is handled in tcp_syncache. 2320292309Srrs * According to RFC1323 the window field in a SYN (i.e., a <SYN> 2321292309Srrs * or <SYN,ACK>) segment itself is never scaled. 2322292309Srrs * XXX this is traditional behavior, may need to be cleaned up. 2323292309Srrs */ 2324292309Srrs if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 2325292309Srrs if ((to.to_flags & TOF_SCALE) && 2326292309Srrs (tp->t_flags & TF_REQ_SCALE)) { 2327292309Srrs tp->t_flags |= TF_RCVD_SCALE; 2328292309Srrs tp->snd_scale = to.to_wscale; 2329292309Srrs } 2330292309Srrs /* 2331292309Srrs * Initial send window. It will be updated with 2332292309Srrs * the next incoming segment to the scaled value. 2333292309Srrs */ 2334292309Srrs tp->snd_wnd = th->th_win; 2335292309Srrs if (to.to_flags & TOF_TS) { 2336292309Srrs tp->t_flags |= TF_RCVD_TSTMP; 2337292309Srrs tp->ts_recent = to.to_tsval; 2338292309Srrs tp->ts_recent_age = tcp_ts_getticks(); 2339292309Srrs } 2340292309Srrs if (to.to_flags & TOF_MSS) 2341292309Srrs tcp_mss(tp, to.to_mss); 2342292309Srrs if ((tp->t_flags & TF_SACK_PERMIT) && 2343292309Srrs (to.to_flags & TOF_SACKPERM) == 0) 2344292309Srrs tp->t_flags &= ~TF_SACK_PERMIT; 2345292309Srrs } 2346292309Srrs /* 2347292309Srrs * Header prediction: check for the two common cases 2348292309Srrs * of a uni-directional data xfer. If the packet has 2349292309Srrs * no control flags, is in-sequence, the window didn't 2350292309Srrs * change and we're not retransmitting, it's a 2351292309Srrs * candidate. If the length is zero and the ack moved 2352292309Srrs * forward, we're the sender side of the xfer. Just 2353292309Srrs * free the data acked & wake any higher level process 2354292309Srrs * that was blocked waiting for space. If the length 2355292309Srrs * is non-zero and the ack didn't move, we're the 2356292309Srrs * receiver side. If we're getting packets in-order 2357292309Srrs * (the reassembly queue is empty), add the data to 2358292309Srrs * the socket buffer and note that we need a delayed ack. 2359292309Srrs * Make sure that the hidden state-flags are also off. 2360292309Srrs * Since we check for TCPS_ESTABLISHED first, it can only 2361292309Srrs * be TH_NEEDSYN. 2362292309Srrs */ 2363292309Srrs if (__predict_true(tp->t_state == TCPS_ESTABLISHED) && 2364292309Srrs __predict_true(((to.to_flags & TOF_SACK) == 0)) && 2365292309Srrs __predict_true(tlen == 0) && 2366292309Srrs __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) && 2367292309Srrs __predict_true(LIST_EMPTY(&tp->t_segq)) && 2368292309Srrs __predict_true(th->th_seq == tp->rcv_nxt)) { 2369292309Srrs if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 2370292309Srrs ti_locked, tiwin)) { 2371292309Srrs return; 2372292309Srrs } 2373292309Srrs } 2374292309Srrs tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, 2375292309Srrs ti_locked, tiwin, thflags); 2376292309Srrs} 2377292309Srrs 2378292309Srrsstruct tcp_function_block __tcp_fastslow = { 2379292309Srrs "fastslow", 2380292309Srrs tcp_output, 2381292309Srrs tcp_do_segment_fastslow, 2382292309Srrs tcp_default_ctloutput, 2383292309Srrs NULL, 2384292309Srrs NULL, 2385292309Srrs NULL, 2386292309Srrs NULL, 2387292309Srrs NULL, 2388292309Srrs NULL, 2389292309Srrs 0, 2390292309Srrs 0 2391292309Srrs 2392292309Srrs}; 2393292309Srrs 2394292309Srrsstruct tcp_function_block __tcp_fastack = { 2395292309Srrs "fastack", 2396292309Srrs tcp_output, 2397292309Srrs tcp_do_segment_fastack, 2398292309Srrs tcp_default_ctloutput, 2399292309Srrs NULL, 2400292309Srrs NULL, 2401292309Srrs NULL, 2402292309Srrs NULL, 2403292309Srrs NULL, 2404292309Srrs NULL, 2405292309Srrs 0, 2406292309Srrs 0 2407292309Srrs}; 2408292309Srrs 2409292309Srrsstatic int 2410292309Srrstcp_addfastpaths(module_t mod, int type, void *data) 2411292309Srrs{ 2412292309Srrs int err=0; 2413292309Srrs 2414292309Srrs switch (type) { 2415292309Srrs case MOD_LOAD: 2416292309Srrs err = register_tcp_functions(&__tcp_fastack, M_WAITOK); 2417292309Srrs if (err) { 2418292309Srrs printf("Failed to register fastack module -- err:%d\n", err); 2419292309Srrs return(err); 2420292309Srrs } 2421292309Srrs err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); 2422292309Srrs if (err) { 2423292309Srrs printf("Failed to register fastslow module -- err:%d\n", err); 2424292309Srrs deregister_tcp_functions(&__tcp_fastack); 2425292309Srrs return(err); 2426292309Srrs } 2427292309Srrs break; 2428292309Srrs case MOD_QUIESCE: 2429292309Srrs if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) { 2430292309Srrs return(EBUSY); 2431292309Srrs } 2432292309Srrs break; 2433292309Srrs case MOD_UNLOAD: 2434292309Srrs err = deregister_tcp_functions(&__tcp_fastack); 2435292309Srrs if (err == EBUSY) 2436292309Srrs break; 2437292309Srrs err = deregister_tcp_functions(&__tcp_fastslow); 2438292309Srrs if (err == EBUSY) 2439292309Srrs break; 2440292309Srrs err = 0; 2441292309Srrs break; 2442292309Srrs default: 2443292309Srrs return (EOPNOTSUPP); 2444292309Srrs } 2445292309Srrs return (err); 2446292309Srrs} 2447292309Srrs 2448292309Srrsstatic moduledata_t new_tcp_fastpaths = { 2449292309Srrs .name = "tcp_fastpaths", 2450292309Srrs .evhand = tcp_addfastpaths, 2451292309Srrs .priv = 0 2452292309Srrs}; 2453292309Srrs 2454292309SrrsMODULE_VERSION(kern_tcpfastpaths, 1); 2455295927SrrsDECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 2456