1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * Copyright (c) 2007-2008,2010 5 * Swinburne University of Technology, Melbourne, Australia. 6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 7 * Copyright (c) 2010 The FreeBSD Foundation 8 * Copyright (c) 2010-2011 Juniper Networks, Inc. 9 * Copyright (c) 2015 Netflix Inc. 10 * All rights reserved. 11 * 12 * Portions of this software were developed at the Centre for Advanced Internet 13 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 14 * James Healy and David Hayes, made possible in part by a grant from the Cisco 15 * University Research Program Fund at Community Foundation Silicon Valley. 16 * 17 * Portions of this software were developed at the Centre for Advanced 18 * Internet Architectures, Swinburne University of Technology, Melbourne, 19 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 20 * 21 * Portions of this software were developed by Robert N. M. Watson under 22 * contract to Juniper Networks, Inc. 23 * 24 * Portions of this software were developed by Randall R. Stewart while 25 * working for Netflix Inc. 26 * 27 * Redistribution and use in source and binary forms, with or without 28 * modification, are permitted provided that the following conditions 29 * are met: 30 * 1. Redistributions of source code must retain the above copyright 31 * notice, this list of conditions and the following disclaimer. 32 * 2. Redistributions in binary form must reproduce the above copyright 33 * notice, this list of conditions and the following disclaimer in the 34 * documentation and/or other materials provided with the distribution. 35 * 4. Neither the name of the University nor the names of its contributors 36 * may be used to endorse or promote products derived from this software 37 * without specific prior written permission. 38 * 39 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 40 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 42 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 43 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 44 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 45 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 47 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 48 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 49 * SUCH DAMAGE. 50 * 51 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 52 */ 53 54#include <sys/cdefs.h> 55__FBSDID("$FreeBSD: stable/11/sys/netinet/tcp_stacks/fastpath.c 344511 2019-02-25 10:38:37Z tuexen $"); 56 57#include "opt_inet.h" 58#include "opt_inet6.h" 59#include "opt_tcpdebug.h" 60 61#include <sys/param.h> 62#include <sys/module.h> 63#include <sys/kernel.h> 64#include <sys/hhook.h> 65#include <sys/malloc.h> 66#include <sys/mbuf.h> 67#include <sys/proc.h> /* for proc0 declaration */ 68#include <sys/protosw.h> 69#include <sys/sdt.h> 70#include <sys/signalvar.h> 71#include <sys/socket.h> 72#include <sys/socketvar.h> 73#include <sys/sysctl.h> 74#include <sys/syslog.h> 75#include <sys/systm.h> 76 77#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 78 79#include <vm/uma.h> 80 81#include <net/route.h> 82#include <net/vnet.h> 83 84#define TCPSTATES /* for logging */ 85 86#include <netinet/in.h> 87#include <netinet/in_kdtrace.h> 88#include <netinet/in_pcb.h> 89#include <netinet/in_systm.h> 90#include <netinet/ip.h> 91#include <netinet/ip_icmp.h> /* required for icmp_var.h */ 92#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 93#include <netinet/ip_var.h> 94#include <netinet/ip_options.h> 95#include <netinet/ip6.h> 96#include <netinet/icmp6.h> 97#include <netinet6/in6_pcb.h> 98#include <netinet6/ip6_var.h> 99#include <netinet/tcp.h> 100#include <netinet/tcp_fsm.h> 101#include <netinet/tcp_seq.h> 102#include <netinet/tcp_timer.h> 103#include <netinet/tcp_var.h> 104#include <netinet6/tcp6_var.h> 105#include <netinet/tcpip.h> 106#include <netinet/tcp_syncache.h> 107#include <netinet/cc/cc.h> 108#ifdef TCPDEBUG 109#include <netinet/tcp_debug.h> 110#endif /* TCPDEBUG */ 111#ifdef TCP_OFFLOAD 112#include <netinet/tcp_offload.h> 113#endif 114 115#include <machine/in_cksum.h> 116 117#include <security/mac/mac_framework.h> 118 119VNET_DECLARE(int, tcp_autorcvbuf_inc); 120#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 121VNET_DECLARE(int, tcp_autorcvbuf_max); 122#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 123VNET_DECLARE(int, tcp_do_rfc3042); 124#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) 125VNET_DECLARE(int, tcp_do_autorcvbuf); 126#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 127VNET_DECLARE(int, tcp_insecure_rst); 128#define V_tcp_insecure_rst VNET(tcp_insecure_rst) 129VNET_DECLARE(int, tcp_insecure_syn); 130#define V_tcp_insecure_syn VNET(tcp_insecure_syn) 131VNET_DECLARE(int, drop_synfin); 132#define V_drop_synfin VNET(drop_synfin) 133 134static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *, 135 struct socket *, struct tcpcb *, int, int, uint8_t, 136 int); 137 138static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *, 139 struct socket *, struct tcpcb *, int, int, uint8_t, 140 int); 141 142/* 143 * Indicate whether this ack should be delayed. We can delay the ack if 144 * following conditions are met: 145 * - There is no delayed ack timer in progress. 146 * - Our last ack wasn't a 0-sized window. We never want to delay 147 * the ack that opens up a 0-sized window. 148 * - LRO wasn't used for this segment. We make sure by checking that the 149 * segment size is not larger than the MSS. 150 */ 151#define DELAY_ACK(tp, tlen) \ 152 ((!tcp_timer_active(tp, TT_DELACK) && \ 153 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 154 (tlen <= tp->t_maxseg) && \ 155 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 156 157/* 158 * So how is this faster than the normal fast ack? 159 * It basically allows us to also stay in the fastpath 160 * when a window-update ack also arrives. In testing 161 * we saw only 25-30% of connections doing fastpath 162 * due to the fact that along with moving forward 163 * in sequence the window was also updated. 164 */ 165static void 166tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 167 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 168 int ti_locked, u_long tiwin) 169{ 170 int acked; 171 int winup_only=0; 172#ifdef TCPDEBUG 173 /* 174 * The size of tcp_saveipgen must be the size of the max ip header, 175 * now IPv6. 176 */ 177 u_char tcp_saveipgen[IP6_HDR_LEN]; 178 struct tcphdr tcp_savetcp; 179 short ostate = 0; 180#endif 181 /* 182 * The following if statement will be true if 183 * we are doing the win_up_in_fp <and> 184 * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or> 185 * - No more new data, but we have an ack for new data 186 * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) 187 * - No more new data, the same ack point but the window grew 188 * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd) 189 */ 190 if ((SEQ_LT(tp->snd_wl1, th->th_seq) || 191 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 192 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 193 /* keep track of pure window updates */ 194 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { 195 winup_only = 1; 196 TCPSTAT_INC(tcps_rcvwinupd); 197 } 198 tp->snd_wnd = tiwin; 199 tp->snd_wl1 = th->th_seq; 200 tp->snd_wl2 = th->th_ack; 201 if (tp->snd_wnd > tp->max_sndwnd) 202 tp->max_sndwnd = tp->snd_wnd; 203 } 204 /* 205 * If last ACK falls within this segment's sequence numbers, 206 * record the timestamp. 207 * NOTE that the test is modified according to the latest 208 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 209 */ 210 if ((to->to_flags & TOF_TS) != 0 && 211 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 212 tp->ts_recent_age = tcp_ts_getticks(); 213 tp->ts_recent = to->to_tsval; 214 } 215 /* 216 * This is a pure ack for outstanding data. 217 */ 218 if (ti_locked == TI_RLOCKED) { 219 INP_INFO_RUNLOCK(&V_tcbinfo); 220 } 221 ti_locked = TI_UNLOCKED; 222 223 TCPSTAT_INC(tcps_predack); 224 225 /* 226 * "bad retransmit" recovery. 227 */ 228 if (tp->t_rxtshift == 1 && 229 tp->t_flags & TF_PREVVALID && 230 (int)(ticks - tp->t_badrxtwin) < 0) { 231 cc_cong_signal(tp, th, CC_RTO_ERR); 232 } 233 234 /* 235 * Recalculate the transmit timer / rtt. 236 * 237 * Some boxes send broken timestamp replies 238 * during the SYN+ACK phase, ignore 239 * timestamps of 0 or we could calculate a 240 * huge RTT and blow up the retransmit timer. 241 */ 242 if ((to->to_flags & TOF_TS) != 0 && 243 to->to_tsecr) { 244 u_int t; 245 246 t = tcp_ts_getticks() - to->to_tsecr; 247 if (!tp->t_rttlow || tp->t_rttlow > t) 248 tp->t_rttlow = t; 249 tcp_xmit_timer(tp, 250 TCP_TS_TO_TICKS(t) + 1); 251 } else if (tp->t_rtttime && 252 SEQ_GT(th->th_ack, tp->t_rtseq)) { 253 if (!tp->t_rttlow || 254 tp->t_rttlow > ticks - tp->t_rtttime) 255 tp->t_rttlow = ticks - tp->t_rtttime; 256 tcp_xmit_timer(tp, 257 ticks - tp->t_rtttime); 258 } 259 if (winup_only == 0) { 260 acked = BYTES_THIS_ACK(tp, th); 261 262 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 263 hhook_run_tcp_est_in(tp, th, to); 264 265 TCPSTAT_ADD(tcps_rcvackbyte, acked); 266 sbdrop(&so->so_snd, acked); 267 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 268 SEQ_LEQ(th->th_ack, tp->snd_recover)) 269 tp->snd_recover = th->th_ack - 1; 270 271 /* 272 * Let the congestion control algorithm update 273 * congestion control related information. This 274 * typically means increasing the congestion 275 * window. 276 */ 277 cc_ack_received(tp, th, CC_ACK); 278 279 tp->snd_una = th->th_ack; 280 /* 281 * Pull snd_wl2 up to prevent seq wrap relative 282 * to th_ack. 283 */ 284 tp->snd_wl2 = th->th_ack; 285 tp->t_dupacks = 0; 286 287 /* 288 * If all outstanding data are acked, stop 289 * retransmit timer, otherwise restart timer 290 * using current (possibly backed-off) value. 291 * If process is waiting for space, 292 * wakeup/selwakeup/signal. If data 293 * are ready to send, let tcp_output 294 * decide between more output or persist. 295 */ 296#ifdef TCPDEBUG 297 if (so->so_options & SO_DEBUG) 298 tcp_trace(TA_INPUT, ostate, tp, 299 (void *)tcp_saveipgen, 300 &tcp_savetcp, 0); 301#endif 302 TCP_PROBE3(debug__input, tp, th, m); 303 m_freem(m); 304 if (tp->snd_una == tp->snd_max) 305 tcp_timer_activate(tp, TT_REXMT, 0); 306 else if (!tcp_timer_active(tp, TT_PERSIST)) 307 tcp_timer_activate(tp, TT_REXMT, 308 tp->t_rxtcur); 309 } else { 310 /* 311 * Window update only, just free the mbufs and 312 * send out whatever we can. 313 */ 314 m_freem(m); 315 } 316 sowwakeup(so); 317 if (sbavail(&so->so_snd)) 318 (void) tcp_output(tp); 319 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 320 __func__, ti_locked)); 321 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 322 INP_WLOCK_ASSERT(tp->t_inpcb); 323 324 if (tp->t_flags & TF_DELACK) { 325 tp->t_flags &= ~TF_DELACK; 326 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 327 } 328 INP_WUNLOCK(tp->t_inpcb); 329} 330 331/* 332 * Here nothing is really faster, its just that we 333 * have broken out the fast-data path also just like 334 * the fast-ack. 335 */ 336static void 337tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 338 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 339 int ti_locked, u_long tiwin) 340{ 341 int newsize = 0; /* automatic sockbuf scaling */ 342#ifdef TCPDEBUG 343 /* 344 * The size of tcp_saveipgen must be the size of the max ip header, 345 * now IPv6. 346 */ 347 u_char tcp_saveipgen[IP6_HDR_LEN]; 348 struct tcphdr tcp_savetcp; 349 short ostate = 0; 350#endif 351 /* 352 * If last ACK falls within this segment's sequence numbers, 353 * record the timestamp. 354 * NOTE that the test is modified according to the latest 355 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 356 */ 357 if ((to->to_flags & TOF_TS) != 0 && 358 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 359 tp->ts_recent_age = tcp_ts_getticks(); 360 tp->ts_recent = to->to_tsval; 361 } 362 363 /* 364 * This is a pure, in-sequence data packet with 365 * nothing on the reassembly queue and we have enough 366 * buffer space to take it. 367 */ 368 if (ti_locked == TI_RLOCKED) { 369 INP_INFO_RUNLOCK(&V_tcbinfo); 370 } 371 ti_locked = TI_UNLOCKED; 372 373 /* Clean receiver SACK report if present */ 374 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 375 tcp_clean_sackreport(tp); 376 TCPSTAT_INC(tcps_preddat); 377 tp->rcv_nxt += tlen; 378 /* 379 * Pull snd_wl1 up to prevent seq wrap relative to 380 * th_seq. 381 */ 382 tp->snd_wl1 = th->th_seq; 383 /* 384 * Pull rcv_up up to prevent seq wrap relative to 385 * rcv_nxt. 386 */ 387 tp->rcv_up = tp->rcv_nxt; 388 TCPSTAT_ADD(tcps_rcvbyte, tlen); 389#ifdef TCPDEBUG 390 if (so->so_options & SO_DEBUG) 391 tcp_trace(TA_INPUT, ostate, tp, 392 (void *)tcp_saveipgen, &tcp_savetcp, 0); 393#endif 394 TCP_PROBE3(debug__input, tp, th, m); 395 396 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 397 398 /* Add data to socket buffer. */ 399 SOCKBUF_LOCK(&so->so_rcv); 400 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 401 m_freem(m); 402 } else { 403 /* 404 * Set new socket buffer size. 405 * Give up when limit is reached. 406 */ 407 if (newsize) 408 if (!sbreserve_locked(&so->so_rcv, 409 newsize, so, NULL)) 410 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 411 m_adj(m, drop_hdrlen); /* delayed header drop */ 412 sbappendstream_locked(&so->so_rcv, m, 0); 413 } 414 /* NB: sorwakeup_locked() does an implicit unlock. */ 415 sorwakeup_locked(so); 416 if (DELAY_ACK(tp, tlen)) { 417 tp->t_flags |= TF_DELACK; 418 } else { 419 tp->t_flags |= TF_ACKNOW; 420 tcp_output(tp); 421 } 422 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 423 __func__, ti_locked)); 424 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 425 INP_WLOCK_ASSERT(tp->t_inpcb); 426 427 if (tp->t_flags & TF_DELACK) { 428 tp->t_flags &= ~TF_DELACK; 429 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 430 } 431 INP_WUNLOCK(tp->t_inpcb); 432} 433 434/* 435 * The slow-path is the clone of the long long part 436 * of tcp_do_segment past all the fast-path stuff. We 437 * use it here by two different callers, the fast/slow and 438 * the fastack only. 439 */ 440static void 441tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so, 442 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 443 int ti_locked, u_long tiwin, int thflags) 444{ 445 int acked, ourfinisacked, needoutput = 0; 446 int rstreason, todrop, win; 447 char *s; 448 struct in_conninfo *inc; 449 struct mbuf *mfree = NULL; 450#ifdef TCPDEBUG 451 /* 452 * The size of tcp_saveipgen must be the size of the max ip header, 453 * now IPv6. 454 */ 455 u_char tcp_saveipgen[IP6_HDR_LEN]; 456 struct tcphdr tcp_savetcp; 457 short ostate = 0; 458#endif 459 /* 460 * Calculate amount of space in receive window, 461 * and then do TCP input processing. 462 * Receive window is amount of space in rcv queue, 463 * but not less than advertised window. 464 */ 465 inc = &tp->t_inpcb->inp_inc; 466 win = sbspace(&so->so_rcv); 467 if (win < 0) 468 win = 0; 469 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 470 471 switch (tp->t_state) { 472 473 /* 474 * If the state is SYN_RECEIVED: 475 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 476 */ 477 case TCPS_SYN_RECEIVED: 478 if ((thflags & TH_ACK) && 479 (SEQ_LEQ(th->th_ack, tp->snd_una) || 480 SEQ_GT(th->th_ack, tp->snd_max))) { 481 rstreason = BANDLIM_RST_OPENPORT; 482 goto dropwithreset; 483 } 484 break; 485 486 /* 487 * If the state is SYN_SENT: 488 * if seg contains a RST, then drop the connection. 489 * if seg does not contain SYN, then drop it. 490 * Otherwise this is an acceptable SYN segment 491 * initialize tp->rcv_nxt and tp->irs 492 * if seg contains ack then advance tp->snd_una 493 * if seg contains an ECE and ECN support is enabled, the stream 494 * is ECN capable. 495 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 496 * arrange for segment to be acked (eventually) 497 * continue processing rest of data/controls, beginning with URG 498 */ 499 case TCPS_SYN_SENT: 500 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { 501 TCP_PROBE5(connect__refused, NULL, tp, m, tp, th); 502 tp = tcp_drop(tp, ECONNREFUSED); 503 } 504 if (thflags & TH_RST) 505 goto drop; 506 if (!(thflags & TH_SYN)) 507 goto drop; 508 509 tp->irs = th->th_seq; 510 tcp_rcvseqinit(tp); 511 if (thflags & TH_ACK) { 512 TCPSTAT_INC(tcps_connects); 513 soisconnected(so); 514#ifdef MAC 515 mac_socketpeer_set_from_mbuf(m, so); 516#endif 517 /* Do window scaling on this connection? */ 518 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 519 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 520 tp->rcv_scale = tp->request_r_scale; 521 } 522 tp->rcv_adv += imin(tp->rcv_wnd, 523 TCP_MAXWIN << tp->rcv_scale); 524 tp->snd_una++; /* SYN is acked */ 525 /* 526 * If there's data, delay ACK; if there's also a FIN 527 * ACKNOW will be turned on later. 528 */ 529 if (DELAY_ACK(tp, tlen) && tlen != 0) 530 tcp_timer_activate(tp, TT_DELACK, 531 tcp_delacktime); 532 else 533 tp->t_flags |= TF_ACKNOW; 534 535 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 536 tp->t_flags |= TF_ECN_PERMIT; 537 TCPSTAT_INC(tcps_ecn_shs); 538 } 539 540 /* 541 * Received <SYN,ACK> in SYN_SENT[*] state. 542 * Transitions: 543 * SYN_SENT --> ESTABLISHED 544 * SYN_SENT* --> FIN_WAIT_1 545 */ 546 tp->t_starttime = ticks; 547 if (tp->t_flags & TF_NEEDFIN) { 548 tcp_state_change(tp, TCPS_FIN_WAIT_1); 549 tp->t_flags &= ~TF_NEEDFIN; 550 thflags &= ~TH_SYN; 551 } else { 552 tcp_state_change(tp, TCPS_ESTABLISHED); 553 TCP_PROBE5(connect__established, NULL, tp, 554 m, tp, th); 555 cc_conn_init(tp); 556 tcp_timer_activate(tp, TT_KEEP, 557 TP_KEEPIDLE(tp)); 558 } 559 } else { 560 /* 561 * Received initial SYN in SYN-SENT[*] state => 562 * simultaneous open. 563 * If it succeeds, connection is * half-synchronized. 564 * Otherwise, do 3-way handshake: 565 * SYN-SENT -> SYN-RECEIVED 566 * SYN-SENT* -> SYN-RECEIVED* 567 */ 568 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 569 tcp_timer_activate(tp, TT_REXMT, 0); 570 tcp_state_change(tp, TCPS_SYN_RECEIVED); 571 } 572 573 KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " 574 "ti_locked %d", __func__, ti_locked)); 575 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 576 INP_WLOCK_ASSERT(tp->t_inpcb); 577 578 /* 579 * Advance th->th_seq to correspond to first data byte. 580 * If data, trim to stay within window, 581 * dropping FIN if necessary. 582 */ 583 th->th_seq++; 584 if (tlen > tp->rcv_wnd) { 585 todrop = tlen - tp->rcv_wnd; 586 m_adj(m, -todrop); 587 tlen = tp->rcv_wnd; 588 thflags &= ~TH_FIN; 589 TCPSTAT_INC(tcps_rcvpackafterwin); 590 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 591 } 592 tp->snd_wl1 = th->th_seq - 1; 593 tp->rcv_up = th->th_seq; 594 /* 595 * Client side of transaction: already sent SYN and data. 596 * If the remote host used T/TCP to validate the SYN, 597 * our data will be ACK'd; if so, enter normal data segment 598 * processing in the middle of step 5, ack processing. 599 * Otherwise, goto step 6. 600 */ 601 if (thflags & TH_ACK) 602 goto process_ACK; 603 604 goto step6; 605 606 /* 607 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 608 * do normal processing. 609 * 610 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 611 */ 612 case TCPS_LAST_ACK: 613 case TCPS_CLOSING: 614 break; /* continue normal processing */ 615 } 616 617 /* 618 * States other than LISTEN or SYN_SENT. 619 * First check the RST flag and sequence number since reset segments 620 * are exempt from the timestamp and connection count tests. This 621 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 622 * below which allowed reset segments in half the sequence space 623 * to fall though and be processed (which gives forged reset 624 * segments with a random sequence number a 50 percent chance of 625 * killing a connection). 626 * Then check timestamp, if present. 627 * Then check the connection count, if present. 628 * Then check that at least some bytes of segment are within 629 * receive window. If segment begins before rcv_nxt, 630 * drop leading data (and SYN); if nothing left, just ack. 631 */ 632 if (thflags & TH_RST) { 633 /* 634 * RFC5961 Section 3.2 635 * 636 * - RST drops connection only if SEG.SEQ == RCV.NXT. 637 * - If RST is in window, we send challenge ACK. 638 * 639 * Note: to take into account delayed ACKs, we should 640 * test against last_ack_sent instead of rcv_nxt. 641 * Note 2: we handle special case of closed window, not 642 * covered by the RFC. 643 */ 644 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 645 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 646 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 647 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 648 KASSERT(ti_locked == TI_RLOCKED, 649 ("%s: TH_RST ti_locked %d, th %p tp %p", 650 __func__, ti_locked, th, tp)); 651 KASSERT(tp->t_state != TCPS_SYN_SENT, 652 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 653 __func__, th, tp)); 654 655 if (V_tcp_insecure_rst || 656 tp->last_ack_sent == th->th_seq) { 657 TCPSTAT_INC(tcps_drops); 658 /* Drop the connection. */ 659 switch (tp->t_state) { 660 case TCPS_SYN_RECEIVED: 661 so->so_error = ECONNREFUSED; 662 goto close; 663 case TCPS_ESTABLISHED: 664 case TCPS_FIN_WAIT_1: 665 case TCPS_FIN_WAIT_2: 666 case TCPS_CLOSE_WAIT: 667 case TCPS_CLOSING: 668 case TCPS_LAST_ACK: 669 so->so_error = ECONNRESET; 670 close: 671 /* FALLTHROUGH */ 672 default: 673 tp = tcp_close(tp); 674 } 675 } else { 676 TCPSTAT_INC(tcps_badrst); 677 /* Send challenge ACK. */ 678 tcp_respond(tp, mtod(m, void *), th, m, 679 tp->rcv_nxt, tp->snd_nxt, TH_ACK); 680 tp->last_ack_sent = tp->rcv_nxt; 681 m = NULL; 682 } 683 } 684 goto drop; 685 } 686 687 /* 688 * RFC5961 Section 4.2 689 * Send challenge ACK for any SYN in synchronized state. 690 */ 691 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { 692 KASSERT(ti_locked == TI_RLOCKED, 693 ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); 694 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 695 696 TCPSTAT_INC(tcps_badsyn); 697 if (V_tcp_insecure_syn && 698 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 699 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 700 tp = tcp_drop(tp, ECONNRESET); 701 rstreason = BANDLIM_UNLIMITED; 702 } else { 703 /* Send challenge ACK. */ 704 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 705 tp->snd_nxt, TH_ACK); 706 tp->last_ack_sent = tp->rcv_nxt; 707 m = NULL; 708 } 709 goto drop; 710 } 711 712 /* 713 * RFC 1323 PAWS: If we have a timestamp reply on this segment 714 * and it's less than ts_recent, drop it. 715 */ 716 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 717 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 718 719 /* Check to see if ts_recent is over 24 days old. */ 720 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 721 /* 722 * Invalidate ts_recent. If this segment updates 723 * ts_recent, the age will be reset later and ts_recent 724 * will get a valid value. If it does not, setting 725 * ts_recent to zero will at least satisfy the 726 * requirement that zero be placed in the timestamp 727 * echo reply when ts_recent isn't valid. The 728 * age isn't reset until we get a valid ts_recent 729 * because we don't want out-of-order segments to be 730 * dropped when ts_recent is old. 731 */ 732 tp->ts_recent = 0; 733 } else { 734 TCPSTAT_INC(tcps_rcvduppack); 735 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 736 TCPSTAT_INC(tcps_pawsdrop); 737 if (tlen) 738 goto dropafterack; 739 goto drop; 740 } 741 } 742 743 /* 744 * In the SYN-RECEIVED state, validate that the packet belongs to 745 * this connection before trimming the data to fit the receive 746 * window. Check the sequence number versus IRS since we know 747 * the sequence numbers haven't wrapped. This is a partial fix 748 * for the "LAND" DoS attack. 749 */ 750 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 751 rstreason = BANDLIM_RST_OPENPORT; 752 goto dropwithreset; 753 } 754 755 todrop = tp->rcv_nxt - th->th_seq; 756 if (todrop > 0) { 757 if (thflags & TH_SYN) { 758 thflags &= ~TH_SYN; 759 th->th_seq++; 760 if (th->th_urp > 1) 761 th->th_urp--; 762 else 763 thflags &= ~TH_URG; 764 todrop--; 765 } 766 /* 767 * Following if statement from Stevens, vol. 2, p. 960. 768 */ 769 if (todrop > tlen 770 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 771 /* 772 * Any valid FIN must be to the left of the window. 773 * At this point the FIN must be a duplicate or out 774 * of sequence; drop it. 775 */ 776 thflags &= ~TH_FIN; 777 778 /* 779 * Send an ACK to resynchronize and drop any data. 780 * But keep on processing for RST or ACK. 781 */ 782 tp->t_flags |= TF_ACKNOW; 783 todrop = tlen; 784 TCPSTAT_INC(tcps_rcvduppack); 785 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 786 } else { 787 TCPSTAT_INC(tcps_rcvpartduppack); 788 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 789 } 790 drop_hdrlen += todrop; /* drop from the top afterwards */ 791 th->th_seq += todrop; 792 tlen -= todrop; 793 if (th->th_urp > todrop) 794 th->th_urp -= todrop; 795 else { 796 thflags &= ~TH_URG; 797 th->th_urp = 0; 798 } 799 } 800 801 /* 802 * If new data are received on a connection after the 803 * user processes are gone, then RST the other end. 804 */ 805 if ((so->so_state & SS_NOFDREF) && 806 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 807 KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " 808 "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); 809 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 810 811 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 812 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " 813 "after socket was closed, " 814 "sending RST and removing tcpcb\n", 815 s, __func__, tcpstates[tp->t_state], tlen); 816 free(s, M_TCPLOG); 817 } 818 tp = tcp_close(tp); 819 TCPSTAT_INC(tcps_rcvafterclose); 820 rstreason = BANDLIM_UNLIMITED; 821 goto dropwithreset; 822 } 823 824 /* 825 * If segment ends after window, drop trailing data 826 * (and PUSH and FIN); if nothing left, just ACK. 827 */ 828 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 829 if (todrop > 0) { 830 TCPSTAT_INC(tcps_rcvpackafterwin); 831 if (todrop >= tlen) { 832 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 833 /* 834 * If window is closed can only take segments at 835 * window edge, and have to drop data and PUSH from 836 * incoming segments. Continue processing, but 837 * remember to ack. Otherwise, drop segment 838 * and ack. 839 */ 840 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 841 tp->t_flags |= TF_ACKNOW; 842 TCPSTAT_INC(tcps_rcvwinprobe); 843 } else 844 goto dropafterack; 845 } else 846 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 847 m_adj(m, -todrop); 848 tlen -= todrop; 849 thflags &= ~(TH_PUSH|TH_FIN); 850 } 851 852 /* 853 * If last ACK falls within this segment's sequence numbers, 854 * record its timestamp. 855 * NOTE: 856 * 1) That the test incorporates suggestions from the latest 857 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 858 * 2) That updating only on newer timestamps interferes with 859 * our earlier PAWS tests, so this check should be solely 860 * predicated on the sequence space of this segment. 861 * 3) That we modify the segment boundary check to be 862 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 863 * instead of RFC1323's 864 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 865 * This modified check allows us to overcome RFC1323's 866 * limitations as described in Stevens TCP/IP Illustrated 867 * Vol. 2 p.869. In such cases, we can still calculate the 868 * RTT correctly when RCV.NXT == Last.ACK.Sent. 869 */ 870 if ((to->to_flags & TOF_TS) != 0 && 871 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 872 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 873 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 874 tp->ts_recent_age = tcp_ts_getticks(); 875 tp->ts_recent = to->to_tsval; 876 } 877 878 /* 879 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 880 * flag is on (half-synchronized state), then queue data for 881 * later processing; else drop segment and return. 882 */ 883 if ((thflags & TH_ACK) == 0) { 884 if (tp->t_state == TCPS_SYN_RECEIVED || 885 (tp->t_flags & TF_NEEDSYN)) 886 goto step6; 887 else if (tp->t_flags & TF_ACKNOW) 888 goto dropafterack; 889 else 890 goto drop; 891 } 892 893 /* 894 * Ack processing. 895 */ 896 switch (tp->t_state) { 897 898 /* 899 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 900 * ESTABLISHED state and continue processing. 901 * The ACK was checked above. 902 */ 903 case TCPS_SYN_RECEIVED: 904 905 TCPSTAT_INC(tcps_connects); 906 soisconnected(so); 907 /* Do window scaling? */ 908 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 909 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 910 tp->rcv_scale = tp->request_r_scale; 911 tp->snd_wnd = tiwin; 912 } 913 /* 914 * Make transitions: 915 * SYN-RECEIVED -> ESTABLISHED 916 * SYN-RECEIVED* -> FIN-WAIT-1 917 */ 918 tp->t_starttime = ticks; 919 if (tp->t_flags & TF_NEEDFIN) { 920 tcp_state_change(tp, TCPS_FIN_WAIT_1); 921 tp->t_flags &= ~TF_NEEDFIN; 922 } else { 923 tcp_state_change(tp, TCPS_ESTABLISHED); 924 TCP_PROBE5(accept__established, NULL, tp, 925 m, tp, th); 926 cc_conn_init(tp); 927 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 928 } 929 /* 930 * If segment contains data or ACK, will call tcp_reass() 931 * later; if not, do so now to pass queued data to user. 932 */ 933 if (tlen == 0 && (thflags & TH_FIN) == 0) 934 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 935 (struct mbuf *)0); 936 tp->snd_wl1 = th->th_seq - 1; 937 /* FALLTHROUGH */ 938 939 /* 940 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 941 * ACKs. If the ack is in the range 942 * tp->snd_una < th->th_ack <= tp->snd_max 943 * then advance tp->snd_una to th->th_ack and drop 944 * data from the retransmission queue. If this ACK reflects 945 * more up to date window information we update our window information. 946 */ 947 case TCPS_ESTABLISHED: 948 case TCPS_FIN_WAIT_1: 949 case TCPS_FIN_WAIT_2: 950 case TCPS_CLOSE_WAIT: 951 case TCPS_CLOSING: 952 case TCPS_LAST_ACK: 953 if (SEQ_GT(th->th_ack, tp->snd_max)) { 954 TCPSTAT_INC(tcps_rcvacktoomuch); 955 goto dropafterack; 956 } 957 if ((tp->t_flags & TF_SACK_PERMIT) && 958 ((to->to_flags & TOF_SACK) || 959 !TAILQ_EMPTY(&tp->snd_holes))) 960 tcp_sack_doack(tp, to, th->th_ack); 961 else 962 /* 963 * Reset the value so that previous (valid) value 964 * from the last ack with SACK doesn't get used. 965 */ 966 tp->sackhint.sacked_bytes = 0; 967 968 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 969 hhook_run_tcp_est_in(tp, th, to); 970 971 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 972 if (tlen == 0 && tiwin == tp->snd_wnd) { 973 /* 974 * If this is the first time we've seen a 975 * FIN from the remote, this is not a 976 * duplicate and it needs to be processed 977 * normally. This happens during a 978 * simultaneous close. 979 */ 980 if ((thflags & TH_FIN) && 981 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { 982 tp->t_dupacks = 0; 983 break; 984 } 985 TCPSTAT_INC(tcps_rcvdupack); 986 /* 987 * If we have outstanding data (other than 988 * a window probe), this is a completely 989 * duplicate ack (ie, window info didn't 990 * change and FIN isn't set), 991 * the ack is the biggest we've 992 * seen and we've seen exactly our rexmt 993 * threshold of them, assume a packet 994 * has been dropped and retransmit it. 995 * Kludge snd_nxt & the congestion 996 * window so we send only this one 997 * packet. 998 * 999 * We know we're losing at the current 1000 * window size so do congestion avoidance 1001 * (set ssthresh to half the current window 1002 * and pull our congestion window back to 1003 * the new ssthresh). 1004 * 1005 * Dup acks mean that packets have left the 1006 * network (they're now cached at the receiver) 1007 * so bump cwnd by the amount in the receiver 1008 * to keep a constant cwnd packets in the 1009 * network. 1010 * 1011 * When using TCP ECN, notify the peer that 1012 * we reduced the cwnd. 1013 */ 1014 if (!tcp_timer_active(tp, TT_REXMT) || 1015 th->th_ack != tp->snd_una) 1016 tp->t_dupacks = 0; 1017 else if (++tp->t_dupacks > tcprexmtthresh || 1018 IN_FASTRECOVERY(tp->t_flags)) { 1019 cc_ack_received(tp, th, CC_DUPACK); 1020 if ((tp->t_flags & TF_SACK_PERMIT) && 1021 IN_FASTRECOVERY(tp->t_flags)) { 1022 int awnd; 1023 1024 /* 1025 * Compute the amount of data in flight first. 1026 * We can inject new data into the pipe iff 1027 * we have less than 1/2 the original window's 1028 * worth of data in flight. 1029 */ 1030 if (V_tcp_do_rfc6675_pipe) 1031 awnd = tcp_compute_pipe(tp); 1032 else 1033 awnd = (tp->snd_nxt - tp->snd_fack) + 1034 tp->sackhint.sack_bytes_rexmit; 1035 1036 if (awnd < tp->snd_ssthresh) { 1037 tp->snd_cwnd += tp->t_maxseg; 1038 if (tp->snd_cwnd > tp->snd_ssthresh) 1039 tp->snd_cwnd = tp->snd_ssthresh; 1040 } 1041 } else 1042 tp->snd_cwnd += tp->t_maxseg; 1043 (void) tp->t_fb->tfb_tcp_output(tp); 1044 goto drop; 1045 } else if (tp->t_dupacks == tcprexmtthresh) { 1046 tcp_seq onxt = tp->snd_nxt; 1047 1048 /* 1049 * If we're doing sack, check to 1050 * see if we're already in sack 1051 * recovery. If we're not doing sack, 1052 * check to see if we're in newreno 1053 * recovery. 1054 */ 1055 if (tp->t_flags & TF_SACK_PERMIT) { 1056 if (IN_FASTRECOVERY(tp->t_flags)) { 1057 tp->t_dupacks = 0; 1058 break; 1059 } 1060 } else { 1061 if (SEQ_LEQ(th->th_ack, 1062 tp->snd_recover)) { 1063 tp->t_dupacks = 0; 1064 break; 1065 } 1066 } 1067 /* Congestion signal before ack. */ 1068 cc_cong_signal(tp, th, CC_NDUPACK); 1069 cc_ack_received(tp, th, CC_DUPACK); 1070 tcp_timer_activate(tp, TT_REXMT, 0); 1071 tp->t_rtttime = 0; 1072 if (tp->t_flags & TF_SACK_PERMIT) { 1073 TCPSTAT_INC( 1074 tcps_sack_recovery_episode); 1075 tp->sack_newdata = tp->snd_nxt; 1076 tp->snd_cwnd = tp->t_maxseg; 1077 (void) tp->t_fb->tfb_tcp_output(tp); 1078 goto drop; 1079 } 1080 tp->snd_nxt = th->th_ack; 1081 tp->snd_cwnd = tp->t_maxseg; 1082 (void) tp->t_fb->tfb_tcp_output(tp); 1083 KASSERT(tp->snd_limited <= 2, 1084 ("%s: tp->snd_limited too big", 1085 __func__)); 1086 tp->snd_cwnd = tp->snd_ssthresh + 1087 tp->t_maxseg * 1088 (tp->t_dupacks - tp->snd_limited); 1089 if (SEQ_GT(onxt, tp->snd_nxt)) 1090 tp->snd_nxt = onxt; 1091 goto drop; 1092 } else if (V_tcp_do_rfc3042) { 1093 /* 1094 * Process first and second duplicate 1095 * ACKs. Each indicates a segment 1096 * leaving the network, creating room 1097 * for more. Make sure we can send a 1098 * packet on reception of each duplicate 1099 * ACK by increasing snd_cwnd by one 1100 * segment. Restore the original 1101 * snd_cwnd after packet transmission. 1102 */ 1103 cc_ack_received(tp, th, CC_DUPACK); 1104 u_long oldcwnd = tp->snd_cwnd; 1105 tcp_seq oldsndmax = tp->snd_max; 1106 u_int sent; 1107 int avail; 1108 1109 KASSERT(tp->t_dupacks == 1 || 1110 tp->t_dupacks == 2, 1111 ("%s: dupacks not 1 or 2", 1112 __func__)); 1113 if (tp->t_dupacks == 1) 1114 tp->snd_limited = 0; 1115 tp->snd_cwnd = 1116 (tp->snd_nxt - tp->snd_una) + 1117 (tp->t_dupacks - tp->snd_limited) * 1118 tp->t_maxseg; 1119 /* 1120 * Only call tcp_output when there 1121 * is new data available to be sent. 1122 * Otherwise we would send pure ACKs. 1123 */ 1124 SOCKBUF_LOCK(&so->so_snd); 1125 avail = sbavail(&so->so_snd) - 1126 (tp->snd_nxt - tp->snd_una); 1127 SOCKBUF_UNLOCK(&so->so_snd); 1128 if (avail > 0) 1129 (void) tp->t_fb->tfb_tcp_output(tp); 1130 sent = tp->snd_max - oldsndmax; 1131 if (sent > tp->t_maxseg) { 1132 KASSERT((tp->t_dupacks == 2 && 1133 tp->snd_limited == 0) || 1134 (sent == tp->t_maxseg + 1 && 1135 tp->t_flags & TF_SENTFIN), 1136 ("%s: sent too much", 1137 __func__)); 1138 tp->snd_limited = 2; 1139 } else if (sent > 0) 1140 ++tp->snd_limited; 1141 tp->snd_cwnd = oldcwnd; 1142 goto drop; 1143 } 1144 } else 1145 tp->t_dupacks = 0; 1146 break; 1147 } 1148 1149 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 1150 ("%s: th_ack <= snd_una", __func__)); 1151 1152 /* 1153 * If the congestion window was inflated to account 1154 * for the other side's cached packets, retract it. 1155 */ 1156 if (IN_FASTRECOVERY(tp->t_flags)) { 1157 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 1158 if (tp->t_flags & TF_SACK_PERMIT) 1159 tcp_sack_partialack(tp, th); 1160 else 1161 tcp_newreno_partial_ack(tp, th); 1162 } else 1163 cc_post_recovery(tp, th); 1164 } 1165 tp->t_dupacks = 0; 1166 /* 1167 * If we reach this point, ACK is not a duplicate, 1168 * i.e., it ACKs something we sent. 1169 */ 1170 if (tp->t_flags & TF_NEEDSYN) { 1171 /* 1172 * T/TCP: Connection was half-synchronized, and our 1173 * SYN has been ACK'd (so connection is now fully 1174 * synchronized). Go to non-starred state, 1175 * increment snd_una for ACK of SYN, and check if 1176 * we can do window scaling. 1177 */ 1178 tp->t_flags &= ~TF_NEEDSYN; 1179 tp->snd_una++; 1180 /* Do window scaling? */ 1181 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1182 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1183 tp->rcv_scale = tp->request_r_scale; 1184 /* Send window already scaled. */ 1185 } 1186 } 1187 1188process_ACK: 1189 INP_WLOCK_ASSERT(tp->t_inpcb); 1190 1191 acked = BYTES_THIS_ACK(tp, th); 1192 TCPSTAT_INC(tcps_rcvackpack); 1193 TCPSTAT_ADD(tcps_rcvackbyte, acked); 1194 1195 /* 1196 * If we just performed our first retransmit, and the ACK 1197 * arrives within our recovery window, then it was a mistake 1198 * to do the retransmit in the first place. Recover our 1199 * original cwnd and ssthresh, and proceed to transmit where 1200 * we left off. 1201 */ 1202 if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && 1203 (int)(ticks - tp->t_badrxtwin) < 0) 1204 cc_cong_signal(tp, th, CC_RTO_ERR); 1205 1206 /* 1207 * If we have a timestamp reply, update smoothed 1208 * round trip time. If no timestamp is present but 1209 * transmit timer is running and timed sequence 1210 * number was acked, update smoothed round trip time. 1211 * Since we now have an rtt measurement, cancel the 1212 * timer backoff (cf., Phil Karn's retransmit alg.). 1213 * Recompute the initial retransmit timer. 1214 * 1215 * Some boxes send broken timestamp replies 1216 * during the SYN+ACK phase, ignore 1217 * timestamps of 0 or we could calculate a 1218 * huge RTT and blow up the retransmit timer. 1219 */ 1220 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 1221 u_int t; 1222 1223 t = tcp_ts_getticks() - to->to_tsecr; 1224 if (!tp->t_rttlow || tp->t_rttlow > t) 1225 tp->t_rttlow = t; 1226 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); 1227 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 1228 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 1229 tp->t_rttlow = ticks - tp->t_rtttime; 1230 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 1231 } 1232 1233 /* 1234 * If all outstanding data is acked, stop retransmit 1235 * timer and remember to restart (more output or persist). 1236 * If there is more data to be acked, restart retransmit 1237 * timer, using current (possibly backed-off) value. 1238 */ 1239 if (th->th_ack == tp->snd_max) { 1240 tcp_timer_activate(tp, TT_REXMT, 0); 1241 needoutput = 1; 1242 } else if (!tcp_timer_active(tp, TT_PERSIST)) 1243 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 1244 1245 /* 1246 * If no data (only SYN) was ACK'd, 1247 * skip rest of ACK processing. 1248 */ 1249 if (acked == 0) 1250 goto step6; 1251 1252 /* 1253 * Let the congestion control algorithm update congestion 1254 * control related information. This typically means increasing 1255 * the congestion window. 1256 */ 1257 cc_ack_received(tp, th, CC_ACK); 1258 1259 SOCKBUF_LOCK(&so->so_snd); 1260 if (acked > sbavail(&so->so_snd)) { 1261 tp->snd_wnd -= sbavail(&so->so_snd); 1262 mfree = sbcut_locked(&so->so_snd, 1263 (int)sbavail(&so->so_snd)); 1264 ourfinisacked = 1; 1265 } else { 1266 mfree = sbcut_locked(&so->so_snd, acked); 1267 tp->snd_wnd -= acked; 1268 ourfinisacked = 0; 1269 } 1270 /* NB: sowwakeup_locked() does an implicit unlock. */ 1271 sowwakeup_locked(so); 1272 m_freem(mfree); 1273 /* Detect una wraparound. */ 1274 if (!IN_RECOVERY(tp->t_flags) && 1275 SEQ_GT(tp->snd_una, tp->snd_recover) && 1276 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1277 tp->snd_recover = th->th_ack - 1; 1278 /* XXXLAS: Can this be moved up into cc_post_recovery? */ 1279 if (IN_RECOVERY(tp->t_flags) && 1280 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 1281 EXIT_RECOVERY(tp->t_flags); 1282 } 1283 tp->snd_una = th->th_ack; 1284 if (tp->t_flags & TF_SACK_PERMIT) { 1285 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 1286 tp->snd_recover = tp->snd_una; 1287 } 1288 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1289 tp->snd_nxt = tp->snd_una; 1290 1291 switch (tp->t_state) { 1292 1293 /* 1294 * In FIN_WAIT_1 STATE in addition to the processing 1295 * for the ESTABLISHED state if our FIN is now acknowledged 1296 * then enter FIN_WAIT_2. 1297 */ 1298 case TCPS_FIN_WAIT_1: 1299 if (ourfinisacked) { 1300 /* 1301 * If we can't receive any more 1302 * data, then closing user can proceed. 1303 * Starting the timer is contrary to the 1304 * specification, but if we don't get a FIN 1305 * we'll hang forever. 1306 * 1307 * XXXjl: 1308 * we should release the tp also, and use a 1309 * compressed state. 1310 */ 1311 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1312 soisdisconnected(so); 1313 tcp_timer_activate(tp, TT_2MSL, 1314 (tcp_fast_finwait2_recycle ? 1315 tcp_finwait2_timeout : 1316 TP_MAXIDLE(tp))); 1317 } 1318 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1319 } 1320 break; 1321 1322 /* 1323 * In CLOSING STATE in addition to the processing for 1324 * the ESTABLISHED state if the ACK acknowledges our FIN 1325 * then enter the TIME-WAIT state, otherwise ignore 1326 * the segment. 1327 */ 1328 case TCPS_CLOSING: 1329 if (ourfinisacked) { 1330 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1331 tcp_twstart(tp); 1332 INP_INFO_RUNLOCK(&V_tcbinfo); 1333 m_freem(m); 1334 return; 1335 } 1336 break; 1337 1338 /* 1339 * In LAST_ACK, we may still be waiting for data to drain 1340 * and/or to be acked, as well as for the ack of our FIN. 1341 * If our FIN is now acknowledged, delete the TCB, 1342 * enter the closed state and return. 1343 */ 1344 case TCPS_LAST_ACK: 1345 if (ourfinisacked) { 1346 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1347 tp = tcp_close(tp); 1348 goto drop; 1349 } 1350 break; 1351 } 1352 } 1353 1354step6: 1355 INP_WLOCK_ASSERT(tp->t_inpcb); 1356 1357 /* 1358 * Update window information. 1359 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1360 */ 1361 if ((thflags & TH_ACK) && 1362 (SEQ_LT(tp->snd_wl1, th->th_seq) || 1363 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 1364 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1365 /* keep track of pure window updates */ 1366 if (tlen == 0 && 1367 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1368 TCPSTAT_INC(tcps_rcvwinupd); 1369 tp->snd_wnd = tiwin; 1370 tp->snd_wl1 = th->th_seq; 1371 tp->snd_wl2 = th->th_ack; 1372 if (tp->snd_wnd > tp->max_sndwnd) 1373 tp->max_sndwnd = tp->snd_wnd; 1374 needoutput = 1; 1375 } 1376 1377 /* 1378 * Process segments with URG. 1379 */ 1380 if ((thflags & TH_URG) && th->th_urp && 1381 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1382 /* 1383 * This is a kludge, but if we receive and accept 1384 * random urgent pointers, we'll crash in 1385 * soreceive. It's hard to imagine someone 1386 * actually wanting to send this much urgent data. 1387 */ 1388 SOCKBUF_LOCK(&so->so_rcv); 1389 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 1390 th->th_urp = 0; /* XXX */ 1391 thflags &= ~TH_URG; /* XXX */ 1392 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 1393 goto dodata; /* XXX */ 1394 } 1395 /* 1396 * If this segment advances the known urgent pointer, 1397 * then mark the data stream. This should not happen 1398 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1399 * a FIN has been received from the remote side. 1400 * In these states we ignore the URG. 1401 * 1402 * According to RFC961 (Assigned Protocols), 1403 * the urgent pointer points to the last octet 1404 * of urgent data. We continue, however, 1405 * to consider it to indicate the first octet 1406 * of data past the urgent section as the original 1407 * spec states (in one of two places). 1408 */ 1409 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1410 tp->rcv_up = th->th_seq + th->th_urp; 1411 so->so_oobmark = sbavail(&so->so_rcv) + 1412 (tp->rcv_up - tp->rcv_nxt) - 1; 1413 if (so->so_oobmark == 0) 1414 so->so_rcv.sb_state |= SBS_RCVATMARK; 1415 sohasoutofband(so); 1416 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1417 } 1418 SOCKBUF_UNLOCK(&so->so_rcv); 1419 /* 1420 * Remove out of band data so doesn't get presented to user. 1421 * This can happen independent of advancing the URG pointer, 1422 * but if two URG's are pending at once, some out-of-band 1423 * data may creep in... ick. 1424 */ 1425 if (th->th_urp <= (u_long)tlen && 1426 !(so->so_options & SO_OOBINLINE)) { 1427 /* hdr drop is delayed */ 1428 tcp_pulloutofband(so, th, m, drop_hdrlen); 1429 } 1430 } else { 1431 /* 1432 * If no out of band data is expected, 1433 * pull receive urgent pointer along 1434 * with the receive window. 1435 */ 1436 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1437 tp->rcv_up = tp->rcv_nxt; 1438 } 1439dodata: /* XXX */ 1440 INP_WLOCK_ASSERT(tp->t_inpcb); 1441 1442 /* 1443 * Process the segment text, merging it into the TCP sequencing queue, 1444 * and arranging for acknowledgment of receipt if necessary. 1445 * This process logically involves adjusting tp->rcv_wnd as data 1446 * is presented to the user (this happens in tcp_usrreq.c, 1447 * case PRU_RCVD). If a FIN has already been received on this 1448 * connection then we just ignore the text. 1449 */ 1450 if ((tlen || (thflags & TH_FIN)) && 1451 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1452 tcp_seq save_start = th->th_seq; 1453 m_adj(m, drop_hdrlen); /* delayed header drop */ 1454 /* 1455 * Insert segment which includes th into TCP reassembly queue 1456 * with control block tp. Set thflags to whether reassembly now 1457 * includes a segment with FIN. This handles the common case 1458 * inline (segment is the next to be received on an established 1459 * connection, and the queue is empty), avoiding linkage into 1460 * and removal from the queue and repetition of various 1461 * conversions. 1462 * Set DELACK for segments received in order, but ack 1463 * immediately when segments are out of order (so 1464 * fast retransmit can work). 1465 */ 1466 if (th->th_seq == tp->rcv_nxt && 1467 SEGQ_EMPTY(tp) && 1468 TCPS_HAVEESTABLISHED(tp->t_state)) { 1469 if (DELAY_ACK(tp, tlen)) 1470 tp->t_flags |= TF_DELACK; 1471 else 1472 tp->t_flags |= TF_ACKNOW; 1473 tp->rcv_nxt += tlen; 1474 thflags = th->th_flags & TH_FIN; 1475 TCPSTAT_INC(tcps_rcvpack); 1476 TCPSTAT_ADD(tcps_rcvbyte, tlen); 1477 SOCKBUF_LOCK(&so->so_rcv); 1478 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1479 m_freem(m); 1480 else 1481 sbappendstream_locked(&so->so_rcv, m, 0); 1482 /* NB: sorwakeup_locked() does an implicit unlock. */ 1483 sorwakeup_locked(so); 1484 } else { 1485 /* 1486 * XXX: Due to the header drop above "th" is 1487 * theoretically invalid by now. Fortunately 1488 * m_adj() doesn't actually frees any mbufs 1489 * when trimming from the head. 1490 */ 1491 thflags = tcp_reass(tp, th, &save_start, &tlen, m); 1492 tp->t_flags |= TF_ACKNOW; 1493 } 1494 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 1495 tcp_update_sack_list(tp, save_start, save_start + tlen); 1496#if 0 1497 /* 1498 * Note the amount of data that peer has sent into 1499 * our window, in order to estimate the sender's 1500 * buffer size. 1501 * XXX: Unused. 1502 */ 1503 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) 1504 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1505 else 1506 len = so->so_rcv.sb_hiwat; 1507#endif 1508 } else { 1509 m_freem(m); 1510 thflags &= ~TH_FIN; 1511 } 1512 1513 /* 1514 * If FIN is received ACK the FIN and let the user know 1515 * that the connection is closing. 1516 */ 1517 if (thflags & TH_FIN) { 1518 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1519 socantrcvmore(so); 1520 /* 1521 * If connection is half-synchronized 1522 * (ie NEEDSYN flag on) then delay ACK, 1523 * so it may be piggybacked when SYN is sent. 1524 * Otherwise, since we received a FIN then no 1525 * more input can be expected, send ACK now. 1526 */ 1527 if (tp->t_flags & TF_NEEDSYN) 1528 tp->t_flags |= TF_DELACK; 1529 else 1530 tp->t_flags |= TF_ACKNOW; 1531 tp->rcv_nxt++; 1532 } 1533 switch (tp->t_state) { 1534 1535 /* 1536 * In SYN_RECEIVED and ESTABLISHED STATES 1537 * enter the CLOSE_WAIT state. 1538 */ 1539 case TCPS_SYN_RECEIVED: 1540 tp->t_starttime = ticks; 1541 /* FALLTHROUGH */ 1542 case TCPS_ESTABLISHED: 1543 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1544 break; 1545 1546 /* 1547 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1548 * enter the CLOSING state. 1549 */ 1550 case TCPS_FIN_WAIT_1: 1551 tcp_state_change(tp, TCPS_CLOSING); 1552 break; 1553 1554 /* 1555 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1556 * starting the time-wait timer, turning off the other 1557 * standard timers. 1558 */ 1559 case TCPS_FIN_WAIT_2: 1560 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1561 KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " 1562 "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 1563 ti_locked)); 1564 1565 tcp_twstart(tp); 1566 INP_INFO_RUNLOCK(&V_tcbinfo); 1567 return; 1568 } 1569 } 1570 if (ti_locked == TI_RLOCKED) { 1571 INP_INFO_RUNLOCK(&V_tcbinfo); 1572 } 1573 ti_locked = TI_UNLOCKED; 1574 1575#ifdef TCPDEBUG 1576 if (so->so_options & SO_DEBUG) 1577 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 1578 &tcp_savetcp, 0); 1579#endif 1580 TCP_PROBE3(debug__input, tp, th, m); 1581 1582 /* 1583 * Return any desired output. 1584 */ 1585 if (needoutput || (tp->t_flags & TF_ACKNOW)) 1586 (void) tp->t_fb->tfb_tcp_output(tp); 1587 1588 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 1589 __func__, ti_locked)); 1590 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1591 INP_WLOCK_ASSERT(tp->t_inpcb); 1592 1593 if (tp->t_flags & TF_DELACK) { 1594 tp->t_flags &= ~TF_DELACK; 1595 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 1596 } 1597 INP_WUNLOCK(tp->t_inpcb); 1598 return; 1599 1600dropafterack: 1601 /* 1602 * Generate an ACK dropping incoming segment if it occupies 1603 * sequence space, where the ACK reflects our state. 1604 * 1605 * We can now skip the test for the RST flag since all 1606 * paths to this code happen after packets containing 1607 * RST have been dropped. 1608 * 1609 * In the SYN-RECEIVED state, don't send an ACK unless the 1610 * segment we received passes the SYN-RECEIVED ACK test. 1611 * If it fails send a RST. This breaks the loop in the 1612 * "LAND" DoS attack, and also prevents an ACK storm 1613 * between two listening ports that have been sent forged 1614 * SYN segments, each with the source address of the other. 1615 */ 1616 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1617 (SEQ_GT(tp->snd_una, th->th_ack) || 1618 SEQ_GT(th->th_ack, tp->snd_max)) ) { 1619 rstreason = BANDLIM_RST_OPENPORT; 1620 goto dropwithreset; 1621 } 1622#ifdef TCPDEBUG 1623 if (so->so_options & SO_DEBUG) 1624 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 1625 &tcp_savetcp, 0); 1626#endif 1627 TCP_PROBE3(debug__drop, tp, th, m); 1628 if (ti_locked == TI_RLOCKED) { 1629 INP_INFO_RUNLOCK(&V_tcbinfo); 1630 } 1631 ti_locked = TI_UNLOCKED; 1632 1633 tp->t_flags |= TF_ACKNOW; 1634 (void) tp->t_fb->tfb_tcp_output(tp); 1635 INP_WUNLOCK(tp->t_inpcb); 1636 m_freem(m); 1637 return; 1638 1639dropwithreset: 1640 if (ti_locked == TI_RLOCKED) { 1641 INP_INFO_RUNLOCK(&V_tcbinfo); 1642 } 1643 ti_locked = TI_UNLOCKED; 1644 1645 if (tp != NULL) { 1646 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1647 INP_WUNLOCK(tp->t_inpcb); 1648 } else 1649 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1650 return; 1651 1652drop: 1653 if (ti_locked == TI_RLOCKED) { 1654 INP_INFO_RUNLOCK(&V_tcbinfo); 1655 ti_locked = TI_UNLOCKED; 1656 } 1657#ifdef INVARIANTS 1658 else 1659 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1660#endif 1661 1662 /* 1663 * Drop space held by incoming segment and return. 1664 */ 1665#ifdef TCPDEBUG 1666 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 1667 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 1668 &tcp_savetcp, 0); 1669#endif 1670 TCP_PROBE3(debug__drop, tp, th, m); 1671 if (tp != NULL) 1672 INP_WUNLOCK(tp->t_inpcb); 1673 m_freem(m); 1674} 1675 1676 1677/* 1678 * Do fast slow is a combination of the original 1679 * tcp_dosegment and a split fastpath, one function 1680 * for the fast-ack which also includes allowing fastpath 1681 * for window advanced in sequence acks. And also a 1682 * sub-function that handles the insequence data. 1683 */ 1684void 1685tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so, 1686 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 1687 int ti_locked) 1688{ 1689 int thflags; 1690 u_long tiwin; 1691 char *s; 1692 int can_enter; 1693 struct in_conninfo *inc; 1694 struct tcpopt to; 1695 1696 thflags = th->th_flags; 1697 inc = &tp->t_inpcb->inp_inc; 1698 /* 1699 * If this is either a state-changing packet or current state isn't 1700 * established, we require a write lock on tcbinfo. Otherwise, we 1701 * allow the tcbinfo to be in either alocked or unlocked, as the 1702 * caller may have unnecessarily acquired a write lock due to a race. 1703 */ 1704 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 1705 tp->t_state != TCPS_ESTABLISHED) { 1706 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 1707 "SYN/FIN/RST/!EST", __func__, ti_locked)); 1708 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1709 } else { 1710#ifdef INVARIANTS 1711 if (ti_locked == TI_RLOCKED) { 1712 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1713 } else { 1714 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 1715 "ti_locked: %d", __func__, ti_locked)); 1716 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1717 } 1718#endif 1719 } 1720 INP_WLOCK_ASSERT(tp->t_inpcb); 1721 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 1722 __func__)); 1723 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 1724 __func__)); 1725 1726 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 1727 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1728 log(LOG_DEBUG, "%s; %s: " 1729 "SYN|FIN segment ignored (based on " 1730 "sysctl setting)\n", s, __func__); 1731 free(s, M_TCPLOG); 1732 } 1733 if (ti_locked == TI_RLOCKED) { 1734 INP_INFO_RUNLOCK(&V_tcbinfo); 1735 } 1736 INP_WUNLOCK(tp->t_inpcb); 1737 m_freem(m); 1738 return; 1739 } 1740 1741 /* 1742 * If a segment with the ACK-bit set arrives in the SYN-SENT state 1743 * check SEQ.ACK first. 1744 */ 1745 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 1746 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 1747 tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED); 1748 if (ti_locked == TI_RLOCKED) { 1749 INP_INFO_RUNLOCK(&V_tcbinfo); 1750 } 1751 INP_WUNLOCK(tp->t_inpcb); 1752 return; 1753 } 1754 1755 tp->sackhint.last_sack_ack = 0; 1756 1757 /* 1758 * Segment received on connection. 1759 * Reset idle time and keep-alive timer. 1760 * XXX: This should be done after segment 1761 * validation to ignore broken/spoofed segs. 1762 */ 1763 tp->t_rcvtime = ticks; 1764 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1765 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 1766 1767 /* 1768 * Unscale the window into a 32-bit value. 1769 * For the SYN_SENT state the scale is zero. 1770 */ 1771 tiwin = th->th_win << tp->snd_scale; 1772 1773 /* 1774 * TCP ECN processing. 1775 */ 1776 if (tp->t_flags & TF_ECN_PERMIT) { 1777 if (thflags & TH_CWR) 1778 tp->t_flags &= ~TF_ECN_SND_ECE; 1779 switch (iptos & IPTOS_ECN_MASK) { 1780 case IPTOS_ECN_CE: 1781 tp->t_flags |= TF_ECN_SND_ECE; 1782 TCPSTAT_INC(tcps_ecn_ce); 1783 break; 1784 case IPTOS_ECN_ECT0: 1785 TCPSTAT_INC(tcps_ecn_ect0); 1786 break; 1787 case IPTOS_ECN_ECT1: 1788 TCPSTAT_INC(tcps_ecn_ect1); 1789 break; 1790 } 1791 /* Congestion experienced. */ 1792 if (thflags & TH_ECE) { 1793 cc_cong_signal(tp, th, CC_ECN); 1794 } 1795 } 1796 1797 /* 1798 * Parse options on any incoming segment. 1799 */ 1800 tcp_dooptions(&to, (u_char *)(th + 1), 1801 (th->th_off << 2) - sizeof(struct tcphdr), 1802 (thflags & TH_SYN) ? TO_SYN : 0); 1803 1804 /* 1805 * If echoed timestamp is later than the current time, 1806 * fall back to non RFC1323 RTT calculation. Normalize 1807 * timestamp if syncookies were used when this connection 1808 * was established. 1809 */ 1810 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1811 to.to_tsecr -= tp->ts_offset; 1812 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) 1813 to.to_tsecr = 0; 1814 } 1815 1816 /* 1817 * Process options only when we get SYN/ACK back. The SYN case 1818 * for incoming connections is handled in tcp_syncache. 1819 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1820 * or <SYN,ACK>) segment itself is never scaled. 1821 * XXX this is traditional behavior, may need to be cleaned up. 1822 */ 1823 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1824 if ((to.to_flags & TOF_SCALE) && 1825 (tp->t_flags & TF_REQ_SCALE)) { 1826 tp->t_flags |= TF_RCVD_SCALE; 1827 tp->snd_scale = to.to_wscale; 1828 } 1829 /* 1830 * Initial send window. It will be updated with 1831 * the next incoming segment to the scaled value. 1832 */ 1833 tp->snd_wnd = th->th_win; 1834 if (to.to_flags & TOF_TS) { 1835 tp->t_flags |= TF_RCVD_TSTMP; 1836 tp->ts_recent = to.to_tsval; 1837 tp->ts_recent_age = tcp_ts_getticks(); 1838 } 1839 if (to.to_flags & TOF_MSS) 1840 tcp_mss(tp, to.to_mss); 1841 if ((tp->t_flags & TF_SACK_PERMIT) && 1842 (to.to_flags & TOF_SACKPERM) == 0) 1843 tp->t_flags &= ~TF_SACK_PERMIT; 1844 } 1845 1846 /* 1847 * If timestamps were negotiated during SYN/ACK they should 1848 * appear on every segment during this session and vice versa. 1849 */ 1850 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 1851 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1852 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1853 "no action\n", s, __func__); 1854 free(s, M_TCPLOG); 1855 } 1856 } 1857 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 1858 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1859 log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 1860 "no action\n", s, __func__); 1861 free(s, M_TCPLOG); 1862 } 1863 } 1864 1865 can_enter = 0; 1866 if (__predict_true((tlen == 0))) { 1867 /* 1868 * The ack moved forward and we have a window (non-zero) 1869 * <or> 1870 * The ack did not move forward, but the window increased. 1871 */ 1872 if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) || 1873 ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) { 1874 can_enter = 1; 1875 } 1876 } else { 1877 /* 1878 * Data incoming, use the old entry criteria 1879 * for fast-path with data. 1880 */ 1881 if ((tiwin && tiwin == tp->snd_wnd)) { 1882 can_enter = 1; 1883 } 1884 } 1885 /* 1886 * Header prediction: check for the two common cases 1887 * of a uni-directional data xfer. If the packet has 1888 * no control flags, is in-sequence, the window didn't 1889 * change and we're not retransmitting, it's a 1890 * candidate. If the length is zero and the ack moved 1891 * forward, we're the sender side of the xfer. Just 1892 * free the data acked & wake any higher level process 1893 * that was blocked waiting for space. If the length 1894 * is non-zero and the ack didn't move, we're the 1895 * receiver side. If we're getting packets in-order 1896 * (the reassembly queue is empty), add the data to 1897 * the socket buffer and note that we need a delayed ack. 1898 * Make sure that the hidden state-flags are also off. 1899 * Since we check for TCPS_ESTABLISHED first, it can only 1900 * be TH_NEEDSYN. 1901 */ 1902 if (__predict_true(tp->t_state == TCPS_ESTABLISHED && 1903 th->th_seq == tp->rcv_nxt && 1904 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1905 tp->snd_nxt == tp->snd_max && 1906 can_enter && 1907 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1908 SEGQ_EMPTY(tp) && 1909 ((to.to_flags & TOF_TS) == 0 || 1910 TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) { 1911 if (__predict_true((tlen == 0) && 1912 (SEQ_LEQ(th->th_ack, tp->snd_max) && 1913 !IN_RECOVERY(tp->t_flags) && 1914 (to.to_flags & TOF_SACK) == 0 && 1915 TAILQ_EMPTY(&tp->snd_holes)))) { 1916 /* We are done */ 1917 tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 1918 ti_locked, tiwin); 1919 return; 1920 } else if ((tlen) && 1921 (th->th_ack == tp->snd_una && 1922 tlen <= sbspace(&so->so_rcv))) { 1923 tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen, 1924 ti_locked, tiwin); 1925 /* We are done */ 1926 return; 1927 } 1928 } 1929 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, 1930 ti_locked, tiwin, thflags); 1931} 1932 1933 1934/* 1935 * This subfunction is used to try to highly optimize the 1936 * fast path. We again allow window updates that are 1937 * in sequence to remain in the fast-path. We also add 1938 * in the __predict's to attempt to help the compiler. 1939 * Note that if we return a 0, then we can *not* process 1940 * it and the caller should push the packet into the 1941 * slow-path. 1942 */ 1943static int 1944tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 1945 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 1946 int ti_locked, u_long tiwin) 1947{ 1948 int acked; 1949 int winup_only=0; 1950#ifdef TCPDEBUG 1951 /* 1952 * The size of tcp_saveipgen must be the size of the max ip header, 1953 * now IPv6. 1954 */ 1955 u_char tcp_saveipgen[IP6_HDR_LEN]; 1956 struct tcphdr tcp_savetcp; 1957 short ostate = 0; 1958#endif 1959 1960 1961 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 1962 /* Old ack, behind (or duplicate to) the last one rcv'd */ 1963 return (0); 1964 } 1965 if (__predict_false(th->th_ack == tp->snd_una) && 1966 __predict_false(tiwin <= tp->snd_wnd)) { 1967 /* duplicate ack <or> a shrinking dup ack with shrinking window */ 1968 return (0); 1969 } 1970 if (__predict_false(tiwin == 0)) { 1971 /* zero window */ 1972 return (0); 1973 } 1974 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 1975 /* Above what we have sent? */ 1976 return (0); 1977 } 1978 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 1979 /* We are retransmitting */ 1980 return (0); 1981 } 1982 if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) { 1983 /* We need a SYN or a FIN, unlikely.. */ 1984 return (0); 1985 } 1986 if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 1987 /* Timestamp is behind .. old ack with seq wrap? */ 1988 return (0); 1989 } 1990 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 1991 /* Still recovering */ 1992 return (0); 1993 } 1994 if (__predict_false(to->to_flags & TOF_SACK)) { 1995 /* Sack included in the ack.. */ 1996 return (0); 1997 } 1998 if (!TAILQ_EMPTY(&tp->snd_holes)) { 1999 /* We have sack holes on our scoreboard */ 2000 return (0); 2001 } 2002 /* Ok if we reach here, we can process a fast-ack */ 2003 2004 /* Did the window get updated? */ 2005 if (tiwin != tp->snd_wnd) { 2006 /* keep track of pure window updates */ 2007 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { 2008 winup_only = 1; 2009 TCPSTAT_INC(tcps_rcvwinupd); 2010 } 2011 tp->snd_wnd = tiwin; 2012 tp->snd_wl1 = th->th_seq; 2013 if (tp->snd_wnd > tp->max_sndwnd) 2014 tp->max_sndwnd = tp->snd_wnd; 2015 } 2016 /* 2017 * Pull snd_wl2 up to prevent seq wrap relative 2018 * to th_ack. 2019 */ 2020 tp->snd_wl2 = th->th_ack; 2021 /* 2022 * If last ACK falls within this segment's sequence numbers, 2023 * record the timestamp. 2024 * NOTE that the test is modified according to the latest 2025 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2026 */ 2027 if ((to->to_flags & TOF_TS) != 0 && 2028 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 2029 tp->ts_recent_age = tcp_ts_getticks(); 2030 tp->ts_recent = to->to_tsval; 2031 } 2032 /* 2033 * This is a pure ack for outstanding data. 2034 */ 2035 if (ti_locked == TI_RLOCKED) { 2036 INP_INFO_RUNLOCK(&V_tcbinfo); 2037 } 2038 ti_locked = TI_UNLOCKED; 2039 2040 TCPSTAT_INC(tcps_predack); 2041 2042 /* 2043 * "bad retransmit" recovery. 2044 */ 2045 if (tp->t_rxtshift == 1 && 2046 tp->t_flags & TF_PREVVALID && 2047 (int)(ticks - tp->t_badrxtwin) < 0) { 2048 cc_cong_signal(tp, th, CC_RTO_ERR); 2049 } 2050 2051 /* 2052 * Recalculate the transmit timer / rtt. 2053 * 2054 * Some boxes send broken timestamp replies 2055 * during the SYN+ACK phase, ignore 2056 * timestamps of 0 or we could calculate a 2057 * huge RTT and blow up the retransmit timer. 2058 */ 2059 if ((to->to_flags & TOF_TS) != 0 && 2060 to->to_tsecr) { 2061 u_int t; 2062 2063 t = tcp_ts_getticks() - to->to_tsecr; 2064 if (!tp->t_rttlow || tp->t_rttlow > t) 2065 tp->t_rttlow = t; 2066 tcp_xmit_timer(tp, 2067 TCP_TS_TO_TICKS(t) + 1); 2068 } else if (tp->t_rtttime && 2069 SEQ_GT(th->th_ack, tp->t_rtseq)) { 2070 if (!tp->t_rttlow || 2071 tp->t_rttlow > ticks - tp->t_rtttime) 2072 tp->t_rttlow = ticks - tp->t_rtttime; 2073 tcp_xmit_timer(tp, 2074 ticks - tp->t_rtttime); 2075 } 2076 if (winup_only == 0) { 2077 acked = BYTES_THIS_ACK(tp, th); 2078 2079 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 2080 hhook_run_tcp_est_in(tp, th, to); 2081 2082 TCPSTAT_ADD(tcps_rcvackbyte, acked); 2083 sbdrop(&so->so_snd, acked); 2084 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 2085 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2086 tp->snd_recover = th->th_ack - 1; 2087 2088 /* 2089 * Let the congestion control algorithm update 2090 * congestion control related information. This 2091 * typically means increasing the congestion 2092 * window. 2093 */ 2094 cc_ack_received(tp, th, CC_ACK); 2095 2096 tp->snd_una = th->th_ack; 2097 tp->t_dupacks = 0; 2098 2099 /* 2100 * If all outstanding data are acked, stop 2101 * retransmit timer, otherwise restart timer 2102 * using current (possibly backed-off) value. 2103 * If process is waiting for space, 2104 * wakeup/selwakeup/signal. If data 2105 * are ready to send, let tcp_output 2106 * decide between more output or persist. 2107 */ 2108#ifdef TCPDEBUG 2109 if (so->so_options & SO_DEBUG) 2110 tcp_trace(TA_INPUT, ostate, tp, 2111 (void *)tcp_saveipgen, 2112 &tcp_savetcp, 0); 2113#endif 2114 TCP_PROBE3(debug__input, tp, th, m); 2115 m_freem(m); 2116 if (tp->snd_una == tp->snd_max) 2117 tcp_timer_activate(tp, TT_REXMT, 0); 2118 else if (!tcp_timer_active(tp, TT_PERSIST)) 2119 tcp_timer_activate(tp, TT_REXMT, 2120 tp->t_rxtcur); 2121 /* Wake up the socket if we have room to write more */ 2122 sowwakeup(so); 2123 } else { 2124 /* 2125 * Window update only, just free the mbufs and 2126 * send out whatever we can. 2127 */ 2128 m_freem(m); 2129 } 2130 if (sbavail(&so->so_snd)) 2131 (void) tcp_output(tp); 2132 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 2133 __func__, ti_locked)); 2134 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2135 INP_WLOCK_ASSERT(tp->t_inpcb); 2136 2137 if (tp->t_flags & TF_DELACK) { 2138 tp->t_flags &= ~TF_DELACK; 2139 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2140 } 2141 INP_WUNLOCK(tp->t_inpcb); 2142 return (1); 2143} 2144 2145/* 2146 * This tcp-do-segment concentrates on making the fastest 2147 * ack processing path. It does not have a fast-path for 2148 * data (it possibly could which would then eliminate the 2149 * need for fast-slow above). For a content distributor having 2150 * large outgoing elephants and very very little coming in 2151 * having no fastpath for data does not really help (since you 2152 * don't get much data in). The most important thing is 2153 * processing ack's quickly and getting the rest of the data 2154 * output to the peer as quickly as possible. This routine 2155 * seems to be about an overall 3% faster then the old 2156 * tcp_do_segment and keeps us in the fast-path for packets 2157 * much more (by allowing window updates to also stay in the fastpath). 2158 */ 2159void 2160tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 2161 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 2162 int ti_locked) 2163{ 2164 int thflags; 2165 u_long tiwin; 2166 char *s; 2167 struct in_conninfo *inc; 2168 struct tcpopt to; 2169 2170 thflags = th->th_flags; 2171 inc = &tp->t_inpcb->inp_inc; 2172 /* 2173 * If this is either a state-changing packet or current state isn't 2174 * established, we require a write lock on tcbinfo. Otherwise, we 2175 * allow the tcbinfo to be in either alocked or unlocked, as the 2176 * caller may have unnecessarily acquired a write lock due to a race. 2177 */ 2178 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 2179 tp->t_state != TCPS_ESTABLISHED) { 2180 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 2181 "SYN/FIN/RST/!EST", __func__, ti_locked)); 2182 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2183 } else { 2184#ifdef INVARIANTS 2185 if (ti_locked == TI_RLOCKED) { 2186 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2187 } else { 2188 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 2189 "ti_locked: %d", __func__, ti_locked)); 2190 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2191 } 2192#endif 2193 } 2194 INP_WLOCK_ASSERT(tp->t_inpcb); 2195 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 2196 __func__)); 2197 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 2198 __func__)); 2199 2200 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 2201 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2202 log(LOG_DEBUG, "%s; %s: " 2203 "SYN|FIN segment ignored (based on " 2204 "sysctl setting)\n", s, __func__); 2205 free(s, M_TCPLOG); 2206 } 2207 if (ti_locked == TI_RLOCKED) { 2208 INP_INFO_RUNLOCK(&V_tcbinfo); 2209 } 2210 INP_WUNLOCK(tp->t_inpcb); 2211 m_freem(m); 2212 return; 2213 } 2214 2215 /* 2216 * If a segment with the ACK-bit set arrives in the SYN-SENT state 2217 * check SEQ.ACK first. 2218 */ 2219 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 2220 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 2221 tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED); 2222 if (ti_locked == TI_RLOCKED) { 2223 INP_INFO_RUNLOCK(&V_tcbinfo); 2224 } 2225 INP_WUNLOCK(tp->t_inpcb); 2226 return; 2227 } 2228 2229 tp->sackhint.last_sack_ack = 0; 2230 2231 /* 2232 * Segment received on connection. 2233 * Reset idle time and keep-alive timer. 2234 * XXX: This should be done after segment 2235 * validation to ignore broken/spoofed segs. 2236 */ 2237 tp->t_rcvtime = ticks; 2238 if (TCPS_HAVEESTABLISHED(tp->t_state)) 2239 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 2240 2241 /* 2242 * Unscale the window into a 32-bit value. 2243 * For the SYN_SENT state the scale is zero. 2244 */ 2245 tiwin = th->th_win << tp->snd_scale; 2246 2247 /* 2248 * TCP ECN processing. 2249 */ 2250 if (tp->t_flags & TF_ECN_PERMIT) { 2251 if (thflags & TH_CWR) 2252 tp->t_flags &= ~TF_ECN_SND_ECE; 2253 switch (iptos & IPTOS_ECN_MASK) { 2254 case IPTOS_ECN_CE: 2255 tp->t_flags |= TF_ECN_SND_ECE; 2256 TCPSTAT_INC(tcps_ecn_ce); 2257 break; 2258 case IPTOS_ECN_ECT0: 2259 TCPSTAT_INC(tcps_ecn_ect0); 2260 break; 2261 case IPTOS_ECN_ECT1: 2262 TCPSTAT_INC(tcps_ecn_ect1); 2263 break; 2264 } 2265 /* Congestion experienced. */ 2266 if (thflags & TH_ECE) { 2267 cc_cong_signal(tp, th, CC_ECN); 2268 } 2269 } 2270 2271 /* 2272 * Parse options on any incoming segment. 2273 */ 2274 tcp_dooptions(&to, (u_char *)(th + 1), 2275 (th->th_off << 2) - sizeof(struct tcphdr), 2276 (thflags & TH_SYN) ? TO_SYN : 0); 2277 2278 /* 2279 * If echoed timestamp is later than the current time, 2280 * fall back to non RFC1323 RTT calculation. Normalize 2281 * timestamp if syncookies were used when this connection 2282 * was established. 2283 */ 2284 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 2285 to.to_tsecr -= tp->ts_offset; 2286 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) 2287 to.to_tsecr = 0; 2288 } 2289 2290 /* 2291 * Process options only when we get SYN/ACK back. The SYN case 2292 * for incoming connections is handled in tcp_syncache. 2293 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 2294 * or <SYN,ACK>) segment itself is never scaled. 2295 * XXX this is traditional behavior, may need to be cleaned up. 2296 */ 2297 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 2298 if ((to.to_flags & TOF_SCALE) && 2299 (tp->t_flags & TF_REQ_SCALE)) { 2300 tp->t_flags |= TF_RCVD_SCALE; 2301 tp->snd_scale = to.to_wscale; 2302 } 2303 /* 2304 * Initial send window. It will be updated with 2305 * the next incoming segment to the scaled value. 2306 */ 2307 tp->snd_wnd = th->th_win; 2308 if (to.to_flags & TOF_TS) { 2309 tp->t_flags |= TF_RCVD_TSTMP; 2310 tp->ts_recent = to.to_tsval; 2311 tp->ts_recent_age = tcp_ts_getticks(); 2312 } 2313 if (to.to_flags & TOF_MSS) 2314 tcp_mss(tp, to.to_mss); 2315 if ((tp->t_flags & TF_SACK_PERMIT) && 2316 (to.to_flags & TOF_SACKPERM) == 0) 2317 tp->t_flags &= ~TF_SACK_PERMIT; 2318 } 2319 2320 /* 2321 * If timestamps were negotiated during SYN/ACK they should 2322 * appear on every segment during this session and vice versa. 2323 */ 2324 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 2325 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2326 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 2327 "no action\n", s, __func__); 2328 free(s, M_TCPLOG); 2329 } 2330 } 2331 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 2332 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2333 log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 2334 "no action\n", s, __func__); 2335 free(s, M_TCPLOG); 2336 } 2337 } 2338 2339 /* 2340 * Header prediction: check for the two common cases 2341 * of a uni-directional data xfer. If the packet has 2342 * no control flags, is in-sequence, the window didn't 2343 * change and we're not retransmitting, it's a 2344 * candidate. If the length is zero and the ack moved 2345 * forward, we're the sender side of the xfer. Just 2346 * free the data acked & wake any higher level process 2347 * that was blocked waiting for space. If the length 2348 * is non-zero and the ack didn't move, we're the 2349 * receiver side. If we're getting packets in-order 2350 * (the reassembly queue is empty), add the data to 2351 * the socket buffer and note that we need a delayed ack. 2352 * Make sure that the hidden state-flags are also off. 2353 * Since we check for TCPS_ESTABLISHED first, it can only 2354 * be TH_NEEDSYN. 2355 */ 2356 if (__predict_true(tp->t_state == TCPS_ESTABLISHED) && 2357 __predict_true(((to.to_flags & TOF_SACK) == 0)) && 2358 __predict_true(tlen == 0) && 2359 __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) && 2360 __predict_true(SEGQ_EMPTY(tp)) && 2361 __predict_true(th->th_seq == tp->rcv_nxt)) { 2362 if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 2363 ti_locked, tiwin)) { 2364 return; 2365 } 2366 } 2367 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, 2368 ti_locked, tiwin, thflags); 2369} 2370 2371struct tcp_function_block __tcp_fastslow = { 2372 .tfb_tcp_block_name = "fastslow", 2373 .tfb_tcp_output = tcp_output, 2374 .tfb_tcp_do_segment = tcp_do_segment_fastslow, 2375 .tfb_tcp_ctloutput = tcp_default_ctloutput, 2376}; 2377 2378struct tcp_function_block __tcp_fastack = { 2379 .tfb_tcp_block_name = "fastack", 2380 .tfb_tcp_output = tcp_output, 2381 .tfb_tcp_do_segment = tcp_do_segment_fastack, 2382 .tfb_tcp_ctloutput = tcp_default_ctloutput 2383}; 2384 2385static int 2386tcp_addfastpaths(module_t mod, int type, void *data) 2387{ 2388 int err=0; 2389 2390 switch (type) { 2391 case MOD_LOAD: 2392 err = register_tcp_functions(&__tcp_fastack, M_WAITOK); 2393 if (err) { 2394 printf("Failed to register fastack module -- err:%d\n", err); 2395 return(err); 2396 } 2397 err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); 2398 if (err) { 2399 printf("Failed to register fastslow module -- err:%d\n", err); 2400 deregister_tcp_functions(&__tcp_fastack); 2401 return(err); 2402 } 2403 break; 2404 case MOD_QUIESCE: 2405 if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) { 2406 return(EBUSY); 2407 } 2408 break; 2409 case MOD_UNLOAD: 2410 err = deregister_tcp_functions(&__tcp_fastack); 2411 if (err == EBUSY) 2412 break; 2413 err = deregister_tcp_functions(&__tcp_fastslow); 2414 if (err == EBUSY) 2415 break; 2416 err = 0; 2417 break; 2418 default: 2419 return (EOPNOTSUPP); 2420 } 2421 return (err); 2422} 2423 2424static moduledata_t new_tcp_fastpaths = { 2425 .name = "tcp_fastpaths", 2426 .evhand = tcp_addfastpaths, 2427 .priv = 0 2428}; 2429 2430MODULE_VERSION(kern_tcpfastpaths, 1); 2431DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 2432