111#ifdef TCPDEBUG 112#include <netinet/tcp_debug.h> 113#endif /* TCPDEBUG */ 114#ifdef TCP_OFFLOAD 115#include <netinet/tcp_offload.h> 116#endif 117 118#ifdef IPSEC 119#include <netipsec/ipsec.h> 120#include <netipsec/ipsec6.h> 121#endif /*IPSEC*/ 122 123#include <machine/in_cksum.h> 124 125#include <security/mac/mac_framework.h> 126 127const int tcprexmtthresh; 128 129VNET_DECLARE(int, tcp_autorcvbuf_inc); 130#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 131VNET_DECLARE(int, tcp_autorcvbuf_max); 132#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 133VNET_DECLARE(int, tcp_do_rfc3042); 134#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) 135VNET_DECLARE(int, tcp_do_autorcvbuf); 136#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 137VNET_DECLARE(int, tcp_insecure_rst); 138#define V_tcp_insecure_rst VNET(tcp_insecure_rst) 139VNET_DECLARE(int, tcp_insecure_syn); 140#define V_tcp_insecure_syn VNET(tcp_insecure_syn) 141 142static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *, 143 struct socket *, struct tcpcb *, int, int, uint8_t, 144 int); 145 146static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *, 147 struct socket *, struct tcpcb *, int, int, uint8_t, 148 int); 149 150/* 151 * Indicate whether this ack should be delayed. We can delay the ack if 152 * following conditions are met: 153 * - There is no delayed ack timer in progress. 154 * - Our last ack wasn't a 0-sized window. We never want to delay 155 * the ack that opens up a 0-sized window. 156 * - LRO wasn't used for this segment. We make sure by checking that the 157 * segment size is not larger than the MSS. 158 */ 159#define DELAY_ACK(tp, tlen) \ 160 ((!tcp_timer_active(tp, TT_DELACK) && \ 161 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 162 (tlen <= tp->t_maxseg) && \ 163 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 164 165/* 166 * So how is this faster than the normal fast ack? 167 * It basically allows us to also stay in the fastpath 168 * when a window-update ack also arrives. In testing 169 * we saw only 25-30% of connections doing fastpath 170 * due to the fact that along with moving forward 171 * in sequence the window was also updated. 172 */ 173static void 174tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 175 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 176 int ti_locked, u_long tiwin) 177{ 178 int acked; 179 int winup_only=0; 180#ifdef TCPDEBUG 181 /* 182 * The size of tcp_saveipgen must be the size of the max ip header, 183 * now IPv6. 184 */ 185 u_char tcp_saveipgen[IP6_HDR_LEN]; 186 struct tcphdr tcp_savetcp; 187 short ostate = 0; 188#endif 189 /* 190 * The following if statment will be true if 191 * we are doing the win_up_in_fp <and> 192 * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or> 193 * - No more new data, but we have an ack for new data 194 * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) 195 * - No more new data, the same ack point but the window grew 196 * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd) 197 */ 198 if ((SEQ_LT(tp->snd_wl1, th->th_seq) || 199 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 200 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 201 /* keep track of pure window updates */ 202 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { 203 winup_only = 1; 204 TCPSTAT_INC(tcps_rcvwinupd); 205 } 206 tp->snd_wnd = tiwin; 207 tp->snd_wl1 = th->th_seq; 208 tp->snd_wl2 = th->th_ack; 209 if (tp->snd_wnd > tp->max_sndwnd) 210 tp->max_sndwnd = tp->snd_wnd; 211 } 212 /* 213 * If last ACK falls within this segment's sequence numbers, 214 * record the timestamp. 215 * NOTE that the test is modified according to the latest 216 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 217 */ 218 if ((to->to_flags & TOF_TS) != 0 && 219 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 220 tp->ts_recent_age = tcp_ts_getticks(); 221 tp->ts_recent = to->to_tsval; 222 } 223 /* 224 * This is a pure ack for outstanding data. 225 */ 226 if (ti_locked == TI_RLOCKED) { 227 INP_INFO_RUNLOCK(&V_tcbinfo); 228 } 229 ti_locked = TI_UNLOCKED; 230 231 TCPSTAT_INC(tcps_predack); 232 233 /* 234 * "bad retransmit" recovery. 235 */ 236 if (tp->t_rxtshift == 1 && 237 tp->t_flags & TF_PREVVALID && 238 (int)(ticks - tp->t_badrxtwin) < 0) { 239 cc_cong_signal(tp, th, CC_RTO_ERR); 240 } 241 242 /* 243 * Recalculate the transmit timer / rtt. 244 * 245 * Some boxes send broken timestamp replies 246 * during the SYN+ACK phase, ignore 247 * timestamps of 0 or we could calculate a 248 * huge RTT and blow up the retransmit timer. 249 */ 250 if ((to->to_flags & TOF_TS) != 0 && 251 to->to_tsecr) { 252 u_int t; 253 254 t = tcp_ts_getticks() - to->to_tsecr; 255 if (!tp->t_rttlow || tp->t_rttlow > t) 256 tp->t_rttlow = t; 257 tcp_xmit_timer(tp, 258 TCP_TS_TO_TICKS(t) + 1); 259 } else if (tp->t_rtttime && 260 SEQ_GT(th->th_ack, tp->t_rtseq)) { 261 if (!tp->t_rttlow || 262 tp->t_rttlow > ticks - tp->t_rtttime) 263 tp->t_rttlow = ticks - tp->t_rtttime; 264 tcp_xmit_timer(tp, 265 ticks - tp->t_rtttime); 266 } 267 if (winup_only == 0) { 268 acked = BYTES_THIS_ACK(tp, th); 269 270 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 271 hhook_run_tcp_est_in(tp, th, to); 272 273 TCPSTAT_ADD(tcps_rcvackbyte, acked); 274 sbdrop(&so->so_snd, acked); 275 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 276 SEQ_LEQ(th->th_ack, tp->snd_recover)) 277 tp->snd_recover = th->th_ack - 1; 278 279 /* 280 * Let the congestion control algorithm update 281 * congestion control related information. This 282 * typically means increasing the congestion 283 * window. 284 */ 285 cc_ack_received(tp, th, CC_ACK); 286 287 tp->snd_una = th->th_ack; 288 /* 289 * Pull snd_wl2 up to prevent seq wrap relative 290 * to th_ack. 291 */ 292 tp->snd_wl2 = th->th_ack; 293 tp->t_dupacks = 0; 294 m_freem(m); 295 296 /* 297 * If all outstanding data are acked, stop 298 * retransmit timer, otherwise restart timer 299 * using current (possibly backed-off) value. 300 * If process is waiting for space, 301 * wakeup/selwakeup/signal. If data 302 * are ready to send, let tcp_output 303 * decide between more output or persist. 304 */ 305#ifdef TCPDEBUG 306 if (so->so_options & SO_DEBUG) 307 tcp_trace(TA_INPUT, ostate, tp, 308 (void *)tcp_saveipgen, 309 &tcp_savetcp, 0); 310#endif 311 if (tp->snd_una == tp->snd_max) 312 tcp_timer_activate(tp, TT_REXMT, 0); 313 else if (!tcp_timer_active(tp, TT_PERSIST)) 314 tcp_timer_activate(tp, TT_REXMT, 315 tp->t_rxtcur); 316 } else { 317 /* 318 * Window update only, just free the mbufs and 319 * send out whatever we can. 320 */ 321 m_freem(m); 322 } 323 sowwakeup(so); 324 if (sbavail(&so->so_snd)) 325 (void) tcp_output(tp); 326 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 327 __func__, ti_locked)); 328 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 329 INP_WLOCK_ASSERT(tp->t_inpcb); 330 331 if (tp->t_flags & TF_DELACK) { 332 tp->t_flags &= ~TF_DELACK; 333 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 334 } 335 INP_WUNLOCK(tp->t_inpcb); 336} 337 338/* 339 * Here nothing is really faster, its just that we 340 * have broken out the fast-data path also just like 341 * the fast-ack. 342 */ 343static void 344tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 345 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 346 int ti_locked, u_long tiwin) 347{ 348 int newsize = 0; /* automatic sockbuf scaling */ 349#ifdef TCPDEBUG 350 /* 351 * The size of tcp_saveipgen must be the size of the max ip header, 352 * now IPv6. 353 */ 354 u_char tcp_saveipgen[IP6_HDR_LEN]; 355 struct tcphdr tcp_savetcp; 356 short ostate = 0; 357#endif 358 /* 359 * If last ACK falls within this segment's sequence numbers, 360 * record the timestamp. 361 * NOTE that the test is modified according to the latest 362 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 363 */ 364 if ((to->to_flags & TOF_TS) != 0 && 365 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 366 tp->ts_recent_age = tcp_ts_getticks(); 367 tp->ts_recent = to->to_tsval; 368 } 369 370 /* 371 * This is a pure, in-sequence data packet with 372 * nothing on the reassembly queue and we have enough 373 * buffer space to take it. 374 */ 375 if (ti_locked == TI_RLOCKED) { 376 INP_INFO_RUNLOCK(&V_tcbinfo); 377 } 378 ti_locked = TI_UNLOCKED; 379 380 /* Clean receiver SACK report if present */ 381 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 382 tcp_clean_sackreport(tp); 383 TCPSTAT_INC(tcps_preddat); 384 tp->rcv_nxt += tlen; 385 /* 386 * Pull snd_wl1 up to prevent seq wrap relative to 387 * th_seq. 388 */ 389 tp->snd_wl1 = th->th_seq; 390 /* 391 * Pull rcv_up up to prevent seq wrap relative to 392 * rcv_nxt. 393 */ 394 tp->rcv_up = tp->rcv_nxt; 395 TCPSTAT_ADD(tcps_rcvbyte, tlen); 396#ifdef TCPDEBUG 397 if (so->so_options & SO_DEBUG) 398 tcp_trace(TA_INPUT, ostate, tp, 399 (void *)tcp_saveipgen, &tcp_savetcp, 0); 400#endif 401 /* 402 * Automatic sizing of receive socket buffer. Often the send 403 * buffer size is not optimally adjusted to the actual network 404 * conditions at hand (delay bandwidth product). Setting the 405 * buffer size too small limits throughput on links with high 406 * bandwidth and high delay (eg. trans-continental/oceanic links). 407 * 408 * On the receive side the socket buffer memory is only rarely 409 * used to any significant extent. This allows us to be much 410 * more aggressive in scaling the receive socket buffer. For 411 * the case that the buffer space is actually used to a large 412 * extent and we run out of kernel memory we can simply drop 413 * the new segments; TCP on the sender will just retransmit it 414 * later. Setting the buffer size too big may only consume too 415 * much kernel memory if the application doesn't read() from 416 * the socket or packet loss or reordering makes use of the 417 * reassembly queue. 418 * 419 * The criteria to step up the receive buffer one notch are: 420 * 1. Application has not set receive buffer size with 421 * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. 422 * 2. the number of bytes received during the time it takes 423 * one timestamp to be reflected back to us (the RTT); 424 * 3. received bytes per RTT is within seven eighth of the 425 * current socket buffer size; 426 * 4. receive buffer size has not hit maximal automatic size; 427 * 428 * This algorithm does one step per RTT at most and only if 429 * we receive a bulk stream w/o packet losses or reorderings. 430 * Shrinking the buffer during idle times is not necessary as 431 * it doesn't consume any memory when idle. 432 * 433 * TODO: Only step up if the application is actually serving 434 * the buffer to better manage the socket buffer resources. 435 */ 436 if (V_tcp_do_autorcvbuf && 437 (to->to_flags & TOF_TS) && 438 to->to_tsecr && 439 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 440 if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) && 441 to->to_tsecr - tp->rfbuf_ts < hz) { 442 if (tp->rfbuf_cnt > 443 (so->so_rcv.sb_hiwat / 8 * 7) && 444 so->so_rcv.sb_hiwat < 445 V_tcp_autorcvbuf_max) { 446 newsize = 447 min(so->so_rcv.sb_hiwat + 448 V_tcp_autorcvbuf_inc, 449 V_tcp_autorcvbuf_max); 450 } 451 /* Start over with next RTT. */ 452 tp->rfbuf_ts = 0; 453 tp->rfbuf_cnt = 0; 454 } else 455 tp->rfbuf_cnt += tlen; /* add up */ 456 } 457 458 /* Add data to socket buffer. */ 459 SOCKBUF_LOCK(&so->so_rcv); 460 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 461 m_freem(m); 462 } else { 463 /* 464 * Set new socket buffer size. 465 * Give up when limit is reached. 466 */ 467 if (newsize) 468 if (!sbreserve_locked(&so->so_rcv, 469 newsize, so, NULL)) 470 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 471 m_adj(m, drop_hdrlen); /* delayed header drop */ 472 sbappendstream_locked(&so->so_rcv, m, 0); 473 } 474 /* NB: sorwakeup_locked() does an implicit unlock. */ 475 sorwakeup_locked(so); 476 if (DELAY_ACK(tp, tlen)) { 477 tp->t_flags |= TF_DELACK; 478 } else { 479 tp->t_flags |= TF_ACKNOW; 480 tcp_output(tp); 481 } 482 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 483 __func__, ti_locked)); 484 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 485 INP_WLOCK_ASSERT(tp->t_inpcb); 486 487 if (tp->t_flags & TF_DELACK) { 488 tp->t_flags &= ~TF_DELACK; 489 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 490 } 491 INP_WUNLOCK(tp->t_inpcb); 492} 493 494/* 495 * The slow-path is the clone of the long long part 496 * of tcp_do_segment past all the fast-path stuff. We 497 * use it here by two different callers, the fast/slow and 498 * the fastack only. 499 */ 500static void 501tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so, 502 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 503 int ti_locked, u_long tiwin, int thflags) 504{ 505 int acked, ourfinisacked, needoutput = 0; 506 int rstreason, todrop, win; 507 char *s; 508 struct in_conninfo *inc; 509 struct mbuf *mfree = NULL; 510#ifdef TCPDEBUG 511 /* 512 * The size of tcp_saveipgen must be the size of the max ip header, 513 * now IPv6. 514 */ 515 u_char tcp_saveipgen[IP6_HDR_LEN]; 516 struct tcphdr tcp_savetcp; 517 short ostate = 0; 518#endif 519 /* 520 * Calculate amount of space in receive window, 521 * and then do TCP input processing. 522 * Receive window is amount of space in rcv queue, 523 * but not less than advertised window. 524 */ 525 inc = &tp->t_inpcb->inp_inc; 526 win = sbspace(&so->so_rcv); 527 if (win < 0) 528 win = 0; 529 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 530 531 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 532 tp->rfbuf_ts = 0; 533 tp->rfbuf_cnt = 0; 534 535 switch (tp->t_state) { 536 537 /* 538 * If the state is SYN_RECEIVED: 539 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 540 */ 541 case TCPS_SYN_RECEIVED: 542 if ((thflags & TH_ACK) && 543 (SEQ_LEQ(th->th_ack, tp->snd_una) || 544 SEQ_GT(th->th_ack, tp->snd_max))) { 545 rstreason = BANDLIM_RST_OPENPORT; 546 goto dropwithreset; 547 } 548 break; 549 550 /* 551 * If the state is SYN_SENT: 552 * if seg contains an ACK, but not for our SYN, drop the input. 553 * if seg contains a RST, then drop the connection. 554 * if seg does not contain SYN, then drop it. 555 * Otherwise this is an acceptable SYN segment 556 * initialize tp->rcv_nxt and tp->irs 557 * if seg contains ack then advance tp->snd_una 558 * if seg contains an ECE and ECN support is enabled, the stream 559 * is ECN capable. 560 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 561 * arrange for segment to be acked (eventually) 562 * continue processing rest of data/controls, beginning with URG 563 */ 564 case TCPS_SYN_SENT: 565 if ((thflags & TH_ACK) && 566 (SEQ_LEQ(th->th_ack, tp->iss) || 567 SEQ_GT(th->th_ack, tp->snd_max))) { 568 rstreason = BANDLIM_UNLIMITED; 569 goto dropwithreset; 570 } 571 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { 572 TCP_PROBE5(connect__refused, NULL, tp, 573 mtod(m, const char *), tp, th); 574 tp = tcp_drop(tp, ECONNREFUSED); 575 } 576 if (thflags & TH_RST) 577 goto drop; 578 if (!(thflags & TH_SYN)) 579 goto drop; 580 581 tp->irs = th->th_seq; 582 tcp_rcvseqinit(tp); 583 if (thflags & TH_ACK) { 584 TCPSTAT_INC(tcps_connects); 585 soisconnected(so); 586#ifdef MAC 587 mac_socketpeer_set_from_mbuf(m, so); 588#endif 589 /* Do window scaling on this connection? */ 590 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 591 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 592 tp->rcv_scale = tp->request_r_scale; 593 } 594 tp->rcv_adv += imin(tp->rcv_wnd, 595 TCP_MAXWIN << tp->rcv_scale); 596 tp->snd_una++; /* SYN is acked */ 597 /* 598 * If there's data, delay ACK; if there's also a FIN 599 * ACKNOW will be turned on later. 600 */ 601 if (DELAY_ACK(tp, tlen) && tlen != 0) 602 tcp_timer_activate(tp, TT_DELACK, 603 tcp_delacktime); 604 else 605 tp->t_flags |= TF_ACKNOW; 606 607 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 608 tp->t_flags |= TF_ECN_PERMIT; 609 TCPSTAT_INC(tcps_ecn_shs); 610 } 611 612 /* 613 * Received <SYN,ACK> in SYN_SENT[*] state. 614 * Transitions: 615 * SYN_SENT --> ESTABLISHED 616 * SYN_SENT* --> FIN_WAIT_1 617 */ 618 tp->t_starttime = ticks; 619 if (tp->t_flags & TF_NEEDFIN) { 620 tcp_state_change(tp, TCPS_FIN_WAIT_1); 621 tp->t_flags &= ~TF_NEEDFIN; 622 thflags &= ~TH_SYN; 623 } else { 624 tcp_state_change(tp, TCPS_ESTABLISHED); 625 TCP_PROBE5(connect__established, NULL, tp, 626 mtod(m, const char *), tp, th); 627 cc_conn_init(tp); 628 tcp_timer_activate(tp, TT_KEEP, 629 TP_KEEPIDLE(tp)); 630 } 631 } else { 632 /* 633 * Received initial SYN in SYN-SENT[*] state => 634 * simultaneous open. 635 * If it succeeds, connection is * half-synchronized. 636 * Otherwise, do 3-way handshake: 637 * SYN-SENT -> SYN-RECEIVED 638 * SYN-SENT* -> SYN-RECEIVED* 639 */ 640 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 641 tcp_timer_activate(tp, TT_REXMT, 0); 642 tcp_state_change(tp, TCPS_SYN_RECEIVED); 643 } 644 645 KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " 646 "ti_locked %d", __func__, ti_locked)); 647 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 648 INP_WLOCK_ASSERT(tp->t_inpcb); 649 650 /* 651 * Advance th->th_seq to correspond to first data byte. 652 * If data, trim to stay within window, 653 * dropping FIN if necessary. 654 */ 655 th->th_seq++; 656 if (tlen > tp->rcv_wnd) { 657 todrop = tlen - tp->rcv_wnd; 658 m_adj(m, -todrop); 659 tlen = tp->rcv_wnd; 660 thflags &= ~TH_FIN; 661 TCPSTAT_INC(tcps_rcvpackafterwin); 662 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 663 } 664 tp->snd_wl1 = th->th_seq - 1; 665 tp->rcv_up = th->th_seq; 666 /* 667 * Client side of transaction: already sent SYN and data. 668 * If the remote host used T/TCP to validate the SYN, 669 * our data will be ACK'd; if so, enter normal data segment 670 * processing in the middle of step 5, ack processing. 671 * Otherwise, goto step 6. 672 */ 673 if (thflags & TH_ACK) 674 goto process_ACK; 675 676 goto step6; 677 678 /* 679 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 680 * do normal processing. 681 * 682 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 683 */ 684 case TCPS_LAST_ACK: 685 case TCPS_CLOSING: 686 break; /* continue normal processing */ 687 } 688 689 /* 690 * States other than LISTEN or SYN_SENT. 691 * First check the RST flag and sequence number since reset segments 692 * are exempt from the timestamp and connection count tests. This 693 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 694 * below which allowed reset segments in half the sequence space 695 * to fall though and be processed (which gives forged reset 696 * segments with a random sequence number a 50 percent chance of 697 * killing a connection). 698 * Then check timestamp, if present. 699 * Then check the connection count, if present. 700 * Then check that at least some bytes of segment are within 701 * receive window. If segment begins before rcv_nxt, 702 * drop leading data (and SYN); if nothing left, just ack. 703 */ 704 if (thflags & TH_RST) { 705 /* 706 * RFC5961 Section 3.2 707 * 708 * - RST drops connection only if SEG.SEQ == RCV.NXT. 709 * - If RST is in window, we send challenge ACK. 710 * 711 * Note: to take into account delayed ACKs, we should 712 * test against last_ack_sent instead of rcv_nxt. 713 * Note 2: we handle special case of closed window, not 714 * covered by the RFC. 715 */ 716 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 717 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 718 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 719 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 720 KASSERT(ti_locked == TI_RLOCKED, 721 ("%s: TH_RST ti_locked %d, th %p tp %p", 722 __func__, ti_locked, th, tp)); 723 KASSERT(tp->t_state != TCPS_SYN_SENT, 724 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 725 __func__, th, tp)); 726 727 if (V_tcp_insecure_rst || 728 tp->last_ack_sent == th->th_seq) { 729 TCPSTAT_INC(tcps_drops); 730 /* Drop the connection. */ 731 switch (tp->t_state) { 732 case TCPS_SYN_RECEIVED: 733 so->so_error = ECONNREFUSED; 734 goto close; 735 case TCPS_ESTABLISHED: 736 case TCPS_FIN_WAIT_1: 737 case TCPS_FIN_WAIT_2: 738 case TCPS_CLOSE_WAIT: 739 so->so_error = ECONNRESET; 740 close: 741 tcp_state_change(tp, TCPS_CLOSED); 742 /* FALLTHROUGH */ 743 default: 744 tp = tcp_close(tp); 745 } 746 } else { 747 TCPSTAT_INC(tcps_badrst); 748 /* Send challenge ACK. */ 749 tcp_respond(tp, mtod(m, void *), th, m, 750 tp->rcv_nxt, tp->snd_nxt, TH_ACK); 751 tp->last_ack_sent = tp->rcv_nxt; 752 m = NULL; 753 } 754 } 755 goto drop; 756 } 757 758 /* 759 * RFC5961 Section 4.2 760 * Send challenge ACK for any SYN in synchronized state. 761 */ 762 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { 763 KASSERT(ti_locked == TI_RLOCKED, 764 ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); 765 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 766 767 TCPSTAT_INC(tcps_badsyn); 768 if (V_tcp_insecure_syn && 769 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 770 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 771 tp = tcp_drop(tp, ECONNRESET); 772 rstreason = BANDLIM_UNLIMITED; 773 } else { 774 /* Send challenge ACK. */ 775 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 776 tp->snd_nxt, TH_ACK); 777 tp->last_ack_sent = tp->rcv_nxt; 778 m = NULL; 779 } 780 goto drop; 781 } 782 783 /* 784 * RFC 1323 PAWS: If we have a timestamp reply on this segment 785 * and it's less than ts_recent, drop it. 786 */ 787 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 788 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 789 790 /* Check to see if ts_recent is over 24 days old. */ 791 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 792 /* 793 * Invalidate ts_recent. If this segment updates 794 * ts_recent, the age will be reset later and ts_recent 795 * will get a valid value. If it does not, setting 796 * ts_recent to zero will at least satisfy the 797 * requirement that zero be placed in the timestamp 798 * echo reply when ts_recent isn't valid. The 799 * age isn't reset until we get a valid ts_recent 800 * because we don't want out-of-order segments to be 801 * dropped when ts_recent is old. 802 */ 803 tp->ts_recent = 0; 804 } else { 805 TCPSTAT_INC(tcps_rcvduppack); 806 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 807 TCPSTAT_INC(tcps_pawsdrop); 808 if (tlen) 809 goto dropafterack; 810 goto drop; 811 } 812 } 813 814 /* 815 * In the SYN-RECEIVED state, validate that the packet belongs to 816 * this connection before trimming the data to fit the receive 817 * window. Check the sequence number versus IRS since we know 818 * the sequence numbers haven't wrapped. This is a partial fix 819 * for the "LAND" DoS attack. 820 */ 821 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 822 rstreason = BANDLIM_RST_OPENPORT; 823 goto dropwithreset; 824 } 825 826 todrop = tp->rcv_nxt - th->th_seq; 827 if (todrop > 0) { 828 if (thflags & TH_SYN) { 829 thflags &= ~TH_SYN; 830 th->th_seq++; 831 if (th->th_urp > 1) 832 th->th_urp--; 833 else 834 thflags &= ~TH_URG; 835 todrop--; 836 } 837 /* 838 * Following if statement from Stevens, vol. 2, p. 960. 839 */ 840 if (todrop > tlen 841 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 842 /* 843 * Any valid FIN must be to the left of the window. 844 * At this point the FIN must be a duplicate or out 845 * of sequence; drop it. 846 */ 847 thflags &= ~TH_FIN; 848 849 /* 850 * Send an ACK to resynchronize and drop any data. 851 * But keep on processing for RST or ACK. 852 */ 853 tp->t_flags |= TF_ACKNOW; 854 todrop = tlen; 855 TCPSTAT_INC(tcps_rcvduppack); 856 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 857 } else { 858 TCPSTAT_INC(tcps_rcvpartduppack); 859 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 860 } 861 drop_hdrlen += todrop; /* drop from the top afterwards */ 862 th->th_seq += todrop; 863 tlen -= todrop; 864 if (th->th_urp > todrop) 865 th->th_urp -= todrop; 866 else { 867 thflags &= ~TH_URG; 868 th->th_urp = 0; 869 } 870 } 871 872 /* 873 * If new data are received on a connection after the 874 * user processes are gone, then RST the other end. 875 */ 876 if ((so->so_state & SS_NOFDREF) && 877 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 878 KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " 879 "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); 880 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 881 882 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 883 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " 884 "after socket was closed, " 885 "sending RST and removing tcpcb\n", 886 s, __func__, tcpstates[tp->t_state], tlen); 887 free(s, M_TCPLOG); 888 } 889 tp = tcp_close(tp); 890 TCPSTAT_INC(tcps_rcvafterclose); 891 rstreason = BANDLIM_UNLIMITED; 892 goto dropwithreset; 893 } 894 895 /* 896 * If segment ends after window, drop trailing data 897 * (and PUSH and FIN); if nothing left, just ACK. 898 */ 899 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 900 if (todrop > 0) { 901 TCPSTAT_INC(tcps_rcvpackafterwin); 902 if (todrop >= tlen) { 903 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 904 /* 905 * If window is closed can only take segments at 906 * window edge, and have to drop data and PUSH from 907 * incoming segments. Continue processing, but 908 * remember to ack. Otherwise, drop segment 909 * and ack. 910 */ 911 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 912 tp->t_flags |= TF_ACKNOW; 913 TCPSTAT_INC(tcps_rcvwinprobe); 914 } else 915 goto dropafterack; 916 } else 917 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 918 m_adj(m, -todrop); 919 tlen -= todrop; 920 thflags &= ~(TH_PUSH|TH_FIN); 921 } 922 923 /* 924 * If last ACK falls within this segment's sequence numbers, 925 * record its timestamp. 926 * NOTE: 927 * 1) That the test incorporates suggestions from the latest 928 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 929 * 2) That updating only on newer timestamps interferes with 930 * our earlier PAWS tests, so this check should be solely 931 * predicated on the sequence space of this segment. 932 * 3) That we modify the segment boundary check to be 933 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 934 * instead of RFC1323's 935 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 936 * This modified check allows us to overcome RFC1323's 937 * limitations as described in Stevens TCP/IP Illustrated 938 * Vol. 2 p.869. In such cases, we can still calculate the 939 * RTT correctly when RCV.NXT == Last.ACK.Sent. 940 */ 941 if ((to->to_flags & TOF_TS) != 0 && 942 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 943 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 944 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 945 tp->ts_recent_age = tcp_ts_getticks(); 946 tp->ts_recent = to->to_tsval; 947 } 948 949 /* 950 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 951 * flag is on (half-synchronized state), then queue data for 952 * later processing; else drop segment and return. 953 */ 954 if ((thflags & TH_ACK) == 0) { 955 if (tp->t_state == TCPS_SYN_RECEIVED || 956 (tp->t_flags & TF_NEEDSYN)) 957 goto step6; 958 else if (tp->t_flags & TF_ACKNOW) 959 goto dropafterack; 960 else 961 goto drop; 962 } 963 964 /* 965 * Ack processing. 966 */ 967 switch (tp->t_state) { 968 969 /* 970 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 971 * ESTABLISHED state and continue processing. 972 * The ACK was checked above. 973 */ 974 case TCPS_SYN_RECEIVED: 975 976 TCPSTAT_INC(tcps_connects); 977 soisconnected(so); 978 /* Do window scaling? */ 979 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 980 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 981 tp->rcv_scale = tp->request_r_scale; 982 tp->snd_wnd = tiwin; 983 } 984 /* 985 * Make transitions: 986 * SYN-RECEIVED -> ESTABLISHED 987 * SYN-RECEIVED* -> FIN-WAIT-1 988 */ 989 tp->t_starttime = ticks; 990 if (tp->t_flags & TF_NEEDFIN) { 991 tcp_state_change(tp, TCPS_FIN_WAIT_1); 992 tp->t_flags &= ~TF_NEEDFIN; 993 } else { 994 tcp_state_change(tp, TCPS_ESTABLISHED); 995 TCP_PROBE5(accept__established, NULL, tp, 996 mtod(m, const char *), tp, th); 997 cc_conn_init(tp); 998 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 999 } 1000 /* 1001 * If segment contains data or ACK, will call tcp_reass() 1002 * later; if not, do so now to pass queued data to user. 1003 */ 1004 if (tlen == 0 && (thflags & TH_FIN) == 0) 1005 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 1006 (struct mbuf *)0); 1007 tp->snd_wl1 = th->th_seq - 1; 1008 /* FALLTHROUGH */ 1009 1010 /* 1011 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1012 * ACKs. If the ack is in the range 1013 * tp->snd_una < th->th_ack <= tp->snd_max 1014 * then advance tp->snd_una to th->th_ack and drop 1015 * data from the retransmission queue. If this ACK reflects 1016 * more up to date window information we update our window information. 1017 */ 1018 case TCPS_ESTABLISHED: 1019 case TCPS_FIN_WAIT_1: 1020 case TCPS_FIN_WAIT_2: 1021 case TCPS_CLOSE_WAIT: 1022 case TCPS_CLOSING: 1023 case TCPS_LAST_ACK: 1024 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1025 TCPSTAT_INC(tcps_rcvacktoomuch); 1026 goto dropafterack; 1027 } 1028 if ((tp->t_flags & TF_SACK_PERMIT) && 1029 ((to->to_flags & TOF_SACK) || 1030 !TAILQ_EMPTY(&tp->snd_holes))) 1031 tcp_sack_doack(tp, to, th->th_ack); 1032 else 1033 /* 1034 * Reset the value so that previous (valid) value 1035 * from the last ack with SACK doesn't get used. 1036 */ 1037 tp->sackhint.sacked_bytes = 0; 1038 1039 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 1040 hhook_run_tcp_est_in(tp, th, to); 1041 1042 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1043 if (tlen == 0 && tiwin == tp->snd_wnd) { 1044 /* 1045 * If this is the first time we've seen a 1046 * FIN from the remote, this is not a 1047 * duplicate and it needs to be processed 1048 * normally. This happens during a 1049 * simultaneous close. 1050 */ 1051 if ((thflags & TH_FIN) && 1052 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { 1053 tp->t_dupacks = 0; 1054 break; 1055 } 1056 TCPSTAT_INC(tcps_rcvdupack); 1057 /* 1058 * If we have outstanding data (other than 1059 * a window probe), this is a completely 1060 * duplicate ack (ie, window info didn't 1061 * change and FIN isn't set), 1062 * the ack is the biggest we've 1063 * seen and we've seen exactly our rexmt 1064 * threshhold of them, assume a packet 1065 * has been dropped and retransmit it. 1066 * Kludge snd_nxt & the congestion 1067 * window so we send only this one 1068 * packet. 1069 * 1070 * We know we're losing at the current 1071 * window size so do congestion avoidance 1072 * (set ssthresh to half the current window 1073 * and pull our congestion window back to 1074 * the new ssthresh). 1075 * 1076 * Dup acks mean that packets have left the 1077 * network (they're now cached at the receiver) 1078 * so bump cwnd by the amount in the receiver 1079 * to keep a constant cwnd packets in the 1080 * network. 1081 * 1082 * When using TCP ECN, notify the peer that 1083 * we reduced the cwnd. 1084 */ 1085 if (!tcp_timer_active(tp, TT_REXMT) || 1086 th->th_ack != tp->snd_una) 1087 tp->t_dupacks = 0; 1088 else if (++tp->t_dupacks > tcprexmtthresh || 1089 IN_FASTRECOVERY(tp->t_flags)) { 1090 cc_ack_received(tp, th, CC_DUPACK); 1091 if ((tp->t_flags & TF_SACK_PERMIT) && 1092 IN_FASTRECOVERY(tp->t_flags)) { 1093 int awnd; 1094 1095 /* 1096 * Compute the amount of data in flight first. 1097 * We can inject new data into the pipe iff 1098 * we have less than 1/2 the original window's 1099 * worth of data in flight. 1100 */ 1101 if (V_tcp_do_rfc6675_pipe) 1102 awnd = tcp_compute_pipe(tp); 1103 else 1104 awnd = (tp->snd_nxt - tp->snd_fack) + 1105 tp->sackhint.sack_bytes_rexmit; 1106 1107 if (awnd < tp->snd_ssthresh) { 1108 tp->snd_cwnd += tp->t_maxseg; 1109 if (tp->snd_cwnd > tp->snd_ssthresh) 1110 tp->snd_cwnd = tp->snd_ssthresh; 1111 } 1112 } else 1113 tp->snd_cwnd += tp->t_maxseg; 1114 (void) tp->t_fb->tfb_tcp_output(tp); 1115 goto drop; 1116 } else if (tp->t_dupacks == tcprexmtthresh) { 1117 tcp_seq onxt = tp->snd_nxt; 1118 1119 /* 1120 * If we're doing sack, check to 1121 * see if we're already in sack 1122 * recovery. If we're not doing sack, 1123 * check to see if we're in newreno 1124 * recovery. 1125 */ 1126 if (tp->t_flags & TF_SACK_PERMIT) { 1127 if (IN_FASTRECOVERY(tp->t_flags)) { 1128 tp->t_dupacks = 0; 1129 break; 1130 } 1131 } else { 1132 if (SEQ_LEQ(th->th_ack, 1133 tp->snd_recover)) { 1134 tp->t_dupacks = 0; 1135 break; 1136 } 1137 } 1138 /* Congestion signal before ack. */ 1139 cc_cong_signal(tp, th, CC_NDUPACK); 1140 cc_ack_received(tp, th, CC_DUPACK); 1141 tcp_timer_activate(tp, TT_REXMT, 0); 1142 tp->t_rtttime = 0; 1143 if (tp->t_flags & TF_SACK_PERMIT) { 1144 TCPSTAT_INC( 1145 tcps_sack_recovery_episode); 1146 tp->sack_newdata = tp->snd_nxt; 1147 tp->snd_cwnd = tp->t_maxseg; 1148 (void) tp->t_fb->tfb_tcp_output(tp); 1149 goto drop; 1150 } 1151 tp->snd_nxt = th->th_ack; 1152 tp->snd_cwnd = tp->t_maxseg; 1153 (void) tp->t_fb->tfb_tcp_output(tp); 1154 KASSERT(tp->snd_limited <= 2, 1155 ("%s: tp->snd_limited too big", 1156 __func__)); 1157 tp->snd_cwnd = tp->snd_ssthresh + 1158 tp->t_maxseg * 1159 (tp->t_dupacks - tp->snd_limited); 1160 if (SEQ_GT(onxt, tp->snd_nxt)) 1161 tp->snd_nxt = onxt; 1162 goto drop; 1163 } else if (V_tcp_do_rfc3042) { 1164 /* 1165 * Process first and second duplicate 1166 * ACKs. Each indicates a segment 1167 * leaving the network, creating room 1168 * for more. Make sure we can send a 1169 * packet on reception of each duplicate 1170 * ACK by increasing snd_cwnd by one 1171 * segment. Restore the original 1172 * snd_cwnd after packet transmission. 1173 */ 1174 cc_ack_received(tp, th, CC_DUPACK); 1175 u_long oldcwnd = tp->snd_cwnd; 1176 tcp_seq oldsndmax = tp->snd_max; 1177 u_int sent; 1178 int avail; 1179 1180 KASSERT(tp->t_dupacks == 1 || 1181 tp->t_dupacks == 2, 1182 ("%s: dupacks not 1 or 2", 1183 __func__)); 1184 if (tp->t_dupacks == 1) 1185 tp->snd_limited = 0; 1186 tp->snd_cwnd = 1187 (tp->snd_nxt - tp->snd_una) + 1188 (tp->t_dupacks - tp->snd_limited) * 1189 tp->t_maxseg; 1190 /* 1191 * Only call tcp_output when there 1192 * is new data available to be sent. 1193 * Otherwise we would send pure ACKs. 1194 */ 1195 SOCKBUF_LOCK(&so->so_snd); 1196 avail = sbavail(&so->so_snd) - 1197 (tp->snd_nxt - tp->snd_una); 1198 SOCKBUF_UNLOCK(&so->so_snd); 1199 if (avail > 0) 1200 (void) tp->t_fb->tfb_tcp_output(tp); 1201 sent = tp->snd_max - oldsndmax; 1202 if (sent > tp->t_maxseg) { 1203 KASSERT((tp->t_dupacks == 2 && 1204 tp->snd_limited == 0) || 1205 (sent == tp->t_maxseg + 1 && 1206 tp->t_flags & TF_SENTFIN), 1207 ("%s: sent too much", 1208 __func__)); 1209 tp->snd_limited = 2; 1210 } else if (sent > 0) 1211 ++tp->snd_limited; 1212 tp->snd_cwnd = oldcwnd; 1213 goto drop; 1214 } 1215 } else 1216 tp->t_dupacks = 0; 1217 break; 1218 } 1219 1220 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 1221 ("%s: th_ack <= snd_una", __func__)); 1222 1223 /* 1224 * If the congestion window was inflated to account 1225 * for the other side's cached packets, retract it. 1226 */ 1227 if (IN_FASTRECOVERY(tp->t_flags)) { 1228 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 1229 if (tp->t_flags & TF_SACK_PERMIT) 1230 tcp_sack_partialack(tp, th); 1231 else 1232 tcp_newreno_partial_ack(tp, th); 1233 } else 1234 cc_post_recovery(tp, th); 1235 } 1236 tp->t_dupacks = 0; 1237 /* 1238 * If we reach this point, ACK is not a duplicate, 1239 * i.e., it ACKs something we sent. 1240 */ 1241 if (tp->t_flags & TF_NEEDSYN) { 1242 /* 1243 * T/TCP: Connection was half-synchronized, and our 1244 * SYN has been ACK'd (so connection is now fully 1245 * synchronized). Go to non-starred state, 1246 * increment snd_una for ACK of SYN, and check if 1247 * we can do window scaling. 1248 */ 1249 tp->t_flags &= ~TF_NEEDSYN; 1250 tp->snd_una++; 1251 /* Do window scaling? */ 1252 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1253 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1254 tp->rcv_scale = tp->request_r_scale; 1255 /* Send window already scaled. */ 1256 } 1257 } 1258 1259process_ACK: 1260 INP_WLOCK_ASSERT(tp->t_inpcb); 1261 1262 acked = BYTES_THIS_ACK(tp, th); 1263 TCPSTAT_INC(tcps_rcvackpack); 1264 TCPSTAT_ADD(tcps_rcvackbyte, acked); 1265 1266 /* 1267 * If we just performed our first retransmit, and the ACK 1268 * arrives within our recovery window, then it was a mistake 1269 * to do the retransmit in the first place. Recover our 1270 * original cwnd and ssthresh, and proceed to transmit where 1271 * we left off. 1272 */ 1273 if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && 1274 (int)(ticks - tp->t_badrxtwin) < 0) 1275 cc_cong_signal(tp, th, CC_RTO_ERR); 1276 1277 /* 1278 * If we have a timestamp reply, update smoothed 1279 * round trip time. If no timestamp is present but 1280 * transmit timer is running and timed sequence 1281 * number was acked, update smoothed round trip time. 1282 * Since we now have an rtt measurement, cancel the 1283 * timer backoff (cf., Phil Karn's retransmit alg.). 1284 * Recompute the initial retransmit timer. 1285 * 1286 * Some boxes send broken timestamp replies 1287 * during the SYN+ACK phase, ignore 1288 * timestamps of 0 or we could calculate a 1289 * huge RTT and blow up the retransmit timer. 1290 */ 1291 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 1292 u_int t; 1293 1294 t = tcp_ts_getticks() - to->to_tsecr; 1295 if (!tp->t_rttlow || tp->t_rttlow > t) 1296 tp->t_rttlow = t; 1297 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); 1298 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 1299 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 1300 tp->t_rttlow = ticks - tp->t_rtttime; 1301 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 1302 } 1303 1304 /* 1305 * If all outstanding data is acked, stop retransmit 1306 * timer and remember to restart (more output or persist). 1307 * If there is more data to be acked, restart retransmit 1308 * timer, using current (possibly backed-off) value. 1309 */ 1310 if (th->th_ack == tp->snd_max) { 1311 tcp_timer_activate(tp, TT_REXMT, 0); 1312 needoutput = 1; 1313 } else if (!tcp_timer_active(tp, TT_PERSIST)) 1314 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 1315 1316 /* 1317 * If no data (only SYN) was ACK'd, 1318 * skip rest of ACK processing. 1319 */ 1320 if (acked == 0) 1321 goto step6; 1322 1323 /* 1324 * Let the congestion control algorithm update congestion 1325 * control related information. This typically means increasing 1326 * the congestion window. 1327 */ 1328 cc_ack_received(tp, th, CC_ACK); 1329 1330 SOCKBUF_LOCK(&so->so_snd); 1331 if (acked > sbavail(&so->so_snd)) { 1332 tp->snd_wnd -= sbavail(&so->so_snd); 1333 mfree = sbcut_locked(&so->so_snd, 1334 (int)sbavail(&so->so_snd)); 1335 ourfinisacked = 1; 1336 } else { 1337 mfree = sbcut_locked(&so->so_snd, acked); 1338 tp->snd_wnd -= acked; 1339 ourfinisacked = 0; 1340 } 1341 /* NB: sowwakeup_locked() does an implicit unlock. */ 1342 sowwakeup_locked(so); 1343 m_freem(mfree); 1344 /* Detect una wraparound. */ 1345 if (!IN_RECOVERY(tp->t_flags) && 1346 SEQ_GT(tp->snd_una, tp->snd_recover) && 1347 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1348 tp->snd_recover = th->th_ack - 1; 1349 /* XXXLAS: Can this be moved up into cc_post_recovery? */ 1350 if (IN_RECOVERY(tp->t_flags) && 1351 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 1352 EXIT_RECOVERY(tp->t_flags); 1353 } 1354 tp->snd_una = th->th_ack; 1355 if (tp->t_flags & TF_SACK_PERMIT) { 1356 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 1357 tp->snd_recover = tp->snd_una; 1358 } 1359 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1360 tp->snd_nxt = tp->snd_una; 1361 1362 switch (tp->t_state) { 1363 1364 /* 1365 * In FIN_WAIT_1 STATE in addition to the processing 1366 * for the ESTABLISHED state if our FIN is now acknowledged 1367 * then enter FIN_WAIT_2. 1368 */ 1369 case TCPS_FIN_WAIT_1: 1370 if (ourfinisacked) { 1371 /* 1372 * If we can't receive any more 1373 * data, then closing user can proceed. 1374 * Starting the timer is contrary to the 1375 * specification, but if we don't get a FIN 1376 * we'll hang forever. 1377 * 1378 * XXXjl: 1379 * we should release the tp also, and use a 1380 * compressed state. 1381 */ 1382 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1383 soisdisconnected(so); 1384 tcp_timer_activate(tp, TT_2MSL, 1385 (tcp_fast_finwait2_recycle ? 1386 tcp_finwait2_timeout : 1387 TP_MAXIDLE(tp))); 1388 } 1389 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1390 } 1391 break; 1392 1393 /* 1394 * In CLOSING STATE in addition to the processing for 1395 * the ESTABLISHED state if the ACK acknowledges our FIN 1396 * then enter the TIME-WAIT state, otherwise ignore 1397 * the segment. 1398 */ 1399 case TCPS_CLOSING: 1400 if (ourfinisacked) { 1401 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1402 tcp_twstart(tp); 1403 INP_INFO_RUNLOCK(&V_tcbinfo); 1404 m_freem(m); 1405 return; 1406 } 1407 break; 1408 1409 /* 1410 * In LAST_ACK, we may still be waiting for data to drain 1411 * and/or to be acked, as well as for the ack of our FIN. 1412 * If our FIN is now acknowledged, delete the TCB, 1413 * enter the closed state and return. 1414 */ 1415 case TCPS_LAST_ACK: 1416 if (ourfinisacked) { 1417 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1418 tp = tcp_close(tp); 1419 goto drop; 1420 } 1421 break; 1422 } 1423 } 1424 1425step6: 1426 INP_WLOCK_ASSERT(tp->t_inpcb); 1427 1428 /* 1429 * Update window information. 1430 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1431 */ 1432 if ((thflags & TH_ACK) && 1433 (SEQ_LT(tp->snd_wl1, th->th_seq) || 1434 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 1435 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1436 /* keep track of pure window updates */ 1437 if (tlen == 0 && 1438 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1439 TCPSTAT_INC(tcps_rcvwinupd); 1440 tp->snd_wnd = tiwin; 1441 tp->snd_wl1 = th->th_seq; 1442 tp->snd_wl2 = th->th_ack; 1443 if (tp->snd_wnd > tp->max_sndwnd) 1444 tp->max_sndwnd = tp->snd_wnd; 1445 needoutput = 1; 1446 } 1447 1448 /* 1449 * Process segments with URG. 1450 */ 1451 if ((thflags & TH_URG) && th->th_urp && 1452 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1453 /* 1454 * This is a kludge, but if we receive and accept 1455 * random urgent pointers, we'll crash in 1456 * soreceive. It's hard to imagine someone 1457 * actually wanting to send this much urgent data. 1458 */ 1459 SOCKBUF_LOCK(&so->so_rcv); 1460 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 1461 th->th_urp = 0; /* XXX */ 1462 thflags &= ~TH_URG; /* XXX */ 1463 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 1464 goto dodata; /* XXX */ 1465 } 1466 /* 1467 * If this segment advances the known urgent pointer, 1468 * then mark the data stream. This should not happen 1469 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1470 * a FIN has been received from the remote side. 1471 * In these states we ignore the URG. 1472 * 1473 * According to RFC961 (Assigned Protocols), 1474 * the urgent pointer points to the last octet 1475 * of urgent data. We continue, however, 1476 * to consider it to indicate the first octet 1477 * of data past the urgent section as the original 1478 * spec states (in one of two places). 1479 */ 1480 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1481 tp->rcv_up = th->th_seq + th->th_urp; 1482 so->so_oobmark = sbavail(&so->so_rcv) + 1483 (tp->rcv_up - tp->rcv_nxt) - 1; 1484 if (so->so_oobmark == 0) 1485 so->so_rcv.sb_state |= SBS_RCVATMARK; 1486 sohasoutofband(so); 1487 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1488 } 1489 SOCKBUF_UNLOCK(&so->so_rcv); 1490 /* 1491 * Remove out of band data so doesn't get presented to user. 1492 * This can happen independent of advancing the URG pointer, 1493 * but if two URG's are pending at once, some out-of-band 1494 * data may creep in... ick. 1495 */ 1496 if (th->th_urp <= (u_long)tlen && 1497 !(so->so_options & SO_OOBINLINE)) { 1498 /* hdr drop is delayed */ 1499 tcp_pulloutofband(so, th, m, drop_hdrlen); 1500 } 1501 } else { 1502 /* 1503 * If no out of band data is expected, 1504 * pull receive urgent pointer along 1505 * with the receive window. 1506 */ 1507 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1508 tp->rcv_up = tp->rcv_nxt; 1509 } 1510dodata: /* XXX */ 1511 INP_WLOCK_ASSERT(tp->t_inpcb); 1512 1513 /* 1514 * Process the segment text, merging it into the TCP sequencing queue, 1515 * and arranging for acknowledgment of receipt if necessary. 1516 * This process logically involves adjusting tp->rcv_wnd as data 1517 * is presented to the user (this happens in tcp_usrreq.c, 1518 * case PRU_RCVD). If a FIN has already been received on this 1519 * connection then we just ignore the text. 1520 */ 1521 if ((tlen || (thflags & TH_FIN)) && 1522 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1523 tcp_seq save_start = th->th_seq; 1524 m_adj(m, drop_hdrlen); /* delayed header drop */ 1525 /* 1526 * Insert segment which includes th into TCP reassembly queue 1527 * with control block tp. Set thflags to whether reassembly now 1528 * includes a segment with FIN. This handles the common case 1529 * inline (segment is the next to be received on an established 1530 * connection, and the queue is empty), avoiding linkage into 1531 * and removal from the queue and repetition of various 1532 * conversions. 1533 * Set DELACK for segments received in order, but ack 1534 * immediately when segments are out of order (so 1535 * fast retransmit can work). 1536 */ 1537 if (th->th_seq == tp->rcv_nxt && 1538 LIST_EMPTY(&tp->t_segq) && 1539 TCPS_HAVEESTABLISHED(tp->t_state)) { 1540 if (DELAY_ACK(tp, tlen)) 1541 tp->t_flags |= TF_DELACK; 1542 else 1543 tp->t_flags |= TF_ACKNOW; 1544 tp->rcv_nxt += tlen; 1545 thflags = th->th_flags & TH_FIN; 1546 TCPSTAT_INC(tcps_rcvpack); 1547 TCPSTAT_ADD(tcps_rcvbyte, tlen); 1548 SOCKBUF_LOCK(&so->so_rcv); 1549 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1550 m_freem(m); 1551 else 1552 sbappendstream_locked(&so->so_rcv, m, 0); 1553 /* NB: sorwakeup_locked() does an implicit unlock. */ 1554 sorwakeup_locked(so); 1555 } else { 1556 /* 1557 * XXX: Due to the header drop above "th" is 1558 * theoretically invalid by now. Fortunately 1559 * m_adj() doesn't actually frees any mbufs 1560 * when trimming from the head. 1561 */ 1562 thflags = tcp_reass(tp, th, &tlen, m); 1563 tp->t_flags |= TF_ACKNOW; 1564 } 1565 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 1566 tcp_update_sack_list(tp, save_start, save_start + tlen); 1567#if 0 1568 /* 1569 * Note the amount of data that peer has sent into 1570 * our window, in order to estimate the sender's 1571 * buffer size. 1572 * XXX: Unused. 1573 */ 1574 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) 1575 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1576 else 1577 len = so->so_rcv.sb_hiwat; 1578#endif 1579 } else { 1580 m_freem(m); 1581 thflags &= ~TH_FIN; 1582 } 1583 1584 /* 1585 * If FIN is received ACK the FIN and let the user know 1586 * that the connection is closing. 1587 */ 1588 if (thflags & TH_FIN) { 1589 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1590 socantrcvmore(so); 1591 /* 1592 * If connection is half-synchronized 1593 * (ie NEEDSYN flag on) then delay ACK, 1594 * so it may be piggybacked when SYN is sent. 1595 * Otherwise, since we received a FIN then no 1596 * more input can be expected, send ACK now. 1597 */ 1598 if (tp->t_flags & TF_NEEDSYN) 1599 tp->t_flags |= TF_DELACK; 1600 else 1601 tp->t_flags |= TF_ACKNOW; 1602 tp->rcv_nxt++; 1603 } 1604 switch (tp->t_state) { 1605 1606 /* 1607 * In SYN_RECEIVED and ESTABLISHED STATES 1608 * enter the CLOSE_WAIT state. 1609 */ 1610 case TCPS_SYN_RECEIVED: 1611 tp->t_starttime = ticks; 1612 /* FALLTHROUGH */ 1613 case TCPS_ESTABLISHED: 1614 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1615 break; 1616 1617 /* 1618 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1619 * enter the CLOSING state. 1620 */ 1621 case TCPS_FIN_WAIT_1: 1622 tcp_state_change(tp, TCPS_CLOSING); 1623 break; 1624 1625 /* 1626 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1627 * starting the time-wait timer, turning off the other 1628 * standard timers. 1629 */ 1630 case TCPS_FIN_WAIT_2: 1631 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1632 KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " 1633 "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 1634 ti_locked)); 1635 1636 tcp_twstart(tp); 1637 INP_INFO_RUNLOCK(&V_tcbinfo); 1638 return; 1639 } 1640 } 1641 if (ti_locked == TI_RLOCKED) { 1642 INP_INFO_RUNLOCK(&V_tcbinfo); 1643 } 1644 ti_locked = TI_UNLOCKED; 1645 1646#ifdef TCPDEBUG 1647 if (so->so_options & SO_DEBUG) 1648 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 1649 &tcp_savetcp, 0); 1650#endif 1651 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); 1652 1653 /* 1654 * Return any desired output. 1655 */ 1656 if (needoutput || (tp->t_flags & TF_ACKNOW)) 1657 (void) tp->t_fb->tfb_tcp_output(tp); 1658 1659 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 1660 __func__, ti_locked)); 1661 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1662 INP_WLOCK_ASSERT(tp->t_inpcb); 1663 1664 if (tp->t_flags & TF_DELACK) { 1665 tp->t_flags &= ~TF_DELACK; 1666 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 1667 } 1668 INP_WUNLOCK(tp->t_inpcb); 1669 return; 1670 1671dropafterack: 1672 /* 1673 * Generate an ACK dropping incoming segment if it occupies 1674 * sequence space, where the ACK reflects our state. 1675 * 1676 * We can now skip the test for the RST flag since all 1677 * paths to this code happen after packets containing 1678 * RST have been dropped. 1679 * 1680 * In the SYN-RECEIVED state, don't send an ACK unless the 1681 * segment we received passes the SYN-RECEIVED ACK test. 1682 * If it fails send a RST. This breaks the loop in the 1683 * "LAND" DoS attack, and also prevents an ACK storm 1684 * between two listening ports that have been sent forged 1685 * SYN segments, each with the source address of the other. 1686 */ 1687 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1688 (SEQ_GT(tp->snd_una, th->th_ack) || 1689 SEQ_GT(th->th_ack, tp->snd_max)) ) { 1690 rstreason = BANDLIM_RST_OPENPORT; 1691 goto dropwithreset; 1692 } 1693#ifdef TCPDEBUG 1694 if (so->so_options & SO_DEBUG) 1695 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 1696 &tcp_savetcp, 0); 1697#endif 1698 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); 1699 if (ti_locked == TI_RLOCKED) { 1700 INP_INFO_RUNLOCK(&V_tcbinfo); 1701 } 1702 ti_locked = TI_UNLOCKED; 1703 1704 tp->t_flags |= TF_ACKNOW; 1705 (void) tp->t_fb->tfb_tcp_output(tp); 1706 INP_WUNLOCK(tp->t_inpcb); 1707 m_freem(m); 1708 return; 1709 1710dropwithreset: 1711 if (ti_locked == TI_RLOCKED) { 1712 INP_INFO_RUNLOCK(&V_tcbinfo); 1713 } 1714 ti_locked = TI_UNLOCKED; 1715 1716 if (tp != NULL) { 1717 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1718 INP_WUNLOCK(tp->t_inpcb); 1719 } else 1720 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1721 return; 1722 1723drop: 1724 if (ti_locked == TI_RLOCKED) { 1725 INP_INFO_RUNLOCK(&V_tcbinfo); 1726 ti_locked = TI_UNLOCKED; 1727 } 1728#ifdef INVARIANTS 1729 else 1730 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1731#endif 1732 1733 /* 1734 * Drop space held by incoming segment and return. 1735 */ 1736#ifdef TCPDEBUG 1737 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 1738 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 1739 &tcp_savetcp, 0); 1740#endif 1741 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); 1742 if (tp != NULL) 1743 INP_WUNLOCK(tp->t_inpcb); 1744 m_freem(m); 1745} 1746 1747 1748/* 1749 * Do fast slow is a combination of the original 1750 * tcp_dosegment and a split fastpath, one function 1751 * for the fast-ack which also includes allowing fastpath 1752 * for window advanced in sequence acks. And also a 1753 * sub-function that handles the insequence data. 1754 */ 1755void 1756tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so, 1757 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 1758 int ti_locked) 1759{ 1760 int thflags; 1761 u_long tiwin; 1762 char *s; 1763 int can_enter; 1764 struct in_conninfo *inc; 1765 struct tcpopt to; 1766 1767 thflags = th->th_flags; 1768 tp->sackhint.last_sack_ack = 0; 1769 inc = &tp->t_inpcb->inp_inc; 1770 /* 1771 * If this is either a state-changing packet or current state isn't 1772 * established, we require a write lock on tcbinfo. Otherwise, we 1773 * allow the tcbinfo to be in either alocked or unlocked, as the 1774 * caller may have unnecessarily acquired a write lock due to a race. 1775 */ 1776 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 1777 tp->t_state != TCPS_ESTABLISHED) { 1778 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 1779 "SYN/FIN/RST/!EST", __func__, ti_locked)); 1780 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1781 } else { 1782#ifdef INVARIANTS 1783 if (ti_locked == TI_RLOCKED) { 1784 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1785 } else { 1786 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 1787 "ti_locked: %d", __func__, ti_locked)); 1788 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1789 } 1790#endif 1791 } 1792 INP_WLOCK_ASSERT(tp->t_inpcb); 1793 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 1794 __func__)); 1795 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 1796 __func__)); 1797 1798 /* 1799 * Segment received on connection. 1800 * Reset idle time and keep-alive timer. 1801 * XXX: This should be done after segment 1802 * validation to ignore broken/spoofed segs. 1803 */ 1804 tp->t_rcvtime = ticks; 1805 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1806 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 1807 1808 /* 1809 * Unscale the window into a 32-bit value. 1810 * For the SYN_SENT state the scale is zero. 1811 */ 1812 tiwin = th->th_win << tp->snd_scale; 1813 1814 /* 1815 * TCP ECN processing. 1816 */ 1817 if (tp->t_flags & TF_ECN_PERMIT) { 1818 if (thflags & TH_CWR) 1819 tp->t_flags &= ~TF_ECN_SND_ECE; 1820 switch (iptos & IPTOS_ECN_MASK) { 1821 case IPTOS_ECN_CE: 1822 tp->t_flags |= TF_ECN_SND_ECE; 1823 TCPSTAT_INC(tcps_ecn_ce); 1824 break; 1825 case IPTOS_ECN_ECT0: 1826 TCPSTAT_INC(tcps_ecn_ect0); 1827 break; 1828 case IPTOS_ECN_ECT1: 1829 TCPSTAT_INC(tcps_ecn_ect1); 1830 break; 1831 } 1832 /* Congestion experienced. */ 1833 if (thflags & TH_ECE) { 1834 cc_cong_signal(tp, th, CC_ECN); 1835 } 1836 } 1837 1838 /* 1839 * Parse options on any incoming segment. 1840 */ 1841 tcp_dooptions(&to, (u_char *)(th + 1), 1842 (th->th_off << 2) - sizeof(struct tcphdr), 1843 (thflags & TH_SYN) ? TO_SYN : 0); 1844 1845 /* 1846 * If echoed timestamp is later than the current time, 1847 * fall back to non RFC1323 RTT calculation. Normalize 1848 * timestamp if syncookies were used when this connection 1849 * was established. 1850 */ 1851 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1852 to.to_tsecr -= tp->ts_offset; 1853 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) 1854 to.to_tsecr = 0; 1855 } 1856 /* 1857 * If timestamps were negotiated during SYN/ACK they should 1858 * appear on every segment during this session and vice versa. 1859 */ 1860 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 1861 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1862 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1863 "no action\n", s, __func__); 1864 free(s, M_TCPLOG); 1865 } 1866 } 1867 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 1868 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1869 log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 1870 "no action\n", s, __func__); 1871 free(s, M_TCPLOG); 1872 } 1873 } 1874 1875 /* 1876 * Process options only when we get SYN/ACK back. The SYN case 1877 * for incoming connections is handled in tcp_syncache. 1878 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1879 * or <SYN,ACK>) segment itself is never scaled. 1880 * XXX this is traditional behavior, may need to be cleaned up. 1881 */ 1882 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1883 if ((to.to_flags & TOF_SCALE) && 1884 (tp->t_flags & TF_REQ_SCALE)) { 1885 tp->t_flags |= TF_RCVD_SCALE; 1886 tp->snd_scale = to.to_wscale; 1887 } 1888 /* 1889 * Initial send window. It will be updated with 1890 * the next incoming segment to the scaled value. 1891 */ 1892 tp->snd_wnd = th->th_win; 1893 if (to.to_flags & TOF_TS) { 1894 tp->t_flags |= TF_RCVD_TSTMP; 1895 tp->ts_recent = to.to_tsval; 1896 tp->ts_recent_age = tcp_ts_getticks(); 1897 } 1898 if (to.to_flags & TOF_MSS) 1899 tcp_mss(tp, to.to_mss); 1900 if ((tp->t_flags & TF_SACK_PERMIT) && 1901 (to.to_flags & TOF_SACKPERM) == 0) 1902 tp->t_flags &= ~TF_SACK_PERMIT; 1903 } 1904 can_enter = 0; 1905 if (__predict_true((tlen == 0))) { 1906 /* 1907 * The ack moved forward and we have a window (non-zero) 1908 * <or> 1909 * The ack did not move forward, but the window increased. 1910 */ 1911 if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) || 1912 ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) { 1913 can_enter = 1; 1914 } 1915 } else { 1916 /* 1917 * Data incoming, use the old entry criteria 1918 * for fast-path with data. 1919 */ 1920 if ((tiwin && tiwin == tp->snd_wnd)) { 1921 can_enter = 1; 1922 } 1923 } 1924 /* 1925 * Header prediction: check for the two common cases 1926 * of a uni-directional data xfer. If the packet has 1927 * no control flags, is in-sequence, the window didn't 1928 * change and we're not retransmitting, it's a 1929 * candidate. If the length is zero and the ack moved 1930 * forward, we're the sender side of the xfer. Just 1931 * free the data acked & wake any higher level process 1932 * that was blocked waiting for space. If the length 1933 * is non-zero and the ack didn't move, we're the 1934 * receiver side. If we're getting packets in-order 1935 * (the reassembly queue is empty), add the data to 1936 * the socket buffer and note that we need a delayed ack. 1937 * Make sure that the hidden state-flags are also off. 1938 * Since we check for TCPS_ESTABLISHED first, it can only 1939 * be TH_NEEDSYN. 1940 */ 1941 if (__predict_true(tp->t_state == TCPS_ESTABLISHED && 1942 th->th_seq == tp->rcv_nxt && 1943 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1944 tp->snd_nxt == tp->snd_max && 1945 can_enter && 1946 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1947 LIST_EMPTY(&tp->t_segq) && 1948 ((to.to_flags & TOF_TS) == 0 || 1949 TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) { 1950 if (__predict_true((tlen == 0) && 1951 (SEQ_LEQ(th->th_ack, tp->snd_max) && 1952 !IN_RECOVERY(tp->t_flags) && 1953 (to.to_flags & TOF_SACK) == 0 && 1954 TAILQ_EMPTY(&tp->snd_holes)))) { 1955 /* We are done */ 1956 tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 1957 ti_locked, tiwin); 1958 return; 1959 } else if ((tlen) && 1960 (th->th_ack == tp->snd_una && 1961 tlen <= sbspace(&so->so_rcv))) { 1962 tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen, 1963 ti_locked, tiwin); 1964 /* We are done */ 1965 return; 1966 } 1967 } 1968 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, 1969 ti_locked, tiwin, thflags); 1970} 1971 1972 1973/* 1974 * This subfunction is used to try to highly optimize the 1975 * fast path. We again allow window updates that are 1976 * in sequence to remain in the fast-path. We also add 1977 * in the __predict's to attempt to help the compiler. 1978 * Note that if we return a 0, then we can *not* process 1979 * it and the caller should push the packet into the 1980 * slow-path. 1981 */ 1982static int 1983tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 1984 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 1985 int ti_locked, u_long tiwin) 1986{ 1987 int acked; 1988 int winup_only=0; 1989#ifdef TCPDEBUG 1990 /* 1991 * The size of tcp_saveipgen must be the size of the max ip header, 1992 * now IPv6. 1993 */ 1994 u_char tcp_saveipgen[IP6_HDR_LEN]; 1995 struct tcphdr tcp_savetcp; 1996 short ostate = 0; 1997#endif 1998 1999 2000 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 2001 /* Old ack, behind (or duplicate to) the last one rcv'd */ 2002 return (0); 2003 } 2004 if (__predict_false(th->th_ack == tp->snd_una) && 2005 __predict_false(tiwin <= tp->snd_wnd)) { 2006 /* duplicate ack <or> a shrinking dup ack with shrinking window */ 2007 return (0); 2008 } 2009 if (__predict_false(tiwin == 0)) { 2010 /* zero window */ 2011 return (0); 2012 } 2013 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 2014 /* Above what we have sent? */ 2015 return (0); 2016 } 2017 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 2018 /* We are retransmitting */ 2019 return (0); 2020 } 2021 if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) { 2022 /* We need a SYN or a FIN, unlikely.. */ 2023 return (0); 2024 } 2025 if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 2026 /* Timestamp is behind .. old ack with seq wrap? */ 2027 return (0); 2028 } 2029 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 2030 /* Still recovering */ 2031 return (0); 2032 } 2033 if (__predict_false(to->to_flags & TOF_SACK)) { 2034 /* Sack included in the ack.. */ 2035 return (0); 2036 } 2037 if (!TAILQ_EMPTY(&tp->snd_holes)) { 2038 /* We have sack holes on our scoreboard */ 2039 return (0); 2040 } 2041 /* Ok if we reach here, we can process a fast-ack */ 2042 2043 /* Did the window get updated? */ 2044 if (tiwin != tp->snd_wnd) { 2045 /* keep track of pure window updates */ 2046 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { 2047 winup_only = 1; 2048 TCPSTAT_INC(tcps_rcvwinupd); 2049 } 2050 tp->snd_wnd = tiwin; 2051 tp->snd_wl1 = th->th_seq; 2052 if (tp->snd_wnd > tp->max_sndwnd) 2053 tp->max_sndwnd = tp->snd_wnd; 2054 } 2055 /* 2056 * Pull snd_wl2 up to prevent seq wrap relative 2057 * to th_ack. 2058 */ 2059 tp->snd_wl2 = th->th_ack; 2060 /* 2061 * If last ACK falls within this segment's sequence numbers, 2062 * record the timestamp. 2063 * NOTE that the test is modified according to the latest 2064 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2065 */ 2066 if ((to->to_flags & TOF_TS) != 0 && 2067 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 2068 tp->ts_recent_age = tcp_ts_getticks(); 2069 tp->ts_recent = to->to_tsval; 2070 } 2071 /* 2072 * This is a pure ack for outstanding data. 2073 */ 2074 if (ti_locked == TI_RLOCKED) { 2075 INP_INFO_RUNLOCK(&V_tcbinfo); 2076 } 2077 ti_locked = TI_UNLOCKED; 2078 2079 TCPSTAT_INC(tcps_predack); 2080 2081 /* 2082 * "bad retransmit" recovery. 2083 */ 2084 if (tp->t_rxtshift == 1 && 2085 tp->t_flags & TF_PREVVALID && 2086 (int)(ticks - tp->t_badrxtwin) < 0) { 2087 cc_cong_signal(tp, th, CC_RTO_ERR); 2088 } 2089 2090 /* 2091 * Recalculate the transmit timer / rtt. 2092 * 2093 * Some boxes send broken timestamp replies 2094 * during the SYN+ACK phase, ignore 2095 * timestamps of 0 or we could calculate a 2096 * huge RTT and blow up the retransmit timer. 2097 */ 2098 if ((to->to_flags & TOF_TS) != 0 && 2099 to->to_tsecr) { 2100 u_int t; 2101 2102 t = tcp_ts_getticks() - to->to_tsecr; 2103 if (!tp->t_rttlow || tp->t_rttlow > t) 2104 tp->t_rttlow = t; 2105 tcp_xmit_timer(tp, 2106 TCP_TS_TO_TICKS(t) + 1); 2107 } else if (tp->t_rtttime && 2108 SEQ_GT(th->th_ack, tp->t_rtseq)) { 2109 if (!tp->t_rttlow || 2110 tp->t_rttlow > ticks - tp->t_rtttime) 2111 tp->t_rttlow = ticks - tp->t_rtttime; 2112 tcp_xmit_timer(tp, 2113 ticks - tp->t_rtttime); 2114 } 2115 if (winup_only == 0) { 2116 acked = BYTES_THIS_ACK(tp, th); 2117 2118 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 2119 hhook_run_tcp_est_in(tp, th, to); 2120 2121 TCPSTAT_ADD(tcps_rcvackbyte, acked); 2122 sbdrop(&so->so_snd, acked); 2123 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 2124 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2125 tp->snd_recover = th->th_ack - 1; 2126 2127 /* 2128 * Let the congestion control algorithm update 2129 * congestion control related information. This 2130 * typically means increasing the congestion 2131 * window. 2132 */ 2133 cc_ack_received(tp, th, CC_ACK); 2134 2135 tp->snd_una = th->th_ack; 2136 tp->t_dupacks = 0; 2137 m_freem(m); 2138 2139 /* 2140 * If all outstanding data are acked, stop 2141 * retransmit timer, otherwise restart timer 2142 * using current (possibly backed-off) value. 2143 * If process is waiting for space, 2144 * wakeup/selwakeup/signal. If data 2145 * are ready to send, let tcp_output 2146 * decide between more output or persist. 2147 */ 2148#ifdef TCPDEBUG 2149 if (so->so_options & SO_DEBUG) 2150 tcp_trace(TA_INPUT, ostate, tp, 2151 (void *)tcp_saveipgen, 2152 &tcp_savetcp, 0); 2153#endif 2154 if (tp->snd_una == tp->snd_max) 2155 tcp_timer_activate(tp, TT_REXMT, 0); 2156 else if (!tcp_timer_active(tp, TT_PERSIST)) 2157 tcp_timer_activate(tp, TT_REXMT, 2158 tp->t_rxtcur); 2159 /* Wake up the socket if we have room to write more */ 2160 sowwakeup(so); 2161 } else { 2162 /* 2163 * Window update only, just free the mbufs and 2164 * send out whatever we can. 2165 */ 2166 m_freem(m); 2167 } 2168 if (sbavail(&so->so_snd)) 2169 (void) tcp_output(tp); 2170 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 2171 __func__, ti_locked)); 2172 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2173 INP_WLOCK_ASSERT(tp->t_inpcb); 2174 2175 if (tp->t_flags & TF_DELACK) { 2176 tp->t_flags &= ~TF_DELACK; 2177 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2178 } 2179 INP_WUNLOCK(tp->t_inpcb); 2180 return (1); 2181} 2182 2183/* 2184 * This tcp-do-segment concentrates on making the fastest 2185 * ack processing path. It does not have a fast-path for 2186 * data (it possibly could which would then eliminate the 2187 * need for fast-slow above). For a content distributor having 2188 * large outgoing elephants and very very little coming in 2189 * having no fastpath for data does not really help (since you 2190 * don't get much data in). The most important thing is 2191 * processing ack's quickly and getting the rest of the data 2192 * output to the peer as quickly as possible. This routine 2193 * seems to be about an overall 3% faster then the old 2194 * tcp_do_segment and keeps us in the fast-path for packets 2195 * much more (by allowing window updates to also stay in the fastpath). 2196 */ 2197void 2198tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 2199 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 2200 int ti_locked) 2201{ 2202 int thflags; 2203 u_long tiwin; 2204 char *s; 2205 struct in_conninfo *inc; 2206 struct tcpopt to; 2207 2208 thflags = th->th_flags; 2209 tp->sackhint.last_sack_ack = 0; 2210 inc = &tp->t_inpcb->inp_inc; 2211 /* 2212 * If this is either a state-changing packet or current state isn't 2213 * established, we require a write lock on tcbinfo. Otherwise, we 2214 * allow the tcbinfo to be in either alocked or unlocked, as the 2215 * caller may have unnecessarily acquired a write lock due to a race. 2216 */ 2217 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 2218 tp->t_state != TCPS_ESTABLISHED) { 2219 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 2220 "SYN/FIN/RST/!EST", __func__, ti_locked)); 2221 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2222 } else { 2223#ifdef INVARIANTS 2224 if (ti_locked == TI_RLOCKED) { 2225 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2226 } else { 2227 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 2228 "ti_locked: %d", __func__, ti_locked)); 2229 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2230 } 2231#endif 2232 } 2233 INP_WLOCK_ASSERT(tp->t_inpcb); 2234 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 2235 __func__)); 2236 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 2237 __func__)); 2238 2239 /* 2240 * Segment received on connection. 2241 * Reset idle time and keep-alive timer. 2242 * XXX: This should be done after segment 2243 * validation to ignore broken/spoofed segs. 2244 */ 2245 tp->t_rcvtime = ticks; 2246 if (TCPS_HAVEESTABLISHED(tp->t_state)) 2247 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 2248 2249 /* 2250 * Unscale the window into a 32-bit value. 2251 * For the SYN_SENT state the scale is zero. 2252 */ 2253 tiwin = th->th_win << tp->snd_scale; 2254 2255 /* 2256 * TCP ECN processing. 2257 */ 2258 if (tp->t_flags & TF_ECN_PERMIT) { 2259 if (thflags & TH_CWR) 2260 tp->t_flags &= ~TF_ECN_SND_ECE; 2261 switch (iptos & IPTOS_ECN_MASK) { 2262 case IPTOS_ECN_CE: 2263 tp->t_flags |= TF_ECN_SND_ECE; 2264 TCPSTAT_INC(tcps_ecn_ce); 2265 break; 2266 case IPTOS_ECN_ECT0: 2267 TCPSTAT_INC(tcps_ecn_ect0); 2268 break; 2269 case IPTOS_ECN_ECT1: 2270 TCPSTAT_INC(tcps_ecn_ect1); 2271 break; 2272 } 2273 /* Congestion experienced. */ 2274 if (thflags & TH_ECE) { 2275 cc_cong_signal(tp, th, CC_ECN); 2276 } 2277 } 2278 2279 /* 2280 * Parse options on any incoming segment. 2281 */ 2282 tcp_dooptions(&to, (u_char *)(th + 1), 2283 (th->th_off << 2) - sizeof(struct tcphdr), 2284 (thflags & TH_SYN) ? TO_SYN : 0); 2285 2286 /* 2287 * If echoed timestamp is later than the current time, 2288 * fall back to non RFC1323 RTT calculation. Normalize 2289 * timestamp if syncookies were used when this connection 2290 * was established. 2291 */ 2292 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 2293 to.to_tsecr -= tp->ts_offset; 2294 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) 2295 to.to_tsecr = 0; 2296 } 2297 /* 2298 * If timestamps were negotiated during SYN/ACK they should 2299 * appear on every segment during this session and vice versa. 2300 */ 2301 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 2302 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2303 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 2304 "no action\n", s, __func__); 2305 free(s, M_TCPLOG); 2306 } 2307 } 2308 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 2309 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2310 log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 2311 "no action\n", s, __func__); 2312 free(s, M_TCPLOG); 2313 } 2314 } 2315 2316 /* 2317 * Process options only when we get SYN/ACK back. The SYN case 2318 * for incoming connections is handled in tcp_syncache. 2319 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 2320 * or <SYN,ACK>) segment itself is never scaled. 2321 * XXX this is traditional behavior, may need to be cleaned up. 2322 */ 2323 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 2324 if ((to.to_flags & TOF_SCALE) && 2325 (tp->t_flags & TF_REQ_SCALE)) { 2326 tp->t_flags |= TF_RCVD_SCALE; 2327 tp->snd_scale = to.to_wscale; 2328 } 2329 /* 2330 * Initial send window. It will be updated with 2331 * the next incoming segment to the scaled value. 2332 */ 2333 tp->snd_wnd = th->th_win; 2334 if (to.to_flags & TOF_TS) { 2335 tp->t_flags |= TF_RCVD_TSTMP; 2336 tp->ts_recent = to.to_tsval; 2337 tp->ts_recent_age = tcp_ts_getticks(); 2338 } 2339 if (to.to_flags & TOF_MSS) 2340 tcp_mss(tp, to.to_mss); 2341 if ((tp->t_flags & TF_SACK_PERMIT) && 2342 (to.to_flags & TOF_SACKPERM) == 0) 2343 tp->t_flags &= ~TF_SACK_PERMIT; 2344 } 2345 /* 2346 * Header prediction: check for the two common cases 2347 * of a uni-directional data xfer. If the packet has 2348 * no control flags, is in-sequence, the window didn't 2349 * change and we're not retransmitting, it's a 2350 * candidate. If the length is zero and the ack moved 2351 * forward, we're the sender side of the xfer. Just 2352 * free the data acked & wake any higher level process 2353 * that was blocked waiting for space. If the length 2354 * is non-zero and the ack didn't move, we're the 2355 * receiver side. If we're getting packets in-order 2356 * (the reassembly queue is empty), add the data to 2357 * the socket buffer and note that we need a delayed ack. 2358 * Make sure that the hidden state-flags are also off. 2359 * Since we check for TCPS_ESTABLISHED first, it can only 2360 * be TH_NEEDSYN. 2361 */ 2362 if (__predict_true(tp->t_state == TCPS_ESTABLISHED) && 2363 __predict_true(((to.to_flags & TOF_SACK) == 0)) && 2364 __predict_true(tlen == 0) && 2365 __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) && 2366 __predict_true(LIST_EMPTY(&tp->t_segq)) && 2367 __predict_true(th->th_seq == tp->rcv_nxt)) { 2368 if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 2369 ti_locked, tiwin)) { 2370 return; 2371 } 2372 } 2373 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, 2374 ti_locked, tiwin, thflags); 2375} 2376 2377struct tcp_function_block __tcp_fastslow = { 2378 "fastslow", 2379 tcp_output, 2380 tcp_do_segment_fastslow, 2381 tcp_default_ctloutput, 2382 NULL, 2383 NULL, 2384 NULL, 2385 NULL, 2386 NULL, 2387 NULL, 2388 NULL, 2389 0, 2390 0 2391 2392}; 2393 2394struct tcp_function_block __tcp_fastack = { 2395 "fastack", 2396 tcp_output, 2397 tcp_do_segment_fastack, 2398 tcp_default_ctloutput, 2399 NULL, 2400 NULL, 2401 NULL, 2402 NULL, 2403 NULL, 2404 NULL, 2405 NULL, 2406 0, 2407 0 2408}; 2409 2410static int 2411tcp_addfastpaths(module_t mod, int type, void *data) 2412{ 2413 int err=0; 2414 2415 switch (type) { 2416 case MOD_LOAD: 2417 err = register_tcp_functions(&__tcp_fastack, M_WAITOK); 2418 if (err) { 2419 printf("Failed to register fastack module -- err:%d\n", err); 2420 return(err); 2421 } 2422 err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); 2423 if (err) { 2424 printf("Failed to register fastslow module -- err:%d\n", err); 2425 deregister_tcp_functions(&__tcp_fastack); 2426 return(err); 2427 } 2428 break; 2429 case MOD_QUIESCE: 2430 if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) { 2431 return(EBUSY); 2432 } 2433 break; 2434 case MOD_UNLOAD: 2435 err = deregister_tcp_functions(&__tcp_fastack); 2436 if (err == EBUSY) 2437 break; 2438 err = deregister_tcp_functions(&__tcp_fastslow); 2439 if (err == EBUSY) 2440 break; 2441 err = 0; 2442 break; 2443 default: 2444 return (EOPNOTSUPP); 2445 } 2446 return (err); 2447} 2448 2449static moduledata_t new_tcp_fastpaths = { 2450 .name = "tcp_fastpaths", 2451 .evhand = tcp_addfastpaths, 2452 .priv = 0 2453}; 2454 2455MODULE_VERSION(kern_tcpfastpaths, 1); 2456DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PSEUDO, SI_ORDER_ANY);
| 111#ifdef TCPDEBUG 112#include <netinet/tcp_debug.h> 113#endif /* TCPDEBUG */ 114#ifdef TCP_OFFLOAD 115#include <netinet/tcp_offload.h> 116#endif 117 118#ifdef IPSEC 119#include <netipsec/ipsec.h> 120#include <netipsec/ipsec6.h> 121#endif /*IPSEC*/ 122 123#include <machine/in_cksum.h> 124 125#include <security/mac/mac_framework.h> 126 127const int tcprexmtthresh; 128 129VNET_DECLARE(int, tcp_autorcvbuf_inc); 130#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 131VNET_DECLARE(int, tcp_autorcvbuf_max); 132#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 133VNET_DECLARE(int, tcp_do_rfc3042); 134#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) 135VNET_DECLARE(int, tcp_do_autorcvbuf); 136#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 137VNET_DECLARE(int, tcp_insecure_rst); 138#define V_tcp_insecure_rst VNET(tcp_insecure_rst) 139VNET_DECLARE(int, tcp_insecure_syn); 140#define V_tcp_insecure_syn VNET(tcp_insecure_syn) 141 142static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *, 143 struct socket *, struct tcpcb *, int, int, uint8_t, 144 int); 145 146static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *, 147 struct socket *, struct tcpcb *, int, int, uint8_t, 148 int); 149 150/* 151 * Indicate whether this ack should be delayed. We can delay the ack if 152 * following conditions are met: 153 * - There is no delayed ack timer in progress. 154 * - Our last ack wasn't a 0-sized window. We never want to delay 155 * the ack that opens up a 0-sized window. 156 * - LRO wasn't used for this segment. We make sure by checking that the 157 * segment size is not larger than the MSS. 158 */ 159#define DELAY_ACK(tp, tlen) \ 160 ((!tcp_timer_active(tp, TT_DELACK) && \ 161 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 162 (tlen <= tp->t_maxseg) && \ 163 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 164 165/* 166 * So how is this faster than the normal fast ack? 167 * It basically allows us to also stay in the fastpath 168 * when a window-update ack also arrives. In testing 169 * we saw only 25-30% of connections doing fastpath 170 * due to the fact that along with moving forward 171 * in sequence the window was also updated. 172 */ 173static void 174tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 175 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 176 int ti_locked, u_long tiwin) 177{ 178 int acked; 179 int winup_only=0; 180#ifdef TCPDEBUG 181 /* 182 * The size of tcp_saveipgen must be the size of the max ip header, 183 * now IPv6. 184 */ 185 u_char tcp_saveipgen[IP6_HDR_LEN]; 186 struct tcphdr tcp_savetcp; 187 short ostate = 0; 188#endif 189 /* 190 * The following if statment will be true if 191 * we are doing the win_up_in_fp <and> 192 * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or> 193 * - No more new data, but we have an ack for new data 194 * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) 195 * - No more new data, the same ack point but the window grew 196 * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd) 197 */ 198 if ((SEQ_LT(tp->snd_wl1, th->th_seq) || 199 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 200 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 201 /* keep track of pure window updates */ 202 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { 203 winup_only = 1; 204 TCPSTAT_INC(tcps_rcvwinupd); 205 } 206 tp->snd_wnd = tiwin; 207 tp->snd_wl1 = th->th_seq; 208 tp->snd_wl2 = th->th_ack; 209 if (tp->snd_wnd > tp->max_sndwnd) 210 tp->max_sndwnd = tp->snd_wnd; 211 } 212 /* 213 * If last ACK falls within this segment's sequence numbers, 214 * record the timestamp. 215 * NOTE that the test is modified according to the latest 216 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 217 */ 218 if ((to->to_flags & TOF_TS) != 0 && 219 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 220 tp->ts_recent_age = tcp_ts_getticks(); 221 tp->ts_recent = to->to_tsval; 222 } 223 /* 224 * This is a pure ack for outstanding data. 225 */ 226 if (ti_locked == TI_RLOCKED) { 227 INP_INFO_RUNLOCK(&V_tcbinfo); 228 } 229 ti_locked = TI_UNLOCKED; 230 231 TCPSTAT_INC(tcps_predack); 232 233 /* 234 * "bad retransmit" recovery. 235 */ 236 if (tp->t_rxtshift == 1 && 237 tp->t_flags & TF_PREVVALID && 238 (int)(ticks - tp->t_badrxtwin) < 0) { 239 cc_cong_signal(tp, th, CC_RTO_ERR); 240 } 241 242 /* 243 * Recalculate the transmit timer / rtt. 244 * 245 * Some boxes send broken timestamp replies 246 * during the SYN+ACK phase, ignore 247 * timestamps of 0 or we could calculate a 248 * huge RTT and blow up the retransmit timer. 249 */ 250 if ((to->to_flags & TOF_TS) != 0 && 251 to->to_tsecr) { 252 u_int t; 253 254 t = tcp_ts_getticks() - to->to_tsecr; 255 if (!tp->t_rttlow || tp->t_rttlow > t) 256 tp->t_rttlow = t; 257 tcp_xmit_timer(tp, 258 TCP_TS_TO_TICKS(t) + 1); 259 } else if (tp->t_rtttime && 260 SEQ_GT(th->th_ack, tp->t_rtseq)) { 261 if (!tp->t_rttlow || 262 tp->t_rttlow > ticks - tp->t_rtttime) 263 tp->t_rttlow = ticks - tp->t_rtttime; 264 tcp_xmit_timer(tp, 265 ticks - tp->t_rtttime); 266 } 267 if (winup_only == 0) { 268 acked = BYTES_THIS_ACK(tp, th); 269 270 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 271 hhook_run_tcp_est_in(tp, th, to); 272 273 TCPSTAT_ADD(tcps_rcvackbyte, acked); 274 sbdrop(&so->so_snd, acked); 275 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 276 SEQ_LEQ(th->th_ack, tp->snd_recover)) 277 tp->snd_recover = th->th_ack - 1; 278 279 /* 280 * Let the congestion control algorithm update 281 * congestion control related information. This 282 * typically means increasing the congestion 283 * window. 284 */ 285 cc_ack_received(tp, th, CC_ACK); 286 287 tp->snd_una = th->th_ack; 288 /* 289 * Pull snd_wl2 up to prevent seq wrap relative 290 * to th_ack. 291 */ 292 tp->snd_wl2 = th->th_ack; 293 tp->t_dupacks = 0; 294 m_freem(m); 295 296 /* 297 * If all outstanding data are acked, stop 298 * retransmit timer, otherwise restart timer 299 * using current (possibly backed-off) value. 300 * If process is waiting for space, 301 * wakeup/selwakeup/signal. If data 302 * are ready to send, let tcp_output 303 * decide between more output or persist. 304 */ 305#ifdef TCPDEBUG 306 if (so->so_options & SO_DEBUG) 307 tcp_trace(TA_INPUT, ostate, tp, 308 (void *)tcp_saveipgen, 309 &tcp_savetcp, 0); 310#endif 311 if (tp->snd_una == tp->snd_max) 312 tcp_timer_activate(tp, TT_REXMT, 0); 313 else if (!tcp_timer_active(tp, TT_PERSIST)) 314 tcp_timer_activate(tp, TT_REXMT, 315 tp->t_rxtcur); 316 } else { 317 /* 318 * Window update only, just free the mbufs and 319 * send out whatever we can. 320 */ 321 m_freem(m); 322 } 323 sowwakeup(so); 324 if (sbavail(&so->so_snd)) 325 (void) tcp_output(tp); 326 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 327 __func__, ti_locked)); 328 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 329 INP_WLOCK_ASSERT(tp->t_inpcb); 330 331 if (tp->t_flags & TF_DELACK) { 332 tp->t_flags &= ~TF_DELACK; 333 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 334 } 335 INP_WUNLOCK(tp->t_inpcb); 336} 337 338/* 339 * Here nothing is really faster, its just that we 340 * have broken out the fast-data path also just like 341 * the fast-ack. 342 */ 343static void 344tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 345 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 346 int ti_locked, u_long tiwin) 347{ 348 int newsize = 0; /* automatic sockbuf scaling */ 349#ifdef TCPDEBUG 350 /* 351 * The size of tcp_saveipgen must be the size of the max ip header, 352 * now IPv6. 353 */ 354 u_char tcp_saveipgen[IP6_HDR_LEN]; 355 struct tcphdr tcp_savetcp; 356 short ostate = 0; 357#endif 358 /* 359 * If last ACK falls within this segment's sequence numbers, 360 * record the timestamp. 361 * NOTE that the test is modified according to the latest 362 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 363 */ 364 if ((to->to_flags & TOF_TS) != 0 && 365 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 366 tp->ts_recent_age = tcp_ts_getticks(); 367 tp->ts_recent = to->to_tsval; 368 } 369 370 /* 371 * This is a pure, in-sequence data packet with 372 * nothing on the reassembly queue and we have enough 373 * buffer space to take it. 374 */ 375 if (ti_locked == TI_RLOCKED) { 376 INP_INFO_RUNLOCK(&V_tcbinfo); 377 } 378 ti_locked = TI_UNLOCKED; 379 380 /* Clean receiver SACK report if present */ 381 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 382 tcp_clean_sackreport(tp); 383 TCPSTAT_INC(tcps_preddat); 384 tp->rcv_nxt += tlen; 385 /* 386 * Pull snd_wl1 up to prevent seq wrap relative to 387 * th_seq. 388 */ 389 tp->snd_wl1 = th->th_seq; 390 /* 391 * Pull rcv_up up to prevent seq wrap relative to 392 * rcv_nxt. 393 */ 394 tp->rcv_up = tp->rcv_nxt; 395 TCPSTAT_ADD(tcps_rcvbyte, tlen); 396#ifdef TCPDEBUG 397 if (so->so_options & SO_DEBUG) 398 tcp_trace(TA_INPUT, ostate, tp, 399 (void *)tcp_saveipgen, &tcp_savetcp, 0); 400#endif 401 /* 402 * Automatic sizing of receive socket buffer. Often the send 403 * buffer size is not optimally adjusted to the actual network 404 * conditions at hand (delay bandwidth product). Setting the 405 * buffer size too small limits throughput on links with high 406 * bandwidth and high delay (eg. trans-continental/oceanic links). 407 * 408 * On the receive side the socket buffer memory is only rarely 409 * used to any significant extent. This allows us to be much 410 * more aggressive in scaling the receive socket buffer. For 411 * the case that the buffer space is actually used to a large 412 * extent and we run out of kernel memory we can simply drop 413 * the new segments; TCP on the sender will just retransmit it 414 * later. Setting the buffer size too big may only consume too 415 * much kernel memory if the application doesn't read() from 416 * the socket or packet loss or reordering makes use of the 417 * reassembly queue. 418 * 419 * The criteria to step up the receive buffer one notch are: 420 * 1. Application has not set receive buffer size with 421 * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. 422 * 2. the number of bytes received during the time it takes 423 * one timestamp to be reflected back to us (the RTT); 424 * 3. received bytes per RTT is within seven eighth of the 425 * current socket buffer size; 426 * 4. receive buffer size has not hit maximal automatic size; 427 * 428 * This algorithm does one step per RTT at most and only if 429 * we receive a bulk stream w/o packet losses or reorderings. 430 * Shrinking the buffer during idle times is not necessary as 431 * it doesn't consume any memory when idle. 432 * 433 * TODO: Only step up if the application is actually serving 434 * the buffer to better manage the socket buffer resources. 435 */ 436 if (V_tcp_do_autorcvbuf && 437 (to->to_flags & TOF_TS) && 438 to->to_tsecr && 439 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 440 if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) && 441 to->to_tsecr - tp->rfbuf_ts < hz) { 442 if (tp->rfbuf_cnt > 443 (so->so_rcv.sb_hiwat / 8 * 7) && 444 so->so_rcv.sb_hiwat < 445 V_tcp_autorcvbuf_max) { 446 newsize = 447 min(so->so_rcv.sb_hiwat + 448 V_tcp_autorcvbuf_inc, 449 V_tcp_autorcvbuf_max); 450 } 451 /* Start over with next RTT. */ 452 tp->rfbuf_ts = 0; 453 tp->rfbuf_cnt = 0; 454 } else 455 tp->rfbuf_cnt += tlen; /* add up */ 456 } 457 458 /* Add data to socket buffer. */ 459 SOCKBUF_LOCK(&so->so_rcv); 460 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 461 m_freem(m); 462 } else { 463 /* 464 * Set new socket buffer size. 465 * Give up when limit is reached. 466 */ 467 if (newsize) 468 if (!sbreserve_locked(&so->so_rcv, 469 newsize, so, NULL)) 470 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 471 m_adj(m, drop_hdrlen); /* delayed header drop */ 472 sbappendstream_locked(&so->so_rcv, m, 0); 473 } 474 /* NB: sorwakeup_locked() does an implicit unlock. */ 475 sorwakeup_locked(so); 476 if (DELAY_ACK(tp, tlen)) { 477 tp->t_flags |= TF_DELACK; 478 } else { 479 tp->t_flags |= TF_ACKNOW; 480 tcp_output(tp); 481 } 482 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 483 __func__, ti_locked)); 484 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 485 INP_WLOCK_ASSERT(tp->t_inpcb); 486 487 if (tp->t_flags & TF_DELACK) { 488 tp->t_flags &= ~TF_DELACK; 489 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 490 } 491 INP_WUNLOCK(tp->t_inpcb); 492} 493 494/* 495 * The slow-path is the clone of the long long part 496 * of tcp_do_segment past all the fast-path stuff. We 497 * use it here by two different callers, the fast/slow and 498 * the fastack only. 499 */ 500static void 501tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so, 502 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 503 int ti_locked, u_long tiwin, int thflags) 504{ 505 int acked, ourfinisacked, needoutput = 0; 506 int rstreason, todrop, win; 507 char *s; 508 struct in_conninfo *inc; 509 struct mbuf *mfree = NULL; 510#ifdef TCPDEBUG 511 /* 512 * The size of tcp_saveipgen must be the size of the max ip header, 513 * now IPv6. 514 */ 515 u_char tcp_saveipgen[IP6_HDR_LEN]; 516 struct tcphdr tcp_savetcp; 517 short ostate = 0; 518#endif 519 /* 520 * Calculate amount of space in receive window, 521 * and then do TCP input processing. 522 * Receive window is amount of space in rcv queue, 523 * but not less than advertised window. 524 */ 525 inc = &tp->t_inpcb->inp_inc; 526 win = sbspace(&so->so_rcv); 527 if (win < 0) 528 win = 0; 529 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 530 531 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 532 tp->rfbuf_ts = 0; 533 tp->rfbuf_cnt = 0; 534 535 switch (tp->t_state) { 536 537 /* 538 * If the state is SYN_RECEIVED: 539 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 540 */ 541 case TCPS_SYN_RECEIVED: 542 if ((thflags & TH_ACK) && 543 (SEQ_LEQ(th->th_ack, tp->snd_una) || 544 SEQ_GT(th->th_ack, tp->snd_max))) { 545 rstreason = BANDLIM_RST_OPENPORT; 546 goto dropwithreset; 547 } 548 break; 549 550 /* 551 * If the state is SYN_SENT: 552 * if seg contains an ACK, but not for our SYN, drop the input. 553 * if seg contains a RST, then drop the connection. 554 * if seg does not contain SYN, then drop it. 555 * Otherwise this is an acceptable SYN segment 556 * initialize tp->rcv_nxt and tp->irs 557 * if seg contains ack then advance tp->snd_una 558 * if seg contains an ECE and ECN support is enabled, the stream 559 * is ECN capable. 560 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 561 * arrange for segment to be acked (eventually) 562 * continue processing rest of data/controls, beginning with URG 563 */ 564 case TCPS_SYN_SENT: 565 if ((thflags & TH_ACK) && 566 (SEQ_LEQ(th->th_ack, tp->iss) || 567 SEQ_GT(th->th_ack, tp->snd_max))) { 568 rstreason = BANDLIM_UNLIMITED; 569 goto dropwithreset; 570 } 571 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { 572 TCP_PROBE5(connect__refused, NULL, tp, 573 mtod(m, const char *), tp, th); 574 tp = tcp_drop(tp, ECONNREFUSED); 575 } 576 if (thflags & TH_RST) 577 goto drop; 578 if (!(thflags & TH_SYN)) 579 goto drop; 580 581 tp->irs = th->th_seq; 582 tcp_rcvseqinit(tp); 583 if (thflags & TH_ACK) { 584 TCPSTAT_INC(tcps_connects); 585 soisconnected(so); 586#ifdef MAC 587 mac_socketpeer_set_from_mbuf(m, so); 588#endif 589 /* Do window scaling on this connection? */ 590 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 591 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 592 tp->rcv_scale = tp->request_r_scale; 593 } 594 tp->rcv_adv += imin(tp->rcv_wnd, 595 TCP_MAXWIN << tp->rcv_scale); 596 tp->snd_una++; /* SYN is acked */ 597 /* 598 * If there's data, delay ACK; if there's also a FIN 599 * ACKNOW will be turned on later. 600 */ 601 if (DELAY_ACK(tp, tlen) && tlen != 0) 602 tcp_timer_activate(tp, TT_DELACK, 603 tcp_delacktime); 604 else 605 tp->t_flags |= TF_ACKNOW; 606 607 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 608 tp->t_flags |= TF_ECN_PERMIT; 609 TCPSTAT_INC(tcps_ecn_shs); 610 } 611 612 /* 613 * Received <SYN,ACK> in SYN_SENT[*] state. 614 * Transitions: 615 * SYN_SENT --> ESTABLISHED 616 * SYN_SENT* --> FIN_WAIT_1 617 */ 618 tp->t_starttime = ticks; 619 if (tp->t_flags & TF_NEEDFIN) { 620 tcp_state_change(tp, TCPS_FIN_WAIT_1); 621 tp->t_flags &= ~TF_NEEDFIN; 622 thflags &= ~TH_SYN; 623 } else { 624 tcp_state_change(tp, TCPS_ESTABLISHED); 625 TCP_PROBE5(connect__established, NULL, tp, 626 mtod(m, const char *), tp, th); 627 cc_conn_init(tp); 628 tcp_timer_activate(tp, TT_KEEP, 629 TP_KEEPIDLE(tp)); 630 } 631 } else { 632 /* 633 * Received initial SYN in SYN-SENT[*] state => 634 * simultaneous open. 635 * If it succeeds, connection is * half-synchronized. 636 * Otherwise, do 3-way handshake: 637 * SYN-SENT -> SYN-RECEIVED 638 * SYN-SENT* -> SYN-RECEIVED* 639 */ 640 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 641 tcp_timer_activate(tp, TT_REXMT, 0); 642 tcp_state_change(tp, TCPS_SYN_RECEIVED); 643 } 644 645 KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " 646 "ti_locked %d", __func__, ti_locked)); 647 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 648 INP_WLOCK_ASSERT(tp->t_inpcb); 649 650 /* 651 * Advance th->th_seq to correspond to first data byte. 652 * If data, trim to stay within window, 653 * dropping FIN if necessary. 654 */ 655 th->th_seq++; 656 if (tlen > tp->rcv_wnd) { 657 todrop = tlen - tp->rcv_wnd; 658 m_adj(m, -todrop); 659 tlen = tp->rcv_wnd; 660 thflags &= ~TH_FIN; 661 TCPSTAT_INC(tcps_rcvpackafterwin); 662 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 663 } 664 tp->snd_wl1 = th->th_seq - 1; 665 tp->rcv_up = th->th_seq; 666 /* 667 * Client side of transaction: already sent SYN and data. 668 * If the remote host used T/TCP to validate the SYN, 669 * our data will be ACK'd; if so, enter normal data segment 670 * processing in the middle of step 5, ack processing. 671 * Otherwise, goto step 6. 672 */ 673 if (thflags & TH_ACK) 674 goto process_ACK; 675 676 goto step6; 677 678 /* 679 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 680 * do normal processing. 681 * 682 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 683 */ 684 case TCPS_LAST_ACK: 685 case TCPS_CLOSING: 686 break; /* continue normal processing */ 687 } 688 689 /* 690 * States other than LISTEN or SYN_SENT. 691 * First check the RST flag and sequence number since reset segments 692 * are exempt from the timestamp and connection count tests. This 693 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 694 * below which allowed reset segments in half the sequence space 695 * to fall though and be processed (which gives forged reset 696 * segments with a random sequence number a 50 percent chance of 697 * killing a connection). 698 * Then check timestamp, if present. 699 * Then check the connection count, if present. 700 * Then check that at least some bytes of segment are within 701 * receive window. If segment begins before rcv_nxt, 702 * drop leading data (and SYN); if nothing left, just ack. 703 */ 704 if (thflags & TH_RST) { 705 /* 706 * RFC5961 Section 3.2 707 * 708 * - RST drops connection only if SEG.SEQ == RCV.NXT. 709 * - If RST is in window, we send challenge ACK. 710 * 711 * Note: to take into account delayed ACKs, we should 712 * test against last_ack_sent instead of rcv_nxt. 713 * Note 2: we handle special case of closed window, not 714 * covered by the RFC. 715 */ 716 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 717 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 718 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 719 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 720 KASSERT(ti_locked == TI_RLOCKED, 721 ("%s: TH_RST ti_locked %d, th %p tp %p", 722 __func__, ti_locked, th, tp)); 723 KASSERT(tp->t_state != TCPS_SYN_SENT, 724 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 725 __func__, th, tp)); 726 727 if (V_tcp_insecure_rst || 728 tp->last_ack_sent == th->th_seq) { 729 TCPSTAT_INC(tcps_drops); 730 /* Drop the connection. */ 731 switch (tp->t_state) { 732 case TCPS_SYN_RECEIVED: 733 so->so_error = ECONNREFUSED; 734 goto close; 735 case TCPS_ESTABLISHED: 736 case TCPS_FIN_WAIT_1: 737 case TCPS_FIN_WAIT_2: 738 case TCPS_CLOSE_WAIT: 739 so->so_error = ECONNRESET; 740 close: 741 tcp_state_change(tp, TCPS_CLOSED); 742 /* FALLTHROUGH */ 743 default: 744 tp = tcp_close(tp); 745 } 746 } else { 747 TCPSTAT_INC(tcps_badrst); 748 /* Send challenge ACK. */ 749 tcp_respond(tp, mtod(m, void *), th, m, 750 tp->rcv_nxt, tp->snd_nxt, TH_ACK); 751 tp->last_ack_sent = tp->rcv_nxt; 752 m = NULL; 753 } 754 } 755 goto drop; 756 } 757 758 /* 759 * RFC5961 Section 4.2 760 * Send challenge ACK for any SYN in synchronized state. 761 */ 762 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { 763 KASSERT(ti_locked == TI_RLOCKED, 764 ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); 765 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 766 767 TCPSTAT_INC(tcps_badsyn); 768 if (V_tcp_insecure_syn && 769 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 770 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 771 tp = tcp_drop(tp, ECONNRESET); 772 rstreason = BANDLIM_UNLIMITED; 773 } else { 774 /* Send challenge ACK. */ 775 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 776 tp->snd_nxt, TH_ACK); 777 tp->last_ack_sent = tp->rcv_nxt; 778 m = NULL; 779 } 780 goto drop; 781 } 782 783 /* 784 * RFC 1323 PAWS: If we have a timestamp reply on this segment 785 * and it's less than ts_recent, drop it. 786 */ 787 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 788 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 789 790 /* Check to see if ts_recent is over 24 days old. */ 791 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 792 /* 793 * Invalidate ts_recent. If this segment updates 794 * ts_recent, the age will be reset later and ts_recent 795 * will get a valid value. If it does not, setting 796 * ts_recent to zero will at least satisfy the 797 * requirement that zero be placed in the timestamp 798 * echo reply when ts_recent isn't valid. The 799 * age isn't reset until we get a valid ts_recent 800 * because we don't want out-of-order segments to be 801 * dropped when ts_recent is old. 802 */ 803 tp->ts_recent = 0; 804 } else { 805 TCPSTAT_INC(tcps_rcvduppack); 806 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 807 TCPSTAT_INC(tcps_pawsdrop); 808 if (tlen) 809 goto dropafterack; 810 goto drop; 811 } 812 } 813 814 /* 815 * In the SYN-RECEIVED state, validate that the packet belongs to 816 * this connection before trimming the data to fit the receive 817 * window. Check the sequence number versus IRS since we know 818 * the sequence numbers haven't wrapped. This is a partial fix 819 * for the "LAND" DoS attack. 820 */ 821 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 822 rstreason = BANDLIM_RST_OPENPORT; 823 goto dropwithreset; 824 } 825 826 todrop = tp->rcv_nxt - th->th_seq; 827 if (todrop > 0) { 828 if (thflags & TH_SYN) { 829 thflags &= ~TH_SYN; 830 th->th_seq++; 831 if (th->th_urp > 1) 832 th->th_urp--; 833 else 834 thflags &= ~TH_URG; 835 todrop--; 836 } 837 /* 838 * Following if statement from Stevens, vol. 2, p. 960. 839 */ 840 if (todrop > tlen 841 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 842 /* 843 * Any valid FIN must be to the left of the window. 844 * At this point the FIN must be a duplicate or out 845 * of sequence; drop it. 846 */ 847 thflags &= ~TH_FIN; 848 849 /* 850 * Send an ACK to resynchronize and drop any data. 851 * But keep on processing for RST or ACK. 852 */ 853 tp->t_flags |= TF_ACKNOW; 854 todrop = tlen; 855 TCPSTAT_INC(tcps_rcvduppack); 856 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 857 } else { 858 TCPSTAT_INC(tcps_rcvpartduppack); 859 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 860 } 861 drop_hdrlen += todrop; /* drop from the top afterwards */ 862 th->th_seq += todrop; 863 tlen -= todrop; 864 if (th->th_urp > todrop) 865 th->th_urp -= todrop; 866 else { 867 thflags &= ~TH_URG; 868 th->th_urp = 0; 869 } 870 } 871 872 /* 873 * If new data are received on a connection after the 874 * user processes are gone, then RST the other end. 875 */ 876 if ((so->so_state & SS_NOFDREF) && 877 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 878 KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " 879 "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); 880 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 881 882 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 883 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " 884 "after socket was closed, " 885 "sending RST and removing tcpcb\n", 886 s, __func__, tcpstates[tp->t_state], tlen); 887 free(s, M_TCPLOG); 888 } 889 tp = tcp_close(tp); 890 TCPSTAT_INC(tcps_rcvafterclose); 891 rstreason = BANDLIM_UNLIMITED; 892 goto dropwithreset; 893 } 894 895 /* 896 * If segment ends after window, drop trailing data 897 * (and PUSH and FIN); if nothing left, just ACK. 898 */ 899 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 900 if (todrop > 0) { 901 TCPSTAT_INC(tcps_rcvpackafterwin); 902 if (todrop >= tlen) { 903 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 904 /* 905 * If window is closed can only take segments at 906 * window edge, and have to drop data and PUSH from 907 * incoming segments. Continue processing, but 908 * remember to ack. Otherwise, drop segment 909 * and ack. 910 */ 911 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 912 tp->t_flags |= TF_ACKNOW; 913 TCPSTAT_INC(tcps_rcvwinprobe); 914 } else 915 goto dropafterack; 916 } else 917 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 918 m_adj(m, -todrop); 919 tlen -= todrop; 920 thflags &= ~(TH_PUSH|TH_FIN); 921 } 922 923 /* 924 * If last ACK falls within this segment's sequence numbers, 925 * record its timestamp. 926 * NOTE: 927 * 1) That the test incorporates suggestions from the latest 928 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 929 * 2) That updating only on newer timestamps interferes with 930 * our earlier PAWS tests, so this check should be solely 931 * predicated on the sequence space of this segment. 932 * 3) That we modify the segment boundary check to be 933 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 934 * instead of RFC1323's 935 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 936 * This modified check allows us to overcome RFC1323's 937 * limitations as described in Stevens TCP/IP Illustrated 938 * Vol. 2 p.869. In such cases, we can still calculate the 939 * RTT correctly when RCV.NXT == Last.ACK.Sent. 940 */ 941 if ((to->to_flags & TOF_TS) != 0 && 942 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 943 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 944 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 945 tp->ts_recent_age = tcp_ts_getticks(); 946 tp->ts_recent = to->to_tsval; 947 } 948 949 /* 950 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 951 * flag is on (half-synchronized state), then queue data for 952 * later processing; else drop segment and return. 953 */ 954 if ((thflags & TH_ACK) == 0) { 955 if (tp->t_state == TCPS_SYN_RECEIVED || 956 (tp->t_flags & TF_NEEDSYN)) 957 goto step6; 958 else if (tp->t_flags & TF_ACKNOW) 959 goto dropafterack; 960 else 961 goto drop; 962 } 963 964 /* 965 * Ack processing. 966 */ 967 switch (tp->t_state) { 968 969 /* 970 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 971 * ESTABLISHED state and continue processing. 972 * The ACK was checked above. 973 */ 974 case TCPS_SYN_RECEIVED: 975 976 TCPSTAT_INC(tcps_connects); 977 soisconnected(so); 978 /* Do window scaling? */ 979 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 980 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 981 tp->rcv_scale = tp->request_r_scale; 982 tp->snd_wnd = tiwin; 983 } 984 /* 985 * Make transitions: 986 * SYN-RECEIVED -> ESTABLISHED 987 * SYN-RECEIVED* -> FIN-WAIT-1 988 */ 989 tp->t_starttime = ticks; 990 if (tp->t_flags & TF_NEEDFIN) { 991 tcp_state_change(tp, TCPS_FIN_WAIT_1); 992 tp->t_flags &= ~TF_NEEDFIN; 993 } else { 994 tcp_state_change(tp, TCPS_ESTABLISHED); 995 TCP_PROBE5(accept__established, NULL, tp, 996 mtod(m, const char *), tp, th); 997 cc_conn_init(tp); 998 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 999 } 1000 /* 1001 * If segment contains data or ACK, will call tcp_reass() 1002 * later; if not, do so now to pass queued data to user. 1003 */ 1004 if (tlen == 0 && (thflags & TH_FIN) == 0) 1005 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 1006 (struct mbuf *)0); 1007 tp->snd_wl1 = th->th_seq - 1; 1008 /* FALLTHROUGH */ 1009 1010 /* 1011 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1012 * ACKs. If the ack is in the range 1013 * tp->snd_una < th->th_ack <= tp->snd_max 1014 * then advance tp->snd_una to th->th_ack and drop 1015 * data from the retransmission queue. If this ACK reflects 1016 * more up to date window information we update our window information. 1017 */ 1018 case TCPS_ESTABLISHED: 1019 case TCPS_FIN_WAIT_1: 1020 case TCPS_FIN_WAIT_2: 1021 case TCPS_CLOSE_WAIT: 1022 case TCPS_CLOSING: 1023 case TCPS_LAST_ACK: 1024 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1025 TCPSTAT_INC(tcps_rcvacktoomuch); 1026 goto dropafterack; 1027 } 1028 if ((tp->t_flags & TF_SACK_PERMIT) && 1029 ((to->to_flags & TOF_SACK) || 1030 !TAILQ_EMPTY(&tp->snd_holes))) 1031 tcp_sack_doack(tp, to, th->th_ack); 1032 else 1033 /* 1034 * Reset the value so that previous (valid) value 1035 * from the last ack with SACK doesn't get used. 1036 */ 1037 tp->sackhint.sacked_bytes = 0; 1038 1039 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 1040 hhook_run_tcp_est_in(tp, th, to); 1041 1042 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1043 if (tlen == 0 && tiwin == tp->snd_wnd) { 1044 /* 1045 * If this is the first time we've seen a 1046 * FIN from the remote, this is not a 1047 * duplicate and it needs to be processed 1048 * normally. This happens during a 1049 * simultaneous close. 1050 */ 1051 if ((thflags & TH_FIN) && 1052 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { 1053 tp->t_dupacks = 0; 1054 break; 1055 } 1056 TCPSTAT_INC(tcps_rcvdupack); 1057 /* 1058 * If we have outstanding data (other than 1059 * a window probe), this is a completely 1060 * duplicate ack (ie, window info didn't 1061 * change and FIN isn't set), 1062 * the ack is the biggest we've 1063 * seen and we've seen exactly our rexmt 1064 * threshhold of them, assume a packet 1065 * has been dropped and retransmit it. 1066 * Kludge snd_nxt & the congestion 1067 * window so we send only this one 1068 * packet. 1069 * 1070 * We know we're losing at the current 1071 * window size so do congestion avoidance 1072 * (set ssthresh to half the current window 1073 * and pull our congestion window back to 1074 * the new ssthresh). 1075 * 1076 * Dup acks mean that packets have left the 1077 * network (they're now cached at the receiver) 1078 * so bump cwnd by the amount in the receiver 1079 * to keep a constant cwnd packets in the 1080 * network. 1081 * 1082 * When using TCP ECN, notify the peer that 1083 * we reduced the cwnd. 1084 */ 1085 if (!tcp_timer_active(tp, TT_REXMT) || 1086 th->th_ack != tp->snd_una) 1087 tp->t_dupacks = 0; 1088 else if (++tp->t_dupacks > tcprexmtthresh || 1089 IN_FASTRECOVERY(tp->t_flags)) { 1090 cc_ack_received(tp, th, CC_DUPACK); 1091 if ((tp->t_flags & TF_SACK_PERMIT) && 1092 IN_FASTRECOVERY(tp->t_flags)) { 1093 int awnd; 1094 1095 /* 1096 * Compute the amount of data in flight first. 1097 * We can inject new data into the pipe iff 1098 * we have less than 1/2 the original window's 1099 * worth of data in flight. 1100 */ 1101 if (V_tcp_do_rfc6675_pipe) 1102 awnd = tcp_compute_pipe(tp); 1103 else 1104 awnd = (tp->snd_nxt - tp->snd_fack) + 1105 tp->sackhint.sack_bytes_rexmit; 1106 1107 if (awnd < tp->snd_ssthresh) { 1108 tp->snd_cwnd += tp->t_maxseg; 1109 if (tp->snd_cwnd > tp->snd_ssthresh) 1110 tp->snd_cwnd = tp->snd_ssthresh; 1111 } 1112 } else 1113 tp->snd_cwnd += tp->t_maxseg; 1114 (void) tp->t_fb->tfb_tcp_output(tp); 1115 goto drop; 1116 } else if (tp->t_dupacks == tcprexmtthresh) { 1117 tcp_seq onxt = tp->snd_nxt; 1118 1119 /* 1120 * If we're doing sack, check to 1121 * see if we're already in sack 1122 * recovery. If we're not doing sack, 1123 * check to see if we're in newreno 1124 * recovery. 1125 */ 1126 if (tp->t_flags & TF_SACK_PERMIT) { 1127 if (IN_FASTRECOVERY(tp->t_flags)) { 1128 tp->t_dupacks = 0; 1129 break; 1130 } 1131 } else { 1132 if (SEQ_LEQ(th->th_ack, 1133 tp->snd_recover)) { 1134 tp->t_dupacks = 0; 1135 break; 1136 } 1137 } 1138 /* Congestion signal before ack. */ 1139 cc_cong_signal(tp, th, CC_NDUPACK); 1140 cc_ack_received(tp, th, CC_DUPACK); 1141 tcp_timer_activate(tp, TT_REXMT, 0); 1142 tp->t_rtttime = 0; 1143 if (tp->t_flags & TF_SACK_PERMIT) { 1144 TCPSTAT_INC( 1145 tcps_sack_recovery_episode); 1146 tp->sack_newdata = tp->snd_nxt; 1147 tp->snd_cwnd = tp->t_maxseg; 1148 (void) tp->t_fb->tfb_tcp_output(tp); 1149 goto drop; 1150 } 1151 tp->snd_nxt = th->th_ack; 1152 tp->snd_cwnd = tp->t_maxseg; 1153 (void) tp->t_fb->tfb_tcp_output(tp); 1154 KASSERT(tp->snd_limited <= 2, 1155 ("%s: tp->snd_limited too big", 1156 __func__)); 1157 tp->snd_cwnd = tp->snd_ssthresh + 1158 tp->t_maxseg * 1159 (tp->t_dupacks - tp->snd_limited); 1160 if (SEQ_GT(onxt, tp->snd_nxt)) 1161 tp->snd_nxt = onxt; 1162 goto drop; 1163 } else if (V_tcp_do_rfc3042) { 1164 /* 1165 * Process first and second duplicate 1166 * ACKs. Each indicates a segment 1167 * leaving the network, creating room 1168 * for more. Make sure we can send a 1169 * packet on reception of each duplicate 1170 * ACK by increasing snd_cwnd by one 1171 * segment. Restore the original 1172 * snd_cwnd after packet transmission. 1173 */ 1174 cc_ack_received(tp, th, CC_DUPACK); 1175 u_long oldcwnd = tp->snd_cwnd; 1176 tcp_seq oldsndmax = tp->snd_max; 1177 u_int sent; 1178 int avail; 1179 1180 KASSERT(tp->t_dupacks == 1 || 1181 tp->t_dupacks == 2, 1182 ("%s: dupacks not 1 or 2", 1183 __func__)); 1184 if (tp->t_dupacks == 1) 1185 tp->snd_limited = 0; 1186 tp->snd_cwnd = 1187 (tp->snd_nxt - tp->snd_una) + 1188 (tp->t_dupacks - tp->snd_limited) * 1189 tp->t_maxseg; 1190 /* 1191 * Only call tcp_output when there 1192 * is new data available to be sent. 1193 * Otherwise we would send pure ACKs. 1194 */ 1195 SOCKBUF_LOCK(&so->so_snd); 1196 avail = sbavail(&so->so_snd) - 1197 (tp->snd_nxt - tp->snd_una); 1198 SOCKBUF_UNLOCK(&so->so_snd); 1199 if (avail > 0) 1200 (void) tp->t_fb->tfb_tcp_output(tp); 1201 sent = tp->snd_max - oldsndmax; 1202 if (sent > tp->t_maxseg) { 1203 KASSERT((tp->t_dupacks == 2 && 1204 tp->snd_limited == 0) || 1205 (sent == tp->t_maxseg + 1 && 1206 tp->t_flags & TF_SENTFIN), 1207 ("%s: sent too much", 1208 __func__)); 1209 tp->snd_limited = 2; 1210 } else if (sent > 0) 1211 ++tp->snd_limited; 1212 tp->snd_cwnd = oldcwnd; 1213 goto drop; 1214 } 1215 } else 1216 tp->t_dupacks = 0; 1217 break; 1218 } 1219 1220 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 1221 ("%s: th_ack <= snd_una", __func__)); 1222 1223 /* 1224 * If the congestion window was inflated to account 1225 * for the other side's cached packets, retract it. 1226 */ 1227 if (IN_FASTRECOVERY(tp->t_flags)) { 1228 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 1229 if (tp->t_flags & TF_SACK_PERMIT) 1230 tcp_sack_partialack(tp, th); 1231 else 1232 tcp_newreno_partial_ack(tp, th); 1233 } else 1234 cc_post_recovery(tp, th); 1235 } 1236 tp->t_dupacks = 0; 1237 /* 1238 * If we reach this point, ACK is not a duplicate, 1239 * i.e., it ACKs something we sent. 1240 */ 1241 if (tp->t_flags & TF_NEEDSYN) { 1242 /* 1243 * T/TCP: Connection was half-synchronized, and our 1244 * SYN has been ACK'd (so connection is now fully 1245 * synchronized). Go to non-starred state, 1246 * increment snd_una for ACK of SYN, and check if 1247 * we can do window scaling. 1248 */ 1249 tp->t_flags &= ~TF_NEEDSYN; 1250 tp->snd_una++; 1251 /* Do window scaling? */ 1252 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1253 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1254 tp->rcv_scale = tp->request_r_scale; 1255 /* Send window already scaled. */ 1256 } 1257 } 1258 1259process_ACK: 1260 INP_WLOCK_ASSERT(tp->t_inpcb); 1261 1262 acked = BYTES_THIS_ACK(tp, th); 1263 TCPSTAT_INC(tcps_rcvackpack); 1264 TCPSTAT_ADD(tcps_rcvackbyte, acked); 1265 1266 /* 1267 * If we just performed our first retransmit, and the ACK 1268 * arrives within our recovery window, then it was a mistake 1269 * to do the retransmit in the first place. Recover our 1270 * original cwnd and ssthresh, and proceed to transmit where 1271 * we left off. 1272 */ 1273 if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && 1274 (int)(ticks - tp->t_badrxtwin) < 0) 1275 cc_cong_signal(tp, th, CC_RTO_ERR); 1276 1277 /* 1278 * If we have a timestamp reply, update smoothed 1279 * round trip time. If no timestamp is present but 1280 * transmit timer is running and timed sequence 1281 * number was acked, update smoothed round trip time. 1282 * Since we now have an rtt measurement, cancel the 1283 * timer backoff (cf., Phil Karn's retransmit alg.). 1284 * Recompute the initial retransmit timer. 1285 * 1286 * Some boxes send broken timestamp replies 1287 * during the SYN+ACK phase, ignore 1288 * timestamps of 0 or we could calculate a 1289 * huge RTT and blow up the retransmit timer. 1290 */ 1291 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 1292 u_int t; 1293 1294 t = tcp_ts_getticks() - to->to_tsecr; 1295 if (!tp->t_rttlow || tp->t_rttlow > t) 1296 tp->t_rttlow = t; 1297 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); 1298 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 1299 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 1300 tp->t_rttlow = ticks - tp->t_rtttime; 1301 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 1302 } 1303 1304 /* 1305 * If all outstanding data is acked, stop retransmit 1306 * timer and remember to restart (more output or persist). 1307 * If there is more data to be acked, restart retransmit 1308 * timer, using current (possibly backed-off) value. 1309 */ 1310 if (th->th_ack == tp->snd_max) { 1311 tcp_timer_activate(tp, TT_REXMT, 0); 1312 needoutput = 1; 1313 } else if (!tcp_timer_active(tp, TT_PERSIST)) 1314 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 1315 1316 /* 1317 * If no data (only SYN) was ACK'd, 1318 * skip rest of ACK processing. 1319 */ 1320 if (acked == 0) 1321 goto step6; 1322 1323 /* 1324 * Let the congestion control algorithm update congestion 1325 * control related information. This typically means increasing 1326 * the congestion window. 1327 */ 1328 cc_ack_received(tp, th, CC_ACK); 1329 1330 SOCKBUF_LOCK(&so->so_snd); 1331 if (acked > sbavail(&so->so_snd)) { 1332 tp->snd_wnd -= sbavail(&so->so_snd); 1333 mfree = sbcut_locked(&so->so_snd, 1334 (int)sbavail(&so->so_snd)); 1335 ourfinisacked = 1; 1336 } else { 1337 mfree = sbcut_locked(&so->so_snd, acked); 1338 tp->snd_wnd -= acked; 1339 ourfinisacked = 0; 1340 } 1341 /* NB: sowwakeup_locked() does an implicit unlock. */ 1342 sowwakeup_locked(so); 1343 m_freem(mfree); 1344 /* Detect una wraparound. */ 1345 if (!IN_RECOVERY(tp->t_flags) && 1346 SEQ_GT(tp->snd_una, tp->snd_recover) && 1347 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1348 tp->snd_recover = th->th_ack - 1; 1349 /* XXXLAS: Can this be moved up into cc_post_recovery? */ 1350 if (IN_RECOVERY(tp->t_flags) && 1351 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 1352 EXIT_RECOVERY(tp->t_flags); 1353 } 1354 tp->snd_una = th->th_ack; 1355 if (tp->t_flags & TF_SACK_PERMIT) { 1356 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 1357 tp->snd_recover = tp->snd_una; 1358 } 1359 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1360 tp->snd_nxt = tp->snd_una; 1361 1362 switch (tp->t_state) { 1363 1364 /* 1365 * In FIN_WAIT_1 STATE in addition to the processing 1366 * for the ESTABLISHED state if our FIN is now acknowledged 1367 * then enter FIN_WAIT_2. 1368 */ 1369 case TCPS_FIN_WAIT_1: 1370 if (ourfinisacked) { 1371 /* 1372 * If we can't receive any more 1373 * data, then closing user can proceed. 1374 * Starting the timer is contrary to the 1375 * specification, but if we don't get a FIN 1376 * we'll hang forever. 1377 * 1378 * XXXjl: 1379 * we should release the tp also, and use a 1380 * compressed state. 1381 */ 1382 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1383 soisdisconnected(so); 1384 tcp_timer_activate(tp, TT_2MSL, 1385 (tcp_fast_finwait2_recycle ? 1386 tcp_finwait2_timeout : 1387 TP_MAXIDLE(tp))); 1388 } 1389 tcp_state_change(tp, TCPS_FIN_WAIT_2); 1390 } 1391 break; 1392 1393 /* 1394 * In CLOSING STATE in addition to the processing for 1395 * the ESTABLISHED state if the ACK acknowledges our FIN 1396 * then enter the TIME-WAIT state, otherwise ignore 1397 * the segment. 1398 */ 1399 case TCPS_CLOSING: 1400 if (ourfinisacked) { 1401 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1402 tcp_twstart(tp); 1403 INP_INFO_RUNLOCK(&V_tcbinfo); 1404 m_freem(m); 1405 return; 1406 } 1407 break; 1408 1409 /* 1410 * In LAST_ACK, we may still be waiting for data to drain 1411 * and/or to be acked, as well as for the ack of our FIN. 1412 * If our FIN is now acknowledged, delete the TCB, 1413 * enter the closed state and return. 1414 */ 1415 case TCPS_LAST_ACK: 1416 if (ourfinisacked) { 1417 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1418 tp = tcp_close(tp); 1419 goto drop; 1420 } 1421 break; 1422 } 1423 } 1424 1425step6: 1426 INP_WLOCK_ASSERT(tp->t_inpcb); 1427 1428 /* 1429 * Update window information. 1430 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1431 */ 1432 if ((thflags & TH_ACK) && 1433 (SEQ_LT(tp->snd_wl1, th->th_seq) || 1434 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 1435 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1436 /* keep track of pure window updates */ 1437 if (tlen == 0 && 1438 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1439 TCPSTAT_INC(tcps_rcvwinupd); 1440 tp->snd_wnd = tiwin; 1441 tp->snd_wl1 = th->th_seq; 1442 tp->snd_wl2 = th->th_ack; 1443 if (tp->snd_wnd > tp->max_sndwnd) 1444 tp->max_sndwnd = tp->snd_wnd; 1445 needoutput = 1; 1446 } 1447 1448 /* 1449 * Process segments with URG. 1450 */ 1451 if ((thflags & TH_URG) && th->th_urp && 1452 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1453 /* 1454 * This is a kludge, but if we receive and accept 1455 * random urgent pointers, we'll crash in 1456 * soreceive. It's hard to imagine someone 1457 * actually wanting to send this much urgent data. 1458 */ 1459 SOCKBUF_LOCK(&so->so_rcv); 1460 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 1461 th->th_urp = 0; /* XXX */ 1462 thflags &= ~TH_URG; /* XXX */ 1463 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 1464 goto dodata; /* XXX */ 1465 } 1466 /* 1467 * If this segment advances the known urgent pointer, 1468 * then mark the data stream. This should not happen 1469 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1470 * a FIN has been received from the remote side. 1471 * In these states we ignore the URG. 1472 * 1473 * According to RFC961 (Assigned Protocols), 1474 * the urgent pointer points to the last octet 1475 * of urgent data. We continue, however, 1476 * to consider it to indicate the first octet 1477 * of data past the urgent section as the original 1478 * spec states (in one of two places). 1479 */ 1480 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1481 tp->rcv_up = th->th_seq + th->th_urp; 1482 so->so_oobmark = sbavail(&so->so_rcv) + 1483 (tp->rcv_up - tp->rcv_nxt) - 1; 1484 if (so->so_oobmark == 0) 1485 so->so_rcv.sb_state |= SBS_RCVATMARK; 1486 sohasoutofband(so); 1487 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1488 } 1489 SOCKBUF_UNLOCK(&so->so_rcv); 1490 /* 1491 * Remove out of band data so doesn't get presented to user. 1492 * This can happen independent of advancing the URG pointer, 1493 * but if two URG's are pending at once, some out-of-band 1494 * data may creep in... ick. 1495 */ 1496 if (th->th_urp <= (u_long)tlen && 1497 !(so->so_options & SO_OOBINLINE)) { 1498 /* hdr drop is delayed */ 1499 tcp_pulloutofband(so, th, m, drop_hdrlen); 1500 } 1501 } else { 1502 /* 1503 * If no out of band data is expected, 1504 * pull receive urgent pointer along 1505 * with the receive window. 1506 */ 1507 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1508 tp->rcv_up = tp->rcv_nxt; 1509 } 1510dodata: /* XXX */ 1511 INP_WLOCK_ASSERT(tp->t_inpcb); 1512 1513 /* 1514 * Process the segment text, merging it into the TCP sequencing queue, 1515 * and arranging for acknowledgment of receipt if necessary. 1516 * This process logically involves adjusting tp->rcv_wnd as data 1517 * is presented to the user (this happens in tcp_usrreq.c, 1518 * case PRU_RCVD). If a FIN has already been received on this 1519 * connection then we just ignore the text. 1520 */ 1521 if ((tlen || (thflags & TH_FIN)) && 1522 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1523 tcp_seq save_start = th->th_seq; 1524 m_adj(m, drop_hdrlen); /* delayed header drop */ 1525 /* 1526 * Insert segment which includes th into TCP reassembly queue 1527 * with control block tp. Set thflags to whether reassembly now 1528 * includes a segment with FIN. This handles the common case 1529 * inline (segment is the next to be received on an established 1530 * connection, and the queue is empty), avoiding linkage into 1531 * and removal from the queue and repetition of various 1532 * conversions. 1533 * Set DELACK for segments received in order, but ack 1534 * immediately when segments are out of order (so 1535 * fast retransmit can work). 1536 */ 1537 if (th->th_seq == tp->rcv_nxt && 1538 LIST_EMPTY(&tp->t_segq) && 1539 TCPS_HAVEESTABLISHED(tp->t_state)) { 1540 if (DELAY_ACK(tp, tlen)) 1541 tp->t_flags |= TF_DELACK; 1542 else 1543 tp->t_flags |= TF_ACKNOW; 1544 tp->rcv_nxt += tlen; 1545 thflags = th->th_flags & TH_FIN; 1546 TCPSTAT_INC(tcps_rcvpack); 1547 TCPSTAT_ADD(tcps_rcvbyte, tlen); 1548 SOCKBUF_LOCK(&so->so_rcv); 1549 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 1550 m_freem(m); 1551 else 1552 sbappendstream_locked(&so->so_rcv, m, 0); 1553 /* NB: sorwakeup_locked() does an implicit unlock. */ 1554 sorwakeup_locked(so); 1555 } else { 1556 /* 1557 * XXX: Due to the header drop above "th" is 1558 * theoretically invalid by now. Fortunately 1559 * m_adj() doesn't actually frees any mbufs 1560 * when trimming from the head. 1561 */ 1562 thflags = tcp_reass(tp, th, &tlen, m); 1563 tp->t_flags |= TF_ACKNOW; 1564 } 1565 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 1566 tcp_update_sack_list(tp, save_start, save_start + tlen); 1567#if 0 1568 /* 1569 * Note the amount of data that peer has sent into 1570 * our window, in order to estimate the sender's 1571 * buffer size. 1572 * XXX: Unused. 1573 */ 1574 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) 1575 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1576 else 1577 len = so->so_rcv.sb_hiwat; 1578#endif 1579 } else { 1580 m_freem(m); 1581 thflags &= ~TH_FIN; 1582 } 1583 1584 /* 1585 * If FIN is received ACK the FIN and let the user know 1586 * that the connection is closing. 1587 */ 1588 if (thflags & TH_FIN) { 1589 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1590 socantrcvmore(so); 1591 /* 1592 * If connection is half-synchronized 1593 * (ie NEEDSYN flag on) then delay ACK, 1594 * so it may be piggybacked when SYN is sent. 1595 * Otherwise, since we received a FIN then no 1596 * more input can be expected, send ACK now. 1597 */ 1598 if (tp->t_flags & TF_NEEDSYN) 1599 tp->t_flags |= TF_DELACK; 1600 else 1601 tp->t_flags |= TF_ACKNOW; 1602 tp->rcv_nxt++; 1603 } 1604 switch (tp->t_state) { 1605 1606 /* 1607 * In SYN_RECEIVED and ESTABLISHED STATES 1608 * enter the CLOSE_WAIT state. 1609 */ 1610 case TCPS_SYN_RECEIVED: 1611 tp->t_starttime = ticks; 1612 /* FALLTHROUGH */ 1613 case TCPS_ESTABLISHED: 1614 tcp_state_change(tp, TCPS_CLOSE_WAIT); 1615 break; 1616 1617 /* 1618 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1619 * enter the CLOSING state. 1620 */ 1621 case TCPS_FIN_WAIT_1: 1622 tcp_state_change(tp, TCPS_CLOSING); 1623 break; 1624 1625 /* 1626 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1627 * starting the time-wait timer, turning off the other 1628 * standard timers. 1629 */ 1630 case TCPS_FIN_WAIT_2: 1631 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1632 KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " 1633 "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 1634 ti_locked)); 1635 1636 tcp_twstart(tp); 1637 INP_INFO_RUNLOCK(&V_tcbinfo); 1638 return; 1639 } 1640 } 1641 if (ti_locked == TI_RLOCKED) { 1642 INP_INFO_RUNLOCK(&V_tcbinfo); 1643 } 1644 ti_locked = TI_UNLOCKED; 1645 1646#ifdef TCPDEBUG 1647 if (so->so_options & SO_DEBUG) 1648 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 1649 &tcp_savetcp, 0); 1650#endif 1651 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); 1652 1653 /* 1654 * Return any desired output. 1655 */ 1656 if (needoutput || (tp->t_flags & TF_ACKNOW)) 1657 (void) tp->t_fb->tfb_tcp_output(tp); 1658 1659 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 1660 __func__, ti_locked)); 1661 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1662 INP_WLOCK_ASSERT(tp->t_inpcb); 1663 1664 if (tp->t_flags & TF_DELACK) { 1665 tp->t_flags &= ~TF_DELACK; 1666 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 1667 } 1668 INP_WUNLOCK(tp->t_inpcb); 1669 return; 1670 1671dropafterack: 1672 /* 1673 * Generate an ACK dropping incoming segment if it occupies 1674 * sequence space, where the ACK reflects our state. 1675 * 1676 * We can now skip the test for the RST flag since all 1677 * paths to this code happen after packets containing 1678 * RST have been dropped. 1679 * 1680 * In the SYN-RECEIVED state, don't send an ACK unless the 1681 * segment we received passes the SYN-RECEIVED ACK test. 1682 * If it fails send a RST. This breaks the loop in the 1683 * "LAND" DoS attack, and also prevents an ACK storm 1684 * between two listening ports that have been sent forged 1685 * SYN segments, each with the source address of the other. 1686 */ 1687 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1688 (SEQ_GT(tp->snd_una, th->th_ack) || 1689 SEQ_GT(th->th_ack, tp->snd_max)) ) { 1690 rstreason = BANDLIM_RST_OPENPORT; 1691 goto dropwithreset; 1692 } 1693#ifdef TCPDEBUG 1694 if (so->so_options & SO_DEBUG) 1695 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 1696 &tcp_savetcp, 0); 1697#endif 1698 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); 1699 if (ti_locked == TI_RLOCKED) { 1700 INP_INFO_RUNLOCK(&V_tcbinfo); 1701 } 1702 ti_locked = TI_UNLOCKED; 1703 1704 tp->t_flags |= TF_ACKNOW; 1705 (void) tp->t_fb->tfb_tcp_output(tp); 1706 INP_WUNLOCK(tp->t_inpcb); 1707 m_freem(m); 1708 return; 1709 1710dropwithreset: 1711 if (ti_locked == TI_RLOCKED) { 1712 INP_INFO_RUNLOCK(&V_tcbinfo); 1713 } 1714 ti_locked = TI_UNLOCKED; 1715 1716 if (tp != NULL) { 1717 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1718 INP_WUNLOCK(tp->t_inpcb); 1719 } else 1720 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1721 return; 1722 1723drop: 1724 if (ti_locked == TI_RLOCKED) { 1725 INP_INFO_RUNLOCK(&V_tcbinfo); 1726 ti_locked = TI_UNLOCKED; 1727 } 1728#ifdef INVARIANTS 1729 else 1730 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1731#endif 1732 1733 /* 1734 * Drop space held by incoming segment and return. 1735 */ 1736#ifdef TCPDEBUG 1737 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 1738 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 1739 &tcp_savetcp, 0); 1740#endif 1741 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); 1742 if (tp != NULL) 1743 INP_WUNLOCK(tp->t_inpcb); 1744 m_freem(m); 1745} 1746 1747 1748/* 1749 * Do fast slow is a combination of the original 1750 * tcp_dosegment and a split fastpath, one function 1751 * for the fast-ack which also includes allowing fastpath 1752 * for window advanced in sequence acks. And also a 1753 * sub-function that handles the insequence data. 1754 */ 1755void 1756tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so, 1757 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 1758 int ti_locked) 1759{ 1760 int thflags; 1761 u_long tiwin; 1762 char *s; 1763 int can_enter; 1764 struct in_conninfo *inc; 1765 struct tcpopt to; 1766 1767 thflags = th->th_flags; 1768 tp->sackhint.last_sack_ack = 0; 1769 inc = &tp->t_inpcb->inp_inc; 1770 /* 1771 * If this is either a state-changing packet or current state isn't 1772 * established, we require a write lock on tcbinfo. Otherwise, we 1773 * allow the tcbinfo to be in either alocked or unlocked, as the 1774 * caller may have unnecessarily acquired a write lock due to a race. 1775 */ 1776 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 1777 tp->t_state != TCPS_ESTABLISHED) { 1778 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 1779 "SYN/FIN/RST/!EST", __func__, ti_locked)); 1780 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1781 } else { 1782#ifdef INVARIANTS 1783 if (ti_locked == TI_RLOCKED) { 1784 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1785 } else { 1786 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 1787 "ti_locked: %d", __func__, ti_locked)); 1788 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1789 } 1790#endif 1791 } 1792 INP_WLOCK_ASSERT(tp->t_inpcb); 1793 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 1794 __func__)); 1795 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 1796 __func__)); 1797 1798 /* 1799 * Segment received on connection. 1800 * Reset idle time and keep-alive timer. 1801 * XXX: This should be done after segment 1802 * validation to ignore broken/spoofed segs. 1803 */ 1804 tp->t_rcvtime = ticks; 1805 if (TCPS_HAVEESTABLISHED(tp->t_state)) 1806 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 1807 1808 /* 1809 * Unscale the window into a 32-bit value. 1810 * For the SYN_SENT state the scale is zero. 1811 */ 1812 tiwin = th->th_win << tp->snd_scale; 1813 1814 /* 1815 * TCP ECN processing. 1816 */ 1817 if (tp->t_flags & TF_ECN_PERMIT) { 1818 if (thflags & TH_CWR) 1819 tp->t_flags &= ~TF_ECN_SND_ECE; 1820 switch (iptos & IPTOS_ECN_MASK) { 1821 case IPTOS_ECN_CE: 1822 tp->t_flags |= TF_ECN_SND_ECE; 1823 TCPSTAT_INC(tcps_ecn_ce); 1824 break; 1825 case IPTOS_ECN_ECT0: 1826 TCPSTAT_INC(tcps_ecn_ect0); 1827 break; 1828 case IPTOS_ECN_ECT1: 1829 TCPSTAT_INC(tcps_ecn_ect1); 1830 break; 1831 } 1832 /* Congestion experienced. */ 1833 if (thflags & TH_ECE) { 1834 cc_cong_signal(tp, th, CC_ECN); 1835 } 1836 } 1837 1838 /* 1839 * Parse options on any incoming segment. 1840 */ 1841 tcp_dooptions(&to, (u_char *)(th + 1), 1842 (th->th_off << 2) - sizeof(struct tcphdr), 1843 (thflags & TH_SYN) ? TO_SYN : 0); 1844 1845 /* 1846 * If echoed timestamp is later than the current time, 1847 * fall back to non RFC1323 RTT calculation. Normalize 1848 * timestamp if syncookies were used when this connection 1849 * was established. 1850 */ 1851 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1852 to.to_tsecr -= tp->ts_offset; 1853 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) 1854 to.to_tsecr = 0; 1855 } 1856 /* 1857 * If timestamps were negotiated during SYN/ACK they should 1858 * appear on every segment during this session and vice versa. 1859 */ 1860 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 1861 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1862 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1863 "no action\n", s, __func__); 1864 free(s, M_TCPLOG); 1865 } 1866 } 1867 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 1868 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1869 log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 1870 "no action\n", s, __func__); 1871 free(s, M_TCPLOG); 1872 } 1873 } 1874 1875 /* 1876 * Process options only when we get SYN/ACK back. The SYN case 1877 * for incoming connections is handled in tcp_syncache. 1878 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1879 * or <SYN,ACK>) segment itself is never scaled. 1880 * XXX this is traditional behavior, may need to be cleaned up. 1881 */ 1882 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1883 if ((to.to_flags & TOF_SCALE) && 1884 (tp->t_flags & TF_REQ_SCALE)) { 1885 tp->t_flags |= TF_RCVD_SCALE; 1886 tp->snd_scale = to.to_wscale; 1887 } 1888 /* 1889 * Initial send window. It will be updated with 1890 * the next incoming segment to the scaled value. 1891 */ 1892 tp->snd_wnd = th->th_win; 1893 if (to.to_flags & TOF_TS) { 1894 tp->t_flags |= TF_RCVD_TSTMP; 1895 tp->ts_recent = to.to_tsval; 1896 tp->ts_recent_age = tcp_ts_getticks(); 1897 } 1898 if (to.to_flags & TOF_MSS) 1899 tcp_mss(tp, to.to_mss); 1900 if ((tp->t_flags & TF_SACK_PERMIT) && 1901 (to.to_flags & TOF_SACKPERM) == 0) 1902 tp->t_flags &= ~TF_SACK_PERMIT; 1903 } 1904 can_enter = 0; 1905 if (__predict_true((tlen == 0))) { 1906 /* 1907 * The ack moved forward and we have a window (non-zero) 1908 * <or> 1909 * The ack did not move forward, but the window increased. 1910 */ 1911 if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) || 1912 ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) { 1913 can_enter = 1; 1914 } 1915 } else { 1916 /* 1917 * Data incoming, use the old entry criteria 1918 * for fast-path with data. 1919 */ 1920 if ((tiwin && tiwin == tp->snd_wnd)) { 1921 can_enter = 1; 1922 } 1923 } 1924 /* 1925 * Header prediction: check for the two common cases 1926 * of a uni-directional data xfer. If the packet has 1927 * no control flags, is in-sequence, the window didn't 1928 * change and we're not retransmitting, it's a 1929 * candidate. If the length is zero and the ack moved 1930 * forward, we're the sender side of the xfer. Just 1931 * free the data acked & wake any higher level process 1932 * that was blocked waiting for space. If the length 1933 * is non-zero and the ack didn't move, we're the 1934 * receiver side. If we're getting packets in-order 1935 * (the reassembly queue is empty), add the data to 1936 * the socket buffer and note that we need a delayed ack. 1937 * Make sure that the hidden state-flags are also off. 1938 * Since we check for TCPS_ESTABLISHED first, it can only 1939 * be TH_NEEDSYN. 1940 */ 1941 if (__predict_true(tp->t_state == TCPS_ESTABLISHED && 1942 th->th_seq == tp->rcv_nxt && 1943 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1944 tp->snd_nxt == tp->snd_max && 1945 can_enter && 1946 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1947 LIST_EMPTY(&tp->t_segq) && 1948 ((to.to_flags & TOF_TS) == 0 || 1949 TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) { 1950 if (__predict_true((tlen == 0) && 1951 (SEQ_LEQ(th->th_ack, tp->snd_max) && 1952 !IN_RECOVERY(tp->t_flags) && 1953 (to.to_flags & TOF_SACK) == 0 && 1954 TAILQ_EMPTY(&tp->snd_holes)))) { 1955 /* We are done */ 1956 tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 1957 ti_locked, tiwin); 1958 return; 1959 } else if ((tlen) && 1960 (th->th_ack == tp->snd_una && 1961 tlen <= sbspace(&so->so_rcv))) { 1962 tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen, 1963 ti_locked, tiwin); 1964 /* We are done */ 1965 return; 1966 } 1967 } 1968 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, 1969 ti_locked, tiwin, thflags); 1970} 1971 1972 1973/* 1974 * This subfunction is used to try to highly optimize the 1975 * fast path. We again allow window updates that are 1976 * in sequence to remain in the fast-path. We also add 1977 * in the __predict's to attempt to help the compiler. 1978 * Note that if we return a 0, then we can *not* process 1979 * it and the caller should push the packet into the 1980 * slow-path. 1981 */ 1982static int 1983tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 1984 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 1985 int ti_locked, u_long tiwin) 1986{ 1987 int acked; 1988 int winup_only=0; 1989#ifdef TCPDEBUG 1990 /* 1991 * The size of tcp_saveipgen must be the size of the max ip header, 1992 * now IPv6. 1993 */ 1994 u_char tcp_saveipgen[IP6_HDR_LEN]; 1995 struct tcphdr tcp_savetcp; 1996 short ostate = 0; 1997#endif 1998 1999 2000 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 2001 /* Old ack, behind (or duplicate to) the last one rcv'd */ 2002 return (0); 2003 } 2004 if (__predict_false(th->th_ack == tp->snd_una) && 2005 __predict_false(tiwin <= tp->snd_wnd)) { 2006 /* duplicate ack <or> a shrinking dup ack with shrinking window */ 2007 return (0); 2008 } 2009 if (__predict_false(tiwin == 0)) { 2010 /* zero window */ 2011 return (0); 2012 } 2013 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 2014 /* Above what we have sent? */ 2015 return (0); 2016 } 2017 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 2018 /* We are retransmitting */ 2019 return (0); 2020 } 2021 if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) { 2022 /* We need a SYN or a FIN, unlikely.. */ 2023 return (0); 2024 } 2025 if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 2026 /* Timestamp is behind .. old ack with seq wrap? */ 2027 return (0); 2028 } 2029 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 2030 /* Still recovering */ 2031 return (0); 2032 } 2033 if (__predict_false(to->to_flags & TOF_SACK)) { 2034 /* Sack included in the ack.. */ 2035 return (0); 2036 } 2037 if (!TAILQ_EMPTY(&tp->snd_holes)) { 2038 /* We have sack holes on our scoreboard */ 2039 return (0); 2040 } 2041 /* Ok if we reach here, we can process a fast-ack */ 2042 2043 /* Did the window get updated? */ 2044 if (tiwin != tp->snd_wnd) { 2045 /* keep track of pure window updates */ 2046 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { 2047 winup_only = 1; 2048 TCPSTAT_INC(tcps_rcvwinupd); 2049 } 2050 tp->snd_wnd = tiwin; 2051 tp->snd_wl1 = th->th_seq; 2052 if (tp->snd_wnd > tp->max_sndwnd) 2053 tp->max_sndwnd = tp->snd_wnd; 2054 } 2055 /* 2056 * Pull snd_wl2 up to prevent seq wrap relative 2057 * to th_ack. 2058 */ 2059 tp->snd_wl2 = th->th_ack; 2060 /* 2061 * If last ACK falls within this segment's sequence numbers, 2062 * record the timestamp. 2063 * NOTE that the test is modified according to the latest 2064 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2065 */ 2066 if ((to->to_flags & TOF_TS) != 0 && 2067 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 2068 tp->ts_recent_age = tcp_ts_getticks(); 2069 tp->ts_recent = to->to_tsval; 2070 } 2071 /* 2072 * This is a pure ack for outstanding data. 2073 */ 2074 if (ti_locked == TI_RLOCKED) { 2075 INP_INFO_RUNLOCK(&V_tcbinfo); 2076 } 2077 ti_locked = TI_UNLOCKED; 2078 2079 TCPSTAT_INC(tcps_predack); 2080 2081 /* 2082 * "bad retransmit" recovery. 2083 */ 2084 if (tp->t_rxtshift == 1 && 2085 tp->t_flags & TF_PREVVALID && 2086 (int)(ticks - tp->t_badrxtwin) < 0) { 2087 cc_cong_signal(tp, th, CC_RTO_ERR); 2088 } 2089 2090 /* 2091 * Recalculate the transmit timer / rtt. 2092 * 2093 * Some boxes send broken timestamp replies 2094 * during the SYN+ACK phase, ignore 2095 * timestamps of 0 or we could calculate a 2096 * huge RTT and blow up the retransmit timer. 2097 */ 2098 if ((to->to_flags & TOF_TS) != 0 && 2099 to->to_tsecr) { 2100 u_int t; 2101 2102 t = tcp_ts_getticks() - to->to_tsecr; 2103 if (!tp->t_rttlow || tp->t_rttlow > t) 2104 tp->t_rttlow = t; 2105 tcp_xmit_timer(tp, 2106 TCP_TS_TO_TICKS(t) + 1); 2107 } else if (tp->t_rtttime && 2108 SEQ_GT(th->th_ack, tp->t_rtseq)) { 2109 if (!tp->t_rttlow || 2110 tp->t_rttlow > ticks - tp->t_rtttime) 2111 tp->t_rttlow = ticks - tp->t_rtttime; 2112 tcp_xmit_timer(tp, 2113 ticks - tp->t_rtttime); 2114 } 2115 if (winup_only == 0) { 2116 acked = BYTES_THIS_ACK(tp, th); 2117 2118 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 2119 hhook_run_tcp_est_in(tp, th, to); 2120 2121 TCPSTAT_ADD(tcps_rcvackbyte, acked); 2122 sbdrop(&so->so_snd, acked); 2123 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 2124 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2125 tp->snd_recover = th->th_ack - 1; 2126 2127 /* 2128 * Let the congestion control algorithm update 2129 * congestion control related information. This 2130 * typically means increasing the congestion 2131 * window. 2132 */ 2133 cc_ack_received(tp, th, CC_ACK); 2134 2135 tp->snd_una = th->th_ack; 2136 tp->t_dupacks = 0; 2137 m_freem(m); 2138 2139 /* 2140 * If all outstanding data are acked, stop 2141 * retransmit timer, otherwise restart timer 2142 * using current (possibly backed-off) value. 2143 * If process is waiting for space, 2144 * wakeup/selwakeup/signal. If data 2145 * are ready to send, let tcp_output 2146 * decide between more output or persist. 2147 */ 2148#ifdef TCPDEBUG 2149 if (so->so_options & SO_DEBUG) 2150 tcp_trace(TA_INPUT, ostate, tp, 2151 (void *)tcp_saveipgen, 2152 &tcp_savetcp, 0); 2153#endif 2154 if (tp->snd_una == tp->snd_max) 2155 tcp_timer_activate(tp, TT_REXMT, 0); 2156 else if (!tcp_timer_active(tp, TT_PERSIST)) 2157 tcp_timer_activate(tp, TT_REXMT, 2158 tp->t_rxtcur); 2159 /* Wake up the socket if we have room to write more */ 2160 sowwakeup(so); 2161 } else { 2162 /* 2163 * Window update only, just free the mbufs and 2164 * send out whatever we can. 2165 */ 2166 m_freem(m); 2167 } 2168 if (sbavail(&so->so_snd)) 2169 (void) tcp_output(tp); 2170 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 2171 __func__, ti_locked)); 2172 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2173 INP_WLOCK_ASSERT(tp->t_inpcb); 2174 2175 if (tp->t_flags & TF_DELACK) { 2176 tp->t_flags &= ~TF_DELACK; 2177 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2178 } 2179 INP_WUNLOCK(tp->t_inpcb); 2180 return (1); 2181} 2182 2183/* 2184 * This tcp-do-segment concentrates on making the fastest 2185 * ack processing path. It does not have a fast-path for 2186 * data (it possibly could which would then eliminate the 2187 * need for fast-slow above). For a content distributor having 2188 * large outgoing elephants and very very little coming in 2189 * having no fastpath for data does not really help (since you 2190 * don't get much data in). The most important thing is 2191 * processing ack's quickly and getting the rest of the data 2192 * output to the peer as quickly as possible. This routine 2193 * seems to be about an overall 3% faster then the old 2194 * tcp_do_segment and keeps us in the fast-path for packets 2195 * much more (by allowing window updates to also stay in the fastpath). 2196 */ 2197void 2198tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 2199 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, 2200 int ti_locked) 2201{ 2202 int thflags; 2203 u_long tiwin; 2204 char *s; 2205 struct in_conninfo *inc; 2206 struct tcpopt to; 2207 2208 thflags = th->th_flags; 2209 tp->sackhint.last_sack_ack = 0; 2210 inc = &tp->t_inpcb->inp_inc; 2211 /* 2212 * If this is either a state-changing packet or current state isn't 2213 * established, we require a write lock on tcbinfo. Otherwise, we 2214 * allow the tcbinfo to be in either alocked or unlocked, as the 2215 * caller may have unnecessarily acquired a write lock due to a race. 2216 */ 2217 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 2218 tp->t_state != TCPS_ESTABLISHED) { 2219 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 2220 "SYN/FIN/RST/!EST", __func__, ti_locked)); 2221 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2222 } else { 2223#ifdef INVARIANTS 2224 if (ti_locked == TI_RLOCKED) { 2225 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2226 } else { 2227 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 2228 "ti_locked: %d", __func__, ti_locked)); 2229 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2230 } 2231#endif 2232 } 2233 INP_WLOCK_ASSERT(tp->t_inpcb); 2234 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 2235 __func__)); 2236 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 2237 __func__)); 2238 2239 /* 2240 * Segment received on connection. 2241 * Reset idle time and keep-alive timer. 2242 * XXX: This should be done after segment 2243 * validation to ignore broken/spoofed segs. 2244 */ 2245 tp->t_rcvtime = ticks; 2246 if (TCPS_HAVEESTABLISHED(tp->t_state)) 2247 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 2248 2249 /* 2250 * Unscale the window into a 32-bit value. 2251 * For the SYN_SENT state the scale is zero. 2252 */ 2253 tiwin = th->th_win << tp->snd_scale; 2254 2255 /* 2256 * TCP ECN processing. 2257 */ 2258 if (tp->t_flags & TF_ECN_PERMIT) { 2259 if (thflags & TH_CWR) 2260 tp->t_flags &= ~TF_ECN_SND_ECE; 2261 switch (iptos & IPTOS_ECN_MASK) { 2262 case IPTOS_ECN_CE: 2263 tp->t_flags |= TF_ECN_SND_ECE; 2264 TCPSTAT_INC(tcps_ecn_ce); 2265 break; 2266 case IPTOS_ECN_ECT0: 2267 TCPSTAT_INC(tcps_ecn_ect0); 2268 break; 2269 case IPTOS_ECN_ECT1: 2270 TCPSTAT_INC(tcps_ecn_ect1); 2271 break; 2272 } 2273 /* Congestion experienced. */ 2274 if (thflags & TH_ECE) { 2275 cc_cong_signal(tp, th, CC_ECN); 2276 } 2277 } 2278 2279 /* 2280 * Parse options on any incoming segment. 2281 */ 2282 tcp_dooptions(&to, (u_char *)(th + 1), 2283 (th->th_off << 2) - sizeof(struct tcphdr), 2284 (thflags & TH_SYN) ? TO_SYN : 0); 2285 2286 /* 2287 * If echoed timestamp is later than the current time, 2288 * fall back to non RFC1323 RTT calculation. Normalize 2289 * timestamp if syncookies were used when this connection 2290 * was established. 2291 */ 2292 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 2293 to.to_tsecr -= tp->ts_offset; 2294 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) 2295 to.to_tsecr = 0; 2296 } 2297 /* 2298 * If timestamps were negotiated during SYN/ACK they should 2299 * appear on every segment during this session and vice versa. 2300 */ 2301 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 2302 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2303 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 2304 "no action\n", s, __func__); 2305 free(s, M_TCPLOG); 2306 } 2307 } 2308 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 2309 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2310 log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 2311 "no action\n", s, __func__); 2312 free(s, M_TCPLOG); 2313 } 2314 } 2315 2316 /* 2317 * Process options only when we get SYN/ACK back. The SYN case 2318 * for incoming connections is handled in tcp_syncache. 2319 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 2320 * or <SYN,ACK>) segment itself is never scaled. 2321 * XXX this is traditional behavior, may need to be cleaned up. 2322 */ 2323 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 2324 if ((to.to_flags & TOF_SCALE) && 2325 (tp->t_flags & TF_REQ_SCALE)) { 2326 tp->t_flags |= TF_RCVD_SCALE; 2327 tp->snd_scale = to.to_wscale; 2328 } 2329 /* 2330 * Initial send window. It will be updated with 2331 * the next incoming segment to the scaled value. 2332 */ 2333 tp->snd_wnd = th->th_win; 2334 if (to.to_flags & TOF_TS) { 2335 tp->t_flags |= TF_RCVD_TSTMP; 2336 tp->ts_recent = to.to_tsval; 2337 tp->ts_recent_age = tcp_ts_getticks(); 2338 } 2339 if (to.to_flags & TOF_MSS) 2340 tcp_mss(tp, to.to_mss); 2341 if ((tp->t_flags & TF_SACK_PERMIT) && 2342 (to.to_flags & TOF_SACKPERM) == 0) 2343 tp->t_flags &= ~TF_SACK_PERMIT; 2344 } 2345 /* 2346 * Header prediction: check for the two common cases 2347 * of a uni-directional data xfer. If the packet has 2348 * no control flags, is in-sequence, the window didn't 2349 * change and we're not retransmitting, it's a 2350 * candidate. If the length is zero and the ack moved 2351 * forward, we're the sender side of the xfer. Just 2352 * free the data acked & wake any higher level process 2353 * that was blocked waiting for space. If the length 2354 * is non-zero and the ack didn't move, we're the 2355 * receiver side. If we're getting packets in-order 2356 * (the reassembly queue is empty), add the data to 2357 * the socket buffer and note that we need a delayed ack. 2358 * Make sure that the hidden state-flags are also off. 2359 * Since we check for TCPS_ESTABLISHED first, it can only 2360 * be TH_NEEDSYN. 2361 */ 2362 if (__predict_true(tp->t_state == TCPS_ESTABLISHED) && 2363 __predict_true(((to.to_flags & TOF_SACK) == 0)) && 2364 __predict_true(tlen == 0) && 2365 __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) && 2366 __predict_true(LIST_EMPTY(&tp->t_segq)) && 2367 __predict_true(th->th_seq == tp->rcv_nxt)) { 2368 if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 2369 ti_locked, tiwin)) { 2370 return; 2371 } 2372 } 2373 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, 2374 ti_locked, tiwin, thflags); 2375} 2376 2377struct tcp_function_block __tcp_fastslow = { 2378 "fastslow", 2379 tcp_output, 2380 tcp_do_segment_fastslow, 2381 tcp_default_ctloutput, 2382 NULL, 2383 NULL, 2384 NULL, 2385 NULL, 2386 NULL, 2387 NULL, 2388 NULL, 2389 0, 2390 0 2391 2392}; 2393 2394struct tcp_function_block __tcp_fastack = { 2395 "fastack", 2396 tcp_output, 2397 tcp_do_segment_fastack, 2398 tcp_default_ctloutput, 2399 NULL, 2400 NULL, 2401 NULL, 2402 NULL, 2403 NULL, 2404 NULL, 2405 NULL, 2406 0, 2407 0 2408}; 2409 2410static int 2411tcp_addfastpaths(module_t mod, int type, void *data) 2412{ 2413 int err=0; 2414 2415 switch (type) { 2416 case MOD_LOAD: 2417 err = register_tcp_functions(&__tcp_fastack, M_WAITOK); 2418 if (err) { 2419 printf("Failed to register fastack module -- err:%d\n", err); 2420 return(err); 2421 } 2422 err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); 2423 if (err) { 2424 printf("Failed to register fastslow module -- err:%d\n", err); 2425 deregister_tcp_functions(&__tcp_fastack); 2426 return(err); 2427 } 2428 break; 2429 case MOD_QUIESCE: 2430 if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) { 2431 return(EBUSY); 2432 } 2433 break; 2434 case MOD_UNLOAD: 2435 err = deregister_tcp_functions(&__tcp_fastack); 2436 if (err == EBUSY) 2437 break; 2438 err = deregister_tcp_functions(&__tcp_fastslow); 2439 if (err == EBUSY) 2440 break; 2441 err = 0; 2442 break; 2443 default: 2444 return (EOPNOTSUPP); 2445 } 2446 return (err); 2447} 2448 2449static moduledata_t new_tcp_fastpaths = { 2450 .name = "tcp_fastpaths", 2451 .evhand = tcp_addfastpaths, 2452 .priv = 0 2453}; 2454 2455MODULE_VERSION(kern_tcpfastpaths, 1); 2456DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PSEUDO, SI_ORDER_ANY);
|