66#include <netinet6/ip6_var.h> 67#endif 68#include <netinet/tcp.h> 69#define TCPOUTFLAGS 70#include <netinet/tcp_fsm.h> 71#include <netinet/tcp_seq.h> 72#include <netinet/tcp_timer.h> 73#include <netinet/tcp_var.h> 74#include <netinet/tcpip.h> 75#ifdef TCPDEBUG 76#include <netinet/tcp_debug.h> 77#endif 78 79#ifdef IPSEC 80#include <netinet6/ipsec.h> 81#endif /*IPSEC*/ 82 83#include <machine/in_cksum.h> 84 85#ifdef notyet 86extern struct mbuf *m_copypack(); 87#endif 88 89static int path_mtu_discovery = 1; 90SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, 91 &path_mtu_discovery, 1, "Enable Path MTU Discovery"); 92 93int ss_fltsz = 1; 94SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, 95 &ss_fltsz, 1, "Slow start flight size"); 96 97int ss_fltsz_local = TCP_MAXWIN; /* something large */ 98SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, 99 &ss_fltsz_local, 1, "Slow start flight size for local networks"); 100 101int tcp_do_newreno = 0; 102SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, 103 0, "Enable NewReno Algorithms"); 104/* 105 * Tcp output routine: figure out what should be sent and send it. 106 */ 107int 108tcp_output(tp) 109 register struct tcpcb *tp; 110{ 111 register struct socket *so = tp->t_inpcb->inp_socket; 112 register long len, win; 113 int off, flags, error; 114 register struct mbuf *m; 115 struct ip *ip = NULL; 116 register struct ipovly *ipov = NULL; 117#ifdef INET6 118 struct ip6_hdr *ip6 = NULL; 119#endif /* INET6 */ 120 register struct tcphdr *th; 121 u_char opt[TCP_MAXOLEN]; 122 unsigned ipoptlen, optlen, hdrlen; 123 int idle, sendalot; 124 int maxburst = TCP_MAXBURST; 125 struct rmxp_tao *taop; 126 struct rmxp_tao tao_noncached; 127#ifdef INET6 128 int isipv6; 129#endif 130 131#ifdef INET6 132 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 133#endif 134 135 /* 136 * Determine length of data that should be transmitted, 137 * and flags that will be used. 138 * If there is some data or critical controls (SYN, RST) 139 * to send, then transmit; otherwise, investigate further. 140 */ 141 idle = (tp->snd_max == tp->snd_una); 142 if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { 143 /* 144 * We have been idle for "a while" and no acks are 145 * expected to clock out any data we send -- 146 * slow start to get ack "clock" running again. 147 * 148 * Set the slow-start flight size depending on whether 149 * this is a local network or not. 150 */ 151 if ( 152#ifdef INET6 153 (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) || 154 (!isipv6 && 155#endif 156 in_localaddr(tp->t_inpcb->inp_faddr) 157#ifdef INET6 158 ) 159#endif 160 ) 161 tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; 162 else 163 tp->snd_cwnd = tp->t_maxseg * ss_fltsz; 164 } 165again: 166 sendalot = 0; 167 off = tp->snd_nxt - tp->snd_una; 168 win = min(tp->snd_wnd, tp->snd_cwnd); 169 170 flags = tcp_outflags[tp->t_state]; 171 /* 172 * Get standard flags, and add SYN or FIN if requested by 'hidden' 173 * state flags. 174 */ 175 if (tp->t_flags & TF_NEEDFIN) 176 flags |= TH_FIN; 177 if (tp->t_flags & TF_NEEDSYN) 178 flags |= TH_SYN; 179 180 /* 181 * If in persist timeout with window of 0, send 1 byte. 182 * Otherwise, if window is small but nonzero 183 * and timer expired, we will send what we can 184 * and go to transmit state. 185 */ 186 if (tp->t_force) { 187 if (win == 0) { 188 /* 189 * If we still have some data to send, then 190 * clear the FIN bit. Usually this would 191 * happen below when it realizes that we 192 * aren't sending all the data. However, 193 * if we have exactly 1 byte of unsent data, 194 * then it won't clear the FIN bit below, 195 * and if we are in persist state, we wind 196 * up sending the packet without recording 197 * that we sent the FIN bit. 198 * 199 * We can't just blindly clear the FIN bit, 200 * because if we don't have any more data 201 * to send then the probe will be the FIN 202 * itself. 203 */ 204 if (off < so->so_snd.sb_cc) 205 flags &= ~TH_FIN; 206 win = 1; 207 } else { 208 callout_stop(tp->tt_persist); 209 tp->t_rxtshift = 0; 210 } 211 } 212 213 len = (long)ulmin(so->so_snd.sb_cc, win) - off; 214 215 if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { 216 taop = &tao_noncached; 217 bzero(taop, sizeof(*taop)); 218 } 219 220 /* 221 * Lop off SYN bit if it has already been sent. However, if this 222 * is SYN-SENT state and if segment contains data and if we don't 223 * know that foreign host supports TAO, suppress sending segment. 224 */ 225 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { 226 flags &= ~TH_SYN; 227 off--, len++; 228 if (len > 0 && tp->t_state == TCPS_SYN_SENT && 229 taop->tao_ccsent == 0) 230 return 0; 231 } 232 233 /* 234 * Be careful not to send data and/or FIN on SYN segments 235 * in cases when no CC option will be sent. 236 * This measure is needed to prevent interoperability problems 237 * with not fully conformant TCP implementations. 238 */ 239 if ((flags & TH_SYN) && 240 ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || 241 ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { 242 len = 0; 243 flags &= ~TH_FIN; 244 } 245 246 if (len < 0) { 247 /* 248 * If FIN has been sent but not acked, 249 * but we haven't been called to retransmit, 250 * len will be -1. Otherwise, window shrank 251 * after we sent into it. If window shrank to 0, 252 * cancel pending retransmit, pull snd_nxt back 253 * to (closed) window, and set the persist timer 254 * if it isn't already going. If the window didn't 255 * close completely, just wait for an ACK. 256 */ 257 len = 0; 258 if (win == 0) { 259 callout_stop(tp->tt_rexmt); 260 tp->t_rxtshift = 0; 261 tp->snd_nxt = tp->snd_una; 262 if (!callout_active(tp->tt_persist)) 263 tcp_setpersist(tp); 264 } 265 } 266 if (len > tp->t_maxseg) { 267 len = tp->t_maxseg; 268 sendalot = 1; 269 } 270 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 271 flags &= ~TH_FIN; 272 273 win = sbspace(&so->so_rcv); 274 275 /* 276 * Sender silly window avoidance. If connection is idle 277 * and can send all data, a maximum segment, 278 * at least a maximum default-size segment do it, 279 * or are forced, do it; otherwise don't bother. 280 * If peer's buffer is tiny, then send 281 * when window is at least half open. 282 * If retransmitting (possibly after persist timer forced us 283 * to send into a small window), then must resend. 284 */ 285 if (len) { 286 if (len == tp->t_maxseg) 287 goto send; 288 if (!(tp->t_flags & TF_MORETOCOME) && 289 (idle || tp->t_flags & TF_NODELAY) && 290 (tp->t_flags & TF_NOPUSH) == 0 && 291 len + off >= so->so_snd.sb_cc) 292 goto send; 293 if (tp->t_force) 294 goto send; 295 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) 296 goto send; 297 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 298 goto send; 299 } 300 301 /* 302 * Compare available window to amount of window 303 * known to peer (as advertised window less 304 * next expected input). If the difference is at least two 305 * max size segments, or at least 50% of the maximum possible 306 * window, then want to send a window update to peer. 307 */ 308 if (win > 0) { 309 /* 310 * "adv" is the amount we can increase the window, 311 * taking into account that we are limited by 312 * TCP_MAXWIN << tp->rcv_scale. 313 */ 314 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - 315 (tp->rcv_adv - tp->rcv_nxt); 316 317 if (adv >= (long) (2 * tp->t_maxseg)) 318 goto send; 319 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 320 goto send; 321 } 322 323 /* 324 * Send if we owe peer an ACK. 325 */ 326 if (tp->t_flags & TF_ACKNOW) 327 goto send; 328 if ((flags & TH_RST) || 329 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) 330 goto send; 331 if (SEQ_GT(tp->snd_up, tp->snd_una)) 332 goto send; 333 /* 334 * If our state indicates that FIN should be sent 335 * and we have not yet done so, or we're retransmitting the FIN, 336 * then we need to send. 337 */ 338 if (flags & TH_FIN && 339 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 340 goto send; 341 342 /* 343 * TCP window updates are not reliable, rather a polling protocol 344 * using ``persist'' packets is used to insure receipt of window 345 * updates. The three ``states'' for the output side are: 346 * idle not doing retransmits or persists 347 * persisting to move a small or zero window 348 * (re)transmitting and thereby not persisting 349 * 350 * callout_active(tp->tt_persist) 351 * is true when we are in persist state. 352 * tp->t_force 353 * is set when we are called to send a persist packet. 354 * callout_active(tp->tt_rexmt) 355 * is set when we are retransmitting 356 * The output side is idle when both timers are zero. 357 * 358 * If send window is too small, there is data to transmit, and no 359 * retransmit or persist is pending, then go to persist state. 360 * If nothing happens soon, send when timer expires: 361 * if window is nonzero, transmit what we can, 362 * otherwise force out a byte. 363 */ 364 if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && 365 !callout_active(tp->tt_persist)) { 366 tp->t_rxtshift = 0; 367 tcp_setpersist(tp); 368 } 369 370 /* 371 * No reason to send a segment, just return. 372 */ 373 return (0); 374 375send: 376 /* 377 * Before ESTABLISHED, force sending of initial options 378 * unless TCP set not to do any options. 379 * NOTE: we assume that the IP/TCP header plus TCP options 380 * always fit in a single mbuf, leaving room for a maximum 381 * link header, i.e. 382 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN 383 */ 384 optlen = 0; 385#ifdef INET6 386 if (isipv6) 387 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 388 else 389#endif 390 hdrlen = sizeof (struct tcpiphdr); 391 if (flags & TH_SYN) { 392 tp->snd_nxt = tp->iss; 393 if ((tp->t_flags & TF_NOOPT) == 0) { 394 u_short mss; 395 396 opt[0] = TCPOPT_MAXSEG; 397 opt[1] = TCPOLEN_MAXSEG; 398 mss = htons((u_short) tcp_mssopt(tp)); 399 (void)memcpy(opt + 2, &mss, sizeof(mss)); 400 optlen = TCPOLEN_MAXSEG; 401 402 if ((tp->t_flags & TF_REQ_SCALE) && 403 ((flags & TH_ACK) == 0 || 404 (tp->t_flags & TF_RCVD_SCALE))) { 405 *((u_int32_t *)(opt + optlen)) = htonl( 406 TCPOPT_NOP << 24 | 407 TCPOPT_WINDOW << 16 | 408 TCPOLEN_WINDOW << 8 | 409 tp->request_r_scale); 410 optlen += 4; 411 } 412 } 413 } 414 415 /* 416 * Send a timestamp and echo-reply if this is a SYN and our side 417 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 418 * and our peer have sent timestamps in our SYN's. 419 */ 420 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 421 (flags & TH_RST) == 0 && 422 ((flags & TH_ACK) == 0 || 423 (tp->t_flags & TF_RCVD_TSTMP))) { 424 u_int32_t *lp = (u_int32_t *)(opt + optlen); 425 426 /* Form timestamp option as shown in appendix A of RFC 1323. */ 427 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 428 *lp++ = htonl(ticks); 429 *lp = htonl(tp->ts_recent); 430 optlen += TCPOLEN_TSTAMP_APPA; 431 } 432 433 /* 434 * Send `CC-family' options if our side wants to use them (TF_REQ_CC), 435 * options are allowed (!TF_NOOPT) and it's not a RST. 436 */ 437 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 438 (flags & TH_RST) == 0) { 439 switch (flags & (TH_SYN|TH_ACK)) { 440 /* 441 * This is a normal ACK, send CC if we received CC before 442 * from our peer. 443 */ 444 case TH_ACK: 445 if (!(tp->t_flags & TF_RCVD_CC)) 446 break; 447 /*FALLTHROUGH*/ 448 449 /* 450 * We can only get here in T/TCP's SYN_SENT* state, when 451 * we're a sending a non-SYN segment without waiting for 452 * the ACK of our SYN. A check above assures that we only 453 * do this if our peer understands T/TCP. 454 */ 455 case 0: 456 opt[optlen++] = TCPOPT_NOP; 457 opt[optlen++] = TCPOPT_NOP; 458 opt[optlen++] = TCPOPT_CC; 459 opt[optlen++] = TCPOLEN_CC; 460 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); 461 462 optlen += 4; 463 break; 464 465 /* 466 * This is our initial SYN, check whether we have to use 467 * CC or CC.new. 468 */ 469 case TH_SYN: 470 opt[optlen++] = TCPOPT_NOP; 471 opt[optlen++] = TCPOPT_NOP; 472 opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? 473 TCPOPT_CCNEW : TCPOPT_CC; 474 opt[optlen++] = TCPOLEN_CC; 475 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); 476 optlen += 4; 477 break; 478 479 /* 480 * This is a SYN,ACK; send CC and CC.echo if we received 481 * CC from our peer. 482 */ 483 case (TH_SYN|TH_ACK): 484 if (tp->t_flags & TF_RCVD_CC) { 485 opt[optlen++] = TCPOPT_NOP; 486 opt[optlen++] = TCPOPT_NOP; 487 opt[optlen++] = TCPOPT_CC; 488 opt[optlen++] = TCPOLEN_CC; 489 *(u_int32_t *)&opt[optlen] = 490 htonl(tp->cc_send); 491 optlen += 4; 492 opt[optlen++] = TCPOPT_NOP; 493 opt[optlen++] = TCPOPT_NOP; 494 opt[optlen++] = TCPOPT_CCECHO; 495 opt[optlen++] = TCPOLEN_CC; 496 *(u_int32_t *)&opt[optlen] = 497 htonl(tp->cc_recv); 498 optlen += 4; 499 } 500 break; 501 } 502 } 503 504 hdrlen += optlen; 505 506#ifdef INET6 507 if (isipv6) 508 ipoptlen = ip6_optlen(tp->t_inpcb); 509 else 510#endif 511 { 512 if (tp->t_inpcb->inp_options) { 513 ipoptlen = tp->t_inpcb->inp_options->m_len - 514 offsetof(struct ipoption, ipopt_list); 515 } else { 516 ipoptlen = 0; 517 } 518 } 519#ifdef IPSEC 520 ipoptlen += ipsec_hdrsiz_tcp(tp); 521#endif 522 523 /* 524 * Adjust data length if insertion of options will 525 * bump the packet length beyond the t_maxopd length. 526 * Clear the FIN bit because we cut off the tail of 527 * the segment. 528 */ 529 if (len + optlen + ipoptlen > tp->t_maxopd) { 530 /* 531 * If there is still more to send, don't close the connection. 532 */ 533 flags &= ~TH_FIN; 534 len = tp->t_maxopd - optlen - ipoptlen; 535 sendalot = 1; 536 } 537 538/*#ifdef DIAGNOSTIC*/ 539#ifdef INET6 540 if (max_linkhdr + hdrlen > MCLBYTES) 541 panic("tcphdr too big"); 542#else 543 if (max_linkhdr + hdrlen > MHLEN) 544 panic("tcphdr too big"); 545#endif 546/*#endif*/ 547 548 /* 549 * Grab a header mbuf, attaching a copy of data to 550 * be transmitted, and initialize the header from 551 * the template for sends on this connection. 552 */ 553 if (len) { 554 if (tp->t_force && len == 1) 555 tcpstat.tcps_sndprobe++; 556 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 557 tcpstat.tcps_sndrexmitpack++; 558 tcpstat.tcps_sndrexmitbyte += len; 559 } else { 560 tcpstat.tcps_sndpack++; 561 tcpstat.tcps_sndbyte += len; 562 } 563#ifdef notyet 564 if ((m = m_copypack(so->so_snd.sb_mb, off, 565 (int)len, max_linkhdr + hdrlen)) == 0) { 566 error = ENOBUFS; 567 goto out; 568 } 569 /* 570 * m_copypack left space for our hdr; use it. 571 */ 572 m->m_len += hdrlen; 573 m->m_data -= hdrlen; 574#else 575 MGETHDR(m, M_DONTWAIT, MT_HEADER); 576 if (m == NULL) { 577 error = ENOBUFS; 578 goto out; 579 } 580#ifdef INET6 581 if (MHLEN < hdrlen + max_linkhdr) { 582 MCLGET(m, M_DONTWAIT); 583 if ((m->m_flags & M_EXT) == 0) { 584 m_freem(m); 585 error = ENOBUFS; 586 goto out; 587 } 588 } 589#endif 590 m->m_data += max_linkhdr; 591 m->m_len = hdrlen; 592 if (len <= MHLEN - hdrlen - max_linkhdr) { 593 m_copydata(so->so_snd.sb_mb, off, (int) len, 594 mtod(m, caddr_t) + hdrlen); 595 m->m_len += len; 596 } else { 597 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 598 if (m->m_next == 0) { 599 (void) m_free(m); 600 error = ENOBUFS; 601 goto out; 602 } 603 } 604#endif 605 /* 606 * If we're sending everything we've got, set PUSH. 607 * (This will keep happy those implementations which only 608 * give data to the user when a buffer fills or 609 * a PUSH comes in.) 610 */ 611 if (off + len == so->so_snd.sb_cc) 612 flags |= TH_PUSH; 613 } else { 614 if (tp->t_flags & TF_ACKNOW) 615 tcpstat.tcps_sndacks++; 616 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 617 tcpstat.tcps_sndctrl++; 618 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 619 tcpstat.tcps_sndurg++; 620 else 621 tcpstat.tcps_sndwinup++; 622 623 MGETHDR(m, M_DONTWAIT, MT_HEADER); 624 if (m == NULL) { 625 error = ENOBUFS; 626 goto out; 627 } 628#ifdef INET6 629 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 630 MHLEN >= hdrlen) { 631 MH_ALIGN(m, hdrlen); 632 } else 633#endif 634 m->m_data += max_linkhdr; 635 m->m_len = hdrlen; 636 } 637 m->m_pkthdr.rcvif = (struct ifnet *)0; 638 if (tp->t_template == 0) 639 panic("tcp_output"); 640#ifdef INET6 641 if (isipv6) { 642 ip6 = mtod(m, struct ip6_hdr *); 643 th = (struct tcphdr *)(ip6 + 1); 644 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6, 645 sizeof(struct ip6_hdr)); 646 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, 647 sizeof(struct tcphdr)); 648 } else 649#endif /* INET6 */ 650 { 651 ip = mtod(m, struct ip *); 652 ipov = (struct ipovly *)ip; 653 th = (struct tcphdr *)(ip + 1); 654 /* this picks up the pseudo header (w/o the length) */ 655 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip, 656 sizeof(struct ip)); 657 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, 658 sizeof(struct tcphdr)); 659 } 660 661 /* 662 * Fill in fields, remembering maximum advertised 663 * window for use in delaying messages about window sizes. 664 * If resending a FIN, be sure not to use a new sequence number. 665 */ 666 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 667 tp->snd_nxt == tp->snd_max) 668 tp->snd_nxt--; 669 /* 670 * If we are doing retransmissions, then snd_nxt will 671 * not reflect the first unsent octet. For ACK only 672 * packets, we do not want the sequence number of the 673 * retransmitted packet, we want the sequence number 674 * of the next unsent octet. So, if there is no data 675 * (and no SYN or FIN), use snd_max instead of snd_nxt 676 * when filling in ti_seq. But if we are in persist 677 * state, snd_max might reflect one byte beyond the 678 * right edge of the window, so use snd_nxt in that 679 * case, since we know we aren't doing a retransmission. 680 * (retransmit and persist are mutually exclusive...) 681 */ 682 if (len || (flags & (TH_SYN|TH_FIN)) 683 || callout_active(tp->tt_persist)) 684 th->th_seq = htonl(tp->snd_nxt); 685 else 686 th->th_seq = htonl(tp->snd_max); 687 th->th_ack = htonl(tp->rcv_nxt); 688 if (optlen) { 689 bcopy(opt, th + 1, optlen); 690 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 691 } 692 th->th_flags = flags; 693 /* 694 * Calculate receive window. Don't shrink window, 695 * but avoid silly window syndrome. 696 */ 697 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 698 win = 0; 699 if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 700 win = (long)(tp->rcv_adv - tp->rcv_nxt); 701 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 702 win = (long)TCP_MAXWIN << tp->rcv_scale; 703 th->th_win = htons((u_short) (win>>tp->rcv_scale)); 704 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 705 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 706 th->th_flags |= TH_URG; 707 } else 708 /* 709 * If no urgent pointer to send, then we pull 710 * the urgent pointer to the left edge of the send window 711 * so that it doesn't drift into the send window on sequence 712 * number wraparound. 713 */ 714 tp->snd_up = tp->snd_una; /* drag it along */ 715 716 /* 717 * Put TCP length in extended header, and then 718 * checksum extended header and data. 719 */ 720 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 721#ifdef INET6 722 if (isipv6) 723 /* 724 * ip6_plen is not need to be filled now, and will be filled 725 * in ip6_output. 726 */ 727 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 728 sizeof(struct tcphdr) + optlen + len); 729 else 730#endif /* INET6 */ 731 { 732 m->m_pkthdr.csum_flags = CSUM_TCP; 733 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 734 if (len + optlen) 735 th->th_sum = in_addword(th->th_sum, 736 htons((u_short)(optlen + len))); 737 738 /* IP version must be set here for ipv4/ipv6 checking later */ 739 KASSERT(ip->ip_v == IPVERSION, 740 ("%s: IP version incorrect: %d", __FUNCTION__, ip->ip_v)); 741 } 742 743 /* 744 * In transmit state, time the transmission and arrange for 745 * the retransmit. In persist state, just set snd_max. 746 */ 747 if (tp->t_force == 0 || !callout_active(tp->tt_persist)) { 748 tcp_seq startseq = tp->snd_nxt; 749 750 /* 751 * Advance snd_nxt over sequence space of this segment. 752 */ 753 if (flags & (TH_SYN|TH_FIN)) { 754 if (flags & TH_SYN) 755 tp->snd_nxt++; 756 if (flags & TH_FIN) { 757 tp->snd_nxt++; 758 tp->t_flags |= TF_SENTFIN; 759 } 760 } 761 tp->snd_nxt += len; 762 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 763 tp->snd_max = tp->snd_nxt; 764 /* 765 * Time this transmission if not a retransmission and 766 * not currently timing anything. 767 */ 768 if (tp->t_rtttime == 0) { 769 tp->t_rtttime = ticks; 770 tp->t_rtseq = startseq; 771 tcpstat.tcps_segstimed++; 772 } 773 } 774 775 /* 776 * Set retransmit timer if not currently set, 777 * and not doing an ack or a keep-alive probe. 778 * Initial value for retransmit timer is smoothed 779 * round-trip time + 2 * round-trip time variance. 780 * Initialize shift counter which is used for backoff 781 * of retransmit time. 782 */ 783 if (!callout_active(tp->tt_rexmt) && 784 tp->snd_nxt != tp->snd_una) { 785 if (callout_active(tp->tt_persist)) { 786 callout_stop(tp->tt_persist); 787 tp->t_rxtshift = 0; 788 } 789 callout_reset(tp->tt_rexmt, tp->t_rxtcur, 790 tcp_timer_rexmt, tp); 791 } 792 } else 793 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 794 tp->snd_max = tp->snd_nxt + len; 795 796#ifdef TCPDEBUG 797 /* 798 * Trace. 799 */ 800 if (so->so_options & SO_DEBUG) 801 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 802#endif 803 804 /* 805 * Fill in IP length and desired time to live and 806 * send to IP level. There should be a better way 807 * to handle ttl and tos; we could keep them in 808 * the template, but need a way to checksum without them. 809 */ 810 /* 811 * m->m_pkthdr.len should have been set before cksum calcuration, 812 * because in6_cksum() need it. 813 */ 814#ifdef INET6 815 if (isipv6) {
| 63#include <netinet6/ip6_var.h> 64#endif 65#include <netinet/tcp.h> 66#define TCPOUTFLAGS 67#include <netinet/tcp_fsm.h> 68#include <netinet/tcp_seq.h> 69#include <netinet/tcp_timer.h> 70#include <netinet/tcp_var.h> 71#include <netinet/tcpip.h> 72#ifdef TCPDEBUG 73#include <netinet/tcp_debug.h> 74#endif 75 76#ifdef IPSEC 77#include <netinet6/ipsec.h> 78#endif /*IPSEC*/ 79 80#include <machine/in_cksum.h> 81 82#ifdef notyet 83extern struct mbuf *m_copypack(); 84#endif 85 86static int path_mtu_discovery = 1; 87SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, 88 &path_mtu_discovery, 1, "Enable Path MTU Discovery"); 89 90int ss_fltsz = 1; 91SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, 92 &ss_fltsz, 1, "Slow start flight size"); 93 94int ss_fltsz_local = TCP_MAXWIN; /* something large */ 95SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, 96 &ss_fltsz_local, 1, "Slow start flight size for local networks"); 97 98int tcp_do_newreno = 0; 99SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, 100 0, "Enable NewReno Algorithms"); 101/* 102 * Tcp output routine: figure out what should be sent and send it. 103 */ 104int 105tcp_output(tp) 106 register struct tcpcb *tp; 107{ 108 register struct socket *so = tp->t_inpcb->inp_socket; 109 register long len, win; 110 int off, flags, error; 111 register struct mbuf *m; 112 struct ip *ip = NULL; 113 register struct ipovly *ipov = NULL; 114#ifdef INET6 115 struct ip6_hdr *ip6 = NULL; 116#endif /* INET6 */ 117 register struct tcphdr *th; 118 u_char opt[TCP_MAXOLEN]; 119 unsigned ipoptlen, optlen, hdrlen; 120 int idle, sendalot; 121 int maxburst = TCP_MAXBURST; 122 struct rmxp_tao *taop; 123 struct rmxp_tao tao_noncached; 124#ifdef INET6 125 int isipv6; 126#endif 127 128#ifdef INET6 129 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 130#endif 131 132 /* 133 * Determine length of data that should be transmitted, 134 * and flags that will be used. 135 * If there is some data or critical controls (SYN, RST) 136 * to send, then transmit; otherwise, investigate further. 137 */ 138 idle = (tp->snd_max == tp->snd_una); 139 if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { 140 /* 141 * We have been idle for "a while" and no acks are 142 * expected to clock out any data we send -- 143 * slow start to get ack "clock" running again. 144 * 145 * Set the slow-start flight size depending on whether 146 * this is a local network or not. 147 */ 148 if ( 149#ifdef INET6 150 (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) || 151 (!isipv6 && 152#endif 153 in_localaddr(tp->t_inpcb->inp_faddr) 154#ifdef INET6 155 ) 156#endif 157 ) 158 tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; 159 else 160 tp->snd_cwnd = tp->t_maxseg * ss_fltsz; 161 } 162again: 163 sendalot = 0; 164 off = tp->snd_nxt - tp->snd_una; 165 win = min(tp->snd_wnd, tp->snd_cwnd); 166 167 flags = tcp_outflags[tp->t_state]; 168 /* 169 * Get standard flags, and add SYN or FIN if requested by 'hidden' 170 * state flags. 171 */ 172 if (tp->t_flags & TF_NEEDFIN) 173 flags |= TH_FIN; 174 if (tp->t_flags & TF_NEEDSYN) 175 flags |= TH_SYN; 176 177 /* 178 * If in persist timeout with window of 0, send 1 byte. 179 * Otherwise, if window is small but nonzero 180 * and timer expired, we will send what we can 181 * and go to transmit state. 182 */ 183 if (tp->t_force) { 184 if (win == 0) { 185 /* 186 * If we still have some data to send, then 187 * clear the FIN bit. Usually this would 188 * happen below when it realizes that we 189 * aren't sending all the data. However, 190 * if we have exactly 1 byte of unsent data, 191 * then it won't clear the FIN bit below, 192 * and if we are in persist state, we wind 193 * up sending the packet without recording 194 * that we sent the FIN bit. 195 * 196 * We can't just blindly clear the FIN bit, 197 * because if we don't have any more data 198 * to send then the probe will be the FIN 199 * itself. 200 */ 201 if (off < so->so_snd.sb_cc) 202 flags &= ~TH_FIN; 203 win = 1; 204 } else { 205 callout_stop(tp->tt_persist); 206 tp->t_rxtshift = 0; 207 } 208 } 209 210 len = (long)ulmin(so->so_snd.sb_cc, win) - off; 211 212 if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { 213 taop = &tao_noncached; 214 bzero(taop, sizeof(*taop)); 215 } 216 217 /* 218 * Lop off SYN bit if it has already been sent. However, if this 219 * is SYN-SENT state and if segment contains data and if we don't 220 * know that foreign host supports TAO, suppress sending segment. 221 */ 222 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { 223 flags &= ~TH_SYN; 224 off--, len++; 225 if (len > 0 && tp->t_state == TCPS_SYN_SENT && 226 taop->tao_ccsent == 0) 227 return 0; 228 } 229 230 /* 231 * Be careful not to send data and/or FIN on SYN segments 232 * in cases when no CC option will be sent. 233 * This measure is needed to prevent interoperability problems 234 * with not fully conformant TCP implementations. 235 */ 236 if ((flags & TH_SYN) && 237 ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || 238 ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { 239 len = 0; 240 flags &= ~TH_FIN; 241 } 242 243 if (len < 0) { 244 /* 245 * If FIN has been sent but not acked, 246 * but we haven't been called to retransmit, 247 * len will be -1. Otherwise, window shrank 248 * after we sent into it. If window shrank to 0, 249 * cancel pending retransmit, pull snd_nxt back 250 * to (closed) window, and set the persist timer 251 * if it isn't already going. If the window didn't 252 * close completely, just wait for an ACK. 253 */ 254 len = 0; 255 if (win == 0) { 256 callout_stop(tp->tt_rexmt); 257 tp->t_rxtshift = 0; 258 tp->snd_nxt = tp->snd_una; 259 if (!callout_active(tp->tt_persist)) 260 tcp_setpersist(tp); 261 } 262 } 263 if (len > tp->t_maxseg) { 264 len = tp->t_maxseg; 265 sendalot = 1; 266 } 267 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 268 flags &= ~TH_FIN; 269 270 win = sbspace(&so->so_rcv); 271 272 /* 273 * Sender silly window avoidance. If connection is idle 274 * and can send all data, a maximum segment, 275 * at least a maximum default-size segment do it, 276 * or are forced, do it; otherwise don't bother. 277 * If peer's buffer is tiny, then send 278 * when window is at least half open. 279 * If retransmitting (possibly after persist timer forced us 280 * to send into a small window), then must resend. 281 */ 282 if (len) { 283 if (len == tp->t_maxseg) 284 goto send; 285 if (!(tp->t_flags & TF_MORETOCOME) && 286 (idle || tp->t_flags & TF_NODELAY) && 287 (tp->t_flags & TF_NOPUSH) == 0 && 288 len + off >= so->so_snd.sb_cc) 289 goto send; 290 if (tp->t_force) 291 goto send; 292 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) 293 goto send; 294 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 295 goto send; 296 } 297 298 /* 299 * Compare available window to amount of window 300 * known to peer (as advertised window less 301 * next expected input). If the difference is at least two 302 * max size segments, or at least 50% of the maximum possible 303 * window, then want to send a window update to peer. 304 */ 305 if (win > 0) { 306 /* 307 * "adv" is the amount we can increase the window, 308 * taking into account that we are limited by 309 * TCP_MAXWIN << tp->rcv_scale. 310 */ 311 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - 312 (tp->rcv_adv - tp->rcv_nxt); 313 314 if (adv >= (long) (2 * tp->t_maxseg)) 315 goto send; 316 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 317 goto send; 318 } 319 320 /* 321 * Send if we owe peer an ACK. 322 */ 323 if (tp->t_flags & TF_ACKNOW) 324 goto send; 325 if ((flags & TH_RST) || 326 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) 327 goto send; 328 if (SEQ_GT(tp->snd_up, tp->snd_una)) 329 goto send; 330 /* 331 * If our state indicates that FIN should be sent 332 * and we have not yet done so, or we're retransmitting the FIN, 333 * then we need to send. 334 */ 335 if (flags & TH_FIN && 336 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 337 goto send; 338 339 /* 340 * TCP window updates are not reliable, rather a polling protocol 341 * using ``persist'' packets is used to insure receipt of window 342 * updates. The three ``states'' for the output side are: 343 * idle not doing retransmits or persists 344 * persisting to move a small or zero window 345 * (re)transmitting and thereby not persisting 346 * 347 * callout_active(tp->tt_persist) 348 * is true when we are in persist state. 349 * tp->t_force 350 * is set when we are called to send a persist packet. 351 * callout_active(tp->tt_rexmt) 352 * is set when we are retransmitting 353 * The output side is idle when both timers are zero. 354 * 355 * If send window is too small, there is data to transmit, and no 356 * retransmit or persist is pending, then go to persist state. 357 * If nothing happens soon, send when timer expires: 358 * if window is nonzero, transmit what we can, 359 * otherwise force out a byte. 360 */ 361 if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && 362 !callout_active(tp->tt_persist)) { 363 tp->t_rxtshift = 0; 364 tcp_setpersist(tp); 365 } 366 367 /* 368 * No reason to send a segment, just return. 369 */ 370 return (0); 371 372send: 373 /* 374 * Before ESTABLISHED, force sending of initial options 375 * unless TCP set not to do any options. 376 * NOTE: we assume that the IP/TCP header plus TCP options 377 * always fit in a single mbuf, leaving room for a maximum 378 * link header, i.e. 379 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN 380 */ 381 optlen = 0; 382#ifdef INET6 383 if (isipv6) 384 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 385 else 386#endif 387 hdrlen = sizeof (struct tcpiphdr); 388 if (flags & TH_SYN) { 389 tp->snd_nxt = tp->iss; 390 if ((tp->t_flags & TF_NOOPT) == 0) { 391 u_short mss; 392 393 opt[0] = TCPOPT_MAXSEG; 394 opt[1] = TCPOLEN_MAXSEG; 395 mss = htons((u_short) tcp_mssopt(tp)); 396 (void)memcpy(opt + 2, &mss, sizeof(mss)); 397 optlen = TCPOLEN_MAXSEG; 398 399 if ((tp->t_flags & TF_REQ_SCALE) && 400 ((flags & TH_ACK) == 0 || 401 (tp->t_flags & TF_RCVD_SCALE))) { 402 *((u_int32_t *)(opt + optlen)) = htonl( 403 TCPOPT_NOP << 24 | 404 TCPOPT_WINDOW << 16 | 405 TCPOLEN_WINDOW << 8 | 406 tp->request_r_scale); 407 optlen += 4; 408 } 409 } 410 } 411 412 /* 413 * Send a timestamp and echo-reply if this is a SYN and our side 414 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 415 * and our peer have sent timestamps in our SYN's. 416 */ 417 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 418 (flags & TH_RST) == 0 && 419 ((flags & TH_ACK) == 0 || 420 (tp->t_flags & TF_RCVD_TSTMP))) { 421 u_int32_t *lp = (u_int32_t *)(opt + optlen); 422 423 /* Form timestamp option as shown in appendix A of RFC 1323. */ 424 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 425 *lp++ = htonl(ticks); 426 *lp = htonl(tp->ts_recent); 427 optlen += TCPOLEN_TSTAMP_APPA; 428 } 429 430 /* 431 * Send `CC-family' options if our side wants to use them (TF_REQ_CC), 432 * options are allowed (!TF_NOOPT) and it's not a RST. 433 */ 434 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 435 (flags & TH_RST) == 0) { 436 switch (flags & (TH_SYN|TH_ACK)) { 437 /* 438 * This is a normal ACK, send CC if we received CC before 439 * from our peer. 440 */ 441 case TH_ACK: 442 if (!(tp->t_flags & TF_RCVD_CC)) 443 break; 444 /*FALLTHROUGH*/ 445 446 /* 447 * We can only get here in T/TCP's SYN_SENT* state, when 448 * we're a sending a non-SYN segment without waiting for 449 * the ACK of our SYN. A check above assures that we only 450 * do this if our peer understands T/TCP. 451 */ 452 case 0: 453 opt[optlen++] = TCPOPT_NOP; 454 opt[optlen++] = TCPOPT_NOP; 455 opt[optlen++] = TCPOPT_CC; 456 opt[optlen++] = TCPOLEN_CC; 457 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); 458 459 optlen += 4; 460 break; 461 462 /* 463 * This is our initial SYN, check whether we have to use 464 * CC or CC.new. 465 */ 466 case TH_SYN: 467 opt[optlen++] = TCPOPT_NOP; 468 opt[optlen++] = TCPOPT_NOP; 469 opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? 470 TCPOPT_CCNEW : TCPOPT_CC; 471 opt[optlen++] = TCPOLEN_CC; 472 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); 473 optlen += 4; 474 break; 475 476 /* 477 * This is a SYN,ACK; send CC and CC.echo if we received 478 * CC from our peer. 479 */ 480 case (TH_SYN|TH_ACK): 481 if (tp->t_flags & TF_RCVD_CC) { 482 opt[optlen++] = TCPOPT_NOP; 483 opt[optlen++] = TCPOPT_NOP; 484 opt[optlen++] = TCPOPT_CC; 485 opt[optlen++] = TCPOLEN_CC; 486 *(u_int32_t *)&opt[optlen] = 487 htonl(tp->cc_send); 488 optlen += 4; 489 opt[optlen++] = TCPOPT_NOP; 490 opt[optlen++] = TCPOPT_NOP; 491 opt[optlen++] = TCPOPT_CCECHO; 492 opt[optlen++] = TCPOLEN_CC; 493 *(u_int32_t *)&opt[optlen] = 494 htonl(tp->cc_recv); 495 optlen += 4; 496 } 497 break; 498 } 499 } 500 501 hdrlen += optlen; 502 503#ifdef INET6 504 if (isipv6) 505 ipoptlen = ip6_optlen(tp->t_inpcb); 506 else 507#endif 508 { 509 if (tp->t_inpcb->inp_options) { 510 ipoptlen = tp->t_inpcb->inp_options->m_len - 511 offsetof(struct ipoption, ipopt_list); 512 } else { 513 ipoptlen = 0; 514 } 515 } 516#ifdef IPSEC 517 ipoptlen += ipsec_hdrsiz_tcp(tp); 518#endif 519 520 /* 521 * Adjust data length if insertion of options will 522 * bump the packet length beyond the t_maxopd length. 523 * Clear the FIN bit because we cut off the tail of 524 * the segment. 525 */ 526 if (len + optlen + ipoptlen > tp->t_maxopd) { 527 /* 528 * If there is still more to send, don't close the connection. 529 */ 530 flags &= ~TH_FIN; 531 len = tp->t_maxopd - optlen - ipoptlen; 532 sendalot = 1; 533 } 534 535/*#ifdef DIAGNOSTIC*/ 536#ifdef INET6 537 if (max_linkhdr + hdrlen > MCLBYTES) 538 panic("tcphdr too big"); 539#else 540 if (max_linkhdr + hdrlen > MHLEN) 541 panic("tcphdr too big"); 542#endif 543/*#endif*/ 544 545 /* 546 * Grab a header mbuf, attaching a copy of data to 547 * be transmitted, and initialize the header from 548 * the template for sends on this connection. 549 */ 550 if (len) { 551 if (tp->t_force && len == 1) 552 tcpstat.tcps_sndprobe++; 553 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 554 tcpstat.tcps_sndrexmitpack++; 555 tcpstat.tcps_sndrexmitbyte += len; 556 } else { 557 tcpstat.tcps_sndpack++; 558 tcpstat.tcps_sndbyte += len; 559 } 560#ifdef notyet 561 if ((m = m_copypack(so->so_snd.sb_mb, off, 562 (int)len, max_linkhdr + hdrlen)) == 0) { 563 error = ENOBUFS; 564 goto out; 565 } 566 /* 567 * m_copypack left space for our hdr; use it. 568 */ 569 m->m_len += hdrlen; 570 m->m_data -= hdrlen; 571#else 572 MGETHDR(m, M_DONTWAIT, MT_HEADER); 573 if (m == NULL) { 574 error = ENOBUFS; 575 goto out; 576 } 577#ifdef INET6 578 if (MHLEN < hdrlen + max_linkhdr) { 579 MCLGET(m, M_DONTWAIT); 580 if ((m->m_flags & M_EXT) == 0) { 581 m_freem(m); 582 error = ENOBUFS; 583 goto out; 584 } 585 } 586#endif 587 m->m_data += max_linkhdr; 588 m->m_len = hdrlen; 589 if (len <= MHLEN - hdrlen - max_linkhdr) { 590 m_copydata(so->so_snd.sb_mb, off, (int) len, 591 mtod(m, caddr_t) + hdrlen); 592 m->m_len += len; 593 } else { 594 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 595 if (m->m_next == 0) { 596 (void) m_free(m); 597 error = ENOBUFS; 598 goto out; 599 } 600 } 601#endif 602 /* 603 * If we're sending everything we've got, set PUSH. 604 * (This will keep happy those implementations which only 605 * give data to the user when a buffer fills or 606 * a PUSH comes in.) 607 */ 608 if (off + len == so->so_snd.sb_cc) 609 flags |= TH_PUSH; 610 } else { 611 if (tp->t_flags & TF_ACKNOW) 612 tcpstat.tcps_sndacks++; 613 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 614 tcpstat.tcps_sndctrl++; 615 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 616 tcpstat.tcps_sndurg++; 617 else 618 tcpstat.tcps_sndwinup++; 619 620 MGETHDR(m, M_DONTWAIT, MT_HEADER); 621 if (m == NULL) { 622 error = ENOBUFS; 623 goto out; 624 } 625#ifdef INET6 626 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 627 MHLEN >= hdrlen) { 628 MH_ALIGN(m, hdrlen); 629 } else 630#endif 631 m->m_data += max_linkhdr; 632 m->m_len = hdrlen; 633 } 634 m->m_pkthdr.rcvif = (struct ifnet *)0; 635 if (tp->t_template == 0) 636 panic("tcp_output"); 637#ifdef INET6 638 if (isipv6) { 639 ip6 = mtod(m, struct ip6_hdr *); 640 th = (struct tcphdr *)(ip6 + 1); 641 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6, 642 sizeof(struct ip6_hdr)); 643 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, 644 sizeof(struct tcphdr)); 645 } else 646#endif /* INET6 */ 647 { 648 ip = mtod(m, struct ip *); 649 ipov = (struct ipovly *)ip; 650 th = (struct tcphdr *)(ip + 1); 651 /* this picks up the pseudo header (w/o the length) */ 652 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip, 653 sizeof(struct ip)); 654 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, 655 sizeof(struct tcphdr)); 656 } 657 658 /* 659 * Fill in fields, remembering maximum advertised 660 * window for use in delaying messages about window sizes. 661 * If resending a FIN, be sure not to use a new sequence number. 662 */ 663 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 664 tp->snd_nxt == tp->snd_max) 665 tp->snd_nxt--; 666 /* 667 * If we are doing retransmissions, then snd_nxt will 668 * not reflect the first unsent octet. For ACK only 669 * packets, we do not want the sequence number of the 670 * retransmitted packet, we want the sequence number 671 * of the next unsent octet. So, if there is no data 672 * (and no SYN or FIN), use snd_max instead of snd_nxt 673 * when filling in ti_seq. But if we are in persist 674 * state, snd_max might reflect one byte beyond the 675 * right edge of the window, so use snd_nxt in that 676 * case, since we know we aren't doing a retransmission. 677 * (retransmit and persist are mutually exclusive...) 678 */ 679 if (len || (flags & (TH_SYN|TH_FIN)) 680 || callout_active(tp->tt_persist)) 681 th->th_seq = htonl(tp->snd_nxt); 682 else 683 th->th_seq = htonl(tp->snd_max); 684 th->th_ack = htonl(tp->rcv_nxt); 685 if (optlen) { 686 bcopy(opt, th + 1, optlen); 687 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 688 } 689 th->th_flags = flags; 690 /* 691 * Calculate receive window. Don't shrink window, 692 * but avoid silly window syndrome. 693 */ 694 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 695 win = 0; 696 if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 697 win = (long)(tp->rcv_adv - tp->rcv_nxt); 698 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 699 win = (long)TCP_MAXWIN << tp->rcv_scale; 700 th->th_win = htons((u_short) (win>>tp->rcv_scale)); 701 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 702 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 703 th->th_flags |= TH_URG; 704 } else 705 /* 706 * If no urgent pointer to send, then we pull 707 * the urgent pointer to the left edge of the send window 708 * so that it doesn't drift into the send window on sequence 709 * number wraparound. 710 */ 711 tp->snd_up = tp->snd_una; /* drag it along */ 712 713 /* 714 * Put TCP length in extended header, and then 715 * checksum extended header and data. 716 */ 717 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 718#ifdef INET6 719 if (isipv6) 720 /* 721 * ip6_plen is not need to be filled now, and will be filled 722 * in ip6_output. 723 */ 724 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 725 sizeof(struct tcphdr) + optlen + len); 726 else 727#endif /* INET6 */ 728 { 729 m->m_pkthdr.csum_flags = CSUM_TCP; 730 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 731 if (len + optlen) 732 th->th_sum = in_addword(th->th_sum, 733 htons((u_short)(optlen + len))); 734 735 /* IP version must be set here for ipv4/ipv6 checking later */ 736 KASSERT(ip->ip_v == IPVERSION, 737 ("%s: IP version incorrect: %d", __FUNCTION__, ip->ip_v)); 738 } 739 740 /* 741 * In transmit state, time the transmission and arrange for 742 * the retransmit. In persist state, just set snd_max. 743 */ 744 if (tp->t_force == 0 || !callout_active(tp->tt_persist)) { 745 tcp_seq startseq = tp->snd_nxt; 746 747 /* 748 * Advance snd_nxt over sequence space of this segment. 749 */ 750 if (flags & (TH_SYN|TH_FIN)) { 751 if (flags & TH_SYN) 752 tp->snd_nxt++; 753 if (flags & TH_FIN) { 754 tp->snd_nxt++; 755 tp->t_flags |= TF_SENTFIN; 756 } 757 } 758 tp->snd_nxt += len; 759 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 760 tp->snd_max = tp->snd_nxt; 761 /* 762 * Time this transmission if not a retransmission and 763 * not currently timing anything. 764 */ 765 if (tp->t_rtttime == 0) { 766 tp->t_rtttime = ticks; 767 tp->t_rtseq = startseq; 768 tcpstat.tcps_segstimed++; 769 } 770 } 771 772 /* 773 * Set retransmit timer if not currently set, 774 * and not doing an ack or a keep-alive probe. 775 * Initial value for retransmit timer is smoothed 776 * round-trip time + 2 * round-trip time variance. 777 * Initialize shift counter which is used for backoff 778 * of retransmit time. 779 */ 780 if (!callout_active(tp->tt_rexmt) && 781 tp->snd_nxt != tp->snd_una) { 782 if (callout_active(tp->tt_persist)) { 783 callout_stop(tp->tt_persist); 784 tp->t_rxtshift = 0; 785 } 786 callout_reset(tp->tt_rexmt, tp->t_rxtcur, 787 tcp_timer_rexmt, tp); 788 } 789 } else 790 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 791 tp->snd_max = tp->snd_nxt + len; 792 793#ifdef TCPDEBUG 794 /* 795 * Trace. 796 */ 797 if (so->so_options & SO_DEBUG) 798 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 799#endif 800 801 /* 802 * Fill in IP length and desired time to live and 803 * send to IP level. There should be a better way 804 * to handle ttl and tos; we could keep them in 805 * the template, but need a way to checksum without them. 806 */ 807 /* 808 * m->m_pkthdr.len should have been set before cksum calcuration, 809 * because in6_cksum() need it. 810 */ 811#ifdef INET6 812 if (isipv6) {
|