tcp_input.c revision 38513
1/* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 34 * $Id: tcp_input.c,v 1.79 1998/07/06 03:20:19 julian Exp $ 35 */ 36 37#include "opt_ipfw.h" /* for ipfw_fwd */ 38#include "opt_tcpdebug.h" 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/kernel.h> 43#include <sys/sysctl.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/proc.h> /* for proc0 declaration */ 47#include <sys/protosw.h> 48#include <sys/socket.h> 49#include <sys/socketvar.h> 50#include <sys/syslog.h> 51 52#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 53 54#include <net/if.h> 55#include <net/route.h> 56 57#include <netinet/in.h> 58#include <netinet/in_systm.h> 59#include <netinet/ip.h> 60#include <netinet/in_pcb.h> 61#include <netinet/ip_var.h> 62#include <netinet/tcp.h> 63#include <netinet/tcp_fsm.h> 64#include <netinet/tcp_seq.h> 65#include <netinet/tcp_timer.h> 66#include <netinet/tcp_var.h> 67#include <netinet/tcpip.h> 68#ifdef TCPDEBUG 69#include <netinet/tcp_debug.h> 70static struct tcpiphdr tcp_saveti; 71#endif 72 73static int tcprexmtthresh = 3; 74tcp_seq tcp_iss; 75tcp_cc tcp_ccgen; 76 77struct tcpstat tcpstat; 78SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, 79 CTLFLAG_RD, &tcpstat , tcpstat, ""); 80 81static int log_in_vain = 0; 82SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 83 &log_in_vain, 0, ""); 84 85int tcp_delack_enabled = 1; 86SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, 87 &tcp_delack_enabled, 0, ""); 88 89u_long tcp_now; 90struct inpcbhead tcb; 91struct inpcbinfo tcbinfo; 92 93static void tcp_dooptions __P((struct tcpcb *, 94 u_char *, int, struct tcpiphdr *, struct tcpopt *)); 95static void tcp_pulloutofband __P((struct socket *, 96 struct tcpiphdr *, struct mbuf *)); 97static int tcp_reass __P((struct tcpcb *, struct tcpiphdr *, struct mbuf *)); 98static void tcp_xmit_timer __P((struct tcpcb *, int)); 99 100 101/* 102 * Insert segment ti into reassembly queue of tcp with 103 * control block tp. Return TH_FIN if reassembly now includes 104 * a segment with FIN. The macro form does the common case inline 105 * (segment is the next to be received on an established connection, 106 * and the queue is empty), avoiding linkage into and removal 107 * from the queue and repetition of various conversions. 108 * Set DELACK for segments received in order, but ack immediately 109 * when segments are out of order (so fast retransmit can work). 110 */ 111#define TCP_REASS(tp, ti, m, so, flags) { \ 112 if ((ti)->ti_seq == (tp)->rcv_nxt && \ 113 (tp)->t_segq == NULL && \ 114 (tp)->t_state == TCPS_ESTABLISHED) { \ 115 if (tcp_delack_enabled) \ 116 tp->t_flags |= TF_DELACK; \ 117 else \ 118 tp->t_flags |= TF_ACKNOW; \ 119 (tp)->rcv_nxt += (ti)->ti_len; \ 120 flags = (ti)->ti_flags & TH_FIN; \ 121 tcpstat.tcps_rcvpack++;\ 122 tcpstat.tcps_rcvbyte += (ti)->ti_len;\ 123 sbappend(&(so)->so_rcv, (m)); \ 124 sorwakeup(so); \ 125 } else { \ 126 (flags) = tcp_reass((tp), (ti), (m)); \ 127 tp->t_flags |= TF_ACKNOW; \ 128 } \ 129} 130 131static int 132tcp_reass(tp, ti, m) 133 register struct tcpcb *tp; 134 register struct tcpiphdr *ti; 135 struct mbuf *m; 136{ 137 struct mbuf *q; 138 struct mbuf *p; 139 struct mbuf *nq; 140 struct socket *so = tp->t_inpcb->inp_socket; 141 int flags; 142 143#define GETTCP(m) ((struct tcpiphdr *)m->m_pkthdr.header) 144 145 /* 146 * Call with ti==0 after become established to 147 * force pre-ESTABLISHED data up to user socket. 148 */ 149 if (ti == 0) 150 goto present; 151 152 m->m_pkthdr.header = ti; 153 154 /* 155 * Find a segment which begins after this one does. 156 */ 157 for (q = tp->t_segq, p = NULL; q; p = q, q = q->m_nextpkt) 158 if (SEQ_GT(GETTCP(q)->ti_seq, ti->ti_seq)) 159 break; 160 161 /* 162 * If there is a preceding segment, it may provide some of 163 * our data already. If so, drop the data from the incoming 164 * segment. If it provides all of our data, drop us. 165 */ 166 if (p != NULL) { 167 register int i; 168 /* conversion to int (in i) handles seq wraparound */ 169 i = GETTCP(p)->ti_seq + GETTCP(p)->ti_len - ti->ti_seq; 170 if (i > 0) { 171 if (i >= ti->ti_len) { 172 tcpstat.tcps_rcvduppack++; 173 tcpstat.tcps_rcvdupbyte += ti->ti_len; 174 m_freem(m); 175 /* 176 * Try to present any queued data 177 * at the left window edge to the user. 178 * This is needed after the 3-WHS 179 * completes. 180 */ 181 goto present; /* ??? */ 182 } 183 m_adj(m, i); 184 ti->ti_len -= i; 185 ti->ti_seq += i; 186 } 187 } 188 tcpstat.tcps_rcvoopack++; 189 tcpstat.tcps_rcvoobyte += ti->ti_len; 190 191 /* 192 * While we overlap succeeding segments trim them or, 193 * if they are completely covered, dequeue them. 194 */ 195 while (q) { 196 register int i = (ti->ti_seq + ti->ti_len) - GETTCP(q)->ti_seq; 197 if (i <= 0) 198 break; 199 if (i < GETTCP(q)->ti_len) { 200 GETTCP(q)->ti_seq += i; 201 GETTCP(q)->ti_len -= i; 202 m_adj(q, i); 203 break; 204 } 205 206 nq = q->m_nextpkt; 207 if (p) 208 p->m_nextpkt = nq; 209 else 210 tp->t_segq = nq; 211 m_freem(q); 212 q = nq; 213 } 214 215 if (p == NULL) { 216 m->m_nextpkt = tp->t_segq; 217 tp->t_segq = m; 218 } else { 219 m->m_nextpkt = p->m_nextpkt; 220 p->m_nextpkt = m; 221 } 222 223present: 224 /* 225 * Present data to user, advancing rcv_nxt through 226 * completed sequence space. 227 */ 228 if (!TCPS_HAVEESTABLISHED(tp->t_state)) 229 return (0); 230 q = tp->t_segq; 231 if (!q || GETTCP(q)->ti_seq != tp->rcv_nxt) 232 return (0); 233 do { 234 tp->rcv_nxt += GETTCP(q)->ti_len; 235 flags = GETTCP(q)->ti_flags & TH_FIN; 236 nq = q->m_nextpkt; 237 tp->t_segq = nq; 238 q->m_nextpkt = NULL; 239 if (so->so_state & SS_CANTRCVMORE) 240 m_freem(q); 241 else 242 sbappend(&so->so_rcv, q); 243 q = nq; 244 } while (q && GETTCP(q)->ti_seq == tp->rcv_nxt); 245 sorwakeup(so); 246 return (flags); 247 248#undef GETTCP 249} 250 251/* 252 * TCP input routine, follows pages 65-76 of the 253 * protocol specification dated September, 1981 very closely. 254 */ 255void 256tcp_input(m, iphlen) 257 register struct mbuf *m; 258 int iphlen; 259{ 260 register struct tcpiphdr *ti; 261 register struct inpcb *inp; 262 u_char *optp = NULL; 263 int optlen = 0; 264 int len, tlen, off; 265 register struct tcpcb *tp = 0; 266 register int tiflags; 267 struct socket *so = 0; 268 int todrop, acked, ourfinisacked, needoutput = 0; 269 struct in_addr laddr; 270 int dropsocket = 0; 271 int iss = 0; 272 u_long tiwin; 273 struct tcpopt to; /* options in this segment */ 274 struct rmxp_tao *taop; /* pointer to our TAO cache entry */ 275 struct rmxp_tao tao_noncached; /* in case there's no cached entry */ 276#ifdef TCPDEBUG 277 short ostate = 0; 278#endif 279 280 bzero((char *)&to, sizeof(to)); 281 282 tcpstat.tcps_rcvtotal++; 283 /* 284 * Get IP and TCP header together in first mbuf. 285 * Note: IP leaves IP header in first mbuf. 286 */ 287 ti = mtod(m, struct tcpiphdr *); 288 if (iphlen > sizeof (struct ip)) 289 ip_stripoptions(m, (struct mbuf *)0); 290 if (m->m_len < sizeof (struct tcpiphdr)) { 291 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { 292 tcpstat.tcps_rcvshort++; 293 return; 294 } 295 ti = mtod(m, struct tcpiphdr *); 296 } 297 298 /* 299 * Checksum extended TCP header and data. 300 */ 301 tlen = ((struct ip *)ti)->ip_len; 302 len = sizeof (struct ip) + tlen; 303 bzero(ti->ti_x1, sizeof(ti->ti_x1)); 304 ti->ti_len = (u_short)tlen; 305 HTONS(ti->ti_len); 306 ti->ti_sum = in_cksum(m, len); 307 if (ti->ti_sum) { 308 tcpstat.tcps_rcvbadsum++; 309 goto drop; 310 } 311 312 /* 313 * Check that TCP offset makes sense, 314 * pull out TCP options and adjust length. XXX 315 */ 316 off = ti->ti_off << 2; 317 if (off < sizeof (struct tcphdr) || off > tlen) { 318 tcpstat.tcps_rcvbadoff++; 319 goto drop; 320 } 321 tlen -= off; 322 ti->ti_len = tlen; 323 if (off > sizeof (struct tcphdr)) { 324 if (m->m_len < sizeof(struct ip) + off) { 325 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { 326 tcpstat.tcps_rcvshort++; 327 return; 328 } 329 ti = mtod(m, struct tcpiphdr *); 330 } 331 optlen = off - sizeof (struct tcphdr); 332 optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); 333 } 334 tiflags = ti->ti_flags; 335 336 /* 337 * Convert TCP protocol specific fields to host format. 338 */ 339 NTOHL(ti->ti_seq); 340 NTOHL(ti->ti_ack); 341 NTOHS(ti->ti_win); 342 NTOHS(ti->ti_urp); 343 344 /* 345 * Drop TCP, IP headers and TCP options. 346 */ 347 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); 348 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); 349 350 /* 351 * Locate pcb for segment. 352 */ 353findpcb: 354#ifdef IPFIREWALL_FORWARD 355 if (ip_fw_fwd_addr != NULL) { 356 /* 357 * Diverted. Pretend to be the destination. 358 * already got one like this? 359 */ 360 inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport, 361 ti->ti_dst, ti->ti_dport, 0); 362 if (!inp) { 363 /* 364 * No, then it's new. Try find the ambushing socket 365 */ 366 if (!ip_fw_fwd_addr->sin_port) { 367 inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, 368 ti->ti_sport, ip_fw_fwd_addr->sin_addr, 369 ti->ti_dport, 1); 370 } else { 371 inp = in_pcblookup_hash(&tcbinfo, 372 ti->ti_src, ti->ti_sport, 373 ip_fw_fwd_addr->sin_addr, 374 ntohs(ip_fw_fwd_addr->sin_port), 1); 375 } 376 } 377 ip_fw_fwd_addr = NULL; 378 } else 379#endif /* IPFIREWALL_FORWARD */ 380 381 inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport, 382 ti->ti_dst, ti->ti_dport, 1); 383 384 /* 385 * If the state is CLOSED (i.e., TCB does not exist) then 386 * all data in the incoming segment is discarded. 387 * If the TCB exists but is in CLOSED state, it is embryonic, 388 * but should either do a listen or a connect soon. 389 */ 390 if (inp == NULL) { 391 if (log_in_vain && tiflags & TH_SYN) { 392 char buf[4*sizeof "123"]; 393 394 strcpy(buf, inet_ntoa(ti->ti_dst)); 395 log(LOG_INFO, 396 "Connection attempt to TCP %s:%d from %s:%d\n", 397 buf, ntohs(ti->ti_dport), inet_ntoa(ti->ti_src), 398 ntohs(ti->ti_sport)); 399 } 400 goto dropwithreset; 401 } 402 tp = intotcpcb(inp); 403 if (tp == 0) 404 goto dropwithreset; 405 if (tp->t_state == TCPS_CLOSED) 406 goto drop; 407 408 /* Unscale the window into a 32-bit value. */ 409 if ((tiflags & TH_SYN) == 0) 410 tiwin = ti->ti_win << tp->snd_scale; 411 else 412 tiwin = ti->ti_win; 413 414 so = inp->inp_socket; 415 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 416#ifdef TCPDEBUG 417 if (so->so_options & SO_DEBUG) { 418 ostate = tp->t_state; 419 tcp_saveti = *ti; 420 } 421#endif 422 if (so->so_options & SO_ACCEPTCONN) { 423 register struct tcpcb *tp0 = tp; 424 struct socket *so2; 425 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 426 /* 427 * Note: dropwithreset makes sure we don't 428 * send a RST in response to a RST. 429 */ 430 if (tiflags & TH_ACK) { 431 tcpstat.tcps_badsyn++; 432 goto dropwithreset; 433 } 434 goto drop; 435 } 436 so2 = sonewconn(so, 0); 437 if (so2 == 0) { 438 tcpstat.tcps_listendrop++; 439 so2 = sodropablereq(so); 440 if (so2) { 441 tcp_drop(sototcpcb(so2), ETIMEDOUT); 442 so2 = sonewconn(so, 0); 443 } 444 if (!so2) 445 goto drop; 446 } 447 so = so2; 448 /* 449 * This is ugly, but .... 450 * 451 * Mark socket as temporary until we're 452 * committed to keeping it. The code at 453 * ``drop'' and ``dropwithreset'' check the 454 * flag dropsocket to see if the temporary 455 * socket created here should be discarded. 456 * We mark the socket as discardable until 457 * we're committed to it below in TCPS_LISTEN. 458 */ 459 dropsocket++; 460 inp = (struct inpcb *)so->so_pcb; 461 inp->inp_laddr = ti->ti_dst; 462 inp->inp_lport = ti->ti_dport; 463 if (in_pcbinshash(inp) != 0) { 464 /* 465 * Undo the assignments above if we failed to put 466 * the PCB on the hash lists. 467 */ 468 inp->inp_laddr.s_addr = INADDR_ANY; 469 inp->inp_lport = 0; 470 goto drop; 471 } 472 inp->inp_options = ip_srcroute(); 473 tp = intotcpcb(inp); 474 tp->t_state = TCPS_LISTEN; 475 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT); 476 477 /* Compute proper scaling value from buffer space */ 478 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 479 TCP_MAXWIN << tp->request_r_scale < so->so_rcv.sb_hiwat) 480 tp->request_r_scale++; 481 } 482 } 483 484 /* 485 * Segment received on connection. 486 * Reset idle time and keep-alive timer. 487 */ 488 tp->t_idle = 0; 489 if (TCPS_HAVEESTABLISHED(tp->t_state)) 490 tp->t_timer[TCPT_KEEP] = tcp_keepidle; 491 492 /* 493 * Process options if not in LISTEN state, 494 * else do it below (after getting remote address). 495 */ 496 if (tp->t_state != TCPS_LISTEN) 497 tcp_dooptions(tp, optp, optlen, ti, &to); 498 499 /* 500 * Header prediction: check for the two common cases 501 * of a uni-directional data xfer. If the packet has 502 * no control flags, is in-sequence, the window didn't 503 * change and we're not retransmitting, it's a 504 * candidate. If the length is zero and the ack moved 505 * forward, we're the sender side of the xfer. Just 506 * free the data acked & wake any higher level process 507 * that was blocked waiting for space. If the length 508 * is non-zero and the ack didn't move, we're the 509 * receiver side. If we're getting packets in-order 510 * (the reassembly queue is empty), add the data to 511 * the socket buffer and note that we need a delayed ack. 512 * Make sure that the hidden state-flags are also off. 513 * Since we check for TCPS_ESTABLISHED above, it can only 514 * be TH_NEEDSYN. 515 */ 516 if (tp->t_state == TCPS_ESTABLISHED && 517 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 518 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 519 ((to.to_flag & TOF_TS) == 0 || 520 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && 521 /* 522 * Using the CC option is compulsory if once started: 523 * the segment is OK if no T/TCP was negotiated or 524 * if the segment has a CC option equal to CCrecv 525 */ 526 ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || 527 (to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv) && 528 ti->ti_seq == tp->rcv_nxt && 529 tiwin && tiwin == tp->snd_wnd && 530 tp->snd_nxt == tp->snd_max) { 531 532 /* 533 * If last ACK falls within this segment's sequence numbers, 534 * record the timestamp. 535 * NOTE that the test is modified according to the latest 536 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 537 */ 538 if ((to.to_flag & TOF_TS) != 0 && 539 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { 540 tp->ts_recent_age = tcp_now; 541 tp->ts_recent = to.to_tsval; 542 } 543 544 if (ti->ti_len == 0) { 545 if (SEQ_GT(ti->ti_ack, tp->snd_una) && 546 SEQ_LEQ(ti->ti_ack, tp->snd_max) && 547 tp->snd_cwnd >= tp->snd_wnd && 548 tp->t_dupacks < tcprexmtthresh) { 549 /* 550 * this is a pure ack for outstanding data. 551 */ 552 ++tcpstat.tcps_predack; 553 if ((to.to_flag & TOF_TS) != 0) 554 tcp_xmit_timer(tp, 555 tcp_now - to.to_tsecr + 1); 556 else if (tp->t_rtt && 557 SEQ_GT(ti->ti_ack, tp->t_rtseq)) 558 tcp_xmit_timer(tp, tp->t_rtt); 559 acked = ti->ti_ack - tp->snd_una; 560 tcpstat.tcps_rcvackpack++; 561 tcpstat.tcps_rcvackbyte += acked; 562 sbdrop(&so->so_snd, acked); 563 tp->snd_una = ti->ti_ack; 564 m_freem(m); 565 566 /* 567 * If all outstanding data are acked, stop 568 * retransmit timer, otherwise restart timer 569 * using current (possibly backed-off) value. 570 * If process is waiting for space, 571 * wakeup/selwakeup/signal. If data 572 * are ready to send, let tcp_output 573 * decide between more output or persist. 574 */ 575 if (tp->snd_una == tp->snd_max) 576 tp->t_timer[TCPT_REXMT] = 0; 577 else if (tp->t_timer[TCPT_PERSIST] == 0) 578 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 579 580 sowwakeup(so); 581 if (so->so_snd.sb_cc) 582 (void) tcp_output(tp); 583 return; 584 } 585 } else if (ti->ti_ack == tp->snd_una && 586 tp->t_segq == NULL && 587 ti->ti_len <= sbspace(&so->so_rcv)) { 588 /* 589 * this is a pure, in-sequence data packet 590 * with nothing on the reassembly queue and 591 * we have enough buffer space to take it. 592 */ 593 ++tcpstat.tcps_preddat; 594 tp->rcv_nxt += ti->ti_len; 595 tcpstat.tcps_rcvpack++; 596 tcpstat.tcps_rcvbyte += ti->ti_len; 597 /* 598 * Add data to socket buffer. 599 */ 600 sbappend(&so->so_rcv, m); 601 sorwakeup(so); 602 if (tcp_delack_enabled) { 603 tp->t_flags |= TF_DELACK; 604 } else { 605 tp->t_flags |= TF_ACKNOW; 606 tcp_output(tp); 607 } 608 return; 609 } 610 } 611 612 /* 613 * Calculate amount of space in receive window, 614 * and then do TCP input processing. 615 * Receive window is amount of space in rcv queue, 616 * but not less than advertised window. 617 */ 618 { int win; 619 620 win = sbspace(&so->so_rcv); 621 if (win < 0) 622 win = 0; 623 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 624 } 625 626 switch (tp->t_state) { 627 628 /* 629 * If the state is LISTEN then ignore segment if it contains an RST. 630 * If the segment contains an ACK then it is bad and send a RST. 631 * If it does not contain a SYN then it is not interesting; drop it. 632 * If it is from this socket, drop it, it must be forged. 633 * Don't bother responding if the destination was a broadcast. 634 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial 635 * tp->iss, and send a segment: 636 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 637 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. 638 * Fill in remote peer address fields if not previously specified. 639 * Enter SYN_RECEIVED state, and process any other fields of this 640 * segment in this state. 641 */ 642 case TCPS_LISTEN: { 643 register struct sockaddr_in *sin; 644 645 if (tiflags & TH_RST) 646 goto drop; 647 if (tiflags & TH_ACK) 648 goto dropwithreset; 649 if ((tiflags & TH_SYN) == 0) 650 goto drop; 651 if ((ti->ti_dport == ti->ti_sport) && 652 (ti->ti_dst.s_addr == ti->ti_src.s_addr)) 653 goto drop; 654 /* 655 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 656 * in_broadcast() should never return true on a received 657 * packet with M_BCAST not set. 658 */ 659 if (m->m_flags & (M_BCAST|M_MCAST) || 660 IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) 661 goto drop; 662 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, 663 M_NOWAIT); 664 if (sin == NULL) 665 goto drop; 666 sin->sin_family = AF_INET; 667 sin->sin_len = sizeof(*sin); 668 sin->sin_addr = ti->ti_src; 669 sin->sin_port = ti->ti_sport; 670 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); 671 laddr = inp->inp_laddr; 672 if (inp->inp_laddr.s_addr == INADDR_ANY) 673 inp->inp_laddr = ti->ti_dst; 674 if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) { 675 inp->inp_laddr = laddr; 676 FREE(sin, M_SONAME); 677 goto drop; 678 } 679 FREE(sin, M_SONAME); 680 tp->t_template = tcp_template(tp); 681 if (tp->t_template == 0) { 682 tp = tcp_drop(tp, ENOBUFS); 683 dropsocket = 0; /* socket is already gone */ 684 goto drop; 685 } 686 if ((taop = tcp_gettaocache(inp)) == NULL) { 687 taop = &tao_noncached; 688 bzero(taop, sizeof(*taop)); 689 } 690 tcp_dooptions(tp, optp, optlen, ti, &to); 691 if (iss) 692 tp->iss = iss; 693 else 694 tp->iss = tcp_iss; 695 tcp_iss += TCP_ISSINCR/4; 696 tp->irs = ti->ti_seq; 697 tcp_sendseqinit(tp); 698 tcp_rcvseqinit(tp); 699 /* 700 * Initialization of the tcpcb for transaction; 701 * set SND.WND = SEG.WND, 702 * initialize CCsend and CCrecv. 703 */ 704 tp->snd_wnd = tiwin; /* initial send-window */ 705 tp->cc_send = CC_INC(tcp_ccgen); 706 tp->cc_recv = to.to_cc; 707 /* 708 * Perform TAO test on incoming CC (SEG.CC) option, if any. 709 * - compare SEG.CC against cached CC from the same host, 710 * if any. 711 * - if SEG.CC > chached value, SYN must be new and is accepted 712 * immediately: save new CC in the cache, mark the socket 713 * connected, enter ESTABLISHED state, turn on flag to 714 * send a SYN in the next segment. 715 * A virtual advertised window is set in rcv_adv to 716 * initialize SWS prevention. Then enter normal segment 717 * processing: drop SYN, process data and FIN. 718 * - otherwise do a normal 3-way handshake. 719 */ 720 if ((to.to_flag & TOF_CC) != 0) { 721 if (((tp->t_flags & TF_NOPUSH) != 0) && 722 taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { 723 724 taop->tao_cc = to.to_cc; 725 tp->t_state = TCPS_ESTABLISHED; 726 727 /* 728 * If there is a FIN, or if there is data and the 729 * connection is local, then delay SYN,ACK(SYN) in 730 * the hope of piggy-backing it on a response 731 * segment. Otherwise must send ACK now in case 732 * the other side is slow starting. 733 */ 734 if (tcp_delack_enabled && ((tiflags & TH_FIN) || (ti->ti_len != 0 && 735 in_localaddr(inp->inp_faddr)))) 736 tp->t_flags |= (TF_DELACK | TF_NEEDSYN); 737 else 738 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 739 740 /* 741 * Limit the `virtual advertised window' to TCP_MAXWIN 742 * here. Even if we requested window scaling, it will 743 * become effective only later when our SYN is acked. 744 */ 745 tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN); 746 tcpstat.tcps_connects++; 747 soisconnected(so); 748 tp->t_timer[TCPT_KEEP] = tcp_keepinit; 749 dropsocket = 0; /* committed to socket */ 750 tcpstat.tcps_accepts++; 751 goto trimthenstep6; 752 } 753 /* else do standard 3-way handshake */ 754 } else { 755 /* 756 * No CC option, but maybe CC.NEW: 757 * invalidate cached value. 758 */ 759 taop->tao_cc = 0; 760 } 761 /* 762 * TAO test failed or there was no CC option, 763 * do a standard 3-way handshake. 764 */ 765 tp->t_flags |= TF_ACKNOW; 766 tp->t_state = TCPS_SYN_RECEIVED; 767 tp->t_timer[TCPT_KEEP] = tcp_keepinit; 768 dropsocket = 0; /* committed to socket */ 769 tcpstat.tcps_accepts++; 770 goto trimthenstep6; 771 } 772 773 /* 774 * If the state is SYN_RECEIVED: 775 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 776 */ 777 case TCPS_SYN_RECEIVED: 778 if ((tiflags & TH_ACK) && 779 (SEQ_LEQ(ti->ti_ack, tp->snd_una) || 780 SEQ_GT(ti->ti_ack, tp->snd_max))) 781 goto dropwithreset; 782 break; 783 784 /* 785 * If the state is SYN_SENT: 786 * if seg contains an ACK, but not for our SYN, drop the input. 787 * if seg contains a RST, then drop the connection. 788 * if seg does not contain SYN, then drop it. 789 * Otherwise this is an acceptable SYN segment 790 * initialize tp->rcv_nxt and tp->irs 791 * if seg contains ack then advance tp->snd_una 792 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 793 * arrange for segment to be acked (eventually) 794 * continue processing rest of data/controls, beginning with URG 795 */ 796 case TCPS_SYN_SENT: 797 if ((taop = tcp_gettaocache(inp)) == NULL) { 798 taop = &tao_noncached; 799 bzero(taop, sizeof(*taop)); 800 } 801 802 if ((tiflags & TH_ACK) && 803 (SEQ_LEQ(ti->ti_ack, tp->iss) || 804 SEQ_GT(ti->ti_ack, tp->snd_max))) { 805 /* 806 * If we have a cached CCsent for the remote host, 807 * hence we haven't just crashed and restarted, 808 * do not send a RST. This may be a retransmission 809 * from the other side after our earlier ACK was lost. 810 * Our new SYN, when it arrives, will serve as the 811 * needed ACK. 812 */ 813 if (taop->tao_ccsent != 0) 814 goto drop; 815 else 816 goto dropwithreset; 817 } 818 if (tiflags & TH_RST) { 819 if (tiflags & TH_ACK) 820 tp = tcp_drop(tp, ECONNREFUSED); 821 goto drop; 822 } 823 if ((tiflags & TH_SYN) == 0) 824 goto drop; 825 tp->snd_wnd = ti->ti_win; /* initial send window */ 826 tp->cc_recv = to.to_cc; /* foreign CC */ 827 828 tp->irs = ti->ti_seq; 829 tcp_rcvseqinit(tp); 830 if (tiflags & TH_ACK) { 831 /* 832 * Our SYN was acked. If segment contains CC.ECHO 833 * option, check it to make sure this segment really 834 * matches our SYN. If not, just drop it as old 835 * duplicate, but send an RST if we're still playing 836 * by the old rules. If no CC.ECHO option, make sure 837 * we don't get fooled into using T/TCP. 838 */ 839 if (to.to_flag & TOF_CCECHO) { 840 if (tp->cc_send != to.to_ccecho) 841 if (taop->tao_ccsent != 0) 842 goto drop; 843 else 844 goto dropwithreset; 845 } else 846 tp->t_flags &= ~TF_RCVD_CC; 847 tcpstat.tcps_connects++; 848 soisconnected(so); 849 /* Do window scaling on this connection? */ 850 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 851 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 852 tp->snd_scale = tp->requested_s_scale; 853 tp->rcv_scale = tp->request_r_scale; 854 } 855 /* Segment is acceptable, update cache if undefined. */ 856 if (taop->tao_ccsent == 0) 857 taop->tao_ccsent = to.to_ccecho; 858 859 tp->rcv_adv += tp->rcv_wnd; 860 tp->snd_una++; /* SYN is acked */ 861 /* 862 * If there's data, delay ACK; if there's also a FIN 863 * ACKNOW will be turned on later. 864 */ 865 if (tcp_delack_enabled && ti->ti_len != 0) 866 tp->t_flags |= TF_DELACK; 867 else 868 tp->t_flags |= TF_ACKNOW; 869 /* 870 * Received <SYN,ACK> in SYN_SENT[*] state. 871 * Transitions: 872 * SYN_SENT --> ESTABLISHED 873 * SYN_SENT* --> FIN_WAIT_1 874 */ 875 if (tp->t_flags & TF_NEEDFIN) { 876 tp->t_state = TCPS_FIN_WAIT_1; 877 tp->t_flags &= ~TF_NEEDFIN; 878 tiflags &= ~TH_SYN; 879 } else { 880 tp->t_state = TCPS_ESTABLISHED; 881 tp->t_timer[TCPT_KEEP] = tcp_keepidle; 882 } 883 } else { 884 /* 885 * Received initial SYN in SYN-SENT[*] state => simul- 886 * taneous open. If segment contains CC option and there is 887 * a cached CC, apply TAO test; if it succeeds, connection is 888 * half-synchronized. Otherwise, do 3-way handshake: 889 * SYN-SENT -> SYN-RECEIVED 890 * SYN-SENT* -> SYN-RECEIVED* 891 * If there was no CC option, clear cached CC value. 892 */ 893 tp->t_flags |= TF_ACKNOW; 894 tp->t_timer[TCPT_REXMT] = 0; 895 if (to.to_flag & TOF_CC) { 896 if (taop->tao_cc != 0 && 897 CC_GT(to.to_cc, taop->tao_cc)) { 898 /* 899 * update cache and make transition: 900 * SYN-SENT -> ESTABLISHED* 901 * SYN-SENT* -> FIN-WAIT-1* 902 */ 903 taop->tao_cc = to.to_cc; 904 if (tp->t_flags & TF_NEEDFIN) { 905 tp->t_state = TCPS_FIN_WAIT_1; 906 tp->t_flags &= ~TF_NEEDFIN; 907 } else { 908 tp->t_state = TCPS_ESTABLISHED; 909 tp->t_timer[TCPT_KEEP] = tcp_keepidle; 910 } 911 tp->t_flags |= TF_NEEDSYN; 912 } else 913 tp->t_state = TCPS_SYN_RECEIVED; 914 } else { 915 /* CC.NEW or no option => invalidate cache */ 916 taop->tao_cc = 0; 917 tp->t_state = TCPS_SYN_RECEIVED; 918 } 919 } 920 921trimthenstep6: 922 /* 923 * Advance ti->ti_seq to correspond to first data byte. 924 * If data, trim to stay within window, 925 * dropping FIN if necessary. 926 */ 927 ti->ti_seq++; 928 if (ti->ti_len > tp->rcv_wnd) { 929 todrop = ti->ti_len - tp->rcv_wnd; 930 m_adj(m, -todrop); 931 ti->ti_len = tp->rcv_wnd; 932 tiflags &= ~TH_FIN; 933 tcpstat.tcps_rcvpackafterwin++; 934 tcpstat.tcps_rcvbyteafterwin += todrop; 935 } 936 tp->snd_wl1 = ti->ti_seq - 1; 937 tp->rcv_up = ti->ti_seq; 938 /* 939 * Client side of transaction: already sent SYN and data. 940 * If the remote host used T/TCP to validate the SYN, 941 * our data will be ACK'd; if so, enter normal data segment 942 * processing in the middle of step 5, ack processing. 943 * Otherwise, goto step 6. 944 */ 945 if (tiflags & TH_ACK) 946 goto process_ACK; 947 goto step6; 948 /* 949 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 950 * if segment contains a SYN and CC [not CC.NEW] option: 951 * if state == TIME_WAIT and connection duration > MSL, 952 * drop packet and send RST; 953 * 954 * if SEG.CC > CCrecv then is new SYN, and can implicitly 955 * ack the FIN (and data) in retransmission queue. 956 * Complete close and delete TCPCB. Then reprocess 957 * segment, hoping to find new TCPCB in LISTEN state; 958 * 959 * else must be old SYN; drop it. 960 * else do normal processing. 961 */ 962 case TCPS_LAST_ACK: 963 case TCPS_CLOSING: 964 case TCPS_TIME_WAIT: 965 if ((tiflags & TH_SYN) && 966 (to.to_flag & TOF_CC) && tp->cc_recv != 0) { 967 if (tp->t_state == TCPS_TIME_WAIT && 968 tp->t_duration > TCPTV_MSL) 969 goto dropwithreset; 970 if (CC_GT(to.to_cc, tp->cc_recv)) { 971 tp = tcp_close(tp); 972 goto findpcb; 973 } 974 else 975 goto drop; 976 } 977 break; /* continue normal processing */ 978 } 979 980 /* 981 * States other than LISTEN or SYN_SENT. 982 * First check timestamp, if present. 983 * Then check the connection count, if present. 984 * Then check that at least some bytes of segment are within 985 * receive window. If segment begins before rcv_nxt, 986 * drop leading data (and SYN); if nothing left, just ack. 987 * 988 * RFC 1323 PAWS: If we have a timestamp reply on this segment 989 * and it's less than ts_recent, drop it. 990 */ 991 if ((to.to_flag & TOF_TS) != 0 && (tiflags & TH_RST) == 0 && 992 tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { 993 994 /* Check to see if ts_recent is over 24 days old. */ 995 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 996 /* 997 * Invalidate ts_recent. If this segment updates 998 * ts_recent, the age will be reset later and ts_recent 999 * will get a valid value. If it does not, setting 1000 * ts_recent to zero will at least satisfy the 1001 * requirement that zero be placed in the timestamp 1002 * echo reply when ts_recent isn't valid. The 1003 * age isn't reset until we get a valid ts_recent 1004 * because we don't want out-of-order segments to be 1005 * dropped when ts_recent is old. 1006 */ 1007 tp->ts_recent = 0; 1008 } else { 1009 tcpstat.tcps_rcvduppack++; 1010 tcpstat.tcps_rcvdupbyte += ti->ti_len; 1011 tcpstat.tcps_pawsdrop++; 1012 goto dropafterack; 1013 } 1014 } 1015 1016 /* 1017 * T/TCP mechanism 1018 * If T/TCP was negotiated and the segment doesn't have CC, 1019 * or if its CC is wrong then drop the segment. 1020 * RST segments do not have to comply with this. 1021 */ 1022 if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && 1023 ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc) && 1024 (tiflags & TH_RST) == 0) 1025 goto dropafterack; 1026 1027 todrop = tp->rcv_nxt - ti->ti_seq; 1028 if (todrop > 0) { 1029 if (tiflags & TH_SYN) { 1030 tiflags &= ~TH_SYN; 1031 ti->ti_seq++; 1032 if (ti->ti_urp > 1) 1033 ti->ti_urp--; 1034 else 1035 tiflags &= ~TH_URG; 1036 todrop--; 1037 } 1038 /* 1039 * Following if statement from Stevens, vol. 2, p. 960. 1040 */ 1041 if (todrop > ti->ti_len 1042 || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) { 1043 /* 1044 * Any valid FIN must be to the left of the window. 1045 * At this point the FIN must be a duplicate or out 1046 * of sequence; drop it. 1047 */ 1048 tiflags &= ~TH_FIN; 1049 1050 /* 1051 * Send an ACK to resynchronize and drop any data. 1052 * But keep on processing for RST or ACK. 1053 */ 1054 tp->t_flags |= TF_ACKNOW; 1055 todrop = ti->ti_len; 1056 tcpstat.tcps_rcvduppack++; 1057 tcpstat.tcps_rcvdupbyte += todrop; 1058 } else { 1059 tcpstat.tcps_rcvpartduppack++; 1060 tcpstat.tcps_rcvpartdupbyte += todrop; 1061 } 1062 m_adj(m, todrop); 1063 ti->ti_seq += todrop; 1064 ti->ti_len -= todrop; 1065 if (ti->ti_urp > todrop) 1066 ti->ti_urp -= todrop; 1067 else { 1068 tiflags &= ~TH_URG; 1069 ti->ti_urp = 0; 1070 } 1071 } 1072 1073 /* 1074 * If new data are received on a connection after the 1075 * user processes are gone, then RST the other end. 1076 */ 1077 if ((so->so_state & SS_NOFDREF) && 1078 tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { 1079 tp = tcp_close(tp); 1080 tcpstat.tcps_rcvafterclose++; 1081 goto dropwithreset; 1082 } 1083 1084 /* 1085 * If segment ends after window, drop trailing data 1086 * (and PUSH and FIN); if nothing left, just ACK. 1087 */ 1088 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); 1089 if (todrop > 0) { 1090 tcpstat.tcps_rcvpackafterwin++; 1091 if (todrop >= ti->ti_len) { 1092 tcpstat.tcps_rcvbyteafterwin += ti->ti_len; 1093 /* 1094 * If a new connection request is received 1095 * while in TIME_WAIT, drop the old connection 1096 * and start over if the sequence numbers 1097 * are above the previous ones. 1098 */ 1099 if (tiflags & TH_SYN && 1100 tp->t_state == TCPS_TIME_WAIT && 1101 SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { 1102 iss = tp->rcv_nxt + TCP_ISSINCR; 1103 tp = tcp_close(tp); 1104 goto findpcb; 1105 } 1106 /* 1107 * If window is closed can only take segments at 1108 * window edge, and have to drop data and PUSH from 1109 * incoming segments. Continue processing, but 1110 * remember to ack. Otherwise, drop segment 1111 * and ack. 1112 */ 1113 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { 1114 tp->t_flags |= TF_ACKNOW; 1115 tcpstat.tcps_rcvwinprobe++; 1116 } else 1117 goto dropafterack; 1118 } else 1119 tcpstat.tcps_rcvbyteafterwin += todrop; 1120 m_adj(m, -todrop); 1121 ti->ti_len -= todrop; 1122 tiflags &= ~(TH_PUSH|TH_FIN); 1123 } 1124 1125 /* 1126 * If last ACK falls within this segment's sequence numbers, 1127 * record its timestamp. 1128 * NOTE that the test is modified according to the latest 1129 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1130 */ 1131 if ((to.to_flag & TOF_TS) != 0 && 1132 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { 1133 tp->ts_recent_age = tcp_now; 1134 tp->ts_recent = to.to_tsval; 1135 } 1136 1137 /* 1138 * If the RST bit is set examine the state: 1139 * SYN_RECEIVED STATE: 1140 * If passive open, return to LISTEN state. 1141 * If active open, inform user that connection was refused. 1142 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1143 * Inform user that connection was reset, and close tcb. 1144 * CLOSING, LAST_ACK, TIME_WAIT STATES 1145 * Close the tcb. 1146 */ 1147 if (tiflags&TH_RST) switch (tp->t_state) { 1148 1149 case TCPS_SYN_RECEIVED: 1150 so->so_error = ECONNREFUSED; 1151 goto close; 1152 1153 case TCPS_ESTABLISHED: 1154 case TCPS_FIN_WAIT_1: 1155 case TCPS_FIN_WAIT_2: 1156 case TCPS_CLOSE_WAIT: 1157 so->so_error = ECONNRESET; 1158 close: 1159 tp->t_state = TCPS_CLOSED; 1160 tcpstat.tcps_drops++; 1161 tp = tcp_close(tp); 1162 goto drop; 1163 1164 case TCPS_CLOSING: 1165 case TCPS_LAST_ACK: 1166 case TCPS_TIME_WAIT: 1167 tp = tcp_close(tp); 1168 goto drop; 1169 } 1170 1171 /* 1172 * If a SYN is in the window, then this is an 1173 * error and we send an RST and drop the connection. 1174 */ 1175 if (tiflags & TH_SYN) { 1176 tp = tcp_drop(tp, ECONNRESET); 1177 goto dropwithreset; 1178 } 1179 1180 /* 1181 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 1182 * flag is on (half-synchronized state), then queue data for 1183 * later processing; else drop segment and return. 1184 */ 1185 if ((tiflags & TH_ACK) == 0) { 1186 if (tp->t_state == TCPS_SYN_RECEIVED || 1187 (tp->t_flags & TF_NEEDSYN)) 1188 goto step6; 1189 else 1190 goto drop; 1191 } 1192 1193 /* 1194 * Ack processing. 1195 */ 1196 switch (tp->t_state) { 1197 1198 /* 1199 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1200 * ESTABLISHED state and continue processing. 1201 * The ACK was checked above. 1202 */ 1203 case TCPS_SYN_RECEIVED: 1204 1205 tcpstat.tcps_connects++; 1206 soisconnected(so); 1207 /* Do window scaling? */ 1208 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1209 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1210 tp->snd_scale = tp->requested_s_scale; 1211 tp->rcv_scale = tp->request_r_scale; 1212 } 1213 /* 1214 * Upon successful completion of 3-way handshake, 1215 * update cache.CC if it was undefined, pass any queued 1216 * data to the user, and advance state appropriately. 1217 */ 1218 if ((taop = tcp_gettaocache(inp)) != NULL && 1219 taop->tao_cc == 0) 1220 taop->tao_cc = tp->cc_recv; 1221 1222 /* 1223 * Make transitions: 1224 * SYN-RECEIVED -> ESTABLISHED 1225 * SYN-RECEIVED* -> FIN-WAIT-1 1226 */ 1227 if (tp->t_flags & TF_NEEDFIN) { 1228 tp->t_state = TCPS_FIN_WAIT_1; 1229 tp->t_flags &= ~TF_NEEDFIN; 1230 } else { 1231 tp->t_state = TCPS_ESTABLISHED; 1232 tp->t_timer[TCPT_KEEP] = tcp_keepidle; 1233 } 1234 /* 1235 * If segment contains data or ACK, will call tcp_reass() 1236 * later; if not, do so now to pass queued data to user. 1237 */ 1238 if (ti->ti_len == 0 && (tiflags & TH_FIN) == 0) 1239 (void) tcp_reass(tp, (struct tcpiphdr *)0, 1240 (struct mbuf *)0); 1241 tp->snd_wl1 = ti->ti_seq - 1; 1242 /* fall into ... */ 1243 1244 /* 1245 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1246 * ACKs. If the ack is in the range 1247 * tp->snd_una < ti->ti_ack <= tp->snd_max 1248 * then advance tp->snd_una to ti->ti_ack and drop 1249 * data from the retransmission queue. If this ACK reflects 1250 * more up to date window information we update our window information. 1251 */ 1252 case TCPS_ESTABLISHED: 1253 case TCPS_FIN_WAIT_1: 1254 case TCPS_FIN_WAIT_2: 1255 case TCPS_CLOSE_WAIT: 1256 case TCPS_CLOSING: 1257 case TCPS_LAST_ACK: 1258 case TCPS_TIME_WAIT: 1259 1260 if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { 1261 if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { 1262 tcpstat.tcps_rcvdupack++; 1263 /* 1264 * If we have outstanding data (other than 1265 * a window probe), this is a completely 1266 * duplicate ack (ie, window info didn't 1267 * change), the ack is the biggest we've 1268 * seen and we've seen exactly our rexmt 1269 * threshhold of them, assume a packet 1270 * has been dropped and retransmit it. 1271 * Kludge snd_nxt & the congestion 1272 * window so we send only this one 1273 * packet. 1274 * 1275 * We know we're losing at the current 1276 * window size so do congestion avoidance 1277 * (set ssthresh to half the current window 1278 * and pull our congestion window back to 1279 * the new ssthresh). 1280 * 1281 * Dup acks mean that packets have left the 1282 * network (they're now cached at the receiver) 1283 * so bump cwnd by the amount in the receiver 1284 * to keep a constant cwnd packets in the 1285 * network. 1286 */ 1287 if (tp->t_timer[TCPT_REXMT] == 0 || 1288 ti->ti_ack != tp->snd_una) 1289 tp->t_dupacks = 0; 1290 else if (++tp->t_dupacks == tcprexmtthresh) { 1291 tcp_seq onxt = tp->snd_nxt; 1292 u_int win = 1293 min(tp->snd_wnd, tp->snd_cwnd) / 2 / 1294 tp->t_maxseg; 1295 1296 if (win < 2) 1297 win = 2; 1298 tp->snd_ssthresh = win * tp->t_maxseg; 1299 tp->t_timer[TCPT_REXMT] = 0; 1300 tp->t_rtt = 0; 1301 tp->snd_nxt = ti->ti_ack; 1302 tp->snd_cwnd = tp->t_maxseg; 1303 (void) tcp_output(tp); 1304 tp->snd_cwnd = tp->snd_ssthresh + 1305 tp->t_maxseg * tp->t_dupacks; 1306 if (SEQ_GT(onxt, tp->snd_nxt)) 1307 tp->snd_nxt = onxt; 1308 goto drop; 1309 } else if (tp->t_dupacks > tcprexmtthresh) { 1310 tp->snd_cwnd += tp->t_maxseg; 1311 (void) tcp_output(tp); 1312 goto drop; 1313 } 1314 } else 1315 tp->t_dupacks = 0; 1316 break; 1317 } 1318 /* 1319 * If the congestion window was inflated to account 1320 * for the other side's cached packets, retract it. 1321 */ 1322 if (tp->t_dupacks >= tcprexmtthresh && 1323 tp->snd_cwnd > tp->snd_ssthresh) 1324 tp->snd_cwnd = tp->snd_ssthresh; 1325 tp->t_dupacks = 0; 1326 if (SEQ_GT(ti->ti_ack, tp->snd_max)) { 1327 tcpstat.tcps_rcvacktoomuch++; 1328 goto dropafterack; 1329 } 1330 /* 1331 * If we reach this point, ACK is not a duplicate, 1332 * i.e., it ACKs something we sent. 1333 */ 1334 if (tp->t_flags & TF_NEEDSYN) { 1335 /* 1336 * T/TCP: Connection was half-synchronized, and our 1337 * SYN has been ACK'd (so connection is now fully 1338 * synchronized). Go to non-starred state, 1339 * increment snd_una for ACK of SYN, and check if 1340 * we can do window scaling. 1341 */ 1342 tp->t_flags &= ~TF_NEEDSYN; 1343 tp->snd_una++; 1344 /* Do window scaling? */ 1345 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1346 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1347 tp->snd_scale = tp->requested_s_scale; 1348 tp->rcv_scale = tp->request_r_scale; 1349 } 1350 } 1351 1352process_ACK: 1353 acked = ti->ti_ack - tp->snd_una; 1354 tcpstat.tcps_rcvackpack++; 1355 tcpstat.tcps_rcvackbyte += acked; 1356 1357 /* 1358 * If we have a timestamp reply, update smoothed 1359 * round trip time. If no timestamp is present but 1360 * transmit timer is running and timed sequence 1361 * number was acked, update smoothed round trip time. 1362 * Since we now have an rtt measurement, cancel the 1363 * timer backoff (cf., Phil Karn's retransmit alg.). 1364 * Recompute the initial retransmit timer. 1365 */ 1366 if (to.to_flag & TOF_TS) 1367 tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1); 1368 else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) 1369 tcp_xmit_timer(tp,tp->t_rtt); 1370 1371 /* 1372 * If all outstanding data is acked, stop retransmit 1373 * timer and remember to restart (more output or persist). 1374 * If there is more data to be acked, restart retransmit 1375 * timer, using current (possibly backed-off) value. 1376 */ 1377 if (ti->ti_ack == tp->snd_max) { 1378 tp->t_timer[TCPT_REXMT] = 0; 1379 needoutput = 1; 1380 } else if (tp->t_timer[TCPT_PERSIST] == 0) 1381 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 1382 1383 /* 1384 * If no data (only SYN) was ACK'd, 1385 * skip rest of ACK processing. 1386 */ 1387 if (acked == 0) 1388 goto step6; 1389 1390 /* 1391 * When new data is acked, open the congestion window. 1392 * If the window gives us less than ssthresh packets 1393 * in flight, open exponentially (maxseg per packet). 1394 * Otherwise open linearly: maxseg per window 1395 * (maxseg^2 / cwnd per packet). 1396 */ 1397 { 1398 register u_int cw = tp->snd_cwnd; 1399 register u_int incr = tp->t_maxseg; 1400 1401 if (cw > tp->snd_ssthresh) 1402 incr = incr * incr / cw; 1403 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1404 } 1405 if (acked > so->so_snd.sb_cc) { 1406 tp->snd_wnd -= so->so_snd.sb_cc; 1407 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1408 ourfinisacked = 1; 1409 } else { 1410 sbdrop(&so->so_snd, acked); 1411 tp->snd_wnd -= acked; 1412 ourfinisacked = 0; 1413 } 1414 sowwakeup(so); 1415 tp->snd_una = ti->ti_ack; 1416 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1417 tp->snd_nxt = tp->snd_una; 1418 1419 switch (tp->t_state) { 1420 1421 /* 1422 * In FIN_WAIT_1 STATE in addition to the processing 1423 * for the ESTABLISHED state if our FIN is now acknowledged 1424 * then enter FIN_WAIT_2. 1425 */ 1426 case TCPS_FIN_WAIT_1: 1427 if (ourfinisacked) { 1428 /* 1429 * If we can't receive any more 1430 * data, then closing user can proceed. 1431 * Starting the timer is contrary to the 1432 * specification, but if we don't get a FIN 1433 * we'll hang forever. 1434 */ 1435 if (so->so_state & SS_CANTRCVMORE) { 1436 soisdisconnected(so); 1437 tp->t_timer[TCPT_2MSL] = tcp_maxidle; 1438 } 1439 tp->t_state = TCPS_FIN_WAIT_2; 1440 } 1441 break; 1442 1443 /* 1444 * In CLOSING STATE in addition to the processing for 1445 * the ESTABLISHED state if the ACK acknowledges our FIN 1446 * then enter the TIME-WAIT state, otherwise ignore 1447 * the segment. 1448 */ 1449 case TCPS_CLOSING: 1450 if (ourfinisacked) { 1451 tp->t_state = TCPS_TIME_WAIT; 1452 tcp_canceltimers(tp); 1453 /* Shorten TIME_WAIT [RFC-1644, p.28] */ 1454 if (tp->cc_recv != 0 && 1455 tp->t_duration < TCPTV_MSL) 1456 tp->t_timer[TCPT_2MSL] = 1457 tp->t_rxtcur * TCPTV_TWTRUNC; 1458 else 1459 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1460 soisdisconnected(so); 1461 } 1462 break; 1463 1464 /* 1465 * In LAST_ACK, we may still be waiting for data to drain 1466 * and/or to be acked, as well as for the ack of our FIN. 1467 * If our FIN is now acknowledged, delete the TCB, 1468 * enter the closed state and return. 1469 */ 1470 case TCPS_LAST_ACK: 1471 if (ourfinisacked) { 1472 tp = tcp_close(tp); 1473 goto drop; 1474 } 1475 break; 1476 1477 /* 1478 * In TIME_WAIT state the only thing that should arrive 1479 * is a retransmission of the remote FIN. Acknowledge 1480 * it and restart the finack timer. 1481 */ 1482 case TCPS_TIME_WAIT: 1483 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1484 goto dropafterack; 1485 } 1486 } 1487 1488step6: 1489 /* 1490 * Update window information. 1491 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1492 */ 1493 if ((tiflags & TH_ACK) && 1494 (SEQ_LT(tp->snd_wl1, ti->ti_seq) || 1495 (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || 1496 (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) { 1497 /* keep track of pure window updates */ 1498 if (ti->ti_len == 0 && 1499 tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) 1500 tcpstat.tcps_rcvwinupd++; 1501 tp->snd_wnd = tiwin; 1502 tp->snd_wl1 = ti->ti_seq; 1503 tp->snd_wl2 = ti->ti_ack; 1504 if (tp->snd_wnd > tp->max_sndwnd) 1505 tp->max_sndwnd = tp->snd_wnd; 1506 needoutput = 1; 1507 } 1508 1509 /* 1510 * Process segments with URG. 1511 */ 1512 if ((tiflags & TH_URG) && ti->ti_urp && 1513 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1514 /* 1515 * This is a kludge, but if we receive and accept 1516 * random urgent pointers, we'll crash in 1517 * soreceive. It's hard to imagine someone 1518 * actually wanting to send this much urgent data. 1519 */ 1520 if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) { 1521 ti->ti_urp = 0; /* XXX */ 1522 tiflags &= ~TH_URG; /* XXX */ 1523 goto dodata; /* XXX */ 1524 } 1525 /* 1526 * If this segment advances the known urgent pointer, 1527 * then mark the data stream. This should not happen 1528 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1529 * a FIN has been received from the remote side. 1530 * In these states we ignore the URG. 1531 * 1532 * According to RFC961 (Assigned Protocols), 1533 * the urgent pointer points to the last octet 1534 * of urgent data. We continue, however, 1535 * to consider it to indicate the first octet 1536 * of data past the urgent section as the original 1537 * spec states (in one of two places). 1538 */ 1539 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { 1540 tp->rcv_up = ti->ti_seq + ti->ti_urp; 1541 so->so_oobmark = so->so_rcv.sb_cc + 1542 (tp->rcv_up - tp->rcv_nxt) - 1; 1543 if (so->so_oobmark == 0) 1544 so->so_state |= SS_RCVATMARK; 1545 sohasoutofband(so); 1546 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1547 } 1548 /* 1549 * Remove out of band data so doesn't get presented to user. 1550 * This can happen independent of advancing the URG pointer, 1551 * but if two URG's are pending at once, some out-of-band 1552 * data may creep in... ick. 1553 */ 1554 if (ti->ti_urp <= (u_long)ti->ti_len 1555#ifdef SO_OOBINLINE 1556 && (so->so_options & SO_OOBINLINE) == 0 1557#endif 1558 ) 1559 tcp_pulloutofband(so, ti, m); 1560 } else 1561 /* 1562 * If no out of band data is expected, 1563 * pull receive urgent pointer along 1564 * with the receive window. 1565 */ 1566 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1567 tp->rcv_up = tp->rcv_nxt; 1568dodata: /* XXX */ 1569 1570 /* 1571 * Process the segment text, merging it into the TCP sequencing queue, 1572 * and arranging for acknowledgment of receipt if necessary. 1573 * This process logically involves adjusting tp->rcv_wnd as data 1574 * is presented to the user (this happens in tcp_usrreq.c, 1575 * case PRU_RCVD). If a FIN has already been received on this 1576 * connection then we just ignore the text. 1577 */ 1578 if ((ti->ti_len || (tiflags&TH_FIN)) && 1579 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1580 TCP_REASS(tp, ti, m, so, tiflags); 1581 /* 1582 * Note the amount of data that peer has sent into 1583 * our window, in order to estimate the sender's 1584 * buffer size. 1585 */ 1586 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1587 } else { 1588 m_freem(m); 1589 tiflags &= ~TH_FIN; 1590 } 1591 1592 /* 1593 * If FIN is received ACK the FIN and let the user know 1594 * that the connection is closing. 1595 */ 1596 if (tiflags & TH_FIN) { 1597 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1598 socantrcvmore(so); 1599 /* 1600 * If connection is half-synchronized 1601 * (ie NEEDSYN flag on) then delay ACK, 1602 * so it may be piggybacked when SYN is sent. 1603 * Otherwise, since we received a FIN then no 1604 * more input can be expected, send ACK now. 1605 */ 1606 if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN)) 1607 tp->t_flags |= TF_DELACK; 1608 else 1609 tp->t_flags |= TF_ACKNOW; 1610 tp->rcv_nxt++; 1611 } 1612 switch (tp->t_state) { 1613 1614 /* 1615 * In SYN_RECEIVED and ESTABLISHED STATES 1616 * enter the CLOSE_WAIT state. 1617 */ 1618 case TCPS_SYN_RECEIVED: 1619 case TCPS_ESTABLISHED: 1620 tp->t_state = TCPS_CLOSE_WAIT; 1621 break; 1622 1623 /* 1624 * If still in FIN_WAIT_1 STATE FIN has not been acked so 1625 * enter the CLOSING state. 1626 */ 1627 case TCPS_FIN_WAIT_1: 1628 tp->t_state = TCPS_CLOSING; 1629 break; 1630 1631 /* 1632 * In FIN_WAIT_2 state enter the TIME_WAIT state, 1633 * starting the time-wait timer, turning off the other 1634 * standard timers. 1635 */ 1636 case TCPS_FIN_WAIT_2: 1637 tp->t_state = TCPS_TIME_WAIT; 1638 tcp_canceltimers(tp); 1639 /* Shorten TIME_WAIT [RFC-1644, p.28] */ 1640 if (tp->cc_recv != 0 && 1641 tp->t_duration < TCPTV_MSL) { 1642 tp->t_timer[TCPT_2MSL] = 1643 tp->t_rxtcur * TCPTV_TWTRUNC; 1644 /* For transaction client, force ACK now. */ 1645 tp->t_flags |= TF_ACKNOW; 1646 } 1647 else 1648 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1649 soisdisconnected(so); 1650 break; 1651 1652 /* 1653 * In TIME_WAIT state restart the 2 MSL time_wait timer. 1654 */ 1655 case TCPS_TIME_WAIT: 1656 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1657 break; 1658 } 1659 } 1660#ifdef TCPDEBUG 1661 if (so->so_options & SO_DEBUG) 1662 tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); 1663#endif 1664 1665 /* 1666 * Return any desired output. 1667 */ 1668 if (needoutput || (tp->t_flags & TF_ACKNOW)) 1669 (void) tcp_output(tp); 1670 return; 1671 1672dropafterack: 1673 /* 1674 * Generate an ACK dropping incoming segment if it occupies 1675 * sequence space, where the ACK reflects our state. 1676 */ 1677 if (tiflags & TH_RST) 1678 goto drop; 1679#ifdef TCPDEBUG 1680 if (so->so_options & SO_DEBUG) 1681 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); 1682#endif 1683 m_freem(m); 1684 tp->t_flags |= TF_ACKNOW; 1685 (void) tcp_output(tp); 1686 return; 1687 1688dropwithreset: 1689 /* 1690 * Generate a RST, dropping incoming segment. 1691 * Make ACK acceptable to originator of segment. 1692 * Don't bother to respond if destination was broadcast/multicast. 1693 */ 1694 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) || 1695 IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) 1696 goto drop; 1697#ifdef TCPDEBUG 1698 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 1699 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); 1700#endif 1701 if (tiflags & TH_ACK) 1702 tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); 1703 else { 1704 if (tiflags & TH_SYN) 1705 ti->ti_len++; 1706 tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, 1707 TH_RST|TH_ACK); 1708 } 1709 /* destroy temporarily created socket */ 1710 if (dropsocket) 1711 (void) soabort(so); 1712 return; 1713 1714drop: 1715 /* 1716 * Drop space held by incoming segment and return. 1717 */ 1718#ifdef TCPDEBUG 1719 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 1720 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); 1721#endif 1722 m_freem(m); 1723 /* destroy temporarily created socket */ 1724 if (dropsocket) 1725 (void) soabort(so); 1726 return; 1727} 1728 1729static void 1730tcp_dooptions(tp, cp, cnt, ti, to) 1731 struct tcpcb *tp; 1732 u_char *cp; 1733 int cnt; 1734 struct tcpiphdr *ti; 1735 struct tcpopt *to; 1736{ 1737 u_short mss = 0; 1738 int opt, optlen; 1739 1740 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1741 opt = cp[0]; 1742 if (opt == TCPOPT_EOL) 1743 break; 1744 if (opt == TCPOPT_NOP) 1745 optlen = 1; 1746 else { 1747 optlen = cp[1]; 1748 if (optlen <= 0) 1749 break; 1750 } 1751 switch (opt) { 1752 1753 default: 1754 continue; 1755 1756 case TCPOPT_MAXSEG: 1757 if (optlen != TCPOLEN_MAXSEG) 1758 continue; 1759 if (!(ti->ti_flags & TH_SYN)) 1760 continue; 1761 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 1762 NTOHS(mss); 1763 break; 1764 1765 case TCPOPT_WINDOW: 1766 if (optlen != TCPOLEN_WINDOW) 1767 continue; 1768 if (!(ti->ti_flags & TH_SYN)) 1769 continue; 1770 tp->t_flags |= TF_RCVD_SCALE; 1771 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 1772 break; 1773 1774 case TCPOPT_TIMESTAMP: 1775 if (optlen != TCPOLEN_TIMESTAMP) 1776 continue; 1777 to->to_flag |= TOF_TS; 1778 bcopy((char *)cp + 2, 1779 (char *)&to->to_tsval, sizeof(to->to_tsval)); 1780 NTOHL(to->to_tsval); 1781 bcopy((char *)cp + 6, 1782 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 1783 NTOHL(to->to_tsecr); 1784 1785 /* 1786 * A timestamp received in a SYN makes 1787 * it ok to send timestamp requests and replies. 1788 */ 1789 if (ti->ti_flags & TH_SYN) { 1790 tp->t_flags |= TF_RCVD_TSTMP; 1791 tp->ts_recent = to->to_tsval; 1792 tp->ts_recent_age = tcp_now; 1793 } 1794 break; 1795 case TCPOPT_CC: 1796 if (optlen != TCPOLEN_CC) 1797 continue; 1798 to->to_flag |= TOF_CC; 1799 bcopy((char *)cp + 2, 1800 (char *)&to->to_cc, sizeof(to->to_cc)); 1801 NTOHL(to->to_cc); 1802 /* 1803 * A CC or CC.new option received in a SYN makes 1804 * it ok to send CC in subsequent segments. 1805 */ 1806 if (ti->ti_flags & TH_SYN) 1807 tp->t_flags |= TF_RCVD_CC; 1808 break; 1809 case TCPOPT_CCNEW: 1810 if (optlen != TCPOLEN_CC) 1811 continue; 1812 if (!(ti->ti_flags & TH_SYN)) 1813 continue; 1814 to->to_flag |= TOF_CCNEW; 1815 bcopy((char *)cp + 2, 1816 (char *)&to->to_cc, sizeof(to->to_cc)); 1817 NTOHL(to->to_cc); 1818 /* 1819 * A CC or CC.new option received in a SYN makes 1820 * it ok to send CC in subsequent segments. 1821 */ 1822 tp->t_flags |= TF_RCVD_CC; 1823 break; 1824 case TCPOPT_CCECHO: 1825 if (optlen != TCPOLEN_CC) 1826 continue; 1827 if (!(ti->ti_flags & TH_SYN)) 1828 continue; 1829 to->to_flag |= TOF_CCECHO; 1830 bcopy((char *)cp + 2, 1831 (char *)&to->to_ccecho, sizeof(to->to_ccecho)); 1832 NTOHL(to->to_ccecho); 1833 break; 1834 } 1835 } 1836 if (ti->ti_flags & TH_SYN) 1837 tcp_mss(tp, mss); /* sets t_maxseg */ 1838} 1839 1840/* 1841 * Pull out of band byte out of a segment so 1842 * it doesn't appear in the user's data queue. 1843 * It is still reflected in the segment length for 1844 * sequencing purposes. 1845 */ 1846static void 1847tcp_pulloutofband(so, ti, m) 1848 struct socket *so; 1849 struct tcpiphdr *ti; 1850 register struct mbuf *m; 1851{ 1852 int cnt = ti->ti_urp - 1; 1853 1854 while (cnt >= 0) { 1855 if (m->m_len > cnt) { 1856 char *cp = mtod(m, caddr_t) + cnt; 1857 struct tcpcb *tp = sototcpcb(so); 1858 1859 tp->t_iobc = *cp; 1860 tp->t_oobflags |= TCPOOB_HAVEDATA; 1861 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 1862 m->m_len--; 1863 return; 1864 } 1865 cnt -= m->m_len; 1866 m = m->m_next; 1867 if (m == 0) 1868 break; 1869 } 1870 panic("tcp_pulloutofband"); 1871} 1872 1873/* 1874 * Collect new round-trip time estimate 1875 * and update averages and current timeout. 1876 */ 1877static void 1878tcp_xmit_timer(tp, rtt) 1879 register struct tcpcb *tp; 1880 short rtt; 1881{ 1882 register int delta; 1883 1884 tcpstat.tcps_rttupdated++; 1885 tp->t_rttupdated++; 1886 if (tp->t_srtt != 0) { 1887 /* 1888 * srtt is stored as fixed point with 5 bits after the 1889 * binary point (i.e., scaled by 8). The following magic 1890 * is equivalent to the smoothing algorithm in rfc793 with 1891 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 1892 * point). Adjust rtt to origin 0. 1893 */ 1894 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 1895 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 1896 1897 if ((tp->t_srtt += delta) <= 0) 1898 tp->t_srtt = 1; 1899 1900 /* 1901 * We accumulate a smoothed rtt variance (actually, a 1902 * smoothed mean difference), then set the retransmit 1903 * timer to smoothed rtt + 4 times the smoothed variance. 1904 * rttvar is stored as fixed point with 4 bits after the 1905 * binary point (scaled by 16). The following is 1906 * equivalent to rfc793 smoothing with an alpha of .75 1907 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 1908 * rfc793's wired-in beta. 1909 */ 1910 if (delta < 0) 1911 delta = -delta; 1912 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 1913 if ((tp->t_rttvar += delta) <= 0) 1914 tp->t_rttvar = 1; 1915 } else { 1916 /* 1917 * No rtt measurement yet - use the unsmoothed rtt. 1918 * Set the variance to half the rtt (so our first 1919 * retransmit happens at 3*rtt). 1920 */ 1921 tp->t_srtt = rtt << TCP_RTT_SHIFT; 1922 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 1923 } 1924 tp->t_rtt = 0; 1925 tp->t_rxtshift = 0; 1926 1927 /* 1928 * the retransmit should happen at rtt + 4 * rttvar. 1929 * Because of the way we do the smoothing, srtt and rttvar 1930 * will each average +1/2 tick of bias. When we compute 1931 * the retransmit timer, we want 1/2 tick of rounding and 1932 * 1 extra tick because of +-1/2 tick uncertainty in the 1933 * firing of the timer. The bias will give us exactly the 1934 * 1.5 tick we need. But, because the bias is 1935 * statistical, we have to test that we don't drop below 1936 * the minimum feasible timer (which is 2 ticks). 1937 */ 1938 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 1939 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 1940 1941 /* 1942 * We received an ack for a packet that wasn't retransmitted; 1943 * it is probably safe to discard any error indications we've 1944 * received recently. This isn't quite right, but close enough 1945 * for now (a route might have failed after we sent a segment, 1946 * and the return path might not be symmetrical). 1947 */ 1948 tp->t_softerror = 0; 1949} 1950 1951/* 1952 * Determine a reasonable value for maxseg size. 1953 * If the route is known, check route for mtu. 1954 * If none, use an mss that can be handled on the outgoing 1955 * interface without forcing IP to fragment; if bigger than 1956 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 1957 * to utilize large mbufs. If no route is found, route has no mtu, 1958 * or the destination isn't local, use a default, hopefully conservative 1959 * size (usually 512 or the default IP max size, but no more than the mtu 1960 * of the interface), as we can't discover anything about intervening 1961 * gateways or networks. We also initialize the congestion/slow start 1962 * window to be a single segment if the destination isn't local. 1963 * While looking at the routing entry, we also initialize other path-dependent 1964 * parameters from pre-set or cached values in the routing entry. 1965 * 1966 * Also take into account the space needed for options that we 1967 * send regularly. Make maxseg shorter by that amount to assure 1968 * that we can send maxseg amount of data even when the options 1969 * are present. Store the upper limit of the length of options plus 1970 * data in maxopd. 1971 * 1972 * NOTE that this routine is only called when we process an incoming 1973 * segment, for outgoing segments only tcp_mssopt is called. 1974 * 1975 * In case of T/TCP, we call this routine during implicit connection 1976 * setup as well (offer = -1), to initialize maxseg from the cached 1977 * MSS of our peer. 1978 */ 1979void 1980tcp_mss(tp, offer) 1981 struct tcpcb *tp; 1982 int offer; 1983{ 1984 register struct rtentry *rt; 1985 struct ifnet *ifp; 1986 register int rtt, mss; 1987 u_long bufsize; 1988 struct inpcb *inp; 1989 struct socket *so; 1990 struct rmxp_tao *taop; 1991 int origoffer = offer; 1992 1993 inp = tp->t_inpcb; 1994 if ((rt = tcp_rtlookup(inp)) == NULL) { 1995 tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; 1996 return; 1997 } 1998 ifp = rt->rt_ifp; 1999 so = inp->inp_socket; 2000 2001 taop = rmx_taop(rt->rt_rmx); 2002 /* 2003 * Offer == -1 means that we didn't receive SYN yet, 2004 * use cached value in that case; 2005 */ 2006 if (offer == -1) 2007 offer = taop->tao_mssopt; 2008 /* 2009 * Offer == 0 means that there was no MSS on the SYN segment, 2010 * in this case we use tcp_mssdflt. 2011 */ 2012 if (offer == 0) 2013 offer = tcp_mssdflt; 2014 else 2015 /* 2016 * Sanity check: make sure that maxopd will be large 2017 * enough to allow some data on segments even is the 2018 * all the option space is used (40bytes). Otherwise 2019 * funny things may happen in tcp_output. 2020 */ 2021 offer = max(offer, 64); 2022 taop->tao_mssopt = offer; 2023 2024 /* 2025 * While we're here, check if there's an initial rtt 2026 * or rttvar. Convert from the route-table units 2027 * to scaled multiples of the slow timeout timer. 2028 */ 2029 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { 2030 /* 2031 * XXX the lock bit for RTT indicates that the value 2032 * is also a minimum value; this is subject to time. 2033 */ 2034 if (rt->rt_rmx.rmx_locks & RTV_RTT) 2035 tp->t_rttmin = rtt / (RTM_RTTUNIT / PR_SLOWHZ); 2036 tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); 2037 tcpstat.tcps_usedrtt++; 2038 if (rt->rt_rmx.rmx_rttvar) { 2039 tp->t_rttvar = rt->rt_rmx.rmx_rttvar / 2040 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); 2041 tcpstat.tcps_usedrttvar++; 2042 } else { 2043 /* default variation is +- 1 rtt */ 2044 tp->t_rttvar = 2045 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 2046 } 2047 TCPT_RANGESET(tp->t_rxtcur, 2048 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 2049 tp->t_rttmin, TCPTV_REXMTMAX); 2050 } 2051 /* 2052 * if there's an mtu associated with the route, use it 2053 */ 2054 if (rt->rt_rmx.rmx_mtu) 2055 mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); 2056 else 2057 { 2058 mss = ifp->if_mtu - sizeof(struct tcpiphdr); 2059 if (!in_localaddr(inp->inp_faddr)) 2060 mss = min(mss, tcp_mssdflt); 2061 } 2062 mss = min(mss, offer); 2063 /* 2064 * maxopd stores the maximum length of data AND options 2065 * in a segment; maxseg is the amount of data in a normal 2066 * segment. We need to store this value (maxopd) apart 2067 * from maxseg, because now every segment carries options 2068 * and thus we normally have somewhat less data in segments. 2069 */ 2070 tp->t_maxopd = mss; 2071 2072 /* 2073 * In case of T/TCP, origoffer==-1 indicates, that no segments 2074 * were received yet. In this case we just guess, otherwise 2075 * we do the same as before T/TCP. 2076 */ 2077 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2078 (origoffer == -1 || 2079 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 2080 mss -= TCPOLEN_TSTAMP_APPA; 2081 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 2082 (origoffer == -1 || 2083 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) 2084 mss -= TCPOLEN_CC_APPA; 2085 2086#if (MCLBYTES & (MCLBYTES - 1)) == 0 2087 if (mss > MCLBYTES) 2088 mss &= ~(MCLBYTES-1); 2089#else 2090 if (mss > MCLBYTES) 2091 mss = mss / MCLBYTES * MCLBYTES; 2092#endif 2093 /* 2094 * If there's a pipesize, change the socket buffer 2095 * to that size. Make the socket buffers an integral 2096 * number of mss units; if the mss is larger than 2097 * the socket buffer, decrease the mss. 2098 */ 2099#ifdef RTV_SPIPE 2100 if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) 2101#endif 2102 bufsize = so->so_snd.sb_hiwat; 2103 if (bufsize < mss) 2104 mss = bufsize; 2105 else { 2106 bufsize = roundup(bufsize, mss); 2107 if (bufsize > sb_max) 2108 bufsize = sb_max; 2109 (void)sbreserve(&so->so_snd, bufsize); 2110 } 2111 tp->t_maxseg = mss; 2112 2113#ifdef RTV_RPIPE 2114 if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) 2115#endif 2116 bufsize = so->so_rcv.sb_hiwat; 2117 if (bufsize > mss) { 2118 bufsize = roundup(bufsize, mss); 2119 if (bufsize > sb_max) 2120 bufsize = sb_max; 2121 (void)sbreserve(&so->so_rcv, bufsize); 2122 } 2123 /* 2124 * Don't force slow-start on local network. 2125 */ 2126 if (!in_localaddr(inp->inp_faddr)) 2127 tp->snd_cwnd = mss; 2128 2129 if (rt->rt_rmx.rmx_ssthresh) { 2130 /* 2131 * There's some sort of gateway or interface 2132 * buffer limit on the path. Use this to set 2133 * the slow start threshhold, but set the 2134 * threshold to no less than 2*mss. 2135 */ 2136 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); 2137 tcpstat.tcps_usedssthresh++; 2138 } 2139} 2140 2141/* 2142 * Determine the MSS option to send on an outgoing SYN. 2143 */ 2144int 2145tcp_mssopt(tp) 2146 struct tcpcb *tp; 2147{ 2148 struct rtentry *rt; 2149 2150 rt = tcp_rtlookup(tp->t_inpcb); 2151 if (rt == NULL) 2152 return tcp_mssdflt; 2153 2154 return rt->rt_ifp->if_mtu - sizeof(struct tcpiphdr); 2155} 2156