tcp_input.c revision 101106
1/* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 34 * $FreeBSD: head/sys/netinet/tcp_input.c 101106 2002-07-31 19:06:49Z rwatson $ 35 */ 36 37#include "opt_ipfw.h" /* for ipfw_fwd */ 38#include "opt_inet6.h" 39#include "opt_ipsec.h" 40#include "opt_mac.h" 41#include "opt_tcpdebug.h" 42#include "opt_tcp_input.h" 43 44#include <sys/param.h> 45#include <sys/kernel.h> 46#include <sys/mac.h> 47#include <sys/malloc.h> 48#include <sys/mbuf.h> 49#include <sys/proc.h> /* for proc0 declaration */ 50#include <sys/protosw.h> 51#include <sys/signalvar.h> 52#include <sys/socket.h> 53#include <sys/socketvar.h> 54#include <sys/sysctl.h> 55#include <sys/syslog.h> 56#include <sys/systm.h> 57 58#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 59 60#include <net/if.h> 61#include <net/route.h> 62 63#include <netinet/in.h> 64#include <netinet/in_pcb.h> 65#include <netinet/in_systm.h> 66#include <netinet/in_var.h> 67#include <netinet/ip.h> 68#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */ 69#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 70#include <netinet/ip_var.h> 71#ifdef INET6 72#include <netinet/ip6.h> 73#include <netinet/icmp6.h> 74#include <netinet6/in6_pcb.h> 75#include <netinet6/ip6_var.h> 76#include <netinet6/nd6.h> 77#endif 78#include <netinet/tcp.h> 79#include <netinet/tcp_fsm.h> 80#include <netinet/tcp_seq.h> 81#include <netinet/tcp_timer.h> 82#include <netinet/tcp_var.h> 83#ifdef INET6 84#include <netinet6/tcp6_var.h> 85#endif 86#include <netinet/tcpip.h> 87#ifdef TCPDEBUG 88#include <netinet/tcp_debug.h> 89 90#endif /* TCPDEBUG */ 91 92#ifdef IPSEC 93#include <netinet6/ipsec.h> 94#ifdef INET6 95#include <netinet6/ipsec6.h> 96#endif 97#include <netkey/key.h> 98#endif /*IPSEC*/ 99 100#include <machine/in_cksum.h> 101 102MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); 103 104static int tcprexmtthresh = 3; 105tcp_cc tcp_ccgen; 106 107struct tcpstat tcpstat; 108SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, 109 &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 110 111static int log_in_vain = 0; 112SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 113 &log_in_vain, 0, "Log all incoming TCP connections"); 114 115static int blackhole = 0; 116SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, 117 &blackhole, 0, "Do not send RST when dropping refused connections"); 118 119int tcp_delack_enabled = 1; 120SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, 121 &tcp_delack_enabled, 0, 122 "Delay ACK to try and piggyback it onto a data packet"); 123 124#ifdef TCP_DROP_SYNFIN 125static int drop_synfin = 0; 126SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, 127 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); 128#endif 129 130struct inpcbhead tcb; 131#define tcb6 tcb /* for KAME src sync over BSD*'s */ 132struct inpcbinfo tcbinfo; 133struct mtx *tcbinfo_mtx; 134 135static void tcp_dooptions(struct tcpopt *, u_char *, int, int); 136static void tcp_pulloutofband(struct socket *, 137 struct tcphdr *, struct mbuf *, int); 138static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, 139 struct mbuf *); 140static void tcp_xmit_timer(struct tcpcb *, int); 141static int tcp_newreno(struct tcpcb *, struct tcphdr *); 142 143/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ 144#ifdef INET6 145#define ND6_HINT(tp) \ 146do { \ 147 if ((tp) && (tp)->t_inpcb && \ 148 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ 149 (tp)->t_inpcb->in6p_route.ro_rt) \ 150 nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \ 151} while (0) 152#else 153#define ND6_HINT(tp) 154#endif 155 156/* 157 * Indicate whether this ack should be delayed. We can delay the ack if 158 * - delayed acks are enabled and 159 * - there is no delayed ack timer in progress and 160 * - our last ack wasn't a 0-sized window. We never want to delay 161 * the ack that opens up a 0-sized window. 162 */ 163#define DELAY_ACK(tp) \ 164 (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \ 165 (tp->t_flags & TF_RXWIN0SENT) == 0) 166 167static int 168tcp_reass(tp, th, tlenp, m) 169 register struct tcpcb *tp; 170 register struct tcphdr *th; 171 int *tlenp; 172 struct mbuf *m; 173{ 174 struct tseg_qent *q; 175 struct tseg_qent *p = NULL; 176 struct tseg_qent *nq; 177 struct tseg_qent *te; 178 struct socket *so = tp->t_inpcb->inp_socket; 179 int flags; 180 181 /* 182 * Call with th==0 after become established to 183 * force pre-ESTABLISHED data up to user socket. 184 */ 185 if (th == 0) 186 goto present; 187 188 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ 189 MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, 190 M_NOWAIT); 191 if (te == NULL) { 192 tcpstat.tcps_rcvmemdrop++; 193 m_freem(m); 194 return (0); 195 } 196 197 /* 198 * Find a segment which begins after this one does. 199 */ 200 LIST_FOREACH(q, &tp->t_segq, tqe_q) { 201 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) 202 break; 203 p = q; 204 } 205 206 /* 207 * If there is a preceding segment, it may provide some of 208 * our data already. If so, drop the data from the incoming 209 * segment. If it provides all of our data, drop us. 210 */ 211 if (p != NULL) { 212 register int i; 213 /* conversion to int (in i) handles seq wraparound */ 214 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; 215 if (i > 0) { 216 if (i >= *tlenp) { 217 tcpstat.tcps_rcvduppack++; 218 tcpstat.tcps_rcvdupbyte += *tlenp; 219 m_freem(m); 220 FREE(te, M_TSEGQ); 221 /* 222 * Try to present any queued data 223 * at the left window edge to the user. 224 * This is needed after the 3-WHS 225 * completes. 226 */ 227 goto present; /* ??? */ 228 } 229 m_adj(m, i); 230 *tlenp -= i; 231 th->th_seq += i; 232 } 233 } 234 tcpstat.tcps_rcvoopack++; 235 tcpstat.tcps_rcvoobyte += *tlenp; 236 237 /* 238 * While we overlap succeeding segments trim them or, 239 * if they are completely covered, dequeue them. 240 */ 241 while (q) { 242 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; 243 if (i <= 0) 244 break; 245 if (i < q->tqe_len) { 246 q->tqe_th->th_seq += i; 247 q->tqe_len -= i; 248 m_adj(q->tqe_m, i); 249 break; 250 } 251 252 nq = LIST_NEXT(q, tqe_q); 253 LIST_REMOVE(q, tqe_q); 254 m_freem(q->tqe_m); 255 FREE(q, M_TSEGQ); 256 q = nq; 257 } 258 259 /* Insert the new segment queue entry into place. */ 260 te->tqe_m = m; 261 te->tqe_th = th; 262 te->tqe_len = *tlenp; 263 264 if (p == NULL) { 265 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); 266 } else { 267 LIST_INSERT_AFTER(p, te, tqe_q); 268 } 269 270present: 271 /* 272 * Present data to user, advancing rcv_nxt through 273 * completed sequence space. 274 */ 275 if (!TCPS_HAVEESTABLISHED(tp->t_state)) 276 return (0); 277 q = LIST_FIRST(&tp->t_segq); 278 if (!q || q->tqe_th->th_seq != tp->rcv_nxt) 279 return (0); 280 do { 281 tp->rcv_nxt += q->tqe_len; 282 flags = q->tqe_th->th_flags & TH_FIN; 283 nq = LIST_NEXT(q, tqe_q); 284 LIST_REMOVE(q, tqe_q); 285 if (so->so_state & SS_CANTRCVMORE) 286 m_freem(q->tqe_m); 287 else 288 sbappend(&so->so_rcv, q->tqe_m); 289 FREE(q, M_TSEGQ); 290 q = nq; 291 } while (q && q->tqe_th->th_seq == tp->rcv_nxt); 292 ND6_HINT(tp); 293 sorwakeup(so); 294 return (flags); 295} 296 297/* 298 * TCP input routine, follows pages 65-76 of the 299 * protocol specification dated September, 1981 very closely. 300 */ 301#ifdef INET6 302int 303tcp6_input(mp, offp, proto) 304 struct mbuf **mp; 305 int *offp, proto; 306{ 307 register struct mbuf *m = *mp; 308 struct in6_ifaddr *ia6; 309 310 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); 311 312 /* 313 * draft-itojun-ipv6-tcp-to-anycast 314 * better place to put this in? 315 */ 316 ia6 = ip6_getdstifaddr(m); 317 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { 318 struct ip6_hdr *ip6; 319 320 ip6 = mtod(m, struct ip6_hdr *); 321 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 322 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 323 return IPPROTO_DONE; 324 } 325 326 tcp_input(m, *offp); 327 return IPPROTO_DONE; 328} 329#endif 330 331void 332tcp_input(m, off0) 333 register struct mbuf *m; 334 int off0; 335{ 336 register struct tcphdr *th; 337 register struct ip *ip = NULL; 338 register struct ipovly *ipov; 339 register struct inpcb *inp = NULL; 340 u_char *optp = NULL; 341 int optlen = 0; 342 int len, tlen, off; 343 int drop_hdrlen; 344 register struct tcpcb *tp = 0; 345 register int thflags; 346 struct socket *so = 0; 347 int todrop, acked, ourfinisacked, needoutput = 0; 348 u_long tiwin; 349 struct tcpopt to; /* options in this segment */ 350 struct rmxp_tao *taop; /* pointer to our TAO cache entry */ 351 struct rmxp_tao tao_noncached; /* in case there's no cached entry */ 352 int headlocked = 0; 353 354#ifdef TCPDEBUG 355 u_char tcp_saveipgen[40]; 356 /* the size of the above must be of max ip header, now IPv6 */ 357 struct tcphdr tcp_savetcp; 358 short ostate = 0; 359#endif 360#ifdef INET6 361 struct ip6_hdr *ip6 = NULL; 362 int isipv6; 363#endif /* INET6 */ 364 struct sockaddr_in *next_hop = NULL; 365#ifdef MAC 366 int error; 367#endif 368 int rstreason; /* For badport_bandlim accounting purposes */ 369 370 /* Grab info from MT_TAG mbufs prepended to the chain. */ 371 for (;m && m->m_type == MT_TAG; m = m->m_next) { 372 if (m->m_tag_id == PACKET_TAG_IPFORWARD) 373 next_hop = (struct sockaddr_in *)m->m_hdr.mh_data; 374 } 375 376#ifdef INET6 377 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 378#endif 379 bzero((char *)&to, sizeof(to)); 380 381 tcpstat.tcps_rcvtotal++; 382 383#ifdef INET6 384 if (isipv6) { 385 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ 386 ip6 = mtod(m, struct ip6_hdr *); 387 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 388 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { 389 tcpstat.tcps_rcvbadsum++; 390 goto drop; 391 } 392 th = (struct tcphdr *)((caddr_t)ip6 + off0); 393 394 /* 395 * Be proactive about unspecified IPv6 address in source. 396 * As we use all-zero to indicate unbounded/unconnected pcb, 397 * unspecified IPv6 address can be used to confuse us. 398 * 399 * Note that packets with unspecified IPv6 destination is 400 * already dropped in ip6_input. 401 */ 402 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 403 /* XXX stat */ 404 goto drop; 405 } 406 } else 407#endif /* INET6 */ 408 { 409 /* 410 * Get IP and TCP header together in first mbuf. 411 * Note: IP leaves IP header in first mbuf. 412 */ 413 if (off0 > sizeof (struct ip)) { 414 ip_stripoptions(m, (struct mbuf *)0); 415 off0 = sizeof(struct ip); 416 } 417 if (m->m_len < sizeof (struct tcpiphdr)) { 418 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { 419 tcpstat.tcps_rcvshort++; 420 return; 421 } 422 } 423 ip = mtod(m, struct ip *); 424 ipov = (struct ipovly *)ip; 425 th = (struct tcphdr *)((caddr_t)ip + off0); 426 tlen = ip->ip_len; 427 428 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 429 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 430 th->th_sum = m->m_pkthdr.csum_data; 431 else 432 th->th_sum = in_pseudo(ip->ip_src.s_addr, 433 ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + 434 ip->ip_len + IPPROTO_TCP)); 435 th->th_sum ^= 0xffff; 436 } else { 437 /* 438 * Checksum extended TCP header and data. 439 */ 440 len = sizeof (struct ip) + tlen; 441 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 442 ipov->ih_len = (u_short)tlen; 443 ipov->ih_len = htons(ipov->ih_len); 444 th->th_sum = in_cksum(m, len); 445 } 446 if (th->th_sum) { 447 tcpstat.tcps_rcvbadsum++; 448 goto drop; 449 } 450#ifdef INET6 451 /* Re-initialization for later version check */ 452 ip->ip_v = IPVERSION; 453#endif 454 } 455 456 /* 457 * Check that TCP offset makes sense, 458 * pull out TCP options and adjust length. XXX 459 */ 460 off = th->th_off << 2; 461 if (off < sizeof (struct tcphdr) || off > tlen) { 462 tcpstat.tcps_rcvbadoff++; 463 goto drop; 464 } 465 tlen -= off; /* tlen is used instead of ti->ti_len */ 466 if (off > sizeof (struct tcphdr)) { 467#ifdef INET6 468 if (isipv6) { 469 IP6_EXTHDR_CHECK(m, off0, off, ); 470 ip6 = mtod(m, struct ip6_hdr *); 471 th = (struct tcphdr *)((caddr_t)ip6 + off0); 472 } else 473#endif /* INET6 */ 474 { 475 if (m->m_len < sizeof(struct ip) + off) { 476 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { 477 tcpstat.tcps_rcvshort++; 478 return; 479 } 480 ip = mtod(m, struct ip *); 481 ipov = (struct ipovly *)ip; 482 th = (struct tcphdr *)((caddr_t)ip + off0); 483 } 484 } 485 optlen = off - sizeof (struct tcphdr); 486 optp = (u_char *)(th + 1); 487 } 488 thflags = th->th_flags; 489 490#ifdef TCP_DROP_SYNFIN 491 /* 492 * If the drop_synfin option is enabled, drop all packets with 493 * both the SYN and FIN bits set. This prevents e.g. nmap from 494 * identifying the TCP/IP stack. 495 * 496 * This is a violation of the TCP specification. 497 */ 498 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) 499 goto drop; 500#endif 501 502 /* 503 * Convert TCP protocol specific fields to host format. 504 */ 505 th->th_seq = ntohl(th->th_seq); 506 th->th_ack = ntohl(th->th_ack); 507 th->th_win = ntohs(th->th_win); 508 th->th_urp = ntohs(th->th_urp); 509 510 /* 511 * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options, 512 * until after ip6_savecontrol() is called and before other functions 513 * which don't want those proto headers. 514 * Because ip6_savecontrol() is going to parse the mbuf to 515 * search for data to be passed up to user-land, it wants mbuf 516 * parameters to be unchanged. 517 * XXX: the call of ip6_savecontrol() has been obsoleted based on 518 * latest version of the advanced API (20020110). 519 */ 520 drop_hdrlen = off0 + off; 521 522 /* 523 * Locate pcb for segment. 524 */ 525 INP_INFO_WLOCK(&tcbinfo); 526 headlocked = 1; 527findpcb: 528 /* IPFIREWALL_FORWARD section */ 529 if (next_hop != NULL 530#ifdef INET6 531 && isipv6 == NULL /* IPv6 support is not yet */ 532#endif /* INET6 */ 533 ) { 534 /* 535 * Transparently forwarded. Pretend to be the destination. 536 * already got one like this? 537 */ 538 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, 539 ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); 540 if (!inp) { 541 /* 542 * No, then it's new. Try find the ambushing socket 543 */ 544 if (next_hop->sin_port == 0) { 545 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, 546 th->th_sport, next_hop->sin_addr, 547 th->th_dport, 1, m->m_pkthdr.rcvif); 548 } else { 549 inp = in_pcblookup_hash(&tcbinfo, 550 ip->ip_src, th->th_sport, 551 next_hop->sin_addr, 552 ntohs(next_hop->sin_port), 1, 553 m->m_pkthdr.rcvif); 554 } 555 } 556 } else 557 { 558#ifdef INET6 559 if (isipv6) 560 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, 561 &ip6->ip6_dst, th->th_dport, 1, 562 m->m_pkthdr.rcvif); 563 else 564#endif /* INET6 */ 565 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, 566 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); 567 } 568 569#ifdef IPSEC 570#ifdef INET6 571 if (isipv6) { 572 if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { 573 ipsec6stat.in_polvio++; 574 goto drop; 575 } 576 } else 577#endif /* INET6 */ 578 if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { 579 ipsecstat.in_polvio++; 580 goto drop; 581 } 582#endif /*IPSEC*/ 583 584 /* 585 * If the state is CLOSED (i.e., TCB does not exist) then 586 * all data in the incoming segment is discarded. 587 * If the TCB exists but is in CLOSED state, it is embryonic, 588 * but should either do a listen or a connect soon. 589 */ 590 if (inp == NULL) { 591 if (log_in_vain) { 592#ifdef INET6 593 char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN]; 594#else /* INET6 */ 595 char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; 596#endif /* INET6 */ 597 598#ifdef INET6 599 if (isipv6) { 600 strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst)); 601 strcpy(sbuf, ip6_sprintf(&ip6->ip6_src)); 602 } else 603#endif 604 { 605 strcpy(dbuf, inet_ntoa(ip->ip_dst)); 606 strcpy(sbuf, inet_ntoa(ip->ip_src)); 607 } 608 switch (log_in_vain) { 609 case 1: 610 if(thflags & TH_SYN) 611 log(LOG_INFO, 612 "Connection attempt to TCP %s:%d from %s:%d\n", 613 dbuf, ntohs(th->th_dport), 614 sbuf, 615 ntohs(th->th_sport)); 616 break; 617 case 2: 618 log(LOG_INFO, 619 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", 620 dbuf, ntohs(th->th_dport), sbuf, 621 ntohs(th->th_sport), thflags); 622 break; 623 default: 624 break; 625 } 626 } 627 if (blackhole) { 628 switch (blackhole) { 629 case 1: 630 if (thflags & TH_SYN) 631 goto drop; 632 break; 633 case 2: 634 goto drop; 635 default: 636 goto drop; 637 } 638 } 639 rstreason = BANDLIM_RST_CLOSEDPORT; 640 goto dropwithreset; 641 } 642 INP_LOCK(inp); 643 tp = intotcpcb(inp); 644 if (tp == 0) { 645 INP_UNLOCK(inp); 646 rstreason = BANDLIM_RST_CLOSEDPORT; 647 goto dropwithreset; 648 } 649 if (tp->t_state == TCPS_CLOSED) 650 goto drop; 651 652 /* Unscale the window into a 32-bit value. */ 653 if ((thflags & TH_SYN) == 0) 654 tiwin = th->th_win << tp->snd_scale; 655 else 656 tiwin = th->th_win; 657 658 so = inp->inp_socket; 659#ifdef MAC 660 error = mac_check_socket_receive(so, m); 661 if (error) 662 goto drop; 663#endif 664 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 665 struct in_conninfo inc; 666#ifdef TCPDEBUG 667 if (so->so_options & SO_DEBUG) { 668 ostate = tp->t_state; 669#ifdef INET6 670 if (isipv6) 671 bcopy((char *)ip6, (char *)tcp_saveipgen, 672 sizeof(*ip6)); 673 else 674#endif /* INET6 */ 675 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); 676 tcp_savetcp = *th; 677 } 678#endif 679 /* skip if this isn't a listen socket */ 680 if ((so->so_options & SO_ACCEPTCONN) == 0) 681 goto after_listen; 682#ifdef INET6 683 inc.inc_isipv6 = isipv6; 684 if (isipv6) { 685 inc.inc6_faddr = ip6->ip6_src; 686 inc.inc6_laddr = ip6->ip6_dst; 687 inc.inc6_route.ro_rt = NULL; /* XXX */ 688 689 } else 690#endif /* INET6 */ 691 { 692 inc.inc_faddr = ip->ip_src; 693 inc.inc_laddr = ip->ip_dst; 694 inc.inc_route.ro_rt = NULL; /* XXX */ 695 } 696 inc.inc_fport = th->th_sport; 697 inc.inc_lport = th->th_dport; 698 699 /* 700 * If the state is LISTEN then ignore segment if it contains 701 * a RST. If the segment contains an ACK then it is bad and 702 * send a RST. If it does not contain a SYN then it is not 703 * interesting; drop it. 704 * 705 * If the state is SYN_RECEIVED (syncache) and seg contains 706 * an ACK, but not for our SYN/ACK, send a RST. If the seg 707 * contains a RST, check the sequence number to see if it 708 * is a valid reset segment. 709 */ 710 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 711 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 712 if (!syncache_expand(&inc, th, &so, m)) { 713 /* 714 * No syncache entry, or ACK was not 715 * for our SYN/ACK. Send a RST. 716 */ 717 tcpstat.tcps_badsyn++; 718 rstreason = BANDLIM_RST_OPENPORT; 719 goto dropwithreset; 720 } 721 if (so == NULL) { 722 /* 723 * Could not complete 3-way handshake, 724 * connection is being closed down, and 725 * syncache will free mbuf. 726 */ 727 INP_UNLOCK(inp); 728 INP_INFO_WUNLOCK(&tcbinfo); 729 return; 730 } 731 /* 732 * Socket is created in state SYN_RECEIVED. 733 * Continue processing segment. 734 */ 735 INP_UNLOCK(inp); 736 inp = sotoinpcb(so); 737 INP_LOCK(inp); 738 tp = intotcpcb(inp); 739 /* 740 * This is what would have happened in 741 * tcp_output() when the SYN,ACK was sent. 742 */ 743 tp->snd_up = tp->snd_una; 744 tp->snd_max = tp->snd_nxt = tp->iss + 1; 745 tp->last_ack_sent = tp->rcv_nxt; 746/* 747 * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled 748 * until the _second_ ACK is received: 749 * rcv SYN (set wscale opts) --> send SYN/ACK, set snd_wnd = window. 750 * rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale, 751 * move to ESTAB, set snd_wnd to tiwin. 752 */ 753 tp->snd_wnd = tiwin; /* unscaled */ 754 goto after_listen; 755 } 756 if (thflags & TH_RST) { 757 syncache_chkrst(&inc, th); 758 goto drop; 759 } 760 if (thflags & TH_ACK) { 761 syncache_badack(&inc); 762 tcpstat.tcps_badsyn++; 763 rstreason = BANDLIM_RST_OPENPORT; 764 goto dropwithreset; 765 } 766 goto drop; 767 } 768 769 /* 770 * Segment's flags are (SYN) or (SYN|FIN). 771 */ 772#ifdef INET6 773 /* 774 * If deprecated address is forbidden, 775 * we do not accept SYN to deprecated interface 776 * address to prevent any new inbound connection from 777 * getting established. 778 * When we do not accept SYN, we send a TCP RST, 779 * with deprecated source address (instead of dropping 780 * it). We compromise it as it is much better for peer 781 * to send a RST, and RST will be the final packet 782 * for the exchange. 783 * 784 * If we do not forbid deprecated addresses, we accept 785 * the SYN packet. RFC2462 does not suggest dropping 786 * SYN in this case. 787 * If we decipher RFC2462 5.5.4, it says like this: 788 * 1. use of deprecated addr with existing 789 * communication is okay - "SHOULD continue to be 790 * used" 791 * 2. use of it with new communication: 792 * (2a) "SHOULD NOT be used if alternate address 793 * with sufficient scope is available" 794 * (2b) nothing mentioned otherwise. 795 * Here we fall into (2b) case as we have no choice in 796 * our source address selection - we must obey the peer. 797 * 798 * The wording in RFC2462 is confusing, and there are 799 * multiple description text for deprecated address 800 * handling - worse, they are not exactly the same. 801 * I believe 5.5.4 is the best one, so we follow 5.5.4. 802 */ 803 if (isipv6 && !ip6_use_deprecated) { 804 struct in6_ifaddr *ia6; 805 806 if ((ia6 = ip6_getdstifaddr(m)) && 807 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 808 INP_UNLOCK(inp); 809 tp = NULL; 810 rstreason = BANDLIM_RST_OPENPORT; 811 goto dropwithreset; 812 } 813 } 814#endif 815 /* 816 * If it is from this socket, drop it, it must be forged. 817 * Don't bother responding if the destination was a broadcast. 818 */ 819 if (th->th_dport == th->th_sport) { 820#ifdef INET6 821 if (isipv6) { 822 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, 823 &ip6->ip6_src)) 824 goto drop; 825 } else 826#endif /* INET6 */ 827 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) 828 goto drop; 829 } 830 /* 831 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 832 * 833 * Note that it is quite possible to receive unicast 834 * link-layer packets with a broadcast IP address. Use 835 * in_broadcast() to find them. 836 */ 837 if (m->m_flags & (M_BCAST|M_MCAST)) 838 goto drop; 839#ifdef INET6 840 if (isipv6) { 841 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 842 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 843 goto drop; 844 } else 845#endif 846 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 847 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 848 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 849 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 850 goto drop; 851 /* 852 * SYN appears to be valid; create compressed TCP state 853 * for syncache, or perform t/tcp connection. 854 */ 855 if (so->so_qlen <= so->so_qlimit) { 856 tcp_dooptions(&to, optp, optlen, 1); 857 if (!syncache_add(&inc, &to, th, &so, m)) 858 goto drop; 859 if (so == NULL) { 860 /* 861 * Entry added to syncache, mbuf used to 862 * send SYN,ACK packet. 863 */ 864 KASSERT(headlocked, ("headlocked")); 865 INP_UNLOCK(inp); 866 INP_INFO_WUNLOCK(&tcbinfo); 867 return; 868 } 869 /* 870 * Segment passed TAO tests. 871 */ 872 INP_UNLOCK(inp); 873 inp = sotoinpcb(so); 874 INP_LOCK(inp); 875 tp = intotcpcb(inp); 876 tp->snd_wnd = tiwin; 877 tp->t_starttime = ticks; 878 tp->t_state = TCPS_ESTABLISHED; 879 880 /* 881 * If there is a FIN, or if there is data and the 882 * connection is local, then delay SYN,ACK(SYN) in 883 * the hope of piggy-backing it on a response 884 * segment. Otherwise must send ACK now in case 885 * the other side is slow starting. 886 */ 887 if (DELAY_ACK(tp) && ((thflags & TH_FIN) || 888 (tlen != 0 && 889#ifdef INET6 890 ((isipv6 && in6_localaddr(&inp->in6p_faddr)) 891 || 892 (!isipv6 && 893#endif 894 in_localaddr(inp->inp_faddr) 895#ifdef INET6 896 )) 897#endif 898 ))) { 899 callout_reset(tp->tt_delack, tcp_delacktime, 900 tcp_timer_delack, tp); 901 tp->t_flags |= TF_NEEDSYN; 902 } else 903 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 904 905 tcpstat.tcps_connects++; 906 soisconnected(so); 907 goto trimthenstep6; 908 } 909 goto drop; 910 } 911after_listen: 912 913/* XXX temp debugging */ 914 /* should not happen - syncache should pick up these connections */ 915 if (tp->t_state == TCPS_LISTEN) 916 panic("tcp_input: TCPS_LISTEN"); 917 918 /* 919 * Segment received on connection. 920 * Reset idle time and keep-alive timer. 921 */ 922 tp->t_rcvtime = ticks; 923 if (TCPS_HAVEESTABLISHED(tp->t_state)) 924 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); 925 926 /* 927 * Process options. 928 * XXX this is tradtitional behavior, may need to be cleaned up. 929 */ 930 tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); 931 if (thflags & TH_SYN) { 932 if (to.to_flags & TOF_SCALE) { 933 tp->t_flags |= TF_RCVD_SCALE; 934 tp->requested_s_scale = to.to_requested_s_scale; 935 } 936 if (to.to_flags & TOF_TS) { 937 tp->t_flags |= TF_RCVD_TSTMP; 938 tp->ts_recent = to.to_tsval; 939 tp->ts_recent_age = ticks; 940 } 941 if (to.to_flags & (TOF_CC|TOF_CCNEW)) 942 tp->t_flags |= TF_RCVD_CC; 943 if (to.to_flags & TOF_MSS) 944 tcp_mss(tp, to.to_mss); 945 } 946 947 /* 948 * Header prediction: check for the two common cases 949 * of a uni-directional data xfer. If the packet has 950 * no control flags, is in-sequence, the window didn't 951 * change and we're not retransmitting, it's a 952 * candidate. If the length is zero and the ack moved 953 * forward, we're the sender side of the xfer. Just 954 * free the data acked & wake any higher level process 955 * that was blocked waiting for space. If the length 956 * is non-zero and the ack didn't move, we're the 957 * receiver side. If we're getting packets in-order 958 * (the reassembly queue is empty), add the data to 959 * the socket buffer and note that we need a delayed ack. 960 * Make sure that the hidden state-flags are also off. 961 * Since we check for TCPS_ESTABLISHED above, it can only 962 * be TH_NEEDSYN. 963 */ 964 if (tp->t_state == TCPS_ESTABLISHED && 965 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 966 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 967 ((to.to_flags & TOF_TS) == 0 || 968 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && 969 /* 970 * Using the CC option is compulsory if once started: 971 * the segment is OK if no T/TCP was negotiated or 972 * if the segment has a CC option equal to CCrecv 973 */ 974 ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || 975 ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && 976 th->th_seq == tp->rcv_nxt && 977 tiwin && tiwin == tp->snd_wnd && 978 tp->snd_nxt == tp->snd_max) { 979 980 /* 981 * If last ACK falls within this segment's sequence numbers, 982 * record the timestamp. 983 * NOTE that the test is modified according to the latest 984 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 985 */ 986 if ((to.to_flags & TOF_TS) != 0 && 987 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 988 tp->ts_recent_age = ticks; 989 tp->ts_recent = to.to_tsval; 990 } 991 992 if (tlen == 0) { 993 if (SEQ_GT(th->th_ack, tp->snd_una) && 994 SEQ_LEQ(th->th_ack, tp->snd_max) && 995 tp->snd_cwnd >= tp->snd_wnd && 996 tp->t_dupacks < tcprexmtthresh) { 997 KASSERT(headlocked, ("headlocked")); 998 INP_INFO_WUNLOCK(&tcbinfo); 999 headlocked = 0; 1000 /* 1001 * this is a pure ack for outstanding data. 1002 */ 1003 ++tcpstat.tcps_predack; 1004 /* 1005 * "bad retransmit" recovery 1006 */ 1007 if (tp->t_rxtshift == 1 && 1008 ticks < tp->t_badrxtwin) { 1009 tp->snd_cwnd = tp->snd_cwnd_prev; 1010 tp->snd_ssthresh = 1011 tp->snd_ssthresh_prev; 1012 tp->snd_nxt = tp->snd_max; 1013 tp->t_badrxtwin = 0; 1014 } 1015 if ((to.to_flags & TOF_TS) != 0) 1016 tcp_xmit_timer(tp, 1017 ticks - to.to_tsecr + 1); 1018 else if (tp->t_rtttime && 1019 SEQ_GT(th->th_ack, tp->t_rtseq)) 1020 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 1021 acked = th->th_ack - tp->snd_una; 1022 tcpstat.tcps_rcvackpack++; 1023 tcpstat.tcps_rcvackbyte += acked; 1024 sbdrop(&so->so_snd, acked); 1025 tp->snd_una = th->th_ack; 1026 m_freem(m); 1027 ND6_HINT(tp); /* some progress has been done */ 1028 1029 /* 1030 * If all outstanding data are acked, stop 1031 * retransmit timer, otherwise restart timer 1032 * using current (possibly backed-off) value. 1033 * If process is waiting for space, 1034 * wakeup/selwakeup/signal. If data 1035 * are ready to send, let tcp_output 1036 * decide between more output or persist. 1037 */ 1038 if (tp->snd_una == tp->snd_max) 1039 callout_stop(tp->tt_rexmt); 1040 else if (!callout_active(tp->tt_persist)) 1041 callout_reset(tp->tt_rexmt, 1042 tp->t_rxtcur, 1043 tcp_timer_rexmt, tp); 1044 1045 sowwakeup(so); 1046 if (so->so_snd.sb_cc) 1047 (void) tcp_output(tp); 1048 INP_UNLOCK(inp); 1049 return; 1050 } 1051 } else if (th->th_ack == tp->snd_una && 1052 LIST_EMPTY(&tp->t_segq) && 1053 tlen <= sbspace(&so->so_rcv)) { 1054 KASSERT(headlocked, ("headlocked")); 1055 INP_INFO_WUNLOCK(&tcbinfo); 1056 headlocked = 0; 1057 /* 1058 * this is a pure, in-sequence data packet 1059 * with nothing on the reassembly queue and 1060 * we have enough buffer space to take it. 1061 */ 1062 ++tcpstat.tcps_preddat; 1063 tp->rcv_nxt += tlen; 1064 tcpstat.tcps_rcvpack++; 1065 tcpstat.tcps_rcvbyte += tlen; 1066 ND6_HINT(tp); /* some progress has been done */ 1067 /* 1068 * Add data to socket buffer. 1069 */ 1070 m_adj(m, drop_hdrlen); /* delayed header drop */ 1071 sbappend(&so->so_rcv, m); 1072 sorwakeup(so); 1073 if (DELAY_ACK(tp)) { 1074 callout_reset(tp->tt_delack, tcp_delacktime, 1075 tcp_timer_delack, tp); 1076 } else { 1077 tp->t_flags |= TF_ACKNOW; 1078 tcp_output(tp); 1079 } 1080 INP_UNLOCK(inp); 1081 return; 1082 } 1083 } 1084 1085 /* 1086 * Calculate amount of space in receive window, 1087 * and then do TCP input processing. 1088 * Receive window is amount of space in rcv queue, 1089 * but not less than advertised window. 1090 */ 1091 { int win; 1092 1093 win = sbspace(&so->so_rcv); 1094 if (win < 0) 1095 win = 0; 1096 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1097 } 1098 1099 switch (tp->t_state) { 1100 1101 /* 1102 * If the state is SYN_RECEIVED: 1103 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1104 */ 1105 case TCPS_SYN_RECEIVED: 1106 if ((thflags & TH_ACK) && 1107 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1108 SEQ_GT(th->th_ack, tp->snd_max))) { 1109 rstreason = BANDLIM_RST_OPENPORT; 1110 goto dropwithreset; 1111 } 1112 break; 1113 1114 /* 1115 * If the state is SYN_SENT: 1116 * if seg contains an ACK, but not for our SYN, drop the input. 1117 * if seg contains a RST, then drop the connection. 1118 * if seg does not contain SYN, then drop it. 1119 * Otherwise this is an acceptable SYN segment 1120 * initialize tp->rcv_nxt and tp->irs 1121 * if seg contains ack then advance tp->snd_una 1122 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1123 * arrange for segment to be acked (eventually) 1124 * continue processing rest of data/controls, beginning with URG 1125 */ 1126 case TCPS_SYN_SENT: 1127 if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) { 1128 taop = &tao_noncached; 1129 bzero(taop, sizeof(*taop)); 1130 } 1131 1132 if ((thflags & TH_ACK) && 1133 (SEQ_LEQ(th->th_ack, tp->iss) || 1134 SEQ_GT(th->th_ack, tp->snd_max))) { 1135 /* 1136 * If we have a cached CCsent for the remote host, 1137 * hence we haven't just crashed and restarted, 1138 * do not send a RST. This may be a retransmission 1139 * from the other side after our earlier ACK was lost. 1140 * Our new SYN, when it arrives, will serve as the 1141 * needed ACK. 1142 */ 1143 if (taop->tao_ccsent != 0) 1144 goto drop; 1145 else { 1146 rstreason = BANDLIM_UNLIMITED; 1147 goto dropwithreset; 1148 } 1149 } 1150 if (thflags & TH_RST) { 1151 if (thflags & TH_ACK) 1152 tp = tcp_drop(tp, ECONNREFUSED); 1153 goto drop; 1154 } 1155 if ((thflags & TH_SYN) == 0) 1156 goto drop; 1157 tp->snd_wnd = th->th_win; /* initial send window */ 1158 tp->cc_recv = to.to_cc; /* foreign CC */ 1159 1160 tp->irs = th->th_seq; 1161 tcp_rcvseqinit(tp); 1162 if (thflags & TH_ACK) { 1163 /* 1164 * Our SYN was acked. If segment contains CC.ECHO 1165 * option, check it to make sure this segment really 1166 * matches our SYN. If not, just drop it as old 1167 * duplicate, but send an RST if we're still playing 1168 * by the old rules. If no CC.ECHO option, make sure 1169 * we don't get fooled into using T/TCP. 1170 */ 1171 if (to.to_flags & TOF_CCECHO) { 1172 if (tp->cc_send != to.to_ccecho) { 1173 if (taop->tao_ccsent != 0) 1174 goto drop; 1175 else { 1176 rstreason = BANDLIM_UNLIMITED; 1177 goto dropwithreset; 1178 } 1179 } 1180 } else 1181 tp->t_flags &= ~TF_RCVD_CC; 1182 tcpstat.tcps_connects++; 1183 soisconnected(so); 1184#ifdef MAC 1185 mac_set_socket_peer_from_mbuf(m, so); 1186#endif 1187 /* Do window scaling on this connection? */ 1188 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1189 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1190 tp->snd_scale = tp->requested_s_scale; 1191 tp->rcv_scale = tp->request_r_scale; 1192 } 1193 /* Segment is acceptable, update cache if undefined. */ 1194 if (taop->tao_ccsent == 0) 1195 taop->tao_ccsent = to.to_ccecho; 1196 1197 tp->rcv_adv += tp->rcv_wnd; 1198 tp->snd_una++; /* SYN is acked */ 1199 /* 1200 * If there's data, delay ACK; if there's also a FIN 1201 * ACKNOW will be turned on later. 1202 */ 1203 if (DELAY_ACK(tp) && tlen != 0) 1204 callout_reset(tp->tt_delack, tcp_delacktime, 1205 tcp_timer_delack, tp); 1206 else 1207 tp->t_flags |= TF_ACKNOW; 1208 /* 1209 * Received <SYN,ACK> in SYN_SENT[*] state. 1210 * Transitions: 1211 * SYN_SENT --> ESTABLISHED 1212 * SYN_SENT* --> FIN_WAIT_1 1213 */ 1214 tp->t_starttime = ticks; 1215 if (tp->t_flags & TF_NEEDFIN) { 1216 tp->t_state = TCPS_FIN_WAIT_1; 1217 tp->t_flags &= ~TF_NEEDFIN; 1218 thflags &= ~TH_SYN; 1219 } else { 1220 tp->t_state = TCPS_ESTABLISHED; 1221 callout_reset(tp->tt_keep, tcp_keepidle, 1222 tcp_timer_keep, tp); 1223 } 1224 } else { 1225 /* 1226 * Received initial SYN in SYN-SENT[*] state => simul- 1227 * taneous open. If segment contains CC option and there is 1228 * a cached CC, apply TAO test; if it succeeds, connection is 1229 * half-synchronized. Otherwise, do 3-way handshake: 1230 * SYN-SENT -> SYN-RECEIVED 1231 * SYN-SENT* -> SYN-RECEIVED* 1232 * If there was no CC option, clear cached CC value. 1233 */ 1234 tp->t_flags |= TF_ACKNOW; 1235 callout_stop(tp->tt_rexmt); 1236 if (to.to_flags & TOF_CC) { 1237 if (taop->tao_cc != 0 && 1238 CC_GT(to.to_cc, taop->tao_cc)) { 1239 /* 1240 * update cache and make transition: 1241 * SYN-SENT -> ESTABLISHED* 1242 * SYN-SENT* -> FIN-WAIT-1* 1243 */ 1244 taop->tao_cc = to.to_cc; 1245 tp->t_starttime = ticks; 1246 if (tp->t_flags & TF_NEEDFIN) { 1247 tp->t_state = TCPS_FIN_WAIT_1; 1248 tp->t_flags &= ~TF_NEEDFIN; 1249 } else { 1250 tp->t_state = TCPS_ESTABLISHED; 1251 callout_reset(tp->tt_keep, 1252 tcp_keepidle, 1253 tcp_timer_keep, 1254 tp); 1255 } 1256 tp->t_flags |= TF_NEEDSYN; 1257 } else 1258 tp->t_state = TCPS_SYN_RECEIVED; 1259 } else { 1260 /* CC.NEW or no option => invalidate cache */ 1261 taop->tao_cc = 0; 1262 tp->t_state = TCPS_SYN_RECEIVED; 1263 } 1264 } 1265 1266trimthenstep6: 1267 /* 1268 * Advance th->th_seq to correspond to first data byte. 1269 * If data, trim to stay within window, 1270 * dropping FIN if necessary. 1271 */ 1272 th->th_seq++; 1273 if (tlen > tp->rcv_wnd) { 1274 todrop = tlen - tp->rcv_wnd; 1275 m_adj(m, -todrop); 1276 tlen = tp->rcv_wnd; 1277 thflags &= ~TH_FIN; 1278 tcpstat.tcps_rcvpackafterwin++; 1279 tcpstat.tcps_rcvbyteafterwin += todrop; 1280 } 1281 tp->snd_wl1 = th->th_seq - 1; 1282 tp->rcv_up = th->th_seq; 1283 /* 1284 * Client side of transaction: already sent SYN and data. 1285 * If the remote host used T/TCP to validate the SYN, 1286 * our data will be ACK'd; if so, enter normal data segment 1287 * processing in the middle of step 5, ack processing. 1288 * Otherwise, goto step 6. 1289 */ 1290 if (thflags & TH_ACK) 1291 goto process_ACK; 1292 goto step6; 1293 /* 1294 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 1295 * if segment contains a SYN and CC [not CC.NEW] option: 1296 * if state == TIME_WAIT and connection duration > MSL, 1297 * drop packet and send RST; 1298 * 1299 * if SEG.CC > CCrecv then is new SYN, and can implicitly 1300 * ack the FIN (and data) in retransmission queue. 1301 * Complete close and delete TCPCB. Then reprocess 1302 * segment, hoping to find new TCPCB in LISTEN state; 1303 * 1304 * else must be old SYN; drop it. 1305 * else do normal processing. 1306 */ 1307 case TCPS_LAST_ACK: 1308 case TCPS_CLOSING: 1309 case TCPS_TIME_WAIT: 1310 if ((thflags & TH_SYN) && 1311 (to.to_flags & TOF_CC) && tp->cc_recv != 0) { 1312 if (tp->t_state == TCPS_TIME_WAIT && 1313 (ticks - tp->t_starttime) > tcp_msl) { 1314 rstreason = BANDLIM_UNLIMITED; 1315 goto dropwithreset; 1316 } 1317 if (CC_GT(to.to_cc, tp->cc_recv)) { 1318 tp = tcp_close(tp); 1319 goto findpcb; 1320 } 1321 else 1322 goto drop; 1323 } 1324 break; /* continue normal processing */ 1325 } 1326 1327 /* 1328 * States other than LISTEN or SYN_SENT. 1329 * First check the RST flag and sequence number since reset segments 1330 * are exempt from the timestamp and connection count tests. This 1331 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 1332 * below which allowed reset segments in half the sequence space 1333 * to fall though and be processed (which gives forged reset 1334 * segments with a random sequence number a 50 percent chance of 1335 * killing a connection). 1336 * Then check timestamp, if present. 1337 * Then check the connection count, if present. 1338 * Then check that at least some bytes of segment are within 1339 * receive window. If segment begins before rcv_nxt, 1340 * drop leading data (and SYN); if nothing left, just ack. 1341 * 1342 * 1343 * If the RST bit is set, check the sequence number to see 1344 * if this is a valid reset segment. 1345 * RFC 793 page 37: 1346 * In all states except SYN-SENT, all reset (RST) segments 1347 * are validated by checking their SEQ-fields. A reset is 1348 * valid if its sequence number is in the window. 1349 * Note: this does not take into account delayed ACKs, so 1350 * we should test against last_ack_sent instead of rcv_nxt. 1351 * The sequence number in the reset segment is normally an 1352 * echo of our outgoing acknowlegement numbers, but some hosts 1353 * send a reset with the sequence number at the rightmost edge 1354 * of our receive window, and we have to handle this case. 1355 * If we have multiple segments in flight, the intial reset 1356 * segment sequence numbers will be to the left of last_ack_sent, 1357 * but they will eventually catch up. 1358 * In any case, it never made sense to trim reset segments to 1359 * fit the receive window since RFC 1122 says: 1360 * 4.2.2.12 RST Segment: RFC-793 Section 3.4 1361 * 1362 * A TCP SHOULD allow a received RST segment to include data. 1363 * 1364 * DISCUSSION 1365 * It has been suggested that a RST segment could contain 1366 * ASCII text that encoded and explained the cause of the 1367 * RST. No standard has yet been established for such 1368 * data. 1369 * 1370 * If the reset segment passes the sequence number test examine 1371 * the state: 1372 * SYN_RECEIVED STATE: 1373 * If passive open, return to LISTEN state. 1374 * If active open, inform user that connection was refused. 1375 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: 1376 * Inform user that connection was reset, and close tcb. 1377 * CLOSING, LAST_ACK STATES: 1378 * Close the tcb. 1379 * TIME_WAIT STATE: 1380 * Drop the segment - see Stevens, vol. 2, p. 964 and 1381 * RFC 1337. 1382 */ 1383 if (thflags & TH_RST) { 1384 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 1385 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1386 switch (tp->t_state) { 1387 1388 case TCPS_SYN_RECEIVED: 1389 so->so_error = ECONNREFUSED; 1390 goto close; 1391 1392 case TCPS_ESTABLISHED: 1393 case TCPS_FIN_WAIT_1: 1394 case TCPS_FIN_WAIT_2: 1395 case TCPS_CLOSE_WAIT: 1396 so->so_error = ECONNRESET; 1397 close: 1398 tp->t_state = TCPS_CLOSED; 1399 tcpstat.tcps_drops++; 1400 tp = tcp_close(tp); 1401 break; 1402 1403 case TCPS_CLOSING: 1404 case TCPS_LAST_ACK: 1405 tp = tcp_close(tp); 1406 break; 1407 1408 case TCPS_TIME_WAIT: 1409 break; 1410 } 1411 } 1412 goto drop; 1413 } 1414 1415 /* 1416 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1417 * and it's less than ts_recent, drop it. 1418 */ 1419 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 1420 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 1421 1422 /* Check to see if ts_recent is over 24 days old. */ 1423 if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1424 /* 1425 * Invalidate ts_recent. If this segment updates 1426 * ts_recent, the age will be reset later and ts_recent 1427 * will get a valid value. If it does not, setting 1428 * ts_recent to zero will at least satisfy the 1429 * requirement that zero be placed in the timestamp 1430 * echo reply when ts_recent isn't valid. The 1431 * age isn't reset until we get a valid ts_recent 1432 * because we don't want out-of-order segments to be 1433 * dropped when ts_recent is old. 1434 */ 1435 tp->ts_recent = 0; 1436 } else { 1437 tcpstat.tcps_rcvduppack++; 1438 tcpstat.tcps_rcvdupbyte += tlen; 1439 tcpstat.tcps_pawsdrop++; 1440 goto dropafterack; 1441 } 1442 } 1443 1444 /* 1445 * T/TCP mechanism 1446 * If T/TCP was negotiated and the segment doesn't have CC, 1447 * or if its CC is wrong then drop the segment. 1448 * RST segments do not have to comply with this. 1449 */ 1450 if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && 1451 ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) 1452 goto dropafterack; 1453 1454 /* 1455 * In the SYN-RECEIVED state, validate that the packet belongs to 1456 * this connection before trimming the data to fit the receive 1457 * window. Check the sequence number versus IRS since we know 1458 * the sequence numbers haven't wrapped. This is a partial fix 1459 * for the "LAND" DoS attack. 1460 */ 1461 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 1462 rstreason = BANDLIM_RST_OPENPORT; 1463 goto dropwithreset; 1464 } 1465 1466 todrop = tp->rcv_nxt - th->th_seq; 1467 if (todrop > 0) { 1468 if (thflags & TH_SYN) { 1469 thflags &= ~TH_SYN; 1470 th->th_seq++; 1471 if (th->th_urp > 1) 1472 th->th_urp--; 1473 else 1474 thflags &= ~TH_URG; 1475 todrop--; 1476 } 1477 /* 1478 * Following if statement from Stevens, vol. 2, p. 960. 1479 */ 1480 if (todrop > tlen 1481 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1482 /* 1483 * Any valid FIN must be to the left of the window. 1484 * At this point the FIN must be a duplicate or out 1485 * of sequence; drop it. 1486 */ 1487 thflags &= ~TH_FIN; 1488 1489 /* 1490 * Send an ACK to resynchronize and drop any data. 1491 * But keep on processing for RST or ACK. 1492 */ 1493 tp->t_flags |= TF_ACKNOW; 1494 todrop = tlen; 1495 tcpstat.tcps_rcvduppack++; 1496 tcpstat.tcps_rcvdupbyte += todrop; 1497 } else { 1498 tcpstat.tcps_rcvpartduppack++; 1499 tcpstat.tcps_rcvpartdupbyte += todrop; 1500 } 1501 drop_hdrlen += todrop; /* drop from the top afterwards */ 1502 th->th_seq += todrop; 1503 tlen -= todrop; 1504 if (th->th_urp > todrop) 1505 th->th_urp -= todrop; 1506 else { 1507 thflags &= ~TH_URG; 1508 th->th_urp = 0; 1509 } 1510 } 1511 1512 /* 1513 * If new data are received on a connection after the 1514 * user processes are gone, then RST the other end. 1515 */ 1516 if ((so->so_state & SS_NOFDREF) && 1517 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1518 tp = tcp_close(tp); 1519 tcpstat.tcps_rcvafterclose++; 1520 rstreason = BANDLIM_UNLIMITED; 1521 goto dropwithreset; 1522 } 1523 1524 /* 1525 * If segment ends after window, drop trailing data 1526 * (and PUSH and FIN); if nothing left, just ACK. 1527 */ 1528 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1529 if (todrop > 0) { 1530 tcpstat.tcps_rcvpackafterwin++; 1531 if (todrop >= tlen) { 1532 tcpstat.tcps_rcvbyteafterwin += tlen; 1533 /* 1534 * If a new connection request is received 1535 * while in TIME_WAIT, drop the old connection 1536 * and start over if the sequence numbers 1537 * are above the previous ones. 1538 */ 1539 if (thflags & TH_SYN && 1540 tp->t_state == TCPS_TIME_WAIT && 1541 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1542 tp = tcp_close(tp); 1543 goto findpcb; 1544 } 1545 /* 1546 * If window is closed can only take segments at 1547 * window edge, and have to drop data and PUSH from 1548 * incoming segments. Continue processing, but 1549 * remember to ack. Otherwise, drop segment 1550 * and ack. 1551 */ 1552 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1553 tp->t_flags |= TF_ACKNOW; 1554 tcpstat.tcps_rcvwinprobe++; 1555 } else 1556 goto dropafterack; 1557 } else 1558 tcpstat.tcps_rcvbyteafterwin += todrop; 1559 m_adj(m, -todrop); 1560 tlen -= todrop; 1561 thflags &= ~(TH_PUSH|TH_FIN); 1562 } 1563 1564 /* 1565 * If last ACK falls within this segment's sequence numbers, 1566 * record its timestamp. 1567 * NOTE that the test is modified according to the latest 1568 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1569 */ 1570 if ((to.to_flags & TOF_TS) != 0 && 1571 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1572 tp->ts_recent_age = ticks; 1573 tp->ts_recent = to.to_tsval; 1574 } 1575 1576 /* 1577 * If a SYN is in the window, then this is an 1578 * error and we send an RST and drop the connection. 1579 */ 1580 if (thflags & TH_SYN) { 1581 tp = tcp_drop(tp, ECONNRESET); 1582 rstreason = BANDLIM_UNLIMITED; 1583 goto dropwithreset; 1584 } 1585 1586 /* 1587 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 1588 * flag is on (half-synchronized state), then queue data for 1589 * later processing; else drop segment and return. 1590 */ 1591 if ((thflags & TH_ACK) == 0) { 1592 if (tp->t_state == TCPS_SYN_RECEIVED || 1593 (tp->t_flags & TF_NEEDSYN)) 1594 goto step6; 1595 else 1596 goto drop; 1597 } 1598 1599 /* 1600 * Ack processing. 1601 */ 1602 switch (tp->t_state) { 1603 1604 /* 1605 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1606 * ESTABLISHED state and continue processing. 1607 * The ACK was checked above. 1608 */ 1609 case TCPS_SYN_RECEIVED: 1610 1611 tcpstat.tcps_connects++; 1612 soisconnected(so); 1613 /* Do window scaling? */ 1614 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1615 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1616 tp->snd_scale = tp->requested_s_scale; 1617 tp->rcv_scale = tp->request_r_scale; 1618 } 1619 /* 1620 * Upon successful completion of 3-way handshake, 1621 * update cache.CC if it was undefined, pass any queued 1622 * data to the user, and advance state appropriately. 1623 */ 1624 if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL && 1625 taop->tao_cc == 0) 1626 taop->tao_cc = tp->cc_recv; 1627 1628 /* 1629 * Make transitions: 1630 * SYN-RECEIVED -> ESTABLISHED 1631 * SYN-RECEIVED* -> FIN-WAIT-1 1632 */ 1633 tp->t_starttime = ticks; 1634 if (tp->t_flags & TF_NEEDFIN) { 1635 tp->t_state = TCPS_FIN_WAIT_1; 1636 tp->t_flags &= ~TF_NEEDFIN; 1637 } else { 1638 tp->t_state = TCPS_ESTABLISHED; 1639 callout_reset(tp->tt_keep, tcp_keepidle, 1640 tcp_timer_keep, tp); 1641 } 1642 /* 1643 * If segment contains data or ACK, will call tcp_reass() 1644 * later; if not, do so now to pass queued data to user. 1645 */ 1646 if (tlen == 0 && (thflags & TH_FIN) == 0) 1647 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 1648 (struct mbuf *)0); 1649 tp->snd_wl1 = th->th_seq - 1; 1650 /* fall into ... */ 1651 1652 /* 1653 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1654 * ACKs. If the ack is in the range 1655 * tp->snd_una < th->th_ack <= tp->snd_max 1656 * then advance tp->snd_una to th->th_ack and drop 1657 * data from the retransmission queue. If this ACK reflects 1658 * more up to date window information we update our window information. 1659 */ 1660 case TCPS_ESTABLISHED: 1661 case TCPS_FIN_WAIT_1: 1662 case TCPS_FIN_WAIT_2: 1663 case TCPS_CLOSE_WAIT: 1664 case TCPS_CLOSING: 1665 case TCPS_LAST_ACK: 1666 case TCPS_TIME_WAIT: 1667 1668 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1669 if (tlen == 0 && tiwin == tp->snd_wnd) { 1670 tcpstat.tcps_rcvdupack++; 1671 /* 1672 * If we have outstanding data (other than 1673 * a window probe), this is a completely 1674 * duplicate ack (ie, window info didn't 1675 * change), the ack is the biggest we've 1676 * seen and we've seen exactly our rexmt 1677 * threshhold of them, assume a packet 1678 * has been dropped and retransmit it. 1679 * Kludge snd_nxt & the congestion 1680 * window so we send only this one 1681 * packet. 1682 * 1683 * We know we're losing at the current 1684 * window size so do congestion avoidance 1685 * (set ssthresh to half the current window 1686 * and pull our congestion window back to 1687 * the new ssthresh). 1688 * 1689 * Dup acks mean that packets have left the 1690 * network (they're now cached at the receiver) 1691 * so bump cwnd by the amount in the receiver 1692 * to keep a constant cwnd packets in the 1693 * network. 1694 */ 1695 if (!callout_active(tp->tt_rexmt) || 1696 th->th_ack != tp->snd_una) 1697 tp->t_dupacks = 0; 1698 else if (++tp->t_dupacks == tcprexmtthresh) { 1699 tcp_seq onxt = tp->snd_nxt; 1700 u_int win = 1701 min(tp->snd_wnd, tp->snd_cwnd) / 2 / 1702 tp->t_maxseg; 1703 if (tcp_do_newreno && SEQ_LT(th->th_ack, 1704 tp->snd_recover)) { 1705 /* False retransmit, should not 1706 * cut window 1707 */ 1708 tp->snd_cwnd += tp->t_maxseg; 1709 tp->t_dupacks = 0; 1710 (void) tcp_output(tp); 1711 goto drop; 1712 } 1713 if (win < 2) 1714 win = 2; 1715 tp->snd_ssthresh = win * tp->t_maxseg; 1716 tp->snd_recover = tp->snd_max; 1717 callout_stop(tp->tt_rexmt); 1718 tp->t_rtttime = 0; 1719 tp->snd_nxt = th->th_ack; 1720 tp->snd_cwnd = tp->t_maxseg; 1721 (void) tcp_output(tp); 1722 tp->snd_cwnd = tp->snd_ssthresh + 1723 tp->t_maxseg * tp->t_dupacks; 1724 if (SEQ_GT(onxt, tp->snd_nxt)) 1725 tp->snd_nxt = onxt; 1726 goto drop; 1727 } else if (tp->t_dupacks > tcprexmtthresh) { 1728 tp->snd_cwnd += tp->t_maxseg; 1729 (void) tcp_output(tp); 1730 goto drop; 1731 } 1732 } else 1733 tp->t_dupacks = 0; 1734 break; 1735 } 1736 /* 1737 * If the congestion window was inflated to account 1738 * for the other side's cached packets, retract it. 1739 */ 1740 if (tcp_do_newreno == 0) { 1741 if (tp->t_dupacks >= tcprexmtthresh && 1742 tp->snd_cwnd > tp->snd_ssthresh) 1743 tp->snd_cwnd = tp->snd_ssthresh; 1744 tp->t_dupacks = 0; 1745 } else if (tp->t_dupacks >= tcprexmtthresh && 1746 !tcp_newreno(tp, th)) { 1747 /* 1748 * Window inflation should have left us with approx. 1749 * snd_ssthresh outstanding data. But in case we 1750 * would be inclined to send a burst, better to do 1751 * it via the slow start mechanism. 1752 */ 1753 if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) 1754 tp->snd_cwnd = 1755 tp->snd_max - th->th_ack + tp->t_maxseg; 1756 else 1757 tp->snd_cwnd = tp->snd_ssthresh; 1758 tp->t_dupacks = 0; 1759 } 1760 if (tp->t_dupacks < tcprexmtthresh) 1761 tp->t_dupacks = 0; 1762 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1763 tcpstat.tcps_rcvacktoomuch++; 1764 goto dropafterack; 1765 } 1766 /* 1767 * If we reach this point, ACK is not a duplicate, 1768 * i.e., it ACKs something we sent. 1769 */ 1770 if (tp->t_flags & TF_NEEDSYN) { 1771 /* 1772 * T/TCP: Connection was half-synchronized, and our 1773 * SYN has been ACK'd (so connection is now fully 1774 * synchronized). Go to non-starred state, 1775 * increment snd_una for ACK of SYN, and check if 1776 * we can do window scaling. 1777 */ 1778 tp->t_flags &= ~TF_NEEDSYN; 1779 tp->snd_una++; 1780 /* Do window scaling? */ 1781 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1782 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1783 tp->snd_scale = tp->requested_s_scale; 1784 tp->rcv_scale = tp->request_r_scale; 1785 } 1786 } 1787 1788process_ACK: 1789 acked = th->th_ack - tp->snd_una; 1790 tcpstat.tcps_rcvackpack++; 1791 tcpstat.tcps_rcvackbyte += acked; 1792 1793 /* 1794 * If we just performed our first retransmit, and the ACK 1795 * arrives within our recovery window, then it was a mistake 1796 * to do the retransmit in the first place. Recover our 1797 * original cwnd and ssthresh, and proceed to transmit where 1798 * we left off. 1799 */ 1800 if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { 1801 ++tcpstat.tcps_sndrexmitbad; 1802 tp->snd_cwnd = tp->snd_cwnd_prev; 1803 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1804 tp->snd_nxt = tp->snd_max; 1805 tp->t_badrxtwin = 0; /* XXX probably not required */ 1806 } 1807 1808 /* 1809 * If we have a timestamp reply, update smoothed 1810 * round trip time. If no timestamp is present but 1811 * transmit timer is running and timed sequence 1812 * number was acked, update smoothed round trip time. 1813 * Since we now have an rtt measurement, cancel the 1814 * timer backoff (cf., Phil Karn's retransmit alg.). 1815 * Recompute the initial retransmit timer. 1816 */ 1817 if (to.to_flags & TOF_TS) 1818 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); 1819 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1820 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 1821 1822 /* 1823 * If all outstanding data is acked, stop retransmit 1824 * timer and remember to restart (more output or persist). 1825 * If there is more data to be acked, restart retransmit 1826 * timer, using current (possibly backed-off) value. 1827 */ 1828 if (th->th_ack == tp->snd_max) { 1829 callout_stop(tp->tt_rexmt); 1830 needoutput = 1; 1831 } else if (!callout_active(tp->tt_persist)) 1832 callout_reset(tp->tt_rexmt, tp->t_rxtcur, 1833 tcp_timer_rexmt, tp); 1834 1835 /* 1836 * If no data (only SYN) was ACK'd, 1837 * skip rest of ACK processing. 1838 */ 1839 if (acked == 0) 1840 goto step6; 1841 1842 /* 1843 * When new data is acked, open the congestion window. 1844 * If the window gives us less than ssthresh packets 1845 * in flight, open exponentially (maxseg per packet). 1846 * Otherwise open linearly: maxseg per window 1847 * (maxseg^2 / cwnd per packet). 1848 */ 1849 { 1850 register u_int cw = tp->snd_cwnd; 1851 register u_int incr = tp->t_maxseg; 1852 1853 if (cw > tp->snd_ssthresh) 1854 incr = incr * incr / cw; 1855 /* 1856 * If t_dupacks != 0 here, it indicates that we are still 1857 * in NewReno fast recovery mode, so we leave the congestion 1858 * window alone. 1859 */ 1860 if (tcp_do_newreno == 0 || tp->t_dupacks == 0) 1861 tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale); 1862 } 1863 if (acked > so->so_snd.sb_cc) { 1864 tp->snd_wnd -= so->so_snd.sb_cc; 1865 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1866 ourfinisacked = 1; 1867 } else { 1868 sbdrop(&so->so_snd, acked); 1869 tp->snd_wnd -= acked; 1870 ourfinisacked = 0; 1871 } 1872 sowwakeup(so); 1873 tp->snd_una = th->th_ack; 1874 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1875 tp->snd_nxt = tp->snd_una; 1876 1877 switch (tp->t_state) { 1878 1879 /* 1880 * In FIN_WAIT_1 STATE in addition to the processing 1881 * for the ESTABLISHED state if our FIN is now acknowledged 1882 * then enter FIN_WAIT_2. 1883 */ 1884 case TCPS_FIN_WAIT_1: 1885 if (ourfinisacked) { 1886 /* 1887 * If we can't receive any more 1888 * data, then closing user can proceed. 1889 * Starting the timer is contrary to the 1890 * specification, but if we don't get a FIN 1891 * we'll hang forever. 1892 */ 1893 if (so->so_state & SS_CANTRCVMORE) { 1894 soisdisconnected(so); 1895 callout_reset(tp->tt_2msl, tcp_maxidle, 1896 tcp_timer_2msl, tp); 1897 } 1898 tp->t_state = TCPS_FIN_WAIT_2; 1899 } 1900 break; 1901 1902 /* 1903 * In CLOSING STATE in addition to the processing for 1904 * the ESTABLISHED state if the ACK acknowledges our FIN 1905 * then enter the TIME-WAIT state, otherwise ignore 1906 * the segment. 1907 */ 1908 case TCPS_CLOSING: 1909 if (ourfinisacked) { 1910 tp->t_state = TCPS_TIME_WAIT; 1911 tcp_canceltimers(tp); 1912 /* Shorten TIME_WAIT [RFC-1644, p.28] */ 1913 if (tp->cc_recv != 0 && 1914 (ticks - tp->t_starttime) < tcp_msl) 1915 callout_reset(tp->tt_2msl, 1916 tp->t_rxtcur * 1917 TCPTV_TWTRUNC, 1918 tcp_timer_2msl, tp); 1919 else 1920 callout_reset(tp->tt_2msl, 2 * tcp_msl, 1921 tcp_timer_2msl, tp); 1922 soisdisconnected(so); 1923 } 1924 break; 1925 1926 /* 1927 * In LAST_ACK, we may still be waiting for data to drain 1928 * and/or to be acked, as well as for the ack of our FIN. 1929 * If our FIN is now acknowledged, delete the TCB, 1930 * enter the closed state and return. 1931 */ 1932 case TCPS_LAST_ACK: 1933 if (ourfinisacked) { 1934 tp = tcp_close(tp); 1935 goto drop; 1936 } 1937 break; 1938 1939 /* 1940 * In TIME_WAIT state the only thing that should arrive 1941 * is a retransmission of the remote FIN. Acknowledge 1942 * it and restart the finack timer. 1943 */ 1944 case TCPS_TIME_WAIT: 1945 callout_reset(tp->tt_2msl, 2 * tcp_msl, 1946 tcp_timer_2msl, tp); 1947 goto dropafterack; 1948 } 1949 } 1950 1951step6: 1952 /* 1953 * Update window information. 1954 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1955 */ 1956 if ((thflags & TH_ACK) && 1957 (SEQ_LT(tp->snd_wl1, th->th_seq) || 1958 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 1959 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1960 /* keep track of pure window updates */ 1961 if (tlen == 0 && 1962 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1963 tcpstat.tcps_rcvwinupd++; 1964 tp->snd_wnd = tiwin; 1965 tp->snd_wl1 = th->th_seq; 1966 tp->snd_wl2 = th->th_ack; 1967 if (tp->snd_wnd > tp->max_sndwnd) 1968 tp->max_sndwnd = tp->snd_wnd; 1969 needoutput = 1; 1970 } 1971 1972 /* 1973 * Process segments with URG. 1974 */ 1975 if ((thflags & TH_URG) && th->th_urp && 1976 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1977 /* 1978 * This is a kludge, but if we receive and accept 1979 * random urgent pointers, we'll crash in 1980 * soreceive. It's hard to imagine someone 1981 * actually wanting to send this much urgent data. 1982 */ 1983 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1984 th->th_urp = 0; /* XXX */ 1985 thflags &= ~TH_URG; /* XXX */ 1986 goto dodata; /* XXX */ 1987 } 1988 /* 1989 * If this segment advances the known urgent pointer, 1990 * then mark the data stream. This should not happen 1991 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1992 * a FIN has been received from the remote side. 1993 * In these states we ignore the URG. 1994 * 1995 * According to RFC961 (Assigned Protocols), 1996 * the urgent pointer points to the last octet 1997 * of urgent data. We continue, however, 1998 * to consider it to indicate the first octet 1999 * of data past the urgent section as the original 2000 * spec states (in one of two places). 2001 */ 2002 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2003 tp->rcv_up = th->th_seq + th->th_urp; 2004 so->so_oobmark = so->so_rcv.sb_cc + 2005 (tp->rcv_up - tp->rcv_nxt) - 1; 2006 if (so->so_oobmark == 0) 2007 so->so_state |= SS_RCVATMARK; 2008 sohasoutofband(so); 2009 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2010 } 2011 /* 2012 * Remove out of band data so doesn't get presented to user. 2013 * This can happen independent of advancing the URG pointer, 2014 * but if two URG's are pending at once, some out-of-band 2015 * data may creep in... ick. 2016 */ 2017 if (th->th_urp <= (u_long)tlen 2018#ifdef SO_OOBINLINE 2019 && (so->so_options & SO_OOBINLINE) == 0 2020#endif 2021 ) 2022 tcp_pulloutofband(so, th, m, 2023 drop_hdrlen); /* hdr drop is delayed */ 2024 } else 2025 /* 2026 * If no out of band data is expected, 2027 * pull receive urgent pointer along 2028 * with the receive window. 2029 */ 2030 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2031 tp->rcv_up = tp->rcv_nxt; 2032dodata: /* XXX */ 2033 KASSERT(headlocked, ("headlocked")); 2034 INP_INFO_WUNLOCK(&tcbinfo); 2035 headlocked = 0; 2036 /* 2037 * Process the segment text, merging it into the TCP sequencing queue, 2038 * and arranging for acknowledgment of receipt if necessary. 2039 * This process logically involves adjusting tp->rcv_wnd as data 2040 * is presented to the user (this happens in tcp_usrreq.c, 2041 * case PRU_RCVD). If a FIN has already been received on this 2042 * connection then we just ignore the text. 2043 */ 2044 if ((tlen || (thflags&TH_FIN)) && 2045 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2046 m_adj(m, drop_hdrlen); /* delayed header drop */ 2047 /* 2048 * Insert segment which inludes th into reassembly queue of tcp with 2049 * control block tp. Return TH_FIN if reassembly now includes 2050 * a segment with FIN. This handle the common case inline (segment 2051 * is the next to be received on an established connection, and the 2052 * queue is empty), avoiding linkage into and removal from the queue 2053 * and repetition of various conversions. 2054 * Set DELACK for segments received in order, but ack immediately 2055 * when segments are out of order (so fast retransmit can work). 2056 */ 2057 if (th->th_seq == tp->rcv_nxt && 2058 LIST_EMPTY(&tp->t_segq) && 2059 TCPS_HAVEESTABLISHED(tp->t_state)) { 2060 if (DELAY_ACK(tp)) 2061 callout_reset(tp->tt_delack, tcp_delacktime, 2062 tcp_timer_delack, tp); 2063 else 2064 tp->t_flags |= TF_ACKNOW; 2065 tp->rcv_nxt += tlen; 2066 thflags = th->th_flags & TH_FIN; 2067 tcpstat.tcps_rcvpack++; 2068 tcpstat.tcps_rcvbyte += tlen; 2069 ND6_HINT(tp); 2070 sbappend(&so->so_rcv, m); 2071 sorwakeup(so); 2072 } else { 2073 thflags = tcp_reass(tp, th, &tlen, m); 2074 tp->t_flags |= TF_ACKNOW; 2075 } 2076 2077 /* 2078 * Note the amount of data that peer has sent into 2079 * our window, in order to estimate the sender's 2080 * buffer size. 2081 */ 2082 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2083 } else { 2084 m_freem(m); 2085 thflags &= ~TH_FIN; 2086 } 2087 2088 /* 2089 * If FIN is received ACK the FIN and let the user know 2090 * that the connection is closing. 2091 */ 2092 if (thflags & TH_FIN) { 2093 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2094 socantrcvmore(so); 2095 /* 2096 * If connection is half-synchronized 2097 * (ie NEEDSYN flag on) then delay ACK, 2098 * so it may be piggybacked when SYN is sent. 2099 * Otherwise, since we received a FIN then no 2100 * more input can be expected, send ACK now. 2101 */ 2102 if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) 2103 callout_reset(tp->tt_delack, tcp_delacktime, 2104 tcp_timer_delack, tp); 2105 else 2106 tp->t_flags |= TF_ACKNOW; 2107 tp->rcv_nxt++; 2108 } 2109 switch (tp->t_state) { 2110 2111 /* 2112 * In SYN_RECEIVED and ESTABLISHED STATES 2113 * enter the CLOSE_WAIT state. 2114 */ 2115 case TCPS_SYN_RECEIVED: 2116 tp->t_starttime = ticks; 2117 /*FALLTHROUGH*/ 2118 case TCPS_ESTABLISHED: 2119 tp->t_state = TCPS_CLOSE_WAIT; 2120 break; 2121 2122 /* 2123 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2124 * enter the CLOSING state. 2125 */ 2126 case TCPS_FIN_WAIT_1: 2127 tp->t_state = TCPS_CLOSING; 2128 break; 2129 2130 /* 2131 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2132 * starting the time-wait timer, turning off the other 2133 * standard timers. 2134 */ 2135 case TCPS_FIN_WAIT_2: 2136 tp->t_state = TCPS_TIME_WAIT; 2137 tcp_canceltimers(tp); 2138 /* Shorten TIME_WAIT [RFC-1644, p.28] */ 2139 if (tp->cc_recv != 0 && 2140 (ticks - tp->t_starttime) < tcp_msl) { 2141 callout_reset(tp->tt_2msl, 2142 tp->t_rxtcur * TCPTV_TWTRUNC, 2143 tcp_timer_2msl, tp); 2144 /* For transaction client, force ACK now. */ 2145 tp->t_flags |= TF_ACKNOW; 2146 } 2147 else 2148 callout_reset(tp->tt_2msl, 2 * tcp_msl, 2149 tcp_timer_2msl, tp); 2150 soisdisconnected(so); 2151 break; 2152 2153 /* 2154 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2155 */ 2156 case TCPS_TIME_WAIT: 2157 callout_reset(tp->tt_2msl, 2 * tcp_msl, 2158 tcp_timer_2msl, tp); 2159 break; 2160 } 2161 } 2162#ifdef TCPDEBUG 2163 if (so->so_options & SO_DEBUG) 2164 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 2165 &tcp_savetcp, 0); 2166#endif 2167 2168 /* 2169 * Return any desired output. 2170 */ 2171 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2172 (void) tcp_output(tp); 2173 INP_UNLOCK(inp); 2174 return; 2175 2176dropafterack: 2177 /* 2178 * Generate an ACK dropping incoming segment if it occupies 2179 * sequence space, where the ACK reflects our state. 2180 * 2181 * We can now skip the test for the RST flag since all 2182 * paths to this code happen after packets containing 2183 * RST have been dropped. 2184 * 2185 * In the SYN-RECEIVED state, don't send an ACK unless the 2186 * segment we received passes the SYN-RECEIVED ACK test. 2187 * If it fails send a RST. This breaks the loop in the 2188 * "LAND" DoS attack, and also prevents an ACK storm 2189 * between two listening ports that have been sent forged 2190 * SYN segments, each with the source address of the other. 2191 */ 2192 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 2193 (SEQ_GT(tp->snd_una, th->th_ack) || 2194 SEQ_GT(th->th_ack, tp->snd_max)) ) { 2195 rstreason = BANDLIM_RST_OPENPORT; 2196 goto dropwithreset; 2197 } 2198#ifdef TCPDEBUG 2199 if (so->so_options & SO_DEBUG) 2200 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2201 &tcp_savetcp, 0); 2202#endif 2203 if (headlocked) 2204 INP_INFO_WUNLOCK(&tcbinfo); 2205 m_freem(m); 2206 tp->t_flags |= TF_ACKNOW; 2207 (void) tcp_output(tp); 2208 INP_UNLOCK(inp); 2209 return; 2210 2211dropwithreset: 2212 /* 2213 * Generate a RST, dropping incoming segment. 2214 * Make ACK acceptable to originator of segment. 2215 * Don't bother to respond if destination was broadcast/multicast. 2216 */ 2217 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2218 goto drop; 2219#ifdef INET6 2220 if (isipv6) { 2221 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 2222 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 2223 goto drop; 2224 } else 2225#endif /* INET6 */ 2226 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 2227 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 2228 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 2229 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 2230 goto drop; 2231 /* IPv6 anycast check is done at tcp6_input() */ 2232 2233 /* 2234 * Perform bandwidth limiting. 2235 */ 2236 if (badport_bandlim(rstreason) < 0) 2237 goto drop; 2238 2239#ifdef TCPDEBUG 2240 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 2241 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2242 &tcp_savetcp, 0); 2243#endif 2244 2245 if (tp) 2246 INP_UNLOCK(inp); 2247 2248 if (thflags & TH_ACK) 2249 /* mtod() below is safe as long as hdr dropping is delayed */ 2250 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, 2251 TH_RST); 2252 else { 2253 if (thflags & TH_SYN) 2254 tlen++; 2255 /* mtod() below is safe as long as hdr dropping is delayed */ 2256 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 2257 (tcp_seq)0, TH_RST|TH_ACK); 2258 } 2259 if (headlocked) 2260 INP_INFO_WUNLOCK(&tcbinfo); 2261 return; 2262 2263drop: 2264 /* 2265 * Drop space held by incoming segment and return. 2266 */ 2267#ifdef TCPDEBUG 2268 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 2269 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2270 &tcp_savetcp, 0); 2271#endif 2272 if (tp) 2273 INP_UNLOCK(inp); 2274 m_freem(m); 2275 if (headlocked) 2276 INP_INFO_WUNLOCK(&tcbinfo); 2277 return; 2278} 2279 2280/* 2281 * Parse TCP options and place in tcpopt. 2282 */ 2283static void 2284tcp_dooptions(to, cp, cnt, is_syn) 2285 struct tcpopt *to; 2286 u_char *cp; 2287 int cnt; 2288{ 2289 int opt, optlen; 2290 2291 to->to_flags = 0; 2292 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2293 opt = cp[0]; 2294 if (opt == TCPOPT_EOL) 2295 break; 2296 if (opt == TCPOPT_NOP) 2297 optlen = 1; 2298 else { 2299 if (cnt < 2) 2300 break; 2301 optlen = cp[1]; 2302 if (optlen < 2 || optlen > cnt) 2303 break; 2304 } 2305 switch (opt) { 2306 case TCPOPT_MAXSEG: 2307 if (optlen != TCPOLEN_MAXSEG) 2308 continue; 2309 if (!is_syn) 2310 continue; 2311 to->to_flags |= TOF_MSS; 2312 bcopy((char *)cp + 2, 2313 (char *)&to->to_mss, sizeof(to->to_mss)); 2314 to->to_mss = ntohs(to->to_mss); 2315 break; 2316 case TCPOPT_WINDOW: 2317 if (optlen != TCPOLEN_WINDOW) 2318 continue; 2319 if (! is_syn) 2320 continue; 2321 to->to_flags |= TOF_SCALE; 2322 to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2323 break; 2324 case TCPOPT_TIMESTAMP: 2325 if (optlen != TCPOLEN_TIMESTAMP) 2326 continue; 2327 to->to_flags |= TOF_TS; 2328 bcopy((char *)cp + 2, 2329 (char *)&to->to_tsval, sizeof(to->to_tsval)); 2330 to->to_tsval = ntohl(to->to_tsval); 2331 bcopy((char *)cp + 6, 2332 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 2333 to->to_tsecr = ntohl(to->to_tsecr); 2334 break; 2335 case TCPOPT_CC: 2336 if (optlen != TCPOLEN_CC) 2337 continue; 2338 to->to_flags |= TOF_CC; 2339 bcopy((char *)cp + 2, 2340 (char *)&to->to_cc, sizeof(to->to_cc)); 2341 to->to_cc = ntohl(to->to_cc); 2342 break; 2343 case TCPOPT_CCNEW: 2344 if (optlen != TCPOLEN_CC) 2345 continue; 2346 if (!is_syn) 2347 continue; 2348 to->to_flags |= TOF_CCNEW; 2349 bcopy((char *)cp + 2, 2350 (char *)&to->to_cc, sizeof(to->to_cc)); 2351 to->to_cc = ntohl(to->to_cc); 2352 break; 2353 case TCPOPT_CCECHO: 2354 if (optlen != TCPOLEN_CC) 2355 continue; 2356 if (!is_syn) 2357 continue; 2358 to->to_flags |= TOF_CCECHO; 2359 bcopy((char *)cp + 2, 2360 (char *)&to->to_ccecho, sizeof(to->to_ccecho)); 2361 to->to_ccecho = ntohl(to->to_ccecho); 2362 break; 2363 default: 2364 continue; 2365 } 2366 } 2367} 2368 2369/* 2370 * Pull out of band byte out of a segment so 2371 * it doesn't appear in the user's data queue. 2372 * It is still reflected in the segment length for 2373 * sequencing purposes. 2374 */ 2375static void 2376tcp_pulloutofband(so, th, m, off) 2377 struct socket *so; 2378 struct tcphdr *th; 2379 register struct mbuf *m; 2380 int off; /* delayed to be droped hdrlen */ 2381{ 2382 int cnt = off + th->th_urp - 1; 2383 2384 while (cnt >= 0) { 2385 if (m->m_len > cnt) { 2386 char *cp = mtod(m, caddr_t) + cnt; 2387 struct tcpcb *tp = sototcpcb(so); 2388 2389 tp->t_iobc = *cp; 2390 tp->t_oobflags |= TCPOOB_HAVEDATA; 2391 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2392 m->m_len--; 2393 if (m->m_flags & M_PKTHDR) 2394 m->m_pkthdr.len--; 2395 return; 2396 } 2397 cnt -= m->m_len; 2398 m = m->m_next; 2399 if (m == 0) 2400 break; 2401 } 2402 panic("tcp_pulloutofband"); 2403} 2404 2405/* 2406 * Collect new round-trip time estimate 2407 * and update averages and current timeout. 2408 */ 2409static void 2410tcp_xmit_timer(tp, rtt) 2411 register struct tcpcb *tp; 2412 int rtt; 2413{ 2414 register int delta; 2415 2416 tcpstat.tcps_rttupdated++; 2417 tp->t_rttupdated++; 2418 if (tp->t_srtt != 0) { 2419 /* 2420 * srtt is stored as fixed point with 5 bits after the 2421 * binary point (i.e., scaled by 8). The following magic 2422 * is equivalent to the smoothing algorithm in rfc793 with 2423 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2424 * point). Adjust rtt to origin 0. 2425 */ 2426 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 2427 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 2428 2429 if ((tp->t_srtt += delta) <= 0) 2430 tp->t_srtt = 1; 2431 2432 /* 2433 * We accumulate a smoothed rtt variance (actually, a 2434 * smoothed mean difference), then set the retransmit 2435 * timer to smoothed rtt + 4 times the smoothed variance. 2436 * rttvar is stored as fixed point with 4 bits after the 2437 * binary point (scaled by 16). The following is 2438 * equivalent to rfc793 smoothing with an alpha of .75 2439 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2440 * rfc793's wired-in beta. 2441 */ 2442 if (delta < 0) 2443 delta = -delta; 2444 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 2445 if ((tp->t_rttvar += delta) <= 0) 2446 tp->t_rttvar = 1; 2447 } else { 2448 /* 2449 * No rtt measurement yet - use the unsmoothed rtt. 2450 * Set the variance to half the rtt (so our first 2451 * retransmit happens at 3*rtt). 2452 */ 2453 tp->t_srtt = rtt << TCP_RTT_SHIFT; 2454 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 2455 } 2456 tp->t_rtttime = 0; 2457 tp->t_rxtshift = 0; 2458 2459 /* 2460 * the retransmit should happen at rtt + 4 * rttvar. 2461 * Because of the way we do the smoothing, srtt and rttvar 2462 * will each average +1/2 tick of bias. When we compute 2463 * the retransmit timer, we want 1/2 tick of rounding and 2464 * 1 extra tick because of +-1/2 tick uncertainty in the 2465 * firing of the timer. The bias will give us exactly the 2466 * 1.5 tick we need. But, because the bias is 2467 * statistical, we have to test that we don't drop below 2468 * the minimum feasible timer (which is 2 ticks). 2469 */ 2470 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 2471 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 2472 2473 /* 2474 * We received an ack for a packet that wasn't retransmitted; 2475 * it is probably safe to discard any error indications we've 2476 * received recently. This isn't quite right, but close enough 2477 * for now (a route might have failed after we sent a segment, 2478 * and the return path might not be symmetrical). 2479 */ 2480 tp->t_softerror = 0; 2481} 2482 2483/* 2484 * Determine a reasonable value for maxseg size. 2485 * If the route is known, check route for mtu. 2486 * If none, use an mss that can be handled on the outgoing 2487 * interface without forcing IP to fragment; if bigger than 2488 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2489 * to utilize large mbufs. If no route is found, route has no mtu, 2490 * or the destination isn't local, use a default, hopefully conservative 2491 * size (usually 512 or the default IP max size, but no more than the mtu 2492 * of the interface), as we can't discover anything about intervening 2493 * gateways or networks. We also initialize the congestion/slow start 2494 * window to be a single segment if the destination isn't local. 2495 * While looking at the routing entry, we also initialize other path-dependent 2496 * parameters from pre-set or cached values in the routing entry. 2497 * 2498 * Also take into account the space needed for options that we 2499 * send regularly. Make maxseg shorter by that amount to assure 2500 * that we can send maxseg amount of data even when the options 2501 * are present. Store the upper limit of the length of options plus 2502 * data in maxopd. 2503 * 2504 * NOTE that this routine is only called when we process an incoming 2505 * segment, for outgoing segments only tcp_mssopt is called. 2506 * 2507 * In case of T/TCP, we call this routine during implicit connection 2508 * setup as well (offer = -1), to initialize maxseg from the cached 2509 * MSS of our peer. 2510 */ 2511void 2512tcp_mss(tp, offer) 2513 struct tcpcb *tp; 2514 int offer; 2515{ 2516 register struct rtentry *rt; 2517 struct ifnet *ifp; 2518 register int rtt, mss; 2519 u_long bufsize; 2520 struct inpcb *inp; 2521 struct socket *so; 2522 struct rmxp_tao *taop; 2523 int origoffer = offer; 2524#ifdef INET6 2525 int isipv6; 2526 int min_protoh; 2527#endif 2528 2529 inp = tp->t_inpcb; 2530#ifdef INET6 2531 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 2532 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) 2533 : sizeof (struct tcpiphdr); 2534#else 2535#define min_protoh (sizeof (struct tcpiphdr)) 2536#endif 2537#ifdef INET6 2538 if (isipv6) 2539 rt = tcp_rtlookup6(&inp->inp_inc); 2540 else 2541#endif 2542 rt = tcp_rtlookup(&inp->inp_inc); 2543 if (rt == NULL) { 2544 tp->t_maxopd = tp->t_maxseg = 2545#ifdef INET6 2546 isipv6 ? tcp_v6mssdflt : 2547#endif /* INET6 */ 2548 tcp_mssdflt; 2549 return; 2550 } 2551 ifp = rt->rt_ifp; 2552 so = inp->inp_socket; 2553 2554 taop = rmx_taop(rt->rt_rmx); 2555 /* 2556 * Offer == -1 means that we didn't receive SYN yet, 2557 * use cached value in that case; 2558 */ 2559 if (offer == -1) 2560 offer = taop->tao_mssopt; 2561 /* 2562 * Offer == 0 means that there was no MSS on the SYN segment, 2563 * in this case we use tcp_mssdflt. 2564 */ 2565 if (offer == 0) 2566 offer = 2567#ifdef INET6 2568 isipv6 ? tcp_v6mssdflt : 2569#endif /* INET6 */ 2570 tcp_mssdflt; 2571 else 2572 /* 2573 * Sanity check: make sure that maxopd will be large 2574 * enough to allow some data on segments even is the 2575 * all the option space is used (40bytes). Otherwise 2576 * funny things may happen in tcp_output. 2577 */ 2578 offer = max(offer, 64); 2579 taop->tao_mssopt = offer; 2580 2581 /* 2582 * While we're here, check if there's an initial rtt 2583 * or rttvar. Convert from the route-table units 2584 * to scaled multiples of the slow timeout timer. 2585 */ 2586 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { 2587 /* 2588 * XXX the lock bit for RTT indicates that the value 2589 * is also a minimum value; this is subject to time. 2590 */ 2591 if (rt->rt_rmx.rmx_locks & RTV_RTT) 2592 tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); 2593 tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); 2594 tcpstat.tcps_usedrtt++; 2595 if (rt->rt_rmx.rmx_rttvar) { 2596 tp->t_rttvar = rt->rt_rmx.rmx_rttvar / 2597 (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); 2598 tcpstat.tcps_usedrttvar++; 2599 } else { 2600 /* default variation is +- 1 rtt */ 2601 tp->t_rttvar = 2602 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 2603 } 2604 TCPT_RANGESET(tp->t_rxtcur, 2605 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 2606 tp->t_rttmin, TCPTV_REXMTMAX); 2607 } 2608 /* 2609 * if there's an mtu associated with the route, use it 2610 * else, use the link mtu. 2611 */ 2612 if (rt->rt_rmx.rmx_mtu) 2613 mss = rt->rt_rmx.rmx_mtu - min_protoh; 2614 else 2615 { 2616 mss = 2617#ifdef INET6 2618 (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu : 2619#endif 2620 ifp->if_mtu 2621#ifdef INET6 2622 ) 2623#endif 2624 - min_protoh; 2625#ifdef INET6 2626 if (isipv6) { 2627 if (!in6_localaddr(&inp->in6p_faddr)) 2628 mss = min(mss, tcp_v6mssdflt); 2629 } else 2630#endif 2631 if (!in_localaddr(inp->inp_faddr)) 2632 mss = min(mss, tcp_mssdflt); 2633 } 2634 mss = min(mss, offer); 2635 /* 2636 * maxopd stores the maximum length of data AND options 2637 * in a segment; maxseg is the amount of data in a normal 2638 * segment. We need to store this value (maxopd) apart 2639 * from maxseg, because now every segment carries options 2640 * and thus we normally have somewhat less data in segments. 2641 */ 2642 tp->t_maxopd = mss; 2643 2644 /* 2645 * In case of T/TCP, origoffer==-1 indicates, that no segments 2646 * were received yet. In this case we just guess, otherwise 2647 * we do the same as before T/TCP. 2648 */ 2649 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2650 (origoffer == -1 || 2651 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 2652 mss -= TCPOLEN_TSTAMP_APPA; 2653 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 2654 (origoffer == -1 || 2655 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) 2656 mss -= TCPOLEN_CC_APPA; 2657 2658#if (MCLBYTES & (MCLBYTES - 1)) == 0 2659 if (mss > MCLBYTES) 2660 mss &= ~(MCLBYTES-1); 2661#else 2662 if (mss > MCLBYTES) 2663 mss = mss / MCLBYTES * MCLBYTES; 2664#endif 2665 /* 2666 * If there's a pipesize, change the socket buffer 2667 * to that size. Make the socket buffers an integral 2668 * number of mss units; if the mss is larger than 2669 * the socket buffer, decrease the mss. 2670 */ 2671#ifdef RTV_SPIPE 2672 if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) 2673#endif 2674 bufsize = so->so_snd.sb_hiwat; 2675 if (bufsize < mss) 2676 mss = bufsize; 2677 else { 2678 bufsize = roundup(bufsize, mss); 2679 if (bufsize > sb_max) 2680 bufsize = sb_max; 2681 if (bufsize > so->so_snd.sb_hiwat) 2682 (void)sbreserve(&so->so_snd, bufsize, so, NULL); 2683 } 2684 tp->t_maxseg = mss; 2685 2686#ifdef RTV_RPIPE 2687 if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) 2688#endif 2689 bufsize = so->so_rcv.sb_hiwat; 2690 if (bufsize > mss) { 2691 bufsize = roundup(bufsize, mss); 2692 if (bufsize > sb_max) 2693 bufsize = sb_max; 2694 if (bufsize > so->so_rcv.sb_hiwat) 2695 (void)sbreserve(&so->so_rcv, bufsize, so, NULL); 2696 } 2697 2698 /* 2699 * Set the slow-start flight size depending on whether this 2700 * is a local network or not. 2701 */ 2702 if ( 2703#ifdef INET6 2704 (isipv6 && in6_localaddr(&inp->in6p_faddr)) || 2705 (!isipv6 && 2706#endif 2707 in_localaddr(inp->inp_faddr) 2708#ifdef INET6 2709 ) 2710#endif 2711 ) 2712 tp->snd_cwnd = mss * ss_fltsz_local; 2713 else 2714 tp->snd_cwnd = mss * ss_fltsz; 2715 2716 if (rt->rt_rmx.rmx_ssthresh) { 2717 /* 2718 * There's some sort of gateway or interface 2719 * buffer limit on the path. Use this to set 2720 * the slow start threshhold, but set the 2721 * threshold to no less than 2*mss. 2722 */ 2723 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); 2724 tcpstat.tcps_usedssthresh++; 2725 } 2726} 2727 2728/* 2729 * Determine the MSS option to send on an outgoing SYN. 2730 */ 2731int 2732tcp_mssopt(tp) 2733 struct tcpcb *tp; 2734{ 2735 struct rtentry *rt; 2736#ifdef INET6 2737 int isipv6; 2738 int min_protoh; 2739#endif 2740 2741#ifdef INET6 2742 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 2743 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) 2744 : sizeof (struct tcpiphdr); 2745#else 2746#define min_protoh (sizeof (struct tcpiphdr)) 2747#endif 2748#ifdef INET6 2749 if (isipv6) 2750 rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc); 2751 else 2752#endif /* INET6 */ 2753 rt = tcp_rtlookup(&tp->t_inpcb->inp_inc); 2754 if (rt == NULL) 2755 return 2756#ifdef INET6 2757 isipv6 ? tcp_v6mssdflt : 2758#endif /* INET6 */ 2759 tcp_mssdflt; 2760 2761 return rt->rt_ifp->if_mtu - min_protoh; 2762} 2763 2764 2765/* 2766 * Checks for partial ack. If partial ack arrives, force the retransmission 2767 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 2768 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 2769 * be started again. If the ack advances at least to tp->snd_recover, return 0. 2770 */ 2771static int 2772tcp_newreno(tp, th) 2773 struct tcpcb *tp; 2774 struct tcphdr *th; 2775{ 2776 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2777 tcp_seq onxt = tp->snd_nxt; 2778 u_long ocwnd = tp->snd_cwnd; 2779 2780 callout_stop(tp->tt_rexmt); 2781 tp->t_rtttime = 0; 2782 tp->snd_nxt = th->th_ack; 2783 /* 2784 * Set snd_cwnd to one segment beyond acknowledged offset 2785 * (tp->snd_una has not yet been updated when this function 2786 * is called) 2787 */ 2788 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 2789 (void) tcp_output(tp); 2790 tp->snd_cwnd = ocwnd; 2791 if (SEQ_GT(onxt, tp->snd_nxt)) 2792 tp->snd_nxt = onxt; 2793 /* 2794 * Partial window deflation. Relies on fact that tp->snd_una 2795 * not updated yet. 2796 */ 2797 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); 2798 return (1); 2799 } 2800 return (0); 2801} 2802