tcp_input.c revision 169683
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 30 * $FreeBSD: head/sys/netinet/tcp_input.c 169683 2007-05-18 19:58:37Z andre $ 31 */ 32 33#include "opt_ipfw.h" /* for ipfw_fwd */ 34#include "opt_inet.h" 35#include "opt_inet6.h" 36#include "opt_ipsec.h" 37#include "opt_mac.h" 38#include "opt_tcpdebug.h" 39 40#include <sys/param.h> 41#include <sys/kernel.h> 42#include <sys/malloc.h> 43#include <sys/mbuf.h> 44#include <sys/proc.h> /* for proc0 declaration */ 45#include <sys/protosw.h> 46#include <sys/signalvar.h> 47#include <sys/socket.h> 48#include <sys/socketvar.h> 49#include <sys/sysctl.h> 50#include <sys/syslog.h> 51#include <sys/systm.h> 52 53#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 54 55#include <vm/uma.h> 56 57#include <net/if.h> 58#include <net/route.h> 59 60#include <netinet/in.h> 61#include <netinet/in_pcb.h> 62#include <netinet/in_systm.h> 63#include <netinet/in_var.h> 64#include <netinet/ip.h> 65#include <netinet/ip_icmp.h> /* required for icmp_var.h */ 66#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 67#include <netinet/ip_var.h> 68#include <netinet/ip_options.h> 69#include <netinet/ip6.h> 70#include <netinet/icmp6.h> 71#include <netinet6/in6_pcb.h> 72#include <netinet6/ip6_var.h> 73#include <netinet6/nd6.h> 74#include <netinet/tcp.h> 75#include <netinet/tcp_fsm.h> 76#include <netinet/tcp_seq.h> 77#include <netinet/tcp_timer.h> 78#include <netinet/tcp_var.h> 79#include <netinet6/tcp6_var.h> 80#include <netinet/tcpip.h> 81#ifdef TCPDEBUG 82#include <netinet/tcp_debug.h> 83#endif /* TCPDEBUG */ 84 85#ifdef FAST_IPSEC 86#include <netipsec/ipsec.h> 87#include <netipsec/ipsec6.h> 88#endif /*FAST_IPSEC*/ 89 90#ifdef IPSEC 91#include <netinet6/ipsec.h> 92#include <netinet6/ipsec6.h> 93#include <netkey/key.h> 94#endif /*IPSEC*/ 95 96#include <machine/in_cksum.h> 97 98#include <security/mac/mac_framework.h> 99 100static const int tcprexmtthresh = 3; 101 102struct tcpstat tcpstat; 103SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, 104 &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 105 106static int tcp_log_in_vain = 0; 107SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 108 &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); 109 110static int blackhole = 0; 111SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, 112 &blackhole, 0, "Do not send RST on segments to closed ports"); 113 114int tcp_delack_enabled = 1; 115SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, 116 &tcp_delack_enabled, 0, 117 "Delay ACK to try and piggyback it onto a data packet"); 118 119static int drop_synfin = 0; 120SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, 121 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); 122 123static int tcp_do_rfc3042 = 1; 124SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, 125 &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); 126 127static int tcp_do_rfc3390 = 1; 128SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, 129 &tcp_do_rfc3390, 0, 130 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); 131 132static int tcp_insecure_rst = 0; 133SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, 134 &tcp_insecure_rst, 0, 135 "Follow the old (insecure) criteria for accepting RST packets"); 136 137int tcp_do_autorcvbuf = 1; 138SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, 139 &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); 140 141int tcp_autorcvbuf_inc = 16*1024; 142SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, 143 &tcp_autorcvbuf_inc, 0, 144 "Incrementor step size of automatic receive buffer"); 145 146int tcp_autorcvbuf_max = 256*1024; 147SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, 148 &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); 149 150struct inpcbhead tcb; 151#define tcb6 tcb /* for KAME src sync over BSD*'s */ 152struct inpcbinfo tcbinfo; 153 154static void tcp_dooptions(struct tcpopt *, u_char *, int, int); 155static void tcp_do_segment(struct mbuf *, struct tcphdr *, 156 struct socket *, struct tcpcb *, int, int); 157static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, 158 struct tcpcb *, int, int); 159static void tcp_pulloutofband(struct socket *, 160 struct tcphdr *, struct mbuf *, int); 161static void tcp_xmit_timer(struct tcpcb *, int); 162static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); 163 164/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ 165#ifdef INET6 166#define ND6_HINT(tp) \ 167do { \ 168 if ((tp) && (tp)->t_inpcb && \ 169 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ 170 nd6_nud_hint(NULL, NULL, 0); \ 171} while (0) 172#else 173#define ND6_HINT(tp) 174#endif 175 176/* 177 * Indicate whether this ack should be delayed. We can delay the ack if 178 * - there is no delayed ack timer in progress and 179 * - our last ack wasn't a 0-sized window. We never want to delay 180 * the ack that opens up a 0-sized window and 181 * - delayed acks are enabled or 182 * - this is a half-synchronized T/TCP connection. 183 */ 184#define DELAY_ACK(tp) \ 185 ((!tcp_timer_active(tp, TT_DELACK) && \ 186 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 187 (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 188 189 190/* 191 * TCP input routine, follows pages 65-76 of the 192 * protocol specification dated September, 1981 very closely. 193 */ 194#ifdef INET6 195int 196tcp6_input(struct mbuf **mp, int *offp, int proto) 197{ 198 struct mbuf *m = *mp; 199 struct in6_ifaddr *ia6; 200 201 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); 202 203 /* 204 * draft-itojun-ipv6-tcp-to-anycast 205 * better place to put this in? 206 */ 207 ia6 = ip6_getdstifaddr(m); 208 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { 209 struct ip6_hdr *ip6; 210 211 ip6 = mtod(m, struct ip6_hdr *); 212 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 213 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 214 return IPPROTO_DONE; 215 } 216 217 tcp_input(m, *offp); 218 return IPPROTO_DONE; 219} 220#endif 221 222void 223tcp_input(struct mbuf *m, int off0) 224{ 225 struct tcphdr *th; 226 struct ip *ip = NULL; 227 struct ipovly *ipov; 228 struct inpcb *inp = NULL; 229 struct tcpcb *tp = NULL; 230 struct socket *so = NULL; 231 u_char *optp = NULL; 232 int optlen = 0; 233 int len, tlen, off; 234 int drop_hdrlen; 235 int thflags; 236 int rstreason = 0; /* For badport_bandlim accounting purposes */ 237#ifdef IPFIREWALL_FORWARD 238 struct m_tag *fwd_tag; 239#endif 240#ifdef INET6 241 struct ip6_hdr *ip6 = NULL; 242 int isipv6; 243#else 244 const int isipv6 = 0; 245#endif 246 struct tcpopt to; /* options in this segment */ 247 248#ifdef TCPDEBUG 249 /* 250 * The size of tcp_saveipgen must be the size of the max ip header, 251 * now IPv6. 252 */ 253 u_char tcp_saveipgen[IP6_HDR_LEN]; 254 struct tcphdr tcp_savetcp; 255 short ostate = 0; 256#endif 257 258#ifdef INET6 259 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 260#endif 261 262 to.to_flags = 0; 263 tcpstat.tcps_rcvtotal++; 264 265 if (isipv6) { 266#ifdef INET6 267 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ 268 ip6 = mtod(m, struct ip6_hdr *); 269 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 270 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { 271 tcpstat.tcps_rcvbadsum++; 272 goto drop; 273 } 274 th = (struct tcphdr *)((caddr_t)ip6 + off0); 275 276 /* 277 * Be proactive about unspecified IPv6 address in source. 278 * As we use all-zero to indicate unbounded/unconnected pcb, 279 * unspecified IPv6 address can be used to confuse us. 280 * 281 * Note that packets with unspecified IPv6 destination is 282 * already dropped in ip6_input. 283 */ 284 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 285 /* XXX stat */ 286 goto drop; 287 } 288#else 289 th = NULL; /* XXX: avoid compiler warning */ 290#endif 291 } else { 292 /* 293 * Get IP and TCP header together in first mbuf. 294 * Note: IP leaves IP header in first mbuf. 295 */ 296 if (off0 > sizeof (struct ip)) { 297 ip_stripoptions(m, (struct mbuf *)0); 298 off0 = sizeof(struct ip); 299 } 300 if (m->m_len < sizeof (struct tcpiphdr)) { 301 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 302 == NULL) { 303 tcpstat.tcps_rcvshort++; 304 return; 305 } 306 } 307 ip = mtod(m, struct ip *); 308 ipov = (struct ipovly *)ip; 309 th = (struct tcphdr *)((caddr_t)ip + off0); 310 tlen = ip->ip_len; 311 312 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 313 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 314 th->th_sum = m->m_pkthdr.csum_data; 315 else 316 th->th_sum = in_pseudo(ip->ip_src.s_addr, 317 ip->ip_dst.s_addr, 318 htonl(m->m_pkthdr.csum_data + 319 ip->ip_len + 320 IPPROTO_TCP)); 321 th->th_sum ^= 0xffff; 322#ifdef TCPDEBUG 323 ipov->ih_len = (u_short)tlen; 324 ipov->ih_len = htons(ipov->ih_len); 325#endif 326 } else { 327 /* 328 * Checksum extended TCP header and data. 329 */ 330 len = sizeof (struct ip) + tlen; 331 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 332 ipov->ih_len = (u_short)tlen; 333 ipov->ih_len = htons(ipov->ih_len); 334 th->th_sum = in_cksum(m, len); 335 } 336 if (th->th_sum) { 337 tcpstat.tcps_rcvbadsum++; 338 goto drop; 339 } 340 /* Re-initialization for later version check */ 341 ip->ip_v = IPVERSION; 342 } 343 344 /* 345 * Check that TCP offset makes sense, 346 * pull out TCP options and adjust length. XXX 347 */ 348 off = th->th_off << 2; 349 if (off < sizeof (struct tcphdr) || off > tlen) { 350 tcpstat.tcps_rcvbadoff++; 351 goto drop; 352 } 353 tlen -= off; /* tlen is used instead of ti->ti_len */ 354 if (off > sizeof (struct tcphdr)) { 355 if (isipv6) { 356#ifdef INET6 357 IP6_EXTHDR_CHECK(m, off0, off, ); 358 ip6 = mtod(m, struct ip6_hdr *); 359 th = (struct tcphdr *)((caddr_t)ip6 + off0); 360#endif 361 } else { 362 if (m->m_len < sizeof(struct ip) + off) { 363 if ((m = m_pullup(m, sizeof (struct ip) + off)) 364 == NULL) { 365 tcpstat.tcps_rcvshort++; 366 return; 367 } 368 ip = mtod(m, struct ip *); 369 ipov = (struct ipovly *)ip; 370 th = (struct tcphdr *)((caddr_t)ip + off0); 371 } 372 } 373 optlen = off - sizeof (struct tcphdr); 374 optp = (u_char *)(th + 1); 375 } 376 thflags = th->th_flags; 377 378 /* 379 * If the drop_synfin option is enabled, drop all packets with 380 * both the SYN and FIN bits set. This prevents e.g. nmap from 381 * identifying the TCP/IP stack. 382 * 383 * This is a violation of the TCP specification. 384 */ 385 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) 386 goto drop; 387 388 /* 389 * Convert TCP protocol specific fields to host format. 390 */ 391 th->th_seq = ntohl(th->th_seq); 392 th->th_ack = ntohl(th->th_ack); 393 th->th_win = ntohs(th->th_win); 394 th->th_urp = ntohs(th->th_urp); 395 396 /* 397 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. 398 */ 399 drop_hdrlen = off0 + off; 400 401 /* 402 * Locate pcb for segment. 403 */ 404 INP_INFO_WLOCK(&tcbinfo); 405findpcb: 406 INP_INFO_WLOCK_ASSERT(&tcbinfo); 407#ifdef IPFIREWALL_FORWARD 408 /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ 409 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 410 411 if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ 412 struct sockaddr_in *next_hop; 413 414 next_hop = (struct sockaddr_in *)(fwd_tag+1); 415 /* 416 * Transparently forwarded. Pretend to be the destination. 417 * already got one like this? 418 */ 419 inp = in_pcblookup_hash(&tcbinfo, 420 ip->ip_src, th->th_sport, 421 ip->ip_dst, th->th_dport, 422 0, m->m_pkthdr.rcvif); 423 if (!inp) { 424 /* It's new. Try to find the ambushing socket. */ 425 inp = in_pcblookup_hash(&tcbinfo, 426 ip->ip_src, th->th_sport, 427 next_hop->sin_addr, 428 next_hop->sin_port ? 429 ntohs(next_hop->sin_port) : 430 th->th_dport, 431 INPLOOKUP_WILDCARD, 432 m->m_pkthdr.rcvif); 433 } 434 /* Remove the tag from the packet. We don't need it anymore. */ 435 m_tag_delete(m, fwd_tag); 436 } else 437#endif /* IPFIREWALL_FORWARD */ 438 { 439 if (isipv6) { 440#ifdef INET6 441 inp = in6_pcblookup_hash(&tcbinfo, 442 &ip6->ip6_src, th->th_sport, 443 &ip6->ip6_dst, th->th_dport, 444 INPLOOKUP_WILDCARD, 445 m->m_pkthdr.rcvif); 446#endif 447 } else 448 inp = in_pcblookup_hash(&tcbinfo, 449 ip->ip_src, th->th_sport, 450 ip->ip_dst, th->th_dport, 451 INPLOOKUP_WILDCARD, 452 m->m_pkthdr.rcvif); 453 } 454 455#if defined(IPSEC) || defined(FAST_IPSEC) 456#ifdef INET6 457 if (isipv6 && inp != NULL && ipsec6_in_reject(m, inp)) { 458#ifdef IPSEC 459 ipsec6stat.in_polvio++; 460#endif 461 goto dropunlock; 462 } else 463#endif /* INET6 */ 464 if (inp != NULL && ipsec4_in_reject(m, inp)) { 465#ifdef IPSEC 466 ipsecstat.in_polvio++; 467#endif 468 goto dropunlock; 469 } 470#endif /*IPSEC || FAST_IPSEC*/ 471 472 /* 473 * If the INPCB does not exist then all data in the incoming 474 * segment is discarded and an appropriate RST is sent back. 475 */ 476 if (inp == NULL) { 477 /* 478 * Log communication attempts to ports that are not 479 * in use. 480 */ 481 if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || 482 tcp_log_in_vain == 2) { 483 char *s; 484#ifdef INET6 485 s = tcp_log_addrs(NULL, th, (void *)ip, (void *)ip6); 486#else 487 s = tcp_log_addrs(NULL, th, (void *)ip, NULL); 488#endif /* INET6 */ 489 if (s != NULL) { 490 log(LOG_INFO, "%s; %s: Connection attempt " 491 "to closed port\n", s, __func__); 492 free(s, M_TCPLOG); 493 } 494 } 495 /* 496 * When blackholing do not respond with a RST but 497 * completely ignore the segment and drop it. 498 */ 499 if ((blackhole == 1 && (thflags & TH_SYN)) || 500 blackhole == 2) 501 goto dropunlock; 502 503 rstreason = BANDLIM_RST_CLOSEDPORT; 504 goto dropwithreset; 505 } 506 INP_LOCK(inp); 507 508 /* Check the minimum TTL for socket. */ 509 if (inp->inp_ip_minttl != 0) { 510#ifdef INET6 511 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) 512 goto dropunlock; 513 else 514#endif 515 if (inp->inp_ip_minttl > ip->ip_ttl) 516 goto dropunlock; 517 } 518 519 /* 520 * A previous connection in TIMEWAIT state is supposed to catch 521 * stray or duplicate segments arriving late. If this segment 522 * was a legitimate new connection attempt the old INPCB gets 523 * removed and we can try again to find a listening socket. 524 */ 525 if (inp->inp_vflag & INP_TIMEWAIT) { 526 if (thflags & TH_SYN) 527 tcp_dooptions(&to, optp, optlen, TO_SYN); 528 /* NB: tcp_twcheck unlocks the INP and frees the mbuf. */ 529 if (tcp_twcheck(inp, &to, th, m, tlen)) 530 goto findpcb; 531 INP_INFO_WUNLOCK(&tcbinfo); 532 return; 533 } 534 /* 535 * The TCPCB may no longer exist if the connection is winding 536 * down or it is in the CLOSED state. Either way we drop the 537 * segment and send an appropriate response. 538 */ 539 tp = intotcpcb(inp); 540 if (tp == NULL) { 541 rstreason = BANDLIM_RST_CLOSEDPORT; 542 goto dropwithreset; 543 } 544 if (tp->t_state == TCPS_CLOSED) 545 goto dropunlock; /* XXX: dropwithreset??? */ 546 547#ifdef MAC 548 INP_LOCK_ASSERT(inp); 549 if (mac_check_inpcb_deliver(inp, m)) 550 goto dropunlock; 551#endif 552 so = inp->inp_socket; 553 KASSERT(so != NULL, ("%s: so == NULL", __func__)); 554#ifdef TCPDEBUG 555 if (so->so_options & SO_DEBUG) { 556 ostate = tp->t_state; 557 if (isipv6) { 558#ifdef INET6 559 bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); 560#endif 561 } else 562 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); 563 tcp_savetcp = *th; 564 } 565#endif 566 /* 567 * When the socket is accepting connections (the INPCB is in LISTEN 568 * state) we look into the SYN cache if this is a new connection 569 * attempt or the completion of a previous one. 570 */ 571 if (so->so_options & SO_ACCEPTCONN) { 572 struct in_conninfo inc; 573 574 KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " 575 "tp not listening", __func__)); 576 577 bzero(&inc, sizeof(inc)); 578 inc.inc_isipv6 = isipv6; 579#ifdef INET6 580 if (isipv6) { 581 inc.inc6_faddr = ip6->ip6_src; 582 inc.inc6_laddr = ip6->ip6_dst; 583 } else 584#endif 585 { 586 inc.inc_faddr = ip->ip_src; 587 inc.inc_laddr = ip->ip_dst; 588 } 589 inc.inc_fport = th->th_sport; 590 inc.inc_lport = th->th_dport; 591 592 /* 593 * If the state is LISTEN then ignore segment if it contains 594 * a RST. If the segment contains an ACK then it is bad and 595 * send a RST. If it does not contain a SYN then it is not 596 * interesting; drop it. 597 * 598 * If the state is SYN_RECEIVED (syncache) and seg contains 599 * an ACK, but not for our SYN/ACK, send a RST. If the seg 600 * contains a RST, check the sequence number to see if it 601 * is a valid reset segment. 602 */ 603 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 604 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 605 /* 606 * Parse the TCP options here because 607 * syncookies need access to the reflected 608 * timestamp. 609 */ 610 tcp_dooptions(&to, optp, optlen, 0); 611 /* 612 * NB: syncache_expand() doesn't unlock 613 * inp and tcpinfo locks. 614 */ 615 if (!syncache_expand(&inc, &to, th, &so, m)) { 616 /* 617 * No syncache entry or ACK was not 618 * for our SYN/ACK. Send a RST. 619 */ 620 rstreason = BANDLIM_RST_OPENPORT; 621 goto dropwithreset; 622 } 623 if (so == NULL) { 624 /* 625 * We completed the 3-way handshake 626 * but could not allocate a socket 627 * either due to memory shortage, 628 * listen queue length limits or 629 * global socket limits. 630 */ 631 rstreason = BANDLIM_UNLIMITED; 632 goto dropwithreset; 633 } 634 /* 635 * Socket is created in state SYN_RECEIVED. 636 * Continue processing segment. 637 */ 638 INP_UNLOCK(inp); /* listen socket */ 639 inp = sotoinpcb(so); 640 INP_LOCK(inp); /* new connection */ 641 tp = intotcpcb(inp); 642 /* 643 * Process the segment and the data it 644 * contains. tcp_do_segment() consumes 645 * the mbuf chain and unlocks the inpcb. 646 */ 647 tcp_do_segment(m, th, so, tp, drop_hdrlen, 648 tlen); 649 INP_INFO_UNLOCK_ASSERT(&tcbinfo); 650 return; 651 } 652 if (thflags & TH_RST) { 653 syncache_chkrst(&inc, th); 654 goto dropunlock; 655 } 656 if (thflags & TH_ACK) { 657 syncache_badack(&inc); 658 tcpstat.tcps_badsyn++; 659 rstreason = BANDLIM_RST_OPENPORT; 660 goto dropwithreset; 661 } 662 goto dropunlock; 663 } 664 665 /* 666 * Segment's flags are (SYN) or (SYN|FIN). 667 */ 668#ifdef INET6 669 /* 670 * If deprecated address is forbidden, 671 * we do not accept SYN to deprecated interface 672 * address to prevent any new inbound connection from 673 * getting established. 674 * When we do not accept SYN, we send a TCP RST, 675 * with deprecated source address (instead of dropping 676 * it). We compromise it as it is much better for peer 677 * to send a RST, and RST will be the final packet 678 * for the exchange. 679 * 680 * If we do not forbid deprecated addresses, we accept 681 * the SYN packet. RFC2462 does not suggest dropping 682 * SYN in this case. 683 * If we decipher RFC2462 5.5.4, it says like this: 684 * 1. use of deprecated addr with existing 685 * communication is okay - "SHOULD continue to be 686 * used" 687 * 2. use of it with new communication: 688 * (2a) "SHOULD NOT be used if alternate address 689 * with sufficient scope is available" 690 * (2b) nothing mentioned otherwise. 691 * Here we fall into (2b) case as we have no choice in 692 * our source address selection - we must obey the peer. 693 * 694 * The wording in RFC2462 is confusing, and there are 695 * multiple description text for deprecated address 696 * handling - worse, they are not exactly the same. 697 * I believe 5.5.4 is the best one, so we follow 5.5.4. 698 */ 699 if (isipv6 && !ip6_use_deprecated) { 700 struct in6_ifaddr *ia6; 701 702 if ((ia6 = ip6_getdstifaddr(m)) && 703 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 704 rstreason = BANDLIM_RST_OPENPORT; 705 goto dropwithreset; 706 } 707 } 708#endif 709 /* 710 * Basic sanity checks on incoming SYN requests: 711 * 712 * Don't bother responding if the destination was a 713 * broadcast according to RFC1122 4.2.3.10, p. 104. 714 * 715 * If it is from this socket, drop it, it must be forged. 716 * 717 * Note that it is quite possible to receive unicast 718 * link-layer packets with a broadcast IP address. Use 719 * in_broadcast() to find them. 720 */ 721 if (m->m_flags & (M_BCAST|M_MCAST)) 722 goto dropunlock; 723 if (isipv6) { 724#ifdef INET6 725 if (th->th_dport == th->th_sport && 726 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) 727 goto dropunlock; 728 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 729 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 730 goto dropunlock; 731#endif 732 } else { 733 if (th->th_dport == th->th_sport && 734 ip->ip_dst.s_addr == ip->ip_src.s_addr) 735 goto dropunlock; 736 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 737 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 738 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 739 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 740 goto dropunlock; 741 } 742 /* 743 * SYN appears to be valid. Create compressed TCP state 744 * for syncache. 745 */ 746#ifdef TCPDEBUG 747 if (so->so_options & SO_DEBUG) 748 tcp_trace(TA_INPUT, ostate, tp, 749 (void *)tcp_saveipgen, &tcp_savetcp, 0); 750#endif 751 tcp_dooptions(&to, optp, optlen, TO_SYN); 752 syncache_add(&inc, &to, th, inp, &so, m); 753 /* 754 * Entry added to syncache and mbuf consumed. 755 * Everything unlocked already by syncache_add(). 756 */ 757 return; 758 } 759 760 /* 761 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or late 762 * state. tcp_do_segment() always consumes the mbuf chain, unlocks the 763 * inpcb, and unlocks the pcbinfo. 764 */ 765 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); 766 INP_INFO_UNLOCK_ASSERT(&tcbinfo); 767 return; 768 769dropwithreset: 770 INP_INFO_WLOCK_ASSERT(&tcbinfo); 771 tcp_dropwithreset(m, th, tp, tlen, rstreason); 772 m = NULL; /* mbuf chain got consumed. */ 773dropunlock: 774 INP_INFO_WLOCK_ASSERT(&tcbinfo); 775 if (inp != NULL) 776 INP_UNLOCK(inp); 777 INP_INFO_WUNLOCK(&tcbinfo); 778drop: 779 INP_INFO_UNLOCK_ASSERT(&tcbinfo); 780 if (m != NULL) 781 m_freem(m); 782 return; 783} 784 785static void 786tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 787 struct tcpcb *tp, int drop_hdrlen, int tlen) 788{ 789 int thflags, acked, ourfinisacked, needoutput = 0; 790 int headlocked = 1; 791 int rstreason, todrop, win; 792 u_long tiwin; 793 struct tcpopt to; 794 795#ifdef TCPDEBUG 796 /* 797 * The size of tcp_saveipgen must be the size of the max ip header, 798 * now IPv6. 799 */ 800 u_char tcp_saveipgen[IP6_HDR_LEN]; 801 struct tcphdr tcp_savetcp; 802 short ostate = 0; 803#endif 804 thflags = th->th_flags; 805 806 INP_INFO_WLOCK_ASSERT(&tcbinfo); 807 INP_LOCK_ASSERT(tp->t_inpcb); 808 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 809 __func__)); 810 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 811 __func__)); 812 813 /* 814 * Segment received on connection. 815 * Reset idle time and keep-alive timer. 816 */ 817 tp->t_rcvtime = ticks; 818 if (TCPS_HAVEESTABLISHED(tp->t_state)) 819 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 820 821 /* 822 * Unscale the window into a 32-bit value. 823 * This value is bogus for the TCPS_SYN_SENT state 824 * and is overwritten later. 825 */ 826 tiwin = th->th_win << tp->snd_scale; 827 828 /* 829 * Parse options on any incoming segment. 830 */ 831 tcp_dooptions(&to, (u_char *)(th + 1), 832 (th->th_off << 2) - sizeof(struct tcphdr), 833 (thflags & TH_SYN) ? TO_SYN : 0); 834 835 /* 836 * If echoed timestamp is later than the current time, 837 * fall back to non RFC1323 RTT calculation. Normalize 838 * timestamp if syncookies were used when this connection 839 * was established. 840 */ 841 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 842 to.to_tsecr -= tp->ts_offset; 843 if (TSTMP_GT(to.to_tsecr, ticks)) 844 to.to_tsecr = 0; 845 } 846 847 /* 848 * Process options only when we get SYN/ACK back. The SYN case 849 * for incoming connections is handled in tcp_syncache. 850 * XXX this is traditional behavior, may need to be cleaned up. 851 */ 852 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 853 if ((to.to_flags & TOF_SCALE) && 854 (tp->t_flags & TF_REQ_SCALE)) { 855 tp->t_flags |= TF_RCVD_SCALE; 856 tp->snd_scale = to.to_wscale; 857 tp->snd_wnd = th->th_win << tp->snd_scale; 858 tiwin = tp->snd_wnd; 859 } 860 if (to.to_flags & TOF_TS) { 861 tp->t_flags |= TF_RCVD_TSTMP; 862 tp->ts_recent = to.to_tsval; 863 tp->ts_recent_age = ticks; 864 } 865 /* Initial send window, already scaled. */ 866 tp->snd_wnd = th->th_win; 867 if (to.to_flags & TOF_MSS) 868 tcp_mss(tp, to.to_mss); 869 if ((tp->t_flags & TF_SACK_PERMIT) && 870 (to.to_flags & TOF_SACKPERM) == 0) 871 tp->t_flags &= ~TF_SACK_PERMIT; 872 } 873 874 /* 875 * Header prediction: check for the two common cases 876 * of a uni-directional data xfer. If the packet has 877 * no control flags, is in-sequence, the window didn't 878 * change and we're not retransmitting, it's a 879 * candidate. If the length is zero and the ack moved 880 * forward, we're the sender side of the xfer. Just 881 * free the data acked & wake any higher level process 882 * that was blocked waiting for space. If the length 883 * is non-zero and the ack didn't move, we're the 884 * receiver side. If we're getting packets in-order 885 * (the reassembly queue is empty), add the data to 886 * the socket buffer and note that we need a delayed ack. 887 * Make sure that the hidden state-flags are also off. 888 * Since we check for TCPS_ESTABLISHED first, it can only 889 * be TH_NEEDSYN. 890 */ 891 if (tp->t_state == TCPS_ESTABLISHED && 892 th->th_seq == tp->rcv_nxt && 893 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 894 tp->snd_nxt == tp->snd_max && 895 tiwin && tiwin == tp->snd_wnd && 896 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 897 LIST_EMPTY(&tp->t_segq) && 898 ((to.to_flags & TOF_TS) == 0 || 899 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { 900 901 /* 902 * If last ACK falls within this segment's sequence numbers, 903 * record the timestamp. 904 * NOTE that the test is modified according to the latest 905 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 906 */ 907 if ((to.to_flags & TOF_TS) != 0 && 908 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 909 tp->ts_recent_age = ticks; 910 tp->ts_recent = to.to_tsval; 911 } 912 913 if (tlen == 0) { 914 if (SEQ_GT(th->th_ack, tp->snd_una) && 915 SEQ_LEQ(th->th_ack, tp->snd_max) && 916 tp->snd_cwnd >= tp->snd_wnd && 917 ((!tcp_do_newreno && 918 !(tp->t_flags & TF_SACK_PERMIT) && 919 tp->t_dupacks < tcprexmtthresh) || 920 ((tcp_do_newreno || 921 (tp->t_flags & TF_SACK_PERMIT)) && 922 !IN_FASTRECOVERY(tp) && 923 (to.to_flags & TOF_SACK) == 0 && 924 TAILQ_EMPTY(&tp->snd_holes)))) { 925 KASSERT(headlocked, 926 ("%s: headlocked", __func__)); 927 INP_INFO_WUNLOCK(&tcbinfo); 928 headlocked = 0; 929 /* 930 * this is a pure ack for outstanding data. 931 */ 932 ++tcpstat.tcps_predack; 933 /* 934 * "bad retransmit" recovery 935 */ 936 if (tp->t_rxtshift == 1 && 937 ticks < tp->t_badrxtwin) { 938 ++tcpstat.tcps_sndrexmitbad; 939 tp->snd_cwnd = tp->snd_cwnd_prev; 940 tp->snd_ssthresh = 941 tp->snd_ssthresh_prev; 942 tp->snd_recover = tp->snd_recover_prev; 943 if (tp->t_flags & TF_WASFRECOVERY) 944 ENTER_FASTRECOVERY(tp); 945 tp->snd_nxt = tp->snd_max; 946 tp->t_badrxtwin = 0; 947 } 948 949 /* 950 * Recalculate the transmit timer / rtt. 951 * 952 * Some boxes send broken timestamp replies 953 * during the SYN+ACK phase, ignore 954 * timestamps of 0 or we could calculate a 955 * huge RTT and blow up the retransmit timer. 956 */ 957 if ((to.to_flags & TOF_TS) != 0 && 958 to.to_tsecr) { 959 if (!tp->t_rttlow || 960 tp->t_rttlow > ticks - to.to_tsecr) 961 tp->t_rttlow = ticks - to.to_tsecr; 962 tcp_xmit_timer(tp, 963 ticks - to.to_tsecr + 1); 964 } else if (tp->t_rtttime && 965 SEQ_GT(th->th_ack, tp->t_rtseq)) { 966 if (!tp->t_rttlow || 967 tp->t_rttlow > ticks - tp->t_rtttime) 968 tp->t_rttlow = ticks - tp->t_rtttime; 969 tcp_xmit_timer(tp, 970 ticks - tp->t_rtttime); 971 } 972 tcp_xmit_bandwidth_limit(tp, th->th_ack); 973 acked = th->th_ack - tp->snd_una; 974 tcpstat.tcps_rcvackpack++; 975 tcpstat.tcps_rcvackbyte += acked; 976 sbdrop(&so->so_snd, acked); 977 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 978 SEQ_LEQ(th->th_ack, tp->snd_recover)) 979 tp->snd_recover = th->th_ack - 1; 980 tp->snd_una = th->th_ack; 981 /* 982 * pull snd_wl2 up to prevent seq wrap relative 983 * to th_ack. 984 */ 985 tp->snd_wl2 = th->th_ack; 986 tp->t_dupacks = 0; 987 m_freem(m); 988 ND6_HINT(tp); /* some progress has been done */ 989 990 /* 991 * If all outstanding data are acked, stop 992 * retransmit timer, otherwise restart timer 993 * using current (possibly backed-off) value. 994 * If process is waiting for space, 995 * wakeup/selwakeup/signal. If data 996 * are ready to send, let tcp_output 997 * decide between more output or persist. 998 999#ifdef TCPDEBUG 1000 if (so->so_options & SO_DEBUG) 1001 tcp_trace(TA_INPUT, ostate, tp, 1002 (void *)tcp_saveipgen, 1003 &tcp_savetcp, 0); 1004#endif 1005 */ 1006 if (tp->snd_una == tp->snd_max) 1007 tcp_timer_activate(tp, TT_REXMT, 0); 1008 else if (!tcp_timer_active(tp, TT_PERSIST)) 1009 tcp_timer_activate(tp, TT_REXMT, 1010 tp->t_rxtcur); 1011 1012 sowwakeup(so); 1013 if (so->so_snd.sb_cc) 1014 (void) tcp_output(tp); 1015 goto check_delack; 1016 } 1017 } else if (th->th_ack == tp->snd_una && 1018 tlen <= sbspace(&so->so_rcv)) { 1019 int newsize = 0; /* automatic sockbuf scaling */ 1020 1021 KASSERT(headlocked, ("%s: headlocked", __func__)); 1022 INP_INFO_WUNLOCK(&tcbinfo); 1023 headlocked = 0; 1024 /* 1025 * this is a pure, in-sequence data packet 1026 * with nothing on the reassembly queue and 1027 * we have enough buffer space to take it. 1028 */ 1029 /* Clean receiver SACK report if present */ 1030 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 1031 tcp_clean_sackreport(tp); 1032 ++tcpstat.tcps_preddat; 1033 tp->rcv_nxt += tlen; 1034 /* 1035 * Pull snd_wl1 up to prevent seq wrap relative to 1036 * th_seq. 1037 */ 1038 tp->snd_wl1 = th->th_seq; 1039 /* 1040 * Pull rcv_up up to prevent seq wrap relative to 1041 * rcv_nxt. 1042 */ 1043 tp->rcv_up = tp->rcv_nxt; 1044 tcpstat.tcps_rcvpack++; 1045 tcpstat.tcps_rcvbyte += tlen; 1046 ND6_HINT(tp); /* some progress has been done */ 1047#ifdef TCPDEBUG 1048 if (so->so_options & SO_DEBUG) 1049 tcp_trace(TA_INPUT, ostate, tp, 1050 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1051#endif 1052 /* 1053 * Automatic sizing of receive socket buffer. Often the send 1054 * buffer size is not optimally adjusted to the actual network 1055 * conditions at hand (delay bandwidth product). Setting the 1056 * buffer size too small limits throughput on links with high 1057 * bandwidth and high delay (eg. trans-continental/oceanic links). 1058 * 1059 * On the receive side the socket buffer memory is only rarely 1060 * used to any significant extent. This allows us to be much 1061 * more aggressive in scaling the receive socket buffer. For 1062 * the case that the buffer space is actually used to a large 1063 * extent and we run out of kernel memory we can simply drop 1064 * the new segments; TCP on the sender will just retransmit it 1065 * later. Setting the buffer size too big may only consume too 1066 * much kernel memory if the application doesn't read() from 1067 * the socket or packet loss or reordering makes use of the 1068 * reassembly queue. 1069 * 1070 * The criteria to step up the receive buffer one notch are: 1071 * 1. the number of bytes received during the time it takes 1072 * one timestamp to be reflected back to us (the RTT); 1073 * 2. received bytes per RTT is within seven eighth of the 1074 * current socket buffer size; 1075 * 3. receive buffer size has not hit maximal automatic size; 1076 * 1077 * This algorithm does one step per RTT at most and only if 1078 * we receive a bulk stream w/o packet losses or reorderings. 1079 * Shrinking the buffer during idle times is not necessary as 1080 * it doesn't consume any memory when idle. 1081 * 1082 * TODO: Only step up if the application is actually serving 1083 * the buffer to better manage the socket buffer resources. 1084 */ 1085 if (tcp_do_autorcvbuf && 1086 to.to_tsecr && 1087 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 1088 if (to.to_tsecr > tp->rfbuf_ts && 1089 to.to_tsecr - tp->rfbuf_ts < hz) { 1090 if (tp->rfbuf_cnt > 1091 (so->so_rcv.sb_hiwat / 8 * 7) && 1092 so->so_rcv.sb_hiwat < 1093 tcp_autorcvbuf_max) { 1094 newsize = 1095 min(so->so_rcv.sb_hiwat + 1096 tcp_autorcvbuf_inc, 1097 tcp_autorcvbuf_max); 1098 } 1099 /* Start over with next RTT. */ 1100 tp->rfbuf_ts = 0; 1101 tp->rfbuf_cnt = 0; 1102 } else 1103 tp->rfbuf_cnt += tlen; /* add up */ 1104 } 1105 1106 /* Add data to socket buffer. */ 1107 SOCKBUF_LOCK(&so->so_rcv); 1108 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1109 m_freem(m); 1110 } else { 1111 /* 1112 * Set new socket buffer size. 1113 * Give up when limit is reached. 1114 */ 1115 if (newsize) 1116 if (!sbreserve_locked(&so->so_rcv, 1117 newsize, so, curthread)) 1118 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1119 m_adj(m, drop_hdrlen); /* delayed header drop */ 1120 sbappendstream_locked(&so->so_rcv, m); 1121 } 1122 sorwakeup_locked(so); 1123 if (DELAY_ACK(tp)) { 1124 tp->t_flags |= TF_DELACK; 1125 } else { 1126 tp->t_flags |= TF_ACKNOW; 1127 tcp_output(tp); 1128 } 1129 goto check_delack; 1130 } 1131 } 1132 1133 /* 1134 * Calculate amount of space in receive window, 1135 * and then do TCP input processing. 1136 * Receive window is amount of space in rcv queue, 1137 * but not less than advertised window. 1138 */ 1139 win = sbspace(&so->so_rcv); 1140 if (win < 0) 1141 win = 0; 1142 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1143 1144 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1145 tp->rfbuf_ts = 0; 1146 tp->rfbuf_cnt = 0; 1147 1148 switch (tp->t_state) { 1149 1150 /* 1151 * If the state is SYN_RECEIVED: 1152 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1153 */ 1154 case TCPS_SYN_RECEIVED: 1155 if ((thflags & TH_ACK) && 1156 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1157 SEQ_GT(th->th_ack, tp->snd_max))) { 1158 rstreason = BANDLIM_RST_OPENPORT; 1159 goto dropwithreset; 1160 } 1161 break; 1162 1163 /* 1164 * If the state is SYN_SENT: 1165 * if seg contains an ACK, but not for our SYN, drop the input. 1166 * if seg contains a RST, then drop the connection. 1167 * if seg does not contain SYN, then drop it. 1168 * Otherwise this is an acceptable SYN segment 1169 * initialize tp->rcv_nxt and tp->irs 1170 * if seg contains ack then advance tp->snd_una 1171 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1172 * arrange for segment to be acked (eventually) 1173 * continue processing rest of data/controls, beginning with URG 1174 */ 1175 case TCPS_SYN_SENT: 1176 if ((thflags & TH_ACK) && 1177 (SEQ_LEQ(th->th_ack, tp->iss) || 1178 SEQ_GT(th->th_ack, tp->snd_max))) { 1179 rstreason = BANDLIM_UNLIMITED; 1180 goto dropwithreset; 1181 } 1182 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) 1183 tp = tcp_drop(tp, ECONNREFUSED); 1184 if (thflags & TH_RST) 1185 goto drop; 1186 if (!(thflags & TH_SYN)) 1187 goto drop; 1188 1189 tp->irs = th->th_seq; 1190 tcp_rcvseqinit(tp); 1191 if (thflags & TH_ACK) { 1192 tcpstat.tcps_connects++; 1193 soisconnected(so); 1194#ifdef MAC 1195 SOCK_LOCK(so); 1196 mac_set_socket_peer_from_mbuf(m, so); 1197 SOCK_UNLOCK(so); 1198#endif 1199 /* Do window scaling on this connection? */ 1200 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1201 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1202 tp->rcv_scale = tp->request_r_scale; 1203 } 1204 tp->rcv_adv += tp->rcv_wnd; 1205 tp->snd_una++; /* SYN is acked */ 1206 /* 1207 * If there's data, delay ACK; if there's also a FIN 1208 * ACKNOW will be turned on later. 1209 */ 1210 if (DELAY_ACK(tp) && tlen != 0) 1211 tcp_timer_activate(tp, TT_DELACK, 1212 tcp_delacktime); 1213 else 1214 tp->t_flags |= TF_ACKNOW; 1215 /* 1216 * Received <SYN,ACK> in SYN_SENT[*] state. 1217 * Transitions: 1218 * SYN_SENT --> ESTABLISHED 1219 * SYN_SENT* --> FIN_WAIT_1 1220 */ 1221 tp->t_starttime = ticks; 1222 if (tp->t_flags & TF_NEEDFIN) { 1223 tp->t_state = TCPS_FIN_WAIT_1; 1224 tp->t_flags &= ~TF_NEEDFIN; 1225 thflags &= ~TH_SYN; 1226 } else { 1227 tp->t_state = TCPS_ESTABLISHED; 1228 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1229 } 1230 } else { 1231 /* 1232 * Received initial SYN in SYN-SENT[*] state => 1233 * simultaneous open. If segment contains CC option 1234 * and there is a cached CC, apply TAO test. 1235 * If it succeeds, connection is * half-synchronized. 1236 * Otherwise, do 3-way handshake: 1237 * SYN-SENT -> SYN-RECEIVED 1238 * SYN-SENT* -> SYN-RECEIVED* 1239 * If there was no CC option, clear cached CC value. 1240 */ 1241 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 1242 tcp_timer_activate(tp, TT_REXMT, 0); 1243 tp->t_state = TCPS_SYN_RECEIVED; 1244 } 1245 1246 KASSERT(headlocked, ("%s: trimthenstep6: head not locked", 1247 __func__)); 1248 INP_LOCK_ASSERT(tp->t_inpcb); 1249 1250 /* 1251 * Advance th->th_seq to correspond to first data byte. 1252 * If data, trim to stay within window, 1253 * dropping FIN if necessary. 1254 */ 1255 th->th_seq++; 1256 if (tlen > tp->rcv_wnd) { 1257 todrop = tlen - tp->rcv_wnd; 1258 m_adj(m, -todrop); 1259 tlen = tp->rcv_wnd; 1260 thflags &= ~TH_FIN; 1261 tcpstat.tcps_rcvpackafterwin++; 1262 tcpstat.tcps_rcvbyteafterwin += todrop; 1263 } 1264 tp->snd_wl1 = th->th_seq - 1; 1265 tp->rcv_up = th->th_seq; 1266 /* 1267 * Client side of transaction: already sent SYN and data. 1268 * If the remote host used T/TCP to validate the SYN, 1269 * our data will be ACK'd; if so, enter normal data segment 1270 * processing in the middle of step 5, ack processing. 1271 * Otherwise, goto step 6. 1272 */ 1273 if (thflags & TH_ACK) 1274 goto process_ACK; 1275 1276 goto step6; 1277 1278 /* 1279 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 1280 * do normal processing. 1281 * 1282 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 1283 */ 1284 case TCPS_LAST_ACK: 1285 case TCPS_CLOSING: 1286 break; /* continue normal processing */ 1287 } 1288 1289 /* 1290 * States other than LISTEN or SYN_SENT. 1291 * First check the RST flag and sequence number since reset segments 1292 * are exempt from the timestamp and connection count tests. This 1293 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 1294 * below which allowed reset segments in half the sequence space 1295 * to fall though and be processed (which gives forged reset 1296 * segments with a random sequence number a 50 percent chance of 1297 * killing a connection). 1298 * Then check timestamp, if present. 1299 * Then check the connection count, if present. 1300 * Then check that at least some bytes of segment are within 1301 * receive window. If segment begins before rcv_nxt, 1302 * drop leading data (and SYN); if nothing left, just ack. 1303 * 1304 * 1305 * If the RST bit is set, check the sequence number to see 1306 * if this is a valid reset segment. 1307 * RFC 793 page 37: 1308 * In all states except SYN-SENT, all reset (RST) segments 1309 * are validated by checking their SEQ-fields. A reset is 1310 * valid if its sequence number is in the window. 1311 * Note: this does not take into account delayed ACKs, so 1312 * we should test against last_ack_sent instead of rcv_nxt. 1313 * The sequence number in the reset segment is normally an 1314 * echo of our outgoing acknowlegement numbers, but some hosts 1315 * send a reset with the sequence number at the rightmost edge 1316 * of our receive window, and we have to handle this case. 1317 * Note 2: Paul Watson's paper "Slipping in the Window" has shown 1318 * that brute force RST attacks are possible. To combat this, 1319 * we use a much stricter check while in the ESTABLISHED state, 1320 * only accepting RSTs where the sequence number is equal to 1321 * last_ack_sent. In all other states (the states in which a 1322 * RST is more likely), the more permissive check is used. 1323 * If we have multiple segments in flight, the intial reset 1324 * segment sequence numbers will be to the left of last_ack_sent, 1325 * but they will eventually catch up. 1326 * In any case, it never made sense to trim reset segments to 1327 * fit the receive window since RFC 1122 says: 1328 * 4.2.2.12 RST Segment: RFC-793 Section 3.4 1329 * 1330 * A TCP SHOULD allow a received RST segment to include data. 1331 * 1332 * DISCUSSION 1333 * It has been suggested that a RST segment could contain 1334 * ASCII text that encoded and explained the cause of the 1335 * RST. No standard has yet been established for such 1336 * data. 1337 * 1338 * If the reset segment passes the sequence number test examine 1339 * the state: 1340 * SYN_RECEIVED STATE: 1341 * If passive open, return to LISTEN state. 1342 * If active open, inform user that connection was refused. 1343 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: 1344 * Inform user that connection was reset, and close tcb. 1345 * CLOSING, LAST_ACK STATES: 1346 * Close the tcb. 1347 * TIME_WAIT STATE: 1348 * Drop the segment - see Stevens, vol. 2, p. 964 and 1349 * RFC 1337. 1350 */ 1351 if (thflags & TH_RST) { 1352 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1353 SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1354 switch (tp->t_state) { 1355 1356 case TCPS_SYN_RECEIVED: 1357 so->so_error = ECONNREFUSED; 1358 goto close; 1359 1360 case TCPS_ESTABLISHED: 1361 if (tcp_insecure_rst == 0 && 1362 !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && 1363 SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && 1364 !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1365 SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { 1366 tcpstat.tcps_badrst++; 1367 goto drop; 1368 } 1369 case TCPS_FIN_WAIT_1: 1370 case TCPS_FIN_WAIT_2: 1371 case TCPS_CLOSE_WAIT: 1372 so->so_error = ECONNRESET; 1373 close: 1374 tp->t_state = TCPS_CLOSED; 1375 tcpstat.tcps_drops++; 1376 KASSERT(headlocked, ("%s: trimthenstep6: " 1377 "tcp_close: head not locked", __func__)); 1378 tp = tcp_close(tp); 1379 break; 1380 1381 case TCPS_CLOSING: 1382 case TCPS_LAST_ACK: 1383 KASSERT(headlocked, ("%s: trimthenstep6: " 1384 "tcp_close.2: head not locked", __func__)); 1385 tp = tcp_close(tp); 1386 break; 1387 } 1388 } 1389 goto drop; 1390 } 1391 1392 /* 1393 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1394 * and it's less than ts_recent, drop it. 1395 */ 1396 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 1397 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 1398 1399 /* Check to see if ts_recent is over 24 days old. */ 1400 if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1401 /* 1402 * Invalidate ts_recent. If this segment updates 1403 * ts_recent, the age will be reset later and ts_recent 1404 * will get a valid value. If it does not, setting 1405 * ts_recent to zero will at least satisfy the 1406 * requirement that zero be placed in the timestamp 1407 * echo reply when ts_recent isn't valid. The 1408 * age isn't reset until we get a valid ts_recent 1409 * because we don't want out-of-order segments to be 1410 * dropped when ts_recent is old. 1411 */ 1412 tp->ts_recent = 0; 1413 } else { 1414 tcpstat.tcps_rcvduppack++; 1415 tcpstat.tcps_rcvdupbyte += tlen; 1416 tcpstat.tcps_pawsdrop++; 1417 if (tlen) 1418 goto dropafterack; 1419 goto drop; 1420 } 1421 } 1422 1423 /* 1424 * In the SYN-RECEIVED state, validate that the packet belongs to 1425 * this connection before trimming the data to fit the receive 1426 * window. Check the sequence number versus IRS since we know 1427 * the sequence numbers haven't wrapped. This is a partial fix 1428 * for the "LAND" DoS attack. 1429 */ 1430 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 1431 rstreason = BANDLIM_RST_OPENPORT; 1432 goto dropwithreset; 1433 } 1434 1435 todrop = tp->rcv_nxt - th->th_seq; 1436 if (todrop > 0) { 1437 if (thflags & TH_SYN) { 1438 thflags &= ~TH_SYN; 1439 th->th_seq++; 1440 if (th->th_urp > 1) 1441 th->th_urp--; 1442 else 1443 thflags &= ~TH_URG; 1444 todrop--; 1445 } 1446 /* 1447 * Following if statement from Stevens, vol. 2, p. 960. 1448 */ 1449 if (todrop > tlen 1450 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1451 /* 1452 * Any valid FIN must be to the left of the window. 1453 * At this point the FIN must be a duplicate or out 1454 * of sequence; drop it. 1455 */ 1456 thflags &= ~TH_FIN; 1457 1458 /* 1459 * Send an ACK to resynchronize and drop any data. 1460 * But keep on processing for RST or ACK. 1461 */ 1462 tp->t_flags |= TF_ACKNOW; 1463 todrop = tlen; 1464 tcpstat.tcps_rcvduppack++; 1465 tcpstat.tcps_rcvdupbyte += todrop; 1466 } else { 1467 tcpstat.tcps_rcvpartduppack++; 1468 tcpstat.tcps_rcvpartdupbyte += todrop; 1469 } 1470 drop_hdrlen += todrop; /* drop from the top afterwards */ 1471 th->th_seq += todrop; 1472 tlen -= todrop; 1473 if (th->th_urp > todrop) 1474 th->th_urp -= todrop; 1475 else { 1476 thflags &= ~TH_URG; 1477 th->th_urp = 0; 1478 } 1479 } 1480 1481 /* 1482 * If new data are received on a connection after the 1483 * user processes are gone, then RST the other end. 1484 */ 1485 if ((so->so_state & SS_NOFDREF) && 1486 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1487 KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head " 1488 "not locked", __func__)); 1489 tp = tcp_close(tp); 1490 tcpstat.tcps_rcvafterclose++; 1491 rstreason = BANDLIM_UNLIMITED; 1492 goto dropwithreset; 1493 } 1494 1495 /* 1496 * If segment ends after window, drop trailing data 1497 * (and PUSH and FIN); if nothing left, just ACK. 1498 */ 1499 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1500 if (todrop > 0) { 1501 tcpstat.tcps_rcvpackafterwin++; 1502 if (todrop >= tlen) { 1503 tcpstat.tcps_rcvbyteafterwin += tlen; 1504 /* 1505 * If window is closed can only take segments at 1506 * window edge, and have to drop data and PUSH from 1507 * incoming segments. Continue processing, but 1508 * remember to ack. Otherwise, drop segment 1509 * and ack. 1510 */ 1511 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1512 tp->t_flags |= TF_ACKNOW; 1513 tcpstat.tcps_rcvwinprobe++; 1514 } else 1515 goto dropafterack; 1516 } else 1517 tcpstat.tcps_rcvbyteafterwin += todrop; 1518 m_adj(m, -todrop); 1519 tlen -= todrop; 1520 thflags &= ~(TH_PUSH|TH_FIN); 1521 } 1522 1523 /* 1524 * If last ACK falls within this segment's sequence numbers, 1525 * record its timestamp. 1526 * NOTE: 1527 * 1) That the test incorporates suggestions from the latest 1528 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1529 * 2) That updating only on newer timestamps interferes with 1530 * our earlier PAWS tests, so this check should be solely 1531 * predicated on the sequence space of this segment. 1532 * 3) That we modify the segment boundary check to be 1533 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 1534 * instead of RFC1323's 1535 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 1536 * This modified check allows us to overcome RFC1323's 1537 * limitations as described in Stevens TCP/IP Illustrated 1538 * Vol. 2 p.869. In such cases, we can still calculate the 1539 * RTT correctly when RCV.NXT == Last.ACK.Sent. 1540 */ 1541 if ((to.to_flags & TOF_TS) != 0 && 1542 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1543 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1544 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 1545 tp->ts_recent_age = ticks; 1546 tp->ts_recent = to.to_tsval; 1547 } 1548 1549 /* 1550 * If a SYN is in the window, then this is an 1551 * error and we send an RST and drop the connection. 1552 */ 1553 if (thflags & TH_SYN) { 1554 KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: " 1555 "head not locked", __func__)); 1556 tp = tcp_drop(tp, ECONNRESET); 1557 rstreason = BANDLIM_UNLIMITED; 1558 goto drop; 1559 } 1560 1561 /* 1562 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 1563 * flag is on (half-synchronized state), then queue data for 1564 * later processing; else drop segment and return. 1565 */ 1566 if ((thflags & TH_ACK) == 0) { 1567 if (tp->t_state == TCPS_SYN_RECEIVED || 1568 (tp->t_flags & TF_NEEDSYN)) 1569 goto step6; 1570 else if (tp->t_flags & TF_ACKNOW) 1571 goto dropafterack; 1572 else 1573 goto drop; 1574 } 1575 1576 /* 1577 * Ack processing. 1578 */ 1579 switch (tp->t_state) { 1580 1581 /* 1582 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1583 * ESTABLISHED state and continue processing. 1584 * The ACK was checked above. 1585 */ 1586 case TCPS_SYN_RECEIVED: 1587 1588 tcpstat.tcps_connects++; 1589 soisconnected(so); 1590 /* Do window scaling? */ 1591 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1592 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1593 tp->rcv_scale = tp->request_r_scale; 1594 tp->snd_wnd = tiwin; 1595 } 1596 /* 1597 * Make transitions: 1598 * SYN-RECEIVED -> ESTABLISHED 1599 * SYN-RECEIVED* -> FIN-WAIT-1 1600 */ 1601 tp->t_starttime = ticks; 1602 if (tp->t_flags & TF_NEEDFIN) { 1603 tp->t_state = TCPS_FIN_WAIT_1; 1604 tp->t_flags &= ~TF_NEEDFIN; 1605 } else { 1606 tp->t_state = TCPS_ESTABLISHED; 1607 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1608 } 1609 /* 1610 * If segment contains data or ACK, will call tcp_reass() 1611 * later; if not, do so now to pass queued data to user. 1612 */ 1613 if (tlen == 0 && (thflags & TH_FIN) == 0) 1614 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 1615 (struct mbuf *)0); 1616 tp->snd_wl1 = th->th_seq - 1; 1617 /* FALLTHROUGH */ 1618 1619 /* 1620 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1621 * ACKs. If the ack is in the range 1622 * tp->snd_una < th->th_ack <= tp->snd_max 1623 * then advance tp->snd_una to th->th_ack and drop 1624 * data from the retransmission queue. If this ACK reflects 1625 * more up to date window information we update our window information. 1626 */ 1627 case TCPS_ESTABLISHED: 1628 case TCPS_FIN_WAIT_1: 1629 case TCPS_FIN_WAIT_2: 1630 case TCPS_CLOSE_WAIT: 1631 case TCPS_CLOSING: 1632 case TCPS_LAST_ACK: 1633 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1634 tcpstat.tcps_rcvacktoomuch++; 1635 goto dropafterack; 1636 } 1637 if ((tp->t_flags & TF_SACK_PERMIT) && 1638 ((to.to_flags & TOF_SACK) || 1639 !TAILQ_EMPTY(&tp->snd_holes))) 1640 tcp_sack_doack(tp, &to, th->th_ack); 1641 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1642 if (tlen == 0 && tiwin == tp->snd_wnd) { 1643 tcpstat.tcps_rcvdupack++; 1644 /* 1645 * If we have outstanding data (other than 1646 * a window probe), this is a completely 1647 * duplicate ack (ie, window info didn't 1648 * change), the ack is the biggest we've 1649 * seen and we've seen exactly our rexmt 1650 * threshhold of them, assume a packet 1651 * has been dropped and retransmit it. 1652 * Kludge snd_nxt & the congestion 1653 * window so we send only this one 1654 * packet. 1655 * 1656 * We know we're losing at the current 1657 * window size so do congestion avoidance 1658 * (set ssthresh to half the current window 1659 * and pull our congestion window back to 1660 * the new ssthresh). 1661 * 1662 * Dup acks mean that packets have left the 1663 * network (they're now cached at the receiver) 1664 * so bump cwnd by the amount in the receiver 1665 * to keep a constant cwnd packets in the 1666 * network. 1667 */ 1668 if (!tcp_timer_active(tp, TT_REXMT) || 1669 th->th_ack != tp->snd_una) 1670 tp->t_dupacks = 0; 1671 else if (++tp->t_dupacks > tcprexmtthresh || 1672 ((tcp_do_newreno || 1673 (tp->t_flags & TF_SACK_PERMIT)) && 1674 IN_FASTRECOVERY(tp))) { 1675 if ((tp->t_flags & TF_SACK_PERMIT) && 1676 IN_FASTRECOVERY(tp)) { 1677 int awnd; 1678 1679 /* 1680 * Compute the amount of data in flight first. 1681 * We can inject new data into the pipe iff 1682 * we have less than 1/2 the original window's 1683 * worth of data in flight. 1684 */ 1685 awnd = (tp->snd_nxt - tp->snd_fack) + 1686 tp->sackhint.sack_bytes_rexmit; 1687 if (awnd < tp->snd_ssthresh) { 1688 tp->snd_cwnd += tp->t_maxseg; 1689 if (tp->snd_cwnd > tp->snd_ssthresh) 1690 tp->snd_cwnd = tp->snd_ssthresh; 1691 } 1692 } else 1693 tp->snd_cwnd += tp->t_maxseg; 1694 (void) tcp_output(tp); 1695 goto drop; 1696 } else if (tp->t_dupacks == tcprexmtthresh) { 1697 tcp_seq onxt = tp->snd_nxt; 1698 u_int win; 1699 1700 /* 1701 * If we're doing sack, check to 1702 * see if we're already in sack 1703 * recovery. If we're not doing sack, 1704 * check to see if we're in newreno 1705 * recovery. 1706 */ 1707 if (tp->t_flags & TF_SACK_PERMIT) { 1708 if (IN_FASTRECOVERY(tp)) { 1709 tp->t_dupacks = 0; 1710 break; 1711 } 1712 } else if (tcp_do_newreno) { 1713 if (SEQ_LEQ(th->th_ack, 1714 tp->snd_recover)) { 1715 tp->t_dupacks = 0; 1716 break; 1717 } 1718 } 1719 win = min(tp->snd_wnd, tp->snd_cwnd) / 1720 2 / tp->t_maxseg; 1721 if (win < 2) 1722 win = 2; 1723 tp->snd_ssthresh = win * tp->t_maxseg; 1724 ENTER_FASTRECOVERY(tp); 1725 tp->snd_recover = tp->snd_max; 1726 tcp_timer_activate(tp, TT_REXMT, 0); 1727 tp->t_rtttime = 0; 1728 if (tp->t_flags & TF_SACK_PERMIT) { 1729 tcpstat.tcps_sack_recovery_episode++; 1730 tp->sack_newdata = tp->snd_nxt; 1731 tp->snd_cwnd = tp->t_maxseg; 1732 (void) tcp_output(tp); 1733 goto drop; 1734 } 1735 tp->snd_nxt = th->th_ack; 1736 tp->snd_cwnd = tp->t_maxseg; 1737 (void) tcp_output(tp); 1738 KASSERT(tp->snd_limited <= 2, 1739 ("%s: tp->snd_limited too big", 1740 __func__)); 1741 tp->snd_cwnd = tp->snd_ssthresh + 1742 tp->t_maxseg * 1743 (tp->t_dupacks - tp->snd_limited); 1744 if (SEQ_GT(onxt, tp->snd_nxt)) 1745 tp->snd_nxt = onxt; 1746 goto drop; 1747 } else if (tcp_do_rfc3042) { 1748 u_long oldcwnd = tp->snd_cwnd; 1749 tcp_seq oldsndmax = tp->snd_max; 1750 u_int sent; 1751 1752 KASSERT(tp->t_dupacks == 1 || 1753 tp->t_dupacks == 2, 1754 ("%s: dupacks not 1 or 2", 1755 __func__)); 1756 if (tp->t_dupacks == 1) 1757 tp->snd_limited = 0; 1758 tp->snd_cwnd = 1759 (tp->snd_nxt - tp->snd_una) + 1760 (tp->t_dupacks - tp->snd_limited) * 1761 tp->t_maxseg; 1762 (void) tcp_output(tp); 1763 sent = tp->snd_max - oldsndmax; 1764 if (sent > tp->t_maxseg) { 1765 KASSERT((tp->t_dupacks == 2 && 1766 tp->snd_limited == 0) || 1767 (sent == tp->t_maxseg + 1 && 1768 tp->t_flags & TF_SENTFIN), 1769 ("%s: sent too much", 1770 __func__)); 1771 tp->snd_limited = 2; 1772 } else if (sent > 0) 1773 ++tp->snd_limited; 1774 tp->snd_cwnd = oldcwnd; 1775 goto drop; 1776 } 1777 } else 1778 tp->t_dupacks = 0; 1779 break; 1780 } 1781 1782 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 1783 ("%s: th_ack <= snd_una", __func__)); 1784 1785 /* 1786 * If the congestion window was inflated to account 1787 * for the other side's cached packets, retract it. 1788 */ 1789 if (tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { 1790 if (IN_FASTRECOVERY(tp)) { 1791 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 1792 if (tp->t_flags & TF_SACK_PERMIT) 1793 tcp_sack_partialack(tp, th); 1794 else 1795 tcp_newreno_partial_ack(tp, th); 1796 } else { 1797 /* 1798 * Out of fast recovery. 1799 * Window inflation should have left us 1800 * with approximately snd_ssthresh 1801 * outstanding data. 1802 * But in case we would be inclined to 1803 * send a burst, better to do it via 1804 * the slow start mechanism. 1805 */ 1806 if (SEQ_GT(th->th_ack + 1807 tp->snd_ssthresh, 1808 tp->snd_max)) 1809 tp->snd_cwnd = tp->snd_max - 1810 th->th_ack + 1811 tp->t_maxseg; 1812 else 1813 tp->snd_cwnd = tp->snd_ssthresh; 1814 } 1815 } 1816 } else { 1817 if (tp->t_dupacks >= tcprexmtthresh && 1818 tp->snd_cwnd > tp->snd_ssthresh) 1819 tp->snd_cwnd = tp->snd_ssthresh; 1820 } 1821 tp->t_dupacks = 0; 1822 /* 1823 * If we reach this point, ACK is not a duplicate, 1824 * i.e., it ACKs something we sent. 1825 */ 1826 if (tp->t_flags & TF_NEEDSYN) { 1827 /* 1828 * T/TCP: Connection was half-synchronized, and our 1829 * SYN has been ACK'd (so connection is now fully 1830 * synchronized). Go to non-starred state, 1831 * increment snd_una for ACK of SYN, and check if 1832 * we can do window scaling. 1833 */ 1834 tp->t_flags &= ~TF_NEEDSYN; 1835 tp->snd_una++; 1836 /* Do window scaling? */ 1837 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1838 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1839 tp->rcv_scale = tp->request_r_scale; 1840 /* Send window already scaled. */ 1841 } 1842 } 1843 1844process_ACK: 1845 KASSERT(headlocked, ("%s: process_ACK: head not locked", 1846 __func__)); 1847 INP_LOCK_ASSERT(tp->t_inpcb); 1848 1849 acked = th->th_ack - tp->snd_una; 1850 tcpstat.tcps_rcvackpack++; 1851 tcpstat.tcps_rcvackbyte += acked; 1852 1853 /* 1854 * If we just performed our first retransmit, and the ACK 1855 * arrives within our recovery window, then it was a mistake 1856 * to do the retransmit in the first place. Recover our 1857 * original cwnd and ssthresh, and proceed to transmit where 1858 * we left off. 1859 */ 1860 if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { 1861 ++tcpstat.tcps_sndrexmitbad; 1862 tp->snd_cwnd = tp->snd_cwnd_prev; 1863 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1864 tp->snd_recover = tp->snd_recover_prev; 1865 if (tp->t_flags & TF_WASFRECOVERY) 1866 ENTER_FASTRECOVERY(tp); 1867 tp->snd_nxt = tp->snd_max; 1868 tp->t_badrxtwin = 0; /* XXX probably not required */ 1869 } 1870 1871 /* 1872 * If we have a timestamp reply, update smoothed 1873 * round trip time. If no timestamp is present but 1874 * transmit timer is running and timed sequence 1875 * number was acked, update smoothed round trip time. 1876 * Since we now have an rtt measurement, cancel the 1877 * timer backoff (cf., Phil Karn's retransmit alg.). 1878 * Recompute the initial retransmit timer. 1879 * 1880 * Some boxes send broken timestamp replies 1881 * during the SYN+ACK phase, ignore 1882 * timestamps of 0 or we could calculate a 1883 * huge RTT and blow up the retransmit timer. 1884 */ 1885 if ((to.to_flags & TOF_TS) != 0 && 1886 to.to_tsecr) { 1887 if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) 1888 tp->t_rttlow = ticks - to.to_tsecr; 1889 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); 1890 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 1891 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 1892 tp->t_rttlow = ticks - tp->t_rtttime; 1893 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 1894 } 1895 tcp_xmit_bandwidth_limit(tp, th->th_ack); 1896 1897 /* 1898 * If all outstanding data is acked, stop retransmit 1899 * timer and remember to restart (more output or persist). 1900 * If there is more data to be acked, restart retransmit 1901 * timer, using current (possibly backed-off) value. 1902 */ 1903 if (th->th_ack == tp->snd_max) { 1904 tcp_timer_activate(tp, TT_REXMT, 0); 1905 needoutput = 1; 1906 } else if (!tcp_timer_active(tp, TT_PERSIST)) 1907 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 1908 1909 /* 1910 * If no data (only SYN) was ACK'd, 1911 * skip rest of ACK processing. 1912 */ 1913 if (acked == 0) 1914 goto step6; 1915 1916 /* 1917 * When new data is acked, open the congestion window. 1918 * If the window gives us less than ssthresh packets 1919 * in flight, open exponentially (maxseg per packet). 1920 * Otherwise open linearly: maxseg per window 1921 * (maxseg^2 / cwnd per packet). 1922 */ 1923 if ((!tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || 1924 !IN_FASTRECOVERY(tp)) { 1925 u_int cw = tp->snd_cwnd; 1926 u_int incr = tp->t_maxseg; 1927 if (cw > tp->snd_ssthresh) 1928 incr = incr * incr / cw; 1929 tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale); 1930 } 1931 SOCKBUF_LOCK(&so->so_snd); 1932 if (acked > so->so_snd.sb_cc) { 1933 tp->snd_wnd -= so->so_snd.sb_cc; 1934 sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); 1935 ourfinisacked = 1; 1936 } else { 1937 sbdrop_locked(&so->so_snd, acked); 1938 tp->snd_wnd -= acked; 1939 ourfinisacked = 0; 1940 } 1941 sowwakeup_locked(so); 1942 /* detect una wraparound */ 1943 if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && 1944 !IN_FASTRECOVERY(tp) && 1945 SEQ_GT(tp->snd_una, tp->snd_recover) && 1946 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1947 tp->snd_recover = th->th_ack - 1; 1948 if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && 1949 IN_FASTRECOVERY(tp) && 1950 SEQ_GEQ(th->th_ack, tp->snd_recover)) 1951 EXIT_FASTRECOVERY(tp); 1952 tp->snd_una = th->th_ack; 1953 if (tp->t_flags & TF_SACK_PERMIT) { 1954 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 1955 tp->snd_recover = tp->snd_una; 1956 } 1957 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1958 tp->snd_nxt = tp->snd_una; 1959 1960 switch (tp->t_state) { 1961 1962 /* 1963 * In FIN_WAIT_1 STATE in addition to the processing 1964 * for the ESTABLISHED state if our FIN is now acknowledged 1965 * then enter FIN_WAIT_2. 1966 */ 1967 case TCPS_FIN_WAIT_1: 1968 if (ourfinisacked) { 1969 /* 1970 * If we can't receive any more 1971 * data, then closing user can proceed. 1972 * Starting the timer is contrary to the 1973 * specification, but if we don't get a FIN 1974 * we'll hang forever. 1975 */ 1976 /* XXXjl 1977 * we should release the tp also, and use a 1978 * compressed state. 1979 */ 1980 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1981 int timeout; 1982 1983 soisdisconnected(so); 1984 timeout = (tcp_fast_finwait2_recycle) ? 1985 tcp_finwait2_timeout : tcp_maxidle; 1986 tcp_timer_activate(tp, TT_2MSL, timeout); 1987 } 1988 tp->t_state = TCPS_FIN_WAIT_2; 1989 } 1990 break; 1991 1992 /* 1993 * In CLOSING STATE in addition to the processing for 1994 * the ESTABLISHED state if the ACK acknowledges our FIN 1995 * then enter the TIME-WAIT state, otherwise ignore 1996 * the segment. 1997 */ 1998 case TCPS_CLOSING: 1999 if (ourfinisacked) { 2000 KASSERT(headlocked, ("%s: process_ACK: " 2001 "head not locked", __func__)); 2002 tcp_twstart(tp); 2003 INP_INFO_WUNLOCK(&tcbinfo); 2004 headlocked = 0; 2005 m_freem(m); 2006 return; 2007 } 2008 break; 2009 2010 /* 2011 * In LAST_ACK, we may still be waiting for data to drain 2012 * and/or to be acked, as well as for the ack of our FIN. 2013 * If our FIN is now acknowledged, delete the TCB, 2014 * enter the closed state and return. 2015 */ 2016 case TCPS_LAST_ACK: 2017 if (ourfinisacked) { 2018 KASSERT(headlocked, ("%s: process_ACK: " 2019 "tcp_close: head not locked", __func__)); 2020 tp = tcp_close(tp); 2021 goto drop; 2022 } 2023 break; 2024 } 2025 } 2026 2027step6: 2028 KASSERT(headlocked, ("%s: step6: head not locked", __func__)); 2029 INP_LOCK_ASSERT(tp->t_inpcb); 2030 2031 /* 2032 * Update window information. 2033 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2034 */ 2035 if ((thflags & TH_ACK) && 2036 (SEQ_LT(tp->snd_wl1, th->th_seq) || 2037 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2038 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2039 /* keep track of pure window updates */ 2040 if (tlen == 0 && 2041 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2042 tcpstat.tcps_rcvwinupd++; 2043 tp->snd_wnd = tiwin; 2044 tp->snd_wl1 = th->th_seq; 2045 tp->snd_wl2 = th->th_ack; 2046 if (tp->snd_wnd > tp->max_sndwnd) 2047 tp->max_sndwnd = tp->snd_wnd; 2048 needoutput = 1; 2049 } 2050 2051 /* 2052 * Process segments with URG. 2053 */ 2054 if ((thflags & TH_URG) && th->th_urp && 2055 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2056 /* 2057 * This is a kludge, but if we receive and accept 2058 * random urgent pointers, we'll crash in 2059 * soreceive. It's hard to imagine someone 2060 * actually wanting to send this much urgent data. 2061 */ 2062 SOCKBUF_LOCK(&so->so_rcv); 2063 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2064 th->th_urp = 0; /* XXX */ 2065 thflags &= ~TH_URG; /* XXX */ 2066 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 2067 goto dodata; /* XXX */ 2068 } 2069 /* 2070 * If this segment advances the known urgent pointer, 2071 * then mark the data stream. This should not happen 2072 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2073 * a FIN has been received from the remote side. 2074 * In these states we ignore the URG. 2075 * 2076 * According to RFC961 (Assigned Protocols), 2077 * the urgent pointer points to the last octet 2078 * of urgent data. We continue, however, 2079 * to consider it to indicate the first octet 2080 * of data past the urgent section as the original 2081 * spec states (in one of two places). 2082 */ 2083 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2084 tp->rcv_up = th->th_seq + th->th_urp; 2085 so->so_oobmark = so->so_rcv.sb_cc + 2086 (tp->rcv_up - tp->rcv_nxt) - 1; 2087 if (so->so_oobmark == 0) 2088 so->so_rcv.sb_state |= SBS_RCVATMARK; 2089 sohasoutofband(so); 2090 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2091 } 2092 SOCKBUF_UNLOCK(&so->so_rcv); 2093 /* 2094 * Remove out of band data so doesn't get presented to user. 2095 * This can happen independent of advancing the URG pointer, 2096 * but if two URG's are pending at once, some out-of-band 2097 * data may creep in... ick. 2098 */ 2099 if (th->th_urp <= (u_long)tlen && 2100 !(so->so_options & SO_OOBINLINE)) { 2101 /* hdr drop is delayed */ 2102 tcp_pulloutofband(so, th, m, drop_hdrlen); 2103 } 2104 } else { 2105 /* 2106 * If no out of band data is expected, 2107 * pull receive urgent pointer along 2108 * with the receive window. 2109 */ 2110 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2111 tp->rcv_up = tp->rcv_nxt; 2112 } 2113dodata: /* XXX */ 2114 KASSERT(headlocked, ("%s: dodata: head not locked", __func__)); 2115 INP_LOCK_ASSERT(tp->t_inpcb); 2116 2117 /* 2118 * Process the segment text, merging it into the TCP sequencing queue, 2119 * and arranging for acknowledgment of receipt if necessary. 2120 * This process logically involves adjusting tp->rcv_wnd as data 2121 * is presented to the user (this happens in tcp_usrreq.c, 2122 * case PRU_RCVD). If a FIN has already been received on this 2123 * connection then we just ignore the text. 2124 */ 2125 if ((tlen || (thflags & TH_FIN)) && 2126 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2127 tcp_seq save_start = th->th_seq; 2128 tcp_seq save_end = th->th_seq + tlen; 2129 m_adj(m, drop_hdrlen); /* delayed header drop */ 2130 /* 2131 * Insert segment which includes th into TCP reassembly queue 2132 * with control block tp. Set thflags to whether reassembly now 2133 * includes a segment with FIN. This handles the common case 2134 * inline (segment is the next to be received on an established 2135 * connection, and the queue is empty), avoiding linkage into 2136 * and removal from the queue and repetition of various 2137 * conversions. 2138 * Set DELACK for segments received in order, but ack 2139 * immediately when segments are out of order (so 2140 * fast retransmit can work). 2141 */ 2142 if (th->th_seq == tp->rcv_nxt && 2143 LIST_EMPTY(&tp->t_segq) && 2144 TCPS_HAVEESTABLISHED(tp->t_state)) { 2145 if (DELAY_ACK(tp)) 2146 tp->t_flags |= TF_DELACK; 2147 else 2148 tp->t_flags |= TF_ACKNOW; 2149 tp->rcv_nxt += tlen; 2150 thflags = th->th_flags & TH_FIN; 2151 tcpstat.tcps_rcvpack++; 2152 tcpstat.tcps_rcvbyte += tlen; 2153 ND6_HINT(tp); 2154 SOCKBUF_LOCK(&so->so_rcv); 2155 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 2156 m_freem(m); 2157 else 2158 sbappendstream_locked(&so->so_rcv, m); 2159 sorwakeup_locked(so); 2160 } else { 2161 thflags = tcp_reass(tp, th, &tlen, m); 2162 tp->t_flags |= TF_ACKNOW; 2163 } 2164 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 2165 tcp_update_sack_list(tp, save_start, save_end); 2166#if 0 2167 /* 2168 * Note the amount of data that peer has sent into 2169 * our window, in order to estimate the sender's 2170 * buffer size. 2171 * XXX: Unused. 2172 */ 2173 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2174#endif 2175 } else { 2176 m_freem(m); 2177 thflags &= ~TH_FIN; 2178 } 2179 2180 /* 2181 * If FIN is received ACK the FIN and let the user know 2182 * that the connection is closing. 2183 */ 2184 if (thflags & TH_FIN) { 2185 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2186 socantrcvmore(so); 2187 /* 2188 * If connection is half-synchronized 2189 * (ie NEEDSYN flag on) then delay ACK, 2190 * so it may be piggybacked when SYN is sent. 2191 * Otherwise, since we received a FIN then no 2192 * more input can be expected, send ACK now. 2193 */ 2194 if (tp->t_flags & TF_NEEDSYN) 2195 tp->t_flags |= TF_DELACK; 2196 else 2197 tp->t_flags |= TF_ACKNOW; 2198 tp->rcv_nxt++; 2199 } 2200 switch (tp->t_state) { 2201 2202 /* 2203 * In SYN_RECEIVED and ESTABLISHED STATES 2204 * enter the CLOSE_WAIT state. 2205 */ 2206 case TCPS_SYN_RECEIVED: 2207 tp->t_starttime = ticks; 2208 /*FALLTHROUGH*/ 2209 case TCPS_ESTABLISHED: 2210 tp->t_state = TCPS_CLOSE_WAIT; 2211 break; 2212 2213 /* 2214 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2215 * enter the CLOSING state. 2216 */ 2217 case TCPS_FIN_WAIT_1: 2218 tp->t_state = TCPS_CLOSING; 2219 break; 2220 2221 /* 2222 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2223 * starting the time-wait timer, turning off the other 2224 * standard timers. 2225 */ 2226 case TCPS_FIN_WAIT_2: 2227 KASSERT(headlocked == 1, ("%s: dodata: " 2228 "TCP_FIN_WAIT_2: head not locked", __func__)); 2229 tcp_twstart(tp); 2230 INP_INFO_WUNLOCK(&tcbinfo); 2231 return; 2232 } 2233 } 2234 INP_INFO_WUNLOCK(&tcbinfo); 2235 headlocked = 0; 2236#ifdef TCPDEBUG 2237 if (so->so_options & SO_DEBUG) 2238 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 2239 &tcp_savetcp, 0); 2240#endif 2241 2242 /* 2243 * Return any desired output. 2244 */ 2245 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2246 (void) tcp_output(tp); 2247 2248check_delack: 2249 KASSERT(headlocked == 0, ("%s: check_delack: head locked", 2250 __func__)); 2251 INP_INFO_UNLOCK_ASSERT(&tcbinfo); 2252 INP_LOCK_ASSERT(tp->t_inpcb); 2253 if (tp->t_flags & TF_DELACK) { 2254 tp->t_flags &= ~TF_DELACK; 2255 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2256 } 2257 INP_UNLOCK(tp->t_inpcb); 2258 return; 2259 2260dropafterack: 2261 KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__)); 2262 /* 2263 * Generate an ACK dropping incoming segment if it occupies 2264 * sequence space, where the ACK reflects our state. 2265 * 2266 * We can now skip the test for the RST flag since all 2267 * paths to this code happen after packets containing 2268 * RST have been dropped. 2269 * 2270 * In the SYN-RECEIVED state, don't send an ACK unless the 2271 * segment we received passes the SYN-RECEIVED ACK test. 2272 * If it fails send a RST. This breaks the loop in the 2273 * "LAND" DoS attack, and also prevents an ACK storm 2274 * between two listening ports that have been sent forged 2275 * SYN segments, each with the source address of the other. 2276 */ 2277 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 2278 (SEQ_GT(tp->snd_una, th->th_ack) || 2279 SEQ_GT(th->th_ack, tp->snd_max)) ) { 2280 rstreason = BANDLIM_RST_OPENPORT; 2281 goto dropwithreset; 2282 } 2283#ifdef TCPDEBUG 2284 if (so->so_options & SO_DEBUG) 2285 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2286 &tcp_savetcp, 0); 2287#endif 2288 KASSERT(headlocked, ("%s: headlocked should be 1", __func__)); 2289 INP_INFO_WUNLOCK(&tcbinfo); 2290 tp->t_flags |= TF_ACKNOW; 2291 (void) tcp_output(tp); 2292 INP_UNLOCK(tp->t_inpcb); 2293 m_freem(m); 2294 return; 2295 2296dropwithreset: 2297 KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__)); 2298 2299 tcp_dropwithreset(m, th, tp, tlen, rstreason); 2300 2301 if (tp != NULL) 2302 INP_UNLOCK(tp->t_inpcb); 2303 if (headlocked) 2304 INP_INFO_WUNLOCK(&tcbinfo); 2305 return; 2306 2307drop: 2308 /* 2309 * Drop space held by incoming segment and return. 2310 */ 2311#ifdef TCPDEBUG 2312 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 2313 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2314 &tcp_savetcp, 0); 2315#endif 2316 if (tp != NULL) 2317 INP_UNLOCK(tp->t_inpcb); 2318 if (headlocked) 2319 INP_INFO_WUNLOCK(&tcbinfo); 2320 m_freem(m); 2321 return; 2322} 2323 2324/* 2325 * Issue RST and make ACK acceptable to originator of segment. 2326 * The mbuf must still include the original packet header. 2327 * tp may be NULL. 2328 */ 2329static void 2330tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, 2331 int tlen, int rstreason) 2332{ 2333 struct ip *ip; 2334#ifdef INET6 2335 struct ip6_hdr *ip6; 2336#endif 2337 /* Don't bother if destination was broadcast/multicast. */ 2338 if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2339 goto drop; 2340#ifdef INET6 2341 if (mtod(m, struct ip *)->ip_v == 6) { 2342 ip6 = mtod(m, struct ip6_hdr *); 2343 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 2344 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 2345 goto drop; 2346 /* IPv6 anycast check is done at tcp6_input() */ 2347 } else 2348#endif 2349 { 2350 ip = mtod(m, struct ip *); 2351 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 2352 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 2353 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 2354 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 2355 goto drop; 2356 } 2357 2358 /* Perform bandwidth limiting. */ 2359 if (badport_bandlim(rstreason) < 0) 2360 goto drop; 2361 2362 /* tcp_respond consumes the mbuf chain. */ 2363 if (th->th_flags & TH_ACK) { 2364 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, 2365 th->th_ack, TH_RST); 2366 } else { 2367 if (th->th_flags & TH_SYN) 2368 tlen++; 2369 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 2370 (tcp_seq)0, TH_RST|TH_ACK); 2371 } 2372 return; 2373drop: 2374 m_freem(m); 2375 return; 2376} 2377 2378/* 2379 * Parse TCP options and place in tcpopt. 2380 */ 2381static void 2382tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) 2383{ 2384 int opt, optlen; 2385 2386 to->to_flags = 0; 2387 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2388 opt = cp[0]; 2389 if (opt == TCPOPT_EOL) 2390 break; 2391 if (opt == TCPOPT_NOP) 2392 optlen = 1; 2393 else { 2394 if (cnt < 2) 2395 break; 2396 optlen = cp[1]; 2397 if (optlen < 2 || optlen > cnt) 2398 break; 2399 } 2400 switch (opt) { 2401 case TCPOPT_MAXSEG: 2402 if (optlen != TCPOLEN_MAXSEG) 2403 continue; 2404 if (!(flags & TO_SYN)) 2405 continue; 2406 to->to_flags |= TOF_MSS; 2407 bcopy((char *)cp + 2, 2408 (char *)&to->to_mss, sizeof(to->to_mss)); 2409 to->to_mss = ntohs(to->to_mss); 2410 break; 2411 case TCPOPT_WINDOW: 2412 if (optlen != TCPOLEN_WINDOW) 2413 continue; 2414 if (!(flags & TO_SYN)) 2415 continue; 2416 to->to_flags |= TOF_SCALE; 2417 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); 2418 break; 2419 case TCPOPT_TIMESTAMP: 2420 if (optlen != TCPOLEN_TIMESTAMP) 2421 continue; 2422 to->to_flags |= TOF_TS; 2423 bcopy((char *)cp + 2, 2424 (char *)&to->to_tsval, sizeof(to->to_tsval)); 2425 to->to_tsval = ntohl(to->to_tsval); 2426 bcopy((char *)cp + 6, 2427 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 2428 to->to_tsecr = ntohl(to->to_tsecr); 2429 break; 2430#ifdef TCP_SIGNATURE 2431 /* 2432 * XXX In order to reply to a host which has set the 2433 * TCP_SIGNATURE option in its initial SYN, we have to 2434 * record the fact that the option was observed here 2435 * for the syncache code to perform the correct response. 2436 */ 2437 case TCPOPT_SIGNATURE: 2438 if (optlen != TCPOLEN_SIGNATURE) 2439 continue; 2440 to->to_flags |= TOF_SIGNATURE; 2441 to->to_signature = cp + 2; 2442 break; 2443#endif 2444 case TCPOPT_SACK_PERMITTED: 2445 if (optlen != TCPOLEN_SACK_PERMITTED) 2446 continue; 2447 if (!(flags & TO_SYN)) 2448 continue; 2449 if (!tcp_do_sack) 2450 continue; 2451 to->to_flags |= TOF_SACKPERM; 2452 break; 2453 case TCPOPT_SACK: 2454 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2455 continue; 2456 if (flags & TO_SYN) 2457 continue; 2458 to->to_flags |= TOF_SACK; 2459 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 2460 to->to_sacks = cp + 2; 2461 tcpstat.tcps_sack_rcv_blocks++; 2462 break; 2463 default: 2464 continue; 2465 } 2466 } 2467} 2468 2469/* 2470 * Pull out of band byte out of a segment so 2471 * it doesn't appear in the user's data queue. 2472 * It is still reflected in the segment length for 2473 * sequencing purposes. 2474 */ 2475static void 2476tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, 2477 int off) 2478{ 2479 int cnt = off + th->th_urp - 1; 2480 2481 while (cnt >= 0) { 2482 if (m->m_len > cnt) { 2483 char *cp = mtod(m, caddr_t) + cnt; 2484 struct tcpcb *tp = sototcpcb(so); 2485 2486 tp->t_iobc = *cp; 2487 tp->t_oobflags |= TCPOOB_HAVEDATA; 2488 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2489 m->m_len--; 2490 if (m->m_flags & M_PKTHDR) 2491 m->m_pkthdr.len--; 2492 return; 2493 } 2494 cnt -= m->m_len; 2495 m = m->m_next; 2496 if (m == NULL) 2497 break; 2498 } 2499 panic("tcp_pulloutofband"); 2500} 2501 2502/* 2503 * Collect new round-trip time estimate 2504 * and update averages and current timeout. 2505 */ 2506static void 2507tcp_xmit_timer(struct tcpcb *tp, int rtt) 2508{ 2509 int delta; 2510 2511 INP_LOCK_ASSERT(tp->t_inpcb); 2512 2513 tcpstat.tcps_rttupdated++; 2514 tp->t_rttupdated++; 2515 if (tp->t_srtt != 0) { 2516 /* 2517 * srtt is stored as fixed point with 5 bits after the 2518 * binary point (i.e., scaled by 8). The following magic 2519 * is equivalent to the smoothing algorithm in rfc793 with 2520 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2521 * point). Adjust rtt to origin 0. 2522 */ 2523 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 2524 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 2525 2526 if ((tp->t_srtt += delta) <= 0) 2527 tp->t_srtt = 1; 2528 2529 /* 2530 * We accumulate a smoothed rtt variance (actually, a 2531 * smoothed mean difference), then set the retransmit 2532 * timer to smoothed rtt + 4 times the smoothed variance. 2533 * rttvar is stored as fixed point with 4 bits after the 2534 * binary point (scaled by 16). The following is 2535 * equivalent to rfc793 smoothing with an alpha of .75 2536 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2537 * rfc793's wired-in beta. 2538 */ 2539 if (delta < 0) 2540 delta = -delta; 2541 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 2542 if ((tp->t_rttvar += delta) <= 0) 2543 tp->t_rttvar = 1; 2544 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 2545 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2546 } else { 2547 /* 2548 * No rtt measurement yet - use the unsmoothed rtt. 2549 * Set the variance to half the rtt (so our first 2550 * retransmit happens at 3*rtt). 2551 */ 2552 tp->t_srtt = rtt << TCP_RTT_SHIFT; 2553 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 2554 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2555 } 2556 tp->t_rtttime = 0; 2557 tp->t_rxtshift = 0; 2558 2559 /* 2560 * the retransmit should happen at rtt + 4 * rttvar. 2561 * Because of the way we do the smoothing, srtt and rttvar 2562 * will each average +1/2 tick of bias. When we compute 2563 * the retransmit timer, we want 1/2 tick of rounding and 2564 * 1 extra tick because of +-1/2 tick uncertainty in the 2565 * firing of the timer. The bias will give us exactly the 2566 * 1.5 tick we need. But, because the bias is 2567 * statistical, we have to test that we don't drop below 2568 * the minimum feasible timer (which is 2 ticks). 2569 */ 2570 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 2571 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 2572 2573 /* 2574 * We received an ack for a packet that wasn't retransmitted; 2575 * it is probably safe to discard any error indications we've 2576 * received recently. This isn't quite right, but close enough 2577 * for now (a route might have failed after we sent a segment, 2578 * and the return path might not be symmetrical). 2579 */ 2580 tp->t_softerror = 0; 2581} 2582 2583/* 2584 * Determine a reasonable value for maxseg size. 2585 * If the route is known, check route for mtu. 2586 * If none, use an mss that can be handled on the outgoing 2587 * interface without forcing IP to fragment; if bigger than 2588 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2589 * to utilize large mbufs. If no route is found, route has no mtu, 2590 * or the destination isn't local, use a default, hopefully conservative 2591 * size (usually 512 or the default IP max size, but no more than the mtu 2592 * of the interface), as we can't discover anything about intervening 2593 * gateways or networks. We also initialize the congestion/slow start 2594 * window to be a single segment if the destination isn't local. 2595 * While looking at the routing entry, we also initialize other path-dependent 2596 * parameters from pre-set or cached values in the routing entry. 2597 * 2598 * Also take into account the space needed for options that we 2599 * send regularly. Make maxseg shorter by that amount to assure 2600 * that we can send maxseg amount of data even when the options 2601 * are present. Store the upper limit of the length of options plus 2602 * data in maxopd. 2603 * 2604 * 2605 * In case of T/TCP, we call this routine during implicit connection 2606 * setup as well (offer = -1), to initialize maxseg from the cached 2607 * MSS of our peer. 2608 * 2609 * NOTE that this routine is only called when we process an incoming 2610 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). 2611 */ 2612void 2613tcp_mss(struct tcpcb *tp, int offer) 2614{ 2615 int rtt, mss; 2616 u_long bufsize; 2617 u_long maxmtu; 2618 struct inpcb *inp = tp->t_inpcb; 2619 struct socket *so; 2620 struct hc_metrics_lite metrics; 2621 int origoffer = offer; 2622 int mtuflags = 0; 2623#ifdef INET6 2624 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 2625 size_t min_protoh = isipv6 ? 2626 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : 2627 sizeof (struct tcpiphdr); 2628#else 2629 const size_t min_protoh = sizeof(struct tcpiphdr); 2630#endif 2631 2632 /* initialize */ 2633#ifdef INET6 2634 if (isipv6) { 2635 maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); 2636 tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; 2637 } else 2638#endif 2639 { 2640 maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); 2641 tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; 2642 } 2643 so = inp->inp_socket; 2644 2645 /* 2646 * no route to sender, stay with default mss and return 2647 */ 2648 if (maxmtu == 0) 2649 return; 2650 2651 /* what have we got? */ 2652 switch (offer) { 2653 case 0: 2654 /* 2655 * Offer == 0 means that there was no MSS on the SYN 2656 * segment, in this case we use tcp_mssdflt. 2657 */ 2658 offer = 2659#ifdef INET6 2660 isipv6 ? tcp_v6mssdflt : 2661#endif 2662 tcp_mssdflt; 2663 break; 2664 2665 case -1: 2666 /* 2667 * Offer == -1 means that we didn't receive SYN yet. 2668 */ 2669 /* FALLTHROUGH */ 2670 2671 default: 2672 /* 2673 * Prevent DoS attack with too small MSS. Round up 2674 * to at least minmss. 2675 */ 2676 offer = max(offer, tcp_minmss); 2677 /* 2678 * Sanity check: make sure that maxopd will be large 2679 * enough to allow some data on segments even if the 2680 * all the option space is used (40bytes). Otherwise 2681 * funny things may happen in tcp_output. 2682 */ 2683 offer = max(offer, 64); 2684 } 2685 2686 /* 2687 * rmx information is now retrieved from tcp_hostcache 2688 */ 2689 tcp_hc_get(&inp->inp_inc, &metrics); 2690 2691 /* 2692 * if there's a discovered mtu int tcp hostcache, use it 2693 * else, use the link mtu. 2694 */ 2695 if (metrics.rmx_mtu) 2696 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; 2697 else { 2698#ifdef INET6 2699 if (isipv6) { 2700 mss = maxmtu - min_protoh; 2701 if (!path_mtu_discovery && 2702 !in6_localaddr(&inp->in6p_faddr)) 2703 mss = min(mss, tcp_v6mssdflt); 2704 } else 2705#endif 2706 { 2707 mss = maxmtu - min_protoh; 2708 if (!path_mtu_discovery && 2709 !in_localaddr(inp->inp_faddr)) 2710 mss = min(mss, tcp_mssdflt); 2711 } 2712 } 2713 mss = min(mss, offer); 2714 2715 /* 2716 * maxopd stores the maximum length of data AND options 2717 * in a segment; maxseg is the amount of data in a normal 2718 * segment. We need to store this value (maxopd) apart 2719 * from maxseg, because now every segment carries options 2720 * and thus we normally have somewhat less data in segments. 2721 */ 2722 tp->t_maxopd = mss; 2723 2724 /* 2725 * origoffer==-1 indicates, that no segments were received yet. 2726 * In this case we just guess. 2727 */ 2728 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2729 (origoffer == -1 || 2730 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 2731 mss -= TCPOLEN_TSTAMP_APPA; 2732 tp->t_maxseg = mss; 2733 2734#if (MCLBYTES & (MCLBYTES - 1)) == 0 2735 if (mss > MCLBYTES) 2736 mss &= ~(MCLBYTES-1); 2737#else 2738 if (mss > MCLBYTES) 2739 mss = mss / MCLBYTES * MCLBYTES; 2740#endif 2741 tp->t_maxseg = mss; 2742 2743 /* 2744 * If there's a pipesize, change the socket buffer to that size, 2745 * don't change if sb_hiwat is different than default (then it 2746 * has been changed on purpose with setsockopt). 2747 * Make the socket buffers an integral number of mss units; 2748 * if the mss is larger than the socket buffer, decrease the mss. 2749 */ 2750 SOCKBUF_LOCK(&so->so_snd); 2751 if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) 2752 bufsize = metrics.rmx_sendpipe; 2753 else 2754 bufsize = so->so_snd.sb_hiwat; 2755 if (bufsize < mss) 2756 mss = bufsize; 2757 else { 2758 bufsize = roundup(bufsize, mss); 2759 if (bufsize > sb_max) 2760 bufsize = sb_max; 2761 if (bufsize > so->so_snd.sb_hiwat) 2762 (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); 2763 } 2764 SOCKBUF_UNLOCK(&so->so_snd); 2765 tp->t_maxseg = mss; 2766 2767 SOCKBUF_LOCK(&so->so_rcv); 2768 if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) 2769 bufsize = metrics.rmx_recvpipe; 2770 else 2771 bufsize = so->so_rcv.sb_hiwat; 2772 if (bufsize > mss) { 2773 bufsize = roundup(bufsize, mss); 2774 if (bufsize > sb_max) 2775 bufsize = sb_max; 2776 if (bufsize > so->so_rcv.sb_hiwat) 2777 (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); 2778 } 2779 SOCKBUF_UNLOCK(&so->so_rcv); 2780 /* 2781 * While we're here, check the others too 2782 */ 2783 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { 2784 tp->t_srtt = rtt; 2785 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; 2786 tcpstat.tcps_usedrtt++; 2787 if (metrics.rmx_rttvar) { 2788 tp->t_rttvar = metrics.rmx_rttvar; 2789 tcpstat.tcps_usedrttvar++; 2790 } else { 2791 /* default variation is +- 1 rtt */ 2792 tp->t_rttvar = 2793 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 2794 } 2795 TCPT_RANGESET(tp->t_rxtcur, 2796 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 2797 tp->t_rttmin, TCPTV_REXMTMAX); 2798 } 2799 if (metrics.rmx_ssthresh) { 2800 /* 2801 * There's some sort of gateway or interface 2802 * buffer limit on the path. Use this to set 2803 * the slow start threshhold, but set the 2804 * threshold to no less than 2*mss. 2805 */ 2806 tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); 2807 tcpstat.tcps_usedssthresh++; 2808 } 2809 if (metrics.rmx_bandwidth) 2810 tp->snd_bandwidth = metrics.rmx_bandwidth; 2811 2812 /* 2813 * Set the slow-start flight size depending on whether this 2814 * is a local network or not. 2815 * 2816 * Extend this so we cache the cwnd too and retrieve it here. 2817 * Make cwnd even bigger than RFC3390 suggests but only if we 2818 * have previous experience with the remote host. Be careful 2819 * not make cwnd bigger than remote receive window or our own 2820 * send socket buffer. Maybe put some additional upper bound 2821 * on the retrieved cwnd. Should do incremental updates to 2822 * hostcache when cwnd collapses so next connection doesn't 2823 * overloads the path again. 2824 * 2825 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. 2826 * We currently check only in syncache_socket for that. 2827 */ 2828#define TCP_METRICS_CWND 2829#ifdef TCP_METRICS_CWND 2830 if (metrics.rmx_cwnd) 2831 tp->snd_cwnd = max(mss, 2832 min(metrics.rmx_cwnd / 2, 2833 min(tp->snd_wnd, so->so_snd.sb_hiwat))); 2834 else 2835#endif 2836 if (tcp_do_rfc3390) 2837 tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); 2838#ifdef INET6 2839 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || 2840 (!isipv6 && in_localaddr(inp->inp_faddr))) 2841#else 2842 else if (in_localaddr(inp->inp_faddr)) 2843#endif 2844 tp->snd_cwnd = mss * ss_fltsz_local; 2845 else 2846 tp->snd_cwnd = mss * ss_fltsz; 2847 2848 /* Check the interface for TSO capabilities. */ 2849 if (mtuflags & CSUM_TSO) 2850 tp->t_flags |= TF_TSO; 2851} 2852 2853/* 2854 * Determine the MSS option to send on an outgoing SYN. 2855 */ 2856int 2857tcp_mssopt(struct in_conninfo *inc) 2858{ 2859 int mss = 0; 2860 u_long maxmtu = 0; 2861 u_long thcmtu = 0; 2862 size_t min_protoh; 2863#ifdef INET6 2864 int isipv6 = inc->inc_isipv6 ? 1 : 0; 2865#endif 2866 2867 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); 2868 2869#ifdef INET6 2870 if (isipv6) { 2871 mss = tcp_v6mssdflt; 2872 maxmtu = tcp_maxmtu6(inc, NULL); 2873 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 2874 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 2875 } else 2876#endif 2877 { 2878 mss = tcp_mssdflt; 2879 maxmtu = tcp_maxmtu(inc, NULL); 2880 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 2881 min_protoh = sizeof(struct tcpiphdr); 2882 } 2883 if (maxmtu && thcmtu) 2884 mss = min(maxmtu, thcmtu) - min_protoh; 2885 else if (maxmtu || thcmtu) 2886 mss = max(maxmtu, thcmtu) - min_protoh; 2887 2888 return (mss); 2889} 2890 2891 2892/* 2893 * On a partial ack arrives, force the retransmission of the 2894 * next unacknowledged segment. Do not clear tp->t_dupacks. 2895 * By setting snd_nxt to ti_ack, this forces retransmission timer to 2896 * be started again. 2897 */ 2898static void 2899tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) 2900{ 2901 tcp_seq onxt = tp->snd_nxt; 2902 u_long ocwnd = tp->snd_cwnd; 2903 2904 tcp_timer_activate(tp, TT_REXMT, 0); 2905 tp->t_rtttime = 0; 2906 tp->snd_nxt = th->th_ack; 2907 /* 2908 * Set snd_cwnd to one segment beyond acknowledged offset. 2909 * (tp->snd_una has not yet been updated when this function is called.) 2910 */ 2911 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 2912 tp->t_flags |= TF_ACKNOW; 2913 (void) tcp_output(tp); 2914 tp->snd_cwnd = ocwnd; 2915 if (SEQ_GT(onxt, tp->snd_nxt)) 2916 tp->snd_nxt = onxt; 2917 /* 2918 * Partial window deflation. Relies on fact that tp->snd_una 2919 * not updated yet. 2920 */ 2921 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 2922 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2923 else 2924 tp->snd_cwnd = 0; 2925 tp->snd_cwnd += tp->t_maxseg; 2926} 2927