tcp_input.c revision 170078
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 30 * $FreeBSD: head/sys/netinet/tcp_input.c 170078 2007-05-28 23:27:44Z andre $ 31 */ 32 33#include "opt_ipfw.h" /* for ipfw_fwd */ 34#include "opt_inet.h" 35#include "opt_inet6.h" 36#include "opt_ipsec.h" 37#include "opt_mac.h" 38#include "opt_tcpdebug.h" 39 40#include <sys/param.h> 41#include <sys/kernel.h> 42#include <sys/malloc.h> 43#include <sys/mbuf.h> 44#include <sys/proc.h> /* for proc0 declaration */ 45#include <sys/protosw.h> 46#include <sys/signalvar.h> 47#include <sys/socket.h> 48#include <sys/socketvar.h> 49#include <sys/sysctl.h> 50#include <sys/syslog.h> 51#include <sys/systm.h> 52 53#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 54 55#include <vm/uma.h> 56 57#include <net/if.h> 58#include <net/route.h> 59 60#include <netinet/in.h> 61#include <netinet/in_pcb.h> 62#include <netinet/in_systm.h> 63#include <netinet/in_var.h> 64#include <netinet/ip.h> 65#include <netinet/ip_icmp.h> /* required for icmp_var.h */ 66#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 67#include <netinet/ip_var.h> 68#include <netinet/ip_options.h> 69#include <netinet/ip6.h> 70#include <netinet/icmp6.h> 71#include <netinet6/in6_pcb.h> 72#include <netinet6/ip6_var.h> 73#include <netinet6/nd6.h> 74#include <netinet/tcp.h> 75#include <netinet/tcp_fsm.h> 76#include <netinet/tcp_seq.h> 77#include <netinet/tcp_timer.h> 78#include <netinet/tcp_var.h> 79#include <netinet6/tcp6_var.h> 80#include <netinet/tcpip.h> 81#ifdef TCPDEBUG 82#include <netinet/tcp_debug.h> 83#endif /* TCPDEBUG */ 84 85#ifdef FAST_IPSEC 86#include <netipsec/ipsec.h> 87#include <netipsec/ipsec6.h> 88#endif /*FAST_IPSEC*/ 89 90#ifdef IPSEC 91#include <netinet6/ipsec.h> 92#include <netinet6/ipsec6.h> 93#include <netkey/key.h> 94#endif /*IPSEC*/ 95 96#include <machine/in_cksum.h> 97 98#include <security/mac/mac_framework.h> 99 100static const int tcprexmtthresh = 3; 101 102struct tcpstat tcpstat; 103SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, 104 &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 105 106static int tcp_log_in_vain = 0; 107SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 108 &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); 109 110static int blackhole = 0; 111SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, 112 &blackhole, 0, "Do not send RST on segments to closed ports"); 113 114int tcp_delack_enabled = 1; 115SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, 116 &tcp_delack_enabled, 0, 117 "Delay ACK to try and piggyback it onto a data packet"); 118 119static int drop_synfin = 0; 120SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, 121 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); 122 123static int tcp_do_rfc3042 = 1; 124SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, 125 &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); 126 127static int tcp_do_rfc3390 = 1; 128SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, 129 &tcp_do_rfc3390, 0, 130 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); 131 132static int tcp_insecure_rst = 0; 133SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, 134 &tcp_insecure_rst, 0, 135 "Follow the old (insecure) criteria for accepting RST packets"); 136 137int tcp_do_autorcvbuf = 1; 138SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, 139 &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); 140 141int tcp_autorcvbuf_inc = 16*1024; 142SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, 143 &tcp_autorcvbuf_inc, 0, 144 "Incrementor step size of automatic receive buffer"); 145 146int tcp_autorcvbuf_max = 256*1024; 147SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, 148 &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); 149 150struct inpcbhead tcb; 151#define tcb6 tcb /* for KAME src sync over BSD*'s */ 152struct inpcbinfo tcbinfo; 153 154static void tcp_dooptions(struct tcpopt *, u_char *, int, int); 155static void tcp_do_segment(struct mbuf *, struct tcphdr *, 156 struct socket *, struct tcpcb *, int, int); 157static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, 158 struct tcpcb *, int, int); 159static void tcp_pulloutofband(struct socket *, 160 struct tcphdr *, struct mbuf *, int); 161static void tcp_xmit_timer(struct tcpcb *, int); 162static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); 163 164/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ 165#ifdef INET6 166#define ND6_HINT(tp) \ 167do { \ 168 if ((tp) && (tp)->t_inpcb && \ 169 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ 170 nd6_nud_hint(NULL, NULL, 0); \ 171} while (0) 172#else 173#define ND6_HINT(tp) 174#endif 175 176/* 177 * Indicate whether this ack should be delayed. We can delay the ack if 178 * - there is no delayed ack timer in progress and 179 * - our last ack wasn't a 0-sized window. We never want to delay 180 * the ack that opens up a 0-sized window and 181 * - delayed acks are enabled or 182 * - this is a half-synchronized T/TCP connection. 183 */ 184#define DELAY_ACK(tp) \ 185 ((!tcp_timer_active(tp, TT_DELACK) && \ 186 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 187 (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 188 189 190/* 191 * TCP input handling is split into multiple parts: 192 * tcp6_input is a thin wrapper around tcp_input for the extended 193 * ip6_protox[] call format in ip6_input 194 * tcp_input handles primary segment validation, inpcb lookup and 195 * SYN processing on listen sockets 196 * tcp_do_segment processes the ACK and text of the segment for 197 * establishing, established and closing connections 198 */ 199#ifdef INET6 200int 201tcp6_input(struct mbuf **mp, int *offp, int proto) 202{ 203 struct mbuf *m = *mp; 204 struct in6_ifaddr *ia6; 205 206 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); 207 208 /* 209 * draft-itojun-ipv6-tcp-to-anycast 210 * better place to put this in? 211 */ 212 ia6 = ip6_getdstifaddr(m); 213 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { 214 struct ip6_hdr *ip6; 215 216 ip6 = mtod(m, struct ip6_hdr *); 217 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 218 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 219 return IPPROTO_DONE; 220 } 221 222 tcp_input(m, *offp); 223 return IPPROTO_DONE; 224} 225#endif 226 227void 228tcp_input(struct mbuf *m, int off0) 229{ 230 struct tcphdr *th; 231 struct ip *ip = NULL; 232 struct ipovly *ipov; 233 struct inpcb *inp = NULL; 234 struct tcpcb *tp = NULL; 235 struct socket *so = NULL; 236 u_char *optp = NULL; 237 int optlen = 0; 238 int len, tlen, off; 239 int drop_hdrlen; 240 int thflags; 241 int rstreason = 0; /* For badport_bandlim accounting purposes */ 242#ifdef IPFIREWALL_FORWARD 243 struct m_tag *fwd_tag; 244#endif 245#ifdef INET6 246 struct ip6_hdr *ip6 = NULL; 247 int isipv6; 248#else 249 const void *ip6 = NULL; 250 const int isipv6 = 0; 251#endif 252 struct tcpopt to; /* options in this segment */ 253 char *s = NULL; /* address and port logging */ 254 255#ifdef TCPDEBUG 256 /* 257 * The size of tcp_saveipgen must be the size of the max ip header, 258 * now IPv6. 259 */ 260 u_char tcp_saveipgen[IP6_HDR_LEN]; 261 struct tcphdr tcp_savetcp; 262 short ostate = 0; 263#endif 264 265#ifdef INET6 266 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 267#endif 268 269 to.to_flags = 0; 270 tcpstat.tcps_rcvtotal++; 271 272 if (isipv6) { 273#ifdef INET6 274 /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ 275 ip6 = mtod(m, struct ip6_hdr *); 276 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 277 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { 278 tcpstat.tcps_rcvbadsum++; 279 goto drop; 280 } 281 th = (struct tcphdr *)((caddr_t)ip6 + off0); 282 283 /* 284 * Be proactive about unspecified IPv6 address in source. 285 * As we use all-zero to indicate unbounded/unconnected pcb, 286 * unspecified IPv6 address can be used to confuse us. 287 * 288 * Note that packets with unspecified IPv6 destination is 289 * already dropped in ip6_input. 290 */ 291 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 292 /* XXX stat */ 293 goto drop; 294 } 295#else 296 th = NULL; /* XXX: Avoid compiler warning. */ 297#endif 298 } else { 299 /* 300 * Get IP and TCP header together in first mbuf. 301 * Note: IP leaves IP header in first mbuf. 302 */ 303 if (off0 > sizeof (struct ip)) { 304 ip_stripoptions(m, (struct mbuf *)0); 305 off0 = sizeof(struct ip); 306 } 307 if (m->m_len < sizeof (struct tcpiphdr)) { 308 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 309 == NULL) { 310 tcpstat.tcps_rcvshort++; 311 return; 312 } 313 } 314 ip = mtod(m, struct ip *); 315 ipov = (struct ipovly *)ip; 316 th = (struct tcphdr *)((caddr_t)ip + off0); 317 tlen = ip->ip_len; 318 319 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 320 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 321 th->th_sum = m->m_pkthdr.csum_data; 322 else 323 th->th_sum = in_pseudo(ip->ip_src.s_addr, 324 ip->ip_dst.s_addr, 325 htonl(m->m_pkthdr.csum_data + 326 ip->ip_len + 327 IPPROTO_TCP)); 328 th->th_sum ^= 0xffff; 329#ifdef TCPDEBUG 330 ipov->ih_len = (u_short)tlen; 331 ipov->ih_len = htons(ipov->ih_len); 332#endif 333 } else { 334 /* 335 * Checksum extended TCP header and data. 336 */ 337 len = sizeof (struct ip) + tlen; 338 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 339 ipov->ih_len = (u_short)tlen; 340 ipov->ih_len = htons(ipov->ih_len); 341 th->th_sum = in_cksum(m, len); 342 } 343 if (th->th_sum) { 344 tcpstat.tcps_rcvbadsum++; 345 goto drop; 346 } 347 /* Re-initialization for later version check */ 348 ip->ip_v = IPVERSION; 349 } 350 351 /* 352 * Check that TCP offset makes sense, 353 * pull out TCP options and adjust length. XXX 354 */ 355 off = th->th_off << 2; 356 if (off < sizeof (struct tcphdr) || off > tlen) { 357 tcpstat.tcps_rcvbadoff++; 358 goto drop; 359 } 360 tlen -= off; /* tlen is used instead of ti->ti_len */ 361 if (off > sizeof (struct tcphdr)) { 362 if (isipv6) { 363#ifdef INET6 364 IP6_EXTHDR_CHECK(m, off0, off, ); 365 ip6 = mtod(m, struct ip6_hdr *); 366 th = (struct tcphdr *)((caddr_t)ip6 + off0); 367#endif 368 } else { 369 if (m->m_len < sizeof(struct ip) + off) { 370 if ((m = m_pullup(m, sizeof (struct ip) + off)) 371 == NULL) { 372 tcpstat.tcps_rcvshort++; 373 return; 374 } 375 ip = mtod(m, struct ip *); 376 ipov = (struct ipovly *)ip; 377 th = (struct tcphdr *)((caddr_t)ip + off0); 378 } 379 } 380 optlen = off - sizeof (struct tcphdr); 381 optp = (u_char *)(th + 1); 382 } 383 thflags = th->th_flags; 384 385 /* 386 * Convert TCP protocol specific fields to host format. 387 */ 388 th->th_seq = ntohl(th->th_seq); 389 th->th_ack = ntohl(th->th_ack); 390 th->th_win = ntohs(th->th_win); 391 th->th_urp = ntohs(th->th_urp); 392 393 /* 394 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. 395 */ 396 drop_hdrlen = off0 + off; 397 398 /* 399 * Locate pcb for segment. 400 */ 401 INP_INFO_WLOCK(&tcbinfo); 402findpcb: 403 INP_INFO_WLOCK_ASSERT(&tcbinfo); 404#ifdef IPFIREWALL_FORWARD 405 /* 406 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. 407 */ 408 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 409 410 if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ 411 struct sockaddr_in *next_hop; 412 413 next_hop = (struct sockaddr_in *)(fwd_tag+1); 414 /* 415 * Transparently forwarded. Pretend to be the destination. 416 * already got one like this? 417 */ 418 inp = in_pcblookup_hash(&tcbinfo, 419 ip->ip_src, th->th_sport, 420 ip->ip_dst, th->th_dport, 421 0, m->m_pkthdr.rcvif); 422 if (!inp) { 423 /* It's new. Try to find the ambushing socket. */ 424 inp = in_pcblookup_hash(&tcbinfo, 425 ip->ip_src, th->th_sport, 426 next_hop->sin_addr, 427 next_hop->sin_port ? 428 ntohs(next_hop->sin_port) : 429 th->th_dport, 430 INPLOOKUP_WILDCARD, 431 m->m_pkthdr.rcvif); 432 } 433 /* Remove the tag from the packet. We don't need it anymore. */ 434 m_tag_delete(m, fwd_tag); 435 } else 436#endif /* IPFIREWALL_FORWARD */ 437 { 438 if (isipv6) { 439#ifdef INET6 440 inp = in6_pcblookup_hash(&tcbinfo, 441 &ip6->ip6_src, th->th_sport, 442 &ip6->ip6_dst, th->th_dport, 443 INPLOOKUP_WILDCARD, 444 m->m_pkthdr.rcvif); 445#endif 446 } else 447 inp = in_pcblookup_hash(&tcbinfo, 448 ip->ip_src, th->th_sport, 449 ip->ip_dst, th->th_dport, 450 INPLOOKUP_WILDCARD, 451 m->m_pkthdr.rcvif); 452 } 453 454#if defined(IPSEC) || defined(FAST_IPSEC) 455#ifdef INET6 456 if (isipv6 && inp != NULL && ipsec6_in_reject(m, inp)) { 457#ifdef IPSEC 458 ipsec6stat.in_polvio++; 459#endif 460 goto dropunlock; 461 } else 462#endif /* INET6 */ 463 if (inp != NULL && ipsec4_in_reject(m, inp)) { 464#ifdef IPSEC 465 ipsecstat.in_polvio++; 466#endif 467 goto dropunlock; 468 } 469#endif /*IPSEC || FAST_IPSEC*/ 470 471 /* 472 * If the INPCB does not exist then all data in the incoming 473 * segment is discarded and an appropriate RST is sent back. 474 */ 475 if (inp == NULL) { 476 /* 477 * Log communication attempts to ports that are not 478 * in use. 479 */ 480 if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || 481 tcp_log_in_vain == 2) { 482 if ((s = tcp_log_addrs(NULL, th, (void *)ip, 483 (void *)ip6))) 484 log(LOG_INFO, "%s; %s: Connection attempt " 485 "to closed port\n", s, __func__); 486 } 487 /* 488 * When blackholing do not respond with a RST but 489 * completely ignore the segment and drop it. 490 */ 491 if ((blackhole == 1 && (thflags & TH_SYN)) || 492 blackhole == 2) 493 goto dropunlock; 494 495 rstreason = BANDLIM_RST_CLOSEDPORT; 496 goto dropwithreset; 497 } 498 INP_LOCK(inp); 499 500 /* 501 * Check the minimum TTL for socket. 502 */ 503 if (inp->inp_ip_minttl != 0) { 504#ifdef INET6 505 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) 506 goto dropunlock; 507 else 508#endif 509 if (inp->inp_ip_minttl > ip->ip_ttl) 510 goto dropunlock; 511 } 512 513 /* 514 * A previous connection in TIMEWAIT state is supposed to catch 515 * stray or duplicate segments arriving late. If this segment 516 * was a legitimate new connection attempt the old INPCB gets 517 * removed and we can try again to find a listening socket. 518 */ 519 if (inp->inp_vflag & INP_TIMEWAIT) { 520 if (thflags & TH_SYN) 521 tcp_dooptions(&to, optp, optlen, TO_SYN); 522 /* 523 * NB: tcp_twcheck unlocks the INP and frees the mbuf. 524 */ 525 if (tcp_twcheck(inp, &to, th, m, tlen)) 526 goto findpcb; 527 INP_INFO_WUNLOCK(&tcbinfo); 528 return; 529 } 530 /* 531 * The TCPCB may no longer exist if the connection is winding 532 * down or it is in the CLOSED state. Either way we drop the 533 * segment and send an appropriate response. 534 */ 535 tp = intotcpcb(inp); 536 if (tp == NULL || tp->t_state == TCPS_CLOSED) { 537 rstreason = BANDLIM_RST_CLOSEDPORT; 538 goto dropwithreset; 539 } 540 541#ifdef MAC 542 INP_LOCK_ASSERT(inp); 543 if (mac_check_inpcb_deliver(inp, m)) 544 goto dropunlock; 545#endif 546 so = inp->inp_socket; 547 KASSERT(so != NULL, ("%s: so == NULL", __func__)); 548#ifdef TCPDEBUG 549 if (so->so_options & SO_DEBUG) { 550 ostate = tp->t_state; 551 if (isipv6) { 552#ifdef INET6 553 bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); 554#endif 555 } else 556 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); 557 tcp_savetcp = *th; 558 } 559#endif 560 /* 561 * When the socket is accepting connections (the INPCB is in LISTEN 562 * state) we look into the SYN cache if this is a new connection 563 * attempt or the completion of a previous one. 564 */ 565 if (so->so_options & SO_ACCEPTCONN) { 566 struct in_conninfo inc; 567 568 KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " 569 "tp not listening", __func__)); 570 571 bzero(&inc, sizeof(inc)); 572 inc.inc_isipv6 = isipv6; 573#ifdef INET6 574 if (isipv6) { 575 inc.inc6_faddr = ip6->ip6_src; 576 inc.inc6_laddr = ip6->ip6_dst; 577 } else 578#endif 579 { 580 inc.inc_faddr = ip->ip_src; 581 inc.inc_laddr = ip->ip_dst; 582 } 583 inc.inc_fport = th->th_sport; 584 inc.inc_lport = th->th_dport; 585 586 /* 587 * Check for an existing connection attempt in syncache if 588 * the flag is only ACK. A successful lookup creates a new 589 * socket appended to the listen queue in SYN_RECEIVED state. 590 */ 591 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 592 /* 593 * Parse the TCP options here because 594 * syncookies need access to the reflected 595 * timestamp. 596 */ 597 tcp_dooptions(&to, optp, optlen, 0); 598 /* 599 * NB: syncache_expand() doesn't unlock 600 * inp and tcpinfo locks. 601 */ 602 if (!syncache_expand(&inc, &to, th, &so, m)) { 603 /* 604 * No syncache entry or ACK was not 605 * for our SYN/ACK. Send a RST. 606 * NB: syncache did its own logging 607 * of the failure cause. 608 */ 609 rstreason = BANDLIM_RST_OPENPORT; 610 goto dropwithreset; 611 } 612 if (so == NULL) { 613 /* 614 * We completed the 3-way handshake 615 * but could not allocate a socket 616 * either due to memory shortage, 617 * listen queue length limits or 618 * global socket limits. Send RST 619 * or wait and have the remote end 620 * retransmit the ACK for another 621 * try. 622 */ 623 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 624 log(LOG_DEBUG, "%s; %s: Listen socket: " 625 "Socket allocation failed due to " 626 "limits or memory shortage, %s\n", 627 s, __func__, (tcp_sc_rst_sock_fail ? 628 "sending RST" : "try again")); 629 if (tcp_sc_rst_sock_fail) { 630 rstreason = BANDLIM_UNLIMITED; 631 goto dropwithreset; 632 } else 633 goto dropunlock; 634 } 635 /* 636 * Socket is created in state SYN_RECEIVED. 637 * Unlock the listen socket, lock the newly 638 * created socket and update the tp variable. 639 */ 640 INP_UNLOCK(inp); /* listen socket */ 641 inp = sotoinpcb(so); 642 INP_LOCK(inp); /* new connection */ 643 tp = intotcpcb(inp); 644 KASSERT(tp->t_state == TCPS_SYN_RECEIVED, 645 ("%s: ", __func__)); 646 /* 647 * Process the segment and the data it 648 * contains. tcp_do_segment() consumes 649 * the mbuf chain and unlocks the inpcb. 650 */ 651 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); 652 INP_INFO_UNLOCK_ASSERT(&tcbinfo); 653 return; 654 } 655 /* 656 * Segment flag validation for new connection attempts: 657 * 658 * Our (SYN|ACK) response was rejected. 659 * Check with syncache and remove entry to prevent 660 * retransmits. 661 */ 662 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { 663 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 664 log(LOG_DEBUG, "%s; %s: Listen socket: " 665 "Our SYN|ACK was rejected, connection " 666 "attempt aborted by remote endpoint\n", 667 s, __func__); 668 syncache_chkrst(&inc, th); 669 goto dropunlock; 670 } 671 /* 672 * Spurious RST. Ignore. 673 */ 674 if (thflags & TH_RST) { 675 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 676 log(LOG_DEBUG, "%s; %s: Listen socket: " 677 "Spurious RST, segment rejected\n", 678 s, __func__); 679 goto dropunlock; 680 } 681 /* 682 * We can't do anything without SYN. 683 */ 684 if ((thflags & TH_SYN) == 0) { 685 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 686 log(LOG_DEBUG, "%s; %s: Listen socket: " 687 "SYN is missing, segment rejected\n", 688 s, __func__); 689 tcpstat.tcps_badsyn++; 690 goto dropunlock; 691 } 692 /* 693 * (SYN|ACK) is bogus on a listen socket. 694 */ 695 if (thflags & TH_ACK) { 696 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 697 log(LOG_DEBUG, "%s; %s: Listen socket: " 698 "SYN|ACK invalid, segment rejected\n", 699 s, __func__); 700 syncache_badack(&inc); /* XXX: Not needed! */ 701 tcpstat.tcps_badsyn++; 702 rstreason = BANDLIM_RST_OPENPORT; 703 goto dropwithreset; 704 } 705 /* 706 * If the drop_synfin option is enabled, drop all 707 * segments with both the SYN and FIN bits set. 708 * This prevents e.g. nmap from identifying the 709 * TCP/IP stack. 710 * XXX: Poor reasoning. nmap has other methods 711 * and is constantly refining its stack detection 712 * strategies. 713 * XXX: This is a violation of the TCP specification 714 * and was used by RFC1644. 715 */ 716 if ((thflags & TH_FIN) && drop_synfin) { 717 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 718 log(LOG_DEBUG, "%s; %s: Listen socket: " 719 "SYN|FIN segment rejected (based on " 720 "sysctl setting)\n", s, __func__); 721 tcpstat.tcps_badsyn++; 722 goto dropunlock; 723 } 724 /* 725 * Segment's flags are (SYN) or (SYN|FIN). 726 * 727 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored 728 * as they do not affect the state of the TCP FSM. 729 * The data pointed to by TH_URG and th_urp is ignored. 730 */ 731 KASSERT((thflags & (TH_RST|TH_ACK)) == 0, 732 ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); 733 KASSERT(thflags & (TH_SYN), 734 ("%s: Listen socket: TH_SYN not set", __func__)); 735#ifdef INET6 736 /* 737 * If deprecated address is forbidden, 738 * we do not accept SYN to deprecated interface 739 * address to prevent any new inbound connection from 740 * getting established. 741 * When we do not accept SYN, we send a TCP RST, 742 * with deprecated source address (instead of dropping 743 * it). We compromise it as it is much better for peer 744 * to send a RST, and RST will be the final packet 745 * for the exchange. 746 * 747 * If we do not forbid deprecated addresses, we accept 748 * the SYN packet. RFC2462 does not suggest dropping 749 * SYN in this case. 750 * If we decipher RFC2462 5.5.4, it says like this: 751 * 1. use of deprecated addr with existing 752 * communication is okay - "SHOULD continue to be 753 * used" 754 * 2. use of it with new communication: 755 * (2a) "SHOULD NOT be used if alternate address 756 * with sufficient scope is available" 757 * (2b) nothing mentioned otherwise. 758 * Here we fall into (2b) case as we have no choice in 759 * our source address selection - we must obey the peer. 760 * 761 * The wording in RFC2462 is confusing, and there are 762 * multiple description text for deprecated address 763 * handling - worse, they are not exactly the same. 764 * I believe 5.5.4 is the best one, so we follow 5.5.4. 765 */ 766 if (isipv6 && !ip6_use_deprecated) { 767 struct in6_ifaddr *ia6; 768 769 if ((ia6 = ip6_getdstifaddr(m)) && 770 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 771 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 772 log(LOG_DEBUG, "%s; %s: Listen socket: " 773 "Connection attempt to deprecated " 774 "IPv6 address rejected\n", 775 s, __func__); 776 rstreason = BANDLIM_RST_OPENPORT; 777 goto dropwithreset; 778 } 779 } 780#endif 781 /* 782 * Basic sanity checks on incoming SYN requests: 783 * Don't respond if the destination is a link layer 784 * broadcast according to RFC1122 4.2.3.10, p. 104. 785 * If it is from this socket it must be forged. 786 * Don't respond if the source or destination is a 787 * global or subnet broad- or multicast address. 788 * Note that it is quite possible to receive unicast 789 * link-layer packets with a broadcast IP address. Use 790 * in_broadcast() to find them. 791 */ 792 if (m->m_flags & (M_BCAST|M_MCAST)) { 793 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 794 log(LOG_DEBUG, "%s; %s: Listen socket: " 795 "Connection attempt from broad- or multicast " 796 "link layer address rejected\n", s, __func__); 797 goto dropunlock; 798 } 799 if (isipv6) { 800#ifdef INET6 801 if (th->th_dport == th->th_sport && 802 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { 803 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 804 log(LOG_DEBUG, "%s; %s: Listen socket: " 805 "Connection attempt to/from self " 806 "rejected\n", s, __func__); 807 goto dropunlock; 808 } 809 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 810 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { 811 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 812 log(LOG_DEBUG, "%s; %s: Listen socket: " 813 "Connection attempt from/to multicast " 814 "address rejected\n", s, __func__); 815 goto dropunlock; 816 } 817#endif 818 } else { 819 if (th->th_dport == th->th_sport && 820 ip->ip_dst.s_addr == ip->ip_src.s_addr) { 821 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 822 log(LOG_DEBUG, "%s; %s: Listen socket: " 823 "Connection attempt from/to self " 824 "rejected\n", s, __func__); 825 goto dropunlock; 826 } 827 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 828 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 829 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 830 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { 831 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 832 log(LOG_DEBUG, "%s; %s: Listen socket: " 833 "Connection attempt from/to broad- " 834 "or multicast address rejected\n", 835 s, __func__); 836 goto dropunlock; 837 } 838 } 839 /* 840 * SYN appears to be valid. Create compressed TCP state 841 * for syncache. 842 */ 843#ifdef TCPDEBUG 844 if (so->so_options & SO_DEBUG) 845 tcp_trace(TA_INPUT, ostate, tp, 846 (void *)tcp_saveipgen, &tcp_savetcp, 0); 847#endif 848 tcp_dooptions(&to, optp, optlen, TO_SYN); 849 syncache_add(&inc, &to, th, inp, &so, m); 850 /* 851 * Entry added to syncache and mbuf consumed. 852 * Everything already unlocked by syncache_add(). 853 */ 854 INP_INFO_UNLOCK_ASSERT(&tcbinfo); 855 return; 856 } 857 858 /* 859 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later 860 * state. tcp_do_segment() always consumes the mbuf chain, unlocks 861 * the inpcb, and unlocks pcbinfo. 862 */ 863 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); 864 INP_INFO_UNLOCK_ASSERT(&tcbinfo); 865 return; 866 867dropwithreset: 868 INP_INFO_WLOCK_ASSERT(&tcbinfo); 869 tcp_dropwithreset(m, th, tp, tlen, rstreason); 870 m = NULL; /* mbuf chain got consumed. */ 871dropunlock: 872 INP_INFO_WLOCK_ASSERT(&tcbinfo); 873 if (inp != NULL) 874 INP_UNLOCK(inp); 875 INP_INFO_WUNLOCK(&tcbinfo); 876drop: 877 INP_INFO_UNLOCK_ASSERT(&tcbinfo); 878 if (s != NULL) 879 free(s, M_TCPLOG); 880 if (m != NULL) 881 m_freem(m); 882 return; 883} 884 885static void 886tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 887 struct tcpcb *tp, int drop_hdrlen, int tlen) 888{ 889 int thflags, acked, ourfinisacked, needoutput = 0; 890 int headlocked = 1; 891 int rstreason, todrop, win; 892 u_long tiwin; 893 struct tcpopt to; 894 895#ifdef TCPDEBUG 896 /* 897 * The size of tcp_saveipgen must be the size of the max ip header, 898 * now IPv6. 899 */ 900 u_char tcp_saveipgen[IP6_HDR_LEN]; 901 struct tcphdr tcp_savetcp; 902 short ostate = 0; 903#endif 904 thflags = th->th_flags; 905 906 INP_INFO_WLOCK_ASSERT(&tcbinfo); 907 INP_LOCK_ASSERT(tp->t_inpcb); 908 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 909 __func__)); 910 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 911 __func__)); 912 913 /* 914 * Segment received on connection. 915 * Reset idle time and keep-alive timer. 916 */ 917 tp->t_rcvtime = ticks; 918 if (TCPS_HAVEESTABLISHED(tp->t_state)) 919 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 920 921 /* 922 * Unscale the window into a 32-bit value. 923 * This value is bogus for the TCPS_SYN_SENT state 924 * and is overwritten later. 925 */ 926 tiwin = th->th_win << tp->snd_scale; 927 928 /* 929 * Parse options on any incoming segment. 930 */ 931 tcp_dooptions(&to, (u_char *)(th + 1), 932 (th->th_off << 2) - sizeof(struct tcphdr), 933 (thflags & TH_SYN) ? TO_SYN : 0); 934 935 /* 936 * If echoed timestamp is later than the current time, 937 * fall back to non RFC1323 RTT calculation. Normalize 938 * timestamp if syncookies were used when this connection 939 * was established. 940 */ 941 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 942 to.to_tsecr -= tp->ts_offset; 943 if (TSTMP_GT(to.to_tsecr, ticks)) 944 to.to_tsecr = 0; 945 } 946 947 /* 948 * Process options only when we get SYN/ACK back. The SYN case 949 * for incoming connections is handled in tcp_syncache. 950 * XXX this is traditional behavior, may need to be cleaned up. 951 */ 952 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 953 if ((to.to_flags & TOF_SCALE) && 954 (tp->t_flags & TF_REQ_SCALE)) { 955 tp->t_flags |= TF_RCVD_SCALE; 956 tp->snd_scale = to.to_wscale; 957 tp->snd_wnd = th->th_win << tp->snd_scale; 958 tiwin = tp->snd_wnd; 959 } 960 if (to.to_flags & TOF_TS) { 961 tp->t_flags |= TF_RCVD_TSTMP; 962 tp->ts_recent = to.to_tsval; 963 tp->ts_recent_age = ticks; 964 } 965 /* Initial send window, already scaled. */ 966 tp->snd_wnd = th->th_win; 967 if (to.to_flags & TOF_MSS) 968 tcp_mss(tp, to.to_mss); 969 if ((tp->t_flags & TF_SACK_PERMIT) && 970 (to.to_flags & TOF_SACKPERM) == 0) 971 tp->t_flags &= ~TF_SACK_PERMIT; 972 } 973 974 /* 975 * Header prediction: check for the two common cases 976 * of a uni-directional data xfer. If the packet has 977 * no control flags, is in-sequence, the window didn't 978 * change and we're not retransmitting, it's a 979 * candidate. If the length is zero and the ack moved 980 * forward, we're the sender side of the xfer. Just 981 * free the data acked & wake any higher level process 982 * that was blocked waiting for space. If the length 983 * is non-zero and the ack didn't move, we're the 984 * receiver side. If we're getting packets in-order 985 * (the reassembly queue is empty), add the data to 986 * the socket buffer and note that we need a delayed ack. 987 * Make sure that the hidden state-flags are also off. 988 * Since we check for TCPS_ESTABLISHED first, it can only 989 * be TH_NEEDSYN. 990 */ 991 if (tp->t_state == TCPS_ESTABLISHED && 992 th->th_seq == tp->rcv_nxt && 993 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 994 tp->snd_nxt == tp->snd_max && 995 tiwin && tiwin == tp->snd_wnd && 996 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 997 LIST_EMPTY(&tp->t_segq) && 998 ((to.to_flags & TOF_TS) == 0 || 999 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { 1000 1001 /* 1002 * If last ACK falls within this segment's sequence numbers, 1003 * record the timestamp. 1004 * NOTE that the test is modified according to the latest 1005 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1006 */ 1007 if ((to.to_flags & TOF_TS) != 0 && 1008 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1009 tp->ts_recent_age = ticks; 1010 tp->ts_recent = to.to_tsval; 1011 } 1012 1013 if (tlen == 0) { 1014 if (SEQ_GT(th->th_ack, tp->snd_una) && 1015 SEQ_LEQ(th->th_ack, tp->snd_max) && 1016 tp->snd_cwnd >= tp->snd_wnd && 1017 ((!tcp_do_newreno && 1018 !(tp->t_flags & TF_SACK_PERMIT) && 1019 tp->t_dupacks < tcprexmtthresh) || 1020 ((tcp_do_newreno || 1021 (tp->t_flags & TF_SACK_PERMIT)) && 1022 !IN_FASTRECOVERY(tp) && 1023 (to.to_flags & TOF_SACK) == 0 && 1024 TAILQ_EMPTY(&tp->snd_holes)))) { 1025 KASSERT(headlocked, 1026 ("%s: headlocked", __func__)); 1027 INP_INFO_WUNLOCK(&tcbinfo); 1028 headlocked = 0; 1029 /* 1030 * this is a pure ack for outstanding data. 1031 */ 1032 ++tcpstat.tcps_predack; 1033 /* 1034 * "bad retransmit" recovery 1035 */ 1036 if (tp->t_rxtshift == 1 && 1037 ticks < tp->t_badrxtwin) { 1038 ++tcpstat.tcps_sndrexmitbad; 1039 tp->snd_cwnd = tp->snd_cwnd_prev; 1040 tp->snd_ssthresh = 1041 tp->snd_ssthresh_prev; 1042 tp->snd_recover = tp->snd_recover_prev; 1043 if (tp->t_flags & TF_WASFRECOVERY) 1044 ENTER_FASTRECOVERY(tp); 1045 tp->snd_nxt = tp->snd_max; 1046 tp->t_badrxtwin = 0; 1047 } 1048 1049 /* 1050 * Recalculate the transmit timer / rtt. 1051 * 1052 * Some boxes send broken timestamp replies 1053 * during the SYN+ACK phase, ignore 1054 * timestamps of 0 or we could calculate a 1055 * huge RTT and blow up the retransmit timer. 1056 */ 1057 if ((to.to_flags & TOF_TS) != 0 && 1058 to.to_tsecr) { 1059 if (!tp->t_rttlow || 1060 tp->t_rttlow > ticks - to.to_tsecr) 1061 tp->t_rttlow = ticks - to.to_tsecr; 1062 tcp_xmit_timer(tp, 1063 ticks - to.to_tsecr + 1); 1064 } else if (tp->t_rtttime && 1065 SEQ_GT(th->th_ack, tp->t_rtseq)) { 1066 if (!tp->t_rttlow || 1067 tp->t_rttlow > ticks - tp->t_rtttime) 1068 tp->t_rttlow = ticks - tp->t_rtttime; 1069 tcp_xmit_timer(tp, 1070 ticks - tp->t_rtttime); 1071 } 1072 tcp_xmit_bandwidth_limit(tp, th->th_ack); 1073 acked = th->th_ack - tp->snd_una; 1074 tcpstat.tcps_rcvackpack++; 1075 tcpstat.tcps_rcvackbyte += acked; 1076 sbdrop(&so->so_snd, acked); 1077 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 1078 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1079 tp->snd_recover = th->th_ack - 1; 1080 tp->snd_una = th->th_ack; 1081 /* 1082 * pull snd_wl2 up to prevent seq wrap relative 1083 * to th_ack. 1084 */ 1085 tp->snd_wl2 = th->th_ack; 1086 tp->t_dupacks = 0; 1087 m_freem(m); 1088 ND6_HINT(tp); /* some progress has been done */ 1089 1090 /* 1091 * If all outstanding data are acked, stop 1092 * retransmit timer, otherwise restart timer 1093 * using current (possibly backed-off) value. 1094 * If process is waiting for space, 1095 * wakeup/selwakeup/signal. If data 1096 * are ready to send, let tcp_output 1097 * decide between more output or persist. 1098 1099#ifdef TCPDEBUG 1100 if (so->so_options & SO_DEBUG) 1101 tcp_trace(TA_INPUT, ostate, tp, 1102 (void *)tcp_saveipgen, 1103 &tcp_savetcp, 0); 1104#endif 1105 */ 1106 if (tp->snd_una == tp->snd_max) 1107 tcp_timer_activate(tp, TT_REXMT, 0); 1108 else if (!tcp_timer_active(tp, TT_PERSIST)) 1109 tcp_timer_activate(tp, TT_REXMT, 1110 tp->t_rxtcur); 1111 1112 sowwakeup(so); 1113 if (so->so_snd.sb_cc) 1114 (void) tcp_output(tp); 1115 goto check_delack; 1116 } 1117 } else if (th->th_ack == tp->snd_una && 1118 tlen <= sbspace(&so->so_rcv)) { 1119 int newsize = 0; /* automatic sockbuf scaling */ 1120 1121 KASSERT(headlocked, ("%s: headlocked", __func__)); 1122 INP_INFO_WUNLOCK(&tcbinfo); 1123 headlocked = 0; 1124 /* 1125 * this is a pure, in-sequence data packet 1126 * with nothing on the reassembly queue and 1127 * we have enough buffer space to take it. 1128 */ 1129 /* Clean receiver SACK report if present */ 1130 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 1131 tcp_clean_sackreport(tp); 1132 ++tcpstat.tcps_preddat; 1133 tp->rcv_nxt += tlen; 1134 /* 1135 * Pull snd_wl1 up to prevent seq wrap relative to 1136 * th_seq. 1137 */ 1138 tp->snd_wl1 = th->th_seq; 1139 /* 1140 * Pull rcv_up up to prevent seq wrap relative to 1141 * rcv_nxt. 1142 */ 1143 tp->rcv_up = tp->rcv_nxt; 1144 tcpstat.tcps_rcvpack++; 1145 tcpstat.tcps_rcvbyte += tlen; 1146 ND6_HINT(tp); /* some progress has been done */ 1147#ifdef TCPDEBUG 1148 if (so->so_options & SO_DEBUG) 1149 tcp_trace(TA_INPUT, ostate, tp, 1150 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1151#endif 1152 /* 1153 * Automatic sizing of receive socket buffer. Often the send 1154 * buffer size is not optimally adjusted to the actual network 1155 * conditions at hand (delay bandwidth product). Setting the 1156 * buffer size too small limits throughput on links with high 1157 * bandwidth and high delay (eg. trans-continental/oceanic links). 1158 * 1159 * On the receive side the socket buffer memory is only rarely 1160 * used to any significant extent. This allows us to be much 1161 * more aggressive in scaling the receive socket buffer. For 1162 * the case that the buffer space is actually used to a large 1163 * extent and we run out of kernel memory we can simply drop 1164 * the new segments; TCP on the sender will just retransmit it 1165 * later. Setting the buffer size too big may only consume too 1166 * much kernel memory if the application doesn't read() from 1167 * the socket or packet loss or reordering makes use of the 1168 * reassembly queue. 1169 * 1170 * The criteria to step up the receive buffer one notch are: 1171 * 1. the number of bytes received during the time it takes 1172 * one timestamp to be reflected back to us (the RTT); 1173 * 2. received bytes per RTT is within seven eighth of the 1174 * current socket buffer size; 1175 * 3. receive buffer size has not hit maximal automatic size; 1176 * 1177 * This algorithm does one step per RTT at most and only if 1178 * we receive a bulk stream w/o packet losses or reorderings. 1179 * Shrinking the buffer during idle times is not necessary as 1180 * it doesn't consume any memory when idle. 1181 * 1182 * TODO: Only step up if the application is actually serving 1183 * the buffer to better manage the socket buffer resources. 1184 */ 1185 if (tcp_do_autorcvbuf && 1186 to.to_tsecr && 1187 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 1188 if (to.to_tsecr > tp->rfbuf_ts && 1189 to.to_tsecr - tp->rfbuf_ts < hz) { 1190 if (tp->rfbuf_cnt > 1191 (so->so_rcv.sb_hiwat / 8 * 7) && 1192 so->so_rcv.sb_hiwat < 1193 tcp_autorcvbuf_max) { 1194 newsize = 1195 min(so->so_rcv.sb_hiwat + 1196 tcp_autorcvbuf_inc, 1197 tcp_autorcvbuf_max); 1198 } 1199 /* Start over with next RTT. */ 1200 tp->rfbuf_ts = 0; 1201 tp->rfbuf_cnt = 0; 1202 } else 1203 tp->rfbuf_cnt += tlen; /* add up */ 1204 } 1205 1206 /* Add data to socket buffer. */ 1207 SOCKBUF_LOCK(&so->so_rcv); 1208 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1209 m_freem(m); 1210 } else { 1211 /* 1212 * Set new socket buffer size. 1213 * Give up when limit is reached. 1214 */ 1215 if (newsize) 1216 if (!sbreserve_locked(&so->so_rcv, 1217 newsize, so, curthread)) 1218 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1219 m_adj(m, drop_hdrlen); /* delayed header drop */ 1220 sbappendstream_locked(&so->so_rcv, m); 1221 } 1222 sorwakeup_locked(so); 1223 if (DELAY_ACK(tp)) { 1224 tp->t_flags |= TF_DELACK; 1225 } else { 1226 tp->t_flags |= TF_ACKNOW; 1227 tcp_output(tp); 1228 } 1229 goto check_delack; 1230 } 1231 } 1232 1233 /* 1234 * Calculate amount of space in receive window, 1235 * and then do TCP input processing. 1236 * Receive window is amount of space in rcv queue, 1237 * but not less than advertised window. 1238 */ 1239 win = sbspace(&so->so_rcv); 1240 if (win < 0) 1241 win = 0; 1242 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1243 1244 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1245 tp->rfbuf_ts = 0; 1246 tp->rfbuf_cnt = 0; 1247 1248 switch (tp->t_state) { 1249 1250 /* 1251 * If the state is SYN_RECEIVED: 1252 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1253 */ 1254 case TCPS_SYN_RECEIVED: 1255 if ((thflags & TH_ACK) && 1256 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1257 SEQ_GT(th->th_ack, tp->snd_max))) { 1258 rstreason = BANDLIM_RST_OPENPORT; 1259 goto dropwithreset; 1260 } 1261 break; 1262 1263 /* 1264 * If the state is SYN_SENT: 1265 * if seg contains an ACK, but not for our SYN, drop the input. 1266 * if seg contains a RST, then drop the connection. 1267 * if seg does not contain SYN, then drop it. 1268 * Otherwise this is an acceptable SYN segment 1269 * initialize tp->rcv_nxt and tp->irs 1270 * if seg contains ack then advance tp->snd_una 1271 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1272 * arrange for segment to be acked (eventually) 1273 * continue processing rest of data/controls, beginning with URG 1274 */ 1275 case TCPS_SYN_SENT: 1276 if ((thflags & TH_ACK) && 1277 (SEQ_LEQ(th->th_ack, tp->iss) || 1278 SEQ_GT(th->th_ack, tp->snd_max))) { 1279 rstreason = BANDLIM_UNLIMITED; 1280 goto dropwithreset; 1281 } 1282 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) 1283 tp = tcp_drop(tp, ECONNREFUSED); 1284 if (thflags & TH_RST) 1285 goto drop; 1286 if (!(thflags & TH_SYN)) 1287 goto drop; 1288 1289 tp->irs = th->th_seq; 1290 tcp_rcvseqinit(tp); 1291 if (thflags & TH_ACK) { 1292 tcpstat.tcps_connects++; 1293 soisconnected(so); 1294#ifdef MAC 1295 SOCK_LOCK(so); 1296 mac_set_socket_peer_from_mbuf(m, so); 1297 SOCK_UNLOCK(so); 1298#endif 1299 /* Do window scaling on this connection? */ 1300 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1301 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1302 tp->rcv_scale = tp->request_r_scale; 1303 } 1304 tp->rcv_adv += tp->rcv_wnd; 1305 tp->snd_una++; /* SYN is acked */ 1306 /* 1307 * If there's data, delay ACK; if there's also a FIN 1308 * ACKNOW will be turned on later. 1309 */ 1310 if (DELAY_ACK(tp) && tlen != 0) 1311 tcp_timer_activate(tp, TT_DELACK, 1312 tcp_delacktime); 1313 else 1314 tp->t_flags |= TF_ACKNOW; 1315 /* 1316 * Received <SYN,ACK> in SYN_SENT[*] state. 1317 * Transitions: 1318 * SYN_SENT --> ESTABLISHED 1319 * SYN_SENT* --> FIN_WAIT_1 1320 */ 1321 tp->t_starttime = ticks; 1322 if (tp->t_flags & TF_NEEDFIN) { 1323 tp->t_state = TCPS_FIN_WAIT_1; 1324 tp->t_flags &= ~TF_NEEDFIN; 1325 thflags &= ~TH_SYN; 1326 } else { 1327 tp->t_state = TCPS_ESTABLISHED; 1328 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1329 } 1330 } else { 1331 /* 1332 * Received initial SYN in SYN-SENT[*] state => 1333 * simultaneous open. If segment contains CC option 1334 * and there is a cached CC, apply TAO test. 1335 * If it succeeds, connection is * half-synchronized. 1336 * Otherwise, do 3-way handshake: 1337 * SYN-SENT -> SYN-RECEIVED 1338 * SYN-SENT* -> SYN-RECEIVED* 1339 * If there was no CC option, clear cached CC value. 1340 */ 1341 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 1342 tcp_timer_activate(tp, TT_REXMT, 0); 1343 tp->t_state = TCPS_SYN_RECEIVED; 1344 } 1345 1346 KASSERT(headlocked, ("%s: trimthenstep6: head not locked", 1347 __func__)); 1348 INP_LOCK_ASSERT(tp->t_inpcb); 1349 1350 /* 1351 * Advance th->th_seq to correspond to first data byte. 1352 * If data, trim to stay within window, 1353 * dropping FIN if necessary. 1354 */ 1355 th->th_seq++; 1356 if (tlen > tp->rcv_wnd) { 1357 todrop = tlen - tp->rcv_wnd; 1358 m_adj(m, -todrop); 1359 tlen = tp->rcv_wnd; 1360 thflags &= ~TH_FIN; 1361 tcpstat.tcps_rcvpackafterwin++; 1362 tcpstat.tcps_rcvbyteafterwin += todrop; 1363 } 1364 tp->snd_wl1 = th->th_seq - 1; 1365 tp->rcv_up = th->th_seq; 1366 /* 1367 * Client side of transaction: already sent SYN and data. 1368 * If the remote host used T/TCP to validate the SYN, 1369 * our data will be ACK'd; if so, enter normal data segment 1370 * processing in the middle of step 5, ack processing. 1371 * Otherwise, goto step 6. 1372 */ 1373 if (thflags & TH_ACK) 1374 goto process_ACK; 1375 1376 goto step6; 1377 1378 /* 1379 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 1380 * do normal processing. 1381 * 1382 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 1383 */ 1384 case TCPS_LAST_ACK: 1385 case TCPS_CLOSING: 1386 break; /* continue normal processing */ 1387 } 1388 1389 /* 1390 * States other than LISTEN or SYN_SENT. 1391 * First check the RST flag and sequence number since reset segments 1392 * are exempt from the timestamp and connection count tests. This 1393 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 1394 * below which allowed reset segments in half the sequence space 1395 * to fall though and be processed (which gives forged reset 1396 * segments with a random sequence number a 50 percent chance of 1397 * killing a connection). 1398 * Then check timestamp, if present. 1399 * Then check the connection count, if present. 1400 * Then check that at least some bytes of segment are within 1401 * receive window. If segment begins before rcv_nxt, 1402 * drop leading data (and SYN); if nothing left, just ack. 1403 * 1404 * 1405 * If the RST bit is set, check the sequence number to see 1406 * if this is a valid reset segment. 1407 * RFC 793 page 37: 1408 * In all states except SYN-SENT, all reset (RST) segments 1409 * are validated by checking their SEQ-fields. A reset is 1410 * valid if its sequence number is in the window. 1411 * Note: this does not take into account delayed ACKs, so 1412 * we should test against last_ack_sent instead of rcv_nxt. 1413 * The sequence number in the reset segment is normally an 1414 * echo of our outgoing acknowlegement numbers, but some hosts 1415 * send a reset with the sequence number at the rightmost edge 1416 * of our receive window, and we have to handle this case. 1417 * Note 2: Paul Watson's paper "Slipping in the Window" has shown 1418 * that brute force RST attacks are possible. To combat this, 1419 * we use a much stricter check while in the ESTABLISHED state, 1420 * only accepting RSTs where the sequence number is equal to 1421 * last_ack_sent. In all other states (the states in which a 1422 * RST is more likely), the more permissive check is used. 1423 * If we have multiple segments in flight, the intial reset 1424 * segment sequence numbers will be to the left of last_ack_sent, 1425 * but they will eventually catch up. 1426 * In any case, it never made sense to trim reset segments to 1427 * fit the receive window since RFC 1122 says: 1428 * 4.2.2.12 RST Segment: RFC-793 Section 3.4 1429 * 1430 * A TCP SHOULD allow a received RST segment to include data. 1431 * 1432 * DISCUSSION 1433 * It has been suggested that a RST segment could contain 1434 * ASCII text that encoded and explained the cause of the 1435 * RST. No standard has yet been established for such 1436 * data. 1437 * 1438 * If the reset segment passes the sequence number test examine 1439 * the state: 1440 * SYN_RECEIVED STATE: 1441 * If passive open, return to LISTEN state. 1442 * If active open, inform user that connection was refused. 1443 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: 1444 * Inform user that connection was reset, and close tcb. 1445 * CLOSING, LAST_ACK STATES: 1446 * Close the tcb. 1447 * TIME_WAIT STATE: 1448 * Drop the segment - see Stevens, vol. 2, p. 964 and 1449 * RFC 1337. 1450 */ 1451 if (thflags & TH_RST) { 1452 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1453 SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1454 switch (tp->t_state) { 1455 1456 case TCPS_SYN_RECEIVED: 1457 so->so_error = ECONNREFUSED; 1458 goto close; 1459 1460 case TCPS_ESTABLISHED: 1461 if (tcp_insecure_rst == 0 && 1462 !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && 1463 SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && 1464 !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1465 SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { 1466 tcpstat.tcps_badrst++; 1467 goto drop; 1468 } 1469 case TCPS_FIN_WAIT_1: 1470 case TCPS_FIN_WAIT_2: 1471 case TCPS_CLOSE_WAIT: 1472 so->so_error = ECONNRESET; 1473 close: 1474 tp->t_state = TCPS_CLOSED; 1475 tcpstat.tcps_drops++; 1476 KASSERT(headlocked, ("%s: trimthenstep6: " 1477 "tcp_close: head not locked", __func__)); 1478 tp = tcp_close(tp); 1479 break; 1480 1481 case TCPS_CLOSING: 1482 case TCPS_LAST_ACK: 1483 KASSERT(headlocked, ("%s: trimthenstep6: " 1484 "tcp_close.2: head not locked", __func__)); 1485 tp = tcp_close(tp); 1486 break; 1487 } 1488 } 1489 goto drop; 1490 } 1491 1492 /* 1493 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1494 * and it's less than ts_recent, drop it. 1495 */ 1496 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 1497 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 1498 1499 /* Check to see if ts_recent is over 24 days old. */ 1500 if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1501 /* 1502 * Invalidate ts_recent. If this segment updates 1503 * ts_recent, the age will be reset later and ts_recent 1504 * will get a valid value. If it does not, setting 1505 * ts_recent to zero will at least satisfy the 1506 * requirement that zero be placed in the timestamp 1507 * echo reply when ts_recent isn't valid. The 1508 * age isn't reset until we get a valid ts_recent 1509 * because we don't want out-of-order segments to be 1510 * dropped when ts_recent is old. 1511 */ 1512 tp->ts_recent = 0; 1513 } else { 1514 tcpstat.tcps_rcvduppack++; 1515 tcpstat.tcps_rcvdupbyte += tlen; 1516 tcpstat.tcps_pawsdrop++; 1517 if (tlen) 1518 goto dropafterack; 1519 goto drop; 1520 } 1521 } 1522 1523 /* 1524 * In the SYN-RECEIVED state, validate that the packet belongs to 1525 * this connection before trimming the data to fit the receive 1526 * window. Check the sequence number versus IRS since we know 1527 * the sequence numbers haven't wrapped. This is a partial fix 1528 * for the "LAND" DoS attack. 1529 */ 1530 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 1531 rstreason = BANDLIM_RST_OPENPORT; 1532 goto dropwithreset; 1533 } 1534 1535 todrop = tp->rcv_nxt - th->th_seq; 1536 if (todrop > 0) { 1537 if (thflags & TH_SYN) { 1538 thflags &= ~TH_SYN; 1539 th->th_seq++; 1540 if (th->th_urp > 1) 1541 th->th_urp--; 1542 else 1543 thflags &= ~TH_URG; 1544 todrop--; 1545 } 1546 /* 1547 * Following if statement from Stevens, vol. 2, p. 960. 1548 */ 1549 if (todrop > tlen 1550 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1551 /* 1552 * Any valid FIN must be to the left of the window. 1553 * At this point the FIN must be a duplicate or out 1554 * of sequence; drop it. 1555 */ 1556 thflags &= ~TH_FIN; 1557 1558 /* 1559 * Send an ACK to resynchronize and drop any data. 1560 * But keep on processing for RST or ACK. 1561 */ 1562 tp->t_flags |= TF_ACKNOW; 1563 todrop = tlen; 1564 tcpstat.tcps_rcvduppack++; 1565 tcpstat.tcps_rcvdupbyte += todrop; 1566 } else { 1567 tcpstat.tcps_rcvpartduppack++; 1568 tcpstat.tcps_rcvpartdupbyte += todrop; 1569 } 1570 drop_hdrlen += todrop; /* drop from the top afterwards */ 1571 th->th_seq += todrop; 1572 tlen -= todrop; 1573 if (th->th_urp > todrop) 1574 th->th_urp -= todrop; 1575 else { 1576 thflags &= ~TH_URG; 1577 th->th_urp = 0; 1578 } 1579 } 1580 1581 /* 1582 * If new data are received on a connection after the 1583 * user processes are gone, then RST the other end. 1584 */ 1585 if ((so->so_state & SS_NOFDREF) && 1586 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1587 KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head " 1588 "not locked", __func__)); 1589 tp = tcp_close(tp); 1590 tcpstat.tcps_rcvafterclose++; 1591 rstreason = BANDLIM_UNLIMITED; 1592 goto dropwithreset; 1593 } 1594 1595 /* 1596 * If segment ends after window, drop trailing data 1597 * (and PUSH and FIN); if nothing left, just ACK. 1598 */ 1599 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1600 if (todrop > 0) { 1601 tcpstat.tcps_rcvpackafterwin++; 1602 if (todrop >= tlen) { 1603 tcpstat.tcps_rcvbyteafterwin += tlen; 1604 /* 1605 * If window is closed can only take segments at 1606 * window edge, and have to drop data and PUSH from 1607 * incoming segments. Continue processing, but 1608 * remember to ack. Otherwise, drop segment 1609 * and ack. 1610 */ 1611 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1612 tp->t_flags |= TF_ACKNOW; 1613 tcpstat.tcps_rcvwinprobe++; 1614 } else 1615 goto dropafterack; 1616 } else 1617 tcpstat.tcps_rcvbyteafterwin += todrop; 1618 m_adj(m, -todrop); 1619 tlen -= todrop; 1620 thflags &= ~(TH_PUSH|TH_FIN); 1621 } 1622 1623 /* 1624 * If last ACK falls within this segment's sequence numbers, 1625 * record its timestamp. 1626 * NOTE: 1627 * 1) That the test incorporates suggestions from the latest 1628 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1629 * 2) That updating only on newer timestamps interferes with 1630 * our earlier PAWS tests, so this check should be solely 1631 * predicated on the sequence space of this segment. 1632 * 3) That we modify the segment boundary check to be 1633 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 1634 * instead of RFC1323's 1635 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 1636 * This modified check allows us to overcome RFC1323's 1637 * limitations as described in Stevens TCP/IP Illustrated 1638 * Vol. 2 p.869. In such cases, we can still calculate the 1639 * RTT correctly when RCV.NXT == Last.ACK.Sent. 1640 */ 1641 if ((to.to_flags & TOF_TS) != 0 && 1642 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1643 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1644 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 1645 tp->ts_recent_age = ticks; 1646 tp->ts_recent = to.to_tsval; 1647 } 1648 1649 /* 1650 * If a SYN is in the window, then this is an 1651 * error and we send an RST and drop the connection. 1652 */ 1653 if (thflags & TH_SYN) { 1654 KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: " 1655 "head not locked", __func__)); 1656 tp = tcp_drop(tp, ECONNRESET); 1657 rstreason = BANDLIM_UNLIMITED; 1658 goto drop; 1659 } 1660 1661 /* 1662 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 1663 * flag is on (half-synchronized state), then queue data for 1664 * later processing; else drop segment and return. 1665 */ 1666 if ((thflags & TH_ACK) == 0) { 1667 if (tp->t_state == TCPS_SYN_RECEIVED || 1668 (tp->t_flags & TF_NEEDSYN)) 1669 goto step6; 1670 else if (tp->t_flags & TF_ACKNOW) 1671 goto dropafterack; 1672 else 1673 goto drop; 1674 } 1675 1676 /* 1677 * Ack processing. 1678 */ 1679 switch (tp->t_state) { 1680 1681 /* 1682 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1683 * ESTABLISHED state and continue processing. 1684 * The ACK was checked above. 1685 */ 1686 case TCPS_SYN_RECEIVED: 1687 1688 tcpstat.tcps_connects++; 1689 soisconnected(so); 1690 /* Do window scaling? */ 1691 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1692 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1693 tp->rcv_scale = tp->request_r_scale; 1694 tp->snd_wnd = tiwin; 1695 } 1696 /* 1697 * Make transitions: 1698 * SYN-RECEIVED -> ESTABLISHED 1699 * SYN-RECEIVED* -> FIN-WAIT-1 1700 */ 1701 tp->t_starttime = ticks; 1702 if (tp->t_flags & TF_NEEDFIN) { 1703 tp->t_state = TCPS_FIN_WAIT_1; 1704 tp->t_flags &= ~TF_NEEDFIN; 1705 } else { 1706 tp->t_state = TCPS_ESTABLISHED; 1707 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1708 } 1709 /* 1710 * If segment contains data or ACK, will call tcp_reass() 1711 * later; if not, do so now to pass queued data to user. 1712 */ 1713 if (tlen == 0 && (thflags & TH_FIN) == 0) 1714 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 1715 (struct mbuf *)0); 1716 tp->snd_wl1 = th->th_seq - 1; 1717 /* FALLTHROUGH */ 1718 1719 /* 1720 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1721 * ACKs. If the ack is in the range 1722 * tp->snd_una < th->th_ack <= tp->snd_max 1723 * then advance tp->snd_una to th->th_ack and drop 1724 * data from the retransmission queue. If this ACK reflects 1725 * more up to date window information we update our window information. 1726 */ 1727 case TCPS_ESTABLISHED: 1728 case TCPS_FIN_WAIT_1: 1729 case TCPS_FIN_WAIT_2: 1730 case TCPS_CLOSE_WAIT: 1731 case TCPS_CLOSING: 1732 case TCPS_LAST_ACK: 1733 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1734 tcpstat.tcps_rcvacktoomuch++; 1735 goto dropafterack; 1736 } 1737 if ((tp->t_flags & TF_SACK_PERMIT) && 1738 ((to.to_flags & TOF_SACK) || 1739 !TAILQ_EMPTY(&tp->snd_holes))) 1740 tcp_sack_doack(tp, &to, th->th_ack); 1741 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1742 if (tlen == 0 && tiwin == tp->snd_wnd) { 1743 tcpstat.tcps_rcvdupack++; 1744 /* 1745 * If we have outstanding data (other than 1746 * a window probe), this is a completely 1747 * duplicate ack (ie, window info didn't 1748 * change), the ack is the biggest we've 1749 * seen and we've seen exactly our rexmt 1750 * threshhold of them, assume a packet 1751 * has been dropped and retransmit it. 1752 * Kludge snd_nxt & the congestion 1753 * window so we send only this one 1754 * packet. 1755 * 1756 * We know we're losing at the current 1757 * window size so do congestion avoidance 1758 * (set ssthresh to half the current window 1759 * and pull our congestion window back to 1760 * the new ssthresh). 1761 * 1762 * Dup acks mean that packets have left the 1763 * network (they're now cached at the receiver) 1764 * so bump cwnd by the amount in the receiver 1765 * to keep a constant cwnd packets in the 1766 * network. 1767 */ 1768 if (!tcp_timer_active(tp, TT_REXMT) || 1769 th->th_ack != tp->snd_una) 1770 tp->t_dupacks = 0; 1771 else if (++tp->t_dupacks > tcprexmtthresh || 1772 ((tcp_do_newreno || 1773 (tp->t_flags & TF_SACK_PERMIT)) && 1774 IN_FASTRECOVERY(tp))) { 1775 if ((tp->t_flags & TF_SACK_PERMIT) && 1776 IN_FASTRECOVERY(tp)) { 1777 int awnd; 1778 1779 /* 1780 * Compute the amount of data in flight first. 1781 * We can inject new data into the pipe iff 1782 * we have less than 1/2 the original window's 1783 * worth of data in flight. 1784 */ 1785 awnd = (tp->snd_nxt - tp->snd_fack) + 1786 tp->sackhint.sack_bytes_rexmit; 1787 if (awnd < tp->snd_ssthresh) { 1788 tp->snd_cwnd += tp->t_maxseg; 1789 if (tp->snd_cwnd > tp->snd_ssthresh) 1790 tp->snd_cwnd = tp->snd_ssthresh; 1791 } 1792 } else 1793 tp->snd_cwnd += tp->t_maxseg; 1794 (void) tcp_output(tp); 1795 goto drop; 1796 } else if (tp->t_dupacks == tcprexmtthresh) { 1797 tcp_seq onxt = tp->snd_nxt; 1798 u_int win; 1799 1800 /* 1801 * If we're doing sack, check to 1802 * see if we're already in sack 1803 * recovery. If we're not doing sack, 1804 * check to see if we're in newreno 1805 * recovery. 1806 */ 1807 if (tp->t_flags & TF_SACK_PERMIT) { 1808 if (IN_FASTRECOVERY(tp)) { 1809 tp->t_dupacks = 0; 1810 break; 1811 } 1812 } else if (tcp_do_newreno) { 1813 if (SEQ_LEQ(th->th_ack, 1814 tp->snd_recover)) { 1815 tp->t_dupacks = 0; 1816 break; 1817 } 1818 } 1819 win = min(tp->snd_wnd, tp->snd_cwnd) / 1820 2 / tp->t_maxseg; 1821 if (win < 2) 1822 win = 2; 1823 tp->snd_ssthresh = win * tp->t_maxseg; 1824 ENTER_FASTRECOVERY(tp); 1825 tp->snd_recover = tp->snd_max; 1826 tcp_timer_activate(tp, TT_REXMT, 0); 1827 tp->t_rtttime = 0; 1828 if (tp->t_flags & TF_SACK_PERMIT) { 1829 tcpstat.tcps_sack_recovery_episode++; 1830 tp->sack_newdata = tp->snd_nxt; 1831 tp->snd_cwnd = tp->t_maxseg; 1832 (void) tcp_output(tp); 1833 goto drop; 1834 } 1835 tp->snd_nxt = th->th_ack; 1836 tp->snd_cwnd = tp->t_maxseg; 1837 (void) tcp_output(tp); 1838 KASSERT(tp->snd_limited <= 2, 1839 ("%s: tp->snd_limited too big", 1840 __func__)); 1841 tp->snd_cwnd = tp->snd_ssthresh + 1842 tp->t_maxseg * 1843 (tp->t_dupacks - tp->snd_limited); 1844 if (SEQ_GT(onxt, tp->snd_nxt)) 1845 tp->snd_nxt = onxt; 1846 goto drop; 1847 } else if (tcp_do_rfc3042) { 1848 u_long oldcwnd = tp->snd_cwnd; 1849 tcp_seq oldsndmax = tp->snd_max; 1850 u_int sent; 1851 1852 KASSERT(tp->t_dupacks == 1 || 1853 tp->t_dupacks == 2, 1854 ("%s: dupacks not 1 or 2", 1855 __func__)); 1856 if (tp->t_dupacks == 1) 1857 tp->snd_limited = 0; 1858 tp->snd_cwnd = 1859 (tp->snd_nxt - tp->snd_una) + 1860 (tp->t_dupacks - tp->snd_limited) * 1861 tp->t_maxseg; 1862 (void) tcp_output(tp); 1863 sent = tp->snd_max - oldsndmax; 1864 if (sent > tp->t_maxseg) { 1865 KASSERT((tp->t_dupacks == 2 && 1866 tp->snd_limited == 0) || 1867 (sent == tp->t_maxseg + 1 && 1868 tp->t_flags & TF_SENTFIN), 1869 ("%s: sent too much", 1870 __func__)); 1871 tp->snd_limited = 2; 1872 } else if (sent > 0) 1873 ++tp->snd_limited; 1874 tp->snd_cwnd = oldcwnd; 1875 goto drop; 1876 } 1877 } else 1878 tp->t_dupacks = 0; 1879 break; 1880 } 1881 1882 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 1883 ("%s: th_ack <= snd_una", __func__)); 1884 1885 /* 1886 * If the congestion window was inflated to account 1887 * for the other side's cached packets, retract it. 1888 */ 1889 if (tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { 1890 if (IN_FASTRECOVERY(tp)) { 1891 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 1892 if (tp->t_flags & TF_SACK_PERMIT) 1893 tcp_sack_partialack(tp, th); 1894 else 1895 tcp_newreno_partial_ack(tp, th); 1896 } else { 1897 /* 1898 * Out of fast recovery. 1899 * Window inflation should have left us 1900 * with approximately snd_ssthresh 1901 * outstanding data. 1902 * But in case we would be inclined to 1903 * send a burst, better to do it via 1904 * the slow start mechanism. 1905 */ 1906 if (SEQ_GT(th->th_ack + 1907 tp->snd_ssthresh, 1908 tp->snd_max)) 1909 tp->snd_cwnd = tp->snd_max - 1910 th->th_ack + 1911 tp->t_maxseg; 1912 else 1913 tp->snd_cwnd = tp->snd_ssthresh; 1914 } 1915 } 1916 } else { 1917 if (tp->t_dupacks >= tcprexmtthresh && 1918 tp->snd_cwnd > tp->snd_ssthresh) 1919 tp->snd_cwnd = tp->snd_ssthresh; 1920 } 1921 tp->t_dupacks = 0; 1922 /* 1923 * If we reach this point, ACK is not a duplicate, 1924 * i.e., it ACKs something we sent. 1925 */ 1926 if (tp->t_flags & TF_NEEDSYN) { 1927 /* 1928 * T/TCP: Connection was half-synchronized, and our 1929 * SYN has been ACK'd (so connection is now fully 1930 * synchronized). Go to non-starred state, 1931 * increment snd_una for ACK of SYN, and check if 1932 * we can do window scaling. 1933 */ 1934 tp->t_flags &= ~TF_NEEDSYN; 1935 tp->snd_una++; 1936 /* Do window scaling? */ 1937 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1938 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1939 tp->rcv_scale = tp->request_r_scale; 1940 /* Send window already scaled. */ 1941 } 1942 } 1943 1944process_ACK: 1945 KASSERT(headlocked, ("%s: process_ACK: head not locked", 1946 __func__)); 1947 INP_LOCK_ASSERT(tp->t_inpcb); 1948 1949 acked = th->th_ack - tp->snd_una; 1950 tcpstat.tcps_rcvackpack++; 1951 tcpstat.tcps_rcvackbyte += acked; 1952 1953 /* 1954 * If we just performed our first retransmit, and the ACK 1955 * arrives within our recovery window, then it was a mistake 1956 * to do the retransmit in the first place. Recover our 1957 * original cwnd and ssthresh, and proceed to transmit where 1958 * we left off. 1959 */ 1960 if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { 1961 ++tcpstat.tcps_sndrexmitbad; 1962 tp->snd_cwnd = tp->snd_cwnd_prev; 1963 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1964 tp->snd_recover = tp->snd_recover_prev; 1965 if (tp->t_flags & TF_WASFRECOVERY) 1966 ENTER_FASTRECOVERY(tp); 1967 tp->snd_nxt = tp->snd_max; 1968 tp->t_badrxtwin = 0; /* XXX probably not required */ 1969 } 1970 1971 /* 1972 * If we have a timestamp reply, update smoothed 1973 * round trip time. If no timestamp is present but 1974 * transmit timer is running and timed sequence 1975 * number was acked, update smoothed round trip time. 1976 * Since we now have an rtt measurement, cancel the 1977 * timer backoff (cf., Phil Karn's retransmit alg.). 1978 * Recompute the initial retransmit timer. 1979 * 1980 * Some boxes send broken timestamp replies 1981 * during the SYN+ACK phase, ignore 1982 * timestamps of 0 or we could calculate a 1983 * huge RTT and blow up the retransmit timer. 1984 */ 1985 if ((to.to_flags & TOF_TS) != 0 && 1986 to.to_tsecr) { 1987 if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) 1988 tp->t_rttlow = ticks - to.to_tsecr; 1989 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); 1990 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 1991 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 1992 tp->t_rttlow = ticks - tp->t_rtttime; 1993 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 1994 } 1995 tcp_xmit_bandwidth_limit(tp, th->th_ack); 1996 1997 /* 1998 * If all outstanding data is acked, stop retransmit 1999 * timer and remember to restart (more output or persist). 2000 * If there is more data to be acked, restart retransmit 2001 * timer, using current (possibly backed-off) value. 2002 */ 2003 if (th->th_ack == tp->snd_max) { 2004 tcp_timer_activate(tp, TT_REXMT, 0); 2005 needoutput = 1; 2006 } else if (!tcp_timer_active(tp, TT_PERSIST)) 2007 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 2008 2009 /* 2010 * If no data (only SYN) was ACK'd, 2011 * skip rest of ACK processing. 2012 */ 2013 if (acked == 0) 2014 goto step6; 2015 2016 /* 2017 * When new data is acked, open the congestion window. 2018 * If the window gives us less than ssthresh packets 2019 * in flight, open exponentially (maxseg per packet). 2020 * Otherwise open linearly: maxseg per window 2021 * (maxseg^2 / cwnd per packet). 2022 */ 2023 if ((!tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || 2024 !IN_FASTRECOVERY(tp)) { 2025 u_int cw = tp->snd_cwnd; 2026 u_int incr = tp->t_maxseg; 2027 if (cw > tp->snd_ssthresh) 2028 incr = incr * incr / cw; 2029 tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale); 2030 } 2031 SOCKBUF_LOCK(&so->so_snd); 2032 if (acked > so->so_snd.sb_cc) { 2033 tp->snd_wnd -= so->so_snd.sb_cc; 2034 sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); 2035 ourfinisacked = 1; 2036 } else { 2037 sbdrop_locked(&so->so_snd, acked); 2038 tp->snd_wnd -= acked; 2039 ourfinisacked = 0; 2040 } 2041 sowwakeup_locked(so); 2042 /* detect una wraparound */ 2043 if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && 2044 !IN_FASTRECOVERY(tp) && 2045 SEQ_GT(tp->snd_una, tp->snd_recover) && 2046 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2047 tp->snd_recover = th->th_ack - 1; 2048 if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && 2049 IN_FASTRECOVERY(tp) && 2050 SEQ_GEQ(th->th_ack, tp->snd_recover)) 2051 EXIT_FASTRECOVERY(tp); 2052 tp->snd_una = th->th_ack; 2053 if (tp->t_flags & TF_SACK_PERMIT) { 2054 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 2055 tp->snd_recover = tp->snd_una; 2056 } 2057 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2058 tp->snd_nxt = tp->snd_una; 2059 2060 switch (tp->t_state) { 2061 2062 /* 2063 * In FIN_WAIT_1 STATE in addition to the processing 2064 * for the ESTABLISHED state if our FIN is now acknowledged 2065 * then enter FIN_WAIT_2. 2066 */ 2067 case TCPS_FIN_WAIT_1: 2068 if (ourfinisacked) { 2069 /* 2070 * If we can't receive any more 2071 * data, then closing user can proceed. 2072 * Starting the timer is contrary to the 2073 * specification, but if we don't get a FIN 2074 * we'll hang forever. 2075 */ 2076 /* XXXjl 2077 * we should release the tp also, and use a 2078 * compressed state. 2079 */ 2080 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2081 int timeout; 2082 2083 soisdisconnected(so); 2084 timeout = (tcp_fast_finwait2_recycle) ? 2085 tcp_finwait2_timeout : tcp_maxidle; 2086 tcp_timer_activate(tp, TT_2MSL, timeout); 2087 } 2088 tp->t_state = TCPS_FIN_WAIT_2; 2089 } 2090 break; 2091 2092 /* 2093 * In CLOSING STATE in addition to the processing for 2094 * the ESTABLISHED state if the ACK acknowledges our FIN 2095 * then enter the TIME-WAIT state, otherwise ignore 2096 * the segment. 2097 */ 2098 case TCPS_CLOSING: 2099 if (ourfinisacked) { 2100 KASSERT(headlocked, ("%s: process_ACK: " 2101 "head not locked", __func__)); 2102 tcp_twstart(tp); 2103 INP_INFO_WUNLOCK(&tcbinfo); 2104 headlocked = 0; 2105 m_freem(m); 2106 return; 2107 } 2108 break; 2109 2110 /* 2111 * In LAST_ACK, we may still be waiting for data to drain 2112 * and/or to be acked, as well as for the ack of our FIN. 2113 * If our FIN is now acknowledged, delete the TCB, 2114 * enter the closed state and return. 2115 */ 2116 case TCPS_LAST_ACK: 2117 if (ourfinisacked) { 2118 KASSERT(headlocked, ("%s: process_ACK: " 2119 "tcp_close: head not locked", __func__)); 2120 tp = tcp_close(tp); 2121 goto drop; 2122 } 2123 break; 2124 } 2125 } 2126 2127step6: 2128 KASSERT(headlocked, ("%s: step6: head not locked", __func__)); 2129 INP_LOCK_ASSERT(tp->t_inpcb); 2130 2131 /* 2132 * Update window information. 2133 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2134 */ 2135 if ((thflags & TH_ACK) && 2136 (SEQ_LT(tp->snd_wl1, th->th_seq) || 2137 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2138 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2139 /* keep track of pure window updates */ 2140 if (tlen == 0 && 2141 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2142 tcpstat.tcps_rcvwinupd++; 2143 tp->snd_wnd = tiwin; 2144 tp->snd_wl1 = th->th_seq; 2145 tp->snd_wl2 = th->th_ack; 2146 if (tp->snd_wnd > tp->max_sndwnd) 2147 tp->max_sndwnd = tp->snd_wnd; 2148 needoutput = 1; 2149 } 2150 2151 /* 2152 * Process segments with URG. 2153 */ 2154 if ((thflags & TH_URG) && th->th_urp && 2155 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2156 /* 2157 * This is a kludge, but if we receive and accept 2158 * random urgent pointers, we'll crash in 2159 * soreceive. It's hard to imagine someone 2160 * actually wanting to send this much urgent data. 2161 */ 2162 SOCKBUF_LOCK(&so->so_rcv); 2163 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2164 th->th_urp = 0; /* XXX */ 2165 thflags &= ~TH_URG; /* XXX */ 2166 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 2167 goto dodata; /* XXX */ 2168 } 2169 /* 2170 * If this segment advances the known urgent pointer, 2171 * then mark the data stream. This should not happen 2172 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2173 * a FIN has been received from the remote side. 2174 * In these states we ignore the URG. 2175 * 2176 * According to RFC961 (Assigned Protocols), 2177 * the urgent pointer points to the last octet 2178 * of urgent data. We continue, however, 2179 * to consider it to indicate the first octet 2180 * of data past the urgent section as the original 2181 * spec states (in one of two places). 2182 */ 2183 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2184 tp->rcv_up = th->th_seq + th->th_urp; 2185 so->so_oobmark = so->so_rcv.sb_cc + 2186 (tp->rcv_up - tp->rcv_nxt) - 1; 2187 if (so->so_oobmark == 0) 2188 so->so_rcv.sb_state |= SBS_RCVATMARK; 2189 sohasoutofband(so); 2190 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2191 } 2192 SOCKBUF_UNLOCK(&so->so_rcv); 2193 /* 2194 * Remove out of band data so doesn't get presented to user. 2195 * This can happen independent of advancing the URG pointer, 2196 * but if two URG's are pending at once, some out-of-band 2197 * data may creep in... ick. 2198 */ 2199 if (th->th_urp <= (u_long)tlen && 2200 !(so->so_options & SO_OOBINLINE)) { 2201 /* hdr drop is delayed */ 2202 tcp_pulloutofband(so, th, m, drop_hdrlen); 2203 } 2204 } else { 2205 /* 2206 * If no out of band data is expected, 2207 * pull receive urgent pointer along 2208 * with the receive window. 2209 */ 2210 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2211 tp->rcv_up = tp->rcv_nxt; 2212 } 2213dodata: /* XXX */ 2214 KASSERT(headlocked, ("%s: dodata: head not locked", __func__)); 2215 INP_LOCK_ASSERT(tp->t_inpcb); 2216 2217 /* 2218 * Process the segment text, merging it into the TCP sequencing queue, 2219 * and arranging for acknowledgment of receipt if necessary. 2220 * This process logically involves adjusting tp->rcv_wnd as data 2221 * is presented to the user (this happens in tcp_usrreq.c, 2222 * case PRU_RCVD). If a FIN has already been received on this 2223 * connection then we just ignore the text. 2224 */ 2225 if ((tlen || (thflags & TH_FIN)) && 2226 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2227 tcp_seq save_start = th->th_seq; 2228 tcp_seq save_end = th->th_seq + tlen; 2229 m_adj(m, drop_hdrlen); /* delayed header drop */ 2230 /* 2231 * Insert segment which includes th into TCP reassembly queue 2232 * with control block tp. Set thflags to whether reassembly now 2233 * includes a segment with FIN. This handles the common case 2234 * inline (segment is the next to be received on an established 2235 * connection, and the queue is empty), avoiding linkage into 2236 * and removal from the queue and repetition of various 2237 * conversions. 2238 * Set DELACK for segments received in order, but ack 2239 * immediately when segments are out of order (so 2240 * fast retransmit can work). 2241 */ 2242 if (th->th_seq == tp->rcv_nxt && 2243 LIST_EMPTY(&tp->t_segq) && 2244 TCPS_HAVEESTABLISHED(tp->t_state)) { 2245 if (DELAY_ACK(tp)) 2246 tp->t_flags |= TF_DELACK; 2247 else 2248 tp->t_flags |= TF_ACKNOW; 2249 tp->rcv_nxt += tlen; 2250 thflags = th->th_flags & TH_FIN; 2251 tcpstat.tcps_rcvpack++; 2252 tcpstat.tcps_rcvbyte += tlen; 2253 ND6_HINT(tp); 2254 SOCKBUF_LOCK(&so->so_rcv); 2255 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 2256 m_freem(m); 2257 else 2258 sbappendstream_locked(&so->so_rcv, m); 2259 sorwakeup_locked(so); 2260 } else { 2261 thflags = tcp_reass(tp, th, &tlen, m); 2262 tp->t_flags |= TF_ACKNOW; 2263 } 2264 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 2265 tcp_update_sack_list(tp, save_start, save_end); 2266#if 0 2267 /* 2268 * Note the amount of data that peer has sent into 2269 * our window, in order to estimate the sender's 2270 * buffer size. 2271 * XXX: Unused. 2272 */ 2273 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2274#endif 2275 } else { 2276 m_freem(m); 2277 thflags &= ~TH_FIN; 2278 } 2279 2280 /* 2281 * If FIN is received ACK the FIN and let the user know 2282 * that the connection is closing. 2283 */ 2284 if (thflags & TH_FIN) { 2285 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2286 socantrcvmore(so); 2287 /* 2288 * If connection is half-synchronized 2289 * (ie NEEDSYN flag on) then delay ACK, 2290 * so it may be piggybacked when SYN is sent. 2291 * Otherwise, since we received a FIN then no 2292 * more input can be expected, send ACK now. 2293 */ 2294 if (tp->t_flags & TF_NEEDSYN) 2295 tp->t_flags |= TF_DELACK; 2296 else 2297 tp->t_flags |= TF_ACKNOW; 2298 tp->rcv_nxt++; 2299 } 2300 switch (tp->t_state) { 2301 2302 /* 2303 * In SYN_RECEIVED and ESTABLISHED STATES 2304 * enter the CLOSE_WAIT state. 2305 */ 2306 case TCPS_SYN_RECEIVED: 2307 tp->t_starttime = ticks; 2308 /*FALLTHROUGH*/ 2309 case TCPS_ESTABLISHED: 2310 tp->t_state = TCPS_CLOSE_WAIT; 2311 break; 2312 2313 /* 2314 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2315 * enter the CLOSING state. 2316 */ 2317 case TCPS_FIN_WAIT_1: 2318 tp->t_state = TCPS_CLOSING; 2319 break; 2320 2321 /* 2322 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2323 * starting the time-wait timer, turning off the other 2324 * standard timers. 2325 */ 2326 case TCPS_FIN_WAIT_2: 2327 KASSERT(headlocked == 1, ("%s: dodata: " 2328 "TCP_FIN_WAIT_2: head not locked", __func__)); 2329 tcp_twstart(tp); 2330 INP_INFO_WUNLOCK(&tcbinfo); 2331 return; 2332 } 2333 } 2334 INP_INFO_WUNLOCK(&tcbinfo); 2335 headlocked = 0; 2336#ifdef TCPDEBUG 2337 if (so->so_options & SO_DEBUG) 2338 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 2339 &tcp_savetcp, 0); 2340#endif 2341 2342 /* 2343 * Return any desired output. 2344 */ 2345 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2346 (void) tcp_output(tp); 2347 2348check_delack: 2349 KASSERT(headlocked == 0, ("%s: check_delack: head locked", 2350 __func__)); 2351 INP_INFO_UNLOCK_ASSERT(&tcbinfo); 2352 INP_LOCK_ASSERT(tp->t_inpcb); 2353 if (tp->t_flags & TF_DELACK) { 2354 tp->t_flags &= ~TF_DELACK; 2355 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2356 } 2357 INP_UNLOCK(tp->t_inpcb); 2358 return; 2359 2360dropafterack: 2361 KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__)); 2362 /* 2363 * Generate an ACK dropping incoming segment if it occupies 2364 * sequence space, where the ACK reflects our state. 2365 * 2366 * We can now skip the test for the RST flag since all 2367 * paths to this code happen after packets containing 2368 * RST have been dropped. 2369 * 2370 * In the SYN-RECEIVED state, don't send an ACK unless the 2371 * segment we received passes the SYN-RECEIVED ACK test. 2372 * If it fails send a RST. This breaks the loop in the 2373 * "LAND" DoS attack, and also prevents an ACK storm 2374 * between two listening ports that have been sent forged 2375 * SYN segments, each with the source address of the other. 2376 */ 2377 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 2378 (SEQ_GT(tp->snd_una, th->th_ack) || 2379 SEQ_GT(th->th_ack, tp->snd_max)) ) { 2380 rstreason = BANDLIM_RST_OPENPORT; 2381 goto dropwithreset; 2382 } 2383#ifdef TCPDEBUG 2384 if (so->so_options & SO_DEBUG) 2385 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2386 &tcp_savetcp, 0); 2387#endif 2388 KASSERT(headlocked, ("%s: headlocked should be 1", __func__)); 2389 INP_INFO_WUNLOCK(&tcbinfo); 2390 tp->t_flags |= TF_ACKNOW; 2391 (void) tcp_output(tp); 2392 INP_UNLOCK(tp->t_inpcb); 2393 m_freem(m); 2394 return; 2395 2396dropwithreset: 2397 KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__)); 2398 2399 tcp_dropwithreset(m, th, tp, tlen, rstreason); 2400 2401 if (tp != NULL) 2402 INP_UNLOCK(tp->t_inpcb); 2403 if (headlocked) 2404 INP_INFO_WUNLOCK(&tcbinfo); 2405 return; 2406 2407drop: 2408 /* 2409 * Drop space held by incoming segment and return. 2410 */ 2411#ifdef TCPDEBUG 2412 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 2413 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2414 &tcp_savetcp, 0); 2415#endif 2416 if (tp != NULL) 2417 INP_UNLOCK(tp->t_inpcb); 2418 if (headlocked) 2419 INP_INFO_WUNLOCK(&tcbinfo); 2420 m_freem(m); 2421 return; 2422} 2423 2424/* 2425 * Issue RST and make ACK acceptable to originator of segment. 2426 * The mbuf must still include the original packet header. 2427 * tp may be NULL. 2428 */ 2429static void 2430tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, 2431 int tlen, int rstreason) 2432{ 2433 struct ip *ip; 2434#ifdef INET6 2435 struct ip6_hdr *ip6; 2436#endif 2437 /* Don't bother if destination was broadcast/multicast. */ 2438 if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2439 goto drop; 2440#ifdef INET6 2441 if (mtod(m, struct ip *)->ip_v == 6) { 2442 ip6 = mtod(m, struct ip6_hdr *); 2443 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 2444 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 2445 goto drop; 2446 /* IPv6 anycast check is done at tcp6_input() */ 2447 } else 2448#endif 2449 { 2450 ip = mtod(m, struct ip *); 2451 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 2452 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 2453 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 2454 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 2455 goto drop; 2456 } 2457 2458 /* Perform bandwidth limiting. */ 2459 if (badport_bandlim(rstreason) < 0) 2460 goto drop; 2461 2462 /* tcp_respond consumes the mbuf chain. */ 2463 if (th->th_flags & TH_ACK) { 2464 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, 2465 th->th_ack, TH_RST); 2466 } else { 2467 if (th->th_flags & TH_SYN) 2468 tlen++; 2469 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 2470 (tcp_seq)0, TH_RST|TH_ACK); 2471 } 2472 return; 2473drop: 2474 m_freem(m); 2475 return; 2476} 2477 2478/* 2479 * Parse TCP options and place in tcpopt. 2480 */ 2481static void 2482tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) 2483{ 2484 int opt, optlen; 2485 2486 to->to_flags = 0; 2487 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2488 opt = cp[0]; 2489 if (opt == TCPOPT_EOL) 2490 break; 2491 if (opt == TCPOPT_NOP) 2492 optlen = 1; 2493 else { 2494 if (cnt < 2) 2495 break; 2496 optlen = cp[1]; 2497 if (optlen < 2 || optlen > cnt) 2498 break; 2499 } 2500 switch (opt) { 2501 case TCPOPT_MAXSEG: 2502 if (optlen != TCPOLEN_MAXSEG) 2503 continue; 2504 if (!(flags & TO_SYN)) 2505 continue; 2506 to->to_flags |= TOF_MSS; 2507 bcopy((char *)cp + 2, 2508 (char *)&to->to_mss, sizeof(to->to_mss)); 2509 to->to_mss = ntohs(to->to_mss); 2510 break; 2511 case TCPOPT_WINDOW: 2512 if (optlen != TCPOLEN_WINDOW) 2513 continue; 2514 if (!(flags & TO_SYN)) 2515 continue; 2516 to->to_flags |= TOF_SCALE; 2517 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); 2518 break; 2519 case TCPOPT_TIMESTAMP: 2520 if (optlen != TCPOLEN_TIMESTAMP) 2521 continue; 2522 to->to_flags |= TOF_TS; 2523 bcopy((char *)cp + 2, 2524 (char *)&to->to_tsval, sizeof(to->to_tsval)); 2525 to->to_tsval = ntohl(to->to_tsval); 2526 bcopy((char *)cp + 6, 2527 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 2528 to->to_tsecr = ntohl(to->to_tsecr); 2529 break; 2530#ifdef TCP_SIGNATURE 2531 /* 2532 * XXX In order to reply to a host which has set the 2533 * TCP_SIGNATURE option in its initial SYN, we have to 2534 * record the fact that the option was observed here 2535 * for the syncache code to perform the correct response. 2536 */ 2537 case TCPOPT_SIGNATURE: 2538 if (optlen != TCPOLEN_SIGNATURE) 2539 continue; 2540 to->to_flags |= TOF_SIGNATURE; 2541 to->to_signature = cp + 2; 2542 break; 2543#endif 2544 case TCPOPT_SACK_PERMITTED: 2545 if (optlen != TCPOLEN_SACK_PERMITTED) 2546 continue; 2547 if (!(flags & TO_SYN)) 2548 continue; 2549 if (!tcp_do_sack) 2550 continue; 2551 to->to_flags |= TOF_SACKPERM; 2552 break; 2553 case TCPOPT_SACK: 2554 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2555 continue; 2556 if (flags & TO_SYN) 2557 continue; 2558 to->to_flags |= TOF_SACK; 2559 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 2560 to->to_sacks = cp + 2; 2561 tcpstat.tcps_sack_rcv_blocks++; 2562 break; 2563 default: 2564 continue; 2565 } 2566 } 2567} 2568 2569/* 2570 * Pull out of band byte out of a segment so 2571 * it doesn't appear in the user's data queue. 2572 * It is still reflected in the segment length for 2573 * sequencing purposes. 2574 */ 2575static void 2576tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, 2577 int off) 2578{ 2579 int cnt = off + th->th_urp - 1; 2580 2581 while (cnt >= 0) { 2582 if (m->m_len > cnt) { 2583 char *cp = mtod(m, caddr_t) + cnt; 2584 struct tcpcb *tp = sototcpcb(so); 2585 2586 tp->t_iobc = *cp; 2587 tp->t_oobflags |= TCPOOB_HAVEDATA; 2588 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2589 m->m_len--; 2590 if (m->m_flags & M_PKTHDR) 2591 m->m_pkthdr.len--; 2592 return; 2593 } 2594 cnt -= m->m_len; 2595 m = m->m_next; 2596 if (m == NULL) 2597 break; 2598 } 2599 panic("tcp_pulloutofband"); 2600} 2601 2602/* 2603 * Collect new round-trip time estimate 2604 * and update averages and current timeout. 2605 */ 2606static void 2607tcp_xmit_timer(struct tcpcb *tp, int rtt) 2608{ 2609 int delta; 2610 2611 INP_LOCK_ASSERT(tp->t_inpcb); 2612 2613 tcpstat.tcps_rttupdated++; 2614 tp->t_rttupdated++; 2615 if (tp->t_srtt != 0) { 2616 /* 2617 * srtt is stored as fixed point with 5 bits after the 2618 * binary point (i.e., scaled by 8). The following magic 2619 * is equivalent to the smoothing algorithm in rfc793 with 2620 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2621 * point). Adjust rtt to origin 0. 2622 */ 2623 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 2624 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 2625 2626 if ((tp->t_srtt += delta) <= 0) 2627 tp->t_srtt = 1; 2628 2629 /* 2630 * We accumulate a smoothed rtt variance (actually, a 2631 * smoothed mean difference), then set the retransmit 2632 * timer to smoothed rtt + 4 times the smoothed variance. 2633 * rttvar is stored as fixed point with 4 bits after the 2634 * binary point (scaled by 16). The following is 2635 * equivalent to rfc793 smoothing with an alpha of .75 2636 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2637 * rfc793's wired-in beta. 2638 */ 2639 if (delta < 0) 2640 delta = -delta; 2641 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 2642 if ((tp->t_rttvar += delta) <= 0) 2643 tp->t_rttvar = 1; 2644 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 2645 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2646 } else { 2647 /* 2648 * No rtt measurement yet - use the unsmoothed rtt. 2649 * Set the variance to half the rtt (so our first 2650 * retransmit happens at 3*rtt). 2651 */ 2652 tp->t_srtt = rtt << TCP_RTT_SHIFT; 2653 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 2654 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2655 } 2656 tp->t_rtttime = 0; 2657 tp->t_rxtshift = 0; 2658 2659 /* 2660 * the retransmit should happen at rtt + 4 * rttvar. 2661 * Because of the way we do the smoothing, srtt and rttvar 2662 * will each average +1/2 tick of bias. When we compute 2663 * the retransmit timer, we want 1/2 tick of rounding and 2664 * 1 extra tick because of +-1/2 tick uncertainty in the 2665 * firing of the timer. The bias will give us exactly the 2666 * 1.5 tick we need. But, because the bias is 2667 * statistical, we have to test that we don't drop below 2668 * the minimum feasible timer (which is 2 ticks). 2669 */ 2670 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 2671 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 2672 2673 /* 2674 * We received an ack for a packet that wasn't retransmitted; 2675 * it is probably safe to discard any error indications we've 2676 * received recently. This isn't quite right, but close enough 2677 * for now (a route might have failed after we sent a segment, 2678 * and the return path might not be symmetrical). 2679 */ 2680 tp->t_softerror = 0; 2681} 2682 2683/* 2684 * Determine a reasonable value for maxseg size. 2685 * If the route is known, check route for mtu. 2686 * If none, use an mss that can be handled on the outgoing 2687 * interface without forcing IP to fragment; if bigger than 2688 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2689 * to utilize large mbufs. If no route is found, route has no mtu, 2690 * or the destination isn't local, use a default, hopefully conservative 2691 * size (usually 512 or the default IP max size, but no more than the mtu 2692 * of the interface), as we can't discover anything about intervening 2693 * gateways or networks. We also initialize the congestion/slow start 2694 * window to be a single segment if the destination isn't local. 2695 * While looking at the routing entry, we also initialize other path-dependent 2696 * parameters from pre-set or cached values in the routing entry. 2697 * 2698 * Also take into account the space needed for options that we 2699 * send regularly. Make maxseg shorter by that amount to assure 2700 * that we can send maxseg amount of data even when the options 2701 * are present. Store the upper limit of the length of options plus 2702 * data in maxopd. 2703 * 2704 * 2705 * In case of T/TCP, we call this routine during implicit connection 2706 * setup as well (offer = -1), to initialize maxseg from the cached 2707 * MSS of our peer. 2708 * 2709 * NOTE that this routine is only called when we process an incoming 2710 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). 2711 */ 2712void 2713tcp_mss(struct tcpcb *tp, int offer) 2714{ 2715 int rtt, mss; 2716 u_long bufsize; 2717 u_long maxmtu; 2718 struct inpcb *inp = tp->t_inpcb; 2719 struct socket *so; 2720 struct hc_metrics_lite metrics; 2721 int origoffer = offer; 2722 int mtuflags = 0; 2723#ifdef INET6 2724 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 2725 size_t min_protoh = isipv6 ? 2726 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : 2727 sizeof (struct tcpiphdr); 2728#else 2729 const size_t min_protoh = sizeof(struct tcpiphdr); 2730#endif 2731 2732 /* initialize */ 2733#ifdef INET6 2734 if (isipv6) { 2735 maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); 2736 tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; 2737 } else 2738#endif 2739 { 2740 maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); 2741 tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; 2742 } 2743 so = inp->inp_socket; 2744 2745 /* 2746 * no route to sender, stay with default mss and return 2747 */ 2748 if (maxmtu == 0) 2749 return; 2750 2751 /* what have we got? */ 2752 switch (offer) { 2753 case 0: 2754 /* 2755 * Offer == 0 means that there was no MSS on the SYN 2756 * segment, in this case we use tcp_mssdflt. 2757 */ 2758 offer = 2759#ifdef INET6 2760 isipv6 ? tcp_v6mssdflt : 2761#endif 2762 tcp_mssdflt; 2763 break; 2764 2765 case -1: 2766 /* 2767 * Offer == -1 means that we didn't receive SYN yet. 2768 */ 2769 /* FALLTHROUGH */ 2770 2771 default: 2772 /* 2773 * Prevent DoS attack with too small MSS. Round up 2774 * to at least minmss. 2775 */ 2776 offer = max(offer, tcp_minmss); 2777 /* 2778 * Sanity check: make sure that maxopd will be large 2779 * enough to allow some data on segments even if the 2780 * all the option space is used (40bytes). Otherwise 2781 * funny things may happen in tcp_output. 2782 */ 2783 offer = max(offer, 64); 2784 } 2785 2786 /* 2787 * rmx information is now retrieved from tcp_hostcache 2788 */ 2789 tcp_hc_get(&inp->inp_inc, &metrics); 2790 2791 /* 2792 * if there's a discovered mtu int tcp hostcache, use it 2793 * else, use the link mtu. 2794 */ 2795 if (metrics.rmx_mtu) 2796 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; 2797 else { 2798#ifdef INET6 2799 if (isipv6) { 2800 mss = maxmtu - min_protoh; 2801 if (!path_mtu_discovery && 2802 !in6_localaddr(&inp->in6p_faddr)) 2803 mss = min(mss, tcp_v6mssdflt); 2804 } else 2805#endif 2806 { 2807 mss = maxmtu - min_protoh; 2808 if (!path_mtu_discovery && 2809 !in_localaddr(inp->inp_faddr)) 2810 mss = min(mss, tcp_mssdflt); 2811 } 2812 } 2813 mss = min(mss, offer); 2814 2815 /* 2816 * maxopd stores the maximum length of data AND options 2817 * in a segment; maxseg is the amount of data in a normal 2818 * segment. We need to store this value (maxopd) apart 2819 * from maxseg, because now every segment carries options 2820 * and thus we normally have somewhat less data in segments. 2821 */ 2822 tp->t_maxopd = mss; 2823 2824 /* 2825 * origoffer==-1 indicates, that no segments were received yet. 2826 * In this case we just guess. 2827 */ 2828 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2829 (origoffer == -1 || 2830 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 2831 mss -= TCPOLEN_TSTAMP_APPA; 2832 tp->t_maxseg = mss; 2833 2834#if (MCLBYTES & (MCLBYTES - 1)) == 0 2835 if (mss > MCLBYTES) 2836 mss &= ~(MCLBYTES-1); 2837#else 2838 if (mss > MCLBYTES) 2839 mss = mss / MCLBYTES * MCLBYTES; 2840#endif 2841 tp->t_maxseg = mss; 2842 2843 /* 2844 * If there's a pipesize, change the socket buffer to that size, 2845 * don't change if sb_hiwat is different than default (then it 2846 * has been changed on purpose with setsockopt). 2847 * Make the socket buffers an integral number of mss units; 2848 * if the mss is larger than the socket buffer, decrease the mss. 2849 */ 2850 SOCKBUF_LOCK(&so->so_snd); 2851 if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) 2852 bufsize = metrics.rmx_sendpipe; 2853 else 2854 bufsize = so->so_snd.sb_hiwat; 2855 if (bufsize < mss) 2856 mss = bufsize; 2857 else { 2858 bufsize = roundup(bufsize, mss); 2859 if (bufsize > sb_max) 2860 bufsize = sb_max; 2861 if (bufsize > so->so_snd.sb_hiwat) 2862 (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); 2863 } 2864 SOCKBUF_UNLOCK(&so->so_snd); 2865 tp->t_maxseg = mss; 2866 2867 SOCKBUF_LOCK(&so->so_rcv); 2868 if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) 2869 bufsize = metrics.rmx_recvpipe; 2870 else 2871 bufsize = so->so_rcv.sb_hiwat; 2872 if (bufsize > mss) { 2873 bufsize = roundup(bufsize, mss); 2874 if (bufsize > sb_max) 2875 bufsize = sb_max; 2876 if (bufsize > so->so_rcv.sb_hiwat) 2877 (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); 2878 } 2879 SOCKBUF_UNLOCK(&so->so_rcv); 2880 /* 2881 * While we're here, check the others too 2882 */ 2883 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { 2884 tp->t_srtt = rtt; 2885 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; 2886 tcpstat.tcps_usedrtt++; 2887 if (metrics.rmx_rttvar) { 2888 tp->t_rttvar = metrics.rmx_rttvar; 2889 tcpstat.tcps_usedrttvar++; 2890 } else { 2891 /* default variation is +- 1 rtt */ 2892 tp->t_rttvar = 2893 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 2894 } 2895 TCPT_RANGESET(tp->t_rxtcur, 2896 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 2897 tp->t_rttmin, TCPTV_REXMTMAX); 2898 } 2899 if (metrics.rmx_ssthresh) { 2900 /* 2901 * There's some sort of gateway or interface 2902 * buffer limit on the path. Use this to set 2903 * the slow start threshhold, but set the 2904 * threshold to no less than 2*mss. 2905 */ 2906 tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); 2907 tcpstat.tcps_usedssthresh++; 2908 } 2909 if (metrics.rmx_bandwidth) 2910 tp->snd_bandwidth = metrics.rmx_bandwidth; 2911 2912 /* 2913 * Set the slow-start flight size depending on whether this 2914 * is a local network or not. 2915 * 2916 * Extend this so we cache the cwnd too and retrieve it here. 2917 * Make cwnd even bigger than RFC3390 suggests but only if we 2918 * have previous experience with the remote host. Be careful 2919 * not make cwnd bigger than remote receive window or our own 2920 * send socket buffer. Maybe put some additional upper bound 2921 * on the retrieved cwnd. Should do incremental updates to 2922 * hostcache when cwnd collapses so next connection doesn't 2923 * overloads the path again. 2924 * 2925 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. 2926 * We currently check only in syncache_socket for that. 2927 */ 2928#define TCP_METRICS_CWND 2929#ifdef TCP_METRICS_CWND 2930 if (metrics.rmx_cwnd) 2931 tp->snd_cwnd = max(mss, 2932 min(metrics.rmx_cwnd / 2, 2933 min(tp->snd_wnd, so->so_snd.sb_hiwat))); 2934 else 2935#endif 2936 if (tcp_do_rfc3390) 2937 tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); 2938#ifdef INET6 2939 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || 2940 (!isipv6 && in_localaddr(inp->inp_faddr))) 2941#else 2942 else if (in_localaddr(inp->inp_faddr)) 2943#endif 2944 tp->snd_cwnd = mss * ss_fltsz_local; 2945 else 2946 tp->snd_cwnd = mss * ss_fltsz; 2947 2948 /* Check the interface for TSO capabilities. */ 2949 if (mtuflags & CSUM_TSO) 2950 tp->t_flags |= TF_TSO; 2951} 2952 2953/* 2954 * Determine the MSS option to send on an outgoing SYN. 2955 */ 2956int 2957tcp_mssopt(struct in_conninfo *inc) 2958{ 2959 int mss = 0; 2960 u_long maxmtu = 0; 2961 u_long thcmtu = 0; 2962 size_t min_protoh; 2963#ifdef INET6 2964 int isipv6 = inc->inc_isipv6 ? 1 : 0; 2965#endif 2966 2967 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); 2968 2969#ifdef INET6 2970 if (isipv6) { 2971 mss = tcp_v6mssdflt; 2972 maxmtu = tcp_maxmtu6(inc, NULL); 2973 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 2974 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 2975 } else 2976#endif 2977 { 2978 mss = tcp_mssdflt; 2979 maxmtu = tcp_maxmtu(inc, NULL); 2980 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 2981 min_protoh = sizeof(struct tcpiphdr); 2982 } 2983 if (maxmtu && thcmtu) 2984 mss = min(maxmtu, thcmtu) - min_protoh; 2985 else if (maxmtu || thcmtu) 2986 mss = max(maxmtu, thcmtu) - min_protoh; 2987 2988 return (mss); 2989} 2990 2991 2992/* 2993 * On a partial ack arrives, force the retransmission of the 2994 * next unacknowledged segment. Do not clear tp->t_dupacks. 2995 * By setting snd_nxt to ti_ack, this forces retransmission timer to 2996 * be started again. 2997 */ 2998static void 2999tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) 3000{ 3001 tcp_seq onxt = tp->snd_nxt; 3002 u_long ocwnd = tp->snd_cwnd; 3003 3004 tcp_timer_activate(tp, TT_REXMT, 0); 3005 tp->t_rtttime = 0; 3006 tp->snd_nxt = th->th_ack; 3007 /* 3008 * Set snd_cwnd to one segment beyond acknowledged offset. 3009 * (tp->snd_una has not yet been updated when this function is called.) 3010 */ 3011 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3012 tp->t_flags |= TF_ACKNOW; 3013 (void) tcp_output(tp); 3014 tp->snd_cwnd = ocwnd; 3015 if (SEQ_GT(onxt, tp->snd_nxt)) 3016 tp->snd_nxt = onxt; 3017 /* 3018 * Partial window deflation. Relies on fact that tp->snd_una 3019 * not updated yet. 3020 */ 3021 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3022 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3023 else 3024 tp->snd_cwnd = 0; 3025 tp->snd_cwnd += tp->t_maxseg; 3026} 3027