tcp_input.c revision 183662
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/netinet/tcp_input.c 183662 2008-10-07 09:41:07Z rwatson $"); 34 35#include "opt_ipfw.h" /* for ipfw_fwd */ 36#include "opt_inet.h" 37#include "opt_inet6.h" 38#include "opt_ipsec.h" 39#include "opt_mac.h" 40#include "opt_tcpdebug.h" 41 42#include <sys/param.h> 43#include <sys/kernel.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/proc.h> /* for proc0 declaration */ 47#include <sys/protosw.h> 48#include <sys/signalvar.h> 49#include <sys/socket.h> 50#include <sys/socketvar.h> 51#include <sys/sysctl.h> 52#include <sys/syslog.h> 53#include <sys/systm.h> 54#include <sys/vimage.h> 55 56#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 57 58#include <vm/uma.h> 59 60#include <net/if.h> 61#include <net/route.h> 62 63#define TCPSTATES /* for logging */ 64 65#include <netinet/in.h> 66#include <netinet/in_pcb.h> 67#include <netinet/in_systm.h> 68#include <netinet/in_var.h> 69#include <netinet/ip.h> 70#include <netinet/ip_icmp.h> /* required for icmp_var.h */ 71#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 72#include <netinet/ip_var.h> 73#include <netinet/ip_options.h> 74#include <netinet/ip6.h> 75#include <netinet/icmp6.h> 76#include <netinet6/in6_pcb.h> 77#include <netinet6/ip6_var.h> 78#include <netinet6/nd6.h> 79#include <netinet/tcp.h> 80#include <netinet/tcp_fsm.h> 81#include <netinet/tcp_seq.h> 82#include <netinet/tcp_timer.h> 83#include <netinet/tcp_var.h> 84#include <netinet6/tcp6_var.h> 85#include <netinet/tcpip.h> 86#include <netinet/tcp_syncache.h> 87#ifdef TCPDEBUG 88#include <netinet/tcp_debug.h> 89#endif /* TCPDEBUG */ 90 91#ifdef IPSEC 92#include <netipsec/ipsec.h> 93#include <netipsec/ipsec6.h> 94#endif /*IPSEC*/ 95 96#include <machine/in_cksum.h> 97 98#include <security/mac/mac_framework.h> 99 100static const int tcprexmtthresh = 3; 101 102struct tcpstat tcpstat; 103SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats, 104 CTLFLAG_RW, tcpstat , tcpstat, 105 "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 106 107int tcp_log_in_vain = 0; 108SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 109 &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); 110 111static int blackhole = 0; 112SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, 113 blackhole, 0, "Do not send RST on segments to closed ports"); 114 115int tcp_delack_enabled = 1; 116SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, delayed_ack, 117 CTLFLAG_RW, tcp_delack_enabled, 0, 118 "Delay ACK to try and piggyback it onto a data packet"); 119 120static int drop_synfin = 0; 121SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, drop_synfin, 122 CTLFLAG_RW, drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); 123 124static int tcp_do_rfc3042 = 1; 125SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, 126 tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); 127 128static int tcp_do_rfc3390 = 1; 129SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, 130 tcp_do_rfc3390, 0, 131 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); 132 133int tcp_do_ecn = 0; 134int tcp_ecn_maxretries = 1; 135SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); 136SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, enable, 137 CTLFLAG_RW, tcp_do_ecn, 0, "TCP ECN support"); 138SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, maxretries, 139 CTLFLAG_RW, tcp_ecn_maxretries, 0, "Max retries before giving up on ECN"); 140 141static int tcp_insecure_rst = 0; 142SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, insecure_rst, 143 CTLFLAG_RW, tcp_insecure_rst, 0, 144 "Follow the old (insecure) criteria for accepting RST packets"); 145 146int tcp_do_autorcvbuf = 1; 147SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_auto, 148 CTLFLAG_RW, tcp_do_autorcvbuf, 0, 149 "Enable automatic receive buffer sizing"); 150 151int tcp_autorcvbuf_inc = 16*1024; 152SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_inc, 153 CTLFLAG_RW, tcp_autorcvbuf_inc, 0, 154 "Incrementor step size of automatic receive buffer"); 155 156int tcp_autorcvbuf_max = 256*1024; 157SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_max, 158 CTLFLAG_RW, tcp_autorcvbuf_max, 0, 159 "Max size of automatic receive buffer"); 160 161struct inpcbhead tcb; 162#define tcb6 tcb /* for KAME src sync over BSD*'s */ 163struct inpcbinfo tcbinfo; 164 165static void tcp_dooptions(struct tcpopt *, u_char *, int, int); 166static void tcp_do_segment(struct mbuf *, struct tcphdr *, 167 struct socket *, struct tcpcb *, int, int, uint8_t); 168static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, 169 struct tcpcb *, int, int); 170static void tcp_pulloutofband(struct socket *, 171 struct tcphdr *, struct mbuf *, int); 172static void tcp_xmit_timer(struct tcpcb *, int); 173static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); 174static void inline 175 tcp_congestion_exp(struct tcpcb *); 176 177static void inline 178tcp_congestion_exp(struct tcpcb *tp) 179{ 180 u_int win; 181 182 win = min(tp->snd_wnd, tp->snd_cwnd) / 183 2 / tp->t_maxseg; 184 if (win < 2) 185 win = 2; 186 tp->snd_ssthresh = win * tp->t_maxseg; 187 ENTER_FASTRECOVERY(tp); 188 tp->snd_recover = tp->snd_max; 189 if (tp->t_flags & TF_ECN_PERMIT) 190 tp->t_flags |= TF_ECN_SND_CWR; 191} 192 193/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ 194#ifdef INET6 195#define ND6_HINT(tp) \ 196do { \ 197 if ((tp) && (tp)->t_inpcb && \ 198 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ 199 nd6_nud_hint(NULL, NULL, 0); \ 200} while (0) 201#else 202#define ND6_HINT(tp) 203#endif 204 205/* 206 * Indicate whether this ack should be delayed. We can delay the ack if 207 * - there is no delayed ack timer in progress and 208 * - our last ack wasn't a 0-sized window. We never want to delay 209 * the ack that opens up a 0-sized window and 210 * - delayed acks are enabled or 211 * - this is a half-synchronized T/TCP connection. 212 */ 213#define DELAY_ACK(tp) \ 214 ((!tcp_timer_active(tp, TT_DELACK) && \ 215 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 216 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 217 218/* 219 * TCP input handling is split into multiple parts: 220 * tcp6_input is a thin wrapper around tcp_input for the extended 221 * ip6_protox[] call format in ip6_input 222 * tcp_input handles primary segment validation, inpcb lookup and 223 * SYN processing on listen sockets 224 * tcp_do_segment processes the ACK and text of the segment for 225 * establishing, established and closing connections 226 */ 227#ifdef INET6 228int 229tcp6_input(struct mbuf **mp, int *offp, int proto) 230{ 231 INIT_VNET_INET6(curvnet); 232 struct mbuf *m = *mp; 233 struct in6_ifaddr *ia6; 234 235 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); 236 237 /* 238 * draft-itojun-ipv6-tcp-to-anycast 239 * better place to put this in? 240 */ 241 ia6 = ip6_getdstifaddr(m); 242 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { 243 struct ip6_hdr *ip6; 244 245 ip6 = mtod(m, struct ip6_hdr *); 246 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 247 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 248 return IPPROTO_DONE; 249 } 250 251 tcp_input(m, *offp); 252 return IPPROTO_DONE; 253} 254#endif 255 256void 257tcp_input(struct mbuf *m, int off0) 258{ 259 INIT_VNET_INET(curvnet); 260#ifdef INET6 261 INIT_VNET_INET6(curvnet); 262#endif 263#ifdef IPSEC 264 INIT_VNET_IPSEC(curvnet); 265#endif 266 struct tcphdr *th; 267 struct ip *ip = NULL; 268 struct ipovly *ipov; 269 struct inpcb *inp = NULL; 270 struct tcpcb *tp = NULL; 271 struct socket *so = NULL; 272 u_char *optp = NULL; 273 int optlen = 0; 274 int len, tlen, off; 275 int drop_hdrlen; 276 int thflags; 277 int rstreason = 0; /* For badport_bandlim accounting purposes */ 278 uint8_t iptos; 279#ifdef IPFIREWALL_FORWARD 280 struct m_tag *fwd_tag; 281#endif 282#ifdef INET6 283 struct ip6_hdr *ip6 = NULL; 284 int isipv6; 285#else 286 const void *ip6 = NULL; 287 const int isipv6 = 0; 288#endif 289 struct tcpopt to; /* options in this segment */ 290 char *s = NULL; /* address and port logging */ 291 292#ifdef TCPDEBUG 293 /* 294 * The size of tcp_saveipgen must be the size of the max ip header, 295 * now IPv6. 296 */ 297 u_char tcp_saveipgen[IP6_HDR_LEN]; 298 struct tcphdr tcp_savetcp; 299 short ostate = 0; 300#endif 301 302#ifdef INET6 303 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 304#endif 305 306 to.to_flags = 0; 307 V_tcpstat.tcps_rcvtotal++; 308 309 if (isipv6) { 310#ifdef INET6 311 /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ 312 ip6 = mtod(m, struct ip6_hdr *); 313 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 314 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { 315 V_tcpstat.tcps_rcvbadsum++; 316 goto drop; 317 } 318 th = (struct tcphdr *)((caddr_t)ip6 + off0); 319 320 /* 321 * Be proactive about unspecified IPv6 address in source. 322 * As we use all-zero to indicate unbounded/unconnected pcb, 323 * unspecified IPv6 address can be used to confuse us. 324 * 325 * Note that packets with unspecified IPv6 destination is 326 * already dropped in ip6_input. 327 */ 328 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 329 /* XXX stat */ 330 goto drop; 331 } 332#else 333 th = NULL; /* XXX: Avoid compiler warning. */ 334#endif 335 } else { 336 /* 337 * Get IP and TCP header together in first mbuf. 338 * Note: IP leaves IP header in first mbuf. 339 */ 340 if (off0 > sizeof (struct ip)) { 341 ip_stripoptions(m, (struct mbuf *)0); 342 off0 = sizeof(struct ip); 343 } 344 if (m->m_len < sizeof (struct tcpiphdr)) { 345 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 346 == NULL) { 347 V_tcpstat.tcps_rcvshort++; 348 return; 349 } 350 } 351 ip = mtod(m, struct ip *); 352 ipov = (struct ipovly *)ip; 353 th = (struct tcphdr *)((caddr_t)ip + off0); 354 tlen = ip->ip_len; 355 356 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 357 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 358 th->th_sum = m->m_pkthdr.csum_data; 359 else 360 th->th_sum = in_pseudo(ip->ip_src.s_addr, 361 ip->ip_dst.s_addr, 362 htonl(m->m_pkthdr.csum_data + 363 ip->ip_len + 364 IPPROTO_TCP)); 365 th->th_sum ^= 0xffff; 366#ifdef TCPDEBUG 367 ipov->ih_len = (u_short)tlen; 368 ipov->ih_len = htons(ipov->ih_len); 369#endif 370 } else { 371 /* 372 * Checksum extended TCP header and data. 373 */ 374 len = sizeof (struct ip) + tlen; 375 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 376 ipov->ih_len = (u_short)tlen; 377 ipov->ih_len = htons(ipov->ih_len); 378 th->th_sum = in_cksum(m, len); 379 } 380 if (th->th_sum) { 381 V_tcpstat.tcps_rcvbadsum++; 382 goto drop; 383 } 384 /* Re-initialization for later version check */ 385 ip->ip_v = IPVERSION; 386 } 387 388#ifdef INET6 389 if (isipv6) 390 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 391 else 392#endif 393 iptos = ip->ip_tos; 394 395 /* 396 * Check that TCP offset makes sense, 397 * pull out TCP options and adjust length. XXX 398 */ 399 off = th->th_off << 2; 400 if (off < sizeof (struct tcphdr) || off > tlen) { 401 V_tcpstat.tcps_rcvbadoff++; 402 goto drop; 403 } 404 tlen -= off; /* tlen is used instead of ti->ti_len */ 405 if (off > sizeof (struct tcphdr)) { 406 if (isipv6) { 407#ifdef INET6 408 IP6_EXTHDR_CHECK(m, off0, off, ); 409 ip6 = mtod(m, struct ip6_hdr *); 410 th = (struct tcphdr *)((caddr_t)ip6 + off0); 411#endif 412 } else { 413 if (m->m_len < sizeof(struct ip) + off) { 414 if ((m = m_pullup(m, sizeof (struct ip) + off)) 415 == NULL) { 416 V_tcpstat.tcps_rcvshort++; 417 return; 418 } 419 ip = mtod(m, struct ip *); 420 ipov = (struct ipovly *)ip; 421 th = (struct tcphdr *)((caddr_t)ip + off0); 422 } 423 } 424 optlen = off - sizeof (struct tcphdr); 425 optp = (u_char *)(th + 1); 426 } 427 thflags = th->th_flags; 428 429 /* 430 * Convert TCP protocol specific fields to host format. 431 */ 432 th->th_seq = ntohl(th->th_seq); 433 th->th_ack = ntohl(th->th_ack); 434 th->th_win = ntohs(th->th_win); 435 th->th_urp = ntohs(th->th_urp); 436 437 /* 438 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. 439 */ 440 drop_hdrlen = off0 + off; 441 442 /* 443 * Locate pcb for segment. 444 */ 445 INP_INFO_WLOCK(&V_tcbinfo); 446findpcb: 447 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 448#ifdef IPFIREWALL_FORWARD 449 /* 450 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. 451 */ 452 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 453 454 if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ 455 struct sockaddr_in *next_hop; 456 457 next_hop = (struct sockaddr_in *)(fwd_tag+1); 458 /* 459 * Transparently forwarded. Pretend to be the destination. 460 * already got one like this? 461 */ 462 inp = in_pcblookup_hash(&V_tcbinfo, 463 ip->ip_src, th->th_sport, 464 ip->ip_dst, th->th_dport, 465 0, m->m_pkthdr.rcvif); 466 if (!inp) { 467 /* It's new. Try to find the ambushing socket. */ 468 inp = in_pcblookup_hash(&V_tcbinfo, 469 ip->ip_src, th->th_sport, 470 next_hop->sin_addr, 471 next_hop->sin_port ? 472 ntohs(next_hop->sin_port) : 473 th->th_dport, 474 INPLOOKUP_WILDCARD, 475 m->m_pkthdr.rcvif); 476 } 477 /* Remove the tag from the packet. We don't need it anymore. */ 478 m_tag_delete(m, fwd_tag); 479 } else 480#endif /* IPFIREWALL_FORWARD */ 481 { 482 if (isipv6) { 483#ifdef INET6 484 inp = in6_pcblookup_hash(&V_tcbinfo, 485 &ip6->ip6_src, th->th_sport, 486 &ip6->ip6_dst, th->th_dport, 487 INPLOOKUP_WILDCARD, 488 m->m_pkthdr.rcvif); 489#endif 490 } else 491 inp = in_pcblookup_hash(&V_tcbinfo, 492 ip->ip_src, th->th_sport, 493 ip->ip_dst, th->th_dport, 494 INPLOOKUP_WILDCARD, 495 m->m_pkthdr.rcvif); 496 } 497 498 /* 499 * If the INPCB does not exist then all data in the incoming 500 * segment is discarded and an appropriate RST is sent back. 501 * XXX MRT Send RST using which routing table? 502 */ 503 if (inp == NULL) { 504 /* 505 * Log communication attempts to ports that are not 506 * in use. 507 */ 508 if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || 509 tcp_log_in_vain == 2) { 510 if ((s = tcp_log_addrs(NULL, th, (void *)ip, ip6))) 511 log(LOG_INFO, "%s; %s: Connection attempt " 512 "to closed port\n", s, __func__); 513 } 514 /* 515 * When blackholing do not respond with a RST but 516 * completely ignore the segment and drop it. 517 */ 518 if ((V_blackhole == 1 && (thflags & TH_SYN)) || 519 V_blackhole == 2) 520 goto dropunlock; 521 522 rstreason = BANDLIM_RST_CLOSEDPORT; 523 goto dropwithreset; 524 } 525 INP_WLOCK(inp); 526 527#ifdef IPSEC 528#ifdef INET6 529 if (isipv6 && ipsec6_in_reject(m, inp)) { 530 V_ipsec6stat.in_polvio++; 531 goto dropunlock; 532 } else 533#endif /* INET6 */ 534 if (ipsec4_in_reject(m, inp) != 0) { 535 V_ipsec4stat.in_polvio++; 536 goto dropunlock; 537 } 538#endif /* IPSEC */ 539 540 /* 541 * Check the minimum TTL for socket. 542 */ 543 if (inp->inp_ip_minttl != 0) { 544#ifdef INET6 545 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) 546 goto dropunlock; 547 else 548#endif 549 if (inp->inp_ip_minttl > ip->ip_ttl) 550 goto dropunlock; 551 } 552 553 /* 554 * A previous connection in TIMEWAIT state is supposed to catch 555 * stray or duplicate segments arriving late. If this segment 556 * was a legitimate new connection attempt the old INPCB gets 557 * removed and we can try again to find a listening socket. 558 */ 559 if (inp->inp_vflag & INP_TIMEWAIT) { 560 if (thflags & TH_SYN) 561 tcp_dooptions(&to, optp, optlen, TO_SYN); 562 /* 563 * NB: tcp_twcheck unlocks the INP and frees the mbuf. 564 */ 565 if (tcp_twcheck(inp, &to, th, m, tlen)) 566 goto findpcb; 567 INP_INFO_WUNLOCK(&V_tcbinfo); 568 return; 569 } 570 /* 571 * The TCPCB may no longer exist if the connection is winding 572 * down or it is in the CLOSED state. Either way we drop the 573 * segment and send an appropriate response. 574 */ 575 tp = intotcpcb(inp); 576 if (tp == NULL || tp->t_state == TCPS_CLOSED) { 577 rstreason = BANDLIM_RST_CLOSEDPORT; 578 goto dropwithreset; 579 } 580 581#ifdef MAC 582 INP_WLOCK_ASSERT(inp); 583 if (mac_inpcb_check_deliver(inp, m)) 584 goto dropunlock; 585#endif 586 so = inp->inp_socket; 587 KASSERT(so != NULL, ("%s: so == NULL", __func__)); 588#ifdef TCPDEBUG 589 if (so->so_options & SO_DEBUG) { 590 ostate = tp->t_state; 591 if (isipv6) { 592#ifdef INET6 593 bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); 594#endif 595 } else 596 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); 597 tcp_savetcp = *th; 598 } 599#endif 600 /* 601 * When the socket is accepting connections (the INPCB is in LISTEN 602 * state) we look into the SYN cache if this is a new connection 603 * attempt or the completion of a previous one. 604 */ 605 if (so->so_options & SO_ACCEPTCONN) { 606 struct in_conninfo inc; 607 608 KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " 609 "tp not listening", __func__)); 610 611 bzero(&inc, sizeof(inc)); 612 inc.inc_isipv6 = isipv6; 613#ifdef INET6 614 if (isipv6) { 615 inc.inc6_faddr = ip6->ip6_src; 616 inc.inc6_laddr = ip6->ip6_dst; 617 } else 618#endif 619 { 620 inc.inc_faddr = ip->ip_src; 621 inc.inc_laddr = ip->ip_dst; 622 } 623 inc.inc_fport = th->th_sport; 624 inc.inc_lport = th->th_dport; 625 626 /* 627 * Check for an existing connection attempt in syncache if 628 * the flag is only ACK. A successful lookup creates a new 629 * socket appended to the listen queue in SYN_RECEIVED state. 630 */ 631 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 632 /* 633 * Parse the TCP options here because 634 * syncookies need access to the reflected 635 * timestamp. 636 */ 637 tcp_dooptions(&to, optp, optlen, 0); 638 /* 639 * NB: syncache_expand() doesn't unlock 640 * inp and tcpinfo locks. 641 */ 642 if (!syncache_expand(&inc, &to, th, &so, m)) { 643 /* 644 * No syncache entry or ACK was not 645 * for our SYN/ACK. Send a RST. 646 * NB: syncache did its own logging 647 * of the failure cause. 648 */ 649 rstreason = BANDLIM_RST_OPENPORT; 650 goto dropwithreset; 651 } 652 if (so == NULL) { 653 /* 654 * We completed the 3-way handshake 655 * but could not allocate a socket 656 * either due to memory shortage, 657 * listen queue length limits or 658 * global socket limits. Send RST 659 * or wait and have the remote end 660 * retransmit the ACK for another 661 * try. 662 */ 663 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 664 log(LOG_DEBUG, "%s; %s: Listen socket: " 665 "Socket allocation failed due to " 666 "limits or memory shortage, %s\n", 667 s, __func__, 668 V_tcp_sc_rst_sock_fail ? 669 "sending RST" : "try again"); 670 if (V_tcp_sc_rst_sock_fail) { 671 rstreason = BANDLIM_UNLIMITED; 672 goto dropwithreset; 673 } else 674 goto dropunlock; 675 } 676 /* 677 * Socket is created in state SYN_RECEIVED. 678 * Unlock the listen socket, lock the newly 679 * created socket and update the tp variable. 680 */ 681 INP_WUNLOCK(inp); /* listen socket */ 682 inp = sotoinpcb(so); 683 INP_WLOCK(inp); /* new connection */ 684 tp = intotcpcb(inp); 685 KASSERT(tp->t_state == TCPS_SYN_RECEIVED, 686 ("%s: ", __func__)); 687 /* 688 * Process the segment and the data it 689 * contains. tcp_do_segment() consumes 690 * the mbuf chain and unlocks the inpcb. 691 */ 692 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, 693 iptos); 694 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 695 return; 696 } 697 /* 698 * Segment flag validation for new connection attempts: 699 * 700 * Our (SYN|ACK) response was rejected. 701 * Check with syncache and remove entry to prevent 702 * retransmits. 703 * 704 * NB: syncache_chkrst does its own logging of failure 705 * causes. 706 */ 707 if (thflags & TH_RST) { 708 syncache_chkrst(&inc, th); 709 goto dropunlock; 710 } 711 /* 712 * We can't do anything without SYN. 713 */ 714 if ((thflags & TH_SYN) == 0) { 715 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 716 log(LOG_DEBUG, "%s; %s: Listen socket: " 717 "SYN is missing, segment ignored\n", 718 s, __func__); 719 V_tcpstat.tcps_badsyn++; 720 goto dropunlock; 721 } 722 /* 723 * (SYN|ACK) is bogus on a listen socket. 724 */ 725 if (thflags & TH_ACK) { 726 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 727 log(LOG_DEBUG, "%s; %s: Listen socket: " 728 "SYN|ACK invalid, segment rejected\n", 729 s, __func__); 730 syncache_badack(&inc); /* XXX: Not needed! */ 731 V_tcpstat.tcps_badsyn++; 732 rstreason = BANDLIM_RST_OPENPORT; 733 goto dropwithreset; 734 } 735 /* 736 * If the drop_synfin option is enabled, drop all 737 * segments with both the SYN and FIN bits set. 738 * This prevents e.g. nmap from identifying the 739 * TCP/IP stack. 740 * XXX: Poor reasoning. nmap has other methods 741 * and is constantly refining its stack detection 742 * strategies. 743 * XXX: This is a violation of the TCP specification 744 * and was used by RFC1644. 745 */ 746 if ((thflags & TH_FIN) && V_drop_synfin) { 747 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 748 log(LOG_DEBUG, "%s; %s: Listen socket: " 749 "SYN|FIN segment ignored (based on " 750 "sysctl setting)\n", s, __func__); 751 V_tcpstat.tcps_badsyn++; 752 goto dropunlock; 753 } 754 /* 755 * Segment's flags are (SYN) or (SYN|FIN). 756 * 757 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored 758 * as they do not affect the state of the TCP FSM. 759 * The data pointed to by TH_URG and th_urp is ignored. 760 */ 761 KASSERT((thflags & (TH_RST|TH_ACK)) == 0, 762 ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); 763 KASSERT(thflags & (TH_SYN), 764 ("%s: Listen socket: TH_SYN not set", __func__)); 765#ifdef INET6 766 /* 767 * If deprecated address is forbidden, 768 * we do not accept SYN to deprecated interface 769 * address to prevent any new inbound connection from 770 * getting established. 771 * When we do not accept SYN, we send a TCP RST, 772 * with deprecated source address (instead of dropping 773 * it). We compromise it as it is much better for peer 774 * to send a RST, and RST will be the final packet 775 * for the exchange. 776 * 777 * If we do not forbid deprecated addresses, we accept 778 * the SYN packet. RFC2462 does not suggest dropping 779 * SYN in this case. 780 * If we decipher RFC2462 5.5.4, it says like this: 781 * 1. use of deprecated addr with existing 782 * communication is okay - "SHOULD continue to be 783 * used" 784 * 2. use of it with new communication: 785 * (2a) "SHOULD NOT be used if alternate address 786 * with sufficient scope is available" 787 * (2b) nothing mentioned otherwise. 788 * Here we fall into (2b) case as we have no choice in 789 * our source address selection - we must obey the peer. 790 * 791 * The wording in RFC2462 is confusing, and there are 792 * multiple description text for deprecated address 793 * handling - worse, they are not exactly the same. 794 * I believe 5.5.4 is the best one, so we follow 5.5.4. 795 */ 796 if (isipv6 && !V_ip6_use_deprecated) { 797 struct in6_ifaddr *ia6; 798 799 if ((ia6 = ip6_getdstifaddr(m)) && 800 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 801 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 802 log(LOG_DEBUG, "%s; %s: Listen socket: " 803 "Connection attempt to deprecated " 804 "IPv6 address rejected\n", 805 s, __func__); 806 rstreason = BANDLIM_RST_OPENPORT; 807 goto dropwithreset; 808 } 809 } 810#endif 811 /* 812 * Basic sanity checks on incoming SYN requests: 813 * Don't respond if the destination is a link layer 814 * broadcast according to RFC1122 4.2.3.10, p. 104. 815 * If it is from this socket it must be forged. 816 * Don't respond if the source or destination is a 817 * global or subnet broad- or multicast address. 818 * Note that it is quite possible to receive unicast 819 * link-layer packets with a broadcast IP address. Use 820 * in_broadcast() to find them. 821 */ 822 if (m->m_flags & (M_BCAST|M_MCAST)) { 823 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 824 log(LOG_DEBUG, "%s; %s: Listen socket: " 825 "Connection attempt from broad- or multicast " 826 "link layer address ignored\n", s, __func__); 827 goto dropunlock; 828 } 829 if (isipv6) { 830#ifdef INET6 831 if (th->th_dport == th->th_sport && 832 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { 833 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 834 log(LOG_DEBUG, "%s; %s: Listen socket: " 835 "Connection attempt to/from self " 836 "ignored\n", s, __func__); 837 goto dropunlock; 838 } 839 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 840 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { 841 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 842 log(LOG_DEBUG, "%s; %s: Listen socket: " 843 "Connection attempt from/to multicast " 844 "address ignored\n", s, __func__); 845 goto dropunlock; 846 } 847#endif 848 } else { 849 if (th->th_dport == th->th_sport && 850 ip->ip_dst.s_addr == ip->ip_src.s_addr) { 851 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 852 log(LOG_DEBUG, "%s; %s: Listen socket: " 853 "Connection attempt from/to self " 854 "ignored\n", s, __func__); 855 goto dropunlock; 856 } 857 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 858 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 859 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 860 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { 861 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 862 log(LOG_DEBUG, "%s; %s: Listen socket: " 863 "Connection attempt from/to broad- " 864 "or multicast address ignored\n", 865 s, __func__); 866 goto dropunlock; 867 } 868 } 869 /* 870 * SYN appears to be valid. Create compressed TCP state 871 * for syncache. 872 */ 873#ifdef TCPDEBUG 874 if (so->so_options & SO_DEBUG) 875 tcp_trace(TA_INPUT, ostate, tp, 876 (void *)tcp_saveipgen, &tcp_savetcp, 0); 877#endif 878 tcp_dooptions(&to, optp, optlen, TO_SYN); 879 syncache_add(&inc, &to, th, inp, &so, m); 880 /* 881 * Entry added to syncache and mbuf consumed. 882 * Everything already unlocked by syncache_add(). 883 */ 884 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 885 return; 886 } 887 888 /* 889 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later 890 * state. tcp_do_segment() always consumes the mbuf chain, unlocks 891 * the inpcb, and unlocks pcbinfo. 892 */ 893 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); 894 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 895 return; 896 897dropwithreset: 898 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 899 900 /* 901 * If inp is non-NULL, we call tcp_dropwithreset() holding both inpcb 902 * and global locks. However, if NULL, we must hold neither as 903 * firewalls may acquire the global lock in order to look for a 904 * matching inpcb. 905 */ 906 if (inp != NULL) { 907 tcp_dropwithreset(m, th, tp, tlen, rstreason); 908 INP_WUNLOCK(inp); 909 } 910 INP_INFO_WUNLOCK(&V_tcbinfo); 911 if (inp == NULL) 912 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 913 m = NULL; /* mbuf chain got consumed. */ 914 goto drop; 915 916dropunlock: 917 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 918 if (inp != NULL) 919 INP_WUNLOCK(inp); 920 INP_INFO_WUNLOCK(&V_tcbinfo); 921 922drop: 923 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 924 if (s != NULL) 925 free(s, M_TCPLOG); 926 if (m != NULL) 927 m_freem(m); 928 return; 929} 930 931static void 932tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 933 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) 934{ 935 INIT_VNET_INET(tp->t_vnet); 936 int thflags, acked, ourfinisacked, needoutput = 0; 937 int headlocked = 1; 938 int rstreason, todrop, win; 939 u_long tiwin; 940 struct tcpopt to; 941 942#ifdef TCPDEBUG 943 /* 944 * The size of tcp_saveipgen must be the size of the max ip header, 945 * now IPv6. 946 */ 947 u_char tcp_saveipgen[IP6_HDR_LEN]; 948 struct tcphdr tcp_savetcp; 949 short ostate = 0; 950#endif 951 thflags = th->th_flags; 952 953 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 954 INP_WLOCK_ASSERT(tp->t_inpcb); 955 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 956 __func__)); 957 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 958 __func__)); 959 960 /* 961 * Segment received on connection. 962 * Reset idle time and keep-alive timer. 963 * XXX: This should be done after segment 964 * validation to ignore broken/spoofed segs. 965 */ 966 tp->t_rcvtime = ticks; 967 if (TCPS_HAVEESTABLISHED(tp->t_state)) 968 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 969 970 /* 971 * Unscale the window into a 32-bit value. 972 * For the SYN_SENT state the scale is zero. 973 */ 974 tiwin = th->th_win << tp->snd_scale; 975 976 /* 977 * TCP ECN processing. 978 */ 979 if (tp->t_flags & TF_ECN_PERMIT) { 980 switch (iptos & IPTOS_ECN_MASK) { 981 case IPTOS_ECN_CE: 982 tp->t_flags |= TF_ECN_SND_ECE; 983 V_tcpstat.tcps_ecn_ce++; 984 break; 985 case IPTOS_ECN_ECT0: 986 V_tcpstat.tcps_ecn_ect0++; 987 break; 988 case IPTOS_ECN_ECT1: 989 V_tcpstat.tcps_ecn_ect1++; 990 break; 991 } 992 993 if (thflags & TH_CWR) 994 tp->t_flags &= ~TF_ECN_SND_ECE; 995 996 /* 997 * Congestion experienced. 998 * Ignore if we are already trying to recover. 999 */ 1000 if ((thflags & TH_ECE) && 1001 SEQ_LEQ(th->th_ack, tp->snd_recover)) { 1002 V_tcpstat.tcps_ecn_rcwnd++; 1003 tcp_congestion_exp(tp); 1004 } 1005 } 1006 1007 /* 1008 * Parse options on any incoming segment. 1009 */ 1010 tcp_dooptions(&to, (u_char *)(th + 1), 1011 (th->th_off << 2) - sizeof(struct tcphdr), 1012 (thflags & TH_SYN) ? TO_SYN : 0); 1013 1014 /* 1015 * If echoed timestamp is later than the current time, 1016 * fall back to non RFC1323 RTT calculation. Normalize 1017 * timestamp if syncookies were used when this connection 1018 * was established. 1019 */ 1020 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1021 to.to_tsecr -= tp->ts_offset; 1022 if (TSTMP_GT(to.to_tsecr, ticks)) 1023 to.to_tsecr = 0; 1024 } 1025 1026 /* 1027 * Process options only when we get SYN/ACK back. The SYN case 1028 * for incoming connections is handled in tcp_syncache. 1029 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1030 * or <SYN,ACK>) segment itself is never scaled. 1031 * XXX this is traditional behavior, may need to be cleaned up. 1032 */ 1033 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1034 if ((to.to_flags & TOF_SCALE) && 1035 (tp->t_flags & TF_REQ_SCALE)) { 1036 tp->t_flags |= TF_RCVD_SCALE; 1037 tp->snd_scale = to.to_wscale; 1038 } 1039 /* 1040 * Initial send window. It will be updated with 1041 * the next incoming segment to the scaled value. 1042 */ 1043 tp->snd_wnd = th->th_win; 1044 if (to.to_flags & TOF_TS) { 1045 tp->t_flags |= TF_RCVD_TSTMP; 1046 tp->ts_recent = to.to_tsval; 1047 tp->ts_recent_age = ticks; 1048 } 1049 if (to.to_flags & TOF_MSS) 1050 tcp_mss(tp, to.to_mss); 1051 if ((tp->t_flags & TF_SACK_PERMIT) && 1052 (to.to_flags & TOF_SACKPERM) == 0) 1053 tp->t_flags &= ~TF_SACK_PERMIT; 1054 } 1055 1056 /* 1057 * Header prediction: check for the two common cases 1058 * of a uni-directional data xfer. If the packet has 1059 * no control flags, is in-sequence, the window didn't 1060 * change and we're not retransmitting, it's a 1061 * candidate. If the length is zero and the ack moved 1062 * forward, we're the sender side of the xfer. Just 1063 * free the data acked & wake any higher level process 1064 * that was blocked waiting for space. If the length 1065 * is non-zero and the ack didn't move, we're the 1066 * receiver side. If we're getting packets in-order 1067 * (the reassembly queue is empty), add the data to 1068 * the socket buffer and note that we need a delayed ack. 1069 * Make sure that the hidden state-flags are also off. 1070 * Since we check for TCPS_ESTABLISHED first, it can only 1071 * be TH_NEEDSYN. 1072 */ 1073 if (tp->t_state == TCPS_ESTABLISHED && 1074 th->th_seq == tp->rcv_nxt && 1075 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1076 tp->snd_nxt == tp->snd_max && 1077 tiwin && tiwin == tp->snd_wnd && 1078 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1079 LIST_EMPTY(&tp->t_segq) && 1080 ((to.to_flags & TOF_TS) == 0 || 1081 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { 1082 1083 /* 1084 * If last ACK falls within this segment's sequence numbers, 1085 * record the timestamp. 1086 * NOTE that the test is modified according to the latest 1087 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1088 */ 1089 if ((to.to_flags & TOF_TS) != 0 && 1090 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1091 tp->ts_recent_age = ticks; 1092 tp->ts_recent = to.to_tsval; 1093 } 1094 1095 if (tlen == 0) { 1096 if (SEQ_GT(th->th_ack, tp->snd_una) && 1097 SEQ_LEQ(th->th_ack, tp->snd_max) && 1098 tp->snd_cwnd >= tp->snd_wnd && 1099 ((!V_tcp_do_newreno && 1100 !(tp->t_flags & TF_SACK_PERMIT) && 1101 tp->t_dupacks < tcprexmtthresh) || 1102 ((V_tcp_do_newreno || 1103 (tp->t_flags & TF_SACK_PERMIT)) && 1104 !IN_FASTRECOVERY(tp) && 1105 (to.to_flags & TOF_SACK) == 0 && 1106 TAILQ_EMPTY(&tp->snd_holes)))) { 1107 KASSERT(headlocked, 1108 ("%s: headlocked", __func__)); 1109 INP_INFO_WUNLOCK(&V_tcbinfo); 1110 headlocked = 0; 1111 /* 1112 * This is a pure ack for outstanding data. 1113 */ 1114 ++V_tcpstat.tcps_predack; 1115 /* 1116 * "bad retransmit" recovery. 1117 */ 1118 if (tp->t_rxtshift == 1 && 1119 ticks < tp->t_badrxtwin) { 1120 ++V_tcpstat.tcps_sndrexmitbad; 1121 tp->snd_cwnd = tp->snd_cwnd_prev; 1122 tp->snd_ssthresh = 1123 tp->snd_ssthresh_prev; 1124 tp->snd_recover = tp->snd_recover_prev; 1125 if (tp->t_flags & TF_WASFRECOVERY) 1126 ENTER_FASTRECOVERY(tp); 1127 tp->snd_nxt = tp->snd_max; 1128 tp->t_badrxtwin = 0; 1129 } 1130 1131 /* 1132 * Recalculate the transmit timer / rtt. 1133 * 1134 * Some boxes send broken timestamp replies 1135 * during the SYN+ACK phase, ignore 1136 * timestamps of 0 or we could calculate a 1137 * huge RTT and blow up the retransmit timer. 1138 */ 1139 if ((to.to_flags & TOF_TS) != 0 && 1140 to.to_tsecr) { 1141 if (!tp->t_rttlow || 1142 tp->t_rttlow > ticks - to.to_tsecr) 1143 tp->t_rttlow = ticks - to.to_tsecr; 1144 tcp_xmit_timer(tp, 1145 ticks - to.to_tsecr + 1); 1146 } else if (tp->t_rtttime && 1147 SEQ_GT(th->th_ack, tp->t_rtseq)) { 1148 if (!tp->t_rttlow || 1149 tp->t_rttlow > ticks - tp->t_rtttime) 1150 tp->t_rttlow = ticks - tp->t_rtttime; 1151 tcp_xmit_timer(tp, 1152 ticks - tp->t_rtttime); 1153 } 1154 tcp_xmit_bandwidth_limit(tp, th->th_ack); 1155 acked = th->th_ack - tp->snd_una; 1156 V_tcpstat.tcps_rcvackpack++; 1157 V_tcpstat.tcps_rcvackbyte += acked; 1158 sbdrop(&so->so_snd, acked); 1159 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 1160 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1161 tp->snd_recover = th->th_ack - 1; 1162 tp->snd_una = th->th_ack; 1163 /* 1164 * Pull snd_wl2 up to prevent seq wrap relative 1165 * to th_ack. 1166 */ 1167 tp->snd_wl2 = th->th_ack; 1168 tp->t_dupacks = 0; 1169 m_freem(m); 1170 ND6_HINT(tp); /* Some progress has been made. */ 1171 1172 /* 1173 * If all outstanding data are acked, stop 1174 * retransmit timer, otherwise restart timer 1175 * using current (possibly backed-off) value. 1176 * If process is waiting for space, 1177 * wakeup/selwakeup/signal. If data 1178 * are ready to send, let tcp_output 1179 * decide between more output or persist. 1180 */ 1181#ifdef TCPDEBUG 1182 if (so->so_options & SO_DEBUG) 1183 tcp_trace(TA_INPUT, ostate, tp, 1184 (void *)tcp_saveipgen, 1185 &tcp_savetcp, 0); 1186#endif 1187 if (tp->snd_una == tp->snd_max) 1188 tcp_timer_activate(tp, TT_REXMT, 0); 1189 else if (!tcp_timer_active(tp, TT_PERSIST)) 1190 tcp_timer_activate(tp, TT_REXMT, 1191 tp->t_rxtcur); 1192 sowwakeup(so); 1193 if (so->so_snd.sb_cc) 1194 (void) tcp_output(tp); 1195 goto check_delack; 1196 } 1197 } else if (th->th_ack == tp->snd_una && 1198 tlen <= sbspace(&so->so_rcv)) { 1199 int newsize = 0; /* automatic sockbuf scaling */ 1200 1201 KASSERT(headlocked, ("%s: headlocked", __func__)); 1202 INP_INFO_WUNLOCK(&V_tcbinfo); 1203 headlocked = 0; 1204 /* 1205 * This is a pure, in-sequence data packet 1206 * with nothing on the reassembly queue and 1207 * we have enough buffer space to take it. 1208 */ 1209 /* Clean receiver SACK report if present */ 1210 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 1211 tcp_clean_sackreport(tp); 1212 ++V_tcpstat.tcps_preddat; 1213 tp->rcv_nxt += tlen; 1214 /* 1215 * Pull snd_wl1 up to prevent seq wrap relative to 1216 * th_seq. 1217 */ 1218 tp->snd_wl1 = th->th_seq; 1219 /* 1220 * Pull rcv_up up to prevent seq wrap relative to 1221 * rcv_nxt. 1222 */ 1223 tp->rcv_up = tp->rcv_nxt; 1224 V_tcpstat.tcps_rcvpack++; 1225 V_tcpstat.tcps_rcvbyte += tlen; 1226 ND6_HINT(tp); /* Some progress has been made */ 1227#ifdef TCPDEBUG 1228 if (so->so_options & SO_DEBUG) 1229 tcp_trace(TA_INPUT, ostate, tp, 1230 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1231#endif 1232 /* 1233 * Automatic sizing of receive socket buffer. Often the send 1234 * buffer size is not optimally adjusted to the actual network 1235 * conditions at hand (delay bandwidth product). Setting the 1236 * buffer size too small limits throughput on links with high 1237 * bandwidth and high delay (eg. trans-continental/oceanic links). 1238 * 1239 * On the receive side the socket buffer memory is only rarely 1240 * used to any significant extent. This allows us to be much 1241 * more aggressive in scaling the receive socket buffer. For 1242 * the case that the buffer space is actually used to a large 1243 * extent and we run out of kernel memory we can simply drop 1244 * the new segments; TCP on the sender will just retransmit it 1245 * later. Setting the buffer size too big may only consume too 1246 * much kernel memory if the application doesn't read() from 1247 * the socket or packet loss or reordering makes use of the 1248 * reassembly queue. 1249 * 1250 * The criteria to step up the receive buffer one notch are: 1251 * 1. the number of bytes received during the time it takes 1252 * one timestamp to be reflected back to us (the RTT); 1253 * 2. received bytes per RTT is within seven eighth of the 1254 * current socket buffer size; 1255 * 3. receive buffer size has not hit maximal automatic size; 1256 * 1257 * This algorithm does one step per RTT at most and only if 1258 * we receive a bulk stream w/o packet losses or reorderings. 1259 * Shrinking the buffer during idle times is not necessary as 1260 * it doesn't consume any memory when idle. 1261 * 1262 * TODO: Only step up if the application is actually serving 1263 * the buffer to better manage the socket buffer resources. 1264 */ 1265 if (V_tcp_do_autorcvbuf && 1266 to.to_tsecr && 1267 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 1268 if (to.to_tsecr > tp->rfbuf_ts && 1269 to.to_tsecr - tp->rfbuf_ts < hz) { 1270 if (tp->rfbuf_cnt > 1271 (so->so_rcv.sb_hiwat / 8 * 7) && 1272 so->so_rcv.sb_hiwat < 1273 V_tcp_autorcvbuf_max) { 1274 newsize = 1275 min(so->so_rcv.sb_hiwat + 1276 V_tcp_autorcvbuf_inc, 1277 V_tcp_autorcvbuf_max); 1278 } 1279 /* Start over with next RTT. */ 1280 tp->rfbuf_ts = 0; 1281 tp->rfbuf_cnt = 0; 1282 } else 1283 tp->rfbuf_cnt += tlen; /* add up */ 1284 } 1285 1286 /* Add data to socket buffer. */ 1287 SOCKBUF_LOCK(&so->so_rcv); 1288 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1289 m_freem(m); 1290 } else { 1291 /* 1292 * Set new socket buffer size. 1293 * Give up when limit is reached. 1294 */ 1295 if (newsize) 1296 if (!sbreserve_locked(&so->so_rcv, 1297 newsize, so, NULL)) 1298 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1299 m_adj(m, drop_hdrlen); /* delayed header drop */ 1300 sbappendstream_locked(&so->so_rcv, m); 1301 } 1302 /* NB: sorwakeup_locked() does an implicit unlock. */ 1303 sorwakeup_locked(so); 1304 if (DELAY_ACK(tp)) { 1305 tp->t_flags |= TF_DELACK; 1306 } else { 1307 tp->t_flags |= TF_ACKNOW; 1308 tcp_output(tp); 1309 } 1310 goto check_delack; 1311 } 1312 } 1313 1314 /* 1315 * Calculate amount of space in receive window, 1316 * and then do TCP input processing. 1317 * Receive window is amount of space in rcv queue, 1318 * but not less than advertised window. 1319 */ 1320 win = sbspace(&so->so_rcv); 1321 if (win < 0) 1322 win = 0; 1323 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1324 1325 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1326 tp->rfbuf_ts = 0; 1327 tp->rfbuf_cnt = 0; 1328 1329 switch (tp->t_state) { 1330 1331 /* 1332 * If the state is SYN_RECEIVED: 1333 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1334 */ 1335 case TCPS_SYN_RECEIVED: 1336 if ((thflags & TH_ACK) && 1337 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1338 SEQ_GT(th->th_ack, tp->snd_max))) { 1339 rstreason = BANDLIM_RST_OPENPORT; 1340 goto dropwithreset; 1341 } 1342 break; 1343 1344 /* 1345 * If the state is SYN_SENT: 1346 * if seg contains an ACK, but not for our SYN, drop the input. 1347 * if seg contains a RST, then drop the connection. 1348 * if seg does not contain SYN, then drop it. 1349 * Otherwise this is an acceptable SYN segment 1350 * initialize tp->rcv_nxt and tp->irs 1351 * if seg contains ack then advance tp->snd_una 1352 * if seg contains an ECE and ECN support is enabled, the stream 1353 * is ECN capable. 1354 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1355 * arrange for segment to be acked (eventually) 1356 * continue processing rest of data/controls, beginning with URG 1357 */ 1358 case TCPS_SYN_SENT: 1359 if ((thflags & TH_ACK) && 1360 (SEQ_LEQ(th->th_ack, tp->iss) || 1361 SEQ_GT(th->th_ack, tp->snd_max))) { 1362 rstreason = BANDLIM_UNLIMITED; 1363 goto dropwithreset; 1364 } 1365 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) 1366 tp = tcp_drop(tp, ECONNREFUSED); 1367 if (thflags & TH_RST) 1368 goto drop; 1369 if (!(thflags & TH_SYN)) 1370 goto drop; 1371 1372 tp->irs = th->th_seq; 1373 tcp_rcvseqinit(tp); 1374 if (thflags & TH_ACK) { 1375 V_tcpstat.tcps_connects++; 1376 soisconnected(so); 1377#ifdef MAC 1378 SOCK_LOCK(so); 1379 mac_socketpeer_set_from_mbuf(m, so); 1380 SOCK_UNLOCK(so); 1381#endif 1382 /* Do window scaling on this connection? */ 1383 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1384 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1385 tp->rcv_scale = tp->request_r_scale; 1386 } 1387 tp->rcv_adv += tp->rcv_wnd; 1388 tp->snd_una++; /* SYN is acked */ 1389 /* 1390 * If there's data, delay ACK; if there's also a FIN 1391 * ACKNOW will be turned on later. 1392 */ 1393 if (DELAY_ACK(tp) && tlen != 0) 1394 tcp_timer_activate(tp, TT_DELACK, 1395 tcp_delacktime); 1396 else 1397 tp->t_flags |= TF_ACKNOW; 1398 1399 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 1400 tp->t_flags |= TF_ECN_PERMIT; 1401 V_tcpstat.tcps_ecn_shs++; 1402 } 1403 1404 /* 1405 * Received <SYN,ACK> in SYN_SENT[*] state. 1406 * Transitions: 1407 * SYN_SENT --> ESTABLISHED 1408 * SYN_SENT* --> FIN_WAIT_1 1409 */ 1410 tp->t_starttime = ticks; 1411 if (tp->t_flags & TF_NEEDFIN) { 1412 tp->t_state = TCPS_FIN_WAIT_1; 1413 tp->t_flags &= ~TF_NEEDFIN; 1414 thflags &= ~TH_SYN; 1415 } else { 1416 tp->t_state = TCPS_ESTABLISHED; 1417 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1418 } 1419 } else { 1420 /* 1421 * Received initial SYN in SYN-SENT[*] state => 1422 * simultaneous open. If segment contains CC option 1423 * and there is a cached CC, apply TAO test. 1424 * If it succeeds, connection is * half-synchronized. 1425 * Otherwise, do 3-way handshake: 1426 * SYN-SENT -> SYN-RECEIVED 1427 * SYN-SENT* -> SYN-RECEIVED* 1428 * If there was no CC option, clear cached CC value. 1429 */ 1430 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 1431 tcp_timer_activate(tp, TT_REXMT, 0); 1432 tp->t_state = TCPS_SYN_RECEIVED; 1433 } 1434 1435 KASSERT(headlocked, ("%s: trimthenstep6: head not locked", 1436 __func__)); 1437 INP_WLOCK_ASSERT(tp->t_inpcb); 1438 1439 /* 1440 * Advance th->th_seq to correspond to first data byte. 1441 * If data, trim to stay within window, 1442 * dropping FIN if necessary. 1443 */ 1444 th->th_seq++; 1445 if (tlen > tp->rcv_wnd) { 1446 todrop = tlen - tp->rcv_wnd; 1447 m_adj(m, -todrop); 1448 tlen = tp->rcv_wnd; 1449 thflags &= ~TH_FIN; 1450 V_tcpstat.tcps_rcvpackafterwin++; 1451 V_tcpstat.tcps_rcvbyteafterwin += todrop; 1452 } 1453 tp->snd_wl1 = th->th_seq - 1; 1454 tp->rcv_up = th->th_seq; 1455 /* 1456 * Client side of transaction: already sent SYN and data. 1457 * If the remote host used T/TCP to validate the SYN, 1458 * our data will be ACK'd; if so, enter normal data segment 1459 * processing in the middle of step 5, ack processing. 1460 * Otherwise, goto step 6. 1461 */ 1462 if (thflags & TH_ACK) 1463 goto process_ACK; 1464 1465 goto step6; 1466 1467 /* 1468 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 1469 * do normal processing. 1470 * 1471 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 1472 */ 1473 case TCPS_LAST_ACK: 1474 case TCPS_CLOSING: 1475 break; /* continue normal processing */ 1476 } 1477 1478 /* 1479 * States other than LISTEN or SYN_SENT. 1480 * First check the RST flag and sequence number since reset segments 1481 * are exempt from the timestamp and connection count tests. This 1482 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 1483 * below which allowed reset segments in half the sequence space 1484 * to fall though and be processed (which gives forged reset 1485 * segments with a random sequence number a 50 percent chance of 1486 * killing a connection). 1487 * Then check timestamp, if present. 1488 * Then check the connection count, if present. 1489 * Then check that at least some bytes of segment are within 1490 * receive window. If segment begins before rcv_nxt, 1491 * drop leading data (and SYN); if nothing left, just ack. 1492 * 1493 * 1494 * If the RST bit is set, check the sequence number to see 1495 * if this is a valid reset segment. 1496 * RFC 793 page 37: 1497 * In all states except SYN-SENT, all reset (RST) segments 1498 * are validated by checking their SEQ-fields. A reset is 1499 * valid if its sequence number is in the window. 1500 * Note: this does not take into account delayed ACKs, so 1501 * we should test against last_ack_sent instead of rcv_nxt. 1502 * The sequence number in the reset segment is normally an 1503 * echo of our outgoing acknowlegement numbers, but some hosts 1504 * send a reset with the sequence number at the rightmost edge 1505 * of our receive window, and we have to handle this case. 1506 * Note 2: Paul Watson's paper "Slipping in the Window" has shown 1507 * that brute force RST attacks are possible. To combat this, 1508 * we use a much stricter check while in the ESTABLISHED state, 1509 * only accepting RSTs where the sequence number is equal to 1510 * last_ack_sent. In all other states (the states in which a 1511 * RST is more likely), the more permissive check is used. 1512 * If we have multiple segments in flight, the intial reset 1513 * segment sequence numbers will be to the left of last_ack_sent, 1514 * but they will eventually catch up. 1515 * In any case, it never made sense to trim reset segments to 1516 * fit the receive window since RFC 1122 says: 1517 * 4.2.2.12 RST Segment: RFC-793 Section 3.4 1518 * 1519 * A TCP SHOULD allow a received RST segment to include data. 1520 * 1521 * DISCUSSION 1522 * It has been suggested that a RST segment could contain 1523 * ASCII text that encoded and explained the cause of the 1524 * RST. No standard has yet been established for such 1525 * data. 1526 * 1527 * If the reset segment passes the sequence number test examine 1528 * the state: 1529 * SYN_RECEIVED STATE: 1530 * If passive open, return to LISTEN state. 1531 * If active open, inform user that connection was refused. 1532 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: 1533 * Inform user that connection was reset, and close tcb. 1534 * CLOSING, LAST_ACK STATES: 1535 * Close the tcb. 1536 * TIME_WAIT STATE: 1537 * Drop the segment - see Stevens, vol. 2, p. 964 and 1538 * RFC 1337. 1539 */ 1540 if (thflags & TH_RST) { 1541 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1542 SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1543 switch (tp->t_state) { 1544 1545 case TCPS_SYN_RECEIVED: 1546 so->so_error = ECONNREFUSED; 1547 goto close; 1548 1549 case TCPS_ESTABLISHED: 1550 if (V_tcp_insecure_rst == 0 && 1551 !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && 1552 SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && 1553 !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1554 SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { 1555 V_tcpstat.tcps_badrst++; 1556 goto drop; 1557 } 1558 /* FALLTHROUGH */ 1559 case TCPS_FIN_WAIT_1: 1560 case TCPS_FIN_WAIT_2: 1561 case TCPS_CLOSE_WAIT: 1562 so->so_error = ECONNRESET; 1563 close: 1564 tp->t_state = TCPS_CLOSED; 1565 V_tcpstat.tcps_drops++; 1566 KASSERT(headlocked, ("%s: trimthenstep6: " 1567 "tcp_close: head not locked", __func__)); 1568 tp = tcp_close(tp); 1569 break; 1570 1571 case TCPS_CLOSING: 1572 case TCPS_LAST_ACK: 1573 KASSERT(headlocked, ("%s: trimthenstep6: " 1574 "tcp_close.2: head not locked", __func__)); 1575 tp = tcp_close(tp); 1576 break; 1577 } 1578 } 1579 goto drop; 1580 } 1581 1582 /* 1583 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1584 * and it's less than ts_recent, drop it. 1585 */ 1586 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 1587 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 1588 1589 /* Check to see if ts_recent is over 24 days old. */ 1590 if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1591 /* 1592 * Invalidate ts_recent. If this segment updates 1593 * ts_recent, the age will be reset later and ts_recent 1594 * will get a valid value. If it does not, setting 1595 * ts_recent to zero will at least satisfy the 1596 * requirement that zero be placed in the timestamp 1597 * echo reply when ts_recent isn't valid. The 1598 * age isn't reset until we get a valid ts_recent 1599 * because we don't want out-of-order segments to be 1600 * dropped when ts_recent is old. 1601 */ 1602 tp->ts_recent = 0; 1603 } else { 1604 V_tcpstat.tcps_rcvduppack++; 1605 V_tcpstat.tcps_rcvdupbyte += tlen; 1606 V_tcpstat.tcps_pawsdrop++; 1607 if (tlen) 1608 goto dropafterack; 1609 goto drop; 1610 } 1611 } 1612 1613 /* 1614 * In the SYN-RECEIVED state, validate that the packet belongs to 1615 * this connection before trimming the data to fit the receive 1616 * window. Check the sequence number versus IRS since we know 1617 * the sequence numbers haven't wrapped. This is a partial fix 1618 * for the "LAND" DoS attack. 1619 */ 1620 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 1621 rstreason = BANDLIM_RST_OPENPORT; 1622 goto dropwithreset; 1623 } 1624 1625 todrop = tp->rcv_nxt - th->th_seq; 1626 if (todrop > 0) { 1627 if (thflags & TH_SYN) { 1628 thflags &= ~TH_SYN; 1629 th->th_seq++; 1630 if (th->th_urp > 1) 1631 th->th_urp--; 1632 else 1633 thflags &= ~TH_URG; 1634 todrop--; 1635 } 1636 /* 1637 * Following if statement from Stevens, vol. 2, p. 960. 1638 */ 1639 if (todrop > tlen 1640 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1641 /* 1642 * Any valid FIN must be to the left of the window. 1643 * At this point the FIN must be a duplicate or out 1644 * of sequence; drop it. 1645 */ 1646 thflags &= ~TH_FIN; 1647 1648 /* 1649 * Send an ACK to resynchronize and drop any data. 1650 * But keep on processing for RST or ACK. 1651 */ 1652 tp->t_flags |= TF_ACKNOW; 1653 todrop = tlen; 1654 V_tcpstat.tcps_rcvduppack++; 1655 V_tcpstat.tcps_rcvdupbyte += todrop; 1656 } else { 1657 V_tcpstat.tcps_rcvpartduppack++; 1658 V_tcpstat.tcps_rcvpartdupbyte += todrop; 1659 } 1660 drop_hdrlen += todrop; /* drop from the top afterwards */ 1661 th->th_seq += todrop; 1662 tlen -= todrop; 1663 if (th->th_urp > todrop) 1664 th->th_urp -= todrop; 1665 else { 1666 thflags &= ~TH_URG; 1667 th->th_urp = 0; 1668 } 1669 } 1670 1671 /* 1672 * If new data are received on a connection after the 1673 * user processes are gone, then RST the other end. 1674 */ 1675 if ((so->so_state & SS_NOFDREF) && 1676 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1677 char *s; 1678 1679 KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head " 1680 "not locked", __func__)); 1681 if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { 1682 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " 1683 "was closed, sending RST and removing tcpcb\n", 1684 s, __func__, tcpstates[tp->t_state], tlen); 1685 free(s, M_TCPLOG); 1686 } 1687 tp = tcp_close(tp); 1688 V_tcpstat.tcps_rcvafterclose++; 1689 rstreason = BANDLIM_UNLIMITED; 1690 goto dropwithreset; 1691 } 1692 1693 /* 1694 * If segment ends after window, drop trailing data 1695 * (and PUSH and FIN); if nothing left, just ACK. 1696 */ 1697 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1698 if (todrop > 0) { 1699 V_tcpstat.tcps_rcvpackafterwin++; 1700 if (todrop >= tlen) { 1701 V_tcpstat.tcps_rcvbyteafterwin += tlen; 1702 /* 1703 * If window is closed can only take segments at 1704 * window edge, and have to drop data and PUSH from 1705 * incoming segments. Continue processing, but 1706 * remember to ack. Otherwise, drop segment 1707 * and ack. 1708 */ 1709 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1710 tp->t_flags |= TF_ACKNOW; 1711 V_tcpstat.tcps_rcvwinprobe++; 1712 } else 1713 goto dropafterack; 1714 } else 1715 V_tcpstat.tcps_rcvbyteafterwin += todrop; 1716 m_adj(m, -todrop); 1717 tlen -= todrop; 1718 thflags &= ~(TH_PUSH|TH_FIN); 1719 } 1720 1721 /* 1722 * If last ACK falls within this segment's sequence numbers, 1723 * record its timestamp. 1724 * NOTE: 1725 * 1) That the test incorporates suggestions from the latest 1726 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1727 * 2) That updating only on newer timestamps interferes with 1728 * our earlier PAWS tests, so this check should be solely 1729 * predicated on the sequence space of this segment. 1730 * 3) That we modify the segment boundary check to be 1731 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 1732 * instead of RFC1323's 1733 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 1734 * This modified check allows us to overcome RFC1323's 1735 * limitations as described in Stevens TCP/IP Illustrated 1736 * Vol. 2 p.869. In such cases, we can still calculate the 1737 * RTT correctly when RCV.NXT == Last.ACK.Sent. 1738 */ 1739 if ((to.to_flags & TOF_TS) != 0 && 1740 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1741 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1742 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 1743 tp->ts_recent_age = ticks; 1744 tp->ts_recent = to.to_tsval; 1745 } 1746 1747 /* 1748 * If a SYN is in the window, then this is an 1749 * error and we send an RST and drop the connection. 1750 */ 1751 if (thflags & TH_SYN) { 1752 KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: " 1753 "head not locked", __func__)); 1754 tp = tcp_drop(tp, ECONNRESET); 1755 rstreason = BANDLIM_UNLIMITED; 1756 goto drop; 1757 } 1758 1759 /* 1760 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 1761 * flag is on (half-synchronized state), then queue data for 1762 * later processing; else drop segment and return. 1763 */ 1764 if ((thflags & TH_ACK) == 0) { 1765 if (tp->t_state == TCPS_SYN_RECEIVED || 1766 (tp->t_flags & TF_NEEDSYN)) 1767 goto step6; 1768 else if (tp->t_flags & TF_ACKNOW) 1769 goto dropafterack; 1770 else 1771 goto drop; 1772 } 1773 1774 /* 1775 * Ack processing. 1776 */ 1777 switch (tp->t_state) { 1778 1779 /* 1780 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1781 * ESTABLISHED state and continue processing. 1782 * The ACK was checked above. 1783 */ 1784 case TCPS_SYN_RECEIVED: 1785 1786 V_tcpstat.tcps_connects++; 1787 soisconnected(so); 1788 /* Do window scaling? */ 1789 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1790 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1791 tp->rcv_scale = tp->request_r_scale; 1792 tp->snd_wnd = tiwin; 1793 } 1794 /* 1795 * Make transitions: 1796 * SYN-RECEIVED -> ESTABLISHED 1797 * SYN-RECEIVED* -> FIN-WAIT-1 1798 */ 1799 tp->t_starttime = ticks; 1800 if (tp->t_flags & TF_NEEDFIN) { 1801 tp->t_state = TCPS_FIN_WAIT_1; 1802 tp->t_flags &= ~TF_NEEDFIN; 1803 } else { 1804 tp->t_state = TCPS_ESTABLISHED; 1805 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1806 } 1807 /* 1808 * If segment contains data or ACK, will call tcp_reass() 1809 * later; if not, do so now to pass queued data to user. 1810 */ 1811 if (tlen == 0 && (thflags & TH_FIN) == 0) 1812 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 1813 (struct mbuf *)0); 1814 tp->snd_wl1 = th->th_seq - 1; 1815 /* FALLTHROUGH */ 1816 1817 /* 1818 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1819 * ACKs. If the ack is in the range 1820 * tp->snd_una < th->th_ack <= tp->snd_max 1821 * then advance tp->snd_una to th->th_ack and drop 1822 * data from the retransmission queue. If this ACK reflects 1823 * more up to date window information we update our window information. 1824 */ 1825 case TCPS_ESTABLISHED: 1826 case TCPS_FIN_WAIT_1: 1827 case TCPS_FIN_WAIT_2: 1828 case TCPS_CLOSE_WAIT: 1829 case TCPS_CLOSING: 1830 case TCPS_LAST_ACK: 1831 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1832 V_tcpstat.tcps_rcvacktoomuch++; 1833 goto dropafterack; 1834 } 1835 if ((tp->t_flags & TF_SACK_PERMIT) && 1836 ((to.to_flags & TOF_SACK) || 1837 !TAILQ_EMPTY(&tp->snd_holes))) 1838 tcp_sack_doack(tp, &to, th->th_ack); 1839 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1840 if (tlen == 0 && tiwin == tp->snd_wnd) { 1841 V_tcpstat.tcps_rcvdupack++; 1842 /* 1843 * If we have outstanding data (other than 1844 * a window probe), this is a completely 1845 * duplicate ack (ie, window info didn't 1846 * change), the ack is the biggest we've 1847 * seen and we've seen exactly our rexmt 1848 * threshhold of them, assume a packet 1849 * has been dropped and retransmit it. 1850 * Kludge snd_nxt & the congestion 1851 * window so we send only this one 1852 * packet. 1853 * 1854 * We know we're losing at the current 1855 * window size so do congestion avoidance 1856 * (set ssthresh to half the current window 1857 * and pull our congestion window back to 1858 * the new ssthresh). 1859 * 1860 * Dup acks mean that packets have left the 1861 * network (they're now cached at the receiver) 1862 * so bump cwnd by the amount in the receiver 1863 * to keep a constant cwnd packets in the 1864 * network. 1865 * 1866 * When using TCP ECN, notify the peer that 1867 * we reduced the cwnd. 1868 */ 1869 if (!tcp_timer_active(tp, TT_REXMT) || 1870 th->th_ack != tp->snd_una) 1871 tp->t_dupacks = 0; 1872 else if (++tp->t_dupacks > tcprexmtthresh || 1873 ((V_tcp_do_newreno || 1874 (tp->t_flags & TF_SACK_PERMIT)) && 1875 IN_FASTRECOVERY(tp))) { 1876 if ((tp->t_flags & TF_SACK_PERMIT) && 1877 IN_FASTRECOVERY(tp)) { 1878 int awnd; 1879 1880 /* 1881 * Compute the amount of data in flight first. 1882 * We can inject new data into the pipe iff 1883 * we have less than 1/2 the original window's 1884 * worth of data in flight. 1885 */ 1886 awnd = (tp->snd_nxt - tp->snd_fack) + 1887 tp->sackhint.sack_bytes_rexmit; 1888 if (awnd < tp->snd_ssthresh) { 1889 tp->snd_cwnd += tp->t_maxseg; 1890 if (tp->snd_cwnd > tp->snd_ssthresh) 1891 tp->snd_cwnd = tp->snd_ssthresh; 1892 } 1893 } else 1894 tp->snd_cwnd += tp->t_maxseg; 1895 (void) tcp_output(tp); 1896 goto drop; 1897 } else if (tp->t_dupacks == tcprexmtthresh) { 1898 tcp_seq onxt = tp->snd_nxt; 1899 1900 /* 1901 * If we're doing sack, check to 1902 * see if we're already in sack 1903 * recovery. If we're not doing sack, 1904 * check to see if we're in newreno 1905 * recovery. 1906 */ 1907 if (tp->t_flags & TF_SACK_PERMIT) { 1908 if (IN_FASTRECOVERY(tp)) { 1909 tp->t_dupacks = 0; 1910 break; 1911 } 1912 } else if (V_tcp_do_newreno || 1913 V_tcp_do_ecn) { 1914 if (SEQ_LEQ(th->th_ack, 1915 tp->snd_recover)) { 1916 tp->t_dupacks = 0; 1917 break; 1918 } 1919 } 1920 tcp_congestion_exp(tp); 1921 tcp_timer_activate(tp, TT_REXMT, 0); 1922 tp->t_rtttime = 0; 1923 if (tp->t_flags & TF_SACK_PERMIT) { 1924 V_tcpstat.tcps_sack_recovery_episode++; 1925 tp->sack_newdata = tp->snd_nxt; 1926 tp->snd_cwnd = tp->t_maxseg; 1927 (void) tcp_output(tp); 1928 goto drop; 1929 } 1930 tp->snd_nxt = th->th_ack; 1931 tp->snd_cwnd = tp->t_maxseg; 1932 (void) tcp_output(tp); 1933 KASSERT(tp->snd_limited <= 2, 1934 ("%s: tp->snd_limited too big", 1935 __func__)); 1936 tp->snd_cwnd = tp->snd_ssthresh + 1937 tp->t_maxseg * 1938 (tp->t_dupacks - tp->snd_limited); 1939 if (SEQ_GT(onxt, tp->snd_nxt)) 1940 tp->snd_nxt = onxt; 1941 goto drop; 1942 } else if (V_tcp_do_rfc3042) { 1943 u_long oldcwnd = tp->snd_cwnd; 1944 tcp_seq oldsndmax = tp->snd_max; 1945 u_int sent; 1946 1947 KASSERT(tp->t_dupacks == 1 || 1948 tp->t_dupacks == 2, 1949 ("%s: dupacks not 1 or 2", 1950 __func__)); 1951 if (tp->t_dupacks == 1) 1952 tp->snd_limited = 0; 1953 tp->snd_cwnd = 1954 (tp->snd_nxt - tp->snd_una) + 1955 (tp->t_dupacks - tp->snd_limited) * 1956 tp->t_maxseg; 1957 (void) tcp_output(tp); 1958 sent = tp->snd_max - oldsndmax; 1959 if (sent > tp->t_maxseg) { 1960 KASSERT((tp->t_dupacks == 2 && 1961 tp->snd_limited == 0) || 1962 (sent == tp->t_maxseg + 1 && 1963 tp->t_flags & TF_SENTFIN), 1964 ("%s: sent too much", 1965 __func__)); 1966 tp->snd_limited = 2; 1967 } else if (sent > 0) 1968 ++tp->snd_limited; 1969 tp->snd_cwnd = oldcwnd; 1970 goto drop; 1971 } 1972 } else 1973 tp->t_dupacks = 0; 1974 break; 1975 } 1976 1977 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 1978 ("%s: th_ack <= snd_una", __func__)); 1979 1980 /* 1981 * If the congestion window was inflated to account 1982 * for the other side's cached packets, retract it. 1983 */ 1984 if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { 1985 if (IN_FASTRECOVERY(tp)) { 1986 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 1987 if (tp->t_flags & TF_SACK_PERMIT) 1988 tcp_sack_partialack(tp, th); 1989 else 1990 tcp_newreno_partial_ack(tp, th); 1991 } else { 1992 /* 1993 * Out of fast recovery. 1994 * Window inflation should have left us 1995 * with approximately snd_ssthresh 1996 * outstanding data. 1997 * But in case we would be inclined to 1998 * send a burst, better to do it via 1999 * the slow start mechanism. 2000 */ 2001 if (SEQ_GT(th->th_ack + 2002 tp->snd_ssthresh, 2003 tp->snd_max)) 2004 tp->snd_cwnd = tp->snd_max - 2005 th->th_ack + 2006 tp->t_maxseg; 2007 else 2008 tp->snd_cwnd = tp->snd_ssthresh; 2009 } 2010 } 2011 } else { 2012 if (tp->t_dupacks >= tcprexmtthresh && 2013 tp->snd_cwnd > tp->snd_ssthresh) 2014 tp->snd_cwnd = tp->snd_ssthresh; 2015 } 2016 tp->t_dupacks = 0; 2017 /* 2018 * If we reach this point, ACK is not a duplicate, 2019 * i.e., it ACKs something we sent. 2020 */ 2021 if (tp->t_flags & TF_NEEDSYN) { 2022 /* 2023 * T/TCP: Connection was half-synchronized, and our 2024 * SYN has been ACK'd (so connection is now fully 2025 * synchronized). Go to non-starred state, 2026 * increment snd_una for ACK of SYN, and check if 2027 * we can do window scaling. 2028 */ 2029 tp->t_flags &= ~TF_NEEDSYN; 2030 tp->snd_una++; 2031 /* Do window scaling? */ 2032 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2033 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2034 tp->rcv_scale = tp->request_r_scale; 2035 /* Send window already scaled. */ 2036 } 2037 } 2038 2039process_ACK: 2040 KASSERT(headlocked, ("%s: process_ACK: head not locked", 2041 __func__)); 2042 INP_WLOCK_ASSERT(tp->t_inpcb); 2043 2044 acked = th->th_ack - tp->snd_una; 2045 V_tcpstat.tcps_rcvackpack++; 2046 V_tcpstat.tcps_rcvackbyte += acked; 2047 2048 /* 2049 * If we just performed our first retransmit, and the ACK 2050 * arrives within our recovery window, then it was a mistake 2051 * to do the retransmit in the first place. Recover our 2052 * original cwnd and ssthresh, and proceed to transmit where 2053 * we left off. 2054 */ 2055 if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { 2056 ++V_tcpstat.tcps_sndrexmitbad; 2057 tp->snd_cwnd = tp->snd_cwnd_prev; 2058 tp->snd_ssthresh = tp->snd_ssthresh_prev; 2059 tp->snd_recover = tp->snd_recover_prev; 2060 if (tp->t_flags & TF_WASFRECOVERY) 2061 ENTER_FASTRECOVERY(tp); 2062 tp->snd_nxt = tp->snd_max; 2063 tp->t_badrxtwin = 0; /* XXX probably not required */ 2064 } 2065 2066 /* 2067 * If we have a timestamp reply, update smoothed 2068 * round trip time. If no timestamp is present but 2069 * transmit timer is running and timed sequence 2070 * number was acked, update smoothed round trip time. 2071 * Since we now have an rtt measurement, cancel the 2072 * timer backoff (cf., Phil Karn's retransmit alg.). 2073 * Recompute the initial retransmit timer. 2074 * 2075 * Some boxes send broken timestamp replies 2076 * during the SYN+ACK phase, ignore 2077 * timestamps of 0 or we could calculate a 2078 * huge RTT and blow up the retransmit timer. 2079 */ 2080 if ((to.to_flags & TOF_TS) != 0 && 2081 to.to_tsecr) { 2082 if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) 2083 tp->t_rttlow = ticks - to.to_tsecr; 2084 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); 2085 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 2086 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 2087 tp->t_rttlow = ticks - tp->t_rtttime; 2088 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 2089 } 2090 tcp_xmit_bandwidth_limit(tp, th->th_ack); 2091 2092 /* 2093 * If all outstanding data is acked, stop retransmit 2094 * timer and remember to restart (more output or persist). 2095 * If there is more data to be acked, restart retransmit 2096 * timer, using current (possibly backed-off) value. 2097 */ 2098 if (th->th_ack == tp->snd_max) { 2099 tcp_timer_activate(tp, TT_REXMT, 0); 2100 needoutput = 1; 2101 } else if (!tcp_timer_active(tp, TT_PERSIST)) 2102 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 2103 2104 /* 2105 * If no data (only SYN) was ACK'd, 2106 * skip rest of ACK processing. 2107 */ 2108 if (acked == 0) 2109 goto step6; 2110 2111 /* 2112 * When new data is acked, open the congestion window. 2113 * If the window gives us less than ssthresh packets 2114 * in flight, open exponentially (maxseg per packet). 2115 * Otherwise open linearly: maxseg per window 2116 * (maxseg^2 / cwnd per packet). 2117 * If cwnd > maxseg^2, fix the cwnd increment at 1 byte 2118 * to avoid capping cwnd (as suggested in RFC2581). 2119 */ 2120 if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || 2121 !IN_FASTRECOVERY(tp)) { 2122 u_int cw = tp->snd_cwnd; 2123 u_int incr = tp->t_maxseg; 2124 if (cw > tp->snd_ssthresh) 2125 incr = max((incr * incr / cw), 1); 2126 tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale); 2127 } 2128 SOCKBUF_LOCK(&so->so_snd); 2129 if (acked > so->so_snd.sb_cc) { 2130 tp->snd_wnd -= so->so_snd.sb_cc; 2131 sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); 2132 ourfinisacked = 1; 2133 } else { 2134 sbdrop_locked(&so->so_snd, acked); 2135 tp->snd_wnd -= acked; 2136 ourfinisacked = 0; 2137 } 2138 /* NB: sowwakeup_locked() does an implicit unlock. */ 2139 sowwakeup_locked(so); 2140 /* Detect una wraparound. */ 2141 if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && 2142 !IN_FASTRECOVERY(tp) && 2143 SEQ_GT(tp->snd_una, tp->snd_recover) && 2144 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2145 tp->snd_recover = th->th_ack - 1; 2146 if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && 2147 IN_FASTRECOVERY(tp) && 2148 SEQ_GEQ(th->th_ack, tp->snd_recover)) 2149 EXIT_FASTRECOVERY(tp); 2150 tp->snd_una = th->th_ack; 2151 if (tp->t_flags & TF_SACK_PERMIT) { 2152 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 2153 tp->snd_recover = tp->snd_una; 2154 } 2155 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2156 tp->snd_nxt = tp->snd_una; 2157 2158 switch (tp->t_state) { 2159 2160 /* 2161 * In FIN_WAIT_1 STATE in addition to the processing 2162 * for the ESTABLISHED state if our FIN is now acknowledged 2163 * then enter FIN_WAIT_2. 2164 */ 2165 case TCPS_FIN_WAIT_1: 2166 if (ourfinisacked) { 2167 /* 2168 * If we can't receive any more 2169 * data, then closing user can proceed. 2170 * Starting the timer is contrary to the 2171 * specification, but if we don't get a FIN 2172 * we'll hang forever. 2173 * 2174 * XXXjl: 2175 * we should release the tp also, and use a 2176 * compressed state. 2177 */ 2178 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2179 int timeout; 2180 2181 soisdisconnected(so); 2182 timeout = (tcp_fast_finwait2_recycle) ? 2183 tcp_finwait2_timeout : tcp_maxidle; 2184 tcp_timer_activate(tp, TT_2MSL, timeout); 2185 } 2186 tp->t_state = TCPS_FIN_WAIT_2; 2187 } 2188 break; 2189 2190 /* 2191 * In CLOSING STATE in addition to the processing for 2192 * the ESTABLISHED state if the ACK acknowledges our FIN 2193 * then enter the TIME-WAIT state, otherwise ignore 2194 * the segment. 2195 */ 2196 case TCPS_CLOSING: 2197 if (ourfinisacked) { 2198 KASSERT(headlocked, ("%s: process_ACK: " 2199 "head not locked", __func__)); 2200 tcp_twstart(tp); 2201 INP_INFO_WUNLOCK(&V_tcbinfo); 2202 headlocked = 0; 2203 m_freem(m); 2204 return; 2205 } 2206 break; 2207 2208 /* 2209 * In LAST_ACK, we may still be waiting for data to drain 2210 * and/or to be acked, as well as for the ack of our FIN. 2211 * If our FIN is now acknowledged, delete the TCB, 2212 * enter the closed state and return. 2213 */ 2214 case TCPS_LAST_ACK: 2215 if (ourfinisacked) { 2216 KASSERT(headlocked, ("%s: process_ACK: " 2217 "tcp_close: head not locked", __func__)); 2218 tp = tcp_close(tp); 2219 goto drop; 2220 } 2221 break; 2222 } 2223 } 2224 2225step6: 2226 KASSERT(headlocked, ("%s: step6: head not locked", __func__)); 2227 INP_WLOCK_ASSERT(tp->t_inpcb); 2228 2229 /* 2230 * Update window information. 2231 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2232 */ 2233 if ((thflags & TH_ACK) && 2234 (SEQ_LT(tp->snd_wl1, th->th_seq) || 2235 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2236 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2237 /* keep track of pure window updates */ 2238 if (tlen == 0 && 2239 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2240 V_tcpstat.tcps_rcvwinupd++; 2241 tp->snd_wnd = tiwin; 2242 tp->snd_wl1 = th->th_seq; 2243 tp->snd_wl2 = th->th_ack; 2244 if (tp->snd_wnd > tp->max_sndwnd) 2245 tp->max_sndwnd = tp->snd_wnd; 2246 needoutput = 1; 2247 } 2248 2249 /* 2250 * Process segments with URG. 2251 */ 2252 if ((thflags & TH_URG) && th->th_urp && 2253 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2254 /* 2255 * This is a kludge, but if we receive and accept 2256 * random urgent pointers, we'll crash in 2257 * soreceive. It's hard to imagine someone 2258 * actually wanting to send this much urgent data. 2259 */ 2260 SOCKBUF_LOCK(&so->so_rcv); 2261 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2262 th->th_urp = 0; /* XXX */ 2263 thflags &= ~TH_URG; /* XXX */ 2264 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 2265 goto dodata; /* XXX */ 2266 } 2267 /* 2268 * If this segment advances the known urgent pointer, 2269 * then mark the data stream. This should not happen 2270 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2271 * a FIN has been received from the remote side. 2272 * In these states we ignore the URG. 2273 * 2274 * According to RFC961 (Assigned Protocols), 2275 * the urgent pointer points to the last octet 2276 * of urgent data. We continue, however, 2277 * to consider it to indicate the first octet 2278 * of data past the urgent section as the original 2279 * spec states (in one of two places). 2280 */ 2281 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2282 tp->rcv_up = th->th_seq + th->th_urp; 2283 so->so_oobmark = so->so_rcv.sb_cc + 2284 (tp->rcv_up - tp->rcv_nxt) - 1; 2285 if (so->so_oobmark == 0) 2286 so->so_rcv.sb_state |= SBS_RCVATMARK; 2287 sohasoutofband(so); 2288 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2289 } 2290 SOCKBUF_UNLOCK(&so->so_rcv); 2291 /* 2292 * Remove out of band data so doesn't get presented to user. 2293 * This can happen independent of advancing the URG pointer, 2294 * but if two URG's are pending at once, some out-of-band 2295 * data may creep in... ick. 2296 */ 2297 if (th->th_urp <= (u_long)tlen && 2298 !(so->so_options & SO_OOBINLINE)) { 2299 /* hdr drop is delayed */ 2300 tcp_pulloutofband(so, th, m, drop_hdrlen); 2301 } 2302 } else { 2303 /* 2304 * If no out of band data is expected, 2305 * pull receive urgent pointer along 2306 * with the receive window. 2307 */ 2308 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2309 tp->rcv_up = tp->rcv_nxt; 2310 } 2311dodata: /* XXX */ 2312 KASSERT(headlocked, ("%s: dodata: head not locked", __func__)); 2313 INP_WLOCK_ASSERT(tp->t_inpcb); 2314 2315 /* 2316 * Process the segment text, merging it into the TCP sequencing queue, 2317 * and arranging for acknowledgment of receipt if necessary. 2318 * This process logically involves adjusting tp->rcv_wnd as data 2319 * is presented to the user (this happens in tcp_usrreq.c, 2320 * case PRU_RCVD). If a FIN has already been received on this 2321 * connection then we just ignore the text. 2322 */ 2323 if ((tlen || (thflags & TH_FIN)) && 2324 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2325 tcp_seq save_start = th->th_seq; 2326 m_adj(m, drop_hdrlen); /* delayed header drop */ 2327 /* 2328 * Insert segment which includes th into TCP reassembly queue 2329 * with control block tp. Set thflags to whether reassembly now 2330 * includes a segment with FIN. This handles the common case 2331 * inline (segment is the next to be received on an established 2332 * connection, and the queue is empty), avoiding linkage into 2333 * and removal from the queue and repetition of various 2334 * conversions. 2335 * Set DELACK for segments received in order, but ack 2336 * immediately when segments are out of order (so 2337 * fast retransmit can work). 2338 */ 2339 if (th->th_seq == tp->rcv_nxt && 2340 LIST_EMPTY(&tp->t_segq) && 2341 TCPS_HAVEESTABLISHED(tp->t_state)) { 2342 if (DELAY_ACK(tp)) 2343 tp->t_flags |= TF_DELACK; 2344 else 2345 tp->t_flags |= TF_ACKNOW; 2346 tp->rcv_nxt += tlen; 2347 thflags = th->th_flags & TH_FIN; 2348 V_tcpstat.tcps_rcvpack++; 2349 V_tcpstat.tcps_rcvbyte += tlen; 2350 ND6_HINT(tp); 2351 SOCKBUF_LOCK(&so->so_rcv); 2352 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 2353 m_freem(m); 2354 else 2355 sbappendstream_locked(&so->so_rcv, m); 2356 /* NB: sorwakeup_locked() does an implicit unlock. */ 2357 sorwakeup_locked(so); 2358 } else { 2359 /* 2360 * XXX: Due to the header drop above "th" is 2361 * theoretically invalid by now. Fortunately 2362 * m_adj() doesn't actually frees any mbufs 2363 * when trimming from the head. 2364 */ 2365 thflags = tcp_reass(tp, th, &tlen, m); 2366 tp->t_flags |= TF_ACKNOW; 2367 } 2368 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 2369 tcp_update_sack_list(tp, save_start, save_start + tlen); 2370#if 0 2371 /* 2372 * Note the amount of data that peer has sent into 2373 * our window, in order to estimate the sender's 2374 * buffer size. 2375 * XXX: Unused. 2376 */ 2377 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2378#endif 2379 } else { 2380 m_freem(m); 2381 thflags &= ~TH_FIN; 2382 } 2383 2384 /* 2385 * If FIN is received ACK the FIN and let the user know 2386 * that the connection is closing. 2387 */ 2388 if (thflags & TH_FIN) { 2389 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2390 socantrcvmore(so); 2391 /* 2392 * If connection is half-synchronized 2393 * (ie NEEDSYN flag on) then delay ACK, 2394 * so it may be piggybacked when SYN is sent. 2395 * Otherwise, since we received a FIN then no 2396 * more input can be expected, send ACK now. 2397 */ 2398 if (tp->t_flags & TF_NEEDSYN) 2399 tp->t_flags |= TF_DELACK; 2400 else 2401 tp->t_flags |= TF_ACKNOW; 2402 tp->rcv_nxt++; 2403 } 2404 switch (tp->t_state) { 2405 2406 /* 2407 * In SYN_RECEIVED and ESTABLISHED STATES 2408 * enter the CLOSE_WAIT state. 2409 */ 2410 case TCPS_SYN_RECEIVED: 2411 tp->t_starttime = ticks; 2412 /* FALLTHROUGH */ 2413 case TCPS_ESTABLISHED: 2414 tp->t_state = TCPS_CLOSE_WAIT; 2415 break; 2416 2417 /* 2418 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2419 * enter the CLOSING state. 2420 */ 2421 case TCPS_FIN_WAIT_1: 2422 tp->t_state = TCPS_CLOSING; 2423 break; 2424 2425 /* 2426 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2427 * starting the time-wait timer, turning off the other 2428 * standard timers. 2429 */ 2430 case TCPS_FIN_WAIT_2: 2431 KASSERT(headlocked == 1, ("%s: dodata: " 2432 "TCP_FIN_WAIT_2: head not locked", __func__)); 2433 tcp_twstart(tp); 2434 INP_INFO_WUNLOCK(&V_tcbinfo); 2435 return; 2436 } 2437 } 2438 INP_INFO_WUNLOCK(&V_tcbinfo); 2439 headlocked = 0; 2440#ifdef TCPDEBUG 2441 if (so->so_options & SO_DEBUG) 2442 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 2443 &tcp_savetcp, 0); 2444#endif 2445 2446 /* 2447 * Return any desired output. 2448 */ 2449 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2450 (void) tcp_output(tp); 2451 2452check_delack: 2453 KASSERT(headlocked == 0, ("%s: check_delack: head locked", 2454 __func__)); 2455 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2456 INP_WLOCK_ASSERT(tp->t_inpcb); 2457 if (tp->t_flags & TF_DELACK) { 2458 tp->t_flags &= ~TF_DELACK; 2459 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2460 } 2461 INP_WUNLOCK(tp->t_inpcb); 2462 return; 2463 2464dropafterack: 2465 KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__)); 2466 /* 2467 * Generate an ACK dropping incoming segment if it occupies 2468 * sequence space, where the ACK reflects our state. 2469 * 2470 * We can now skip the test for the RST flag since all 2471 * paths to this code happen after packets containing 2472 * RST have been dropped. 2473 * 2474 * In the SYN-RECEIVED state, don't send an ACK unless the 2475 * segment we received passes the SYN-RECEIVED ACK test. 2476 * If it fails send a RST. This breaks the loop in the 2477 * "LAND" DoS attack, and also prevents an ACK storm 2478 * between two listening ports that have been sent forged 2479 * SYN segments, each with the source address of the other. 2480 */ 2481 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 2482 (SEQ_GT(tp->snd_una, th->th_ack) || 2483 SEQ_GT(th->th_ack, tp->snd_max)) ) { 2484 rstreason = BANDLIM_RST_OPENPORT; 2485 goto dropwithreset; 2486 } 2487#ifdef TCPDEBUG 2488 if (so->so_options & SO_DEBUG) 2489 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2490 &tcp_savetcp, 0); 2491#endif 2492 KASSERT(headlocked, ("%s: headlocked should be 1", __func__)); 2493 INP_INFO_WUNLOCK(&V_tcbinfo); 2494 tp->t_flags |= TF_ACKNOW; 2495 (void) tcp_output(tp); 2496 INP_WUNLOCK(tp->t_inpcb); 2497 m_freem(m); 2498 return; 2499 2500dropwithreset: 2501 KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__)); 2502 2503 /* 2504 * If tp is non-NULL, we call tcp_dropwithreset() holding both inpcb 2505 * and global locks. However, if NULL, we must hold neither as 2506 * firewalls may acquire the global lock in order to look for a 2507 * matching inpcb. 2508 */ 2509 if (tp != NULL) { 2510 tcp_dropwithreset(m, th, tp, tlen, rstreason); 2511 INP_WUNLOCK(tp->t_inpcb); 2512 } 2513 INP_INFO_WUNLOCK(&V_tcbinfo); 2514 if (tp == NULL) 2515 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 2516 return; 2517 2518drop: 2519 /* 2520 * Drop space held by incoming segment and return. 2521 */ 2522#ifdef TCPDEBUG 2523 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 2524 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2525 &tcp_savetcp, 0); 2526#endif 2527 if (tp != NULL) 2528 INP_WUNLOCK(tp->t_inpcb); 2529 if (headlocked) 2530 INP_INFO_WUNLOCK(&V_tcbinfo); 2531 m_freem(m); 2532 return; 2533} 2534 2535/* 2536 * Issue RST and make ACK acceptable to originator of segment. 2537 * The mbuf must still include the original packet header. 2538 * tp may be NULL. 2539 */ 2540static void 2541tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, 2542 int tlen, int rstreason) 2543{ 2544 struct ip *ip; 2545#ifdef INET6 2546 struct ip6_hdr *ip6; 2547#endif 2548 2549 if (tp != NULL) { 2550 INP_WLOCK_ASSERT(tp->t_inpcb); 2551 } 2552 2553 /* Don't bother if destination was broadcast/multicast. */ 2554 if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2555 goto drop; 2556#ifdef INET6 2557 if (mtod(m, struct ip *)->ip_v == 6) { 2558 ip6 = mtod(m, struct ip6_hdr *); 2559 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 2560 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 2561 goto drop; 2562 /* IPv6 anycast check is done at tcp6_input() */ 2563 } else 2564#endif 2565 { 2566 ip = mtod(m, struct ip *); 2567 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 2568 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 2569 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 2570 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 2571 goto drop; 2572 } 2573 2574 /* Perform bandwidth limiting. */ 2575 if (badport_bandlim(rstreason) < 0) 2576 goto drop; 2577 2578 /* tcp_respond consumes the mbuf chain. */ 2579 if (th->th_flags & TH_ACK) { 2580 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, 2581 th->th_ack, TH_RST); 2582 } else { 2583 if (th->th_flags & TH_SYN) 2584 tlen++; 2585 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 2586 (tcp_seq)0, TH_RST|TH_ACK); 2587 } 2588 return; 2589drop: 2590 m_freem(m); 2591 return; 2592} 2593 2594/* 2595 * Parse TCP options and place in tcpopt. 2596 */ 2597static void 2598tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) 2599{ 2600 INIT_VNET_INET(curvnet); 2601 int opt, optlen; 2602 2603 to->to_flags = 0; 2604 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2605 opt = cp[0]; 2606 if (opt == TCPOPT_EOL) 2607 break; 2608 if (opt == TCPOPT_NOP) 2609 optlen = 1; 2610 else { 2611 if (cnt < 2) 2612 break; 2613 optlen = cp[1]; 2614 if (optlen < 2 || optlen > cnt) 2615 break; 2616 } 2617 switch (opt) { 2618 case TCPOPT_MAXSEG: 2619 if (optlen != TCPOLEN_MAXSEG) 2620 continue; 2621 if (!(flags & TO_SYN)) 2622 continue; 2623 to->to_flags |= TOF_MSS; 2624 bcopy((char *)cp + 2, 2625 (char *)&to->to_mss, sizeof(to->to_mss)); 2626 to->to_mss = ntohs(to->to_mss); 2627 break; 2628 case TCPOPT_WINDOW: 2629 if (optlen != TCPOLEN_WINDOW) 2630 continue; 2631 if (!(flags & TO_SYN)) 2632 continue; 2633 to->to_flags |= TOF_SCALE; 2634 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); 2635 break; 2636 case TCPOPT_TIMESTAMP: 2637 if (optlen != TCPOLEN_TIMESTAMP) 2638 continue; 2639 to->to_flags |= TOF_TS; 2640 bcopy((char *)cp + 2, 2641 (char *)&to->to_tsval, sizeof(to->to_tsval)); 2642 to->to_tsval = ntohl(to->to_tsval); 2643 bcopy((char *)cp + 6, 2644 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 2645 to->to_tsecr = ntohl(to->to_tsecr); 2646 break; 2647#ifdef TCP_SIGNATURE 2648 /* 2649 * XXX In order to reply to a host which has set the 2650 * TCP_SIGNATURE option in its initial SYN, we have to 2651 * record the fact that the option was observed here 2652 * for the syncache code to perform the correct response. 2653 */ 2654 case TCPOPT_SIGNATURE: 2655 if (optlen != TCPOLEN_SIGNATURE) 2656 continue; 2657 to->to_flags |= TOF_SIGNATURE; 2658 to->to_signature = cp + 2; 2659 break; 2660#endif 2661 case TCPOPT_SACK_PERMITTED: 2662 if (optlen != TCPOLEN_SACK_PERMITTED) 2663 continue; 2664 if (!(flags & TO_SYN)) 2665 continue; 2666 if (!V_tcp_do_sack) 2667 continue; 2668 to->to_flags |= TOF_SACKPERM; 2669 break; 2670 case TCPOPT_SACK: 2671 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2672 continue; 2673 if (flags & TO_SYN) 2674 continue; 2675 to->to_flags |= TOF_SACK; 2676 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 2677 to->to_sacks = cp + 2; 2678 V_tcpstat.tcps_sack_rcv_blocks++; 2679 break; 2680 default: 2681 continue; 2682 } 2683 } 2684} 2685 2686/* 2687 * Pull out of band byte out of a segment so 2688 * it doesn't appear in the user's data queue. 2689 * It is still reflected in the segment length for 2690 * sequencing purposes. 2691 */ 2692static void 2693tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, 2694 int off) 2695{ 2696 int cnt = off + th->th_urp - 1; 2697 2698 while (cnt >= 0) { 2699 if (m->m_len > cnt) { 2700 char *cp = mtod(m, caddr_t) + cnt; 2701 struct tcpcb *tp = sototcpcb(so); 2702 2703 INP_WLOCK_ASSERT(tp->t_inpcb); 2704 2705 tp->t_iobc = *cp; 2706 tp->t_oobflags |= TCPOOB_HAVEDATA; 2707 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2708 m->m_len--; 2709 if (m->m_flags & M_PKTHDR) 2710 m->m_pkthdr.len--; 2711 return; 2712 } 2713 cnt -= m->m_len; 2714 m = m->m_next; 2715 if (m == NULL) 2716 break; 2717 } 2718 panic("tcp_pulloutofband"); 2719} 2720 2721/* 2722 * Collect new round-trip time estimate 2723 * and update averages and current timeout. 2724 */ 2725static void 2726tcp_xmit_timer(struct tcpcb *tp, int rtt) 2727{ 2728 INIT_VNET_INET(tp->t_inpcb->inp_vnet); 2729 int delta; 2730 2731 INP_WLOCK_ASSERT(tp->t_inpcb); 2732 2733 V_tcpstat.tcps_rttupdated++; 2734 tp->t_rttupdated++; 2735 if (tp->t_srtt != 0) { 2736 /* 2737 * srtt is stored as fixed point with 5 bits after the 2738 * binary point (i.e., scaled by 8). The following magic 2739 * is equivalent to the smoothing algorithm in rfc793 with 2740 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2741 * point). Adjust rtt to origin 0. 2742 */ 2743 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 2744 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 2745 2746 if ((tp->t_srtt += delta) <= 0) 2747 tp->t_srtt = 1; 2748 2749 /* 2750 * We accumulate a smoothed rtt variance (actually, a 2751 * smoothed mean difference), then set the retransmit 2752 * timer to smoothed rtt + 4 times the smoothed variance. 2753 * rttvar is stored as fixed point with 4 bits after the 2754 * binary point (scaled by 16). The following is 2755 * equivalent to rfc793 smoothing with an alpha of .75 2756 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2757 * rfc793's wired-in beta. 2758 */ 2759 if (delta < 0) 2760 delta = -delta; 2761 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 2762 if ((tp->t_rttvar += delta) <= 0) 2763 tp->t_rttvar = 1; 2764 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 2765 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2766 } else { 2767 /* 2768 * No rtt measurement yet - use the unsmoothed rtt. 2769 * Set the variance to half the rtt (so our first 2770 * retransmit happens at 3*rtt). 2771 */ 2772 tp->t_srtt = rtt << TCP_RTT_SHIFT; 2773 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 2774 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2775 } 2776 tp->t_rtttime = 0; 2777 tp->t_rxtshift = 0; 2778 2779 /* 2780 * the retransmit should happen at rtt + 4 * rttvar. 2781 * Because of the way we do the smoothing, srtt and rttvar 2782 * will each average +1/2 tick of bias. When we compute 2783 * the retransmit timer, we want 1/2 tick of rounding and 2784 * 1 extra tick because of +-1/2 tick uncertainty in the 2785 * firing of the timer. The bias will give us exactly the 2786 * 1.5 tick we need. But, because the bias is 2787 * statistical, we have to test that we don't drop below 2788 * the minimum feasible timer (which is 2 ticks). 2789 */ 2790 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 2791 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 2792 2793 /* 2794 * We received an ack for a packet that wasn't retransmitted; 2795 * it is probably safe to discard any error indications we've 2796 * received recently. This isn't quite right, but close enough 2797 * for now (a route might have failed after we sent a segment, 2798 * and the return path might not be symmetrical). 2799 */ 2800 tp->t_softerror = 0; 2801} 2802 2803/* 2804 * Determine a reasonable value for maxseg size. 2805 * If the route is known, check route for mtu. 2806 * If none, use an mss that can be handled on the outgoing 2807 * interface without forcing IP to fragment; if bigger than 2808 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2809 * to utilize large mbufs. If no route is found, route has no mtu, 2810 * or the destination isn't local, use a default, hopefully conservative 2811 * size (usually 512 or the default IP max size, but no more than the mtu 2812 * of the interface), as we can't discover anything about intervening 2813 * gateways or networks. We also initialize the congestion/slow start 2814 * window to be a single segment if the destination isn't local. 2815 * While looking at the routing entry, we also initialize other path-dependent 2816 * parameters from pre-set or cached values in the routing entry. 2817 * 2818 * Also take into account the space needed for options that we 2819 * send regularly. Make maxseg shorter by that amount to assure 2820 * that we can send maxseg amount of data even when the options 2821 * are present. Store the upper limit of the length of options plus 2822 * data in maxopd. 2823 * 2824 * In case of T/TCP, we call this routine during implicit connection 2825 * setup as well (offer = -1), to initialize maxseg from the cached 2826 * MSS of our peer. 2827 * 2828 * NOTE that this routine is only called when we process an incoming 2829 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). 2830 */ 2831void 2832tcp_mss_update(struct tcpcb *tp, int offer, struct hc_metrics_lite *metricptr) 2833{ 2834 INIT_VNET_INET(tp->t_inpcb->inp_vnet); 2835 int mss; 2836 u_long maxmtu; 2837 struct inpcb *inp = tp->t_inpcb; 2838 struct hc_metrics_lite metrics; 2839 int origoffer = offer; 2840 int mtuflags = 0; 2841#ifdef INET6 2842 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 2843 size_t min_protoh = isipv6 ? 2844 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : 2845 sizeof (struct tcpiphdr); 2846#else 2847 const size_t min_protoh = sizeof(struct tcpiphdr); 2848#endif 2849 2850 INP_WLOCK_ASSERT(tp->t_inpcb); 2851 2852 /* Initialize. */ 2853#ifdef INET6 2854 if (isipv6) { 2855 maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); 2856 tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; 2857 } else 2858#endif 2859 { 2860 maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); 2861 tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; 2862 } 2863 2864 /* 2865 * No route to sender, stay with default mss and return. 2866 */ 2867 if (maxmtu == 0) 2868 return; 2869 2870 /* Check the interface for TSO capabilities. */ 2871 if (mtuflags & CSUM_TSO) 2872 tp->t_flags |= TF_TSO; 2873 2874 /* What have we got? */ 2875 switch (offer) { 2876 case 0: 2877 /* 2878 * Offer == 0 means that there was no MSS on the SYN 2879 * segment, in this case we use tcp_mssdflt as 2880 * already assigned to t_maxopd above. 2881 */ 2882 offer = tp->t_maxopd; 2883 break; 2884 2885 case -1: 2886 /* 2887 * Offer == -1 means that we didn't receive SYN yet. 2888 */ 2889 /* FALLTHROUGH */ 2890 2891 default: 2892 /* 2893 * Prevent DoS attack with too small MSS. Round up 2894 * to at least minmss. 2895 */ 2896 offer = max(offer, V_tcp_minmss); 2897 } 2898 2899 /* 2900 * rmx information is now retrieved from tcp_hostcache. 2901 */ 2902 tcp_hc_get(&inp->inp_inc, &metrics); 2903 if (metricptr != NULL) 2904 bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); 2905 2906 /* 2907 * If there's a discovered mtu int tcp hostcache, use it 2908 * else, use the link mtu. 2909 */ 2910 if (metrics.rmx_mtu) 2911 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; 2912 else { 2913#ifdef INET6 2914 if (isipv6) { 2915 mss = maxmtu - min_protoh; 2916 if (!V_path_mtu_discovery && 2917 !in6_localaddr(&inp->in6p_faddr)) 2918 mss = min(mss, V_tcp_v6mssdflt); 2919 } else 2920#endif 2921 { 2922 mss = maxmtu - min_protoh; 2923 if (!V_path_mtu_discovery && 2924 !in_localaddr(inp->inp_faddr)) 2925 mss = min(mss, V_tcp_mssdflt); 2926 } 2927 /* 2928 * XXX - The above conditional (mss = maxmtu - min_protoh) 2929 * probably violates the TCP spec. 2930 * The problem is that, since we don't know the 2931 * other end's MSS, we are supposed to use a conservative 2932 * default. But, if we do that, then MTU discovery will 2933 * never actually take place, because the conservative 2934 * default is much less than the MTUs typically seen 2935 * on the Internet today. For the moment, we'll sweep 2936 * this under the carpet. 2937 * 2938 * The conservative default might not actually be a problem 2939 * if the only case this occurs is when sending an initial 2940 * SYN with options and data to a host we've never talked 2941 * to before. Then, they will reply with an MSS value which 2942 * will get recorded and the new parameters should get 2943 * recomputed. For Further Study. 2944 */ 2945 } 2946 mss = min(mss, offer); 2947 2948 /* 2949 * Sanity check: make sure that maxopd will be large 2950 * enough to allow some data on segments even if the 2951 * all the option space is used (40bytes). Otherwise 2952 * funny things may happen in tcp_output. 2953 */ 2954 mss = max(mss, 64); 2955 2956 /* 2957 * maxopd stores the maximum length of data AND options 2958 * in a segment; maxseg is the amount of data in a normal 2959 * segment. We need to store this value (maxopd) apart 2960 * from maxseg, because now every segment carries options 2961 * and thus we normally have somewhat less data in segments. 2962 */ 2963 tp->t_maxopd = mss; 2964 2965 /* 2966 * origoffer==-1 indicates that no segments were received yet. 2967 * In this case we just guess. 2968 */ 2969 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2970 (origoffer == -1 || 2971 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 2972 mss -= TCPOLEN_TSTAMP_APPA; 2973 2974#if (MCLBYTES & (MCLBYTES - 1)) == 0 2975 if (mss > MCLBYTES) 2976 mss &= ~(MCLBYTES-1); 2977#else 2978 if (mss > MCLBYTES) 2979 mss = mss / MCLBYTES * MCLBYTES; 2980#endif 2981 tp->t_maxseg = mss; 2982} 2983 2984void 2985tcp_mss(struct tcpcb *tp, int offer) 2986{ 2987 int rtt, mss; 2988 u_long bufsize; 2989 struct inpcb *inp; 2990 struct socket *so; 2991 struct hc_metrics_lite metrics; 2992#ifdef INET6 2993 int isipv6; 2994#endif 2995 KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); 2996 2997 tcp_mss_update(tp, offer, &metrics); 2998 2999 mss = tp->t_maxseg; 3000 inp = tp->t_inpcb; 3001#ifdef INET6 3002 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 3003#endif 3004 3005 /* 3006 * If there's a pipesize, change the socket buffer to that size, 3007 * don't change if sb_hiwat is different than default (then it 3008 * has been changed on purpose with setsockopt). 3009 * Make the socket buffers an integral number of mss units; 3010 * if the mss is larger than the socket buffer, decrease the mss. 3011 */ 3012 so = inp->inp_socket; 3013 SOCKBUF_LOCK(&so->so_snd); 3014 if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) 3015 bufsize = metrics.rmx_sendpipe; 3016 else 3017 bufsize = so->so_snd.sb_hiwat; 3018 if (bufsize < mss) 3019 mss = bufsize; 3020 else { 3021 bufsize = roundup(bufsize, mss); 3022 if (bufsize > sb_max) 3023 bufsize = sb_max; 3024 if (bufsize > so->so_snd.sb_hiwat) 3025 (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); 3026 } 3027 SOCKBUF_UNLOCK(&so->so_snd); 3028 tp->t_maxseg = mss; 3029 3030 SOCKBUF_LOCK(&so->so_rcv); 3031 if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) 3032 bufsize = metrics.rmx_recvpipe; 3033 else 3034 bufsize = so->so_rcv.sb_hiwat; 3035 if (bufsize > mss) { 3036 bufsize = roundup(bufsize, mss); 3037 if (bufsize > sb_max) 3038 bufsize = sb_max; 3039 if (bufsize > so->so_rcv.sb_hiwat) 3040 (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); 3041 } 3042 SOCKBUF_UNLOCK(&so->so_rcv); 3043 /* 3044 * While we're here, check the others too. 3045 */ 3046 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { 3047 tp->t_srtt = rtt; 3048 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; 3049 V_tcpstat.tcps_usedrtt++; 3050 if (metrics.rmx_rttvar) { 3051 tp->t_rttvar = metrics.rmx_rttvar; 3052 V_tcpstat.tcps_usedrttvar++; 3053 } else { 3054 /* default variation is +- 1 rtt */ 3055 tp->t_rttvar = 3056 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 3057 } 3058 TCPT_RANGESET(tp->t_rxtcur, 3059 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 3060 tp->t_rttmin, TCPTV_REXMTMAX); 3061 } 3062 if (metrics.rmx_ssthresh) { 3063 /* 3064 * There's some sort of gateway or interface 3065 * buffer limit on the path. Use this to set 3066 * the slow start threshhold, but set the 3067 * threshold to no less than 2*mss. 3068 */ 3069 tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); 3070 V_tcpstat.tcps_usedssthresh++; 3071 } 3072 if (metrics.rmx_bandwidth) 3073 tp->snd_bandwidth = metrics.rmx_bandwidth; 3074 3075 /* 3076 * Set the slow-start flight size depending on whether this 3077 * is a local network or not. 3078 * 3079 * Extend this so we cache the cwnd too and retrieve it here. 3080 * Make cwnd even bigger than RFC3390 suggests but only if we 3081 * have previous experience with the remote host. Be careful 3082 * not make cwnd bigger than remote receive window or our own 3083 * send socket buffer. Maybe put some additional upper bound 3084 * on the retrieved cwnd. Should do incremental updates to 3085 * hostcache when cwnd collapses so next connection doesn't 3086 * overloads the path again. 3087 * 3088 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. 3089 * We currently check only in syncache_socket for that. 3090 */ 3091#define TCP_METRICS_CWND 3092#ifdef TCP_METRICS_CWND 3093 if (metrics.rmx_cwnd) 3094 tp->snd_cwnd = max(mss, 3095 min(metrics.rmx_cwnd / 2, 3096 min(tp->snd_wnd, so->so_snd.sb_hiwat))); 3097 else 3098#endif 3099 if (V_tcp_do_rfc3390) 3100 tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); 3101#ifdef INET6 3102 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || 3103 (!isipv6 && in_localaddr(inp->inp_faddr))) 3104#else 3105 else if (in_localaddr(inp->inp_faddr)) 3106#endif 3107 tp->snd_cwnd = mss * V_ss_fltsz_local; 3108 else 3109 tp->snd_cwnd = mss * V_ss_fltsz; 3110} 3111 3112/* 3113 * Determine the MSS option to send on an outgoing SYN. 3114 */ 3115int 3116tcp_mssopt(struct in_conninfo *inc) 3117{ 3118 INIT_VNET_INET(curvnet); 3119 int mss = 0; 3120 u_long maxmtu = 0; 3121 u_long thcmtu = 0; 3122 size_t min_protoh; 3123#ifdef INET6 3124 int isipv6 = inc->inc_isipv6 ? 1 : 0; 3125#endif 3126 3127 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); 3128 3129#ifdef INET6 3130 if (isipv6) { 3131 mss = V_tcp_v6mssdflt; 3132 maxmtu = tcp_maxmtu6(inc, NULL); 3133 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3134 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 3135 } else 3136#endif 3137 { 3138 mss = V_tcp_mssdflt; 3139 maxmtu = tcp_maxmtu(inc, NULL); 3140 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3141 min_protoh = sizeof(struct tcpiphdr); 3142 } 3143 if (maxmtu && thcmtu) 3144 mss = min(maxmtu, thcmtu) - min_protoh; 3145 else if (maxmtu || thcmtu) 3146 mss = max(maxmtu, thcmtu) - min_protoh; 3147 3148 return (mss); 3149} 3150 3151 3152/* 3153 * On a partial ack arrives, force the retransmission of the 3154 * next unacknowledged segment. Do not clear tp->t_dupacks. 3155 * By setting snd_nxt to ti_ack, this forces retransmission timer to 3156 * be started again. 3157 */ 3158static void 3159tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) 3160{ 3161 tcp_seq onxt = tp->snd_nxt; 3162 u_long ocwnd = tp->snd_cwnd; 3163 3164 INP_WLOCK_ASSERT(tp->t_inpcb); 3165 3166 tcp_timer_activate(tp, TT_REXMT, 0); 3167 tp->t_rtttime = 0; 3168 tp->snd_nxt = th->th_ack; 3169 /* 3170 * Set snd_cwnd to one segment beyond acknowledged offset. 3171 * (tp->snd_una has not yet been updated when this function is called.) 3172 */ 3173 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3174 tp->t_flags |= TF_ACKNOW; 3175 (void) tcp_output(tp); 3176 tp->snd_cwnd = ocwnd; 3177 if (SEQ_GT(onxt, tp->snd_nxt)) 3178 tp->snd_nxt = onxt; 3179 /* 3180 * Partial window deflation. Relies on fact that tp->snd_una 3181 * not updated yet. 3182 */ 3183 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3184 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3185 else 3186 tp->snd_cwnd = 0; 3187 tp->snd_cwnd += tp->t_maxseg; 3188} 3189