tcp_input.c revision 184720
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: head/sys/netinet/tcp_input.c 184720 2008-11-06 12:33:33Z bz $"); 34 35#include "opt_ipfw.h" /* for ipfw_fwd */ 36#include "opt_inet.h" 37#include "opt_inet6.h" 38#include "opt_ipsec.h" 39#include "opt_mac.h" 40#include "opt_tcpdebug.h" 41 42#include <sys/param.h> 43#include <sys/kernel.h> 44#include <sys/malloc.h> 45#include <sys/mbuf.h> 46#include <sys/proc.h> /* for proc0 declaration */ 47#include <sys/protosw.h> 48#include <sys/signalvar.h> 49#include <sys/socket.h> 50#include <sys/socketvar.h> 51#include <sys/sysctl.h> 52#include <sys/syslog.h> 53#include <sys/systm.h> 54#include <sys/vimage.h> 55 56#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 57 58#include <vm/uma.h> 59 60#include <net/if.h> 61#include <net/route.h> 62 63#define TCPSTATES /* for logging */ 64 65#include <netinet/in.h> 66#include <netinet/in_pcb.h> 67#include <netinet/in_systm.h> 68#include <netinet/in_var.h> 69#include <netinet/ip.h> 70#include <netinet/ip_icmp.h> /* required for icmp_var.h */ 71#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 72#include <netinet/ip_var.h> 73#include <netinet/ip_options.h> 74#include <netinet/ip6.h> 75#include <netinet/icmp6.h> 76#include <netinet6/in6_pcb.h> 77#include <netinet6/ip6_var.h> 78#include <netinet6/nd6.h> 79#include <netinet/tcp.h> 80#include <netinet/tcp_fsm.h> 81#include <netinet/tcp_seq.h> 82#include <netinet/tcp_timer.h> 83#include <netinet/tcp_var.h> 84#include <netinet6/tcp6_var.h> 85#include <netinet/tcpip.h> 86#include <netinet/tcp_syncache.h> 87#ifdef TCPDEBUG 88#include <netinet/tcp_debug.h> 89#endif /* TCPDEBUG */ 90 91#ifdef IPSEC 92#include <netipsec/ipsec.h> 93#include <netipsec/ipsec6.h> 94#endif /*IPSEC*/ 95 96#include <machine/in_cksum.h> 97 98#include <security/mac/mac_framework.h> 99 100static const int tcprexmtthresh = 3; 101 102struct tcpstat tcpstat; 103SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats, 104 CTLFLAG_RW, tcpstat , tcpstat, 105 "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 106 107int tcp_log_in_vain = 0; 108SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 109 &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); 110 111static int blackhole = 0; 112SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, 113 blackhole, 0, "Do not send RST on segments to closed ports"); 114 115int tcp_delack_enabled = 1; 116SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, delayed_ack, 117 CTLFLAG_RW, tcp_delack_enabled, 0, 118 "Delay ACK to try and piggyback it onto a data packet"); 119 120static int drop_synfin = 0; 121SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, drop_synfin, 122 CTLFLAG_RW, drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); 123 124static int tcp_do_rfc3042 = 1; 125SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, 126 tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); 127 128static int tcp_do_rfc3390 = 1; 129SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, 130 tcp_do_rfc3390, 0, 131 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); 132 133int tcp_do_ecn = 0; 134int tcp_ecn_maxretries = 1; 135SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); 136SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, enable, 137 CTLFLAG_RW, tcp_do_ecn, 0, "TCP ECN support"); 138SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, maxretries, 139 CTLFLAG_RW, tcp_ecn_maxretries, 0, "Max retries before giving up on ECN"); 140 141static int tcp_insecure_rst = 0; 142SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, insecure_rst, 143 CTLFLAG_RW, tcp_insecure_rst, 0, 144 "Follow the old (insecure) criteria for accepting RST packets"); 145 146int tcp_do_autorcvbuf = 1; 147SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_auto, 148 CTLFLAG_RW, tcp_do_autorcvbuf, 0, 149 "Enable automatic receive buffer sizing"); 150 151int tcp_autorcvbuf_inc = 16*1024; 152SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_inc, 153 CTLFLAG_RW, tcp_autorcvbuf_inc, 0, 154 "Incrementor step size of automatic receive buffer"); 155 156int tcp_autorcvbuf_max = 256*1024; 157SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_max, 158 CTLFLAG_RW, tcp_autorcvbuf_max, 0, 159 "Max size of automatic receive buffer"); 160 161struct inpcbhead tcb; 162#define tcb6 tcb /* for KAME src sync over BSD*'s */ 163struct inpcbinfo tcbinfo; 164 165static void tcp_dooptions(struct tcpopt *, u_char *, int, int); 166static void tcp_do_segment(struct mbuf *, struct tcphdr *, 167 struct socket *, struct tcpcb *, int, int, uint8_t); 168static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, 169 struct tcpcb *, int, int); 170static void tcp_pulloutofband(struct socket *, 171 struct tcphdr *, struct mbuf *, int); 172static void tcp_xmit_timer(struct tcpcb *, int); 173static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); 174static void inline 175 tcp_congestion_exp(struct tcpcb *); 176 177static void inline 178tcp_congestion_exp(struct tcpcb *tp) 179{ 180 u_int win; 181 182 win = min(tp->snd_wnd, tp->snd_cwnd) / 183 2 / tp->t_maxseg; 184 if (win < 2) 185 win = 2; 186 tp->snd_ssthresh = win * tp->t_maxseg; 187 ENTER_FASTRECOVERY(tp); 188 tp->snd_recover = tp->snd_max; 189 if (tp->t_flags & TF_ECN_PERMIT) 190 tp->t_flags |= TF_ECN_SND_CWR; 191} 192 193/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ 194#ifdef INET6 195#define ND6_HINT(tp) \ 196do { \ 197 if ((tp) && (tp)->t_inpcb && \ 198 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ 199 nd6_nud_hint(NULL, NULL, 0); \ 200} while (0) 201#else 202#define ND6_HINT(tp) 203#endif 204 205/* 206 * Indicate whether this ack should be delayed. We can delay the ack if 207 * - there is no delayed ack timer in progress and 208 * - our last ack wasn't a 0-sized window. We never want to delay 209 * the ack that opens up a 0-sized window and 210 * - delayed acks are enabled or 211 * - this is a half-synchronized T/TCP connection. 212 */ 213#define DELAY_ACK(tp) \ 214 ((!tcp_timer_active(tp, TT_DELACK) && \ 215 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 216 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 217 218/* 219 * TCP input handling is split into multiple parts: 220 * tcp6_input is a thin wrapper around tcp_input for the extended 221 * ip6_protox[] call format in ip6_input 222 * tcp_input handles primary segment validation, inpcb lookup and 223 * SYN processing on listen sockets 224 * tcp_do_segment processes the ACK and text of the segment for 225 * establishing, established and closing connections 226 */ 227#ifdef INET6 228int 229tcp6_input(struct mbuf **mp, int *offp, int proto) 230{ 231 INIT_VNET_INET6(curvnet); 232 struct mbuf *m = *mp; 233 struct in6_ifaddr *ia6; 234 235 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); 236 237 /* 238 * draft-itojun-ipv6-tcp-to-anycast 239 * better place to put this in? 240 */ 241 ia6 = ip6_getdstifaddr(m); 242 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { 243 struct ip6_hdr *ip6; 244 245 ip6 = mtod(m, struct ip6_hdr *); 246 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 247 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 248 return IPPROTO_DONE; 249 } 250 251 tcp_input(m, *offp); 252 return IPPROTO_DONE; 253} 254#endif 255 256void 257tcp_input(struct mbuf *m, int off0) 258{ 259 INIT_VNET_INET(curvnet); 260#ifdef INET6 261 INIT_VNET_INET6(curvnet); 262#endif 263#ifdef IPSEC 264 INIT_VNET_IPSEC(curvnet); 265#endif 266 struct tcphdr *th; 267 struct ip *ip = NULL; 268 struct ipovly *ipov; 269 struct inpcb *inp = NULL; 270 struct tcpcb *tp = NULL; 271 struct socket *so = NULL; 272 u_char *optp = NULL; 273 int optlen = 0; 274 int len, tlen, off; 275 int drop_hdrlen; 276 int thflags; 277 int rstreason = 0; /* For badport_bandlim accounting purposes */ 278 uint8_t iptos; 279#ifdef IPFIREWALL_FORWARD 280 struct m_tag *fwd_tag; 281#endif 282#ifdef INET6 283 struct ip6_hdr *ip6 = NULL; 284 int isipv6; 285#else 286 const void *ip6 = NULL; 287 const int isipv6 = 0; 288#endif 289 struct tcpopt to; /* options in this segment */ 290 char *s = NULL; /* address and port logging */ 291 292#ifdef TCPDEBUG 293 /* 294 * The size of tcp_saveipgen must be the size of the max ip header, 295 * now IPv6. 296 */ 297 u_char tcp_saveipgen[IP6_HDR_LEN]; 298 struct tcphdr tcp_savetcp; 299 short ostate = 0; 300#endif 301 302#ifdef INET6 303 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 304#endif 305 306 to.to_flags = 0; 307 V_tcpstat.tcps_rcvtotal++; 308 309 if (isipv6) { 310#ifdef INET6 311 /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ 312 ip6 = mtod(m, struct ip6_hdr *); 313 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 314 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { 315 V_tcpstat.tcps_rcvbadsum++; 316 goto drop; 317 } 318 th = (struct tcphdr *)((caddr_t)ip6 + off0); 319 320 /* 321 * Be proactive about unspecified IPv6 address in source. 322 * As we use all-zero to indicate unbounded/unconnected pcb, 323 * unspecified IPv6 address can be used to confuse us. 324 * 325 * Note that packets with unspecified IPv6 destination is 326 * already dropped in ip6_input. 327 */ 328 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 329 /* XXX stat */ 330 goto drop; 331 } 332#else 333 th = NULL; /* XXX: Avoid compiler warning. */ 334#endif 335 } else { 336 /* 337 * Get IP and TCP header together in first mbuf. 338 * Note: IP leaves IP header in first mbuf. 339 */ 340 if (off0 > sizeof (struct ip)) { 341 ip_stripoptions(m, (struct mbuf *)0); 342 off0 = sizeof(struct ip); 343 } 344 if (m->m_len < sizeof (struct tcpiphdr)) { 345 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 346 == NULL) { 347 V_tcpstat.tcps_rcvshort++; 348 return; 349 } 350 } 351 ip = mtod(m, struct ip *); 352 ipov = (struct ipovly *)ip; 353 th = (struct tcphdr *)((caddr_t)ip + off0); 354 tlen = ip->ip_len; 355 356 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 357 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 358 th->th_sum = m->m_pkthdr.csum_data; 359 else 360 th->th_sum = in_pseudo(ip->ip_src.s_addr, 361 ip->ip_dst.s_addr, 362 htonl(m->m_pkthdr.csum_data + 363 ip->ip_len + 364 IPPROTO_TCP)); 365 th->th_sum ^= 0xffff; 366#ifdef TCPDEBUG 367 ipov->ih_len = (u_short)tlen; 368 ipov->ih_len = htons(ipov->ih_len); 369#endif 370 } else { 371 /* 372 * Checksum extended TCP header and data. 373 */ 374 len = sizeof (struct ip) + tlen; 375 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 376 ipov->ih_len = (u_short)tlen; 377 ipov->ih_len = htons(ipov->ih_len); 378 th->th_sum = in_cksum(m, len); 379 } 380 if (th->th_sum) { 381 V_tcpstat.tcps_rcvbadsum++; 382 goto drop; 383 } 384 /* Re-initialization for later version check */ 385 ip->ip_v = IPVERSION; 386 } 387 388#ifdef INET6 389 if (isipv6) 390 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 391 else 392#endif 393 iptos = ip->ip_tos; 394 395 /* 396 * Check that TCP offset makes sense, 397 * pull out TCP options and adjust length. XXX 398 */ 399 off = th->th_off << 2; 400 if (off < sizeof (struct tcphdr) || off > tlen) { 401 V_tcpstat.tcps_rcvbadoff++; 402 goto drop; 403 } 404 tlen -= off; /* tlen is used instead of ti->ti_len */ 405 if (off > sizeof (struct tcphdr)) { 406 if (isipv6) { 407#ifdef INET6 408 IP6_EXTHDR_CHECK(m, off0, off, ); 409 ip6 = mtod(m, struct ip6_hdr *); 410 th = (struct tcphdr *)((caddr_t)ip6 + off0); 411#endif 412 } else { 413 if (m->m_len < sizeof(struct ip) + off) { 414 if ((m = m_pullup(m, sizeof (struct ip) + off)) 415 == NULL) { 416 V_tcpstat.tcps_rcvshort++; 417 return; 418 } 419 ip = mtod(m, struct ip *); 420 ipov = (struct ipovly *)ip; 421 th = (struct tcphdr *)((caddr_t)ip + off0); 422 } 423 } 424 optlen = off - sizeof (struct tcphdr); 425 optp = (u_char *)(th + 1); 426 } 427 thflags = th->th_flags; 428 429 /* 430 * Convert TCP protocol specific fields to host format. 431 */ 432 th->th_seq = ntohl(th->th_seq); 433 th->th_ack = ntohl(th->th_ack); 434 th->th_win = ntohs(th->th_win); 435 th->th_urp = ntohs(th->th_urp); 436 437 /* 438 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. 439 */ 440 drop_hdrlen = off0 + off; 441 442 /* 443 * Locate pcb for segment. 444 */ 445 INP_INFO_WLOCK(&V_tcbinfo); 446findpcb: 447 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 448#ifdef IPFIREWALL_FORWARD 449 /* 450 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. 451 */ 452 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 453 454 if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ 455 struct sockaddr_in *next_hop; 456 457 next_hop = (struct sockaddr_in *)(fwd_tag+1); 458 /* 459 * Transparently forwarded. Pretend to be the destination. 460 * already got one like this? 461 */ 462 inp = in_pcblookup_hash(&V_tcbinfo, 463 ip->ip_src, th->th_sport, 464 ip->ip_dst, th->th_dport, 465 0, m->m_pkthdr.rcvif); 466 if (!inp) { 467 /* It's new. Try to find the ambushing socket. */ 468 inp = in_pcblookup_hash(&V_tcbinfo, 469 ip->ip_src, th->th_sport, 470 next_hop->sin_addr, 471 next_hop->sin_port ? 472 ntohs(next_hop->sin_port) : 473 th->th_dport, 474 INPLOOKUP_WILDCARD, 475 m->m_pkthdr.rcvif); 476 } 477 /* Remove the tag from the packet. We don't need it anymore. */ 478 m_tag_delete(m, fwd_tag); 479 } else 480#endif /* IPFIREWALL_FORWARD */ 481 { 482 if (isipv6) { 483#ifdef INET6 484 inp = in6_pcblookup_hash(&V_tcbinfo, 485 &ip6->ip6_src, th->th_sport, 486 &ip6->ip6_dst, th->th_dport, 487 INPLOOKUP_WILDCARD, 488 m->m_pkthdr.rcvif); 489#endif 490 } else 491 inp = in_pcblookup_hash(&V_tcbinfo, 492 ip->ip_src, th->th_sport, 493 ip->ip_dst, th->th_dport, 494 INPLOOKUP_WILDCARD, 495 m->m_pkthdr.rcvif); 496 } 497 498 /* 499 * If the INPCB does not exist then all data in the incoming 500 * segment is discarded and an appropriate RST is sent back. 501 * XXX MRT Send RST using which routing table? 502 */ 503 if (inp == NULL) { 504 /* 505 * Log communication attempts to ports that are not 506 * in use. 507 */ 508 if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || 509 tcp_log_in_vain == 2) { 510 if ((s = tcp_log_addrs(NULL, th, (void *)ip, ip6))) 511 log(LOG_INFO, "%s; %s: Connection attempt " 512 "to closed port\n", s, __func__); 513 } 514 /* 515 * When blackholing do not respond with a RST but 516 * completely ignore the segment and drop it. 517 */ 518 if ((V_blackhole == 1 && (thflags & TH_SYN)) || 519 V_blackhole == 2) 520 goto dropunlock; 521 522 rstreason = BANDLIM_RST_CLOSEDPORT; 523 goto dropwithreset; 524 } 525 INP_WLOCK(inp); 526 527#ifdef IPSEC 528#ifdef INET6 529 if (isipv6 && ipsec6_in_reject(m, inp)) { 530 V_ipsec6stat.in_polvio++; 531 goto dropunlock; 532 } else 533#endif /* INET6 */ 534 if (ipsec4_in_reject(m, inp) != 0) { 535 V_ipsec4stat.in_polvio++; 536 goto dropunlock; 537 } 538#endif /* IPSEC */ 539 540 /* 541 * Check the minimum TTL for socket. 542 */ 543 if (inp->inp_ip_minttl != 0) { 544#ifdef INET6 545 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) 546 goto dropunlock; 547 else 548#endif 549 if (inp->inp_ip_minttl > ip->ip_ttl) 550 goto dropunlock; 551 } 552 553 /* 554 * A previous connection in TIMEWAIT state is supposed to catch 555 * stray or duplicate segments arriving late. If this segment 556 * was a legitimate new connection attempt the old INPCB gets 557 * removed and we can try again to find a listening socket. 558 */ 559 if (inp->inp_vflag & INP_TIMEWAIT) { 560 if (thflags & TH_SYN) 561 tcp_dooptions(&to, optp, optlen, TO_SYN); 562 /* 563 * NB: tcp_twcheck unlocks the INP and frees the mbuf. 564 */ 565 if (tcp_twcheck(inp, &to, th, m, tlen)) 566 goto findpcb; 567 INP_INFO_WUNLOCK(&V_tcbinfo); 568 return; 569 } 570 /* 571 * The TCPCB may no longer exist if the connection is winding 572 * down or it is in the CLOSED state. Either way we drop the 573 * segment and send an appropriate response. 574 */ 575 tp = intotcpcb(inp); 576 if (tp == NULL || tp->t_state == TCPS_CLOSED) { 577 rstreason = BANDLIM_RST_CLOSEDPORT; 578 goto dropwithreset; 579 } 580 581#ifdef MAC 582 INP_WLOCK_ASSERT(inp); 583 if (mac_inpcb_check_deliver(inp, m)) 584 goto dropunlock; 585#endif 586 so = inp->inp_socket; 587 KASSERT(so != NULL, ("%s: so == NULL", __func__)); 588#ifdef TCPDEBUG 589 if (so->so_options & SO_DEBUG) { 590 ostate = tp->t_state; 591 if (isipv6) { 592#ifdef INET6 593 bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); 594#endif 595 } else 596 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); 597 tcp_savetcp = *th; 598 } 599#endif 600 /* 601 * When the socket is accepting connections (the INPCB is in LISTEN 602 * state) we look into the SYN cache if this is a new connection 603 * attempt or the completion of a previous one. 604 */ 605 if (so->so_options & SO_ACCEPTCONN) { 606 struct in_conninfo inc; 607 608 KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " 609 "tp not listening", __func__)); 610 611 bzero(&inc, sizeof(inc)); 612 inc.inc_isipv6 = isipv6; 613#ifdef INET6 614 if (isipv6) { 615 inc.inc6_faddr = ip6->ip6_src; 616 inc.inc6_laddr = ip6->ip6_dst; 617 } else 618#endif 619 { 620 inc.inc_faddr = ip->ip_src; 621 inc.inc_laddr = ip->ip_dst; 622 } 623 inc.inc_fport = th->th_sport; 624 inc.inc_lport = th->th_dport; 625 626 /* 627 * Check for an existing connection attempt in syncache if 628 * the flag is only ACK. A successful lookup creates a new 629 * socket appended to the listen queue in SYN_RECEIVED state. 630 */ 631 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 632 /* 633 * Parse the TCP options here because 634 * syncookies need access to the reflected 635 * timestamp. 636 */ 637 tcp_dooptions(&to, optp, optlen, 0); 638 /* 639 * NB: syncache_expand() doesn't unlock 640 * inp and tcpinfo locks. 641 */ 642 if (!syncache_expand(&inc, &to, th, &so, m)) { 643 /* 644 * No syncache entry or ACK was not 645 * for our SYN/ACK. Send a RST. 646 * NB: syncache did its own logging 647 * of the failure cause. 648 */ 649 rstreason = BANDLIM_RST_OPENPORT; 650 goto dropwithreset; 651 } 652 if (so == NULL) { 653 /* 654 * We completed the 3-way handshake 655 * but could not allocate a socket 656 * either due to memory shortage, 657 * listen queue length limits or 658 * global socket limits. Send RST 659 * or wait and have the remote end 660 * retransmit the ACK for another 661 * try. 662 */ 663 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 664 log(LOG_DEBUG, "%s; %s: Listen socket: " 665 "Socket allocation failed due to " 666 "limits or memory shortage, %s\n", 667 s, __func__, 668 V_tcp_sc_rst_sock_fail ? 669 "sending RST" : "try again"); 670 if (V_tcp_sc_rst_sock_fail) { 671 rstreason = BANDLIM_UNLIMITED; 672 goto dropwithreset; 673 } else 674 goto dropunlock; 675 } 676 /* 677 * Socket is created in state SYN_RECEIVED. 678 * Unlock the listen socket, lock the newly 679 * created socket and update the tp variable. 680 */ 681 INP_WUNLOCK(inp); /* listen socket */ 682 inp = sotoinpcb(so); 683 INP_WLOCK(inp); /* new connection */ 684 tp = intotcpcb(inp); 685 KASSERT(tp->t_state == TCPS_SYN_RECEIVED, 686 ("%s: ", __func__)); 687 /* 688 * Process the segment and the data it 689 * contains. tcp_do_segment() consumes 690 * the mbuf chain and unlocks the inpcb. 691 */ 692 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, 693 iptos); 694 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 695 return; 696 } 697 /* 698 * Segment flag validation for new connection attempts: 699 * 700 * Our (SYN|ACK) response was rejected. 701 * Check with syncache and remove entry to prevent 702 * retransmits. 703 * 704 * NB: syncache_chkrst does its own logging of failure 705 * causes. 706 */ 707 if (thflags & TH_RST) { 708 syncache_chkrst(&inc, th); 709 goto dropunlock; 710 } 711 /* 712 * We can't do anything without SYN. 713 */ 714 if ((thflags & TH_SYN) == 0) { 715 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 716 log(LOG_DEBUG, "%s; %s: Listen socket: " 717 "SYN is missing, segment ignored\n", 718 s, __func__); 719 V_tcpstat.tcps_badsyn++; 720 goto dropunlock; 721 } 722 /* 723 * (SYN|ACK) is bogus on a listen socket. 724 */ 725 if (thflags & TH_ACK) { 726 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 727 log(LOG_DEBUG, "%s; %s: Listen socket: " 728 "SYN|ACK invalid, segment rejected\n", 729 s, __func__); 730 syncache_badack(&inc); /* XXX: Not needed! */ 731 V_tcpstat.tcps_badsyn++; 732 rstreason = BANDLIM_RST_OPENPORT; 733 goto dropwithreset; 734 } 735 /* 736 * If the drop_synfin option is enabled, drop all 737 * segments with both the SYN and FIN bits set. 738 * This prevents e.g. nmap from identifying the 739 * TCP/IP stack. 740 * XXX: Poor reasoning. nmap has other methods 741 * and is constantly refining its stack detection 742 * strategies. 743 * XXX: This is a violation of the TCP specification 744 * and was used by RFC1644. 745 */ 746 if ((thflags & TH_FIN) && V_drop_synfin) { 747 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 748 log(LOG_DEBUG, "%s; %s: Listen socket: " 749 "SYN|FIN segment ignored (based on " 750 "sysctl setting)\n", s, __func__); 751 V_tcpstat.tcps_badsyn++; 752 goto dropunlock; 753 } 754 /* 755 * Segment's flags are (SYN) or (SYN|FIN). 756 * 757 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored 758 * as they do not affect the state of the TCP FSM. 759 * The data pointed to by TH_URG and th_urp is ignored. 760 */ 761 KASSERT((thflags & (TH_RST|TH_ACK)) == 0, 762 ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); 763 KASSERT(thflags & (TH_SYN), 764 ("%s: Listen socket: TH_SYN not set", __func__)); 765#ifdef INET6 766 /* 767 * If deprecated address is forbidden, 768 * we do not accept SYN to deprecated interface 769 * address to prevent any new inbound connection from 770 * getting established. 771 * When we do not accept SYN, we send a TCP RST, 772 * with deprecated source address (instead of dropping 773 * it). We compromise it as it is much better for peer 774 * to send a RST, and RST will be the final packet 775 * for the exchange. 776 * 777 * If we do not forbid deprecated addresses, we accept 778 * the SYN packet. RFC2462 does not suggest dropping 779 * SYN in this case. 780 * If we decipher RFC2462 5.5.4, it says like this: 781 * 1. use of deprecated addr with existing 782 * communication is okay - "SHOULD continue to be 783 * used" 784 * 2. use of it with new communication: 785 * (2a) "SHOULD NOT be used if alternate address 786 * with sufficient scope is available" 787 * (2b) nothing mentioned otherwise. 788 * Here we fall into (2b) case as we have no choice in 789 * our source address selection - we must obey the peer. 790 * 791 * The wording in RFC2462 is confusing, and there are 792 * multiple description text for deprecated address 793 * handling - worse, they are not exactly the same. 794 * I believe 5.5.4 is the best one, so we follow 5.5.4. 795 */ 796 if (isipv6 && !V_ip6_use_deprecated) { 797 struct in6_ifaddr *ia6; 798 799 if ((ia6 = ip6_getdstifaddr(m)) && 800 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 801 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 802 log(LOG_DEBUG, "%s; %s: Listen socket: " 803 "Connection attempt to deprecated " 804 "IPv6 address rejected\n", 805 s, __func__); 806 rstreason = BANDLIM_RST_OPENPORT; 807 goto dropwithreset; 808 } 809 } 810#endif 811 /* 812 * Basic sanity checks on incoming SYN requests: 813 * Don't respond if the destination is a link layer 814 * broadcast according to RFC1122 4.2.3.10, p. 104. 815 * If it is from this socket it must be forged. 816 * Don't respond if the source or destination is a 817 * global or subnet broad- or multicast address. 818 * Note that it is quite possible to receive unicast 819 * link-layer packets with a broadcast IP address. Use 820 * in_broadcast() to find them. 821 */ 822 if (m->m_flags & (M_BCAST|M_MCAST)) { 823 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 824 log(LOG_DEBUG, "%s; %s: Listen socket: " 825 "Connection attempt from broad- or multicast " 826 "link layer address ignored\n", s, __func__); 827 goto dropunlock; 828 } 829 if (isipv6) { 830#ifdef INET6 831 if (th->th_dport == th->th_sport && 832 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { 833 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 834 log(LOG_DEBUG, "%s; %s: Listen socket: " 835 "Connection attempt to/from self " 836 "ignored\n", s, __func__); 837 goto dropunlock; 838 } 839 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 840 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { 841 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 842 log(LOG_DEBUG, "%s; %s: Listen socket: " 843 "Connection attempt from/to multicast " 844 "address ignored\n", s, __func__); 845 goto dropunlock; 846 } 847#endif 848 } else { 849 if (th->th_dport == th->th_sport && 850 ip->ip_dst.s_addr == ip->ip_src.s_addr) { 851 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 852 log(LOG_DEBUG, "%s; %s: Listen socket: " 853 "Connection attempt from/to self " 854 "ignored\n", s, __func__); 855 goto dropunlock; 856 } 857 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 858 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 859 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 860 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { 861 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 862 log(LOG_DEBUG, "%s; %s: Listen socket: " 863 "Connection attempt from/to broad- " 864 "or multicast address ignored\n", 865 s, __func__); 866 goto dropunlock; 867 } 868 } 869 /* 870 * SYN appears to be valid. Create compressed TCP state 871 * for syncache. 872 */ 873#ifdef TCPDEBUG 874 if (so->so_options & SO_DEBUG) 875 tcp_trace(TA_INPUT, ostate, tp, 876 (void *)tcp_saveipgen, &tcp_savetcp, 0); 877#endif 878 tcp_dooptions(&to, optp, optlen, TO_SYN); 879 syncache_add(&inc, &to, th, inp, &so, m); 880 /* 881 * Entry added to syncache and mbuf consumed. 882 * Everything already unlocked by syncache_add(). 883 */ 884 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 885 return; 886 } 887 888 /* 889 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later 890 * state. tcp_do_segment() always consumes the mbuf chain, unlocks 891 * the inpcb, and unlocks pcbinfo. 892 */ 893 tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); 894 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 895 return; 896 897dropwithreset: 898 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 899 INP_INFO_WUNLOCK(&V_tcbinfo); 900 901 if (inp != NULL) { 902 tcp_dropwithreset(m, th, tp, tlen, rstreason); 903 INP_WUNLOCK(inp); 904 } else 905 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 906 m = NULL; /* mbuf chain got consumed. */ 907 goto drop; 908 909dropunlock: 910 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 911 if (inp != NULL) 912 INP_WUNLOCK(inp); 913 INP_INFO_WUNLOCK(&V_tcbinfo); 914 915drop: 916 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 917 if (s != NULL) 918 free(s, M_TCPLOG); 919 if (m != NULL) 920 m_freem(m); 921} 922 923static void 924tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 925 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) 926{ 927 INIT_VNET_INET(tp->t_vnet); 928 int thflags, acked, ourfinisacked, needoutput = 0; 929 int headlocked = 1; 930 int rstreason, todrop, win; 931 u_long tiwin; 932 struct tcpopt to; 933 934#ifdef TCPDEBUG 935 /* 936 * The size of tcp_saveipgen must be the size of the max ip header, 937 * now IPv6. 938 */ 939 u_char tcp_saveipgen[IP6_HDR_LEN]; 940 struct tcphdr tcp_savetcp; 941 short ostate = 0; 942#endif 943 thflags = th->th_flags; 944 945 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 946 INP_WLOCK_ASSERT(tp->t_inpcb); 947 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 948 __func__)); 949 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 950 __func__)); 951 952 /* 953 * Segment received on connection. 954 * Reset idle time and keep-alive timer. 955 * XXX: This should be done after segment 956 * validation to ignore broken/spoofed segs. 957 */ 958 tp->t_rcvtime = ticks; 959 if (TCPS_HAVEESTABLISHED(tp->t_state)) 960 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 961 962 /* 963 * Unscale the window into a 32-bit value. 964 * For the SYN_SENT state the scale is zero. 965 */ 966 tiwin = th->th_win << tp->snd_scale; 967 968 /* 969 * TCP ECN processing. 970 */ 971 if (tp->t_flags & TF_ECN_PERMIT) { 972 switch (iptos & IPTOS_ECN_MASK) { 973 case IPTOS_ECN_CE: 974 tp->t_flags |= TF_ECN_SND_ECE; 975 V_tcpstat.tcps_ecn_ce++; 976 break; 977 case IPTOS_ECN_ECT0: 978 V_tcpstat.tcps_ecn_ect0++; 979 break; 980 case IPTOS_ECN_ECT1: 981 V_tcpstat.tcps_ecn_ect1++; 982 break; 983 } 984 985 if (thflags & TH_CWR) 986 tp->t_flags &= ~TF_ECN_SND_ECE; 987 988 /* 989 * Congestion experienced. 990 * Ignore if we are already trying to recover. 991 */ 992 if ((thflags & TH_ECE) && 993 SEQ_LEQ(th->th_ack, tp->snd_recover)) { 994 V_tcpstat.tcps_ecn_rcwnd++; 995 tcp_congestion_exp(tp); 996 } 997 } 998 999 /* 1000 * Parse options on any incoming segment. 1001 */ 1002 tcp_dooptions(&to, (u_char *)(th + 1), 1003 (th->th_off << 2) - sizeof(struct tcphdr), 1004 (thflags & TH_SYN) ? TO_SYN : 0); 1005 1006 /* 1007 * If echoed timestamp is later than the current time, 1008 * fall back to non RFC1323 RTT calculation. Normalize 1009 * timestamp if syncookies were used when this connection 1010 * was established. 1011 */ 1012 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1013 to.to_tsecr -= tp->ts_offset; 1014 if (TSTMP_GT(to.to_tsecr, ticks)) 1015 to.to_tsecr = 0; 1016 } 1017 1018 /* 1019 * Process options only when we get SYN/ACK back. The SYN case 1020 * for incoming connections is handled in tcp_syncache. 1021 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1022 * or <SYN,ACK>) segment itself is never scaled. 1023 * XXX this is traditional behavior, may need to be cleaned up. 1024 */ 1025 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1026 if ((to.to_flags & TOF_SCALE) && 1027 (tp->t_flags & TF_REQ_SCALE)) { 1028 tp->t_flags |= TF_RCVD_SCALE; 1029 tp->snd_scale = to.to_wscale; 1030 } 1031 /* 1032 * Initial send window. It will be updated with 1033 * the next incoming segment to the scaled value. 1034 */ 1035 tp->snd_wnd = th->th_win; 1036 if (to.to_flags & TOF_TS) { 1037 tp->t_flags |= TF_RCVD_TSTMP; 1038 tp->ts_recent = to.to_tsval; 1039 tp->ts_recent_age = ticks; 1040 } 1041 if (to.to_flags & TOF_MSS) 1042 tcp_mss(tp, to.to_mss); 1043 if ((tp->t_flags & TF_SACK_PERMIT) && 1044 (to.to_flags & TOF_SACKPERM) == 0) 1045 tp->t_flags &= ~TF_SACK_PERMIT; 1046 } 1047 1048 /* 1049 * Header prediction: check for the two common cases 1050 * of a uni-directional data xfer. If the packet has 1051 * no control flags, is in-sequence, the window didn't 1052 * change and we're not retransmitting, it's a 1053 * candidate. If the length is zero and the ack moved 1054 * forward, we're the sender side of the xfer. Just 1055 * free the data acked & wake any higher level process 1056 * that was blocked waiting for space. If the length 1057 * is non-zero and the ack didn't move, we're the 1058 * receiver side. If we're getting packets in-order 1059 * (the reassembly queue is empty), add the data to 1060 * the socket buffer and note that we need a delayed ack. 1061 * Make sure that the hidden state-flags are also off. 1062 * Since we check for TCPS_ESTABLISHED first, it can only 1063 * be TH_NEEDSYN. 1064 */ 1065 if (tp->t_state == TCPS_ESTABLISHED && 1066 th->th_seq == tp->rcv_nxt && 1067 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1068 tp->snd_nxt == tp->snd_max && 1069 tiwin && tiwin == tp->snd_wnd && 1070 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1071 LIST_EMPTY(&tp->t_segq) && 1072 ((to.to_flags & TOF_TS) == 0 || 1073 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { 1074 1075 /* 1076 * If last ACK falls within this segment's sequence numbers, 1077 * record the timestamp. 1078 * NOTE that the test is modified according to the latest 1079 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1080 */ 1081 if ((to.to_flags & TOF_TS) != 0 && 1082 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1083 tp->ts_recent_age = ticks; 1084 tp->ts_recent = to.to_tsval; 1085 } 1086 1087 if (tlen == 0) { 1088 if (SEQ_GT(th->th_ack, tp->snd_una) && 1089 SEQ_LEQ(th->th_ack, tp->snd_max) && 1090 tp->snd_cwnd >= tp->snd_wnd && 1091 ((!V_tcp_do_newreno && 1092 !(tp->t_flags & TF_SACK_PERMIT) && 1093 tp->t_dupacks < tcprexmtthresh) || 1094 ((V_tcp_do_newreno || 1095 (tp->t_flags & TF_SACK_PERMIT)) && 1096 !IN_FASTRECOVERY(tp) && 1097 (to.to_flags & TOF_SACK) == 0 && 1098 TAILQ_EMPTY(&tp->snd_holes)))) { 1099 KASSERT(headlocked, 1100 ("%s: headlocked", __func__)); 1101 INP_INFO_WUNLOCK(&V_tcbinfo); 1102 headlocked = 0; 1103 /* 1104 * This is a pure ack for outstanding data. 1105 */ 1106 ++V_tcpstat.tcps_predack; 1107 /* 1108 * "bad retransmit" recovery. 1109 */ 1110 if (tp->t_rxtshift == 1 && 1111 ticks < tp->t_badrxtwin) { 1112 ++V_tcpstat.tcps_sndrexmitbad; 1113 tp->snd_cwnd = tp->snd_cwnd_prev; 1114 tp->snd_ssthresh = 1115 tp->snd_ssthresh_prev; 1116 tp->snd_recover = tp->snd_recover_prev; 1117 if (tp->t_flags & TF_WASFRECOVERY) 1118 ENTER_FASTRECOVERY(tp); 1119 tp->snd_nxt = tp->snd_max; 1120 tp->t_badrxtwin = 0; 1121 } 1122 1123 /* 1124 * Recalculate the transmit timer / rtt. 1125 * 1126 * Some boxes send broken timestamp replies 1127 * during the SYN+ACK phase, ignore 1128 * timestamps of 0 or we could calculate a 1129 * huge RTT and blow up the retransmit timer. 1130 */ 1131 if ((to.to_flags & TOF_TS) != 0 && 1132 to.to_tsecr) { 1133 if (!tp->t_rttlow || 1134 tp->t_rttlow > ticks - to.to_tsecr) 1135 tp->t_rttlow = ticks - to.to_tsecr; 1136 tcp_xmit_timer(tp, 1137 ticks - to.to_tsecr + 1); 1138 } else if (tp->t_rtttime && 1139 SEQ_GT(th->th_ack, tp->t_rtseq)) { 1140 if (!tp->t_rttlow || 1141 tp->t_rttlow > ticks - tp->t_rtttime) 1142 tp->t_rttlow = ticks - tp->t_rtttime; 1143 tcp_xmit_timer(tp, 1144 ticks - tp->t_rtttime); 1145 } 1146 tcp_xmit_bandwidth_limit(tp, th->th_ack); 1147 acked = th->th_ack - tp->snd_una; 1148 V_tcpstat.tcps_rcvackpack++; 1149 V_tcpstat.tcps_rcvackbyte += acked; 1150 sbdrop(&so->so_snd, acked); 1151 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 1152 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1153 tp->snd_recover = th->th_ack - 1; 1154 tp->snd_una = th->th_ack; 1155 /* 1156 * Pull snd_wl2 up to prevent seq wrap relative 1157 * to th_ack. 1158 */ 1159 tp->snd_wl2 = th->th_ack; 1160 tp->t_dupacks = 0; 1161 m_freem(m); 1162 ND6_HINT(tp); /* Some progress has been made. */ 1163 1164 /* 1165 * If all outstanding data are acked, stop 1166 * retransmit timer, otherwise restart timer 1167 * using current (possibly backed-off) value. 1168 * If process is waiting for space, 1169 * wakeup/selwakeup/signal. If data 1170 * are ready to send, let tcp_output 1171 * decide between more output or persist. 1172 */ 1173#ifdef TCPDEBUG 1174 if (so->so_options & SO_DEBUG) 1175 tcp_trace(TA_INPUT, ostate, tp, 1176 (void *)tcp_saveipgen, 1177 &tcp_savetcp, 0); 1178#endif 1179 if (tp->snd_una == tp->snd_max) 1180 tcp_timer_activate(tp, TT_REXMT, 0); 1181 else if (!tcp_timer_active(tp, TT_PERSIST)) 1182 tcp_timer_activate(tp, TT_REXMT, 1183 tp->t_rxtcur); 1184 sowwakeup(so); 1185 if (so->so_snd.sb_cc) 1186 (void) tcp_output(tp); 1187 goto check_delack; 1188 } 1189 } else if (th->th_ack == tp->snd_una && 1190 tlen <= sbspace(&so->so_rcv)) { 1191 int newsize = 0; /* automatic sockbuf scaling */ 1192 1193 KASSERT(headlocked, ("%s: headlocked", __func__)); 1194 INP_INFO_WUNLOCK(&V_tcbinfo); 1195 headlocked = 0; 1196 /* 1197 * This is a pure, in-sequence data packet 1198 * with nothing on the reassembly queue and 1199 * we have enough buffer space to take it. 1200 */ 1201 /* Clean receiver SACK report if present */ 1202 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 1203 tcp_clean_sackreport(tp); 1204 ++V_tcpstat.tcps_preddat; 1205 tp->rcv_nxt += tlen; 1206 /* 1207 * Pull snd_wl1 up to prevent seq wrap relative to 1208 * th_seq. 1209 */ 1210 tp->snd_wl1 = th->th_seq; 1211 /* 1212 * Pull rcv_up up to prevent seq wrap relative to 1213 * rcv_nxt. 1214 */ 1215 tp->rcv_up = tp->rcv_nxt; 1216 V_tcpstat.tcps_rcvpack++; 1217 V_tcpstat.tcps_rcvbyte += tlen; 1218 ND6_HINT(tp); /* Some progress has been made */ 1219#ifdef TCPDEBUG 1220 if (so->so_options & SO_DEBUG) 1221 tcp_trace(TA_INPUT, ostate, tp, 1222 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1223#endif 1224 /* 1225 * Automatic sizing of receive socket buffer. Often the send 1226 * buffer size is not optimally adjusted to the actual network 1227 * conditions at hand (delay bandwidth product). Setting the 1228 * buffer size too small limits throughput on links with high 1229 * bandwidth and high delay (eg. trans-continental/oceanic links). 1230 * 1231 * On the receive side the socket buffer memory is only rarely 1232 * used to any significant extent. This allows us to be much 1233 * more aggressive in scaling the receive socket buffer. For 1234 * the case that the buffer space is actually used to a large 1235 * extent and we run out of kernel memory we can simply drop 1236 * the new segments; TCP on the sender will just retransmit it 1237 * later. Setting the buffer size too big may only consume too 1238 * much kernel memory if the application doesn't read() from 1239 * the socket or packet loss or reordering makes use of the 1240 * reassembly queue. 1241 * 1242 * The criteria to step up the receive buffer one notch are: 1243 * 1. the number of bytes received during the time it takes 1244 * one timestamp to be reflected back to us (the RTT); 1245 * 2. received bytes per RTT is within seven eighth of the 1246 * current socket buffer size; 1247 * 3. receive buffer size has not hit maximal automatic size; 1248 * 1249 * This algorithm does one step per RTT at most and only if 1250 * we receive a bulk stream w/o packet losses or reorderings. 1251 * Shrinking the buffer during idle times is not necessary as 1252 * it doesn't consume any memory when idle. 1253 * 1254 * TODO: Only step up if the application is actually serving 1255 * the buffer to better manage the socket buffer resources. 1256 */ 1257 if (V_tcp_do_autorcvbuf && 1258 to.to_tsecr && 1259 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 1260 if (to.to_tsecr > tp->rfbuf_ts && 1261 to.to_tsecr - tp->rfbuf_ts < hz) { 1262 if (tp->rfbuf_cnt > 1263 (so->so_rcv.sb_hiwat / 8 * 7) && 1264 so->so_rcv.sb_hiwat < 1265 V_tcp_autorcvbuf_max) { 1266 newsize = 1267 min(so->so_rcv.sb_hiwat + 1268 V_tcp_autorcvbuf_inc, 1269 V_tcp_autorcvbuf_max); 1270 } 1271 /* Start over with next RTT. */ 1272 tp->rfbuf_ts = 0; 1273 tp->rfbuf_cnt = 0; 1274 } else 1275 tp->rfbuf_cnt += tlen; /* add up */ 1276 } 1277 1278 /* Add data to socket buffer. */ 1279 SOCKBUF_LOCK(&so->so_rcv); 1280 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1281 m_freem(m); 1282 } else { 1283 /* 1284 * Set new socket buffer size. 1285 * Give up when limit is reached. 1286 */ 1287 if (newsize) 1288 if (!sbreserve_locked(&so->so_rcv, 1289 newsize, so, NULL)) 1290 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1291 m_adj(m, drop_hdrlen); /* delayed header drop */ 1292 sbappendstream_locked(&so->so_rcv, m); 1293 } 1294 /* NB: sorwakeup_locked() does an implicit unlock. */ 1295 sorwakeup_locked(so); 1296 if (DELAY_ACK(tp)) { 1297 tp->t_flags |= TF_DELACK; 1298 } else { 1299 tp->t_flags |= TF_ACKNOW; 1300 tcp_output(tp); 1301 } 1302 goto check_delack; 1303 } 1304 } 1305 1306 /* 1307 * Calculate amount of space in receive window, 1308 * and then do TCP input processing. 1309 * Receive window is amount of space in rcv queue, 1310 * but not less than advertised window. 1311 */ 1312 win = sbspace(&so->so_rcv); 1313 if (win < 0) 1314 win = 0; 1315 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1316 1317 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1318 tp->rfbuf_ts = 0; 1319 tp->rfbuf_cnt = 0; 1320 1321 switch (tp->t_state) { 1322 1323 /* 1324 * If the state is SYN_RECEIVED: 1325 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1326 */ 1327 case TCPS_SYN_RECEIVED: 1328 if ((thflags & TH_ACK) && 1329 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1330 SEQ_GT(th->th_ack, tp->snd_max))) { 1331 rstreason = BANDLIM_RST_OPENPORT; 1332 goto dropwithreset; 1333 } 1334 break; 1335 1336 /* 1337 * If the state is SYN_SENT: 1338 * if seg contains an ACK, but not for our SYN, drop the input. 1339 * if seg contains a RST, then drop the connection. 1340 * if seg does not contain SYN, then drop it. 1341 * Otherwise this is an acceptable SYN segment 1342 * initialize tp->rcv_nxt and tp->irs 1343 * if seg contains ack then advance tp->snd_una 1344 * if seg contains an ECE and ECN support is enabled, the stream 1345 * is ECN capable. 1346 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1347 * arrange for segment to be acked (eventually) 1348 * continue processing rest of data/controls, beginning with URG 1349 */ 1350 case TCPS_SYN_SENT: 1351 if ((thflags & TH_ACK) && 1352 (SEQ_LEQ(th->th_ack, tp->iss) || 1353 SEQ_GT(th->th_ack, tp->snd_max))) { 1354 rstreason = BANDLIM_UNLIMITED; 1355 goto dropwithreset; 1356 } 1357 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) 1358 tp = tcp_drop(tp, ECONNREFUSED); 1359 if (thflags & TH_RST) 1360 goto drop; 1361 if (!(thflags & TH_SYN)) 1362 goto drop; 1363 1364 tp->irs = th->th_seq; 1365 tcp_rcvseqinit(tp); 1366 if (thflags & TH_ACK) { 1367 V_tcpstat.tcps_connects++; 1368 soisconnected(so); 1369#ifdef MAC 1370 SOCK_LOCK(so); 1371 mac_socketpeer_set_from_mbuf(m, so); 1372 SOCK_UNLOCK(so); 1373#endif 1374 /* Do window scaling on this connection? */ 1375 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1376 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1377 tp->rcv_scale = tp->request_r_scale; 1378 } 1379 tp->rcv_adv += tp->rcv_wnd; 1380 tp->snd_una++; /* SYN is acked */ 1381 /* 1382 * If there's data, delay ACK; if there's also a FIN 1383 * ACKNOW will be turned on later. 1384 */ 1385 if (DELAY_ACK(tp) && tlen != 0) 1386 tcp_timer_activate(tp, TT_DELACK, 1387 tcp_delacktime); 1388 else 1389 tp->t_flags |= TF_ACKNOW; 1390 1391 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 1392 tp->t_flags |= TF_ECN_PERMIT; 1393 V_tcpstat.tcps_ecn_shs++; 1394 } 1395 1396 /* 1397 * Received <SYN,ACK> in SYN_SENT[*] state. 1398 * Transitions: 1399 * SYN_SENT --> ESTABLISHED 1400 * SYN_SENT* --> FIN_WAIT_1 1401 */ 1402 tp->t_starttime = ticks; 1403 if (tp->t_flags & TF_NEEDFIN) { 1404 tp->t_state = TCPS_FIN_WAIT_1; 1405 tp->t_flags &= ~TF_NEEDFIN; 1406 thflags &= ~TH_SYN; 1407 } else { 1408 tp->t_state = TCPS_ESTABLISHED; 1409 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1410 } 1411 } else { 1412 /* 1413 * Received initial SYN in SYN-SENT[*] state => 1414 * simultaneous open. If segment contains CC option 1415 * and there is a cached CC, apply TAO test. 1416 * If it succeeds, connection is * half-synchronized. 1417 * Otherwise, do 3-way handshake: 1418 * SYN-SENT -> SYN-RECEIVED 1419 * SYN-SENT* -> SYN-RECEIVED* 1420 * If there was no CC option, clear cached CC value. 1421 */ 1422 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 1423 tcp_timer_activate(tp, TT_REXMT, 0); 1424 tp->t_state = TCPS_SYN_RECEIVED; 1425 } 1426 1427 KASSERT(headlocked, ("%s: trimthenstep6: head not locked", 1428 __func__)); 1429 INP_WLOCK_ASSERT(tp->t_inpcb); 1430 1431 /* 1432 * Advance th->th_seq to correspond to first data byte. 1433 * If data, trim to stay within window, 1434 * dropping FIN if necessary. 1435 */ 1436 th->th_seq++; 1437 if (tlen > tp->rcv_wnd) { 1438 todrop = tlen - tp->rcv_wnd; 1439 m_adj(m, -todrop); 1440 tlen = tp->rcv_wnd; 1441 thflags &= ~TH_FIN; 1442 V_tcpstat.tcps_rcvpackafterwin++; 1443 V_tcpstat.tcps_rcvbyteafterwin += todrop; 1444 } 1445 tp->snd_wl1 = th->th_seq - 1; 1446 tp->rcv_up = th->th_seq; 1447 /* 1448 * Client side of transaction: already sent SYN and data. 1449 * If the remote host used T/TCP to validate the SYN, 1450 * our data will be ACK'd; if so, enter normal data segment 1451 * processing in the middle of step 5, ack processing. 1452 * Otherwise, goto step 6. 1453 */ 1454 if (thflags & TH_ACK) 1455 goto process_ACK; 1456 1457 goto step6; 1458 1459 /* 1460 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 1461 * do normal processing. 1462 * 1463 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 1464 */ 1465 case TCPS_LAST_ACK: 1466 case TCPS_CLOSING: 1467 break; /* continue normal processing */ 1468 } 1469 1470 /* 1471 * States other than LISTEN or SYN_SENT. 1472 * First check the RST flag and sequence number since reset segments 1473 * are exempt from the timestamp and connection count tests. This 1474 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 1475 * below which allowed reset segments in half the sequence space 1476 * to fall though and be processed (which gives forged reset 1477 * segments with a random sequence number a 50 percent chance of 1478 * killing a connection). 1479 * Then check timestamp, if present. 1480 * Then check the connection count, if present. 1481 * Then check that at least some bytes of segment are within 1482 * receive window. If segment begins before rcv_nxt, 1483 * drop leading data (and SYN); if nothing left, just ack. 1484 * 1485 * 1486 * If the RST bit is set, check the sequence number to see 1487 * if this is a valid reset segment. 1488 * RFC 793 page 37: 1489 * In all states except SYN-SENT, all reset (RST) segments 1490 * are validated by checking their SEQ-fields. A reset is 1491 * valid if its sequence number is in the window. 1492 * Note: this does not take into account delayed ACKs, so 1493 * we should test against last_ack_sent instead of rcv_nxt. 1494 * The sequence number in the reset segment is normally an 1495 * echo of our outgoing acknowlegement numbers, but some hosts 1496 * send a reset with the sequence number at the rightmost edge 1497 * of our receive window, and we have to handle this case. 1498 * Note 2: Paul Watson's paper "Slipping in the Window" has shown 1499 * that brute force RST attacks are possible. To combat this, 1500 * we use a much stricter check while in the ESTABLISHED state, 1501 * only accepting RSTs where the sequence number is equal to 1502 * last_ack_sent. In all other states (the states in which a 1503 * RST is more likely), the more permissive check is used. 1504 * If we have multiple segments in flight, the intial reset 1505 * segment sequence numbers will be to the left of last_ack_sent, 1506 * but they will eventually catch up. 1507 * In any case, it never made sense to trim reset segments to 1508 * fit the receive window since RFC 1122 says: 1509 * 4.2.2.12 RST Segment: RFC-793 Section 3.4 1510 * 1511 * A TCP SHOULD allow a received RST segment to include data. 1512 * 1513 * DISCUSSION 1514 * It has been suggested that a RST segment could contain 1515 * ASCII text that encoded and explained the cause of the 1516 * RST. No standard has yet been established for such 1517 * data. 1518 * 1519 * If the reset segment passes the sequence number test examine 1520 * the state: 1521 * SYN_RECEIVED STATE: 1522 * If passive open, return to LISTEN state. 1523 * If active open, inform user that connection was refused. 1524 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: 1525 * Inform user that connection was reset, and close tcb. 1526 * CLOSING, LAST_ACK STATES: 1527 * Close the tcb. 1528 * TIME_WAIT STATE: 1529 * Drop the segment - see Stevens, vol. 2, p. 964 and 1530 * RFC 1337. 1531 */ 1532 if (thflags & TH_RST) { 1533 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1534 SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1535 switch (tp->t_state) { 1536 1537 case TCPS_SYN_RECEIVED: 1538 so->so_error = ECONNREFUSED; 1539 goto close; 1540 1541 case TCPS_ESTABLISHED: 1542 if (V_tcp_insecure_rst == 0 && 1543 !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && 1544 SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && 1545 !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1546 SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { 1547 V_tcpstat.tcps_badrst++; 1548 goto drop; 1549 } 1550 /* FALLTHROUGH */ 1551 case TCPS_FIN_WAIT_1: 1552 case TCPS_FIN_WAIT_2: 1553 case TCPS_CLOSE_WAIT: 1554 so->so_error = ECONNRESET; 1555 close: 1556 tp->t_state = TCPS_CLOSED; 1557 V_tcpstat.tcps_drops++; 1558 KASSERT(headlocked, ("%s: trimthenstep6: " 1559 "tcp_close: head not locked", __func__)); 1560 tp = tcp_close(tp); 1561 break; 1562 1563 case TCPS_CLOSING: 1564 case TCPS_LAST_ACK: 1565 KASSERT(headlocked, ("%s: trimthenstep6: " 1566 "tcp_close.2: head not locked", __func__)); 1567 tp = tcp_close(tp); 1568 break; 1569 } 1570 } 1571 goto drop; 1572 } 1573 1574 /* 1575 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1576 * and it's less than ts_recent, drop it. 1577 */ 1578 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 1579 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 1580 1581 /* Check to see if ts_recent is over 24 days old. */ 1582 if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1583 /* 1584 * Invalidate ts_recent. If this segment updates 1585 * ts_recent, the age will be reset later and ts_recent 1586 * will get a valid value. If it does not, setting 1587 * ts_recent to zero will at least satisfy the 1588 * requirement that zero be placed in the timestamp 1589 * echo reply when ts_recent isn't valid. The 1590 * age isn't reset until we get a valid ts_recent 1591 * because we don't want out-of-order segments to be 1592 * dropped when ts_recent is old. 1593 */ 1594 tp->ts_recent = 0; 1595 } else { 1596 V_tcpstat.tcps_rcvduppack++; 1597 V_tcpstat.tcps_rcvdupbyte += tlen; 1598 V_tcpstat.tcps_pawsdrop++; 1599 if (tlen) 1600 goto dropafterack; 1601 goto drop; 1602 } 1603 } 1604 1605 /* 1606 * In the SYN-RECEIVED state, validate that the packet belongs to 1607 * this connection before trimming the data to fit the receive 1608 * window. Check the sequence number versus IRS since we know 1609 * the sequence numbers haven't wrapped. This is a partial fix 1610 * for the "LAND" DoS attack. 1611 */ 1612 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 1613 rstreason = BANDLIM_RST_OPENPORT; 1614 goto dropwithreset; 1615 } 1616 1617 todrop = tp->rcv_nxt - th->th_seq; 1618 if (todrop > 0) { 1619 if (thflags & TH_SYN) { 1620 thflags &= ~TH_SYN; 1621 th->th_seq++; 1622 if (th->th_urp > 1) 1623 th->th_urp--; 1624 else 1625 thflags &= ~TH_URG; 1626 todrop--; 1627 } 1628 /* 1629 * Following if statement from Stevens, vol. 2, p. 960. 1630 */ 1631 if (todrop > tlen 1632 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1633 /* 1634 * Any valid FIN must be to the left of the window. 1635 * At this point the FIN must be a duplicate or out 1636 * of sequence; drop it. 1637 */ 1638 thflags &= ~TH_FIN; 1639 1640 /* 1641 * Send an ACK to resynchronize and drop any data. 1642 * But keep on processing for RST or ACK. 1643 */ 1644 tp->t_flags |= TF_ACKNOW; 1645 todrop = tlen; 1646 V_tcpstat.tcps_rcvduppack++; 1647 V_tcpstat.tcps_rcvdupbyte += todrop; 1648 } else { 1649 V_tcpstat.tcps_rcvpartduppack++; 1650 V_tcpstat.tcps_rcvpartdupbyte += todrop; 1651 } 1652 drop_hdrlen += todrop; /* drop from the top afterwards */ 1653 th->th_seq += todrop; 1654 tlen -= todrop; 1655 if (th->th_urp > todrop) 1656 th->th_urp -= todrop; 1657 else { 1658 thflags &= ~TH_URG; 1659 th->th_urp = 0; 1660 } 1661 } 1662 1663 /* 1664 * If new data are received on a connection after the 1665 * user processes are gone, then RST the other end. 1666 */ 1667 if ((so->so_state & SS_NOFDREF) && 1668 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1669 char *s; 1670 1671 KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head " 1672 "not locked", __func__)); 1673 if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { 1674 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " 1675 "was closed, sending RST and removing tcpcb\n", 1676 s, __func__, tcpstates[tp->t_state], tlen); 1677 free(s, M_TCPLOG); 1678 } 1679 tp = tcp_close(tp); 1680 V_tcpstat.tcps_rcvafterclose++; 1681 rstreason = BANDLIM_UNLIMITED; 1682 goto dropwithreset; 1683 } 1684 1685 /* 1686 * If segment ends after window, drop trailing data 1687 * (and PUSH and FIN); if nothing left, just ACK. 1688 */ 1689 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1690 if (todrop > 0) { 1691 V_tcpstat.tcps_rcvpackafterwin++; 1692 if (todrop >= tlen) { 1693 V_tcpstat.tcps_rcvbyteafterwin += tlen; 1694 /* 1695 * If window is closed can only take segments at 1696 * window edge, and have to drop data and PUSH from 1697 * incoming segments. Continue processing, but 1698 * remember to ack. Otherwise, drop segment 1699 * and ack. 1700 */ 1701 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1702 tp->t_flags |= TF_ACKNOW; 1703 V_tcpstat.tcps_rcvwinprobe++; 1704 } else 1705 goto dropafterack; 1706 } else 1707 V_tcpstat.tcps_rcvbyteafterwin += todrop; 1708 m_adj(m, -todrop); 1709 tlen -= todrop; 1710 thflags &= ~(TH_PUSH|TH_FIN); 1711 } 1712 1713 /* 1714 * If last ACK falls within this segment's sequence numbers, 1715 * record its timestamp. 1716 * NOTE: 1717 * 1) That the test incorporates suggestions from the latest 1718 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1719 * 2) That updating only on newer timestamps interferes with 1720 * our earlier PAWS tests, so this check should be solely 1721 * predicated on the sequence space of this segment. 1722 * 3) That we modify the segment boundary check to be 1723 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 1724 * instead of RFC1323's 1725 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 1726 * This modified check allows us to overcome RFC1323's 1727 * limitations as described in Stevens TCP/IP Illustrated 1728 * Vol. 2 p.869. In such cases, we can still calculate the 1729 * RTT correctly when RCV.NXT == Last.ACK.Sent. 1730 */ 1731 if ((to.to_flags & TOF_TS) != 0 && 1732 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1733 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1734 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 1735 tp->ts_recent_age = ticks; 1736 tp->ts_recent = to.to_tsval; 1737 } 1738 1739 /* 1740 * If a SYN is in the window, then this is an 1741 * error and we send an RST and drop the connection. 1742 */ 1743 if (thflags & TH_SYN) { 1744 KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: " 1745 "head not locked", __func__)); 1746 tp = tcp_drop(tp, ECONNRESET); 1747 rstreason = BANDLIM_UNLIMITED; 1748 goto drop; 1749 } 1750 1751 /* 1752 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 1753 * flag is on (half-synchronized state), then queue data for 1754 * later processing; else drop segment and return. 1755 */ 1756 if ((thflags & TH_ACK) == 0) { 1757 if (tp->t_state == TCPS_SYN_RECEIVED || 1758 (tp->t_flags & TF_NEEDSYN)) 1759 goto step6; 1760 else if (tp->t_flags & TF_ACKNOW) 1761 goto dropafterack; 1762 else 1763 goto drop; 1764 } 1765 1766 /* 1767 * Ack processing. 1768 */ 1769 switch (tp->t_state) { 1770 1771 /* 1772 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1773 * ESTABLISHED state and continue processing. 1774 * The ACK was checked above. 1775 */ 1776 case TCPS_SYN_RECEIVED: 1777 1778 V_tcpstat.tcps_connects++; 1779 soisconnected(so); 1780 /* Do window scaling? */ 1781 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1782 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1783 tp->rcv_scale = tp->request_r_scale; 1784 tp->snd_wnd = tiwin; 1785 } 1786 /* 1787 * Make transitions: 1788 * SYN-RECEIVED -> ESTABLISHED 1789 * SYN-RECEIVED* -> FIN-WAIT-1 1790 */ 1791 tp->t_starttime = ticks; 1792 if (tp->t_flags & TF_NEEDFIN) { 1793 tp->t_state = TCPS_FIN_WAIT_1; 1794 tp->t_flags &= ~TF_NEEDFIN; 1795 } else { 1796 tp->t_state = TCPS_ESTABLISHED; 1797 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); 1798 } 1799 /* 1800 * If segment contains data or ACK, will call tcp_reass() 1801 * later; if not, do so now to pass queued data to user. 1802 */ 1803 if (tlen == 0 && (thflags & TH_FIN) == 0) 1804 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 1805 (struct mbuf *)0); 1806 tp->snd_wl1 = th->th_seq - 1; 1807 /* FALLTHROUGH */ 1808 1809 /* 1810 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1811 * ACKs. If the ack is in the range 1812 * tp->snd_una < th->th_ack <= tp->snd_max 1813 * then advance tp->snd_una to th->th_ack and drop 1814 * data from the retransmission queue. If this ACK reflects 1815 * more up to date window information we update our window information. 1816 */ 1817 case TCPS_ESTABLISHED: 1818 case TCPS_FIN_WAIT_1: 1819 case TCPS_FIN_WAIT_2: 1820 case TCPS_CLOSE_WAIT: 1821 case TCPS_CLOSING: 1822 case TCPS_LAST_ACK: 1823 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1824 V_tcpstat.tcps_rcvacktoomuch++; 1825 goto dropafterack; 1826 } 1827 if ((tp->t_flags & TF_SACK_PERMIT) && 1828 ((to.to_flags & TOF_SACK) || 1829 !TAILQ_EMPTY(&tp->snd_holes))) 1830 tcp_sack_doack(tp, &to, th->th_ack); 1831 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1832 if (tlen == 0 && tiwin == tp->snd_wnd) { 1833 V_tcpstat.tcps_rcvdupack++; 1834 /* 1835 * If we have outstanding data (other than 1836 * a window probe), this is a completely 1837 * duplicate ack (ie, window info didn't 1838 * change), the ack is the biggest we've 1839 * seen and we've seen exactly our rexmt 1840 * threshhold of them, assume a packet 1841 * has been dropped and retransmit it. 1842 * Kludge snd_nxt & the congestion 1843 * window so we send only this one 1844 * packet. 1845 * 1846 * We know we're losing at the current 1847 * window size so do congestion avoidance 1848 * (set ssthresh to half the current window 1849 * and pull our congestion window back to 1850 * the new ssthresh). 1851 * 1852 * Dup acks mean that packets have left the 1853 * network (they're now cached at the receiver) 1854 * so bump cwnd by the amount in the receiver 1855 * to keep a constant cwnd packets in the 1856 * network. 1857 * 1858 * When using TCP ECN, notify the peer that 1859 * we reduced the cwnd. 1860 */ 1861 if (!tcp_timer_active(tp, TT_REXMT) || 1862 th->th_ack != tp->snd_una) 1863 tp->t_dupacks = 0; 1864 else if (++tp->t_dupacks > tcprexmtthresh || 1865 ((V_tcp_do_newreno || 1866 (tp->t_flags & TF_SACK_PERMIT)) && 1867 IN_FASTRECOVERY(tp))) { 1868 if ((tp->t_flags & TF_SACK_PERMIT) && 1869 IN_FASTRECOVERY(tp)) { 1870 int awnd; 1871 1872 /* 1873 * Compute the amount of data in flight first. 1874 * We can inject new data into the pipe iff 1875 * we have less than 1/2 the original window's 1876 * worth of data in flight. 1877 */ 1878 awnd = (tp->snd_nxt - tp->snd_fack) + 1879 tp->sackhint.sack_bytes_rexmit; 1880 if (awnd < tp->snd_ssthresh) { 1881 tp->snd_cwnd += tp->t_maxseg; 1882 if (tp->snd_cwnd > tp->snd_ssthresh) 1883 tp->snd_cwnd = tp->snd_ssthresh; 1884 } 1885 } else 1886 tp->snd_cwnd += tp->t_maxseg; 1887 (void) tcp_output(tp); 1888 goto drop; 1889 } else if (tp->t_dupacks == tcprexmtthresh) { 1890 tcp_seq onxt = tp->snd_nxt; 1891 1892 /* 1893 * If we're doing sack, check to 1894 * see if we're already in sack 1895 * recovery. If we're not doing sack, 1896 * check to see if we're in newreno 1897 * recovery. 1898 */ 1899 if (tp->t_flags & TF_SACK_PERMIT) { 1900 if (IN_FASTRECOVERY(tp)) { 1901 tp->t_dupacks = 0; 1902 break; 1903 } 1904 } else if (V_tcp_do_newreno || 1905 V_tcp_do_ecn) { 1906 if (SEQ_LEQ(th->th_ack, 1907 tp->snd_recover)) { 1908 tp->t_dupacks = 0; 1909 break; 1910 } 1911 } 1912 tcp_congestion_exp(tp); 1913 tcp_timer_activate(tp, TT_REXMT, 0); 1914 tp->t_rtttime = 0; 1915 if (tp->t_flags & TF_SACK_PERMIT) { 1916 V_tcpstat.tcps_sack_recovery_episode++; 1917 tp->sack_newdata = tp->snd_nxt; 1918 tp->snd_cwnd = tp->t_maxseg; 1919 (void) tcp_output(tp); 1920 goto drop; 1921 } 1922 tp->snd_nxt = th->th_ack; 1923 tp->snd_cwnd = tp->t_maxseg; 1924 (void) tcp_output(tp); 1925 KASSERT(tp->snd_limited <= 2, 1926 ("%s: tp->snd_limited too big", 1927 __func__)); 1928 tp->snd_cwnd = tp->snd_ssthresh + 1929 tp->t_maxseg * 1930 (tp->t_dupacks - tp->snd_limited); 1931 if (SEQ_GT(onxt, tp->snd_nxt)) 1932 tp->snd_nxt = onxt; 1933 goto drop; 1934 } else if (V_tcp_do_rfc3042) { 1935 u_long oldcwnd = tp->snd_cwnd; 1936 tcp_seq oldsndmax = tp->snd_max; 1937 u_int sent; 1938 1939 KASSERT(tp->t_dupacks == 1 || 1940 tp->t_dupacks == 2, 1941 ("%s: dupacks not 1 or 2", 1942 __func__)); 1943 if (tp->t_dupacks == 1) 1944 tp->snd_limited = 0; 1945 tp->snd_cwnd = 1946 (tp->snd_nxt - tp->snd_una) + 1947 (tp->t_dupacks - tp->snd_limited) * 1948 tp->t_maxseg; 1949 (void) tcp_output(tp); 1950 sent = tp->snd_max - oldsndmax; 1951 if (sent > tp->t_maxseg) { 1952 KASSERT((tp->t_dupacks == 2 && 1953 tp->snd_limited == 0) || 1954 (sent == tp->t_maxseg + 1 && 1955 tp->t_flags & TF_SENTFIN), 1956 ("%s: sent too much", 1957 __func__)); 1958 tp->snd_limited = 2; 1959 } else if (sent > 0) 1960 ++tp->snd_limited; 1961 tp->snd_cwnd = oldcwnd; 1962 goto drop; 1963 } 1964 } else 1965 tp->t_dupacks = 0; 1966 break; 1967 } 1968 1969 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 1970 ("%s: th_ack <= snd_una", __func__)); 1971 1972 /* 1973 * If the congestion window was inflated to account 1974 * for the other side's cached packets, retract it. 1975 */ 1976 if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { 1977 if (IN_FASTRECOVERY(tp)) { 1978 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 1979 if (tp->t_flags & TF_SACK_PERMIT) 1980 tcp_sack_partialack(tp, th); 1981 else 1982 tcp_newreno_partial_ack(tp, th); 1983 } else { 1984 /* 1985 * Out of fast recovery. 1986 * Window inflation should have left us 1987 * with approximately snd_ssthresh 1988 * outstanding data. 1989 * But in case we would be inclined to 1990 * send a burst, better to do it via 1991 * the slow start mechanism. 1992 */ 1993 if (SEQ_GT(th->th_ack + 1994 tp->snd_ssthresh, 1995 tp->snd_max)) 1996 tp->snd_cwnd = tp->snd_max - 1997 th->th_ack + 1998 tp->t_maxseg; 1999 else 2000 tp->snd_cwnd = tp->snd_ssthresh; 2001 } 2002 } 2003 } else { 2004 if (tp->t_dupacks >= tcprexmtthresh && 2005 tp->snd_cwnd > tp->snd_ssthresh) 2006 tp->snd_cwnd = tp->snd_ssthresh; 2007 } 2008 tp->t_dupacks = 0; 2009 /* 2010 * If we reach this point, ACK is not a duplicate, 2011 * i.e., it ACKs something we sent. 2012 */ 2013 if (tp->t_flags & TF_NEEDSYN) { 2014 /* 2015 * T/TCP: Connection was half-synchronized, and our 2016 * SYN has been ACK'd (so connection is now fully 2017 * synchronized). Go to non-starred state, 2018 * increment snd_una for ACK of SYN, and check if 2019 * we can do window scaling. 2020 */ 2021 tp->t_flags &= ~TF_NEEDSYN; 2022 tp->snd_una++; 2023 /* Do window scaling? */ 2024 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2025 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2026 tp->rcv_scale = tp->request_r_scale; 2027 /* Send window already scaled. */ 2028 } 2029 } 2030 2031process_ACK: 2032 KASSERT(headlocked, ("%s: process_ACK: head not locked", 2033 __func__)); 2034 INP_WLOCK_ASSERT(tp->t_inpcb); 2035 2036 acked = th->th_ack - tp->snd_una; 2037 V_tcpstat.tcps_rcvackpack++; 2038 V_tcpstat.tcps_rcvackbyte += acked; 2039 2040 /* 2041 * If we just performed our first retransmit, and the ACK 2042 * arrives within our recovery window, then it was a mistake 2043 * to do the retransmit in the first place. Recover our 2044 * original cwnd and ssthresh, and proceed to transmit where 2045 * we left off. 2046 */ 2047 if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { 2048 ++V_tcpstat.tcps_sndrexmitbad; 2049 tp->snd_cwnd = tp->snd_cwnd_prev; 2050 tp->snd_ssthresh = tp->snd_ssthresh_prev; 2051 tp->snd_recover = tp->snd_recover_prev; 2052 if (tp->t_flags & TF_WASFRECOVERY) 2053 ENTER_FASTRECOVERY(tp); 2054 tp->snd_nxt = tp->snd_max; 2055 tp->t_badrxtwin = 0; /* XXX probably not required */ 2056 } 2057 2058 /* 2059 * If we have a timestamp reply, update smoothed 2060 * round trip time. If no timestamp is present but 2061 * transmit timer is running and timed sequence 2062 * number was acked, update smoothed round trip time. 2063 * Since we now have an rtt measurement, cancel the 2064 * timer backoff (cf., Phil Karn's retransmit alg.). 2065 * Recompute the initial retransmit timer. 2066 * 2067 * Some boxes send broken timestamp replies 2068 * during the SYN+ACK phase, ignore 2069 * timestamps of 0 or we could calculate a 2070 * huge RTT and blow up the retransmit timer. 2071 */ 2072 if ((to.to_flags & TOF_TS) != 0 && 2073 to.to_tsecr) { 2074 if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) 2075 tp->t_rttlow = ticks - to.to_tsecr; 2076 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); 2077 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 2078 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 2079 tp->t_rttlow = ticks - tp->t_rtttime; 2080 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 2081 } 2082 tcp_xmit_bandwidth_limit(tp, th->th_ack); 2083 2084 /* 2085 * If all outstanding data is acked, stop retransmit 2086 * timer and remember to restart (more output or persist). 2087 * If there is more data to be acked, restart retransmit 2088 * timer, using current (possibly backed-off) value. 2089 */ 2090 if (th->th_ack == tp->snd_max) { 2091 tcp_timer_activate(tp, TT_REXMT, 0); 2092 needoutput = 1; 2093 } else if (!tcp_timer_active(tp, TT_PERSIST)) 2094 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); 2095 2096 /* 2097 * If no data (only SYN) was ACK'd, 2098 * skip rest of ACK processing. 2099 */ 2100 if (acked == 0) 2101 goto step6; 2102 2103 /* 2104 * When new data is acked, open the congestion window. 2105 * If the window gives us less than ssthresh packets 2106 * in flight, open exponentially (maxseg per packet). 2107 * Otherwise open linearly: maxseg per window 2108 * (maxseg^2 / cwnd per packet). 2109 * If cwnd > maxseg^2, fix the cwnd increment at 1 byte 2110 * to avoid capping cwnd (as suggested in RFC2581). 2111 */ 2112 if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || 2113 !IN_FASTRECOVERY(tp)) { 2114 u_int cw = tp->snd_cwnd; 2115 u_int incr = tp->t_maxseg; 2116 if (cw > tp->snd_ssthresh) 2117 incr = max((incr * incr / cw), 1); 2118 tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale); 2119 } 2120 SOCKBUF_LOCK(&so->so_snd); 2121 if (acked > so->so_snd.sb_cc) { 2122 tp->snd_wnd -= so->so_snd.sb_cc; 2123 sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); 2124 ourfinisacked = 1; 2125 } else { 2126 sbdrop_locked(&so->so_snd, acked); 2127 tp->snd_wnd -= acked; 2128 ourfinisacked = 0; 2129 } 2130 /* NB: sowwakeup_locked() does an implicit unlock. */ 2131 sowwakeup_locked(so); 2132 /* Detect una wraparound. */ 2133 if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && 2134 !IN_FASTRECOVERY(tp) && 2135 SEQ_GT(tp->snd_una, tp->snd_recover) && 2136 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2137 tp->snd_recover = th->th_ack - 1; 2138 if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && 2139 IN_FASTRECOVERY(tp) && 2140 SEQ_GEQ(th->th_ack, tp->snd_recover)) 2141 EXIT_FASTRECOVERY(tp); 2142 tp->snd_una = th->th_ack; 2143 if (tp->t_flags & TF_SACK_PERMIT) { 2144 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 2145 tp->snd_recover = tp->snd_una; 2146 } 2147 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2148 tp->snd_nxt = tp->snd_una; 2149 2150 switch (tp->t_state) { 2151 2152 /* 2153 * In FIN_WAIT_1 STATE in addition to the processing 2154 * for the ESTABLISHED state if our FIN is now acknowledged 2155 * then enter FIN_WAIT_2. 2156 */ 2157 case TCPS_FIN_WAIT_1: 2158 if (ourfinisacked) { 2159 /* 2160 * If we can't receive any more 2161 * data, then closing user can proceed. 2162 * Starting the timer is contrary to the 2163 * specification, but if we don't get a FIN 2164 * we'll hang forever. 2165 * 2166 * XXXjl: 2167 * we should release the tp also, and use a 2168 * compressed state. 2169 */ 2170 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2171 int timeout; 2172 2173 soisdisconnected(so); 2174 timeout = (tcp_fast_finwait2_recycle) ? 2175 tcp_finwait2_timeout : tcp_maxidle; 2176 tcp_timer_activate(tp, TT_2MSL, timeout); 2177 } 2178 tp->t_state = TCPS_FIN_WAIT_2; 2179 } 2180 break; 2181 2182 /* 2183 * In CLOSING STATE in addition to the processing for 2184 * the ESTABLISHED state if the ACK acknowledges our FIN 2185 * then enter the TIME-WAIT state, otherwise ignore 2186 * the segment. 2187 */ 2188 case TCPS_CLOSING: 2189 if (ourfinisacked) { 2190 KASSERT(headlocked, ("%s: process_ACK: " 2191 "head not locked", __func__)); 2192 tcp_twstart(tp); 2193 INP_INFO_WUNLOCK(&V_tcbinfo); 2194 headlocked = 0; 2195 m_freem(m); 2196 return; 2197 } 2198 break; 2199 2200 /* 2201 * In LAST_ACK, we may still be waiting for data to drain 2202 * and/or to be acked, as well as for the ack of our FIN. 2203 * If our FIN is now acknowledged, delete the TCB, 2204 * enter the closed state and return. 2205 */ 2206 case TCPS_LAST_ACK: 2207 if (ourfinisacked) { 2208 KASSERT(headlocked, ("%s: process_ACK: " 2209 "tcp_close: head not locked", __func__)); 2210 tp = tcp_close(tp); 2211 goto drop; 2212 } 2213 break; 2214 } 2215 } 2216 2217step6: 2218 KASSERT(headlocked, ("%s: step6: head not locked", __func__)); 2219 INP_WLOCK_ASSERT(tp->t_inpcb); 2220 2221 /* 2222 * Update window information. 2223 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2224 */ 2225 if ((thflags & TH_ACK) && 2226 (SEQ_LT(tp->snd_wl1, th->th_seq) || 2227 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2228 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2229 /* keep track of pure window updates */ 2230 if (tlen == 0 && 2231 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2232 V_tcpstat.tcps_rcvwinupd++; 2233 tp->snd_wnd = tiwin; 2234 tp->snd_wl1 = th->th_seq; 2235 tp->snd_wl2 = th->th_ack; 2236 if (tp->snd_wnd > tp->max_sndwnd) 2237 tp->max_sndwnd = tp->snd_wnd; 2238 needoutput = 1; 2239 } 2240 2241 /* 2242 * Process segments with URG. 2243 */ 2244 if ((thflags & TH_URG) && th->th_urp && 2245 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2246 /* 2247 * This is a kludge, but if we receive and accept 2248 * random urgent pointers, we'll crash in 2249 * soreceive. It's hard to imagine someone 2250 * actually wanting to send this much urgent data. 2251 */ 2252 SOCKBUF_LOCK(&so->so_rcv); 2253 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2254 th->th_urp = 0; /* XXX */ 2255 thflags &= ~TH_URG; /* XXX */ 2256 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 2257 goto dodata; /* XXX */ 2258 } 2259 /* 2260 * If this segment advances the known urgent pointer, 2261 * then mark the data stream. This should not happen 2262 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2263 * a FIN has been received from the remote side. 2264 * In these states we ignore the URG. 2265 * 2266 * According to RFC961 (Assigned Protocols), 2267 * the urgent pointer points to the last octet 2268 * of urgent data. We continue, however, 2269 * to consider it to indicate the first octet 2270 * of data past the urgent section as the original 2271 * spec states (in one of two places). 2272 */ 2273 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2274 tp->rcv_up = th->th_seq + th->th_urp; 2275 so->so_oobmark = so->so_rcv.sb_cc + 2276 (tp->rcv_up - tp->rcv_nxt) - 1; 2277 if (so->so_oobmark == 0) 2278 so->so_rcv.sb_state |= SBS_RCVATMARK; 2279 sohasoutofband(so); 2280 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2281 } 2282 SOCKBUF_UNLOCK(&so->so_rcv); 2283 /* 2284 * Remove out of band data so doesn't get presented to user. 2285 * This can happen independent of advancing the URG pointer, 2286 * but if two URG's are pending at once, some out-of-band 2287 * data may creep in... ick. 2288 */ 2289 if (th->th_urp <= (u_long)tlen && 2290 !(so->so_options & SO_OOBINLINE)) { 2291 /* hdr drop is delayed */ 2292 tcp_pulloutofband(so, th, m, drop_hdrlen); 2293 } 2294 } else { 2295 /* 2296 * If no out of band data is expected, 2297 * pull receive urgent pointer along 2298 * with the receive window. 2299 */ 2300 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2301 tp->rcv_up = tp->rcv_nxt; 2302 } 2303dodata: /* XXX */ 2304 KASSERT(headlocked, ("%s: dodata: head not locked", __func__)); 2305 INP_WLOCK_ASSERT(tp->t_inpcb); 2306 2307 /* 2308 * Process the segment text, merging it into the TCP sequencing queue, 2309 * and arranging for acknowledgment of receipt if necessary. 2310 * This process logically involves adjusting tp->rcv_wnd as data 2311 * is presented to the user (this happens in tcp_usrreq.c, 2312 * case PRU_RCVD). If a FIN has already been received on this 2313 * connection then we just ignore the text. 2314 */ 2315 if ((tlen || (thflags & TH_FIN)) && 2316 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2317 tcp_seq save_start = th->th_seq; 2318 m_adj(m, drop_hdrlen); /* delayed header drop */ 2319 /* 2320 * Insert segment which includes th into TCP reassembly queue 2321 * with control block tp. Set thflags to whether reassembly now 2322 * includes a segment with FIN. This handles the common case 2323 * inline (segment is the next to be received on an established 2324 * connection, and the queue is empty), avoiding linkage into 2325 * and removal from the queue and repetition of various 2326 * conversions. 2327 * Set DELACK for segments received in order, but ack 2328 * immediately when segments are out of order (so 2329 * fast retransmit can work). 2330 */ 2331 if (th->th_seq == tp->rcv_nxt && 2332 LIST_EMPTY(&tp->t_segq) && 2333 TCPS_HAVEESTABLISHED(tp->t_state)) { 2334 if (DELAY_ACK(tp)) 2335 tp->t_flags |= TF_DELACK; 2336 else 2337 tp->t_flags |= TF_ACKNOW; 2338 tp->rcv_nxt += tlen; 2339 thflags = th->th_flags & TH_FIN; 2340 V_tcpstat.tcps_rcvpack++; 2341 V_tcpstat.tcps_rcvbyte += tlen; 2342 ND6_HINT(tp); 2343 SOCKBUF_LOCK(&so->so_rcv); 2344 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 2345 m_freem(m); 2346 else 2347 sbappendstream_locked(&so->so_rcv, m); 2348 /* NB: sorwakeup_locked() does an implicit unlock. */ 2349 sorwakeup_locked(so); 2350 } else { 2351 /* 2352 * XXX: Due to the header drop above "th" is 2353 * theoretically invalid by now. Fortunately 2354 * m_adj() doesn't actually frees any mbufs 2355 * when trimming from the head. 2356 */ 2357 thflags = tcp_reass(tp, th, &tlen, m); 2358 tp->t_flags |= TF_ACKNOW; 2359 } 2360 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) 2361 tcp_update_sack_list(tp, save_start, save_start + tlen); 2362#if 0 2363 /* 2364 * Note the amount of data that peer has sent into 2365 * our window, in order to estimate the sender's 2366 * buffer size. 2367 * XXX: Unused. 2368 */ 2369 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2370#endif 2371 } else { 2372 m_freem(m); 2373 thflags &= ~TH_FIN; 2374 } 2375 2376 /* 2377 * If FIN is received ACK the FIN and let the user know 2378 * that the connection is closing. 2379 */ 2380 if (thflags & TH_FIN) { 2381 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2382 socantrcvmore(so); 2383 /* 2384 * If connection is half-synchronized 2385 * (ie NEEDSYN flag on) then delay ACK, 2386 * so it may be piggybacked when SYN is sent. 2387 * Otherwise, since we received a FIN then no 2388 * more input can be expected, send ACK now. 2389 */ 2390 if (tp->t_flags & TF_NEEDSYN) 2391 tp->t_flags |= TF_DELACK; 2392 else 2393 tp->t_flags |= TF_ACKNOW; 2394 tp->rcv_nxt++; 2395 } 2396 switch (tp->t_state) { 2397 2398 /* 2399 * In SYN_RECEIVED and ESTABLISHED STATES 2400 * enter the CLOSE_WAIT state. 2401 */ 2402 case TCPS_SYN_RECEIVED: 2403 tp->t_starttime = ticks; 2404 /* FALLTHROUGH */ 2405 case TCPS_ESTABLISHED: 2406 tp->t_state = TCPS_CLOSE_WAIT; 2407 break; 2408 2409 /* 2410 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2411 * enter the CLOSING state. 2412 */ 2413 case TCPS_FIN_WAIT_1: 2414 tp->t_state = TCPS_CLOSING; 2415 break; 2416 2417 /* 2418 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2419 * starting the time-wait timer, turning off the other 2420 * standard timers. 2421 */ 2422 case TCPS_FIN_WAIT_2: 2423 KASSERT(headlocked == 1, ("%s: dodata: " 2424 "TCP_FIN_WAIT_2: head not locked", __func__)); 2425 tcp_twstart(tp); 2426 INP_INFO_WUNLOCK(&V_tcbinfo); 2427 return; 2428 } 2429 } 2430 INP_INFO_WUNLOCK(&V_tcbinfo); 2431 headlocked = 0; 2432#ifdef TCPDEBUG 2433 if (so->so_options & SO_DEBUG) 2434 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 2435 &tcp_savetcp, 0); 2436#endif 2437 2438 /* 2439 * Return any desired output. 2440 */ 2441 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2442 (void) tcp_output(tp); 2443 2444check_delack: 2445 KASSERT(headlocked == 0, ("%s: check_delack: head locked", 2446 __func__)); 2447 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 2448 INP_WLOCK_ASSERT(tp->t_inpcb); 2449 if (tp->t_flags & TF_DELACK) { 2450 tp->t_flags &= ~TF_DELACK; 2451 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 2452 } 2453 INP_WUNLOCK(tp->t_inpcb); 2454 return; 2455 2456dropafterack: 2457 KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__)); 2458 /* 2459 * Generate an ACK dropping incoming segment if it occupies 2460 * sequence space, where the ACK reflects our state. 2461 * 2462 * We can now skip the test for the RST flag since all 2463 * paths to this code happen after packets containing 2464 * RST have been dropped. 2465 * 2466 * In the SYN-RECEIVED state, don't send an ACK unless the 2467 * segment we received passes the SYN-RECEIVED ACK test. 2468 * If it fails send a RST. This breaks the loop in the 2469 * "LAND" DoS attack, and also prevents an ACK storm 2470 * between two listening ports that have been sent forged 2471 * SYN segments, each with the source address of the other. 2472 */ 2473 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 2474 (SEQ_GT(tp->snd_una, th->th_ack) || 2475 SEQ_GT(th->th_ack, tp->snd_max)) ) { 2476 rstreason = BANDLIM_RST_OPENPORT; 2477 goto dropwithreset; 2478 } 2479#ifdef TCPDEBUG 2480 if (so->so_options & SO_DEBUG) 2481 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2482 &tcp_savetcp, 0); 2483#endif 2484 KASSERT(headlocked, ("%s: headlocked should be 1", __func__)); 2485 INP_INFO_WUNLOCK(&V_tcbinfo); 2486 tp->t_flags |= TF_ACKNOW; 2487 (void) tcp_output(tp); 2488 INP_WUNLOCK(tp->t_inpcb); 2489 m_freem(m); 2490 return; 2491 2492dropwithreset: 2493 KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__)); 2494 INP_INFO_WUNLOCK(&V_tcbinfo); 2495 2496 if (tp != NULL) { 2497 tcp_dropwithreset(m, th, tp, tlen, rstreason); 2498 INP_WUNLOCK(tp->t_inpcb); 2499 } else 2500 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 2501 return; 2502 2503drop: 2504 /* 2505 * Drop space held by incoming segment and return. 2506 */ 2507#ifdef TCPDEBUG 2508 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 2509 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2510 &tcp_savetcp, 0); 2511#endif 2512 if (tp != NULL) 2513 INP_WUNLOCK(tp->t_inpcb); 2514 if (headlocked) 2515 INP_INFO_WUNLOCK(&V_tcbinfo); 2516 m_freem(m); 2517} 2518 2519/* 2520 * Issue RST and make ACK acceptable to originator of segment. 2521 * The mbuf must still include the original packet header. 2522 * tp may be NULL. 2523 */ 2524static void 2525tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, 2526 int tlen, int rstreason) 2527{ 2528 struct ip *ip; 2529#ifdef INET6 2530 struct ip6_hdr *ip6; 2531#endif 2532 2533 if (tp != NULL) { 2534 INP_WLOCK_ASSERT(tp->t_inpcb); 2535 } 2536 2537 /* Don't bother if destination was broadcast/multicast. */ 2538 if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2539 goto drop; 2540#ifdef INET6 2541 if (mtod(m, struct ip *)->ip_v == 6) { 2542 ip6 = mtod(m, struct ip6_hdr *); 2543 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 2544 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 2545 goto drop; 2546 /* IPv6 anycast check is done at tcp6_input() */ 2547 } else 2548#endif 2549 { 2550 ip = mtod(m, struct ip *); 2551 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 2552 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 2553 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 2554 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 2555 goto drop; 2556 } 2557 2558 /* Perform bandwidth limiting. */ 2559 if (badport_bandlim(rstreason) < 0) 2560 goto drop; 2561 2562 /* tcp_respond consumes the mbuf chain. */ 2563 if (th->th_flags & TH_ACK) { 2564 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, 2565 th->th_ack, TH_RST); 2566 } else { 2567 if (th->th_flags & TH_SYN) 2568 tlen++; 2569 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 2570 (tcp_seq)0, TH_RST|TH_ACK); 2571 } 2572 return; 2573drop: 2574 m_freem(m); 2575} 2576 2577/* 2578 * Parse TCP options and place in tcpopt. 2579 */ 2580static void 2581tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) 2582{ 2583 INIT_VNET_INET(curvnet); 2584 int opt, optlen; 2585 2586 to->to_flags = 0; 2587 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2588 opt = cp[0]; 2589 if (opt == TCPOPT_EOL) 2590 break; 2591 if (opt == TCPOPT_NOP) 2592 optlen = 1; 2593 else { 2594 if (cnt < 2) 2595 break; 2596 optlen = cp[1]; 2597 if (optlen < 2 || optlen > cnt) 2598 break; 2599 } 2600 switch (opt) { 2601 case TCPOPT_MAXSEG: 2602 if (optlen != TCPOLEN_MAXSEG) 2603 continue; 2604 if (!(flags & TO_SYN)) 2605 continue; 2606 to->to_flags |= TOF_MSS; 2607 bcopy((char *)cp + 2, 2608 (char *)&to->to_mss, sizeof(to->to_mss)); 2609 to->to_mss = ntohs(to->to_mss); 2610 break; 2611 case TCPOPT_WINDOW: 2612 if (optlen != TCPOLEN_WINDOW) 2613 continue; 2614 if (!(flags & TO_SYN)) 2615 continue; 2616 to->to_flags |= TOF_SCALE; 2617 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); 2618 break; 2619 case TCPOPT_TIMESTAMP: 2620 if (optlen != TCPOLEN_TIMESTAMP) 2621 continue; 2622 to->to_flags |= TOF_TS; 2623 bcopy((char *)cp + 2, 2624 (char *)&to->to_tsval, sizeof(to->to_tsval)); 2625 to->to_tsval = ntohl(to->to_tsval); 2626 bcopy((char *)cp + 6, 2627 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 2628 to->to_tsecr = ntohl(to->to_tsecr); 2629 break; 2630#ifdef TCP_SIGNATURE 2631 /* 2632 * XXX In order to reply to a host which has set the 2633 * TCP_SIGNATURE option in its initial SYN, we have to 2634 * record the fact that the option was observed here 2635 * for the syncache code to perform the correct response. 2636 */ 2637 case TCPOPT_SIGNATURE: 2638 if (optlen != TCPOLEN_SIGNATURE) 2639 continue; 2640 to->to_flags |= TOF_SIGNATURE; 2641 to->to_signature = cp + 2; 2642 break; 2643#endif 2644 case TCPOPT_SACK_PERMITTED: 2645 if (optlen != TCPOLEN_SACK_PERMITTED) 2646 continue; 2647 if (!(flags & TO_SYN)) 2648 continue; 2649 if (!V_tcp_do_sack) 2650 continue; 2651 to->to_flags |= TOF_SACKPERM; 2652 break; 2653 case TCPOPT_SACK: 2654 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2655 continue; 2656 if (flags & TO_SYN) 2657 continue; 2658 to->to_flags |= TOF_SACK; 2659 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 2660 to->to_sacks = cp + 2; 2661 V_tcpstat.tcps_sack_rcv_blocks++; 2662 break; 2663 default: 2664 continue; 2665 } 2666 } 2667} 2668 2669/* 2670 * Pull out of band byte out of a segment so 2671 * it doesn't appear in the user's data queue. 2672 * It is still reflected in the segment length for 2673 * sequencing purposes. 2674 */ 2675static void 2676tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, 2677 int off) 2678{ 2679 int cnt = off + th->th_urp - 1; 2680 2681 while (cnt >= 0) { 2682 if (m->m_len > cnt) { 2683 char *cp = mtod(m, caddr_t) + cnt; 2684 struct tcpcb *tp = sototcpcb(so); 2685 2686 INP_WLOCK_ASSERT(tp->t_inpcb); 2687 2688 tp->t_iobc = *cp; 2689 tp->t_oobflags |= TCPOOB_HAVEDATA; 2690 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2691 m->m_len--; 2692 if (m->m_flags & M_PKTHDR) 2693 m->m_pkthdr.len--; 2694 return; 2695 } 2696 cnt -= m->m_len; 2697 m = m->m_next; 2698 if (m == NULL) 2699 break; 2700 } 2701 panic("tcp_pulloutofband"); 2702} 2703 2704/* 2705 * Collect new round-trip time estimate 2706 * and update averages and current timeout. 2707 */ 2708static void 2709tcp_xmit_timer(struct tcpcb *tp, int rtt) 2710{ 2711 INIT_VNET_INET(tp->t_inpcb->inp_vnet); 2712 int delta; 2713 2714 INP_WLOCK_ASSERT(tp->t_inpcb); 2715 2716 V_tcpstat.tcps_rttupdated++; 2717 tp->t_rttupdated++; 2718 if (tp->t_srtt != 0) { 2719 /* 2720 * srtt is stored as fixed point with 5 bits after the 2721 * binary point (i.e., scaled by 8). The following magic 2722 * is equivalent to the smoothing algorithm in rfc793 with 2723 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2724 * point). Adjust rtt to origin 0. 2725 */ 2726 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 2727 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 2728 2729 if ((tp->t_srtt += delta) <= 0) 2730 tp->t_srtt = 1; 2731 2732 /* 2733 * We accumulate a smoothed rtt variance (actually, a 2734 * smoothed mean difference), then set the retransmit 2735 * timer to smoothed rtt + 4 times the smoothed variance. 2736 * rttvar is stored as fixed point with 4 bits after the 2737 * binary point (scaled by 16). The following is 2738 * equivalent to rfc793 smoothing with an alpha of .75 2739 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2740 * rfc793's wired-in beta. 2741 */ 2742 if (delta < 0) 2743 delta = -delta; 2744 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 2745 if ((tp->t_rttvar += delta) <= 0) 2746 tp->t_rttvar = 1; 2747 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 2748 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2749 } else { 2750 /* 2751 * No rtt measurement yet - use the unsmoothed rtt. 2752 * Set the variance to half the rtt (so our first 2753 * retransmit happens at 3*rtt). 2754 */ 2755 tp->t_srtt = rtt << TCP_RTT_SHIFT; 2756 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 2757 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2758 } 2759 tp->t_rtttime = 0; 2760 tp->t_rxtshift = 0; 2761 2762 /* 2763 * the retransmit should happen at rtt + 4 * rttvar. 2764 * Because of the way we do the smoothing, srtt and rttvar 2765 * will each average +1/2 tick of bias. When we compute 2766 * the retransmit timer, we want 1/2 tick of rounding and 2767 * 1 extra tick because of +-1/2 tick uncertainty in the 2768 * firing of the timer. The bias will give us exactly the 2769 * 1.5 tick we need. But, because the bias is 2770 * statistical, we have to test that we don't drop below 2771 * the minimum feasible timer (which is 2 ticks). 2772 */ 2773 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 2774 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 2775 2776 /* 2777 * We received an ack for a packet that wasn't retransmitted; 2778 * it is probably safe to discard any error indications we've 2779 * received recently. This isn't quite right, but close enough 2780 * for now (a route might have failed after we sent a segment, 2781 * and the return path might not be symmetrical). 2782 */ 2783 tp->t_softerror = 0; 2784} 2785 2786/* 2787 * Determine a reasonable value for maxseg size. 2788 * If the route is known, check route for mtu. 2789 * If none, use an mss that can be handled on the outgoing 2790 * interface without forcing IP to fragment; if bigger than 2791 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2792 * to utilize large mbufs. If no route is found, route has no mtu, 2793 * or the destination isn't local, use a default, hopefully conservative 2794 * size (usually 512 or the default IP max size, but no more than the mtu 2795 * of the interface), as we can't discover anything about intervening 2796 * gateways or networks. We also initialize the congestion/slow start 2797 * window to be a single segment if the destination isn't local. 2798 * While looking at the routing entry, we also initialize other path-dependent 2799 * parameters from pre-set or cached values in the routing entry. 2800 * 2801 * Also take into account the space needed for options that we 2802 * send regularly. Make maxseg shorter by that amount to assure 2803 * that we can send maxseg amount of data even when the options 2804 * are present. Store the upper limit of the length of options plus 2805 * data in maxopd. 2806 * 2807 * In case of T/TCP, we call this routine during implicit connection 2808 * setup as well (offer = -1), to initialize maxseg from the cached 2809 * MSS of our peer. 2810 * 2811 * NOTE that this routine is only called when we process an incoming 2812 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). 2813 */ 2814void 2815tcp_mss_update(struct tcpcb *tp, int offer, struct hc_metrics_lite *metricptr) 2816{ 2817 INIT_VNET_INET(tp->t_inpcb->inp_vnet); 2818 int mss; 2819 u_long maxmtu; 2820 struct inpcb *inp = tp->t_inpcb; 2821 struct hc_metrics_lite metrics; 2822 int origoffer = offer; 2823 int mtuflags = 0; 2824#ifdef INET6 2825 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 2826 size_t min_protoh = isipv6 ? 2827 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : 2828 sizeof (struct tcpiphdr); 2829#else 2830 const size_t min_protoh = sizeof(struct tcpiphdr); 2831#endif 2832 2833 INP_WLOCK_ASSERT(tp->t_inpcb); 2834 2835 /* Initialize. */ 2836#ifdef INET6 2837 if (isipv6) { 2838 maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); 2839 tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; 2840 } else 2841#endif 2842 { 2843 maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); 2844 tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; 2845 } 2846 2847 /* 2848 * No route to sender, stay with default mss and return. 2849 */ 2850 if (maxmtu == 0) { 2851 /* 2852 * In case we return early we need to intialize metrics 2853 * to a defined state as tcp_hc_get() would do for us 2854 * if there was no cache hit. 2855 */ 2856 if (metricptr != NULL) 2857 bzero(metricptr, sizeof(struct hc_metrics_lite)); 2858 return; 2859 } 2860 2861 /* Check the interface for TSO capabilities. */ 2862 if (mtuflags & CSUM_TSO) 2863 tp->t_flags |= TF_TSO; 2864 2865 /* What have we got? */ 2866 switch (offer) { 2867 case 0: 2868 /* 2869 * Offer == 0 means that there was no MSS on the SYN 2870 * segment, in this case we use tcp_mssdflt as 2871 * already assigned to t_maxopd above. 2872 */ 2873 offer = tp->t_maxopd; 2874 break; 2875 2876 case -1: 2877 /* 2878 * Offer == -1 means that we didn't receive SYN yet. 2879 */ 2880 /* FALLTHROUGH */ 2881 2882 default: 2883 /* 2884 * Prevent DoS attack with too small MSS. Round up 2885 * to at least minmss. 2886 */ 2887 offer = max(offer, V_tcp_minmss); 2888 } 2889 2890 /* 2891 * rmx information is now retrieved from tcp_hostcache. 2892 */ 2893 tcp_hc_get(&inp->inp_inc, &metrics); 2894 if (metricptr != NULL) 2895 bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); 2896 2897 /* 2898 * If there's a discovered mtu int tcp hostcache, use it 2899 * else, use the link mtu. 2900 */ 2901 if (metrics.rmx_mtu) 2902 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; 2903 else { 2904#ifdef INET6 2905 if (isipv6) { 2906 mss = maxmtu - min_protoh; 2907 if (!V_path_mtu_discovery && 2908 !in6_localaddr(&inp->in6p_faddr)) 2909 mss = min(mss, V_tcp_v6mssdflt); 2910 } else 2911#endif 2912 { 2913 mss = maxmtu - min_protoh; 2914 if (!V_path_mtu_discovery && 2915 !in_localaddr(inp->inp_faddr)) 2916 mss = min(mss, V_tcp_mssdflt); 2917 } 2918 /* 2919 * XXX - The above conditional (mss = maxmtu - min_protoh) 2920 * probably violates the TCP spec. 2921 * The problem is that, since we don't know the 2922 * other end's MSS, we are supposed to use a conservative 2923 * default. But, if we do that, then MTU discovery will 2924 * never actually take place, because the conservative 2925 * default is much less than the MTUs typically seen 2926 * on the Internet today. For the moment, we'll sweep 2927 * this under the carpet. 2928 * 2929 * The conservative default might not actually be a problem 2930 * if the only case this occurs is when sending an initial 2931 * SYN with options and data to a host we've never talked 2932 * to before. Then, they will reply with an MSS value which 2933 * will get recorded and the new parameters should get 2934 * recomputed. For Further Study. 2935 */ 2936 } 2937 mss = min(mss, offer); 2938 2939 /* 2940 * Sanity check: make sure that maxopd will be large 2941 * enough to allow some data on segments even if the 2942 * all the option space is used (40bytes). Otherwise 2943 * funny things may happen in tcp_output. 2944 */ 2945 mss = max(mss, 64); 2946 2947 /* 2948 * maxopd stores the maximum length of data AND options 2949 * in a segment; maxseg is the amount of data in a normal 2950 * segment. We need to store this value (maxopd) apart 2951 * from maxseg, because now every segment carries options 2952 * and thus we normally have somewhat less data in segments. 2953 */ 2954 tp->t_maxopd = mss; 2955 2956 /* 2957 * origoffer==-1 indicates that no segments were received yet. 2958 * In this case we just guess. 2959 */ 2960 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2961 (origoffer == -1 || 2962 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 2963 mss -= TCPOLEN_TSTAMP_APPA; 2964 2965#if (MCLBYTES & (MCLBYTES - 1)) == 0 2966 if (mss > MCLBYTES) 2967 mss &= ~(MCLBYTES-1); 2968#else 2969 if (mss > MCLBYTES) 2970 mss = mss / MCLBYTES * MCLBYTES; 2971#endif 2972 tp->t_maxseg = mss; 2973} 2974 2975void 2976tcp_mss(struct tcpcb *tp, int offer) 2977{ 2978 int rtt, mss; 2979 u_long bufsize; 2980 struct inpcb *inp; 2981 struct socket *so; 2982 struct hc_metrics_lite metrics; 2983#ifdef INET6 2984 int isipv6; 2985#endif 2986 KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); 2987 2988 tcp_mss_update(tp, offer, &metrics); 2989 2990 mss = tp->t_maxseg; 2991 inp = tp->t_inpcb; 2992#ifdef INET6 2993 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 2994#endif 2995 2996 /* 2997 * If there's a pipesize, change the socket buffer to that size, 2998 * don't change if sb_hiwat is different than default (then it 2999 * has been changed on purpose with setsockopt). 3000 * Make the socket buffers an integral number of mss units; 3001 * if the mss is larger than the socket buffer, decrease the mss. 3002 */ 3003 so = inp->inp_socket; 3004 SOCKBUF_LOCK(&so->so_snd); 3005 if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) 3006 bufsize = metrics.rmx_sendpipe; 3007 else 3008 bufsize = so->so_snd.sb_hiwat; 3009 if (bufsize < mss) 3010 mss = bufsize; 3011 else { 3012 bufsize = roundup(bufsize, mss); 3013 if (bufsize > sb_max) 3014 bufsize = sb_max; 3015 if (bufsize > so->so_snd.sb_hiwat) 3016 (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); 3017 } 3018 SOCKBUF_UNLOCK(&so->so_snd); 3019 tp->t_maxseg = mss; 3020 3021 SOCKBUF_LOCK(&so->so_rcv); 3022 if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) 3023 bufsize = metrics.rmx_recvpipe; 3024 else 3025 bufsize = so->so_rcv.sb_hiwat; 3026 if (bufsize > mss) { 3027 bufsize = roundup(bufsize, mss); 3028 if (bufsize > sb_max) 3029 bufsize = sb_max; 3030 if (bufsize > so->so_rcv.sb_hiwat) 3031 (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); 3032 } 3033 SOCKBUF_UNLOCK(&so->so_rcv); 3034 /* 3035 * While we're here, check the others too. 3036 */ 3037 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { 3038 tp->t_srtt = rtt; 3039 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; 3040 V_tcpstat.tcps_usedrtt++; 3041 if (metrics.rmx_rttvar) { 3042 tp->t_rttvar = metrics.rmx_rttvar; 3043 V_tcpstat.tcps_usedrttvar++; 3044 } else { 3045 /* default variation is +- 1 rtt */ 3046 tp->t_rttvar = 3047 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 3048 } 3049 TCPT_RANGESET(tp->t_rxtcur, 3050 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 3051 tp->t_rttmin, TCPTV_REXMTMAX); 3052 } 3053 if (metrics.rmx_ssthresh) { 3054 /* 3055 * There's some sort of gateway or interface 3056 * buffer limit on the path. Use this to set 3057 * the slow start threshhold, but set the 3058 * threshold to no less than 2*mss. 3059 */ 3060 tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); 3061 V_tcpstat.tcps_usedssthresh++; 3062 } 3063 if (metrics.rmx_bandwidth) 3064 tp->snd_bandwidth = metrics.rmx_bandwidth; 3065 3066 /* 3067 * Set the slow-start flight size depending on whether this 3068 * is a local network or not. 3069 * 3070 * Extend this so we cache the cwnd too and retrieve it here. 3071 * Make cwnd even bigger than RFC3390 suggests but only if we 3072 * have previous experience with the remote host. Be careful 3073 * not make cwnd bigger than remote receive window or our own 3074 * send socket buffer. Maybe put some additional upper bound 3075 * on the retrieved cwnd. Should do incremental updates to 3076 * hostcache when cwnd collapses so next connection doesn't 3077 * overloads the path again. 3078 * 3079 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. 3080 * We currently check only in syncache_socket for that. 3081 */ 3082#define TCP_METRICS_CWND 3083#ifdef TCP_METRICS_CWND 3084 if (metrics.rmx_cwnd) 3085 tp->snd_cwnd = max(mss, 3086 min(metrics.rmx_cwnd / 2, 3087 min(tp->snd_wnd, so->so_snd.sb_hiwat))); 3088 else 3089#endif 3090 if (V_tcp_do_rfc3390) 3091 tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); 3092#ifdef INET6 3093 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || 3094 (!isipv6 && in_localaddr(inp->inp_faddr))) 3095#else 3096 else if (in_localaddr(inp->inp_faddr)) 3097#endif 3098 tp->snd_cwnd = mss * V_ss_fltsz_local; 3099 else 3100 tp->snd_cwnd = mss * V_ss_fltsz; 3101} 3102 3103/* 3104 * Determine the MSS option to send on an outgoing SYN. 3105 */ 3106int 3107tcp_mssopt(struct in_conninfo *inc) 3108{ 3109 INIT_VNET_INET(curvnet); 3110 int mss = 0; 3111 u_long maxmtu = 0; 3112 u_long thcmtu = 0; 3113 size_t min_protoh; 3114#ifdef INET6 3115 int isipv6 = inc->inc_isipv6 ? 1 : 0; 3116#endif 3117 3118 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); 3119 3120#ifdef INET6 3121 if (isipv6) { 3122 mss = V_tcp_v6mssdflt; 3123 maxmtu = tcp_maxmtu6(inc, NULL); 3124 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3125 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 3126 } else 3127#endif 3128 { 3129 mss = V_tcp_mssdflt; 3130 maxmtu = tcp_maxmtu(inc, NULL); 3131 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3132 min_protoh = sizeof(struct tcpiphdr); 3133 } 3134 if (maxmtu && thcmtu) 3135 mss = min(maxmtu, thcmtu) - min_protoh; 3136 else if (maxmtu || thcmtu) 3137 mss = max(maxmtu, thcmtu) - min_protoh; 3138 3139 return (mss); 3140} 3141 3142 3143/* 3144 * On a partial ack arrives, force the retransmission of the 3145 * next unacknowledged segment. Do not clear tp->t_dupacks. 3146 * By setting snd_nxt to ti_ack, this forces retransmission timer to 3147 * be started again. 3148 */ 3149static void 3150tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) 3151{ 3152 tcp_seq onxt = tp->snd_nxt; 3153 u_long ocwnd = tp->snd_cwnd; 3154 3155 INP_WLOCK_ASSERT(tp->t_inpcb); 3156 3157 tcp_timer_activate(tp, TT_REXMT, 0); 3158 tp->t_rtttime = 0; 3159 tp->snd_nxt = th->th_ack; 3160 /* 3161 * Set snd_cwnd to one segment beyond acknowledged offset. 3162 * (tp->snd_una has not yet been updated when this function is called.) 3163 */ 3164 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3165 tp->t_flags |= TF_ACKNOW; 3166 (void) tcp_output(tp); 3167 tp->snd_cwnd = ocwnd; 3168 if (SEQ_GT(onxt, tp->snd_nxt)) 3169 tp->snd_nxt = onxt; 3170 /* 3171 * Partial window deflation. Relies on fact that tp->snd_una 3172 * not updated yet. 3173 */ 3174 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3175 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3176 else 3177 tp->snd_cwnd = 0; 3178 tp->snd_cwnd += tp->t_maxseg; 3179} 3180