tcp_input.c revision 167834
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 30 * $FreeBSD: head/sys/netinet/tcp_input.c 167834 2007-03-23 19:11:22Z andre $ 31 */ 32 33#include "opt_ipfw.h" /* for ipfw_fwd */ 34#include "opt_inet.h" 35#include "opt_inet6.h" 36#include "opt_ipsec.h" 37#include "opt_mac.h" 38#include "opt_tcpdebug.h" 39#include "opt_tcp_sack.h" 40 41#include <sys/param.h> 42#include <sys/kernel.h> 43#include <sys/malloc.h> 44#include <sys/mbuf.h> 45#include <sys/proc.h> /* for proc0 declaration */ 46#include <sys/protosw.h> 47#include <sys/signalvar.h> 48#include <sys/socket.h> 49#include <sys/socketvar.h> 50#include <sys/sysctl.h> 51#include <sys/syslog.h> 52#include <sys/systm.h> 53 54#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 55 56#include <vm/uma.h> 57 58#include <net/if.h> 59#include <net/route.h> 60 61#include <netinet/in.h> 62#include <netinet/in_pcb.h> 63#include <netinet/in_systm.h> 64#include <netinet/in_var.h> 65#include <netinet/ip.h> 66#include <netinet/ip_icmp.h> /* required for icmp_var.h */ 67#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 68#include <netinet/ip_var.h> 69#include <netinet/ip_options.h> 70#include <netinet/ip6.h> 71#include <netinet/icmp6.h> 72#include <netinet6/in6_pcb.h> 73#include <netinet6/ip6_var.h> 74#include <netinet6/nd6.h> 75#include <netinet/tcp.h> 76#include <netinet/tcp_fsm.h> 77#include <netinet/tcp_seq.h> 78#include <netinet/tcp_timer.h> 79#include <netinet/tcp_var.h> 80#include <netinet6/tcp6_var.h> 81#include <netinet/tcpip.h> 82#ifdef TCPDEBUG 83#include <netinet/tcp_debug.h> 84#endif /* TCPDEBUG */ 85 86#ifdef FAST_IPSEC 87#include <netipsec/ipsec.h> 88#include <netipsec/ipsec6.h> 89#endif /*FAST_IPSEC*/ 90 91#ifdef IPSEC 92#include <netinet6/ipsec.h> 93#include <netinet6/ipsec6.h> 94#include <netkey/key.h> 95#endif /*IPSEC*/ 96 97#include <machine/in_cksum.h> 98 99#include <security/mac/mac_framework.h> 100 101static const int tcprexmtthresh = 3; 102 103struct tcpstat tcpstat; 104SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, 105 &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 106 107static int tcp_log_in_vain = 0; 108SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, 109 &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); 110 111static int blackhole = 0; 112SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, 113 &blackhole, 0, "Do not send RST on segments to closed ports"); 114 115int tcp_delack_enabled = 1; 116SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, 117 &tcp_delack_enabled, 0, 118 "Delay ACK to try and piggyback it onto a data packet"); 119 120static int drop_synfin = 0; 121SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, 122 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); 123 124static int tcp_do_rfc3042 = 1; 125SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, 126 &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); 127 128static int tcp_do_rfc3390 = 1; 129SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, 130 &tcp_do_rfc3390, 0, 131 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); 132 133static int tcp_insecure_rst = 0; 134SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, 135 &tcp_insecure_rst, 0, 136 "Follow the old (insecure) criteria for accepting RST packets"); 137 138SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, 139 "TCP Segment Reassembly Queue"); 140 141static int tcp_reass_maxseg = 0; 142SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, 143 &tcp_reass_maxseg, 0, 144 "Global maximum number of TCP Segments in Reassembly Queue"); 145 146int tcp_reass_qsize = 0; 147SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, 148 &tcp_reass_qsize, 0, 149 "Global number of TCP Segments currently in Reassembly Queue"); 150 151static int tcp_reass_maxqlen = 48; 152SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, 153 &tcp_reass_maxqlen, 0, 154 "Maximum number of TCP Segments per individual Reassembly Queue"); 155 156static int tcp_reass_overflows = 0; 157SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, 158 &tcp_reass_overflows, 0, 159 "Global number of TCP Segment Reassembly Queue Overflows"); 160 161int tcp_do_autorcvbuf = 1; 162SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, 163 &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); 164 165int tcp_autorcvbuf_inc = 16*1024; 166SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, 167 &tcp_autorcvbuf_inc, 0, 168 "Incrementor step size of automatic receive buffer"); 169 170int tcp_autorcvbuf_max = 256*1024; 171SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, 172 &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); 173 174struct inpcbhead tcb; 175#define tcb6 tcb /* for KAME src sync over BSD*'s */ 176struct inpcbinfo tcbinfo; 177struct mtx *tcbinfo_mtx; 178 179static void tcp_dooptions(struct tcpopt *, u_char *, int, int); 180 181static void tcp_pulloutofband(struct socket *, 182 struct tcphdr *, struct mbuf *, int); 183static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, 184 struct mbuf *); 185static void tcp_xmit_timer(struct tcpcb *, int); 186static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); 187static int tcp_timewait(struct inpcb *, struct tcpopt *, 188 struct tcphdr *, struct mbuf *, int); 189 190/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ 191#ifdef INET6 192#define ND6_HINT(tp) \ 193do { \ 194 if ((tp) && (tp)->t_inpcb && \ 195 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ 196 nd6_nud_hint(NULL, NULL, 0); \ 197} while (0) 198#else 199#define ND6_HINT(tp) 200#endif 201 202/* 203 * Indicate whether this ack should be delayed. We can delay the ack if 204 * - there is no delayed ack timer in progress and 205 * - our last ack wasn't a 0-sized window. We never want to delay 206 * the ack that opens up a 0-sized window and 207 * - delayed acks are enabled or 208 * - this is a half-synchronized T/TCP connection. 209 */ 210#define DELAY_ACK(tp) \ 211 ((!callout_active(tp->tt_delack) && \ 212 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 213 (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 214 215/* Initialize TCP reassembly queue */ 216static void 217tcp_reass_zone_change(void *tag) 218{ 219 220 tcp_reass_maxseg = nmbclusters / 16; 221 uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); 222} 223 224uma_zone_t tcp_reass_zone; 225void 226tcp_reass_init() 227{ 228 tcp_reass_maxseg = nmbclusters / 16; 229 TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", 230 &tcp_reass_maxseg); 231 tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), 232 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 233 uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); 234 EVENTHANDLER_REGISTER(nmbclusters_change, 235 tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); 236} 237 238static int 239tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) 240{ 241 struct tseg_qent *q; 242 struct tseg_qent *p = NULL; 243 struct tseg_qent *nq; 244 struct tseg_qent *te = NULL; 245 struct socket *so = tp->t_inpcb->inp_socket; 246 int flags; 247 248 INP_LOCK_ASSERT(tp->t_inpcb); 249 250 /* 251 * XXX: tcp_reass() is rather inefficient with its data structures 252 * and should be rewritten (see NetBSD for optimizations). While 253 * doing that it should move to its own file tcp_reass.c. 254 */ 255 256 /* 257 * Call with th==NULL after become established to 258 * force pre-ESTABLISHED data up to user socket. 259 */ 260 if (th == NULL) 261 goto present; 262 263 /* 264 * Limit the number of segments in the reassembly queue to prevent 265 * holding on to too many segments (and thus running out of mbufs). 266 * Make sure to let the missing segment through which caused this 267 * queue. Always keep one global queue entry spare to be able to 268 * process the missing segment. 269 */ 270 if (th->th_seq != tp->rcv_nxt && 271 (tcp_reass_qsize + 1 >= tcp_reass_maxseg || 272 tp->t_segqlen >= tcp_reass_maxqlen)) { 273 tcp_reass_overflows++; 274 tcpstat.tcps_rcvmemdrop++; 275 m_freem(m); 276 *tlenp = 0; 277 return (0); 278 } 279 280 /* 281 * Allocate a new queue entry. If we can't, or hit the zone limit 282 * just drop the pkt. 283 */ 284 te = uma_zalloc(tcp_reass_zone, M_NOWAIT); 285 if (te == NULL) { 286 tcpstat.tcps_rcvmemdrop++; 287 m_freem(m); 288 *tlenp = 0; 289 return (0); 290 } 291 tp->t_segqlen++; 292 tcp_reass_qsize++; 293 294 /* 295 * Find a segment which begins after this one does. 296 */ 297 LIST_FOREACH(q, &tp->t_segq, tqe_q) { 298 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) 299 break; 300 p = q; 301 } 302 303 /* 304 * If there is a preceding segment, it may provide some of 305 * our data already. If so, drop the data from the incoming 306 * segment. If it provides all of our data, drop us. 307 */ 308 if (p != NULL) { 309 int i; 310 /* conversion to int (in i) handles seq wraparound */ 311 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; 312 if (i > 0) { 313 if (i >= *tlenp) { 314 tcpstat.tcps_rcvduppack++; 315 tcpstat.tcps_rcvdupbyte += *tlenp; 316 m_freem(m); 317 uma_zfree(tcp_reass_zone, te); 318 tp->t_segqlen--; 319 tcp_reass_qsize--; 320 /* 321 * Try to present any queued data 322 * at the left window edge to the user. 323 * This is needed after the 3-WHS 324 * completes. 325 */ 326 goto present; /* ??? */ 327 } 328 m_adj(m, i); 329 *tlenp -= i; 330 th->th_seq += i; 331 } 332 } 333 tcpstat.tcps_rcvoopack++; 334 tcpstat.tcps_rcvoobyte += *tlenp; 335 336 /* 337 * While we overlap succeeding segments trim them or, 338 * if they are completely covered, dequeue them. 339 */ 340 while (q) { 341 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; 342 if (i <= 0) 343 break; 344 if (i < q->tqe_len) { 345 q->tqe_th->th_seq += i; 346 q->tqe_len -= i; 347 m_adj(q->tqe_m, i); 348 break; 349 } 350 351 nq = LIST_NEXT(q, tqe_q); 352 LIST_REMOVE(q, tqe_q); 353 m_freem(q->tqe_m); 354 uma_zfree(tcp_reass_zone, q); 355 tp->t_segqlen--; 356 tcp_reass_qsize--; 357 q = nq; 358 } 359 360 /* Insert the new segment queue entry into place. */ 361 te->tqe_m = m; 362 te->tqe_th = th; 363 te->tqe_len = *tlenp; 364 365 if (p == NULL) { 366 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); 367 } else { 368 LIST_INSERT_AFTER(p, te, tqe_q); 369 } 370 371present: 372 /* 373 * Present data to user, advancing rcv_nxt through 374 * completed sequence space. 375 */ 376 if (!TCPS_HAVEESTABLISHED(tp->t_state)) 377 return (0); 378 q = LIST_FIRST(&tp->t_segq); 379 if (!q || q->tqe_th->th_seq != tp->rcv_nxt) 380 return (0); 381 SOCKBUF_LOCK(&so->so_rcv); 382 do { 383 tp->rcv_nxt += q->tqe_len; 384 flags = q->tqe_th->th_flags & TH_FIN; 385 nq = LIST_NEXT(q, tqe_q); 386 LIST_REMOVE(q, tqe_q); 387 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 388 m_freem(q->tqe_m); 389 else 390 sbappendstream_locked(&so->so_rcv, q->tqe_m); 391 uma_zfree(tcp_reass_zone, q); 392 tp->t_segqlen--; 393 tcp_reass_qsize--; 394 q = nq; 395 } while (q && q->tqe_th->th_seq == tp->rcv_nxt); 396 ND6_HINT(tp); 397 sorwakeup_locked(so); 398 return (flags); 399} 400 401/* 402 * TCP input routine, follows pages 65-76 of the 403 * protocol specification dated September, 1981 very closely. 404 */ 405#ifdef INET6 406int 407tcp6_input(struct mbuf **mp, int *offp, int proto) 408{ 409 struct mbuf *m = *mp; 410 struct in6_ifaddr *ia6; 411 412 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); 413 414 /* 415 * draft-itojun-ipv6-tcp-to-anycast 416 * better place to put this in? 417 */ 418 ia6 = ip6_getdstifaddr(m); 419 if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { 420 struct ip6_hdr *ip6; 421 422 ip6 = mtod(m, struct ip6_hdr *); 423 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 424 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 425 return IPPROTO_DONE; 426 } 427 428 tcp_input(m, *offp); 429 return IPPROTO_DONE; 430} 431#endif 432 433void 434tcp_input(struct mbuf *m, int off0) 435{ 436 struct tcphdr *th; 437 struct ip *ip = NULL; 438 struct ipovly *ipov; 439 struct inpcb *inp = NULL; 440 u_char *optp = NULL; 441 int optlen = 0; 442 int len, tlen, off; 443 int drop_hdrlen; 444 struct tcpcb *tp = NULL; 445 int thflags; 446 struct socket *so = NULL; 447 int todrop, acked, ourfinisacked, needoutput = 0; 448 u_long tiwin; 449 struct tcpopt to; /* options in this segment */ 450 int headlocked = 0; 451#ifdef IPFIREWALL_FORWARD 452 struct m_tag *fwd_tag; 453#endif 454 int rstreason; /* For badport_bandlim accounting purposes */ 455 456 struct ip6_hdr *ip6 = NULL; 457#ifdef INET6 458 int isipv6; 459 char ip6buf[INET6_ADDRSTRLEN]; 460#else 461 const int isipv6 = 0; 462#endif 463 464#ifdef TCPDEBUG 465 /* 466 * The size of tcp_saveipgen must be the size of the max ip header, 467 * now IPv6. 468 */ 469 u_char tcp_saveipgen[40]; 470 struct tcphdr tcp_savetcp; 471 short ostate = 0; 472#endif 473 474#ifdef INET6 475 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 476#endif 477 bzero((char *)&to, sizeof(to)); 478 479 tcpstat.tcps_rcvtotal++; 480 481 if (isipv6) { 482#ifdef INET6 483 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ 484 ip6 = mtod(m, struct ip6_hdr *); 485 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 486 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { 487 tcpstat.tcps_rcvbadsum++; 488 goto drop; 489 } 490 th = (struct tcphdr *)((caddr_t)ip6 + off0); 491 492 /* 493 * Be proactive about unspecified IPv6 address in source. 494 * As we use all-zero to indicate unbounded/unconnected pcb, 495 * unspecified IPv6 address can be used to confuse us. 496 * 497 * Note that packets with unspecified IPv6 destination is 498 * already dropped in ip6_input. 499 */ 500 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 501 /* XXX stat */ 502 goto drop; 503 } 504#else 505 th = NULL; /* XXX: avoid compiler warning */ 506#endif 507 } else { 508 /* 509 * Get IP and TCP header together in first mbuf. 510 * Note: IP leaves IP header in first mbuf. 511 */ 512 if (off0 > sizeof (struct ip)) { 513 ip_stripoptions(m, (struct mbuf *)0); 514 off0 = sizeof(struct ip); 515 } 516 if (m->m_len < sizeof (struct tcpiphdr)) { 517 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 518 == NULL) { 519 tcpstat.tcps_rcvshort++; 520 return; 521 } 522 } 523 ip = mtod(m, struct ip *); 524 ipov = (struct ipovly *)ip; 525 th = (struct tcphdr *)((caddr_t)ip + off0); 526 tlen = ip->ip_len; 527 528 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 529 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 530 th->th_sum = m->m_pkthdr.csum_data; 531 else 532 th->th_sum = in_pseudo(ip->ip_src.s_addr, 533 ip->ip_dst.s_addr, 534 htonl(m->m_pkthdr.csum_data + 535 ip->ip_len + 536 IPPROTO_TCP)); 537 th->th_sum ^= 0xffff; 538#ifdef TCPDEBUG 539 ipov->ih_len = (u_short)tlen; 540 ipov->ih_len = htons(ipov->ih_len); 541#endif 542 } else { 543 /* 544 * Checksum extended TCP header and data. 545 */ 546 len = sizeof (struct ip) + tlen; 547 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 548 ipov->ih_len = (u_short)tlen; 549 ipov->ih_len = htons(ipov->ih_len); 550 th->th_sum = in_cksum(m, len); 551 } 552 if (th->th_sum) { 553 tcpstat.tcps_rcvbadsum++; 554 goto drop; 555 } 556 /* Re-initialization for later version check */ 557 ip->ip_v = IPVERSION; 558 } 559 560 /* 561 * Check that TCP offset makes sense, 562 * pull out TCP options and adjust length. XXX 563 */ 564 off = th->th_off << 2; 565 if (off < sizeof (struct tcphdr) || off > tlen) { 566 tcpstat.tcps_rcvbadoff++; 567 goto drop; 568 } 569 tlen -= off; /* tlen is used instead of ti->ti_len */ 570 if (off > sizeof (struct tcphdr)) { 571 if (isipv6) { 572#ifdef INET6 573 IP6_EXTHDR_CHECK(m, off0, off, ); 574 ip6 = mtod(m, struct ip6_hdr *); 575 th = (struct tcphdr *)((caddr_t)ip6 + off0); 576#endif 577 } else { 578 if (m->m_len < sizeof(struct ip) + off) { 579 if ((m = m_pullup(m, sizeof (struct ip) + off)) 580 == NULL) { 581 tcpstat.tcps_rcvshort++; 582 return; 583 } 584 ip = mtod(m, struct ip *); 585 ipov = (struct ipovly *)ip; 586 th = (struct tcphdr *)((caddr_t)ip + off0); 587 } 588 } 589 optlen = off - sizeof (struct tcphdr); 590 optp = (u_char *)(th + 1); 591 } 592 thflags = th->th_flags; 593 594 /* 595 * If the drop_synfin option is enabled, drop all packets with 596 * both the SYN and FIN bits set. This prevents e.g. nmap from 597 * identifying the TCP/IP stack. 598 * 599 * This is a violation of the TCP specification. 600 */ 601 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) 602 goto drop; 603 604 /* 605 * Convert TCP protocol specific fields to host format. 606 */ 607 th->th_seq = ntohl(th->th_seq); 608 th->th_ack = ntohl(th->th_ack); 609 th->th_win = ntohs(th->th_win); 610 th->th_urp = ntohs(th->th_urp); 611 612 /* 613 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. 614 */ 615 drop_hdrlen = off0 + off; 616 617 /* 618 * Locate pcb for segment. 619 */ 620 INP_INFO_WLOCK(&tcbinfo); 621 headlocked = 1; 622findpcb: 623 KASSERT(headlocked, ("tcp_input: findpcb: head not locked")); 624#ifdef IPFIREWALL_FORWARD 625 /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ 626 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 627 628 if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ 629 struct sockaddr_in *next_hop; 630 631 next_hop = (struct sockaddr_in *)(fwd_tag+1); 632 /* 633 * Transparently forwarded. Pretend to be the destination. 634 * already got one like this? 635 */ 636 inp = in_pcblookup_hash(&tcbinfo, 637 ip->ip_src, th->th_sport, 638 ip->ip_dst, th->th_dport, 639 0, m->m_pkthdr.rcvif); 640 if (!inp) { 641 /* It's new. Try to find the ambushing socket. */ 642 inp = in_pcblookup_hash(&tcbinfo, 643 ip->ip_src, th->th_sport, 644 next_hop->sin_addr, 645 next_hop->sin_port ? 646 ntohs(next_hop->sin_port) : 647 th->th_dport, 648 INPLOOKUP_WILDCARD, 649 m->m_pkthdr.rcvif); 650 } 651 /* Remove the tag from the packet. We don't need it anymore. */ 652 m_tag_delete(m, fwd_tag); 653 } else 654#endif /* IPFIREWALL_FORWARD */ 655 { 656 if (isipv6) { 657#ifdef INET6 658 inp = in6_pcblookup_hash(&tcbinfo, 659 &ip6->ip6_src, th->th_sport, 660 &ip6->ip6_dst, th->th_dport, 661 INPLOOKUP_WILDCARD, 662 m->m_pkthdr.rcvif); 663#endif 664 } else 665 inp = in_pcblookup_hash(&tcbinfo, 666 ip->ip_src, th->th_sport, 667 ip->ip_dst, th->th_dport, 668 INPLOOKUP_WILDCARD, 669 m->m_pkthdr.rcvif); 670 } 671 672#if defined(IPSEC) || defined(FAST_IPSEC) 673#ifdef INET6 674 if (isipv6 && inp != NULL && ipsec6_in_reject(m, inp)) { 675#ifdef IPSEC 676 ipsec6stat.in_polvio++; 677#endif 678 goto drop; 679 } else 680#endif /* INET6 */ 681 if (inp != NULL && ipsec4_in_reject(m, inp)) { 682#ifdef IPSEC 683 ipsecstat.in_polvio++; 684#endif 685 goto drop; 686 } 687#endif /*IPSEC || FAST_IPSEC*/ 688 689 /* 690 * If the INPCB does not exist then all data in the incoming 691 * segment is discarded and an appropriate RST is sent back. 692 */ 693 if (inp == NULL) { 694 /* 695 * Log communication attempts to ports that are not 696 * in use. 697 */ 698 if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || 699 tcp_log_in_vain == 2) { 700#ifndef INET6 701 char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; 702#else 703 char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2]; 704 if (isipv6) { 705 strcpy(dbuf, "["); 706 strcat(dbuf, 707 ip6_sprintf(ip6buf, &ip6->ip6_dst)); 708 strcat(dbuf, "]"); 709 strcpy(sbuf, "["); 710 strcat(sbuf, 711 ip6_sprintf(ip6buf, &ip6->ip6_src)); 712 strcat(sbuf, "]"); 713 } else 714#endif /* INET6 */ 715 { 716 strcpy(dbuf, inet_ntoa(ip->ip_dst)); 717 strcpy(sbuf, inet_ntoa(ip->ip_src)); 718 } 719 log(LOG_INFO, 720 "Connection attempt to TCP %s:%d " 721 "from %s:%d flags:0x%02x\n", 722 dbuf, ntohs(th->th_dport), sbuf, 723 ntohs(th->th_sport), thflags); 724 } 725 /* 726 * When blackholing do not respond with a RST but 727 * completely ignore the segment and drop it. 728 */ 729 if ((blackhole == 1 && (thflags & TH_SYN)) || 730 blackhole == 2) 731 goto drop; 732 733 rstreason = BANDLIM_RST_CLOSEDPORT; 734 goto dropwithreset; 735 } 736 INP_LOCK(inp); 737 738 /* Check the minimum TTL for socket. */ 739 if (inp->inp_ip_minttl != 0) { 740#ifdef INET6 741 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) 742 goto drop; 743 else 744#endif 745 if (inp->inp_ip_minttl > ip->ip_ttl) 746 goto drop; 747 } 748 749 /* 750 * A previous connection in TIMEWAIT state is supposed to catch 751 * stray or duplicate segments arriving late. If this segment 752 * was a legitimate new connection attempt the old INPCB gets 753 * removed and we can try again to find a listening socket. 754 */ 755 if (inp->inp_vflag & INP_TIMEWAIT) { 756 if (thflags & TH_SYN) 757 tcp_dooptions(&to, optp, optlen, TO_SYN); 758 if (tcp_timewait(inp, &to, th, m, tlen)) 759 goto findpcb; 760 /* tcp_timewait unlocks inp. */ 761 INP_INFO_WUNLOCK(&tcbinfo); 762 return; 763 } 764 /* 765 * The TCPCB may no longer exist if the connection is winding 766 * down or it is in the CLOSED state. Either way we drop the 767 * segment and send an appropriate response. 768 */ 769 tp = intotcpcb(inp); 770 if (tp == NULL) { 771 INP_UNLOCK(inp); 772 rstreason = BANDLIM_RST_CLOSEDPORT; 773 goto dropwithreset; 774 } 775 if (tp->t_state == TCPS_CLOSED) 776 goto drop; 777 778#ifdef MAC 779 INP_LOCK_ASSERT(inp); 780 if (mac_check_inpcb_deliver(inp, m)) 781 goto drop; 782#endif 783 so = inp->inp_socket; 784 KASSERT(so != NULL, ("tcp_input: so == NULL")); 785#ifdef TCPDEBUG 786 if (so->so_options & SO_DEBUG) { 787 ostate = tp->t_state; 788 if (isipv6) 789 bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); 790 else 791 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); 792 tcp_savetcp = *th; 793 } 794#endif 795 /* 796 * When the socket is accepting connections (the INPCB is in LISTEN 797 * state) we look into the SYN cache if this is a new connection 798 * attempt or the completion of a previous one. 799 */ 800 if (so->so_options & SO_ACCEPTCONN) { 801 struct in_conninfo inc; 802 803 bzero(&inc, sizeof(inc)); 804 inc.inc_isipv6 = isipv6; 805 if (isipv6) { 806 inc.inc6_faddr = ip6->ip6_src; 807 inc.inc6_laddr = ip6->ip6_dst; 808 } else { 809 inc.inc_faddr = ip->ip_src; 810 inc.inc_laddr = ip->ip_dst; 811 } 812 inc.inc_fport = th->th_sport; 813 inc.inc_lport = th->th_dport; 814 815 /* 816 * If the state is LISTEN then ignore segment if it contains 817 * a RST. If the segment contains an ACK then it is bad and 818 * send a RST. If it does not contain a SYN then it is not 819 * interesting; drop it. 820 * 821 * If the state is SYN_RECEIVED (syncache) and seg contains 822 * an ACK, but not for our SYN/ACK, send a RST. If the seg 823 * contains a RST, check the sequence number to see if it 824 * is a valid reset segment. 825 */ 826 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 827 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 828 /* 829 * Parse the TCP options here because 830 * syncookies need access to the reflected 831 * timestamp. 832 */ 833 tcp_dooptions(&to, optp, optlen, 0); 834 if (!syncache_expand(&inc, &to, th, &so, m)) { 835 /* 836 * No syncache entry, or ACK was not 837 * for our SYN/ACK. Send a RST. 838 */ 839 tcpstat.tcps_badsyn++; 840 rstreason = BANDLIM_RST_OPENPORT; 841 goto dropwithreset; 842 } 843 if (so == NULL) { 844 /* 845 * Could not complete 3-way handshake, 846 * connection is being closed down, and 847 * syncache has free'd mbuf. 848 */ 849 INP_UNLOCK(inp); 850 INP_INFO_WUNLOCK(&tcbinfo); 851 return; 852 } 853 /* 854 * Socket is created in state SYN_RECEIVED. 855 * Continue processing segment. 856 */ 857 INP_UNLOCK(inp); 858 inp = sotoinpcb(so); 859 INP_LOCK(inp); 860 tp = intotcpcb(inp); 861 /* 862 * This is what would have happened in 863 * tcp_output() when the SYN,ACK was sent. 864 */ 865 tp->snd_up = tp->snd_una; 866 tp->snd_max = tp->snd_nxt = tp->iss + 1; 867 tp->last_ack_sent = tp->rcv_nxt; 868 goto after_listen; 869 } 870 if (thflags & TH_RST) { 871 syncache_chkrst(&inc, th); 872 goto drop; 873 } 874 if (thflags & TH_ACK) { 875 syncache_badack(&inc); 876 tcpstat.tcps_badsyn++; 877 rstreason = BANDLIM_RST_OPENPORT; 878 goto dropwithreset; 879 } 880 goto drop; 881 } 882 883 /* 884 * Segment's flags are (SYN) or (SYN|FIN). 885 */ 886#ifdef INET6 887 /* 888 * If deprecated address is forbidden, 889 * we do not accept SYN to deprecated interface 890 * address to prevent any new inbound connection from 891 * getting established. 892 * When we do not accept SYN, we send a TCP RST, 893 * with deprecated source address (instead of dropping 894 * it). We compromise it as it is much better for peer 895 * to send a RST, and RST will be the final packet 896 * for the exchange. 897 * 898 * If we do not forbid deprecated addresses, we accept 899 * the SYN packet. RFC2462 does not suggest dropping 900 * SYN in this case. 901 * If we decipher RFC2462 5.5.4, it says like this: 902 * 1. use of deprecated addr with existing 903 * communication is okay - "SHOULD continue to be 904 * used" 905 * 2. use of it with new communication: 906 * (2a) "SHOULD NOT be used if alternate address 907 * with sufficient scope is available" 908 * (2b) nothing mentioned otherwise. 909 * Here we fall into (2b) case as we have no choice in 910 * our source address selection - we must obey the peer. 911 * 912 * The wording in RFC2462 is confusing, and there are 913 * multiple description text for deprecated address 914 * handling - worse, they are not exactly the same. 915 * I believe 5.5.4 is the best one, so we follow 5.5.4. 916 */ 917 if (isipv6 && !ip6_use_deprecated) { 918 struct in6_ifaddr *ia6; 919 920 if ((ia6 = ip6_getdstifaddr(m)) && 921 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 922 INP_UNLOCK(inp); 923 tp = NULL; 924 rstreason = BANDLIM_RST_OPENPORT; 925 goto dropwithreset; 926 } 927 } 928#endif 929 /* 930 * Basic sanity checks on incoming SYN requests: 931 * 932 * Don't bother responding if the destination was a 933 * broadcast according to RFC1122 4.2.3.10, p. 104. 934 * 935 * If it is from this socket, drop it, it must be forged. 936 * 937 * Note that it is quite possible to receive unicast 938 * link-layer packets with a broadcast IP address. Use 939 * in_broadcast() to find them. 940 */ 941 if (m->m_flags & (M_BCAST|M_MCAST)) 942 goto drop; 943 if (isipv6) { 944#ifdef INET6 945 if (th->th_dport == th->th_sport && 946 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) 947 goto drop; 948 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 949 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 950 goto drop; 951#endif 952 } else { 953 if (th->th_dport == th->th_sport && 954 ip->ip_dst.s_addr == ip->ip_src.s_addr) 955 goto drop; 956 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 957 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 958 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 959 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 960 goto drop; 961 } 962 /* 963 * SYN appears to be valid. Create compressed TCP state 964 * for syncache. 965 */ 966 if (so->so_qlen <= so->so_qlimit) { 967#ifdef TCPDEBUG 968 if (so->so_options & SO_DEBUG) 969 tcp_trace(TA_INPUT, ostate, tp, 970 (void *)tcp_saveipgen, &tcp_savetcp, 0); 971#endif 972 tcp_dooptions(&to, optp, optlen, TO_SYN); 973 if (!syncache_add(&inc, &to, th, inp, &so, m)) 974 goto drop; 975 /* 976 * Entry added to syncache, mbuf used to 977 * send SYN-ACK packet. Everything unlocked 978 * already. 979 */ 980 return; 981 } 982 /* Catch all. Everthing that makes it down here is junk. */ 983 goto drop; 984 } 985 986after_listen: 987 KASSERT(headlocked, ("tcp_input: after_listen: head not locked")); 988 INP_LOCK_ASSERT(inp); 989 990 /* Syncache takes care of sockets in the listen state. */ 991 KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN")); 992 993 /* 994 * Segment received on connection. 995 * Reset idle time and keep-alive timer. 996 */ 997 tp->t_rcvtime = ticks; 998 if (TCPS_HAVEESTABLISHED(tp->t_state)) 999 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); 1000 1001 /* 1002 * Unscale the window into a 32-bit value. 1003 * This value is bogus for the TCPS_SYN_SENT state 1004 * and is overwritten later. 1005 */ 1006 tiwin = th->th_win << tp->snd_scale; 1007 1008 /* 1009 * Parse options on any incoming segment. 1010 */ 1011 tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); 1012 1013 /* 1014 * If echoed timestamp is later than the current time, 1015 * fall back to non RFC1323 RTT calculation. Normalize 1016 * timestamp if syncookies were used when this connection 1017 * was established. 1018 */ 1019 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1020 to.to_tsecr -= tp->ts_offset; 1021 if (TSTMP_GT(to.to_tsecr, ticks)) 1022 to.to_tsecr = 0; 1023 } 1024 1025 /* 1026 * Process options only when we get SYN/ACK back. The SYN case 1027 * for incoming connections is handled in tcp_syncache. 1028 * XXX this is traditional behavior, may need to be cleaned up. 1029 */ 1030 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1031 if ((to.to_flags & TOF_SCALE) && 1032 (tp->t_flags & TF_REQ_SCALE)) { 1033 tp->t_flags |= TF_RCVD_SCALE; 1034 tp->snd_scale = to.to_wscale; 1035 tp->snd_wnd = th->th_win << tp->snd_scale; 1036 tiwin = tp->snd_wnd; 1037 } 1038 if (to.to_flags & TOF_TS) { 1039 tp->t_flags |= TF_RCVD_TSTMP; 1040 tp->ts_recent = to.to_tsval; 1041 tp->ts_recent_age = ticks; 1042 } 1043 /* Initial send window, already scaled. */ 1044 tp->snd_wnd = th->th_win; 1045 if (to.to_flags & TOF_MSS) 1046 tcp_mss(tp, to.to_mss); 1047 if (tp->sack_enable) { 1048 if (!(to.to_flags & TOF_SACKPERM)) 1049 tp->sack_enable = 0; 1050 else 1051 tp->t_flags |= TF_SACK_PERMIT; 1052 } 1053 1054 } 1055 1056 /* 1057 * Header prediction: check for the two common cases 1058 * of a uni-directional data xfer. If the packet has 1059 * no control flags, is in-sequence, the window didn't 1060 * change and we're not retransmitting, it's a 1061 * candidate. If the length is zero and the ack moved 1062 * forward, we're the sender side of the xfer. Just 1063 * free the data acked & wake any higher level process 1064 * that was blocked waiting for space. If the length 1065 * is non-zero and the ack didn't move, we're the 1066 * receiver side. If we're getting packets in-order 1067 * (the reassembly queue is empty), add the data to 1068 * the socket buffer and note that we need a delayed ack. 1069 * Make sure that the hidden state-flags are also off. 1070 * Since we check for TCPS_ESTABLISHED above, it can only 1071 * be TH_NEEDSYN. 1072 */ 1073 if (tp->t_state == TCPS_ESTABLISHED && 1074 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1075 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1076 ((to.to_flags & TOF_TS) == 0 || 1077 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && 1078 th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && 1079 tp->snd_nxt == tp->snd_max) { 1080 1081 /* 1082 * If last ACK falls within this segment's sequence numbers, 1083 * record the timestamp. 1084 * NOTE that the test is modified according to the latest 1085 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1086 */ 1087 if ((to.to_flags & TOF_TS) != 0 && 1088 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1089 tp->ts_recent_age = ticks; 1090 tp->ts_recent = to.to_tsval; 1091 } 1092 1093 if (tlen == 0) { 1094 if (SEQ_GT(th->th_ack, tp->snd_una) && 1095 SEQ_LEQ(th->th_ack, tp->snd_max) && 1096 tp->snd_cwnd >= tp->snd_wnd && 1097 ((!tcp_do_newreno && !tp->sack_enable && 1098 tp->t_dupacks < tcprexmtthresh) || 1099 ((tcp_do_newreno || tp->sack_enable) && 1100 !IN_FASTRECOVERY(tp) && 1101 (to.to_flags & TOF_SACK) == 0 && 1102 TAILQ_EMPTY(&tp->snd_holes)))) { 1103 KASSERT(headlocked, ("headlocked")); 1104 INP_INFO_WUNLOCK(&tcbinfo); 1105 headlocked = 0; 1106 /* 1107 * this is a pure ack for outstanding data. 1108 */ 1109 ++tcpstat.tcps_predack; 1110 /* 1111 * "bad retransmit" recovery 1112 */ 1113 if (tp->t_rxtshift == 1 && 1114 ticks < tp->t_badrxtwin) { 1115 ++tcpstat.tcps_sndrexmitbad; 1116 tp->snd_cwnd = tp->snd_cwnd_prev; 1117 tp->snd_ssthresh = 1118 tp->snd_ssthresh_prev; 1119 tp->snd_recover = tp->snd_recover_prev; 1120 if (tp->t_flags & TF_WASFRECOVERY) 1121 ENTER_FASTRECOVERY(tp); 1122 tp->snd_nxt = tp->snd_max; 1123 tp->t_badrxtwin = 0; 1124 } 1125 1126 /* 1127 * Recalculate the transmit timer / rtt. 1128 * 1129 * Some boxes send broken timestamp replies 1130 * during the SYN+ACK phase, ignore 1131 * timestamps of 0 or we could calculate a 1132 * huge RTT and blow up the retransmit timer. 1133 */ 1134 if ((to.to_flags & TOF_TS) != 0 && 1135 to.to_tsecr) { 1136 if (!tp->t_rttlow || 1137 tp->t_rttlow > ticks - to.to_tsecr) 1138 tp->t_rttlow = ticks - to.to_tsecr; 1139 tcp_xmit_timer(tp, 1140 ticks - to.to_tsecr + 1); 1141 } else if (tp->t_rtttime && 1142 SEQ_GT(th->th_ack, tp->t_rtseq)) { 1143 if (!tp->t_rttlow || 1144 tp->t_rttlow > ticks - tp->t_rtttime) 1145 tp->t_rttlow = ticks - tp->t_rtttime; 1146 tcp_xmit_timer(tp, 1147 ticks - tp->t_rtttime); 1148 } 1149 tcp_xmit_bandwidth_limit(tp, th->th_ack); 1150 acked = th->th_ack - tp->snd_una; 1151 tcpstat.tcps_rcvackpack++; 1152 tcpstat.tcps_rcvackbyte += acked; 1153 sbdrop(&so->so_snd, acked); 1154 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 1155 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1156 tp->snd_recover = th->th_ack - 1; 1157 tp->snd_una = th->th_ack; 1158 /* 1159 * pull snd_wl2 up to prevent seq wrap relative 1160 * to th_ack. 1161 */ 1162 tp->snd_wl2 = th->th_ack; 1163 tp->t_dupacks = 0; 1164 m_freem(m); 1165 ND6_HINT(tp); /* some progress has been done */ 1166 1167 /* 1168 * If all outstanding data are acked, stop 1169 * retransmit timer, otherwise restart timer 1170 * using current (possibly backed-off) value. 1171 * If process is waiting for space, 1172 * wakeup/selwakeup/signal. If data 1173 * are ready to send, let tcp_output 1174 * decide between more output or persist. 1175 1176#ifdef TCPDEBUG 1177 if (so->so_options & SO_DEBUG) 1178 tcp_trace(TA_INPUT, ostate, tp, 1179 (void *)tcp_saveipgen, 1180 &tcp_savetcp, 0); 1181#endif 1182 */ 1183 if (tp->snd_una == tp->snd_max) 1184 callout_stop(tp->tt_rexmt); 1185 else if (!callout_active(tp->tt_persist)) 1186 callout_reset(tp->tt_rexmt, 1187 tp->t_rxtcur, 1188 tcp_timer_rexmt, tp); 1189 1190 sowwakeup(so); 1191 if (so->so_snd.sb_cc) 1192 (void) tcp_output(tp); 1193 goto check_delack; 1194 } 1195 } else if (th->th_ack == tp->snd_una && 1196 LIST_EMPTY(&tp->t_segq) && 1197 tlen <= sbspace(&so->so_rcv)) { 1198 int newsize = 0; /* automatic sockbuf scaling */ 1199 1200 KASSERT(headlocked, ("headlocked")); 1201 INP_INFO_WUNLOCK(&tcbinfo); 1202 headlocked = 0; 1203 /* 1204 * this is a pure, in-sequence data packet 1205 * with nothing on the reassembly queue and 1206 * we have enough buffer space to take it. 1207 */ 1208 /* Clean receiver SACK report if present */ 1209 if (tp->sack_enable && tp->rcv_numsacks) 1210 tcp_clean_sackreport(tp); 1211 ++tcpstat.tcps_preddat; 1212 tp->rcv_nxt += tlen; 1213 /* 1214 * Pull snd_wl1 up to prevent seq wrap relative to 1215 * th_seq. 1216 */ 1217 tp->snd_wl1 = th->th_seq; 1218 /* 1219 * Pull rcv_up up to prevent seq wrap relative to 1220 * rcv_nxt. 1221 */ 1222 tp->rcv_up = tp->rcv_nxt; 1223 tcpstat.tcps_rcvpack++; 1224 tcpstat.tcps_rcvbyte += tlen; 1225 ND6_HINT(tp); /* some progress has been done */ 1226#ifdef TCPDEBUG 1227 if (so->so_options & SO_DEBUG) 1228 tcp_trace(TA_INPUT, ostate, tp, 1229 (void *)tcp_saveipgen, &tcp_savetcp, 0); 1230#endif 1231 /* 1232 * Automatic sizing of receive socket buffer. Often the send 1233 * buffer size is not optimally adjusted to the actual network 1234 * conditions at hand (delay bandwidth product). Setting the 1235 * buffer size too small limits throughput on links with high 1236 * bandwidth and high delay (eg. trans-continental/oceanic links). 1237 * 1238 * On the receive side the socket buffer memory is only rarely 1239 * used to any significant extent. This allows us to be much 1240 * more aggressive in scaling the receive socket buffer. For 1241 * the case that the buffer space is actually used to a large 1242 * extent and we run out of kernel memory we can simply drop 1243 * the new segments; TCP on the sender will just retransmit it 1244 * later. Setting the buffer size too big may only consume too 1245 * much kernel memory if the application doesn't read() from 1246 * the socket or packet loss or reordering makes use of the 1247 * reassembly queue. 1248 * 1249 * The criteria to step up the receive buffer one notch are: 1250 * 1. the number of bytes received during the time it takes 1251 * one timestamp to be reflected back to us (the RTT); 1252 * 2. received bytes per RTT is within seven eighth of the 1253 * current socket buffer size; 1254 * 3. receive buffer size has not hit maximal automatic size; 1255 * 1256 * This algorithm does one step per RTT at most and only if 1257 * we receive a bulk stream w/o packet losses or reorderings. 1258 * Shrinking the buffer during idle times is not necessary as 1259 * it doesn't consume any memory when idle. 1260 * 1261 * TODO: Only step up if the application is actually serving 1262 * the buffer to better manage the socket buffer resources. 1263 */ 1264 if (tcp_do_autorcvbuf && 1265 to.to_tsecr && 1266 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 1267 if (to.to_tsecr > tp->rfbuf_ts && 1268 to.to_tsecr - tp->rfbuf_ts < hz) { 1269 if (tp->rfbuf_cnt > 1270 (so->so_rcv.sb_hiwat / 8 * 7) && 1271 so->so_rcv.sb_hiwat < 1272 tcp_autorcvbuf_max) { 1273 newsize = 1274 min(so->so_rcv.sb_hiwat + 1275 tcp_autorcvbuf_inc, 1276 tcp_autorcvbuf_max); 1277 } 1278 /* Start over with next RTT. */ 1279 tp->rfbuf_ts = 0; 1280 tp->rfbuf_cnt = 0; 1281 } else 1282 tp->rfbuf_cnt += tlen; /* add up */ 1283 } 1284 1285 /* Add data to socket buffer. */ 1286 SOCKBUF_LOCK(&so->so_rcv); 1287 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1288 m_freem(m); 1289 } else { 1290 /* 1291 * Set new socket buffer size. 1292 * Give up when limit is reached. 1293 */ 1294 if (newsize) 1295 if (!sbreserve_locked(&so->so_rcv, 1296 newsize, so, curthread)) 1297 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1298 m_adj(m, drop_hdrlen); /* delayed header drop */ 1299 sbappendstream_locked(&so->so_rcv, m); 1300 } 1301 sorwakeup_locked(so); 1302 if (DELAY_ACK(tp)) { 1303 tp->t_flags |= TF_DELACK; 1304 } else { 1305 tp->t_flags |= TF_ACKNOW; 1306 tcp_output(tp); 1307 } 1308 goto check_delack; 1309 } 1310 } 1311 1312 /* 1313 * Calculate amount of space in receive window, 1314 * and then do TCP input processing. 1315 * Receive window is amount of space in rcv queue, 1316 * but not less than advertised window. 1317 */ 1318 { int win; 1319 1320 win = sbspace(&so->so_rcv); 1321 if (win < 0) 1322 win = 0; 1323 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1324 } 1325 1326 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 1327 tp->rfbuf_ts = 0; 1328 tp->rfbuf_cnt = 0; 1329 1330 switch (tp->t_state) { 1331 1332 /* 1333 * If the state is SYN_RECEIVED: 1334 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1335 */ 1336 case TCPS_SYN_RECEIVED: 1337 if ((thflags & TH_ACK) && 1338 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1339 SEQ_GT(th->th_ack, tp->snd_max))) { 1340 rstreason = BANDLIM_RST_OPENPORT; 1341 goto dropwithreset; 1342 } 1343 break; 1344 1345 /* 1346 * If the state is SYN_SENT: 1347 * if seg contains an ACK, but not for our SYN, drop the input. 1348 * if seg contains a RST, then drop the connection. 1349 * if seg does not contain SYN, then drop it. 1350 * Otherwise this is an acceptable SYN segment 1351 * initialize tp->rcv_nxt and tp->irs 1352 * if seg contains ack then advance tp->snd_una 1353 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1354 * arrange for segment to be acked (eventually) 1355 * continue processing rest of data/controls, beginning with URG 1356 */ 1357 case TCPS_SYN_SENT: 1358 if ((thflags & TH_ACK) && 1359 (SEQ_LEQ(th->th_ack, tp->iss) || 1360 SEQ_GT(th->th_ack, tp->snd_max))) { 1361 rstreason = BANDLIM_UNLIMITED; 1362 goto dropwithreset; 1363 } 1364 if (thflags & TH_RST) { 1365 if (thflags & TH_ACK) { 1366 KASSERT(headlocked, ("tcp_input: after_listen" 1367 ": tcp_drop.2: head not locked")); 1368 tp = tcp_drop(tp, ECONNREFUSED); 1369 } 1370 goto drop; 1371 } 1372 if ((thflags & TH_SYN) == 0) 1373 goto drop; 1374 1375 tp->irs = th->th_seq; 1376 tcp_rcvseqinit(tp); 1377 if (thflags & TH_ACK) { 1378 tcpstat.tcps_connects++; 1379 soisconnected(so); 1380#ifdef MAC 1381 SOCK_LOCK(so); 1382 mac_set_socket_peer_from_mbuf(m, so); 1383 SOCK_UNLOCK(so); 1384#endif 1385 /* Do window scaling on this connection? */ 1386 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1387 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1388 tp->rcv_scale = tp->request_r_scale; 1389 } 1390 tp->rcv_adv += tp->rcv_wnd; 1391 tp->snd_una++; /* SYN is acked */ 1392 /* 1393 * If there's data, delay ACK; if there's also a FIN 1394 * ACKNOW will be turned on later. 1395 */ 1396 if (DELAY_ACK(tp) && tlen != 0) 1397 callout_reset(tp->tt_delack, tcp_delacktime, 1398 tcp_timer_delack, tp); 1399 else 1400 tp->t_flags |= TF_ACKNOW; 1401 /* 1402 * Received <SYN,ACK> in SYN_SENT[*] state. 1403 * Transitions: 1404 * SYN_SENT --> ESTABLISHED 1405 * SYN_SENT* --> FIN_WAIT_1 1406 */ 1407 tp->t_starttime = ticks; 1408 if (tp->t_flags & TF_NEEDFIN) { 1409 tp->t_state = TCPS_FIN_WAIT_1; 1410 tp->t_flags &= ~TF_NEEDFIN; 1411 thflags &= ~TH_SYN; 1412 } else { 1413 tp->t_state = TCPS_ESTABLISHED; 1414 callout_reset(tp->tt_keep, tcp_keepidle, 1415 tcp_timer_keep, tp); 1416 } 1417 } else { 1418 /* 1419 * Received initial SYN in SYN-SENT[*] state => 1420 * simultaneous open. If segment contains CC option 1421 * and there is a cached CC, apply TAO test. 1422 * If it succeeds, connection is * half-synchronized. 1423 * Otherwise, do 3-way handshake: 1424 * SYN-SENT -> SYN-RECEIVED 1425 * SYN-SENT* -> SYN-RECEIVED* 1426 * If there was no CC option, clear cached CC value. 1427 */ 1428 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 1429 callout_stop(tp->tt_rexmt); 1430 tp->t_state = TCPS_SYN_RECEIVED; 1431 } 1432 1433 KASSERT(headlocked, ("tcp_input: trimthenstep6: head not " 1434 "locked")); 1435 INP_LOCK_ASSERT(inp); 1436 1437 /* 1438 * Advance th->th_seq to correspond to first data byte. 1439 * If data, trim to stay within window, 1440 * dropping FIN if necessary. 1441 */ 1442 th->th_seq++; 1443 if (tlen > tp->rcv_wnd) { 1444 todrop = tlen - tp->rcv_wnd; 1445 m_adj(m, -todrop); 1446 tlen = tp->rcv_wnd; 1447 thflags &= ~TH_FIN; 1448 tcpstat.tcps_rcvpackafterwin++; 1449 tcpstat.tcps_rcvbyteafterwin += todrop; 1450 } 1451 tp->snd_wl1 = th->th_seq - 1; 1452 tp->rcv_up = th->th_seq; 1453 /* 1454 * Client side of transaction: already sent SYN and data. 1455 * If the remote host used T/TCP to validate the SYN, 1456 * our data will be ACK'd; if so, enter normal data segment 1457 * processing in the middle of step 5, ack processing. 1458 * Otherwise, goto step 6. 1459 */ 1460 if (thflags & TH_ACK) 1461 goto process_ACK; 1462 1463 goto step6; 1464 1465 /* 1466 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 1467 * do normal processing. 1468 * 1469 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 1470 */ 1471 case TCPS_LAST_ACK: 1472 case TCPS_CLOSING: 1473 case TCPS_TIME_WAIT: 1474 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); 1475 break; /* continue normal processing */ 1476 } 1477 1478 /* 1479 * States other than LISTEN or SYN_SENT. 1480 * First check the RST flag and sequence number since reset segments 1481 * are exempt from the timestamp and connection count tests. This 1482 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 1483 * below which allowed reset segments in half the sequence space 1484 * to fall though and be processed (which gives forged reset 1485 * segments with a random sequence number a 50 percent chance of 1486 * killing a connection). 1487 * Then check timestamp, if present. 1488 * Then check the connection count, if present. 1489 * Then check that at least some bytes of segment are within 1490 * receive window. If segment begins before rcv_nxt, 1491 * drop leading data (and SYN); if nothing left, just ack. 1492 * 1493 * 1494 * If the RST bit is set, check the sequence number to see 1495 * if this is a valid reset segment. 1496 * RFC 793 page 37: 1497 * In all states except SYN-SENT, all reset (RST) segments 1498 * are validated by checking their SEQ-fields. A reset is 1499 * valid if its sequence number is in the window. 1500 * Note: this does not take into account delayed ACKs, so 1501 * we should test against last_ack_sent instead of rcv_nxt. 1502 * The sequence number in the reset segment is normally an 1503 * echo of our outgoing acknowlegement numbers, but some hosts 1504 * send a reset with the sequence number at the rightmost edge 1505 * of our receive window, and we have to handle this case. 1506 * Note 2: Paul Watson's paper "Slipping in the Window" has shown 1507 * that brute force RST attacks are possible. To combat this, 1508 * we use a much stricter check while in the ESTABLISHED state, 1509 * only accepting RSTs where the sequence number is equal to 1510 * last_ack_sent. In all other states (the states in which a 1511 * RST is more likely), the more permissive check is used. 1512 * If we have multiple segments in flight, the intial reset 1513 * segment sequence numbers will be to the left of last_ack_sent, 1514 * but they will eventually catch up. 1515 * In any case, it never made sense to trim reset segments to 1516 * fit the receive window since RFC 1122 says: 1517 * 4.2.2.12 RST Segment: RFC-793 Section 3.4 1518 * 1519 * A TCP SHOULD allow a received RST segment to include data. 1520 * 1521 * DISCUSSION 1522 * It has been suggested that a RST segment could contain 1523 * ASCII text that encoded and explained the cause of the 1524 * RST. No standard has yet been established for such 1525 * data. 1526 * 1527 * If the reset segment passes the sequence number test examine 1528 * the state: 1529 * SYN_RECEIVED STATE: 1530 * If passive open, return to LISTEN state. 1531 * If active open, inform user that connection was refused. 1532 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: 1533 * Inform user that connection was reset, and close tcb. 1534 * CLOSING, LAST_ACK STATES: 1535 * Close the tcb. 1536 * TIME_WAIT STATE: 1537 * Drop the segment - see Stevens, vol. 2, p. 964 and 1538 * RFC 1337. 1539 */ 1540 if (thflags & TH_RST) { 1541 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1542 SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1543 switch (tp->t_state) { 1544 1545 case TCPS_SYN_RECEIVED: 1546 so->so_error = ECONNREFUSED; 1547 goto close; 1548 1549 case TCPS_ESTABLISHED: 1550 if (tcp_insecure_rst == 0 && 1551 !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && 1552 SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && 1553 !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && 1554 SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { 1555 tcpstat.tcps_badrst++; 1556 goto drop; 1557 } 1558 case TCPS_FIN_WAIT_1: 1559 case TCPS_FIN_WAIT_2: 1560 case TCPS_CLOSE_WAIT: 1561 so->so_error = ECONNRESET; 1562 close: 1563 tp->t_state = TCPS_CLOSED; 1564 tcpstat.tcps_drops++; 1565 KASSERT(headlocked, ("tcp_input: " 1566 "trimthenstep6: tcp_close: head not " 1567 "locked")); 1568 tp = tcp_close(tp); 1569 break; 1570 1571 case TCPS_CLOSING: 1572 case TCPS_LAST_ACK: 1573 KASSERT(headlocked, ("trimthenstep6: " 1574 "tcp_close.2: head not locked")); 1575 tp = tcp_close(tp); 1576 break; 1577 1578 case TCPS_TIME_WAIT: 1579 KASSERT(tp->t_state != TCPS_TIME_WAIT, 1580 ("timewait")); 1581 break; 1582 } 1583 } 1584 goto drop; 1585 } 1586 1587 /* 1588 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1589 * and it's less than ts_recent, drop it. 1590 */ 1591 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 1592 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 1593 1594 /* Check to see if ts_recent is over 24 days old. */ 1595 if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1596 /* 1597 * Invalidate ts_recent. If this segment updates 1598 * ts_recent, the age will be reset later and ts_recent 1599 * will get a valid value. If it does not, setting 1600 * ts_recent to zero will at least satisfy the 1601 * requirement that zero be placed in the timestamp 1602 * echo reply when ts_recent isn't valid. The 1603 * age isn't reset until we get a valid ts_recent 1604 * because we don't want out-of-order segments to be 1605 * dropped when ts_recent is old. 1606 */ 1607 tp->ts_recent = 0; 1608 } else { 1609 tcpstat.tcps_rcvduppack++; 1610 tcpstat.tcps_rcvdupbyte += tlen; 1611 tcpstat.tcps_pawsdrop++; 1612 if (tlen) 1613 goto dropafterack; 1614 goto drop; 1615 } 1616 } 1617 1618 /* 1619 * In the SYN-RECEIVED state, validate that the packet belongs to 1620 * this connection before trimming the data to fit the receive 1621 * window. Check the sequence number versus IRS since we know 1622 * the sequence numbers haven't wrapped. This is a partial fix 1623 * for the "LAND" DoS attack. 1624 */ 1625 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 1626 rstreason = BANDLIM_RST_OPENPORT; 1627 goto dropwithreset; 1628 } 1629 1630 todrop = tp->rcv_nxt - th->th_seq; 1631 if (todrop > 0) { 1632 if (thflags & TH_SYN) { 1633 thflags &= ~TH_SYN; 1634 th->th_seq++; 1635 if (th->th_urp > 1) 1636 th->th_urp--; 1637 else 1638 thflags &= ~TH_URG; 1639 todrop--; 1640 } 1641 /* 1642 * Following if statement from Stevens, vol. 2, p. 960. 1643 */ 1644 if (todrop > tlen 1645 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1646 /* 1647 * Any valid FIN must be to the left of the window. 1648 * At this point the FIN must be a duplicate or out 1649 * of sequence; drop it. 1650 */ 1651 thflags &= ~TH_FIN; 1652 1653 /* 1654 * Send an ACK to resynchronize and drop any data. 1655 * But keep on processing for RST or ACK. 1656 */ 1657 tp->t_flags |= TF_ACKNOW; 1658 todrop = tlen; 1659 tcpstat.tcps_rcvduppack++; 1660 tcpstat.tcps_rcvdupbyte += todrop; 1661 } else { 1662 tcpstat.tcps_rcvpartduppack++; 1663 tcpstat.tcps_rcvpartdupbyte += todrop; 1664 } 1665 drop_hdrlen += todrop; /* drop from the top afterwards */ 1666 th->th_seq += todrop; 1667 tlen -= todrop; 1668 if (th->th_urp > todrop) 1669 th->th_urp -= todrop; 1670 else { 1671 thflags &= ~TH_URG; 1672 th->th_urp = 0; 1673 } 1674 } 1675 1676 /* 1677 * If new data are received on a connection after the 1678 * user processes are gone, then RST the other end. 1679 */ 1680 if ((so->so_state & SS_NOFDREF) && 1681 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1682 KASSERT(headlocked, ("trimthenstep6: tcp_close.3: head not " 1683 "locked")); 1684 tp = tcp_close(tp); 1685 tcpstat.tcps_rcvafterclose++; 1686 rstreason = BANDLIM_UNLIMITED; 1687 goto dropwithreset; 1688 } 1689 1690 /* 1691 * If segment ends after window, drop trailing data 1692 * (and PUSH and FIN); if nothing left, just ACK. 1693 */ 1694 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1695 if (todrop > 0) { 1696 tcpstat.tcps_rcvpackafterwin++; 1697 if (todrop >= tlen) { 1698 tcpstat.tcps_rcvbyteafterwin += tlen; 1699 /* 1700 * If a new connection request is received 1701 * while in TIME_WAIT, drop the old connection 1702 * and start over if the sequence numbers 1703 * are above the previous ones. 1704 */ 1705 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); 1706 if (thflags & TH_SYN && 1707 tp->t_state == TCPS_TIME_WAIT && 1708 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1709 KASSERT(headlocked, ("trimthenstep6: " 1710 "tcp_close.4: head not locked")); 1711 tp = tcp_close(tp); 1712 goto findpcb; 1713 } 1714 /* 1715 * If window is closed can only take segments at 1716 * window edge, and have to drop data and PUSH from 1717 * incoming segments. Continue processing, but 1718 * remember to ack. Otherwise, drop segment 1719 * and ack. 1720 */ 1721 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1722 tp->t_flags |= TF_ACKNOW; 1723 tcpstat.tcps_rcvwinprobe++; 1724 } else 1725 goto dropafterack; 1726 } else 1727 tcpstat.tcps_rcvbyteafterwin += todrop; 1728 m_adj(m, -todrop); 1729 tlen -= todrop; 1730 thflags &= ~(TH_PUSH|TH_FIN); 1731 } 1732 1733 /* 1734 * If last ACK falls within this segment's sequence numbers, 1735 * record its timestamp. 1736 * NOTE: 1737 * 1) That the test incorporates suggestions from the latest 1738 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1739 * 2) That updating only on newer timestamps interferes with 1740 * our earlier PAWS tests, so this check should be solely 1741 * predicated on the sequence space of this segment. 1742 * 3) That we modify the segment boundary check to be 1743 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 1744 * instead of RFC1323's 1745 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 1746 * This modified check allows us to overcome RFC1323's 1747 * limitations as described in Stevens TCP/IP Illustrated 1748 * Vol. 2 p.869. In such cases, we can still calculate the 1749 * RTT correctly when RCV.NXT == Last.ACK.Sent. 1750 */ 1751 if ((to.to_flags & TOF_TS) != 0 && 1752 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 1753 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 1754 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 1755 tp->ts_recent_age = ticks; 1756 tp->ts_recent = to.to_tsval; 1757 } 1758 1759 /* 1760 * If a SYN is in the window, then this is an 1761 * error and we send an RST and drop the connection. 1762 */ 1763 if (thflags & TH_SYN) { 1764 KASSERT(headlocked, ("tcp_input: tcp_drop: trimthenstep6: " 1765 "head not locked")); 1766 tp = tcp_drop(tp, ECONNRESET); 1767 rstreason = BANDLIM_UNLIMITED; 1768 goto drop; 1769 } 1770 1771 /* 1772 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 1773 * flag is on (half-synchronized state), then queue data for 1774 * later processing; else drop segment and return. 1775 */ 1776 if ((thflags & TH_ACK) == 0) { 1777 if (tp->t_state == TCPS_SYN_RECEIVED || 1778 (tp->t_flags & TF_NEEDSYN)) 1779 goto step6; 1780 else if (tp->t_flags & TF_ACKNOW) 1781 goto dropafterack; 1782 else 1783 goto drop; 1784 } 1785 1786 /* 1787 * Ack processing. 1788 */ 1789 switch (tp->t_state) { 1790 1791 /* 1792 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1793 * ESTABLISHED state and continue processing. 1794 * The ACK was checked above. 1795 */ 1796 case TCPS_SYN_RECEIVED: 1797 1798 tcpstat.tcps_connects++; 1799 soisconnected(so); 1800 /* Do window scaling? */ 1801 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1802 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1803 tp->rcv_scale = tp->request_r_scale; 1804 tp->snd_wnd = tiwin; 1805 } 1806 /* 1807 * Make transitions: 1808 * SYN-RECEIVED -> ESTABLISHED 1809 * SYN-RECEIVED* -> FIN-WAIT-1 1810 */ 1811 tp->t_starttime = ticks; 1812 if (tp->t_flags & TF_NEEDFIN) { 1813 tp->t_state = TCPS_FIN_WAIT_1; 1814 tp->t_flags &= ~TF_NEEDFIN; 1815 } else { 1816 tp->t_state = TCPS_ESTABLISHED; 1817 callout_reset(tp->tt_keep, tcp_keepidle, 1818 tcp_timer_keep, tp); 1819 } 1820 /* 1821 * If segment contains data or ACK, will call tcp_reass() 1822 * later; if not, do so now to pass queued data to user. 1823 */ 1824 if (tlen == 0 && (thflags & TH_FIN) == 0) 1825 (void) tcp_reass(tp, (struct tcphdr *)0, 0, 1826 (struct mbuf *)0); 1827 tp->snd_wl1 = th->th_seq - 1; 1828 /* FALLTHROUGH */ 1829 1830 /* 1831 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1832 * ACKs. If the ack is in the range 1833 * tp->snd_una < th->th_ack <= tp->snd_max 1834 * then advance tp->snd_una to th->th_ack and drop 1835 * data from the retransmission queue. If this ACK reflects 1836 * more up to date window information we update our window information. 1837 */ 1838 case TCPS_ESTABLISHED: 1839 case TCPS_FIN_WAIT_1: 1840 case TCPS_FIN_WAIT_2: 1841 case TCPS_CLOSE_WAIT: 1842 case TCPS_CLOSING: 1843 case TCPS_LAST_ACK: 1844 case TCPS_TIME_WAIT: 1845 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); 1846 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1847 tcpstat.tcps_rcvacktoomuch++; 1848 goto dropafterack; 1849 } 1850 if (tp->sack_enable && 1851 ((to.to_flags & TOF_SACK) || 1852 !TAILQ_EMPTY(&tp->snd_holes))) 1853 tcp_sack_doack(tp, &to, th->th_ack); 1854 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1855 if (tlen == 0 && tiwin == tp->snd_wnd) { 1856 tcpstat.tcps_rcvdupack++; 1857 /* 1858 * If we have outstanding data (other than 1859 * a window probe), this is a completely 1860 * duplicate ack (ie, window info didn't 1861 * change), the ack is the biggest we've 1862 * seen and we've seen exactly our rexmt 1863 * threshhold of them, assume a packet 1864 * has been dropped and retransmit it. 1865 * Kludge snd_nxt & the congestion 1866 * window so we send only this one 1867 * packet. 1868 * 1869 * We know we're losing at the current 1870 * window size so do congestion avoidance 1871 * (set ssthresh to half the current window 1872 * and pull our congestion window back to 1873 * the new ssthresh). 1874 * 1875 * Dup acks mean that packets have left the 1876 * network (they're now cached at the receiver) 1877 * so bump cwnd by the amount in the receiver 1878 * to keep a constant cwnd packets in the 1879 * network. 1880 */ 1881 if (!callout_active(tp->tt_rexmt) || 1882 th->th_ack != tp->snd_una) 1883 tp->t_dupacks = 0; 1884 else if (++tp->t_dupacks > tcprexmtthresh || 1885 ((tcp_do_newreno || tp->sack_enable) && 1886 IN_FASTRECOVERY(tp))) { 1887 if (tp->sack_enable && IN_FASTRECOVERY(tp)) { 1888 int awnd; 1889 1890 /* 1891 * Compute the amount of data in flight first. 1892 * We can inject new data into the pipe iff 1893 * we have less than 1/2 the original window's 1894 * worth of data in flight. 1895 */ 1896 awnd = (tp->snd_nxt - tp->snd_fack) + 1897 tp->sackhint.sack_bytes_rexmit; 1898 if (awnd < tp->snd_ssthresh) { 1899 tp->snd_cwnd += tp->t_maxseg; 1900 if (tp->snd_cwnd > tp->snd_ssthresh) 1901 tp->snd_cwnd = tp->snd_ssthresh; 1902 } 1903 } else 1904 tp->snd_cwnd += tp->t_maxseg; 1905 (void) tcp_output(tp); 1906 goto drop; 1907 } else if (tp->t_dupacks == tcprexmtthresh) { 1908 tcp_seq onxt = tp->snd_nxt; 1909 u_int win; 1910 1911 /* 1912 * If we're doing sack, check to 1913 * see if we're already in sack 1914 * recovery. If we're not doing sack, 1915 * check to see if we're in newreno 1916 * recovery. 1917 */ 1918 if (tp->sack_enable) { 1919 if (IN_FASTRECOVERY(tp)) { 1920 tp->t_dupacks = 0; 1921 break; 1922 } 1923 } else if (tcp_do_newreno) { 1924 if (SEQ_LEQ(th->th_ack, 1925 tp->snd_recover)) { 1926 tp->t_dupacks = 0; 1927 break; 1928 } 1929 } 1930 win = min(tp->snd_wnd, tp->snd_cwnd) / 1931 2 / tp->t_maxseg; 1932 if (win < 2) 1933 win = 2; 1934 tp->snd_ssthresh = win * tp->t_maxseg; 1935 ENTER_FASTRECOVERY(tp); 1936 tp->snd_recover = tp->snd_max; 1937 callout_stop(tp->tt_rexmt); 1938 tp->t_rtttime = 0; 1939 if (tp->sack_enable) { 1940 tcpstat.tcps_sack_recovery_episode++; 1941 tp->sack_newdata = tp->snd_nxt; 1942 tp->snd_cwnd = tp->t_maxseg; 1943 (void) tcp_output(tp); 1944 goto drop; 1945 } 1946 tp->snd_nxt = th->th_ack; 1947 tp->snd_cwnd = tp->t_maxseg; 1948 (void) tcp_output(tp); 1949 KASSERT(tp->snd_limited <= 2, 1950 ("tp->snd_limited too big")); 1951 tp->snd_cwnd = tp->snd_ssthresh + 1952 tp->t_maxseg * 1953 (tp->t_dupacks - tp->snd_limited); 1954 if (SEQ_GT(onxt, tp->snd_nxt)) 1955 tp->snd_nxt = onxt; 1956 goto drop; 1957 } else if (tcp_do_rfc3042) { 1958 u_long oldcwnd = tp->snd_cwnd; 1959 tcp_seq oldsndmax = tp->snd_max; 1960 u_int sent; 1961 1962 KASSERT(tp->t_dupacks == 1 || 1963 tp->t_dupacks == 2, 1964 ("dupacks not 1 or 2")); 1965 if (tp->t_dupacks == 1) 1966 tp->snd_limited = 0; 1967 tp->snd_cwnd = 1968 (tp->snd_nxt - tp->snd_una) + 1969 (tp->t_dupacks - tp->snd_limited) * 1970 tp->t_maxseg; 1971 (void) tcp_output(tp); 1972 sent = tp->snd_max - oldsndmax; 1973 if (sent > tp->t_maxseg) { 1974 KASSERT((tp->t_dupacks == 2 && 1975 tp->snd_limited == 0) || 1976 (sent == tp->t_maxseg + 1 && 1977 tp->t_flags & TF_SENTFIN), 1978 ("sent too much")); 1979 tp->snd_limited = 2; 1980 } else if (sent > 0) 1981 ++tp->snd_limited; 1982 tp->snd_cwnd = oldcwnd; 1983 goto drop; 1984 } 1985 } else 1986 tp->t_dupacks = 0; 1987 break; 1988 } 1989 1990 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("th_ack <= snd_una")); 1991 1992 /* 1993 * If the congestion window was inflated to account 1994 * for the other side's cached packets, retract it. 1995 */ 1996 if (tcp_do_newreno || tp->sack_enable) { 1997 if (IN_FASTRECOVERY(tp)) { 1998 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 1999 if (tp->sack_enable) 2000 tcp_sack_partialack(tp, th); 2001 else 2002 tcp_newreno_partial_ack(tp, th); 2003 } else { 2004 /* 2005 * Out of fast recovery. 2006 * Window inflation should have left us 2007 * with approximately snd_ssthresh 2008 * outstanding data. 2009 * But in case we would be inclined to 2010 * send a burst, better to do it via 2011 * the slow start mechanism. 2012 */ 2013 if (SEQ_GT(th->th_ack + 2014 tp->snd_ssthresh, 2015 tp->snd_max)) 2016 tp->snd_cwnd = tp->snd_max - 2017 th->th_ack + 2018 tp->t_maxseg; 2019 else 2020 tp->snd_cwnd = tp->snd_ssthresh; 2021 } 2022 } 2023 } else { 2024 if (tp->t_dupacks >= tcprexmtthresh && 2025 tp->snd_cwnd > tp->snd_ssthresh) 2026 tp->snd_cwnd = tp->snd_ssthresh; 2027 } 2028 tp->t_dupacks = 0; 2029 /* 2030 * If we reach this point, ACK is not a duplicate, 2031 * i.e., it ACKs something we sent. 2032 */ 2033 if (tp->t_flags & TF_NEEDSYN) { 2034 /* 2035 * T/TCP: Connection was half-synchronized, and our 2036 * SYN has been ACK'd (so connection is now fully 2037 * synchronized). Go to non-starred state, 2038 * increment snd_una for ACK of SYN, and check if 2039 * we can do window scaling. 2040 */ 2041 tp->t_flags &= ~TF_NEEDSYN; 2042 tp->snd_una++; 2043 /* Do window scaling? */ 2044 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2045 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2046 tp->rcv_scale = tp->request_r_scale; 2047 /* Send window already scaled. */ 2048 } 2049 } 2050 2051process_ACK: 2052 KASSERT(headlocked, ("tcp_input: process_ACK: head not " 2053 "locked")); 2054 INP_LOCK_ASSERT(inp); 2055 2056 acked = th->th_ack - tp->snd_una; 2057 tcpstat.tcps_rcvackpack++; 2058 tcpstat.tcps_rcvackbyte += acked; 2059 2060 /* 2061 * If we just performed our first retransmit, and the ACK 2062 * arrives within our recovery window, then it was a mistake 2063 * to do the retransmit in the first place. Recover our 2064 * original cwnd and ssthresh, and proceed to transmit where 2065 * we left off. 2066 */ 2067 if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { 2068 ++tcpstat.tcps_sndrexmitbad; 2069 tp->snd_cwnd = tp->snd_cwnd_prev; 2070 tp->snd_ssthresh = tp->snd_ssthresh_prev; 2071 tp->snd_recover = tp->snd_recover_prev; 2072 if (tp->t_flags & TF_WASFRECOVERY) 2073 ENTER_FASTRECOVERY(tp); 2074 tp->snd_nxt = tp->snd_max; 2075 tp->t_badrxtwin = 0; /* XXX probably not required */ 2076 } 2077 2078 /* 2079 * If we have a timestamp reply, update smoothed 2080 * round trip time. If no timestamp is present but 2081 * transmit timer is running and timed sequence 2082 * number was acked, update smoothed round trip time. 2083 * Since we now have an rtt measurement, cancel the 2084 * timer backoff (cf., Phil Karn's retransmit alg.). 2085 * Recompute the initial retransmit timer. 2086 * 2087 * Some boxes send broken timestamp replies 2088 * during the SYN+ACK phase, ignore 2089 * timestamps of 0 or we could calculate a 2090 * huge RTT and blow up the retransmit timer. 2091 */ 2092 if ((to.to_flags & TOF_TS) != 0 && 2093 to.to_tsecr) { 2094 if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) 2095 tp->t_rttlow = ticks - to.to_tsecr; 2096 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); 2097 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 2098 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 2099 tp->t_rttlow = ticks - tp->t_rtttime; 2100 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 2101 } 2102 tcp_xmit_bandwidth_limit(tp, th->th_ack); 2103 2104 /* 2105 * If all outstanding data is acked, stop retransmit 2106 * timer and remember to restart (more output or persist). 2107 * If there is more data to be acked, restart retransmit 2108 * timer, using current (possibly backed-off) value. 2109 */ 2110 if (th->th_ack == tp->snd_max) { 2111 callout_stop(tp->tt_rexmt); 2112 needoutput = 1; 2113 } else if (!callout_active(tp->tt_persist)) 2114 callout_reset(tp->tt_rexmt, tp->t_rxtcur, 2115 tcp_timer_rexmt, tp); 2116 2117 /* 2118 * If no data (only SYN) was ACK'd, 2119 * skip rest of ACK processing. 2120 */ 2121 if (acked == 0) 2122 goto step6; 2123 2124 /* 2125 * When new data is acked, open the congestion window. 2126 * If the window gives us less than ssthresh packets 2127 * in flight, open exponentially (maxseg per packet). 2128 * Otherwise open linearly: maxseg per window 2129 * (maxseg^2 / cwnd per packet). 2130 */ 2131 if ((!tcp_do_newreno && !tp->sack_enable) || 2132 !IN_FASTRECOVERY(tp)) { 2133 u_int cw = tp->snd_cwnd; 2134 u_int incr = tp->t_maxseg; 2135 if (cw > tp->snd_ssthresh) 2136 incr = incr * incr / cw; 2137 tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale); 2138 } 2139 SOCKBUF_LOCK(&so->so_snd); 2140 if (acked > so->so_snd.sb_cc) { 2141 tp->snd_wnd -= so->so_snd.sb_cc; 2142 sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); 2143 ourfinisacked = 1; 2144 } else { 2145 sbdrop_locked(&so->so_snd, acked); 2146 tp->snd_wnd -= acked; 2147 ourfinisacked = 0; 2148 } 2149 sowwakeup_locked(so); 2150 /* detect una wraparound */ 2151 if ((tcp_do_newreno || tp->sack_enable) && 2152 !IN_FASTRECOVERY(tp) && 2153 SEQ_GT(tp->snd_una, tp->snd_recover) && 2154 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2155 tp->snd_recover = th->th_ack - 1; 2156 if ((tcp_do_newreno || tp->sack_enable) && 2157 IN_FASTRECOVERY(tp) && 2158 SEQ_GEQ(th->th_ack, tp->snd_recover)) 2159 EXIT_FASTRECOVERY(tp); 2160 tp->snd_una = th->th_ack; 2161 if (tp->sack_enable) { 2162 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 2163 tp->snd_recover = tp->snd_una; 2164 } 2165 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2166 tp->snd_nxt = tp->snd_una; 2167 2168 switch (tp->t_state) { 2169 2170 /* 2171 * In FIN_WAIT_1 STATE in addition to the processing 2172 * for the ESTABLISHED state if our FIN is now acknowledged 2173 * then enter FIN_WAIT_2. 2174 */ 2175 case TCPS_FIN_WAIT_1: 2176 if (ourfinisacked) { 2177 /* 2178 * If we can't receive any more 2179 * data, then closing user can proceed. 2180 * Starting the timer is contrary to the 2181 * specification, but if we don't get a FIN 2182 * we'll hang forever. 2183 */ 2184 /* XXXjl 2185 * we should release the tp also, and use a 2186 * compressed state. 2187 */ 2188 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2189 int timeout; 2190 2191 soisdisconnected(so); 2192 timeout = (tcp_fast_finwait2_recycle) ? 2193 tcp_finwait2_timeout : tcp_maxidle; 2194 callout_reset(tp->tt_2msl, timeout, 2195 tcp_timer_2msl, tp); 2196 } 2197 tp->t_state = TCPS_FIN_WAIT_2; 2198 } 2199 break; 2200 2201 /* 2202 * In CLOSING STATE in addition to the processing for 2203 * the ESTABLISHED state if the ACK acknowledges our FIN 2204 * then enter the TIME-WAIT state, otherwise ignore 2205 * the segment. 2206 */ 2207 case TCPS_CLOSING: 2208 if (ourfinisacked) { 2209 KASSERT(headlocked, ("tcp_input: process_ACK: " 2210 "head not locked")); 2211 tcp_twstart(tp); 2212 INP_INFO_WUNLOCK(&tcbinfo); 2213 m_freem(m); 2214 return; 2215 } 2216 break; 2217 2218 /* 2219 * In LAST_ACK, we may still be waiting for data to drain 2220 * and/or to be acked, as well as for the ack of our FIN. 2221 * If our FIN is now acknowledged, delete the TCB, 2222 * enter the closed state and return. 2223 */ 2224 case TCPS_LAST_ACK: 2225 if (ourfinisacked) { 2226 KASSERT(headlocked, ("tcp_input: process_ACK:" 2227 " tcp_close: head not locked")); 2228 tp = tcp_close(tp); 2229 goto drop; 2230 } 2231 break; 2232 2233 /* 2234 * In TIME_WAIT state the only thing that should arrive 2235 * is a retransmission of the remote FIN. Acknowledge 2236 * it and restart the finack timer. 2237 */ 2238 case TCPS_TIME_WAIT: 2239 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); 2240 callout_reset(tp->tt_2msl, 2 * tcp_msl, 2241 tcp_timer_2msl, tp); 2242 goto dropafterack; 2243 } 2244 } 2245 2246step6: 2247 KASSERT(headlocked, ("tcp_input: step6: head not locked")); 2248 INP_LOCK_ASSERT(inp); 2249 2250 /* 2251 * Update window information. 2252 * Don't look at window if no ACK: TAC's send garbage on first SYN. 2253 */ 2254 if ((thflags & TH_ACK) && 2255 (SEQ_LT(tp->snd_wl1, th->th_seq) || 2256 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2257 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2258 /* keep track of pure window updates */ 2259 if (tlen == 0 && 2260 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2261 tcpstat.tcps_rcvwinupd++; 2262 tp->snd_wnd = tiwin; 2263 tp->snd_wl1 = th->th_seq; 2264 tp->snd_wl2 = th->th_ack; 2265 if (tp->snd_wnd > tp->max_sndwnd) 2266 tp->max_sndwnd = tp->snd_wnd; 2267 needoutput = 1; 2268 } 2269 2270 /* 2271 * Process segments with URG. 2272 */ 2273 if ((thflags & TH_URG) && th->th_urp && 2274 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2275 /* 2276 * This is a kludge, but if we receive and accept 2277 * random urgent pointers, we'll crash in 2278 * soreceive. It's hard to imagine someone 2279 * actually wanting to send this much urgent data. 2280 */ 2281 SOCKBUF_LOCK(&so->so_rcv); 2282 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2283 th->th_urp = 0; /* XXX */ 2284 thflags &= ~TH_URG; /* XXX */ 2285 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 2286 goto dodata; /* XXX */ 2287 } 2288 /* 2289 * If this segment advances the known urgent pointer, 2290 * then mark the data stream. This should not happen 2291 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2292 * a FIN has been received from the remote side. 2293 * In these states we ignore the URG. 2294 * 2295 * According to RFC961 (Assigned Protocols), 2296 * the urgent pointer points to the last octet 2297 * of urgent data. We continue, however, 2298 * to consider it to indicate the first octet 2299 * of data past the urgent section as the original 2300 * spec states (in one of two places). 2301 */ 2302 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2303 tp->rcv_up = th->th_seq + th->th_urp; 2304 so->so_oobmark = so->so_rcv.sb_cc + 2305 (tp->rcv_up - tp->rcv_nxt) - 1; 2306 if (so->so_oobmark == 0) 2307 so->so_rcv.sb_state |= SBS_RCVATMARK; 2308 sohasoutofband(so); 2309 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2310 } 2311 SOCKBUF_UNLOCK(&so->so_rcv); 2312 /* 2313 * Remove out of band data so doesn't get presented to user. 2314 * This can happen independent of advancing the URG pointer, 2315 * but if two URG's are pending at once, some out-of-band 2316 * data may creep in... ick. 2317 */ 2318 if (th->th_urp <= (u_long)tlen && 2319 !(so->so_options & SO_OOBINLINE)) { 2320 /* hdr drop is delayed */ 2321 tcp_pulloutofband(so, th, m, drop_hdrlen); 2322 } 2323 } else { 2324 /* 2325 * If no out of band data is expected, 2326 * pull receive urgent pointer along 2327 * with the receive window. 2328 */ 2329 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2330 tp->rcv_up = tp->rcv_nxt; 2331 } 2332dodata: /* XXX */ 2333 KASSERT(headlocked, ("tcp_input: dodata: head not locked")); 2334 INP_LOCK_ASSERT(inp); 2335 2336 /* 2337 * Process the segment text, merging it into the TCP sequencing queue, 2338 * and arranging for acknowledgment of receipt if necessary. 2339 * This process logically involves adjusting tp->rcv_wnd as data 2340 * is presented to the user (this happens in tcp_usrreq.c, 2341 * case PRU_RCVD). If a FIN has already been received on this 2342 * connection then we just ignore the text. 2343 */ 2344 if ((tlen || (thflags & TH_FIN)) && 2345 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2346 tcp_seq save_start = th->th_seq; 2347 tcp_seq save_end = th->th_seq + tlen; 2348 m_adj(m, drop_hdrlen); /* delayed header drop */ 2349 /* 2350 * Insert segment which includes th into TCP reassembly queue 2351 * with control block tp. Set thflags to whether reassembly now 2352 * includes a segment with FIN. This handles the common case 2353 * inline (segment is the next to be received on an established 2354 * connection, and the queue is empty), avoiding linkage into 2355 * and removal from the queue and repetition of various 2356 * conversions. 2357 * Set DELACK for segments received in order, but ack 2358 * immediately when segments are out of order (so 2359 * fast retransmit can work). 2360 */ 2361 if (th->th_seq == tp->rcv_nxt && 2362 LIST_EMPTY(&tp->t_segq) && 2363 TCPS_HAVEESTABLISHED(tp->t_state)) { 2364 if (DELAY_ACK(tp)) 2365 tp->t_flags |= TF_DELACK; 2366 else 2367 tp->t_flags |= TF_ACKNOW; 2368 tp->rcv_nxt += tlen; 2369 thflags = th->th_flags & TH_FIN; 2370 tcpstat.tcps_rcvpack++; 2371 tcpstat.tcps_rcvbyte += tlen; 2372 ND6_HINT(tp); 2373 SOCKBUF_LOCK(&so->so_rcv); 2374 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 2375 m_freem(m); 2376 else 2377 sbappendstream_locked(&so->so_rcv, m); 2378 sorwakeup_locked(so); 2379 } else { 2380 thflags = tcp_reass(tp, th, &tlen, m); 2381 tp->t_flags |= TF_ACKNOW; 2382 } 2383 if (tlen > 0 && tp->sack_enable) 2384 tcp_update_sack_list(tp, save_start, save_end); 2385 /* 2386 * Note the amount of data that peer has sent into 2387 * our window, in order to estimate the sender's 2388 * buffer size. 2389 */ 2390 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2391 } else { 2392 m_freem(m); 2393 thflags &= ~TH_FIN; 2394 } 2395 2396 /* 2397 * If FIN is received ACK the FIN and let the user know 2398 * that the connection is closing. 2399 */ 2400 if (thflags & TH_FIN) { 2401 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2402 socantrcvmore(so); 2403 /* 2404 * If connection is half-synchronized 2405 * (ie NEEDSYN flag on) then delay ACK, 2406 * so it may be piggybacked when SYN is sent. 2407 * Otherwise, since we received a FIN then no 2408 * more input can be expected, send ACK now. 2409 */ 2410 if (tp->t_flags & TF_NEEDSYN) 2411 tp->t_flags |= TF_DELACK; 2412 else 2413 tp->t_flags |= TF_ACKNOW; 2414 tp->rcv_nxt++; 2415 } 2416 switch (tp->t_state) { 2417 2418 /* 2419 * In SYN_RECEIVED and ESTABLISHED STATES 2420 * enter the CLOSE_WAIT state. 2421 */ 2422 case TCPS_SYN_RECEIVED: 2423 tp->t_starttime = ticks; 2424 /*FALLTHROUGH*/ 2425 case TCPS_ESTABLISHED: 2426 tp->t_state = TCPS_CLOSE_WAIT; 2427 break; 2428 2429 /* 2430 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2431 * enter the CLOSING state. 2432 */ 2433 case TCPS_FIN_WAIT_1: 2434 tp->t_state = TCPS_CLOSING; 2435 break; 2436 2437 /* 2438 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2439 * starting the time-wait timer, turning off the other 2440 * standard timers. 2441 */ 2442 case TCPS_FIN_WAIT_2: 2443 KASSERT(headlocked == 1, ("tcp_input: dodata: " 2444 "TCP_FIN_WAIT_2: head not locked")); 2445 tcp_twstart(tp); 2446 INP_INFO_WUNLOCK(&tcbinfo); 2447 return; 2448 2449 /* 2450 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2451 */ 2452 case TCPS_TIME_WAIT: 2453 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("timewait")); 2454 callout_reset(tp->tt_2msl, 2 * tcp_msl, 2455 tcp_timer_2msl, tp); 2456 break; 2457 } 2458 } 2459 INP_INFO_WUNLOCK(&tcbinfo); 2460 headlocked = 0; 2461#ifdef TCPDEBUG 2462 if (so->so_options & SO_DEBUG) 2463 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 2464 &tcp_savetcp, 0); 2465#endif 2466 2467 /* 2468 * Return any desired output. 2469 */ 2470 if (needoutput || (tp->t_flags & TF_ACKNOW)) 2471 (void) tcp_output(tp); 2472 2473check_delack: 2474 KASSERT(headlocked == 0, ("tcp_input: check_delack: head locked")); 2475 INP_LOCK_ASSERT(inp); 2476 if (tp->t_flags & TF_DELACK) { 2477 tp->t_flags &= ~TF_DELACK; 2478 callout_reset(tp->tt_delack, tcp_delacktime, 2479 tcp_timer_delack, tp); 2480 } 2481 INP_UNLOCK(inp); 2482 return; 2483 2484dropafterack: 2485 KASSERT(headlocked, ("tcp_input: dropafterack: head not locked")); 2486 /* 2487 * Generate an ACK dropping incoming segment if it occupies 2488 * sequence space, where the ACK reflects our state. 2489 * 2490 * We can now skip the test for the RST flag since all 2491 * paths to this code happen after packets containing 2492 * RST have been dropped. 2493 * 2494 * In the SYN-RECEIVED state, don't send an ACK unless the 2495 * segment we received passes the SYN-RECEIVED ACK test. 2496 * If it fails send a RST. This breaks the loop in the 2497 * "LAND" DoS attack, and also prevents an ACK storm 2498 * between two listening ports that have been sent forged 2499 * SYN segments, each with the source address of the other. 2500 */ 2501 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 2502 (SEQ_GT(tp->snd_una, th->th_ack) || 2503 SEQ_GT(th->th_ack, tp->snd_max)) ) { 2504 rstreason = BANDLIM_RST_OPENPORT; 2505 goto dropwithreset; 2506 } 2507#ifdef TCPDEBUG 2508 if (so->so_options & SO_DEBUG) 2509 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2510 &tcp_savetcp, 0); 2511#endif 2512 KASSERT(headlocked, ("headlocked should be 1")); 2513 INP_INFO_WUNLOCK(&tcbinfo); 2514 tp->t_flags |= TF_ACKNOW; 2515 (void) tcp_output(tp); 2516 INP_UNLOCK(inp); 2517 m_freem(m); 2518 return; 2519 2520dropwithreset: 2521 KASSERT(headlocked, ("tcp_input: dropwithreset: head not locked")); 2522 /* 2523 * Generate a RST, dropping incoming segment. 2524 * Make ACK acceptable to originator of segment. 2525 * Don't bother to respond if destination was broadcast/multicast. 2526 */ 2527 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2528 goto drop; 2529 if (isipv6) { 2530 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 2531 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 2532 goto drop; 2533 } else { 2534 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 2535 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 2536 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 2537 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 2538 goto drop; 2539 } 2540 /* IPv6 anycast check is done at tcp6_input() */ 2541 2542 /* 2543 * Perform bandwidth limiting. 2544 */ 2545 if (badport_bandlim(rstreason) < 0) 2546 goto drop; 2547 2548#ifdef TCPDEBUG 2549 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 2550 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2551 &tcp_savetcp, 0); 2552#endif 2553 2554 if (thflags & TH_ACK) 2555 /* mtod() below is safe as long as hdr dropping is delayed */ 2556 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, 2557 TH_RST); 2558 else { 2559 if (thflags & TH_SYN) 2560 tlen++; 2561 /* mtod() below is safe as long as hdr dropping is delayed */ 2562 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 2563 (tcp_seq)0, TH_RST|TH_ACK); 2564 } 2565 2566 if (tp != NULL) 2567 INP_UNLOCK(inp); 2568 if (headlocked) 2569 INP_INFO_WUNLOCK(&tcbinfo); 2570 return; 2571 2572drop: 2573 /* 2574 * Drop space held by incoming segment and return. 2575 */ 2576#ifdef TCPDEBUG 2577 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 2578 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 2579 &tcp_savetcp, 0); 2580#endif 2581 if (tp != NULL) 2582 INP_UNLOCK(inp); 2583 if (headlocked) 2584 INP_INFO_WUNLOCK(&tcbinfo); 2585 m_freem(m); 2586 return; 2587} 2588 2589/* 2590 * Parse TCP options and place in tcpopt. 2591 */ 2592static void 2593tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) 2594{ 2595 int opt, optlen; 2596 2597 to->to_flags = 0; 2598 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2599 opt = cp[0]; 2600 if (opt == TCPOPT_EOL) 2601 break; 2602 if (opt == TCPOPT_NOP) 2603 optlen = 1; 2604 else { 2605 if (cnt < 2) 2606 break; 2607 optlen = cp[1]; 2608 if (optlen < 2 || optlen > cnt) 2609 break; 2610 } 2611 switch (opt) { 2612 case TCPOPT_MAXSEG: 2613 if (optlen != TCPOLEN_MAXSEG) 2614 continue; 2615 if (!(flags & TO_SYN)) 2616 continue; 2617 to->to_flags |= TOF_MSS; 2618 bcopy((char *)cp + 2, 2619 (char *)&to->to_mss, sizeof(to->to_mss)); 2620 to->to_mss = ntohs(to->to_mss); 2621 break; 2622 case TCPOPT_WINDOW: 2623 if (optlen != TCPOLEN_WINDOW) 2624 continue; 2625 if (!(flags & TO_SYN)) 2626 continue; 2627 to->to_flags |= TOF_SCALE; 2628 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); 2629 break; 2630 case TCPOPT_TIMESTAMP: 2631 if (optlen != TCPOLEN_TIMESTAMP) 2632 continue; 2633 to->to_flags |= TOF_TS; 2634 bcopy((char *)cp + 2, 2635 (char *)&to->to_tsval, sizeof(to->to_tsval)); 2636 to->to_tsval = ntohl(to->to_tsval); 2637 bcopy((char *)cp + 6, 2638 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 2639 to->to_tsecr = ntohl(to->to_tsecr); 2640 break; 2641#ifdef TCP_SIGNATURE 2642 /* 2643 * XXX In order to reply to a host which has set the 2644 * TCP_SIGNATURE option in its initial SYN, we have to 2645 * record the fact that the option was observed here 2646 * for the syncache code to perform the correct response. 2647 */ 2648 case TCPOPT_SIGNATURE: 2649 if (optlen != TCPOLEN_SIGNATURE) 2650 continue; 2651 to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN); 2652 break; 2653#endif 2654 case TCPOPT_SACK_PERMITTED: 2655 if (optlen != TCPOLEN_SACK_PERMITTED) 2656 continue; 2657 if (!(flags & TO_SYN)) 2658 continue; 2659 if (!tcp_do_sack) 2660 continue; 2661 to->to_flags |= TOF_SACKPERM; 2662 break; 2663 case TCPOPT_SACK: 2664 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2665 continue; 2666 to->to_flags |= TOF_SACK; 2667 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 2668 to->to_sacks = cp + 2; 2669 tcpstat.tcps_sack_rcv_blocks++; 2670 break; 2671 default: 2672 continue; 2673 } 2674 } 2675} 2676 2677/* 2678 * Pull out of band byte out of a segment so 2679 * it doesn't appear in the user's data queue. 2680 * It is still reflected in the segment length for 2681 * sequencing purposes. 2682 */ 2683static void 2684tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, 2685 int off) 2686{ 2687 int cnt = off + th->th_urp - 1; 2688 2689 while (cnt >= 0) { 2690 if (m->m_len > cnt) { 2691 char *cp = mtod(m, caddr_t) + cnt; 2692 struct tcpcb *tp = sototcpcb(so); 2693 2694 tp->t_iobc = *cp; 2695 tp->t_oobflags |= TCPOOB_HAVEDATA; 2696 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2697 m->m_len--; 2698 if (m->m_flags & M_PKTHDR) 2699 m->m_pkthdr.len--; 2700 return; 2701 } 2702 cnt -= m->m_len; 2703 m = m->m_next; 2704 if (m == NULL) 2705 break; 2706 } 2707 panic("tcp_pulloutofband"); 2708} 2709 2710/* 2711 * Collect new round-trip time estimate 2712 * and update averages and current timeout. 2713 */ 2714static void 2715tcp_xmit_timer(struct tcpcb *tp, int rtt) 2716{ 2717 int delta; 2718 2719 INP_LOCK_ASSERT(tp->t_inpcb); 2720 2721 tcpstat.tcps_rttupdated++; 2722 tp->t_rttupdated++; 2723 if (tp->t_srtt != 0) { 2724 /* 2725 * srtt is stored as fixed point with 5 bits after the 2726 * binary point (i.e., scaled by 8). The following magic 2727 * is equivalent to the smoothing algorithm in rfc793 with 2728 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2729 * point). Adjust rtt to origin 0. 2730 */ 2731 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 2732 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 2733 2734 if ((tp->t_srtt += delta) <= 0) 2735 tp->t_srtt = 1; 2736 2737 /* 2738 * We accumulate a smoothed rtt variance (actually, a 2739 * smoothed mean difference), then set the retransmit 2740 * timer to smoothed rtt + 4 times the smoothed variance. 2741 * rttvar is stored as fixed point with 4 bits after the 2742 * binary point (scaled by 16). The following is 2743 * equivalent to rfc793 smoothing with an alpha of .75 2744 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2745 * rfc793's wired-in beta. 2746 */ 2747 if (delta < 0) 2748 delta = -delta; 2749 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 2750 if ((tp->t_rttvar += delta) <= 0) 2751 tp->t_rttvar = 1; 2752 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 2753 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2754 } else { 2755 /* 2756 * No rtt measurement yet - use the unsmoothed rtt. 2757 * Set the variance to half the rtt (so our first 2758 * retransmit happens at 3*rtt). 2759 */ 2760 tp->t_srtt = rtt << TCP_RTT_SHIFT; 2761 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 2762 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 2763 } 2764 tp->t_rtttime = 0; 2765 tp->t_rxtshift = 0; 2766 2767 /* 2768 * the retransmit should happen at rtt + 4 * rttvar. 2769 * Because of the way we do the smoothing, srtt and rttvar 2770 * will each average +1/2 tick of bias. When we compute 2771 * the retransmit timer, we want 1/2 tick of rounding and 2772 * 1 extra tick because of +-1/2 tick uncertainty in the 2773 * firing of the timer. The bias will give us exactly the 2774 * 1.5 tick we need. But, because the bias is 2775 * statistical, we have to test that we don't drop below 2776 * the minimum feasible timer (which is 2 ticks). 2777 */ 2778 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 2779 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 2780 2781 /* 2782 * We received an ack for a packet that wasn't retransmitted; 2783 * it is probably safe to discard any error indications we've 2784 * received recently. This isn't quite right, but close enough 2785 * for now (a route might have failed after we sent a segment, 2786 * and the return path might not be symmetrical). 2787 */ 2788 tp->t_softerror = 0; 2789} 2790 2791/* 2792 * Determine a reasonable value for maxseg size. 2793 * If the route is known, check route for mtu. 2794 * If none, use an mss that can be handled on the outgoing 2795 * interface without forcing IP to fragment; if bigger than 2796 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2797 * to utilize large mbufs. If no route is found, route has no mtu, 2798 * or the destination isn't local, use a default, hopefully conservative 2799 * size (usually 512 or the default IP max size, but no more than the mtu 2800 * of the interface), as we can't discover anything about intervening 2801 * gateways or networks. We also initialize the congestion/slow start 2802 * window to be a single segment if the destination isn't local. 2803 * While looking at the routing entry, we also initialize other path-dependent 2804 * parameters from pre-set or cached values in the routing entry. 2805 * 2806 * Also take into account the space needed for options that we 2807 * send regularly. Make maxseg shorter by that amount to assure 2808 * that we can send maxseg amount of data even when the options 2809 * are present. Store the upper limit of the length of options plus 2810 * data in maxopd. 2811 * 2812 * 2813 * In case of T/TCP, we call this routine during implicit connection 2814 * setup as well (offer = -1), to initialize maxseg from the cached 2815 * MSS of our peer. 2816 * 2817 * NOTE that this routine is only called when we process an incoming 2818 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). 2819 */ 2820void 2821tcp_mss(struct tcpcb *tp, int offer) 2822{ 2823 int rtt, mss; 2824 u_long bufsize; 2825 u_long maxmtu; 2826 struct inpcb *inp = tp->t_inpcb; 2827 struct socket *so; 2828 struct hc_metrics_lite metrics; 2829 int origoffer = offer; 2830 int mtuflags = 0; 2831#ifdef INET6 2832 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 2833 size_t min_protoh = isipv6 ? 2834 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : 2835 sizeof (struct tcpiphdr); 2836#else 2837 const size_t min_protoh = sizeof(struct tcpiphdr); 2838#endif 2839 2840 /* initialize */ 2841#ifdef INET6 2842 if (isipv6) { 2843 maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); 2844 tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; 2845 } else 2846#endif 2847 { 2848 maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); 2849 tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; 2850 } 2851 so = inp->inp_socket; 2852 2853 /* 2854 * no route to sender, stay with default mss and return 2855 */ 2856 if (maxmtu == 0) 2857 return; 2858 2859 /* what have we got? */ 2860 switch (offer) { 2861 case 0: 2862 /* 2863 * Offer == 0 means that there was no MSS on the SYN 2864 * segment, in this case we use tcp_mssdflt. 2865 */ 2866 offer = 2867#ifdef INET6 2868 isipv6 ? tcp_v6mssdflt : 2869#endif 2870 tcp_mssdflt; 2871 break; 2872 2873 case -1: 2874 /* 2875 * Offer == -1 means that we didn't receive SYN yet. 2876 */ 2877 /* FALLTHROUGH */ 2878 2879 default: 2880 /* 2881 * Prevent DoS attack with too small MSS. Round up 2882 * to at least minmss. 2883 */ 2884 offer = max(offer, tcp_minmss); 2885 /* 2886 * Sanity check: make sure that maxopd will be large 2887 * enough to allow some data on segments even if the 2888 * all the option space is used (40bytes). Otherwise 2889 * funny things may happen in tcp_output. 2890 */ 2891 offer = max(offer, 64); 2892 } 2893 2894 /* 2895 * rmx information is now retrieved from tcp_hostcache 2896 */ 2897 tcp_hc_get(&inp->inp_inc, &metrics); 2898 2899 /* 2900 * if there's a discovered mtu int tcp hostcache, use it 2901 * else, use the link mtu. 2902 */ 2903 if (metrics.rmx_mtu) 2904 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; 2905 else { 2906#ifdef INET6 2907 if (isipv6) { 2908 mss = maxmtu - min_protoh; 2909 if (!path_mtu_discovery && 2910 !in6_localaddr(&inp->in6p_faddr)) 2911 mss = min(mss, tcp_v6mssdflt); 2912 } else 2913#endif 2914 { 2915 mss = maxmtu - min_protoh; 2916 if (!path_mtu_discovery && 2917 !in_localaddr(inp->inp_faddr)) 2918 mss = min(mss, tcp_mssdflt); 2919 } 2920 } 2921 mss = min(mss, offer); 2922 2923 /* 2924 * maxopd stores the maximum length of data AND options 2925 * in a segment; maxseg is the amount of data in a normal 2926 * segment. We need to store this value (maxopd) apart 2927 * from maxseg, because now every segment carries options 2928 * and thus we normally have somewhat less data in segments. 2929 */ 2930 tp->t_maxopd = mss; 2931 2932 /* 2933 * origoffer==-1 indicates, that no segments were received yet. 2934 * In this case we just guess. 2935 */ 2936 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2937 (origoffer == -1 || 2938 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 2939 mss -= TCPOLEN_TSTAMP_APPA; 2940 tp->t_maxseg = mss; 2941 2942#if (MCLBYTES & (MCLBYTES - 1)) == 0 2943 if (mss > MCLBYTES) 2944 mss &= ~(MCLBYTES-1); 2945#else 2946 if (mss > MCLBYTES) 2947 mss = mss / MCLBYTES * MCLBYTES; 2948#endif 2949 tp->t_maxseg = mss; 2950 2951 /* 2952 * If there's a pipesize, change the socket buffer to that size, 2953 * don't change if sb_hiwat is different than default (then it 2954 * has been changed on purpose with setsockopt). 2955 * Make the socket buffers an integral number of mss units; 2956 * if the mss is larger than the socket buffer, decrease the mss. 2957 */ 2958 SOCKBUF_LOCK(&so->so_snd); 2959 if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) 2960 bufsize = metrics.rmx_sendpipe; 2961 else 2962 bufsize = so->so_snd.sb_hiwat; 2963 if (bufsize < mss) 2964 mss = bufsize; 2965 else { 2966 bufsize = roundup(bufsize, mss); 2967 if (bufsize > sb_max) 2968 bufsize = sb_max; 2969 if (bufsize > so->so_snd.sb_hiwat) 2970 (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); 2971 } 2972 SOCKBUF_UNLOCK(&so->so_snd); 2973 tp->t_maxseg = mss; 2974 2975 SOCKBUF_LOCK(&so->so_rcv); 2976 if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) 2977 bufsize = metrics.rmx_recvpipe; 2978 else 2979 bufsize = so->so_rcv.sb_hiwat; 2980 if (bufsize > mss) { 2981 bufsize = roundup(bufsize, mss); 2982 if (bufsize > sb_max) 2983 bufsize = sb_max; 2984 if (bufsize > so->so_rcv.sb_hiwat) 2985 (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); 2986 } 2987 SOCKBUF_UNLOCK(&so->so_rcv); 2988 /* 2989 * While we're here, check the others too 2990 */ 2991 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { 2992 tp->t_srtt = rtt; 2993 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; 2994 tcpstat.tcps_usedrtt++; 2995 if (metrics.rmx_rttvar) { 2996 tp->t_rttvar = metrics.rmx_rttvar; 2997 tcpstat.tcps_usedrttvar++; 2998 } else { 2999 /* default variation is +- 1 rtt */ 3000 tp->t_rttvar = 3001 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 3002 } 3003 TCPT_RANGESET(tp->t_rxtcur, 3004 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 3005 tp->t_rttmin, TCPTV_REXMTMAX); 3006 } 3007 if (metrics.rmx_ssthresh) { 3008 /* 3009 * There's some sort of gateway or interface 3010 * buffer limit on the path. Use this to set 3011 * the slow start threshhold, but set the 3012 * threshold to no less than 2*mss. 3013 */ 3014 tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); 3015 tcpstat.tcps_usedssthresh++; 3016 } 3017 if (metrics.rmx_bandwidth) 3018 tp->snd_bandwidth = metrics.rmx_bandwidth; 3019 3020 /* 3021 * Set the slow-start flight size depending on whether this 3022 * is a local network or not. 3023 * 3024 * Extend this so we cache the cwnd too and retrieve it here. 3025 * Make cwnd even bigger than RFC3390 suggests but only if we 3026 * have previous experience with the remote host. Be careful 3027 * not make cwnd bigger than remote receive window or our own 3028 * send socket buffer. Maybe put some additional upper bound 3029 * on the retrieved cwnd. Should do incremental updates to 3030 * hostcache when cwnd collapses so next connection doesn't 3031 * overloads the path again. 3032 * 3033 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. 3034 * We currently check only in syncache_socket for that. 3035 */ 3036#define TCP_METRICS_CWND 3037#ifdef TCP_METRICS_CWND 3038 if (metrics.rmx_cwnd) 3039 tp->snd_cwnd = max(mss, 3040 min(metrics.rmx_cwnd / 2, 3041 min(tp->snd_wnd, so->so_snd.sb_hiwat))); 3042 else 3043#endif 3044 if (tcp_do_rfc3390) 3045 tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); 3046#ifdef INET6 3047 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || 3048 (!isipv6 && in_localaddr(inp->inp_faddr))) 3049#else 3050 else if (in_localaddr(inp->inp_faddr)) 3051#endif 3052 tp->snd_cwnd = mss * ss_fltsz_local; 3053 else 3054 tp->snd_cwnd = mss * ss_fltsz; 3055 3056 /* Check the interface for TSO capabilities. */ 3057 if (mtuflags & CSUM_TSO) 3058 tp->t_flags |= TF_TSO; 3059} 3060 3061/* 3062 * Determine the MSS option to send on an outgoing SYN. 3063 */ 3064int 3065tcp_mssopt(struct in_conninfo *inc) 3066{ 3067 int mss = 0; 3068 u_long maxmtu = 0; 3069 u_long thcmtu = 0; 3070 size_t min_protoh; 3071#ifdef INET6 3072 int isipv6 = inc->inc_isipv6 ? 1 : 0; 3073#endif 3074 3075 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); 3076 3077#ifdef INET6 3078 if (isipv6) { 3079 mss = tcp_v6mssdflt; 3080 maxmtu = tcp_maxmtu6(inc, NULL); 3081 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3082 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 3083 } else 3084#endif 3085 { 3086 mss = tcp_mssdflt; 3087 maxmtu = tcp_maxmtu(inc, NULL); 3088 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3089 min_protoh = sizeof(struct tcpiphdr); 3090 } 3091 if (maxmtu && thcmtu) 3092 mss = min(maxmtu, thcmtu) - min_protoh; 3093 else if (maxmtu || thcmtu) 3094 mss = max(maxmtu, thcmtu) - min_protoh; 3095 3096 return (mss); 3097} 3098 3099 3100/* 3101 * On a partial ack arrives, force the retransmission of the 3102 * next unacknowledged segment. Do not clear tp->t_dupacks. 3103 * By setting snd_nxt to ti_ack, this forces retransmission timer to 3104 * be started again. 3105 */ 3106static void 3107tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) 3108{ 3109 tcp_seq onxt = tp->snd_nxt; 3110 u_long ocwnd = tp->snd_cwnd; 3111 3112 callout_stop(tp->tt_rexmt); 3113 tp->t_rtttime = 0; 3114 tp->snd_nxt = th->th_ack; 3115 /* 3116 * Set snd_cwnd to one segment beyond acknowledged offset. 3117 * (tp->snd_una has not yet been updated when this function is called.) 3118 */ 3119 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3120 tp->t_flags |= TF_ACKNOW; 3121 (void) tcp_output(tp); 3122 tp->snd_cwnd = ocwnd; 3123 if (SEQ_GT(onxt, tp->snd_nxt)) 3124 tp->snd_nxt = onxt; 3125 /* 3126 * Partial window deflation. Relies on fact that tp->snd_una 3127 * not updated yet. 3128 */ 3129 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3130 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3131 else 3132 tp->snd_cwnd = 0; 3133 tp->snd_cwnd += tp->t_maxseg; 3134} 3135 3136/* 3137 * Returns 1 if the TIME_WAIT state was killed and we should start over, 3138 * looking for a pcb in the listen state. Returns 0 otherwise. 3139 */ 3140static int 3141tcp_timewait(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, 3142 struct mbuf *m, int tlen) 3143{ 3144 struct tcptw *tw; 3145 int thflags; 3146 tcp_seq seq; 3147#ifdef INET6 3148 int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 3149#else 3150 const int isipv6 = 0; 3151#endif 3152 3153 /* tcbinfo lock required for tcp_twclose(), tcp_timer_2msl_reset(). */ 3154 INP_INFO_WLOCK_ASSERT(&tcbinfo); 3155 INP_LOCK_ASSERT(inp); 3156 3157 /* 3158 * XXXRW: Time wait state for inpcb has been recycled, but inpcb is 3159 * still present. This is undesirable, but temporarily necessary 3160 * until we work out how to handle inpcb's who's timewait state has 3161 * been removed. 3162 */ 3163 tw = intotw(inp); 3164 if (tw == NULL) 3165 goto drop; 3166 3167 thflags = th->th_flags; 3168 3169 /* 3170 * NOTE: for FIN_WAIT_2 (to be added later), 3171 * must validate sequence number before accepting RST 3172 */ 3173 3174 /* 3175 * If the segment contains RST: 3176 * Drop the segment - see Stevens, vol. 2, p. 964 and 3177 * RFC 1337. 3178 */ 3179 if (thflags & TH_RST) 3180 goto drop; 3181 3182#if 0 3183/* PAWS not needed at the moment */ 3184 /* 3185 * RFC 1323 PAWS: If we have a timestamp reply on this segment 3186 * and it's less than ts_recent, drop it. 3187 */ 3188 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 3189 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 3190 if ((thflags & TH_ACK) == 0) 3191 goto drop; 3192 goto ack; 3193 } 3194 /* 3195 * ts_recent is never updated because we never accept new segments. 3196 */ 3197#endif 3198 3199 /* 3200 * If a new connection request is received 3201 * while in TIME_WAIT, drop the old connection 3202 * and start over if the sequence numbers 3203 * are above the previous ones. 3204 */ 3205 if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { 3206 tcp_twclose(tw, 0); 3207 return (1); 3208 } 3209 3210 /* 3211 * Drop the the segment if it does not contain an ACK. 3212 */ 3213 if ((thflags & TH_ACK) == 0) 3214 goto drop; 3215 3216 /* 3217 * Reset the 2MSL timer if this is a duplicate FIN. 3218 */ 3219 if (thflags & TH_FIN) { 3220 seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); 3221 if (seq + 1 == tw->rcv_nxt) 3222 tcp_timer_2msl_reset(tw, 1); 3223 } 3224 3225 /* 3226 * Acknowledge the segment if it has data or is not a duplicate ACK. 3227 */ 3228 if (thflags != TH_ACK || tlen != 0 || 3229 th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) 3230 tcp_twrespond(tw, TH_ACK); 3231 goto drop; 3232 3233 /* 3234 * Generate a RST, dropping incoming segment. 3235 * Make ACK acceptable to originator of segment. 3236 * Don't bother to respond if destination was broadcast/multicast. 3237 */ 3238 if (m->m_flags & (M_BCAST|M_MCAST)) 3239 goto drop; 3240 if (isipv6) { 3241 struct ip6_hdr *ip6; 3242 3243 /* IPv6 anycast check is done at tcp6_input() */ 3244 ip6 = mtod(m, struct ip6_hdr *); 3245 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 3246 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 3247 goto drop; 3248 } else { 3249 struct ip *ip; 3250 3251 ip = mtod(m, struct ip *); 3252 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 3253 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 3254 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 3255 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 3256 goto drop; 3257 } 3258 if (thflags & TH_ACK) { 3259 tcp_respond(NULL, 3260 mtod(m, void *), th, m, 0, th->th_ack, TH_RST); 3261 } else { 3262 seq = th->th_seq + (thflags & TH_SYN ? 1 : 0); 3263 tcp_respond(NULL, 3264 mtod(m, void *), th, m, seq, 0, TH_RST|TH_ACK); 3265 } 3266 INP_UNLOCK(inp); 3267 return (0); 3268 3269drop: 3270 INP_UNLOCK(inp); 3271 m_freem(m); 3272 return (0); 3273} 3274