tcp_timewait.c revision 122922
1/* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 34 * $FreeBSD: head/sys/netinet/tcp_timewait.c 122922 2003-11-20 20:07:39Z andre $ 35 */ 36 37#include "opt_compat.h" 38#include "opt_inet6.h" 39#include "opt_ipsec.h" 40#include "opt_mac.h" 41#include "opt_tcpdebug.h" 42 43#include <sys/param.h> 44#include <sys/systm.h> 45#include <sys/callout.h> 46#include <sys/kernel.h> 47#include <sys/sysctl.h> 48#include <sys/mac.h> 49#include <sys/malloc.h> 50#include <sys/mbuf.h> 51#ifdef INET6 52#include <sys/domain.h> 53#endif 54#include <sys/proc.h> 55#include <sys/socket.h> 56#include <sys/socketvar.h> 57#include <sys/protosw.h> 58#include <sys/random.h> 59 60#include <vm/uma.h> 61 62#include <net/route.h> 63#include <net/if.h> 64 65#include <netinet/in.h> 66#include <netinet/in_systm.h> 67#include <netinet/ip.h> 68#ifdef INET6 69#include <netinet/ip6.h> 70#endif 71#include <netinet/in_pcb.h> 72#ifdef INET6 73#include <netinet6/in6_pcb.h> 74#endif 75#include <netinet/in_var.h> 76#include <netinet/ip_var.h> 77#ifdef INET6 78#include <netinet6/ip6_var.h> 79#include <netinet6/nd6.h> 80#endif 81#include <netinet/tcp.h> 82#include <netinet/tcp_fsm.h> 83#include <netinet/tcp_seq.h> 84#include <netinet/tcp_timer.h> 85#include <netinet/tcp_var.h> 86#ifdef INET6 87#include <netinet6/tcp6_var.h> 88#endif 89#include <netinet/tcpip.h> 90#ifdef TCPDEBUG 91#include <netinet/tcp_debug.h> 92#endif 93#include <netinet6/ip6protosw.h> 94 95#ifdef IPSEC 96#include <netinet6/ipsec.h> 97#ifdef INET6 98#include <netinet6/ipsec6.h> 99#endif 100#endif /*IPSEC*/ 101 102#ifdef FAST_IPSEC 103#include <netipsec/ipsec.h> 104#ifdef INET6 105#include <netipsec/ipsec6.h> 106#endif 107#define IPSEC 108#endif /*FAST_IPSEC*/ 109 110#include <machine/in_cksum.h> 111#include <sys/md5.h> 112 113int tcp_mssdflt = TCP_MSS; 114SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 115 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); 116 117#ifdef INET6 118int tcp_v6mssdflt = TCP6_MSS; 119SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 120 CTLFLAG_RW, &tcp_v6mssdflt , 0, 121 "Default TCP Maximum Segment Size for IPv6"); 122#endif 123 124#if 0 125static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; 126SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 127 &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); 128#endif 129 130int tcp_do_rfc1323 = 1; 131SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 132 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); 133 134int tcp_do_rfc1644 = 0; 135SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 136 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); 137 138static int tcp_tcbhashsize = 0; 139SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, 140 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 141 142static int do_tcpdrain = 1; 143SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, 144 "Enable tcp_drain routine for extra help when low on mbufs"); 145 146SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 147 &tcbinfo.ipi_count, 0, "Number of active PCBs"); 148 149static int icmp_may_rst = 1; 150SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 151 "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 152 153static int tcp_isn_reseed_interval = 0; 154SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, 155 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); 156 157/* 158 * TCP bandwidth limiting sysctls. Note that the default lower bound of 159 * 1024 exists only for debugging. A good production default would be 160 * something like 6100. 161 */ 162static int tcp_inflight_enable = 0; 163SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW, 164 &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); 165 166static int tcp_inflight_debug = 0; 167SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW, 168 &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); 169 170static int tcp_inflight_min = 6144; 171SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW, 172 &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); 173 174static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; 175SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW, 176 &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); 177static int tcp_inflight_stab = 20; 178SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, 179 &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); 180 181static struct inpcb *tcp_notify(struct inpcb *, int); 182static void tcp_discardcb(struct tcpcb *); 183 184/* 185 * Target size of TCP PCB hash tables. Must be a power of two. 186 * 187 * Note that this can be overridden by the kernel environment 188 * variable net.inet.tcp.tcbhashsize 189 */ 190#ifndef TCBHASHSIZE 191#define TCBHASHSIZE 512 192#endif 193 194/* 195 * XXX 196 * Callouts should be moved into struct tcp directly. They are currently 197 * separate becuase the tcpcb structure is exported to userland for sysctl 198 * parsing purposes, which do not know about callouts. 199 */ 200struct tcpcb_mem { 201 struct tcpcb tcb; 202 struct callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep; 203 struct callout tcpcb_mem_2msl, tcpcb_mem_delack; 204}; 205 206static uma_zone_t tcpcb_zone; 207static uma_zone_t tcptw_zone; 208 209/* 210 * Tcp initialization 211 */ 212void 213tcp_init() 214{ 215 int hashsize = TCBHASHSIZE; 216 217 tcp_ccgen = 1; 218 219 tcp_delacktime = TCPTV_DELACK; 220 tcp_keepinit = TCPTV_KEEP_INIT; 221 tcp_keepidle = TCPTV_KEEP_IDLE; 222 tcp_keepintvl = TCPTV_KEEPINTVL; 223 tcp_maxpersistidle = TCPTV_KEEP_IDLE; 224 tcp_msl = TCPTV_MSL; 225 tcp_rexmit_min = TCPTV_MIN; 226 tcp_rexmit_slop = TCPTV_CPU_VAR; 227 228 INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); 229 LIST_INIT(&tcb); 230 tcbinfo.listhead = &tcb; 231 TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); 232 if (!powerof2(hashsize)) { 233 printf("WARNING: TCB hash size not a power of 2\n"); 234 hashsize = 512; /* safe default */ 235 } 236 tcp_tcbhashsize = hashsize; 237 tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); 238 tcbinfo.porthashbase = hashinit(hashsize, M_PCB, 239 &tcbinfo.porthashmask); 240 tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), 241 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 242 uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); 243#ifdef INET6 244#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) 245#else /* INET6 */ 246#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) 247#endif /* INET6 */ 248 if (max_protohdr < TCP_MINPROTOHDR) 249 max_protohdr = TCP_MINPROTOHDR; 250 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) 251 panic("tcp_init"); 252#undef TCP_MINPROTOHDR 253 /* 254 * These have to be type stable for the benefit of the timers. 255 */ 256 tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), 257 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 258 uma_zone_set_max(tcpcb_zone, maxsockets); 259 tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), 260 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 261 uma_zone_set_max(tcptw_zone, maxsockets / 5); 262 tcp_timer_init(); 263 syncache_init(); 264 tcp_hc_init(); 265} 266 267/* 268 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 269 * tcp_template used to store this data in mbufs, but we now recopy it out 270 * of the tcpcb each time to conserve mbufs. 271 */ 272void 273tcpip_fillheaders(inp, ip_ptr, tcp_ptr) 274 struct inpcb *inp; 275 void *ip_ptr; 276 void *tcp_ptr; 277{ 278 struct tcphdr *th = (struct tcphdr *)tcp_ptr; 279 280#ifdef INET6 281 if ((inp->inp_vflag & INP_IPV6) != 0) { 282 struct ip6_hdr *ip6; 283 284 ip6 = (struct ip6_hdr *)ip_ptr; 285 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 286 (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); 287 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 288 (IPV6_VERSION & IPV6_VERSION_MASK); 289 ip6->ip6_nxt = IPPROTO_TCP; 290 ip6->ip6_plen = sizeof(struct tcphdr); 291 ip6->ip6_src = inp->in6p_laddr; 292 ip6->ip6_dst = inp->in6p_faddr; 293 } else 294#endif 295 { 296 struct ip *ip; 297 298 ip = (struct ip *)ip_ptr; 299 ip->ip_v = IPVERSION; 300 ip->ip_hl = 5; 301 ip->ip_tos = inp->inp_ip_tos; 302 ip->ip_len = 0; 303 ip->ip_id = 0; 304 ip->ip_off = 0; 305 ip->ip_ttl = inp->inp_ip_ttl; 306 ip->ip_sum = 0; 307 ip->ip_p = IPPROTO_TCP; 308 ip->ip_src = inp->inp_laddr; 309 ip->ip_dst = inp->inp_faddr; 310 } 311 th->th_sport = inp->inp_lport; 312 th->th_dport = inp->inp_fport; 313 th->th_seq = 0; 314 th->th_ack = 0; 315 th->th_x2 = 0; 316 th->th_off = 5; 317 th->th_flags = 0; 318 th->th_win = 0; 319 th->th_urp = 0; 320 th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ 321} 322 323/* 324 * Create template to be used to send tcp packets on a connection. 325 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 326 * use for this function is in keepalives, which use tcp_respond. 327 */ 328struct tcptemp * 329tcpip_maketemplate(inp) 330 struct inpcb *inp; 331{ 332 struct mbuf *m; 333 struct tcptemp *n; 334 335 m = m_get(M_DONTWAIT, MT_HEADER); 336 if (m == NULL) 337 return (0); 338 m->m_len = sizeof(struct tcptemp); 339 n = mtod(m, struct tcptemp *); 340 341 tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t); 342 return (n); 343} 344 345/* 346 * Send a single message to the TCP at address specified by 347 * the given TCP/IP header. If m == 0, then we make a copy 348 * of the tcpiphdr at ti and send directly to the addressed host. 349 * This is used to force keep alive messages out using the TCP 350 * template for a connection. If flags are given then we send 351 * a message back to the TCP which originated the * segment ti, 352 * and discard the mbuf containing it and any other attached mbufs. 353 * 354 * In any case the ack and sequence number of the transmitted 355 * segment are as specified by the parameters. 356 * 357 * NOTE: If m != NULL, then ti must point to *inside* the mbuf. 358 */ 359void 360tcp_respond(tp, ipgen, th, m, ack, seq, flags) 361 struct tcpcb *tp; 362 void *ipgen; 363 register struct tcphdr *th; 364 register struct mbuf *m; 365 tcp_seq ack, seq; 366 int flags; 367{ 368 register int tlen; 369 int win = 0; 370 struct ip *ip; 371 struct tcphdr *nth; 372#ifdef INET6 373 struct ip6_hdr *ip6; 374 int isipv6; 375#endif /* INET6 */ 376 int ipflags = 0; 377 struct inpcb *inp = NULL; 378 379 KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); 380 381#ifdef INET6 382 isipv6 = ((struct ip *)ipgen)->ip_v == 6; 383 ip6 = ipgen; 384#endif /* INET6 */ 385 ip = ipgen; 386 387 if (tp) { 388 inp = tp->t_inpcb; 389 KASSERT(inp != NULL, ("tcp control block w/o inpcb")); 390 INP_INFO_WLOCK_ASSERT(&tcbinfo); 391 INP_LOCK_ASSERT(inp); 392 if (!(flags & TH_RST)) { 393 win = sbspace(&inp->inp_socket->so_rcv); 394 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 395 win = (long)TCP_MAXWIN << tp->rcv_scale; 396 } 397 } 398 if (m == 0) { 399 m = m_gethdr(M_DONTWAIT, MT_HEADER); 400 if (m == NULL) 401 return; 402 tlen = 0; 403 m->m_data += max_linkhdr; 404#ifdef INET6 405 if (isipv6) { 406 bcopy((caddr_t)ip6, mtod(m, caddr_t), 407 sizeof(struct ip6_hdr)); 408 ip6 = mtod(m, struct ip6_hdr *); 409 nth = (struct tcphdr *)(ip6 + 1); 410 } else 411#endif /* INET6 */ 412 { 413 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 414 ip = mtod(m, struct ip *); 415 nth = (struct tcphdr *)(ip + 1); 416 } 417 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 418 flags = TH_ACK; 419 } else { 420 m_freem(m->m_next); 421 m->m_next = 0; 422 m->m_data = (caddr_t)ipgen; 423 /* m_len is set later */ 424 tlen = 0; 425#define xchg(a,b,type) { type t; t=a; a=b; b=t; } 426#ifdef INET6 427 if (isipv6) { 428 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 429 nth = (struct tcphdr *)(ip6 + 1); 430 } else 431#endif /* INET6 */ 432 { 433 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); 434 nth = (struct tcphdr *)(ip + 1); 435 } 436 if (th != nth) { 437 /* 438 * this is usually a case when an extension header 439 * exists between the IPv6 header and the 440 * TCP header. 441 */ 442 nth->th_sport = th->th_sport; 443 nth->th_dport = th->th_dport; 444 } 445 xchg(nth->th_dport, nth->th_sport, n_short); 446#undef xchg 447 } 448#ifdef INET6 449 if (isipv6) { 450 ip6->ip6_flow = 0; 451 ip6->ip6_vfc = IPV6_VERSION; 452 ip6->ip6_nxt = IPPROTO_TCP; 453 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + 454 tlen)); 455 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 456 } else 457#endif 458 { 459 tlen += sizeof (struct tcpiphdr); 460 ip->ip_len = tlen; 461 ip->ip_ttl = ip_defttl; 462 } 463 m->m_len = tlen; 464 m->m_pkthdr.len = tlen; 465 m->m_pkthdr.rcvif = (struct ifnet *) 0; 466#ifdef MAC 467 if (inp != NULL) { 468 /* 469 * Packet is associated with a socket, so allow the 470 * label of the response to reflect the socket label. 471 */ 472 mac_create_mbuf_from_socket(inp->inp_socket, m); 473 } else { 474 /* 475 * Packet is not associated with a socket, so possibly 476 * update the label in place. 477 */ 478 mac_reflect_mbuf_tcp(m); 479 } 480#endif 481 nth->th_seq = htonl(seq); 482 nth->th_ack = htonl(ack); 483 nth->th_x2 = 0; 484 nth->th_off = sizeof (struct tcphdr) >> 2; 485 nth->th_flags = flags; 486 if (tp) 487 nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 488 else 489 nth->th_win = htons((u_short)win); 490 nth->th_urp = 0; 491#ifdef INET6 492 if (isipv6) { 493 nth->th_sum = 0; 494 nth->th_sum = in6_cksum(m, IPPROTO_TCP, 495 sizeof(struct ip6_hdr), 496 tlen - sizeof(struct ip6_hdr)); 497 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL); 498 } else 499#endif /* INET6 */ 500 { 501 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 502 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 503 m->m_pkthdr.csum_flags = CSUM_TCP; 504 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 505 } 506#ifdef TCPDEBUG 507 if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) 508 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); 509#endif 510#ifdef INET6 511 if (isipv6) 512 (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); 513 else 514#endif /* INET6 */ 515 (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); 516} 517 518/* 519 * Create a new TCP control block, making an 520 * empty reassembly queue and hooking it to the argument 521 * protocol control block. The `inp' parameter must have 522 * come from the zone allocator set up in tcp_init(). 523 */ 524struct tcpcb * 525tcp_newtcpcb(inp) 526 struct inpcb *inp; 527{ 528 struct tcpcb_mem *tm; 529 struct tcpcb *tp; 530#ifdef INET6 531 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 532#endif /* INET6 */ 533 534 tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); 535 if (tm == NULL) 536 return (NULL); 537 tp = &tm->tcb; 538 /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ 539 tp->t_maxseg = tp->t_maxopd = 540#ifdef INET6 541 isipv6 ? tcp_v6mssdflt : 542#endif /* INET6 */ 543 tcp_mssdflt; 544 545 /* Set up our timeouts. */ 546 callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0); 547 callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0); 548 callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0); 549 callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0); 550 callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0); 551 552 if (tcp_do_rfc1323) 553 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 554 if (tcp_do_rfc1644) 555 tp->t_flags |= TF_REQ_CC; 556 tp->t_inpcb = inp; /* XXX */ 557 /* 558 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 559 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 560 * reasonable initial retransmit time. 561 */ 562 tp->t_srtt = TCPTV_SRTTBASE; 563 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 564 tp->t_rttmin = tcp_rexmit_min; 565 tp->t_rxtcur = TCPTV_RTOBASE; 566 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 567 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 568 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 569 tp->t_rcvtime = ticks; 570 tp->t_bw_rtttime = ticks; 571 /* 572 * IPv4 TTL initialization is necessary for an IPv6 socket as well, 573 * because the socket may be bound to an IPv6 wildcard address, 574 * which may match an IPv4-mapped IPv6 address. 575 */ 576 inp->inp_ip_ttl = ip_defttl; 577 inp->inp_ppcb = (caddr_t)tp; 578 return (tp); /* XXX */ 579} 580 581/* 582 * Drop a TCP connection, reporting 583 * the specified error. If connection is synchronized, 584 * then send a RST to peer. 585 */ 586struct tcpcb * 587tcp_drop(tp, errno) 588 register struct tcpcb *tp; 589 int errno; 590{ 591 struct socket *so = tp->t_inpcb->inp_socket; 592 593 if (TCPS_HAVERCVDSYN(tp->t_state)) { 594 tp->t_state = TCPS_CLOSED; 595 (void) tcp_output(tp); 596 tcpstat.tcps_drops++; 597 } else 598 tcpstat.tcps_conndrops++; 599 if (errno == ETIMEDOUT && tp->t_softerror) 600 errno = tp->t_softerror; 601 so->so_error = errno; 602 return (tcp_close(tp)); 603} 604 605static void 606tcp_discardcb(tp) 607 struct tcpcb *tp; 608{ 609 struct tseg_qent *q; 610 struct inpcb *inp = tp->t_inpcb; 611 struct socket *so = inp->inp_socket; 612#ifdef INET6 613 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 614#endif /* INET6 */ 615 616 /* 617 * Make sure that all of our timers are stopped before we 618 * delete the PCB. 619 */ 620 callout_stop(tp->tt_rexmt); 621 callout_stop(tp->tt_persist); 622 callout_stop(tp->tt_keep); 623 callout_stop(tp->tt_2msl); 624 callout_stop(tp->tt_delack); 625 626 /* 627 * If we got enough samples through the srtt filter, 628 * save the rtt and rttvar in the routing entry. 629 * 'Enough' is arbitrarily defined as 4 rtt samples. 630 * 4 samples is enough for the srtt filter to converge 631 * to within enough % of the correct value; fewer samples 632 * and we could save a bogus rtt. The danger is not high 633 * as tcp quickly recovers from everything. 634 * XXX: Works very well but needs some more statistics! 635 */ 636 if (tp->t_rttupdated >= 4) { 637 struct hc_metrics_lite metrics; 638 u_long ssthresh; 639 640 bzero(&metrics, sizeof(metrics)); 641 /* 642 * Update the ssthresh always when the conditions below 643 * are satisfied. This gives us better new start value 644 * for the congestion avoidance for new connections. 645 * ssthresh is only set if packet loss occured on a session. 646 */ 647 ssthresh = tp->snd_ssthresh; 648 if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { 649 /* 650 * convert the limit from user data bytes to 651 * packets then to packet data bytes. 652 */ 653 ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; 654 if (ssthresh < 2) 655 ssthresh = 2; 656 ssthresh *= (u_long)(tp->t_maxseg + 657#ifdef INET6 658 (isipv6 ? sizeof (struct ip6_hdr) + 659 sizeof (struct tcphdr) : 660#endif 661 sizeof (struct tcpiphdr) 662#ifdef INET6 663 ) 664#endif 665 ); 666 } else 667 ssthresh = 0; 668 metrics.rmx_ssthresh = ssthresh; 669 670 metrics.rmx_rtt = tp->t_srtt; 671 metrics.rmx_rttvar = tp->t_rttvar; 672 /* XXX: This wraps if the pipe is more than 4 Gbit per second */ 673 metrics.rmx_bandwidth = tp->snd_bandwidth; 674 metrics.rmx_cwnd = tp->snd_cwnd; 675 metrics.rmx_sendpipe = 0; 676 metrics.rmx_recvpipe = 0; 677 678 tcp_hc_update(&inp->inp_inc, &metrics); 679 } 680 681 /* free the reassembly queue, if any */ 682 while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { 683 LIST_REMOVE(q, tqe_q); 684 m_freem(q->tqe_m); 685 FREE(q, M_TSEGQ); 686 } 687 inp->inp_ppcb = NULL; 688 tp->t_inpcb = NULL; 689 uma_zfree(tcpcb_zone, tp); 690 soisdisconnected(so); 691} 692 693/* 694 * Close a TCP control block: 695 * discard all space held by the tcp 696 * discard internet protocol block 697 * wake up any sleepers 698 */ 699struct tcpcb * 700tcp_close(tp) 701 struct tcpcb *tp; 702{ 703 struct inpcb *inp = tp->t_inpcb; 704#ifdef INET6 705 struct socket *so = inp->inp_socket; 706#endif 707 708 tcp_discardcb(tp); 709#ifdef INET6 710 if (INP_CHECK_SOCKAF(so, AF_INET6)) 711 in6_pcbdetach(inp); 712 else 713#endif 714 in_pcbdetach(inp); 715 tcpstat.tcps_closed++; 716 return ((struct tcpcb *)0); 717} 718 719void 720tcp_drain() 721{ 722 if (do_tcpdrain) 723 { 724 struct inpcb *inpb; 725 struct tcpcb *tcpb; 726 struct tseg_qent *te; 727 728 /* 729 * Walk the tcpbs, if existing, and flush the reassembly queue, 730 * if there is one... 731 * XXX: The "Net/3" implementation doesn't imply that the TCP 732 * reassembly queue should be flushed, but in a situation 733 * where we're really low on mbufs, this is potentially 734 * usefull. 735 */ 736 INP_INFO_RLOCK(&tcbinfo); 737 LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { 738 if (inpb->inp_vflag & INP_TIMEWAIT) 739 continue; 740 INP_LOCK(inpb); 741 if ((tcpb = intotcpcb(inpb))) { 742 while ((te = LIST_FIRST(&tcpb->t_segq)) 743 != NULL) { 744 LIST_REMOVE(te, tqe_q); 745 m_freem(te->tqe_m); 746 FREE(te, M_TSEGQ); 747 } 748 } 749 INP_UNLOCK(inpb); 750 } 751 INP_INFO_RUNLOCK(&tcbinfo); 752 } 753} 754 755/* 756 * Notify a tcp user of an asynchronous error; 757 * store error as soft error, but wake up user 758 * (for now, won't do anything until can select for soft error). 759 * 760 * Do not wake up user since there currently is no mechanism for 761 * reporting soft errors (yet - a kqueue filter may be added). 762 */ 763static struct inpcb * 764tcp_notify(inp, error) 765 struct inpcb *inp; 766 int error; 767{ 768 struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; 769 770 /* 771 * Ignore some errors if we are hooked up. 772 * If connection hasn't completed, has retransmitted several times, 773 * and receives a second error, give up now. This is better 774 * than waiting a long time to establish a connection that 775 * can never complete. 776 */ 777 if (tp->t_state == TCPS_ESTABLISHED && 778 (error == EHOSTUNREACH || error == ENETUNREACH || 779 error == EHOSTDOWN)) { 780 return inp; 781 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 782 tp->t_softerror) { 783 tcp_drop(tp, error); 784 return (struct inpcb *)0; 785 } else { 786 tp->t_softerror = error; 787 return inp; 788 } 789#if 0 790 wakeup( &so->so_timeo); 791 sorwakeup(so); 792 sowwakeup(so); 793#endif 794} 795 796static int 797tcp_pcblist(SYSCTL_HANDLER_ARGS) 798{ 799 int error, i, n, s; 800 struct inpcb *inp, **inp_list; 801 inp_gen_t gencnt; 802 struct xinpgen xig; 803 804 /* 805 * The process of preparing the TCB list is too time-consuming and 806 * resource-intensive to repeat twice on every request. 807 */ 808 if (req->oldptr == 0) { 809 n = tcbinfo.ipi_count; 810 req->oldidx = 2 * (sizeof xig) 811 + (n + n/8) * sizeof(struct xtcpcb); 812 return 0; 813 } 814 815 if (req->newptr != 0) 816 return EPERM; 817 818 /* 819 * OK, now we're committed to doing something. 820 */ 821 s = splnet(); 822 INP_INFO_RLOCK(&tcbinfo); 823 gencnt = tcbinfo.ipi_gencnt; 824 n = tcbinfo.ipi_count; 825 INP_INFO_RUNLOCK(&tcbinfo); 826 splx(s); 827 828 sysctl_wire_old_buffer(req, 2 * (sizeof xig) 829 + n * sizeof(struct xtcpcb)); 830 831 xig.xig_len = sizeof xig; 832 xig.xig_count = n; 833 xig.xig_gen = gencnt; 834 xig.xig_sogen = so_gencnt; 835 error = SYSCTL_OUT(req, &xig, sizeof xig); 836 if (error) 837 return error; 838 839 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 840 if (inp_list == 0) 841 return ENOMEM; 842 843 s = splnet(); 844 INP_INFO_RLOCK(&tcbinfo); 845 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; 846 inp = LIST_NEXT(inp, inp_list)) { 847 INP_LOCK(inp); 848 if (inp->inp_gencnt <= gencnt) { 849 /* 850 * XXX: This use of cr_cansee(), introduced with 851 * TCP state changes, is not quite right, but for 852 * now, better than nothing. 853 */ 854 if (inp->inp_vflag & INP_TIMEWAIT) 855 error = cr_cansee(req->td->td_ucred, 856 intotw(inp)->tw_cred); 857 else 858 error = cr_canseesocket(req->td->td_ucred, 859 inp->inp_socket); 860 if (error == 0) 861 inp_list[i++] = inp; 862 } 863 INP_UNLOCK(inp); 864 } 865 INP_INFO_RUNLOCK(&tcbinfo); 866 splx(s); 867 n = i; 868 869 error = 0; 870 for (i = 0; i < n; i++) { 871 inp = inp_list[i]; 872 if (inp->inp_gencnt <= gencnt) { 873 struct xtcpcb xt; 874 caddr_t inp_ppcb; 875 xt.xt_len = sizeof xt; 876 /* XXX should avoid extra copy */ 877 bcopy(inp, &xt.xt_inp, sizeof *inp); 878 inp_ppcb = inp->inp_ppcb; 879 if (inp_ppcb == NULL) 880 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 881 else if (inp->inp_vflag & INP_TIMEWAIT) { 882 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 883 xt.xt_tp.t_state = TCPS_TIME_WAIT; 884 } else 885 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); 886 if (inp->inp_socket) 887 sotoxsocket(inp->inp_socket, &xt.xt_socket); 888 else { 889 bzero(&xt.xt_socket, sizeof xt.xt_socket); 890 xt.xt_socket.xso_protocol = IPPROTO_TCP; 891 } 892 xt.xt_inp.inp_gencnt = inp->inp_gencnt; 893 error = SYSCTL_OUT(req, &xt, sizeof xt); 894 } 895 } 896 if (!error) { 897 /* 898 * Give the user an updated idea of our state. 899 * If the generation differs from what we told 900 * her before, she knows that something happened 901 * while we were processing this request, and it 902 * might be necessary to retry. 903 */ 904 s = splnet(); 905 INP_INFO_RLOCK(&tcbinfo); 906 xig.xig_gen = tcbinfo.ipi_gencnt; 907 xig.xig_sogen = so_gencnt; 908 xig.xig_count = tcbinfo.ipi_count; 909 INP_INFO_RUNLOCK(&tcbinfo); 910 splx(s); 911 error = SYSCTL_OUT(req, &xig, sizeof xig); 912 } 913 free(inp_list, M_TEMP); 914 return error; 915} 916 917SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, 918 tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 919 920static int 921tcp_getcred(SYSCTL_HANDLER_ARGS) 922{ 923 struct xucred xuc; 924 struct sockaddr_in addrs[2]; 925 struct inpcb *inp; 926 int error, s; 927 928 error = suser_cred(req->td->td_ucred, PRISON_ROOT); 929 if (error) 930 return (error); 931 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 932 if (error) 933 return (error); 934 s = splnet(); 935 INP_INFO_RLOCK(&tcbinfo); 936 inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, 937 addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); 938 if (inp == NULL) { 939 error = ENOENT; 940 goto outunlocked; 941 } 942 INP_LOCK(inp); 943 if (inp->inp_socket == NULL) { 944 error = ENOENT; 945 goto out; 946 } 947 error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 948 if (error) 949 goto out; 950 cru2x(inp->inp_socket->so_cred, &xuc); 951out: 952 INP_UNLOCK(inp); 953outunlocked: 954 INP_INFO_RUNLOCK(&tcbinfo); 955 splx(s); 956 if (error == 0) 957 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 958 return (error); 959} 960 961SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, 962 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 963 tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); 964 965#ifdef INET6 966static int 967tcp6_getcred(SYSCTL_HANDLER_ARGS) 968{ 969 struct xucred xuc; 970 struct sockaddr_in6 addrs[2]; 971 struct inpcb *inp; 972 int error, s, mapped = 0; 973 974 error = suser_cred(req->td->td_ucred, PRISON_ROOT); 975 if (error) 976 return (error); 977 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 978 if (error) 979 return (error); 980 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { 981 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) 982 mapped = 1; 983 else 984 return (EINVAL); 985 } 986 s = splnet(); 987 INP_INFO_RLOCK(&tcbinfo); 988 if (mapped == 1) 989 inp = in_pcblookup_hash(&tcbinfo, 990 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], 991 addrs[1].sin6_port, 992 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], 993 addrs[0].sin6_port, 994 0, NULL); 995 else 996 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, 997 addrs[1].sin6_port, 998 &addrs[0].sin6_addr, addrs[0].sin6_port, 999 0, NULL); 1000 if (inp == NULL) { 1001 error = ENOENT; 1002 goto outunlocked; 1003 } 1004 INP_LOCK(inp); 1005 if (inp->inp_socket == NULL) { 1006 error = ENOENT; 1007 goto out; 1008 } 1009 error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 1010 if (error) 1011 goto out; 1012 cru2x(inp->inp_socket->so_cred, &xuc); 1013out: 1014 INP_UNLOCK(inp); 1015outunlocked: 1016 INP_INFO_RUNLOCK(&tcbinfo); 1017 splx(s); 1018 if (error == 0) 1019 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 1020 return (error); 1021} 1022 1023SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, 1024 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 1025 tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); 1026#endif 1027 1028 1029void 1030tcp_ctlinput(cmd, sa, vip) 1031 int cmd; 1032 struct sockaddr *sa; 1033 void *vip; 1034{ 1035 struct ip *ip = vip; 1036 struct tcphdr *th; 1037 struct in_addr faddr; 1038 struct inpcb *inp; 1039 struct tcpcb *tp; 1040 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 1041 tcp_seq icmp_seq; 1042 int s; 1043 1044 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1045 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1046 return; 1047 1048 if (cmd == PRC_QUENCH) 1049 notify = tcp_quench; 1050 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || 1051 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) 1052 notify = tcp_drop_syn_sent; 1053 else if (cmd == PRC_MSGSIZE) 1054 notify = tcp_mtudisc; 1055 /* 1056 * Redirects don't need to be handled up here. 1057 */ 1058 else if (PRC_IS_REDIRECT(cmd)) 1059 return; 1060 /* 1061 * Hostdead is ugly because it goes linearly through all PCBs. 1062 * XXX: We never get this from ICMP, otherwise it makes an 1063 * excellent DoS attack on machines with many connections. 1064 */ 1065 else if (cmd == PRC_HOSTDEAD) 1066 ip = 0; 1067 else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) 1068 return; 1069 if (ip) { 1070 s = splnet(); 1071 th = (struct tcphdr *)((caddr_t)ip 1072 + (ip->ip_hl << 2)); 1073 INP_INFO_WLOCK(&tcbinfo); 1074 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, 1075 ip->ip_src, th->th_sport, 0, NULL); 1076 if (inp != NULL) { 1077 INP_LOCK(inp); 1078 if (inp->inp_socket != NULL) { 1079 icmp_seq = htonl(th->th_seq); 1080 tp = intotcpcb(inp); 1081 if (SEQ_GEQ(icmp_seq, tp->snd_una) && 1082 SEQ_LT(icmp_seq, tp->snd_max)) 1083 inp = (*notify)(inp, inetctlerrmap[cmd]); 1084 } 1085 if (inp) 1086 INP_UNLOCK(inp); 1087 } else { 1088 struct in_conninfo inc; 1089 1090 inc.inc_fport = th->th_dport; 1091 inc.inc_lport = th->th_sport; 1092 inc.inc_faddr = faddr; 1093 inc.inc_laddr = ip->ip_src; 1094#ifdef INET6 1095 inc.inc_isipv6 = 0; 1096#endif 1097 syncache_unreach(&inc, th); 1098 } 1099 INP_INFO_WUNLOCK(&tcbinfo); 1100 splx(s); 1101 } else 1102 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); 1103} 1104 1105#ifdef INET6 1106void 1107tcp6_ctlinput(cmd, sa, d) 1108 int cmd; 1109 struct sockaddr *sa; 1110 void *d; 1111{ 1112 struct tcphdr th; 1113 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 1114 struct ip6_hdr *ip6; 1115 struct mbuf *m; 1116 struct ip6ctlparam *ip6cp = NULL; 1117 const struct sockaddr_in6 *sa6_src = NULL; 1118 int off; 1119 struct tcp_portonly { 1120 u_int16_t th_sport; 1121 u_int16_t th_dport; 1122 } *thp; 1123 1124 if (sa->sa_family != AF_INET6 || 1125 sa->sa_len != sizeof(struct sockaddr_in6)) 1126 return; 1127 1128 if (cmd == PRC_QUENCH) 1129 notify = tcp_quench; 1130 else if (cmd == PRC_MSGSIZE) 1131 notify = tcp_mtudisc; 1132 else if (!PRC_IS_REDIRECT(cmd) && 1133 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) 1134 return; 1135 1136 /* if the parameter is from icmp6, decode it. */ 1137 if (d != NULL) { 1138 ip6cp = (struct ip6ctlparam *)d; 1139 m = ip6cp->ip6c_m; 1140 ip6 = ip6cp->ip6c_ip6; 1141 off = ip6cp->ip6c_off; 1142 sa6_src = ip6cp->ip6c_src; 1143 } else { 1144 m = NULL; 1145 ip6 = NULL; 1146 off = 0; /* fool gcc */ 1147 sa6_src = &sa6_any; 1148 } 1149 1150 if (ip6) { 1151 struct in_conninfo inc; 1152 /* 1153 * XXX: We assume that when IPV6 is non NULL, 1154 * M and OFF are valid. 1155 */ 1156 1157 /* check if we can safely examine src and dst ports */ 1158 if (m->m_pkthdr.len < off + sizeof(*thp)) 1159 return; 1160 1161 bzero(&th, sizeof(th)); 1162 m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 1163 1164 in6_pcbnotify(&tcb, sa, th.th_dport, 1165 (struct sockaddr *)ip6cp->ip6c_src, 1166 th.th_sport, cmd, notify); 1167 1168 inc.inc_fport = th.th_dport; 1169 inc.inc_lport = th.th_sport; 1170 inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; 1171 inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; 1172 inc.inc_isipv6 = 1; 1173 syncache_unreach(&inc, &th); 1174 } else 1175 in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src, 1176 0, cmd, notify); 1177} 1178#endif /* INET6 */ 1179 1180 1181/* 1182 * Following is where TCP initial sequence number generation occurs. 1183 * 1184 * There are two places where we must use initial sequence numbers: 1185 * 1. In SYN-ACK packets. 1186 * 2. In SYN packets. 1187 * 1188 * All ISNs for SYN-ACK packets are generated by the syncache. See 1189 * tcp_syncache.c for details. 1190 * 1191 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 1192 * depends on this property. In addition, these ISNs should be 1193 * unguessable so as to prevent connection hijacking. To satisfy 1194 * the requirements of this situation, the algorithm outlined in 1195 * RFC 1948 is used to generate sequence numbers. 1196 * 1197 * Implementation details: 1198 * 1199 * Time is based off the system timer, and is corrected so that it 1200 * increases by one megabyte per second. This allows for proper 1201 * recycling on high speed LANs while still leaving over an hour 1202 * before rollover. 1203 * 1204 * net.inet.tcp.isn_reseed_interval controls the number of seconds 1205 * between seeding of isn_secret. This is normally set to zero, 1206 * as reseeding should not be necessary. 1207 * 1208 */ 1209 1210#define ISN_BYTES_PER_SECOND 1048576 1211 1212u_char isn_secret[32]; 1213int isn_last_reseed; 1214MD5_CTX isn_ctx; 1215 1216tcp_seq 1217tcp_new_isn(tp) 1218 struct tcpcb *tp; 1219{ 1220 u_int32_t md5_buffer[4]; 1221 tcp_seq new_isn; 1222 1223 /* Seed if this is the first use, reseed if requested. */ 1224 if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && 1225 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) 1226 < (u_int)ticks))) { 1227 read_random(&isn_secret, sizeof(isn_secret)); 1228 isn_last_reseed = ticks; 1229 } 1230 1231 /* Compute the md5 hash and return the ISN. */ 1232 MD5Init(&isn_ctx); 1233 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); 1234 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); 1235#ifdef INET6 1236 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { 1237 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, 1238 sizeof(struct in6_addr)); 1239 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, 1240 sizeof(struct in6_addr)); 1241 } else 1242#endif 1243 { 1244 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, 1245 sizeof(struct in_addr)); 1246 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, 1247 sizeof(struct in_addr)); 1248 } 1249 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); 1250 MD5Final((u_char *) &md5_buffer, &isn_ctx); 1251 new_isn = (tcp_seq) md5_buffer[0]; 1252 new_isn += ticks * (ISN_BYTES_PER_SECOND / hz); 1253 return new_isn; 1254} 1255 1256/* 1257 * When a source quench is received, close congestion window 1258 * to one segment. We will gradually open it again as we proceed. 1259 */ 1260struct inpcb * 1261tcp_quench(inp, errno) 1262 struct inpcb *inp; 1263 int errno; 1264{ 1265 struct tcpcb *tp = intotcpcb(inp); 1266 1267 if (tp) 1268 tp->snd_cwnd = tp->t_maxseg; 1269 return (inp); 1270} 1271 1272/* 1273 * When a specific ICMP unreachable message is received and the 1274 * connection state is SYN-SENT, drop the connection. This behavior 1275 * is controlled by the icmp_may_rst sysctl. 1276 */ 1277struct inpcb * 1278tcp_drop_syn_sent(inp, errno) 1279 struct inpcb *inp; 1280 int errno; 1281{ 1282 struct tcpcb *tp = intotcpcb(inp); 1283 1284 if (tp && tp->t_state == TCPS_SYN_SENT) { 1285 tcp_drop(tp, errno); 1286 return (struct inpcb *)0; 1287 } 1288 return inp; 1289} 1290 1291/* 1292 * When `need fragmentation' ICMP is received, update our idea of the MSS 1293 * based on the new value in the route. Also nudge TCP to send something, 1294 * since we know the packet we just sent was dropped. 1295 * This duplicates some code in the tcp_mss() function in tcp_input.c. 1296 */ 1297struct inpcb * 1298tcp_mtudisc(inp, errno) 1299 struct inpcb *inp; 1300 int errno; 1301{ 1302 struct tcpcb *tp = intotcpcb(inp); 1303 struct rmxp_tao tao; 1304 struct socket *so = inp->inp_socket; 1305 u_int maxmtu; 1306 u_int romtu; 1307 int mss; 1308#ifdef INET6 1309 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 1310#endif /* INET6 */ 1311 bzero(&tao, sizeof(tao)); 1312 1313 if (tp) { 1314 maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */ 1315 romtu = 1316#ifdef INET6 1317 isipv6 ? tcp_maxmtu6(&inp->inp_inc) : 1318#endif /* INET6 */ 1319 tcp_maxmtu(&inp->inp_inc); 1320 if (!maxmtu) 1321 maxmtu = romtu; 1322 else 1323 maxmtu = min(maxmtu, romtu); 1324 if (!maxmtu) { 1325 tp->t_maxopd = tp->t_maxseg = 1326#ifdef INET6 1327 isipv6 ? tcp_v6mssdflt : 1328#endif /* INET6 */ 1329 tcp_mssdflt; 1330 return inp; 1331 } 1332 mss = maxmtu - 1333#ifdef INET6 1334 (isipv6 ? 1335 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : 1336#endif /* INET6 */ 1337 sizeof(struct tcpiphdr) 1338#ifdef INET6 1339 ) 1340#endif /* INET6 */ 1341 ; 1342 1343 if (tcp_do_rfc1644) { 1344 tcp_hc_gettao(&inp->inp_inc, &tao); 1345 if (tao.tao_mssopt) 1346 mss = min(mss, tao.tao_mssopt); 1347 } 1348 /* 1349 * XXX - The above conditional probably violates the TCP 1350 * spec. The problem is that, since we don't know the 1351 * other end's MSS, we are supposed to use a conservative 1352 * default. But, if we do that, then MTU discovery will 1353 * never actually take place, because the conservative 1354 * default is much less than the MTUs typically seen 1355 * on the Internet today. For the moment, we'll sweep 1356 * this under the carpet. 1357 * 1358 * The conservative default might not actually be a problem 1359 * if the only case this occurs is when sending an initial 1360 * SYN with options and data to a host we've never talked 1361 * to before. Then, they will reply with an MSS value which 1362 * will get recorded and the new parameters should get 1363 * recomputed. For Further Study. 1364 */ 1365 if (tp->t_maxopd <= mss) 1366 return inp; 1367 tp->t_maxopd = mss; 1368 1369 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 1370 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 1371 mss -= TCPOLEN_TSTAMP_APPA; 1372 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 1373 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) 1374 mss -= TCPOLEN_CC_APPA; 1375#if (MCLBYTES & (MCLBYTES - 1)) == 0 1376 if (mss > MCLBYTES) 1377 mss &= ~(MCLBYTES-1); 1378#else 1379 if (mss > MCLBYTES) 1380 mss = mss / MCLBYTES * MCLBYTES; 1381#endif 1382 if (so->so_snd.sb_hiwat < mss) 1383 mss = so->so_snd.sb_hiwat; 1384 1385 tp->t_maxseg = mss; 1386 1387 tcpstat.tcps_mturesent++; 1388 tp->t_rtttime = 0; 1389 tp->snd_nxt = tp->snd_una; 1390 tcp_output(tp); 1391 } 1392 return inp; 1393} 1394 1395/* 1396 * Look-up the routing entry to the peer of this inpcb. If no route 1397 * is found and it cannot be allocated, then return NULL. This routine 1398 * is called by TCP routines that access the rmx structure and by tcp_mss 1399 * to get the interface MTU. 1400 */ 1401u_long 1402tcp_maxmtu(inc) 1403 struct in_conninfo *inc; 1404{ 1405 struct route sro; 1406 struct sockaddr_in *dst; 1407 struct ifnet *ifp; 1408 u_long maxmtu = 0; 1409 1410 KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); 1411 1412 sro.ro_rt = NULL; 1413 if (inc->inc_faddr.s_addr != INADDR_ANY) { 1414 dst = (struct sockaddr_in *)&sro.ro_dst; 1415 dst->sin_family = AF_INET; 1416 dst->sin_len = sizeof(*dst); 1417 dst->sin_addr = inc->inc_faddr; 1418 rtalloc_ign(&sro, RTF_CLONING); 1419 } 1420 if (sro.ro_rt != NULL) { 1421 ifp = sro.ro_rt->rt_ifp; 1422 if (sro.ro_rt->rt_rmx.rmx_mtu == 0) 1423 maxmtu = ifp->if_mtu; 1424 else 1425 maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); 1426 RTFREE(sro.ro_rt); 1427 } 1428 return (maxmtu); 1429} 1430 1431#ifdef INET6 1432u_long 1433tcp_maxmtu6(inc) 1434 struct in_conninfo *inc; 1435{ 1436 struct route_in6 sro6; 1437 struct ifnet *ifp; 1438 u_long maxmtu = 0; 1439 1440 KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); 1441 1442 sro6.ro_rt = NULL; 1443 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { 1444 sro6.ro_dst.sin6_family = AF_INET6; 1445 sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); 1446 sro6.ro_dst.sin6_addr = inc->inc6_faddr; 1447 rtalloc_ign((struct route *)&sro6, RTF_CLONING); 1448 } 1449 if (sro6.ro_rt != NULL) { 1450 ifp = sro6.ro_rt->rt_ifp; 1451 if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) 1452 maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); 1453 else 1454 maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, 1455 IN6_LINKMTU(sro6.ro_rt->rt_ifp)); 1456 RTFREE(sro6.ro_rt); 1457 } 1458 1459 return (maxmtu); 1460} 1461#endif /* INET6 */ 1462 1463#ifdef IPSEC 1464/* compute ESP/AH header size for TCP, including outer IP header. */ 1465size_t 1466ipsec_hdrsiz_tcp(tp) 1467 struct tcpcb *tp; 1468{ 1469 struct inpcb *inp; 1470 struct mbuf *m; 1471 size_t hdrsiz; 1472 struct ip *ip; 1473#ifdef INET6 1474 struct ip6_hdr *ip6; 1475#endif 1476 struct tcphdr *th; 1477 1478 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) 1479 return 0; 1480 MGETHDR(m, M_DONTWAIT, MT_DATA); 1481 if (!m) 1482 return 0; 1483 1484#ifdef INET6 1485 if ((inp->inp_vflag & INP_IPV6) != 0) { 1486 ip6 = mtod(m, struct ip6_hdr *); 1487 th = (struct tcphdr *)(ip6 + 1); 1488 m->m_pkthdr.len = m->m_len = 1489 sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1490 tcpip_fillheaders(inp, ip6, th); 1491 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 1492 } else 1493#endif /* INET6 */ 1494 { 1495 ip = mtod(m, struct ip *); 1496 th = (struct tcphdr *)(ip + 1); 1497 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); 1498 tcpip_fillheaders(inp, ip, th); 1499 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 1500 } 1501 1502 m_free(m); 1503 return hdrsiz; 1504} 1505#endif /*IPSEC*/ 1506 1507/* 1508 * Move a TCP connection into TIME_WAIT state. 1509 * tcbinfo is unlocked. 1510 * inp is locked, and is unlocked before returning. 1511 */ 1512void 1513tcp_twstart(tp) 1514 struct tcpcb *tp; 1515{ 1516 struct tcptw *tw; 1517 struct inpcb *inp; 1518 int tw_time, acknow; 1519 struct socket *so; 1520 1521 tw = uma_zalloc(tcptw_zone, M_NOWAIT); 1522 if (tw == NULL) { 1523 tw = tcp_timer_2msl_tw(1); 1524 if (tw == NULL) { 1525 tcp_close(tp); 1526 return; 1527 } 1528 } 1529 inp = tp->t_inpcb; 1530 tw->tw_inpcb = inp; 1531 1532 /* 1533 * Recover last window size sent. 1534 */ 1535 tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; 1536 1537 /* 1538 * Set t_recent if timestamps are used on the connection. 1539 */ 1540 if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == 1541 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) 1542 tw->t_recent = tp->ts_recent; 1543 else 1544 tw->t_recent = 0; 1545 1546 tw->snd_nxt = tp->snd_nxt; 1547 tw->rcv_nxt = tp->rcv_nxt; 1548 tw->iss = tp->iss; 1549 tw->irs = tp->irs; 1550 tw->cc_recv = tp->cc_recv; 1551 tw->cc_send = tp->cc_send; 1552 tw->t_starttime = tp->t_starttime; 1553 tw->tw_time = 0; 1554 1555/* XXX 1556 * If this code will 1557 * be used for fin-wait-2 state also, then we may need 1558 * a ts_recent from the last segment. 1559 */ 1560 /* Shorten TIME_WAIT [RFC-1644, p.28] */ 1561 if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) { 1562 tw_time = tp->t_rxtcur * TCPTV_TWTRUNC; 1563 /* For T/TCP client, force ACK now. */ 1564 acknow = 1; 1565 } else { 1566 tw_time = 2 * tcp_msl; 1567 acknow = tp->t_flags & TF_ACKNOW; 1568 } 1569 tcp_discardcb(tp); 1570 so = inp->inp_socket; 1571 so->so_pcb = NULL; 1572 tw->tw_cred = crhold(so->so_cred); 1573 tw->tw_so_options = so->so_options; 1574 if (acknow) 1575 tcp_twrespond(tw, so, NULL, TH_ACK); 1576 sotryfree(so); 1577 inp->inp_socket = NULL; 1578 inp->inp_ppcb = (caddr_t)tw; 1579 inp->inp_vflag |= INP_TIMEWAIT; 1580 tcp_timer_2msl_reset(tw, tw_time); 1581 INP_UNLOCK(inp); 1582} 1583 1584/* 1585 * The appromixate rate of ISN increase of Microsoft TCP stacks; 1586 * the actual rate is slightly higher due to the addition of 1587 * random positive increments. 1588 * 1589 * Most other new OSes use semi-randomized ISN values, so we 1590 * do not need to worry about them. 1591 */ 1592#define MS_ISN_BYTES_PER_SECOND 250000 1593 1594/* 1595 * Determine if the ISN we will generate has advanced beyond the last 1596 * sequence number used by the previous connection. If so, indicate 1597 * that it is safe to recycle this tw socket by returning 1. 1598 */ 1599int 1600tcp_twrecycleable(struct tcptw *tw) 1601{ 1602 tcp_seq new_iss = tw->iss; 1603 tcp_seq new_irs = tw->irs; 1604 1605 new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); 1606 new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); 1607 1608 if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt)) 1609 return 1; 1610 else 1611 return 0; 1612} 1613 1614struct tcptw * 1615tcp_twclose(struct tcptw *tw, int reuse) 1616{ 1617 struct inpcb *inp; 1618 1619 inp = tw->tw_inpcb; 1620 tw->tw_inpcb = NULL; 1621 tcp_timer_2msl_stop(tw); 1622 inp->inp_ppcb = NULL; 1623#ifdef INET6 1624 if (inp->inp_vflag & INP_IPV6PROTO) 1625 in6_pcbdetach(inp); 1626 else 1627#endif 1628 in_pcbdetach(inp); 1629 tcpstat.tcps_closed++; 1630 if (reuse) 1631 return (tw); 1632 uma_zfree(tcptw_zone, tw); 1633 return (NULL); 1634} 1635 1636/* 1637 * One of so and msrc must be non-NULL for use by the MAC Framework to 1638 * construct a label for ay resulting packet. 1639 */ 1640int 1641tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc, 1642 int flags) 1643{ 1644 struct inpcb *inp = tw->tw_inpcb; 1645 struct tcphdr *th; 1646 struct mbuf *m; 1647 struct ip *ip = NULL; 1648 u_int8_t *optp; 1649 u_int hdrlen, optlen; 1650 int error; 1651#ifdef INET6 1652 struct ip6_hdr *ip6 = NULL; 1653 int isipv6 = inp->inp_inc.inc_isipv6; 1654#endif 1655 1656 KASSERT(so != NULL || msrc != NULL, 1657 ("tcp_twrespond: so and msrc NULL")); 1658 1659 m = m_gethdr(M_DONTWAIT, MT_HEADER); 1660 if (m == NULL) 1661 return (ENOBUFS); 1662 m->m_data += max_linkhdr; 1663 1664#ifdef MAC 1665 if (so != NULL) 1666 mac_create_mbuf_from_socket(so, m); 1667 else 1668 mac_create_mbuf_netlayer(msrc, m); 1669#endif 1670 1671#ifdef INET6 1672 if (isipv6) { 1673 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1674 ip6 = mtod(m, struct ip6_hdr *); 1675 th = (struct tcphdr *)(ip6 + 1); 1676 tcpip_fillheaders(inp, ip6, th); 1677 } else 1678#endif 1679 { 1680 hdrlen = sizeof(struct tcpiphdr); 1681 ip = mtod(m, struct ip *); 1682 th = (struct tcphdr *)(ip + 1); 1683 tcpip_fillheaders(inp, ip, th); 1684 } 1685 optp = (u_int8_t *)(th + 1); 1686 1687 /* 1688 * Send a timestamp and echo-reply if both our side and our peer 1689 * have sent timestamps in our SYN's and this is not a RST. 1690 */ 1691 if (tw->t_recent && flags == TH_ACK) { 1692 u_int32_t *lp = (u_int32_t *)optp; 1693 1694 /* Form timestamp option as shown in appendix A of RFC 1323. */ 1695 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 1696 *lp++ = htonl(ticks); 1697 *lp = htonl(tw->t_recent); 1698 optp += TCPOLEN_TSTAMP_APPA; 1699 } 1700 1701 /* 1702 * Send `CC-family' options if needed, and it's not a RST. 1703 */ 1704 if (tw->cc_recv != 0 && flags == TH_ACK) { 1705 u_int32_t *lp = (u_int32_t *)optp; 1706 1707 *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC)); 1708 *lp = htonl(tw->cc_send); 1709 optp += TCPOLEN_CC_APPA; 1710 } 1711 optlen = optp - (u_int8_t *)(th + 1); 1712 1713 m->m_len = hdrlen + optlen; 1714 m->m_pkthdr.len = m->m_len; 1715 1716 KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); 1717 1718 th->th_seq = htonl(tw->snd_nxt); 1719 th->th_ack = htonl(tw->rcv_nxt); 1720 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 1721 th->th_flags = flags; 1722 th->th_win = htons(tw->last_win); 1723 1724#ifdef INET6 1725 if (isipv6) { 1726 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 1727 sizeof(struct tcphdr) + optlen); 1728 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 1729 error = ip6_output(m, inp->in6p_outputopts, NULL, 1730 (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); 1731 } else 1732#endif 1733 { 1734 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 1735 htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); 1736 m->m_pkthdr.csum_flags = CSUM_TCP; 1737 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1738 ip->ip_len = m->m_pkthdr.len; 1739 error = ip_output(m, inp->inp_options, NULL, 1740 (tw->tw_so_options & SO_DONTROUTE), NULL, inp); 1741 } 1742 if (flags & TH_ACK) 1743 tcpstat.tcps_sndacks++; 1744 else 1745 tcpstat.tcps_sndctrl++; 1746 tcpstat.tcps_sndtotal++; 1747 return (error); 1748} 1749 1750/* 1751 * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING 1752 * 1753 * This code attempts to calculate the bandwidth-delay product as a 1754 * means of determining the optimal window size to maximize bandwidth, 1755 * minimize RTT, and avoid the over-allocation of buffers on interfaces and 1756 * routers. This code also does a fairly good job keeping RTTs in check 1757 * across slow links like modems. We implement an algorithm which is very 1758 * similar (but not meant to be) TCP/Vegas. The code operates on the 1759 * transmitter side of a TCP connection and so only effects the transmit 1760 * side of the connection. 1761 * 1762 * BACKGROUND: TCP makes no provision for the management of buffer space 1763 * at the end points or at the intermediate routers and switches. A TCP 1764 * stream, whether using NewReno or not, will eventually buffer as 1765 * many packets as it is able and the only reason this typically works is 1766 * due to the fairly small default buffers made available for a connection 1767 * (typicaly 16K or 32K). As machines use larger windows and/or window 1768 * scaling it is now fairly easy for even a single TCP connection to blow-out 1769 * all available buffer space not only on the local interface, but on 1770 * intermediate routers and switches as well. NewReno makes a misguided 1771 * attempt to 'solve' this problem by waiting for an actual failure to occur, 1772 * then backing off, then steadily increasing the window again until another 1773 * failure occurs, ad-infinitum. This results in terrible oscillation that 1774 * is only made worse as network loads increase and the idea of intentionally 1775 * blowing out network buffers is, frankly, a terrible way to manage network 1776 * resources. 1777 * 1778 * It is far better to limit the transmit window prior to the failure 1779 * condition being achieved. There are two general ways to do this: First 1780 * you can 'scan' through different transmit window sizes and locate the 1781 * point where the RTT stops increasing, indicating that you have filled the 1782 * pipe, then scan backwards until you note that RTT stops decreasing, then 1783 * repeat ad-infinitum. This method works in principle but has severe 1784 * implementation issues due to RTT variances, timer granularity, and 1785 * instability in the algorithm which can lead to many false positives and 1786 * create oscillations as well as interact badly with other TCP streams 1787 * implementing the same algorithm. 1788 * 1789 * The second method is to limit the window to the bandwidth delay product 1790 * of the link. This is the method we implement. RTT variances and our 1791 * own manipulation of the congestion window, bwnd, can potentially 1792 * destabilize the algorithm. For this reason we have to stabilize the 1793 * elements used to calculate the window. We do this by using the minimum 1794 * observed RTT, the long term average of the observed bandwidth, and 1795 * by adding two segments worth of slop. It isn't perfect but it is able 1796 * to react to changing conditions and gives us a very stable basis on 1797 * which to extend the algorithm. 1798 */ 1799void 1800tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) 1801{ 1802 u_long bw; 1803 u_long bwnd; 1804 int save_ticks; 1805 1806 /* 1807 * If inflight_enable is disabled in the middle of a tcp connection, 1808 * make sure snd_bwnd is effectively disabled. 1809 */ 1810 if (tcp_inflight_enable == 0) { 1811 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 1812 tp->snd_bandwidth = 0; 1813 return; 1814 } 1815 1816 /* 1817 * Figure out the bandwidth. Due to the tick granularity this 1818 * is a very rough number and it MUST be averaged over a fairly 1819 * long period of time. XXX we need to take into account a link 1820 * that is not using all available bandwidth, but for now our 1821 * slop will ramp us up if this case occurs and the bandwidth later 1822 * increases. 1823 * 1824 * Note: if ticks rollover 'bw' may wind up negative. We must 1825 * effectively reset t_bw_rtttime for this case. 1826 */ 1827 save_ticks = ticks; 1828 if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) 1829 return; 1830 1831 bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 1832 (save_ticks - tp->t_bw_rtttime); 1833 tp->t_bw_rtttime = save_ticks; 1834 tp->t_bw_rtseq = ack_seq; 1835 if (tp->t_bw_rtttime == 0 || (int)bw < 0) 1836 return; 1837 bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; 1838 1839 tp->snd_bandwidth = bw; 1840 1841 /* 1842 * Calculate the semi-static bandwidth delay product, plus two maximal 1843 * segments. The additional slop puts us squarely in the sweet 1844 * spot and also handles the bandwidth run-up case and stabilization. 1845 * Without the slop we could be locking ourselves into a lower 1846 * bandwidth. 1847 * 1848 * Situations Handled: 1849 * (1) Prevents over-queueing of packets on LANs, especially on 1850 * high speed LANs, allowing larger TCP buffers to be 1851 * specified, and also does a good job preventing 1852 * over-queueing of packets over choke points like modems 1853 * (at least for the transmit side). 1854 * 1855 * (2) Is able to handle changing network loads (bandwidth 1856 * drops so bwnd drops, bandwidth increases so bwnd 1857 * increases). 1858 * 1859 * (3) Theoretically should stabilize in the face of multiple 1860 * connections implementing the same algorithm (this may need 1861 * a little work). 1862 * 1863 * (4) Stability value (defaults to 20 = 2 maximal packets) can 1864 * be adjusted with a sysctl but typically only needs to be 1865 * on very slow connections. A value no smaller then 5 1866 * should be used, but only reduce this default if you have 1867 * no other choice. 1868 */ 1869#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) 1870 bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; 1871#undef USERTT 1872 1873 if (tcp_inflight_debug > 0) { 1874 static int ltime; 1875 if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { 1876 ltime = ticks; 1877 printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", 1878 tp, 1879 bw, 1880 tp->t_rttbest, 1881 tp->t_srtt, 1882 bwnd 1883 ); 1884 } 1885 } 1886 if ((long)bwnd < tcp_inflight_min) 1887 bwnd = tcp_inflight_min; 1888 if (bwnd > tcp_inflight_max) 1889 bwnd = tcp_inflight_max; 1890 if ((long)bwnd < tp->t_maxseg * 2) 1891 bwnd = tp->t_maxseg * 2; 1892 tp->snd_bwnd = bwnd; 1893} 1894 1895