tcp_timewait.c revision 124248
1/* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 34 * $FreeBSD: head/sys/netinet/tcp_timewait.c 124248 2004-01-08 11:17:11Z andre $ 35 */ 36 37#include "opt_compat.h" 38#include "opt_inet6.h" 39#include "opt_ipsec.h" 40#include "opt_mac.h" 41#include "opt_tcpdebug.h" 42 43#include <sys/param.h> 44#include <sys/systm.h> 45#include <sys/callout.h> 46#include <sys/kernel.h> 47#include <sys/sysctl.h> 48#include <sys/mac.h> 49#include <sys/malloc.h> 50#include <sys/mbuf.h> 51#ifdef INET6 52#include <sys/domain.h> 53#endif 54#include <sys/proc.h> 55#include <sys/socket.h> 56#include <sys/socketvar.h> 57#include <sys/protosw.h> 58#include <sys/random.h> 59 60#include <vm/uma.h> 61 62#include <net/route.h> 63#include <net/if.h> 64 65#include <netinet/in.h> 66#include <netinet/in_systm.h> 67#include <netinet/ip.h> 68#ifdef INET6 69#include <netinet/ip6.h> 70#endif 71#include <netinet/in_pcb.h> 72#ifdef INET6 73#include <netinet6/in6_pcb.h> 74#endif 75#include <netinet/in_var.h> 76#include <netinet/ip_var.h> 77#ifdef INET6 78#include <netinet6/ip6_var.h> 79#include <netinet6/nd6.h> 80#endif 81#include <netinet/tcp.h> 82#include <netinet/tcp_fsm.h> 83#include <netinet/tcp_seq.h> 84#include <netinet/tcp_timer.h> 85#include <netinet/tcp_var.h> 86#ifdef INET6 87#include <netinet6/tcp6_var.h> 88#endif 89#include <netinet/tcpip.h> 90#ifdef TCPDEBUG 91#include <netinet/tcp_debug.h> 92#endif 93#include <netinet6/ip6protosw.h> 94 95#ifdef IPSEC 96#include <netinet6/ipsec.h> 97#ifdef INET6 98#include <netinet6/ipsec6.h> 99#endif 100#endif /*IPSEC*/ 101 102#ifdef FAST_IPSEC 103#include <netipsec/ipsec.h> 104#ifdef INET6 105#include <netipsec/ipsec6.h> 106#endif 107#define IPSEC 108#endif /*FAST_IPSEC*/ 109 110#include <machine/in_cksum.h> 111#include <sys/md5.h> 112 113int tcp_mssdflt = TCP_MSS; 114SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 115 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); 116 117#ifdef INET6 118int tcp_v6mssdflt = TCP6_MSS; 119SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 120 CTLFLAG_RW, &tcp_v6mssdflt , 0, 121 "Default TCP Maximum Segment Size for IPv6"); 122#endif 123 124#if 0 125static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; 126SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 127 &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); 128#endif 129 130int tcp_do_rfc1323 = 1; 131SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 132 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); 133 134int tcp_do_rfc1644 = 0; 135SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 136 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); 137 138static int tcp_tcbhashsize = 0; 139SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, 140 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 141 142static int do_tcpdrain = 1; 143SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, 144 "Enable tcp_drain routine for extra help when low on mbufs"); 145 146SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 147 &tcbinfo.ipi_count, 0, "Number of active PCBs"); 148 149static int icmp_may_rst = 1; 150SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 151 "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 152 153static int tcp_isn_reseed_interval = 0; 154SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, 155 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); 156 157/* 158 * TCP bandwidth limiting sysctls. Note that the default lower bound of 159 * 1024 exists only for debugging. A good production default would be 160 * something like 6100. 161 */ 162static int tcp_inflight_enable = 1; 163SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW, 164 &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); 165 166static int tcp_inflight_debug = 0; 167SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW, 168 &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); 169 170static int tcp_inflight_min = 6144; 171SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW, 172 &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); 173 174static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; 175SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW, 176 &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); 177static int tcp_inflight_stab = 20; 178SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, 179 &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); 180 181static struct inpcb *tcp_notify(struct inpcb *, int); 182static void tcp_discardcb(struct tcpcb *); 183 184/* 185 * Target size of TCP PCB hash tables. Must be a power of two. 186 * 187 * Note that this can be overridden by the kernel environment 188 * variable net.inet.tcp.tcbhashsize 189 */ 190#ifndef TCBHASHSIZE 191#define TCBHASHSIZE 512 192#endif 193 194/* 195 * XXX 196 * Callouts should be moved into struct tcp directly. They are currently 197 * separate because the tcpcb structure is exported to userland for sysctl 198 * parsing purposes, which do not know about callouts. 199 */ 200struct tcpcb_mem { 201 struct tcpcb tcb; 202 struct callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep; 203 struct callout tcpcb_mem_2msl, tcpcb_mem_delack; 204}; 205 206static uma_zone_t tcpcb_zone; 207static uma_zone_t tcptw_zone; 208 209/* 210 * Tcp initialization 211 */ 212void 213tcp_init() 214{ 215 int hashsize = TCBHASHSIZE; 216 217 tcp_ccgen = 1; 218 219 tcp_delacktime = TCPTV_DELACK; 220 tcp_keepinit = TCPTV_KEEP_INIT; 221 tcp_keepidle = TCPTV_KEEP_IDLE; 222 tcp_keepintvl = TCPTV_KEEPINTVL; 223 tcp_maxpersistidle = TCPTV_KEEP_IDLE; 224 tcp_msl = TCPTV_MSL; 225 tcp_rexmit_min = TCPTV_MIN; 226 tcp_rexmit_slop = TCPTV_CPU_VAR; 227 228 INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); 229 LIST_INIT(&tcb); 230 tcbinfo.listhead = &tcb; 231 TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); 232 if (!powerof2(hashsize)) { 233 printf("WARNING: TCB hash size not a power of 2\n"); 234 hashsize = 512; /* safe default */ 235 } 236 tcp_tcbhashsize = hashsize; 237 tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); 238 tcbinfo.porthashbase = hashinit(hashsize, M_PCB, 239 &tcbinfo.porthashmask); 240 tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), 241 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 242 uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); 243#ifdef INET6 244#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) 245#else /* INET6 */ 246#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) 247#endif /* INET6 */ 248 if (max_protohdr < TCP_MINPROTOHDR) 249 max_protohdr = TCP_MINPROTOHDR; 250 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) 251 panic("tcp_init"); 252#undef TCP_MINPROTOHDR 253 /* 254 * These have to be type stable for the benefit of the timers. 255 */ 256 tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), 257 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 258 uma_zone_set_max(tcpcb_zone, maxsockets); 259 tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), 260 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 261 uma_zone_set_max(tcptw_zone, maxsockets / 5); 262 tcp_timer_init(); 263 syncache_init(); 264 tcp_hc_init(); 265} 266 267/* 268 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 269 * tcp_template used to store this data in mbufs, but we now recopy it out 270 * of the tcpcb each time to conserve mbufs. 271 */ 272void 273tcpip_fillheaders(inp, ip_ptr, tcp_ptr) 274 struct inpcb *inp; 275 void *ip_ptr; 276 void *tcp_ptr; 277{ 278 struct tcphdr *th = (struct tcphdr *)tcp_ptr; 279 280#ifdef INET6 281 if ((inp->inp_vflag & INP_IPV6) != 0) { 282 struct ip6_hdr *ip6; 283 284 ip6 = (struct ip6_hdr *)ip_ptr; 285 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 286 (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); 287 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 288 (IPV6_VERSION & IPV6_VERSION_MASK); 289 ip6->ip6_nxt = IPPROTO_TCP; 290 ip6->ip6_plen = sizeof(struct tcphdr); 291 ip6->ip6_src = inp->in6p_laddr; 292 ip6->ip6_dst = inp->in6p_faddr; 293 } else 294#endif 295 { 296 struct ip *ip; 297 298 ip = (struct ip *)ip_ptr; 299 ip->ip_v = IPVERSION; 300 ip->ip_hl = 5; 301 ip->ip_tos = inp->inp_ip_tos; 302 ip->ip_len = 0; 303 ip->ip_id = 0; 304 ip->ip_off = 0; 305 ip->ip_ttl = inp->inp_ip_ttl; 306 ip->ip_sum = 0; 307 ip->ip_p = IPPROTO_TCP; 308 ip->ip_src = inp->inp_laddr; 309 ip->ip_dst = inp->inp_faddr; 310 } 311 th->th_sport = inp->inp_lport; 312 th->th_dport = inp->inp_fport; 313 th->th_seq = 0; 314 th->th_ack = 0; 315 th->th_x2 = 0; 316 th->th_off = 5; 317 th->th_flags = 0; 318 th->th_win = 0; 319 th->th_urp = 0; 320 th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ 321} 322 323/* 324 * Create template to be used to send tcp packets on a connection. 325 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 326 * use for this function is in keepalives, which use tcp_respond. 327 */ 328struct tcptemp * 329tcpip_maketemplate(inp) 330 struct inpcb *inp; 331{ 332 struct mbuf *m; 333 struct tcptemp *n; 334 335 m = m_get(M_DONTWAIT, MT_HEADER); 336 if (m == NULL) 337 return (0); 338 m->m_len = sizeof(struct tcptemp); 339 n = mtod(m, struct tcptemp *); 340 341 tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t); 342 return (n); 343} 344 345/* 346 * Send a single message to the TCP at address specified by 347 * the given TCP/IP header. If m == 0, then we make a copy 348 * of the tcpiphdr at ti and send directly to the addressed host. 349 * This is used to force keep alive messages out using the TCP 350 * template for a connection. If flags are given then we send 351 * a message back to the TCP which originated the * segment ti, 352 * and discard the mbuf containing it and any other attached mbufs. 353 * 354 * In any case the ack and sequence number of the transmitted 355 * segment are as specified by the parameters. 356 * 357 * NOTE: If m != NULL, then ti must point to *inside* the mbuf. 358 */ 359void 360tcp_respond(tp, ipgen, th, m, ack, seq, flags) 361 struct tcpcb *tp; 362 void *ipgen; 363 register struct tcphdr *th; 364 register struct mbuf *m; 365 tcp_seq ack, seq; 366 int flags; 367{ 368 register int tlen; 369 int win = 0; 370 struct ip *ip; 371 struct tcphdr *nth; 372#ifdef INET6 373 struct ip6_hdr *ip6; 374 int isipv6; 375#endif /* INET6 */ 376 int ipflags = 0; 377 struct inpcb *inp = NULL; 378 379 KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); 380 381#ifdef INET6 382 isipv6 = ((struct ip *)ipgen)->ip_v == 6; 383 ip6 = ipgen; 384#endif /* INET6 */ 385 ip = ipgen; 386 387 if (tp) { 388 inp = tp->t_inpcb; 389 KASSERT(inp != NULL, ("tcp control block w/o inpcb")); 390 INP_INFO_WLOCK_ASSERT(&tcbinfo); 391 INP_LOCK_ASSERT(inp); 392 if (!(flags & TH_RST)) { 393 win = sbspace(&inp->inp_socket->so_rcv); 394 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 395 win = (long)TCP_MAXWIN << tp->rcv_scale; 396 } 397 } 398 if (m == 0) { 399 m = m_gethdr(M_DONTWAIT, MT_HEADER); 400 if (m == NULL) 401 return; 402 tlen = 0; 403 m->m_data += max_linkhdr; 404#ifdef INET6 405 if (isipv6) { 406 bcopy((caddr_t)ip6, mtod(m, caddr_t), 407 sizeof(struct ip6_hdr)); 408 ip6 = mtod(m, struct ip6_hdr *); 409 nth = (struct tcphdr *)(ip6 + 1); 410 } else 411#endif /* INET6 */ 412 { 413 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 414 ip = mtod(m, struct ip *); 415 nth = (struct tcphdr *)(ip + 1); 416 } 417 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 418 flags = TH_ACK; 419 } else { 420 m_freem(m->m_next); 421 m->m_next = 0; 422 m->m_data = (caddr_t)ipgen; 423 /* m_len is set later */ 424 tlen = 0; 425#define xchg(a,b,type) { type t; t=a; a=b; b=t; } 426#ifdef INET6 427 if (isipv6) { 428 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 429 nth = (struct tcphdr *)(ip6 + 1); 430 } else 431#endif /* INET6 */ 432 { 433 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); 434 nth = (struct tcphdr *)(ip + 1); 435 } 436 if (th != nth) { 437 /* 438 * this is usually a case when an extension header 439 * exists between the IPv6 header and the 440 * TCP header. 441 */ 442 nth->th_sport = th->th_sport; 443 nth->th_dport = th->th_dport; 444 } 445 xchg(nth->th_dport, nth->th_sport, n_short); 446#undef xchg 447 } 448#ifdef INET6 449 if (isipv6) { 450 ip6->ip6_flow = 0; 451 ip6->ip6_vfc = IPV6_VERSION; 452 ip6->ip6_nxt = IPPROTO_TCP; 453 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + 454 tlen)); 455 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 456 } else 457#endif 458 { 459 tlen += sizeof (struct tcpiphdr); 460 ip->ip_len = tlen; 461 ip->ip_ttl = ip_defttl; 462 if (path_mtu_discovery) 463 ip->ip_off |= IP_DF; 464 } 465 m->m_len = tlen; 466 m->m_pkthdr.len = tlen; 467 m->m_pkthdr.rcvif = (struct ifnet *) 0; 468#ifdef MAC 469 if (inp != NULL) { 470 /* 471 * Packet is associated with a socket, so allow the 472 * label of the response to reflect the socket label. 473 */ 474 mac_create_mbuf_from_socket(inp->inp_socket, m); 475 } else { 476 /* 477 * Packet is not associated with a socket, so possibly 478 * update the label in place. 479 */ 480 mac_reflect_mbuf_tcp(m); 481 } 482#endif 483 nth->th_seq = htonl(seq); 484 nth->th_ack = htonl(ack); 485 nth->th_x2 = 0; 486 nth->th_off = sizeof (struct tcphdr) >> 2; 487 nth->th_flags = flags; 488 if (tp) 489 nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 490 else 491 nth->th_win = htons((u_short)win); 492 nth->th_urp = 0; 493#ifdef INET6 494 if (isipv6) { 495 nth->th_sum = 0; 496 nth->th_sum = in6_cksum(m, IPPROTO_TCP, 497 sizeof(struct ip6_hdr), 498 tlen - sizeof(struct ip6_hdr)); 499 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL); 500 } else 501#endif /* INET6 */ 502 { 503 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 504 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 505 m->m_pkthdr.csum_flags = CSUM_TCP; 506 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 507 } 508#ifdef TCPDEBUG 509 if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) 510 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); 511#endif 512#ifdef INET6 513 if (isipv6) 514 (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); 515 else 516#endif /* INET6 */ 517 (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); 518} 519 520/* 521 * Create a new TCP control block, making an 522 * empty reassembly queue and hooking it to the argument 523 * protocol control block. The `inp' parameter must have 524 * come from the zone allocator set up in tcp_init(). 525 */ 526struct tcpcb * 527tcp_newtcpcb(inp) 528 struct inpcb *inp; 529{ 530 struct tcpcb_mem *tm; 531 struct tcpcb *tp; 532#ifdef INET6 533 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 534#endif /* INET6 */ 535 536 tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); 537 if (tm == NULL) 538 return (NULL); 539 tp = &tm->tcb; 540 /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ 541 tp->t_maxseg = tp->t_maxopd = 542#ifdef INET6 543 isipv6 ? tcp_v6mssdflt : 544#endif /* INET6 */ 545 tcp_mssdflt; 546 547 /* Set up our timeouts. */ 548 callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0); 549 callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0); 550 callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0); 551 callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0); 552 callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0); 553 554 if (tcp_do_rfc1323) 555 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 556 if (tcp_do_rfc1644) 557 tp->t_flags |= TF_REQ_CC; 558 tp->t_inpcb = inp; /* XXX */ 559 /* 560 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 561 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 562 * reasonable initial retransmit time. 563 */ 564 tp->t_srtt = TCPTV_SRTTBASE; 565 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 566 tp->t_rttmin = tcp_rexmit_min; 567 tp->t_rxtcur = TCPTV_RTOBASE; 568 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 569 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 570 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 571 tp->t_rcvtime = ticks; 572 tp->t_bw_rtttime = ticks; 573 /* 574 * IPv4 TTL initialization is necessary for an IPv6 socket as well, 575 * because the socket may be bound to an IPv6 wildcard address, 576 * which may match an IPv4-mapped IPv6 address. 577 */ 578 inp->inp_ip_ttl = ip_defttl; 579 inp->inp_ppcb = (caddr_t)tp; 580 return (tp); /* XXX */ 581} 582 583/* 584 * Drop a TCP connection, reporting 585 * the specified error. If connection is synchronized, 586 * then send a RST to peer. 587 */ 588struct tcpcb * 589tcp_drop(tp, errno) 590 register struct tcpcb *tp; 591 int errno; 592{ 593 struct socket *so = tp->t_inpcb->inp_socket; 594 595 if (TCPS_HAVERCVDSYN(tp->t_state)) { 596 tp->t_state = TCPS_CLOSED; 597 (void) tcp_output(tp); 598 tcpstat.tcps_drops++; 599 } else 600 tcpstat.tcps_conndrops++; 601 if (errno == ETIMEDOUT && tp->t_softerror) 602 errno = tp->t_softerror; 603 so->so_error = errno; 604 return (tcp_close(tp)); 605} 606 607static void 608tcp_discardcb(tp) 609 struct tcpcb *tp; 610{ 611 struct tseg_qent *q; 612 struct inpcb *inp = tp->t_inpcb; 613 struct socket *so = inp->inp_socket; 614#ifdef INET6 615 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 616#endif /* INET6 */ 617 618 /* 619 * Make sure that all of our timers are stopped before we 620 * delete the PCB. 621 */ 622 callout_stop(tp->tt_rexmt); 623 callout_stop(tp->tt_persist); 624 callout_stop(tp->tt_keep); 625 callout_stop(tp->tt_2msl); 626 callout_stop(tp->tt_delack); 627 628 /* 629 * If we got enough samples through the srtt filter, 630 * save the rtt and rttvar in the routing entry. 631 * 'Enough' is arbitrarily defined as 4 rtt samples. 632 * 4 samples is enough for the srtt filter to converge 633 * to within enough % of the correct value; fewer samples 634 * and we could save a bogus rtt. The danger is not high 635 * as tcp quickly recovers from everything. 636 * XXX: Works very well but needs some more statistics! 637 */ 638 if (tp->t_rttupdated >= 4) { 639 struct hc_metrics_lite metrics; 640 u_long ssthresh; 641 642 bzero(&metrics, sizeof(metrics)); 643 /* 644 * Update the ssthresh always when the conditions below 645 * are satisfied. This gives us better new start value 646 * for the congestion avoidance for new connections. 647 * ssthresh is only set if packet loss occured on a session. 648 */ 649 ssthresh = tp->snd_ssthresh; 650 if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { 651 /* 652 * convert the limit from user data bytes to 653 * packets then to packet data bytes. 654 */ 655 ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; 656 if (ssthresh < 2) 657 ssthresh = 2; 658 ssthresh *= (u_long)(tp->t_maxseg + 659#ifdef INET6 660 (isipv6 ? sizeof (struct ip6_hdr) + 661 sizeof (struct tcphdr) : 662#endif 663 sizeof (struct tcpiphdr) 664#ifdef INET6 665 ) 666#endif 667 ); 668 } else 669 ssthresh = 0; 670 metrics.rmx_ssthresh = ssthresh; 671 672 metrics.rmx_rtt = tp->t_srtt; 673 metrics.rmx_rttvar = tp->t_rttvar; 674 /* XXX: This wraps if the pipe is more than 4 Gbit per second */ 675 metrics.rmx_bandwidth = tp->snd_bandwidth; 676 metrics.rmx_cwnd = tp->snd_cwnd; 677 metrics.rmx_sendpipe = 0; 678 metrics.rmx_recvpipe = 0; 679 680 tcp_hc_update(&inp->inp_inc, &metrics); 681 } 682 683 /* free the reassembly queue, if any */ 684 while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { 685 LIST_REMOVE(q, tqe_q); 686 m_freem(q->tqe_m); 687 FREE(q, M_TSEGQ); 688 } 689 inp->inp_ppcb = NULL; 690 tp->t_inpcb = NULL; 691 uma_zfree(tcpcb_zone, tp); 692 soisdisconnected(so); 693} 694 695/* 696 * Close a TCP control block: 697 * discard all space held by the tcp 698 * discard internet protocol block 699 * wake up any sleepers 700 */ 701struct tcpcb * 702tcp_close(tp) 703 struct tcpcb *tp; 704{ 705 struct inpcb *inp = tp->t_inpcb; 706#ifdef INET6 707 struct socket *so = inp->inp_socket; 708#endif 709 710 tcp_discardcb(tp); 711#ifdef INET6 712 if (INP_CHECK_SOCKAF(so, AF_INET6)) 713 in6_pcbdetach(inp); 714 else 715#endif 716 in_pcbdetach(inp); 717 tcpstat.tcps_closed++; 718 return ((struct tcpcb *)0); 719} 720 721void 722tcp_drain() 723{ 724 if (do_tcpdrain) 725 { 726 struct inpcb *inpb; 727 struct tcpcb *tcpb; 728 struct tseg_qent *te; 729 730 /* 731 * Walk the tcpbs, if existing, and flush the reassembly queue, 732 * if there is one... 733 * XXX: The "Net/3" implementation doesn't imply that the TCP 734 * reassembly queue should be flushed, but in a situation 735 * where we're really low on mbufs, this is potentially 736 * usefull. 737 */ 738 INP_INFO_RLOCK(&tcbinfo); 739 LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { 740 if (inpb->inp_vflag & INP_TIMEWAIT) 741 continue; 742 INP_LOCK(inpb); 743 if ((tcpb = intotcpcb(inpb))) { 744 while ((te = LIST_FIRST(&tcpb->t_segq)) 745 != NULL) { 746 LIST_REMOVE(te, tqe_q); 747 m_freem(te->tqe_m); 748 FREE(te, M_TSEGQ); 749 } 750 } 751 INP_UNLOCK(inpb); 752 } 753 INP_INFO_RUNLOCK(&tcbinfo); 754 } 755} 756 757/* 758 * Notify a tcp user of an asynchronous error; 759 * store error as soft error, but wake up user 760 * (for now, won't do anything until can select for soft error). 761 * 762 * Do not wake up user since there currently is no mechanism for 763 * reporting soft errors (yet - a kqueue filter may be added). 764 */ 765static struct inpcb * 766tcp_notify(inp, error) 767 struct inpcb *inp; 768 int error; 769{ 770 struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; 771 772 /* 773 * Ignore some errors if we are hooked up. 774 * If connection hasn't completed, has retransmitted several times, 775 * and receives a second error, give up now. This is better 776 * than waiting a long time to establish a connection that 777 * can never complete. 778 */ 779 if (tp->t_state == TCPS_ESTABLISHED && 780 (error == EHOSTUNREACH || error == ENETUNREACH || 781 error == EHOSTDOWN)) { 782 return inp; 783 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 784 tp->t_softerror) { 785 tcp_drop(tp, error); 786 return (struct inpcb *)0; 787 } else { 788 tp->t_softerror = error; 789 return inp; 790 } 791#if 0 792 wakeup( &so->so_timeo); 793 sorwakeup(so); 794 sowwakeup(so); 795#endif 796} 797 798static int 799tcp_pcblist(SYSCTL_HANDLER_ARGS) 800{ 801 int error, i, n, s; 802 struct inpcb *inp, **inp_list; 803 inp_gen_t gencnt; 804 struct xinpgen xig; 805 806 /* 807 * The process of preparing the TCB list is too time-consuming and 808 * resource-intensive to repeat twice on every request. 809 */ 810 if (req->oldptr == 0) { 811 n = tcbinfo.ipi_count; 812 req->oldidx = 2 * (sizeof xig) 813 + (n + n/8) * sizeof(struct xtcpcb); 814 return 0; 815 } 816 817 if (req->newptr != 0) 818 return EPERM; 819 820 /* 821 * OK, now we're committed to doing something. 822 */ 823 s = splnet(); 824 INP_INFO_RLOCK(&tcbinfo); 825 gencnt = tcbinfo.ipi_gencnt; 826 n = tcbinfo.ipi_count; 827 INP_INFO_RUNLOCK(&tcbinfo); 828 splx(s); 829 830 sysctl_wire_old_buffer(req, 2 * (sizeof xig) 831 + n * sizeof(struct xtcpcb)); 832 833 xig.xig_len = sizeof xig; 834 xig.xig_count = n; 835 xig.xig_gen = gencnt; 836 xig.xig_sogen = so_gencnt; 837 error = SYSCTL_OUT(req, &xig, sizeof xig); 838 if (error) 839 return error; 840 841 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 842 if (inp_list == 0) 843 return ENOMEM; 844 845 s = splnet(); 846 INP_INFO_RLOCK(&tcbinfo); 847 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; 848 inp = LIST_NEXT(inp, inp_list)) { 849 INP_LOCK(inp); 850 if (inp->inp_gencnt <= gencnt) { 851 /* 852 * XXX: This use of cr_cansee(), introduced with 853 * TCP state changes, is not quite right, but for 854 * now, better than nothing. 855 */ 856 if (inp->inp_vflag & INP_TIMEWAIT) 857 error = cr_cansee(req->td->td_ucred, 858 intotw(inp)->tw_cred); 859 else 860 error = cr_canseesocket(req->td->td_ucred, 861 inp->inp_socket); 862 if (error == 0) 863 inp_list[i++] = inp; 864 } 865 INP_UNLOCK(inp); 866 } 867 INP_INFO_RUNLOCK(&tcbinfo); 868 splx(s); 869 n = i; 870 871 error = 0; 872 for (i = 0; i < n; i++) { 873 inp = inp_list[i]; 874 if (inp->inp_gencnt <= gencnt) { 875 struct xtcpcb xt; 876 caddr_t inp_ppcb; 877 xt.xt_len = sizeof xt; 878 /* XXX should avoid extra copy */ 879 bcopy(inp, &xt.xt_inp, sizeof *inp); 880 inp_ppcb = inp->inp_ppcb; 881 if (inp_ppcb == NULL) 882 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 883 else if (inp->inp_vflag & INP_TIMEWAIT) { 884 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 885 xt.xt_tp.t_state = TCPS_TIME_WAIT; 886 } else 887 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); 888 if (inp->inp_socket) 889 sotoxsocket(inp->inp_socket, &xt.xt_socket); 890 else { 891 bzero(&xt.xt_socket, sizeof xt.xt_socket); 892 xt.xt_socket.xso_protocol = IPPROTO_TCP; 893 } 894 xt.xt_inp.inp_gencnt = inp->inp_gencnt; 895 error = SYSCTL_OUT(req, &xt, sizeof xt); 896 } 897 } 898 if (!error) { 899 /* 900 * Give the user an updated idea of our state. 901 * If the generation differs from what we told 902 * her before, she knows that something happened 903 * while we were processing this request, and it 904 * might be necessary to retry. 905 */ 906 s = splnet(); 907 INP_INFO_RLOCK(&tcbinfo); 908 xig.xig_gen = tcbinfo.ipi_gencnt; 909 xig.xig_sogen = so_gencnt; 910 xig.xig_count = tcbinfo.ipi_count; 911 INP_INFO_RUNLOCK(&tcbinfo); 912 splx(s); 913 error = SYSCTL_OUT(req, &xig, sizeof xig); 914 } 915 free(inp_list, M_TEMP); 916 return error; 917} 918 919SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, 920 tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 921 922static int 923tcp_getcred(SYSCTL_HANDLER_ARGS) 924{ 925 struct xucred xuc; 926 struct sockaddr_in addrs[2]; 927 struct inpcb *inp; 928 int error, s; 929 930 error = suser_cred(req->td->td_ucred, PRISON_ROOT); 931 if (error) 932 return (error); 933 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 934 if (error) 935 return (error); 936 s = splnet(); 937 INP_INFO_RLOCK(&tcbinfo); 938 inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, 939 addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); 940 if (inp == NULL) { 941 error = ENOENT; 942 goto outunlocked; 943 } 944 INP_LOCK(inp); 945 if (inp->inp_socket == NULL) { 946 error = ENOENT; 947 goto out; 948 } 949 error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 950 if (error) 951 goto out; 952 cru2x(inp->inp_socket->so_cred, &xuc); 953out: 954 INP_UNLOCK(inp); 955outunlocked: 956 INP_INFO_RUNLOCK(&tcbinfo); 957 splx(s); 958 if (error == 0) 959 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 960 return (error); 961} 962 963SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, 964 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 965 tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); 966 967#ifdef INET6 968static int 969tcp6_getcred(SYSCTL_HANDLER_ARGS) 970{ 971 struct xucred xuc; 972 struct sockaddr_in6 addrs[2]; 973 struct inpcb *inp; 974 int error, s, mapped = 0; 975 976 error = suser_cred(req->td->td_ucred, PRISON_ROOT); 977 if (error) 978 return (error); 979 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 980 if (error) 981 return (error); 982 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { 983 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) 984 mapped = 1; 985 else 986 return (EINVAL); 987 } 988 s = splnet(); 989 INP_INFO_RLOCK(&tcbinfo); 990 if (mapped == 1) 991 inp = in_pcblookup_hash(&tcbinfo, 992 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], 993 addrs[1].sin6_port, 994 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], 995 addrs[0].sin6_port, 996 0, NULL); 997 else 998 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, 999 addrs[1].sin6_port, 1000 &addrs[0].sin6_addr, addrs[0].sin6_port, 1001 0, NULL); 1002 if (inp == NULL) { 1003 error = ENOENT; 1004 goto outunlocked; 1005 } 1006 INP_LOCK(inp); 1007 if (inp->inp_socket == NULL) { 1008 error = ENOENT; 1009 goto out; 1010 } 1011 error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 1012 if (error) 1013 goto out; 1014 cru2x(inp->inp_socket->so_cred, &xuc); 1015out: 1016 INP_UNLOCK(inp); 1017outunlocked: 1018 INP_INFO_RUNLOCK(&tcbinfo); 1019 splx(s); 1020 if (error == 0) 1021 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 1022 return (error); 1023} 1024 1025SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, 1026 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 1027 tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); 1028#endif 1029 1030 1031void 1032tcp_ctlinput(cmd, sa, vip) 1033 int cmd; 1034 struct sockaddr *sa; 1035 void *vip; 1036{ 1037 struct ip *ip = vip; 1038 struct tcphdr *th; 1039 struct in_addr faddr; 1040 struct inpcb *inp; 1041 struct tcpcb *tp; 1042 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 1043 tcp_seq icmp_seq; 1044 int s; 1045 1046 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1047 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1048 return; 1049 1050 if (cmd == PRC_QUENCH) 1051 notify = tcp_quench; 1052 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || 1053 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) 1054 notify = tcp_drop_syn_sent; 1055 else if (cmd == PRC_MSGSIZE) 1056 notify = tcp_mtudisc; 1057 /* 1058 * Redirects don't need to be handled up here. 1059 */ 1060 else if (PRC_IS_REDIRECT(cmd)) 1061 return; 1062 /* 1063 * Hostdead is ugly because it goes linearly through all PCBs. 1064 * XXX: We never get this from ICMP, otherwise it makes an 1065 * excellent DoS attack on machines with many connections. 1066 */ 1067 else if (cmd == PRC_HOSTDEAD) 1068 ip = 0; 1069 else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) 1070 return; 1071 if (ip) { 1072 s = splnet(); 1073 th = (struct tcphdr *)((caddr_t)ip 1074 + (ip->ip_hl << 2)); 1075 INP_INFO_WLOCK(&tcbinfo); 1076 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, 1077 ip->ip_src, th->th_sport, 0, NULL); 1078 if (inp != NULL) { 1079 INP_LOCK(inp); 1080 if (inp->inp_socket != NULL) { 1081 icmp_seq = htonl(th->th_seq); 1082 tp = intotcpcb(inp); 1083 if (SEQ_GEQ(icmp_seq, tp->snd_una) && 1084 SEQ_LT(icmp_seq, tp->snd_max)) 1085 inp = (*notify)(inp, inetctlerrmap[cmd]); 1086 } 1087 if (inp) 1088 INP_UNLOCK(inp); 1089 } else { 1090 struct in_conninfo inc; 1091 1092 inc.inc_fport = th->th_dport; 1093 inc.inc_lport = th->th_sport; 1094 inc.inc_faddr = faddr; 1095 inc.inc_laddr = ip->ip_src; 1096#ifdef INET6 1097 inc.inc_isipv6 = 0; 1098#endif 1099 syncache_unreach(&inc, th); 1100 } 1101 INP_INFO_WUNLOCK(&tcbinfo); 1102 splx(s); 1103 } else 1104 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); 1105} 1106 1107#ifdef INET6 1108void 1109tcp6_ctlinput(cmd, sa, d) 1110 int cmd; 1111 struct sockaddr *sa; 1112 void *d; 1113{ 1114 struct tcphdr th; 1115 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 1116 struct ip6_hdr *ip6; 1117 struct mbuf *m; 1118 struct ip6ctlparam *ip6cp = NULL; 1119 const struct sockaddr_in6 *sa6_src = NULL; 1120 int off; 1121 struct tcp_portonly { 1122 u_int16_t th_sport; 1123 u_int16_t th_dport; 1124 } *thp; 1125 1126 if (sa->sa_family != AF_INET6 || 1127 sa->sa_len != sizeof(struct sockaddr_in6)) 1128 return; 1129 1130 if (cmd == PRC_QUENCH) 1131 notify = tcp_quench; 1132 else if (cmd == PRC_MSGSIZE) 1133 notify = tcp_mtudisc; 1134 else if (!PRC_IS_REDIRECT(cmd) && 1135 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) 1136 return; 1137 1138 /* if the parameter is from icmp6, decode it. */ 1139 if (d != NULL) { 1140 ip6cp = (struct ip6ctlparam *)d; 1141 m = ip6cp->ip6c_m; 1142 ip6 = ip6cp->ip6c_ip6; 1143 off = ip6cp->ip6c_off; 1144 sa6_src = ip6cp->ip6c_src; 1145 } else { 1146 m = NULL; 1147 ip6 = NULL; 1148 off = 0; /* fool gcc */ 1149 sa6_src = &sa6_any; 1150 } 1151 1152 if (ip6) { 1153 struct in_conninfo inc; 1154 /* 1155 * XXX: We assume that when IPV6 is non NULL, 1156 * M and OFF are valid. 1157 */ 1158 1159 /* check if we can safely examine src and dst ports */ 1160 if (m->m_pkthdr.len < off + sizeof(*thp)) 1161 return; 1162 1163 bzero(&th, sizeof(th)); 1164 m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 1165 1166 in6_pcbnotify(&tcb, sa, th.th_dport, 1167 (struct sockaddr *)ip6cp->ip6c_src, 1168 th.th_sport, cmd, notify); 1169 1170 inc.inc_fport = th.th_dport; 1171 inc.inc_lport = th.th_sport; 1172 inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; 1173 inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; 1174 inc.inc_isipv6 = 1; 1175 syncache_unreach(&inc, &th); 1176 } else 1177 in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src, 1178 0, cmd, notify); 1179} 1180#endif /* INET6 */ 1181 1182 1183/* 1184 * Following is where TCP initial sequence number generation occurs. 1185 * 1186 * There are two places where we must use initial sequence numbers: 1187 * 1. In SYN-ACK packets. 1188 * 2. In SYN packets. 1189 * 1190 * All ISNs for SYN-ACK packets are generated by the syncache. See 1191 * tcp_syncache.c for details. 1192 * 1193 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 1194 * depends on this property. In addition, these ISNs should be 1195 * unguessable so as to prevent connection hijacking. To satisfy 1196 * the requirements of this situation, the algorithm outlined in 1197 * RFC 1948 is used to generate sequence numbers. 1198 * 1199 * Implementation details: 1200 * 1201 * Time is based off the system timer, and is corrected so that it 1202 * increases by one megabyte per second. This allows for proper 1203 * recycling on high speed LANs while still leaving over an hour 1204 * before rollover. 1205 * 1206 * net.inet.tcp.isn_reseed_interval controls the number of seconds 1207 * between seeding of isn_secret. This is normally set to zero, 1208 * as reseeding should not be necessary. 1209 * 1210 */ 1211 1212#define ISN_BYTES_PER_SECOND 1048576 1213 1214u_char isn_secret[32]; 1215int isn_last_reseed; 1216MD5_CTX isn_ctx; 1217 1218tcp_seq 1219tcp_new_isn(tp) 1220 struct tcpcb *tp; 1221{ 1222 u_int32_t md5_buffer[4]; 1223 tcp_seq new_isn; 1224 1225 /* Seed if this is the first use, reseed if requested. */ 1226 if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && 1227 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) 1228 < (u_int)ticks))) { 1229 read_random(&isn_secret, sizeof(isn_secret)); 1230 isn_last_reseed = ticks; 1231 } 1232 1233 /* Compute the md5 hash and return the ISN. */ 1234 MD5Init(&isn_ctx); 1235 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); 1236 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); 1237#ifdef INET6 1238 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { 1239 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, 1240 sizeof(struct in6_addr)); 1241 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, 1242 sizeof(struct in6_addr)); 1243 } else 1244#endif 1245 { 1246 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, 1247 sizeof(struct in_addr)); 1248 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, 1249 sizeof(struct in_addr)); 1250 } 1251 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); 1252 MD5Final((u_char *) &md5_buffer, &isn_ctx); 1253 new_isn = (tcp_seq) md5_buffer[0]; 1254 new_isn += ticks * (ISN_BYTES_PER_SECOND / hz); 1255 return new_isn; 1256} 1257 1258/* 1259 * When a source quench is received, close congestion window 1260 * to one segment. We will gradually open it again as we proceed. 1261 */ 1262struct inpcb * 1263tcp_quench(inp, errno) 1264 struct inpcb *inp; 1265 int errno; 1266{ 1267 struct tcpcb *tp = intotcpcb(inp); 1268 1269 if (tp) 1270 tp->snd_cwnd = tp->t_maxseg; 1271 return (inp); 1272} 1273 1274/* 1275 * When a specific ICMP unreachable message is received and the 1276 * connection state is SYN-SENT, drop the connection. This behavior 1277 * is controlled by the icmp_may_rst sysctl. 1278 */ 1279struct inpcb * 1280tcp_drop_syn_sent(inp, errno) 1281 struct inpcb *inp; 1282 int errno; 1283{ 1284 struct tcpcb *tp = intotcpcb(inp); 1285 1286 if (tp && tp->t_state == TCPS_SYN_SENT) { 1287 tcp_drop(tp, errno); 1288 return (struct inpcb *)0; 1289 } 1290 return inp; 1291} 1292 1293/* 1294 * When `need fragmentation' ICMP is received, update our idea of the MSS 1295 * based on the new value in the route. Also nudge TCP to send something, 1296 * since we know the packet we just sent was dropped. 1297 * This duplicates some code in the tcp_mss() function in tcp_input.c. 1298 */ 1299struct inpcb * 1300tcp_mtudisc(inp, errno) 1301 struct inpcb *inp; 1302 int errno; 1303{ 1304 struct tcpcb *tp = intotcpcb(inp); 1305 struct rmxp_tao tao; 1306 struct socket *so = inp->inp_socket; 1307 u_int maxmtu; 1308 u_int romtu; 1309 int mss; 1310#ifdef INET6 1311 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 1312#endif /* INET6 */ 1313 bzero(&tao, sizeof(tao)); 1314 1315 if (tp) { 1316 maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */ 1317 romtu = 1318#ifdef INET6 1319 isipv6 ? tcp_maxmtu6(&inp->inp_inc) : 1320#endif /* INET6 */ 1321 tcp_maxmtu(&inp->inp_inc); 1322 if (!maxmtu) 1323 maxmtu = romtu; 1324 else 1325 maxmtu = min(maxmtu, romtu); 1326 if (!maxmtu) { 1327 tp->t_maxopd = tp->t_maxseg = 1328#ifdef INET6 1329 isipv6 ? tcp_v6mssdflt : 1330#endif /* INET6 */ 1331 tcp_mssdflt; 1332 return inp; 1333 } 1334 mss = maxmtu - 1335#ifdef INET6 1336 (isipv6 ? 1337 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : 1338#endif /* INET6 */ 1339 sizeof(struct tcpiphdr) 1340#ifdef INET6 1341 ) 1342#endif /* INET6 */ 1343 ; 1344 1345 if (tcp_do_rfc1644) { 1346 tcp_hc_gettao(&inp->inp_inc, &tao); 1347 if (tao.tao_mssopt) 1348 mss = min(mss, tao.tao_mssopt); 1349 } 1350 /* 1351 * XXX - The above conditional probably violates the TCP 1352 * spec. The problem is that, since we don't know the 1353 * other end's MSS, we are supposed to use a conservative 1354 * default. But, if we do that, then MTU discovery will 1355 * never actually take place, because the conservative 1356 * default is much less than the MTUs typically seen 1357 * on the Internet today. For the moment, we'll sweep 1358 * this under the carpet. 1359 * 1360 * The conservative default might not actually be a problem 1361 * if the only case this occurs is when sending an initial 1362 * SYN with options and data to a host we've never talked 1363 * to before. Then, they will reply with an MSS value which 1364 * will get recorded and the new parameters should get 1365 * recomputed. For Further Study. 1366 */ 1367 if (tp->t_maxopd <= mss) 1368 return inp; 1369 tp->t_maxopd = mss; 1370 1371 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 1372 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 1373 mss -= TCPOLEN_TSTAMP_APPA; 1374 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 1375 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) 1376 mss -= TCPOLEN_CC_APPA; 1377#if (MCLBYTES & (MCLBYTES - 1)) == 0 1378 if (mss > MCLBYTES) 1379 mss &= ~(MCLBYTES-1); 1380#else 1381 if (mss > MCLBYTES) 1382 mss = mss / MCLBYTES * MCLBYTES; 1383#endif 1384 if (so->so_snd.sb_hiwat < mss) 1385 mss = so->so_snd.sb_hiwat; 1386 1387 tp->t_maxseg = mss; 1388 1389 tcpstat.tcps_mturesent++; 1390 tp->t_rtttime = 0; 1391 tp->snd_nxt = tp->snd_una; 1392 tcp_output(tp); 1393 } 1394 return inp; 1395} 1396 1397/* 1398 * Look-up the routing entry to the peer of this inpcb. If no route 1399 * is found and it cannot be allocated, then return NULL. This routine 1400 * is called by TCP routines that access the rmx structure and by tcp_mss 1401 * to get the interface MTU. 1402 */ 1403u_long 1404tcp_maxmtu(inc) 1405 struct in_conninfo *inc; 1406{ 1407 struct route sro; 1408 struct sockaddr_in *dst; 1409 struct ifnet *ifp; 1410 u_long maxmtu = 0; 1411 1412 KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); 1413 1414 bzero(&sro, sizeof(sro)); 1415 if (inc->inc_faddr.s_addr != INADDR_ANY) { 1416 dst = (struct sockaddr_in *)&sro.ro_dst; 1417 dst->sin_family = AF_INET; 1418 dst->sin_len = sizeof(*dst); 1419 dst->sin_addr = inc->inc_faddr; 1420 rtalloc_ign(&sro, RTF_CLONING); 1421 } 1422 if (sro.ro_rt != NULL) { 1423 ifp = sro.ro_rt->rt_ifp; 1424 if (sro.ro_rt->rt_rmx.rmx_mtu == 0) 1425 maxmtu = ifp->if_mtu; 1426 else 1427 maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); 1428 RTFREE(sro.ro_rt); 1429 } 1430 return (maxmtu); 1431} 1432 1433#ifdef INET6 1434u_long 1435tcp_maxmtu6(inc) 1436 struct in_conninfo *inc; 1437{ 1438 struct route_in6 sro6; 1439 struct ifnet *ifp; 1440 u_long maxmtu = 0; 1441 1442 KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); 1443 1444 bzero(&sro6, sizeof(sro6)); 1445 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { 1446 sro6.ro_dst.sin6_family = AF_INET6; 1447 sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); 1448 sro6.ro_dst.sin6_addr = inc->inc6_faddr; 1449 rtalloc_ign((struct route *)&sro6, RTF_CLONING); 1450 } 1451 if (sro6.ro_rt != NULL) { 1452 ifp = sro6.ro_rt->rt_ifp; 1453 if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) 1454 maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); 1455 else 1456 maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, 1457 IN6_LINKMTU(sro6.ro_rt->rt_ifp)); 1458 RTFREE(sro6.ro_rt); 1459 } 1460 1461 return (maxmtu); 1462} 1463#endif /* INET6 */ 1464 1465#ifdef IPSEC 1466/* compute ESP/AH header size for TCP, including outer IP header. */ 1467size_t 1468ipsec_hdrsiz_tcp(tp) 1469 struct tcpcb *tp; 1470{ 1471 struct inpcb *inp; 1472 struct mbuf *m; 1473 size_t hdrsiz; 1474 struct ip *ip; 1475#ifdef INET6 1476 struct ip6_hdr *ip6; 1477#endif 1478 struct tcphdr *th; 1479 1480 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) 1481 return 0; 1482 MGETHDR(m, M_DONTWAIT, MT_DATA); 1483 if (!m) 1484 return 0; 1485 1486#ifdef INET6 1487 if ((inp->inp_vflag & INP_IPV6) != 0) { 1488 ip6 = mtod(m, struct ip6_hdr *); 1489 th = (struct tcphdr *)(ip6 + 1); 1490 m->m_pkthdr.len = m->m_len = 1491 sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1492 tcpip_fillheaders(inp, ip6, th); 1493 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 1494 } else 1495#endif /* INET6 */ 1496 { 1497 ip = mtod(m, struct ip *); 1498 th = (struct tcphdr *)(ip + 1); 1499 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); 1500 tcpip_fillheaders(inp, ip, th); 1501 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 1502 } 1503 1504 m_free(m); 1505 return hdrsiz; 1506} 1507#endif /*IPSEC*/ 1508 1509/* 1510 * Move a TCP connection into TIME_WAIT state. 1511 * tcbinfo is unlocked. 1512 * inp is locked, and is unlocked before returning. 1513 */ 1514void 1515tcp_twstart(tp) 1516 struct tcpcb *tp; 1517{ 1518 struct tcptw *tw; 1519 struct inpcb *inp; 1520 int tw_time, acknow; 1521 struct socket *so; 1522 1523 tw = uma_zalloc(tcptw_zone, M_NOWAIT); 1524 if (tw == NULL) { 1525 tw = tcp_timer_2msl_tw(1); 1526 if (tw == NULL) { 1527 tcp_close(tp); 1528 return; 1529 } 1530 } 1531 inp = tp->t_inpcb; 1532 tw->tw_inpcb = inp; 1533 1534 /* 1535 * Recover last window size sent. 1536 */ 1537 tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; 1538 1539 /* 1540 * Set t_recent if timestamps are used on the connection. 1541 */ 1542 if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == 1543 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) 1544 tw->t_recent = tp->ts_recent; 1545 else 1546 tw->t_recent = 0; 1547 1548 tw->snd_nxt = tp->snd_nxt; 1549 tw->rcv_nxt = tp->rcv_nxt; 1550 tw->iss = tp->iss; 1551 tw->irs = tp->irs; 1552 tw->cc_recv = tp->cc_recv; 1553 tw->cc_send = tp->cc_send; 1554 tw->t_starttime = tp->t_starttime; 1555 tw->tw_time = 0; 1556 1557/* XXX 1558 * If this code will 1559 * be used for fin-wait-2 state also, then we may need 1560 * a ts_recent from the last segment. 1561 */ 1562 /* Shorten TIME_WAIT [RFC-1644, p.28] */ 1563 if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) { 1564 tw_time = tp->t_rxtcur * TCPTV_TWTRUNC; 1565 /* For T/TCP client, force ACK now. */ 1566 acknow = 1; 1567 } else { 1568 tw_time = 2 * tcp_msl; 1569 acknow = tp->t_flags & TF_ACKNOW; 1570 } 1571 tcp_discardcb(tp); 1572 so = inp->inp_socket; 1573 so->so_pcb = NULL; 1574 tw->tw_cred = crhold(so->so_cred); 1575 tw->tw_so_options = so->so_options; 1576 if (acknow) 1577 tcp_twrespond(tw, so, NULL, TH_ACK); 1578 sotryfree(so); 1579 inp->inp_socket = NULL; 1580 inp->inp_ppcb = (caddr_t)tw; 1581 inp->inp_vflag |= INP_TIMEWAIT; 1582 tcp_timer_2msl_reset(tw, tw_time); 1583 INP_UNLOCK(inp); 1584} 1585 1586/* 1587 * The appromixate rate of ISN increase of Microsoft TCP stacks; 1588 * the actual rate is slightly higher due to the addition of 1589 * random positive increments. 1590 * 1591 * Most other new OSes use semi-randomized ISN values, so we 1592 * do not need to worry about them. 1593 */ 1594#define MS_ISN_BYTES_PER_SECOND 250000 1595 1596/* 1597 * Determine if the ISN we will generate has advanced beyond the last 1598 * sequence number used by the previous connection. If so, indicate 1599 * that it is safe to recycle this tw socket by returning 1. 1600 */ 1601int 1602tcp_twrecycleable(struct tcptw *tw) 1603{ 1604 tcp_seq new_iss = tw->iss; 1605 tcp_seq new_irs = tw->irs; 1606 1607 new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); 1608 new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); 1609 1610 if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt)) 1611 return 1; 1612 else 1613 return 0; 1614} 1615 1616struct tcptw * 1617tcp_twclose(struct tcptw *tw, int reuse) 1618{ 1619 struct inpcb *inp; 1620 1621 inp = tw->tw_inpcb; 1622 tw->tw_inpcb = NULL; 1623 tcp_timer_2msl_stop(tw); 1624 inp->inp_ppcb = NULL; 1625#ifdef INET6 1626 if (inp->inp_vflag & INP_IPV6PROTO) 1627 in6_pcbdetach(inp); 1628 else 1629#endif 1630 in_pcbdetach(inp); 1631 tcpstat.tcps_closed++; 1632 if (reuse) 1633 return (tw); 1634 uma_zfree(tcptw_zone, tw); 1635 return (NULL); 1636} 1637 1638/* 1639 * One of so and msrc must be non-NULL for use by the MAC Framework to 1640 * construct a label for ay resulting packet. 1641 */ 1642int 1643tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc, 1644 int flags) 1645{ 1646 struct inpcb *inp = tw->tw_inpcb; 1647 struct tcphdr *th; 1648 struct mbuf *m; 1649 struct ip *ip = NULL; 1650 u_int8_t *optp; 1651 u_int hdrlen, optlen; 1652 int error; 1653#ifdef INET6 1654 struct ip6_hdr *ip6 = NULL; 1655 int isipv6 = inp->inp_inc.inc_isipv6; 1656#endif 1657 1658 KASSERT(so != NULL || msrc != NULL, 1659 ("tcp_twrespond: so and msrc NULL")); 1660 1661 m = m_gethdr(M_DONTWAIT, MT_HEADER); 1662 if (m == NULL) 1663 return (ENOBUFS); 1664 m->m_data += max_linkhdr; 1665 1666#ifdef MAC 1667 mac_create_mbuf_from_inpcb(inp, m); 1668#endif 1669 1670#ifdef INET6 1671 if (isipv6) { 1672 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1673 ip6 = mtod(m, struct ip6_hdr *); 1674 th = (struct tcphdr *)(ip6 + 1); 1675 tcpip_fillheaders(inp, ip6, th); 1676 } else 1677#endif 1678 { 1679 hdrlen = sizeof(struct tcpiphdr); 1680 ip = mtod(m, struct ip *); 1681 th = (struct tcphdr *)(ip + 1); 1682 tcpip_fillheaders(inp, ip, th); 1683 } 1684 optp = (u_int8_t *)(th + 1); 1685 1686 /* 1687 * Send a timestamp and echo-reply if both our side and our peer 1688 * have sent timestamps in our SYN's and this is not a RST. 1689 */ 1690 if (tw->t_recent && flags == TH_ACK) { 1691 u_int32_t *lp = (u_int32_t *)optp; 1692 1693 /* Form timestamp option as shown in appendix A of RFC 1323. */ 1694 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 1695 *lp++ = htonl(ticks); 1696 *lp = htonl(tw->t_recent); 1697 optp += TCPOLEN_TSTAMP_APPA; 1698 } 1699 1700 /* 1701 * Send `CC-family' options if needed, and it's not a RST. 1702 */ 1703 if (tw->cc_recv != 0 && flags == TH_ACK) { 1704 u_int32_t *lp = (u_int32_t *)optp; 1705 1706 *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC)); 1707 *lp = htonl(tw->cc_send); 1708 optp += TCPOLEN_CC_APPA; 1709 } 1710 optlen = optp - (u_int8_t *)(th + 1); 1711 1712 m->m_len = hdrlen + optlen; 1713 m->m_pkthdr.len = m->m_len; 1714 1715 KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); 1716 1717 th->th_seq = htonl(tw->snd_nxt); 1718 th->th_ack = htonl(tw->rcv_nxt); 1719 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 1720 th->th_flags = flags; 1721 th->th_win = htons(tw->last_win); 1722 1723#ifdef INET6 1724 if (isipv6) { 1725 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 1726 sizeof(struct tcphdr) + optlen); 1727 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 1728 error = ip6_output(m, inp->in6p_outputopts, NULL, 1729 (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); 1730 } else 1731#endif 1732 { 1733 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 1734 htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); 1735 m->m_pkthdr.csum_flags = CSUM_TCP; 1736 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1737 ip->ip_len = m->m_pkthdr.len; 1738 if (path_mtu_discovery) 1739 ip->ip_off |= IP_DF; 1740 error = ip_output(m, inp->inp_options, NULL, 1741 (tw->tw_so_options & SO_DONTROUTE), NULL, inp); 1742 } 1743 if (flags & TH_ACK) 1744 tcpstat.tcps_sndacks++; 1745 else 1746 tcpstat.tcps_sndctrl++; 1747 tcpstat.tcps_sndtotal++; 1748 return (error); 1749} 1750 1751/* 1752 * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING 1753 * 1754 * This code attempts to calculate the bandwidth-delay product as a 1755 * means of determining the optimal window size to maximize bandwidth, 1756 * minimize RTT, and avoid the over-allocation of buffers on interfaces and 1757 * routers. This code also does a fairly good job keeping RTTs in check 1758 * across slow links like modems. We implement an algorithm which is very 1759 * similar (but not meant to be) TCP/Vegas. The code operates on the 1760 * transmitter side of a TCP connection and so only effects the transmit 1761 * side of the connection. 1762 * 1763 * BACKGROUND: TCP makes no provision for the management of buffer space 1764 * at the end points or at the intermediate routers and switches. A TCP 1765 * stream, whether using NewReno or not, will eventually buffer as 1766 * many packets as it is able and the only reason this typically works is 1767 * due to the fairly small default buffers made available for a connection 1768 * (typicaly 16K or 32K). As machines use larger windows and/or window 1769 * scaling it is now fairly easy for even a single TCP connection to blow-out 1770 * all available buffer space not only on the local interface, but on 1771 * intermediate routers and switches as well. NewReno makes a misguided 1772 * attempt to 'solve' this problem by waiting for an actual failure to occur, 1773 * then backing off, then steadily increasing the window again until another 1774 * failure occurs, ad-infinitum. This results in terrible oscillation that 1775 * is only made worse as network loads increase and the idea of intentionally 1776 * blowing out network buffers is, frankly, a terrible way to manage network 1777 * resources. 1778 * 1779 * It is far better to limit the transmit window prior to the failure 1780 * condition being achieved. There are two general ways to do this: First 1781 * you can 'scan' through different transmit window sizes and locate the 1782 * point where the RTT stops increasing, indicating that you have filled the 1783 * pipe, then scan backwards until you note that RTT stops decreasing, then 1784 * repeat ad-infinitum. This method works in principle but has severe 1785 * implementation issues due to RTT variances, timer granularity, and 1786 * instability in the algorithm which can lead to many false positives and 1787 * create oscillations as well as interact badly with other TCP streams 1788 * implementing the same algorithm. 1789 * 1790 * The second method is to limit the window to the bandwidth delay product 1791 * of the link. This is the method we implement. RTT variances and our 1792 * own manipulation of the congestion window, bwnd, can potentially 1793 * destabilize the algorithm. For this reason we have to stabilize the 1794 * elements used to calculate the window. We do this by using the minimum 1795 * observed RTT, the long term average of the observed bandwidth, and 1796 * by adding two segments worth of slop. It isn't perfect but it is able 1797 * to react to changing conditions and gives us a very stable basis on 1798 * which to extend the algorithm. 1799 */ 1800void 1801tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) 1802{ 1803 u_long bw; 1804 u_long bwnd; 1805 int save_ticks; 1806 1807 /* 1808 * If inflight_enable is disabled in the middle of a tcp connection, 1809 * make sure snd_bwnd is effectively disabled. 1810 */ 1811 if (tcp_inflight_enable == 0) { 1812 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 1813 tp->snd_bandwidth = 0; 1814 return; 1815 } 1816 1817 /* 1818 * Figure out the bandwidth. Due to the tick granularity this 1819 * is a very rough number and it MUST be averaged over a fairly 1820 * long period of time. XXX we need to take into account a link 1821 * that is not using all available bandwidth, but for now our 1822 * slop will ramp us up if this case occurs and the bandwidth later 1823 * increases. 1824 * 1825 * Note: if ticks rollover 'bw' may wind up negative. We must 1826 * effectively reset t_bw_rtttime for this case. 1827 */ 1828 save_ticks = ticks; 1829 if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) 1830 return; 1831 1832 bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 1833 (save_ticks - tp->t_bw_rtttime); 1834 tp->t_bw_rtttime = save_ticks; 1835 tp->t_bw_rtseq = ack_seq; 1836 if (tp->t_bw_rtttime == 0 || (int)bw < 0) 1837 return; 1838 bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; 1839 1840 tp->snd_bandwidth = bw; 1841 1842 /* 1843 * Calculate the semi-static bandwidth delay product, plus two maximal 1844 * segments. The additional slop puts us squarely in the sweet 1845 * spot and also handles the bandwidth run-up case and stabilization. 1846 * Without the slop we could be locking ourselves into a lower 1847 * bandwidth. 1848 * 1849 * Situations Handled: 1850 * (1) Prevents over-queueing of packets on LANs, especially on 1851 * high speed LANs, allowing larger TCP buffers to be 1852 * specified, and also does a good job preventing 1853 * over-queueing of packets over choke points like modems 1854 * (at least for the transmit side). 1855 * 1856 * (2) Is able to handle changing network loads (bandwidth 1857 * drops so bwnd drops, bandwidth increases so bwnd 1858 * increases). 1859 * 1860 * (3) Theoretically should stabilize in the face of multiple 1861 * connections implementing the same algorithm (this may need 1862 * a little work). 1863 * 1864 * (4) Stability value (defaults to 20 = 2 maximal packets) can 1865 * be adjusted with a sysctl but typically only needs to be 1866 * on very slow connections. A value no smaller then 5 1867 * should be used, but only reduce this default if you have 1868 * no other choice. 1869 */ 1870#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) 1871 bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; 1872#undef USERTT 1873 1874 if (tcp_inflight_debug > 0) { 1875 static int ltime; 1876 if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { 1877 ltime = ticks; 1878 printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", 1879 tp, 1880 bw, 1881 tp->t_rttbest, 1882 tp->t_srtt, 1883 bwnd 1884 ); 1885 } 1886 } 1887 if ((long)bwnd < tcp_inflight_min) 1888 bwnd = tcp_inflight_min; 1889 if (bwnd > tcp_inflight_max) 1890 bwnd = tcp_inflight_max; 1891 if ((long)bwnd < tp->t_maxseg * 2) 1892 bwnd = tp->t_maxseg * 2; 1893 tp->snd_bwnd = bwnd; 1894} 1895 1896