tcp_timewait.c revision 126351
11541Srgrimes/* 211150Swollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 31541Srgrimes * The Regents of the University of California. All rights reserved. 41541Srgrimes * 51541Srgrimes * Redistribution and use in source and binary forms, with or without 61541Srgrimes * modification, are permitted provided that the following conditions 71541Srgrimes * are met: 81541Srgrimes * 1. Redistributions of source code must retain the above copyright 91541Srgrimes * notice, this list of conditions and the following disclaimer. 101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer in the 121541Srgrimes * documentation and/or other materials provided with the distribution. 131541Srgrimes * 3. All advertising materials mentioning features or use of this software 141541Srgrimes * must display the following acknowledgement: 151541Srgrimes * This product includes software developed by the University of 161541Srgrimes * California, Berkeley and its contributors. 171541Srgrimes * 4. Neither the name of the University nor the names of its contributors 181541Srgrimes * may be used to endorse or promote products derived from this software 191541Srgrimes * without specific prior written permission. 201541Srgrimes * 211541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 221541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 231541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 241541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 251541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 261541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 271541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 281541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 291541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 301541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 311541Srgrimes * SUCH DAMAGE. 321541Srgrimes * 3311150Swollman * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 3450477Speter * $FreeBSD: head/sys/netinet/tcp_timewait.c 126351 2004-02-28 15:12:20Z rwatson $ 351541Srgrimes */ 361541Srgrimes 3732752Seivind#include "opt_compat.h" 38125680Sbms#include "opt_inet.h" 3954263Sshin#include "opt_inet6.h" 4056041Sshin#include "opt_ipsec.h" 41101106Srwatson#include "opt_mac.h" 4229514Sjoerg#include "opt_tcpdebug.h" 4329514Sjoerg 441541Srgrimes#include <sys/param.h> 451541Srgrimes#include <sys/systm.h> 4650673Sjlemon#include <sys/callout.h> 4712172Sphk#include <sys/kernel.h> 4812172Sphk#include <sys/sysctl.h> 49101106Srwatson#include <sys/mac.h> 501541Srgrimes#include <sys/malloc.h> 511541Srgrimes#include <sys/mbuf.h> 5255679Sshin#ifdef INET6 5355679Sshin#include <sys/domain.h> 5455679Sshin#endif 5548758Sgreen#include <sys/proc.h> 561541Srgrimes#include <sys/socket.h> 571541Srgrimes#include <sys/socketvar.h> 581541Srgrimes#include <sys/protosw.h> 5975619Skris#include <sys/random.h> 6034923Sbde 6192760Sjeff#include <vm/uma.h> 621541Srgrimes 631541Srgrimes#include <net/route.h> 641541Srgrimes#include <net/if.h> 651541Srgrimes 661541Srgrimes#include <netinet/in.h> 671541Srgrimes#include <netinet/in_systm.h> 681541Srgrimes#include <netinet/ip.h> 6955679Sshin#ifdef INET6 7055679Sshin#include <netinet/ip6.h> 7155679Sshin#endif 721541Srgrimes#include <netinet/in_pcb.h> 7355679Sshin#ifdef INET6 7455679Sshin#include <netinet6/in6_pcb.h> 7555679Sshin#endif 767090Sbde#include <netinet/in_var.h> 771541Srgrimes#include <netinet/ip_var.h> 7855679Sshin#ifdef INET6 7955679Sshin#include <netinet6/ip6_var.h> 80122922Sandre#include <netinet6/nd6.h> 8155679Sshin#endif 821541Srgrimes#include <netinet/tcp.h> 831541Srgrimes#include <netinet/tcp_fsm.h> 841541Srgrimes#include <netinet/tcp_seq.h> 851541Srgrimes#include <netinet/tcp_timer.h> 861541Srgrimes#include <netinet/tcp_var.h> 8755679Sshin#ifdef INET6 8855679Sshin#include <netinet6/tcp6_var.h> 8955679Sshin#endif 901541Srgrimes#include <netinet/tcpip.h> 916283Swollman#ifdef TCPDEBUG 926283Swollman#include <netinet/tcp_debug.h> 936283Swollman#endif 9455679Sshin#include <netinet6/ip6protosw.h> 951541Srgrimes 9655679Sshin#ifdef IPSEC 9755679Sshin#include <netinet6/ipsec.h> 9862587Sitojun#ifdef INET6 9962587Sitojun#include <netinet6/ipsec6.h> 10062587Sitojun#endif 10155679Sshin#endif /*IPSEC*/ 10255679Sshin 103105199Ssam#ifdef FAST_IPSEC 104105199Ssam#include <netipsec/ipsec.h> 105125680Sbms#include <netipsec/xform.h> 106105199Ssam#ifdef INET6 107105199Ssam#include <netipsec/ipsec6.h> 108105199Ssam#endif 109125680Sbms#include <netipsec/key.h> 110105199Ssam#define IPSEC 111105199Ssam#endif /*FAST_IPSEC*/ 112105199Ssam 11358698Sjlemon#include <machine/in_cksum.h> 11482122Ssilby#include <sys/md5.h> 11558698Sjlemon 1161541Srgrimesint tcp_mssdflt = TCP_MSS; 11746381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 11846381Sbillf &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); 11912296Sphk 12052904Sshin#ifdef INET6 12152904Sshinint tcp_v6mssdflt = TCP6_MSS; 12252904SshinSYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 12355679Sshin CTLFLAG_RW, &tcp_v6mssdflt , 0, 12455679Sshin "Default TCP Maximum Segment Size for IPv6"); 12552904Sshin#endif 12652904Sshin 127124258Sandre/* 128124258Sandre * Minimum MSS we accept and use. This prevents DoS attacks where 129124258Sandre * we are forced to a ridiculous low MSS like 20 and send hundreds 130124258Sandre * of packets instead of one. The effect scales with the available 131124258Sandre * bandwidth and quickly saturates the CPU and network interface 132124258Sandre * with packet generation and sending. Set to zero to disable MINMSS 133124258Sandre * checking. This setting prevents us from sending too small packets. 134124258Sandre */ 135124258Sandreint tcp_minmss = TCP_MINMSS; 136124258SandreSYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, 137124258Sandre &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); 138124258Sandre/* 139124258Sandre * Number of TCP segments per second we accept from remote host 140124258Sandre * before we start to calculate average segment size. If average 141124258Sandre * segment size drops below the minimum TCP MSS we assume a DoS 142124258Sandre * attack and reset+drop the connection. Care has to be taken not to 143124258Sandre * set this value too small to not kill interactive type connections 144124258Sandre * (telnet, SSH) which send many small packets. 145124258Sandre */ 146124258Sandreint tcp_minmssoverload = TCP_MINMSSOVERLOAD; 147124258SandreSYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW, 148124258Sandre &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to" 149124258Sandre "be under the MINMSS Size"); 150124258Sandre 15150673Sjlemon#if 0 15212296Sphkstatic int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; 15346381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 15446381Sbillf &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); 15550673Sjlemon#endif 15612296Sphk 15786764Sjlemonint tcp_do_rfc1323 = 1; 15846381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 15946381Sbillf &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); 16012296Sphk 16186764Sjlemonint tcp_do_rfc1644 = 0; 16246381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 16346381Sbillf &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); 1641541Srgrimes 16550426Sjlemonstatic int tcp_tcbhashsize = 0; 166121307SsilbySYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, 16750426Sjlemon &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 16850426Sjlemon 16955198Smsmithstatic int do_tcpdrain = 1; 17066376SbmilekicSYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, 17166376Sbmilekic "Enable tcp_drain routine for extra help when low on mbufs"); 17255198Smsmith 17346381SbillfSYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 17446381Sbillf &tcbinfo.ipi_count, 0, "Number of active PCBs"); 17536079Swollman 17672959Sjlemonstatic int icmp_may_rst = 1; 17772959SjlemonSYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 17872959Sjlemon "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 17970103Sphk 18082122Ssilbystatic int tcp_isn_reseed_interval = 0; 18182122SsilbySYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, 18282122Ssilby &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); 18382122Ssilby 184102017Sdillon/* 185102017Sdillon * TCP bandwidth limiting sysctls. Note that the default lower bound of 186102017Sdillon * 1024 exists only for debugging. A good production default would be 187102017Sdillon * something like 6100. 188102017Sdillon */ 189124199Sandrestatic int tcp_inflight_enable = 1; 190102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW, 191102017Sdillon &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); 192102017Sdillon 193104825Sdillonstatic int tcp_inflight_debug = 0; 194102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW, 195102017Sdillon &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); 196102017Sdillon 197107881Sdillonstatic int tcp_inflight_min = 6144; 198102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW, 199102017Sdillon &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); 200102017Sdillon 201102017Sdillonstatic int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; 202102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW, 203102017Sdillon &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); 204107881Sdillonstatic int tcp_inflight_stab = 20; 205107881SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, 206107881Sdillon &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); 207102017Sdillon 20898211Shsustatic struct inpcb *tcp_notify(struct inpcb *, int); 209111145Sjlemonstatic void tcp_discardcb(struct tcpcb *); 21012296Sphk 2117684Sdg/* 21232821Sdg * Target size of TCP PCB hash tables. Must be a power of two. 21343562Smsmith * 21443562Smsmith * Note that this can be overridden by the kernel environment 21543562Smsmith * variable net.inet.tcp.tcbhashsize 2167684Sdg */ 2177684Sdg#ifndef TCBHASHSIZE 21832821Sdg#define TCBHASHSIZE 512 2197684Sdg#endif 2201541Srgrimes 2211541Srgrimes/* 222111145Sjlemon * XXX 223111145Sjlemon * Callouts should be moved into struct tcp directly. They are currently 224123608Sjhb * separate because the tcpcb structure is exported to userland for sysctl 225111145Sjlemon * parsing purposes, which do not know about callouts. 22634881Swollman */ 227111145Sjlemonstruct tcpcb_mem { 22834881Swollman struct tcpcb tcb; 229111145Sjlemon struct callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep; 230111145Sjlemon struct callout tcpcb_mem_2msl, tcpcb_mem_delack; 23134881Swollman}; 23234881Swollman 233111145Sjlemonstatic uma_zone_t tcpcb_zone; 234111145Sjlemonstatic uma_zone_t tcptw_zone; 235111145Sjlemon 23634881Swollman/* 2371541Srgrimes * Tcp initialization 2381541Srgrimes */ 2391541Srgrimesvoid 2401541Srgrimestcp_init() 2411541Srgrimes{ 24277843Speter int hashsize = TCBHASHSIZE; 24343562Smsmith 2446283Swollman tcp_ccgen = 1; 24550673Sjlemon 24650673Sjlemon tcp_delacktime = TCPTV_DELACK; 24750673Sjlemon tcp_keepinit = TCPTV_KEEP_INIT; 24850673Sjlemon tcp_keepidle = TCPTV_KEEP_IDLE; 24950673Sjlemon tcp_keepintvl = TCPTV_KEEPINTVL; 25050673Sjlemon tcp_maxpersistidle = TCPTV_KEEP_IDLE; 25150673Sjlemon tcp_msl = TCPTV_MSL; 252100335Sdillon tcp_rexmit_min = TCPTV_MIN; 253100335Sdillon tcp_rexmit_slop = TCPTV_CPU_VAR; 25450673Sjlemon 25598102Shsu INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); 2567684Sdg LIST_INIT(&tcb); 2577684Sdg tcbinfo.listhead = &tcb; 25877900Speter TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); 25943576Smsmith if (!powerof2(hashsize)) { 26043562Smsmith printf("WARNING: TCB hash size not a power of 2\n"); 26143562Smsmith hashsize = 512; /* safe default */ 26243562Smsmith } 26350426Sjlemon tcp_tcbhashsize = hashsize; 26443562Smsmith tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); 26543562Smsmith tcbinfo.porthashbase = hashinit(hashsize, M_PCB, 26634923Sbde &tcbinfo.porthashmask); 267111145Sjlemon tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), 26892760Sjeff NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 26992760Sjeff uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); 27055679Sshin#ifdef INET6 27155679Sshin#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) 27255679Sshin#else /* INET6 */ 27355679Sshin#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) 27455679Sshin#endif /* INET6 */ 27555679Sshin if (max_protohdr < TCP_MINPROTOHDR) 27655679Sshin max_protohdr = TCP_MINPROTOHDR; 27755679Sshin if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) 2781541Srgrimes panic("tcp_init"); 27955679Sshin#undef TCP_MINPROTOHDR 280111145Sjlemon /* 281111145Sjlemon * These have to be type stable for the benefit of the timers. 282111145Sjlemon */ 283111145Sjlemon tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), 284111145Sjlemon NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 285111145Sjlemon uma_zone_set_max(tcpcb_zone, maxsockets); 286112009Sjlemon tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), 287111145Sjlemon NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 288121453Ssilby uma_zone_set_max(tcptw_zone, maxsockets / 5); 289112009Sjlemon tcp_timer_init(); 29086764Sjlemon syncache_init(); 291122922Sandre tcp_hc_init(); 292126193Sandre tcp_reass_init(); 2931541Srgrimes} 2941541Srgrimes 2951541Srgrimes/* 29678642Ssilby * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 29778642Ssilby * tcp_template used to store this data in mbufs, but we now recopy it out 29878642Ssilby * of the tcpcb each time to conserve mbufs. 2991541Srgrimes */ 30078642Ssilbyvoid 301111144Sjlemontcpip_fillheaders(inp, ip_ptr, tcp_ptr) 302111144Sjlemon struct inpcb *inp; 30378642Ssilby void *ip_ptr; 30478642Ssilby void *tcp_ptr; 3051541Srgrimes{ 306111144Sjlemon struct tcphdr *th = (struct tcphdr *)tcp_ptr; 3071541Srgrimes 30855679Sshin#ifdef INET6 30955679Sshin if ((inp->inp_vflag & INP_IPV6) != 0) { 31078642Ssilby struct ip6_hdr *ip6; 31155679Sshin 31278642Ssilby ip6 = (struct ip6_hdr *)ip_ptr; 31355679Sshin ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 31455679Sshin (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); 31555679Sshin ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 31655679Sshin (IPV6_VERSION & IPV6_VERSION_MASK); 31755679Sshin ip6->ip6_nxt = IPPROTO_TCP; 31855679Sshin ip6->ip6_plen = sizeof(struct tcphdr); 31955679Sshin ip6->ip6_src = inp->in6p_laddr; 32055679Sshin ip6->ip6_dst = inp->in6p_faddr; 32155679Sshin } else 32255679Sshin#endif 32378642Ssilby { 324111144Sjlemon struct ip *ip; 32555679Sshin 326111144Sjlemon ip = (struct ip *)ip_ptr; 327111144Sjlemon ip->ip_v = IPVERSION; 328111144Sjlemon ip->ip_hl = 5; 329111144Sjlemon ip->ip_tos = inp->inp_ip_tos; 330111144Sjlemon ip->ip_len = 0; 331111144Sjlemon ip->ip_id = 0; 332111144Sjlemon ip->ip_off = 0; 333111144Sjlemon ip->ip_ttl = inp->inp_ip_ttl; 334111144Sjlemon ip->ip_sum = 0; 335111144Sjlemon ip->ip_p = IPPROTO_TCP; 336111144Sjlemon ip->ip_src = inp->inp_laddr; 337111144Sjlemon ip->ip_dst = inp->inp_faddr; 33878642Ssilby } 339111144Sjlemon th->th_sport = inp->inp_lport; 340111144Sjlemon th->th_dport = inp->inp_fport; 341111144Sjlemon th->th_seq = 0; 342111144Sjlemon th->th_ack = 0; 343111144Sjlemon th->th_x2 = 0; 344111144Sjlemon th->th_off = 5; 345111144Sjlemon th->th_flags = 0; 346111144Sjlemon th->th_win = 0; 347111144Sjlemon th->th_urp = 0; 348111144Sjlemon th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ 34978642Ssilby} 35078642Ssilby 35178642Ssilby/* 35278642Ssilby * Create template to be used to send tcp packets on a connection. 35378642Ssilby * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 35478642Ssilby * use for this function is in keepalives, which use tcp_respond. 35578642Ssilby */ 35678642Ssilbystruct tcptemp * 357111144Sjlemontcpip_maketemplate(inp) 358111144Sjlemon struct inpcb *inp; 35978642Ssilby{ 36078642Ssilby struct mbuf *m; 36178642Ssilby struct tcptemp *n; 36278642Ssilby 363111119Simp m = m_get(M_DONTWAIT, MT_HEADER); 36478642Ssilby if (m == NULL) 36578642Ssilby return (0); 36678642Ssilby m->m_len = sizeof(struct tcptemp); 36778642Ssilby n = mtod(m, struct tcptemp *); 36878642Ssilby 369111144Sjlemon tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t); 3701541Srgrimes return (n); 3711541Srgrimes} 3721541Srgrimes 3731541Srgrimes/* 3741541Srgrimes * Send a single message to the TCP at address specified by 3751541Srgrimes * the given TCP/IP header. If m == 0, then we make a copy 3761541Srgrimes * of the tcpiphdr at ti and send directly to the addressed host. 3771541Srgrimes * This is used to force keep alive messages out using the TCP 37878642Ssilby * template for a connection. If flags are given then we send 37978642Ssilby * a message back to the TCP which originated the * segment ti, 38078642Ssilby * and discard the mbuf containing it and any other attached mbufs. 3811541Srgrimes * 3821541Srgrimes * In any case the ack and sequence number of the transmitted 3831541Srgrimes * segment are as specified by the parameters. 38431848Sjulian * 38531848Sjulian * NOTE: If m != NULL, then ti must point to *inside* the mbuf. 3861541Srgrimes */ 3871541Srgrimesvoid 38855679Sshintcp_respond(tp, ipgen, th, m, ack, seq, flags) 3891541Srgrimes struct tcpcb *tp; 39055679Sshin void *ipgen; 39155679Sshin register struct tcphdr *th; 3921541Srgrimes register struct mbuf *m; 3931541Srgrimes tcp_seq ack, seq; 3941541Srgrimes int flags; 3951541Srgrimes{ 3961541Srgrimes register int tlen; 3971541Srgrimes int win = 0; 39855679Sshin struct ip *ip; 39955679Sshin struct tcphdr *nth; 40055679Sshin#ifdef INET6 40155679Sshin struct ip6_hdr *ip6; 40255679Sshin int isipv6; 40355679Sshin#endif /* INET6 */ 40455679Sshin int ipflags = 0; 405122922Sandre struct inpcb *inp = NULL; 4061541Srgrimes 407101137Srwatson KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); 408101137Srwatson 40955679Sshin#ifdef INET6 410105586Sphk isipv6 = ((struct ip *)ipgen)->ip_v == 6; 41155679Sshin ip6 = ipgen; 41255679Sshin#endif /* INET6 */ 41355679Sshin ip = ipgen; 41455679Sshin 4151541Srgrimes if (tp) { 416122327Ssam inp = tp->t_inpcb; 417122327Ssam KASSERT(inp != NULL, ("tcp control block w/o inpcb")); 418122327Ssam INP_INFO_WLOCK_ASSERT(&tcbinfo); 419122327Ssam INP_LOCK_ASSERT(inp); 42057576Sps if (!(flags & TH_RST)) { 421122327Ssam win = sbspace(&inp->inp_socket->so_rcv); 42257576Sps if (win > (long)TCP_MAXWIN << tp->rcv_scale) 42357576Sps win = (long)TCP_MAXWIN << tp->rcv_scale; 42457576Sps } 4251541Srgrimes } 4261541Srgrimes if (m == 0) { 427111119Simp m = m_gethdr(M_DONTWAIT, MT_HEADER); 4281541Srgrimes if (m == NULL) 4291541Srgrimes return; 4301541Srgrimes tlen = 0; 4311541Srgrimes m->m_data += max_linkhdr; 43255679Sshin#ifdef INET6 43355679Sshin if (isipv6) { 43455679Sshin bcopy((caddr_t)ip6, mtod(m, caddr_t), 43555679Sshin sizeof(struct ip6_hdr)); 43655679Sshin ip6 = mtod(m, struct ip6_hdr *); 43755679Sshin nth = (struct tcphdr *)(ip6 + 1); 43855679Sshin } else 43955679Sshin#endif /* INET6 */ 44055679Sshin { 44155679Sshin bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 44255679Sshin ip = mtod(m, struct ip *); 44355679Sshin nth = (struct tcphdr *)(ip + 1); 44455679Sshin } 44555679Sshin bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 4461541Srgrimes flags = TH_ACK; 4471541Srgrimes } else { 4481541Srgrimes m_freem(m->m_next); 4491541Srgrimes m->m_next = 0; 45055679Sshin m->m_data = (caddr_t)ipgen; 45155679Sshin /* m_len is set later */ 4521541Srgrimes tlen = 0; 4531541Srgrimes#define xchg(a,b,type) { type t; t=a; a=b; b=t; } 45455679Sshin#ifdef INET6 45555679Sshin if (isipv6) { 45655679Sshin xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 45755679Sshin nth = (struct tcphdr *)(ip6 + 1); 45855679Sshin } else 45955679Sshin#endif /* INET6 */ 46055679Sshin { 46155679Sshin xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); 46255679Sshin nth = (struct tcphdr *)(ip + 1); 46355679Sshin } 46455679Sshin if (th != nth) { 46555679Sshin /* 46655679Sshin * this is usually a case when an extension header 46755679Sshin * exists between the IPv6 header and the 46855679Sshin * TCP header. 46955679Sshin */ 47055679Sshin nth->th_sport = th->th_sport; 47155679Sshin nth->th_dport = th->th_dport; 47255679Sshin } 47355679Sshin xchg(nth->th_dport, nth->th_sport, n_short); 4741541Srgrimes#undef xchg 4751541Srgrimes } 47655679Sshin#ifdef INET6 47755679Sshin if (isipv6) { 47890198Sume ip6->ip6_flow = 0; 47990198Sume ip6->ip6_vfc = IPV6_VERSION; 48090198Sume ip6->ip6_nxt = IPPROTO_TCP; 48155679Sshin ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + 48255679Sshin tlen)); 48355679Sshin tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 48456039Sshin } else 48555679Sshin#endif 48655679Sshin { 4871541Srgrimes tlen += sizeof (struct tcpiphdr); 48858698Sjlemon ip->ip_len = tlen; 48958698Sjlemon ip->ip_ttl = ip_defttl; 490124248Sandre if (path_mtu_discovery) 491124248Sandre ip->ip_off |= IP_DF; 49255679Sshin } 4931541Srgrimes m->m_len = tlen; 4941541Srgrimes m->m_pkthdr.len = tlen; 4951541Srgrimes m->m_pkthdr.rcvif = (struct ifnet *) 0; 496101106Srwatson#ifdef MAC 497122327Ssam if (inp != NULL) { 498101106Srwatson /* 499101106Srwatson * Packet is associated with a socket, so allow the 500101106Srwatson * label of the response to reflect the socket label. 501101106Srwatson */ 502122327Ssam mac_create_mbuf_from_socket(inp->inp_socket, m); 503101106Srwatson } else { 504101106Srwatson /* 505119245Srwatson * Packet is not associated with a socket, so possibly 506119245Srwatson * update the label in place. 507101106Srwatson */ 508119245Srwatson mac_reflect_mbuf_tcp(m); 509101106Srwatson } 510101106Srwatson#endif 51155679Sshin nth->th_seq = htonl(seq); 51255679Sshin nth->th_ack = htonl(ack); 51355679Sshin nth->th_x2 = 0; 51455679Sshin nth->th_off = sizeof (struct tcphdr) >> 2; 51555679Sshin nth->th_flags = flags; 5161541Srgrimes if (tp) 51755679Sshin nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 5181541Srgrimes else 51955679Sshin nth->th_win = htons((u_short)win); 52055679Sshin nth->th_urp = 0; 52155679Sshin#ifdef INET6 52255679Sshin if (isipv6) { 52359392Sshin nth->th_sum = 0; 52455679Sshin nth->th_sum = in6_cksum(m, IPPROTO_TCP, 52555679Sshin sizeof(struct ip6_hdr), 52655679Sshin tlen - sizeof(struct ip6_hdr)); 527122922Sandre ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL); 52855679Sshin } else 52955679Sshin#endif /* INET6 */ 53055679Sshin { 53158698Sjlemon nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 53258698Sjlemon htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 53358698Sjlemon m->m_pkthdr.csum_flags = CSUM_TCP; 53458698Sjlemon m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 53555679Sshin } 5366283Swollman#ifdef TCPDEBUG 537122327Ssam if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) 53855679Sshin tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); 5396283Swollman#endif 54055679Sshin#ifdef INET6 541122922Sandre if (isipv6) 542122922Sandre (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); 543122922Sandre else 54455679Sshin#endif /* INET6 */ 545122922Sandre (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); 5461541Srgrimes} 5471541Srgrimes 5481541Srgrimes/* 5491541Srgrimes * Create a new TCP control block, making an 5501541Srgrimes * empty reassembly queue and hooking it to the argument 55134881Swollman * protocol control block. The `inp' parameter must have 55234881Swollman * come from the zone allocator set up in tcp_init(). 5531541Srgrimes */ 5541541Srgrimesstruct tcpcb * 5551541Srgrimestcp_newtcpcb(inp) 5561541Srgrimes struct inpcb *inp; 5571541Srgrimes{ 558111145Sjlemon struct tcpcb_mem *tm; 559111145Sjlemon struct tcpcb *tp; 56055679Sshin#ifdef INET6 56155679Sshin int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 56255679Sshin#endif /* INET6 */ 5631541Srgrimes 564111145Sjlemon tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); 565111145Sjlemon if (tm == NULL) 566111145Sjlemon return (NULL); 567111145Sjlemon tp = &tm->tcb; 568111145Sjlemon /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ 56955679Sshin tp->t_maxseg = tp->t_maxopd = 57055679Sshin#ifdef INET6 57155679Sshin isipv6 ? tcp_v6mssdflt : 57255679Sshin#endif /* INET6 */ 57355679Sshin tcp_mssdflt; 5741541Srgrimes 57550673Sjlemon /* Set up our timeouts. */ 576111145Sjlemon callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0); 577111145Sjlemon callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0); 578111145Sjlemon callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0); 579111145Sjlemon callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0); 580111145Sjlemon callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0); 58150673Sjlemon 5826283Swollman if (tcp_do_rfc1323) 5836283Swollman tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 5846283Swollman if (tcp_do_rfc1644) 5856283Swollman tp->t_flags |= TF_REQ_CC; 58634881Swollman tp->t_inpcb = inp; /* XXX */ 5871541Srgrimes /* 5881541Srgrimes * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 58916367Swollman * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 5901541Srgrimes * reasonable initial retransmit time. 5911541Srgrimes */ 5921541Srgrimes tp->t_srtt = TCPTV_SRTTBASE; 59316367Swollman tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 594100335Sdillon tp->t_rttmin = tcp_rexmit_min; 59516367Swollman tp->t_rxtcur = TCPTV_RTOBASE; 5961541Srgrimes tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 597102017Sdillon tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 5981541Srgrimes tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 59950673Sjlemon tp->t_rcvtime = ticks; 600102017Sdillon tp->t_bw_rtttime = ticks; 60156564Sshin /* 60256564Sshin * IPv4 TTL initialization is necessary for an IPv6 socket as well, 60356564Sshin * because the socket may be bound to an IPv6 wildcard address, 60456564Sshin * which may match an IPv4-mapped IPv6 address. 60556564Sshin */ 60624570Sdg inp->inp_ip_ttl = ip_defttl; 6071541Srgrimes inp->inp_ppcb = (caddr_t)tp; 60834881Swollman return (tp); /* XXX */ 6091541Srgrimes} 6101541Srgrimes 6111541Srgrimes/* 6121541Srgrimes * Drop a TCP connection, reporting 6131541Srgrimes * the specified error. If connection is synchronized, 6141541Srgrimes * then send a RST to peer. 6151541Srgrimes */ 6161541Srgrimesstruct tcpcb * 6171541Srgrimestcp_drop(tp, errno) 6181541Srgrimes register struct tcpcb *tp; 6191541Srgrimes int errno; 6201541Srgrimes{ 6211541Srgrimes struct socket *so = tp->t_inpcb->inp_socket; 6221541Srgrimes 6231541Srgrimes if (TCPS_HAVERCVDSYN(tp->t_state)) { 6241541Srgrimes tp->t_state = TCPS_CLOSED; 6251541Srgrimes (void) tcp_output(tp); 6261541Srgrimes tcpstat.tcps_drops++; 6271541Srgrimes } else 6281541Srgrimes tcpstat.tcps_conndrops++; 6291541Srgrimes if (errno == ETIMEDOUT && tp->t_softerror) 6301541Srgrimes errno = tp->t_softerror; 6311541Srgrimes so->so_error = errno; 6321541Srgrimes return (tcp_close(tp)); 6331541Srgrimes} 6341541Srgrimes 635111145Sjlemonstatic void 636111145Sjlemontcp_discardcb(tp) 637111145Sjlemon struct tcpcb *tp; 6381541Srgrimes{ 639111145Sjlemon struct tseg_qent *q; 6401541Srgrimes struct inpcb *inp = tp->t_inpcb; 6411541Srgrimes struct socket *so = inp->inp_socket; 64255679Sshin#ifdef INET6 64355679Sshin int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 64455679Sshin#endif /* INET6 */ 6451541Srgrimes 6461541Srgrimes /* 64750673Sjlemon * Make sure that all of our timers are stopped before we 64850673Sjlemon * delete the PCB. 64950673Sjlemon */ 65050673Sjlemon callout_stop(tp->tt_rexmt); 65150673Sjlemon callout_stop(tp->tt_persist); 65250673Sjlemon callout_stop(tp->tt_keep); 65350673Sjlemon callout_stop(tp->tt_2msl); 65450673Sjlemon callout_stop(tp->tt_delack); 65550673Sjlemon 65650673Sjlemon /* 6579373Swollman * If we got enough samples through the srtt filter, 6589373Swollman * save the rtt and rttvar in the routing entry. 659122922Sandre * 'Enough' is arbitrarily defined as 4 rtt samples. 660122922Sandre * 4 samples is enough for the srtt filter to converge 661122922Sandre * to within enough % of the correct value; fewer samples 662122922Sandre * and we could save a bogus rtt. The danger is not high 663122922Sandre * as tcp quickly recovers from everything. 664122922Sandre * XXX: Works very well but needs some more statistics! 6651541Srgrimes */ 666122922Sandre if (tp->t_rttupdated >= 4) { 667122922Sandre struct hc_metrics_lite metrics; 668122922Sandre u_long ssthresh; 6691541Srgrimes 670122922Sandre bzero(&metrics, sizeof(metrics)); 6711541Srgrimes /* 672122922Sandre * Update the ssthresh always when the conditions below 673122922Sandre * are satisfied. This gives us better new start value 674122922Sandre * for the congestion avoidance for new connections. 675122922Sandre * ssthresh is only set if packet loss occured on a session. 6761541Srgrimes */ 677122922Sandre ssthresh = tp->snd_ssthresh; 678122922Sandre if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { 6791541Srgrimes /* 6801541Srgrimes * convert the limit from user data bytes to 6811541Srgrimes * packets then to packet data bytes. 6821541Srgrimes */ 683122922Sandre ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; 684122922Sandre if (ssthresh < 2) 685122922Sandre ssthresh = 2; 686122922Sandre ssthresh *= (u_long)(tp->t_maxseg + 68755679Sshin#ifdef INET6 68855679Sshin (isipv6 ? sizeof (struct ip6_hdr) + 68955679Sshin sizeof (struct tcphdr) : 69055679Sshin#endif 69155679Sshin sizeof (struct tcpiphdr) 69255679Sshin#ifdef INET6 69355679Sshin ) 69455679Sshin#endif 69555679Sshin ); 696122922Sandre } else 697122922Sandre ssthresh = 0; 698122922Sandre metrics.rmx_ssthresh = ssthresh; 699122922Sandre 700122922Sandre metrics.rmx_rtt = tp->t_srtt; 701122922Sandre metrics.rmx_rttvar = tp->t_rttvar; 702122922Sandre /* XXX: This wraps if the pipe is more than 4 Gbit per second */ 703122922Sandre metrics.rmx_bandwidth = tp->snd_bandwidth; 704122922Sandre metrics.rmx_cwnd = tp->snd_cwnd; 705122922Sandre metrics.rmx_sendpipe = 0; 706122922Sandre metrics.rmx_recvpipe = 0; 707122922Sandre 708122922Sandre tcp_hc_update(&inp->inp_inc, &metrics); 7091541Srgrimes } 710122922Sandre 7111541Srgrimes /* free the reassembly queue, if any */ 712111145Sjlemon while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { 71355679Sshin LIST_REMOVE(q, tqe_q); 71455679Sshin m_freem(q->tqe_m); 715126193Sandre uma_zfree(tcp_reass_zone, q); 716126193Sandre tp->t_segqlen--; 717126193Sandre tcp_reass_qsize--; 7181541Srgrimes } 71932821Sdg inp->inp_ppcb = NULL; 720108265Shsu tp->t_inpcb = NULL; 721111145Sjlemon uma_zfree(tcpcb_zone, tp); 7221541Srgrimes soisdisconnected(so); 723111145Sjlemon} 724111145Sjlemon 725111145Sjlemon/* 726111145Sjlemon * Close a TCP control block: 727111145Sjlemon * discard all space held by the tcp 728111145Sjlemon * discard internet protocol block 729111145Sjlemon * wake up any sleepers 730111145Sjlemon */ 731111145Sjlemonstruct tcpcb * 732111145Sjlemontcp_close(tp) 733111145Sjlemon struct tcpcb *tp; 734111145Sjlemon{ 735111145Sjlemon struct inpcb *inp = tp->t_inpcb; 736111153Sjlemon#ifdef INET6 737111145Sjlemon struct socket *so = inp->inp_socket; 738111153Sjlemon#endif 739111145Sjlemon 740111145Sjlemon tcp_discardcb(tp); 74155679Sshin#ifdef INET6 74255679Sshin if (INP_CHECK_SOCKAF(so, AF_INET6)) 74355679Sshin in6_pcbdetach(inp); 74455679Sshin else 745111145Sjlemon#endif 746111145Sjlemon in_pcbdetach(inp); 7471541Srgrimes tcpstat.tcps_closed++; 7481541Srgrimes return ((struct tcpcb *)0); 7491541Srgrimes} 7501541Srgrimes 7511541Srgrimesvoid 7521541Srgrimestcp_drain() 7531541Srgrimes{ 75455198Smsmith if (do_tcpdrain) 75555198Smsmith { 75655198Smsmith struct inpcb *inpb; 75755198Smsmith struct tcpcb *tcpb; 75855679Sshin struct tseg_qent *te; 7591541Srgrimes 76055198Smsmith /* 76155198Smsmith * Walk the tcpbs, if existing, and flush the reassembly queue, 76255198Smsmith * if there is one... 76355198Smsmith * XXX: The "Net/3" implementation doesn't imply that the TCP 76455198Smsmith * reassembly queue should be flushed, but in a situation 76555198Smsmith * where we're really low on mbufs, this is potentially 76655198Smsmith * usefull. 76755198Smsmith */ 76898102Shsu INP_INFO_RLOCK(&tcbinfo); 76974362Sphk LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { 770111145Sjlemon if (inpb->inp_vflag & INP_TIMEWAIT) 771111145Sjlemon continue; 77298102Shsu INP_LOCK(inpb); 77374362Sphk if ((tcpb = intotcpcb(inpb))) { 77474362Sphk while ((te = LIST_FIRST(&tcpb->t_segq)) 77574362Sphk != NULL) { 77655679Sshin LIST_REMOVE(te, tqe_q); 77755679Sshin m_freem(te->tqe_m); 778126193Sandre uma_zfree(tcp_reass_zone, te); 779126193Sandre tcpb->t_segqlen--; 780126193Sandre tcp_reass_qsize--; 78155198Smsmith } 78255198Smsmith } 78398102Shsu INP_UNLOCK(inpb); 78455198Smsmith } 78598102Shsu INP_INFO_RUNLOCK(&tcbinfo); 78655198Smsmith } 7871541Srgrimes} 7881541Srgrimes 7891541Srgrimes/* 7901541Srgrimes * Notify a tcp user of an asynchronous error; 7911541Srgrimes * store error as soft error, but wake up user 7921541Srgrimes * (for now, won't do anything until can select for soft error). 79372960Sjlemon * 79472960Sjlemon * Do not wake up user since there currently is no mechanism for 79572960Sjlemon * reporting soft errors (yet - a kqueue filter may be added). 7961541Srgrimes */ 79798211Shsustatic struct inpcb * 7981541Srgrimestcp_notify(inp, error) 7991541Srgrimes struct inpcb *inp; 8001541Srgrimes int error; 8011541Srgrimes{ 80272960Sjlemon struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; 8031541Srgrimes 8041541Srgrimes /* 8051541Srgrimes * Ignore some errors if we are hooked up. 8061541Srgrimes * If connection hasn't completed, has retransmitted several times, 8071541Srgrimes * and receives a second error, give up now. This is better 8081541Srgrimes * than waiting a long time to establish a connection that 8091541Srgrimes * can never complete. 8101541Srgrimes */ 8111541Srgrimes if (tp->t_state == TCPS_ESTABLISHED && 812110896Shsu (error == EHOSTUNREACH || error == ENETUNREACH || 813110896Shsu error == EHOSTDOWN)) { 81498211Shsu return inp; 8151541Srgrimes } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 81698211Shsu tp->t_softerror) { 81772960Sjlemon tcp_drop(tp, error); 81898211Shsu return (struct inpcb *)0; 81998211Shsu } else { 8201541Srgrimes tp->t_softerror = error; 82198211Shsu return inp; 82298211Shsu } 82372960Sjlemon#if 0 824111748Sdes wakeup( &so->so_timeo); 8251541Srgrimes sorwakeup(so); 8261541Srgrimes sowwakeup(so); 82772960Sjlemon#endif 8281541Srgrimes} 8291541Srgrimes 83036079Swollmanstatic int 83162573Sphktcp_pcblist(SYSCTL_HANDLER_ARGS) 83236079Swollman{ 83336079Swollman int error, i, n, s; 83436079Swollman struct inpcb *inp, **inp_list; 83536079Swollman inp_gen_t gencnt; 83636079Swollman struct xinpgen xig; 83736079Swollman 83836079Swollman /* 83936079Swollman * The process of preparing the TCB list is too time-consuming and 84036079Swollman * resource-intensive to repeat twice on every request. 84136079Swollman */ 84236079Swollman if (req->oldptr == 0) { 84336079Swollman n = tcbinfo.ipi_count; 84436079Swollman req->oldidx = 2 * (sizeof xig) 84536079Swollman + (n + n/8) * sizeof(struct xtcpcb); 84636079Swollman return 0; 84736079Swollman } 84836079Swollman 84936079Swollman if (req->newptr != 0) 85036079Swollman return EPERM; 85136079Swollman 85236079Swollman /* 85336079Swollman * OK, now we're committed to doing something. 85436079Swollman */ 85536079Swollman s = splnet(); 85698102Shsu INP_INFO_RLOCK(&tcbinfo); 85736079Swollman gencnt = tcbinfo.ipi_gencnt; 85836079Swollman n = tcbinfo.ipi_count; 85998102Shsu INP_INFO_RUNLOCK(&tcbinfo); 86036079Swollman splx(s); 86136079Swollman 862126253Struckman error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 863100831Struckman + n * sizeof(struct xtcpcb)); 864126253Struckman if (error != 0) 865126253Struckman return (error); 866100831Struckman 86736079Swollman xig.xig_len = sizeof xig; 86836079Swollman xig.xig_count = n; 86936079Swollman xig.xig_gen = gencnt; 87036079Swollman xig.xig_sogen = so_gencnt; 87136079Swollman error = SYSCTL_OUT(req, &xig, sizeof xig); 87236079Swollman if (error) 87336079Swollman return error; 87436079Swollman 875111119Simp inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 87636079Swollman if (inp_list == 0) 87736079Swollman return ENOMEM; 87836079Swollman 87936079Swollman s = splnet(); 88098102Shsu INP_INFO_RLOCK(&tcbinfo); 88171999Sphk for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; 88271999Sphk inp = LIST_NEXT(inp, inp_list)) { 88398102Shsu INP_LOCK(inp); 884113345Srwatson if (inp->inp_gencnt <= gencnt) { 885113345Srwatson /* 886113345Srwatson * XXX: This use of cr_cansee(), introduced with 887113345Srwatson * TCP state changes, is not quite right, but for 888113345Srwatson * now, better than nothing. 889113345Srwatson */ 890113345Srwatson if (inp->inp_vflag & INP_TIMEWAIT) 891113345Srwatson error = cr_cansee(req->td->td_ucred, 892113345Srwatson intotw(inp)->tw_cred); 893113345Srwatson else 894113345Srwatson error = cr_canseesocket(req->td->td_ucred, 895113345Srwatson inp->inp_socket); 896113345Srwatson if (error == 0) 897113345Srwatson inp_list[i++] = inp; 898113345Srwatson } 89998102Shsu INP_UNLOCK(inp); 90036079Swollman } 90198102Shsu INP_INFO_RUNLOCK(&tcbinfo); 90236079Swollman splx(s); 90336079Swollman n = i; 90436079Swollman 90536079Swollman error = 0; 90636079Swollman for (i = 0; i < n; i++) { 90736079Swollman inp = inp_list[i]; 90836079Swollman if (inp->inp_gencnt <= gencnt) { 90936079Swollman struct xtcpcb xt; 91047960Stegge caddr_t inp_ppcb; 91136079Swollman xt.xt_len = sizeof xt; 91236079Swollman /* XXX should avoid extra copy */ 91336079Swollman bcopy(inp, &xt.xt_inp, sizeof *inp); 91447960Stegge inp_ppcb = inp->inp_ppcb; 915111145Sjlemon if (inp_ppcb == NULL) 916111145Sjlemon bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 917111145Sjlemon else if (inp->inp_vflag & INP_TIMEWAIT) { 918111145Sjlemon bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 919111145Sjlemon xt.xt_tp.t_state = TCPS_TIME_WAIT; 920111145Sjlemon } else 92147960Stegge bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); 92236079Swollman if (inp->inp_socket) 92336079Swollman sotoxsocket(inp->inp_socket, &xt.xt_socket); 924111145Sjlemon else { 925111145Sjlemon bzero(&xt.xt_socket, sizeof xt.xt_socket); 926111145Sjlemon xt.xt_socket.xso_protocol = IPPROTO_TCP; 927111145Sjlemon } 928110896Shsu xt.xt_inp.inp_gencnt = inp->inp_gencnt; 92936079Swollman error = SYSCTL_OUT(req, &xt, sizeof xt); 93036079Swollman } 93136079Swollman } 93236079Swollman if (!error) { 93336079Swollman /* 93436079Swollman * Give the user an updated idea of our state. 93536079Swollman * If the generation differs from what we told 93636079Swollman * her before, she knows that something happened 93736079Swollman * while we were processing this request, and it 93836079Swollman * might be necessary to retry. 93936079Swollman */ 94036079Swollman s = splnet(); 94198102Shsu INP_INFO_RLOCK(&tcbinfo); 94236079Swollman xig.xig_gen = tcbinfo.ipi_gencnt; 94336079Swollman xig.xig_sogen = so_gencnt; 94436079Swollman xig.xig_count = tcbinfo.ipi_count; 94598102Shsu INP_INFO_RUNLOCK(&tcbinfo); 94636079Swollman splx(s); 94736079Swollman error = SYSCTL_OUT(req, &xig, sizeof xig); 94836079Swollman } 94936079Swollman free(inp_list, M_TEMP); 95036079Swollman return error; 95136079Swollman} 95236079Swollman 95336079SwollmanSYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, 95436079Swollman tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 95536079Swollman 95648758Sgreenstatic int 95762573Sphktcp_getcred(SYSCTL_HANDLER_ARGS) 95848758Sgreen{ 95972650Sgreen struct xucred xuc; 96048758Sgreen struct sockaddr_in addrs[2]; 96148758Sgreen struct inpcb *inp; 96248758Sgreen int error, s; 96348758Sgreen 96493593Sjhb error = suser_cred(req->td->td_ucred, PRISON_ROOT); 96548758Sgreen if (error) 96648758Sgreen return (error); 96748758Sgreen error = SYSCTL_IN(req, addrs, sizeof(addrs)); 96848758Sgreen if (error) 96948758Sgreen return (error); 97048758Sgreen s = splnet(); 97198102Shsu INP_INFO_RLOCK(&tcbinfo); 97248758Sgreen inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, 97354263Sshin addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); 97498102Shsu if (inp == NULL) { 97548758Sgreen error = ENOENT; 97698102Shsu goto outunlocked; 97748758Sgreen } 97899837Struckman INP_LOCK(inp); 97999837Struckman if (inp->inp_socket == NULL) { 98099837Struckman error = ENOENT; 98199837Struckman goto out; 98299837Struckman } 98392976Srwatson error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 98478697Sdwmalone if (error) 98578697Sdwmalone goto out; 98691354Sdd cru2x(inp->inp_socket->so_cred, &xuc); 98748758Sgreenout: 98898102Shsu INP_UNLOCK(inp); 98998102Shsuoutunlocked: 99098102Shsu INP_INFO_RUNLOCK(&tcbinfo); 99148758Sgreen splx(s); 99299838Struckman if (error == 0) 99399838Struckman error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 99448758Sgreen return (error); 99548758Sgreen} 99648758Sgreen 99778697SdwmaloneSYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, 99878697Sdwmalone CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 99978697Sdwmalone tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); 100048758Sgreen 100155679Sshin#ifdef INET6 100255679Sshinstatic int 100362573Sphktcp6_getcred(SYSCTL_HANDLER_ARGS) 100455679Sshin{ 100572650Sgreen struct xucred xuc; 100655679Sshin struct sockaddr_in6 addrs[2]; 100755679Sshin struct inpcb *inp; 100855679Sshin int error, s, mapped = 0; 100955679Sshin 101093593Sjhb error = suser_cred(req->td->td_ucred, PRISON_ROOT); 101155679Sshin if (error) 101255679Sshin return (error); 101355679Sshin error = SYSCTL_IN(req, addrs, sizeof(addrs)); 101455679Sshin if (error) 101555679Sshin return (error); 101655679Sshin if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { 101755679Sshin if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) 101855679Sshin mapped = 1; 101955679Sshin else 102055679Sshin return (EINVAL); 102155679Sshin } 102255679Sshin s = splnet(); 102398102Shsu INP_INFO_RLOCK(&tcbinfo); 102455679Sshin if (mapped == 1) 102555679Sshin inp = in_pcblookup_hash(&tcbinfo, 102655679Sshin *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], 102755679Sshin addrs[1].sin6_port, 102855679Sshin *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], 102955679Sshin addrs[0].sin6_port, 103055679Sshin 0, NULL); 103155679Sshin else 103255679Sshin inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, 103355679Sshin addrs[1].sin6_port, 103455679Sshin &addrs[0].sin6_addr, addrs[0].sin6_port, 103555679Sshin 0, NULL); 103698102Shsu if (inp == NULL) { 103755679Sshin error = ENOENT; 103898102Shsu goto outunlocked; 103955679Sshin } 104099837Struckman INP_LOCK(inp); 104199837Struckman if (inp->inp_socket == NULL) { 104299837Struckman error = ENOENT; 104399837Struckman goto out; 104499837Struckman } 104592976Srwatson error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 104678697Sdwmalone if (error) 104778697Sdwmalone goto out; 104891354Sdd cru2x(inp->inp_socket->so_cred, &xuc); 104955679Sshinout: 105098102Shsu INP_UNLOCK(inp); 105198102Shsuoutunlocked: 105298102Shsu INP_INFO_RUNLOCK(&tcbinfo); 105355679Sshin splx(s); 105499838Struckman if (error == 0) 105599838Struckman error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 105655679Sshin return (error); 105755679Sshin} 105855679Sshin 105978697SdwmaloneSYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, 106078697Sdwmalone CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 106178697Sdwmalone tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); 106255679Sshin#endif 106355679Sshin 106455679Sshin 10651541Srgrimesvoid 106612881Sbdetcp_ctlinput(cmd, sa, vip) 10671541Srgrimes int cmd; 10681541Srgrimes struct sockaddr *sa; 106912881Sbde void *vip; 10701541Srgrimes{ 107172959Sjlemon struct ip *ip = vip; 107272959Sjlemon struct tcphdr *th; 107373109Sjlemon struct in_addr faddr; 107473109Sjlemon struct inpcb *inp; 107573109Sjlemon struct tcpcb *tp; 107698211Shsu struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 107773109Sjlemon tcp_seq icmp_seq; 107873109Sjlemon int s; 10791541Srgrimes 108073109Sjlemon faddr = ((struct sockaddr_in *)sa)->sin_addr; 108173109Sjlemon if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 108273109Sjlemon return; 108373109Sjlemon 10841541Srgrimes if (cmd == PRC_QUENCH) 10851541Srgrimes notify = tcp_quench; 108674937Sjesper else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || 108799156Sjesper cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) 108872959Sjlemon notify = tcp_drop_syn_sent; 108973109Sjlemon else if (cmd == PRC_MSGSIZE) 109010881Swollman notify = tcp_mtudisc; 1091122922Sandre /* 1092122922Sandre * Redirects don't need to be handled up here. 1093122922Sandre */ 1094122922Sandre else if (PRC_IS_REDIRECT(cmd)) 1095122922Sandre return; 1096122922Sandre /* 1097122922Sandre * Hostdead is ugly because it goes linearly through all PCBs. 1098122922Sandre * XXX: We never get this from ICMP, otherwise it makes an 1099122922Sandre * excellent DoS attack on machines with many connections. 1100122922Sandre */ 1101122922Sandre else if (cmd == PRC_HOSTDEAD) 110272922Sjesper ip = 0; 1103119995Sru else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) 11041541Srgrimes return; 11051541Srgrimes if (ip) { 110673109Sjlemon s = splnet(); 110717269Swollman th = (struct tcphdr *)((caddr_t)ip 1108105586Sphk + (ip->ip_hl << 2)); 110998596Shsu INP_INFO_WLOCK(&tcbinfo); 111073109Sjlemon inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, 111173109Sjlemon ip->ip_src, th->th_sport, 0, NULL); 111298102Shsu if (inp != NULL) { 111398102Shsu INP_LOCK(inp); 111498102Shsu if (inp->inp_socket != NULL) { 111598102Shsu icmp_seq = htonl(th->th_seq); 111698102Shsu tp = intotcpcb(inp); 111798102Shsu if (SEQ_GEQ(icmp_seq, tp->snd_una) && 111898102Shsu SEQ_LT(icmp_seq, tp->snd_max)) 111998211Shsu inp = (*notify)(inp, inetctlerrmap[cmd]); 112098102Shsu } 112198211Shsu if (inp) 112298211Shsu INP_UNLOCK(inp); 112386764Sjlemon } else { 112486764Sjlemon struct in_conninfo inc; 112586764Sjlemon 112686764Sjlemon inc.inc_fport = th->th_dport; 112786764Sjlemon inc.inc_lport = th->th_sport; 112886764Sjlemon inc.inc_faddr = faddr; 112986764Sjlemon inc.inc_laddr = ip->ip_src; 113086764Sjlemon#ifdef INET6 113186764Sjlemon inc.inc_isipv6 = 0; 113286764Sjlemon#endif 113386764Sjlemon syncache_unreach(&inc, th); 113473109Sjlemon } 113598596Shsu INP_INFO_WUNLOCK(&tcbinfo); 113673109Sjlemon splx(s); 11371541Srgrimes } else 113898102Shsu in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); 11391541Srgrimes} 11401541Srgrimes 114155679Sshin#ifdef INET6 114255679Sshinvoid 114355679Sshintcp6_ctlinput(cmd, sa, d) 114455679Sshin int cmd; 114555679Sshin struct sockaddr *sa; 114655679Sshin void *d; 114755679Sshin{ 114855679Sshin struct tcphdr th; 114998211Shsu struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 115055679Sshin struct ip6_hdr *ip6; 115155679Sshin struct mbuf *m; 115278064Sume struct ip6ctlparam *ip6cp = NULL; 115378064Sume const struct sockaddr_in6 *sa6_src = NULL; 115455679Sshin int off; 115578064Sume struct tcp_portonly { 115678064Sume u_int16_t th_sport; 115778064Sume u_int16_t th_dport; 115878064Sume } *thp; 115955679Sshin 116055679Sshin if (sa->sa_family != AF_INET6 || 116155679Sshin sa->sa_len != sizeof(struct sockaddr_in6)) 116255679Sshin return; 116355679Sshin 116455679Sshin if (cmd == PRC_QUENCH) 116555679Sshin notify = tcp_quench; 116655679Sshin else if (cmd == PRC_MSGSIZE) 116755679Sshin notify = tcp_mtudisc; 116855679Sshin else if (!PRC_IS_REDIRECT(cmd) && 1169119995Sru ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) 117055679Sshin return; 117155679Sshin 117255679Sshin /* if the parameter is from icmp6, decode it. */ 117355679Sshin if (d != NULL) { 117478064Sume ip6cp = (struct ip6ctlparam *)d; 117555679Sshin m = ip6cp->ip6c_m; 117655679Sshin ip6 = ip6cp->ip6c_ip6; 117755679Sshin off = ip6cp->ip6c_off; 117878064Sume sa6_src = ip6cp->ip6c_src; 117955679Sshin } else { 118055679Sshin m = NULL; 118155679Sshin ip6 = NULL; 118267456Sitojun off = 0; /* fool gcc */ 118378064Sume sa6_src = &sa6_any; 118455679Sshin } 118555679Sshin 118655679Sshin if (ip6) { 118786764Sjlemon struct in_conninfo inc; 118855679Sshin /* 118955679Sshin * XXX: We assume that when IPV6 is non NULL, 119055679Sshin * M and OFF are valid. 119155679Sshin */ 119255679Sshin 119367456Sitojun /* check if we can safely examine src and dst ports */ 119478064Sume if (m->m_pkthdr.len < off + sizeof(*thp)) 119567456Sitojun return; 119667456Sitojun 119778064Sume bzero(&th, sizeof(th)); 119878064Sume m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 119978064Sume 120078064Sume in6_pcbnotify(&tcb, sa, th.th_dport, 120178064Sume (struct sockaddr *)ip6cp->ip6c_src, 1202125776Sume th.th_sport, cmd, NULL, notify); 120386764Sjlemon 120486764Sjlemon inc.inc_fport = th.th_dport; 120586764Sjlemon inc.inc_lport = th.th_sport; 120686764Sjlemon inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; 120786764Sjlemon inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; 120886764Sjlemon inc.inc_isipv6 = 1; 120986764Sjlemon syncache_unreach(&inc, &th); 121055679Sshin } else 121191357Salfred in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src, 1212125776Sume 0, cmd, NULL, notify); 121355679Sshin} 121455679Sshin#endif /* INET6 */ 121555679Sshin 121680428Speter 121782122Ssilby/* 121882122Ssilby * Following is where TCP initial sequence number generation occurs. 121982122Ssilby * 122082122Ssilby * There are two places where we must use initial sequence numbers: 122182122Ssilby * 1. In SYN-ACK packets. 122282122Ssilby * 2. In SYN packets. 122382122Ssilby * 122494390Ssilby * All ISNs for SYN-ACK packets are generated by the syncache. See 122594390Ssilby * tcp_syncache.c for details. 122682122Ssilby * 122782122Ssilby * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 122882122Ssilby * depends on this property. In addition, these ISNs should be 122982122Ssilby * unguessable so as to prevent connection hijacking. To satisfy 123082122Ssilby * the requirements of this situation, the algorithm outlined in 123182122Ssilby * RFC 1948 is used to generate sequence numbers. 123282122Ssilby * 123382122Ssilby * Implementation details: 123482122Ssilby * 123582122Ssilby * Time is based off the system timer, and is corrected so that it 123682122Ssilby * increases by one megabyte per second. This allows for proper 123782122Ssilby * recycling on high speed LANs while still leaving over an hour 123882122Ssilby * before rollover. 123982122Ssilby * 124082122Ssilby * net.inet.tcp.isn_reseed_interval controls the number of seconds 124182122Ssilby * between seeding of isn_secret. This is normally set to zero, 124282122Ssilby * as reseeding should not be necessary. 124382122Ssilby * 124482122Ssilby */ 124579413Ssilby 124682122Ssilby#define ISN_BYTES_PER_SECOND 1048576 124779413Ssilby 124882122Ssilbyu_char isn_secret[32]; 124982122Ssilbyint isn_last_reseed; 125082122SsilbyMD5_CTX isn_ctx; 125175619Skris 125275619Skristcp_seq 125382122Ssilbytcp_new_isn(tp) 125482122Ssilby struct tcpcb *tp; 125575619Skris{ 125682122Ssilby u_int32_t md5_buffer[4]; 125782122Ssilby tcp_seq new_isn; 125875619Skris 125982122Ssilby /* Seed if this is the first use, reseed if requested. */ 126094390Ssilby if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && 126182122Ssilby (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) 126282122Ssilby < (u_int)ticks))) { 126382122Ssilby read_random(&isn_secret, sizeof(isn_secret)); 126482122Ssilby isn_last_reseed = ticks; 126582122Ssilby } 126682122Ssilby 126782122Ssilby /* Compute the md5 hash and return the ISN. */ 126882122Ssilby MD5Init(&isn_ctx); 126982122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); 127082122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); 127182122Ssilby#ifdef INET6 127282122Ssilby if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { 127382122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, 127482122Ssilby sizeof(struct in6_addr)); 127582122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, 127682122Ssilby sizeof(struct in6_addr)); 127782122Ssilby } else 127882122Ssilby#endif 127982122Ssilby { 128082122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, 128182122Ssilby sizeof(struct in_addr)); 128282122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, 128382122Ssilby sizeof(struct in_addr)); 128482122Ssilby } 128582122Ssilby MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); 128682122Ssilby MD5Final((u_char *) &md5_buffer, &isn_ctx); 128782122Ssilby new_isn = (tcp_seq) md5_buffer[0]; 128882122Ssilby new_isn += ticks * (ISN_BYTES_PER_SECOND / hz); 128982122Ssilby return new_isn; 129075619Skris} 129175619Skris 12921541Srgrimes/* 12931541Srgrimes * When a source quench is received, close congestion window 12941541Srgrimes * to one segment. We will gradually open it again as we proceed. 12951541Srgrimes */ 129698211Shsustruct inpcb * 12971541Srgrimestcp_quench(inp, errno) 12981541Srgrimes struct inpcb *inp; 12991541Srgrimes int errno; 13001541Srgrimes{ 13011541Srgrimes struct tcpcb *tp = intotcpcb(inp); 13021541Srgrimes 13031541Srgrimes if (tp) 13041541Srgrimes tp->snd_cwnd = tp->t_maxseg; 130598211Shsu return (inp); 13061541Srgrimes} 13076283Swollman 13086283Swollman/* 130972959Sjlemon * When a specific ICMP unreachable message is received and the 131072959Sjlemon * connection state is SYN-SENT, drop the connection. This behavior 131172959Sjlemon * is controlled by the icmp_may_rst sysctl. 131270103Sphk */ 131398211Shsustruct inpcb * 131470103Sphktcp_drop_syn_sent(inp, errno) 131570103Sphk struct inpcb *inp; 131670103Sphk int errno; 131770103Sphk{ 131870103Sphk struct tcpcb *tp = intotcpcb(inp); 131970103Sphk 132098211Shsu if (tp && tp->t_state == TCPS_SYN_SENT) { 132172638Sphk tcp_drop(tp, errno); 132298211Shsu return (struct inpcb *)0; 132398211Shsu } 132498211Shsu return inp; 132572638Sphk} 132672638Sphk 132772638Sphk/* 132810881Swollman * When `need fragmentation' ICMP is received, update our idea of the MSS 132910881Swollman * based on the new value in the route. Also nudge TCP to send something, 133010881Swollman * since we know the packet we just sent was dropped. 133110930Swollman * This duplicates some code in the tcp_mss() function in tcp_input.c. 133210881Swollman */ 133398211Shsustruct inpcb * 133410881Swollmantcp_mtudisc(inp, errno) 133510881Swollman struct inpcb *inp; 133610881Swollman int errno; 133710881Swollman{ 133810881Swollman struct tcpcb *tp = intotcpcb(inp); 1339122922Sandre struct rmxp_tao tao; 134010930Swollman struct socket *so = inp->inp_socket; 1341122922Sandre u_int maxmtu; 1342122922Sandre u_int romtu; 134310930Swollman int mss; 134455679Sshin#ifdef INET6 134555679Sshin int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 134655679Sshin#endif /* INET6 */ 1347122922Sandre bzero(&tao, sizeof(tao)); 134810881Swollman 134910930Swollman if (tp) { 1350122922Sandre maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */ 1351122922Sandre romtu = 135255679Sshin#ifdef INET6 1353122922Sandre isipv6 ? tcp_maxmtu6(&inp->inp_inc) : 1354122922Sandre#endif /* INET6 */ 1355122922Sandre tcp_maxmtu(&inp->inp_inc); 1356122922Sandre if (!maxmtu) 1357122922Sandre maxmtu = romtu; 135855679Sshin else 1359122922Sandre maxmtu = min(maxmtu, romtu); 1360122922Sandre if (!maxmtu) { 136155679Sshin tp->t_maxopd = tp->t_maxseg = 136255679Sshin#ifdef INET6 136355679Sshin isipv6 ? tcp_v6mssdflt : 136455679Sshin#endif /* INET6 */ 136555679Sshin tcp_mssdflt; 136698211Shsu return inp; 136710930Swollman } 1368122922Sandre mss = maxmtu - 136955679Sshin#ifdef INET6 137055679Sshin (isipv6 ? 137155679Sshin sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : 137255679Sshin#endif /* INET6 */ 137355679Sshin sizeof(struct tcpiphdr) 137455679Sshin#ifdef INET6 137555679Sshin ) 137655679Sshin#endif /* INET6 */ 137755679Sshin ; 137855679Sshin 1379122922Sandre if (tcp_do_rfc1644) { 1380122922Sandre tcp_hc_gettao(&inp->inp_inc, &tao); 1381122922Sandre if (tao.tao_mssopt) 1382122922Sandre mss = min(mss, tao.tao_mssopt); 1383122922Sandre } 138412939Swollman /* 138512939Swollman * XXX - The above conditional probably violates the TCP 138612939Swollman * spec. The problem is that, since we don't know the 138712939Swollman * other end's MSS, we are supposed to use a conservative 138812939Swollman * default. But, if we do that, then MTU discovery will 138912939Swollman * never actually take place, because the conservative 139012939Swollman * default is much less than the MTUs typically seen 139112939Swollman * on the Internet today. For the moment, we'll sweep 139212939Swollman * this under the carpet. 139312939Swollman * 139412939Swollman * The conservative default might not actually be a problem 139512939Swollman * if the only case this occurs is when sending an initial 139612939Swollman * SYN with options and data to a host we've never talked 139712939Swollman * to before. Then, they will reply with an MSS value which 139812939Swollman * will get recorded and the new parameters should get 139912939Swollman * recomputed. For Further Study. 140012939Swollman */ 140111415Swollman if (tp->t_maxopd <= mss) 140298211Shsu return inp; 140310930Swollman tp->t_maxopd = mss; 140410930Swollman 140510930Swollman if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 140610930Swollman (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 140710930Swollman mss -= TCPOLEN_TSTAMP_APPA; 140810930Swollman if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 140910930Swollman (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) 141010930Swollman mss -= TCPOLEN_CC_APPA; 141110930Swollman#if (MCLBYTES & (MCLBYTES - 1)) == 0 141210930Swollman if (mss > MCLBYTES) 141310930Swollman mss &= ~(MCLBYTES-1); 141410930Swollman#else 141510930Swollman if (mss > MCLBYTES) 141610930Swollman mss = mss / MCLBYTES * MCLBYTES; 141710881Swollman#endif 141810930Swollman if (so->so_snd.sb_hiwat < mss) 141910930Swollman mss = so->so_snd.sb_hiwat; 142010930Swollman 142110930Swollman tp->t_maxseg = mss; 142210930Swollman 142311450Swollman tcpstat.tcps_mturesent++; 142450673Sjlemon tp->t_rtttime = 0; 142511450Swollman tp->snd_nxt = tp->snd_una; 142611450Swollman tcp_output(tp); 142710930Swollman } 142898211Shsu return inp; 142910881Swollman} 143010881Swollman 143110881Swollman/* 14326283Swollman * Look-up the routing entry to the peer of this inpcb. If no route 1433108265Shsu * is found and it cannot be allocated, then return NULL. This routine 14346283Swollman * is called by TCP routines that access the rmx structure and by tcp_mss 14356283Swollman * to get the interface MTU. 14366283Swollman */ 1437122922Sandreu_long 1438122922Sandretcp_maxmtu(inc) 143986764Sjlemon struct in_conninfo *inc; 14406283Swollman{ 1441122922Sandre struct route sro; 1442122922Sandre struct sockaddr_in *dst; 1443122922Sandre struct ifnet *ifp; 1444122922Sandre u_long maxmtu = 0; 14456283Swollman 1446122922Sandre KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); 1447122922Sandre 1448122996Sandre bzero(&sro, sizeof(sro)); 1449122922Sandre if (inc->inc_faddr.s_addr != INADDR_ANY) { 1450122922Sandre dst = (struct sockaddr_in *)&sro.ro_dst; 1451122922Sandre dst->sin_family = AF_INET; 1452122922Sandre dst->sin_len = sizeof(*dst); 1453122922Sandre dst->sin_addr = inc->inc_faddr; 1454122922Sandre rtalloc_ign(&sro, RTF_CLONING); 14556283Swollman } 1456122922Sandre if (sro.ro_rt != NULL) { 1457122922Sandre ifp = sro.ro_rt->rt_ifp; 1458122922Sandre if (sro.ro_rt->rt_rmx.rmx_mtu == 0) 1459122922Sandre maxmtu = ifp->if_mtu; 1460122922Sandre else 1461122922Sandre maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); 1462122922Sandre RTFREE(sro.ro_rt); 1463122922Sandre } 1464122922Sandre return (maxmtu); 14656283Swollman} 14666283Swollman 146755679Sshin#ifdef INET6 1468122922Sandreu_long 1469122922Sandretcp_maxmtu6(inc) 147086764Sjlemon struct in_conninfo *inc; 147155679Sshin{ 1472122922Sandre struct route_in6 sro6; 1473122922Sandre struct ifnet *ifp; 1474122922Sandre u_long maxmtu = 0; 147555679Sshin 1476122922Sandre KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); 1477122922Sandre 1478122996Sandre bzero(&sro6, sizeof(sro6)); 1479122922Sandre if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { 1480122922Sandre sro6.ro_dst.sin6_family = AF_INET6; 1481122922Sandre sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); 1482122922Sandre sro6.ro_dst.sin6_addr = inc->inc6_faddr; 1483122922Sandre rtalloc_ign((struct route *)&sro6, RTF_CLONING); 148455679Sshin } 1485122922Sandre if (sro6.ro_rt != NULL) { 1486122922Sandre ifp = sro6.ro_rt->rt_ifp; 1487122922Sandre if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) 1488122922Sandre maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); 1489122922Sandre else 1490122922Sandre maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, 1491122922Sandre IN6_LINKMTU(sro6.ro_rt->rt_ifp)); 1492122922Sandre RTFREE(sro6.ro_rt); 1493122922Sandre } 1494122922Sandre 1495122922Sandre return (maxmtu); 149655679Sshin} 149755679Sshin#endif /* INET6 */ 149855679Sshin 149955679Sshin#ifdef IPSEC 150055679Sshin/* compute ESP/AH header size for TCP, including outer IP header. */ 150155679Sshinsize_t 150255679Sshinipsec_hdrsiz_tcp(tp) 150355679Sshin struct tcpcb *tp; 150455679Sshin{ 150555679Sshin struct inpcb *inp; 150655679Sshin struct mbuf *m; 150755679Sshin size_t hdrsiz; 150855679Sshin struct ip *ip; 150955679Sshin#ifdef INET6 151055679Sshin struct ip6_hdr *ip6; 1511111145Sjlemon#endif 151255679Sshin struct tcphdr *th; 151355679Sshin 151478642Ssilby if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) 151555679Sshin return 0; 1516111119Simp MGETHDR(m, M_DONTWAIT, MT_DATA); 151755679Sshin if (!m) 151855679Sshin return 0; 151955679Sshin 152055679Sshin#ifdef INET6 152155679Sshin if ((inp->inp_vflag & INP_IPV6) != 0) { 152255679Sshin ip6 = mtod(m, struct ip6_hdr *); 152355679Sshin th = (struct tcphdr *)(ip6 + 1); 152455679Sshin m->m_pkthdr.len = m->m_len = 152555679Sshin sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1526111144Sjlemon tcpip_fillheaders(inp, ip6, th); 152755679Sshin hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 152855679Sshin } else 152955679Sshin#endif /* INET6 */ 153055679Sshin { 153155679Sshin ip = mtod(m, struct ip *); 153255679Sshin th = (struct tcphdr *)(ip + 1); 153355679Sshin m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); 1534111144Sjlemon tcpip_fillheaders(inp, ip, th); 153555679Sshin hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 153655679Sshin } 153755679Sshin 153855679Sshin m_free(m); 153955679Sshin return hdrsiz; 154055679Sshin} 154155679Sshin#endif /*IPSEC*/ 154255679Sshin 15436283Swollman/* 1544111145Sjlemon * Move a TCP connection into TIME_WAIT state. 1545111145Sjlemon * tcbinfo is unlocked. 1546111145Sjlemon * inp is locked, and is unlocked before returning. 1547111145Sjlemon */ 1548111145Sjlemonvoid 1549111145Sjlemontcp_twstart(tp) 1550111145Sjlemon struct tcpcb *tp; 1551111145Sjlemon{ 1552111145Sjlemon struct tcptw *tw; 1553111145Sjlemon struct inpcb *inp; 1554111145Sjlemon int tw_time, acknow; 1555111145Sjlemon struct socket *so; 1556111145Sjlemon 1557112009Sjlemon tw = uma_zalloc(tcptw_zone, M_NOWAIT); 1558112009Sjlemon if (tw == NULL) { 1559112009Sjlemon tw = tcp_timer_2msl_tw(1); 1560112009Sjlemon if (tw == NULL) { 1561112009Sjlemon tcp_close(tp); 1562112009Sjlemon return; 1563112009Sjlemon } 1564112009Sjlemon } 1565111145Sjlemon inp = tp->t_inpcb; 1566111145Sjlemon tw->tw_inpcb = inp; 1567111145Sjlemon 1568111145Sjlemon /* 1569111145Sjlemon * Recover last window size sent. 1570111145Sjlemon */ 1571111145Sjlemon tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; 1572111145Sjlemon 1573111145Sjlemon /* 1574111145Sjlemon * Set t_recent if timestamps are used on the connection. 1575111145Sjlemon */ 1576111145Sjlemon if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == 1577111145Sjlemon (TF_REQ_TSTMP|TF_RCVD_TSTMP)) 1578111145Sjlemon tw->t_recent = tp->ts_recent; 1579111145Sjlemon else 1580111145Sjlemon tw->t_recent = 0; 1581111145Sjlemon 1582111145Sjlemon tw->snd_nxt = tp->snd_nxt; 1583111145Sjlemon tw->rcv_nxt = tp->rcv_nxt; 1584121850Ssilby tw->iss = tp->iss; 1585121884Ssilby tw->irs = tp->irs; 1586111145Sjlemon tw->cc_recv = tp->cc_recv; 1587111145Sjlemon tw->cc_send = tp->cc_send; 1588111145Sjlemon tw->t_starttime = tp->t_starttime; 1589112009Sjlemon tw->tw_time = 0; 1590111145Sjlemon 1591111145Sjlemon/* XXX 1592111145Sjlemon * If this code will 1593111145Sjlemon * be used for fin-wait-2 state also, then we may need 1594111145Sjlemon * a ts_recent from the last segment. 1595111145Sjlemon */ 1596111145Sjlemon /* Shorten TIME_WAIT [RFC-1644, p.28] */ 1597111145Sjlemon if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) { 1598111145Sjlemon tw_time = tp->t_rxtcur * TCPTV_TWTRUNC; 1599111145Sjlemon /* For T/TCP client, force ACK now. */ 1600111145Sjlemon acknow = 1; 1601111145Sjlemon } else { 1602111145Sjlemon tw_time = 2 * tcp_msl; 1603111145Sjlemon acknow = tp->t_flags & TF_ACKNOW; 1604111145Sjlemon } 1605111145Sjlemon tcp_discardcb(tp); 1606111145Sjlemon so = inp->inp_socket; 1607111145Sjlemon so->so_pcb = NULL; 1608111145Sjlemon tw->tw_cred = crhold(so->so_cred); 1609111145Sjlemon tw->tw_so_options = so->so_options; 1610114794Srwatson if (acknow) 1611126351Srwatson tcp_twrespond(tw, TH_ACK); 1612111145Sjlemon sotryfree(so); 1613111145Sjlemon inp->inp_socket = NULL; 1614111145Sjlemon inp->inp_ppcb = (caddr_t)tw; 1615111145Sjlemon inp->inp_vflag |= INP_TIMEWAIT; 1616112009Sjlemon tcp_timer_2msl_reset(tw, tw_time); 1617111145Sjlemon INP_UNLOCK(inp); 1618111145Sjlemon} 1619111145Sjlemon 1620121850Ssilby/* 1621121884Ssilby * The appromixate rate of ISN increase of Microsoft TCP stacks; 1622121884Ssilby * the actual rate is slightly higher due to the addition of 1623121884Ssilby * random positive increments. 1624121884Ssilby * 1625121884Ssilby * Most other new OSes use semi-randomized ISN values, so we 1626121884Ssilby * do not need to worry about them. 1627121884Ssilby */ 1628121884Ssilby#define MS_ISN_BYTES_PER_SECOND 250000 1629121884Ssilby 1630121884Ssilby/* 1631121850Ssilby * Determine if the ISN we will generate has advanced beyond the last 1632121850Ssilby * sequence number used by the previous connection. If so, indicate 1633121850Ssilby * that it is safe to recycle this tw socket by returning 1. 1634121850Ssilby */ 1635121850Ssilbyint 1636121850Ssilbytcp_twrecycleable(struct tcptw *tw) 1637121850Ssilby{ 1638121884Ssilby tcp_seq new_iss = tw->iss; 1639121884Ssilby tcp_seq new_irs = tw->irs; 1640121850Ssilby 1641121884Ssilby new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); 1642121884Ssilby new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); 1643121850Ssilby 1644121884Ssilby if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt)) 1645121850Ssilby return 1; 1646121850Ssilby else 1647121850Ssilby return 0; 1648121850Ssilby} 1649121850Ssilby 1650112009Sjlemonstruct tcptw * 1651112009Sjlemontcp_twclose(struct tcptw *tw, int reuse) 1652111145Sjlemon{ 1653111145Sjlemon struct inpcb *inp; 1654111145Sjlemon 1655111145Sjlemon inp = tw->tw_inpcb; 1656111145Sjlemon tw->tw_inpcb = NULL; 1657112009Sjlemon tcp_timer_2msl_stop(tw); 1658111145Sjlemon inp->inp_ppcb = NULL; 1659111145Sjlemon#ifdef INET6 1660111145Sjlemon if (inp->inp_vflag & INP_IPV6PROTO) 1661111145Sjlemon in6_pcbdetach(inp); 1662111145Sjlemon else 1663111145Sjlemon#endif 1664111145Sjlemon in_pcbdetach(inp); 1665111145Sjlemon tcpstat.tcps_closed++; 1666126002Spjd crfree(tw->tw_cred); 1667126002Spjd tw->tw_cred = NULL; 1668112009Sjlemon if (reuse) 1669112009Sjlemon return (tw); 1670112009Sjlemon uma_zfree(tcptw_zone, tw); 1671112009Sjlemon return (NULL); 1672111145Sjlemon} 1673111145Sjlemon 1674111145Sjlemonint 1675126351Srwatsontcp_twrespond(struct tcptw *tw, int flags) 1676111145Sjlemon{ 1677111145Sjlemon struct inpcb *inp = tw->tw_inpcb; 1678111145Sjlemon struct tcphdr *th; 1679111145Sjlemon struct mbuf *m; 1680111145Sjlemon struct ip *ip = NULL; 1681111145Sjlemon u_int8_t *optp; 1682111145Sjlemon u_int hdrlen, optlen; 1683111145Sjlemon int error; 1684111145Sjlemon#ifdef INET6 1685111145Sjlemon struct ip6_hdr *ip6 = NULL; 1686111145Sjlemon int isipv6 = inp->inp_inc.inc_isipv6; 1687111145Sjlemon#endif 1688111145Sjlemon 1689111231Sphk m = m_gethdr(M_DONTWAIT, MT_HEADER); 1690111145Sjlemon if (m == NULL) 1691111145Sjlemon return (ENOBUFS); 1692111145Sjlemon m->m_data += max_linkhdr; 1693111145Sjlemon 1694114794Srwatson#ifdef MAC 1695123607Srwatson mac_create_mbuf_from_inpcb(inp, m); 1696114794Srwatson#endif 1697114794Srwatson 1698111153Sjlemon#ifdef INET6 1699111145Sjlemon if (isipv6) { 1700111145Sjlemon hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1701111145Sjlemon ip6 = mtod(m, struct ip6_hdr *); 1702111145Sjlemon th = (struct tcphdr *)(ip6 + 1); 1703111145Sjlemon tcpip_fillheaders(inp, ip6, th); 1704111153Sjlemon } else 1705111153Sjlemon#endif 1706111153Sjlemon { 1707111145Sjlemon hdrlen = sizeof(struct tcpiphdr); 1708111145Sjlemon ip = mtod(m, struct ip *); 1709111145Sjlemon th = (struct tcphdr *)(ip + 1); 1710111145Sjlemon tcpip_fillheaders(inp, ip, th); 1711111145Sjlemon } 1712111145Sjlemon optp = (u_int8_t *)(th + 1); 1713111145Sjlemon 1714111145Sjlemon /* 1715111145Sjlemon * Send a timestamp and echo-reply if both our side and our peer 1716111145Sjlemon * have sent timestamps in our SYN's and this is not a RST. 1717111145Sjlemon */ 1718111145Sjlemon if (tw->t_recent && flags == TH_ACK) { 1719111145Sjlemon u_int32_t *lp = (u_int32_t *)optp; 1720111145Sjlemon 1721111145Sjlemon /* Form timestamp option as shown in appendix A of RFC 1323. */ 1722111145Sjlemon *lp++ = htonl(TCPOPT_TSTAMP_HDR); 1723111145Sjlemon *lp++ = htonl(ticks); 1724111145Sjlemon *lp = htonl(tw->t_recent); 1725111145Sjlemon optp += TCPOLEN_TSTAMP_APPA; 1726111145Sjlemon } 1727111145Sjlemon 1728111145Sjlemon /* 1729111145Sjlemon * Send `CC-family' options if needed, and it's not a RST. 1730111145Sjlemon */ 1731111145Sjlemon if (tw->cc_recv != 0 && flags == TH_ACK) { 1732111145Sjlemon u_int32_t *lp = (u_int32_t *)optp; 1733111145Sjlemon 1734111145Sjlemon *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC)); 1735111145Sjlemon *lp = htonl(tw->cc_send); 1736111145Sjlemon optp += TCPOLEN_CC_APPA; 1737111145Sjlemon } 1738111145Sjlemon optlen = optp - (u_int8_t *)(th + 1); 1739111145Sjlemon 1740111145Sjlemon m->m_len = hdrlen + optlen; 1741111145Sjlemon m->m_pkthdr.len = m->m_len; 1742111145Sjlemon 1743111145Sjlemon KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); 1744111145Sjlemon 1745111145Sjlemon th->th_seq = htonl(tw->snd_nxt); 1746111145Sjlemon th->th_ack = htonl(tw->rcv_nxt); 1747111145Sjlemon th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 1748111145Sjlemon th->th_flags = flags; 1749111145Sjlemon th->th_win = htons(tw->last_win); 1750111145Sjlemon 1751111153Sjlemon#ifdef INET6 1752111145Sjlemon if (isipv6) { 1753111145Sjlemon th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 1754111145Sjlemon sizeof(struct tcphdr) + optlen); 1755122922Sandre ip6->ip6_hlim = in6_selecthlim(inp, NULL); 1756122922Sandre error = ip6_output(m, inp->in6p_outputopts, NULL, 1757111145Sjlemon (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); 1758111153Sjlemon } else 1759111153Sjlemon#endif 1760111153Sjlemon { 1761111145Sjlemon th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 1762111145Sjlemon htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); 1763111145Sjlemon m->m_pkthdr.csum_flags = CSUM_TCP; 1764111145Sjlemon m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1765111145Sjlemon ip->ip_len = m->m_pkthdr.len; 1766124248Sandre if (path_mtu_discovery) 1767124248Sandre ip->ip_off |= IP_DF; 1768122922Sandre error = ip_output(m, inp->inp_options, NULL, 1769111145Sjlemon (tw->tw_so_options & SO_DONTROUTE), NULL, inp); 1770111145Sjlemon } 1771111145Sjlemon if (flags & TH_ACK) 1772111145Sjlemon tcpstat.tcps_sndacks++; 1773111145Sjlemon else 1774111145Sjlemon tcpstat.tcps_sndctrl++; 1775111145Sjlemon tcpstat.tcps_sndtotal++; 1776111145Sjlemon return (error); 1777111145Sjlemon} 1778111145Sjlemon 1779111145Sjlemon/* 1780102017Sdillon * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING 1781102017Sdillon * 1782102017Sdillon * This code attempts to calculate the bandwidth-delay product as a 1783102017Sdillon * means of determining the optimal window size to maximize bandwidth, 1784102017Sdillon * minimize RTT, and avoid the over-allocation of buffers on interfaces and 1785102017Sdillon * routers. This code also does a fairly good job keeping RTTs in check 1786102017Sdillon * across slow links like modems. We implement an algorithm which is very 1787102017Sdillon * similar (but not meant to be) TCP/Vegas. The code operates on the 1788102017Sdillon * transmitter side of a TCP connection and so only effects the transmit 1789102017Sdillon * side of the connection. 1790102017Sdillon * 1791102017Sdillon * BACKGROUND: TCP makes no provision for the management of buffer space 1792102017Sdillon * at the end points or at the intermediate routers and switches. A TCP 1793102017Sdillon * stream, whether using NewReno or not, will eventually buffer as 1794102017Sdillon * many packets as it is able and the only reason this typically works is 1795102017Sdillon * due to the fairly small default buffers made available for a connection 1796102017Sdillon * (typicaly 16K or 32K). As machines use larger windows and/or window 1797102017Sdillon * scaling it is now fairly easy for even a single TCP connection to blow-out 1798102017Sdillon * all available buffer space not only on the local interface, but on 1799102017Sdillon * intermediate routers and switches as well. NewReno makes a misguided 1800102017Sdillon * attempt to 'solve' this problem by waiting for an actual failure to occur, 1801102017Sdillon * then backing off, then steadily increasing the window again until another 1802102017Sdillon * failure occurs, ad-infinitum. This results in terrible oscillation that 1803102017Sdillon * is only made worse as network loads increase and the idea of intentionally 1804102017Sdillon * blowing out network buffers is, frankly, a terrible way to manage network 1805102017Sdillon * resources. 1806102017Sdillon * 1807102017Sdillon * It is far better to limit the transmit window prior to the failure 1808102017Sdillon * condition being achieved. There are two general ways to do this: First 1809102017Sdillon * you can 'scan' through different transmit window sizes and locate the 1810102017Sdillon * point where the RTT stops increasing, indicating that you have filled the 1811102017Sdillon * pipe, then scan backwards until you note that RTT stops decreasing, then 1812102017Sdillon * repeat ad-infinitum. This method works in principle but has severe 1813102017Sdillon * implementation issues due to RTT variances, timer granularity, and 1814102017Sdillon * instability in the algorithm which can lead to many false positives and 1815102017Sdillon * create oscillations as well as interact badly with other TCP streams 1816102017Sdillon * implementing the same algorithm. 1817102017Sdillon * 1818102017Sdillon * The second method is to limit the window to the bandwidth delay product 1819102017Sdillon * of the link. This is the method we implement. RTT variances and our 1820102017Sdillon * own manipulation of the congestion window, bwnd, can potentially 1821102017Sdillon * destabilize the algorithm. For this reason we have to stabilize the 1822102017Sdillon * elements used to calculate the window. We do this by using the minimum 1823102017Sdillon * observed RTT, the long term average of the observed bandwidth, and 1824102017Sdillon * by adding two segments worth of slop. It isn't perfect but it is able 1825102017Sdillon * to react to changing conditions and gives us a very stable basis on 1826102017Sdillon * which to extend the algorithm. 1827102017Sdillon */ 1828102017Sdillonvoid 1829102017Sdillontcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) 1830102017Sdillon{ 1831102017Sdillon u_long bw; 1832102017Sdillon u_long bwnd; 1833102017Sdillon int save_ticks; 1834102017Sdillon 1835102017Sdillon /* 1836102017Sdillon * If inflight_enable is disabled in the middle of a tcp connection, 1837102017Sdillon * make sure snd_bwnd is effectively disabled. 1838102017Sdillon */ 1839102017Sdillon if (tcp_inflight_enable == 0) { 1840102017Sdillon tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 1841102017Sdillon tp->snd_bandwidth = 0; 1842102017Sdillon return; 1843102017Sdillon } 1844102017Sdillon 1845102017Sdillon /* 1846102017Sdillon * Figure out the bandwidth. Due to the tick granularity this 1847102017Sdillon * is a very rough number and it MUST be averaged over a fairly 1848102017Sdillon * long period of time. XXX we need to take into account a link 1849102017Sdillon * that is not using all available bandwidth, but for now our 1850102017Sdillon * slop will ramp us up if this case occurs and the bandwidth later 1851102017Sdillon * increases. 1852102368Sdillon * 1853102368Sdillon * Note: if ticks rollover 'bw' may wind up negative. We must 1854102368Sdillon * effectively reset t_bw_rtttime for this case. 1855102017Sdillon */ 1856102017Sdillon save_ticks = ticks; 1857102017Sdillon if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) 1858102017Sdillon return; 1859102017Sdillon 1860102017Sdillon bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 1861102017Sdillon (save_ticks - tp->t_bw_rtttime); 1862102017Sdillon tp->t_bw_rtttime = save_ticks; 1863102017Sdillon tp->t_bw_rtseq = ack_seq; 1864102368Sdillon if (tp->t_bw_rtttime == 0 || (int)bw < 0) 1865102017Sdillon return; 1866102017Sdillon bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; 1867102017Sdillon 1868102017Sdillon tp->snd_bandwidth = bw; 1869102017Sdillon 1870102017Sdillon /* 1871102017Sdillon * Calculate the semi-static bandwidth delay product, plus two maximal 1872102017Sdillon * segments. The additional slop puts us squarely in the sweet 1873107881Sdillon * spot and also handles the bandwidth run-up case and stabilization. 1874107881Sdillon * Without the slop we could be locking ourselves into a lower 1875107881Sdillon * bandwidth. 1876102017Sdillon * 1877102017Sdillon * Situations Handled: 1878102017Sdillon * (1) Prevents over-queueing of packets on LANs, especially on 1879102017Sdillon * high speed LANs, allowing larger TCP buffers to be 1880102017Sdillon * specified, and also does a good job preventing 1881102017Sdillon * over-queueing of packets over choke points like modems 1882102017Sdillon * (at least for the transmit side). 1883102017Sdillon * 1884102017Sdillon * (2) Is able to handle changing network loads (bandwidth 1885102017Sdillon * drops so bwnd drops, bandwidth increases so bwnd 1886102017Sdillon * increases). 1887102017Sdillon * 1888102017Sdillon * (3) Theoretically should stabilize in the face of multiple 1889102017Sdillon * connections implementing the same algorithm (this may need 1890102017Sdillon * a little work). 1891107881Sdillon * 1892107881Sdillon * (4) Stability value (defaults to 20 = 2 maximal packets) can 1893107881Sdillon * be adjusted with a sysctl but typically only needs to be 1894107881Sdillon * on very slow connections. A value no smaller then 5 1895107881Sdillon * should be used, but only reduce this default if you have 1896107881Sdillon * no other choice. 1897102017Sdillon */ 1898102017Sdillon#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) 1899107881Sdillon bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; 1900102368Sdillon#undef USERTT 1901102017Sdillon 1902102017Sdillon if (tcp_inflight_debug > 0) { 1903102017Sdillon static int ltime; 1904102017Sdillon if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { 1905102017Sdillon ltime = ticks; 1906102017Sdillon printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", 1907102017Sdillon tp, 1908102017Sdillon bw, 1909102017Sdillon tp->t_rttbest, 1910102017Sdillon tp->t_srtt, 1911102017Sdillon bwnd 1912102017Sdillon ); 1913102017Sdillon } 1914102017Sdillon } 1915102017Sdillon if ((long)bwnd < tcp_inflight_min) 1916102017Sdillon bwnd = tcp_inflight_min; 1917102017Sdillon if (bwnd > tcp_inflight_max) 1918102017Sdillon bwnd = tcp_inflight_max; 1919102017Sdillon if ((long)bwnd < tp->t_maxseg * 2) 1920102017Sdillon bwnd = tp->t_maxseg * 2; 1921102017Sdillon tp->snd_bwnd = bwnd; 1922102017Sdillon} 1923102017Sdillon 1924125680Sbms#ifdef TCP_SIGNATURE 1925125680Sbms/* 1926125783Sbms * Callback function invoked by m_apply() to digest TCP segment data 1927125783Sbms * contained within an mbuf chain. 1928125783Sbms */ 1929125783Sbmsstatic int 1930125783Sbmstcp_signature_apply(void *fstate, void *data, u_int len) 1931125783Sbms{ 1932125783Sbms 1933125819Sbms MD5Update(fstate, (u_char *)data, len); 1934125783Sbms return (0); 1935125783Sbms} 1936125783Sbms 1937125783Sbms/* 1938125680Sbms * Compute TCP-MD5 hash of a TCPv4 segment. (RFC2385) 1939125680Sbms * 1940125741Sbms * Parameters: 1941125741Sbms * m pointer to head of mbuf chain 1942125741Sbms * off0 offset to TCP header within the mbuf chain 1943125741Sbms * len length of TCP segment data, excluding options 1944125741Sbms * optlen length of TCP segment options 1945125741Sbms * buf pointer to storage for computed MD5 digest 1946125741Sbms * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) 1947125741Sbms * 1948125680Sbms * We do this over ip, tcphdr, segment data, and the key in the SADB. 1949125680Sbms * When called from tcp_input(), we can be sure that th_sum has been 1950125680Sbms * zeroed out and verified already. 1951125680Sbms * 1952125680Sbms * This function is for IPv4 use only. Calling this function with an 1953125680Sbms * IPv6 packet in the mbuf chain will yield undefined results. 1954125680Sbms * 1955125680Sbms * Return 0 if successful, otherwise return -1. 1956125680Sbms * 1957125680Sbms * XXX The key is retrieved from the system's PF_KEY SADB, by keying a 1958125680Sbms * search with the destination IP address, and a 'magic SPI' to be 1959125680Sbms * determined by the application. This is hardcoded elsewhere to 1179 1960125680Sbms * right now. Another branch of this code exists which uses the SPD to 1961125680Sbms * specify per-application flows but it is unstable. 1962125680Sbms */ 1963125680Sbmsint 1964125783Sbmstcp_signature_compute(struct mbuf *m, int off0, int len, int optlen, 1965125741Sbms u_char *buf, u_int direction) 1966125680Sbms{ 1967125680Sbms union sockaddr_union dst; 1968125680Sbms struct ippseudo ippseudo; 1969125680Sbms MD5_CTX ctx; 1970125680Sbms int doff; 1971125680Sbms struct ip *ip; 1972125680Sbms struct ipovly *ipovly; 1973125680Sbms struct secasvar *sav; 1974125680Sbms struct tcphdr *th; 1975125680Sbms u_short savecsum; 1976125680Sbms 1977125741Sbms KASSERT(m != NULL, ("NULL mbuf chain")); 1978125741Sbms KASSERT(buf != NULL, ("NULL signature pointer")); 1979125741Sbms 1980125741Sbms /* Extract the destination from the IP header in the mbuf. */ 1981125680Sbms ip = mtod(m, struct ip *); 1982125680Sbms bzero(&dst, sizeof(union sockaddr_union)); 1983125680Sbms dst.sa.sa_len = sizeof(struct sockaddr_in); 1984125680Sbms dst.sa.sa_family = AF_INET; 1985125680Sbms dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? 1986125680Sbms ip->ip_src : ip->ip_dst; 1987125741Sbms 1988125741Sbms /* Look up an SADB entry which matches the address of the peer. */ 1989125680Sbms sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); 1990125680Sbms if (sav == NULL) { 1991125680Sbms printf("%s: SADB lookup failed for %s\n", __func__, 1992125680Sbms inet_ntoa(dst.sin.sin_addr)); 1993125680Sbms return (EINVAL); 1994125680Sbms } 1995125741Sbms 1996125680Sbms MD5Init(&ctx); 1997125680Sbms ipovly = (struct ipovly *)ip; 1998125680Sbms th = (struct tcphdr *)((u_char *)ip + off0); 1999125680Sbms doff = off0 + sizeof(struct tcphdr) + optlen; 2000125741Sbms 2001125680Sbms /* 2002125680Sbms * Step 1: Update MD5 hash with IP pseudo-header. 2003125680Sbms * 2004125680Sbms * XXX The ippseudo header MUST be digested in network byte order, 2005125680Sbms * or else we'll fail the regression test. Assume all fields we've 2006125680Sbms * been doing arithmetic on have been in host byte order. 2007125680Sbms * XXX One cannot depend on ipovly->ih_len here. When called from 2008125680Sbms * tcp_output(), the underlying ip_len member has not yet been set. 2009125680Sbms */ 2010125680Sbms ippseudo.ippseudo_src = ipovly->ih_src; 2011125680Sbms ippseudo.ippseudo_dst = ipovly->ih_dst; 2012125680Sbms ippseudo.ippseudo_pad = 0; 2013125680Sbms ippseudo.ippseudo_p = IPPROTO_TCP; 2014125680Sbms ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen); 2015125680Sbms MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); 2016125741Sbms 2017125680Sbms /* 2018125680Sbms * Step 2: Update MD5 hash with TCP header, excluding options. 2019125680Sbms * The TCP checksum must be set to zero. 2020125680Sbms */ 2021125680Sbms savecsum = th->th_sum; 2022125680Sbms th->th_sum = 0; 2023125680Sbms MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); 2024125680Sbms th->th_sum = savecsum; 2025125741Sbms 2026125680Sbms /* 2027125680Sbms * Step 3: Update MD5 hash with TCP segment data. 2028125680Sbms * Use m_apply() to avoid an early m_pullup(). 2029125680Sbms */ 2030125680Sbms if (len > 0) 2031125783Sbms m_apply(m, doff, len, tcp_signature_apply, &ctx); 2032125741Sbms 2033125680Sbms /* 2034125680Sbms * Step 4: Update MD5 hash with shared secret. 2035125680Sbms */ 2036125680Sbms MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); 2037125680Sbms MD5Final(buf, &ctx); 2038125741Sbms 2039125680Sbms key_sa_recordxfer(sav, m); 2040125680Sbms KEY_FREESAV(&sav); 2041125680Sbms return (0); 2042125680Sbms} 2043125680Sbms#endif /* TCP_SIGNATURE */ 2044