tcp_timewait.c revision 121850
11541Srgrimes/* 211150Swollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 31541Srgrimes * The Regents of the University of California. All rights reserved. 41541Srgrimes * 51541Srgrimes * Redistribution and use in source and binary forms, with or without 61541Srgrimes * modification, are permitted provided that the following conditions 71541Srgrimes * are met: 81541Srgrimes * 1. Redistributions of source code must retain the above copyright 91541Srgrimes * notice, this list of conditions and the following disclaimer. 101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer in the 121541Srgrimes * documentation and/or other materials provided with the distribution. 131541Srgrimes * 3. All advertising materials mentioning features or use of this software 141541Srgrimes * must display the following acknowledgement: 151541Srgrimes * This product includes software developed by the University of 161541Srgrimes * California, Berkeley and its contributors. 171541Srgrimes * 4. Neither the name of the University nor the names of its contributors 181541Srgrimes * may be used to endorse or promote products derived from this software 191541Srgrimes * without specific prior written permission. 201541Srgrimes * 211541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 221541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 231541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 241541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 251541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 261541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 271541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 281541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 291541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 301541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 311541Srgrimes * SUCH DAMAGE. 321541Srgrimes * 3311150Swollman * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 3450477Speter * $FreeBSD: head/sys/netinet/tcp_timewait.c 121850 2003-11-01 07:30:08Z silby $ 351541Srgrimes */ 361541Srgrimes 3732752Seivind#include "opt_compat.h" 3854263Sshin#include "opt_inet6.h" 3956041Sshin#include "opt_ipsec.h" 40101106Srwatson#include "opt_mac.h" 4129514Sjoerg#include "opt_tcpdebug.h" 4229514Sjoerg 431541Srgrimes#include <sys/param.h> 441541Srgrimes#include <sys/systm.h> 4550673Sjlemon#include <sys/callout.h> 4612172Sphk#include <sys/kernel.h> 4712172Sphk#include <sys/sysctl.h> 48101106Srwatson#include <sys/mac.h> 491541Srgrimes#include <sys/malloc.h> 501541Srgrimes#include <sys/mbuf.h> 5155679Sshin#ifdef INET6 5255679Sshin#include <sys/domain.h> 5355679Sshin#endif 5448758Sgreen#include <sys/proc.h> 551541Srgrimes#include <sys/socket.h> 561541Srgrimes#include <sys/socketvar.h> 571541Srgrimes#include <sys/protosw.h> 5875619Skris#include <sys/random.h> 5934923Sbde 6092760Sjeff#include <vm/uma.h> 611541Srgrimes 621541Srgrimes#include <net/route.h> 631541Srgrimes#include <net/if.h> 641541Srgrimes 651541Srgrimes#include <netinet/in.h> 661541Srgrimes#include <netinet/in_systm.h> 671541Srgrimes#include <netinet/ip.h> 6855679Sshin#ifdef INET6 6955679Sshin#include <netinet/ip6.h> 7055679Sshin#endif 711541Srgrimes#include <netinet/in_pcb.h> 7255679Sshin#ifdef INET6 7355679Sshin#include <netinet6/in6_pcb.h> 7455679Sshin#endif 757090Sbde#include <netinet/in_var.h> 761541Srgrimes#include <netinet/ip_var.h> 7755679Sshin#ifdef INET6 7855679Sshin#include <netinet6/ip6_var.h> 7955679Sshin#endif 801541Srgrimes#include <netinet/tcp.h> 811541Srgrimes#include <netinet/tcp_fsm.h> 821541Srgrimes#include <netinet/tcp_seq.h> 831541Srgrimes#include <netinet/tcp_timer.h> 841541Srgrimes#include <netinet/tcp_var.h> 8555679Sshin#ifdef INET6 8655679Sshin#include <netinet6/tcp6_var.h> 8755679Sshin#endif 881541Srgrimes#include <netinet/tcpip.h> 896283Swollman#ifdef TCPDEBUG 906283Swollman#include <netinet/tcp_debug.h> 916283Swollman#endif 9255679Sshin#include <netinet6/ip6protosw.h> 931541Srgrimes 9455679Sshin#ifdef IPSEC 9555679Sshin#include <netinet6/ipsec.h> 9662587Sitojun#ifdef INET6 9762587Sitojun#include <netinet6/ipsec6.h> 9862587Sitojun#endif 9955679Sshin#endif /*IPSEC*/ 10055679Sshin 101105199Ssam#ifdef FAST_IPSEC 102105199Ssam#include <netipsec/ipsec.h> 103105199Ssam#ifdef INET6 104105199Ssam#include <netipsec/ipsec6.h> 105105199Ssam#endif 106105199Ssam#define IPSEC 107105199Ssam#endif /*FAST_IPSEC*/ 108105199Ssam 10958698Sjlemon#include <machine/in_cksum.h> 11082122Ssilby#include <sys/md5.h> 11158698Sjlemon 1121541Srgrimesint tcp_mssdflt = TCP_MSS; 11346381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 11446381Sbillf &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); 11512296Sphk 11652904Sshin#ifdef INET6 11752904Sshinint tcp_v6mssdflt = TCP6_MSS; 11852904SshinSYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 11955679Sshin CTLFLAG_RW, &tcp_v6mssdflt , 0, 12055679Sshin "Default TCP Maximum Segment Size for IPv6"); 12152904Sshin#endif 12252904Sshin 12350673Sjlemon#if 0 12412296Sphkstatic int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; 12546381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 12646381Sbillf &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); 12750673Sjlemon#endif 12812296Sphk 12986764Sjlemonint tcp_do_rfc1323 = 1; 13046381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 13146381Sbillf &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); 13212296Sphk 13386764Sjlemonint tcp_do_rfc1644 = 0; 13446381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 13546381Sbillf &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); 1361541Srgrimes 13750426Sjlemonstatic int tcp_tcbhashsize = 0; 138121307SsilbySYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, 13950426Sjlemon &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 14050426Sjlemon 14155198Smsmithstatic int do_tcpdrain = 1; 14266376SbmilekicSYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, 14366376Sbmilekic "Enable tcp_drain routine for extra help when low on mbufs"); 14455198Smsmith 14546381SbillfSYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 14646381Sbillf &tcbinfo.ipi_count, 0, "Number of active PCBs"); 14736079Swollman 14872959Sjlemonstatic int icmp_may_rst = 1; 14972959SjlemonSYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 15072959Sjlemon "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 15170103Sphk 15282122Ssilbystatic int tcp_isn_reseed_interval = 0; 15382122SsilbySYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, 15482122Ssilby &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); 15582122Ssilby 156102017Sdillon/* 157102017Sdillon * TCP bandwidth limiting sysctls. Note that the default lower bound of 158102017Sdillon * 1024 exists only for debugging. A good production default would be 159102017Sdillon * something like 6100. 160102017Sdillon */ 161102017Sdillonstatic int tcp_inflight_enable = 0; 162102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW, 163102017Sdillon &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); 164102017Sdillon 165104825Sdillonstatic int tcp_inflight_debug = 0; 166102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW, 167102017Sdillon &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); 168102017Sdillon 169107881Sdillonstatic int tcp_inflight_min = 6144; 170102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW, 171102017Sdillon &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); 172102017Sdillon 173102017Sdillonstatic int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; 174102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW, 175102017Sdillon &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); 176107881Sdillonstatic int tcp_inflight_stab = 20; 177107881SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, 178107881Sdillon &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); 179102017Sdillon 18092723Salfredstatic void tcp_cleartaocache(void); 18198211Shsustatic struct inpcb *tcp_notify(struct inpcb *, int); 182111145Sjlemonstatic void tcp_discardcb(struct tcpcb *); 18312296Sphk 1847684Sdg/* 18532821Sdg * Target size of TCP PCB hash tables. Must be a power of two. 18643562Smsmith * 18743562Smsmith * Note that this can be overridden by the kernel environment 18843562Smsmith * variable net.inet.tcp.tcbhashsize 1897684Sdg */ 1907684Sdg#ifndef TCBHASHSIZE 19132821Sdg#define TCBHASHSIZE 512 1927684Sdg#endif 1931541Srgrimes 1941541Srgrimes/* 195111145Sjlemon * XXX 196111145Sjlemon * Callouts should be moved into struct tcp directly. They are currently 197111145Sjlemon * separate becuase the tcpcb structure is exported to userland for sysctl 198111145Sjlemon * parsing purposes, which do not know about callouts. 19934881Swollman */ 200111145Sjlemonstruct tcpcb_mem { 20134881Swollman struct tcpcb tcb; 202111145Sjlemon struct callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep; 203111145Sjlemon struct callout tcpcb_mem_2msl, tcpcb_mem_delack; 20434881Swollman}; 20534881Swollman 206111145Sjlemonstatic uma_zone_t tcpcb_zone; 207111145Sjlemonstatic uma_zone_t tcptw_zone; 208111145Sjlemon 20934881Swollman/* 2101541Srgrimes * Tcp initialization 2111541Srgrimes */ 2121541Srgrimesvoid 2131541Srgrimestcp_init() 2141541Srgrimes{ 21577843Speter int hashsize = TCBHASHSIZE; 21643562Smsmith 2176283Swollman tcp_ccgen = 1; 2186283Swollman tcp_cleartaocache(); 21950673Sjlemon 22050673Sjlemon tcp_delacktime = TCPTV_DELACK; 22150673Sjlemon tcp_keepinit = TCPTV_KEEP_INIT; 22250673Sjlemon tcp_keepidle = TCPTV_KEEP_IDLE; 22350673Sjlemon tcp_keepintvl = TCPTV_KEEPINTVL; 22450673Sjlemon tcp_maxpersistidle = TCPTV_KEEP_IDLE; 22550673Sjlemon tcp_msl = TCPTV_MSL; 226100335Sdillon tcp_rexmit_min = TCPTV_MIN; 227100335Sdillon tcp_rexmit_slop = TCPTV_CPU_VAR; 22850673Sjlemon 22998102Shsu INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); 2307684Sdg LIST_INIT(&tcb); 2317684Sdg tcbinfo.listhead = &tcb; 23277900Speter TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); 23343576Smsmith if (!powerof2(hashsize)) { 23443562Smsmith printf("WARNING: TCB hash size not a power of 2\n"); 23543562Smsmith hashsize = 512; /* safe default */ 23643562Smsmith } 23750426Sjlemon tcp_tcbhashsize = hashsize; 23843562Smsmith tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); 23943562Smsmith tcbinfo.porthashbase = hashinit(hashsize, M_PCB, 24034923Sbde &tcbinfo.porthashmask); 241111145Sjlemon tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), 24292760Sjeff NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 24392760Sjeff uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); 24455679Sshin#ifdef INET6 24555679Sshin#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) 24655679Sshin#else /* INET6 */ 24755679Sshin#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) 24855679Sshin#endif /* INET6 */ 24955679Sshin if (max_protohdr < TCP_MINPROTOHDR) 25055679Sshin max_protohdr = TCP_MINPROTOHDR; 25155679Sshin if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) 2521541Srgrimes panic("tcp_init"); 25355679Sshin#undef TCP_MINPROTOHDR 254111145Sjlemon /* 255111145Sjlemon * These have to be type stable for the benefit of the timers. 256111145Sjlemon */ 257111145Sjlemon tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), 258111145Sjlemon NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 259111145Sjlemon uma_zone_set_max(tcpcb_zone, maxsockets); 260112009Sjlemon tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), 261111145Sjlemon NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 262121453Ssilby uma_zone_set_max(tcptw_zone, maxsockets / 5); 263112009Sjlemon tcp_timer_init(); 26486764Sjlemon syncache_init(); 2651541Srgrimes} 2661541Srgrimes 2671541Srgrimes/* 26878642Ssilby * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 26978642Ssilby * tcp_template used to store this data in mbufs, but we now recopy it out 27078642Ssilby * of the tcpcb each time to conserve mbufs. 2711541Srgrimes */ 27278642Ssilbyvoid 273111144Sjlemontcpip_fillheaders(inp, ip_ptr, tcp_ptr) 274111144Sjlemon struct inpcb *inp; 27578642Ssilby void *ip_ptr; 27678642Ssilby void *tcp_ptr; 2771541Srgrimes{ 278111144Sjlemon struct tcphdr *th = (struct tcphdr *)tcp_ptr; 2791541Srgrimes 28055679Sshin#ifdef INET6 28155679Sshin if ((inp->inp_vflag & INP_IPV6) != 0) { 28278642Ssilby struct ip6_hdr *ip6; 28355679Sshin 28478642Ssilby ip6 = (struct ip6_hdr *)ip_ptr; 28555679Sshin ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 28655679Sshin (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); 28755679Sshin ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 28855679Sshin (IPV6_VERSION & IPV6_VERSION_MASK); 28955679Sshin ip6->ip6_nxt = IPPROTO_TCP; 29055679Sshin ip6->ip6_plen = sizeof(struct tcphdr); 29155679Sshin ip6->ip6_src = inp->in6p_laddr; 29255679Sshin ip6->ip6_dst = inp->in6p_faddr; 29355679Sshin } else 29455679Sshin#endif 29578642Ssilby { 296111144Sjlemon struct ip *ip; 29755679Sshin 298111144Sjlemon ip = (struct ip *)ip_ptr; 299111144Sjlemon ip->ip_v = IPVERSION; 300111144Sjlemon ip->ip_hl = 5; 301111144Sjlemon ip->ip_tos = inp->inp_ip_tos; 302111144Sjlemon ip->ip_len = 0; 303111144Sjlemon ip->ip_id = 0; 304111144Sjlemon ip->ip_off = 0; 305111144Sjlemon ip->ip_ttl = inp->inp_ip_ttl; 306111144Sjlemon ip->ip_sum = 0; 307111144Sjlemon ip->ip_p = IPPROTO_TCP; 308111144Sjlemon ip->ip_src = inp->inp_laddr; 309111144Sjlemon ip->ip_dst = inp->inp_faddr; 31078642Ssilby } 311111144Sjlemon th->th_sport = inp->inp_lport; 312111144Sjlemon th->th_dport = inp->inp_fport; 313111144Sjlemon th->th_seq = 0; 314111144Sjlemon th->th_ack = 0; 315111144Sjlemon th->th_x2 = 0; 316111144Sjlemon th->th_off = 5; 317111144Sjlemon th->th_flags = 0; 318111144Sjlemon th->th_win = 0; 319111144Sjlemon th->th_urp = 0; 320111144Sjlemon th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ 32178642Ssilby} 32278642Ssilby 32378642Ssilby/* 32478642Ssilby * Create template to be used to send tcp packets on a connection. 32578642Ssilby * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 32678642Ssilby * use for this function is in keepalives, which use tcp_respond. 32778642Ssilby */ 32878642Ssilbystruct tcptemp * 329111144Sjlemontcpip_maketemplate(inp) 330111144Sjlemon struct inpcb *inp; 33178642Ssilby{ 33278642Ssilby struct mbuf *m; 33378642Ssilby struct tcptemp *n; 33478642Ssilby 335111119Simp m = m_get(M_DONTWAIT, MT_HEADER); 33678642Ssilby if (m == NULL) 33778642Ssilby return (0); 33878642Ssilby m->m_len = sizeof(struct tcptemp); 33978642Ssilby n = mtod(m, struct tcptemp *); 34078642Ssilby 341111144Sjlemon tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t); 3421541Srgrimes return (n); 3431541Srgrimes} 3441541Srgrimes 3451541Srgrimes/* 3461541Srgrimes * Send a single message to the TCP at address specified by 3471541Srgrimes * the given TCP/IP header. If m == 0, then we make a copy 3481541Srgrimes * of the tcpiphdr at ti and send directly to the addressed host. 3491541Srgrimes * This is used to force keep alive messages out using the TCP 35078642Ssilby * template for a connection. If flags are given then we send 35178642Ssilby * a message back to the TCP which originated the * segment ti, 35278642Ssilby * and discard the mbuf containing it and any other attached mbufs. 3531541Srgrimes * 3541541Srgrimes * In any case the ack and sequence number of the transmitted 3551541Srgrimes * segment are as specified by the parameters. 35631848Sjulian * 35731848Sjulian * NOTE: If m != NULL, then ti must point to *inside* the mbuf. 3581541Srgrimes */ 3591541Srgrimesvoid 36055679Sshintcp_respond(tp, ipgen, th, m, ack, seq, flags) 3611541Srgrimes struct tcpcb *tp; 36255679Sshin void *ipgen; 36355679Sshin register struct tcphdr *th; 3641541Srgrimes register struct mbuf *m; 3651541Srgrimes tcp_seq ack, seq; 3661541Srgrimes int flags; 3671541Srgrimes{ 3681541Srgrimes register int tlen; 3691541Srgrimes int win = 0; 3701541Srgrimes struct route *ro = 0; 37114754Swollman struct route sro; 37255679Sshin struct ip *ip; 37355679Sshin struct tcphdr *nth; 37455679Sshin#ifdef INET6 37555679Sshin struct route_in6 *ro6 = 0; 37655679Sshin struct route_in6 sro6; 37755679Sshin struct ip6_hdr *ip6; 37855679Sshin int isipv6; 37955679Sshin#endif /* INET6 */ 38055679Sshin int ipflags = 0; 3811541Srgrimes 382101137Srwatson KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); 383101137Srwatson 38455679Sshin#ifdef INET6 385105586Sphk isipv6 = ((struct ip *)ipgen)->ip_v == 6; 38655679Sshin ip6 = ipgen; 38755679Sshin#endif /* INET6 */ 38855679Sshin ip = ipgen; 38955679Sshin 3901541Srgrimes if (tp) { 39157576Sps if (!(flags & TH_RST)) { 39241187Sguido win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); 39357576Sps if (win > (long)TCP_MAXWIN << tp->rcv_scale) 39457576Sps win = (long)TCP_MAXWIN << tp->rcv_scale; 39557576Sps } 39655679Sshin#ifdef INET6 39755679Sshin if (isipv6) 39855679Sshin ro6 = &tp->t_inpcb->in6p_route; 39955679Sshin else 40055679Sshin#endif /* INET6 */ 4011541Srgrimes ro = &tp->t_inpcb->inp_route; 40214754Swollman } else { 40355679Sshin#ifdef INET6 40455679Sshin if (isipv6) { 40555679Sshin ro6 = &sro6; 40655679Sshin bzero(ro6, sizeof *ro6); 40755679Sshin } else 40855679Sshin#endif /* INET6 */ 40955679Sshin { 41014754Swollman ro = &sro; 41114754Swollman bzero(ro, sizeof *ro); 41255679Sshin } 4131541Srgrimes } 4141541Srgrimes if (m == 0) { 415111119Simp m = m_gethdr(M_DONTWAIT, MT_HEADER); 4161541Srgrimes if (m == NULL) 4171541Srgrimes return; 4181541Srgrimes tlen = 0; 4191541Srgrimes m->m_data += max_linkhdr; 42055679Sshin#ifdef INET6 42155679Sshin if (isipv6) { 42255679Sshin bcopy((caddr_t)ip6, mtod(m, caddr_t), 42355679Sshin sizeof(struct ip6_hdr)); 42455679Sshin ip6 = mtod(m, struct ip6_hdr *); 42555679Sshin nth = (struct tcphdr *)(ip6 + 1); 42655679Sshin } else 42755679Sshin#endif /* INET6 */ 42855679Sshin { 42955679Sshin bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 43055679Sshin ip = mtod(m, struct ip *); 43155679Sshin nth = (struct tcphdr *)(ip + 1); 43255679Sshin } 43355679Sshin bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 4341541Srgrimes flags = TH_ACK; 4351541Srgrimes } else { 4361541Srgrimes m_freem(m->m_next); 4371541Srgrimes m->m_next = 0; 43855679Sshin m->m_data = (caddr_t)ipgen; 43955679Sshin /* m_len is set later */ 4401541Srgrimes tlen = 0; 4411541Srgrimes#define xchg(a,b,type) { type t; t=a; a=b; b=t; } 44255679Sshin#ifdef INET6 44355679Sshin if (isipv6) { 44455679Sshin xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 44555679Sshin nth = (struct tcphdr *)(ip6 + 1); 44655679Sshin } else 44755679Sshin#endif /* INET6 */ 44855679Sshin { 44955679Sshin xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); 45055679Sshin nth = (struct tcphdr *)(ip + 1); 45155679Sshin } 45255679Sshin if (th != nth) { 45355679Sshin /* 45455679Sshin * this is usually a case when an extension header 45555679Sshin * exists between the IPv6 header and the 45655679Sshin * TCP header. 45755679Sshin */ 45855679Sshin nth->th_sport = th->th_sport; 45955679Sshin nth->th_dport = th->th_dport; 46055679Sshin } 46155679Sshin xchg(nth->th_dport, nth->th_sport, n_short); 4621541Srgrimes#undef xchg 4631541Srgrimes } 46455679Sshin#ifdef INET6 46555679Sshin if (isipv6) { 46690198Sume ip6->ip6_flow = 0; 46790198Sume ip6->ip6_vfc = IPV6_VERSION; 46890198Sume ip6->ip6_nxt = IPPROTO_TCP; 46955679Sshin ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + 47055679Sshin tlen)); 47155679Sshin tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 47256039Sshin } else 47355679Sshin#endif 47455679Sshin { 4751541Srgrimes tlen += sizeof (struct tcpiphdr); 47658698Sjlemon ip->ip_len = tlen; 47758698Sjlemon ip->ip_ttl = ip_defttl; 47855679Sshin } 4791541Srgrimes m->m_len = tlen; 4801541Srgrimes m->m_pkthdr.len = tlen; 4811541Srgrimes m->m_pkthdr.rcvif = (struct ifnet *) 0; 482101106Srwatson#ifdef MAC 483111483Srwatson if (tp != NULL && tp->t_inpcb != NULL) { 484101106Srwatson /* 485101106Srwatson * Packet is associated with a socket, so allow the 486101106Srwatson * label of the response to reflect the socket label. 487101106Srwatson */ 488101106Srwatson mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m); 489101106Srwatson } else { 490101106Srwatson /* 491119245Srwatson * Packet is not associated with a socket, so possibly 492119245Srwatson * update the label in place. 493101106Srwatson */ 494119245Srwatson mac_reflect_mbuf_tcp(m); 495101106Srwatson } 496101106Srwatson#endif 49755679Sshin nth->th_seq = htonl(seq); 49855679Sshin nth->th_ack = htonl(ack); 49955679Sshin nth->th_x2 = 0; 50055679Sshin nth->th_off = sizeof (struct tcphdr) >> 2; 50155679Sshin nth->th_flags = flags; 5021541Srgrimes if (tp) 50355679Sshin nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 5041541Srgrimes else 50555679Sshin nth->th_win = htons((u_short)win); 50655679Sshin nth->th_urp = 0; 50755679Sshin#ifdef INET6 50855679Sshin if (isipv6) { 50959392Sshin nth->th_sum = 0; 51055679Sshin nth->th_sum = in6_cksum(m, IPPROTO_TCP, 51155679Sshin sizeof(struct ip6_hdr), 51255679Sshin tlen - sizeof(struct ip6_hdr)); 51355679Sshin ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, 51455679Sshin ro6 && ro6->ro_rt ? 51555679Sshin ro6->ro_rt->rt_ifp : 51655679Sshin NULL); 51755679Sshin } else 51855679Sshin#endif /* INET6 */ 51955679Sshin { 52058698Sjlemon nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 52158698Sjlemon htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 52258698Sjlemon m->m_pkthdr.csum_flags = CSUM_TCP; 52358698Sjlemon m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 52455679Sshin } 5256283Swollman#ifdef TCPDEBUG 52697658Stanimura if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 52755679Sshin tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); 5286283Swollman#endif 52955679Sshin#ifdef INET6 53055679Sshin if (isipv6) { 531105194Ssam (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL, 532105194Ssam tp ? tp->t_inpcb : NULL); 53355913Sshin if (ro6 == &sro6 && ro6->ro_rt) { 53455679Sshin RTFREE(ro6->ro_rt); 53555913Sshin ro6->ro_rt = NULL; 53655913Sshin } 53755679Sshin } else 53855679Sshin#endif /* INET6 */ 53955679Sshin { 540105194Ssam (void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL); 54114841Swollman if (ro == &sro && ro->ro_rt) { 54214754Swollman RTFREE(ro->ro_rt); 54355913Sshin ro->ro_rt = NULL; 54414754Swollman } 54555679Sshin } 5461541Srgrimes} 5471541Srgrimes 5481541Srgrimes/* 5491541Srgrimes * Create a new TCP control block, making an 5501541Srgrimes * empty reassembly queue and hooking it to the argument 55134881Swollman * protocol control block. The `inp' parameter must have 55234881Swollman * come from the zone allocator set up in tcp_init(). 5531541Srgrimes */ 5541541Srgrimesstruct tcpcb * 5551541Srgrimestcp_newtcpcb(inp) 5561541Srgrimes struct inpcb *inp; 5571541Srgrimes{ 558111145Sjlemon struct tcpcb_mem *tm; 559111145Sjlemon struct tcpcb *tp; 56055679Sshin#ifdef INET6 56155679Sshin int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 56255679Sshin#endif /* INET6 */ 5631541Srgrimes 564111145Sjlemon tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); 565111145Sjlemon if (tm == NULL) 566111145Sjlemon return (NULL); 567111145Sjlemon tp = &tm->tcb; 568111145Sjlemon /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ 56955679Sshin tp->t_maxseg = tp->t_maxopd = 57055679Sshin#ifdef INET6 57155679Sshin isipv6 ? tcp_v6mssdflt : 57255679Sshin#endif /* INET6 */ 57355679Sshin tcp_mssdflt; 5741541Srgrimes 57550673Sjlemon /* Set up our timeouts. */ 576111145Sjlemon callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0); 577111145Sjlemon callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0); 578111145Sjlemon callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0); 579111145Sjlemon callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0); 580111145Sjlemon callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0); 58150673Sjlemon 5826283Swollman if (tcp_do_rfc1323) 5836283Swollman tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 5846283Swollman if (tcp_do_rfc1644) 5856283Swollman tp->t_flags |= TF_REQ_CC; 58634881Swollman tp->t_inpcb = inp; /* XXX */ 5871541Srgrimes /* 5881541Srgrimes * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 58916367Swollman * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 5901541Srgrimes * reasonable initial retransmit time. 5911541Srgrimes */ 5921541Srgrimes tp->t_srtt = TCPTV_SRTTBASE; 59316367Swollman tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 594100335Sdillon tp->t_rttmin = tcp_rexmit_min; 59516367Swollman tp->t_rxtcur = TCPTV_RTOBASE; 5961541Srgrimes tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 597102017Sdillon tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 5981541Srgrimes tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 59950673Sjlemon tp->t_rcvtime = ticks; 600102017Sdillon tp->t_bw_rtttime = ticks; 60156564Sshin /* 60256564Sshin * IPv4 TTL initialization is necessary for an IPv6 socket as well, 60356564Sshin * because the socket may be bound to an IPv6 wildcard address, 60456564Sshin * which may match an IPv4-mapped IPv6 address. 60556564Sshin */ 60624570Sdg inp->inp_ip_ttl = ip_defttl; 6071541Srgrimes inp->inp_ppcb = (caddr_t)tp; 60834881Swollman return (tp); /* XXX */ 6091541Srgrimes} 6101541Srgrimes 6111541Srgrimes/* 6121541Srgrimes * Drop a TCP connection, reporting 6131541Srgrimes * the specified error. If connection is synchronized, 6141541Srgrimes * then send a RST to peer. 6151541Srgrimes */ 6161541Srgrimesstruct tcpcb * 6171541Srgrimestcp_drop(tp, errno) 6181541Srgrimes register struct tcpcb *tp; 6191541Srgrimes int errno; 6201541Srgrimes{ 6211541Srgrimes struct socket *so = tp->t_inpcb->inp_socket; 6221541Srgrimes 6231541Srgrimes if (TCPS_HAVERCVDSYN(tp->t_state)) { 6241541Srgrimes tp->t_state = TCPS_CLOSED; 6251541Srgrimes (void) tcp_output(tp); 6261541Srgrimes tcpstat.tcps_drops++; 6271541Srgrimes } else 6281541Srgrimes tcpstat.tcps_conndrops++; 6291541Srgrimes if (errno == ETIMEDOUT && tp->t_softerror) 6301541Srgrimes errno = tp->t_softerror; 6311541Srgrimes so->so_error = errno; 6321541Srgrimes return (tcp_close(tp)); 6331541Srgrimes} 6341541Srgrimes 635111145Sjlemonstatic void 636111145Sjlemontcp_discardcb(tp) 637111145Sjlemon struct tcpcb *tp; 6381541Srgrimes{ 639111145Sjlemon struct tseg_qent *q; 6401541Srgrimes struct inpcb *inp = tp->t_inpcb; 6411541Srgrimes struct socket *so = inp->inp_socket; 64255679Sshin#ifdef INET6 64355679Sshin int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 64455679Sshin#endif /* INET6 */ 645111145Sjlemon struct rtentry *rt; 64622719Swollman int dosavessthresh; 6471541Srgrimes 6481541Srgrimes /* 64950673Sjlemon * Make sure that all of our timers are stopped before we 65050673Sjlemon * delete the PCB. 65150673Sjlemon */ 65250673Sjlemon callout_stop(tp->tt_rexmt); 65350673Sjlemon callout_stop(tp->tt_persist); 65450673Sjlemon callout_stop(tp->tt_keep); 65550673Sjlemon callout_stop(tp->tt_2msl); 65650673Sjlemon callout_stop(tp->tt_delack); 65750673Sjlemon 65850673Sjlemon /* 6599373Swollman * If we got enough samples through the srtt filter, 6609373Swollman * save the rtt and rttvar in the routing entry. 6619373Swollman * 'Enough' is arbitrarily defined as the 16 samples. 6629373Swollman * 16 samples is enough for the srtt filter to converge 6639373Swollman * to within 5% of the correct value; fewer samples and 6649373Swollman * we could save a very bogus rtt. 6651541Srgrimes * 6661541Srgrimes * Don't update the default route's characteristics and don't 6671541Srgrimes * update anything that the user "locked". 6681541Srgrimes */ 66955679Sshin if (tp->t_rttupdated >= 16) { 6701549Srgrimes register u_long i = 0; 67155679Sshin#ifdef INET6 67255679Sshin if (isipv6) { 67355679Sshin struct sockaddr_in6 *sin6; 6741541Srgrimes 67555679Sshin if ((rt = inp->in6p_route.ro_rt) == NULL) 67655679Sshin goto no_valid_rt; 67755679Sshin sin6 = (struct sockaddr_in6 *)rt_key(rt); 67855679Sshin if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 67955679Sshin goto no_valid_rt; 68055679Sshin } 68155679Sshin else 68255679Sshin#endif /* INET6 */ 68355679Sshin if ((rt = inp->inp_route.ro_rt) == NULL || 68455679Sshin ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr 68555679Sshin == INADDR_ANY) 68655679Sshin goto no_valid_rt; 68755679Sshin 6881541Srgrimes if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { 6891541Srgrimes i = tp->t_srtt * 69050673Sjlemon (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); 6911541Srgrimes if (rt->rt_rmx.rmx_rtt && i) 6921541Srgrimes /* 6931541Srgrimes * filter this update to half the old & half 6941541Srgrimes * the new values, converting scale. 6951541Srgrimes * See route.h and tcp_var.h for a 6961541Srgrimes * description of the scaling constants. 6971541Srgrimes */ 6981541Srgrimes rt->rt_rmx.rmx_rtt = 6991541Srgrimes (rt->rt_rmx.rmx_rtt + i) / 2; 7001541Srgrimes else 7011541Srgrimes rt->rt_rmx.rmx_rtt = i; 7029263Swollman tcpstat.tcps_cachedrtt++; 7031541Srgrimes } 7041541Srgrimes if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { 7051541Srgrimes i = tp->t_rttvar * 70650673Sjlemon (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); 7071541Srgrimes if (rt->rt_rmx.rmx_rttvar && i) 7081541Srgrimes rt->rt_rmx.rmx_rttvar = 7091541Srgrimes (rt->rt_rmx.rmx_rttvar + i) / 2; 7101541Srgrimes else 7111541Srgrimes rt->rt_rmx.rmx_rttvar = i; 7129263Swollman tcpstat.tcps_cachedrttvar++; 7131541Srgrimes } 7141541Srgrimes /* 71522719Swollman * The old comment here said: 7161541Srgrimes * update the pipelimit (ssthresh) if it has been updated 7171541Srgrimes * already or if a pipesize was specified & the threshhold 7181541Srgrimes * got below half the pipesize. I.e., wait for bad news 7191541Srgrimes * before we start updating, then update on both good 7201541Srgrimes * and bad news. 72122719Swollman * 72222719Swollman * But we want to save the ssthresh even if no pipesize is 72322719Swollman * specified explicitly in the route, because such 72422719Swollman * connections still have an implicit pipesize specified 72522719Swollman * by the global tcp_sendspace. In the absence of a reliable 72622719Swollman * way to calculate the pipesize, it will have to do. 7271541Srgrimes */ 72822719Swollman i = tp->snd_ssthresh; 72922719Swollman if (rt->rt_rmx.rmx_sendpipe != 0) 73022719Swollman dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); 73122719Swollman else 73222719Swollman dosavessthresh = (i < so->so_snd.sb_hiwat / 2); 7333444Sphk if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && 73422719Swollman i != 0 && rt->rt_rmx.rmx_ssthresh != 0) 73522719Swollman || dosavessthresh) { 7361541Srgrimes /* 7371541Srgrimes * convert the limit from user data bytes to 7381541Srgrimes * packets then to packet data bytes. 7391541Srgrimes */ 7401541Srgrimes i = (i + tp->t_maxseg / 2) / tp->t_maxseg; 7411541Srgrimes if (i < 2) 7421541Srgrimes i = 2; 74355679Sshin i *= (u_long)(tp->t_maxseg + 74455679Sshin#ifdef INET6 74555679Sshin (isipv6 ? sizeof (struct ip6_hdr) + 74655679Sshin sizeof (struct tcphdr) : 74755679Sshin#endif 74855679Sshin sizeof (struct tcpiphdr) 74955679Sshin#ifdef INET6 75055679Sshin ) 75155679Sshin#endif 75255679Sshin ); 7531541Srgrimes if (rt->rt_rmx.rmx_ssthresh) 7541541Srgrimes rt->rt_rmx.rmx_ssthresh = 7551541Srgrimes (rt->rt_rmx.rmx_ssthresh + i) / 2; 7561541Srgrimes else 7571541Srgrimes rt->rt_rmx.rmx_ssthresh = i; 7589263Swollman tcpstat.tcps_cachedssthresh++; 7591541Srgrimes } 7601541Srgrimes } 76155679Sshin no_valid_rt: 7621541Srgrimes /* free the reassembly queue, if any */ 763111145Sjlemon while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { 76455679Sshin LIST_REMOVE(q, tqe_q); 76555679Sshin m_freem(q->tqe_m); 76655679Sshin FREE(q, M_TSEGQ); 7671541Srgrimes } 76832821Sdg inp->inp_ppcb = NULL; 769108265Shsu tp->t_inpcb = NULL; 770111145Sjlemon uma_zfree(tcpcb_zone, tp); 7711541Srgrimes soisdisconnected(so); 772111145Sjlemon} 773111145Sjlemon 774111145Sjlemon/* 775111145Sjlemon * Close a TCP control block: 776111145Sjlemon * discard all space held by the tcp 777111145Sjlemon * discard internet protocol block 778111145Sjlemon * wake up any sleepers 779111145Sjlemon */ 780111145Sjlemonstruct tcpcb * 781111145Sjlemontcp_close(tp) 782111145Sjlemon struct tcpcb *tp; 783111145Sjlemon{ 784111145Sjlemon struct inpcb *inp = tp->t_inpcb; 785111153Sjlemon#ifdef INET6 786111145Sjlemon struct socket *so = inp->inp_socket; 787111153Sjlemon#endif 788111145Sjlemon 789111145Sjlemon tcp_discardcb(tp); 79055679Sshin#ifdef INET6 79155679Sshin if (INP_CHECK_SOCKAF(so, AF_INET6)) 79255679Sshin in6_pcbdetach(inp); 79355679Sshin else 794111145Sjlemon#endif 795111145Sjlemon in_pcbdetach(inp); 7961541Srgrimes tcpstat.tcps_closed++; 7971541Srgrimes return ((struct tcpcb *)0); 7981541Srgrimes} 7991541Srgrimes 8001541Srgrimesvoid 8011541Srgrimestcp_drain() 8021541Srgrimes{ 80355198Smsmith if (do_tcpdrain) 80455198Smsmith { 80555198Smsmith struct inpcb *inpb; 80655198Smsmith struct tcpcb *tcpb; 80755679Sshin struct tseg_qent *te; 8081541Srgrimes 80955198Smsmith /* 81055198Smsmith * Walk the tcpbs, if existing, and flush the reassembly queue, 81155198Smsmith * if there is one... 81255198Smsmith * XXX: The "Net/3" implementation doesn't imply that the TCP 81355198Smsmith * reassembly queue should be flushed, but in a situation 81455198Smsmith * where we're really low on mbufs, this is potentially 81555198Smsmith * usefull. 81655198Smsmith */ 81798102Shsu INP_INFO_RLOCK(&tcbinfo); 81874362Sphk LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { 819111145Sjlemon if (inpb->inp_vflag & INP_TIMEWAIT) 820111145Sjlemon continue; 82198102Shsu INP_LOCK(inpb); 82274362Sphk if ((tcpb = intotcpcb(inpb))) { 82374362Sphk while ((te = LIST_FIRST(&tcpb->t_segq)) 82474362Sphk != NULL) { 82555679Sshin LIST_REMOVE(te, tqe_q); 82655679Sshin m_freem(te->tqe_m); 82755679Sshin FREE(te, M_TSEGQ); 82855198Smsmith } 82955198Smsmith } 83098102Shsu INP_UNLOCK(inpb); 83155198Smsmith } 83298102Shsu INP_INFO_RUNLOCK(&tcbinfo); 83355198Smsmith } 8341541Srgrimes} 8351541Srgrimes 8361541Srgrimes/* 8371541Srgrimes * Notify a tcp user of an asynchronous error; 8381541Srgrimes * store error as soft error, but wake up user 8391541Srgrimes * (for now, won't do anything until can select for soft error). 84072960Sjlemon * 84172960Sjlemon * Do not wake up user since there currently is no mechanism for 84272960Sjlemon * reporting soft errors (yet - a kqueue filter may be added). 8431541Srgrimes */ 84498211Shsustatic struct inpcb * 8451541Srgrimestcp_notify(inp, error) 8461541Srgrimes struct inpcb *inp; 8471541Srgrimes int error; 8481541Srgrimes{ 84972960Sjlemon struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; 8501541Srgrimes 8511541Srgrimes /* 8521541Srgrimes * Ignore some errors if we are hooked up. 8531541Srgrimes * If connection hasn't completed, has retransmitted several times, 8541541Srgrimes * and receives a second error, give up now. This is better 8551541Srgrimes * than waiting a long time to establish a connection that 8561541Srgrimes * can never complete. 8571541Srgrimes */ 8581541Srgrimes if (tp->t_state == TCPS_ESTABLISHED && 859110896Shsu (error == EHOSTUNREACH || error == ENETUNREACH || 860110896Shsu error == EHOSTDOWN)) { 86198211Shsu return inp; 8621541Srgrimes } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 86398211Shsu tp->t_softerror) { 86472960Sjlemon tcp_drop(tp, error); 86598211Shsu return (struct inpcb *)0; 86698211Shsu } else { 8671541Srgrimes tp->t_softerror = error; 86898211Shsu return inp; 86998211Shsu } 87072960Sjlemon#if 0 871111748Sdes wakeup( &so->so_timeo); 8721541Srgrimes sorwakeup(so); 8731541Srgrimes sowwakeup(so); 87472960Sjlemon#endif 8751541Srgrimes} 8761541Srgrimes 87736079Swollmanstatic int 87862573Sphktcp_pcblist(SYSCTL_HANDLER_ARGS) 87936079Swollman{ 88036079Swollman int error, i, n, s; 88136079Swollman struct inpcb *inp, **inp_list; 88236079Swollman inp_gen_t gencnt; 88336079Swollman struct xinpgen xig; 88436079Swollman 88536079Swollman /* 88636079Swollman * The process of preparing the TCB list is too time-consuming and 88736079Swollman * resource-intensive to repeat twice on every request. 88836079Swollman */ 88936079Swollman if (req->oldptr == 0) { 89036079Swollman n = tcbinfo.ipi_count; 89136079Swollman req->oldidx = 2 * (sizeof xig) 89236079Swollman + (n + n/8) * sizeof(struct xtcpcb); 89336079Swollman return 0; 89436079Swollman } 89536079Swollman 89636079Swollman if (req->newptr != 0) 89736079Swollman return EPERM; 89836079Swollman 89936079Swollman /* 90036079Swollman * OK, now we're committed to doing something. 90136079Swollman */ 90236079Swollman s = splnet(); 90398102Shsu INP_INFO_RLOCK(&tcbinfo); 90436079Swollman gencnt = tcbinfo.ipi_gencnt; 90536079Swollman n = tcbinfo.ipi_count; 90698102Shsu INP_INFO_RUNLOCK(&tcbinfo); 90736079Swollman splx(s); 90836079Swollman 909100831Struckman sysctl_wire_old_buffer(req, 2 * (sizeof xig) 910100831Struckman + n * sizeof(struct xtcpcb)); 911100831Struckman 91236079Swollman xig.xig_len = sizeof xig; 91336079Swollman xig.xig_count = n; 91436079Swollman xig.xig_gen = gencnt; 91536079Swollman xig.xig_sogen = so_gencnt; 91636079Swollman error = SYSCTL_OUT(req, &xig, sizeof xig); 91736079Swollman if (error) 91836079Swollman return error; 91936079Swollman 920111119Simp inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 92136079Swollman if (inp_list == 0) 92236079Swollman return ENOMEM; 92336079Swollman 92436079Swollman s = splnet(); 92598102Shsu INP_INFO_RLOCK(&tcbinfo); 92671999Sphk for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; 92771999Sphk inp = LIST_NEXT(inp, inp_list)) { 92898102Shsu INP_LOCK(inp); 929113345Srwatson if (inp->inp_gencnt <= gencnt) { 930113345Srwatson /* 931113345Srwatson * XXX: This use of cr_cansee(), introduced with 932113345Srwatson * TCP state changes, is not quite right, but for 933113345Srwatson * now, better than nothing. 934113345Srwatson */ 935113345Srwatson if (inp->inp_vflag & INP_TIMEWAIT) 936113345Srwatson error = cr_cansee(req->td->td_ucred, 937113345Srwatson intotw(inp)->tw_cred); 938113345Srwatson else 939113345Srwatson error = cr_canseesocket(req->td->td_ucred, 940113345Srwatson inp->inp_socket); 941113345Srwatson if (error == 0) 942113345Srwatson inp_list[i++] = inp; 943113345Srwatson } 94498102Shsu INP_UNLOCK(inp); 94536079Swollman } 94698102Shsu INP_INFO_RUNLOCK(&tcbinfo); 94736079Swollman splx(s); 94836079Swollman n = i; 94936079Swollman 95036079Swollman error = 0; 95136079Swollman for (i = 0; i < n; i++) { 95236079Swollman inp = inp_list[i]; 95336079Swollman if (inp->inp_gencnt <= gencnt) { 95436079Swollman struct xtcpcb xt; 95547960Stegge caddr_t inp_ppcb; 95636079Swollman xt.xt_len = sizeof xt; 95736079Swollman /* XXX should avoid extra copy */ 95836079Swollman bcopy(inp, &xt.xt_inp, sizeof *inp); 95947960Stegge inp_ppcb = inp->inp_ppcb; 960111145Sjlemon if (inp_ppcb == NULL) 961111145Sjlemon bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 962111145Sjlemon else if (inp->inp_vflag & INP_TIMEWAIT) { 963111145Sjlemon bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 964111145Sjlemon xt.xt_tp.t_state = TCPS_TIME_WAIT; 965111145Sjlemon } else 96647960Stegge bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); 96736079Swollman if (inp->inp_socket) 96836079Swollman sotoxsocket(inp->inp_socket, &xt.xt_socket); 969111145Sjlemon else { 970111145Sjlemon bzero(&xt.xt_socket, sizeof xt.xt_socket); 971111145Sjlemon xt.xt_socket.xso_protocol = IPPROTO_TCP; 972111145Sjlemon } 973110896Shsu xt.xt_inp.inp_gencnt = inp->inp_gencnt; 97436079Swollman error = SYSCTL_OUT(req, &xt, sizeof xt); 97536079Swollman } 97636079Swollman } 97736079Swollman if (!error) { 97836079Swollman /* 97936079Swollman * Give the user an updated idea of our state. 98036079Swollman * If the generation differs from what we told 98136079Swollman * her before, she knows that something happened 98236079Swollman * while we were processing this request, and it 98336079Swollman * might be necessary to retry. 98436079Swollman */ 98536079Swollman s = splnet(); 98698102Shsu INP_INFO_RLOCK(&tcbinfo); 98736079Swollman xig.xig_gen = tcbinfo.ipi_gencnt; 98836079Swollman xig.xig_sogen = so_gencnt; 98936079Swollman xig.xig_count = tcbinfo.ipi_count; 99098102Shsu INP_INFO_RUNLOCK(&tcbinfo); 99136079Swollman splx(s); 99236079Swollman error = SYSCTL_OUT(req, &xig, sizeof xig); 99336079Swollman } 99436079Swollman free(inp_list, M_TEMP); 99536079Swollman return error; 99636079Swollman} 99736079Swollman 99836079SwollmanSYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, 99936079Swollman tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 100036079Swollman 100148758Sgreenstatic int 100262573Sphktcp_getcred(SYSCTL_HANDLER_ARGS) 100348758Sgreen{ 100472650Sgreen struct xucred xuc; 100548758Sgreen struct sockaddr_in addrs[2]; 100648758Sgreen struct inpcb *inp; 100748758Sgreen int error, s; 100848758Sgreen 100993593Sjhb error = suser_cred(req->td->td_ucred, PRISON_ROOT); 101048758Sgreen if (error) 101148758Sgreen return (error); 101248758Sgreen error = SYSCTL_IN(req, addrs, sizeof(addrs)); 101348758Sgreen if (error) 101448758Sgreen return (error); 101548758Sgreen s = splnet(); 101698102Shsu INP_INFO_RLOCK(&tcbinfo); 101748758Sgreen inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, 101854263Sshin addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); 101998102Shsu if (inp == NULL) { 102048758Sgreen error = ENOENT; 102198102Shsu goto outunlocked; 102248758Sgreen } 102399837Struckman INP_LOCK(inp); 102499837Struckman if (inp->inp_socket == NULL) { 102599837Struckman error = ENOENT; 102699837Struckman goto out; 102799837Struckman } 102892976Srwatson error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 102978697Sdwmalone if (error) 103078697Sdwmalone goto out; 103191354Sdd cru2x(inp->inp_socket->so_cred, &xuc); 103248758Sgreenout: 103398102Shsu INP_UNLOCK(inp); 103498102Shsuoutunlocked: 103598102Shsu INP_INFO_RUNLOCK(&tcbinfo); 103648758Sgreen splx(s); 103799838Struckman if (error == 0) 103899838Struckman error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 103948758Sgreen return (error); 104048758Sgreen} 104148758Sgreen 104278697SdwmaloneSYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, 104378697Sdwmalone CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 104478697Sdwmalone tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); 104548758Sgreen 104655679Sshin#ifdef INET6 104755679Sshinstatic int 104862573Sphktcp6_getcred(SYSCTL_HANDLER_ARGS) 104955679Sshin{ 105072650Sgreen struct xucred xuc; 105155679Sshin struct sockaddr_in6 addrs[2]; 105255679Sshin struct inpcb *inp; 105355679Sshin int error, s, mapped = 0; 105455679Sshin 105593593Sjhb error = suser_cred(req->td->td_ucred, PRISON_ROOT); 105655679Sshin if (error) 105755679Sshin return (error); 105855679Sshin error = SYSCTL_IN(req, addrs, sizeof(addrs)); 105955679Sshin if (error) 106055679Sshin return (error); 106155679Sshin if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { 106255679Sshin if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) 106355679Sshin mapped = 1; 106455679Sshin else 106555679Sshin return (EINVAL); 106655679Sshin } 106755679Sshin s = splnet(); 106898102Shsu INP_INFO_RLOCK(&tcbinfo); 106955679Sshin if (mapped == 1) 107055679Sshin inp = in_pcblookup_hash(&tcbinfo, 107155679Sshin *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], 107255679Sshin addrs[1].sin6_port, 107355679Sshin *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], 107455679Sshin addrs[0].sin6_port, 107555679Sshin 0, NULL); 107655679Sshin else 107755679Sshin inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, 107855679Sshin addrs[1].sin6_port, 107955679Sshin &addrs[0].sin6_addr, addrs[0].sin6_port, 108055679Sshin 0, NULL); 108198102Shsu if (inp == NULL) { 108255679Sshin error = ENOENT; 108398102Shsu goto outunlocked; 108455679Sshin } 108599837Struckman INP_LOCK(inp); 108699837Struckman if (inp->inp_socket == NULL) { 108799837Struckman error = ENOENT; 108899837Struckman goto out; 108999837Struckman } 109092976Srwatson error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 109178697Sdwmalone if (error) 109278697Sdwmalone goto out; 109391354Sdd cru2x(inp->inp_socket->so_cred, &xuc); 109455679Sshinout: 109598102Shsu INP_UNLOCK(inp); 109698102Shsuoutunlocked: 109798102Shsu INP_INFO_RUNLOCK(&tcbinfo); 109855679Sshin splx(s); 109999838Struckman if (error == 0) 110099838Struckman error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 110155679Sshin return (error); 110255679Sshin} 110355679Sshin 110478697SdwmaloneSYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, 110578697Sdwmalone CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 110678697Sdwmalone tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); 110755679Sshin#endif 110855679Sshin 110955679Sshin 11101541Srgrimesvoid 111112881Sbdetcp_ctlinput(cmd, sa, vip) 11121541Srgrimes int cmd; 11131541Srgrimes struct sockaddr *sa; 111412881Sbde void *vip; 11151541Srgrimes{ 111672959Sjlemon struct ip *ip = vip; 111772959Sjlemon struct tcphdr *th; 111873109Sjlemon struct in_addr faddr; 111973109Sjlemon struct inpcb *inp; 112073109Sjlemon struct tcpcb *tp; 112198211Shsu struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 112273109Sjlemon tcp_seq icmp_seq; 112373109Sjlemon int s; 11241541Srgrimes 112573109Sjlemon faddr = ((struct sockaddr_in *)sa)->sin_addr; 112673109Sjlemon if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 112773109Sjlemon return; 112873109Sjlemon 11291541Srgrimes if (cmd == PRC_QUENCH) 11301541Srgrimes notify = tcp_quench; 113174937Sjesper else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || 113299156Sjesper cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) 113372959Sjlemon notify = tcp_drop_syn_sent; 113473109Sjlemon else if (cmd == PRC_MSGSIZE) 113510881Swollman notify = tcp_mtudisc; 113672922Sjesper else if (PRC_IS_REDIRECT(cmd)) { 113772922Sjesper ip = 0; 113872922Sjesper notify = in_rtchange; 113972922Sjesper } else if (cmd == PRC_HOSTDEAD) 114072922Sjesper ip = 0; 1141119995Sru else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) 11421541Srgrimes return; 11431541Srgrimes if (ip) { 114473109Sjlemon s = splnet(); 114517269Swollman th = (struct tcphdr *)((caddr_t)ip 1146105586Sphk + (ip->ip_hl << 2)); 114798596Shsu INP_INFO_WLOCK(&tcbinfo); 114873109Sjlemon inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, 114973109Sjlemon ip->ip_src, th->th_sport, 0, NULL); 115098102Shsu if (inp != NULL) { 115198102Shsu INP_LOCK(inp); 115298102Shsu if (inp->inp_socket != NULL) { 115398102Shsu icmp_seq = htonl(th->th_seq); 115498102Shsu tp = intotcpcb(inp); 115598102Shsu if (SEQ_GEQ(icmp_seq, tp->snd_una) && 115698102Shsu SEQ_LT(icmp_seq, tp->snd_max)) 115798211Shsu inp = (*notify)(inp, inetctlerrmap[cmd]); 115898102Shsu } 115998211Shsu if (inp) 116098211Shsu INP_UNLOCK(inp); 116186764Sjlemon } else { 116286764Sjlemon struct in_conninfo inc; 116386764Sjlemon 116486764Sjlemon inc.inc_fport = th->th_dport; 116586764Sjlemon inc.inc_lport = th->th_sport; 116686764Sjlemon inc.inc_faddr = faddr; 116786764Sjlemon inc.inc_laddr = ip->ip_src; 116886764Sjlemon#ifdef INET6 116986764Sjlemon inc.inc_isipv6 = 0; 117086764Sjlemon#endif 117186764Sjlemon syncache_unreach(&inc, th); 117273109Sjlemon } 117398596Shsu INP_INFO_WUNLOCK(&tcbinfo); 117473109Sjlemon splx(s); 11751541Srgrimes } else 117698102Shsu in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); 11771541Srgrimes} 11781541Srgrimes 117955679Sshin#ifdef INET6 118055679Sshinvoid 118155679Sshintcp6_ctlinput(cmd, sa, d) 118255679Sshin int cmd; 118355679Sshin struct sockaddr *sa; 118455679Sshin void *d; 118555679Sshin{ 118655679Sshin struct tcphdr th; 118798211Shsu struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 118855679Sshin struct ip6_hdr *ip6; 118955679Sshin struct mbuf *m; 119078064Sume struct ip6ctlparam *ip6cp = NULL; 119178064Sume const struct sockaddr_in6 *sa6_src = NULL; 119255679Sshin int off; 119378064Sume struct tcp_portonly { 119478064Sume u_int16_t th_sport; 119578064Sume u_int16_t th_dport; 119678064Sume } *thp; 119755679Sshin 119855679Sshin if (sa->sa_family != AF_INET6 || 119955679Sshin sa->sa_len != sizeof(struct sockaddr_in6)) 120055679Sshin return; 120155679Sshin 120255679Sshin if (cmd == PRC_QUENCH) 120355679Sshin notify = tcp_quench; 120455679Sshin else if (cmd == PRC_MSGSIZE) 120555679Sshin notify = tcp_mtudisc; 120655679Sshin else if (!PRC_IS_REDIRECT(cmd) && 1207119995Sru ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) 120855679Sshin return; 120955679Sshin 121055679Sshin /* if the parameter is from icmp6, decode it. */ 121155679Sshin if (d != NULL) { 121278064Sume ip6cp = (struct ip6ctlparam *)d; 121355679Sshin m = ip6cp->ip6c_m; 121455679Sshin ip6 = ip6cp->ip6c_ip6; 121555679Sshin off = ip6cp->ip6c_off; 121678064Sume sa6_src = ip6cp->ip6c_src; 121755679Sshin } else { 121855679Sshin m = NULL; 121955679Sshin ip6 = NULL; 122067456Sitojun off = 0; /* fool gcc */ 122178064Sume sa6_src = &sa6_any; 122255679Sshin } 122355679Sshin 122455679Sshin if (ip6) { 122586764Sjlemon struct in_conninfo inc; 122655679Sshin /* 122755679Sshin * XXX: We assume that when IPV6 is non NULL, 122855679Sshin * M and OFF are valid. 122955679Sshin */ 123055679Sshin 123167456Sitojun /* check if we can safely examine src and dst ports */ 123278064Sume if (m->m_pkthdr.len < off + sizeof(*thp)) 123367456Sitojun return; 123467456Sitojun 123578064Sume bzero(&th, sizeof(th)); 123678064Sume m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 123778064Sume 123878064Sume in6_pcbnotify(&tcb, sa, th.th_dport, 123978064Sume (struct sockaddr *)ip6cp->ip6c_src, 124078064Sume th.th_sport, cmd, notify); 124186764Sjlemon 124286764Sjlemon inc.inc_fport = th.th_dport; 124386764Sjlemon inc.inc_lport = th.th_sport; 124486764Sjlemon inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; 124586764Sjlemon inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; 124686764Sjlemon inc.inc_isipv6 = 1; 124786764Sjlemon syncache_unreach(&inc, &th); 124855679Sshin } else 124991357Salfred in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src, 125055679Sshin 0, cmd, notify); 125155679Sshin} 125255679Sshin#endif /* INET6 */ 125355679Sshin 125480428Speter 125582122Ssilby/* 125682122Ssilby * Following is where TCP initial sequence number generation occurs. 125782122Ssilby * 125882122Ssilby * There are two places where we must use initial sequence numbers: 125982122Ssilby * 1. In SYN-ACK packets. 126082122Ssilby * 2. In SYN packets. 126182122Ssilby * 126294390Ssilby * All ISNs for SYN-ACK packets are generated by the syncache. See 126394390Ssilby * tcp_syncache.c for details. 126482122Ssilby * 126582122Ssilby * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 126682122Ssilby * depends on this property. In addition, these ISNs should be 126782122Ssilby * unguessable so as to prevent connection hijacking. To satisfy 126882122Ssilby * the requirements of this situation, the algorithm outlined in 126982122Ssilby * RFC 1948 is used to generate sequence numbers. 127082122Ssilby * 127182122Ssilby * Implementation details: 127282122Ssilby * 127382122Ssilby * Time is based off the system timer, and is corrected so that it 127482122Ssilby * increases by one megabyte per second. This allows for proper 127582122Ssilby * recycling on high speed LANs while still leaving over an hour 127682122Ssilby * before rollover. 127782122Ssilby * 127882122Ssilby * net.inet.tcp.isn_reseed_interval controls the number of seconds 127982122Ssilby * between seeding of isn_secret. This is normally set to zero, 128082122Ssilby * as reseeding should not be necessary. 128182122Ssilby * 128282122Ssilby */ 128379413Ssilby 128482122Ssilby#define ISN_BYTES_PER_SECOND 1048576 128579413Ssilby 128682122Ssilbyu_char isn_secret[32]; 128782122Ssilbyint isn_last_reseed; 128882122SsilbyMD5_CTX isn_ctx; 128975619Skris 129075619Skristcp_seq 129182122Ssilbytcp_new_isn(tp) 129282122Ssilby struct tcpcb *tp; 129375619Skris{ 129482122Ssilby u_int32_t md5_buffer[4]; 129582122Ssilby tcp_seq new_isn; 129675619Skris 129782122Ssilby /* Seed if this is the first use, reseed if requested. */ 129894390Ssilby if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && 129982122Ssilby (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) 130082122Ssilby < (u_int)ticks))) { 130182122Ssilby read_random(&isn_secret, sizeof(isn_secret)); 130282122Ssilby isn_last_reseed = ticks; 130382122Ssilby } 130482122Ssilby 130582122Ssilby /* Compute the md5 hash and return the ISN. */ 130682122Ssilby MD5Init(&isn_ctx); 130782122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); 130882122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); 130982122Ssilby#ifdef INET6 131082122Ssilby if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { 131182122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, 131282122Ssilby sizeof(struct in6_addr)); 131382122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, 131482122Ssilby sizeof(struct in6_addr)); 131582122Ssilby } else 131682122Ssilby#endif 131782122Ssilby { 131882122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, 131982122Ssilby sizeof(struct in_addr)); 132082122Ssilby MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, 132182122Ssilby sizeof(struct in_addr)); 132282122Ssilby } 132382122Ssilby MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); 132482122Ssilby MD5Final((u_char *) &md5_buffer, &isn_ctx); 132582122Ssilby new_isn = (tcp_seq) md5_buffer[0]; 132682122Ssilby new_isn += ticks * (ISN_BYTES_PER_SECOND / hz); 132782122Ssilby return new_isn; 132875619Skris} 132975619Skris 13301541Srgrimes/* 13311541Srgrimes * When a source quench is received, close congestion window 13321541Srgrimes * to one segment. We will gradually open it again as we proceed. 13331541Srgrimes */ 133498211Shsustruct inpcb * 13351541Srgrimestcp_quench(inp, errno) 13361541Srgrimes struct inpcb *inp; 13371541Srgrimes int errno; 13381541Srgrimes{ 13391541Srgrimes struct tcpcb *tp = intotcpcb(inp); 13401541Srgrimes 13411541Srgrimes if (tp) 13421541Srgrimes tp->snd_cwnd = tp->t_maxseg; 134398211Shsu return (inp); 13441541Srgrimes} 13456283Swollman 13466283Swollman/* 134772959Sjlemon * When a specific ICMP unreachable message is received and the 134872959Sjlemon * connection state is SYN-SENT, drop the connection. This behavior 134972959Sjlemon * is controlled by the icmp_may_rst sysctl. 135070103Sphk */ 135198211Shsustruct inpcb * 135270103Sphktcp_drop_syn_sent(inp, errno) 135370103Sphk struct inpcb *inp; 135470103Sphk int errno; 135570103Sphk{ 135670103Sphk struct tcpcb *tp = intotcpcb(inp); 135770103Sphk 135898211Shsu if (tp && tp->t_state == TCPS_SYN_SENT) { 135972638Sphk tcp_drop(tp, errno); 136098211Shsu return (struct inpcb *)0; 136198211Shsu } 136298211Shsu return inp; 136372638Sphk} 136472638Sphk 136572638Sphk/* 136610881Swollman * When `need fragmentation' ICMP is received, update our idea of the MSS 136710881Swollman * based on the new value in the route. Also nudge TCP to send something, 136810881Swollman * since we know the packet we just sent was dropped. 136910930Swollman * This duplicates some code in the tcp_mss() function in tcp_input.c. 137010881Swollman */ 137198211Shsustruct inpcb * 137210881Swollmantcp_mtudisc(inp, errno) 137310881Swollman struct inpcb *inp; 137410881Swollman int errno; 137510881Swollman{ 137610881Swollman struct tcpcb *tp = intotcpcb(inp); 137710930Swollman struct rtentry *rt; 137810930Swollman struct rmxp_tao *taop; 137910930Swollman struct socket *so = inp->inp_socket; 138010930Swollman int offered; 138110930Swollman int mss; 138255679Sshin#ifdef INET6 138355679Sshin int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 138455679Sshin#endif /* INET6 */ 138510881Swollman 138610930Swollman if (tp) { 138755679Sshin#ifdef INET6 138855679Sshin if (isipv6) 138986764Sjlemon rt = tcp_rtlookup6(&inp->inp_inc); 139055679Sshin else 139155679Sshin#endif /* INET6 */ 139286764Sjlemon rt = tcp_rtlookup(&inp->inp_inc); 139310930Swollman if (!rt || !rt->rt_rmx.rmx_mtu) { 139455679Sshin tp->t_maxopd = tp->t_maxseg = 139555679Sshin#ifdef INET6 139655679Sshin isipv6 ? tcp_v6mssdflt : 139755679Sshin#endif /* INET6 */ 139855679Sshin tcp_mssdflt; 139998211Shsu return inp; 140010930Swollman } 140110930Swollman taop = rmx_taop(rt->rt_rmx); 140210930Swollman offered = taop->tao_mssopt; 140355679Sshin mss = rt->rt_rmx.rmx_mtu - 140455679Sshin#ifdef INET6 140555679Sshin (isipv6 ? 140655679Sshin sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : 140755679Sshin#endif /* INET6 */ 140855679Sshin sizeof(struct tcpiphdr) 140955679Sshin#ifdef INET6 141055679Sshin ) 141155679Sshin#endif /* INET6 */ 141255679Sshin ; 141355679Sshin 141412939Swollman if (offered) 141512939Swollman mss = min(mss, offered); 141612939Swollman /* 141712939Swollman * XXX - The above conditional probably violates the TCP 141812939Swollman * spec. The problem is that, since we don't know the 141912939Swollman * other end's MSS, we are supposed to use a conservative 142012939Swollman * default. But, if we do that, then MTU discovery will 142112939Swollman * never actually take place, because the conservative 142212939Swollman * default is much less than the MTUs typically seen 142312939Swollman * on the Internet today. For the moment, we'll sweep 142412939Swollman * this under the carpet. 142512939Swollman * 142612939Swollman * The conservative default might not actually be a problem 142712939Swollman * if the only case this occurs is when sending an initial 142812939Swollman * SYN with options and data to a host we've never talked 142912939Swollman * to before. Then, they will reply with an MSS value which 143012939Swollman * will get recorded and the new parameters should get 143112939Swollman * recomputed. For Further Study. 143212939Swollman */ 143311415Swollman if (tp->t_maxopd <= mss) 143498211Shsu return inp; 143510930Swollman tp->t_maxopd = mss; 143610930Swollman 143710930Swollman if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 143810930Swollman (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 143910930Swollman mss -= TCPOLEN_TSTAMP_APPA; 144010930Swollman if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 144110930Swollman (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) 144210930Swollman mss -= TCPOLEN_CC_APPA; 144310930Swollman#if (MCLBYTES & (MCLBYTES - 1)) == 0 144410930Swollman if (mss > MCLBYTES) 144510930Swollman mss &= ~(MCLBYTES-1); 144610930Swollman#else 144710930Swollman if (mss > MCLBYTES) 144810930Swollman mss = mss / MCLBYTES * MCLBYTES; 144910881Swollman#endif 145010930Swollman if (so->so_snd.sb_hiwat < mss) 145110930Swollman mss = so->so_snd.sb_hiwat; 145210930Swollman 145310930Swollman tp->t_maxseg = mss; 145410930Swollman 145511450Swollman tcpstat.tcps_mturesent++; 145650673Sjlemon tp->t_rtttime = 0; 145711450Swollman tp->snd_nxt = tp->snd_una; 145811450Swollman tcp_output(tp); 145910930Swollman } 146098211Shsu return inp; 146110881Swollman} 146210881Swollman 146310881Swollman/* 14646283Swollman * Look-up the routing entry to the peer of this inpcb. If no route 1465108265Shsu * is found and it cannot be allocated, then return NULL. This routine 14666283Swollman * is called by TCP routines that access the rmx structure and by tcp_mss 14676283Swollman * to get the interface MTU. 14686283Swollman */ 14696283Swollmanstruct rtentry * 147086764Sjlemontcp_rtlookup(inc) 147186764Sjlemon struct in_conninfo *inc; 14726283Swollman{ 14736283Swollman struct route *ro; 14746283Swollman struct rtentry *rt; 14756283Swollman 147686764Sjlemon ro = &inc->inc_route; 14776283Swollman rt = ro->ro_rt; 14786283Swollman if (rt == NULL || !(rt->rt_flags & RTF_UP)) { 14796283Swollman /* No route yet, so try to acquire one */ 148086764Sjlemon if (inc->inc_faddr.s_addr != INADDR_ANY) { 14816283Swollman ro->ro_dst.sa_family = AF_INET; 148278492Sume ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 14836283Swollman ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = 148486764Sjlemon inc->inc_faddr; 14856283Swollman rtalloc(ro); 14866283Swollman rt = ro->ro_rt; 14876283Swollman } 14886283Swollman } 14896283Swollman return rt; 14906283Swollman} 14916283Swollman 149255679Sshin#ifdef INET6 149355679Sshinstruct rtentry * 149486764Sjlemontcp_rtlookup6(inc) 149586764Sjlemon struct in_conninfo *inc; 149655679Sshin{ 149755679Sshin struct route_in6 *ro6; 149855679Sshin struct rtentry *rt; 149955679Sshin 150086764Sjlemon ro6 = &inc->inc6_route; 150155679Sshin rt = ro6->ro_rt; 150255679Sshin if (rt == NULL || !(rt->rt_flags & RTF_UP)) { 150355679Sshin /* No route yet, so try to acquire one */ 150486764Sjlemon if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { 150586764Sjlemon ro6->ro_dst.sin6_family = AF_INET6; 150686764Sjlemon ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6); 150786764Sjlemon ro6->ro_dst.sin6_addr = inc->inc6_faddr; 150855679Sshin rtalloc((struct route *)ro6); 150955679Sshin rt = ro6->ro_rt; 151055679Sshin } 151155679Sshin } 151255679Sshin return rt; 151355679Sshin} 151455679Sshin#endif /* INET6 */ 151555679Sshin 151655679Sshin#ifdef IPSEC 151755679Sshin/* compute ESP/AH header size for TCP, including outer IP header. */ 151855679Sshinsize_t 151955679Sshinipsec_hdrsiz_tcp(tp) 152055679Sshin struct tcpcb *tp; 152155679Sshin{ 152255679Sshin struct inpcb *inp; 152355679Sshin struct mbuf *m; 152455679Sshin size_t hdrsiz; 152555679Sshin struct ip *ip; 152655679Sshin#ifdef INET6 152755679Sshin struct ip6_hdr *ip6; 1528111145Sjlemon#endif 152955679Sshin struct tcphdr *th; 153055679Sshin 153178642Ssilby if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) 153255679Sshin return 0; 1533111119Simp MGETHDR(m, M_DONTWAIT, MT_DATA); 153455679Sshin if (!m) 153555679Sshin return 0; 153655679Sshin 153755679Sshin#ifdef INET6 153855679Sshin if ((inp->inp_vflag & INP_IPV6) != 0) { 153955679Sshin ip6 = mtod(m, struct ip6_hdr *); 154055679Sshin th = (struct tcphdr *)(ip6 + 1); 154155679Sshin m->m_pkthdr.len = m->m_len = 154255679Sshin sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1543111144Sjlemon tcpip_fillheaders(inp, ip6, th); 154455679Sshin hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 154555679Sshin } else 154655679Sshin#endif /* INET6 */ 154755679Sshin { 154855679Sshin ip = mtod(m, struct ip *); 154955679Sshin th = (struct tcphdr *)(ip + 1); 155055679Sshin m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); 1551111144Sjlemon tcpip_fillheaders(inp, ip, th); 155255679Sshin hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 155355679Sshin } 155455679Sshin 155555679Sshin m_free(m); 155655679Sshin return hdrsiz; 155755679Sshin} 155855679Sshin#endif /*IPSEC*/ 155955679Sshin 15606283Swollman/* 15616283Swollman * Return a pointer to the cached information about the remote host. 15626283Swollman * The cached information is stored in the protocol specific part of 15636283Swollman * the route metrics. 15646283Swollman */ 15656283Swollmanstruct rmxp_tao * 156686764Sjlemontcp_gettaocache(inc) 156786764Sjlemon struct in_conninfo *inc; 15686283Swollman{ 156955679Sshin struct rtentry *rt; 15706283Swollman 157155679Sshin#ifdef INET6 157286764Sjlemon if (inc->inc_isipv6) 157386764Sjlemon rt = tcp_rtlookup6(inc); 157455679Sshin else 157555679Sshin#endif /* INET6 */ 157686764Sjlemon rt = tcp_rtlookup(inc); 157755679Sshin 15786283Swollman /* Make sure this is a host route and is up. */ 15796283Swollman if (rt == NULL || 15806283Swollman (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) 15816283Swollman return NULL; 15826283Swollman 15836283Swollman return rmx_taop(rt->rt_rmx); 15846283Swollman} 15856283Swollman 15866283Swollman/* 15876283Swollman * Clear all the TAO cache entries, called from tcp_init. 15886283Swollman * 15896283Swollman * XXX 15906283Swollman * This routine is just an empty one, because we assume that the routing 15916283Swollman * routing tables are initialized at the same time when TCP, so there is 15926283Swollman * nothing in the cache left over. 15936283Swollman */ 15946283Swollmanstatic void 159529506Sbdetcp_cleartaocache() 159629506Sbde{ 159729506Sbde} 1598102017Sdillon 1599102017Sdillon/* 1600111145Sjlemon * Move a TCP connection into TIME_WAIT state. 1601111145Sjlemon * tcbinfo is unlocked. 1602111145Sjlemon * inp is locked, and is unlocked before returning. 1603111145Sjlemon */ 1604111145Sjlemonvoid 1605111145Sjlemontcp_twstart(tp) 1606111145Sjlemon struct tcpcb *tp; 1607111145Sjlemon{ 1608111145Sjlemon struct tcptw *tw; 1609111145Sjlemon struct inpcb *inp; 1610111145Sjlemon int tw_time, acknow; 1611111145Sjlemon struct socket *so; 1612111145Sjlemon 1613112009Sjlemon tw = uma_zalloc(tcptw_zone, M_NOWAIT); 1614112009Sjlemon if (tw == NULL) { 1615112009Sjlemon tw = tcp_timer_2msl_tw(1); 1616112009Sjlemon if (tw == NULL) { 1617112009Sjlemon tcp_close(tp); 1618112009Sjlemon return; 1619112009Sjlemon } 1620112009Sjlemon } 1621111145Sjlemon inp = tp->t_inpcb; 1622111145Sjlemon tw->tw_inpcb = inp; 1623111145Sjlemon 1624111145Sjlemon /* 1625111145Sjlemon * Recover last window size sent. 1626111145Sjlemon */ 1627111145Sjlemon tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; 1628111145Sjlemon 1629111145Sjlemon /* 1630111145Sjlemon * Set t_recent if timestamps are used on the connection. 1631111145Sjlemon */ 1632111145Sjlemon if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == 1633111145Sjlemon (TF_REQ_TSTMP|TF_RCVD_TSTMP)) 1634111145Sjlemon tw->t_recent = tp->ts_recent; 1635111145Sjlemon else 1636111145Sjlemon tw->t_recent = 0; 1637111145Sjlemon 1638111145Sjlemon tw->snd_nxt = tp->snd_nxt; 1639111145Sjlemon tw->rcv_nxt = tp->rcv_nxt; 1640121850Ssilby tw->iss = tp->iss; 1641111145Sjlemon tw->cc_recv = tp->cc_recv; 1642111145Sjlemon tw->cc_send = tp->cc_send; 1643111145Sjlemon tw->t_starttime = tp->t_starttime; 1644112009Sjlemon tw->tw_time = 0; 1645111145Sjlemon 1646111145Sjlemon/* XXX 1647111145Sjlemon * If this code will 1648111145Sjlemon * be used for fin-wait-2 state also, then we may need 1649111145Sjlemon * a ts_recent from the last segment. 1650111145Sjlemon */ 1651111145Sjlemon /* Shorten TIME_WAIT [RFC-1644, p.28] */ 1652111145Sjlemon if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) { 1653111145Sjlemon tw_time = tp->t_rxtcur * TCPTV_TWTRUNC; 1654111145Sjlemon /* For T/TCP client, force ACK now. */ 1655111145Sjlemon acknow = 1; 1656111145Sjlemon } else { 1657111145Sjlemon tw_time = 2 * tcp_msl; 1658111145Sjlemon acknow = tp->t_flags & TF_ACKNOW; 1659111145Sjlemon } 1660111145Sjlemon tcp_discardcb(tp); 1661111145Sjlemon so = inp->inp_socket; 1662111145Sjlemon so->so_pcb = NULL; 1663111145Sjlemon tw->tw_cred = crhold(so->so_cred); 1664111145Sjlemon tw->tw_so_options = so->so_options; 1665114794Srwatson if (acknow) 1666114794Srwatson tcp_twrespond(tw, so, NULL, TH_ACK); 1667111145Sjlemon sotryfree(so); 1668111145Sjlemon inp->inp_socket = NULL; 1669111145Sjlemon inp->inp_ppcb = (caddr_t)tw; 1670111145Sjlemon inp->inp_vflag |= INP_TIMEWAIT; 1671112009Sjlemon tcp_timer_2msl_reset(tw, tw_time); 1672111145Sjlemon INP_UNLOCK(inp); 1673111145Sjlemon} 1674111145Sjlemon 1675121850Ssilby/* 1676121850Ssilby * Determine if the ISN we will generate has advanced beyond the last 1677121850Ssilby * sequence number used by the previous connection. If so, indicate 1678121850Ssilby * that it is safe to recycle this tw socket by returning 1. 1679121850Ssilby */ 1680121850Ssilbyint 1681121850Ssilbytcp_twrecycleable(struct tcptw *tw) 1682121850Ssilby{ 1683121850Ssilby tcp_seq new_isn = tw->iss; 1684121850Ssilby 1685121850Ssilby new_isn += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); 1686121850Ssilby 1687121850Ssilby if (SEQ_GT(new_isn, tw->snd_nxt)) 1688121850Ssilby return 1; 1689121850Ssilby else 1690121850Ssilby return 0; 1691121850Ssilby} 1692121850Ssilby 1693112009Sjlemonstruct tcptw * 1694112009Sjlemontcp_twclose(struct tcptw *tw, int reuse) 1695111145Sjlemon{ 1696111145Sjlemon struct inpcb *inp; 1697111145Sjlemon 1698111145Sjlemon inp = tw->tw_inpcb; 1699111145Sjlemon tw->tw_inpcb = NULL; 1700112009Sjlemon tcp_timer_2msl_stop(tw); 1701111145Sjlemon inp->inp_ppcb = NULL; 1702111145Sjlemon#ifdef INET6 1703111145Sjlemon if (inp->inp_vflag & INP_IPV6PROTO) 1704111145Sjlemon in6_pcbdetach(inp); 1705111145Sjlemon else 1706111145Sjlemon#endif 1707111145Sjlemon in_pcbdetach(inp); 1708111145Sjlemon tcpstat.tcps_closed++; 1709112009Sjlemon if (reuse) 1710112009Sjlemon return (tw); 1711112009Sjlemon uma_zfree(tcptw_zone, tw); 1712112009Sjlemon return (NULL); 1713111145Sjlemon} 1714111145Sjlemon 1715114794Srwatson/* 1716114794Srwatson * One of so and msrc must be non-NULL for use by the MAC Framework to 1717114794Srwatson * construct a label for ay resulting packet. 1718114794Srwatson */ 1719111145Sjlemonint 1720114794Srwatsontcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc, 1721114794Srwatson int flags) 1722111145Sjlemon{ 1723111145Sjlemon struct inpcb *inp = tw->tw_inpcb; 1724111145Sjlemon struct tcphdr *th; 1725111145Sjlemon struct mbuf *m; 1726111145Sjlemon struct ip *ip = NULL; 1727111145Sjlemon u_int8_t *optp; 1728111145Sjlemon u_int hdrlen, optlen; 1729111145Sjlemon int error; 1730111145Sjlemon#ifdef INET6 1731111145Sjlemon struct ip6_hdr *ip6 = NULL; 1732111145Sjlemon int isipv6 = inp->inp_inc.inc_isipv6; 1733111145Sjlemon#endif 1734111145Sjlemon 1735114794Srwatson KASSERT(so != NULL || msrc != NULL, 1736114794Srwatson ("tcp_twrespond: so and msrc NULL")); 1737114794Srwatson 1738111231Sphk m = m_gethdr(M_DONTWAIT, MT_HEADER); 1739111145Sjlemon if (m == NULL) 1740111145Sjlemon return (ENOBUFS); 1741111145Sjlemon m->m_data += max_linkhdr; 1742111145Sjlemon 1743114794Srwatson#ifdef MAC 1744114794Srwatson if (so != NULL) 1745114794Srwatson mac_create_mbuf_from_socket(so, m); 1746114794Srwatson else 1747114794Srwatson mac_create_mbuf_netlayer(msrc, m); 1748114794Srwatson#endif 1749114794Srwatson 1750111153Sjlemon#ifdef INET6 1751111145Sjlemon if (isipv6) { 1752111145Sjlemon hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1753111145Sjlemon ip6 = mtod(m, struct ip6_hdr *); 1754111145Sjlemon th = (struct tcphdr *)(ip6 + 1); 1755111145Sjlemon tcpip_fillheaders(inp, ip6, th); 1756111153Sjlemon } else 1757111153Sjlemon#endif 1758111153Sjlemon { 1759111145Sjlemon hdrlen = sizeof(struct tcpiphdr); 1760111145Sjlemon ip = mtod(m, struct ip *); 1761111145Sjlemon th = (struct tcphdr *)(ip + 1); 1762111145Sjlemon tcpip_fillheaders(inp, ip, th); 1763111145Sjlemon } 1764111145Sjlemon optp = (u_int8_t *)(th + 1); 1765111145Sjlemon 1766111145Sjlemon /* 1767111145Sjlemon * Send a timestamp and echo-reply if both our side and our peer 1768111145Sjlemon * have sent timestamps in our SYN's and this is not a RST. 1769111145Sjlemon */ 1770111145Sjlemon if (tw->t_recent && flags == TH_ACK) { 1771111145Sjlemon u_int32_t *lp = (u_int32_t *)optp; 1772111145Sjlemon 1773111145Sjlemon /* Form timestamp option as shown in appendix A of RFC 1323. */ 1774111145Sjlemon *lp++ = htonl(TCPOPT_TSTAMP_HDR); 1775111145Sjlemon *lp++ = htonl(ticks); 1776111145Sjlemon *lp = htonl(tw->t_recent); 1777111145Sjlemon optp += TCPOLEN_TSTAMP_APPA; 1778111145Sjlemon } 1779111145Sjlemon 1780111145Sjlemon /* 1781111145Sjlemon * Send `CC-family' options if needed, and it's not a RST. 1782111145Sjlemon */ 1783111145Sjlemon if (tw->cc_recv != 0 && flags == TH_ACK) { 1784111145Sjlemon u_int32_t *lp = (u_int32_t *)optp; 1785111145Sjlemon 1786111145Sjlemon *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC)); 1787111145Sjlemon *lp = htonl(tw->cc_send); 1788111145Sjlemon optp += TCPOLEN_CC_APPA; 1789111145Sjlemon } 1790111145Sjlemon optlen = optp - (u_int8_t *)(th + 1); 1791111145Sjlemon 1792111145Sjlemon m->m_len = hdrlen + optlen; 1793111145Sjlemon m->m_pkthdr.len = m->m_len; 1794111145Sjlemon 1795111145Sjlemon KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); 1796111145Sjlemon 1797111145Sjlemon th->th_seq = htonl(tw->snd_nxt); 1798111145Sjlemon th->th_ack = htonl(tw->rcv_nxt); 1799111145Sjlemon th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 1800111145Sjlemon th->th_flags = flags; 1801111145Sjlemon th->th_win = htons(tw->last_win); 1802111145Sjlemon 1803111153Sjlemon#ifdef INET6 1804111145Sjlemon if (isipv6) { 1805111145Sjlemon th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 1806111145Sjlemon sizeof(struct tcphdr) + optlen); 1807111145Sjlemon ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ? 1808111145Sjlemon inp->in6p_route.ro_rt->rt_ifp : NULL); 1809111145Sjlemon error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route, 1810111145Sjlemon (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); 1811111153Sjlemon } else 1812111153Sjlemon#endif 1813111153Sjlemon { 1814111145Sjlemon th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 1815111145Sjlemon htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); 1816111145Sjlemon m->m_pkthdr.csum_flags = CSUM_TCP; 1817111145Sjlemon m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1818111145Sjlemon ip->ip_len = m->m_pkthdr.len; 1819111145Sjlemon error = ip_output(m, inp->inp_options, &inp->inp_route, 1820111145Sjlemon (tw->tw_so_options & SO_DONTROUTE), NULL, inp); 1821111145Sjlemon } 1822111145Sjlemon if (flags & TH_ACK) 1823111145Sjlemon tcpstat.tcps_sndacks++; 1824111145Sjlemon else 1825111145Sjlemon tcpstat.tcps_sndctrl++; 1826111145Sjlemon tcpstat.tcps_sndtotal++; 1827111145Sjlemon return (error); 1828111145Sjlemon} 1829111145Sjlemon 1830111145Sjlemon/* 1831102017Sdillon * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING 1832102017Sdillon * 1833102017Sdillon * This code attempts to calculate the bandwidth-delay product as a 1834102017Sdillon * means of determining the optimal window size to maximize bandwidth, 1835102017Sdillon * minimize RTT, and avoid the over-allocation of buffers on interfaces and 1836102017Sdillon * routers. This code also does a fairly good job keeping RTTs in check 1837102017Sdillon * across slow links like modems. We implement an algorithm which is very 1838102017Sdillon * similar (but not meant to be) TCP/Vegas. The code operates on the 1839102017Sdillon * transmitter side of a TCP connection and so only effects the transmit 1840102017Sdillon * side of the connection. 1841102017Sdillon * 1842102017Sdillon * BACKGROUND: TCP makes no provision for the management of buffer space 1843102017Sdillon * at the end points or at the intermediate routers and switches. A TCP 1844102017Sdillon * stream, whether using NewReno or not, will eventually buffer as 1845102017Sdillon * many packets as it is able and the only reason this typically works is 1846102017Sdillon * due to the fairly small default buffers made available for a connection 1847102017Sdillon * (typicaly 16K or 32K). As machines use larger windows and/or window 1848102017Sdillon * scaling it is now fairly easy for even a single TCP connection to blow-out 1849102017Sdillon * all available buffer space not only on the local interface, but on 1850102017Sdillon * intermediate routers and switches as well. NewReno makes a misguided 1851102017Sdillon * attempt to 'solve' this problem by waiting for an actual failure to occur, 1852102017Sdillon * then backing off, then steadily increasing the window again until another 1853102017Sdillon * failure occurs, ad-infinitum. This results in terrible oscillation that 1854102017Sdillon * is only made worse as network loads increase and the idea of intentionally 1855102017Sdillon * blowing out network buffers is, frankly, a terrible way to manage network 1856102017Sdillon * resources. 1857102017Sdillon * 1858102017Sdillon * It is far better to limit the transmit window prior to the failure 1859102017Sdillon * condition being achieved. There are two general ways to do this: First 1860102017Sdillon * you can 'scan' through different transmit window sizes and locate the 1861102017Sdillon * point where the RTT stops increasing, indicating that you have filled the 1862102017Sdillon * pipe, then scan backwards until you note that RTT stops decreasing, then 1863102017Sdillon * repeat ad-infinitum. This method works in principle but has severe 1864102017Sdillon * implementation issues due to RTT variances, timer granularity, and 1865102017Sdillon * instability in the algorithm which can lead to many false positives and 1866102017Sdillon * create oscillations as well as interact badly with other TCP streams 1867102017Sdillon * implementing the same algorithm. 1868102017Sdillon * 1869102017Sdillon * The second method is to limit the window to the bandwidth delay product 1870102017Sdillon * of the link. This is the method we implement. RTT variances and our 1871102017Sdillon * own manipulation of the congestion window, bwnd, can potentially 1872102017Sdillon * destabilize the algorithm. For this reason we have to stabilize the 1873102017Sdillon * elements used to calculate the window. We do this by using the minimum 1874102017Sdillon * observed RTT, the long term average of the observed bandwidth, and 1875102017Sdillon * by adding two segments worth of slop. It isn't perfect but it is able 1876102017Sdillon * to react to changing conditions and gives us a very stable basis on 1877102017Sdillon * which to extend the algorithm. 1878102017Sdillon */ 1879102017Sdillonvoid 1880102017Sdillontcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) 1881102017Sdillon{ 1882102017Sdillon u_long bw; 1883102017Sdillon u_long bwnd; 1884102017Sdillon int save_ticks; 1885102017Sdillon 1886102017Sdillon /* 1887102017Sdillon * If inflight_enable is disabled in the middle of a tcp connection, 1888102017Sdillon * make sure snd_bwnd is effectively disabled. 1889102017Sdillon */ 1890102017Sdillon if (tcp_inflight_enable == 0) { 1891102017Sdillon tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 1892102017Sdillon tp->snd_bandwidth = 0; 1893102017Sdillon return; 1894102017Sdillon } 1895102017Sdillon 1896102017Sdillon /* 1897102017Sdillon * Figure out the bandwidth. Due to the tick granularity this 1898102017Sdillon * is a very rough number and it MUST be averaged over a fairly 1899102017Sdillon * long period of time. XXX we need to take into account a link 1900102017Sdillon * that is not using all available bandwidth, but for now our 1901102017Sdillon * slop will ramp us up if this case occurs and the bandwidth later 1902102017Sdillon * increases. 1903102368Sdillon * 1904102368Sdillon * Note: if ticks rollover 'bw' may wind up negative. We must 1905102368Sdillon * effectively reset t_bw_rtttime for this case. 1906102017Sdillon */ 1907102017Sdillon save_ticks = ticks; 1908102017Sdillon if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) 1909102017Sdillon return; 1910102017Sdillon 1911102017Sdillon bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 1912102017Sdillon (save_ticks - tp->t_bw_rtttime); 1913102017Sdillon tp->t_bw_rtttime = save_ticks; 1914102017Sdillon tp->t_bw_rtseq = ack_seq; 1915102368Sdillon if (tp->t_bw_rtttime == 0 || (int)bw < 0) 1916102017Sdillon return; 1917102017Sdillon bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; 1918102017Sdillon 1919102017Sdillon tp->snd_bandwidth = bw; 1920102017Sdillon 1921102017Sdillon /* 1922102017Sdillon * Calculate the semi-static bandwidth delay product, plus two maximal 1923102017Sdillon * segments. The additional slop puts us squarely in the sweet 1924107881Sdillon * spot and also handles the bandwidth run-up case and stabilization. 1925107881Sdillon * Without the slop we could be locking ourselves into a lower 1926107881Sdillon * bandwidth. 1927102017Sdillon * 1928102017Sdillon * Situations Handled: 1929102017Sdillon * (1) Prevents over-queueing of packets on LANs, especially on 1930102017Sdillon * high speed LANs, allowing larger TCP buffers to be 1931102017Sdillon * specified, and also does a good job preventing 1932102017Sdillon * over-queueing of packets over choke points like modems 1933102017Sdillon * (at least for the transmit side). 1934102017Sdillon * 1935102017Sdillon * (2) Is able to handle changing network loads (bandwidth 1936102017Sdillon * drops so bwnd drops, bandwidth increases so bwnd 1937102017Sdillon * increases). 1938102017Sdillon * 1939102017Sdillon * (3) Theoretically should stabilize in the face of multiple 1940102017Sdillon * connections implementing the same algorithm (this may need 1941102017Sdillon * a little work). 1942107881Sdillon * 1943107881Sdillon * (4) Stability value (defaults to 20 = 2 maximal packets) can 1944107881Sdillon * be adjusted with a sysctl but typically only needs to be 1945107881Sdillon * on very slow connections. A value no smaller then 5 1946107881Sdillon * should be used, but only reduce this default if you have 1947107881Sdillon * no other choice. 1948102017Sdillon */ 1949102017Sdillon#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) 1950107881Sdillon bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; 1951102368Sdillon#undef USERTT 1952102017Sdillon 1953102017Sdillon if (tcp_inflight_debug > 0) { 1954102017Sdillon static int ltime; 1955102017Sdillon if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { 1956102017Sdillon ltime = ticks; 1957102017Sdillon printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", 1958102017Sdillon tp, 1959102017Sdillon bw, 1960102017Sdillon tp->t_rttbest, 1961102017Sdillon tp->t_srtt, 1962102017Sdillon bwnd 1963102017Sdillon ); 1964102017Sdillon } 1965102017Sdillon } 1966102017Sdillon if ((long)bwnd < tcp_inflight_min) 1967102017Sdillon bwnd = tcp_inflight_min; 1968102017Sdillon if (bwnd > tcp_inflight_max) 1969102017Sdillon bwnd = tcp_inflight_max; 1970102017Sdillon if ((long)bwnd < tp->t_maxseg * 2) 1971102017Sdillon bwnd = tp->t_maxseg * 2; 1972102017Sdillon tp->snd_bwnd = bwnd; 1973102017Sdillon} 1974102017Sdillon 1975