tcp_timewait.c revision 50673
11541Srgrimes/* 211150Swollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 31541Srgrimes * The Regents of the University of California. All rights reserved. 41541Srgrimes * 51541Srgrimes * Redistribution and use in source and binary forms, with or without 61541Srgrimes * modification, are permitted provided that the following conditions 71541Srgrimes * are met: 81541Srgrimes * 1. Redistributions of source code must retain the above copyright 91541Srgrimes * notice, this list of conditions and the following disclaimer. 101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 111541Srgrimes * notice, this list of conditions and the following disclaimer in the 121541Srgrimes * documentation and/or other materials provided with the distribution. 131541Srgrimes * 3. All advertising materials mentioning features or use of this software 141541Srgrimes * must display the following acknowledgement: 151541Srgrimes * This product includes software developed by the University of 161541Srgrimes * California, Berkeley and its contributors. 171541Srgrimes * 4. Neither the name of the University nor the names of its contributors 181541Srgrimes * may be used to endorse or promote products derived from this software 191541Srgrimes * without specific prior written permission. 201541Srgrimes * 211541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 221541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 231541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 241541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 251541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 261541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 271541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 281541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 291541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 301541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 311541Srgrimes * SUCH DAMAGE. 321541Srgrimes * 3311150Swollman * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 3450477Speter * $FreeBSD: head/sys/netinet/tcp_timewait.c 50673 1999-08-30 21:17:07Z jlemon $ 351541Srgrimes */ 361541Srgrimes 3732752Seivind#include "opt_compat.h" 3829514Sjoerg#include "opt_tcpdebug.h" 3929514Sjoerg 401541Srgrimes#include <sys/param.h> 411541Srgrimes#include <sys/systm.h> 4250673Sjlemon#include <sys/callout.h> 4312172Sphk#include <sys/kernel.h> 4412172Sphk#include <sys/sysctl.h> 451541Srgrimes#include <sys/malloc.h> 461541Srgrimes#include <sys/mbuf.h> 4748758Sgreen#include <sys/proc.h> 481541Srgrimes#include <sys/socket.h> 491541Srgrimes#include <sys/socketvar.h> 501541Srgrimes#include <sys/protosw.h> 5134923Sbde 5234881Swollman#include <vm/vm_zone.h> 531541Srgrimes 541541Srgrimes#include <net/route.h> 551541Srgrimes#include <net/if.h> 561541Srgrimes 5717269Swollman#define _IP_VHL 581541Srgrimes#include <netinet/in.h> 591541Srgrimes#include <netinet/in_systm.h> 601541Srgrimes#include <netinet/ip.h> 611541Srgrimes#include <netinet/in_pcb.h> 627090Sbde#include <netinet/in_var.h> 631541Srgrimes#include <netinet/ip_var.h> 641541Srgrimes#include <netinet/tcp.h> 651541Srgrimes#include <netinet/tcp_fsm.h> 661541Srgrimes#include <netinet/tcp_seq.h> 671541Srgrimes#include <netinet/tcp_timer.h> 681541Srgrimes#include <netinet/tcp_var.h> 691541Srgrimes#include <netinet/tcpip.h> 706283Swollman#ifdef TCPDEBUG 716283Swollman#include <netinet/tcp_debug.h> 726283Swollman#endif 731541Srgrimes 741541Srgrimesint tcp_mssdflt = TCP_MSS; 7546381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 7646381Sbillf &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); 7712296Sphk 7850673Sjlemon#if 0 7912296Sphkstatic int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; 8046381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 8146381Sbillf &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); 8250673Sjlemon#endif 8312296Sphk 8412296Sphkstatic int tcp_do_rfc1323 = 1; 8546381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 8646381Sbillf &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); 8712296Sphk 8838875Sphkstatic int tcp_do_rfc1644 = 0; 8946381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 9046381Sbillf &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); 911541Srgrimes 9250426Sjlemonstatic int tcp_tcbhashsize = 0; 9350426SjlemonSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, 9450426Sjlemon &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 9550426Sjlemon 9646381SbillfSYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 9746381Sbillf &tcbinfo.ipi_count, 0, "Number of active PCBs"); 9836079Swollman 9929506Sbdestatic void tcp_cleartaocache __P((void)); 10012296Sphkstatic void tcp_notify __P((struct inpcb *, int)); 10112296Sphk 1027684Sdg/* 10332821Sdg * Target size of TCP PCB hash tables. Must be a power of two. 10443562Smsmith * 10543562Smsmith * Note that this can be overridden by the kernel environment 10643562Smsmith * variable net.inet.tcp.tcbhashsize 1077684Sdg */ 1087684Sdg#ifndef TCBHASHSIZE 10932821Sdg#define TCBHASHSIZE 512 1107684Sdg#endif 1111541Srgrimes 1121541Srgrimes/* 11334881Swollman * This is the actual shape of what we allocate using the zone 11434881Swollman * allocator. Doing it this way allows us to protect both structures 11534881Swollman * using the same generation count, and also eliminates the overhead 11634881Swollman * of allocating tcpcbs separately. By hiding the structure here, 11734881Swollman * we avoid changing most of the rest of the code (although it needs 11834881Swollman * to be changed, eventually, for greater efficiency). 11934881Swollman */ 12034923Sbde#define ALIGNMENT 32 12134923Sbde#define ALIGNM1 (ALIGNMENT - 1) 12234881Swollmanstruct inp_tp { 12334881Swollman union { 12434881Swollman struct inpcb inp; 12534881Swollman char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; 12634881Swollman } inp_tp_u; 12734881Swollman struct tcpcb tcb; 12850673Sjlemon struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; 12950673Sjlemon struct callout inp_tp_delack; 13034881Swollman}; 13134881Swollman#undef ALIGNMENT 13234881Swollman#undef ALIGNM1 13334881Swollman 13434881Swollman/* 1351541Srgrimes * Tcp initialization 1361541Srgrimes */ 1371541Srgrimesvoid 1381541Srgrimestcp_init() 1391541Srgrimes{ 14043562Smsmith int hashsize; 14143562Smsmith 14211150Swollman tcp_iss = random(); /* wrong, but better than a constant */ 1436283Swollman tcp_ccgen = 1; 1446283Swollman tcp_cleartaocache(); 14550673Sjlemon 14650673Sjlemon tcp_delacktime = TCPTV_DELACK; 14750673Sjlemon tcp_keepinit = TCPTV_KEEP_INIT; 14850673Sjlemon tcp_keepidle = TCPTV_KEEP_IDLE; 14950673Sjlemon tcp_keepintvl = TCPTV_KEEPINTVL; 15050673Sjlemon tcp_maxpersistidle = TCPTV_KEEP_IDLE; 15150673Sjlemon tcp_msl = TCPTV_MSL; 15250673Sjlemon 1537684Sdg LIST_INIT(&tcb); 1547684Sdg tcbinfo.listhead = &tcb; 15548578Smsmith TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", TCBHASHSIZE, hashsize); 15643576Smsmith if (!powerof2(hashsize)) { 15743562Smsmith printf("WARNING: TCB hash size not a power of 2\n"); 15843562Smsmith hashsize = 512; /* safe default */ 15943562Smsmith } 16050426Sjlemon tcp_tcbhashsize = hashsize; 16143562Smsmith tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); 16243562Smsmith tcbinfo.porthashbase = hashinit(hashsize, M_PCB, 16334923Sbde &tcbinfo.porthashmask); 16436079Swollman tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets, 16534881Swollman ZONE_INTERRUPT, 0); 16650673Sjlemon 1671541Srgrimes if (max_protohdr < sizeof(struct tcpiphdr)) 1681541Srgrimes max_protohdr = sizeof(struct tcpiphdr); 1691541Srgrimes if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) 1701541Srgrimes panic("tcp_init"); 1711541Srgrimes} 1721541Srgrimes 1731541Srgrimes/* 1741541Srgrimes * Create template to be used to send tcp packets on a connection. 1751541Srgrimes * Call after host entry created, allocates an mbuf and fills 1761541Srgrimes * in a skeletal tcp/ip header, minimizing the amount of work 1771541Srgrimes * necessary when the connection is used. 1781541Srgrimes */ 1791541Srgrimesstruct tcpiphdr * 1801541Srgrimestcp_template(tp) 1811541Srgrimes struct tcpcb *tp; 1821541Srgrimes{ 1831541Srgrimes register struct inpcb *inp = tp->t_inpcb; 1841541Srgrimes register struct mbuf *m; 1851541Srgrimes register struct tcpiphdr *n; 1861541Srgrimes 1871541Srgrimes if ((n = tp->t_template) == 0) { 1881541Srgrimes m = m_get(M_DONTWAIT, MT_HEADER); 1891541Srgrimes if (m == NULL) 1901541Srgrimes return (0); 1911541Srgrimes m->m_len = sizeof (struct tcpiphdr); 1921541Srgrimes n = mtod(m, struct tcpiphdr *); 1931541Srgrimes } 19438513Sdfr bzero(n->ti_x1, sizeof(n->ti_x1)); 1951541Srgrimes n->ti_pr = IPPROTO_TCP; 1961541Srgrimes n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); 1971541Srgrimes n->ti_src = inp->inp_laddr; 1981541Srgrimes n->ti_dst = inp->inp_faddr; 1991541Srgrimes n->ti_sport = inp->inp_lport; 2001541Srgrimes n->ti_dport = inp->inp_fport; 2011541Srgrimes n->ti_seq = 0; 2021541Srgrimes n->ti_ack = 0; 2031541Srgrimes n->ti_x2 = 0; 2041541Srgrimes n->ti_off = 5; 2051541Srgrimes n->ti_flags = 0; 2061541Srgrimes n->ti_win = 0; 2071541Srgrimes n->ti_sum = 0; 2081541Srgrimes n->ti_urp = 0; 2091541Srgrimes return (n); 2101541Srgrimes} 2111541Srgrimes 2121541Srgrimes/* 2131541Srgrimes * Send a single message to the TCP at address specified by 2141541Srgrimes * the given TCP/IP header. If m == 0, then we make a copy 2151541Srgrimes * of the tcpiphdr at ti and send directly to the addressed host. 2161541Srgrimes * This is used to force keep alive messages out using the TCP 2171541Srgrimes * template for a connection tp->t_template. If flags are given 2181541Srgrimes * then we send a message back to the TCP which originated the 2191541Srgrimes * segment ti, and discard the mbuf containing it and any other 2201541Srgrimes * attached mbufs. 2211541Srgrimes * 2221541Srgrimes * In any case the ack and sequence number of the transmitted 2231541Srgrimes * segment are as specified by the parameters. 22431848Sjulian * 22531848Sjulian * NOTE: If m != NULL, then ti must point to *inside* the mbuf. 2261541Srgrimes */ 2271541Srgrimesvoid 2281541Srgrimestcp_respond(tp, ti, m, ack, seq, flags) 2291541Srgrimes struct tcpcb *tp; 2301541Srgrimes register struct tcpiphdr *ti; 2311541Srgrimes register struct mbuf *m; 2321541Srgrimes tcp_seq ack, seq; 2331541Srgrimes int flags; 2341541Srgrimes{ 2351541Srgrimes register int tlen; 2361541Srgrimes int win = 0; 2371541Srgrimes struct route *ro = 0; 23814754Swollman struct route sro; 2391541Srgrimes 2401541Srgrimes if (tp) { 24141187Sguido if (!(flags & TH_RST)) 24241187Sguido win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); 2431541Srgrimes ro = &tp->t_inpcb->inp_route; 24414754Swollman } else { 24514754Swollman ro = &sro; 24614754Swollman bzero(ro, sizeof *ro); 2471541Srgrimes } 2481541Srgrimes if (m == 0) { 2491541Srgrimes m = m_gethdr(M_DONTWAIT, MT_HEADER); 2501541Srgrimes if (m == NULL) 2511541Srgrimes return; 2521541Srgrimes#ifdef TCP_COMPAT_42 2531541Srgrimes tlen = 1; 2541541Srgrimes#else 2551541Srgrimes tlen = 0; 2561541Srgrimes#endif 2571541Srgrimes m->m_data += max_linkhdr; 2581541Srgrimes *mtod(m, struct tcpiphdr *) = *ti; 2591541Srgrimes ti = mtod(m, struct tcpiphdr *); 2601541Srgrimes flags = TH_ACK; 2611541Srgrimes } else { 2621541Srgrimes m_freem(m->m_next); 2631541Srgrimes m->m_next = 0; 2641541Srgrimes m->m_data = (caddr_t)ti; 2651541Srgrimes m->m_len = sizeof (struct tcpiphdr); 2661541Srgrimes tlen = 0; 2671541Srgrimes#define xchg(a,b,type) { type t; t=a; a=b; b=t; } 26838513Sdfr xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, n_long); 26938513Sdfr xchg(ti->ti_dport, ti->ti_sport, n_short); 2701541Srgrimes#undef xchg 2711541Srgrimes } 2721541Srgrimes ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); 2731541Srgrimes tlen += sizeof (struct tcpiphdr); 2741541Srgrimes m->m_len = tlen; 2751541Srgrimes m->m_pkthdr.len = tlen; 2761541Srgrimes m->m_pkthdr.rcvif = (struct ifnet *) 0; 27738513Sdfr bzero(ti->ti_x1, sizeof(ti->ti_x1)); 2781541Srgrimes ti->ti_seq = htonl(seq); 2791541Srgrimes ti->ti_ack = htonl(ack); 2801541Srgrimes ti->ti_x2 = 0; 2811541Srgrimes ti->ti_off = sizeof (struct tcphdr) >> 2; 2821541Srgrimes ti->ti_flags = flags; 2831541Srgrimes if (tp) 2841541Srgrimes ti->ti_win = htons((u_short) (win >> tp->rcv_scale)); 2851541Srgrimes else 2861541Srgrimes ti->ti_win = htons((u_short)win); 2871541Srgrimes ti->ti_urp = 0; 2881541Srgrimes ti->ti_sum = 0; 2891541Srgrimes ti->ti_sum = in_cksum(m, tlen); 2901541Srgrimes ((struct ip *)ti)->ip_len = tlen; 2911541Srgrimes ((struct ip *)ti)->ip_ttl = ip_defttl; 2926283Swollman#ifdef TCPDEBUG 2936283Swollman if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 2946283Swollman tcp_trace(TA_OUTPUT, 0, tp, ti, 0); 2956283Swollman#endif 2961541Srgrimes (void) ip_output(m, NULL, ro, 0, NULL); 29714841Swollman if (ro == &sro && ro->ro_rt) { 29814754Swollman RTFREE(ro->ro_rt); 29914754Swollman } 3001541Srgrimes} 3011541Srgrimes 3021541Srgrimes/* 3031541Srgrimes * Create a new TCP control block, making an 3041541Srgrimes * empty reassembly queue and hooking it to the argument 30534881Swollman * protocol control block. The `inp' parameter must have 30634881Swollman * come from the zone allocator set up in tcp_init(). 3071541Srgrimes */ 3081541Srgrimesstruct tcpcb * 3091541Srgrimestcp_newtcpcb(inp) 3101541Srgrimes struct inpcb *inp; 3111541Srgrimes{ 31234923Sbde struct inp_tp *it; 3131541Srgrimes register struct tcpcb *tp; 3141541Srgrimes 31534881Swollman it = (struct inp_tp *)inp; 31634881Swollman tp = &it->tcb; 3171541Srgrimes bzero((char *) tp, sizeof(struct tcpcb)); 31838513Sdfr tp->t_segq = NULL; 3196283Swollman tp->t_maxseg = tp->t_maxopd = tcp_mssdflt; 3201541Srgrimes 32150673Sjlemon /* Set up our timeouts. */ 32250673Sjlemon callout_init(tp->tt_rexmt = &it->inp_tp_rexmt); 32350673Sjlemon callout_init(tp->tt_persist = &it->inp_tp_persist); 32450673Sjlemon callout_init(tp->tt_keep = &it->inp_tp_keep); 32550673Sjlemon callout_init(tp->tt_2msl = &it->inp_tp_2msl); 32650673Sjlemon callout_init(tp->tt_delack = &it->inp_tp_delack); 32750673Sjlemon 3286283Swollman if (tcp_do_rfc1323) 3296283Swollman tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 3306283Swollman if (tcp_do_rfc1644) 3316283Swollman tp->t_flags |= TF_REQ_CC; 33234881Swollman tp->t_inpcb = inp; /* XXX */ 3331541Srgrimes /* 3341541Srgrimes * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 33516367Swollman * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 3361541Srgrimes * reasonable initial retransmit time. 3371541Srgrimes */ 3381541Srgrimes tp->t_srtt = TCPTV_SRTTBASE; 33916367Swollman tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 3401541Srgrimes tp->t_rttmin = TCPTV_MIN; 34116367Swollman tp->t_rxtcur = TCPTV_RTOBASE; 3421541Srgrimes tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 3431541Srgrimes tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 34450673Sjlemon tp->t_rcvtime = ticks; 34524570Sdg inp->inp_ip_ttl = ip_defttl; 3461541Srgrimes inp->inp_ppcb = (caddr_t)tp; 34734881Swollman return (tp); /* XXX */ 3481541Srgrimes} 3491541Srgrimes 3501541Srgrimes/* 3511541Srgrimes * Drop a TCP connection, reporting 3521541Srgrimes * the specified error. If connection is synchronized, 3531541Srgrimes * then send a RST to peer. 3541541Srgrimes */ 3551541Srgrimesstruct tcpcb * 3561541Srgrimestcp_drop(tp, errno) 3571541Srgrimes register struct tcpcb *tp; 3581541Srgrimes int errno; 3591541Srgrimes{ 3601541Srgrimes struct socket *so = tp->t_inpcb->inp_socket; 3611541Srgrimes 3621541Srgrimes if (TCPS_HAVERCVDSYN(tp->t_state)) { 3631541Srgrimes tp->t_state = TCPS_CLOSED; 3641541Srgrimes (void) tcp_output(tp); 3651541Srgrimes tcpstat.tcps_drops++; 3661541Srgrimes } else 3671541Srgrimes tcpstat.tcps_conndrops++; 3681541Srgrimes if (errno == ETIMEDOUT && tp->t_softerror) 3691541Srgrimes errno = tp->t_softerror; 3701541Srgrimes so->so_error = errno; 3711541Srgrimes return (tcp_close(tp)); 3721541Srgrimes} 3731541Srgrimes 3741541Srgrimes/* 3751541Srgrimes * Close a TCP control block: 3761541Srgrimes * discard all space held by the tcp 3771541Srgrimes * discard internet protocol block 3781541Srgrimes * wake up any sleepers 3791541Srgrimes */ 3801541Srgrimesstruct tcpcb * 3811541Srgrimestcp_close(tp) 3821541Srgrimes register struct tcpcb *tp; 3831541Srgrimes{ 38438513Sdfr register struct mbuf *q; 38538513Sdfr register struct mbuf *nq; 3861541Srgrimes struct inpcb *inp = tp->t_inpcb; 3871541Srgrimes struct socket *so = inp->inp_socket; 3881541Srgrimes register struct rtentry *rt; 38922719Swollman int dosavessthresh; 3901541Srgrimes 3911541Srgrimes /* 39250673Sjlemon * Make sure that all of our timers are stopped before we 39350673Sjlemon * delete the PCB. 39450673Sjlemon */ 39550673Sjlemon callout_stop(tp->tt_rexmt); 39650673Sjlemon callout_stop(tp->tt_persist); 39750673Sjlemon callout_stop(tp->tt_keep); 39850673Sjlemon callout_stop(tp->tt_2msl); 39950673Sjlemon callout_stop(tp->tt_delack); 40050673Sjlemon 40150673Sjlemon /* 4029373Swollman * If we got enough samples through the srtt filter, 4039373Swollman * save the rtt and rttvar in the routing entry. 4049373Swollman * 'Enough' is arbitrarily defined as the 16 samples. 4059373Swollman * 16 samples is enough for the srtt filter to converge 4069373Swollman * to within 5% of the correct value; fewer samples and 4079373Swollman * we could save a very bogus rtt. 4081541Srgrimes * 4091541Srgrimes * Don't update the default route's characteristics and don't 4101541Srgrimes * update anything that the user "locked". 4111541Srgrimes */ 4129373Swollman if (tp->t_rttupdated >= 16 && 4131541Srgrimes (rt = inp->inp_route.ro_rt) && 4141541Srgrimes ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) { 4151549Srgrimes register u_long i = 0; 4161541Srgrimes 4171541Srgrimes if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { 4181541Srgrimes i = tp->t_srtt * 41950673Sjlemon (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); 4201541Srgrimes if (rt->rt_rmx.rmx_rtt && i) 4211541Srgrimes /* 4221541Srgrimes * filter this update to half the old & half 4231541Srgrimes * the new values, converting scale. 4241541Srgrimes * See route.h and tcp_var.h for a 4251541Srgrimes * description of the scaling constants. 4261541Srgrimes */ 4271541Srgrimes rt->rt_rmx.rmx_rtt = 4281541Srgrimes (rt->rt_rmx.rmx_rtt + i) / 2; 4291541Srgrimes else 4301541Srgrimes rt->rt_rmx.rmx_rtt = i; 4319263Swollman tcpstat.tcps_cachedrtt++; 4321541Srgrimes } 4331541Srgrimes if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { 4341541Srgrimes i = tp->t_rttvar * 43550673Sjlemon (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); 4361541Srgrimes if (rt->rt_rmx.rmx_rttvar && i) 4371541Srgrimes rt->rt_rmx.rmx_rttvar = 4381541Srgrimes (rt->rt_rmx.rmx_rttvar + i) / 2; 4391541Srgrimes else 4401541Srgrimes rt->rt_rmx.rmx_rttvar = i; 4419263Swollman tcpstat.tcps_cachedrttvar++; 4421541Srgrimes } 4431541Srgrimes /* 44422719Swollman * The old comment here said: 4451541Srgrimes * update the pipelimit (ssthresh) if it has been updated 4461541Srgrimes * already or if a pipesize was specified & the threshhold 4471541Srgrimes * got below half the pipesize. I.e., wait for bad news 4481541Srgrimes * before we start updating, then update on both good 4491541Srgrimes * and bad news. 45022719Swollman * 45122719Swollman * But we want to save the ssthresh even if no pipesize is 45222719Swollman * specified explicitly in the route, because such 45322719Swollman * connections still have an implicit pipesize specified 45422719Swollman * by the global tcp_sendspace. In the absence of a reliable 45522719Swollman * way to calculate the pipesize, it will have to do. 4561541Srgrimes */ 45722719Swollman i = tp->snd_ssthresh; 45822719Swollman if (rt->rt_rmx.rmx_sendpipe != 0) 45922719Swollman dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); 46022719Swollman else 46122719Swollman dosavessthresh = (i < so->so_snd.sb_hiwat / 2); 4623444Sphk if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && 46322719Swollman i != 0 && rt->rt_rmx.rmx_ssthresh != 0) 46422719Swollman || dosavessthresh) { 4651541Srgrimes /* 4661541Srgrimes * convert the limit from user data bytes to 4671541Srgrimes * packets then to packet data bytes. 4681541Srgrimes */ 4691541Srgrimes i = (i + tp->t_maxseg / 2) / tp->t_maxseg; 4701541Srgrimes if (i < 2) 4711541Srgrimes i = 2; 4721541Srgrimes i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr)); 4731541Srgrimes if (rt->rt_rmx.rmx_ssthresh) 4741541Srgrimes rt->rt_rmx.rmx_ssthresh = 4751541Srgrimes (rt->rt_rmx.rmx_ssthresh + i) / 2; 4761541Srgrimes else 4771541Srgrimes rt->rt_rmx.rmx_ssthresh = i; 4789263Swollman tcpstat.tcps_cachedssthresh++; 4791541Srgrimes } 4801541Srgrimes } 4811541Srgrimes /* free the reassembly queue, if any */ 48238513Sdfr for (q = tp->t_segq; q; q = nq) { 48338513Sdfr nq = q->m_nextpkt; 48438513Sdfr tp->t_segq = nq; 48538513Sdfr m_freem(q); 4861541Srgrimes } 4871541Srgrimes if (tp->t_template) 4881541Srgrimes (void) m_free(dtom(tp->t_template)); 48932821Sdg inp->inp_ppcb = NULL; 4901541Srgrimes soisdisconnected(so); 4911541Srgrimes in_pcbdetach(inp); 4921541Srgrimes tcpstat.tcps_closed++; 4931541Srgrimes return ((struct tcpcb *)0); 4941541Srgrimes} 4951541Srgrimes 4961541Srgrimesvoid 4971541Srgrimestcp_drain() 4981541Srgrimes{ 4991541Srgrimes 5001541Srgrimes} 5011541Srgrimes 5021541Srgrimes/* 5031541Srgrimes * Notify a tcp user of an asynchronous error; 5041541Srgrimes * store error as soft error, but wake up user 5051541Srgrimes * (for now, won't do anything until can select for soft error). 5061541Srgrimes */ 50712296Sphkstatic void 5081541Srgrimestcp_notify(inp, error) 5091541Srgrimes struct inpcb *inp; 5101541Srgrimes int error; 5111541Srgrimes{ 5121541Srgrimes register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; 5131541Srgrimes register struct socket *so = inp->inp_socket; 5141541Srgrimes 5151541Srgrimes /* 5161541Srgrimes * Ignore some errors if we are hooked up. 5171541Srgrimes * If connection hasn't completed, has retransmitted several times, 5181541Srgrimes * and receives a second error, give up now. This is better 5191541Srgrimes * than waiting a long time to establish a connection that 5201541Srgrimes * can never complete. 5211541Srgrimes */ 5221541Srgrimes if (tp->t_state == TCPS_ESTABLISHED && 5231541Srgrimes (error == EHOSTUNREACH || error == ENETUNREACH || 5241541Srgrimes error == EHOSTDOWN)) { 5251541Srgrimes return; 5261541Srgrimes } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 5271541Srgrimes tp->t_softerror) 5281541Srgrimes so->so_error = error; 5298876Srgrimes else 5301541Srgrimes tp->t_softerror = error; 5311541Srgrimes wakeup((caddr_t) &so->so_timeo); 5321541Srgrimes sorwakeup(so); 5331541Srgrimes sowwakeup(so); 5341541Srgrimes} 5351541Srgrimes 53636079Swollmanstatic int 53736079Swollmantcp_pcblist SYSCTL_HANDLER_ARGS 53836079Swollman{ 53936079Swollman int error, i, n, s; 54036079Swollman struct inpcb *inp, **inp_list; 54136079Swollman inp_gen_t gencnt; 54236079Swollman struct xinpgen xig; 54336079Swollman 54436079Swollman /* 54536079Swollman * The process of preparing the TCB list is too time-consuming and 54636079Swollman * resource-intensive to repeat twice on every request. 54736079Swollman */ 54836079Swollman if (req->oldptr == 0) { 54936079Swollman n = tcbinfo.ipi_count; 55036079Swollman req->oldidx = 2 * (sizeof xig) 55136079Swollman + (n + n/8) * sizeof(struct xtcpcb); 55236079Swollman return 0; 55336079Swollman } 55436079Swollman 55536079Swollman if (req->newptr != 0) 55636079Swollman return EPERM; 55736079Swollman 55836079Swollman /* 55936079Swollman * OK, now we're committed to doing something. 56036079Swollman */ 56136079Swollman s = splnet(); 56236079Swollman gencnt = tcbinfo.ipi_gencnt; 56336079Swollman n = tcbinfo.ipi_count; 56436079Swollman splx(s); 56536079Swollman 56636079Swollman xig.xig_len = sizeof xig; 56736079Swollman xig.xig_count = n; 56836079Swollman xig.xig_gen = gencnt; 56936079Swollman xig.xig_sogen = so_gencnt; 57036079Swollman error = SYSCTL_OUT(req, &xig, sizeof xig); 57136079Swollman if (error) 57236079Swollman return error; 57336079Swollman 57436079Swollman inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 57536079Swollman if (inp_list == 0) 57636079Swollman return ENOMEM; 57736079Swollman 57836079Swollman s = splnet(); 57936079Swollman for (inp = tcbinfo.listhead->lh_first, i = 0; inp && i < n; 58036079Swollman inp = inp->inp_list.le_next) { 58146155Sphk if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) 58236079Swollman inp_list[i++] = inp; 58336079Swollman } 58436079Swollman splx(s); 58536079Swollman n = i; 58636079Swollman 58736079Swollman error = 0; 58836079Swollman for (i = 0; i < n; i++) { 58936079Swollman inp = inp_list[i]; 59036079Swollman if (inp->inp_gencnt <= gencnt) { 59136079Swollman struct xtcpcb xt; 59247960Stegge caddr_t inp_ppcb; 59336079Swollman xt.xt_len = sizeof xt; 59436079Swollman /* XXX should avoid extra copy */ 59536079Swollman bcopy(inp, &xt.xt_inp, sizeof *inp); 59647960Stegge inp_ppcb = inp->inp_ppcb; 59747960Stegge if (inp_ppcb != NULL) 59847960Stegge bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); 59947960Stegge else 60047960Stegge bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 60136079Swollman if (inp->inp_socket) 60236079Swollman sotoxsocket(inp->inp_socket, &xt.xt_socket); 60336079Swollman error = SYSCTL_OUT(req, &xt, sizeof xt); 60436079Swollman } 60536079Swollman } 60636079Swollman if (!error) { 60736079Swollman /* 60836079Swollman * Give the user an updated idea of our state. 60936079Swollman * If the generation differs from what we told 61036079Swollman * her before, she knows that something happened 61136079Swollman * while we were processing this request, and it 61236079Swollman * might be necessary to retry. 61336079Swollman */ 61436079Swollman s = splnet(); 61536079Swollman xig.xig_gen = tcbinfo.ipi_gencnt; 61636079Swollman xig.xig_sogen = so_gencnt; 61736079Swollman xig.xig_count = tcbinfo.ipi_count; 61836079Swollman splx(s); 61936079Swollman error = SYSCTL_OUT(req, &xig, sizeof xig); 62036079Swollman } 62136079Swollman free(inp_list, M_TEMP); 62236079Swollman return error; 62336079Swollman} 62436079Swollman 62536079SwollmanSYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, 62636079Swollman tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 62736079Swollman 62848758Sgreenstatic int 62948758Sgreentcp_getcred SYSCTL_HANDLER_ARGS 63048758Sgreen{ 63148758Sgreen struct sockaddr_in addrs[2]; 63248758Sgreen struct inpcb *inp; 63348758Sgreen int error, s; 63448758Sgreen 63548758Sgreen error = suser(req->p); 63648758Sgreen if (error) 63748758Sgreen return (error); 63848758Sgreen error = SYSCTL_IN(req, addrs, sizeof(addrs)); 63948758Sgreen if (error) 64048758Sgreen return (error); 64148758Sgreen s = splnet(); 64248758Sgreen inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, 64348758Sgreen addrs[0].sin_addr, addrs[0].sin_port, 0); 64448758Sgreen if (inp == NULL || inp->inp_socket == NULL || 64548758Sgreen inp->inp_socket->so_cred == NULL) { 64648758Sgreen error = ENOENT; 64748758Sgreen goto out; 64848758Sgreen } 64948758Sgreen error = SYSCTL_OUT(req, inp->inp_socket->so_cred->pc_ucred, 65048758Sgreen sizeof(struct ucred)); 65148758Sgreenout: 65248758Sgreen splx(s); 65348758Sgreen return (error); 65448758Sgreen} 65548758Sgreen 65648758SgreenSYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 65748758Sgreen 0, 0, tcp_getcred, "S,ucred", "Get the ucred of a TCP connection"); 65848758Sgreen 6591541Srgrimesvoid 66012881Sbdetcp_ctlinput(cmd, sa, vip) 6611541Srgrimes int cmd; 6621541Srgrimes struct sockaddr *sa; 66312881Sbde void *vip; 6641541Srgrimes{ 66512881Sbde register struct ip *ip = vip; 6661541Srgrimes register struct tcphdr *th; 6671541Srgrimes void (*notify) __P((struct inpcb *, int)) = tcp_notify; 6681541Srgrimes 6691541Srgrimes if (cmd == PRC_QUENCH) 6701541Srgrimes notify = tcp_quench; 67110881Swollman else if (cmd == PRC_MSGSIZE) 67210881Swollman notify = tcp_mtudisc; 6731541Srgrimes else if (!PRC_IS_REDIRECT(cmd) && 6741541Srgrimes ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)) 6751541Srgrimes return; 6761541Srgrimes if (ip) { 67717269Swollman th = (struct tcphdr *)((caddr_t)ip 67817269Swollman + (IP_VHL_HL(ip->ip_vhl) << 2)); 6791541Srgrimes in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport, 6801541Srgrimes cmd, notify); 6811541Srgrimes } else 6821541Srgrimes in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify); 6831541Srgrimes} 6841541Srgrimes 6851541Srgrimes/* 6861541Srgrimes * When a source quench is received, close congestion window 6871541Srgrimes * to one segment. We will gradually open it again as we proceed. 6881541Srgrimes */ 6891541Srgrimesvoid 6901541Srgrimestcp_quench(inp, errno) 6911541Srgrimes struct inpcb *inp; 6921541Srgrimes int errno; 6931541Srgrimes{ 6941541Srgrimes struct tcpcb *tp = intotcpcb(inp); 6951541Srgrimes 6961541Srgrimes if (tp) 6971541Srgrimes tp->snd_cwnd = tp->t_maxseg; 6981541Srgrimes} 6996283Swollman 7006283Swollman/* 70110881Swollman * When `need fragmentation' ICMP is received, update our idea of the MSS 70210881Swollman * based on the new value in the route. Also nudge TCP to send something, 70310881Swollman * since we know the packet we just sent was dropped. 70410930Swollman * This duplicates some code in the tcp_mss() function in tcp_input.c. 70510881Swollman */ 70611537Swollmanvoid 70710881Swollmantcp_mtudisc(inp, errno) 70810881Swollman struct inpcb *inp; 70910881Swollman int errno; 71010881Swollman{ 71110881Swollman struct tcpcb *tp = intotcpcb(inp); 71210930Swollman struct rtentry *rt; 71310930Swollman struct rmxp_tao *taop; 71410930Swollman struct socket *so = inp->inp_socket; 71510930Swollman int offered; 71610930Swollman int mss; 71710881Swollman 71810930Swollman if (tp) { 71910930Swollman rt = tcp_rtlookup(inp); 72010930Swollman if (!rt || !rt->rt_rmx.rmx_mtu) { 72110930Swollman tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; 72210930Swollman return; 72310930Swollman } 72410930Swollman taop = rmx_taop(rt->rt_rmx); 72510930Swollman offered = taop->tao_mssopt; 72610930Swollman mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); 72712939Swollman if (offered) 72812939Swollman mss = min(mss, offered); 72912939Swollman /* 73012939Swollman * XXX - The above conditional probably violates the TCP 73112939Swollman * spec. The problem is that, since we don't know the 73212939Swollman * other end's MSS, we are supposed to use a conservative 73312939Swollman * default. But, if we do that, then MTU discovery will 73412939Swollman * never actually take place, because the conservative 73512939Swollman * default is much less than the MTUs typically seen 73612939Swollman * on the Internet today. For the moment, we'll sweep 73712939Swollman * this under the carpet. 73812939Swollman * 73912939Swollman * The conservative default might not actually be a problem 74012939Swollman * if the only case this occurs is when sending an initial 74112939Swollman * SYN with options and data to a host we've never talked 74212939Swollman * to before. Then, they will reply with an MSS value which 74312939Swollman * will get recorded and the new parameters should get 74412939Swollman * recomputed. For Further Study. 74512939Swollman */ 74611415Swollman if (tp->t_maxopd <= mss) 74711415Swollman return; 74810930Swollman tp->t_maxopd = mss; 74910930Swollman 75010930Swollman if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 75110930Swollman (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 75210930Swollman mss -= TCPOLEN_TSTAMP_APPA; 75310930Swollman if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 75410930Swollman (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) 75510930Swollman mss -= TCPOLEN_CC_APPA; 75610930Swollman#if (MCLBYTES & (MCLBYTES - 1)) == 0 75710930Swollman if (mss > MCLBYTES) 75810930Swollman mss &= ~(MCLBYTES-1); 75910930Swollman#else 76010930Swollman if (mss > MCLBYTES) 76110930Swollman mss = mss / MCLBYTES * MCLBYTES; 76210881Swollman#endif 76310930Swollman if (so->so_snd.sb_hiwat < mss) 76410930Swollman mss = so->so_snd.sb_hiwat; 76510930Swollman 76610930Swollman tp->t_maxseg = mss; 76710930Swollman 76811450Swollman tcpstat.tcps_mturesent++; 76950673Sjlemon tp->t_rtttime = 0; 77011450Swollman tp->snd_nxt = tp->snd_una; 77111450Swollman tcp_output(tp); 77210930Swollman } 77310881Swollman} 77410881Swollman 77510881Swollman/* 7766283Swollman * Look-up the routing entry to the peer of this inpcb. If no route 7776283Swollman * is found and it cannot be allocated the return NULL. This routine 7786283Swollman * is called by TCP routines that access the rmx structure and by tcp_mss 7796283Swollman * to get the interface MTU. 7806283Swollman */ 7816283Swollmanstruct rtentry * 7826283Swollmantcp_rtlookup(inp) 7836283Swollman struct inpcb *inp; 7846283Swollman{ 7856283Swollman struct route *ro; 7866283Swollman struct rtentry *rt; 7876283Swollman 7886283Swollman ro = &inp->inp_route; 7896283Swollman rt = ro->ro_rt; 7906283Swollman if (rt == NULL || !(rt->rt_flags & RTF_UP)) { 7916283Swollman /* No route yet, so try to acquire one */ 7926283Swollman if (inp->inp_faddr.s_addr != INADDR_ANY) { 7936283Swollman ro->ro_dst.sa_family = AF_INET; 7946283Swollman ro->ro_dst.sa_len = sizeof(ro->ro_dst); 7956283Swollman ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = 7966283Swollman inp->inp_faddr; 7976283Swollman rtalloc(ro); 7986283Swollman rt = ro->ro_rt; 7996283Swollman } 8006283Swollman } 8016283Swollman return rt; 8026283Swollman} 8036283Swollman 8046283Swollman/* 8056283Swollman * Return a pointer to the cached information about the remote host. 8066283Swollman * The cached information is stored in the protocol specific part of 8076283Swollman * the route metrics. 8086283Swollman */ 8096283Swollmanstruct rmxp_tao * 8106283Swollmantcp_gettaocache(inp) 8116283Swollman struct inpcb *inp; 8126283Swollman{ 8136283Swollman struct rtentry *rt = tcp_rtlookup(inp); 8146283Swollman 8156283Swollman /* Make sure this is a host route and is up. */ 8166283Swollman if (rt == NULL || 8176283Swollman (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) 8186283Swollman return NULL; 8196283Swollman 8206283Swollman return rmx_taop(rt->rt_rmx); 8216283Swollman} 8226283Swollman 8236283Swollman/* 8246283Swollman * Clear all the TAO cache entries, called from tcp_init. 8256283Swollman * 8266283Swollman * XXX 8276283Swollman * This routine is just an empty one, because we assume that the routing 8286283Swollman * routing tables are initialized at the same time when TCP, so there is 8296283Swollman * nothing in the cache left over. 8306283Swollman */ 8316283Swollmanstatic void 83229506Sbdetcp_cleartaocache() 83329506Sbde{ 83429506Sbde} 835