tcp_timewait.c revision 50673
11541Srgrimes/*
211150Swollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
41541Srgrimes *
51541Srgrimes * Redistribution and use in source and binary forms, with or without
61541Srgrimes * modification, are permitted provided that the following conditions
71541Srgrimes * are met:
81541Srgrimes * 1. Redistributions of source code must retain the above copyright
91541Srgrimes *    notice, this list of conditions and the following disclaimer.
101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer in the
121541Srgrimes *    documentation and/or other materials provided with the distribution.
131541Srgrimes * 3. All advertising materials mentioning features or use of this software
141541Srgrimes *    must display the following acknowledgement:
151541Srgrimes *	This product includes software developed by the University of
161541Srgrimes *	California, Berkeley and its contributors.
171541Srgrimes * 4. Neither the name of the University nor the names of its contributors
181541Srgrimes *    may be used to endorse or promote products derived from this software
191541Srgrimes *    without specific prior written permission.
201541Srgrimes *
211541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
221541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
231541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
241541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
251541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
261541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
271541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
281541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
291541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
301541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
311541Srgrimes * SUCH DAMAGE.
321541Srgrimes *
3311150Swollman *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
3450477Speter * $FreeBSD: head/sys/netinet/tcp_timewait.c 50673 1999-08-30 21:17:07Z jlemon $
351541Srgrimes */
361541Srgrimes
3732752Seivind#include "opt_compat.h"
3829514Sjoerg#include "opt_tcpdebug.h"
3929514Sjoerg
401541Srgrimes#include <sys/param.h>
411541Srgrimes#include <sys/systm.h>
4250673Sjlemon#include <sys/callout.h>
4312172Sphk#include <sys/kernel.h>
4412172Sphk#include <sys/sysctl.h>
451541Srgrimes#include <sys/malloc.h>
461541Srgrimes#include <sys/mbuf.h>
4748758Sgreen#include <sys/proc.h>
481541Srgrimes#include <sys/socket.h>
491541Srgrimes#include <sys/socketvar.h>
501541Srgrimes#include <sys/protosw.h>
5134923Sbde
5234881Swollman#include <vm/vm_zone.h>
531541Srgrimes
541541Srgrimes#include <net/route.h>
551541Srgrimes#include <net/if.h>
561541Srgrimes
5717269Swollman#define _IP_VHL
581541Srgrimes#include <netinet/in.h>
591541Srgrimes#include <netinet/in_systm.h>
601541Srgrimes#include <netinet/ip.h>
611541Srgrimes#include <netinet/in_pcb.h>
627090Sbde#include <netinet/in_var.h>
631541Srgrimes#include <netinet/ip_var.h>
641541Srgrimes#include <netinet/tcp.h>
651541Srgrimes#include <netinet/tcp_fsm.h>
661541Srgrimes#include <netinet/tcp_seq.h>
671541Srgrimes#include <netinet/tcp_timer.h>
681541Srgrimes#include <netinet/tcp_var.h>
691541Srgrimes#include <netinet/tcpip.h>
706283Swollman#ifdef TCPDEBUG
716283Swollman#include <netinet/tcp_debug.h>
726283Swollman#endif
731541Srgrimes
741541Srgrimesint 	tcp_mssdflt = TCP_MSS;
7546381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
7646381Sbillf    &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
7712296Sphk
7850673Sjlemon#if 0
7912296Sphkstatic int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
8046381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
8146381Sbillf    &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
8250673Sjlemon#endif
8312296Sphk
8412296Sphkstatic int	tcp_do_rfc1323 = 1;
8546381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
8646381Sbillf    &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
8712296Sphk
8838875Sphkstatic int	tcp_do_rfc1644 = 0;
8946381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
9046381Sbillf    &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
911541Srgrimes
9250426Sjlemonstatic int	tcp_tcbhashsize = 0;
9350426SjlemonSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
9450426Sjlemon     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
9550426Sjlemon
9646381SbillfSYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
9746381Sbillf    &tcbinfo.ipi_count, 0, "Number of active PCBs");
9836079Swollman
9929506Sbdestatic void	tcp_cleartaocache __P((void));
10012296Sphkstatic void	tcp_notify __P((struct inpcb *, int));
10112296Sphk
1027684Sdg/*
10332821Sdg * Target size of TCP PCB hash tables. Must be a power of two.
10443562Smsmith *
10543562Smsmith * Note that this can be overridden by the kernel environment
10643562Smsmith * variable net.inet.tcp.tcbhashsize
1077684Sdg */
1087684Sdg#ifndef TCBHASHSIZE
10932821Sdg#define TCBHASHSIZE	512
1107684Sdg#endif
1111541Srgrimes
1121541Srgrimes/*
11334881Swollman * This is the actual shape of what we allocate using the zone
11434881Swollman * allocator.  Doing it this way allows us to protect both structures
11534881Swollman * using the same generation count, and also eliminates the overhead
11634881Swollman * of allocating tcpcbs separately.  By hiding the structure here,
11734881Swollman * we avoid changing most of the rest of the code (although it needs
11834881Swollman * to be changed, eventually, for greater efficiency).
11934881Swollman */
12034923Sbde#define	ALIGNMENT	32
12134923Sbde#define	ALIGNM1		(ALIGNMENT - 1)
12234881Swollmanstruct	inp_tp {
12334881Swollman	union {
12434881Swollman		struct	inpcb inp;
12534881Swollman		char	align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
12634881Swollman	} inp_tp_u;
12734881Swollman	struct	tcpcb tcb;
12850673Sjlemon	struct	callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
12950673Sjlemon	struct	callout inp_tp_delack;
13034881Swollman};
13134881Swollman#undef ALIGNMENT
13234881Swollman#undef ALIGNM1
13334881Swollman
13434881Swollman/*
1351541Srgrimes * Tcp initialization
1361541Srgrimes */
1371541Srgrimesvoid
1381541Srgrimestcp_init()
1391541Srgrimes{
14043562Smsmith	int hashsize;
14143562Smsmith
14211150Swollman	tcp_iss = random();	/* wrong, but better than a constant */
1436283Swollman	tcp_ccgen = 1;
1446283Swollman	tcp_cleartaocache();
14550673Sjlemon
14650673Sjlemon	tcp_delacktime = TCPTV_DELACK;
14750673Sjlemon	tcp_keepinit = TCPTV_KEEP_INIT;
14850673Sjlemon	tcp_keepidle = TCPTV_KEEP_IDLE;
14950673Sjlemon	tcp_keepintvl = TCPTV_KEEPINTVL;
15050673Sjlemon	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
15150673Sjlemon	tcp_msl = TCPTV_MSL;
15250673Sjlemon
1537684Sdg	LIST_INIT(&tcb);
1547684Sdg	tcbinfo.listhead = &tcb;
15548578Smsmith	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", TCBHASHSIZE, hashsize);
15643576Smsmith	if (!powerof2(hashsize)) {
15743562Smsmith		printf("WARNING: TCB hash size not a power of 2\n");
15843562Smsmith		hashsize = 512; /* safe default */
15943562Smsmith	}
16050426Sjlemon	tcp_tcbhashsize = hashsize;
16143562Smsmith	tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
16243562Smsmith	tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
16334923Sbde					&tcbinfo.porthashmask);
16436079Swollman	tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets,
16534881Swollman				 ZONE_INTERRUPT, 0);
16650673Sjlemon
1671541Srgrimes	if (max_protohdr < sizeof(struct tcpiphdr))
1681541Srgrimes		max_protohdr = sizeof(struct tcpiphdr);
1691541Srgrimes	if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN)
1701541Srgrimes		panic("tcp_init");
1711541Srgrimes}
1721541Srgrimes
1731541Srgrimes/*
1741541Srgrimes * Create template to be used to send tcp packets on a connection.
1751541Srgrimes * Call after host entry created, allocates an mbuf and fills
1761541Srgrimes * in a skeletal tcp/ip header, minimizing the amount of work
1771541Srgrimes * necessary when the connection is used.
1781541Srgrimes */
1791541Srgrimesstruct tcpiphdr *
1801541Srgrimestcp_template(tp)
1811541Srgrimes	struct tcpcb *tp;
1821541Srgrimes{
1831541Srgrimes	register struct inpcb *inp = tp->t_inpcb;
1841541Srgrimes	register struct mbuf *m;
1851541Srgrimes	register struct tcpiphdr *n;
1861541Srgrimes
1871541Srgrimes	if ((n = tp->t_template) == 0) {
1881541Srgrimes		m = m_get(M_DONTWAIT, MT_HEADER);
1891541Srgrimes		if (m == NULL)
1901541Srgrimes			return (0);
1911541Srgrimes		m->m_len = sizeof (struct tcpiphdr);
1921541Srgrimes		n = mtod(m, struct tcpiphdr *);
1931541Srgrimes	}
19438513Sdfr	bzero(n->ti_x1, sizeof(n->ti_x1));
1951541Srgrimes	n->ti_pr = IPPROTO_TCP;
1961541Srgrimes	n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip));
1971541Srgrimes	n->ti_src = inp->inp_laddr;
1981541Srgrimes	n->ti_dst = inp->inp_faddr;
1991541Srgrimes	n->ti_sport = inp->inp_lport;
2001541Srgrimes	n->ti_dport = inp->inp_fport;
2011541Srgrimes	n->ti_seq = 0;
2021541Srgrimes	n->ti_ack = 0;
2031541Srgrimes	n->ti_x2 = 0;
2041541Srgrimes	n->ti_off = 5;
2051541Srgrimes	n->ti_flags = 0;
2061541Srgrimes	n->ti_win = 0;
2071541Srgrimes	n->ti_sum = 0;
2081541Srgrimes	n->ti_urp = 0;
2091541Srgrimes	return (n);
2101541Srgrimes}
2111541Srgrimes
2121541Srgrimes/*
2131541Srgrimes * Send a single message to the TCP at address specified by
2141541Srgrimes * the given TCP/IP header.  If m == 0, then we make a copy
2151541Srgrimes * of the tcpiphdr at ti and send directly to the addressed host.
2161541Srgrimes * This is used to force keep alive messages out using the TCP
2171541Srgrimes * template for a connection tp->t_template.  If flags are given
2181541Srgrimes * then we send a message back to the TCP which originated the
2191541Srgrimes * segment ti, and discard the mbuf containing it and any other
2201541Srgrimes * attached mbufs.
2211541Srgrimes *
2221541Srgrimes * In any case the ack and sequence number of the transmitted
2231541Srgrimes * segment are as specified by the parameters.
22431848Sjulian *
22531848Sjulian * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
2261541Srgrimes */
2271541Srgrimesvoid
2281541Srgrimestcp_respond(tp, ti, m, ack, seq, flags)
2291541Srgrimes	struct tcpcb *tp;
2301541Srgrimes	register struct tcpiphdr *ti;
2311541Srgrimes	register struct mbuf *m;
2321541Srgrimes	tcp_seq ack, seq;
2331541Srgrimes	int flags;
2341541Srgrimes{
2351541Srgrimes	register int tlen;
2361541Srgrimes	int win = 0;
2371541Srgrimes	struct route *ro = 0;
23814754Swollman	struct route sro;
2391541Srgrimes
2401541Srgrimes	if (tp) {
24141187Sguido		if (!(flags & TH_RST))
24241187Sguido			win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
2431541Srgrimes		ro = &tp->t_inpcb->inp_route;
24414754Swollman	} else {
24514754Swollman		ro = &sro;
24614754Swollman		bzero(ro, sizeof *ro);
2471541Srgrimes	}
2481541Srgrimes	if (m == 0) {
2491541Srgrimes		m = m_gethdr(M_DONTWAIT, MT_HEADER);
2501541Srgrimes		if (m == NULL)
2511541Srgrimes			return;
2521541Srgrimes#ifdef TCP_COMPAT_42
2531541Srgrimes		tlen = 1;
2541541Srgrimes#else
2551541Srgrimes		tlen = 0;
2561541Srgrimes#endif
2571541Srgrimes		m->m_data += max_linkhdr;
2581541Srgrimes		*mtod(m, struct tcpiphdr *) = *ti;
2591541Srgrimes		ti = mtod(m, struct tcpiphdr *);
2601541Srgrimes		flags = TH_ACK;
2611541Srgrimes	} else {
2621541Srgrimes		m_freem(m->m_next);
2631541Srgrimes		m->m_next = 0;
2641541Srgrimes		m->m_data = (caddr_t)ti;
2651541Srgrimes		m->m_len = sizeof (struct tcpiphdr);
2661541Srgrimes		tlen = 0;
2671541Srgrimes#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
26838513Sdfr		xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, n_long);
26938513Sdfr		xchg(ti->ti_dport, ti->ti_sport, n_short);
2701541Srgrimes#undef xchg
2711541Srgrimes	}
2721541Srgrimes	ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen));
2731541Srgrimes	tlen += sizeof (struct tcpiphdr);
2741541Srgrimes	m->m_len = tlen;
2751541Srgrimes	m->m_pkthdr.len = tlen;
2761541Srgrimes	m->m_pkthdr.rcvif = (struct ifnet *) 0;
27738513Sdfr	bzero(ti->ti_x1, sizeof(ti->ti_x1));
2781541Srgrimes	ti->ti_seq = htonl(seq);
2791541Srgrimes	ti->ti_ack = htonl(ack);
2801541Srgrimes	ti->ti_x2 = 0;
2811541Srgrimes	ti->ti_off = sizeof (struct tcphdr) >> 2;
2821541Srgrimes	ti->ti_flags = flags;
2831541Srgrimes	if (tp)
2841541Srgrimes		ti->ti_win = htons((u_short) (win >> tp->rcv_scale));
2851541Srgrimes	else
2861541Srgrimes		ti->ti_win = htons((u_short)win);
2871541Srgrimes	ti->ti_urp = 0;
2881541Srgrimes	ti->ti_sum = 0;
2891541Srgrimes	ti->ti_sum = in_cksum(m, tlen);
2901541Srgrimes	((struct ip *)ti)->ip_len = tlen;
2911541Srgrimes	((struct ip *)ti)->ip_ttl = ip_defttl;
2926283Swollman#ifdef TCPDEBUG
2936283Swollman	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2946283Swollman		tcp_trace(TA_OUTPUT, 0, tp, ti, 0);
2956283Swollman#endif
2961541Srgrimes	(void) ip_output(m, NULL, ro, 0, NULL);
29714841Swollman	if (ro == &sro && ro->ro_rt) {
29814754Swollman		RTFREE(ro->ro_rt);
29914754Swollman	}
3001541Srgrimes}
3011541Srgrimes
3021541Srgrimes/*
3031541Srgrimes * Create a new TCP control block, making an
3041541Srgrimes * empty reassembly queue and hooking it to the argument
30534881Swollman * protocol control block.  The `inp' parameter must have
30634881Swollman * come from the zone allocator set up in tcp_init().
3071541Srgrimes */
3081541Srgrimesstruct tcpcb *
3091541Srgrimestcp_newtcpcb(inp)
3101541Srgrimes	struct inpcb *inp;
3111541Srgrimes{
31234923Sbde	struct inp_tp *it;
3131541Srgrimes	register struct tcpcb *tp;
3141541Srgrimes
31534881Swollman	it = (struct inp_tp *)inp;
31634881Swollman	tp = &it->tcb;
3171541Srgrimes	bzero((char *) tp, sizeof(struct tcpcb));
31838513Sdfr	tp->t_segq = NULL;
3196283Swollman	tp->t_maxseg = tp->t_maxopd = tcp_mssdflt;
3201541Srgrimes
32150673Sjlemon	/* Set up our timeouts. */
32250673Sjlemon	callout_init(tp->tt_rexmt = &it->inp_tp_rexmt);
32350673Sjlemon	callout_init(tp->tt_persist = &it->inp_tp_persist);
32450673Sjlemon	callout_init(tp->tt_keep = &it->inp_tp_keep);
32550673Sjlemon	callout_init(tp->tt_2msl = &it->inp_tp_2msl);
32650673Sjlemon	callout_init(tp->tt_delack = &it->inp_tp_delack);
32750673Sjlemon
3286283Swollman	if (tcp_do_rfc1323)
3296283Swollman		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
3306283Swollman	if (tcp_do_rfc1644)
3316283Swollman		tp->t_flags |= TF_REQ_CC;
33234881Swollman	tp->t_inpcb = inp;	/* XXX */
3331541Srgrimes	/*
3341541Srgrimes	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
33516367Swollman	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
3361541Srgrimes	 * reasonable initial retransmit time.
3371541Srgrimes	 */
3381541Srgrimes	tp->t_srtt = TCPTV_SRTTBASE;
33916367Swollman	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
3401541Srgrimes	tp->t_rttmin = TCPTV_MIN;
34116367Swollman	tp->t_rxtcur = TCPTV_RTOBASE;
3421541Srgrimes	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
3431541Srgrimes	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
34450673Sjlemon	tp->t_rcvtime = ticks;
34524570Sdg	inp->inp_ip_ttl = ip_defttl;
3461541Srgrimes	inp->inp_ppcb = (caddr_t)tp;
34734881Swollman	return (tp);		/* XXX */
3481541Srgrimes}
3491541Srgrimes
3501541Srgrimes/*
3511541Srgrimes * Drop a TCP connection, reporting
3521541Srgrimes * the specified error.  If connection is synchronized,
3531541Srgrimes * then send a RST to peer.
3541541Srgrimes */
3551541Srgrimesstruct tcpcb *
3561541Srgrimestcp_drop(tp, errno)
3571541Srgrimes	register struct tcpcb *tp;
3581541Srgrimes	int errno;
3591541Srgrimes{
3601541Srgrimes	struct socket *so = tp->t_inpcb->inp_socket;
3611541Srgrimes
3621541Srgrimes	if (TCPS_HAVERCVDSYN(tp->t_state)) {
3631541Srgrimes		tp->t_state = TCPS_CLOSED;
3641541Srgrimes		(void) tcp_output(tp);
3651541Srgrimes		tcpstat.tcps_drops++;
3661541Srgrimes	} else
3671541Srgrimes		tcpstat.tcps_conndrops++;
3681541Srgrimes	if (errno == ETIMEDOUT && tp->t_softerror)
3691541Srgrimes		errno = tp->t_softerror;
3701541Srgrimes	so->so_error = errno;
3711541Srgrimes	return (tcp_close(tp));
3721541Srgrimes}
3731541Srgrimes
3741541Srgrimes/*
3751541Srgrimes * Close a TCP control block:
3761541Srgrimes *	discard all space held by the tcp
3771541Srgrimes *	discard internet protocol block
3781541Srgrimes *	wake up any sleepers
3791541Srgrimes */
3801541Srgrimesstruct tcpcb *
3811541Srgrimestcp_close(tp)
3821541Srgrimes	register struct tcpcb *tp;
3831541Srgrimes{
38438513Sdfr	register struct mbuf *q;
38538513Sdfr	register struct mbuf *nq;
3861541Srgrimes	struct inpcb *inp = tp->t_inpcb;
3871541Srgrimes	struct socket *so = inp->inp_socket;
3881541Srgrimes	register struct rtentry *rt;
38922719Swollman	int dosavessthresh;
3901541Srgrimes
3911541Srgrimes	/*
39250673Sjlemon	 * Make sure that all of our timers are stopped before we
39350673Sjlemon	 * delete the PCB.
39450673Sjlemon	 */
39550673Sjlemon	callout_stop(tp->tt_rexmt);
39650673Sjlemon	callout_stop(tp->tt_persist);
39750673Sjlemon	callout_stop(tp->tt_keep);
39850673Sjlemon	callout_stop(tp->tt_2msl);
39950673Sjlemon	callout_stop(tp->tt_delack);
40050673Sjlemon
40150673Sjlemon	/*
4029373Swollman	 * If we got enough samples through the srtt filter,
4039373Swollman	 * save the rtt and rttvar in the routing entry.
4049373Swollman	 * 'Enough' is arbitrarily defined as the 16 samples.
4059373Swollman	 * 16 samples is enough for the srtt filter to converge
4069373Swollman	 * to within 5% of the correct value; fewer samples and
4079373Swollman	 * we could save a very bogus rtt.
4081541Srgrimes	 *
4091541Srgrimes	 * Don't update the default route's characteristics and don't
4101541Srgrimes	 * update anything that the user "locked".
4111541Srgrimes	 */
4129373Swollman	if (tp->t_rttupdated >= 16 &&
4131541Srgrimes	    (rt = inp->inp_route.ro_rt) &&
4141541Srgrimes	    ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) {
4151549Srgrimes		register u_long i = 0;
4161541Srgrimes
4171541Srgrimes		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
4181541Srgrimes			i = tp->t_srtt *
41950673Sjlemon			    (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
4201541Srgrimes			if (rt->rt_rmx.rmx_rtt && i)
4211541Srgrimes				/*
4221541Srgrimes				 * filter this update to half the old & half
4231541Srgrimes				 * the new values, converting scale.
4241541Srgrimes				 * See route.h and tcp_var.h for a
4251541Srgrimes				 * description of the scaling constants.
4261541Srgrimes				 */
4271541Srgrimes				rt->rt_rmx.rmx_rtt =
4281541Srgrimes				    (rt->rt_rmx.rmx_rtt + i) / 2;
4291541Srgrimes			else
4301541Srgrimes				rt->rt_rmx.rmx_rtt = i;
4319263Swollman			tcpstat.tcps_cachedrtt++;
4321541Srgrimes		}
4331541Srgrimes		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
4341541Srgrimes			i = tp->t_rttvar *
43550673Sjlemon			    (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
4361541Srgrimes			if (rt->rt_rmx.rmx_rttvar && i)
4371541Srgrimes				rt->rt_rmx.rmx_rttvar =
4381541Srgrimes				    (rt->rt_rmx.rmx_rttvar + i) / 2;
4391541Srgrimes			else
4401541Srgrimes				rt->rt_rmx.rmx_rttvar = i;
4419263Swollman			tcpstat.tcps_cachedrttvar++;
4421541Srgrimes		}
4431541Srgrimes		/*
44422719Swollman		 * The old comment here said:
4451541Srgrimes		 * update the pipelimit (ssthresh) if it has been updated
4461541Srgrimes		 * already or if a pipesize was specified & the threshhold
4471541Srgrimes		 * got below half the pipesize.  I.e., wait for bad news
4481541Srgrimes		 * before we start updating, then update on both good
4491541Srgrimes		 * and bad news.
45022719Swollman		 *
45122719Swollman		 * But we want to save the ssthresh even if no pipesize is
45222719Swollman		 * specified explicitly in the route, because such
45322719Swollman		 * connections still have an implicit pipesize specified
45422719Swollman		 * by the global tcp_sendspace.  In the absence of a reliable
45522719Swollman		 * way to calculate the pipesize, it will have to do.
4561541Srgrimes		 */
45722719Swollman		i = tp->snd_ssthresh;
45822719Swollman		if (rt->rt_rmx.rmx_sendpipe != 0)
45922719Swollman			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
46022719Swollman		else
46122719Swollman			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
4623444Sphk		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
46322719Swollman		     i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
46422719Swollman		    || dosavessthresh) {
4651541Srgrimes			/*
4661541Srgrimes			 * convert the limit from user data bytes to
4671541Srgrimes			 * packets then to packet data bytes.
4681541Srgrimes			 */
4691541Srgrimes			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
4701541Srgrimes			if (i < 2)
4711541Srgrimes				i = 2;
4721541Srgrimes			i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr));
4731541Srgrimes			if (rt->rt_rmx.rmx_ssthresh)
4741541Srgrimes				rt->rt_rmx.rmx_ssthresh =
4751541Srgrimes				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
4761541Srgrimes			else
4771541Srgrimes				rt->rt_rmx.rmx_ssthresh = i;
4789263Swollman			tcpstat.tcps_cachedssthresh++;
4791541Srgrimes		}
4801541Srgrimes	}
4811541Srgrimes	/* free the reassembly queue, if any */
48238513Sdfr	for (q = tp->t_segq; q; q = nq) {
48338513Sdfr		nq = q->m_nextpkt;
48438513Sdfr		tp->t_segq = nq;
48538513Sdfr		m_freem(q);
4861541Srgrimes	}
4871541Srgrimes	if (tp->t_template)
4881541Srgrimes		(void) m_free(dtom(tp->t_template));
48932821Sdg	inp->inp_ppcb = NULL;
4901541Srgrimes	soisdisconnected(so);
4911541Srgrimes	in_pcbdetach(inp);
4921541Srgrimes	tcpstat.tcps_closed++;
4931541Srgrimes	return ((struct tcpcb *)0);
4941541Srgrimes}
4951541Srgrimes
4961541Srgrimesvoid
4971541Srgrimestcp_drain()
4981541Srgrimes{
4991541Srgrimes
5001541Srgrimes}
5011541Srgrimes
5021541Srgrimes/*
5031541Srgrimes * Notify a tcp user of an asynchronous error;
5041541Srgrimes * store error as soft error, but wake up user
5051541Srgrimes * (for now, won't do anything until can select for soft error).
5061541Srgrimes */
50712296Sphkstatic void
5081541Srgrimestcp_notify(inp, error)
5091541Srgrimes	struct inpcb *inp;
5101541Srgrimes	int error;
5111541Srgrimes{
5121541Srgrimes	register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
5131541Srgrimes	register struct socket *so = inp->inp_socket;
5141541Srgrimes
5151541Srgrimes	/*
5161541Srgrimes	 * Ignore some errors if we are hooked up.
5171541Srgrimes	 * If connection hasn't completed, has retransmitted several times,
5181541Srgrimes	 * and receives a second error, give up now.  This is better
5191541Srgrimes	 * than waiting a long time to establish a connection that
5201541Srgrimes	 * can never complete.
5211541Srgrimes	 */
5221541Srgrimes	if (tp->t_state == TCPS_ESTABLISHED &&
5231541Srgrimes	     (error == EHOSTUNREACH || error == ENETUNREACH ||
5241541Srgrimes	      error == EHOSTDOWN)) {
5251541Srgrimes		return;
5261541Srgrimes	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
5271541Srgrimes	    tp->t_softerror)
5281541Srgrimes		so->so_error = error;
5298876Srgrimes	else
5301541Srgrimes		tp->t_softerror = error;
5311541Srgrimes	wakeup((caddr_t) &so->so_timeo);
5321541Srgrimes	sorwakeup(so);
5331541Srgrimes	sowwakeup(so);
5341541Srgrimes}
5351541Srgrimes
53636079Swollmanstatic int
53736079Swollmantcp_pcblist SYSCTL_HANDLER_ARGS
53836079Swollman{
53936079Swollman	int error, i, n, s;
54036079Swollman	struct inpcb *inp, **inp_list;
54136079Swollman	inp_gen_t gencnt;
54236079Swollman	struct xinpgen xig;
54336079Swollman
54436079Swollman	/*
54536079Swollman	 * The process of preparing the TCB list is too time-consuming and
54636079Swollman	 * resource-intensive to repeat twice on every request.
54736079Swollman	 */
54836079Swollman	if (req->oldptr == 0) {
54936079Swollman		n = tcbinfo.ipi_count;
55036079Swollman		req->oldidx = 2 * (sizeof xig)
55136079Swollman			+ (n + n/8) * sizeof(struct xtcpcb);
55236079Swollman		return 0;
55336079Swollman	}
55436079Swollman
55536079Swollman	if (req->newptr != 0)
55636079Swollman		return EPERM;
55736079Swollman
55836079Swollman	/*
55936079Swollman	 * OK, now we're committed to doing something.
56036079Swollman	 */
56136079Swollman	s = splnet();
56236079Swollman	gencnt = tcbinfo.ipi_gencnt;
56336079Swollman	n = tcbinfo.ipi_count;
56436079Swollman	splx(s);
56536079Swollman
56636079Swollman	xig.xig_len = sizeof xig;
56736079Swollman	xig.xig_count = n;
56836079Swollman	xig.xig_gen = gencnt;
56936079Swollman	xig.xig_sogen = so_gencnt;
57036079Swollman	error = SYSCTL_OUT(req, &xig, sizeof xig);
57136079Swollman	if (error)
57236079Swollman		return error;
57336079Swollman
57436079Swollman	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
57536079Swollman	if (inp_list == 0)
57636079Swollman		return ENOMEM;
57736079Swollman
57836079Swollman	s = splnet();
57936079Swollman	for (inp = tcbinfo.listhead->lh_first, i = 0; inp && i < n;
58036079Swollman	     inp = inp->inp_list.le_next) {
58146155Sphk		if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
58236079Swollman			inp_list[i++] = inp;
58336079Swollman	}
58436079Swollman	splx(s);
58536079Swollman	n = i;
58636079Swollman
58736079Swollman	error = 0;
58836079Swollman	for (i = 0; i < n; i++) {
58936079Swollman		inp = inp_list[i];
59036079Swollman		if (inp->inp_gencnt <= gencnt) {
59136079Swollman			struct xtcpcb xt;
59247960Stegge			caddr_t inp_ppcb;
59336079Swollman			xt.xt_len = sizeof xt;
59436079Swollman			/* XXX should avoid extra copy */
59536079Swollman			bcopy(inp, &xt.xt_inp, sizeof *inp);
59647960Stegge			inp_ppcb = inp->inp_ppcb;
59747960Stegge			if (inp_ppcb != NULL)
59847960Stegge				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
59947960Stegge			else
60047960Stegge				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
60136079Swollman			if (inp->inp_socket)
60236079Swollman				sotoxsocket(inp->inp_socket, &xt.xt_socket);
60336079Swollman			error = SYSCTL_OUT(req, &xt, sizeof xt);
60436079Swollman		}
60536079Swollman	}
60636079Swollman	if (!error) {
60736079Swollman		/*
60836079Swollman		 * Give the user an updated idea of our state.
60936079Swollman		 * If the generation differs from what we told
61036079Swollman		 * her before, she knows that something happened
61136079Swollman		 * while we were processing this request, and it
61236079Swollman		 * might be necessary to retry.
61336079Swollman		 */
61436079Swollman		s = splnet();
61536079Swollman		xig.xig_gen = tcbinfo.ipi_gencnt;
61636079Swollman		xig.xig_sogen = so_gencnt;
61736079Swollman		xig.xig_count = tcbinfo.ipi_count;
61836079Swollman		splx(s);
61936079Swollman		error = SYSCTL_OUT(req, &xig, sizeof xig);
62036079Swollman	}
62136079Swollman	free(inp_list, M_TEMP);
62236079Swollman	return error;
62336079Swollman}
62436079Swollman
62536079SwollmanSYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
62636079Swollman	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
62736079Swollman
62848758Sgreenstatic int
62948758Sgreentcp_getcred SYSCTL_HANDLER_ARGS
63048758Sgreen{
63148758Sgreen	struct sockaddr_in addrs[2];
63248758Sgreen	struct inpcb *inp;
63348758Sgreen	int error, s;
63448758Sgreen
63548758Sgreen	error = suser(req->p);
63648758Sgreen	if (error)
63748758Sgreen		return (error);
63848758Sgreen	error = SYSCTL_IN(req, addrs, sizeof(addrs));
63948758Sgreen	if (error)
64048758Sgreen		return (error);
64148758Sgreen	s = splnet();
64248758Sgreen	inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
64348758Sgreen	    addrs[0].sin_addr, addrs[0].sin_port, 0);
64448758Sgreen	if (inp == NULL || inp->inp_socket == NULL ||
64548758Sgreen	    inp->inp_socket->so_cred == NULL) {
64648758Sgreen		error = ENOENT;
64748758Sgreen		goto out;
64848758Sgreen	}
64948758Sgreen	error = SYSCTL_OUT(req, inp->inp_socket->so_cred->pc_ucred,
65048758Sgreen	    sizeof(struct ucred));
65148758Sgreenout:
65248758Sgreen	splx(s);
65348758Sgreen	return (error);
65448758Sgreen}
65548758Sgreen
65648758SgreenSYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
65748758Sgreen    0, 0, tcp_getcred, "S,ucred", "Get the ucred of a TCP connection");
65848758Sgreen
6591541Srgrimesvoid
66012881Sbdetcp_ctlinput(cmd, sa, vip)
6611541Srgrimes	int cmd;
6621541Srgrimes	struct sockaddr *sa;
66312881Sbde	void *vip;
6641541Srgrimes{
66512881Sbde	register struct ip *ip = vip;
6661541Srgrimes	register struct tcphdr *th;
6671541Srgrimes	void (*notify) __P((struct inpcb *, int)) = tcp_notify;
6681541Srgrimes
6691541Srgrimes	if (cmd == PRC_QUENCH)
6701541Srgrimes		notify = tcp_quench;
67110881Swollman	else if (cmd == PRC_MSGSIZE)
67210881Swollman		notify = tcp_mtudisc;
6731541Srgrimes	else if (!PRC_IS_REDIRECT(cmd) &&
6741541Srgrimes		 ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0))
6751541Srgrimes		return;
6761541Srgrimes	if (ip) {
67717269Swollman		th = (struct tcphdr *)((caddr_t)ip
67817269Swollman				       + (IP_VHL_HL(ip->ip_vhl) << 2));
6791541Srgrimes		in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport,
6801541Srgrimes			cmd, notify);
6811541Srgrimes	} else
6821541Srgrimes		in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify);
6831541Srgrimes}
6841541Srgrimes
6851541Srgrimes/*
6861541Srgrimes * When a source quench is received, close congestion window
6871541Srgrimes * to one segment.  We will gradually open it again as we proceed.
6881541Srgrimes */
6891541Srgrimesvoid
6901541Srgrimestcp_quench(inp, errno)
6911541Srgrimes	struct inpcb *inp;
6921541Srgrimes	int errno;
6931541Srgrimes{
6941541Srgrimes	struct tcpcb *tp = intotcpcb(inp);
6951541Srgrimes
6961541Srgrimes	if (tp)
6971541Srgrimes		tp->snd_cwnd = tp->t_maxseg;
6981541Srgrimes}
6996283Swollman
7006283Swollman/*
70110881Swollman * When `need fragmentation' ICMP is received, update our idea of the MSS
70210881Swollman * based on the new value in the route.  Also nudge TCP to send something,
70310881Swollman * since we know the packet we just sent was dropped.
70410930Swollman * This duplicates some code in the tcp_mss() function in tcp_input.c.
70510881Swollman */
70611537Swollmanvoid
70710881Swollmantcp_mtudisc(inp, errno)
70810881Swollman	struct inpcb *inp;
70910881Swollman	int errno;
71010881Swollman{
71110881Swollman	struct tcpcb *tp = intotcpcb(inp);
71210930Swollman	struct rtentry *rt;
71310930Swollman	struct rmxp_tao *taop;
71410930Swollman	struct socket *so = inp->inp_socket;
71510930Swollman	int offered;
71610930Swollman	int mss;
71710881Swollman
71810930Swollman	if (tp) {
71910930Swollman		rt = tcp_rtlookup(inp);
72010930Swollman		if (!rt || !rt->rt_rmx.rmx_mtu) {
72110930Swollman			tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
72210930Swollman			return;
72310930Swollman		}
72410930Swollman		taop = rmx_taop(rt->rt_rmx);
72510930Swollman		offered = taop->tao_mssopt;
72610930Swollman		mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr);
72712939Swollman		if (offered)
72812939Swollman			mss = min(mss, offered);
72912939Swollman		/*
73012939Swollman		 * XXX - The above conditional probably violates the TCP
73112939Swollman		 * spec.  The problem is that, since we don't know the
73212939Swollman		 * other end's MSS, we are supposed to use a conservative
73312939Swollman		 * default.  But, if we do that, then MTU discovery will
73412939Swollman		 * never actually take place, because the conservative
73512939Swollman		 * default is much less than the MTUs typically seen
73612939Swollman		 * on the Internet today.  For the moment, we'll sweep
73712939Swollman		 * this under the carpet.
73812939Swollman		 *
73912939Swollman		 * The conservative default might not actually be a problem
74012939Swollman		 * if the only case this occurs is when sending an initial
74112939Swollman		 * SYN with options and data to a host we've never talked
74212939Swollman		 * to before.  Then, they will reply with an MSS value which
74312939Swollman		 * will get recorded and the new parameters should get
74412939Swollman		 * recomputed.  For Further Study.
74512939Swollman		 */
74611415Swollman		if (tp->t_maxopd <= mss)
74711415Swollman			return;
74810930Swollman		tp->t_maxopd = mss;
74910930Swollman
75010930Swollman		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
75110930Swollman		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
75210930Swollman			mss -= TCPOLEN_TSTAMP_APPA;
75310930Swollman		if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
75410930Swollman		    (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
75510930Swollman			mss -= TCPOLEN_CC_APPA;
75610930Swollman#if	(MCLBYTES & (MCLBYTES - 1)) == 0
75710930Swollman		if (mss > MCLBYTES)
75810930Swollman			mss &= ~(MCLBYTES-1);
75910930Swollman#else
76010930Swollman		if (mss > MCLBYTES)
76110930Swollman			mss = mss / MCLBYTES * MCLBYTES;
76210881Swollman#endif
76310930Swollman		if (so->so_snd.sb_hiwat < mss)
76410930Swollman			mss = so->so_snd.sb_hiwat;
76510930Swollman
76610930Swollman		tp->t_maxseg = mss;
76710930Swollman
76811450Swollman		tcpstat.tcps_mturesent++;
76950673Sjlemon		tp->t_rtttime = 0;
77011450Swollman		tp->snd_nxt = tp->snd_una;
77111450Swollman		tcp_output(tp);
77210930Swollman	}
77310881Swollman}
77410881Swollman
77510881Swollman/*
7766283Swollman * Look-up the routing entry to the peer of this inpcb.  If no route
7776283Swollman * is found and it cannot be allocated the return NULL.  This routine
7786283Swollman * is called by TCP routines that access the rmx structure and by tcp_mss
7796283Swollman * to get the interface MTU.
7806283Swollman */
7816283Swollmanstruct rtentry *
7826283Swollmantcp_rtlookup(inp)
7836283Swollman	struct inpcb *inp;
7846283Swollman{
7856283Swollman	struct route *ro;
7866283Swollman	struct rtentry *rt;
7876283Swollman
7886283Swollman	ro = &inp->inp_route;
7896283Swollman	rt = ro->ro_rt;
7906283Swollman	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
7916283Swollman		/* No route yet, so try to acquire one */
7926283Swollman		if (inp->inp_faddr.s_addr != INADDR_ANY) {
7936283Swollman			ro->ro_dst.sa_family = AF_INET;
7946283Swollman			ro->ro_dst.sa_len = sizeof(ro->ro_dst);
7956283Swollman			((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
7966283Swollman				inp->inp_faddr;
7976283Swollman			rtalloc(ro);
7986283Swollman			rt = ro->ro_rt;
7996283Swollman		}
8006283Swollman	}
8016283Swollman	return rt;
8026283Swollman}
8036283Swollman
8046283Swollman/*
8056283Swollman * Return a pointer to the cached information about the remote host.
8066283Swollman * The cached information is stored in the protocol specific part of
8076283Swollman * the route metrics.
8086283Swollman */
8096283Swollmanstruct rmxp_tao *
8106283Swollmantcp_gettaocache(inp)
8116283Swollman	struct inpcb *inp;
8126283Swollman{
8136283Swollman	struct rtentry *rt = tcp_rtlookup(inp);
8146283Swollman
8156283Swollman	/* Make sure this is a host route and is up. */
8166283Swollman	if (rt == NULL ||
8176283Swollman	    (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
8186283Swollman		return NULL;
8196283Swollman
8206283Swollman	return rmx_taop(rt->rt_rmx);
8216283Swollman}
8226283Swollman
8236283Swollman/*
8246283Swollman * Clear all the TAO cache entries, called from tcp_init.
8256283Swollman *
8266283Swollman * XXX
8276283Swollman * This routine is just an empty one, because we assume that the routing
8286283Swollman * routing tables are initialized at the same time when TCP, so there is
8296283Swollman * nothing in the cache left over.
8306283Swollman */
8316283Swollmanstatic void
83229506Sbdetcp_cleartaocache()
83329506Sbde{
83429506Sbde}
835