tcp_timewait.c revision 47960
1139823Simp/*
211150Swollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
41541Srgrimes *
51541Srgrimes * Redistribution and use in source and binary forms, with or without
61541Srgrimes * modification, are permitted provided that the following conditions
71541Srgrimes * are met:
81541Srgrimes * 1. Redistributions of source code must retain the above copyright
91541Srgrimes *    notice, this list of conditions and the following disclaimer.
101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer in the
121541Srgrimes *    documentation and/or other materials provided with the distribution.
131541Srgrimes * 3. All advertising materials mentioning features or use of this software
141541Srgrimes *    must display the following acknowledgement:
151541Srgrimes *	This product includes software developed by the University of
161541Srgrimes *	California, Berkeley and its contributors.
171541Srgrimes * 4. Neither the name of the University nor the names of its contributors
181541Srgrimes *    may be used to endorse or promote products derived from this software
191541Srgrimes *    without specific prior written permission.
201541Srgrimes *
211541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
221541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
231541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
241541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
251541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
261541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
271541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
281541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2911150Swollman * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
301541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
311541Srgrimes * SUCH DAMAGE.
32172467Ssilby *
33172467Ssilby *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
34172467Ssilby *	$Id: tcp_subr.c,v 1.54 1999/05/03 23:57:31 billf Exp $
3555679Sshin */
3629514Sjoerg
3729514Sjoerg#include "opt_compat.h"
381541Srgrimes#include "opt_tcpdebug.h"
3912172Sphk
40102967Sbde#include <sys/param.h>
4178642Ssilby#include <sys/systm.h>
42102967Sbde#include <sys/kernel.h>
43102967Sbde#include <sys/sysctl.h>
44205391Skmacy#include <sys/malloc.h>
451541Srgrimes#include <sys/mbuf.h>
461541Srgrimes#include <sys/socket.h>
47102967Sbde#include <sys/socketvar.h>
48102967Sbde#include <sys/protosw.h>
491541Srgrimes
50185571Sbz#include <vm/vm_zone.h>
511541Srgrimes
52196019Srwatson#include <net/route.h>
531541Srgrimes#include <net/if.h>
54215166Slstewart
551541Srgrimes#define _IP_VHL
56102967Sbde#include <netinet/in.h>
571541Srgrimes#include <netinet/in_systm.h>
5855679Sshin#include <netinet/ip.h>
5955679Sshin#include <netinet/in_pcb.h>
6055679Sshin#include <netinet/in_var.h>
611541Srgrimes#include <netinet/ip_var.h>
621541Srgrimes#include <netinet/tcp.h>
631541Srgrimes#include <netinet/tcp_fsm.h>
641541Srgrimes#include <netinet/tcp_seq.h>
651541Srgrimes#include <netinet/tcp_timer.h>
6617138Sdg#include <netinet/tcp_var.h>
6717138Sdg#include <netinet/tcpip.h>
6817138Sdg#ifdef TCPDEBUG
691541Srgrimes#include <netinet/tcp_debug.h>
7050673Sjlemon#endif
7150682Sjlemon
72180631Strhodesint 	tcp_mssdflt = TCP_MSS;
7318280SpstSYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
7450673Sjlemon    &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
7550682Sjlemon
76180631Strhodesstatic int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
7712172SphkSYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
7850673Sjlemon    &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
7950682Sjlemon
80180631Strhodesstatic int	tcp_do_rfc1323 = 1;
8112172SphkSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
8250673Sjlemon    &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
83167721Sandre
84167721Sandrestatic int	tcp_do_rfc1644 = 0;
8550682SjlemonSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
86133874Srwatson    &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
8750673Sjlemon
8850682SjlemonSYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
8950682Sjlemon    &tcbinfo.ipi_count, 0, "Number of active PCBs");
9050673Sjlemon
91100335Sdillonstatic void	tcp_cleartaocache __P((void));
92100335Sdillonstatic void	tcp_notify __P((struct inpcb *, int));
93167721Sandre
94167721Sandre/*
95100335Sdillon * Target size of TCP PCB hash tables. Must be a power of two.
96100335Sdillon *
97100335Sdillon * Note that this can be overridden by the kernel environment
98167721Sandre * variable net.inet.tcp.tcbhashsize
99167721Sandre */
100100335Sdillon#ifndef TCBHASHSIZE
10187499Srwatson#define TCBHASHSIZE	512
102133874Srwatson#endif
10346381Sbillf
10415039Sphk/*
105167036Smohans * This is the actual shape of what we allocate using the zone
106167036Smohans * allocator.  Doing it this way allows us to protect both structures
107167721Sandre * using the same generation count, and also eliminates the overhead
108167721Sandre * of allocating tcpcbs separately.  By hiding the structure here,
109167036Smohans * we avoid changing most of the rest of the code (although it needs
110167036Smohans * to be changed, eventually, for greater efficiency).
111167036Smohans */
112167721Sandre#define	ALIGNMENT	32
113167036Smohans#define	ALIGNM1		(ALIGNMENT - 1)
114167036Smohansstruct	inp_tp {
11512296Sphk	union {
11612296Sphk		struct	inpcb inp;
11750673Sjlemon		char	align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
11812296Sphk	} inp_tp_u;
1191541Srgrimes	struct	tcpcb tcb;
12011150Swollman};
121205391Skmacy#undef ALIGNMENT
122205391Skmacy#undef ALIGNM1
123205391Skmacy
124205391Skmacy/*
125205391Skmacy * Tcp initialization
126205391Skmacy */
127205391Skmacyvoid
1281541Srgrimestcp_init()
1291541Srgrimes{
13050673Sjlemon	int hashsize;
1311541Srgrimes
1321541Srgrimes	tcp_iss = random();	/* wrong, but better than a constant */
1331541Srgrimes	tcp_ccgen = 1;
134172309Ssilby	tcp_cleartaocache();
1351541Srgrimes	LIST_INIT(&tcb);
136183550Szec	tcbinfo.listhead = &tcb;
1371541Srgrimes	if (!(getenv_int("net.inet.tcp.tcbhashsize", &hashsize)))
138195760Srwatson		hashsize = TCBHASHSIZE;
139183550Szec	if (!powerof2(hashsize)) {
140183550Szec		printf("WARNING: TCB hash size not a power of 2\n");
141183550Szec		hashsize = 512; /* safe default */
142183550Szec	}
143183550Szec	tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
144183550Szec	tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
145183550Szec					&tcbinfo.porthashmask);
146183550Szec	tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets,
147195760Srwatson				 ZONE_INTERRUPT, 0);
1481541Srgrimes	if (max_protohdr < sizeof(struct tcpiphdr))
1491541Srgrimes		max_protohdr = sizeof(struct tcpiphdr);
15073110Sjlemon	if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN)
15173110Sjlemon		panic("tcp_init");
15273110Sjlemon}
1531541Srgrimes
154115824Shsu/*
1551541Srgrimes * Create template to be used to send tcp packets on a connection.
156115824Shsu * Call after host entry created, allocates an mbuf and fills
15711150Swollman * in a skeletal tcp/ip header, minimizing the amount of work
158157376Srwatson * necessary when the connection is used.
159157376Srwatson */
160157376Srwatsonstruct tcpiphdr *
161157376Srwatsontcp_template(tp)
162172074Srwatson	struct tcpcb *tp;
163172074Srwatson{
164172074Srwatson	register struct inpcb *inp = tp->t_inpcb;
165172074Srwatson	register struct mbuf *m;
16650673Sjlemon	register struct tcpiphdr *n;
167172074Srwatson
1681541Srgrimes	if ((n = tp->t_template) == 0) {
169172074Srwatson		m = m_get(M_DONTWAIT, MT_HEADER);
170172074Srwatson		if (m == NULL)
171183550Szec			return (0);
1721541Srgrimes		m->m_len = sizeof (struct tcpiphdr);
173172074Srwatson		n = mtod(m, struct tcpiphdr *);
174157376Srwatson	}
175172074Srwatson	bzero(n->ti_x1, sizeof(n->ti_x1));
176172074Srwatson	n->ti_pr = IPPROTO_TCP;
177172074Srwatson	n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip));
178172074Srwatson	n->ti_src = inp->inp_laddr;
179172074Srwatson	n->ti_dst = inp->inp_faddr;
180157376Srwatson	n->ti_sport = inp->inp_lport;
181172074Srwatson	n->ti_dport = inp->inp_fport;
182172074Srwatson	n->ti_seq = 0;
183183550Szec	n->ti_ack = 0;
184108265Shsu	n->ti_x2 = 0;
185108265Shsu	n->ti_off = 5;
186178285Srwatson	n->ti_flags = 0;
187189848Srwatson	n->ti_win = 0;
188172309Ssilby	n->ti_sum = 0;
189178285Srwatson	n->ti_urp = 0;
190183550Szec	return (n);
19150673Sjlemon}
19250673Sjlemon
193172309Ssilby/*
1941541Srgrimes * Send a single message to the TCP at address specified by
19550673Sjlemon * the given TCP/IP header.  If m == 0, then we make a copy
196190948Srwatson * of the tcpiphdr at ti and send directly to the addressed host.
19750673Sjlemon * This is used to force keep alive messages out using the TCP
198178285Srwatson * template for a connection tp->t_template.  If flags are given
199183550Szec * then we send a message back to the TCP which originated the
20050673Sjlemon * segment ti, and discard the mbuf containing it and any other
20150673Sjlemon * attached mbufs.
202172074Srwatson *
203172074Srwatson * In any case the ack and sequence number of the transmitted
20450673Sjlemon * segment are as specified by the parameters.
205172074Srwatson *
206172074Srwatson * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
207183550Szec */
20850673Sjlemonvoid
20950673Sjlemontcp_respond(tp, ti, m, ack, seq, flags)
21050673Sjlemon	struct tcpcb *tp;
21150673Sjlemon	register struct tcpiphdr *ti;
21250673Sjlemon	register struct mbuf *m;
213157376Srwatson	tcp_seq ack, seq;
214172074Srwatson	int flags;
215172074Srwatson{
216181803Sbz	register int tlen;
217172074Srwatson	int win = 0;
218172074Srwatson	struct route *ro = 0;
219172074Srwatson	struct route sro;
220172074Srwatson
221172074Srwatson	if (tp) {
222172074Srwatson		if (!(flags & TH_RST))
223172074Srwatson			win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
224172074Srwatson		ro = &tp->t_inpcb->inp_route;
225172074Srwatson	} else {
226172074Srwatson		ro = &sro;
227181803Sbz		bzero(ro, sizeof *ro);
228183550Szec	}
229172074Srwatson	if (m == 0) {
230172074Srwatson		m = m_gethdr(M_DONTWAIT, MT_HEADER);
231178285Srwatson		if (m == NULL)
232172074Srwatson			return;
233189848Srwatson#ifdef TCP_COMPAT_42
234172309Ssilby		tlen = 1;
235178285Srwatson#else
236181803Sbz		tlen = 0;
237183550Szec#endif
238172074Srwatson		m->m_data += max_linkhdr;
239172074Srwatson		*mtod(m, struct tcpiphdr *) = *ti;
240172309Ssilby		ti = mtod(m, struct tcpiphdr *);
241172074Srwatson		flags = TH_ACK;
2421541Srgrimes	} else {
2431541Srgrimes		m_freem(m->m_next);
2441541Srgrimes		m->m_next = 0;
2451541Srgrimes		m->m_data = (caddr_t)ti;
246167036Smohans		m->m_len = sizeof (struct tcpiphdr);
247167036Smohans		tlen = 0;
248167036Smohans#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
249167036Smohans		xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, n_long);
2501541Srgrimes		xchg(ti->ti_dport, ti->ti_sport, n_short);
251167036Smohans#undef xchg
252172074Srwatson	}
253167036Smohans	ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen));
254190948Srwatson	tlen += sizeof (struct tcpiphdr);
255172074Srwatson	m->m_len = tlen;
256167036Smohans	m->m_pkthdr.len = tlen;
257167036Smohans	m->m_pkthdr.rcvif = (struct ifnet *) 0;
258194305Sjhb	bzero(ti->ti_x1, sizeof(ti->ti_x1));
259205391Skmacy	ti->ti_seq = htonl(seq);
260205391Skmacy	ti->ti_ack = htonl(ack);
261172074Srwatson	ti->ti_x2 = 0;
262172074Srwatson	ti->ti_off = sizeof (struct tcphdr) >> 2;
263172074Srwatson	ti->ti_flags = flags;
2641541Srgrimes	if (tp)
26550673Sjlemon		ti->ti_win = htons((u_short) (win >> tp->rcv_scale));
266172312Skib	else
26797658Stanimura		ti->ti_win = htons((u_short)win);
26897658Stanimura	ti->ti_urp = 0;
26950673Sjlemon	ti->ti_sum = 0;
270172074Srwatson	ti->ti_sum = in_cksum(m, tlen);
271178285Srwatson	((struct ip *)ti)->ip_len = tlen;
272181803Sbz	((struct ip *)ti)->ip_ttl = ip_defttl;
273183550Szec#ifdef TCPDEBUG
27450673Sjlemon	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
27550673Sjlemon		tcp_trace(TA_OUTPUT, 0, tp, ti, 0);
276172074Srwatson#endif
277172074Srwatson	(void) ip_output(m, NULL, ro, 0, NULL);
27850673Sjlemon	if (ro == &sro && ro->ro_rt) {
279172074Srwatson		RTFREE(ro->ro_rt);
28078642Ssilby	}
281172074Srwatson}
282183550Szec
28350673Sjlemon/*
28450673Sjlemon * Create a new TCP control block, making an
28550673Sjlemon * empty reassembly queue and hooking it to the argument
28650673Sjlemon * protocol control block.  The `inp' parameter must have
28750673Sjlemon * come from the zone allocator set up in tcp_init().
288181803Sbz */
289172074Srwatsonstruct tcpcb *
290157376Srwatsontcp_newtcpcb(inp)
291172074Srwatson	struct inpcb *inp;
292172074Srwatson{
293172074Srwatson	struct inp_tp *it;
294172074Srwatson	register struct tcpcb *tp;
295172074Srwatson
296172074Srwatson	it = (struct inp_tp *)inp;
297172074Srwatson	tp = &it->tcb;
298172074Srwatson	bzero((char *) tp, sizeof(struct tcpcb));
299181803Sbz	tp->t_segq = NULL;
300183550Szec	tp->t_maxseg = tp->t_maxopd = tcp_mssdflt;
301172074Srwatson
302172074Srwatson	if (tcp_do_rfc1323)
303178285Srwatson		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
304189848Srwatson	if (tcp_do_rfc1644)
305172309Ssilby		tp->t_flags |= TF_REQ_CC;
306178285Srwatson	tp->t_inpcb = inp;	/* XXX */
307181803Sbz	/*
308183550Szec	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
309172074Srwatson	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
310172074Srwatson	 * reasonable initial retransmit time.
311172309Ssilby	 */
312172074Srwatson	tp->t_srtt = TCPTV_SRTTBASE;
31350673Sjlemon	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
31450673Sjlemon	tp->t_rttmin = TCPTV_MIN;
3151541Srgrimes	tp->t_rxtcur = TCPTV_RTOBASE;
316190948Srwatson	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
31750673Sjlemon	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
31850673Sjlemon	inp->inp_ip_ttl = ip_defttl;
319122326Ssam	inp->inp_ppcb = (caddr_t)tp;
32050673Sjlemon	return (tp);		/* XXX */
321194305Sjhb}
32250673Sjlemon
3231541Srgrimes/*
32450673Sjlemon * Drop a TCP connection, reporting
32550673Sjlemon * the specified error.  If connection is synchronized,
32650673Sjlemon * then send a RST to peer.
32750673Sjlemon */
32850673Sjlemonstruct tcpcb *
32950673Sjlemontcp_drop(tp, errno)
33050673Sjlemon	register struct tcpcb *tp;
33150673Sjlemon	int errno;
33250673Sjlemon{
33350673Sjlemon	struct socket *so = tp->t_inpcb->inp_socket;
3341541Srgrimes
335190948Srwatson	if (TCPS_HAVERCVDSYN(tp->t_state)) {
336111144Sjlemon		tp->t_state = TCPS_CLOSED;
33778642Ssilby		(void) tcp_output(tp);
33878642Ssilby		tcpstat.tcps_drops++;
33978642Ssilby	} else
34078642Ssilby		tcpstat.tcps_conndrops++;
341179487Srwatson	if (errno == ETIMEDOUT && tp->t_softerror)
34278642Ssilby		errno = tp->t_softerror;
343205391Skmacy	so->so_error = errno;
34497658Stanimura	return (tcp_close(tp));
345205391Skmacy}
34650673Sjlemon
34750673Sjlemon/*
348122326Ssam * Close a TCP control block:
34955679Sshin *	discard all space held by the tcp
35050673Sjlemon *	discard internet protocol block
35150673Sjlemon *	wake up any sleepers
352178285Srwatson */
353181803Sbzstruct tcpcb *
354183550Szectcp_close(tp)
355172074Srwatson	register struct tcpcb *tp;
35650673Sjlemon{
35750673Sjlemon	register struct mbuf *q;
358190948Srwatson	register struct mbuf *nq;
359172074Srwatson	struct inpcb *inp = tp->t_inpcb;
360172074Srwatson	struct socket *so = inp->inp_socket;
361172074Srwatson	register struct rtentry *rt;
362172074Srwatson	int dosavessthresh;
363172074Srwatson
364172074Srwatson	/*
365172074Srwatson	 * If we got enough samples through the srtt filter,
366172074Srwatson	 * save the rtt and rttvar in the routing entry.
367178285Srwatson	 * 'Enough' is arbitrarily defined as the 16 samples.
368181803Sbz	 * 16 samples is enough for the srtt filter to converge
369183550Szec	 * to within 5% of the correct value; fewer samples and
37050673Sjlemon	 * we could save a very bogus rtt.
37150673Sjlemon	 *
372172074Srwatson	 * Don't update the default route's characteristics and don't
373172074Srwatson	 * update anything that the user "locked".
37450673Sjlemon	 */
375172074Srwatson	if (tp->t_rttupdated >= 16 &&
376172074Srwatson	    (rt = inp->inp_route.ro_rt) &&
377183550Szec	    ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) {
37850673Sjlemon		register u_long i = 0;
37950673Sjlemon
38050673Sjlemon		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
38150673Sjlemon			i = tp->t_srtt *
38250673Sjlemon			    (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE));
383181803Sbz			if (rt->rt_rmx.rmx_rtt && i)
384172074Srwatson				/*
385157376Srwatson				 * filter this update to half the old & half
386172074Srwatson				 * the new values, converting scale.
387172074Srwatson				 * See route.h and tcp_var.h for a
388172074Srwatson				 * description of the scaling constants.
389172074Srwatson				 */
390172074Srwatson				rt->rt_rmx.rmx_rtt =
391172074Srwatson				    (rt->rt_rmx.rmx_rtt + i) / 2;
392172074Srwatson			else
393172074Srwatson				rt->rt_rmx.rmx_rtt = i;
394181803Sbz			tcpstat.tcps_cachedrtt++;
395183550Szec		}
396172074Srwatson		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
397172074Srwatson			i = tp->t_rttvar *
398178285Srwatson			    (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE));
399189848Srwatson			if (rt->rt_rmx.rmx_rttvar && i)
400172309Ssilby				rt->rt_rmx.rmx_rttvar =
401178285Srwatson				    (rt->rt_rmx.rmx_rttvar + i) / 2;
402181803Sbz			else
403183550Szec				rt->rt_rmx.rmx_rttvar = i;
404172074Srwatson			tcpstat.tcps_cachedrttvar++;
405172074Srwatson		}
406172309Ssilby		/*
407172074Srwatson		 * The old comment here said:
40850673Sjlemon		 * update the pipelimit (ssthresh) if it has been updated
40950673Sjlemon		 * already or if a pipesize was specified & the threshhold
41050673Sjlemon		 * got below half the pipesize.  I.e., wait for bad news
411190948Srwatson		 * before we start updating, then update on both good
41250673Sjlemon		 * and bad news.
41350673Sjlemon		 *
41450673Sjlemon		 * But we want to save the ssthresh even if no pipesize is
41550673Sjlemon		 * specified explicitly in the route, because such
41650673Sjlemon		 * connections still have an implicit pipesize specified
41750673Sjlemon		 * by the global tcp_sendspace.  In the absence of a reliable
41850673Sjlemon		 * way to calculate the pipesize, it will have to do.
41950673Sjlemon		 */
420194305Sjhb		i = tp->snd_ssthresh;
421194305Sjhb		if (rt->rt_rmx.rmx_sendpipe != 0)
422190948Srwatson			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
423172074Srwatson		else
424172074Srwatson			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
42550673Sjlemon		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
42650673Sjlemon		     i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
427146463Sps		    || dosavessthresh) {
42850673Sjlemon			/*
429146463Sps			 * convert the limit from user data bytes to
43050673Sjlemon			 * packets then to packet data bytes.
431172074Srwatson			 */
43250673Sjlemon			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
433158644Sglebius			if (i < 2)
434158644Sglebius				i = 2;
43550673Sjlemon			i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr));
436172074Srwatson			if (rt->rt_rmx.rmx_ssthresh)
437178285Srwatson				rt->rt_rmx.rmx_ssthresh =
438181803Sbz				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
439183550Szec			else
44050673Sjlemon				rt->rt_rmx.rmx_ssthresh = i;
44150673Sjlemon			tcpstat.tcps_cachedssthresh++;
442172074Srwatson		}
443172074Srwatson	}
44450673Sjlemon	/* free the reassembly queue, if any */
445172074Srwatson	for (q = tp->t_segq; q; q = nq) {
446183550Szec		nq = q->m_nextpkt;
44750673Sjlemon		tp->t_segq = nq;
448172074Srwatson		m_freem(q);
449172074Srwatson	}
45050673Sjlemon	if (tp->t_template)
45150673Sjlemon		(void) m_free(dtom(tp->t_template));
45250673Sjlemon	inp->inp_ppcb = NULL;
45350673Sjlemon	soisdisconnected(so);
45450673Sjlemon	in_pcbdetach(inp);
455205391Skmacy	tcpstat.tcps_closed++;
456172074Srwatson	return ((struct tcpcb *)0);
457172074Srwatson}
458172074Srwatson
459172074Srwatsonvoid
460172074Srwatsontcp_drain()
461172074Srwatson{
462172074Srwatson
463172074Srwatson}
464172074Srwatson
465172074Srwatson/*
466205391Skmacy * Notify a tcp user of an asynchronous error;
467183550Szec * store error as soft error, but wake up user
468172074Srwatson * (for now, won't do anything until can select for soft error).
469172074Srwatson */
470178285Srwatsonstatic void
471189848Srwatsontcp_notify(inp, error)
472172309Ssilby	struct inpcb *inp;
473178285Srwatson	int error;
474205391Skmacy{
475183550Szec	register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
476172074Srwatson	register struct socket *so = inp->inp_socket;
477172074Srwatson
478172309Ssilby	/*
479130989Sps	 * Ignore some errors if we are hooked up.
48050673Sjlemon	 * If connection hasn't completed, has retransmitted several times,
48150673Sjlemon	 * and receives a second error, give up now.  This is better
48250673Sjlemon	 * than waiting a long time to establish a connection that
48350673Sjlemon	 * can never complete.
48450673Sjlemon	 */
48550673Sjlemon	if (tp->t_state == TCPS_ESTABLISHED &&
48650673Sjlemon	     (error == EHOSTUNREACH || error == ENETUNREACH ||
487190948Srwatson	      error == EHOSTDOWN)) {
488205391Skmacy		return;
489217126Sjhb	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
490217126Sjhb	    tp->t_softerror)
491217126Sjhb		so->so_error = error;
492217126Sjhb	else
493217126Sjhb		tp->t_softerror = error;
494217126Sjhb	wakeup((caddr_t) &so->so_timeo);
495217126Sjhb	sorwakeup(so);
496217126Sjhb	sowwakeup(so);
497217126Sjhb}
498172074Srwatson
499172074Srwatsonstatic int
500205391Skmacytcp_pcblist SYSCTL_HANDLER_ARGS
501172074Srwatson{
50250673Sjlemon	int error, i, n, s;
503205391Skmacy	struct inpcb *inp, **inp_list;
504172074Srwatson	inp_gen_t gencnt;
50550673Sjlemon	struct xinpgen xig;
50613229Solah
50750673Sjlemon	/*
508133874Srwatson	 * The process of preparing the TCB list is too time-consuming and
509133874Srwatson	 * resource-intensive to repeat twice on every request.
51050673Sjlemon	 */
511133874Srwatson	if (req->oldptr == 0) {
51250673Sjlemon		n = tcbinfo.ipi_count;
51350673Sjlemon		req->oldidx = 2 * (sizeof xig)
5141541Srgrimes			+ (n + n/8) * sizeof(struct xtcpcb);
51550673Sjlemon		return 0;
51650673Sjlemon	}
517117650Shsu
518215166Slstewart	if (req->newptr != 0)
519215166Slstewart		return EPERM;
520117650Shsu
521215166Slstewart	/*
522215166Slstewart	 * OK, now we're committed to doing something.
523215166Slstewart	 */
524215166Slstewart	s = splnet();
525215166Slstewart	gencnt = tcbinfo.ipi_gencnt;
52650673Sjlemon	n = tcbinfo.ipi_count;
52750673Sjlemon	splx(s);
528190948Srwatson
52973110Sjlemon	xig.xig_len = sizeof xig;
53073110Sjlemon	xig.xig_count = n;
53173110Sjlemon	xig.xig_gen = gencnt;
53273110Sjlemon	xig.xig_sogen = so_gencnt;
53350673Sjlemon	error = SYSCTL_OUT(req, &xig, sizeof xig);
53450673Sjlemon	if (error)
53550673Sjlemon		return error;
536216621Sjhb
537133874Srwatson	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
538133874Srwatson	if (inp_list == 0)
539133874Srwatson		return ENOMEM;
54077539Sjesper
54177539Sjesper	s = splnet();
54277539Sjesper	for (inp = tcbinfo.listhead->lh_first, i = 0; inp && i < n;
543137139Sandre	     inp = inp->inp_list.le_next) {
54477539Sjesper		if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
545122922Sandre			inp_list[i++] = inp;
546122922Sandre	}
54750673Sjlemon	splx(s);
54850673Sjlemon	n = i;
54950673Sjlemon
55050673Sjlemon	error = 0;
55155679Sshin	for (i = 0; i < n; i++) {
55255679Sshin		inp = inp_list[i];
55355679Sshin		if (inp->inp_gencnt <= gencnt) {
55455679Sshin			struct xtcpcb xt;
55555679Sshin			caddr_t inp_ppcb;
55650673Sjlemon			xt.xt_len = sizeof xt;
55750673Sjlemon			/* XXX should avoid extra copy */
55850673Sjlemon			bcopy(inp, &xt.xt_inp, sizeof *inp);
55950673Sjlemon			inp_ppcb = inp->inp_ppcb;
560117650Shsu			if (inp_ppcb != NULL)
56150673Sjlemon				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
56250673Sjlemon			else
56350673Sjlemon				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
56450673Sjlemon			if (inp->inp_socket)
56550673Sjlemon				sotoxsocket(inp->inp_socket, &xt.xt_socket);
56650673Sjlemon			error = SYSCTL_OUT(req, &xt, sizeof xt);
56750673Sjlemon		}
56850673Sjlemon	}
569215166Slstewart	if (!error) {
570216101Slstewart		/*
571215166Slstewart		 * Give the user an updated idea of our state.
57250673Sjlemon		 * If the generation differs from what we told
5731541Srgrimes		 * her before, she knows that something happened
574172074Srwatson		 * while we were processing this request, and it
57550673Sjlemon		 * might be necessary to retry.
576157136Srwatson		 */
57797658Stanimura		s = splnet();
57897658Stanimura		xig.xig_gen = tcbinfo.ipi_gencnt;
5791541Srgrimes		xig.xig_sogen = so_gencnt;
580172074Srwatson		xig.xig_count = tcbinfo.ipi_count;
581178285Srwatson		splx(s);
582172074Srwatson		error = SYSCTL_OUT(req, &xig, sizeof xig);
583181803Sbz	}
584183550Szec	free(inp_list, M_TEMP);
5851541Srgrimes	return error;
586172074Srwatson}
587172074Srwatson
588172074SrwatsonSYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
589172074Srwatson	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
590172074Srwatson
591172074Srwatsonvoid
592205391Skmacytcp_ctlinput(cmd, sa, vip)
593205391Skmacy	int cmd;
594172074Srwatson	struct sockaddr *sa;
595172074Srwatson	void *vip;
596172074Srwatson{
597172309Ssilby	register struct ip *ip = vip;
598172074Srwatson	register struct tcphdr *th;
599172074Srwatson	void (*notify) __P((struct inpcb *, int)) = tcp_notify;
600172074Srwatson
601172309Ssilby	if (cmd == PRC_QUENCH)
602172074Srwatson		notify = tcp_quench;
603172074Srwatson	else if (cmd == PRC_MSGSIZE)
604172074Srwatson		notify = tcp_mtudisc;
605172309Ssilby	else if (!PRC_IS_REDIRECT(cmd) &&
606172074Srwatson		 ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0))
607172074Srwatson		return;
608172074Srwatson	if (ip) {
609172309Ssilby		th = (struct tcphdr *)((caddr_t)ip
610172074Srwatson				       + (IP_VHL_HL(ip->ip_vhl) << 2));
611172074Srwatson		in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport,
612172074Srwatson			cmd, notify);
613172309Ssilby	} else
614172074Srwatson		in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify);
615172074Srwatson}
616172074Srwatson
617172074Srwatson/*
618172074Srwatson * When a source quench is received, close congestion window
619172074Srwatson * to one segment.  We will gradually open it again as we proceed.
620172074Srwatson */
621172074Srwatsonvoid
622205391Skmacytcp_quench(inp, errno)
623172074Srwatson	struct inpcb *inp;
624172074Srwatson	int errno;
625172074Srwatson{
626172074Srwatson	struct tcpcb *tp = intotcpcb(inp);
627172074Srwatson
628172074Srwatson	if (tp)
629172074Srwatson		tp->snd_cwnd = tp->t_maxseg;
630172074Srwatson}
631172074Srwatson
632172074Srwatson/*
633172309Ssilby * When `need fragmentation' ICMP is received, update our idea of the MSS
634172074Srwatson * based on the new value in the route.  Also nudge TCP to send something,
635172074Srwatson * since we know the packet we just sent was dropped.
636172309Ssilby * This duplicates some code in the tcp_mss() function in tcp_input.c.
637172074Srwatson */
638172074Srwatsonvoid
639172309Ssilbytcp_mtudisc(inp, errno)
640172074Srwatson	struct inpcb *inp;
641172074Srwatson	int errno;
642172309Ssilby{
643172074Srwatson	struct tcpcb *tp = intotcpcb(inp);
644172074Srwatson	struct rtentry *rt;
645172309Ssilby	struct rmxp_tao *taop;
646172074Srwatson	struct socket *so = inp->inp_socket;
647172074Srwatson	int offered;
648172074Srwatson	int mss;
649172074Srwatson
650172074Srwatson	if (tp) {
651172074Srwatson		rt = tcp_rtlookup(inp);
652197244Ssilby		if (!rt || !rt->rt_rmx.rmx_mtu) {
653197244Ssilby			tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
654197244Ssilby			return;
655197244Ssilby		}
656197244Ssilby		taop = rmx_taop(rt->rt_rmx);
657197244Ssilby		offered = taop->tao_mssopt;
658197244Ssilby		mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr);
659197244Ssilby		if (offered)
660197244Ssilby			mss = min(mss, offered);
661197244Ssilby		/*
662197244Ssilby		 * XXX - The above conditional probably violates the TCP
663197244Ssilby		 * spec.  The problem is that, since we don't know the
664197244Ssilby		 * other end's MSS, we are supposed to use a conservative
665197244Ssilby		 * default.  But, if we do that, then MTU discovery will
666197244Ssilby		 * never actually take place, because the conservative
667197244Ssilby		 * default is much less than the MTUs typically seen
668197244Ssilby		 * on the Internet today.  For the moment, we'll sweep
669197244Ssilby		 * this under the carpet.
670197244Ssilby		 *
671197244Ssilby		 * The conservative default might not actually be a problem
672197244Ssilby		 * if the only case this occurs is when sending an initial
673		 * SYN with options and data to a host we've never talked
674		 * to before.  Then, they will reply with an MSS value which
675		 * will get recorded and the new parameters should get
676		 * recomputed.  For Further Study.
677		 */
678		if (tp->t_maxopd <= mss)
679			return;
680		tp->t_maxopd = mss;
681
682		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
683		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
684			mss -= TCPOLEN_TSTAMP_APPA;
685		if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
686		    (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
687			mss -= TCPOLEN_CC_APPA;
688#if	(MCLBYTES & (MCLBYTES - 1)) == 0
689		if (mss > MCLBYTES)
690			mss &= ~(MCLBYTES-1);
691#else
692		if (mss > MCLBYTES)
693			mss = mss / MCLBYTES * MCLBYTES;
694#endif
695		if (so->so_snd.sb_hiwat < mss)
696			mss = so->so_snd.sb_hiwat;
697
698		tp->t_maxseg = mss;
699
700		tcpstat.tcps_mturesent++;
701		tp->t_rtt = 0;
702		tp->snd_nxt = tp->snd_una;
703		tcp_output(tp);
704	}
705}
706
707/*
708 * Look-up the routing entry to the peer of this inpcb.  If no route
709 * is found and it cannot be allocated the return NULL.  This routine
710 * is called by TCP routines that access the rmx structure and by tcp_mss
711 * to get the interface MTU.
712 */
713struct rtentry *
714tcp_rtlookup(inp)
715	struct inpcb *inp;
716{
717	struct route *ro;
718	struct rtentry *rt;
719
720	ro = &inp->inp_route;
721	rt = ro->ro_rt;
722	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
723		/* No route yet, so try to acquire one */
724		if (inp->inp_faddr.s_addr != INADDR_ANY) {
725			ro->ro_dst.sa_family = AF_INET;
726			ro->ro_dst.sa_len = sizeof(ro->ro_dst);
727			((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
728				inp->inp_faddr;
729			rtalloc(ro);
730			rt = ro->ro_rt;
731		}
732	}
733	return rt;
734}
735
736/*
737 * Return a pointer to the cached information about the remote host.
738 * The cached information is stored in the protocol specific part of
739 * the route metrics.
740 */
741struct rmxp_tao *
742tcp_gettaocache(inp)
743	struct inpcb *inp;
744{
745	struct rtentry *rt = tcp_rtlookup(inp);
746
747	/* Make sure this is a host route and is up. */
748	if (rt == NULL ||
749	    (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
750		return NULL;
751
752	return rmx_taop(rt->rt_rmx);
753}
754
755/*
756 * Clear all the TAO cache entries, called from tcp_init.
757 *
758 * XXX
759 * This routine is just an empty one, because we assume that the routing
760 * routing tables are initialized at the same time when TCP, so there is
761 * nothing in the cache left over.
762 */
763static void
764tcp_cleartaocache()
765{
766}
767