tcp_timewait.c revision 78671
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
34 * $FreeBSD: head/sys/netinet/tcp_timewait.c 78671 2001-06-23 17:44:27Z jlemon $
35 */
36
37#include "opt_compat.h"
38#include "opt_inet6.h"
39#include "opt_ipsec.h"
40#include "opt_tcpdebug.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/callout.h>
45#include <sys/kernel.h>
46#include <sys/sysctl.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#ifdef INET6
50#include <sys/domain.h>
51#endif
52#include <sys/proc.h>
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55#include <sys/protosw.h>
56#include <sys/random.h>
57
58#include <vm/vm_zone.h>
59
60#include <net/route.h>
61#include <net/if.h>
62
63#define _IP_VHL
64#include <netinet/in.h>
65#include <netinet/in_systm.h>
66#include <netinet/ip.h>
67#ifdef INET6
68#include <netinet/ip6.h>
69#endif
70#include <netinet/in_pcb.h>
71#ifdef INET6
72#include <netinet6/in6_pcb.h>
73#endif
74#include <netinet/in_var.h>
75#include <netinet/ip_var.h>
76#ifdef INET6
77#include <netinet6/ip6_var.h>
78#endif
79#include <netinet/tcp.h>
80#include <netinet/tcp_fsm.h>
81#include <netinet/tcp_seq.h>
82#include <netinet/tcp_timer.h>
83#include <netinet/tcp_var.h>
84#ifdef INET6
85#include <netinet6/tcp6_var.h>
86#endif
87#include <netinet/tcpip.h>
88#ifdef TCPDEBUG
89#include <netinet/tcp_debug.h>
90#endif
91#include <netinet6/ip6protosw.h>
92
93#ifdef IPSEC
94#include <netinet6/ipsec.h>
95#ifdef INET6
96#include <netinet6/ipsec6.h>
97#endif
98#endif /*IPSEC*/
99
100#include <machine/in_cksum.h>
101
102int 	tcp_mssdflt = TCP_MSS;
103SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
104    &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
105
106#ifdef INET6
107int	tcp_v6mssdflt = TCP6_MSS;
108SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
109	CTLFLAG_RW, &tcp_v6mssdflt , 0,
110	"Default TCP Maximum Segment Size for IPv6");
111#endif
112
113#if 0
114static int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
115SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
116    &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
117#endif
118
119static int	tcp_do_rfc1323 = 1;
120SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
121    &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
122
123static int	tcp_do_rfc1644 = 0;
124SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
125    &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
126
127static int	tcp_tcbhashsize = 0;
128SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
129     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
130
131static int	do_tcpdrain = 1;
132SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
133     "Enable tcp_drain routine for extra help when low on mbufs");
134
135SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
136    &tcbinfo.ipi_count, 0, "Number of active PCBs");
137
138static int	icmp_may_rst = 1;
139SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
140    "Certain ICMP unreachable messages may abort connections in SYN_SENT");
141
142static void	tcp_cleartaocache __P((void));
143static void	tcp_notify __P((struct inpcb *, int));
144
145/*
146 * Target size of TCP PCB hash tables. Must be a power of two.
147 *
148 * Note that this can be overridden by the kernel environment
149 * variable net.inet.tcp.tcbhashsize
150 */
151#ifndef TCBHASHSIZE
152#define TCBHASHSIZE	512
153#endif
154
155/*
156 * This is the actual shape of what we allocate using the zone
157 * allocator.  Doing it this way allows us to protect both structures
158 * using the same generation count, and also eliminates the overhead
159 * of allocating tcpcbs separately.  By hiding the structure here,
160 * we avoid changing most of the rest of the code (although it needs
161 * to be changed, eventually, for greater efficiency).
162 */
163#define	ALIGNMENT	32
164#define	ALIGNM1		(ALIGNMENT - 1)
165struct	inp_tp {
166	union {
167		struct	inpcb inp;
168		char	align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
169	} inp_tp_u;
170	struct	tcpcb tcb;
171	struct	callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
172	struct	callout inp_tp_delack;
173};
174#undef ALIGNMENT
175#undef ALIGNM1
176
177/*
178 * Tcp initialization
179 */
180void
181tcp_init()
182{
183	int hashsize = TCBHASHSIZE;
184
185	tcp_ccgen = 1;
186	tcp_cleartaocache();
187
188	tcp_delacktime = TCPTV_DELACK;
189	tcp_keepinit = TCPTV_KEEP_INIT;
190	tcp_keepidle = TCPTV_KEEP_IDLE;
191	tcp_keepintvl = TCPTV_KEEPINTVL;
192	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
193	tcp_msl = TCPTV_MSL;
194
195	LIST_INIT(&tcb);
196	tcbinfo.listhead = &tcb;
197	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
198	if (!powerof2(hashsize)) {
199		printf("WARNING: TCB hash size not a power of 2\n");
200		hashsize = 512; /* safe default */
201	}
202	tcp_tcbhashsize = hashsize;
203	tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
204	tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
205					&tcbinfo.porthashmask);
206	tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets,
207				 ZONE_INTERRUPT, 0);
208#ifdef INET6
209#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
210#else /* INET6 */
211#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
212#endif /* INET6 */
213	if (max_protohdr < TCP_MINPROTOHDR)
214		max_protohdr = TCP_MINPROTOHDR;
215	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
216		panic("tcp_init");
217#undef TCP_MINPROTOHDR
218}
219
220/*
221 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
222 * tcp_template used to store this data in mbufs, but we now recopy it out
223 * of the tcpcb each time to conserve mbufs.
224 */
225void
226tcp_fillheaders(tp, ip_ptr, tcp_ptr)
227	struct tcpcb *tp;
228	void *ip_ptr;
229	void *tcp_ptr;
230{
231	struct inpcb *inp = tp->t_inpcb;
232	struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
233
234#ifdef INET6
235	if ((inp->inp_vflag & INP_IPV6) != 0) {
236		struct ip6_hdr *ip6;
237
238		ip6 = (struct ip6_hdr *)ip_ptr;
239		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
240			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
241		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
242			(IPV6_VERSION & IPV6_VERSION_MASK);
243		ip6->ip6_nxt = IPPROTO_TCP;
244		ip6->ip6_plen = sizeof(struct tcphdr);
245		ip6->ip6_src = inp->in6p_laddr;
246		ip6->ip6_dst = inp->in6p_faddr;
247		tcp_hdr->th_sum = 0;
248	} else
249#endif
250	{
251	struct ip *ip = (struct ip *) ip_ptr;
252
253	ip->ip_vhl = IP_VHL_BORING;
254	ip->ip_tos = 0;
255	ip->ip_len = 0;
256	ip->ip_id = 0;
257	ip->ip_off = 0;
258	ip->ip_ttl = 0;
259	ip->ip_sum = 0;
260	ip->ip_p = IPPROTO_TCP;
261	ip->ip_src = inp->inp_laddr;
262	ip->ip_dst = inp->inp_faddr;
263	tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
264		htons(sizeof(struct tcphdr) + IPPROTO_TCP));
265	}
266
267	tcp_hdr->th_sport = inp->inp_lport;
268	tcp_hdr->th_dport = inp->inp_fport;
269	tcp_hdr->th_seq = 0;
270	tcp_hdr->th_ack = 0;
271	tcp_hdr->th_x2 = 0;
272	tcp_hdr->th_off = 5;
273	tcp_hdr->th_flags = 0;
274	tcp_hdr->th_win = 0;
275	tcp_hdr->th_urp = 0;
276}
277
278/*
279 * Create template to be used to send tcp packets on a connection.
280 * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
281 * use for this function is in keepalives, which use tcp_respond.
282 */
283struct tcptemp *
284tcp_maketemplate(tp)
285	struct tcpcb *tp;
286{
287	struct mbuf *m;
288	struct tcptemp *n;
289
290	m = m_get(M_DONTWAIT, MT_HEADER);
291	if (m == NULL)
292		return (0);
293	m->m_len = sizeof(struct tcptemp);
294	n = mtod(m, struct tcptemp *);
295
296	tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
297	return (n);
298}
299
300/*
301 * Send a single message to the TCP at address specified by
302 * the given TCP/IP header.  If m == 0, then we make a copy
303 * of the tcpiphdr at ti and send directly to the addressed host.
304 * This is used to force keep alive messages out using the TCP
305 * template for a connection.  If flags are given then we send
306 * a message back to the TCP which originated the * segment ti,
307 * and discard the mbuf containing it and any other attached mbufs.
308 *
309 * In any case the ack and sequence number of the transmitted
310 * segment are as specified by the parameters.
311 *
312 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
313 */
314void
315tcp_respond(tp, ipgen, th, m, ack, seq, flags)
316	struct tcpcb *tp;
317	void *ipgen;
318	register struct tcphdr *th;
319	register struct mbuf *m;
320	tcp_seq ack, seq;
321	int flags;
322{
323	register int tlen;
324	int win = 0;
325	struct route *ro = 0;
326	struct route sro;
327	struct ip *ip;
328	struct tcphdr *nth;
329#ifdef INET6
330	struct route_in6 *ro6 = 0;
331	struct route_in6 sro6;
332	struct ip6_hdr *ip6;
333	int isipv6;
334#endif /* INET6 */
335	int ipflags = 0;
336
337#ifdef INET6
338	isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
339	ip6 = ipgen;
340#endif /* INET6 */
341	ip = ipgen;
342
343	if (tp) {
344		if (!(flags & TH_RST)) {
345			win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
346			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
347				win = (long)TCP_MAXWIN << tp->rcv_scale;
348		}
349#ifdef INET6
350		if (isipv6)
351			ro6 = &tp->t_inpcb->in6p_route;
352		else
353#endif /* INET6 */
354		ro = &tp->t_inpcb->inp_route;
355	} else {
356#ifdef INET6
357		if (isipv6) {
358			ro6 = &sro6;
359			bzero(ro6, sizeof *ro6);
360		} else
361#endif /* INET6 */
362	      {
363		ro = &sro;
364		bzero(ro, sizeof *ro);
365	      }
366	}
367	if (m == 0) {
368		m = m_gethdr(M_DONTWAIT, MT_HEADER);
369		if (m == NULL)
370			return;
371		tlen = 0;
372		m->m_data += max_linkhdr;
373#ifdef INET6
374		if (isipv6) {
375			bcopy((caddr_t)ip6, mtod(m, caddr_t),
376			      sizeof(struct ip6_hdr));
377			ip6 = mtod(m, struct ip6_hdr *);
378			nth = (struct tcphdr *)(ip6 + 1);
379		} else
380#endif /* INET6 */
381	      {
382		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
383		ip = mtod(m, struct ip *);
384		nth = (struct tcphdr *)(ip + 1);
385	      }
386		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
387		flags = TH_ACK;
388	} else {
389		m_freem(m->m_next);
390		m->m_next = 0;
391		m->m_data = (caddr_t)ipgen;
392		/* m_len is set later */
393		tlen = 0;
394#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
395#ifdef INET6
396		if (isipv6) {
397			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
398			nth = (struct tcphdr *)(ip6 + 1);
399		} else
400#endif /* INET6 */
401	      {
402		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
403		nth = (struct tcphdr *)(ip + 1);
404	      }
405		if (th != nth) {
406			/*
407			 * this is usually a case when an extension header
408			 * exists between the IPv6 header and the
409			 * TCP header.
410			 */
411			nth->th_sport = th->th_sport;
412			nth->th_dport = th->th_dport;
413		}
414		xchg(nth->th_dport, nth->th_sport, n_short);
415#undef xchg
416	}
417#ifdef INET6
418	if (isipv6) {
419		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
420						tlen));
421		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
422	} else
423#endif
424      {
425	tlen += sizeof (struct tcpiphdr);
426	ip->ip_len = tlen;
427	ip->ip_ttl = ip_defttl;
428      }
429	m->m_len = tlen;
430	m->m_pkthdr.len = tlen;
431	m->m_pkthdr.rcvif = (struct ifnet *) 0;
432	nth->th_seq = htonl(seq);
433	nth->th_ack = htonl(ack);
434	nth->th_x2 = 0;
435	nth->th_off = sizeof (struct tcphdr) >> 2;
436	nth->th_flags = flags;
437	if (tp)
438		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
439	else
440		nth->th_win = htons((u_short)win);
441	nth->th_urp = 0;
442#ifdef INET6
443	if (isipv6) {
444		nth->th_sum = 0;
445		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
446					sizeof(struct ip6_hdr),
447					tlen - sizeof(struct ip6_hdr));
448		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
449					       ro6 && ro6->ro_rt ?
450					       ro6->ro_rt->rt_ifp :
451					       NULL);
452	} else
453#endif /* INET6 */
454      {
455        nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
456	    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
457        m->m_pkthdr.csum_flags = CSUM_TCP;
458        m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
459      }
460#ifdef TCPDEBUG
461	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
462		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
463#endif
464#ifdef IPSEC
465	if (ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
466		m_freem(m);
467		return;
468	}
469#endif
470#ifdef INET6
471	if (isipv6) {
472		(void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL);
473		if (ro6 == &sro6 && ro6->ro_rt) {
474			RTFREE(ro6->ro_rt);
475			ro6->ro_rt = NULL;
476		}
477	} else
478#endif /* INET6 */
479      {
480	(void) ip_output(m, NULL, ro, ipflags, NULL);
481	if (ro == &sro && ro->ro_rt) {
482		RTFREE(ro->ro_rt);
483		ro->ro_rt = NULL;
484	}
485      }
486}
487
488/*
489 * Create a new TCP control block, making an
490 * empty reassembly queue and hooking it to the argument
491 * protocol control block.  The `inp' parameter must have
492 * come from the zone allocator set up in tcp_init().
493 */
494struct tcpcb *
495tcp_newtcpcb(inp)
496	struct inpcb *inp;
497{
498	struct inp_tp *it;
499	register struct tcpcb *tp;
500#ifdef INET6
501	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
502#endif /* INET6 */
503
504	it = (struct inp_tp *)inp;
505	tp = &it->tcb;
506	bzero((char *) tp, sizeof(struct tcpcb));
507	LIST_INIT(&tp->t_segq);
508	tp->t_maxseg = tp->t_maxopd =
509#ifdef INET6
510		isipv6 ? tcp_v6mssdflt :
511#endif /* INET6 */
512		tcp_mssdflt;
513
514	/* Set up our timeouts. */
515	callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0);
516	callout_init(tp->tt_persist = &it->inp_tp_persist, 0);
517	callout_init(tp->tt_keep = &it->inp_tp_keep, 0);
518	callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
519	callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
520
521	if (tcp_do_rfc1323)
522		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
523	if (tcp_do_rfc1644)
524		tp->t_flags |= TF_REQ_CC;
525	tp->t_inpcb = inp;	/* XXX */
526	/*
527	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
528	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
529	 * reasonable initial retransmit time.
530	 */
531	tp->t_srtt = TCPTV_SRTTBASE;
532	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
533	tp->t_rttmin = TCPTV_MIN;
534	tp->t_rxtcur = TCPTV_RTOBASE;
535	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
536	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
537	tp->t_rcvtime = ticks;
538        /*
539	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
540	 * because the socket may be bound to an IPv6 wildcard address,
541	 * which may match an IPv4-mapped IPv6 address.
542	 */
543	inp->inp_ip_ttl = ip_defttl;
544	inp->inp_ppcb = (caddr_t)tp;
545	return (tp);		/* XXX */
546}
547
548/*
549 * Drop a TCP connection, reporting
550 * the specified error.  If connection is synchronized,
551 * then send a RST to peer.
552 */
553struct tcpcb *
554tcp_drop(tp, errno)
555	register struct tcpcb *tp;
556	int errno;
557{
558	struct socket *so = tp->t_inpcb->inp_socket;
559
560	if (TCPS_HAVERCVDSYN(tp->t_state)) {
561		tp->t_state = TCPS_CLOSED;
562		(void) tcp_output(tp);
563		tcpstat.tcps_drops++;
564	} else
565		tcpstat.tcps_conndrops++;
566	if (errno == ETIMEDOUT && tp->t_softerror)
567		errno = tp->t_softerror;
568	so->so_error = errno;
569	return (tcp_close(tp));
570}
571
572/*
573 * Close a TCP control block:
574 *	discard all space held by the tcp
575 *	discard internet protocol block
576 *	wake up any sleepers
577 */
578struct tcpcb *
579tcp_close(tp)
580	register struct tcpcb *tp;
581{
582	register struct tseg_qent *q;
583	struct inpcb *inp = tp->t_inpcb;
584	struct socket *so = inp->inp_socket;
585#ifdef INET6
586	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
587#endif /* INET6 */
588	register struct rtentry *rt;
589	int dosavessthresh;
590
591	/*
592	 * Make sure that all of our timers are stopped before we
593	 * delete the PCB.
594	 */
595	callout_stop(tp->tt_rexmt);
596	callout_stop(tp->tt_persist);
597	callout_stop(tp->tt_keep);
598	callout_stop(tp->tt_2msl);
599	callout_stop(tp->tt_delack);
600
601	/*
602	 * If we got enough samples through the srtt filter,
603	 * save the rtt and rttvar in the routing entry.
604	 * 'Enough' is arbitrarily defined as the 16 samples.
605	 * 16 samples is enough for the srtt filter to converge
606	 * to within 5% of the correct value; fewer samples and
607	 * we could save a very bogus rtt.
608	 *
609	 * Don't update the default route's characteristics and don't
610	 * update anything that the user "locked".
611	 */
612	if (tp->t_rttupdated >= 16) {
613		register u_long i = 0;
614#ifdef INET6
615		if (isipv6) {
616			struct sockaddr_in6 *sin6;
617
618			if ((rt = inp->in6p_route.ro_rt) == NULL)
619				goto no_valid_rt;
620			sin6 = (struct sockaddr_in6 *)rt_key(rt);
621			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
622				goto no_valid_rt;
623		}
624		else
625#endif /* INET6 */
626		if ((rt = inp->inp_route.ro_rt) == NULL ||
627		    ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
628		    == INADDR_ANY)
629			goto no_valid_rt;
630
631		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
632			i = tp->t_srtt *
633			    (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
634			if (rt->rt_rmx.rmx_rtt && i)
635				/*
636				 * filter this update to half the old & half
637				 * the new values, converting scale.
638				 * See route.h and tcp_var.h for a
639				 * description of the scaling constants.
640				 */
641				rt->rt_rmx.rmx_rtt =
642				    (rt->rt_rmx.rmx_rtt + i) / 2;
643			else
644				rt->rt_rmx.rmx_rtt = i;
645			tcpstat.tcps_cachedrtt++;
646		}
647		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
648			i = tp->t_rttvar *
649			    (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
650			if (rt->rt_rmx.rmx_rttvar && i)
651				rt->rt_rmx.rmx_rttvar =
652				    (rt->rt_rmx.rmx_rttvar + i) / 2;
653			else
654				rt->rt_rmx.rmx_rttvar = i;
655			tcpstat.tcps_cachedrttvar++;
656		}
657		/*
658		 * The old comment here said:
659		 * update the pipelimit (ssthresh) if it has been updated
660		 * already or if a pipesize was specified & the threshhold
661		 * got below half the pipesize.  I.e., wait for bad news
662		 * before we start updating, then update on both good
663		 * and bad news.
664		 *
665		 * But we want to save the ssthresh even if no pipesize is
666		 * specified explicitly in the route, because such
667		 * connections still have an implicit pipesize specified
668		 * by the global tcp_sendspace.  In the absence of a reliable
669		 * way to calculate the pipesize, it will have to do.
670		 */
671		i = tp->snd_ssthresh;
672		if (rt->rt_rmx.rmx_sendpipe != 0)
673			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
674		else
675			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
676		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
677		     i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
678		    || dosavessthresh) {
679			/*
680			 * convert the limit from user data bytes to
681			 * packets then to packet data bytes.
682			 */
683			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
684			if (i < 2)
685				i = 2;
686			i *= (u_long)(tp->t_maxseg +
687#ifdef INET6
688				      (isipv6 ? sizeof (struct ip6_hdr) +
689					       sizeof (struct tcphdr) :
690#endif
691				       sizeof (struct tcpiphdr)
692#ifdef INET6
693				       )
694#endif
695				      );
696			if (rt->rt_rmx.rmx_ssthresh)
697				rt->rt_rmx.rmx_ssthresh =
698				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
699			else
700				rt->rt_rmx.rmx_ssthresh = i;
701			tcpstat.tcps_cachedssthresh++;
702		}
703	}
704	rt = inp->inp_route.ro_rt;
705	if (rt) {
706		/*
707		 * mark route for deletion if no information is
708		 * cached.
709		 */
710		if ((tp->t_flags & TF_LQ_OVERFLOW) &&
711		    ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0)){
712			if (rt->rt_rmx.rmx_rtt == 0)
713				rt->rt_flags |= RTF_DELCLONE;
714		}
715	}
716    no_valid_rt:
717	/* free the reassembly queue, if any */
718	while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
719		LIST_REMOVE(q, tqe_q);
720		m_freem(q->tqe_m);
721		FREE(q, M_TSEGQ);
722	}
723	inp->inp_ppcb = NULL;
724	soisdisconnected(so);
725#ifdef INET6
726	if (INP_CHECK_SOCKAF(so, AF_INET6))
727		in6_pcbdetach(inp);
728	else
729#endif /* INET6 */
730	in_pcbdetach(inp);
731	tcpstat.tcps_closed++;
732	return ((struct tcpcb *)0);
733}
734
735void
736tcp_drain()
737{
738	if (do_tcpdrain)
739	{
740		struct inpcb *inpb;
741		struct tcpcb *tcpb;
742		struct tseg_qent *te;
743
744	/*
745	 * Walk the tcpbs, if existing, and flush the reassembly queue,
746	 * if there is one...
747	 * XXX: The "Net/3" implementation doesn't imply that the TCP
748	 *      reassembly queue should be flushed, but in a situation
749	 * 	where we're really low on mbufs, this is potentially
750	 *  	usefull.
751	 */
752		LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
753			if ((tcpb = intotcpcb(inpb))) {
754				while ((te = LIST_FIRST(&tcpb->t_segq))
755			            != NULL) {
756					LIST_REMOVE(te, tqe_q);
757					m_freem(te->tqe_m);
758					FREE(te, M_TSEGQ);
759				}
760			}
761		}
762	}
763}
764
765/*
766 * Notify a tcp user of an asynchronous error;
767 * store error as soft error, but wake up user
768 * (for now, won't do anything until can select for soft error).
769 *
770 * Do not wake up user since there currently is no mechanism for
771 * reporting soft errors (yet - a kqueue filter may be added).
772 */
773static void
774tcp_notify(inp, error)
775	struct inpcb *inp;
776	int error;
777{
778	struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
779
780	/*
781	 * Ignore some errors if we are hooked up.
782	 * If connection hasn't completed, has retransmitted several times,
783	 * and receives a second error, give up now.  This is better
784	 * than waiting a long time to establish a connection that
785	 * can never complete.
786	 */
787	if (tp->t_state == TCPS_ESTABLISHED &&
788	     (error == EHOSTUNREACH || error == ENETUNREACH ||
789	      error == EHOSTDOWN)) {
790		return;
791	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
792	    tp->t_softerror)
793		tcp_drop(tp, error);
794	else
795		tp->t_softerror = error;
796#if 0
797	wakeup((caddr_t) &so->so_timeo);
798	sorwakeup(so);
799	sowwakeup(so);
800#endif
801}
802
803static int
804tcp_pcblist(SYSCTL_HANDLER_ARGS)
805{
806	int error, i, n, s;
807	struct inpcb *inp, **inp_list;
808	inp_gen_t gencnt;
809	struct xinpgen xig;
810
811	/*
812	 * The process of preparing the TCB list is too time-consuming and
813	 * resource-intensive to repeat twice on every request.
814	 */
815	if (req->oldptr == 0) {
816		n = tcbinfo.ipi_count;
817		req->oldidx = 2 * (sizeof xig)
818			+ (n + n/8) * sizeof(struct xtcpcb);
819		return 0;
820	}
821
822	if (req->newptr != 0)
823		return EPERM;
824
825	/*
826	 * OK, now we're committed to doing something.
827	 */
828	s = splnet();
829	gencnt = tcbinfo.ipi_gencnt;
830	n = tcbinfo.ipi_count;
831	splx(s);
832
833	xig.xig_len = sizeof xig;
834	xig.xig_count = n;
835	xig.xig_gen = gencnt;
836	xig.xig_sogen = so_gencnt;
837	error = SYSCTL_OUT(req, &xig, sizeof xig);
838	if (error)
839		return error;
840
841	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
842	if (inp_list == 0)
843		return ENOMEM;
844
845	s = splnet();
846	for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
847	     inp = LIST_NEXT(inp, inp_list)) {
848		if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
849			inp_list[i++] = inp;
850	}
851	splx(s);
852	n = i;
853
854	error = 0;
855	for (i = 0; i < n; i++) {
856		inp = inp_list[i];
857		if (inp->inp_gencnt <= gencnt) {
858			struct xtcpcb xt;
859			caddr_t inp_ppcb;
860			xt.xt_len = sizeof xt;
861			/* XXX should avoid extra copy */
862			bcopy(inp, &xt.xt_inp, sizeof *inp);
863			inp_ppcb = inp->inp_ppcb;
864			if (inp_ppcb != NULL)
865				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
866			else
867				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
868			if (inp->inp_socket)
869				sotoxsocket(inp->inp_socket, &xt.xt_socket);
870			error = SYSCTL_OUT(req, &xt, sizeof xt);
871		}
872	}
873	if (!error) {
874		/*
875		 * Give the user an updated idea of our state.
876		 * If the generation differs from what we told
877		 * her before, she knows that something happened
878		 * while we were processing this request, and it
879		 * might be necessary to retry.
880		 */
881		s = splnet();
882		xig.xig_gen = tcbinfo.ipi_gencnt;
883		xig.xig_sogen = so_gencnt;
884		xig.xig_count = tcbinfo.ipi_count;
885		splx(s);
886		error = SYSCTL_OUT(req, &xig, sizeof xig);
887	}
888	free(inp_list, M_TEMP);
889	return error;
890}
891
892SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
893	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
894
895static int
896tcp_getcred(SYSCTL_HANDLER_ARGS)
897{
898	struct xucred xuc;
899	struct sockaddr_in addrs[2];
900	struct inpcb *inp;
901	int error, s;
902
903	error = suser(req->p);
904	if (error)
905		return (error);
906	error = SYSCTL_IN(req, addrs, sizeof(addrs));
907	if (error)
908		return (error);
909	s = splnet();
910	inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
911	    addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
912	if (inp == NULL || inp->inp_socket == NULL) {
913		error = ENOENT;
914		goto out;
915	}
916	bzero(&xuc, sizeof(xuc));
917	xuc.cr_uid = inp->inp_socket->so_cred->cr_uid;
918	xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups;
919	bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups,
920	    sizeof(xuc.cr_groups));
921	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
922out:
923	splx(s);
924	return (error);
925}
926
927SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
928    0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
929
930#ifdef INET6
931static int
932tcp6_getcred(SYSCTL_HANDLER_ARGS)
933{
934	struct xucred xuc;
935	struct sockaddr_in6 addrs[2];
936	struct inpcb *inp;
937	int error, s, mapped = 0;
938
939	error = suser(req->p);
940	if (error)
941		return (error);
942	error = SYSCTL_IN(req, addrs, sizeof(addrs));
943	if (error)
944		return (error);
945	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
946		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
947			mapped = 1;
948		else
949			return (EINVAL);
950	}
951	s = splnet();
952	if (mapped == 1)
953		inp = in_pcblookup_hash(&tcbinfo,
954			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
955			addrs[1].sin6_port,
956			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
957			addrs[0].sin6_port,
958			0, NULL);
959	else
960		inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
961				 addrs[1].sin6_port,
962				 &addrs[0].sin6_addr, addrs[0].sin6_port,
963				 0, NULL);
964	if (inp == NULL || inp->inp_socket == NULL) {
965		error = ENOENT;
966		goto out;
967	}
968	bzero(&xuc, sizeof(xuc));
969	xuc.cr_uid = inp->inp_socket->so_cred->cr_uid;
970	xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups;
971	bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups,
972	    sizeof(xuc.cr_groups));
973	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
974out:
975	splx(s);
976	return (error);
977}
978
979SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
980	    0, 0,
981	    tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
982#endif
983
984
985void
986tcp_ctlinput(cmd, sa, vip)
987	int cmd;
988	struct sockaddr *sa;
989	void *vip;
990{
991	struct ip *ip = vip;
992	struct tcphdr *th;
993	struct in_addr faddr;
994	struct inpcb *inp;
995	struct tcpcb *tp;
996	void (*notify) __P((struct inpcb *, int)) = tcp_notify;
997	tcp_seq icmp_seq;
998	int s;
999
1000	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1001	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1002		return;
1003
1004	if (cmd == PRC_QUENCH)
1005		notify = tcp_quench;
1006	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
1007		cmd == PRC_UNREACH_PORT) && ip)
1008		notify = tcp_drop_syn_sent;
1009	else if (cmd == PRC_MSGSIZE)
1010		notify = tcp_mtudisc;
1011	else if (PRC_IS_REDIRECT(cmd)) {
1012		ip = 0;
1013		notify = in_rtchange;
1014	} else if (cmd == PRC_HOSTDEAD)
1015		ip = 0;
1016	else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
1017		return;
1018	if (ip) {
1019		s = splnet();
1020		th = (struct tcphdr *)((caddr_t)ip
1021				       + (IP_VHL_HL(ip->ip_vhl) << 2));
1022		inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
1023		    ip->ip_src, th->th_sport, 0, NULL);
1024		if (inp != NULL && inp->inp_socket != NULL) {
1025			icmp_seq = htonl(th->th_seq);
1026			tp = intotcpcb(inp);
1027			if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
1028			    SEQ_LT(icmp_seq, tp->snd_max))
1029				(*notify)(inp, inetctlerrmap[cmd]);
1030		}
1031		splx(s);
1032	} else
1033		in_pcbnotifyall(&tcb, faddr, inetctlerrmap[cmd], notify);
1034}
1035
1036#ifdef INET6
1037void
1038tcp6_ctlinput(cmd, sa, d)
1039	int cmd;
1040	struct sockaddr *sa;
1041	void *d;
1042{
1043	struct tcphdr th;
1044	void (*notify) __P((struct inpcb *, int)) = tcp_notify;
1045	struct ip6_hdr *ip6;
1046	struct mbuf *m;
1047	struct ip6ctlparam *ip6cp = NULL;
1048	const struct sockaddr_in6 *sa6_src = NULL;
1049	int off;
1050	struct tcp_portonly {
1051		u_int16_t th_sport;
1052		u_int16_t th_dport;
1053	} *thp;
1054
1055	if (sa->sa_family != AF_INET6 ||
1056	    sa->sa_len != sizeof(struct sockaddr_in6))
1057		return;
1058
1059	if (cmd == PRC_QUENCH)
1060		notify = tcp_quench;
1061	else if (cmd == PRC_MSGSIZE)
1062		notify = tcp_mtudisc;
1063	else if (!PRC_IS_REDIRECT(cmd) &&
1064		 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1065		return;
1066
1067	/* if the parameter is from icmp6, decode it. */
1068	if (d != NULL) {
1069		ip6cp = (struct ip6ctlparam *)d;
1070		m = ip6cp->ip6c_m;
1071		ip6 = ip6cp->ip6c_ip6;
1072		off = ip6cp->ip6c_off;
1073		sa6_src = ip6cp->ip6c_src;
1074	} else {
1075		m = NULL;
1076		ip6 = NULL;
1077		off = 0;	/* fool gcc */
1078		sa6_src = &sa6_any;
1079	}
1080
1081	if (ip6) {
1082		/*
1083		 * XXX: We assume that when IPV6 is non NULL,
1084		 * M and OFF are valid.
1085		 */
1086
1087		/* check if we can safely examine src and dst ports */
1088		if (m->m_pkthdr.len < off + sizeof(*thp))
1089			return;
1090
1091		bzero(&th, sizeof(th));
1092		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1093
1094		in6_pcbnotify(&tcb, sa, th.th_dport,
1095		    (struct sockaddr *)ip6cp->ip6c_src,
1096		    th.th_sport, cmd, notify);
1097	} else
1098		in6_pcbnotify(&tcb, sa, 0, (struct sockaddr *)sa6_src,
1099			      0, cmd, notify);
1100}
1101#endif /* INET6 */
1102
1103#define TCP_RNDISS_ROUNDS	16
1104#define TCP_RNDISS_OUT	7200
1105#define TCP_RNDISS_MAX	30000
1106
1107u_int8_t tcp_rndiss_sbox[128];
1108u_int16_t tcp_rndiss_msb;
1109u_int16_t tcp_rndiss_cnt;
1110long tcp_rndiss_reseed;
1111
1112u_int16_t
1113tcp_rndiss_encrypt(val)
1114	u_int16_t val;
1115{
1116	u_int16_t sum = 0, i;
1117
1118	for (i = 0; i < TCP_RNDISS_ROUNDS; i++) {
1119		sum += 0x79b9;
1120		val ^= ((u_int16_t)tcp_rndiss_sbox[(val^sum) & 0x7f]) << 7;
1121		val = ((val & 0xff) << 7) | (val >> 8);
1122	}
1123
1124	return val;
1125}
1126
1127void
1128tcp_rndiss_init()
1129{
1130	struct timeval time;
1131
1132	getmicrotime(&time);
1133	read_random(tcp_rndiss_sbox, sizeof(tcp_rndiss_sbox));
1134
1135	tcp_rndiss_reseed = time.tv_sec + TCP_RNDISS_OUT;
1136	tcp_rndiss_msb = tcp_rndiss_msb == 0x8000 ? 0 : 0x8000;
1137	tcp_rndiss_cnt = 0;
1138}
1139
1140tcp_seq
1141tcp_rndiss_next()
1142{
1143	u_int16_t tmp;
1144	struct timeval time;
1145
1146	getmicrotime(&time);
1147
1148        if (tcp_rndiss_cnt >= TCP_RNDISS_MAX ||
1149	    time.tv_sec > tcp_rndiss_reseed)
1150                tcp_rndiss_init();
1151
1152	read_random(&tmp, sizeof(tmp));
1153
1154	/* (tmp & 0x7fff) ensures a 32768 byte gap between ISS */
1155	return ((tcp_rndiss_encrypt(tcp_rndiss_cnt++) | tcp_rndiss_msb) <<16) |
1156		(tmp & 0x7fff);
1157}
1158
1159
1160/*
1161 * When a source quench is received, close congestion window
1162 * to one segment.  We will gradually open it again as we proceed.
1163 */
1164void
1165tcp_quench(inp, errno)
1166	struct inpcb *inp;
1167	int errno;
1168{
1169	struct tcpcb *tp = intotcpcb(inp);
1170
1171	if (tp)
1172		tp->snd_cwnd = tp->t_maxseg;
1173}
1174
1175/*
1176 * When a specific ICMP unreachable message is received and the
1177 * connection state is SYN-SENT, drop the connection.  This behavior
1178 * is controlled by the icmp_may_rst sysctl.
1179 */
1180void
1181tcp_drop_syn_sent(inp, errno)
1182	struct inpcb *inp;
1183	int errno;
1184{
1185	struct tcpcb *tp = intotcpcb(inp);
1186
1187	if (tp && tp->t_state == TCPS_SYN_SENT)
1188		tcp_drop(tp, errno);
1189}
1190
1191/*
1192 * When `need fragmentation' ICMP is received, update our idea of the MSS
1193 * based on the new value in the route.  Also nudge TCP to send something,
1194 * since we know the packet we just sent was dropped.
1195 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1196 */
1197void
1198tcp_mtudisc(inp, errno)
1199	struct inpcb *inp;
1200	int errno;
1201{
1202	struct tcpcb *tp = intotcpcb(inp);
1203	struct rtentry *rt;
1204	struct rmxp_tao *taop;
1205	struct socket *so = inp->inp_socket;
1206	int offered;
1207	int mss;
1208#ifdef INET6
1209	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
1210#endif /* INET6 */
1211
1212	if (tp) {
1213#ifdef INET6
1214		if (isipv6)
1215			rt = tcp_rtlookup6(inp);
1216		else
1217#endif /* INET6 */
1218		rt = tcp_rtlookup(inp);
1219		if (!rt || !rt->rt_rmx.rmx_mtu) {
1220			tp->t_maxopd = tp->t_maxseg =
1221#ifdef INET6
1222				isipv6 ? tcp_v6mssdflt :
1223#endif /* INET6 */
1224				tcp_mssdflt;
1225			return;
1226		}
1227		taop = rmx_taop(rt->rt_rmx);
1228		offered = taop->tao_mssopt;
1229		mss = rt->rt_rmx.rmx_mtu -
1230#ifdef INET6
1231			(isipv6 ?
1232			 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
1233#endif /* INET6 */
1234			 sizeof(struct tcpiphdr)
1235#ifdef INET6
1236			 )
1237#endif /* INET6 */
1238			;
1239
1240		if (offered)
1241			mss = min(mss, offered);
1242		/*
1243		 * XXX - The above conditional probably violates the TCP
1244		 * spec.  The problem is that, since we don't know the
1245		 * other end's MSS, we are supposed to use a conservative
1246		 * default.  But, if we do that, then MTU discovery will
1247		 * never actually take place, because the conservative
1248		 * default is much less than the MTUs typically seen
1249		 * on the Internet today.  For the moment, we'll sweep
1250		 * this under the carpet.
1251		 *
1252		 * The conservative default might not actually be a problem
1253		 * if the only case this occurs is when sending an initial
1254		 * SYN with options and data to a host we've never talked
1255		 * to before.  Then, they will reply with an MSS value which
1256		 * will get recorded and the new parameters should get
1257		 * recomputed.  For Further Study.
1258		 */
1259		if (tp->t_maxopd <= mss)
1260			return;
1261		tp->t_maxopd = mss;
1262
1263		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1264		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1265			mss -= TCPOLEN_TSTAMP_APPA;
1266		if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
1267		    (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
1268			mss -= TCPOLEN_CC_APPA;
1269#if	(MCLBYTES & (MCLBYTES - 1)) == 0
1270		if (mss > MCLBYTES)
1271			mss &= ~(MCLBYTES-1);
1272#else
1273		if (mss > MCLBYTES)
1274			mss = mss / MCLBYTES * MCLBYTES;
1275#endif
1276		if (so->so_snd.sb_hiwat < mss)
1277			mss = so->so_snd.sb_hiwat;
1278
1279		tp->t_maxseg = mss;
1280
1281		tcpstat.tcps_mturesent++;
1282		tp->t_rtttime = 0;
1283		tp->snd_nxt = tp->snd_una;
1284		tcp_output(tp);
1285	}
1286}
1287
1288/*
1289 * Look-up the routing entry to the peer of this inpcb.  If no route
1290 * is found and it cannot be allocated the return NULL.  This routine
1291 * is called by TCP routines that access the rmx structure and by tcp_mss
1292 * to get the interface MTU.
1293 */
1294struct rtentry *
1295tcp_rtlookup(inp)
1296	struct inpcb *inp;
1297{
1298	struct route *ro;
1299	struct rtentry *rt;
1300
1301	ro = &inp->inp_route;
1302	rt = ro->ro_rt;
1303	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1304		/* No route yet, so try to acquire one */
1305		if (inp->inp_faddr.s_addr != INADDR_ANY) {
1306			ro->ro_dst.sa_family = AF_INET;
1307			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
1308			((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
1309				inp->inp_faddr;
1310			rtalloc(ro);
1311			rt = ro->ro_rt;
1312		}
1313	}
1314	return rt;
1315}
1316
1317#ifdef INET6
1318struct rtentry *
1319tcp_rtlookup6(inp)
1320	struct inpcb *inp;
1321{
1322	struct route_in6 *ro6;
1323	struct rtentry *rt;
1324
1325	ro6 = &inp->in6p_route;
1326	rt = ro6->ro_rt;
1327	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1328		/* No route yet, so try to acquire one */
1329		if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
1330			struct sockaddr_in6 *dst6;
1331
1332			dst6 = (struct sockaddr_in6 *)&ro6->ro_dst;
1333			dst6->sin6_family = AF_INET6;
1334			dst6->sin6_len = sizeof(*dst6);
1335			dst6->sin6_addr = inp->in6p_faddr;
1336			rtalloc((struct route *)ro6);
1337			rt = ro6->ro_rt;
1338		}
1339	}
1340	return rt;
1341}
1342#endif /* INET6 */
1343
1344#ifdef IPSEC
1345/* compute ESP/AH header size for TCP, including outer IP header. */
1346size_t
1347ipsec_hdrsiz_tcp(tp)
1348	struct tcpcb *tp;
1349{
1350	struct inpcb *inp;
1351	struct mbuf *m;
1352	size_t hdrsiz;
1353	struct ip *ip;
1354#ifdef INET6
1355	struct ip6_hdr *ip6;
1356#endif /* INET6 */
1357	struct tcphdr *th;
1358
1359	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
1360		return 0;
1361	MGETHDR(m, M_DONTWAIT, MT_DATA);
1362	if (!m)
1363		return 0;
1364
1365#ifdef INET6
1366	if ((inp->inp_vflag & INP_IPV6) != 0) {
1367		ip6 = mtod(m, struct ip6_hdr *);
1368		th = (struct tcphdr *)(ip6 + 1);
1369		m->m_pkthdr.len = m->m_len =
1370			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1371		tcp_fillheaders(tp, ip6, th);
1372		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1373	} else
1374#endif /* INET6 */
1375      {
1376	ip = mtod(m, struct ip *);
1377	th = (struct tcphdr *)(ip + 1);
1378	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1379	tcp_fillheaders(tp, ip, th);
1380	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1381      }
1382
1383	m_free(m);
1384	return hdrsiz;
1385}
1386#endif /*IPSEC*/
1387
1388/*
1389 * Return a pointer to the cached information about the remote host.
1390 * The cached information is stored in the protocol specific part of
1391 * the route metrics.
1392 */
1393struct rmxp_tao *
1394tcp_gettaocache(inp)
1395	struct inpcb *inp;
1396{
1397	struct rtentry *rt;
1398
1399#ifdef INET6
1400	if ((inp->inp_vflag & INP_IPV6) != 0)
1401		rt = tcp_rtlookup6(inp);
1402	else
1403#endif /* INET6 */
1404	rt = tcp_rtlookup(inp);
1405
1406	/* Make sure this is a host route and is up. */
1407	if (rt == NULL ||
1408	    (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
1409		return NULL;
1410
1411	return rmx_taop(rt->rt_rmx);
1412}
1413
1414/*
1415 * Clear all the TAO cache entries, called from tcp_init.
1416 *
1417 * XXX
1418 * This routine is just an empty one, because we assume that the routing
1419 * routing tables are initialized at the same time when TCP, so there is
1420 * nothing in the cache left over.
1421 */
1422static void
1423tcp_cleartaocache()
1424{
1425}
1426