tcp_timewait.c revision 72650
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
34 * $FreeBSD: head/sys/netinet/tcp_timewait.c 72650 2001-02-18 13:30:20Z green $
35 */
36
37#include "opt_compat.h"
38#include "opt_inet6.h"
39#include "opt_ipsec.h"
40#include "opt_tcpdebug.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/callout.h>
45#include <sys/kernel.h>
46#include <sys/sysctl.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#ifdef INET6
50#include <sys/domain.h>
51#endif
52#include <sys/proc.h>
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55#include <sys/protosw.h>
56
57#include <vm/vm_zone.h>
58
59#include <net/route.h>
60#include <net/if.h>
61
62#define _IP_VHL
63#include <netinet/in.h>
64#include <netinet/in_systm.h>
65#include <netinet/ip.h>
66#ifdef INET6
67#include <netinet/ip6.h>
68#endif
69#include <netinet/in_pcb.h>
70#ifdef INET6
71#include <netinet6/in6_pcb.h>
72#endif
73#include <netinet/in_var.h>
74#include <netinet/ip_var.h>
75#ifdef INET6
76#include <netinet6/ip6_var.h>
77#endif
78#include <netinet/tcp.h>
79#include <netinet/tcp_fsm.h>
80#include <netinet/tcp_seq.h>
81#include <netinet/tcp_timer.h>
82#include <netinet/tcp_var.h>
83#ifdef INET6
84#include <netinet6/tcp6_var.h>
85#endif
86#include <netinet/tcpip.h>
87#ifdef TCPDEBUG
88#include <netinet/tcp_debug.h>
89#endif
90#include <netinet6/ip6protosw.h>
91
92#ifdef IPSEC
93#include <netinet6/ipsec.h>
94#ifdef INET6
95#include <netinet6/ipsec6.h>
96#endif
97#endif /*IPSEC*/
98
99#include <machine/in_cksum.h>
100
101int 	tcp_mssdflt = TCP_MSS;
102SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
103    &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
104
105#ifdef INET6
106int	tcp_v6mssdflt = TCP6_MSS;
107SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
108	CTLFLAG_RW, &tcp_v6mssdflt , 0,
109	"Default TCP Maximum Segment Size for IPv6");
110#endif
111
112#if 0
113static int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
114SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
115    &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
116#endif
117
118static int	tcp_do_rfc1323 = 1;
119SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
120    &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
121
122static int	tcp_do_rfc1644 = 0;
123SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
124    &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
125
126static int	tcp_tcbhashsize = 0;
127SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
128     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
129
130static int	do_tcpdrain = 1;
131SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
132     "Enable tcp_drain routine for extra help when low on mbufs");
133
134SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
135    &tcbinfo.ipi_count, 0, "Number of active PCBs");
136
137/*
138 * Treat ICMP unreachables like a TCP RST as required by rfc1122 section 3.2.2.1
139 *
140 * Administatively prohibited kill's sessions regardless of
141 * their current state, other unreachable by default only kill
142 * sessions if they are in SYN-SENT state, this ensure temporary
143 * routing problems doesn't kill existing TCP sessions.
144 * This can be overridden by icmp_like_rst_syn_sent_only.
145 */
146
147static int	icmp_unreach_like_rst = 1;
148SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_unreach_like_rst, CTLFLAG_RW,
149	&icmp_unreach_like_rst, 0,
150	"Treat ICMP unreachable messages like TCP RST, rfc1122 section 3.2.2.1");
151
152/*
153 * Control if ICMP unreachable messages other that administratively prohibited
154 * ones will kill sessions not in SYN-SENT state.
155 *
156 * Has no effect unless icmp_unreach_like_rst is enabled.
157 */
158
159static int	icmp_like_rst_syn_sent_only = 1;
160SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_like_rst_syn_sent_only, CTLFLAG_RW,
161	&icmp_like_rst_syn_sent_only, 0,
162	"When icmp_unreach_like_rst is enabled, only act on sessions in SYN-SENT state");
163
164static void	tcp_cleartaocache __P((void));
165static void	tcp_notify __P((struct inpcb *, int));
166
167/*
168 * Target size of TCP PCB hash tables. Must be a power of two.
169 *
170 * Note that this can be overridden by the kernel environment
171 * variable net.inet.tcp.tcbhashsize
172 */
173#ifndef TCBHASHSIZE
174#define TCBHASHSIZE	512
175#endif
176
177/*
178 * This is the actual shape of what we allocate using the zone
179 * allocator.  Doing it this way allows us to protect both structures
180 * using the same generation count, and also eliminates the overhead
181 * of allocating tcpcbs separately.  By hiding the structure here,
182 * we avoid changing most of the rest of the code (although it needs
183 * to be changed, eventually, for greater efficiency).
184 */
185#define	ALIGNMENT	32
186#define	ALIGNM1		(ALIGNMENT - 1)
187struct	inp_tp {
188	union {
189		struct	inpcb inp;
190		char	align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
191	} inp_tp_u;
192	struct	tcpcb tcb;
193	struct	callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
194	struct	callout inp_tp_delack;
195};
196#undef ALIGNMENT
197#undef ALIGNM1
198
199/*
200 * Tcp initialization
201 */
202void
203tcp_init()
204{
205	int hashsize;
206
207	tcp_iss = arc4random();	/* wrong, but better than a constant */
208	tcp_ccgen = 1;
209	tcp_cleartaocache();
210
211	tcp_delacktime = TCPTV_DELACK;
212	tcp_keepinit = TCPTV_KEEP_INIT;
213	tcp_keepidle = TCPTV_KEEP_IDLE;
214	tcp_keepintvl = TCPTV_KEEPINTVL;
215	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
216	tcp_msl = TCPTV_MSL;
217
218	LIST_INIT(&tcb);
219	tcbinfo.listhead = &tcb;
220	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", TCBHASHSIZE, hashsize);
221	if (!powerof2(hashsize)) {
222		printf("WARNING: TCB hash size not a power of 2\n");
223		hashsize = 512; /* safe default */
224	}
225	tcp_tcbhashsize = hashsize;
226	tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
227	tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
228					&tcbinfo.porthashmask);
229	tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets,
230				 ZONE_INTERRUPT, 0);
231#ifdef INET6
232#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
233#else /* INET6 */
234#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
235#endif /* INET6 */
236	if (max_protohdr < TCP_MINPROTOHDR)
237		max_protohdr = TCP_MINPROTOHDR;
238	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
239		panic("tcp_init");
240#undef TCP_MINPROTOHDR
241}
242
243/*
244 * Create template to be used to send tcp packets on a connection.
245 * Call after host entry created, allocates an mbuf and fills
246 * in a skeletal tcp/ip header, minimizing the amount of work
247 * necessary when the connection is used.
248 */
249struct tcptemp *
250tcp_template(tp)
251	struct tcpcb *tp;
252{
253	register struct inpcb *inp = tp->t_inpcb;
254	register struct mbuf *m;
255	register struct tcptemp *n;
256
257	if ((n = tp->t_template) == 0) {
258		m = m_get(M_DONTWAIT, MT_HEADER);
259		if (m == NULL)
260			return (0);
261		m->m_len = sizeof (struct tcptemp);
262		n = mtod(m, struct tcptemp *);
263	}
264#ifdef INET6
265	if ((inp->inp_vflag & INP_IPV6) != 0) {
266		register struct ip6_hdr *ip6;
267
268		ip6 = (struct ip6_hdr *)n->tt_ipgen;
269		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
270			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
271		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
272			(IPV6_VERSION & IPV6_VERSION_MASK);
273		ip6->ip6_nxt = IPPROTO_TCP;
274		ip6->ip6_plen = sizeof(struct tcphdr);
275		ip6->ip6_src = inp->in6p_laddr;
276		ip6->ip6_dst = inp->in6p_faddr;
277		n->tt_t.th_sum = 0;
278	} else
279#endif
280      {
281	struct ip *ip = (struct ip *)n->tt_ipgen;
282
283	bzero(ip, sizeof(struct ip));		/* XXX overkill? */
284	ip->ip_vhl = IP_VHL_BORING;
285	ip->ip_p = IPPROTO_TCP;
286	ip->ip_src = inp->inp_laddr;
287	ip->ip_dst = inp->inp_faddr;
288	n->tt_t.th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
289	    htons(sizeof(struct tcphdr) + IPPROTO_TCP));
290      }
291	n->tt_t.th_sport = inp->inp_lport;
292	n->tt_t.th_dport = inp->inp_fport;
293	n->tt_t.th_seq = 0;
294	n->tt_t.th_ack = 0;
295	n->tt_t.th_x2 = 0;
296	n->tt_t.th_off = 5;
297	n->tt_t.th_flags = 0;
298	n->tt_t.th_win = 0;
299	n->tt_t.th_urp = 0;
300	return (n);
301}
302
303/*
304 * Send a single message to the TCP at address specified by
305 * the given TCP/IP header.  If m == 0, then we make a copy
306 * of the tcpiphdr at ti and send directly to the addressed host.
307 * This is used to force keep alive messages out using the TCP
308 * template for a connection tp->t_template.  If flags are given
309 * then we send a message back to the TCP which originated the
310 * segment ti, and discard the mbuf containing it and any other
311 * attached mbufs.
312 *
313 * In any case the ack and sequence number of the transmitted
314 * segment are as specified by the parameters.
315 *
316 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
317 */
318void
319tcp_respond(tp, ipgen, th, m, ack, seq, flags)
320	struct tcpcb *tp;
321	void *ipgen;
322	register struct tcphdr *th;
323	register struct mbuf *m;
324	tcp_seq ack, seq;
325	int flags;
326{
327	register int tlen;
328	int win = 0;
329	struct route *ro = 0;
330	struct route sro;
331	struct ip *ip;
332	struct tcphdr *nth;
333#ifdef INET6
334	struct route_in6 *ro6 = 0;
335	struct route_in6 sro6;
336	struct ip6_hdr *ip6;
337	int isipv6;
338#endif /* INET6 */
339	int ipflags = 0;
340
341#ifdef INET6
342	isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
343	ip6 = ipgen;
344#endif /* INET6 */
345	ip = ipgen;
346
347	if (tp) {
348		if (!(flags & TH_RST)) {
349			win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
350			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
351				win = (long)TCP_MAXWIN << tp->rcv_scale;
352		}
353#ifdef INET6
354		if (isipv6)
355			ro6 = &tp->t_inpcb->in6p_route;
356		else
357#endif /* INET6 */
358		ro = &tp->t_inpcb->inp_route;
359	} else {
360#ifdef INET6
361		if (isipv6) {
362			ro6 = &sro6;
363			bzero(ro6, sizeof *ro6);
364		} else
365#endif /* INET6 */
366	      {
367		ro = &sro;
368		bzero(ro, sizeof *ro);
369	      }
370	}
371	if (m == 0) {
372		m = m_gethdr(M_DONTWAIT, MT_HEADER);
373		if (m == NULL)
374			return;
375#ifdef TCP_COMPAT_42
376		tlen = 1;
377#else
378		tlen = 0;
379#endif
380		m->m_data += max_linkhdr;
381#ifdef INET6
382		if (isipv6) {
383			bcopy((caddr_t)ip6, mtod(m, caddr_t),
384			      sizeof(struct ip6_hdr));
385			ip6 = mtod(m, struct ip6_hdr *);
386			nth = (struct tcphdr *)(ip6 + 1);
387		} else
388#endif /* INET6 */
389	      {
390		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
391		ip = mtod(m, struct ip *);
392		nth = (struct tcphdr *)(ip + 1);
393	      }
394		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
395		flags = TH_ACK;
396	} else {
397		m_freem(m->m_next);
398		m->m_next = 0;
399		m->m_data = (caddr_t)ipgen;
400		/* m_len is set later */
401		tlen = 0;
402#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
403#ifdef INET6
404		if (isipv6) {
405			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
406			nth = (struct tcphdr *)(ip6 + 1);
407		} else
408#endif /* INET6 */
409	      {
410		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
411		nth = (struct tcphdr *)(ip + 1);
412	      }
413		if (th != nth) {
414			/*
415			 * this is usually a case when an extension header
416			 * exists between the IPv6 header and the
417			 * TCP header.
418			 */
419			nth->th_sport = th->th_sport;
420			nth->th_dport = th->th_dport;
421		}
422		xchg(nth->th_dport, nth->th_sport, n_short);
423#undef xchg
424	}
425#ifdef INET6
426	if (isipv6) {
427		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
428						tlen));
429		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
430	} else
431#endif
432      {
433	tlen += sizeof (struct tcpiphdr);
434	ip->ip_len = tlen;
435	ip->ip_ttl = ip_defttl;
436      }
437	m->m_len = tlen;
438	m->m_pkthdr.len = tlen;
439	m->m_pkthdr.rcvif = (struct ifnet *) 0;
440	nth->th_seq = htonl(seq);
441	nth->th_ack = htonl(ack);
442	nth->th_x2 = 0;
443	nth->th_off = sizeof (struct tcphdr) >> 2;
444	nth->th_flags = flags;
445	if (tp)
446		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
447	else
448		nth->th_win = htons((u_short)win);
449	nth->th_urp = 0;
450#ifdef INET6
451	if (isipv6) {
452		nth->th_sum = 0;
453		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
454					sizeof(struct ip6_hdr),
455					tlen - sizeof(struct ip6_hdr));
456		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
457					       ro6 && ro6->ro_rt ?
458					       ro6->ro_rt->rt_ifp :
459					       NULL);
460	} else
461#endif /* INET6 */
462      {
463        nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
464	    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
465        m->m_pkthdr.csum_flags = CSUM_TCP;
466        m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
467      }
468#ifdef TCPDEBUG
469	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
470		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
471#endif
472#ifdef IPSEC
473	ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL);
474#endif
475#ifdef INET6
476	if (isipv6) {
477		(void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL);
478		if (ro6 == &sro6 && ro6->ro_rt) {
479			RTFREE(ro6->ro_rt);
480			ro6->ro_rt = NULL;
481		}
482	} else
483#endif /* INET6 */
484      {
485	(void) ip_output(m, NULL, ro, ipflags, NULL);
486	if (ro == &sro && ro->ro_rt) {
487		RTFREE(ro->ro_rt);
488		ro->ro_rt = NULL;
489	}
490      }
491}
492
493/*
494 * Create a new TCP control block, making an
495 * empty reassembly queue and hooking it to the argument
496 * protocol control block.  The `inp' parameter must have
497 * come from the zone allocator set up in tcp_init().
498 */
499struct tcpcb *
500tcp_newtcpcb(inp)
501	struct inpcb *inp;
502{
503	struct inp_tp *it;
504	register struct tcpcb *tp;
505#ifdef INET6
506	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
507#endif /* INET6 */
508
509	it = (struct inp_tp *)inp;
510	tp = &it->tcb;
511	bzero((char *) tp, sizeof(struct tcpcb));
512	LIST_INIT(&tp->t_segq);
513	tp->t_maxseg = tp->t_maxopd =
514#ifdef INET6
515		isipv6 ? tcp_v6mssdflt :
516#endif /* INET6 */
517		tcp_mssdflt;
518
519	/* Set up our timeouts. */
520	callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0);
521	callout_init(tp->tt_persist = &it->inp_tp_persist, 0);
522	callout_init(tp->tt_keep = &it->inp_tp_keep, 0);
523	callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
524	callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
525
526	if (tcp_do_rfc1323)
527		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
528	if (tcp_do_rfc1644)
529		tp->t_flags |= TF_REQ_CC;
530	tp->t_inpcb = inp;	/* XXX */
531	/*
532	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
533	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
534	 * reasonable initial retransmit time.
535	 */
536	tp->t_srtt = TCPTV_SRTTBASE;
537	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
538	tp->t_rttmin = TCPTV_MIN;
539	tp->t_rxtcur = TCPTV_RTOBASE;
540	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
541	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
542	tp->t_rcvtime = ticks;
543        /*
544	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
545	 * because the socket may be bound to an IPv6 wildcard address,
546	 * which may match an IPv4-mapped IPv6 address.
547	 */
548	inp->inp_ip_ttl = ip_defttl;
549	inp->inp_ppcb = (caddr_t)tp;
550	return (tp);		/* XXX */
551}
552
553/*
554 * Drop a TCP connection, reporting
555 * the specified error.  If connection is synchronized,
556 * then send a RST to peer.
557 */
558struct tcpcb *
559tcp_drop(tp, errno)
560	register struct tcpcb *tp;
561	int errno;
562{
563	struct socket *so = tp->t_inpcb->inp_socket;
564
565	if (TCPS_HAVERCVDSYN(tp->t_state)) {
566		tp->t_state = TCPS_CLOSED;
567		(void) tcp_output(tp);
568		tcpstat.tcps_drops++;
569	} else
570		tcpstat.tcps_conndrops++;
571	if (errno == ETIMEDOUT && tp->t_softerror)
572		errno = tp->t_softerror;
573	so->so_error = errno;
574	return (tcp_close(tp));
575}
576
577/*
578 * Close a TCP control block:
579 *	discard all space held by the tcp
580 *	discard internet protocol block
581 *	wake up any sleepers
582 */
583struct tcpcb *
584tcp_close(tp)
585	register struct tcpcb *tp;
586{
587	register struct tseg_qent *q;
588	struct inpcb *inp = tp->t_inpcb;
589	struct socket *so = inp->inp_socket;
590#ifdef INET6
591	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
592#endif /* INET6 */
593	register struct rtentry *rt;
594	int dosavessthresh;
595
596	/*
597	 * Make sure that all of our timers are stopped before we
598	 * delete the PCB.
599	 */
600	callout_stop(tp->tt_rexmt);
601	callout_stop(tp->tt_persist);
602	callout_stop(tp->tt_keep);
603	callout_stop(tp->tt_2msl);
604	callout_stop(tp->tt_delack);
605
606	/*
607	 * If we got enough samples through the srtt filter,
608	 * save the rtt and rttvar in the routing entry.
609	 * 'Enough' is arbitrarily defined as the 16 samples.
610	 * 16 samples is enough for the srtt filter to converge
611	 * to within 5% of the correct value; fewer samples and
612	 * we could save a very bogus rtt.
613	 *
614	 * Don't update the default route's characteristics and don't
615	 * update anything that the user "locked".
616	 */
617	if (tp->t_rttupdated >= 16) {
618		register u_long i = 0;
619#ifdef INET6
620		if (isipv6) {
621			struct sockaddr_in6 *sin6;
622
623			if ((rt = inp->in6p_route.ro_rt) == NULL)
624				goto no_valid_rt;
625			sin6 = (struct sockaddr_in6 *)rt_key(rt);
626			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
627				goto no_valid_rt;
628		}
629		else
630#endif /* INET6 */
631		if ((rt = inp->inp_route.ro_rt) == NULL ||
632		    ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
633		    == INADDR_ANY)
634			goto no_valid_rt;
635
636		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
637			i = tp->t_srtt *
638			    (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
639			if (rt->rt_rmx.rmx_rtt && i)
640				/*
641				 * filter this update to half the old & half
642				 * the new values, converting scale.
643				 * See route.h and tcp_var.h for a
644				 * description of the scaling constants.
645				 */
646				rt->rt_rmx.rmx_rtt =
647				    (rt->rt_rmx.rmx_rtt + i) / 2;
648			else
649				rt->rt_rmx.rmx_rtt = i;
650			tcpstat.tcps_cachedrtt++;
651		}
652		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
653			i = tp->t_rttvar *
654			    (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
655			if (rt->rt_rmx.rmx_rttvar && i)
656				rt->rt_rmx.rmx_rttvar =
657				    (rt->rt_rmx.rmx_rttvar + i) / 2;
658			else
659				rt->rt_rmx.rmx_rttvar = i;
660			tcpstat.tcps_cachedrttvar++;
661		}
662		/*
663		 * The old comment here said:
664		 * update the pipelimit (ssthresh) if it has been updated
665		 * already or if a pipesize was specified & the threshhold
666		 * got below half the pipesize.  I.e., wait for bad news
667		 * before we start updating, then update on both good
668		 * and bad news.
669		 *
670		 * But we want to save the ssthresh even if no pipesize is
671		 * specified explicitly in the route, because such
672		 * connections still have an implicit pipesize specified
673		 * by the global tcp_sendspace.  In the absence of a reliable
674		 * way to calculate the pipesize, it will have to do.
675		 */
676		i = tp->snd_ssthresh;
677		if (rt->rt_rmx.rmx_sendpipe != 0)
678			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
679		else
680			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
681		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
682		     i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
683		    || dosavessthresh) {
684			/*
685			 * convert the limit from user data bytes to
686			 * packets then to packet data bytes.
687			 */
688			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
689			if (i < 2)
690				i = 2;
691			i *= (u_long)(tp->t_maxseg +
692#ifdef INET6
693				      (isipv6 ? sizeof (struct ip6_hdr) +
694					       sizeof (struct tcphdr) :
695#endif
696				       sizeof (struct tcpiphdr)
697#ifdef INET6
698				       )
699#endif
700				      );
701			if (rt->rt_rmx.rmx_ssthresh)
702				rt->rt_rmx.rmx_ssthresh =
703				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
704			else
705				rt->rt_rmx.rmx_ssthresh = i;
706			tcpstat.tcps_cachedssthresh++;
707		}
708	}
709	rt = inp->inp_route.ro_rt;
710	if (rt) {
711		/*
712		 * mark route for deletion if no information is
713		 * cached.
714		 */
715		if ((tp->t_flags & TF_LQ_OVERFLOW) &&
716		    ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0)){
717			if (rt->rt_rmx.rmx_rtt == 0)
718				rt->rt_flags |= RTF_DELCLONE;
719		}
720	}
721    no_valid_rt:
722	/* free the reassembly queue, if any */
723	while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
724		LIST_REMOVE(q, tqe_q);
725		m_freem(q->tqe_m);
726		FREE(q, M_TSEGQ);
727	}
728	if (tp->t_template)
729		(void) m_free(dtom(tp->t_template));
730	inp->inp_ppcb = NULL;
731	soisdisconnected(so);
732#ifdef INET6
733	if (INP_CHECK_SOCKAF(so, AF_INET6))
734		in6_pcbdetach(inp);
735	else
736#endif /* INET6 */
737	in_pcbdetach(inp);
738	tcpstat.tcps_closed++;
739	return ((struct tcpcb *)0);
740}
741
742void
743tcp_drain()
744{
745	if (do_tcpdrain)
746	{
747		struct inpcb *inpb;
748		struct tcpcb *tcpb;
749		struct tseg_qent *te;
750
751	/*
752	 * Walk the tcpbs, if existing, and flush the reassembly queue,
753	 * if there is one...
754	 * XXX: The "Net/3" implementation doesn't imply that the TCP
755	 *      reassembly queue should be flushed, but in a situation
756	 * 	where we're really low on mbufs, this is potentially
757	 *  	usefull.
758	 */
759		for (inpb = LIST_FIRST(tcbinfo.listhead); inpb;
760	    		inpb = LIST_NEXT(inpb, inp_list)) {
761				if ((tcpb = intotcpcb(inpb))) {
762					while ((te = LIST_FIRST(&tcpb->t_segq))
763					       != NULL) {
764					LIST_REMOVE(te, tqe_q);
765					m_freem(te->tqe_m);
766					FREE(te, M_TSEGQ);
767				}
768			}
769		}
770
771	}
772}
773
774/*
775 * Notify a tcp user of an asynchronous error;
776 * store error as soft error, but wake up user
777 * (for now, won't do anything until can select for soft error).
778 */
779static void
780tcp_notify(inp, error)
781	struct inpcb *inp;
782	int error;
783{
784	register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
785	register struct socket *so = inp->inp_socket;
786
787	/*
788	 * Ignore some errors if we are hooked up.
789	 * If connection hasn't completed, has retransmitted several times,
790	 * and receives a second error, give up now.  This is better
791	 * than waiting a long time to establish a connection that
792	 * can never complete.
793	 */
794	if (tp->t_state == TCPS_ESTABLISHED &&
795	     (error == EHOSTUNREACH || error == ENETUNREACH ||
796	      error == EHOSTDOWN)) {
797		return;
798	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
799	    tp->t_softerror)
800		so->so_error = error;
801	else
802		tp->t_softerror = error;
803	wakeup((caddr_t) &so->so_timeo);
804	sorwakeup(so);
805	sowwakeup(so);
806}
807
808static int
809tcp_pcblist(SYSCTL_HANDLER_ARGS)
810{
811	int error, i, n, s;
812	struct inpcb *inp, **inp_list;
813	inp_gen_t gencnt;
814	struct xinpgen xig;
815
816	/*
817	 * The process of preparing the TCB list is too time-consuming and
818	 * resource-intensive to repeat twice on every request.
819	 */
820	if (req->oldptr == 0) {
821		n = tcbinfo.ipi_count;
822		req->oldidx = 2 * (sizeof xig)
823			+ (n + n/8) * sizeof(struct xtcpcb);
824		return 0;
825	}
826
827	if (req->newptr != 0)
828		return EPERM;
829
830	/*
831	 * OK, now we're committed to doing something.
832	 */
833	s = splnet();
834	gencnt = tcbinfo.ipi_gencnt;
835	n = tcbinfo.ipi_count;
836	splx(s);
837
838	xig.xig_len = sizeof xig;
839	xig.xig_count = n;
840	xig.xig_gen = gencnt;
841	xig.xig_sogen = so_gencnt;
842	error = SYSCTL_OUT(req, &xig, sizeof xig);
843	if (error)
844		return error;
845
846	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
847	if (inp_list == 0)
848		return ENOMEM;
849
850	s = splnet();
851	for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
852	     inp = LIST_NEXT(inp, inp_list)) {
853		if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
854			inp_list[i++] = inp;
855	}
856	splx(s);
857	n = i;
858
859	error = 0;
860	for (i = 0; i < n; i++) {
861		inp = inp_list[i];
862		if (inp->inp_gencnt <= gencnt) {
863			struct xtcpcb xt;
864			caddr_t inp_ppcb;
865			xt.xt_len = sizeof xt;
866			/* XXX should avoid extra copy */
867			bcopy(inp, &xt.xt_inp, sizeof *inp);
868			inp_ppcb = inp->inp_ppcb;
869			if (inp_ppcb != NULL)
870				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
871			else
872				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
873			if (inp->inp_socket)
874				sotoxsocket(inp->inp_socket, &xt.xt_socket);
875			error = SYSCTL_OUT(req, &xt, sizeof xt);
876		}
877	}
878	if (!error) {
879		/*
880		 * Give the user an updated idea of our state.
881		 * If the generation differs from what we told
882		 * her before, she knows that something happened
883		 * while we were processing this request, and it
884		 * might be necessary to retry.
885		 */
886		s = splnet();
887		xig.xig_gen = tcbinfo.ipi_gencnt;
888		xig.xig_sogen = so_gencnt;
889		xig.xig_count = tcbinfo.ipi_count;
890		splx(s);
891		error = SYSCTL_OUT(req, &xig, sizeof xig);
892	}
893	free(inp_list, M_TEMP);
894	return error;
895}
896
897SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
898	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
899
900static int
901tcp_getcred(SYSCTL_HANDLER_ARGS)
902{
903	struct xucred xuc;
904	struct sockaddr_in addrs[2];
905	struct inpcb *inp;
906	int error, s;
907
908	error = suser(req->p);
909	if (error)
910		return (error);
911	error = SYSCTL_IN(req, addrs, sizeof(addrs));
912	if (error)
913		return (error);
914	s = splnet();
915	inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
916	    addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
917	if (inp == NULL || inp->inp_socket == NULL) {
918		error = ENOENT;
919		goto out;
920	}
921	bzero(&xuc, sizeof(xuc));
922	xuc.cr_uid = inp->inp_socket->so_cred->cr_uid;
923	xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups;
924	bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups,
925	    sizeof(xuc.cr_groups));
926	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
927out:
928	splx(s);
929	return (error);
930}
931
932SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
933    0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
934
935#ifdef INET6
936static int
937tcp6_getcred(SYSCTL_HANDLER_ARGS)
938{
939	struct xucred xuc;
940	struct sockaddr_in6 addrs[2];
941	struct inpcb *inp;
942	int error, s, mapped = 0;
943
944	error = suser(req->p);
945	if (error)
946		return (error);
947	error = SYSCTL_IN(req, addrs, sizeof(addrs));
948	if (error)
949		return (error);
950	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
951		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
952			mapped = 1;
953		else
954			return (EINVAL);
955	}
956	s = splnet();
957	if (mapped == 1)
958		inp = in_pcblookup_hash(&tcbinfo,
959			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
960			addrs[1].sin6_port,
961			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
962			addrs[0].sin6_port,
963			0, NULL);
964	else
965		inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
966				 addrs[1].sin6_port,
967				 &addrs[0].sin6_addr, addrs[0].sin6_port,
968				 0, NULL);
969	if (inp == NULL || inp->inp_socket == NULL) {
970		error = ENOENT;
971		goto out;
972	}
973	bzero(&xuc, sizeof(xuc));
974	xuc.cr_uid = inp->inp_socket->so_cred->cr_uid;
975	xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups;
976	bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups,
977	    sizeof(xuc.cr_groups));
978	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
979out:
980	splx(s);
981	return (error);
982}
983
984SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
985	    0, 0,
986	    tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
987#endif
988
989
990void
991tcp_ctlinput(cmd, sa, vip)
992	int cmd;
993	struct sockaddr *sa;
994	void *vip;
995{
996	register struct ip *ip = vip;
997	register struct tcphdr *th;
998	void (*notify) __P((struct inpcb *, int)) = tcp_notify;
999	tcp_seq tcp_sequence = 0;
1000	int tcp_seq_check = 0;
1001
1002	if (cmd == PRC_QUENCH)
1003		notify = tcp_quench;
1004	else if ((icmp_unreach_like_rst == 1) && ((cmd == PRC_UNREACH_HOST) ||
1005			(cmd == PRC_UNREACH_ADMIN_PROHIB)) && (ip) &&
1006			((IP_VHL_HL(ip->ip_vhl) << 2) == sizeof(struct ip))) {
1007		/*
1008		 * Only go here if the length of the IP header in the ICMP packet
1009		 * is 20 bytes, that is it doesn't have options, if it does have
1010		 * options, we will not have the first 8 bytes of the TCP header,
1011		 * and thus we cannot match against TCP source/destination port
1012		 * numbers and TCP sequence number.
1013		 *
1014		 * If PRC_UNREACH_ADMIN_PROHIB drop session regardsless of current
1015		 * state, else we check the sysctl icmp_like_rst_syn_sent_only to
1016		 * see if we should drop the session only in SYN-SENT state, or
1017		 * in all states.
1018		 */
1019		tcp_seq_check = 1;
1020		if (cmd == PRC_UNREACH_ADMIN_PROHIB) {
1021			notify = tcp_drop_all_states;
1022		} else {
1023			notify = tcp_drop_syn_sent;
1024		}
1025	} else if (cmd == PRC_MSGSIZE)
1026		notify = tcp_mtudisc;
1027	else if (!PRC_IS_REDIRECT(cmd) &&
1028		 ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0))
1029		return;
1030	if (ip) {
1031		th = (struct tcphdr *)((caddr_t)ip
1032				       + (IP_VHL_HL(ip->ip_vhl) << 2));
1033		if (tcp_seq_check == 1)
1034			tcp_sequence = ntohl(th->th_seq);
1035		in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport,
1036			cmd, notify, tcp_sequence, tcp_seq_check);
1037	} else
1038		in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify, 0, 0);
1039}
1040
1041#ifdef INET6
1042void
1043tcp6_ctlinput(cmd, sa, d)
1044	int cmd;
1045	struct sockaddr *sa;
1046	void *d;
1047{
1048	register struct tcphdr *thp;
1049	struct tcphdr th;
1050	void (*notify) __P((struct inpcb *, int)) = tcp_notify;
1051	struct sockaddr_in6 sa6;
1052	struct ip6_hdr *ip6;
1053	struct mbuf *m;
1054	int off;
1055
1056	if (sa->sa_family != AF_INET6 ||
1057	    sa->sa_len != sizeof(struct sockaddr_in6))
1058		return;
1059
1060	if (cmd == PRC_QUENCH)
1061		notify = tcp_quench;
1062	else if (cmd == PRC_MSGSIZE)
1063		notify = tcp_mtudisc;
1064	else if (!PRC_IS_REDIRECT(cmd) &&
1065		 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1066		return;
1067
1068	/* if the parameter is from icmp6, decode it. */
1069	if (d != NULL) {
1070		struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d;
1071		m = ip6cp->ip6c_m;
1072		ip6 = ip6cp->ip6c_ip6;
1073		off = ip6cp->ip6c_off;
1074	} else {
1075		m = NULL;
1076		ip6 = NULL;
1077		off = 0;	/* fool gcc */
1078	}
1079
1080	/*
1081	 * Translate addresses into internal form.
1082	 * Sa check if it is AF_INET6 is done at the top of this funciton.
1083	 */
1084	sa6 = *(struct sockaddr_in6 *)sa;
1085	if (IN6_IS_ADDR_LINKLOCAL(&sa6.sin6_addr) != 0 && m != NULL &&
1086	    m->m_pkthdr.rcvif != NULL)
1087		sa6.sin6_addr.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index);
1088
1089	if (ip6) {
1090		/*
1091		 * XXX: We assume that when IPV6 is non NULL,
1092		 * M and OFF are valid.
1093		 */
1094		struct in6_addr s;
1095
1096		/* translate addresses into internal form */
1097		memcpy(&s, &ip6->ip6_src, sizeof(s));
1098		if (IN6_IS_ADDR_LINKLOCAL(&s) != 0 && m != NULL &&
1099		    m->m_pkthdr.rcvif != NULL)
1100			s.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index);
1101
1102		/* check if we can safely examine src and dst ports */
1103		if (m->m_pkthdr.len < off + sizeof(th))
1104			return;
1105
1106		if (m->m_len < off + sizeof(th)) {
1107			/*
1108			 * this should be rare case
1109			 * because now MINCLSIZE is "(MHLEN + 1)",
1110			 * so we compromise on this copy...
1111			 */
1112			m_copydata(m, off, sizeof(th), (caddr_t)&th);
1113			thp = &th;
1114		} else
1115			thp = (struct tcphdr *)(mtod(m, caddr_t) + off);
1116		in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, thp->th_dport,
1117			      &s, thp->th_sport, cmd, notify);
1118	} else
1119		in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, 0, &zeroin6_addr,
1120			      0, cmd, notify);
1121}
1122#endif /* INET6 */
1123
1124/*
1125 * Check if the supplied TCP sequence number is a sequence number
1126 * for a sent but unacknowledged packet on the given TCP session.
1127 */
1128int
1129tcp_seq_vs_sess(inp, tcp_sequence)
1130	struct inpcb *inp;
1131	tcp_seq tcp_sequence;
1132{
1133	struct tcpcb *tp = intotcpcb(inp);
1134	/*
1135	 * If the sequence number is less than that of the last
1136	 * unacknowledged packet, or greater than that of the
1137	 * last sent, the given sequence number is not that
1138	 * of a sent but unacknowledged packet for this session.
1139	 */
1140	if (SEQ_LT(tcp_sequence, tp->snd_una) ||
1141			SEQ_GT(tcp_sequence, tp->snd_max)) {
1142		return(0);
1143	} else {
1144		return(1);
1145	}
1146}
1147
1148/*
1149 * When a source quench is received, close congestion window
1150 * to one segment.  We will gradually open it again as we proceed.
1151 */
1152void
1153tcp_quench(inp, errno)
1154	struct inpcb *inp;
1155	int errno;
1156{
1157	struct tcpcb *tp = intotcpcb(inp);
1158
1159	if (tp)
1160		tp->snd_cwnd = tp->t_maxseg;
1161}
1162
1163/*
1164 * When a ICMP unreachable is recieved, drop the
1165 * TCP connection, depending on the sysctl
1166 * icmp_like_rst_syn_sent_only, it only drops
1167 * the session if it's in SYN-SENT state
1168 */
1169void
1170tcp_drop_syn_sent(inp, errno)
1171	struct inpcb *inp;
1172	int errno;
1173{
1174	struct tcpcb *tp = intotcpcb(inp);
1175	if((tp) && ((icmp_like_rst_syn_sent_only == 0) ||
1176			(tp->t_state == TCPS_SYN_SENT)))
1177		tcp_drop(tp, errno);
1178}
1179
1180/*
1181 * When a ICMP unreachable is recieved, drop the
1182 * TCP connection, regardless of the state.
1183 */
1184void
1185tcp_drop_all_states(inp, errno)
1186	struct inpcb *inp;
1187	int errno;
1188{
1189	struct tcpcb *tp = intotcpcb(inp);
1190	if(tp)
1191		tcp_drop(tp, errno);
1192}
1193
1194/*
1195 * When `need fragmentation' ICMP is received, update our idea of the MSS
1196 * based on the new value in the route.  Also nudge TCP to send something,
1197 * since we know the packet we just sent was dropped.
1198 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1199 */
1200void
1201tcp_mtudisc(inp, errno)
1202	struct inpcb *inp;
1203	int errno;
1204{
1205	struct tcpcb *tp = intotcpcb(inp);
1206	struct rtentry *rt;
1207	struct rmxp_tao *taop;
1208	struct socket *so = inp->inp_socket;
1209	int offered;
1210	int mss;
1211#ifdef INET6
1212	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
1213#endif /* INET6 */
1214
1215	if (tp) {
1216#ifdef INET6
1217		if (isipv6)
1218			rt = tcp_rtlookup6(inp);
1219		else
1220#endif /* INET6 */
1221		rt = tcp_rtlookup(inp);
1222		if (!rt || !rt->rt_rmx.rmx_mtu) {
1223			tp->t_maxopd = tp->t_maxseg =
1224#ifdef INET6
1225				isipv6 ? tcp_v6mssdflt :
1226#endif /* INET6 */
1227				tcp_mssdflt;
1228			return;
1229		}
1230		taop = rmx_taop(rt->rt_rmx);
1231		offered = taop->tao_mssopt;
1232		mss = rt->rt_rmx.rmx_mtu -
1233#ifdef INET6
1234			(isipv6 ?
1235			 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
1236#endif /* INET6 */
1237			 sizeof(struct tcpiphdr)
1238#ifdef INET6
1239			 )
1240#endif /* INET6 */
1241			;
1242
1243		if (offered)
1244			mss = min(mss, offered);
1245		/*
1246		 * XXX - The above conditional probably violates the TCP
1247		 * spec.  The problem is that, since we don't know the
1248		 * other end's MSS, we are supposed to use a conservative
1249		 * default.  But, if we do that, then MTU discovery will
1250		 * never actually take place, because the conservative
1251		 * default is much less than the MTUs typically seen
1252		 * on the Internet today.  For the moment, we'll sweep
1253		 * this under the carpet.
1254		 *
1255		 * The conservative default might not actually be a problem
1256		 * if the only case this occurs is when sending an initial
1257		 * SYN with options and data to a host we've never talked
1258		 * to before.  Then, they will reply with an MSS value which
1259		 * will get recorded and the new parameters should get
1260		 * recomputed.  For Further Study.
1261		 */
1262		if (tp->t_maxopd <= mss)
1263			return;
1264		tp->t_maxopd = mss;
1265
1266		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1267		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1268			mss -= TCPOLEN_TSTAMP_APPA;
1269		if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
1270		    (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
1271			mss -= TCPOLEN_CC_APPA;
1272#if	(MCLBYTES & (MCLBYTES - 1)) == 0
1273		if (mss > MCLBYTES)
1274			mss &= ~(MCLBYTES-1);
1275#else
1276		if (mss > MCLBYTES)
1277			mss = mss / MCLBYTES * MCLBYTES;
1278#endif
1279		if (so->so_snd.sb_hiwat < mss)
1280			mss = so->so_snd.sb_hiwat;
1281
1282		tp->t_maxseg = mss;
1283
1284		tcpstat.tcps_mturesent++;
1285		tp->t_rtttime = 0;
1286		tp->snd_nxt = tp->snd_una;
1287		tcp_output(tp);
1288	}
1289}
1290
1291/*
1292 * Look-up the routing entry to the peer of this inpcb.  If no route
1293 * is found and it cannot be allocated the return NULL.  This routine
1294 * is called by TCP routines that access the rmx structure and by tcp_mss
1295 * to get the interface MTU.
1296 */
1297struct rtentry *
1298tcp_rtlookup(inp)
1299	struct inpcb *inp;
1300{
1301	struct route *ro;
1302	struct rtentry *rt;
1303
1304	ro = &inp->inp_route;
1305	rt = ro->ro_rt;
1306	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1307		/* No route yet, so try to acquire one */
1308		if (inp->inp_faddr.s_addr != INADDR_ANY) {
1309			ro->ro_dst.sa_family = AF_INET;
1310			ro->ro_dst.sa_len = sizeof(ro->ro_dst);
1311			((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
1312				inp->inp_faddr;
1313			rtalloc(ro);
1314			rt = ro->ro_rt;
1315		}
1316	}
1317	return rt;
1318}
1319
1320#ifdef INET6
1321struct rtentry *
1322tcp_rtlookup6(inp)
1323	struct inpcb *inp;
1324{
1325	struct route_in6 *ro6;
1326	struct rtentry *rt;
1327
1328	ro6 = &inp->in6p_route;
1329	rt = ro6->ro_rt;
1330	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1331		/* No route yet, so try to acquire one */
1332		if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
1333			ro6->ro_dst.sin6_family = AF_INET6;
1334			ro6->ro_dst.sin6_len = sizeof(ro6->ro_dst);
1335			ro6->ro_dst.sin6_addr = inp->in6p_faddr;
1336			rtalloc((struct route *)ro6);
1337			rt = ro6->ro_rt;
1338		}
1339	}
1340	return rt;
1341}
1342#endif /* INET6 */
1343
1344#ifdef IPSEC
1345/* compute ESP/AH header size for TCP, including outer IP header. */
1346size_t
1347ipsec_hdrsiz_tcp(tp)
1348	struct tcpcb *tp;
1349{
1350	struct inpcb *inp;
1351	struct mbuf *m;
1352	size_t hdrsiz;
1353	struct ip *ip;
1354#ifdef INET6
1355	struct ip6_hdr *ip6;
1356#endif /* INET6 */
1357	struct tcphdr *th;
1358
1359	if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
1360		return 0;
1361	MGETHDR(m, M_DONTWAIT, MT_DATA);
1362	if (!m)
1363		return 0;
1364
1365#ifdef INET6
1366	if ((inp->inp_vflag & INP_IPV6) != 0) {
1367		ip6 = mtod(m, struct ip6_hdr *);
1368		th = (struct tcphdr *)(ip6 + 1);
1369		m->m_pkthdr.len = m->m_len =
1370			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1371		bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6,
1372		      sizeof(struct ip6_hdr));
1373		bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
1374		      sizeof(struct tcphdr));
1375		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1376	} else
1377#endif /* INET6 */
1378      {
1379	ip = mtod(m, struct ip *);
1380	th = (struct tcphdr *)(ip + 1);
1381	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1382	bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip,
1383	      sizeof(struct ip));
1384	bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
1385	      sizeof(struct tcphdr));
1386	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1387      }
1388
1389	m_free(m);
1390	return hdrsiz;
1391}
1392#endif /*IPSEC*/
1393
1394/*
1395 * Return a pointer to the cached information about the remote host.
1396 * The cached information is stored in the protocol specific part of
1397 * the route metrics.
1398 */
1399struct rmxp_tao *
1400tcp_gettaocache(inp)
1401	struct inpcb *inp;
1402{
1403	struct rtentry *rt;
1404
1405#ifdef INET6
1406	if ((inp->inp_vflag & INP_IPV6) != 0)
1407		rt = tcp_rtlookup6(inp);
1408	else
1409#endif /* INET6 */
1410	rt = tcp_rtlookup(inp);
1411
1412	/* Make sure this is a host route and is up. */
1413	if (rt == NULL ||
1414	    (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
1415		return NULL;
1416
1417	return rmx_taop(rt->rt_rmx);
1418}
1419
1420/*
1421 * Clear all the TAO cache entries, called from tcp_init.
1422 *
1423 * XXX
1424 * This routine is just an empty one, because we assume that the routing
1425 * routing tables are initialized at the same time when TCP, so there is
1426 * nothing in the cache left over.
1427 */
1428static void
1429tcp_cleartaocache()
1430{
1431}
1432