tcp_timewait.c revision 121850
11541Srgrimes/*
211150Swollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
41541Srgrimes *
51541Srgrimes * Redistribution and use in source and binary forms, with or without
61541Srgrimes * modification, are permitted provided that the following conditions
71541Srgrimes * are met:
81541Srgrimes * 1. Redistributions of source code must retain the above copyright
91541Srgrimes *    notice, this list of conditions and the following disclaimer.
101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer in the
121541Srgrimes *    documentation and/or other materials provided with the distribution.
131541Srgrimes * 3. All advertising materials mentioning features or use of this software
141541Srgrimes *    must display the following acknowledgement:
151541Srgrimes *	This product includes software developed by the University of
161541Srgrimes *	California, Berkeley and its contributors.
171541Srgrimes * 4. Neither the name of the University nor the names of its contributors
181541Srgrimes *    may be used to endorse or promote products derived from this software
191541Srgrimes *    without specific prior written permission.
201541Srgrimes *
211541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
221541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
231541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
241541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
251541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
261541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
271541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
281541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
291541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
301541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
311541Srgrimes * SUCH DAMAGE.
321541Srgrimes *
3311150Swollman *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
3450477Speter * $FreeBSD: head/sys/netinet/tcp_timewait.c 121850 2003-11-01 07:30:08Z silby $
351541Srgrimes */
361541Srgrimes
3732752Seivind#include "opt_compat.h"
3854263Sshin#include "opt_inet6.h"
3956041Sshin#include "opt_ipsec.h"
40101106Srwatson#include "opt_mac.h"
4129514Sjoerg#include "opt_tcpdebug.h"
4229514Sjoerg
431541Srgrimes#include <sys/param.h>
441541Srgrimes#include <sys/systm.h>
4550673Sjlemon#include <sys/callout.h>
4612172Sphk#include <sys/kernel.h>
4712172Sphk#include <sys/sysctl.h>
48101106Srwatson#include <sys/mac.h>
491541Srgrimes#include <sys/malloc.h>
501541Srgrimes#include <sys/mbuf.h>
5155679Sshin#ifdef INET6
5255679Sshin#include <sys/domain.h>
5355679Sshin#endif
5448758Sgreen#include <sys/proc.h>
551541Srgrimes#include <sys/socket.h>
561541Srgrimes#include <sys/socketvar.h>
571541Srgrimes#include <sys/protosw.h>
5875619Skris#include <sys/random.h>
5934923Sbde
6092760Sjeff#include <vm/uma.h>
611541Srgrimes
621541Srgrimes#include <net/route.h>
631541Srgrimes#include <net/if.h>
641541Srgrimes
651541Srgrimes#include <netinet/in.h>
661541Srgrimes#include <netinet/in_systm.h>
671541Srgrimes#include <netinet/ip.h>
6855679Sshin#ifdef INET6
6955679Sshin#include <netinet/ip6.h>
7055679Sshin#endif
711541Srgrimes#include <netinet/in_pcb.h>
7255679Sshin#ifdef INET6
7355679Sshin#include <netinet6/in6_pcb.h>
7455679Sshin#endif
757090Sbde#include <netinet/in_var.h>
761541Srgrimes#include <netinet/ip_var.h>
7755679Sshin#ifdef INET6
7855679Sshin#include <netinet6/ip6_var.h>
7955679Sshin#endif
801541Srgrimes#include <netinet/tcp.h>
811541Srgrimes#include <netinet/tcp_fsm.h>
821541Srgrimes#include <netinet/tcp_seq.h>
831541Srgrimes#include <netinet/tcp_timer.h>
841541Srgrimes#include <netinet/tcp_var.h>
8555679Sshin#ifdef INET6
8655679Sshin#include <netinet6/tcp6_var.h>
8755679Sshin#endif
881541Srgrimes#include <netinet/tcpip.h>
896283Swollman#ifdef TCPDEBUG
906283Swollman#include <netinet/tcp_debug.h>
916283Swollman#endif
9255679Sshin#include <netinet6/ip6protosw.h>
931541Srgrimes
9455679Sshin#ifdef IPSEC
9555679Sshin#include <netinet6/ipsec.h>
9662587Sitojun#ifdef INET6
9762587Sitojun#include <netinet6/ipsec6.h>
9862587Sitojun#endif
9955679Sshin#endif /*IPSEC*/
10055679Sshin
101105199Ssam#ifdef FAST_IPSEC
102105199Ssam#include <netipsec/ipsec.h>
103105199Ssam#ifdef INET6
104105199Ssam#include <netipsec/ipsec6.h>
105105199Ssam#endif
106105199Ssam#define	IPSEC
107105199Ssam#endif /*FAST_IPSEC*/
108105199Ssam
10958698Sjlemon#include <machine/in_cksum.h>
11082122Ssilby#include <sys/md5.h>
11158698Sjlemon
1121541Srgrimesint 	tcp_mssdflt = TCP_MSS;
11346381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
11446381Sbillf    &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
11512296Sphk
11652904Sshin#ifdef INET6
11752904Sshinint	tcp_v6mssdflt = TCP6_MSS;
11852904SshinSYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
11955679Sshin	CTLFLAG_RW, &tcp_v6mssdflt , 0,
12055679Sshin	"Default TCP Maximum Segment Size for IPv6");
12152904Sshin#endif
12252904Sshin
12350673Sjlemon#if 0
12412296Sphkstatic int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
12546381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
12646381Sbillf    &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
12750673Sjlemon#endif
12812296Sphk
12986764Sjlemonint	tcp_do_rfc1323 = 1;
13046381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
13146381Sbillf    &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
13212296Sphk
13386764Sjlemonint	tcp_do_rfc1644 = 0;
13446381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
13546381Sbillf    &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
1361541Srgrimes
13750426Sjlemonstatic int	tcp_tcbhashsize = 0;
138121307SsilbySYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
13950426Sjlemon     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
14050426Sjlemon
14155198Smsmithstatic int	do_tcpdrain = 1;
14266376SbmilekicSYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
14366376Sbmilekic     "Enable tcp_drain routine for extra help when low on mbufs");
14455198Smsmith
14546381SbillfSYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
14646381Sbillf    &tcbinfo.ipi_count, 0, "Number of active PCBs");
14736079Swollman
14872959Sjlemonstatic int	icmp_may_rst = 1;
14972959SjlemonSYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
15072959Sjlemon    "Certain ICMP unreachable messages may abort connections in SYN_SENT");
15170103Sphk
15282122Ssilbystatic int	tcp_isn_reseed_interval = 0;
15382122SsilbySYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
15482122Ssilby    &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
15582122Ssilby
156102017Sdillon/*
157102017Sdillon * TCP bandwidth limiting sysctls.  Note that the default lower bound of
158102017Sdillon * 1024 exists only for debugging.  A good production default would be
159102017Sdillon * something like 6100.
160102017Sdillon */
161102017Sdillonstatic int	tcp_inflight_enable = 0;
162102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
163102017Sdillon    &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
164102017Sdillon
165104825Sdillonstatic int	tcp_inflight_debug = 0;
166102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
167102017Sdillon    &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
168102017Sdillon
169107881Sdillonstatic int	tcp_inflight_min = 6144;
170102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
171102017Sdillon    &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
172102017Sdillon
173102017Sdillonstatic int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
174102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
175102017Sdillon    &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
176107881Sdillonstatic int	tcp_inflight_stab = 20;
177107881SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
178107881Sdillon    &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
179102017Sdillon
18092723Salfredstatic void	tcp_cleartaocache(void);
18198211Shsustatic struct inpcb *tcp_notify(struct inpcb *, int);
182111145Sjlemonstatic void	tcp_discardcb(struct tcpcb *);
18312296Sphk
1847684Sdg/*
18532821Sdg * Target size of TCP PCB hash tables. Must be a power of two.
18643562Smsmith *
18743562Smsmith * Note that this can be overridden by the kernel environment
18843562Smsmith * variable net.inet.tcp.tcbhashsize
1897684Sdg */
1907684Sdg#ifndef TCBHASHSIZE
19132821Sdg#define TCBHASHSIZE	512
1927684Sdg#endif
1931541Srgrimes
1941541Srgrimes/*
195111145Sjlemon * XXX
196111145Sjlemon * Callouts should be moved into struct tcp directly.  They are currently
197111145Sjlemon * separate becuase the tcpcb structure is exported to userland for sysctl
198111145Sjlemon * parsing purposes, which do not know about callouts.
19934881Swollman */
200111145Sjlemonstruct	tcpcb_mem {
20134881Swollman	struct	tcpcb tcb;
202111145Sjlemon	struct	callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep;
203111145Sjlemon	struct	callout tcpcb_mem_2msl, tcpcb_mem_delack;
20434881Swollman};
20534881Swollman
206111145Sjlemonstatic uma_zone_t tcpcb_zone;
207111145Sjlemonstatic uma_zone_t tcptw_zone;
208111145Sjlemon
20934881Swollman/*
2101541Srgrimes * Tcp initialization
2111541Srgrimes */
2121541Srgrimesvoid
2131541Srgrimestcp_init()
2141541Srgrimes{
21577843Speter	int hashsize = TCBHASHSIZE;
21643562Smsmith
2176283Swollman	tcp_ccgen = 1;
2186283Swollman	tcp_cleartaocache();
21950673Sjlemon
22050673Sjlemon	tcp_delacktime = TCPTV_DELACK;
22150673Sjlemon	tcp_keepinit = TCPTV_KEEP_INIT;
22250673Sjlemon	tcp_keepidle = TCPTV_KEEP_IDLE;
22350673Sjlemon	tcp_keepintvl = TCPTV_KEEPINTVL;
22450673Sjlemon	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
22550673Sjlemon	tcp_msl = TCPTV_MSL;
226100335Sdillon	tcp_rexmit_min = TCPTV_MIN;
227100335Sdillon	tcp_rexmit_slop = TCPTV_CPU_VAR;
22850673Sjlemon
22998102Shsu	INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
2307684Sdg	LIST_INIT(&tcb);
2317684Sdg	tcbinfo.listhead = &tcb;
23277900Speter	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
23343576Smsmith	if (!powerof2(hashsize)) {
23443562Smsmith		printf("WARNING: TCB hash size not a power of 2\n");
23543562Smsmith		hashsize = 512; /* safe default */
23643562Smsmith	}
23750426Sjlemon	tcp_tcbhashsize = hashsize;
23843562Smsmith	tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
23943562Smsmith	tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
24034923Sbde					&tcbinfo.porthashmask);
241111145Sjlemon	tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb),
24292760Sjeff	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
24392760Sjeff	uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
24455679Sshin#ifdef INET6
24555679Sshin#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
24655679Sshin#else /* INET6 */
24755679Sshin#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
24855679Sshin#endif /* INET6 */
24955679Sshin	if (max_protohdr < TCP_MINPROTOHDR)
25055679Sshin		max_protohdr = TCP_MINPROTOHDR;
25155679Sshin	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
2521541Srgrimes		panic("tcp_init");
25355679Sshin#undef TCP_MINPROTOHDR
254111145Sjlemon	/*
255111145Sjlemon	 * These have to be type stable for the benefit of the timers.
256111145Sjlemon	 */
257111145Sjlemon	tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
258111145Sjlemon	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
259111145Sjlemon	uma_zone_set_max(tcpcb_zone, maxsockets);
260112009Sjlemon	tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
261111145Sjlemon	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
262121453Ssilby	uma_zone_set_max(tcptw_zone, maxsockets / 5);
263112009Sjlemon	tcp_timer_init();
26486764Sjlemon	syncache_init();
2651541Srgrimes}
2661541Srgrimes
2671541Srgrimes/*
26878642Ssilby * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
26978642Ssilby * tcp_template used to store this data in mbufs, but we now recopy it out
27078642Ssilby * of the tcpcb each time to conserve mbufs.
2711541Srgrimes */
27278642Ssilbyvoid
273111144Sjlemontcpip_fillheaders(inp, ip_ptr, tcp_ptr)
274111144Sjlemon	struct inpcb *inp;
27578642Ssilby	void *ip_ptr;
27678642Ssilby	void *tcp_ptr;
2771541Srgrimes{
278111144Sjlemon	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
2791541Srgrimes
28055679Sshin#ifdef INET6
28155679Sshin	if ((inp->inp_vflag & INP_IPV6) != 0) {
28278642Ssilby		struct ip6_hdr *ip6;
28355679Sshin
28478642Ssilby		ip6 = (struct ip6_hdr *)ip_ptr;
28555679Sshin		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
28655679Sshin			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
28755679Sshin		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
28855679Sshin			(IPV6_VERSION & IPV6_VERSION_MASK);
28955679Sshin		ip6->ip6_nxt = IPPROTO_TCP;
29055679Sshin		ip6->ip6_plen = sizeof(struct tcphdr);
29155679Sshin		ip6->ip6_src = inp->in6p_laddr;
29255679Sshin		ip6->ip6_dst = inp->in6p_faddr;
29355679Sshin	} else
29455679Sshin#endif
29578642Ssilby	{
296111144Sjlemon		struct ip *ip;
29755679Sshin
298111144Sjlemon		ip = (struct ip *)ip_ptr;
299111144Sjlemon		ip->ip_v = IPVERSION;
300111144Sjlemon		ip->ip_hl = 5;
301111144Sjlemon		ip->ip_tos = inp->inp_ip_tos;
302111144Sjlemon		ip->ip_len = 0;
303111144Sjlemon		ip->ip_id = 0;
304111144Sjlemon		ip->ip_off = 0;
305111144Sjlemon		ip->ip_ttl = inp->inp_ip_ttl;
306111144Sjlemon		ip->ip_sum = 0;
307111144Sjlemon		ip->ip_p = IPPROTO_TCP;
308111144Sjlemon		ip->ip_src = inp->inp_laddr;
309111144Sjlemon		ip->ip_dst = inp->inp_faddr;
31078642Ssilby	}
311111144Sjlemon	th->th_sport = inp->inp_lport;
312111144Sjlemon	th->th_dport = inp->inp_fport;
313111144Sjlemon	th->th_seq = 0;
314111144Sjlemon	th->th_ack = 0;
315111144Sjlemon	th->th_x2 = 0;
316111144Sjlemon	th->th_off = 5;
317111144Sjlemon	th->th_flags = 0;
318111144Sjlemon	th->th_win = 0;
319111144Sjlemon	th->th_urp = 0;
320111144Sjlemon	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
32178642Ssilby}
32278642Ssilby
32378642Ssilby/*
32478642Ssilby * Create template to be used to send tcp packets on a connection.
32578642Ssilby * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
32678642Ssilby * use for this function is in keepalives, which use tcp_respond.
32778642Ssilby */
32878642Ssilbystruct tcptemp *
329111144Sjlemontcpip_maketemplate(inp)
330111144Sjlemon	struct inpcb *inp;
33178642Ssilby{
33278642Ssilby	struct mbuf *m;
33378642Ssilby	struct tcptemp *n;
33478642Ssilby
335111119Simp	m = m_get(M_DONTWAIT, MT_HEADER);
33678642Ssilby	if (m == NULL)
33778642Ssilby		return (0);
33878642Ssilby	m->m_len = sizeof(struct tcptemp);
33978642Ssilby	n = mtod(m, struct tcptemp *);
34078642Ssilby
341111144Sjlemon	tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
3421541Srgrimes	return (n);
3431541Srgrimes}
3441541Srgrimes
3451541Srgrimes/*
3461541Srgrimes * Send a single message to the TCP at address specified by
3471541Srgrimes * the given TCP/IP header.  If m == 0, then we make a copy
3481541Srgrimes * of the tcpiphdr at ti and send directly to the addressed host.
3491541Srgrimes * This is used to force keep alive messages out using the TCP
35078642Ssilby * template for a connection.  If flags are given then we send
35178642Ssilby * a message back to the TCP which originated the * segment ti,
35278642Ssilby * and discard the mbuf containing it and any other attached mbufs.
3531541Srgrimes *
3541541Srgrimes * In any case the ack and sequence number of the transmitted
3551541Srgrimes * segment are as specified by the parameters.
35631848Sjulian *
35731848Sjulian * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
3581541Srgrimes */
3591541Srgrimesvoid
36055679Sshintcp_respond(tp, ipgen, th, m, ack, seq, flags)
3611541Srgrimes	struct tcpcb *tp;
36255679Sshin	void *ipgen;
36355679Sshin	register struct tcphdr *th;
3641541Srgrimes	register struct mbuf *m;
3651541Srgrimes	tcp_seq ack, seq;
3661541Srgrimes	int flags;
3671541Srgrimes{
3681541Srgrimes	register int tlen;
3691541Srgrimes	int win = 0;
3701541Srgrimes	struct route *ro = 0;
37114754Swollman	struct route sro;
37255679Sshin	struct ip *ip;
37355679Sshin	struct tcphdr *nth;
37455679Sshin#ifdef INET6
37555679Sshin	struct route_in6 *ro6 = 0;
37655679Sshin	struct route_in6 sro6;
37755679Sshin	struct ip6_hdr *ip6;
37855679Sshin	int isipv6;
37955679Sshin#endif /* INET6 */
38055679Sshin	int ipflags = 0;
3811541Srgrimes
382101137Srwatson	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
383101137Srwatson
38455679Sshin#ifdef INET6
385105586Sphk	isipv6 = ((struct ip *)ipgen)->ip_v == 6;
38655679Sshin	ip6 = ipgen;
38755679Sshin#endif /* INET6 */
38855679Sshin	ip = ipgen;
38955679Sshin
3901541Srgrimes	if (tp) {
39157576Sps		if (!(flags & TH_RST)) {
39241187Sguido			win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
39357576Sps			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
39457576Sps				win = (long)TCP_MAXWIN << tp->rcv_scale;
39557576Sps		}
39655679Sshin#ifdef INET6
39755679Sshin		if (isipv6)
39855679Sshin			ro6 = &tp->t_inpcb->in6p_route;
39955679Sshin		else
40055679Sshin#endif /* INET6 */
4011541Srgrimes		ro = &tp->t_inpcb->inp_route;
40214754Swollman	} else {
40355679Sshin#ifdef INET6
40455679Sshin		if (isipv6) {
40555679Sshin			ro6 = &sro6;
40655679Sshin			bzero(ro6, sizeof *ro6);
40755679Sshin		} else
40855679Sshin#endif /* INET6 */
40955679Sshin	      {
41014754Swollman		ro = &sro;
41114754Swollman		bzero(ro, sizeof *ro);
41255679Sshin	      }
4131541Srgrimes	}
4141541Srgrimes	if (m == 0) {
415111119Simp		m = m_gethdr(M_DONTWAIT, MT_HEADER);
4161541Srgrimes		if (m == NULL)
4171541Srgrimes			return;
4181541Srgrimes		tlen = 0;
4191541Srgrimes		m->m_data += max_linkhdr;
42055679Sshin#ifdef INET6
42155679Sshin		if (isipv6) {
42255679Sshin			bcopy((caddr_t)ip6, mtod(m, caddr_t),
42355679Sshin			      sizeof(struct ip6_hdr));
42455679Sshin			ip6 = mtod(m, struct ip6_hdr *);
42555679Sshin			nth = (struct tcphdr *)(ip6 + 1);
42655679Sshin		} else
42755679Sshin#endif /* INET6 */
42855679Sshin	      {
42955679Sshin		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
43055679Sshin		ip = mtod(m, struct ip *);
43155679Sshin		nth = (struct tcphdr *)(ip + 1);
43255679Sshin	      }
43355679Sshin		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
4341541Srgrimes		flags = TH_ACK;
4351541Srgrimes	} else {
4361541Srgrimes		m_freem(m->m_next);
4371541Srgrimes		m->m_next = 0;
43855679Sshin		m->m_data = (caddr_t)ipgen;
43955679Sshin		/* m_len is set later */
4401541Srgrimes		tlen = 0;
4411541Srgrimes#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
44255679Sshin#ifdef INET6
44355679Sshin		if (isipv6) {
44455679Sshin			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
44555679Sshin			nth = (struct tcphdr *)(ip6 + 1);
44655679Sshin		} else
44755679Sshin#endif /* INET6 */
44855679Sshin	      {
44955679Sshin		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
45055679Sshin		nth = (struct tcphdr *)(ip + 1);
45155679Sshin	      }
45255679Sshin		if (th != nth) {
45355679Sshin			/*
45455679Sshin			 * this is usually a case when an extension header
45555679Sshin			 * exists between the IPv6 header and the
45655679Sshin			 * TCP header.
45755679Sshin			 */
45855679Sshin			nth->th_sport = th->th_sport;
45955679Sshin			nth->th_dport = th->th_dport;
46055679Sshin		}
46155679Sshin		xchg(nth->th_dport, nth->th_sport, n_short);
4621541Srgrimes#undef xchg
4631541Srgrimes	}
46455679Sshin#ifdef INET6
46555679Sshin	if (isipv6) {
46690198Sume		ip6->ip6_flow = 0;
46790198Sume		ip6->ip6_vfc = IPV6_VERSION;
46890198Sume		ip6->ip6_nxt = IPPROTO_TCP;
46955679Sshin		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
47055679Sshin						tlen));
47155679Sshin		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
47256039Sshin	} else
47355679Sshin#endif
47455679Sshin      {
4751541Srgrimes	tlen += sizeof (struct tcpiphdr);
47658698Sjlemon	ip->ip_len = tlen;
47758698Sjlemon	ip->ip_ttl = ip_defttl;
47855679Sshin      }
4791541Srgrimes	m->m_len = tlen;
4801541Srgrimes	m->m_pkthdr.len = tlen;
4811541Srgrimes	m->m_pkthdr.rcvif = (struct ifnet *) 0;
482101106Srwatson#ifdef MAC
483111483Srwatson	if (tp != NULL && tp->t_inpcb != NULL) {
484101106Srwatson		/*
485101106Srwatson		 * Packet is associated with a socket, so allow the
486101106Srwatson		 * label of the response to reflect the socket label.
487101106Srwatson		 */
488101106Srwatson		mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m);
489101106Srwatson	} else {
490101106Srwatson		/*
491119245Srwatson		 * Packet is not associated with a socket, so possibly
492119245Srwatson		 * update the label in place.
493101106Srwatson		 */
494119245Srwatson		mac_reflect_mbuf_tcp(m);
495101106Srwatson	}
496101106Srwatson#endif
49755679Sshin	nth->th_seq = htonl(seq);
49855679Sshin	nth->th_ack = htonl(ack);
49955679Sshin	nth->th_x2 = 0;
50055679Sshin	nth->th_off = sizeof (struct tcphdr) >> 2;
50155679Sshin	nth->th_flags = flags;
5021541Srgrimes	if (tp)
50355679Sshin		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
5041541Srgrimes	else
50555679Sshin		nth->th_win = htons((u_short)win);
50655679Sshin	nth->th_urp = 0;
50755679Sshin#ifdef INET6
50855679Sshin	if (isipv6) {
50959392Sshin		nth->th_sum = 0;
51055679Sshin		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
51155679Sshin					sizeof(struct ip6_hdr),
51255679Sshin					tlen - sizeof(struct ip6_hdr));
51355679Sshin		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
51455679Sshin					       ro6 && ro6->ro_rt ?
51555679Sshin					       ro6->ro_rt->rt_ifp :
51655679Sshin					       NULL);
51755679Sshin	} else
51855679Sshin#endif /* INET6 */
51955679Sshin      {
52058698Sjlemon        nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
52158698Sjlemon	    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
52258698Sjlemon        m->m_pkthdr.csum_flags = CSUM_TCP;
52358698Sjlemon        m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
52455679Sshin      }
5256283Swollman#ifdef TCPDEBUG
52697658Stanimura	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
52755679Sshin		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
5286283Swollman#endif
52955679Sshin#ifdef INET6
53055679Sshin	if (isipv6) {
531105194Ssam		(void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL,
532105194Ssam			tp ? tp->t_inpcb : NULL);
53355913Sshin		if (ro6 == &sro6 && ro6->ro_rt) {
53455679Sshin			RTFREE(ro6->ro_rt);
53555913Sshin			ro6->ro_rt = NULL;
53655913Sshin		}
53755679Sshin	} else
53855679Sshin#endif /* INET6 */
53955679Sshin      {
540105194Ssam	(void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL);
54114841Swollman	if (ro == &sro && ro->ro_rt) {
54214754Swollman		RTFREE(ro->ro_rt);
54355913Sshin		ro->ro_rt = NULL;
54414754Swollman	}
54555679Sshin      }
5461541Srgrimes}
5471541Srgrimes
5481541Srgrimes/*
5491541Srgrimes * Create a new TCP control block, making an
5501541Srgrimes * empty reassembly queue and hooking it to the argument
55134881Swollman * protocol control block.  The `inp' parameter must have
55234881Swollman * come from the zone allocator set up in tcp_init().
5531541Srgrimes */
5541541Srgrimesstruct tcpcb *
5551541Srgrimestcp_newtcpcb(inp)
5561541Srgrimes	struct inpcb *inp;
5571541Srgrimes{
558111145Sjlemon	struct tcpcb_mem *tm;
559111145Sjlemon	struct tcpcb *tp;
56055679Sshin#ifdef INET6
56155679Sshin	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
56255679Sshin#endif /* INET6 */
5631541Srgrimes
564111145Sjlemon	tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
565111145Sjlemon	if (tm == NULL)
566111145Sjlemon		return (NULL);
567111145Sjlemon	tp = &tm->tcb;
568111145Sjlemon	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
56955679Sshin	tp->t_maxseg = tp->t_maxopd =
57055679Sshin#ifdef INET6
57155679Sshin		isipv6 ? tcp_v6mssdflt :
57255679Sshin#endif /* INET6 */
57355679Sshin		tcp_mssdflt;
5741541Srgrimes
57550673Sjlemon	/* Set up our timeouts. */
576111145Sjlemon	callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0);
577111145Sjlemon	callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0);
578111145Sjlemon	callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0);
579111145Sjlemon	callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0);
580111145Sjlemon	callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0);
58150673Sjlemon
5826283Swollman	if (tcp_do_rfc1323)
5836283Swollman		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
5846283Swollman	if (tcp_do_rfc1644)
5856283Swollman		tp->t_flags |= TF_REQ_CC;
58634881Swollman	tp->t_inpcb = inp;	/* XXX */
5871541Srgrimes	/*
5881541Srgrimes	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
58916367Swollman	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
5901541Srgrimes	 * reasonable initial retransmit time.
5911541Srgrimes	 */
5921541Srgrimes	tp->t_srtt = TCPTV_SRTTBASE;
59316367Swollman	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
594100335Sdillon	tp->t_rttmin = tcp_rexmit_min;
59516367Swollman	tp->t_rxtcur = TCPTV_RTOBASE;
5961541Srgrimes	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
597102017Sdillon	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
5981541Srgrimes	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
59950673Sjlemon	tp->t_rcvtime = ticks;
600102017Sdillon	tp->t_bw_rtttime = ticks;
60156564Sshin        /*
60256564Sshin	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
60356564Sshin	 * because the socket may be bound to an IPv6 wildcard address,
60456564Sshin	 * which may match an IPv4-mapped IPv6 address.
60556564Sshin	 */
60624570Sdg	inp->inp_ip_ttl = ip_defttl;
6071541Srgrimes	inp->inp_ppcb = (caddr_t)tp;
60834881Swollman	return (tp);		/* XXX */
6091541Srgrimes}
6101541Srgrimes
6111541Srgrimes/*
6121541Srgrimes * Drop a TCP connection, reporting
6131541Srgrimes * the specified error.  If connection is synchronized,
6141541Srgrimes * then send a RST to peer.
6151541Srgrimes */
6161541Srgrimesstruct tcpcb *
6171541Srgrimestcp_drop(tp, errno)
6181541Srgrimes	register struct tcpcb *tp;
6191541Srgrimes	int errno;
6201541Srgrimes{
6211541Srgrimes	struct socket *so = tp->t_inpcb->inp_socket;
6221541Srgrimes
6231541Srgrimes	if (TCPS_HAVERCVDSYN(tp->t_state)) {
6241541Srgrimes		tp->t_state = TCPS_CLOSED;
6251541Srgrimes		(void) tcp_output(tp);
6261541Srgrimes		tcpstat.tcps_drops++;
6271541Srgrimes	} else
6281541Srgrimes		tcpstat.tcps_conndrops++;
6291541Srgrimes	if (errno == ETIMEDOUT && tp->t_softerror)
6301541Srgrimes		errno = tp->t_softerror;
6311541Srgrimes	so->so_error = errno;
6321541Srgrimes	return (tcp_close(tp));
6331541Srgrimes}
6341541Srgrimes
635111145Sjlemonstatic void
636111145Sjlemontcp_discardcb(tp)
637111145Sjlemon	struct tcpcb *tp;
6381541Srgrimes{
639111145Sjlemon	struct tseg_qent *q;
6401541Srgrimes	struct inpcb *inp = tp->t_inpcb;
6411541Srgrimes	struct socket *so = inp->inp_socket;
64255679Sshin#ifdef INET6
64355679Sshin	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
64455679Sshin#endif /* INET6 */
645111145Sjlemon	struct rtentry *rt;
64622719Swollman	int dosavessthresh;
6471541Srgrimes
6481541Srgrimes	/*
64950673Sjlemon	 * Make sure that all of our timers are stopped before we
65050673Sjlemon	 * delete the PCB.
65150673Sjlemon	 */
65250673Sjlemon	callout_stop(tp->tt_rexmt);
65350673Sjlemon	callout_stop(tp->tt_persist);
65450673Sjlemon	callout_stop(tp->tt_keep);
65550673Sjlemon	callout_stop(tp->tt_2msl);
65650673Sjlemon	callout_stop(tp->tt_delack);
65750673Sjlemon
65850673Sjlemon	/*
6599373Swollman	 * If we got enough samples through the srtt filter,
6609373Swollman	 * save the rtt and rttvar in the routing entry.
6619373Swollman	 * 'Enough' is arbitrarily defined as the 16 samples.
6629373Swollman	 * 16 samples is enough for the srtt filter to converge
6639373Swollman	 * to within 5% of the correct value; fewer samples and
6649373Swollman	 * we could save a very bogus rtt.
6651541Srgrimes	 *
6661541Srgrimes	 * Don't update the default route's characteristics and don't
6671541Srgrimes	 * update anything that the user "locked".
6681541Srgrimes	 */
66955679Sshin	if (tp->t_rttupdated >= 16) {
6701549Srgrimes		register u_long i = 0;
67155679Sshin#ifdef INET6
67255679Sshin		if (isipv6) {
67355679Sshin			struct sockaddr_in6 *sin6;
6741541Srgrimes
67555679Sshin			if ((rt = inp->in6p_route.ro_rt) == NULL)
67655679Sshin				goto no_valid_rt;
67755679Sshin			sin6 = (struct sockaddr_in6 *)rt_key(rt);
67855679Sshin			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
67955679Sshin				goto no_valid_rt;
68055679Sshin		}
68155679Sshin		else
68255679Sshin#endif /* INET6 */
68355679Sshin		if ((rt = inp->inp_route.ro_rt) == NULL ||
68455679Sshin		    ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
68555679Sshin		    == INADDR_ANY)
68655679Sshin			goto no_valid_rt;
68755679Sshin
6881541Srgrimes		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
6891541Srgrimes			i = tp->t_srtt *
69050673Sjlemon			    (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
6911541Srgrimes			if (rt->rt_rmx.rmx_rtt && i)
6921541Srgrimes				/*
6931541Srgrimes				 * filter this update to half the old & half
6941541Srgrimes				 * the new values, converting scale.
6951541Srgrimes				 * See route.h and tcp_var.h for a
6961541Srgrimes				 * description of the scaling constants.
6971541Srgrimes				 */
6981541Srgrimes				rt->rt_rmx.rmx_rtt =
6991541Srgrimes				    (rt->rt_rmx.rmx_rtt + i) / 2;
7001541Srgrimes			else
7011541Srgrimes				rt->rt_rmx.rmx_rtt = i;
7029263Swollman			tcpstat.tcps_cachedrtt++;
7031541Srgrimes		}
7041541Srgrimes		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
7051541Srgrimes			i = tp->t_rttvar *
70650673Sjlemon			    (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
7071541Srgrimes			if (rt->rt_rmx.rmx_rttvar && i)
7081541Srgrimes				rt->rt_rmx.rmx_rttvar =
7091541Srgrimes				    (rt->rt_rmx.rmx_rttvar + i) / 2;
7101541Srgrimes			else
7111541Srgrimes				rt->rt_rmx.rmx_rttvar = i;
7129263Swollman			tcpstat.tcps_cachedrttvar++;
7131541Srgrimes		}
7141541Srgrimes		/*
71522719Swollman		 * The old comment here said:
7161541Srgrimes		 * update the pipelimit (ssthresh) if it has been updated
7171541Srgrimes		 * already or if a pipesize was specified & the threshhold
7181541Srgrimes		 * got below half the pipesize.  I.e., wait for bad news
7191541Srgrimes		 * before we start updating, then update on both good
7201541Srgrimes		 * and bad news.
72122719Swollman		 *
72222719Swollman		 * But we want to save the ssthresh even if no pipesize is
72322719Swollman		 * specified explicitly in the route, because such
72422719Swollman		 * connections still have an implicit pipesize specified
72522719Swollman		 * by the global tcp_sendspace.  In the absence of a reliable
72622719Swollman		 * way to calculate the pipesize, it will have to do.
7271541Srgrimes		 */
72822719Swollman		i = tp->snd_ssthresh;
72922719Swollman		if (rt->rt_rmx.rmx_sendpipe != 0)
73022719Swollman			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
73122719Swollman		else
73222719Swollman			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
7333444Sphk		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
73422719Swollman		     i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
73522719Swollman		    || dosavessthresh) {
7361541Srgrimes			/*
7371541Srgrimes			 * convert the limit from user data bytes to
7381541Srgrimes			 * packets then to packet data bytes.
7391541Srgrimes			 */
7401541Srgrimes			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
7411541Srgrimes			if (i < 2)
7421541Srgrimes				i = 2;
74355679Sshin			i *= (u_long)(tp->t_maxseg +
74455679Sshin#ifdef INET6
74555679Sshin				      (isipv6 ? sizeof (struct ip6_hdr) +
74655679Sshin					       sizeof (struct tcphdr) :
74755679Sshin#endif
74855679Sshin				       sizeof (struct tcpiphdr)
74955679Sshin#ifdef INET6
75055679Sshin				       )
75155679Sshin#endif
75255679Sshin				      );
7531541Srgrimes			if (rt->rt_rmx.rmx_ssthresh)
7541541Srgrimes				rt->rt_rmx.rmx_ssthresh =
7551541Srgrimes				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
7561541Srgrimes			else
7571541Srgrimes				rt->rt_rmx.rmx_ssthresh = i;
7589263Swollman			tcpstat.tcps_cachedssthresh++;
7591541Srgrimes		}
7601541Srgrimes	}
76155679Sshin    no_valid_rt:
7621541Srgrimes	/* free the reassembly queue, if any */
763111145Sjlemon	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
76455679Sshin		LIST_REMOVE(q, tqe_q);
76555679Sshin		m_freem(q->tqe_m);
76655679Sshin		FREE(q, M_TSEGQ);
7671541Srgrimes	}
76832821Sdg	inp->inp_ppcb = NULL;
769108265Shsu	tp->t_inpcb = NULL;
770111145Sjlemon	uma_zfree(tcpcb_zone, tp);
7711541Srgrimes	soisdisconnected(so);
772111145Sjlemon}
773111145Sjlemon
774111145Sjlemon/*
775111145Sjlemon * Close a TCP control block:
776111145Sjlemon *    discard all space held by the tcp
777111145Sjlemon *    discard internet protocol block
778111145Sjlemon *    wake up any sleepers
779111145Sjlemon */
780111145Sjlemonstruct tcpcb *
781111145Sjlemontcp_close(tp)
782111145Sjlemon	struct tcpcb *tp;
783111145Sjlemon{
784111145Sjlemon	struct inpcb *inp = tp->t_inpcb;
785111153Sjlemon#ifdef INET6
786111145Sjlemon	struct socket *so = inp->inp_socket;
787111153Sjlemon#endif
788111145Sjlemon
789111145Sjlemon	tcp_discardcb(tp);
79055679Sshin#ifdef INET6
79155679Sshin	if (INP_CHECK_SOCKAF(so, AF_INET6))
79255679Sshin		in6_pcbdetach(inp);
79355679Sshin	else
794111145Sjlemon#endif
795111145Sjlemon		in_pcbdetach(inp);
7961541Srgrimes	tcpstat.tcps_closed++;
7971541Srgrimes	return ((struct tcpcb *)0);
7981541Srgrimes}
7991541Srgrimes
8001541Srgrimesvoid
8011541Srgrimestcp_drain()
8021541Srgrimes{
80355198Smsmith	if (do_tcpdrain)
80455198Smsmith	{
80555198Smsmith		struct inpcb *inpb;
80655198Smsmith		struct tcpcb *tcpb;
80755679Sshin		struct tseg_qent *te;
8081541Srgrimes
80955198Smsmith	/*
81055198Smsmith	 * Walk the tcpbs, if existing, and flush the reassembly queue,
81155198Smsmith	 * if there is one...
81255198Smsmith	 * XXX: The "Net/3" implementation doesn't imply that the TCP
81355198Smsmith	 *      reassembly queue should be flushed, but in a situation
81455198Smsmith	 * 	where we're really low on mbufs, this is potentially
81555198Smsmith	 *  	usefull.
81655198Smsmith	 */
81798102Shsu		INP_INFO_RLOCK(&tcbinfo);
81874362Sphk		LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
819111145Sjlemon			if (inpb->inp_vflag & INP_TIMEWAIT)
820111145Sjlemon				continue;
82198102Shsu			INP_LOCK(inpb);
82274362Sphk			if ((tcpb = intotcpcb(inpb))) {
82374362Sphk				while ((te = LIST_FIRST(&tcpb->t_segq))
82474362Sphk			            != NULL) {
82555679Sshin					LIST_REMOVE(te, tqe_q);
82655679Sshin					m_freem(te->tqe_m);
82755679Sshin					FREE(te, M_TSEGQ);
82855198Smsmith				}
82955198Smsmith			}
83098102Shsu			INP_UNLOCK(inpb);
83155198Smsmith		}
83298102Shsu		INP_INFO_RUNLOCK(&tcbinfo);
83355198Smsmith	}
8341541Srgrimes}
8351541Srgrimes
8361541Srgrimes/*
8371541Srgrimes * Notify a tcp user of an asynchronous error;
8381541Srgrimes * store error as soft error, but wake up user
8391541Srgrimes * (for now, won't do anything until can select for soft error).
84072960Sjlemon *
84172960Sjlemon * Do not wake up user since there currently is no mechanism for
84272960Sjlemon * reporting soft errors (yet - a kqueue filter may be added).
8431541Srgrimes */
84498211Shsustatic struct inpcb *
8451541Srgrimestcp_notify(inp, error)
8461541Srgrimes	struct inpcb *inp;
8471541Srgrimes	int error;
8481541Srgrimes{
84972960Sjlemon	struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
8501541Srgrimes
8511541Srgrimes	/*
8521541Srgrimes	 * Ignore some errors if we are hooked up.
8531541Srgrimes	 * If connection hasn't completed, has retransmitted several times,
8541541Srgrimes	 * and receives a second error, give up now.  This is better
8551541Srgrimes	 * than waiting a long time to establish a connection that
8561541Srgrimes	 * can never complete.
8571541Srgrimes	 */
8581541Srgrimes	if (tp->t_state == TCPS_ESTABLISHED &&
859110896Shsu	    (error == EHOSTUNREACH || error == ENETUNREACH ||
860110896Shsu	     error == EHOSTDOWN)) {
86198211Shsu		return inp;
8621541Srgrimes	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
86398211Shsu	    tp->t_softerror) {
86472960Sjlemon		tcp_drop(tp, error);
86598211Shsu		return (struct inpcb *)0;
86698211Shsu	} else {
8671541Srgrimes		tp->t_softerror = error;
86898211Shsu		return inp;
86998211Shsu	}
87072960Sjlemon#if 0
871111748Sdes	wakeup( &so->so_timeo);
8721541Srgrimes	sorwakeup(so);
8731541Srgrimes	sowwakeup(so);
87472960Sjlemon#endif
8751541Srgrimes}
8761541Srgrimes
87736079Swollmanstatic int
87862573Sphktcp_pcblist(SYSCTL_HANDLER_ARGS)
87936079Swollman{
88036079Swollman	int error, i, n, s;
88136079Swollman	struct inpcb *inp, **inp_list;
88236079Swollman	inp_gen_t gencnt;
88336079Swollman	struct xinpgen xig;
88436079Swollman
88536079Swollman	/*
88636079Swollman	 * The process of preparing the TCB list is too time-consuming and
88736079Swollman	 * resource-intensive to repeat twice on every request.
88836079Swollman	 */
88936079Swollman	if (req->oldptr == 0) {
89036079Swollman		n = tcbinfo.ipi_count;
89136079Swollman		req->oldidx = 2 * (sizeof xig)
89236079Swollman			+ (n + n/8) * sizeof(struct xtcpcb);
89336079Swollman		return 0;
89436079Swollman	}
89536079Swollman
89636079Swollman	if (req->newptr != 0)
89736079Swollman		return EPERM;
89836079Swollman
89936079Swollman	/*
90036079Swollman	 * OK, now we're committed to doing something.
90136079Swollman	 */
90236079Swollman	s = splnet();
90398102Shsu	INP_INFO_RLOCK(&tcbinfo);
90436079Swollman	gencnt = tcbinfo.ipi_gencnt;
90536079Swollman	n = tcbinfo.ipi_count;
90698102Shsu	INP_INFO_RUNLOCK(&tcbinfo);
90736079Swollman	splx(s);
90836079Swollman
909100831Struckman	sysctl_wire_old_buffer(req, 2 * (sizeof xig)
910100831Struckman		+ n * sizeof(struct xtcpcb));
911100831Struckman
91236079Swollman	xig.xig_len = sizeof xig;
91336079Swollman	xig.xig_count = n;
91436079Swollman	xig.xig_gen = gencnt;
91536079Swollman	xig.xig_sogen = so_gencnt;
91636079Swollman	error = SYSCTL_OUT(req, &xig, sizeof xig);
91736079Swollman	if (error)
91836079Swollman		return error;
91936079Swollman
920111119Simp	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
92136079Swollman	if (inp_list == 0)
92236079Swollman		return ENOMEM;
92336079Swollman
92436079Swollman	s = splnet();
92598102Shsu	INP_INFO_RLOCK(&tcbinfo);
92671999Sphk	for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
92771999Sphk	     inp = LIST_NEXT(inp, inp_list)) {
92898102Shsu		INP_LOCK(inp);
929113345Srwatson		if (inp->inp_gencnt <= gencnt) {
930113345Srwatson			/*
931113345Srwatson			 * XXX: This use of cr_cansee(), introduced with
932113345Srwatson			 * TCP state changes, is not quite right, but for
933113345Srwatson			 * now, better than nothing.
934113345Srwatson			 */
935113345Srwatson			if (inp->inp_vflag & INP_TIMEWAIT)
936113345Srwatson				error = cr_cansee(req->td->td_ucred,
937113345Srwatson				    intotw(inp)->tw_cred);
938113345Srwatson			else
939113345Srwatson				error = cr_canseesocket(req->td->td_ucred,
940113345Srwatson				    inp->inp_socket);
941113345Srwatson			if (error == 0)
942113345Srwatson				inp_list[i++] = inp;
943113345Srwatson		}
94498102Shsu		INP_UNLOCK(inp);
94536079Swollman	}
94698102Shsu	INP_INFO_RUNLOCK(&tcbinfo);
94736079Swollman	splx(s);
94836079Swollman	n = i;
94936079Swollman
95036079Swollman	error = 0;
95136079Swollman	for (i = 0; i < n; i++) {
95236079Swollman		inp = inp_list[i];
95336079Swollman		if (inp->inp_gencnt <= gencnt) {
95436079Swollman			struct xtcpcb xt;
95547960Stegge			caddr_t inp_ppcb;
95636079Swollman			xt.xt_len = sizeof xt;
95736079Swollman			/* XXX should avoid extra copy */
95836079Swollman			bcopy(inp, &xt.xt_inp, sizeof *inp);
95947960Stegge			inp_ppcb = inp->inp_ppcb;
960111145Sjlemon			if (inp_ppcb == NULL)
961111145Sjlemon				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
962111145Sjlemon			else if (inp->inp_vflag & INP_TIMEWAIT) {
963111145Sjlemon				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
964111145Sjlemon				xt.xt_tp.t_state = TCPS_TIME_WAIT;
965111145Sjlemon			} else
96647960Stegge				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
96736079Swollman			if (inp->inp_socket)
96836079Swollman				sotoxsocket(inp->inp_socket, &xt.xt_socket);
969111145Sjlemon			else {
970111145Sjlemon				bzero(&xt.xt_socket, sizeof xt.xt_socket);
971111145Sjlemon				xt.xt_socket.xso_protocol = IPPROTO_TCP;
972111145Sjlemon			}
973110896Shsu			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
97436079Swollman			error = SYSCTL_OUT(req, &xt, sizeof xt);
97536079Swollman		}
97636079Swollman	}
97736079Swollman	if (!error) {
97836079Swollman		/*
97936079Swollman		 * Give the user an updated idea of our state.
98036079Swollman		 * If the generation differs from what we told
98136079Swollman		 * her before, she knows that something happened
98236079Swollman		 * while we were processing this request, and it
98336079Swollman		 * might be necessary to retry.
98436079Swollman		 */
98536079Swollman		s = splnet();
98698102Shsu		INP_INFO_RLOCK(&tcbinfo);
98736079Swollman		xig.xig_gen = tcbinfo.ipi_gencnt;
98836079Swollman		xig.xig_sogen = so_gencnt;
98936079Swollman		xig.xig_count = tcbinfo.ipi_count;
99098102Shsu		INP_INFO_RUNLOCK(&tcbinfo);
99136079Swollman		splx(s);
99236079Swollman		error = SYSCTL_OUT(req, &xig, sizeof xig);
99336079Swollman	}
99436079Swollman	free(inp_list, M_TEMP);
99536079Swollman	return error;
99636079Swollman}
99736079Swollman
99836079SwollmanSYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
99936079Swollman	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
100036079Swollman
100148758Sgreenstatic int
100262573Sphktcp_getcred(SYSCTL_HANDLER_ARGS)
100348758Sgreen{
100472650Sgreen	struct xucred xuc;
100548758Sgreen	struct sockaddr_in addrs[2];
100648758Sgreen	struct inpcb *inp;
100748758Sgreen	int error, s;
100848758Sgreen
100993593Sjhb	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
101048758Sgreen	if (error)
101148758Sgreen		return (error);
101248758Sgreen	error = SYSCTL_IN(req, addrs, sizeof(addrs));
101348758Sgreen	if (error)
101448758Sgreen		return (error);
101548758Sgreen	s = splnet();
101698102Shsu	INP_INFO_RLOCK(&tcbinfo);
101748758Sgreen	inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
101854263Sshin	    addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
101998102Shsu	if (inp == NULL) {
102048758Sgreen		error = ENOENT;
102198102Shsu		goto outunlocked;
102248758Sgreen	}
102399837Struckman	INP_LOCK(inp);
102499837Struckman	if (inp->inp_socket == NULL) {
102599837Struckman		error = ENOENT;
102699837Struckman		goto out;
102799837Struckman	}
102892976Srwatson	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
102978697Sdwmalone	if (error)
103078697Sdwmalone		goto out;
103191354Sdd	cru2x(inp->inp_socket->so_cred, &xuc);
103248758Sgreenout:
103398102Shsu	INP_UNLOCK(inp);
103498102Shsuoutunlocked:
103598102Shsu	INP_INFO_RUNLOCK(&tcbinfo);
103648758Sgreen	splx(s);
103799838Struckman	if (error == 0)
103899838Struckman		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
103948758Sgreen	return (error);
104048758Sgreen}
104148758Sgreen
104278697SdwmaloneSYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
104378697Sdwmalone    CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
104478697Sdwmalone    tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
104548758Sgreen
104655679Sshin#ifdef INET6
104755679Sshinstatic int
104862573Sphktcp6_getcred(SYSCTL_HANDLER_ARGS)
104955679Sshin{
105072650Sgreen	struct xucred xuc;
105155679Sshin	struct sockaddr_in6 addrs[2];
105255679Sshin	struct inpcb *inp;
105355679Sshin	int error, s, mapped = 0;
105455679Sshin
105593593Sjhb	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
105655679Sshin	if (error)
105755679Sshin		return (error);
105855679Sshin	error = SYSCTL_IN(req, addrs, sizeof(addrs));
105955679Sshin	if (error)
106055679Sshin		return (error);
106155679Sshin	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
106255679Sshin		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
106355679Sshin			mapped = 1;
106455679Sshin		else
106555679Sshin			return (EINVAL);
106655679Sshin	}
106755679Sshin	s = splnet();
106898102Shsu	INP_INFO_RLOCK(&tcbinfo);
106955679Sshin	if (mapped == 1)
107055679Sshin		inp = in_pcblookup_hash(&tcbinfo,
107155679Sshin			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
107255679Sshin			addrs[1].sin6_port,
107355679Sshin			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
107455679Sshin			addrs[0].sin6_port,
107555679Sshin			0, NULL);
107655679Sshin	else
107755679Sshin		inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
107855679Sshin				 addrs[1].sin6_port,
107955679Sshin				 &addrs[0].sin6_addr, addrs[0].sin6_port,
108055679Sshin				 0, NULL);
108198102Shsu	if (inp == NULL) {
108255679Sshin		error = ENOENT;
108398102Shsu		goto outunlocked;
108455679Sshin	}
108599837Struckman	INP_LOCK(inp);
108699837Struckman	if (inp->inp_socket == NULL) {
108799837Struckman		error = ENOENT;
108899837Struckman		goto out;
108999837Struckman	}
109092976Srwatson	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
109178697Sdwmalone	if (error)
109278697Sdwmalone		goto out;
109391354Sdd	cru2x(inp->inp_socket->so_cred, &xuc);
109455679Sshinout:
109598102Shsu	INP_UNLOCK(inp);
109698102Shsuoutunlocked:
109798102Shsu	INP_INFO_RUNLOCK(&tcbinfo);
109855679Sshin	splx(s);
109999838Struckman	if (error == 0)
110099838Struckman		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
110155679Sshin	return (error);
110255679Sshin}
110355679Sshin
110478697SdwmaloneSYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
110578697Sdwmalone    CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
110678697Sdwmalone    tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
110755679Sshin#endif
110855679Sshin
110955679Sshin
11101541Srgrimesvoid
111112881Sbdetcp_ctlinput(cmd, sa, vip)
11121541Srgrimes	int cmd;
11131541Srgrimes	struct sockaddr *sa;
111412881Sbde	void *vip;
11151541Srgrimes{
111672959Sjlemon	struct ip *ip = vip;
111772959Sjlemon	struct tcphdr *th;
111873109Sjlemon	struct in_addr faddr;
111973109Sjlemon	struct inpcb *inp;
112073109Sjlemon	struct tcpcb *tp;
112198211Shsu	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
112273109Sjlemon	tcp_seq icmp_seq;
112373109Sjlemon	int s;
11241541Srgrimes
112573109Sjlemon	faddr = ((struct sockaddr_in *)sa)->sin_addr;
112673109Sjlemon	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
112773109Sjlemon		return;
112873109Sjlemon
11291541Srgrimes	if (cmd == PRC_QUENCH)
11301541Srgrimes		notify = tcp_quench;
113174937Sjesper	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
113299156Sjesper		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
113372959Sjlemon		notify = tcp_drop_syn_sent;
113473109Sjlemon	else if (cmd == PRC_MSGSIZE)
113510881Swollman		notify = tcp_mtudisc;
113672922Sjesper	else if (PRC_IS_REDIRECT(cmd)) {
113772922Sjesper		ip = 0;
113872922Sjesper		notify = in_rtchange;
113972922Sjesper	} else if (cmd == PRC_HOSTDEAD)
114072922Sjesper		ip = 0;
1141119995Sru	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
11421541Srgrimes		return;
11431541Srgrimes	if (ip) {
114473109Sjlemon		s = splnet();
114517269Swollman		th = (struct tcphdr *)((caddr_t)ip
1146105586Sphk				       + (ip->ip_hl << 2));
114798596Shsu		INP_INFO_WLOCK(&tcbinfo);
114873109Sjlemon		inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
114973109Sjlemon		    ip->ip_src, th->th_sport, 0, NULL);
115098102Shsu		if (inp != NULL)  {
115198102Shsu			INP_LOCK(inp);
115298102Shsu			if (inp->inp_socket != NULL) {
115398102Shsu				icmp_seq = htonl(th->th_seq);
115498102Shsu				tp = intotcpcb(inp);
115598102Shsu				if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
115698102Shsu			    		SEQ_LT(icmp_seq, tp->snd_max))
115798211Shsu					inp = (*notify)(inp, inetctlerrmap[cmd]);
115898102Shsu			}
115998211Shsu			if (inp)
116098211Shsu				INP_UNLOCK(inp);
116186764Sjlemon		} else {
116286764Sjlemon			struct in_conninfo inc;
116386764Sjlemon
116486764Sjlemon			inc.inc_fport = th->th_dport;
116586764Sjlemon			inc.inc_lport = th->th_sport;
116686764Sjlemon			inc.inc_faddr = faddr;
116786764Sjlemon			inc.inc_laddr = ip->ip_src;
116886764Sjlemon#ifdef INET6
116986764Sjlemon			inc.inc_isipv6 = 0;
117086764Sjlemon#endif
117186764Sjlemon			syncache_unreach(&inc, th);
117273109Sjlemon		}
117398596Shsu		INP_INFO_WUNLOCK(&tcbinfo);
117473109Sjlemon		splx(s);
11751541Srgrimes	} else
117698102Shsu		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
11771541Srgrimes}
11781541Srgrimes
117955679Sshin#ifdef INET6
118055679Sshinvoid
118155679Sshintcp6_ctlinput(cmd, sa, d)
118255679Sshin	int cmd;
118355679Sshin	struct sockaddr *sa;
118455679Sshin	void *d;
118555679Sshin{
118655679Sshin	struct tcphdr th;
118798211Shsu	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
118855679Sshin	struct ip6_hdr *ip6;
118955679Sshin	struct mbuf *m;
119078064Sume	struct ip6ctlparam *ip6cp = NULL;
119178064Sume	const struct sockaddr_in6 *sa6_src = NULL;
119255679Sshin	int off;
119378064Sume	struct tcp_portonly {
119478064Sume		u_int16_t th_sport;
119578064Sume		u_int16_t th_dport;
119678064Sume	} *thp;
119755679Sshin
119855679Sshin	if (sa->sa_family != AF_INET6 ||
119955679Sshin	    sa->sa_len != sizeof(struct sockaddr_in6))
120055679Sshin		return;
120155679Sshin
120255679Sshin	if (cmd == PRC_QUENCH)
120355679Sshin		notify = tcp_quench;
120455679Sshin	else if (cmd == PRC_MSGSIZE)
120555679Sshin		notify = tcp_mtudisc;
120655679Sshin	else if (!PRC_IS_REDIRECT(cmd) &&
1207119995Sru		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
120855679Sshin		return;
120955679Sshin
121055679Sshin	/* if the parameter is from icmp6, decode it. */
121155679Sshin	if (d != NULL) {
121278064Sume		ip6cp = (struct ip6ctlparam *)d;
121355679Sshin		m = ip6cp->ip6c_m;
121455679Sshin		ip6 = ip6cp->ip6c_ip6;
121555679Sshin		off = ip6cp->ip6c_off;
121678064Sume		sa6_src = ip6cp->ip6c_src;
121755679Sshin	} else {
121855679Sshin		m = NULL;
121955679Sshin		ip6 = NULL;
122067456Sitojun		off = 0;	/* fool gcc */
122178064Sume		sa6_src = &sa6_any;
122255679Sshin	}
122355679Sshin
122455679Sshin	if (ip6) {
122586764Sjlemon		struct in_conninfo inc;
122655679Sshin		/*
122755679Sshin		 * XXX: We assume that when IPV6 is non NULL,
122855679Sshin		 * M and OFF are valid.
122955679Sshin		 */
123055679Sshin
123167456Sitojun		/* check if we can safely examine src and dst ports */
123278064Sume		if (m->m_pkthdr.len < off + sizeof(*thp))
123367456Sitojun			return;
123467456Sitojun
123578064Sume		bzero(&th, sizeof(th));
123678064Sume		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
123778064Sume
123878064Sume		in6_pcbnotify(&tcb, sa, th.th_dport,
123978064Sume		    (struct sockaddr *)ip6cp->ip6c_src,
124078064Sume		    th.th_sport, cmd, notify);
124186764Sjlemon
124286764Sjlemon		inc.inc_fport = th.th_dport;
124386764Sjlemon		inc.inc_lport = th.th_sport;
124486764Sjlemon		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
124586764Sjlemon		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
124686764Sjlemon		inc.inc_isipv6 = 1;
124786764Sjlemon		syncache_unreach(&inc, &th);
124855679Sshin	} else
124991357Salfred		in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src,
125055679Sshin			      0, cmd, notify);
125155679Sshin}
125255679Sshin#endif /* INET6 */
125355679Sshin
125480428Speter
125582122Ssilby/*
125682122Ssilby * Following is where TCP initial sequence number generation occurs.
125782122Ssilby *
125882122Ssilby * There are two places where we must use initial sequence numbers:
125982122Ssilby * 1.  In SYN-ACK packets.
126082122Ssilby * 2.  In SYN packets.
126182122Ssilby *
126294390Ssilby * All ISNs for SYN-ACK packets are generated by the syncache.  See
126394390Ssilby * tcp_syncache.c for details.
126482122Ssilby *
126582122Ssilby * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
126682122Ssilby * depends on this property.  In addition, these ISNs should be
126782122Ssilby * unguessable so as to prevent connection hijacking.  To satisfy
126882122Ssilby * the requirements of this situation, the algorithm outlined in
126982122Ssilby * RFC 1948 is used to generate sequence numbers.
127082122Ssilby *
127182122Ssilby * Implementation details:
127282122Ssilby *
127382122Ssilby * Time is based off the system timer, and is corrected so that it
127482122Ssilby * increases by one megabyte per second.  This allows for proper
127582122Ssilby * recycling on high speed LANs while still leaving over an hour
127682122Ssilby * before rollover.
127782122Ssilby *
127882122Ssilby * net.inet.tcp.isn_reseed_interval controls the number of seconds
127982122Ssilby * between seeding of isn_secret.  This is normally set to zero,
128082122Ssilby * as reseeding should not be necessary.
128182122Ssilby *
128282122Ssilby */
128379413Ssilby
128482122Ssilby#define ISN_BYTES_PER_SECOND 1048576
128579413Ssilby
128682122Ssilbyu_char isn_secret[32];
128782122Ssilbyint isn_last_reseed;
128882122SsilbyMD5_CTX isn_ctx;
128975619Skris
129075619Skristcp_seq
129182122Ssilbytcp_new_isn(tp)
129282122Ssilby	struct tcpcb *tp;
129375619Skris{
129482122Ssilby	u_int32_t md5_buffer[4];
129582122Ssilby	tcp_seq new_isn;
129675619Skris
129782122Ssilby	/* Seed if this is the first use, reseed if requested. */
129894390Ssilby	if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
129982122Ssilby	     (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
130082122Ssilby		< (u_int)ticks))) {
130182122Ssilby		read_random(&isn_secret, sizeof(isn_secret));
130282122Ssilby		isn_last_reseed = ticks;
130382122Ssilby	}
130482122Ssilby
130582122Ssilby	/* Compute the md5 hash and return the ISN. */
130682122Ssilby	MD5Init(&isn_ctx);
130782122Ssilby	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
130882122Ssilby	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
130982122Ssilby#ifdef INET6
131082122Ssilby	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
131182122Ssilby		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
131282122Ssilby			  sizeof(struct in6_addr));
131382122Ssilby		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
131482122Ssilby			  sizeof(struct in6_addr));
131582122Ssilby	} else
131682122Ssilby#endif
131782122Ssilby	{
131882122Ssilby		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
131982122Ssilby			  sizeof(struct in_addr));
132082122Ssilby		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
132182122Ssilby			  sizeof(struct in_addr));
132282122Ssilby	}
132382122Ssilby	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
132482122Ssilby	MD5Final((u_char *) &md5_buffer, &isn_ctx);
132582122Ssilby	new_isn = (tcp_seq) md5_buffer[0];
132682122Ssilby	new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
132782122Ssilby	return new_isn;
132875619Skris}
132975619Skris
13301541Srgrimes/*
13311541Srgrimes * When a source quench is received, close congestion window
13321541Srgrimes * to one segment.  We will gradually open it again as we proceed.
13331541Srgrimes */
133498211Shsustruct inpcb *
13351541Srgrimestcp_quench(inp, errno)
13361541Srgrimes	struct inpcb *inp;
13371541Srgrimes	int errno;
13381541Srgrimes{
13391541Srgrimes	struct tcpcb *tp = intotcpcb(inp);
13401541Srgrimes
13411541Srgrimes	if (tp)
13421541Srgrimes		tp->snd_cwnd = tp->t_maxseg;
134398211Shsu	return (inp);
13441541Srgrimes}
13456283Swollman
13466283Swollman/*
134772959Sjlemon * When a specific ICMP unreachable message is received and the
134872959Sjlemon * connection state is SYN-SENT, drop the connection.  This behavior
134972959Sjlemon * is controlled by the icmp_may_rst sysctl.
135070103Sphk */
135198211Shsustruct inpcb *
135270103Sphktcp_drop_syn_sent(inp, errno)
135370103Sphk	struct inpcb *inp;
135470103Sphk	int errno;
135570103Sphk{
135670103Sphk	struct tcpcb *tp = intotcpcb(inp);
135770103Sphk
135898211Shsu	if (tp && tp->t_state == TCPS_SYN_SENT) {
135972638Sphk		tcp_drop(tp, errno);
136098211Shsu		return (struct inpcb *)0;
136198211Shsu	}
136298211Shsu	return inp;
136372638Sphk}
136472638Sphk
136572638Sphk/*
136610881Swollman * When `need fragmentation' ICMP is received, update our idea of the MSS
136710881Swollman * based on the new value in the route.  Also nudge TCP to send something,
136810881Swollman * since we know the packet we just sent was dropped.
136910930Swollman * This duplicates some code in the tcp_mss() function in tcp_input.c.
137010881Swollman */
137198211Shsustruct inpcb *
137210881Swollmantcp_mtudisc(inp, errno)
137310881Swollman	struct inpcb *inp;
137410881Swollman	int errno;
137510881Swollman{
137610881Swollman	struct tcpcb *tp = intotcpcb(inp);
137710930Swollman	struct rtentry *rt;
137810930Swollman	struct rmxp_tao *taop;
137910930Swollman	struct socket *so = inp->inp_socket;
138010930Swollman	int offered;
138110930Swollman	int mss;
138255679Sshin#ifdef INET6
138355679Sshin	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
138455679Sshin#endif /* INET6 */
138510881Swollman
138610930Swollman	if (tp) {
138755679Sshin#ifdef INET6
138855679Sshin		if (isipv6)
138986764Sjlemon			rt = tcp_rtlookup6(&inp->inp_inc);
139055679Sshin		else
139155679Sshin#endif /* INET6 */
139286764Sjlemon		rt = tcp_rtlookup(&inp->inp_inc);
139310930Swollman		if (!rt || !rt->rt_rmx.rmx_mtu) {
139455679Sshin			tp->t_maxopd = tp->t_maxseg =
139555679Sshin#ifdef INET6
139655679Sshin				isipv6 ? tcp_v6mssdflt :
139755679Sshin#endif /* INET6 */
139855679Sshin				tcp_mssdflt;
139998211Shsu			return inp;
140010930Swollman		}
140110930Swollman		taop = rmx_taop(rt->rt_rmx);
140210930Swollman		offered = taop->tao_mssopt;
140355679Sshin		mss = rt->rt_rmx.rmx_mtu -
140455679Sshin#ifdef INET6
140555679Sshin			(isipv6 ?
140655679Sshin			 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
140755679Sshin#endif /* INET6 */
140855679Sshin			 sizeof(struct tcpiphdr)
140955679Sshin#ifdef INET6
141055679Sshin			 )
141155679Sshin#endif /* INET6 */
141255679Sshin			;
141355679Sshin
141412939Swollman		if (offered)
141512939Swollman			mss = min(mss, offered);
141612939Swollman		/*
141712939Swollman		 * XXX - The above conditional probably violates the TCP
141812939Swollman		 * spec.  The problem is that, since we don't know the
141912939Swollman		 * other end's MSS, we are supposed to use a conservative
142012939Swollman		 * default.  But, if we do that, then MTU discovery will
142112939Swollman		 * never actually take place, because the conservative
142212939Swollman		 * default is much less than the MTUs typically seen
142312939Swollman		 * on the Internet today.  For the moment, we'll sweep
142412939Swollman		 * this under the carpet.
142512939Swollman		 *
142612939Swollman		 * The conservative default might not actually be a problem
142712939Swollman		 * if the only case this occurs is when sending an initial
142812939Swollman		 * SYN with options and data to a host we've never talked
142912939Swollman		 * to before.  Then, they will reply with an MSS value which
143012939Swollman		 * will get recorded and the new parameters should get
143112939Swollman		 * recomputed.  For Further Study.
143212939Swollman		 */
143311415Swollman		if (tp->t_maxopd <= mss)
143498211Shsu			return inp;
143510930Swollman		tp->t_maxopd = mss;
143610930Swollman
143710930Swollman		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
143810930Swollman		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
143910930Swollman			mss -= TCPOLEN_TSTAMP_APPA;
144010930Swollman		if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
144110930Swollman		    (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
144210930Swollman			mss -= TCPOLEN_CC_APPA;
144310930Swollman#if	(MCLBYTES & (MCLBYTES - 1)) == 0
144410930Swollman		if (mss > MCLBYTES)
144510930Swollman			mss &= ~(MCLBYTES-1);
144610930Swollman#else
144710930Swollman		if (mss > MCLBYTES)
144810930Swollman			mss = mss / MCLBYTES * MCLBYTES;
144910881Swollman#endif
145010930Swollman		if (so->so_snd.sb_hiwat < mss)
145110930Swollman			mss = so->so_snd.sb_hiwat;
145210930Swollman
145310930Swollman		tp->t_maxseg = mss;
145410930Swollman
145511450Swollman		tcpstat.tcps_mturesent++;
145650673Sjlemon		tp->t_rtttime = 0;
145711450Swollman		tp->snd_nxt = tp->snd_una;
145811450Swollman		tcp_output(tp);
145910930Swollman	}
146098211Shsu	return inp;
146110881Swollman}
146210881Swollman
146310881Swollman/*
14646283Swollman * Look-up the routing entry to the peer of this inpcb.  If no route
1465108265Shsu * is found and it cannot be allocated, then return NULL.  This routine
14666283Swollman * is called by TCP routines that access the rmx structure and by tcp_mss
14676283Swollman * to get the interface MTU.
14686283Swollman */
14696283Swollmanstruct rtentry *
147086764Sjlemontcp_rtlookup(inc)
147186764Sjlemon	struct in_conninfo *inc;
14726283Swollman{
14736283Swollman	struct route *ro;
14746283Swollman	struct rtentry *rt;
14756283Swollman
147686764Sjlemon	ro = &inc->inc_route;
14776283Swollman	rt = ro->ro_rt;
14786283Swollman	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
14796283Swollman		/* No route yet, so try to acquire one */
148086764Sjlemon		if (inc->inc_faddr.s_addr != INADDR_ANY) {
14816283Swollman			ro->ro_dst.sa_family = AF_INET;
148278492Sume			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
14836283Swollman			((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
148486764Sjlemon			    inc->inc_faddr;
14856283Swollman			rtalloc(ro);
14866283Swollman			rt = ro->ro_rt;
14876283Swollman		}
14886283Swollman	}
14896283Swollman	return rt;
14906283Swollman}
14916283Swollman
149255679Sshin#ifdef INET6
149355679Sshinstruct rtentry *
149486764Sjlemontcp_rtlookup6(inc)
149586764Sjlemon	struct in_conninfo *inc;
149655679Sshin{
149755679Sshin	struct route_in6 *ro6;
149855679Sshin	struct rtentry *rt;
149955679Sshin
150086764Sjlemon	ro6 = &inc->inc6_route;
150155679Sshin	rt = ro6->ro_rt;
150255679Sshin	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
150355679Sshin		/* No route yet, so try to acquire one */
150486764Sjlemon		if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
150586764Sjlemon			ro6->ro_dst.sin6_family = AF_INET6;
150686764Sjlemon			ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
150786764Sjlemon			ro6->ro_dst.sin6_addr = inc->inc6_faddr;
150855679Sshin			rtalloc((struct route *)ro6);
150955679Sshin			rt = ro6->ro_rt;
151055679Sshin		}
151155679Sshin	}
151255679Sshin	return rt;
151355679Sshin}
151455679Sshin#endif /* INET6 */
151555679Sshin
151655679Sshin#ifdef IPSEC
151755679Sshin/* compute ESP/AH header size for TCP, including outer IP header. */
151855679Sshinsize_t
151955679Sshinipsec_hdrsiz_tcp(tp)
152055679Sshin	struct tcpcb *tp;
152155679Sshin{
152255679Sshin	struct inpcb *inp;
152355679Sshin	struct mbuf *m;
152455679Sshin	size_t hdrsiz;
152555679Sshin	struct ip *ip;
152655679Sshin#ifdef INET6
152755679Sshin	struct ip6_hdr *ip6;
1528111145Sjlemon#endif
152955679Sshin	struct tcphdr *th;
153055679Sshin
153178642Ssilby	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
153255679Sshin		return 0;
1533111119Simp	MGETHDR(m, M_DONTWAIT, MT_DATA);
153455679Sshin	if (!m)
153555679Sshin		return 0;
153655679Sshin
153755679Sshin#ifdef INET6
153855679Sshin	if ((inp->inp_vflag & INP_IPV6) != 0) {
153955679Sshin		ip6 = mtod(m, struct ip6_hdr *);
154055679Sshin		th = (struct tcphdr *)(ip6 + 1);
154155679Sshin		m->m_pkthdr.len = m->m_len =
154255679Sshin			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1543111144Sjlemon		tcpip_fillheaders(inp, ip6, th);
154455679Sshin		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
154555679Sshin	} else
154655679Sshin#endif /* INET6 */
154755679Sshin      {
154855679Sshin	ip = mtod(m, struct ip *);
154955679Sshin	th = (struct tcphdr *)(ip + 1);
155055679Sshin	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1551111144Sjlemon	tcpip_fillheaders(inp, ip, th);
155255679Sshin	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
155355679Sshin      }
155455679Sshin
155555679Sshin	m_free(m);
155655679Sshin	return hdrsiz;
155755679Sshin}
155855679Sshin#endif /*IPSEC*/
155955679Sshin
15606283Swollman/*
15616283Swollman * Return a pointer to the cached information about the remote host.
15626283Swollman * The cached information is stored in the protocol specific part of
15636283Swollman * the route metrics.
15646283Swollman */
15656283Swollmanstruct rmxp_tao *
156686764Sjlemontcp_gettaocache(inc)
156786764Sjlemon	struct in_conninfo *inc;
15686283Swollman{
156955679Sshin	struct rtentry *rt;
15706283Swollman
157155679Sshin#ifdef INET6
157286764Sjlemon	if (inc->inc_isipv6)
157386764Sjlemon		rt = tcp_rtlookup6(inc);
157455679Sshin	else
157555679Sshin#endif /* INET6 */
157686764Sjlemon	rt = tcp_rtlookup(inc);
157755679Sshin
15786283Swollman	/* Make sure this is a host route and is up. */
15796283Swollman	if (rt == NULL ||
15806283Swollman	    (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
15816283Swollman		return NULL;
15826283Swollman
15836283Swollman	return rmx_taop(rt->rt_rmx);
15846283Swollman}
15856283Swollman
15866283Swollman/*
15876283Swollman * Clear all the TAO cache entries, called from tcp_init.
15886283Swollman *
15896283Swollman * XXX
15906283Swollman * This routine is just an empty one, because we assume that the routing
15916283Swollman * routing tables are initialized at the same time when TCP, so there is
15926283Swollman * nothing in the cache left over.
15936283Swollman */
15946283Swollmanstatic void
159529506Sbdetcp_cleartaocache()
159629506Sbde{
159729506Sbde}
1598102017Sdillon
1599102017Sdillon/*
1600111145Sjlemon * Move a TCP connection into TIME_WAIT state.
1601111145Sjlemon *    tcbinfo is unlocked.
1602111145Sjlemon *    inp is locked, and is unlocked before returning.
1603111145Sjlemon */
1604111145Sjlemonvoid
1605111145Sjlemontcp_twstart(tp)
1606111145Sjlemon	struct tcpcb *tp;
1607111145Sjlemon{
1608111145Sjlemon	struct tcptw *tw;
1609111145Sjlemon	struct inpcb *inp;
1610111145Sjlemon	int tw_time, acknow;
1611111145Sjlemon	struct socket *so;
1612111145Sjlemon
1613112009Sjlemon	tw = uma_zalloc(tcptw_zone, M_NOWAIT);
1614112009Sjlemon	if (tw == NULL) {
1615112009Sjlemon		tw = tcp_timer_2msl_tw(1);
1616112009Sjlemon		if (tw == NULL) {
1617112009Sjlemon			tcp_close(tp);
1618112009Sjlemon			return;
1619112009Sjlemon		}
1620112009Sjlemon	}
1621111145Sjlemon	inp = tp->t_inpcb;
1622111145Sjlemon	tw->tw_inpcb = inp;
1623111145Sjlemon
1624111145Sjlemon	/*
1625111145Sjlemon	 * Recover last window size sent.
1626111145Sjlemon	 */
1627111145Sjlemon	tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
1628111145Sjlemon
1629111145Sjlemon	/*
1630111145Sjlemon	 * Set t_recent if timestamps are used on the connection.
1631111145Sjlemon	 */
1632111145Sjlemon        if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
1633111145Sjlemon            (TF_REQ_TSTMP|TF_RCVD_TSTMP))
1634111145Sjlemon		tw->t_recent = tp->ts_recent;
1635111145Sjlemon	else
1636111145Sjlemon		tw->t_recent = 0;
1637111145Sjlemon
1638111145Sjlemon	tw->snd_nxt = tp->snd_nxt;
1639111145Sjlemon	tw->rcv_nxt = tp->rcv_nxt;
1640121850Ssilby	tw->iss     = tp->iss;
1641111145Sjlemon	tw->cc_recv = tp->cc_recv;
1642111145Sjlemon	tw->cc_send = tp->cc_send;
1643111145Sjlemon	tw->t_starttime = tp->t_starttime;
1644112009Sjlemon	tw->tw_time = 0;
1645111145Sjlemon
1646111145Sjlemon/* XXX
1647111145Sjlemon * If this code will
1648111145Sjlemon * be used for fin-wait-2 state also, then we may need
1649111145Sjlemon * a ts_recent from the last segment.
1650111145Sjlemon */
1651111145Sjlemon	/* Shorten TIME_WAIT [RFC-1644, p.28] */
1652111145Sjlemon	if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) {
1653111145Sjlemon		tw_time = tp->t_rxtcur * TCPTV_TWTRUNC;
1654111145Sjlemon		/* For T/TCP client, force ACK now. */
1655111145Sjlemon		acknow = 1;
1656111145Sjlemon	} else {
1657111145Sjlemon		tw_time = 2 * tcp_msl;
1658111145Sjlemon		acknow = tp->t_flags & TF_ACKNOW;
1659111145Sjlemon	}
1660111145Sjlemon	tcp_discardcb(tp);
1661111145Sjlemon	so = inp->inp_socket;
1662111145Sjlemon	so->so_pcb = NULL;
1663111145Sjlemon	tw->tw_cred = crhold(so->so_cred);
1664111145Sjlemon	tw->tw_so_options = so->so_options;
1665114794Srwatson	if (acknow)
1666114794Srwatson		tcp_twrespond(tw, so, NULL, TH_ACK);
1667111145Sjlemon	sotryfree(so);
1668111145Sjlemon	inp->inp_socket = NULL;
1669111145Sjlemon	inp->inp_ppcb = (caddr_t)tw;
1670111145Sjlemon	inp->inp_vflag |= INP_TIMEWAIT;
1671112009Sjlemon	tcp_timer_2msl_reset(tw, tw_time);
1672111145Sjlemon	INP_UNLOCK(inp);
1673111145Sjlemon}
1674111145Sjlemon
1675121850Ssilby/*
1676121850Ssilby * Determine if the ISN we will generate has advanced beyond the last
1677121850Ssilby * sequence number used by the previous connection.  If so, indicate
1678121850Ssilby * that it is safe to recycle this tw socket by returning 1.
1679121850Ssilby */
1680121850Ssilbyint
1681121850Ssilbytcp_twrecycleable(struct tcptw *tw)
1682121850Ssilby{
1683121850Ssilby	tcp_seq new_isn = tw->iss;
1684121850Ssilby
1685121850Ssilby	new_isn += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz);
1686121850Ssilby
1687121850Ssilby	if (SEQ_GT(new_isn, tw->snd_nxt))
1688121850Ssilby		return 1;
1689121850Ssilby	else
1690121850Ssilby		return 0;
1691121850Ssilby}
1692121850Ssilby
1693112009Sjlemonstruct tcptw *
1694112009Sjlemontcp_twclose(struct tcptw *tw, int reuse)
1695111145Sjlemon{
1696111145Sjlemon	struct inpcb *inp;
1697111145Sjlemon
1698111145Sjlemon	inp = tw->tw_inpcb;
1699111145Sjlemon	tw->tw_inpcb = NULL;
1700112009Sjlemon	tcp_timer_2msl_stop(tw);
1701111145Sjlemon	inp->inp_ppcb = NULL;
1702111145Sjlemon#ifdef INET6
1703111145Sjlemon	if (inp->inp_vflag & INP_IPV6PROTO)
1704111145Sjlemon		in6_pcbdetach(inp);
1705111145Sjlemon	else
1706111145Sjlemon#endif
1707111145Sjlemon		in_pcbdetach(inp);
1708111145Sjlemon	tcpstat.tcps_closed++;
1709112009Sjlemon	if (reuse)
1710112009Sjlemon		return (tw);
1711112009Sjlemon	uma_zfree(tcptw_zone, tw);
1712112009Sjlemon	return (NULL);
1713111145Sjlemon}
1714111145Sjlemon
1715114794Srwatson/*
1716114794Srwatson * One of so and msrc must be non-NULL for use by the MAC Framework to
1717114794Srwatson * construct a label for ay resulting packet.
1718114794Srwatson */
1719111145Sjlemonint
1720114794Srwatsontcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
1721114794Srwatson    int flags)
1722111145Sjlemon{
1723111145Sjlemon	struct inpcb *inp = tw->tw_inpcb;
1724111145Sjlemon	struct tcphdr *th;
1725111145Sjlemon	struct mbuf *m;
1726111145Sjlemon	struct ip *ip = NULL;
1727111145Sjlemon	u_int8_t *optp;
1728111145Sjlemon	u_int hdrlen, optlen;
1729111145Sjlemon	int error;
1730111145Sjlemon#ifdef INET6
1731111145Sjlemon	struct ip6_hdr *ip6 = NULL;
1732111145Sjlemon	int isipv6 = inp->inp_inc.inc_isipv6;
1733111145Sjlemon#endif
1734111145Sjlemon
1735114794Srwatson	KASSERT(so != NULL || msrc != NULL,
1736114794Srwatson	    ("tcp_twrespond: so and msrc NULL"));
1737114794Srwatson
1738111231Sphk	m = m_gethdr(M_DONTWAIT, MT_HEADER);
1739111145Sjlemon	if (m == NULL)
1740111145Sjlemon		return (ENOBUFS);
1741111145Sjlemon	m->m_data += max_linkhdr;
1742111145Sjlemon
1743114794Srwatson#ifdef MAC
1744114794Srwatson	if (so != NULL)
1745114794Srwatson		mac_create_mbuf_from_socket(so, m);
1746114794Srwatson	else
1747114794Srwatson		mac_create_mbuf_netlayer(msrc, m);
1748114794Srwatson#endif
1749114794Srwatson
1750111153Sjlemon#ifdef INET6
1751111145Sjlemon	if (isipv6) {
1752111145Sjlemon		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1753111145Sjlemon		ip6 = mtod(m, struct ip6_hdr *);
1754111145Sjlemon		th = (struct tcphdr *)(ip6 + 1);
1755111145Sjlemon		tcpip_fillheaders(inp, ip6, th);
1756111153Sjlemon	} else
1757111153Sjlemon#endif
1758111153Sjlemon	{
1759111145Sjlemon		hdrlen = sizeof(struct tcpiphdr);
1760111145Sjlemon		ip = mtod(m, struct ip *);
1761111145Sjlemon		th = (struct tcphdr *)(ip + 1);
1762111145Sjlemon		tcpip_fillheaders(inp, ip, th);
1763111145Sjlemon	}
1764111145Sjlemon	optp = (u_int8_t *)(th + 1);
1765111145Sjlemon
1766111145Sjlemon 	/*
1767111145Sjlemon	 * Send a timestamp and echo-reply if both our side and our peer
1768111145Sjlemon	 * have sent timestamps in our SYN's and this is not a RST.
1769111145Sjlemon 	 */
1770111145Sjlemon	if (tw->t_recent && flags == TH_ACK) {
1771111145Sjlemon		u_int32_t *lp = (u_int32_t *)optp;
1772111145Sjlemon
1773111145Sjlemon 		/* Form timestamp option as shown in appendix A of RFC 1323. */
1774111145Sjlemon 		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
1775111145Sjlemon 		*lp++ = htonl(ticks);
1776111145Sjlemon 		*lp   = htonl(tw->t_recent);
1777111145Sjlemon 		optp += TCPOLEN_TSTAMP_APPA;
1778111145Sjlemon 	}
1779111145Sjlemon
1780111145Sjlemon 	/*
1781111145Sjlemon	 * Send `CC-family' options if needed, and it's not a RST.
1782111145Sjlemon 	 */
1783111145Sjlemon	if (tw->cc_recv != 0 && flags == TH_ACK) {
1784111145Sjlemon		u_int32_t *lp = (u_int32_t *)optp;
1785111145Sjlemon
1786111145Sjlemon		*lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC));
1787111145Sjlemon		*lp   = htonl(tw->cc_send);
1788111145Sjlemon		optp += TCPOLEN_CC_APPA;
1789111145Sjlemon 	}
1790111145Sjlemon	optlen = optp - (u_int8_t *)(th + 1);
1791111145Sjlemon
1792111145Sjlemon	m->m_len = hdrlen + optlen;
1793111145Sjlemon	m->m_pkthdr.len = m->m_len;
1794111145Sjlemon
1795111145Sjlemon	KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
1796111145Sjlemon
1797111145Sjlemon	th->th_seq = htonl(tw->snd_nxt);
1798111145Sjlemon	th->th_ack = htonl(tw->rcv_nxt);
1799111145Sjlemon	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1800111145Sjlemon	th->th_flags = flags;
1801111145Sjlemon	th->th_win = htons(tw->last_win);
1802111145Sjlemon
1803111153Sjlemon#ifdef INET6
1804111145Sjlemon	if (isipv6) {
1805111145Sjlemon		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
1806111145Sjlemon		    sizeof(struct tcphdr) + optlen);
1807111145Sjlemon		ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
1808111145Sjlemon		    inp->in6p_route.ro_rt->rt_ifp : NULL);
1809111145Sjlemon		error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
1810111145Sjlemon		    (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
1811111153Sjlemon	} else
1812111153Sjlemon#endif
1813111153Sjlemon	{
1814111145Sjlemon		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1815111145Sjlemon                    htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
1816111145Sjlemon		m->m_pkthdr.csum_flags = CSUM_TCP;
1817111145Sjlemon		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1818111145Sjlemon		ip->ip_len = m->m_pkthdr.len;
1819111145Sjlemon		error = ip_output(m, inp->inp_options, &inp->inp_route,
1820111145Sjlemon		    (tw->tw_so_options & SO_DONTROUTE), NULL, inp);
1821111145Sjlemon	}
1822111145Sjlemon	if (flags & TH_ACK)
1823111145Sjlemon		tcpstat.tcps_sndacks++;
1824111145Sjlemon	else
1825111145Sjlemon		tcpstat.tcps_sndctrl++;
1826111145Sjlemon	tcpstat.tcps_sndtotal++;
1827111145Sjlemon	return (error);
1828111145Sjlemon}
1829111145Sjlemon
1830111145Sjlemon/*
1831102017Sdillon * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
1832102017Sdillon *
1833102017Sdillon * This code attempts to calculate the bandwidth-delay product as a
1834102017Sdillon * means of determining the optimal window size to maximize bandwidth,
1835102017Sdillon * minimize RTT, and avoid the over-allocation of buffers on interfaces and
1836102017Sdillon * routers.  This code also does a fairly good job keeping RTTs in check
1837102017Sdillon * across slow links like modems.  We implement an algorithm which is very
1838102017Sdillon * similar (but not meant to be) TCP/Vegas.  The code operates on the
1839102017Sdillon * transmitter side of a TCP connection and so only effects the transmit
1840102017Sdillon * side of the connection.
1841102017Sdillon *
1842102017Sdillon * BACKGROUND:  TCP makes no provision for the management of buffer space
1843102017Sdillon * at the end points or at the intermediate routers and switches.  A TCP
1844102017Sdillon * stream, whether using NewReno or not, will eventually buffer as
1845102017Sdillon * many packets as it is able and the only reason this typically works is
1846102017Sdillon * due to the fairly small default buffers made available for a connection
1847102017Sdillon * (typicaly 16K or 32K).  As machines use larger windows and/or window
1848102017Sdillon * scaling it is now fairly easy for even a single TCP connection to blow-out
1849102017Sdillon * all available buffer space not only on the local interface, but on
1850102017Sdillon * intermediate routers and switches as well.  NewReno makes a misguided
1851102017Sdillon * attempt to 'solve' this problem by waiting for an actual failure to occur,
1852102017Sdillon * then backing off, then steadily increasing the window again until another
1853102017Sdillon * failure occurs, ad-infinitum.  This results in terrible oscillation that
1854102017Sdillon * is only made worse as network loads increase and the idea of intentionally
1855102017Sdillon * blowing out network buffers is, frankly, a terrible way to manage network
1856102017Sdillon * resources.
1857102017Sdillon *
1858102017Sdillon * It is far better to limit the transmit window prior to the failure
1859102017Sdillon * condition being achieved.  There are two general ways to do this:  First
1860102017Sdillon * you can 'scan' through different transmit window sizes and locate the
1861102017Sdillon * point where the RTT stops increasing, indicating that you have filled the
1862102017Sdillon * pipe, then scan backwards until you note that RTT stops decreasing, then
1863102017Sdillon * repeat ad-infinitum.  This method works in principle but has severe
1864102017Sdillon * implementation issues due to RTT variances, timer granularity, and
1865102017Sdillon * instability in the algorithm which can lead to many false positives and
1866102017Sdillon * create oscillations as well as interact badly with other TCP streams
1867102017Sdillon * implementing the same algorithm.
1868102017Sdillon *
1869102017Sdillon * The second method is to limit the window to the bandwidth delay product
1870102017Sdillon * of the link.  This is the method we implement.  RTT variances and our
1871102017Sdillon * own manipulation of the congestion window, bwnd, can potentially
1872102017Sdillon * destabilize the algorithm.  For this reason we have to stabilize the
1873102017Sdillon * elements used to calculate the window.  We do this by using the minimum
1874102017Sdillon * observed RTT, the long term average of the observed bandwidth, and
1875102017Sdillon * by adding two segments worth of slop.  It isn't perfect but it is able
1876102017Sdillon * to react to changing conditions and gives us a very stable basis on
1877102017Sdillon * which to extend the algorithm.
1878102017Sdillon */
1879102017Sdillonvoid
1880102017Sdillontcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
1881102017Sdillon{
1882102017Sdillon	u_long bw;
1883102017Sdillon	u_long bwnd;
1884102017Sdillon	int save_ticks;
1885102017Sdillon
1886102017Sdillon	/*
1887102017Sdillon	 * If inflight_enable is disabled in the middle of a tcp connection,
1888102017Sdillon	 * make sure snd_bwnd is effectively disabled.
1889102017Sdillon	 */
1890102017Sdillon	if (tcp_inflight_enable == 0) {
1891102017Sdillon		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1892102017Sdillon		tp->snd_bandwidth = 0;
1893102017Sdillon		return;
1894102017Sdillon	}
1895102017Sdillon
1896102017Sdillon	/*
1897102017Sdillon	 * Figure out the bandwidth.  Due to the tick granularity this
1898102017Sdillon	 * is a very rough number and it MUST be averaged over a fairly
1899102017Sdillon	 * long period of time.  XXX we need to take into account a link
1900102017Sdillon	 * that is not using all available bandwidth, but for now our
1901102017Sdillon	 * slop will ramp us up if this case occurs and the bandwidth later
1902102017Sdillon	 * increases.
1903102368Sdillon	 *
1904102368Sdillon	 * Note: if ticks rollover 'bw' may wind up negative.  We must
1905102368Sdillon	 * effectively reset t_bw_rtttime for this case.
1906102017Sdillon	 */
1907102017Sdillon	save_ticks = ticks;
1908102017Sdillon	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
1909102017Sdillon		return;
1910102017Sdillon
1911102017Sdillon	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
1912102017Sdillon	    (save_ticks - tp->t_bw_rtttime);
1913102017Sdillon	tp->t_bw_rtttime = save_ticks;
1914102017Sdillon	tp->t_bw_rtseq = ack_seq;
1915102368Sdillon	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
1916102017Sdillon		return;
1917102017Sdillon	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
1918102017Sdillon
1919102017Sdillon	tp->snd_bandwidth = bw;
1920102017Sdillon
1921102017Sdillon	/*
1922102017Sdillon	 * Calculate the semi-static bandwidth delay product, plus two maximal
1923102017Sdillon	 * segments.  The additional slop puts us squarely in the sweet
1924107881Sdillon	 * spot and also handles the bandwidth run-up case and stabilization.
1925107881Sdillon	 * Without the slop we could be locking ourselves into a lower
1926107881Sdillon	 * bandwidth.
1927102017Sdillon	 *
1928102017Sdillon	 * Situations Handled:
1929102017Sdillon	 *	(1) Prevents over-queueing of packets on LANs, especially on
1930102017Sdillon	 *	    high speed LANs, allowing larger TCP buffers to be
1931102017Sdillon	 *	    specified, and also does a good job preventing
1932102017Sdillon	 *	    over-queueing of packets over choke points like modems
1933102017Sdillon	 *	    (at least for the transmit side).
1934102017Sdillon	 *
1935102017Sdillon	 *	(2) Is able to handle changing network loads (bandwidth
1936102017Sdillon	 *	    drops so bwnd drops, bandwidth increases so bwnd
1937102017Sdillon	 *	    increases).
1938102017Sdillon	 *
1939102017Sdillon	 *	(3) Theoretically should stabilize in the face of multiple
1940102017Sdillon	 *	    connections implementing the same algorithm (this may need
1941102017Sdillon	 *	    a little work).
1942107881Sdillon	 *
1943107881Sdillon	 *	(4) Stability value (defaults to 20 = 2 maximal packets) can
1944107881Sdillon	 *	    be adjusted with a sysctl but typically only needs to be
1945107881Sdillon	 *	    on very slow connections.  A value no smaller then 5
1946107881Sdillon	 *	    should be used, but only reduce this default if you have
1947107881Sdillon	 *	    no other choice.
1948102017Sdillon	 */
1949102017Sdillon#define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
1950107881Sdillon	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
1951102368Sdillon#undef USERTT
1952102017Sdillon
1953102017Sdillon	if (tcp_inflight_debug > 0) {
1954102017Sdillon		static int ltime;
1955102017Sdillon		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
1956102017Sdillon			ltime = ticks;
1957102017Sdillon			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
1958102017Sdillon			    tp,
1959102017Sdillon			    bw,
1960102017Sdillon			    tp->t_rttbest,
1961102017Sdillon			    tp->t_srtt,
1962102017Sdillon			    bwnd
1963102017Sdillon			);
1964102017Sdillon		}
1965102017Sdillon	}
1966102017Sdillon	if ((long)bwnd < tcp_inflight_min)
1967102017Sdillon		bwnd = tcp_inflight_min;
1968102017Sdillon	if (bwnd > tcp_inflight_max)
1969102017Sdillon		bwnd = tcp_inflight_max;
1970102017Sdillon	if ((long)bwnd < tp->t_maxseg * 2)
1971102017Sdillon		bwnd = tp->t_maxseg * 2;
1972102017Sdillon	tp->snd_bwnd = bwnd;
1973102017Sdillon}
1974102017Sdillon
1975