tcp_timewait.c revision 126351
11541Srgrimes/*
211150Swollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
41541Srgrimes *
51541Srgrimes * Redistribution and use in source and binary forms, with or without
61541Srgrimes * modification, are permitted provided that the following conditions
71541Srgrimes * are met:
81541Srgrimes * 1. Redistributions of source code must retain the above copyright
91541Srgrimes *    notice, this list of conditions and the following disclaimer.
101541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111541Srgrimes *    notice, this list of conditions and the following disclaimer in the
121541Srgrimes *    documentation and/or other materials provided with the distribution.
131541Srgrimes * 3. All advertising materials mentioning features or use of this software
141541Srgrimes *    must display the following acknowledgement:
151541Srgrimes *	This product includes software developed by the University of
161541Srgrimes *	California, Berkeley and its contributors.
171541Srgrimes * 4. Neither the name of the University nor the names of its contributors
181541Srgrimes *    may be used to endorse or promote products derived from this software
191541Srgrimes *    without specific prior written permission.
201541Srgrimes *
211541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
221541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
231541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
241541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
251541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
261541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
271541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
281541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
291541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
301541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
311541Srgrimes * SUCH DAMAGE.
321541Srgrimes *
3311150Swollman *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
3450477Speter * $FreeBSD: head/sys/netinet/tcp_timewait.c 126351 2004-02-28 15:12:20Z rwatson $
351541Srgrimes */
361541Srgrimes
3732752Seivind#include "opt_compat.h"
38125680Sbms#include "opt_inet.h"
3954263Sshin#include "opt_inet6.h"
4056041Sshin#include "opt_ipsec.h"
41101106Srwatson#include "opt_mac.h"
4229514Sjoerg#include "opt_tcpdebug.h"
4329514Sjoerg
441541Srgrimes#include <sys/param.h>
451541Srgrimes#include <sys/systm.h>
4650673Sjlemon#include <sys/callout.h>
4712172Sphk#include <sys/kernel.h>
4812172Sphk#include <sys/sysctl.h>
49101106Srwatson#include <sys/mac.h>
501541Srgrimes#include <sys/malloc.h>
511541Srgrimes#include <sys/mbuf.h>
5255679Sshin#ifdef INET6
5355679Sshin#include <sys/domain.h>
5455679Sshin#endif
5548758Sgreen#include <sys/proc.h>
561541Srgrimes#include <sys/socket.h>
571541Srgrimes#include <sys/socketvar.h>
581541Srgrimes#include <sys/protosw.h>
5975619Skris#include <sys/random.h>
6034923Sbde
6192760Sjeff#include <vm/uma.h>
621541Srgrimes
631541Srgrimes#include <net/route.h>
641541Srgrimes#include <net/if.h>
651541Srgrimes
661541Srgrimes#include <netinet/in.h>
671541Srgrimes#include <netinet/in_systm.h>
681541Srgrimes#include <netinet/ip.h>
6955679Sshin#ifdef INET6
7055679Sshin#include <netinet/ip6.h>
7155679Sshin#endif
721541Srgrimes#include <netinet/in_pcb.h>
7355679Sshin#ifdef INET6
7455679Sshin#include <netinet6/in6_pcb.h>
7555679Sshin#endif
767090Sbde#include <netinet/in_var.h>
771541Srgrimes#include <netinet/ip_var.h>
7855679Sshin#ifdef INET6
7955679Sshin#include <netinet6/ip6_var.h>
80122922Sandre#include <netinet6/nd6.h>
8155679Sshin#endif
821541Srgrimes#include <netinet/tcp.h>
831541Srgrimes#include <netinet/tcp_fsm.h>
841541Srgrimes#include <netinet/tcp_seq.h>
851541Srgrimes#include <netinet/tcp_timer.h>
861541Srgrimes#include <netinet/tcp_var.h>
8755679Sshin#ifdef INET6
8855679Sshin#include <netinet6/tcp6_var.h>
8955679Sshin#endif
901541Srgrimes#include <netinet/tcpip.h>
916283Swollman#ifdef TCPDEBUG
926283Swollman#include <netinet/tcp_debug.h>
936283Swollman#endif
9455679Sshin#include <netinet6/ip6protosw.h>
951541Srgrimes
9655679Sshin#ifdef IPSEC
9755679Sshin#include <netinet6/ipsec.h>
9862587Sitojun#ifdef INET6
9962587Sitojun#include <netinet6/ipsec6.h>
10062587Sitojun#endif
10155679Sshin#endif /*IPSEC*/
10255679Sshin
103105199Ssam#ifdef FAST_IPSEC
104105199Ssam#include <netipsec/ipsec.h>
105125680Sbms#include <netipsec/xform.h>
106105199Ssam#ifdef INET6
107105199Ssam#include <netipsec/ipsec6.h>
108105199Ssam#endif
109125680Sbms#include <netipsec/key.h>
110105199Ssam#define	IPSEC
111105199Ssam#endif /*FAST_IPSEC*/
112105199Ssam
11358698Sjlemon#include <machine/in_cksum.h>
11482122Ssilby#include <sys/md5.h>
11558698Sjlemon
1161541Srgrimesint 	tcp_mssdflt = TCP_MSS;
11746381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
11846381Sbillf    &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
11912296Sphk
12052904Sshin#ifdef INET6
12152904Sshinint	tcp_v6mssdflt = TCP6_MSS;
12252904SshinSYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
12355679Sshin	CTLFLAG_RW, &tcp_v6mssdflt , 0,
12455679Sshin	"Default TCP Maximum Segment Size for IPv6");
12552904Sshin#endif
12652904Sshin
127124258Sandre/*
128124258Sandre * Minimum MSS we accept and use. This prevents DoS attacks where
129124258Sandre * we are forced to a ridiculous low MSS like 20 and send hundreds
130124258Sandre * of packets instead of one. The effect scales with the available
131124258Sandre * bandwidth and quickly saturates the CPU and network interface
132124258Sandre * with packet generation and sending. Set to zero to disable MINMSS
133124258Sandre * checking. This setting prevents us from sending too small packets.
134124258Sandre */
135124258Sandreint	tcp_minmss = TCP_MINMSS;
136124258SandreSYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
137124258Sandre    &tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
138124258Sandre/*
139124258Sandre * Number of TCP segments per second we accept from remote host
140124258Sandre * before we start to calculate average segment size. If average
141124258Sandre * segment size drops below the minimum TCP MSS we assume a DoS
142124258Sandre * attack and reset+drop the connection. Care has to be taken not to
143124258Sandre * set this value too small to not kill interactive type connections
144124258Sandre * (telnet, SSH) which send many small packets.
145124258Sandre */
146124258Sandreint     tcp_minmssoverload = TCP_MINMSSOVERLOAD;
147124258SandreSYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW,
148124258Sandre    &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to"
149124258Sandre    "be under the MINMSS Size");
150124258Sandre
15150673Sjlemon#if 0
15212296Sphkstatic int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
15346381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
15446381Sbillf    &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
15550673Sjlemon#endif
15612296Sphk
15786764Sjlemonint	tcp_do_rfc1323 = 1;
15846381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
15946381Sbillf    &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
16012296Sphk
16186764Sjlemonint	tcp_do_rfc1644 = 0;
16246381SbillfSYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
16346381Sbillf    &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
1641541Srgrimes
16550426Sjlemonstatic int	tcp_tcbhashsize = 0;
166121307SsilbySYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
16750426Sjlemon     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
16850426Sjlemon
16955198Smsmithstatic int	do_tcpdrain = 1;
17066376SbmilekicSYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
17166376Sbmilekic     "Enable tcp_drain routine for extra help when low on mbufs");
17255198Smsmith
17346381SbillfSYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
17446381Sbillf    &tcbinfo.ipi_count, 0, "Number of active PCBs");
17536079Swollman
17672959Sjlemonstatic int	icmp_may_rst = 1;
17772959SjlemonSYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
17872959Sjlemon    "Certain ICMP unreachable messages may abort connections in SYN_SENT");
17970103Sphk
18082122Ssilbystatic int	tcp_isn_reseed_interval = 0;
18182122SsilbySYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
18282122Ssilby    &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
18382122Ssilby
184102017Sdillon/*
185102017Sdillon * TCP bandwidth limiting sysctls.  Note that the default lower bound of
186102017Sdillon * 1024 exists only for debugging.  A good production default would be
187102017Sdillon * something like 6100.
188102017Sdillon */
189124199Sandrestatic int	tcp_inflight_enable = 1;
190102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
191102017Sdillon    &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
192102017Sdillon
193104825Sdillonstatic int	tcp_inflight_debug = 0;
194102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
195102017Sdillon    &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
196102017Sdillon
197107881Sdillonstatic int	tcp_inflight_min = 6144;
198102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
199102017Sdillon    &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
200102017Sdillon
201102017Sdillonstatic int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
202102017SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
203102017Sdillon    &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
204107881Sdillonstatic int	tcp_inflight_stab = 20;
205107881SdillonSYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
206107881Sdillon    &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
207102017Sdillon
20898211Shsustatic struct inpcb *tcp_notify(struct inpcb *, int);
209111145Sjlemonstatic void	tcp_discardcb(struct tcpcb *);
21012296Sphk
2117684Sdg/*
21232821Sdg * Target size of TCP PCB hash tables. Must be a power of two.
21343562Smsmith *
21443562Smsmith * Note that this can be overridden by the kernel environment
21543562Smsmith * variable net.inet.tcp.tcbhashsize
2167684Sdg */
2177684Sdg#ifndef TCBHASHSIZE
21832821Sdg#define TCBHASHSIZE	512
2197684Sdg#endif
2201541Srgrimes
2211541Srgrimes/*
222111145Sjlemon * XXX
223111145Sjlemon * Callouts should be moved into struct tcp directly.  They are currently
224123608Sjhb * separate because the tcpcb structure is exported to userland for sysctl
225111145Sjlemon * parsing purposes, which do not know about callouts.
22634881Swollman */
227111145Sjlemonstruct	tcpcb_mem {
22834881Swollman	struct	tcpcb tcb;
229111145Sjlemon	struct	callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep;
230111145Sjlemon	struct	callout tcpcb_mem_2msl, tcpcb_mem_delack;
23134881Swollman};
23234881Swollman
233111145Sjlemonstatic uma_zone_t tcpcb_zone;
234111145Sjlemonstatic uma_zone_t tcptw_zone;
235111145Sjlemon
23634881Swollman/*
2371541Srgrimes * Tcp initialization
2381541Srgrimes */
2391541Srgrimesvoid
2401541Srgrimestcp_init()
2411541Srgrimes{
24277843Speter	int hashsize = TCBHASHSIZE;
24343562Smsmith
2446283Swollman	tcp_ccgen = 1;
24550673Sjlemon
24650673Sjlemon	tcp_delacktime = TCPTV_DELACK;
24750673Sjlemon	tcp_keepinit = TCPTV_KEEP_INIT;
24850673Sjlemon	tcp_keepidle = TCPTV_KEEP_IDLE;
24950673Sjlemon	tcp_keepintvl = TCPTV_KEEPINTVL;
25050673Sjlemon	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
25150673Sjlemon	tcp_msl = TCPTV_MSL;
252100335Sdillon	tcp_rexmit_min = TCPTV_MIN;
253100335Sdillon	tcp_rexmit_slop = TCPTV_CPU_VAR;
25450673Sjlemon
25598102Shsu	INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
2567684Sdg	LIST_INIT(&tcb);
2577684Sdg	tcbinfo.listhead = &tcb;
25877900Speter	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
25943576Smsmith	if (!powerof2(hashsize)) {
26043562Smsmith		printf("WARNING: TCB hash size not a power of 2\n");
26143562Smsmith		hashsize = 512; /* safe default */
26243562Smsmith	}
26350426Sjlemon	tcp_tcbhashsize = hashsize;
26443562Smsmith	tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
26543562Smsmith	tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
26634923Sbde					&tcbinfo.porthashmask);
267111145Sjlemon	tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb),
26892760Sjeff	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
26992760Sjeff	uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
27055679Sshin#ifdef INET6
27155679Sshin#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
27255679Sshin#else /* INET6 */
27355679Sshin#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
27455679Sshin#endif /* INET6 */
27555679Sshin	if (max_protohdr < TCP_MINPROTOHDR)
27655679Sshin		max_protohdr = TCP_MINPROTOHDR;
27755679Sshin	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
2781541Srgrimes		panic("tcp_init");
27955679Sshin#undef TCP_MINPROTOHDR
280111145Sjlemon	/*
281111145Sjlemon	 * These have to be type stable for the benefit of the timers.
282111145Sjlemon	 */
283111145Sjlemon	tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
284111145Sjlemon	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
285111145Sjlemon	uma_zone_set_max(tcpcb_zone, maxsockets);
286112009Sjlemon	tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
287111145Sjlemon	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
288121453Ssilby	uma_zone_set_max(tcptw_zone, maxsockets / 5);
289112009Sjlemon	tcp_timer_init();
29086764Sjlemon	syncache_init();
291122922Sandre	tcp_hc_init();
292126193Sandre	tcp_reass_init();
2931541Srgrimes}
2941541Srgrimes
2951541Srgrimes/*
29678642Ssilby * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
29778642Ssilby * tcp_template used to store this data in mbufs, but we now recopy it out
29878642Ssilby * of the tcpcb each time to conserve mbufs.
2991541Srgrimes */
30078642Ssilbyvoid
301111144Sjlemontcpip_fillheaders(inp, ip_ptr, tcp_ptr)
302111144Sjlemon	struct inpcb *inp;
30378642Ssilby	void *ip_ptr;
30478642Ssilby	void *tcp_ptr;
3051541Srgrimes{
306111144Sjlemon	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
3071541Srgrimes
30855679Sshin#ifdef INET6
30955679Sshin	if ((inp->inp_vflag & INP_IPV6) != 0) {
31078642Ssilby		struct ip6_hdr *ip6;
31155679Sshin
31278642Ssilby		ip6 = (struct ip6_hdr *)ip_ptr;
31355679Sshin		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
31455679Sshin			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
31555679Sshin		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
31655679Sshin			(IPV6_VERSION & IPV6_VERSION_MASK);
31755679Sshin		ip6->ip6_nxt = IPPROTO_TCP;
31855679Sshin		ip6->ip6_plen = sizeof(struct tcphdr);
31955679Sshin		ip6->ip6_src = inp->in6p_laddr;
32055679Sshin		ip6->ip6_dst = inp->in6p_faddr;
32155679Sshin	} else
32255679Sshin#endif
32378642Ssilby	{
324111144Sjlemon		struct ip *ip;
32555679Sshin
326111144Sjlemon		ip = (struct ip *)ip_ptr;
327111144Sjlemon		ip->ip_v = IPVERSION;
328111144Sjlemon		ip->ip_hl = 5;
329111144Sjlemon		ip->ip_tos = inp->inp_ip_tos;
330111144Sjlemon		ip->ip_len = 0;
331111144Sjlemon		ip->ip_id = 0;
332111144Sjlemon		ip->ip_off = 0;
333111144Sjlemon		ip->ip_ttl = inp->inp_ip_ttl;
334111144Sjlemon		ip->ip_sum = 0;
335111144Sjlemon		ip->ip_p = IPPROTO_TCP;
336111144Sjlemon		ip->ip_src = inp->inp_laddr;
337111144Sjlemon		ip->ip_dst = inp->inp_faddr;
33878642Ssilby	}
339111144Sjlemon	th->th_sport = inp->inp_lport;
340111144Sjlemon	th->th_dport = inp->inp_fport;
341111144Sjlemon	th->th_seq = 0;
342111144Sjlemon	th->th_ack = 0;
343111144Sjlemon	th->th_x2 = 0;
344111144Sjlemon	th->th_off = 5;
345111144Sjlemon	th->th_flags = 0;
346111144Sjlemon	th->th_win = 0;
347111144Sjlemon	th->th_urp = 0;
348111144Sjlemon	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
34978642Ssilby}
35078642Ssilby
35178642Ssilby/*
35278642Ssilby * Create template to be used to send tcp packets on a connection.
35378642Ssilby * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
35478642Ssilby * use for this function is in keepalives, which use tcp_respond.
35578642Ssilby */
35678642Ssilbystruct tcptemp *
357111144Sjlemontcpip_maketemplate(inp)
358111144Sjlemon	struct inpcb *inp;
35978642Ssilby{
36078642Ssilby	struct mbuf *m;
36178642Ssilby	struct tcptemp *n;
36278642Ssilby
363111119Simp	m = m_get(M_DONTWAIT, MT_HEADER);
36478642Ssilby	if (m == NULL)
36578642Ssilby		return (0);
36678642Ssilby	m->m_len = sizeof(struct tcptemp);
36778642Ssilby	n = mtod(m, struct tcptemp *);
36878642Ssilby
369111144Sjlemon	tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
3701541Srgrimes	return (n);
3711541Srgrimes}
3721541Srgrimes
3731541Srgrimes/*
3741541Srgrimes * Send a single message to the TCP at address specified by
3751541Srgrimes * the given TCP/IP header.  If m == 0, then we make a copy
3761541Srgrimes * of the tcpiphdr at ti and send directly to the addressed host.
3771541Srgrimes * This is used to force keep alive messages out using the TCP
37878642Ssilby * template for a connection.  If flags are given then we send
37978642Ssilby * a message back to the TCP which originated the * segment ti,
38078642Ssilby * and discard the mbuf containing it and any other attached mbufs.
3811541Srgrimes *
3821541Srgrimes * In any case the ack and sequence number of the transmitted
3831541Srgrimes * segment are as specified by the parameters.
38431848Sjulian *
38531848Sjulian * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
3861541Srgrimes */
3871541Srgrimesvoid
38855679Sshintcp_respond(tp, ipgen, th, m, ack, seq, flags)
3891541Srgrimes	struct tcpcb *tp;
39055679Sshin	void *ipgen;
39155679Sshin	register struct tcphdr *th;
3921541Srgrimes	register struct mbuf *m;
3931541Srgrimes	tcp_seq ack, seq;
3941541Srgrimes	int flags;
3951541Srgrimes{
3961541Srgrimes	register int tlen;
3971541Srgrimes	int win = 0;
39855679Sshin	struct ip *ip;
39955679Sshin	struct tcphdr *nth;
40055679Sshin#ifdef INET6
40155679Sshin	struct ip6_hdr *ip6;
40255679Sshin	int isipv6;
40355679Sshin#endif /* INET6 */
40455679Sshin	int ipflags = 0;
405122922Sandre	struct inpcb *inp = NULL;
4061541Srgrimes
407101137Srwatson	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
408101137Srwatson
40955679Sshin#ifdef INET6
410105586Sphk	isipv6 = ((struct ip *)ipgen)->ip_v == 6;
41155679Sshin	ip6 = ipgen;
41255679Sshin#endif /* INET6 */
41355679Sshin	ip = ipgen;
41455679Sshin
4151541Srgrimes	if (tp) {
416122327Ssam		inp = tp->t_inpcb;
417122327Ssam		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
418122327Ssam		INP_INFO_WLOCK_ASSERT(&tcbinfo);
419122327Ssam		INP_LOCK_ASSERT(inp);
42057576Sps		if (!(flags & TH_RST)) {
421122327Ssam			win = sbspace(&inp->inp_socket->so_rcv);
42257576Sps			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
42357576Sps				win = (long)TCP_MAXWIN << tp->rcv_scale;
42457576Sps		}
4251541Srgrimes	}
4261541Srgrimes	if (m == 0) {
427111119Simp		m = m_gethdr(M_DONTWAIT, MT_HEADER);
4281541Srgrimes		if (m == NULL)
4291541Srgrimes			return;
4301541Srgrimes		tlen = 0;
4311541Srgrimes		m->m_data += max_linkhdr;
43255679Sshin#ifdef INET6
43355679Sshin		if (isipv6) {
43455679Sshin			bcopy((caddr_t)ip6, mtod(m, caddr_t),
43555679Sshin			      sizeof(struct ip6_hdr));
43655679Sshin			ip6 = mtod(m, struct ip6_hdr *);
43755679Sshin			nth = (struct tcphdr *)(ip6 + 1);
43855679Sshin		} else
43955679Sshin#endif /* INET6 */
44055679Sshin	      {
44155679Sshin		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
44255679Sshin		ip = mtod(m, struct ip *);
44355679Sshin		nth = (struct tcphdr *)(ip + 1);
44455679Sshin	      }
44555679Sshin		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
4461541Srgrimes		flags = TH_ACK;
4471541Srgrimes	} else {
4481541Srgrimes		m_freem(m->m_next);
4491541Srgrimes		m->m_next = 0;
45055679Sshin		m->m_data = (caddr_t)ipgen;
45155679Sshin		/* m_len is set later */
4521541Srgrimes		tlen = 0;
4531541Srgrimes#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
45455679Sshin#ifdef INET6
45555679Sshin		if (isipv6) {
45655679Sshin			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
45755679Sshin			nth = (struct tcphdr *)(ip6 + 1);
45855679Sshin		} else
45955679Sshin#endif /* INET6 */
46055679Sshin	      {
46155679Sshin		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
46255679Sshin		nth = (struct tcphdr *)(ip + 1);
46355679Sshin	      }
46455679Sshin		if (th != nth) {
46555679Sshin			/*
46655679Sshin			 * this is usually a case when an extension header
46755679Sshin			 * exists between the IPv6 header and the
46855679Sshin			 * TCP header.
46955679Sshin			 */
47055679Sshin			nth->th_sport = th->th_sport;
47155679Sshin			nth->th_dport = th->th_dport;
47255679Sshin		}
47355679Sshin		xchg(nth->th_dport, nth->th_sport, n_short);
4741541Srgrimes#undef xchg
4751541Srgrimes	}
47655679Sshin#ifdef INET6
47755679Sshin	if (isipv6) {
47890198Sume		ip6->ip6_flow = 0;
47990198Sume		ip6->ip6_vfc = IPV6_VERSION;
48090198Sume		ip6->ip6_nxt = IPPROTO_TCP;
48155679Sshin		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
48255679Sshin						tlen));
48355679Sshin		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
48456039Sshin	} else
48555679Sshin#endif
48655679Sshin      {
4871541Srgrimes	tlen += sizeof (struct tcpiphdr);
48858698Sjlemon	ip->ip_len = tlen;
48958698Sjlemon	ip->ip_ttl = ip_defttl;
490124248Sandre	if (path_mtu_discovery)
491124248Sandre		ip->ip_off |= IP_DF;
49255679Sshin      }
4931541Srgrimes	m->m_len = tlen;
4941541Srgrimes	m->m_pkthdr.len = tlen;
4951541Srgrimes	m->m_pkthdr.rcvif = (struct ifnet *) 0;
496101106Srwatson#ifdef MAC
497122327Ssam	if (inp != NULL) {
498101106Srwatson		/*
499101106Srwatson		 * Packet is associated with a socket, so allow the
500101106Srwatson		 * label of the response to reflect the socket label.
501101106Srwatson		 */
502122327Ssam		mac_create_mbuf_from_socket(inp->inp_socket, m);
503101106Srwatson	} else {
504101106Srwatson		/*
505119245Srwatson		 * Packet is not associated with a socket, so possibly
506119245Srwatson		 * update the label in place.
507101106Srwatson		 */
508119245Srwatson		mac_reflect_mbuf_tcp(m);
509101106Srwatson	}
510101106Srwatson#endif
51155679Sshin	nth->th_seq = htonl(seq);
51255679Sshin	nth->th_ack = htonl(ack);
51355679Sshin	nth->th_x2 = 0;
51455679Sshin	nth->th_off = sizeof (struct tcphdr) >> 2;
51555679Sshin	nth->th_flags = flags;
5161541Srgrimes	if (tp)
51755679Sshin		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
5181541Srgrimes	else
51955679Sshin		nth->th_win = htons((u_short)win);
52055679Sshin	nth->th_urp = 0;
52155679Sshin#ifdef INET6
52255679Sshin	if (isipv6) {
52359392Sshin		nth->th_sum = 0;
52455679Sshin		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
52555679Sshin					sizeof(struct ip6_hdr),
52655679Sshin					tlen - sizeof(struct ip6_hdr));
527122922Sandre		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL);
52855679Sshin	} else
52955679Sshin#endif /* INET6 */
53055679Sshin      {
53158698Sjlemon        nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
53258698Sjlemon	    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
53358698Sjlemon        m->m_pkthdr.csum_flags = CSUM_TCP;
53458698Sjlemon        m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
53555679Sshin      }
5366283Swollman#ifdef TCPDEBUG
537122327Ssam	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
53855679Sshin		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
5396283Swollman#endif
54055679Sshin#ifdef INET6
541122922Sandre	if (isipv6)
542122922Sandre		(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
543122922Sandre	else
54455679Sshin#endif /* INET6 */
545122922Sandre	(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
5461541Srgrimes}
5471541Srgrimes
5481541Srgrimes/*
5491541Srgrimes * Create a new TCP control block, making an
5501541Srgrimes * empty reassembly queue and hooking it to the argument
55134881Swollman * protocol control block.  The `inp' parameter must have
55234881Swollman * come from the zone allocator set up in tcp_init().
5531541Srgrimes */
5541541Srgrimesstruct tcpcb *
5551541Srgrimestcp_newtcpcb(inp)
5561541Srgrimes	struct inpcb *inp;
5571541Srgrimes{
558111145Sjlemon	struct tcpcb_mem *tm;
559111145Sjlemon	struct tcpcb *tp;
56055679Sshin#ifdef INET6
56155679Sshin	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
56255679Sshin#endif /* INET6 */
5631541Srgrimes
564111145Sjlemon	tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
565111145Sjlemon	if (tm == NULL)
566111145Sjlemon		return (NULL);
567111145Sjlemon	tp = &tm->tcb;
568111145Sjlemon	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
56955679Sshin	tp->t_maxseg = tp->t_maxopd =
57055679Sshin#ifdef INET6
57155679Sshin		isipv6 ? tcp_v6mssdflt :
57255679Sshin#endif /* INET6 */
57355679Sshin		tcp_mssdflt;
5741541Srgrimes
57550673Sjlemon	/* Set up our timeouts. */
576111145Sjlemon	callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0);
577111145Sjlemon	callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0);
578111145Sjlemon	callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0);
579111145Sjlemon	callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0);
580111145Sjlemon	callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0);
58150673Sjlemon
5826283Swollman	if (tcp_do_rfc1323)
5836283Swollman		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
5846283Swollman	if (tcp_do_rfc1644)
5856283Swollman		tp->t_flags |= TF_REQ_CC;
58634881Swollman	tp->t_inpcb = inp;	/* XXX */
5871541Srgrimes	/*
5881541Srgrimes	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
58916367Swollman	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
5901541Srgrimes	 * reasonable initial retransmit time.
5911541Srgrimes	 */
5921541Srgrimes	tp->t_srtt = TCPTV_SRTTBASE;
59316367Swollman	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
594100335Sdillon	tp->t_rttmin = tcp_rexmit_min;
59516367Swollman	tp->t_rxtcur = TCPTV_RTOBASE;
5961541Srgrimes	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
597102017Sdillon	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
5981541Srgrimes	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
59950673Sjlemon	tp->t_rcvtime = ticks;
600102017Sdillon	tp->t_bw_rtttime = ticks;
60156564Sshin        /*
60256564Sshin	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
60356564Sshin	 * because the socket may be bound to an IPv6 wildcard address,
60456564Sshin	 * which may match an IPv4-mapped IPv6 address.
60556564Sshin	 */
60624570Sdg	inp->inp_ip_ttl = ip_defttl;
6071541Srgrimes	inp->inp_ppcb = (caddr_t)tp;
60834881Swollman	return (tp);		/* XXX */
6091541Srgrimes}
6101541Srgrimes
6111541Srgrimes/*
6121541Srgrimes * Drop a TCP connection, reporting
6131541Srgrimes * the specified error.  If connection is synchronized,
6141541Srgrimes * then send a RST to peer.
6151541Srgrimes */
6161541Srgrimesstruct tcpcb *
6171541Srgrimestcp_drop(tp, errno)
6181541Srgrimes	register struct tcpcb *tp;
6191541Srgrimes	int errno;
6201541Srgrimes{
6211541Srgrimes	struct socket *so = tp->t_inpcb->inp_socket;
6221541Srgrimes
6231541Srgrimes	if (TCPS_HAVERCVDSYN(tp->t_state)) {
6241541Srgrimes		tp->t_state = TCPS_CLOSED;
6251541Srgrimes		(void) tcp_output(tp);
6261541Srgrimes		tcpstat.tcps_drops++;
6271541Srgrimes	} else
6281541Srgrimes		tcpstat.tcps_conndrops++;
6291541Srgrimes	if (errno == ETIMEDOUT && tp->t_softerror)
6301541Srgrimes		errno = tp->t_softerror;
6311541Srgrimes	so->so_error = errno;
6321541Srgrimes	return (tcp_close(tp));
6331541Srgrimes}
6341541Srgrimes
635111145Sjlemonstatic void
636111145Sjlemontcp_discardcb(tp)
637111145Sjlemon	struct tcpcb *tp;
6381541Srgrimes{
639111145Sjlemon	struct tseg_qent *q;
6401541Srgrimes	struct inpcb *inp = tp->t_inpcb;
6411541Srgrimes	struct socket *so = inp->inp_socket;
64255679Sshin#ifdef INET6
64355679Sshin	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
64455679Sshin#endif /* INET6 */
6451541Srgrimes
6461541Srgrimes	/*
64750673Sjlemon	 * Make sure that all of our timers are stopped before we
64850673Sjlemon	 * delete the PCB.
64950673Sjlemon	 */
65050673Sjlemon	callout_stop(tp->tt_rexmt);
65150673Sjlemon	callout_stop(tp->tt_persist);
65250673Sjlemon	callout_stop(tp->tt_keep);
65350673Sjlemon	callout_stop(tp->tt_2msl);
65450673Sjlemon	callout_stop(tp->tt_delack);
65550673Sjlemon
65650673Sjlemon	/*
6579373Swollman	 * If we got enough samples through the srtt filter,
6589373Swollman	 * save the rtt and rttvar in the routing entry.
659122922Sandre	 * 'Enough' is arbitrarily defined as 4 rtt samples.
660122922Sandre	 * 4 samples is enough for the srtt filter to converge
661122922Sandre	 * to within enough % of the correct value; fewer samples
662122922Sandre	 * and we could save a bogus rtt. The danger is not high
663122922Sandre	 * as tcp quickly recovers from everything.
664122922Sandre	 * XXX: Works very well but needs some more statistics!
6651541Srgrimes	 */
666122922Sandre	if (tp->t_rttupdated >= 4) {
667122922Sandre		struct hc_metrics_lite metrics;
668122922Sandre		u_long ssthresh;
6691541Srgrimes
670122922Sandre		bzero(&metrics, sizeof(metrics));
6711541Srgrimes		/*
672122922Sandre		 * Update the ssthresh always when the conditions below
673122922Sandre		 * are satisfied. This gives us better new start value
674122922Sandre		 * for the congestion avoidance for new connections.
675122922Sandre		 * ssthresh is only set if packet loss occured on a session.
6761541Srgrimes		 */
677122922Sandre		ssthresh = tp->snd_ssthresh;
678122922Sandre		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
6791541Srgrimes			/*
6801541Srgrimes			 * convert the limit from user data bytes to
6811541Srgrimes			 * packets then to packet data bytes.
6821541Srgrimes			 */
683122922Sandre			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
684122922Sandre			if (ssthresh < 2)
685122922Sandre				ssthresh = 2;
686122922Sandre			ssthresh *= (u_long)(tp->t_maxseg +
68755679Sshin#ifdef INET6
68855679Sshin				      (isipv6 ? sizeof (struct ip6_hdr) +
68955679Sshin					       sizeof (struct tcphdr) :
69055679Sshin#endif
69155679Sshin				       sizeof (struct tcpiphdr)
69255679Sshin#ifdef INET6
69355679Sshin				       )
69455679Sshin#endif
69555679Sshin				      );
696122922Sandre		} else
697122922Sandre			ssthresh = 0;
698122922Sandre		metrics.rmx_ssthresh = ssthresh;
699122922Sandre
700122922Sandre		metrics.rmx_rtt = tp->t_srtt;
701122922Sandre		metrics.rmx_rttvar = tp->t_rttvar;
702122922Sandre		/* XXX: This wraps if the pipe is more than 4 Gbit per second */
703122922Sandre		metrics.rmx_bandwidth = tp->snd_bandwidth;
704122922Sandre		metrics.rmx_cwnd = tp->snd_cwnd;
705122922Sandre		metrics.rmx_sendpipe = 0;
706122922Sandre		metrics.rmx_recvpipe = 0;
707122922Sandre
708122922Sandre		tcp_hc_update(&inp->inp_inc, &metrics);
7091541Srgrimes	}
710122922Sandre
7111541Srgrimes	/* free the reassembly queue, if any */
712111145Sjlemon	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
71355679Sshin		LIST_REMOVE(q, tqe_q);
71455679Sshin		m_freem(q->tqe_m);
715126193Sandre		uma_zfree(tcp_reass_zone, q);
716126193Sandre		tp->t_segqlen--;
717126193Sandre		tcp_reass_qsize--;
7181541Srgrimes	}
71932821Sdg	inp->inp_ppcb = NULL;
720108265Shsu	tp->t_inpcb = NULL;
721111145Sjlemon	uma_zfree(tcpcb_zone, tp);
7221541Srgrimes	soisdisconnected(so);
723111145Sjlemon}
724111145Sjlemon
725111145Sjlemon/*
726111145Sjlemon * Close a TCP control block:
727111145Sjlemon *    discard all space held by the tcp
728111145Sjlemon *    discard internet protocol block
729111145Sjlemon *    wake up any sleepers
730111145Sjlemon */
731111145Sjlemonstruct tcpcb *
732111145Sjlemontcp_close(tp)
733111145Sjlemon	struct tcpcb *tp;
734111145Sjlemon{
735111145Sjlemon	struct inpcb *inp = tp->t_inpcb;
736111153Sjlemon#ifdef INET6
737111145Sjlemon	struct socket *so = inp->inp_socket;
738111153Sjlemon#endif
739111145Sjlemon
740111145Sjlemon	tcp_discardcb(tp);
74155679Sshin#ifdef INET6
74255679Sshin	if (INP_CHECK_SOCKAF(so, AF_INET6))
74355679Sshin		in6_pcbdetach(inp);
74455679Sshin	else
745111145Sjlemon#endif
746111145Sjlemon		in_pcbdetach(inp);
7471541Srgrimes	tcpstat.tcps_closed++;
7481541Srgrimes	return ((struct tcpcb *)0);
7491541Srgrimes}
7501541Srgrimes
7511541Srgrimesvoid
7521541Srgrimestcp_drain()
7531541Srgrimes{
75455198Smsmith	if (do_tcpdrain)
75555198Smsmith	{
75655198Smsmith		struct inpcb *inpb;
75755198Smsmith		struct tcpcb *tcpb;
75855679Sshin		struct tseg_qent *te;
7591541Srgrimes
76055198Smsmith	/*
76155198Smsmith	 * Walk the tcpbs, if existing, and flush the reassembly queue,
76255198Smsmith	 * if there is one...
76355198Smsmith	 * XXX: The "Net/3" implementation doesn't imply that the TCP
76455198Smsmith	 *      reassembly queue should be flushed, but in a situation
76555198Smsmith	 * 	where we're really low on mbufs, this is potentially
76655198Smsmith	 *  	usefull.
76755198Smsmith	 */
76898102Shsu		INP_INFO_RLOCK(&tcbinfo);
76974362Sphk		LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
770111145Sjlemon			if (inpb->inp_vflag & INP_TIMEWAIT)
771111145Sjlemon				continue;
77298102Shsu			INP_LOCK(inpb);
77374362Sphk			if ((tcpb = intotcpcb(inpb))) {
77474362Sphk				while ((te = LIST_FIRST(&tcpb->t_segq))
77574362Sphk			            != NULL) {
77655679Sshin					LIST_REMOVE(te, tqe_q);
77755679Sshin					m_freem(te->tqe_m);
778126193Sandre					uma_zfree(tcp_reass_zone, te);
779126193Sandre					tcpb->t_segqlen--;
780126193Sandre					tcp_reass_qsize--;
78155198Smsmith				}
78255198Smsmith			}
78398102Shsu			INP_UNLOCK(inpb);
78455198Smsmith		}
78598102Shsu		INP_INFO_RUNLOCK(&tcbinfo);
78655198Smsmith	}
7871541Srgrimes}
7881541Srgrimes
7891541Srgrimes/*
7901541Srgrimes * Notify a tcp user of an asynchronous error;
7911541Srgrimes * store error as soft error, but wake up user
7921541Srgrimes * (for now, won't do anything until can select for soft error).
79372960Sjlemon *
79472960Sjlemon * Do not wake up user since there currently is no mechanism for
79572960Sjlemon * reporting soft errors (yet - a kqueue filter may be added).
7961541Srgrimes */
79798211Shsustatic struct inpcb *
7981541Srgrimestcp_notify(inp, error)
7991541Srgrimes	struct inpcb *inp;
8001541Srgrimes	int error;
8011541Srgrimes{
80272960Sjlemon	struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
8031541Srgrimes
8041541Srgrimes	/*
8051541Srgrimes	 * Ignore some errors if we are hooked up.
8061541Srgrimes	 * If connection hasn't completed, has retransmitted several times,
8071541Srgrimes	 * and receives a second error, give up now.  This is better
8081541Srgrimes	 * than waiting a long time to establish a connection that
8091541Srgrimes	 * can never complete.
8101541Srgrimes	 */
8111541Srgrimes	if (tp->t_state == TCPS_ESTABLISHED &&
812110896Shsu	    (error == EHOSTUNREACH || error == ENETUNREACH ||
813110896Shsu	     error == EHOSTDOWN)) {
81498211Shsu		return inp;
8151541Srgrimes	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
81698211Shsu	    tp->t_softerror) {
81772960Sjlemon		tcp_drop(tp, error);
81898211Shsu		return (struct inpcb *)0;
81998211Shsu	} else {
8201541Srgrimes		tp->t_softerror = error;
82198211Shsu		return inp;
82298211Shsu	}
82372960Sjlemon#if 0
824111748Sdes	wakeup( &so->so_timeo);
8251541Srgrimes	sorwakeup(so);
8261541Srgrimes	sowwakeup(so);
82772960Sjlemon#endif
8281541Srgrimes}
8291541Srgrimes
83036079Swollmanstatic int
83162573Sphktcp_pcblist(SYSCTL_HANDLER_ARGS)
83236079Swollman{
83336079Swollman	int error, i, n, s;
83436079Swollman	struct inpcb *inp, **inp_list;
83536079Swollman	inp_gen_t gencnt;
83636079Swollman	struct xinpgen xig;
83736079Swollman
83836079Swollman	/*
83936079Swollman	 * The process of preparing the TCB list is too time-consuming and
84036079Swollman	 * resource-intensive to repeat twice on every request.
84136079Swollman	 */
84236079Swollman	if (req->oldptr == 0) {
84336079Swollman		n = tcbinfo.ipi_count;
84436079Swollman		req->oldidx = 2 * (sizeof xig)
84536079Swollman			+ (n + n/8) * sizeof(struct xtcpcb);
84636079Swollman		return 0;
84736079Swollman	}
84836079Swollman
84936079Swollman	if (req->newptr != 0)
85036079Swollman		return EPERM;
85136079Swollman
85236079Swollman	/*
85336079Swollman	 * OK, now we're committed to doing something.
85436079Swollman	 */
85536079Swollman	s = splnet();
85698102Shsu	INP_INFO_RLOCK(&tcbinfo);
85736079Swollman	gencnt = tcbinfo.ipi_gencnt;
85836079Swollman	n = tcbinfo.ipi_count;
85998102Shsu	INP_INFO_RUNLOCK(&tcbinfo);
86036079Swollman	splx(s);
86136079Swollman
862126253Struckman	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
863100831Struckman		+ n * sizeof(struct xtcpcb));
864126253Struckman	if (error != 0)
865126253Struckman		return (error);
866100831Struckman
86736079Swollman	xig.xig_len = sizeof xig;
86836079Swollman	xig.xig_count = n;
86936079Swollman	xig.xig_gen = gencnt;
87036079Swollman	xig.xig_sogen = so_gencnt;
87136079Swollman	error = SYSCTL_OUT(req, &xig, sizeof xig);
87236079Swollman	if (error)
87336079Swollman		return error;
87436079Swollman
875111119Simp	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
87636079Swollman	if (inp_list == 0)
87736079Swollman		return ENOMEM;
87836079Swollman
87936079Swollman	s = splnet();
88098102Shsu	INP_INFO_RLOCK(&tcbinfo);
88171999Sphk	for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
88271999Sphk	     inp = LIST_NEXT(inp, inp_list)) {
88398102Shsu		INP_LOCK(inp);
884113345Srwatson		if (inp->inp_gencnt <= gencnt) {
885113345Srwatson			/*
886113345Srwatson			 * XXX: This use of cr_cansee(), introduced with
887113345Srwatson			 * TCP state changes, is not quite right, but for
888113345Srwatson			 * now, better than nothing.
889113345Srwatson			 */
890113345Srwatson			if (inp->inp_vflag & INP_TIMEWAIT)
891113345Srwatson				error = cr_cansee(req->td->td_ucred,
892113345Srwatson				    intotw(inp)->tw_cred);
893113345Srwatson			else
894113345Srwatson				error = cr_canseesocket(req->td->td_ucred,
895113345Srwatson				    inp->inp_socket);
896113345Srwatson			if (error == 0)
897113345Srwatson				inp_list[i++] = inp;
898113345Srwatson		}
89998102Shsu		INP_UNLOCK(inp);
90036079Swollman	}
90198102Shsu	INP_INFO_RUNLOCK(&tcbinfo);
90236079Swollman	splx(s);
90336079Swollman	n = i;
90436079Swollman
90536079Swollman	error = 0;
90636079Swollman	for (i = 0; i < n; i++) {
90736079Swollman		inp = inp_list[i];
90836079Swollman		if (inp->inp_gencnt <= gencnt) {
90936079Swollman			struct xtcpcb xt;
91047960Stegge			caddr_t inp_ppcb;
91136079Swollman			xt.xt_len = sizeof xt;
91236079Swollman			/* XXX should avoid extra copy */
91336079Swollman			bcopy(inp, &xt.xt_inp, sizeof *inp);
91447960Stegge			inp_ppcb = inp->inp_ppcb;
915111145Sjlemon			if (inp_ppcb == NULL)
916111145Sjlemon				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
917111145Sjlemon			else if (inp->inp_vflag & INP_TIMEWAIT) {
918111145Sjlemon				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
919111145Sjlemon				xt.xt_tp.t_state = TCPS_TIME_WAIT;
920111145Sjlemon			} else
92147960Stegge				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
92236079Swollman			if (inp->inp_socket)
92336079Swollman				sotoxsocket(inp->inp_socket, &xt.xt_socket);
924111145Sjlemon			else {
925111145Sjlemon				bzero(&xt.xt_socket, sizeof xt.xt_socket);
926111145Sjlemon				xt.xt_socket.xso_protocol = IPPROTO_TCP;
927111145Sjlemon			}
928110896Shsu			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
92936079Swollman			error = SYSCTL_OUT(req, &xt, sizeof xt);
93036079Swollman		}
93136079Swollman	}
93236079Swollman	if (!error) {
93336079Swollman		/*
93436079Swollman		 * Give the user an updated idea of our state.
93536079Swollman		 * If the generation differs from what we told
93636079Swollman		 * her before, she knows that something happened
93736079Swollman		 * while we were processing this request, and it
93836079Swollman		 * might be necessary to retry.
93936079Swollman		 */
94036079Swollman		s = splnet();
94198102Shsu		INP_INFO_RLOCK(&tcbinfo);
94236079Swollman		xig.xig_gen = tcbinfo.ipi_gencnt;
94336079Swollman		xig.xig_sogen = so_gencnt;
94436079Swollman		xig.xig_count = tcbinfo.ipi_count;
94598102Shsu		INP_INFO_RUNLOCK(&tcbinfo);
94636079Swollman		splx(s);
94736079Swollman		error = SYSCTL_OUT(req, &xig, sizeof xig);
94836079Swollman	}
94936079Swollman	free(inp_list, M_TEMP);
95036079Swollman	return error;
95136079Swollman}
95236079Swollman
95336079SwollmanSYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
95436079Swollman	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
95536079Swollman
95648758Sgreenstatic int
95762573Sphktcp_getcred(SYSCTL_HANDLER_ARGS)
95848758Sgreen{
95972650Sgreen	struct xucred xuc;
96048758Sgreen	struct sockaddr_in addrs[2];
96148758Sgreen	struct inpcb *inp;
96248758Sgreen	int error, s;
96348758Sgreen
96493593Sjhb	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
96548758Sgreen	if (error)
96648758Sgreen		return (error);
96748758Sgreen	error = SYSCTL_IN(req, addrs, sizeof(addrs));
96848758Sgreen	if (error)
96948758Sgreen		return (error);
97048758Sgreen	s = splnet();
97198102Shsu	INP_INFO_RLOCK(&tcbinfo);
97248758Sgreen	inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
97354263Sshin	    addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
97498102Shsu	if (inp == NULL) {
97548758Sgreen		error = ENOENT;
97698102Shsu		goto outunlocked;
97748758Sgreen	}
97899837Struckman	INP_LOCK(inp);
97999837Struckman	if (inp->inp_socket == NULL) {
98099837Struckman		error = ENOENT;
98199837Struckman		goto out;
98299837Struckman	}
98392976Srwatson	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
98478697Sdwmalone	if (error)
98578697Sdwmalone		goto out;
98691354Sdd	cru2x(inp->inp_socket->so_cred, &xuc);
98748758Sgreenout:
98898102Shsu	INP_UNLOCK(inp);
98998102Shsuoutunlocked:
99098102Shsu	INP_INFO_RUNLOCK(&tcbinfo);
99148758Sgreen	splx(s);
99299838Struckman	if (error == 0)
99399838Struckman		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
99448758Sgreen	return (error);
99548758Sgreen}
99648758Sgreen
99778697SdwmaloneSYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
99878697Sdwmalone    CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
99978697Sdwmalone    tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
100048758Sgreen
100155679Sshin#ifdef INET6
100255679Sshinstatic int
100362573Sphktcp6_getcred(SYSCTL_HANDLER_ARGS)
100455679Sshin{
100572650Sgreen	struct xucred xuc;
100655679Sshin	struct sockaddr_in6 addrs[2];
100755679Sshin	struct inpcb *inp;
100855679Sshin	int error, s, mapped = 0;
100955679Sshin
101093593Sjhb	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
101155679Sshin	if (error)
101255679Sshin		return (error);
101355679Sshin	error = SYSCTL_IN(req, addrs, sizeof(addrs));
101455679Sshin	if (error)
101555679Sshin		return (error);
101655679Sshin	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
101755679Sshin		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
101855679Sshin			mapped = 1;
101955679Sshin		else
102055679Sshin			return (EINVAL);
102155679Sshin	}
102255679Sshin	s = splnet();
102398102Shsu	INP_INFO_RLOCK(&tcbinfo);
102455679Sshin	if (mapped == 1)
102555679Sshin		inp = in_pcblookup_hash(&tcbinfo,
102655679Sshin			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
102755679Sshin			addrs[1].sin6_port,
102855679Sshin			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
102955679Sshin			addrs[0].sin6_port,
103055679Sshin			0, NULL);
103155679Sshin	else
103255679Sshin		inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
103355679Sshin				 addrs[1].sin6_port,
103455679Sshin				 &addrs[0].sin6_addr, addrs[0].sin6_port,
103555679Sshin				 0, NULL);
103698102Shsu	if (inp == NULL) {
103755679Sshin		error = ENOENT;
103898102Shsu		goto outunlocked;
103955679Sshin	}
104099837Struckman	INP_LOCK(inp);
104199837Struckman	if (inp->inp_socket == NULL) {
104299837Struckman		error = ENOENT;
104399837Struckman		goto out;
104499837Struckman	}
104592976Srwatson	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
104678697Sdwmalone	if (error)
104778697Sdwmalone		goto out;
104891354Sdd	cru2x(inp->inp_socket->so_cred, &xuc);
104955679Sshinout:
105098102Shsu	INP_UNLOCK(inp);
105198102Shsuoutunlocked:
105298102Shsu	INP_INFO_RUNLOCK(&tcbinfo);
105355679Sshin	splx(s);
105499838Struckman	if (error == 0)
105599838Struckman		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
105655679Sshin	return (error);
105755679Sshin}
105855679Sshin
105978697SdwmaloneSYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
106078697Sdwmalone    CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
106178697Sdwmalone    tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
106255679Sshin#endif
106355679Sshin
106455679Sshin
10651541Srgrimesvoid
106612881Sbdetcp_ctlinput(cmd, sa, vip)
10671541Srgrimes	int cmd;
10681541Srgrimes	struct sockaddr *sa;
106912881Sbde	void *vip;
10701541Srgrimes{
107172959Sjlemon	struct ip *ip = vip;
107272959Sjlemon	struct tcphdr *th;
107373109Sjlemon	struct in_addr faddr;
107473109Sjlemon	struct inpcb *inp;
107573109Sjlemon	struct tcpcb *tp;
107698211Shsu	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
107773109Sjlemon	tcp_seq icmp_seq;
107873109Sjlemon	int s;
10791541Srgrimes
108073109Sjlemon	faddr = ((struct sockaddr_in *)sa)->sin_addr;
108173109Sjlemon	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
108273109Sjlemon		return;
108373109Sjlemon
10841541Srgrimes	if (cmd == PRC_QUENCH)
10851541Srgrimes		notify = tcp_quench;
108674937Sjesper	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
108799156Sjesper		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
108872959Sjlemon		notify = tcp_drop_syn_sent;
108973109Sjlemon	else if (cmd == PRC_MSGSIZE)
109010881Swollman		notify = tcp_mtudisc;
1091122922Sandre	/*
1092122922Sandre	 * Redirects don't need to be handled up here.
1093122922Sandre	 */
1094122922Sandre	else if (PRC_IS_REDIRECT(cmd))
1095122922Sandre		return;
1096122922Sandre	/*
1097122922Sandre	 * Hostdead is ugly because it goes linearly through all PCBs.
1098122922Sandre	 * XXX: We never get this from ICMP, otherwise it makes an
1099122922Sandre	 * excellent DoS attack on machines with many connections.
1100122922Sandre	 */
1101122922Sandre	else if (cmd == PRC_HOSTDEAD)
110272922Sjesper		ip = 0;
1103119995Sru	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
11041541Srgrimes		return;
11051541Srgrimes	if (ip) {
110673109Sjlemon		s = splnet();
110717269Swollman		th = (struct tcphdr *)((caddr_t)ip
1108105586Sphk				       + (ip->ip_hl << 2));
110998596Shsu		INP_INFO_WLOCK(&tcbinfo);
111073109Sjlemon		inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
111173109Sjlemon		    ip->ip_src, th->th_sport, 0, NULL);
111298102Shsu		if (inp != NULL)  {
111398102Shsu			INP_LOCK(inp);
111498102Shsu			if (inp->inp_socket != NULL) {
111598102Shsu				icmp_seq = htonl(th->th_seq);
111698102Shsu				tp = intotcpcb(inp);
111798102Shsu				if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
111898102Shsu			    		SEQ_LT(icmp_seq, tp->snd_max))
111998211Shsu					inp = (*notify)(inp, inetctlerrmap[cmd]);
112098102Shsu			}
112198211Shsu			if (inp)
112298211Shsu				INP_UNLOCK(inp);
112386764Sjlemon		} else {
112486764Sjlemon			struct in_conninfo inc;
112586764Sjlemon
112686764Sjlemon			inc.inc_fport = th->th_dport;
112786764Sjlemon			inc.inc_lport = th->th_sport;
112886764Sjlemon			inc.inc_faddr = faddr;
112986764Sjlemon			inc.inc_laddr = ip->ip_src;
113086764Sjlemon#ifdef INET6
113186764Sjlemon			inc.inc_isipv6 = 0;
113286764Sjlemon#endif
113386764Sjlemon			syncache_unreach(&inc, th);
113473109Sjlemon		}
113598596Shsu		INP_INFO_WUNLOCK(&tcbinfo);
113673109Sjlemon		splx(s);
11371541Srgrimes	} else
113898102Shsu		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
11391541Srgrimes}
11401541Srgrimes
114155679Sshin#ifdef INET6
114255679Sshinvoid
114355679Sshintcp6_ctlinput(cmd, sa, d)
114455679Sshin	int cmd;
114555679Sshin	struct sockaddr *sa;
114655679Sshin	void *d;
114755679Sshin{
114855679Sshin	struct tcphdr th;
114998211Shsu	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
115055679Sshin	struct ip6_hdr *ip6;
115155679Sshin	struct mbuf *m;
115278064Sume	struct ip6ctlparam *ip6cp = NULL;
115378064Sume	const struct sockaddr_in6 *sa6_src = NULL;
115455679Sshin	int off;
115578064Sume	struct tcp_portonly {
115678064Sume		u_int16_t th_sport;
115778064Sume		u_int16_t th_dport;
115878064Sume	} *thp;
115955679Sshin
116055679Sshin	if (sa->sa_family != AF_INET6 ||
116155679Sshin	    sa->sa_len != sizeof(struct sockaddr_in6))
116255679Sshin		return;
116355679Sshin
116455679Sshin	if (cmd == PRC_QUENCH)
116555679Sshin		notify = tcp_quench;
116655679Sshin	else if (cmd == PRC_MSGSIZE)
116755679Sshin		notify = tcp_mtudisc;
116855679Sshin	else if (!PRC_IS_REDIRECT(cmd) &&
1169119995Sru		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
117055679Sshin		return;
117155679Sshin
117255679Sshin	/* if the parameter is from icmp6, decode it. */
117355679Sshin	if (d != NULL) {
117478064Sume		ip6cp = (struct ip6ctlparam *)d;
117555679Sshin		m = ip6cp->ip6c_m;
117655679Sshin		ip6 = ip6cp->ip6c_ip6;
117755679Sshin		off = ip6cp->ip6c_off;
117878064Sume		sa6_src = ip6cp->ip6c_src;
117955679Sshin	} else {
118055679Sshin		m = NULL;
118155679Sshin		ip6 = NULL;
118267456Sitojun		off = 0;	/* fool gcc */
118378064Sume		sa6_src = &sa6_any;
118455679Sshin	}
118555679Sshin
118655679Sshin	if (ip6) {
118786764Sjlemon		struct in_conninfo inc;
118855679Sshin		/*
118955679Sshin		 * XXX: We assume that when IPV6 is non NULL,
119055679Sshin		 * M and OFF are valid.
119155679Sshin		 */
119255679Sshin
119367456Sitojun		/* check if we can safely examine src and dst ports */
119478064Sume		if (m->m_pkthdr.len < off + sizeof(*thp))
119567456Sitojun			return;
119667456Sitojun
119778064Sume		bzero(&th, sizeof(th));
119878064Sume		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
119978064Sume
120078064Sume		in6_pcbnotify(&tcb, sa, th.th_dport,
120178064Sume		    (struct sockaddr *)ip6cp->ip6c_src,
1202125776Sume		    th.th_sport, cmd, NULL, notify);
120386764Sjlemon
120486764Sjlemon		inc.inc_fport = th.th_dport;
120586764Sjlemon		inc.inc_lport = th.th_sport;
120686764Sjlemon		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
120786764Sjlemon		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
120886764Sjlemon		inc.inc_isipv6 = 1;
120986764Sjlemon		syncache_unreach(&inc, &th);
121055679Sshin	} else
121191357Salfred		in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src,
1212125776Sume			      0, cmd, NULL, notify);
121355679Sshin}
121455679Sshin#endif /* INET6 */
121555679Sshin
121680428Speter
121782122Ssilby/*
121882122Ssilby * Following is where TCP initial sequence number generation occurs.
121982122Ssilby *
122082122Ssilby * There are two places where we must use initial sequence numbers:
122182122Ssilby * 1.  In SYN-ACK packets.
122282122Ssilby * 2.  In SYN packets.
122382122Ssilby *
122494390Ssilby * All ISNs for SYN-ACK packets are generated by the syncache.  See
122594390Ssilby * tcp_syncache.c for details.
122682122Ssilby *
122782122Ssilby * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
122882122Ssilby * depends on this property.  In addition, these ISNs should be
122982122Ssilby * unguessable so as to prevent connection hijacking.  To satisfy
123082122Ssilby * the requirements of this situation, the algorithm outlined in
123182122Ssilby * RFC 1948 is used to generate sequence numbers.
123282122Ssilby *
123382122Ssilby * Implementation details:
123482122Ssilby *
123582122Ssilby * Time is based off the system timer, and is corrected so that it
123682122Ssilby * increases by one megabyte per second.  This allows for proper
123782122Ssilby * recycling on high speed LANs while still leaving over an hour
123882122Ssilby * before rollover.
123982122Ssilby *
124082122Ssilby * net.inet.tcp.isn_reseed_interval controls the number of seconds
124182122Ssilby * between seeding of isn_secret.  This is normally set to zero,
124282122Ssilby * as reseeding should not be necessary.
124382122Ssilby *
124482122Ssilby */
124579413Ssilby
124682122Ssilby#define ISN_BYTES_PER_SECOND 1048576
124779413Ssilby
124882122Ssilbyu_char isn_secret[32];
124982122Ssilbyint isn_last_reseed;
125082122SsilbyMD5_CTX isn_ctx;
125175619Skris
125275619Skristcp_seq
125382122Ssilbytcp_new_isn(tp)
125482122Ssilby	struct tcpcb *tp;
125575619Skris{
125682122Ssilby	u_int32_t md5_buffer[4];
125782122Ssilby	tcp_seq new_isn;
125875619Skris
125982122Ssilby	/* Seed if this is the first use, reseed if requested. */
126094390Ssilby	if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
126182122Ssilby	     (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
126282122Ssilby		< (u_int)ticks))) {
126382122Ssilby		read_random(&isn_secret, sizeof(isn_secret));
126482122Ssilby		isn_last_reseed = ticks;
126582122Ssilby	}
126682122Ssilby
126782122Ssilby	/* Compute the md5 hash and return the ISN. */
126882122Ssilby	MD5Init(&isn_ctx);
126982122Ssilby	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
127082122Ssilby	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
127182122Ssilby#ifdef INET6
127282122Ssilby	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
127382122Ssilby		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
127482122Ssilby			  sizeof(struct in6_addr));
127582122Ssilby		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
127682122Ssilby			  sizeof(struct in6_addr));
127782122Ssilby	} else
127882122Ssilby#endif
127982122Ssilby	{
128082122Ssilby		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
128182122Ssilby			  sizeof(struct in_addr));
128282122Ssilby		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
128382122Ssilby			  sizeof(struct in_addr));
128482122Ssilby	}
128582122Ssilby	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
128682122Ssilby	MD5Final((u_char *) &md5_buffer, &isn_ctx);
128782122Ssilby	new_isn = (tcp_seq) md5_buffer[0];
128882122Ssilby	new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
128982122Ssilby	return new_isn;
129075619Skris}
129175619Skris
12921541Srgrimes/*
12931541Srgrimes * When a source quench is received, close congestion window
12941541Srgrimes * to one segment.  We will gradually open it again as we proceed.
12951541Srgrimes */
129698211Shsustruct inpcb *
12971541Srgrimestcp_quench(inp, errno)
12981541Srgrimes	struct inpcb *inp;
12991541Srgrimes	int errno;
13001541Srgrimes{
13011541Srgrimes	struct tcpcb *tp = intotcpcb(inp);
13021541Srgrimes
13031541Srgrimes	if (tp)
13041541Srgrimes		tp->snd_cwnd = tp->t_maxseg;
130598211Shsu	return (inp);
13061541Srgrimes}
13076283Swollman
13086283Swollman/*
130972959Sjlemon * When a specific ICMP unreachable message is received and the
131072959Sjlemon * connection state is SYN-SENT, drop the connection.  This behavior
131172959Sjlemon * is controlled by the icmp_may_rst sysctl.
131270103Sphk */
131398211Shsustruct inpcb *
131470103Sphktcp_drop_syn_sent(inp, errno)
131570103Sphk	struct inpcb *inp;
131670103Sphk	int errno;
131770103Sphk{
131870103Sphk	struct tcpcb *tp = intotcpcb(inp);
131970103Sphk
132098211Shsu	if (tp && tp->t_state == TCPS_SYN_SENT) {
132172638Sphk		tcp_drop(tp, errno);
132298211Shsu		return (struct inpcb *)0;
132398211Shsu	}
132498211Shsu	return inp;
132572638Sphk}
132672638Sphk
132772638Sphk/*
132810881Swollman * When `need fragmentation' ICMP is received, update our idea of the MSS
132910881Swollman * based on the new value in the route.  Also nudge TCP to send something,
133010881Swollman * since we know the packet we just sent was dropped.
133110930Swollman * This duplicates some code in the tcp_mss() function in tcp_input.c.
133210881Swollman */
133398211Shsustruct inpcb *
133410881Swollmantcp_mtudisc(inp, errno)
133510881Swollman	struct inpcb *inp;
133610881Swollman	int errno;
133710881Swollman{
133810881Swollman	struct tcpcb *tp = intotcpcb(inp);
1339122922Sandre	struct rmxp_tao tao;
134010930Swollman	struct socket *so = inp->inp_socket;
1341122922Sandre	u_int maxmtu;
1342122922Sandre	u_int romtu;
134310930Swollman	int mss;
134455679Sshin#ifdef INET6
134555679Sshin	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
134655679Sshin#endif /* INET6 */
1347122922Sandre	bzero(&tao, sizeof(tao));
134810881Swollman
134910930Swollman	if (tp) {
1350122922Sandre		maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
1351122922Sandre		romtu =
135255679Sshin#ifdef INET6
1353122922Sandre		    isipv6 ? tcp_maxmtu6(&inp->inp_inc) :
1354122922Sandre#endif /* INET6 */
1355122922Sandre		    tcp_maxmtu(&inp->inp_inc);
1356122922Sandre		if (!maxmtu)
1357122922Sandre			maxmtu = romtu;
135855679Sshin		else
1359122922Sandre			maxmtu = min(maxmtu, romtu);
1360122922Sandre		if (!maxmtu) {
136155679Sshin			tp->t_maxopd = tp->t_maxseg =
136255679Sshin#ifdef INET6
136355679Sshin				isipv6 ? tcp_v6mssdflt :
136455679Sshin#endif /* INET6 */
136555679Sshin				tcp_mssdflt;
136698211Shsu			return inp;
136710930Swollman		}
1368122922Sandre		mss = maxmtu -
136955679Sshin#ifdef INET6
137055679Sshin			(isipv6 ?
137155679Sshin			 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
137255679Sshin#endif /* INET6 */
137355679Sshin			 sizeof(struct tcpiphdr)
137455679Sshin#ifdef INET6
137555679Sshin			 )
137655679Sshin#endif /* INET6 */
137755679Sshin			;
137855679Sshin
1379122922Sandre		if (tcp_do_rfc1644) {
1380122922Sandre			tcp_hc_gettao(&inp->inp_inc, &tao);
1381122922Sandre			if (tao.tao_mssopt)
1382122922Sandre				mss = min(mss, tao.tao_mssopt);
1383122922Sandre		}
138412939Swollman		/*
138512939Swollman		 * XXX - The above conditional probably violates the TCP
138612939Swollman		 * spec.  The problem is that, since we don't know the
138712939Swollman		 * other end's MSS, we are supposed to use a conservative
138812939Swollman		 * default.  But, if we do that, then MTU discovery will
138912939Swollman		 * never actually take place, because the conservative
139012939Swollman		 * default is much less than the MTUs typically seen
139112939Swollman		 * on the Internet today.  For the moment, we'll sweep
139212939Swollman		 * this under the carpet.
139312939Swollman		 *
139412939Swollman		 * The conservative default might not actually be a problem
139512939Swollman		 * if the only case this occurs is when sending an initial
139612939Swollman		 * SYN with options and data to a host we've never talked
139712939Swollman		 * to before.  Then, they will reply with an MSS value which
139812939Swollman		 * will get recorded and the new parameters should get
139912939Swollman		 * recomputed.  For Further Study.
140012939Swollman		 */
140111415Swollman		if (tp->t_maxopd <= mss)
140298211Shsu			return inp;
140310930Swollman		tp->t_maxopd = mss;
140410930Swollman
140510930Swollman		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
140610930Swollman		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
140710930Swollman			mss -= TCPOLEN_TSTAMP_APPA;
140810930Swollman		if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
140910930Swollman		    (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
141010930Swollman			mss -= TCPOLEN_CC_APPA;
141110930Swollman#if	(MCLBYTES & (MCLBYTES - 1)) == 0
141210930Swollman		if (mss > MCLBYTES)
141310930Swollman			mss &= ~(MCLBYTES-1);
141410930Swollman#else
141510930Swollman		if (mss > MCLBYTES)
141610930Swollman			mss = mss / MCLBYTES * MCLBYTES;
141710881Swollman#endif
141810930Swollman		if (so->so_snd.sb_hiwat < mss)
141910930Swollman			mss = so->so_snd.sb_hiwat;
142010930Swollman
142110930Swollman		tp->t_maxseg = mss;
142210930Swollman
142311450Swollman		tcpstat.tcps_mturesent++;
142450673Sjlemon		tp->t_rtttime = 0;
142511450Swollman		tp->snd_nxt = tp->snd_una;
142611450Swollman		tcp_output(tp);
142710930Swollman	}
142898211Shsu	return inp;
142910881Swollman}
143010881Swollman
143110881Swollman/*
14326283Swollman * Look-up the routing entry to the peer of this inpcb.  If no route
1433108265Shsu * is found and it cannot be allocated, then return NULL.  This routine
14346283Swollman * is called by TCP routines that access the rmx structure and by tcp_mss
14356283Swollman * to get the interface MTU.
14366283Swollman */
1437122922Sandreu_long
1438122922Sandretcp_maxmtu(inc)
143986764Sjlemon	struct in_conninfo *inc;
14406283Swollman{
1441122922Sandre	struct route sro;
1442122922Sandre	struct sockaddr_in *dst;
1443122922Sandre	struct ifnet *ifp;
1444122922Sandre	u_long maxmtu = 0;
14456283Swollman
1446122922Sandre	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
1447122922Sandre
1448122996Sandre	bzero(&sro, sizeof(sro));
1449122922Sandre	if (inc->inc_faddr.s_addr != INADDR_ANY) {
1450122922Sandre	        dst = (struct sockaddr_in *)&sro.ro_dst;
1451122922Sandre		dst->sin_family = AF_INET;
1452122922Sandre		dst->sin_len = sizeof(*dst);
1453122922Sandre		dst->sin_addr = inc->inc_faddr;
1454122922Sandre		rtalloc_ign(&sro, RTF_CLONING);
14556283Swollman	}
1456122922Sandre	if (sro.ro_rt != NULL) {
1457122922Sandre		ifp = sro.ro_rt->rt_ifp;
1458122922Sandre		if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
1459122922Sandre			maxmtu = ifp->if_mtu;
1460122922Sandre		else
1461122922Sandre			maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
1462122922Sandre		RTFREE(sro.ro_rt);
1463122922Sandre	}
1464122922Sandre	return (maxmtu);
14656283Swollman}
14666283Swollman
146755679Sshin#ifdef INET6
1468122922Sandreu_long
1469122922Sandretcp_maxmtu6(inc)
147086764Sjlemon	struct in_conninfo *inc;
147155679Sshin{
1472122922Sandre	struct route_in6 sro6;
1473122922Sandre	struct ifnet *ifp;
1474122922Sandre	u_long maxmtu = 0;
147555679Sshin
1476122922Sandre	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
1477122922Sandre
1478122996Sandre	bzero(&sro6, sizeof(sro6));
1479122922Sandre	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
1480122922Sandre		sro6.ro_dst.sin6_family = AF_INET6;
1481122922Sandre		sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
1482122922Sandre		sro6.ro_dst.sin6_addr = inc->inc6_faddr;
1483122922Sandre		rtalloc_ign((struct route *)&sro6, RTF_CLONING);
148455679Sshin	}
1485122922Sandre	if (sro6.ro_rt != NULL) {
1486122922Sandre		ifp = sro6.ro_rt->rt_ifp;
1487122922Sandre		if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
1488122922Sandre			maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
1489122922Sandre		else
1490122922Sandre			maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
1491122922Sandre				     IN6_LINKMTU(sro6.ro_rt->rt_ifp));
1492122922Sandre		RTFREE(sro6.ro_rt);
1493122922Sandre	}
1494122922Sandre
1495122922Sandre	return (maxmtu);
149655679Sshin}
149755679Sshin#endif /* INET6 */
149855679Sshin
149955679Sshin#ifdef IPSEC
150055679Sshin/* compute ESP/AH header size for TCP, including outer IP header. */
150155679Sshinsize_t
150255679Sshinipsec_hdrsiz_tcp(tp)
150355679Sshin	struct tcpcb *tp;
150455679Sshin{
150555679Sshin	struct inpcb *inp;
150655679Sshin	struct mbuf *m;
150755679Sshin	size_t hdrsiz;
150855679Sshin	struct ip *ip;
150955679Sshin#ifdef INET6
151055679Sshin	struct ip6_hdr *ip6;
1511111145Sjlemon#endif
151255679Sshin	struct tcphdr *th;
151355679Sshin
151478642Ssilby	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
151555679Sshin		return 0;
1516111119Simp	MGETHDR(m, M_DONTWAIT, MT_DATA);
151755679Sshin	if (!m)
151855679Sshin		return 0;
151955679Sshin
152055679Sshin#ifdef INET6
152155679Sshin	if ((inp->inp_vflag & INP_IPV6) != 0) {
152255679Sshin		ip6 = mtod(m, struct ip6_hdr *);
152355679Sshin		th = (struct tcphdr *)(ip6 + 1);
152455679Sshin		m->m_pkthdr.len = m->m_len =
152555679Sshin			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1526111144Sjlemon		tcpip_fillheaders(inp, ip6, th);
152755679Sshin		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
152855679Sshin	} else
152955679Sshin#endif /* INET6 */
153055679Sshin      {
153155679Sshin	ip = mtod(m, struct ip *);
153255679Sshin	th = (struct tcphdr *)(ip + 1);
153355679Sshin	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1534111144Sjlemon	tcpip_fillheaders(inp, ip, th);
153555679Sshin	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
153655679Sshin      }
153755679Sshin
153855679Sshin	m_free(m);
153955679Sshin	return hdrsiz;
154055679Sshin}
154155679Sshin#endif /*IPSEC*/
154255679Sshin
15436283Swollman/*
1544111145Sjlemon * Move a TCP connection into TIME_WAIT state.
1545111145Sjlemon *    tcbinfo is unlocked.
1546111145Sjlemon *    inp is locked, and is unlocked before returning.
1547111145Sjlemon */
1548111145Sjlemonvoid
1549111145Sjlemontcp_twstart(tp)
1550111145Sjlemon	struct tcpcb *tp;
1551111145Sjlemon{
1552111145Sjlemon	struct tcptw *tw;
1553111145Sjlemon	struct inpcb *inp;
1554111145Sjlemon	int tw_time, acknow;
1555111145Sjlemon	struct socket *so;
1556111145Sjlemon
1557112009Sjlemon	tw = uma_zalloc(tcptw_zone, M_NOWAIT);
1558112009Sjlemon	if (tw == NULL) {
1559112009Sjlemon		tw = tcp_timer_2msl_tw(1);
1560112009Sjlemon		if (tw == NULL) {
1561112009Sjlemon			tcp_close(tp);
1562112009Sjlemon			return;
1563112009Sjlemon		}
1564112009Sjlemon	}
1565111145Sjlemon	inp = tp->t_inpcb;
1566111145Sjlemon	tw->tw_inpcb = inp;
1567111145Sjlemon
1568111145Sjlemon	/*
1569111145Sjlemon	 * Recover last window size sent.
1570111145Sjlemon	 */
1571111145Sjlemon	tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
1572111145Sjlemon
1573111145Sjlemon	/*
1574111145Sjlemon	 * Set t_recent if timestamps are used on the connection.
1575111145Sjlemon	 */
1576111145Sjlemon        if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
1577111145Sjlemon            (TF_REQ_TSTMP|TF_RCVD_TSTMP))
1578111145Sjlemon		tw->t_recent = tp->ts_recent;
1579111145Sjlemon	else
1580111145Sjlemon		tw->t_recent = 0;
1581111145Sjlemon
1582111145Sjlemon	tw->snd_nxt = tp->snd_nxt;
1583111145Sjlemon	tw->rcv_nxt = tp->rcv_nxt;
1584121850Ssilby	tw->iss     = tp->iss;
1585121884Ssilby	tw->irs     = tp->irs;
1586111145Sjlemon	tw->cc_recv = tp->cc_recv;
1587111145Sjlemon	tw->cc_send = tp->cc_send;
1588111145Sjlemon	tw->t_starttime = tp->t_starttime;
1589112009Sjlemon	tw->tw_time = 0;
1590111145Sjlemon
1591111145Sjlemon/* XXX
1592111145Sjlemon * If this code will
1593111145Sjlemon * be used for fin-wait-2 state also, then we may need
1594111145Sjlemon * a ts_recent from the last segment.
1595111145Sjlemon */
1596111145Sjlemon	/* Shorten TIME_WAIT [RFC-1644, p.28] */
1597111145Sjlemon	if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) {
1598111145Sjlemon		tw_time = tp->t_rxtcur * TCPTV_TWTRUNC;
1599111145Sjlemon		/* For T/TCP client, force ACK now. */
1600111145Sjlemon		acknow = 1;
1601111145Sjlemon	} else {
1602111145Sjlemon		tw_time = 2 * tcp_msl;
1603111145Sjlemon		acknow = tp->t_flags & TF_ACKNOW;
1604111145Sjlemon	}
1605111145Sjlemon	tcp_discardcb(tp);
1606111145Sjlemon	so = inp->inp_socket;
1607111145Sjlemon	so->so_pcb = NULL;
1608111145Sjlemon	tw->tw_cred = crhold(so->so_cred);
1609111145Sjlemon	tw->tw_so_options = so->so_options;
1610114794Srwatson	if (acknow)
1611126351Srwatson		tcp_twrespond(tw, TH_ACK);
1612111145Sjlemon	sotryfree(so);
1613111145Sjlemon	inp->inp_socket = NULL;
1614111145Sjlemon	inp->inp_ppcb = (caddr_t)tw;
1615111145Sjlemon	inp->inp_vflag |= INP_TIMEWAIT;
1616112009Sjlemon	tcp_timer_2msl_reset(tw, tw_time);
1617111145Sjlemon	INP_UNLOCK(inp);
1618111145Sjlemon}
1619111145Sjlemon
1620121850Ssilby/*
1621121884Ssilby * The appromixate rate of ISN increase of Microsoft TCP stacks;
1622121884Ssilby * the actual rate is slightly higher due to the addition of
1623121884Ssilby * random positive increments.
1624121884Ssilby *
1625121884Ssilby * Most other new OSes use semi-randomized ISN values, so we
1626121884Ssilby * do not need to worry about them.
1627121884Ssilby */
1628121884Ssilby#define MS_ISN_BYTES_PER_SECOND		250000
1629121884Ssilby
1630121884Ssilby/*
1631121850Ssilby * Determine if the ISN we will generate has advanced beyond the last
1632121850Ssilby * sequence number used by the previous connection.  If so, indicate
1633121850Ssilby * that it is safe to recycle this tw socket by returning 1.
1634121850Ssilby */
1635121850Ssilbyint
1636121850Ssilbytcp_twrecycleable(struct tcptw *tw)
1637121850Ssilby{
1638121884Ssilby	tcp_seq new_iss = tw->iss;
1639121884Ssilby	tcp_seq new_irs = tw->irs;
1640121850Ssilby
1641121884Ssilby	new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz);
1642121884Ssilby	new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz);
1643121850Ssilby
1644121884Ssilby	if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt))
1645121850Ssilby		return 1;
1646121850Ssilby	else
1647121850Ssilby		return 0;
1648121850Ssilby}
1649121850Ssilby
1650112009Sjlemonstruct tcptw *
1651112009Sjlemontcp_twclose(struct tcptw *tw, int reuse)
1652111145Sjlemon{
1653111145Sjlemon	struct inpcb *inp;
1654111145Sjlemon
1655111145Sjlemon	inp = tw->tw_inpcb;
1656111145Sjlemon	tw->tw_inpcb = NULL;
1657112009Sjlemon	tcp_timer_2msl_stop(tw);
1658111145Sjlemon	inp->inp_ppcb = NULL;
1659111145Sjlemon#ifdef INET6
1660111145Sjlemon	if (inp->inp_vflag & INP_IPV6PROTO)
1661111145Sjlemon		in6_pcbdetach(inp);
1662111145Sjlemon	else
1663111145Sjlemon#endif
1664111145Sjlemon		in_pcbdetach(inp);
1665111145Sjlemon	tcpstat.tcps_closed++;
1666126002Spjd	crfree(tw->tw_cred);
1667126002Spjd	tw->tw_cred = NULL;
1668112009Sjlemon	if (reuse)
1669112009Sjlemon		return (tw);
1670112009Sjlemon	uma_zfree(tcptw_zone, tw);
1671112009Sjlemon	return (NULL);
1672111145Sjlemon}
1673111145Sjlemon
1674111145Sjlemonint
1675126351Srwatsontcp_twrespond(struct tcptw *tw, int flags)
1676111145Sjlemon{
1677111145Sjlemon	struct inpcb *inp = tw->tw_inpcb;
1678111145Sjlemon	struct tcphdr *th;
1679111145Sjlemon	struct mbuf *m;
1680111145Sjlemon	struct ip *ip = NULL;
1681111145Sjlemon	u_int8_t *optp;
1682111145Sjlemon	u_int hdrlen, optlen;
1683111145Sjlemon	int error;
1684111145Sjlemon#ifdef INET6
1685111145Sjlemon	struct ip6_hdr *ip6 = NULL;
1686111145Sjlemon	int isipv6 = inp->inp_inc.inc_isipv6;
1687111145Sjlemon#endif
1688111145Sjlemon
1689111231Sphk	m = m_gethdr(M_DONTWAIT, MT_HEADER);
1690111145Sjlemon	if (m == NULL)
1691111145Sjlemon		return (ENOBUFS);
1692111145Sjlemon	m->m_data += max_linkhdr;
1693111145Sjlemon
1694114794Srwatson#ifdef MAC
1695123607Srwatson	mac_create_mbuf_from_inpcb(inp, m);
1696114794Srwatson#endif
1697114794Srwatson
1698111153Sjlemon#ifdef INET6
1699111145Sjlemon	if (isipv6) {
1700111145Sjlemon		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1701111145Sjlemon		ip6 = mtod(m, struct ip6_hdr *);
1702111145Sjlemon		th = (struct tcphdr *)(ip6 + 1);
1703111145Sjlemon		tcpip_fillheaders(inp, ip6, th);
1704111153Sjlemon	} else
1705111153Sjlemon#endif
1706111153Sjlemon	{
1707111145Sjlemon		hdrlen = sizeof(struct tcpiphdr);
1708111145Sjlemon		ip = mtod(m, struct ip *);
1709111145Sjlemon		th = (struct tcphdr *)(ip + 1);
1710111145Sjlemon		tcpip_fillheaders(inp, ip, th);
1711111145Sjlemon	}
1712111145Sjlemon	optp = (u_int8_t *)(th + 1);
1713111145Sjlemon
1714111145Sjlemon 	/*
1715111145Sjlemon	 * Send a timestamp and echo-reply if both our side and our peer
1716111145Sjlemon	 * have sent timestamps in our SYN's and this is not a RST.
1717111145Sjlemon 	 */
1718111145Sjlemon	if (tw->t_recent && flags == TH_ACK) {
1719111145Sjlemon		u_int32_t *lp = (u_int32_t *)optp;
1720111145Sjlemon
1721111145Sjlemon 		/* Form timestamp option as shown in appendix A of RFC 1323. */
1722111145Sjlemon 		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
1723111145Sjlemon 		*lp++ = htonl(ticks);
1724111145Sjlemon 		*lp   = htonl(tw->t_recent);
1725111145Sjlemon 		optp += TCPOLEN_TSTAMP_APPA;
1726111145Sjlemon 	}
1727111145Sjlemon
1728111145Sjlemon 	/*
1729111145Sjlemon	 * Send `CC-family' options if needed, and it's not a RST.
1730111145Sjlemon 	 */
1731111145Sjlemon	if (tw->cc_recv != 0 && flags == TH_ACK) {
1732111145Sjlemon		u_int32_t *lp = (u_int32_t *)optp;
1733111145Sjlemon
1734111145Sjlemon		*lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC));
1735111145Sjlemon		*lp   = htonl(tw->cc_send);
1736111145Sjlemon		optp += TCPOLEN_CC_APPA;
1737111145Sjlemon 	}
1738111145Sjlemon	optlen = optp - (u_int8_t *)(th + 1);
1739111145Sjlemon
1740111145Sjlemon	m->m_len = hdrlen + optlen;
1741111145Sjlemon	m->m_pkthdr.len = m->m_len;
1742111145Sjlemon
1743111145Sjlemon	KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
1744111145Sjlemon
1745111145Sjlemon	th->th_seq = htonl(tw->snd_nxt);
1746111145Sjlemon	th->th_ack = htonl(tw->rcv_nxt);
1747111145Sjlemon	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1748111145Sjlemon	th->th_flags = flags;
1749111145Sjlemon	th->th_win = htons(tw->last_win);
1750111145Sjlemon
1751111153Sjlemon#ifdef INET6
1752111145Sjlemon	if (isipv6) {
1753111145Sjlemon		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
1754111145Sjlemon		    sizeof(struct tcphdr) + optlen);
1755122922Sandre		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
1756122922Sandre		error = ip6_output(m, inp->in6p_outputopts, NULL,
1757111145Sjlemon		    (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
1758111153Sjlemon	} else
1759111153Sjlemon#endif
1760111153Sjlemon	{
1761111145Sjlemon		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1762111145Sjlemon                    htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
1763111145Sjlemon		m->m_pkthdr.csum_flags = CSUM_TCP;
1764111145Sjlemon		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1765111145Sjlemon		ip->ip_len = m->m_pkthdr.len;
1766124248Sandre		if (path_mtu_discovery)
1767124248Sandre			ip->ip_off |= IP_DF;
1768122922Sandre		error = ip_output(m, inp->inp_options, NULL,
1769111145Sjlemon		    (tw->tw_so_options & SO_DONTROUTE), NULL, inp);
1770111145Sjlemon	}
1771111145Sjlemon	if (flags & TH_ACK)
1772111145Sjlemon		tcpstat.tcps_sndacks++;
1773111145Sjlemon	else
1774111145Sjlemon		tcpstat.tcps_sndctrl++;
1775111145Sjlemon	tcpstat.tcps_sndtotal++;
1776111145Sjlemon	return (error);
1777111145Sjlemon}
1778111145Sjlemon
1779111145Sjlemon/*
1780102017Sdillon * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
1781102017Sdillon *
1782102017Sdillon * This code attempts to calculate the bandwidth-delay product as a
1783102017Sdillon * means of determining the optimal window size to maximize bandwidth,
1784102017Sdillon * minimize RTT, and avoid the over-allocation of buffers on interfaces and
1785102017Sdillon * routers.  This code also does a fairly good job keeping RTTs in check
1786102017Sdillon * across slow links like modems.  We implement an algorithm which is very
1787102017Sdillon * similar (but not meant to be) TCP/Vegas.  The code operates on the
1788102017Sdillon * transmitter side of a TCP connection and so only effects the transmit
1789102017Sdillon * side of the connection.
1790102017Sdillon *
1791102017Sdillon * BACKGROUND:  TCP makes no provision for the management of buffer space
1792102017Sdillon * at the end points or at the intermediate routers and switches.  A TCP
1793102017Sdillon * stream, whether using NewReno or not, will eventually buffer as
1794102017Sdillon * many packets as it is able and the only reason this typically works is
1795102017Sdillon * due to the fairly small default buffers made available for a connection
1796102017Sdillon * (typicaly 16K or 32K).  As machines use larger windows and/or window
1797102017Sdillon * scaling it is now fairly easy for even a single TCP connection to blow-out
1798102017Sdillon * all available buffer space not only on the local interface, but on
1799102017Sdillon * intermediate routers and switches as well.  NewReno makes a misguided
1800102017Sdillon * attempt to 'solve' this problem by waiting for an actual failure to occur,
1801102017Sdillon * then backing off, then steadily increasing the window again until another
1802102017Sdillon * failure occurs, ad-infinitum.  This results in terrible oscillation that
1803102017Sdillon * is only made worse as network loads increase and the idea of intentionally
1804102017Sdillon * blowing out network buffers is, frankly, a terrible way to manage network
1805102017Sdillon * resources.
1806102017Sdillon *
1807102017Sdillon * It is far better to limit the transmit window prior to the failure
1808102017Sdillon * condition being achieved.  There are two general ways to do this:  First
1809102017Sdillon * you can 'scan' through different transmit window sizes and locate the
1810102017Sdillon * point where the RTT stops increasing, indicating that you have filled the
1811102017Sdillon * pipe, then scan backwards until you note that RTT stops decreasing, then
1812102017Sdillon * repeat ad-infinitum.  This method works in principle but has severe
1813102017Sdillon * implementation issues due to RTT variances, timer granularity, and
1814102017Sdillon * instability in the algorithm which can lead to many false positives and
1815102017Sdillon * create oscillations as well as interact badly with other TCP streams
1816102017Sdillon * implementing the same algorithm.
1817102017Sdillon *
1818102017Sdillon * The second method is to limit the window to the bandwidth delay product
1819102017Sdillon * of the link.  This is the method we implement.  RTT variances and our
1820102017Sdillon * own manipulation of the congestion window, bwnd, can potentially
1821102017Sdillon * destabilize the algorithm.  For this reason we have to stabilize the
1822102017Sdillon * elements used to calculate the window.  We do this by using the minimum
1823102017Sdillon * observed RTT, the long term average of the observed bandwidth, and
1824102017Sdillon * by adding two segments worth of slop.  It isn't perfect but it is able
1825102017Sdillon * to react to changing conditions and gives us a very stable basis on
1826102017Sdillon * which to extend the algorithm.
1827102017Sdillon */
1828102017Sdillonvoid
1829102017Sdillontcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
1830102017Sdillon{
1831102017Sdillon	u_long bw;
1832102017Sdillon	u_long bwnd;
1833102017Sdillon	int save_ticks;
1834102017Sdillon
1835102017Sdillon	/*
1836102017Sdillon	 * If inflight_enable is disabled in the middle of a tcp connection,
1837102017Sdillon	 * make sure snd_bwnd is effectively disabled.
1838102017Sdillon	 */
1839102017Sdillon	if (tcp_inflight_enable == 0) {
1840102017Sdillon		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1841102017Sdillon		tp->snd_bandwidth = 0;
1842102017Sdillon		return;
1843102017Sdillon	}
1844102017Sdillon
1845102017Sdillon	/*
1846102017Sdillon	 * Figure out the bandwidth.  Due to the tick granularity this
1847102017Sdillon	 * is a very rough number and it MUST be averaged over a fairly
1848102017Sdillon	 * long period of time.  XXX we need to take into account a link
1849102017Sdillon	 * that is not using all available bandwidth, but for now our
1850102017Sdillon	 * slop will ramp us up if this case occurs and the bandwidth later
1851102017Sdillon	 * increases.
1852102368Sdillon	 *
1853102368Sdillon	 * Note: if ticks rollover 'bw' may wind up negative.  We must
1854102368Sdillon	 * effectively reset t_bw_rtttime for this case.
1855102017Sdillon	 */
1856102017Sdillon	save_ticks = ticks;
1857102017Sdillon	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
1858102017Sdillon		return;
1859102017Sdillon
1860102017Sdillon	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
1861102017Sdillon	    (save_ticks - tp->t_bw_rtttime);
1862102017Sdillon	tp->t_bw_rtttime = save_ticks;
1863102017Sdillon	tp->t_bw_rtseq = ack_seq;
1864102368Sdillon	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
1865102017Sdillon		return;
1866102017Sdillon	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
1867102017Sdillon
1868102017Sdillon	tp->snd_bandwidth = bw;
1869102017Sdillon
1870102017Sdillon	/*
1871102017Sdillon	 * Calculate the semi-static bandwidth delay product, plus two maximal
1872102017Sdillon	 * segments.  The additional slop puts us squarely in the sweet
1873107881Sdillon	 * spot and also handles the bandwidth run-up case and stabilization.
1874107881Sdillon	 * Without the slop we could be locking ourselves into a lower
1875107881Sdillon	 * bandwidth.
1876102017Sdillon	 *
1877102017Sdillon	 * Situations Handled:
1878102017Sdillon	 *	(1) Prevents over-queueing of packets on LANs, especially on
1879102017Sdillon	 *	    high speed LANs, allowing larger TCP buffers to be
1880102017Sdillon	 *	    specified, and also does a good job preventing
1881102017Sdillon	 *	    over-queueing of packets over choke points like modems
1882102017Sdillon	 *	    (at least for the transmit side).
1883102017Sdillon	 *
1884102017Sdillon	 *	(2) Is able to handle changing network loads (bandwidth
1885102017Sdillon	 *	    drops so bwnd drops, bandwidth increases so bwnd
1886102017Sdillon	 *	    increases).
1887102017Sdillon	 *
1888102017Sdillon	 *	(3) Theoretically should stabilize in the face of multiple
1889102017Sdillon	 *	    connections implementing the same algorithm (this may need
1890102017Sdillon	 *	    a little work).
1891107881Sdillon	 *
1892107881Sdillon	 *	(4) Stability value (defaults to 20 = 2 maximal packets) can
1893107881Sdillon	 *	    be adjusted with a sysctl but typically only needs to be
1894107881Sdillon	 *	    on very slow connections.  A value no smaller then 5
1895107881Sdillon	 *	    should be used, but only reduce this default if you have
1896107881Sdillon	 *	    no other choice.
1897102017Sdillon	 */
1898102017Sdillon#define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
1899107881Sdillon	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
1900102368Sdillon#undef USERTT
1901102017Sdillon
1902102017Sdillon	if (tcp_inflight_debug > 0) {
1903102017Sdillon		static int ltime;
1904102017Sdillon		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
1905102017Sdillon			ltime = ticks;
1906102017Sdillon			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
1907102017Sdillon			    tp,
1908102017Sdillon			    bw,
1909102017Sdillon			    tp->t_rttbest,
1910102017Sdillon			    tp->t_srtt,
1911102017Sdillon			    bwnd
1912102017Sdillon			);
1913102017Sdillon		}
1914102017Sdillon	}
1915102017Sdillon	if ((long)bwnd < tcp_inflight_min)
1916102017Sdillon		bwnd = tcp_inflight_min;
1917102017Sdillon	if (bwnd > tcp_inflight_max)
1918102017Sdillon		bwnd = tcp_inflight_max;
1919102017Sdillon	if ((long)bwnd < tp->t_maxseg * 2)
1920102017Sdillon		bwnd = tp->t_maxseg * 2;
1921102017Sdillon	tp->snd_bwnd = bwnd;
1922102017Sdillon}
1923102017Sdillon
1924125680Sbms#ifdef TCP_SIGNATURE
1925125680Sbms/*
1926125783Sbms * Callback function invoked by m_apply() to digest TCP segment data
1927125783Sbms * contained within an mbuf chain.
1928125783Sbms */
1929125783Sbmsstatic int
1930125783Sbmstcp_signature_apply(void *fstate, void *data, u_int len)
1931125783Sbms{
1932125783Sbms
1933125819Sbms	MD5Update(fstate, (u_char *)data, len);
1934125783Sbms	return (0);
1935125783Sbms}
1936125783Sbms
1937125783Sbms/*
1938125680Sbms * Compute TCP-MD5 hash of a TCPv4 segment. (RFC2385)
1939125680Sbms *
1940125741Sbms * Parameters:
1941125741Sbms * m		pointer to head of mbuf chain
1942125741Sbms * off0		offset to TCP header within the mbuf chain
1943125741Sbms * len		length of TCP segment data, excluding options
1944125741Sbms * optlen	length of TCP segment options
1945125741Sbms * buf		pointer to storage for computed MD5 digest
1946125741Sbms * direction	direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
1947125741Sbms *
1948125680Sbms * We do this over ip, tcphdr, segment data, and the key in the SADB.
1949125680Sbms * When called from tcp_input(), we can be sure that th_sum has been
1950125680Sbms * zeroed out and verified already.
1951125680Sbms *
1952125680Sbms * This function is for IPv4 use only. Calling this function with an
1953125680Sbms * IPv6 packet in the mbuf chain will yield undefined results.
1954125680Sbms *
1955125680Sbms * Return 0 if successful, otherwise return -1.
1956125680Sbms *
1957125680Sbms * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
1958125680Sbms * search with the destination IP address, and a 'magic SPI' to be
1959125680Sbms * determined by the application. This is hardcoded elsewhere to 1179
1960125680Sbms * right now. Another branch of this code exists which uses the SPD to
1961125680Sbms * specify per-application flows but it is unstable.
1962125680Sbms */
1963125680Sbmsint
1964125783Sbmstcp_signature_compute(struct mbuf *m, int off0, int len, int optlen,
1965125741Sbms    u_char *buf, u_int direction)
1966125680Sbms{
1967125680Sbms	union sockaddr_union dst;
1968125680Sbms	struct ippseudo ippseudo;
1969125680Sbms	MD5_CTX ctx;
1970125680Sbms	int doff;
1971125680Sbms	struct ip *ip;
1972125680Sbms	struct ipovly *ipovly;
1973125680Sbms	struct secasvar *sav;
1974125680Sbms	struct tcphdr *th;
1975125680Sbms	u_short savecsum;
1976125680Sbms
1977125741Sbms	KASSERT(m != NULL, ("NULL mbuf chain"));
1978125741Sbms	KASSERT(buf != NULL, ("NULL signature pointer"));
1979125741Sbms
1980125741Sbms	/* Extract the destination from the IP header in the mbuf. */
1981125680Sbms	ip = mtod(m, struct ip *);
1982125680Sbms	bzero(&dst, sizeof(union sockaddr_union));
1983125680Sbms	dst.sa.sa_len = sizeof(struct sockaddr_in);
1984125680Sbms	dst.sa.sa_family = AF_INET;
1985125680Sbms	dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
1986125680Sbms	    ip->ip_src : ip->ip_dst;
1987125741Sbms
1988125741Sbms	/* Look up an SADB entry which matches the address of the peer. */
1989125680Sbms	sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
1990125680Sbms	if (sav == NULL) {
1991125680Sbms		printf("%s: SADB lookup failed for %s\n", __func__,
1992125680Sbms		    inet_ntoa(dst.sin.sin_addr));
1993125680Sbms		return (EINVAL);
1994125680Sbms	}
1995125741Sbms
1996125680Sbms	MD5Init(&ctx);
1997125680Sbms	ipovly = (struct ipovly *)ip;
1998125680Sbms	th = (struct tcphdr *)((u_char *)ip + off0);
1999125680Sbms	doff = off0 + sizeof(struct tcphdr) + optlen;
2000125741Sbms
2001125680Sbms	/*
2002125680Sbms	 * Step 1: Update MD5 hash with IP pseudo-header.
2003125680Sbms	 *
2004125680Sbms	 * XXX The ippseudo header MUST be digested in network byte order,
2005125680Sbms	 * or else we'll fail the regression test. Assume all fields we've
2006125680Sbms	 * been doing arithmetic on have been in host byte order.
2007125680Sbms	 * XXX One cannot depend on ipovly->ih_len here. When called from
2008125680Sbms	 * tcp_output(), the underlying ip_len member has not yet been set.
2009125680Sbms	 */
2010125680Sbms	ippseudo.ippseudo_src = ipovly->ih_src;
2011125680Sbms	ippseudo.ippseudo_dst = ipovly->ih_dst;
2012125680Sbms	ippseudo.ippseudo_pad = 0;
2013125680Sbms	ippseudo.ippseudo_p = IPPROTO_TCP;
2014125680Sbms	ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen);
2015125680Sbms	MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
2016125741Sbms
2017125680Sbms	/*
2018125680Sbms	 * Step 2: Update MD5 hash with TCP header, excluding options.
2019125680Sbms	 * The TCP checksum must be set to zero.
2020125680Sbms	 */
2021125680Sbms	savecsum = th->th_sum;
2022125680Sbms	th->th_sum = 0;
2023125680Sbms	MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
2024125680Sbms	th->th_sum = savecsum;
2025125741Sbms
2026125680Sbms	/*
2027125680Sbms	 * Step 3: Update MD5 hash with TCP segment data.
2028125680Sbms	 *         Use m_apply() to avoid an early m_pullup().
2029125680Sbms	 */
2030125680Sbms	if (len > 0)
2031125783Sbms		m_apply(m, doff, len, tcp_signature_apply, &ctx);
2032125741Sbms
2033125680Sbms	/*
2034125680Sbms	 * Step 4: Update MD5 hash with shared secret.
2035125680Sbms	 */
2036125680Sbms	MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
2037125680Sbms	MD5Final(buf, &ctx);
2038125741Sbms
2039125680Sbms	key_sa_recordxfer(sav, m);
2040125680Sbms	KEY_FREESAV(&sav);
2041125680Sbms	return (0);
2042125680Sbms}
2043125680Sbms#endif /* TCP_SIGNATURE */
2044