1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.73.2.22 2001/08/22 00:59:12 silby Exp $
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections.  This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/callout.h>
73#include <sys/kernel.h>
74#include <sys/sysctl.h>
75#include <sys/malloc.h>
76#include <sys/mbuf.h>
77#include <sys/domain.h>
78#include <sys/proc.h>
79#include <sys/kauth.h>
80#include <sys/socket.h>
81#include <sys/socketvar.h>
82#include <sys/protosw.h>
83#include <sys/random.h>
84#include <sys/syslog.h>
85#include <sys/mcache.h>
86#include <kern/locks.h>
87#include <kern/zalloc.h>
88
89#include <net/route.h>
90#include <net/if.h>
91
92#define tcp_minmssoverload fring
93#define _IP_VHL
94#include <netinet/in.h>
95#include <netinet/in_systm.h>
96#include <netinet/ip.h>
97#include <netinet/ip_icmp.h>
98#if INET6
99#include <netinet/ip6.h>
100#endif
101#include <netinet/in_pcb.h>
102#if INET6
103#include <netinet6/in6_pcb.h>
104#endif
105#include <netinet/in_var.h>
106#include <netinet/ip_var.h>
107#include <netinet/icmp_var.h>
108#if INET6
109#include <netinet6/ip6_var.h>
110#endif
111#include <netinet/tcp.h>
112#include <netinet/tcp_fsm.h>
113#include <netinet/tcp_seq.h>
114#include <netinet/tcp_timer.h>
115#include <netinet/tcp_var.h>
116#include <netinet/tcp_cc.h>
117#include <kern/thread_call.h>
118
119#if INET6
120#include <netinet6/tcp6_var.h>
121#endif
122#include <netinet/tcpip.h>
123#if TCPDEBUG
124#include <netinet/tcp_debug.h>
125#endif
126#include <netinet6/ip6protosw.h>
127
128#if IPSEC
129#include <netinet6/ipsec.h>
130#if INET6
131#include <netinet6/ipsec6.h>
132#endif
133#endif /*IPSEC*/
134
135#undef tcp_minmssoverload
136
137#if CONFIG_MACF_NET
138#include <security/mac_framework.h>
139#endif /* MAC_NET */
140
141#include <libkern/crypto/md5.h>
142#include <sys/kdebug.h>
143#include <mach/sdt.h>
144
145#include <netinet/lro_ext.h>
146
147#define DBG_FNC_TCP_CLOSE	NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
148
149extern int tcp_lq_overflow;
150
151/* temporary: for testing */
152#if IPSEC
153extern int ipsec_bypass;
154#endif
155
156int 	tcp_mssdflt = TCP_MSS;
157SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED,
158    &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
159
160#if INET6
161int	tcp_v6mssdflt = TCP6_MSS;
162SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
163	CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_v6mssdflt , 0,
164	"Default TCP Maximum Segment Size for IPv6");
165#endif
166
167extern int tcp_do_autorcvbuf;
168
169/*
170 * Minimum MSS we accept and use. This prevents DoS attacks where
171 * we are forced to a ridiculous low MSS like 20 and send hundreds
172 * of packets instead of one. The effect scales with the available
173 * bandwidth and quickly saturates the CPU and network interface
174 * with packet generation and sending. Set to zero to disable MINMSS
175 * checking. This setting prevents us from sending too small packets.
176 */
177int	tcp_minmss = TCP_MINMSS;
178SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED,
179    &tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
180
181/*
182 * Number of TCP segments per second we accept from remote host
183 * before we start to calculate average segment size. If average
184 * segment size drops below the minimum TCP MSS we assume a DoS
185 * attack and reset+drop the connection. Care has to be taken not to
186 * set this value too small to not kill interactive type connections
187 * (telnet, SSH) which send many small packets.
188 */
189#ifdef FIX_WORKAROUND_FOR_3894301
190__private_extern__ int     tcp_minmssoverload = TCP_MINMSSOVERLOAD;
191#else
192__private_extern__ int     tcp_minmssoverload = 0;
193#endif
194SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW | CTLFLAG_LOCKED,
195    &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to"
196    "be under the MINMSS Size");
197
198static int	tcp_do_rfc1323 = 1;
199SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW | CTLFLAG_LOCKED,
200    &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
201
202// Not used
203static int	tcp_do_rfc1644 = 0;
204SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW | CTLFLAG_LOCKED,
205    &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
206
207static int	do_tcpdrain = 0;
208SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW | CTLFLAG_LOCKED, &do_tcpdrain, 0,
209     "Enable tcp_drain routine for extra help when low on mbufs");
210
211SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
212    &tcbinfo.ipi_count, 0, "Number of active PCBs");
213
214static int	icmp_may_rst = 1;
215SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp_may_rst, 0,
216    "Certain ICMP unreachable messages may abort connections in SYN_SENT");
217
218static int	tcp_strict_rfc1948 = 0;
219SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED,
220    &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly");
221
222static int	tcp_isn_reseed_interval = 0;
223SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW | CTLFLAG_LOCKED,
224    &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
225static int 	tcp_background_io_enabled = 1;
226SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_enabled, CTLFLAG_RW | CTLFLAG_LOCKED,
227    &tcp_background_io_enabled, 0, "Background IO Enabled");
228
229int 	tcp_TCPTV_MIN = 100;	/* 100ms minimum RTT */
230SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED,
231    &tcp_TCPTV_MIN, 0, "min rtt value allowed");
232
233int tcp_rexmt_slop = TCPTV_REXMTSLOP;
234SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmt_slop, CTLFLAG_RW,
235	&tcp_rexmt_slop, 0, "Slop added to retransmit timeout");
236
237__private_extern__ int tcp_use_randomport = 0;
238SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED,
239    &tcp_use_randomport, 0, "Randomize TCP port numbers");
240
241extern struct tcp_cc_algo tcp_cc_newreno;
242SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets, CTLFLAG_RD | CTLFLAG_LOCKED,
243	&tcp_cc_newreno.num_sockets, 0, "Number of sockets using newreno");
244
245extern struct tcp_cc_algo tcp_cc_ledbat;
246SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets, CTLFLAG_RD | CTLFLAG_LOCKED,
247	&tcp_cc_ledbat.num_sockets, 0, "Number of sockets using background transport");
248
249__private_extern__ int	tcp_win_scale = 3;
250SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED,
251    &tcp_win_scale, 0, "Window scaling factor");
252
253static void	tcp_cleartaocache(void);
254static void	tcp_notify(struct inpcb *, int);
255static void	tcp_cc_init(void);
256
257struct zone	*sack_hole_zone;
258struct zone	*tcp_reass_zone;
259struct zone	*tcp_bwmeas_zone;
260
261/* The array containing pointers to currently implemented TCP CC algorithms */
262struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT];
263
264extern int slowlink_wsize;	/* window correction for slow links */
265extern int path_mtu_discovery;
266
267extern u_int32_t tcp_autorcvbuf_max;
268extern u_int32_t tcp_autorcvbuf_inc_shift;
269static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb);
270
271#define TCP_BWMEAS_BURST_MINSIZE 6
272#define TCP_BWMEAS_BURST_MAXSIZE 25
273
274static uint32_t bwmeas_elm_size;
275
276/*
277 * Target size of TCP PCB hash tables. Must be a power of two.
278 *
279 * Note that this can be overridden by the kernel environment
280 * variable net.inet.tcp.tcbhashsize
281 */
282#ifndef TCBHASHSIZE
283#define TCBHASHSIZE	CONFIG_TCBHASHSIZE
284#endif
285
286__private_extern__ int	tcp_tcbhashsize = TCBHASHSIZE;
287SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED,
288     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
289
290/*
291 * This is the actual shape of what we allocate using the zone
292 * allocator.  Doing it this way allows us to protect both structures
293 * using the same generation count, and also eliminates the overhead
294 * of allocating tcpcbs separately.  By hiding the structure here,
295 * we avoid changing most of the rest of the code (although it needs
296 * to be changed, eventually, for greater efficiency).
297 */
298#define	ALIGNMENT	32
299struct	inp_tp {
300	struct	inpcb	inp;
301	struct	tcpcb	tcb __attribute__((aligned(ALIGNMENT)));
302};
303#undef ALIGNMENT
304
305extern struct	inpcbhead	time_wait_slots[];
306extern struct tcptimerlist tcp_timer_list;
307
308int  get_inpcb_str_size(void);
309int  get_tcp_str_size(void);
310
311static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *);
312
313static lck_attr_t *tcp_uptime_mtx_attr = NULL;		/* mutex attributes */
314static lck_grp_t *tcp_uptime_mtx_grp = NULL;		/* mutex group definition */
315static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL;	/* mutex group attributes */
316int tcp_notsent_lowat_check(struct socket *so);
317
318
319int  get_inpcb_str_size(void)
320{
321	return sizeof(struct inpcb);
322}
323
324
325int  get_tcp_str_size(void)
326{
327	return sizeof(struct tcpcb);
328}
329
330int	tcp_freeq(struct tcpcb *tp);
331
332/*
333 * Initialize TCP congestion control algorithms.
334 */
335
336void
337tcp_cc_init(void)
338{
339	bzero(&tcp_cc_algo_list, sizeof(tcp_cc_algo_list));
340	tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno;
341	tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat;
342}
343
344/*
345 * Tcp initialization
346 */
347void
348tcp_init()
349{
350	vm_size_t       str_size;
351	int i;
352    	struct inpcbinfo *pcbinfo;
353
354	tcp_ccgen = 1;
355	tcp_cleartaocache();
356
357	tcp_keepinit = TCPTV_KEEP_INIT;
358	tcp_keepidle = TCPTV_KEEP_IDLE;
359	tcp_keepintvl = TCPTV_KEEPINTVL;
360	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
361	tcp_msl = TCPTV_MSL;
362
363	microuptime(&tcp_uptime);
364	read_random(&tcp_now, sizeof(tcp_now));
365	tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal clock at a random value */
366
367	LIST_INIT(&tcb);
368	tcbinfo.listhead = &tcb;
369	pcbinfo = &tcbinfo;
370	if (!powerof2(tcp_tcbhashsize)) {
371		printf("WARNING: TCB hash size not a power of 2\n");
372		tcp_tcbhashsize = 512; /* safe default */
373	}
374	tcbinfo.hashsize = tcp_tcbhashsize;
375	tcbinfo.hashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.hashmask);
376	tcbinfo.porthashbase = hashinit(tcp_tcbhashsize, M_PCB,
377					&tcbinfo.porthashmask);
378	str_size = P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t));
379	tcbinfo.ipi_zone = (void *) zinit(str_size, 120000*str_size, 8192, "tcpcb");
380	zone_change(tcbinfo.ipi_zone, Z_CALLERACCT, FALSE);
381	zone_change(tcbinfo.ipi_zone, Z_EXPAND, TRUE);
382
383	str_size = P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t));
384	sack_hole_zone = zinit(str_size, 120000*str_size, 8192, "sack_hole zone");
385	zone_change(sack_hole_zone, Z_CALLERACCT, FALSE);
386	zone_change(sack_hole_zone, Z_EXPAND, TRUE);
387
388	tcp_reass_maxseg = nmbclusters / 16;
389	str_size = P2ROUNDUP(sizeof(struct tseg_qent), sizeof(u_int64_t));
390	tcp_reass_zone = zinit(str_size, (tcp_reass_maxseg + 1) * str_size,
391		0, "tcp_reass_zone");
392	if (tcp_reass_zone == NULL) {
393		panic("%s: failed allocating tcp_reass_zone", __func__);
394		/* NOTREACHED */
395	}
396	zone_change(tcp_reass_zone, Z_CALLERACCT, FALSE);
397	zone_change(tcp_reass_zone, Z_EXPAND, TRUE);
398
399	bwmeas_elm_size = P2ROUNDUP(sizeof(struct bwmeas), sizeof(u_int64_t));
400	tcp_bwmeas_zone = zinit(bwmeas_elm_size, (100 * bwmeas_elm_size), 0, "tcp_bwmeas_zone");
401	if (tcp_bwmeas_zone == NULL) {
402		panic("%s: failed allocating tcp_bwmeas_zone", __func__);
403		/* NOTREACHED */
404	}
405	zone_change(tcp_bwmeas_zone, Z_CALLERACCT, FALSE);
406	zone_change(tcp_bwmeas_zone, Z_EXPAND, TRUE);
407
408#if INET6
409#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
410#else /* INET6 */
411#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
412#endif /* INET6 */
413	if (max_protohdr < TCP_MINPROTOHDR) {
414		_max_protohdr = TCP_MINPROTOHDR;
415		_max_protohdr = max_protohdr;	/* round it up */
416	}
417	if (max_linkhdr + max_protohdr > MCLBYTES)
418		panic("tcp_init");
419#undef TCP_MINPROTOHDR
420
421	/*
422	 * allocate lock group attribute and group for tcp pcb mutexes
423	 */
424	pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init();
425	pcbinfo->mtx_grp = lck_grp_alloc_init("tcppcb", pcbinfo->mtx_grp_attr);
426
427	/*
428	 * allocate the lock attribute for tcp pcb mutexes
429	 */
430	pcbinfo->mtx_attr = lck_attr_alloc_init();
431
432	if ((pcbinfo->mtx = lck_rw_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr)) == NULL) {
433		printf("tcp_init: mutex not alloced!\n");
434		return;	/* pretty much dead if this fails... */
435	}
436
437	for (i=0; i < N_TIME_WAIT_SLOTS; i++) {
438	     LIST_INIT(&time_wait_slots[i]);
439	}
440
441	bzero(&tcp_timer_list, sizeof(tcp_timer_list));
442	LIST_INIT(&tcp_timer_list.lhead);
443	/*
444	 * allocate lock group attribute, group and attribute for the tcp timer list
445	 */
446	tcp_timer_list.mtx_grp_attr = lck_grp_attr_alloc_init();
447	tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist", tcp_timer_list.mtx_grp_attr);
448	tcp_timer_list.mtx_attr = lck_attr_alloc_init();
449	if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp, tcp_timer_list.mtx_attr)) == NULL) {
450		panic("failed to allocate memory for tcp_timer_list.mtx\n");
451	};
452	tcp_timer_list.fast_quantum = TCP_FASTTIMER_QUANTUM;
453	tcp_timer_list.slow_quantum = TCP_SLOWTIMER_QUANTUM;
454	if ((tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL)) == NULL) {
455		panic("failed to allocate call entry 1 in tcp_init\n");
456	}
457
458	/*
459	 * allocate lock group attribute, group and attribute for tcp_uptime_lock
460	 */
461	tcp_uptime_mtx_grp_attr = lck_grp_attr_alloc_init();
462	tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime", tcp_uptime_mtx_grp_attr);
463	tcp_uptime_mtx_attr = lck_attr_alloc_init();
464	tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp, tcp_uptime_mtx_attr);
465
466	/* Initialize TCP congestion control algorithms list */
467	tcp_cc_init();
468
469	/* Initialize TCP LRO data structures */
470	tcp_lro_init();
471}
472
473/*
474 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
475 * tcp_template used to store this data in mbufs, but we now recopy it out
476 * of the tcpcb each time to conserve mbufs.
477 */
478void
479tcp_fillheaders(tp, ip_ptr, tcp_ptr)
480	struct tcpcb *tp;
481	void *ip_ptr;
482	void *tcp_ptr;
483{
484	struct inpcb *inp = tp->t_inpcb;
485	struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
486
487#if INET6
488	if ((inp->inp_vflag & INP_IPV6) != 0) {
489		struct ip6_hdr *ip6;
490
491		ip6 = (struct ip6_hdr *)ip_ptr;
492		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
493			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
494		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
495			(IPV6_VERSION & IPV6_VERSION_MASK);
496		ip6->ip6_nxt = IPPROTO_TCP;
497		ip6->ip6_plen = sizeof(struct tcphdr);
498		ip6->ip6_src = inp->in6p_laddr;
499		ip6->ip6_dst = inp->in6p_faddr;
500		tcp_hdr->th_sum = in6_cksum_phdr(&inp->in6p_laddr,
501		    &inp->in6p_faddr, htonl(sizeof(struct tcphdr)),
502		    htonl(IPPROTO_TCP));
503	} else
504#endif
505	{
506	struct ip *ip = (struct ip *) ip_ptr;
507
508	ip->ip_vhl = IP_VHL_BORING;
509	ip->ip_tos = 0;
510	ip->ip_len = 0;
511	ip->ip_id = 0;
512	ip->ip_off = 0;
513	ip->ip_ttl = 0;
514	ip->ip_sum = 0;
515	ip->ip_p = IPPROTO_TCP;
516	ip->ip_src = inp->inp_laddr;
517	ip->ip_dst = inp->inp_faddr;
518	tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
519		htons(sizeof(struct tcphdr) + IPPROTO_TCP));
520	}
521
522	tcp_hdr->th_sport = inp->inp_lport;
523	tcp_hdr->th_dport = inp->inp_fport;
524	tcp_hdr->th_seq = 0;
525	tcp_hdr->th_ack = 0;
526	tcp_hdr->th_x2 = 0;
527	tcp_hdr->th_off = 5;
528	tcp_hdr->th_flags = 0;
529	tcp_hdr->th_win = 0;
530	tcp_hdr->th_urp = 0;
531}
532
533/*
534 * Create template to be used to send tcp packets on a connection.
535 * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
536 * use for this function is in keepalives, which use tcp_respond.
537 */
538struct tcptemp *
539tcp_maketemplate(tp)
540	struct tcpcb *tp;
541{
542	struct mbuf *m;
543	struct tcptemp *n;
544
545	m = m_get(M_DONTWAIT, MT_HEADER);
546	if (m == NULL)
547		return (0);
548	m->m_len = sizeof(struct tcptemp);
549	n = mtod(m, struct tcptemp *);
550
551	tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
552	return (n);
553}
554
555/*
556 * Send a single message to the TCP at address specified by
557 * the given TCP/IP header.  If m == 0, then we make a copy
558 * of the tcpiphdr at ti and send directly to the addressed host.
559 * This is used to force keep alive messages out using the TCP
560 * template for a connection.  If flags are given then we send
561 * a message back to the TCP which originated the * segment ti,
562 * and discard the mbuf containing it and any other attached mbufs.
563 *
564 * In any case the ack and sequence number of the transmitted
565 * segment are as specified by the parameters.
566 *
567 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
568 */
569void
570tcp_respond(
571	struct tcpcb *tp,
572	void *ipgen,
573	register struct tcphdr *th,
574	register struct mbuf *m,
575	tcp_seq ack,
576	tcp_seq seq,
577	int flags,
578	unsigned int ifscope,
579	unsigned int nocell
580	)
581{
582	register int tlen;
583	int win = 0;
584	struct route *ro = 0;
585	struct route sro;
586	struct ip *ip;
587	struct tcphdr *nth;
588#if INET6
589	struct route_in6 *ro6 = 0;
590	struct route_in6 sro6;
591	struct ip6_hdr *ip6;
592	int isipv6;
593#endif /* INET6 */
594	struct ifnet *outif;
595
596#if INET6
597	isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
598	ip6 = ipgen;
599#endif /* INET6 */
600	ip = ipgen;
601
602	if (tp) {
603		if (!(flags & TH_RST)) {
604			win = tcp_sbspace(tp);
605			if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale)
606				win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
607		}
608#if INET6
609		if (isipv6)
610			ro6 = &tp->t_inpcb->in6p_route;
611		else
612#endif /* INET6 */
613		ro = &tp->t_inpcb->inp_route;
614	} else {
615#if INET6
616		if (isipv6) {
617			ro6 = &sro6;
618			bzero(ro6, sizeof *ro6);
619		} else
620#endif /* INET6 */
621		{
622			ro = &sro;
623			bzero(ro, sizeof *ro);
624		}
625	}
626	if (m == 0) {
627		m = m_gethdr(M_DONTWAIT, MT_HEADER);	/* MAC-OK */
628		if (m == NULL)
629			return;
630		tlen = 0;
631		m->m_data += max_linkhdr;
632#if INET6
633		if (isipv6) {
634			bcopy((caddr_t)ip6, mtod(m, caddr_t),
635			      sizeof(struct ip6_hdr));
636			ip6 = mtod(m, struct ip6_hdr *);
637			nth = (struct tcphdr *)(void *)(ip6 + 1);
638		} else
639#endif /* INET6 */
640		{
641			bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
642			ip = mtod(m, struct ip *);
643			nth = (struct tcphdr *)(void *)(ip + 1);
644		}
645		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
646		flags = TH_ACK;
647	} else {
648		m_freem(m->m_next);
649		m->m_next = 0;
650		m->m_data = (caddr_t)ipgen;
651		/* m_len is set later */
652		tlen = 0;
653#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
654#if INET6
655		if (isipv6) {
656			/* Expect 32-bit aligned IP on strict-align platforms */
657			IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6);
658			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
659			nth = (struct tcphdr *)(void *)(ip6 + 1);
660		} else
661#endif /* INET6 */
662	      {
663		/* Expect 32-bit aligned IP on strict-align platforms */
664		IP_HDR_STRICT_ALIGNMENT_CHECK(ip);
665		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
666		nth = (struct tcphdr *)(void *)(ip + 1);
667	      }
668		if (th != nth) {
669			/*
670			 * this is usually a case when an extension header
671			 * exists between the IPv6 header and the
672			 * TCP header.
673			 */
674			nth->th_sport = th->th_sport;
675			nth->th_dport = th->th_dport;
676		}
677		xchg(nth->th_dport, nth->th_sport, n_short);
678#undef xchg
679	}
680#if INET6
681	if (isipv6) {
682		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
683						tlen));
684		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
685	} else
686#endif
687      {
688	tlen += sizeof (struct tcpiphdr);
689	ip->ip_len = tlen;
690	ip->ip_ttl = ip_defttl;
691      }
692	m->m_len = tlen;
693	m->m_pkthdr.len = tlen;
694	m->m_pkthdr.rcvif = 0;
695#if CONFIG_MACF_NET
696	if (tp != NULL && tp->t_inpcb != NULL) {
697		/*
698		 * Packet is associated with a socket, so allow the
699		 * label of the response to reflect the socket label.
700		 */
701		mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
702	} else {
703		/*
704		 * Packet is not associated with a socket, so possibly
705		 * update the label in place.
706		 */
707		mac_netinet_tcp_reply(m);
708	}
709#endif
710
711	nth->th_seq = htonl(seq);
712	nth->th_ack = htonl(ack);
713	nth->th_x2 = 0;
714	nth->th_off = sizeof (struct tcphdr) >> 2;
715	nth->th_flags = flags;
716	if (tp)
717		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
718	else
719		nth->th_win = htons((u_short)win);
720	nth->th_urp = 0;
721#if INET6
722	if (isipv6) {
723		nth->th_sum = 0;
724		nth->th_sum = in6_cksum_phdr(&ip6->ip6_src,
725		    &ip6->ip6_dst, htons((u_short)(tlen - sizeof(struct ip6_hdr))),
726		    		htonl(IPPROTO_TCP));
727		m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
728		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
729		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
730					       ro6 && ro6->ro_rt ?
731					       ro6->ro_rt->rt_ifp :
732					       NULL);
733	} else
734#endif /* INET6 */
735	{
736		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
737		htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
738		m->m_pkthdr.csum_flags = CSUM_TCP;
739		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
740	}
741#if TCPDEBUG
742	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
743		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
744#endif
745#if IPSEC
746	if (ipsec_bypass == 0 && ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
747		m_freem(m);
748		return;
749	}
750#endif
751
752	if (tp != NULL) {
753		u_int32_t svc_flags = 0;
754		if (isipv6) {
755			svc_flags |= PKT_SCF_IPV6;
756		}
757		set_packet_service_class(m, tp->t_inpcb->inp_socket,
758		    MBUF_SC_UNSPEC, svc_flags);
759
760		/* Embed flowhash and flow control flags */
761		m->m_pkthdr.m_flowhash = tp->t_inpcb->inp_flowhash;
762		m->m_pkthdr.m_fhflags |=
763		    (PF_TAG_TCP | PF_TAG_FLOWHASH | PF_TAG_FLOWADV);
764	}
765
766#if INET6
767	if (isipv6) {
768		struct ip6_out_args ip6oa = { ifscope, { 0 },
769		    IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR };
770
771		if (ifscope != IFSCOPE_NONE)
772			ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
773		if (nocell)
774			ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
775
776		(void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL,
777		    NULL, &ip6oa);
778		if (ro6->ro_rt != NULL) {
779			if (ro6 == &sro6) {
780				rtfree(ro6->ro_rt);
781				ro6->ro_rt = NULL;
782			} else if ((outif = ro6->ro_rt->rt_ifp) !=
783			    tp->t_inpcb->in6p_last_outifp) {
784				tp->t_inpcb->in6p_last_outifp = outif;
785			}
786		}
787	} else
788#endif /* INET6 */
789	{
790		struct ip_out_args ipoa = { ifscope, { 0 },
791		    IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR };
792
793		if (ifscope != IFSCOPE_NONE)
794			ipoa.ipoa_flags |= IPOAF_BOUND_IF;
795		if (nocell)
796			ipoa.ipoa_flags |= IPOAF_NO_CELLULAR;
797
798		if (ro != &sro) {
799			/* Copy the cached route and take an extra reference */
800			inp_route_copyout(tp->t_inpcb, &sro);
801		}
802		/*
803		 * For consistency, pass a local route copy.
804		 */
805		(void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa);
806
807		if (ro != &sro) {
808			if (sro.ro_rt != NULL &&
809			    (outif = sro.ro_rt->rt_ifp) !=
810			    tp->t_inpcb->inp_last_outifp)
811				tp->t_inpcb->inp_last_outifp = outif;
812			/* Synchronize cached PCB route */
813			inp_route_copyin(tp->t_inpcb, &sro);
814		} else if (sro.ro_rt != NULL) {
815			rtfree(sro.ro_rt);
816		}
817	}
818}
819
820/*
821 * Create a new TCP control block, making an
822 * empty reassembly queue and hooking it to the argument
823 * protocol control block.  The `inp' parameter must have
824 * come from the zone allocator set up in tcp_init().
825 */
826struct tcpcb *
827tcp_newtcpcb(inp)
828	struct inpcb *inp;
829{
830	struct inp_tp *it;
831	register struct tcpcb *tp;
832	register struct socket *so = inp->inp_socket;
833#if INET6
834	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
835#endif /* INET6 */
836
837	calculate_tcp_clock();
838
839	if (so->cached_in_sock_layer == 0) {
840	     it = (struct inp_tp *)(void *)inp;
841	     tp = &it->tcb;
842	}
843	else
844	     tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb;
845
846	bzero((char *) tp, sizeof(struct tcpcb));
847	LIST_INIT(&tp->t_segq);
848	tp->t_maxseg = tp->t_maxopd =
849#if INET6
850		isipv6 ? tcp_v6mssdflt :
851#endif /* INET6 */
852		tcp_mssdflt;
853
854	if (tcp_do_rfc1323)
855		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
856	tp->sack_enable = tcp_do_sack;
857	TAILQ_INIT(&tp->snd_holes);
858	tp->t_inpcb = inp;	/* XXX */
859	/*
860	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
861	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
862	 * reasonable initial retransmit time.
863	 */
864	tp->t_srtt = TCPTV_SRTTBASE;
865	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
866	tp->t_rttmin = tcp_TCPTV_MIN;
867	tp->t_rxtcur = TCPTV_RTOBASE;
868
869	/* Initialize congestion control algorithm for this connection
870	 * to newreno by default
871	 */
872	tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
873	if (CC_ALGO(tp)->init != NULL) {
874		CC_ALGO(tp)->init(tp);
875	}
876
877	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
878	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
879	tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
880	tp->t_rcvtime = tcp_now;
881	tp->tentry.timer_start = tcp_now;
882	tp->t_persist_timeout = tcp_max_persist_timeout;
883	tp->t_persist_stop = 0;
884	tp->t_flagsext |= TF_RCVUNACK_WAITSS;
885	/*
886	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
887	 * because the socket may be bound to an IPv6 wildcard address,
888	 * which may match an IPv4-mapped IPv6 address.
889	 */
890	inp->inp_ip_ttl = ip_defttl;
891	inp->inp_ppcb = (caddr_t)tp;
892	return (tp);		/* XXX */
893}
894
895/*
896 * Drop a TCP connection, reporting
897 * the specified error.  If connection is synchronized,
898 * then send a RST to peer.
899 */
900struct tcpcb *
901tcp_drop(tp, errno)
902	register struct tcpcb *tp;
903	int errno;
904{
905	struct socket *so = tp->t_inpcb->inp_socket;
906#if CONFIG_DTRACE
907	struct inpcb *inp = tp->t_inpcb;
908#endif
909
910	if (TCPS_HAVERCVDSYN(tp->t_state)) {
911		DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
912			struct tcpcb *, tp, int32_t, TCPS_CLOSED);
913		tp->t_state = TCPS_CLOSED;
914		(void) tcp_output(tp);
915		tcpstat.tcps_drops++;
916	} else
917		tcpstat.tcps_conndrops++;
918	if (errno == ETIMEDOUT && tp->t_softerror)
919		errno = tp->t_softerror;
920	so->so_error = errno;
921	return (tcp_close(tp));
922}
923
924void
925tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt)
926{
927	u_int32_t rtt = rt->rt_rmx.rmx_rtt;
928	int isnetlocal = (tp->t_flags & TF_LOCAL);
929
930	if (rtt != 0) {
931		/*
932		 * XXX the lock bit for RTT indicates that the value
933		 * is also a minimum value; this is subject to time.
934		 */
935		if (rt->rt_rmx.rmx_locks & RTV_RTT)
936			tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
937		else
938			tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
939		tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
940		tcpstat.tcps_usedrtt++;
941		if (rt->rt_rmx.rmx_rttvar) {
942			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
943		    		(RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
944			tcpstat.tcps_usedrttvar++;
945		} else {
946			/* default variation is +- 1 rtt */
947			tp->t_rttvar =
948		    		tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
949		}
950		TCPT_RANGESET(tp->t_rxtcur,
951			((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
952			tp->t_rttmin, TCPTV_REXMTMAX,
953			TCP_ADD_REXMTSLOP(tp));
954	}
955}
956
957/*
958 * Close a TCP control block:
959 *	discard all space held by the tcp
960 *	discard internet protocol block
961 *	wake up any sleepers
962 */
963struct tcpcb *
964tcp_close(tp)
965	register struct tcpcb *tp;
966{
967	struct inpcb *inp = tp->t_inpcb;
968	struct socket *so = inp->inp_socket;
969#if INET6
970	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
971#endif /* INET6 */
972	struct rtentry *rt;
973	int dosavessthresh;
974
975	if ( inp->inp_ppcb == NULL) /* tcp_close was called previously, bail */
976		return(NULL);
977
978	tcp_canceltimers(tp);
979	KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0);
980
981	/*
982	 * If another thread for this tcp is currently in ip (indicated by
983	 * the TF_SENDINPROG flag), defer the cleanup until after it returns
984	 * back to tcp.  This is done to serialize the close until after all
985	 * pending output is finished, in order to avoid having the PCB be
986	 * detached and the cached route cleaned, only for ip to cache the
987	 * route back into the PCB again.  Note that we've cleared all the
988	 * timers at this point.  Set TF_CLOSING to indicate to tcp_output()
989	 * that is should call us again once it returns from ip; at that
990	 * point both flags should be cleared and we can proceed further
991	 * with the cleanup.
992	 */
993	if ((tp->t_flags & TF_CLOSING) ||
994		inp->inp_sndinprog_cnt > 0) {
995		tp->t_flags |= TF_CLOSING;
996		return (NULL);
997	}
998
999	if (CC_ALGO(tp)->cleanup != NULL) {
1000		CC_ALGO(tp)->cleanup(tp);
1001	}
1002
1003#if INET6
1004	rt = isipv6 ? inp->in6p_route.ro_rt : inp->inp_route.ro_rt;
1005#else
1006	rt = inp->inp_route.ro_rt;
1007#endif
1008	if (rt != NULL)
1009		RT_LOCK_SPIN(rt);
1010
1011	/*
1012	 * If we got enough samples through the srtt filter,
1013	 * save the rtt and rttvar in the routing entry.
1014	 * 'Enough' is arbitrarily defined as the 16 samples.
1015	 * 16 samples is enough for the srtt filter to converge
1016	 * to within 5% of the correct value; fewer samples and
1017	 * we could save a very bogus rtt.
1018	 *
1019	 * Don't update the default route's characteristics and don't
1020	 * update anything that the user "locked".
1021	 */
1022	if (tp->t_rttupdated >= 16) {
1023		register u_int32_t i = 0;
1024
1025#if INET6
1026		if (isipv6) {
1027			struct sockaddr_in6 *sin6;
1028
1029			if (rt == NULL)
1030				goto no_valid_rt;
1031			sin6 = (struct sockaddr_in6 *)(void *)rt_key(rt);
1032			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
1033				goto no_valid_rt;
1034		}
1035		else
1036#endif /* INET6 */
1037		if (rt == NULL || !(rt->rt_flags & RTF_UP) ||
1038		    ((struct sockaddr_in *)(void *)rt_key(rt))->sin_addr.s_addr ==
1039		    INADDR_ANY || rt->generation_id != route_generation) {
1040			if (tp->t_state >= TCPS_CLOSE_WAIT) {
1041				DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1042					struct tcpcb *, tp, int32_t, TCPS_CLOSING);
1043				tp->t_state = TCPS_CLOSING;
1044			}
1045			goto no_valid_rt;
1046		}
1047
1048		RT_LOCK_ASSERT_HELD(rt);
1049		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
1050			i = tp->t_srtt *
1051			    (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1052			if (rt->rt_rmx.rmx_rtt && i)
1053				/*
1054				 * filter this update to half the old & half
1055				 * the new values, converting scale.
1056				 * See route.h and tcp_var.h for a
1057				 * description of the scaling constants.
1058				 */
1059				rt->rt_rmx.rmx_rtt =
1060				    (rt->rt_rmx.rmx_rtt + i) / 2;
1061			else
1062				rt->rt_rmx.rmx_rtt = i;
1063			tcpstat.tcps_cachedrtt++;
1064		}
1065		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
1066			i = tp->t_rttvar *
1067			    (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1068			if (rt->rt_rmx.rmx_rttvar && i)
1069				rt->rt_rmx.rmx_rttvar =
1070				    (rt->rt_rmx.rmx_rttvar + i) / 2;
1071			else
1072				rt->rt_rmx.rmx_rttvar = i;
1073			tcpstat.tcps_cachedrttvar++;
1074		}
1075		/*
1076		 * The old comment here said:
1077		 * update the pipelimit (ssthresh) if it has been updated
1078		 * already or if a pipesize was specified & the threshhold
1079		 * got below half the pipesize.  I.e., wait for bad news
1080		 * before we start updating, then update on both good
1081		 * and bad news.
1082		 *
1083		 * But we want to save the ssthresh even if no pipesize is
1084		 * specified explicitly in the route, because such
1085		 * connections still have an implicit pipesize specified
1086		 * by the global tcp_sendspace.  In the absence of a reliable
1087		 * way to calculate the pipesize, it will have to do.
1088		 */
1089		i = tp->snd_ssthresh;
1090		if (rt->rt_rmx.rmx_sendpipe != 0)
1091			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
1092		else
1093			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
1094		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
1095		     i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
1096		    || dosavessthresh) {
1097			/*
1098			 * convert the limit from user data bytes to
1099			 * packets then to packet data bytes.
1100			 */
1101			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
1102			if (i < 2)
1103				i = 2;
1104			i *= (u_int32_t)(tp->t_maxseg +
1105#if INET6
1106				      (isipv6 ? sizeof (struct ip6_hdr) +
1107					       sizeof (struct tcphdr) :
1108#endif
1109				       sizeof (struct tcpiphdr)
1110#if INET6
1111				       )
1112#endif
1113				      );
1114			if (rt->rt_rmx.rmx_ssthresh)
1115				rt->rt_rmx.rmx_ssthresh =
1116				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
1117			else
1118				rt->rt_rmx.rmx_ssthresh = i;
1119			tcpstat.tcps_cachedssthresh++;
1120		}
1121	}
1122
1123	/*
1124	 * Mark route for deletion if no information is cached.
1125	 */
1126	if (rt != NULL && (so->so_flags & SOF_OVERFLOW) && tcp_lq_overflow) {
1127		if (!(rt->rt_rmx.rmx_locks & RTV_RTT) &&
1128		    rt->rt_rmx.rmx_rtt == 0) {
1129			rt->rt_flags |= RTF_DELCLONE;
1130		}
1131	}
1132
1133no_valid_rt:
1134	if (rt != NULL)
1135		RT_UNLOCK(rt);
1136
1137	/* free the reassembly queue, if any */
1138	(void) tcp_freeq(tp);
1139
1140	tcp_free_sackholes(tp);
1141	if (tp->t_bwmeas != NULL) {
1142		tcp_bwmeas_free(tp);
1143	}
1144
1145	/* Free the packet list */
1146	if (tp->t_pktlist_head != NULL)
1147		m_freem_list(tp->t_pktlist_head);
1148	TCP_PKTLIST_CLEAR(tp);
1149
1150#ifdef __APPLE__
1151	if (so->cached_in_sock_layer)
1152	    inp->inp_saved_ppcb = (caddr_t) tp;
1153#endif
1154	/* Issue a wakeup before detach so that we don't miss
1155	 * a wakeup
1156	 */
1157	sodisconnectwakeup(so);
1158
1159	/*
1160	 * Clean up any LRO state
1161	 */
1162	if (tp->t_flagsext & TF_LRO_OFFLOADED) {
1163		tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
1164			inp->inp_lport,
1165			inp->inp_fport);
1166		tp->t_flagsext &= ~TF_LRO_OFFLOADED;
1167	}
1168
1169#if INET6
1170	if (INP_CHECK_SOCKAF(so, AF_INET6))
1171		in6_pcbdetach(inp);
1172	else
1173#endif /* INET6 */
1174	in_pcbdetach(inp);
1175
1176	/* Call soisdisconnected after detach because it might unlock the socket */
1177	soisdisconnected(so);
1178	tcpstat.tcps_closed++;
1179	KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END, tcpstat.tcps_closed,0,0,0,0);
1180	return(NULL);
1181}
1182
1183int
1184tcp_freeq(tp)
1185	struct tcpcb *tp;
1186{
1187
1188	register struct tseg_qent *q;
1189	int rv = 0;
1190
1191	while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
1192		LIST_REMOVE(q, tqe_q);
1193		m_freem(q->tqe_m);
1194		zfree(tcp_reass_zone, q);
1195		tcp_reass_qsize--;
1196		rv = 1;
1197	}
1198	return (rv);
1199}
1200
1201void
1202tcp_drain()
1203{
1204	if (do_tcpdrain)
1205	{
1206		struct inpcb *inpb;
1207		struct tcpcb *tcpb;
1208		struct tseg_qent *te;
1209
1210	/*
1211	 * Walk the tcpbs, if existing, and flush the reassembly queue,
1212	 * if there is one...
1213	 * XXX: The "Net/3" implementation doesn't imply that the TCP
1214	 *      reassembly queue should be flushed, but in a situation
1215	 * 	where we're really low on mbufs, this is potentially
1216	 *  	usefull.
1217	 */
1218		if (!lck_rw_try_lock_exclusive(tcbinfo.mtx)) /* do it next time if the lock is in use */
1219			return;
1220
1221		for (inpb = LIST_FIRST(tcbinfo.listhead); inpb;
1222	    		inpb = LIST_NEXT(inpb, inp_list)) {
1223				if ((tcpb = intotcpcb(inpb))) {
1224					while ((te = LIST_FIRST(&tcpb->t_segq))
1225					       != NULL) {
1226					LIST_REMOVE(te, tqe_q);
1227					m_freem(te->tqe_m);
1228					zfree(tcp_reass_zone, te);
1229					tcp_reass_qsize--;
1230				}
1231			}
1232		}
1233		lck_rw_done(tcbinfo.mtx);
1234
1235	}
1236}
1237
1238/*
1239 * Notify a tcp user of an asynchronous error;
1240 * store error as soft error, but wake up user
1241 * (for now, won't do anything until can select for soft error).
1242 *
1243 * Do not wake up user since there currently is no mechanism for
1244 * reporting soft errors (yet - a kqueue filter may be added).
1245 */
1246static void
1247tcp_notify(inp, error)
1248	struct inpcb *inp;
1249	int error;
1250{
1251	struct tcpcb *tp;
1252
1253	if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD))
1254		return; /* pcb is gone already */
1255
1256	tp = (struct tcpcb *)inp->inp_ppcb;
1257
1258	/*
1259	 * Ignore some errors if we are hooked up.
1260	 * If connection hasn't completed, has retransmitted several times,
1261	 * and receives a second error, give up now.  This is better
1262	 * than waiting a long time to establish a connection that
1263	 * can never complete.
1264	 */
1265	if (tp->t_state == TCPS_ESTABLISHED &&
1266	     (error == EHOSTUNREACH || error == ENETUNREACH ||
1267	      error == EHOSTDOWN)) {
1268		return;
1269	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
1270	    tp->t_softerror)
1271		tcp_drop(tp, error);
1272	else
1273		tp->t_softerror = error;
1274#if 0
1275	wakeup((caddr_t) &so->so_timeo);
1276	sorwakeup(so);
1277	sowwakeup(so);
1278#endif
1279}
1280
1281struct bwmeas*
1282tcp_bwmeas_alloc(struct tcpcb *tp)
1283{
1284	struct bwmeas *elm;
1285	elm = zalloc(tcp_bwmeas_zone);
1286	if (elm == NULL)
1287		return(elm);
1288
1289	bzero(elm, bwmeas_elm_size);
1290	elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE;
1291	elm->bw_maxsizepkts = TCP_BWMEAS_BURST_MAXSIZE;
1292	elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg;
1293	elm->bw_maxsize = elm->bw_maxsizepkts * tp->t_maxseg;
1294	return(elm);
1295}
1296
1297void
1298tcp_bwmeas_free(struct tcpcb* tp)
1299{
1300	zfree(tcp_bwmeas_zone, tp->t_bwmeas);
1301	tp->t_bwmeas = NULL;
1302	tp->t_flagsext &= ~(TF_MEASURESNDBW);
1303}
1304
1305/*
1306 * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format.
1307 * The otcpcb data structure is passed to user space and must not change.
1308 */
1309static void
1310tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp)
1311{
1312	int i;
1313
1314	otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first;
1315	otp->t_dupacks = tp->t_dupacks;
1316	for (i = 0; i < TCPT_NTIMERS_EXT; i++)
1317		otp->t_timer[i] = tp->t_timer[i];
1318	otp->t_inpcb = (_TCPCB_PTR(struct inpcb *))(uintptr_t)tp->t_inpcb;
1319	otp->t_state = tp->t_state;
1320	otp->t_flags = tp->t_flags;
1321	otp->t_force = tp->t_force;
1322	otp->snd_una = tp->snd_una;
1323	otp->snd_max = tp->snd_max;
1324	otp->snd_nxt = tp->snd_nxt;
1325	otp->snd_up = tp->snd_up;
1326	otp->snd_wl1 = tp->snd_wl1;
1327	otp->snd_wl2 = tp->snd_wl2;
1328	otp->iss = tp->iss;
1329	otp->irs = tp->irs;
1330	otp->rcv_nxt = tp->rcv_nxt;
1331	otp->rcv_adv = tp->rcv_adv;
1332	otp->rcv_wnd = tp->rcv_wnd;
1333	otp->rcv_up = tp->rcv_up;
1334	otp->snd_wnd = tp->snd_wnd;
1335	otp->snd_cwnd = tp->snd_cwnd;
1336	otp->snd_ssthresh = tp->snd_ssthresh;
1337	otp->t_maxopd = tp->t_maxopd;
1338	otp->t_rcvtime = tp->t_rcvtime;
1339	otp->t_starttime = tp->t_starttime;
1340	otp->t_rtttime = tp->t_rtttime;
1341	otp->t_rtseq = tp->t_rtseq;
1342	otp->t_rxtcur = tp->t_rxtcur;
1343	otp->t_maxseg = tp->t_maxseg;
1344	otp->t_srtt = tp->t_srtt;
1345	otp->t_rttvar = tp->t_rttvar;
1346	otp->t_rxtshift = tp->t_rxtshift;
1347	otp->t_rttmin = tp->t_rttmin;
1348	otp->t_rttupdated = tp->t_rttupdated;
1349	otp->max_sndwnd = tp->max_sndwnd;
1350	otp->t_softerror = tp->t_softerror;
1351	otp->t_oobflags = tp->t_oobflags;
1352	otp->t_iobc = tp->t_iobc;
1353	otp->snd_scale = tp->snd_scale;
1354	otp->rcv_scale = tp->rcv_scale;
1355	otp->request_r_scale = tp->request_r_scale;
1356	otp->requested_s_scale = tp->requested_s_scale;
1357	otp->ts_recent = tp->ts_recent;
1358	otp->ts_recent_age = tp->ts_recent_age;
1359	otp->last_ack_sent = tp->last_ack_sent;
1360	otp->cc_send = tp->cc_send;
1361	otp->cc_recv = tp->cc_recv;
1362	otp->snd_recover = tp->snd_recover;
1363	otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1364	otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1365	otp->t_badrxtwin = tp->t_badrxtwin;
1366}
1367
1368static int
1369tcp_pcblist SYSCTL_HANDLER_ARGS
1370{
1371#pragma unused(oidp, arg1, arg2)
1372	int error, i, n;
1373	struct inpcb *inp, **inp_list;
1374	inp_gen_t gencnt;
1375	struct xinpgen xig;
1376	int slot;
1377
1378	/*
1379	 * The process of preparing the TCB list is too time-consuming and
1380	 * resource-intensive to repeat twice on every request.
1381	 */
1382	lck_rw_lock_shared(tcbinfo.mtx);
1383	if (req->oldptr == USER_ADDR_NULL) {
1384		n = tcbinfo.ipi_count;
1385		req->oldidx = 2 * (sizeof xig)
1386			+ (n + n/8) * sizeof(struct xtcpcb);
1387		lck_rw_done(tcbinfo.mtx);
1388		return 0;
1389	}
1390
1391	if (req->newptr != USER_ADDR_NULL) {
1392		lck_rw_done(tcbinfo.mtx);
1393		return EPERM;
1394	}
1395
1396	/*
1397	 * OK, now we're committed to doing something.
1398	 */
1399	gencnt = tcbinfo.ipi_gencnt;
1400	n = tcbinfo.ipi_count;
1401
1402	bzero(&xig, sizeof(xig));
1403	xig.xig_len = sizeof xig;
1404	xig.xig_count = n;
1405	xig.xig_gen = gencnt;
1406	xig.xig_sogen = so_gencnt;
1407	error = SYSCTL_OUT(req, &xig, sizeof xig);
1408	if (error) {
1409		lck_rw_done(tcbinfo.mtx);
1410		return error;
1411	}
1412	/*
1413	 * We are done if there is no pcb
1414	 */
1415	if (n == 0) {
1416		lck_rw_done(tcbinfo.mtx);
1417		return 0;
1418	}
1419
1420	inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1421	if (inp_list == 0) {
1422		lck_rw_done(tcbinfo.mtx);
1423		return ENOMEM;
1424	}
1425
1426	for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
1427	     inp = LIST_NEXT(inp, inp_list)) {
1428#ifdef __APPLE__
1429		if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1430#else
1431		if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
1432#endif
1433			inp_list[i++] = inp;
1434	}
1435
1436	for (slot = 0; slot < N_TIME_WAIT_SLOTS; slot++) {
1437		struct inpcb *inpnxt;
1438
1439		for (inp = time_wait_slots[slot].lh_first; inp && i < n; inp = inpnxt) {
1440			inpnxt = inp->inp_list.le_next;
1441			if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1442				inp_list[i++] = inp;
1443		}
1444	}
1445
1446	n = i;
1447
1448	error = 0;
1449	for (i = 0; i < n; i++) {
1450		inp = inp_list[i];
1451		if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1452			struct xtcpcb xt;
1453			caddr_t inp_ppcb;
1454
1455			bzero(&xt, sizeof(xt));
1456			xt.xt_len = sizeof xt;
1457			/* XXX should avoid extra copy */
1458			inpcb_to_compat(inp, &xt.xt_inp);
1459			inp_ppcb = inp->inp_ppcb;
1460			if (inp_ppcb != NULL) {
1461				tcpcb_to_otcpcb((struct tcpcb *)(void *)inp_ppcb,
1462				    &xt.xt_tp);
1463			} else {
1464				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
1465			}
1466			if (inp->inp_socket)
1467				sotoxsocket(inp->inp_socket, &xt.xt_socket);
1468			error = SYSCTL_OUT(req, &xt, sizeof xt);
1469		}
1470	}
1471	if (!error) {
1472		/*
1473		 * Give the user an updated idea of our state.
1474		 * If the generation differs from what we told
1475		 * them before, they know that something happened
1476		 * while we were processing this request, and it
1477		 * might be necessary to retry.
1478		 */
1479		bzero(&xig, sizeof(xig));
1480		xig.xig_len = sizeof xig;
1481		xig.xig_gen = tcbinfo.ipi_gencnt;
1482		xig.xig_sogen = so_gencnt;
1483		xig.xig_count = tcbinfo.ipi_count;
1484		error = SYSCTL_OUT(req, &xig, sizeof xig);
1485	}
1486	FREE(inp_list, M_TEMP);
1487	lck_rw_done(tcbinfo.mtx);
1488	return error;
1489}
1490
1491SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1492	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
1493
1494#if !CONFIG_EMBEDDED
1495
1496static void
1497tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp)
1498{
1499        int i;
1500
1501        otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first;
1502        otp->t_dupacks = tp->t_dupacks;
1503        for (i = 0; i < TCPT_NTIMERS_EXT; i++)
1504                otp->t_timer[i] = tp->t_timer[i];
1505        otp->t_state = tp->t_state;
1506        otp->t_flags = tp->t_flags;
1507        otp->t_force = tp->t_force;
1508        otp->snd_una = tp->snd_una;
1509        otp->snd_max = tp->snd_max;
1510        otp->snd_nxt = tp->snd_nxt;
1511        otp->snd_up = tp->snd_up;
1512        otp->snd_wl1 = tp->snd_wl1;
1513        otp->snd_wl2 = tp->snd_wl2;
1514        otp->iss = tp->iss;
1515        otp->irs = tp->irs;
1516        otp->rcv_nxt = tp->rcv_nxt;
1517        otp->rcv_adv = tp->rcv_adv;
1518        otp->rcv_wnd = tp->rcv_wnd;
1519        otp->rcv_up = tp->rcv_up;
1520        otp->snd_wnd = tp->snd_wnd;
1521        otp->snd_cwnd = tp->snd_cwnd;
1522        otp->snd_ssthresh = tp->snd_ssthresh;
1523        otp->t_maxopd = tp->t_maxopd;
1524        otp->t_rcvtime = tp->t_rcvtime;
1525        otp->t_starttime = tp->t_starttime;
1526        otp->t_rtttime = tp->t_rtttime;
1527        otp->t_rtseq = tp->t_rtseq;
1528        otp->t_rxtcur = tp->t_rxtcur;
1529        otp->t_maxseg = tp->t_maxseg;
1530        otp->t_srtt = tp->t_srtt;
1531        otp->t_rttvar = tp->t_rttvar;
1532        otp->t_rxtshift = tp->t_rxtshift;
1533        otp->t_rttmin = tp->t_rttmin;
1534        otp->t_rttupdated = tp->t_rttupdated;
1535        otp->max_sndwnd = tp->max_sndwnd;
1536        otp->t_softerror = tp->t_softerror;
1537        otp->t_oobflags = tp->t_oobflags;
1538        otp->t_iobc = tp->t_iobc;
1539        otp->snd_scale = tp->snd_scale;
1540        otp->rcv_scale = tp->rcv_scale;
1541        otp->request_r_scale = tp->request_r_scale;
1542        otp->requested_s_scale = tp->requested_s_scale;
1543        otp->ts_recent = tp->ts_recent;
1544        otp->ts_recent_age = tp->ts_recent_age;
1545        otp->last_ack_sent = tp->last_ack_sent;
1546        otp->cc_send = tp->cc_send;
1547        otp->cc_recv = tp->cc_recv;
1548        otp->snd_recover = tp->snd_recover;
1549        otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1550        otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1551        otp->t_badrxtwin = tp->t_badrxtwin;
1552}
1553
1554
1555static int
1556tcp_pcblist64 SYSCTL_HANDLER_ARGS
1557{
1558#pragma unused(oidp, arg1, arg2)
1559        int error, i, n;
1560        struct inpcb *inp, **inp_list;
1561        inp_gen_t gencnt;
1562        struct xinpgen xig;
1563        int slot;
1564
1565        /*
1566         * The process of preparing the TCB list is too time-consuming and
1567         * resource-intensive to repeat twice on every request.
1568         */
1569        lck_rw_lock_shared(tcbinfo.mtx);
1570        if (req->oldptr == USER_ADDR_NULL) {
1571                n = tcbinfo.ipi_count;
1572                req->oldidx = 2 * (sizeof xig)
1573                        + (n + n/8) * sizeof(struct xtcpcb64);
1574                lck_rw_done(tcbinfo.mtx);
1575                return 0;
1576        }
1577
1578        if (req->newptr != USER_ADDR_NULL) {
1579                lck_rw_done(tcbinfo.mtx);
1580                return EPERM;
1581        }
1582
1583        /*
1584         * OK, now we're committed to doing something.
1585         */
1586        gencnt = tcbinfo.ipi_gencnt;
1587        n = tcbinfo.ipi_count;
1588
1589        bzero(&xig, sizeof(xig));
1590        xig.xig_len = sizeof xig;
1591        xig.xig_count = n;
1592        xig.xig_gen = gencnt;
1593        xig.xig_sogen = so_gencnt;
1594        error = SYSCTL_OUT(req, &xig, sizeof xig);
1595        if (error) {
1596                lck_rw_done(tcbinfo.mtx);
1597                return error;
1598        }
1599        /*
1600         * We are done if there is no pcb
1601         */
1602        if (n == 0) {
1603                lck_rw_done(tcbinfo.mtx);
1604                return 0;
1605        }
1606
1607        inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1608        if (inp_list == 0) {
1609                lck_rw_done(tcbinfo.mtx);
1610                return ENOMEM;
1611        }
1612
1613        for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
1614             inp = LIST_NEXT(inp, inp_list)) {
1615#ifdef __APPLE__
1616                if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1617#else
1618                if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
1619#endif
1620                        inp_list[i++] = inp;
1621        }
1622
1623        for (slot = 0; slot < N_TIME_WAIT_SLOTS; slot++) {
1624                struct inpcb *inpnxt;
1625
1626                for (inp = time_wait_slots[slot].lh_first; inp && i < n; inp = inpnxt) {
1627                        inpnxt = inp->inp_list.le_next;
1628                        if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD)
1629                                inp_list[i++] = inp;
1630                }
1631        }
1632
1633        n = i;
1634
1635        error = 0;
1636        for (i = 0; i < n; i++) {
1637                inp = inp_list[i];
1638                if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) {
1639					struct xtcpcb64 xt;
1640
1641					bzero(&xt, sizeof(xt));
1642					xt.xt_len = sizeof xt;
1643					inpcb_to_xinpcb64(inp, &xt.xt_inpcb);
1644					xt.xt_inpcb.inp_ppcb = (u_int64_t)(uintptr_t)inp->inp_ppcb;
1645					if (inp->inp_ppcb != NULL)
1646						tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, &xt);
1647					if (inp->inp_socket)
1648						sotoxsocket64(inp->inp_socket, &xt.xt_inpcb.xi_socket);
1649					error = SYSCTL_OUT(req, &xt, sizeof xt);
1650                }
1651        }
1652        if (!error) {
1653			/*
1654			 * Give the user an updated idea of our state.
1655			 * If the generation differs from what we told
1656			 * them before, they know that something happened
1657			 * while we were processing this request, and it
1658			 * might be necessary to retry.
1659			 */
1660			bzero(&xig, sizeof(xig));
1661			xig.xig_len = sizeof xig;
1662			xig.xig_gen = tcbinfo.ipi_gencnt;
1663			xig.xig_sogen = so_gencnt;
1664			xig.xig_count = tcbinfo.ipi_count;
1665			error = SYSCTL_OUT(req, &xig, sizeof xig);
1666        }
1667        FREE(inp_list, M_TEMP);
1668        lck_rw_done(tcbinfo.mtx);
1669        return error;
1670}
1671
1672SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1673            tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections");
1674
1675#endif /* !CONFIG_EMBEDDED */
1676
1677static int
1678tcp_pcblist_n SYSCTL_HANDLER_ARGS
1679{
1680#pragma unused(oidp, arg1, arg2)
1681	int error = 0;
1682
1683	error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo);
1684
1685	return error;
1686}
1687
1688
1689SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1690            tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
1691
1692
1693__private_extern__ void
1694tcp_get_ports_used(unsigned int ifindex, uint8_t *bitfield)
1695{
1696	inpcb_get_ports_used(ifindex, bitfield, &tcbinfo);
1697}
1698
1699__private_extern__ uint32_t
1700tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
1701{
1702	return inpcb_count_opportunistic(ifindex, &tcbinfo, flags);
1703}
1704
1705void
1706tcp_ctlinput(cmd, sa, vip)
1707	int cmd;
1708	struct sockaddr *sa;
1709	void *vip;
1710{
1711	tcp_seq icmp_tcp_seq;
1712	struct ip *ip = vip;
1713	struct in_addr faddr;
1714	struct inpcb *inp;
1715	struct tcpcb *tp;
1716
1717	void (*notify)(struct inpcb *, int) = tcp_notify;
1718
1719	faddr = ((struct sockaddr_in *)(void *)sa)->sin_addr;
1720	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1721		return;
1722
1723	if (cmd == PRC_MSGSIZE)
1724		notify = tcp_mtudisc;
1725	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
1726		cmd == PRC_UNREACH_PORT) && ip)
1727		notify = tcp_drop_syn_sent;
1728	else if (PRC_IS_REDIRECT(cmd)) {
1729		ip = 0;
1730		notify = in_rtchange;
1731	} else if (cmd == PRC_HOSTDEAD)
1732		ip = 0;
1733	/* Source quench is deprecated */
1734	else if (cmd == PRC_QUENCH)
1735		return;
1736	else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
1737		return;
1738	if (ip) {
1739		struct tcphdr th;
1740		struct icmp *icp;
1741
1742		icp = (struct icmp *)(void *)
1743		    ((caddr_t)ip - offsetof(struct icmp, icmp_ip));
1744		bcopy(((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)),
1745		    &th, sizeof (th));
1746		inp = in_pcblookup_hash(&tcbinfo, faddr, th.th_dport,
1747		    ip->ip_src, th.th_sport, 0, NULL);
1748		if (inp != NULL && inp->inp_socket != NULL) {
1749			tcp_lock(inp->inp_socket, 1, 0);
1750			if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1751				tcp_unlock(inp->inp_socket, 1, 0);
1752				return;
1753			}
1754			icmp_tcp_seq = htonl(th.th_seq);
1755			tp = intotcpcb(inp);
1756			if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
1757			    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
1758				if (cmd == PRC_MSGSIZE) {
1759
1760					/*
1761				  	 * MTU discovery:
1762				 	 * If we got a needfrag and there is a host route to the
1763				 	 * original destination, and the MTU is not locked, then
1764				 	 * set the MTU in the route to the suggested new value
1765				 	 * (if given) and then notify as usual.  The ULPs will
1766				 	 * notice that the MTU has changed and adapt accordingly.
1767				 	 * If no new MTU was suggested, then we guess a new one
1768				 	 * less than the current value.  If the new MTU is
1769				 	 * unreasonably small (defined by sysctl tcp_minmss), then
1770				 	 * we reset the MTU to the interface value and enable the
1771				 	 * lock bit, indicating that we are no longer doing MTU
1772				 	 * discovery.
1773				 	 */
1774					struct rtentry *rt;
1775					int mtu;
1776					struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET,
1777										0 , { 0 }, { 0,0,0,0,0,0,0,0 } };
1778					icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
1779
1780					rt = rtalloc1((struct sockaddr *)&icmpsrc, 0,
1781					    RTF_CLONING | RTF_PRCLONING);
1782					if (rt != NULL) {
1783						RT_LOCK(rt);
1784						if ((rt->rt_flags & RTF_HOST) &&
1785						    !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
1786							mtu = ntohs(icp->icmp_nextmtu);
1787							if (!mtu)
1788								mtu = ip_next_mtu(rt->rt_rmx.
1789								    rmx_mtu, 1);
1790#if DEBUG_MTUDISC
1791							printf("MTU for %s reduced to %d\n",
1792							    inet_ntop(AF_INET,
1793							    &icmpsrc.sin_addr, ipv4str,
1794							    sizeof (ipv4str)), mtu);
1795#endif
1796							if (mtu < max(296, (tcp_minmss +
1797							    sizeof (struct tcpiphdr)))) {
1798								/* rt->rt_rmx.rmx_mtu =
1799									rt->rt_ifp->if_mtu; */
1800								rt->rt_rmx.rmx_locks |= RTV_MTU;
1801							} else if (rt->rt_rmx.rmx_mtu > mtu) {
1802								rt->rt_rmx.rmx_mtu = mtu;
1803							}
1804						}
1805						RT_UNLOCK(rt);
1806						rtfree(rt);
1807					}
1808				}
1809
1810				(*notify)(inp, inetctlerrmap[cmd]);
1811			}
1812			tcp_unlock(inp->inp_socket, 1, 0);
1813		}
1814	} else
1815		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
1816}
1817
1818#if INET6
1819void
1820tcp6_ctlinput(cmd, sa, d)
1821	int cmd;
1822	struct sockaddr *sa;
1823	void *d;
1824{
1825	struct tcphdr th;
1826	void (*notify)(struct inpcb *, int) = tcp_notify;
1827	struct ip6_hdr *ip6;
1828	struct mbuf *m;
1829	struct ip6ctlparam *ip6cp = NULL;
1830	const struct sockaddr_in6 *sa6_src = NULL;
1831	int off;
1832	struct tcp_portonly {
1833		u_int16_t th_sport;
1834		u_int16_t th_dport;
1835	} *thp;
1836
1837	if (sa->sa_family != AF_INET6 ||
1838	    sa->sa_len != sizeof(struct sockaddr_in6))
1839		return;
1840
1841	if (cmd == PRC_MSGSIZE)
1842		notify = tcp_mtudisc;
1843	else if (!PRC_IS_REDIRECT(cmd) &&
1844		 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1845		return;
1846	/* Source quench is deprecated */
1847	else if (cmd == PRC_QUENCH)
1848		return;
1849
1850	/* if the parameter is from icmp6, decode it. */
1851	if (d != NULL) {
1852		ip6cp = (struct ip6ctlparam *)d;
1853		m = ip6cp->ip6c_m;
1854		ip6 = ip6cp->ip6c_ip6;
1855		off = ip6cp->ip6c_off;
1856		sa6_src = ip6cp->ip6c_src;
1857	} else {
1858		m = NULL;
1859		ip6 = NULL;
1860		off = 0;	/* fool gcc */
1861		sa6_src = &sa6_any;
1862	}
1863
1864	if (ip6) {
1865		/*
1866		 * XXX: We assume that when IPV6 is non NULL,
1867		 * M and OFF are valid.
1868		 */
1869
1870		/* check if we can safely examine src and dst ports */
1871		if (m->m_pkthdr.len < off + sizeof(*thp))
1872			return;
1873
1874		bzero(&th, sizeof(th));
1875		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1876
1877		in6_pcbnotify(&tcbinfo, sa, th.th_dport,
1878		    (struct sockaddr *)ip6cp->ip6c_src,
1879		    th.th_sport, cmd, NULL, notify);
1880	} else {
1881		in6_pcbnotify(&tcbinfo, sa, 0,
1882		    (struct sockaddr *)(size_t)sa6_src, 0, cmd, NULL, notify);
1883	}
1884}
1885#endif /* INET6 */
1886
1887
1888/*
1889 * Following is where TCP initial sequence number generation occurs.
1890 *
1891 * There are two places where we must use initial sequence numbers:
1892 * 1.  In SYN-ACK packets.
1893 * 2.  In SYN packets.
1894 *
1895 * The ISNs in SYN-ACK packets have no monotonicity requirement,
1896 * and should be as unpredictable as possible to avoid the possibility
1897 * of spoofing and/or connection hijacking.  To satisfy this
1898 * requirement, SYN-ACK ISNs are generated via the arc4random()
1899 * function.  If exact RFC 1948 compliance is requested via sysctl,
1900 * these ISNs will be generated just like those in SYN packets.
1901 *
1902 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
1903 * depends on this property.  In addition, these ISNs should be
1904 * unguessable so as to prevent connection hijacking.  To satisfy
1905 * the requirements of this situation, the algorithm outlined in
1906 * RFC 1948 is used to generate sequence numbers.
1907 *
1908 * For more information on the theory of operation, please see
1909 * RFC 1948.
1910 *
1911 * Implementation details:
1912 *
1913 * Time is based off the system timer, and is corrected so that it
1914 * increases by one megabyte per second.  This allows for proper
1915 * recycling on high speed LANs while still leaving over an hour
1916 * before rollover.
1917 *
1918 * Two sysctls control the generation of ISNs:
1919 *
1920 * net.inet.tcp.isn_reseed_interval controls the number of seconds
1921 * between seeding of isn_secret.  This is normally set to zero,
1922 * as reseeding should not be necessary.
1923 *
1924 * net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed
1925 * strictly.  When strict compliance is requested, reseeding is
1926 * disabled and SYN-ACKs will be generated in the same manner as
1927 * SYNs.  Strict mode is disabled by default.
1928 *
1929 */
1930
1931#define ISN_BYTES_PER_SECOND 1048576
1932
1933tcp_seq
1934tcp_new_isn(tp)
1935	struct tcpcb *tp;
1936{
1937	u_int32_t md5_buffer[4];
1938	tcp_seq new_isn;
1939	struct timeval timenow;
1940	u_char isn_secret[32];
1941	int isn_last_reseed = 0;
1942	MD5_CTX isn_ctx;
1943
1944	/* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */
1945	if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT))
1946	   && tcp_strict_rfc1948 == 0)
1947#ifdef __APPLE__
1948		return random();
1949#else
1950		return arc4random();
1951#endif
1952	getmicrotime(&timenow);
1953
1954	/* Seed if this is the first use, reseed if requested. */
1955	if ((isn_last_reseed == 0) ||
1956	    ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) &&
1957	     (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
1958		< (u_int)timenow.tv_sec))) {
1959#ifdef __APPLE__
1960		read_random(&isn_secret, sizeof(isn_secret));
1961#else
1962		read_random_unlimited(&isn_secret, sizeof(isn_secret));
1963#endif
1964		isn_last_reseed = timenow.tv_sec;
1965	}
1966
1967	/* Compute the md5 hash and return the ISN. */
1968	MD5Init(&isn_ctx);
1969	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
1970	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
1971#if INET6
1972	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
1973		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
1974			  sizeof(struct in6_addr));
1975		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
1976			  sizeof(struct in6_addr));
1977	} else
1978#endif
1979	{
1980		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
1981			  sizeof(struct in_addr));
1982		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
1983			  sizeof(struct in_addr));
1984	}
1985	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
1986	MD5Final((u_char *) &md5_buffer, &isn_ctx);
1987	new_isn = (tcp_seq) md5_buffer[0];
1988	new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz);
1989	return new_isn;
1990}
1991
1992
1993/*
1994 * When a specific ICMP unreachable message is received and the
1995 * connection state is SYN-SENT, drop the connection.  This behavior
1996 * is controlled by the icmp_may_rst sysctl.
1997 */
1998void
1999tcp_drop_syn_sent(inp, errno)
2000	struct inpcb *inp;
2001	int errno;
2002{
2003	struct tcpcb *tp = intotcpcb(inp);
2004
2005	if (tp && tp->t_state == TCPS_SYN_SENT)
2006		tcp_drop(tp, errno);
2007}
2008
2009/*
2010 * When `need fragmentation' ICMP is received, update our idea of the MSS
2011 * based on the new value in the route.  Also nudge TCP to send something,
2012 * since we know the packet we just sent was dropped.
2013 * This duplicates some code in the tcp_mss() function in tcp_input.c.
2014 */
2015void
2016tcp_mtudisc(
2017	struct inpcb *inp,
2018	__unused int errno
2019)
2020{
2021	struct tcpcb *tp = intotcpcb(inp);
2022	struct rtentry *rt;
2023	struct rmxp_tao *taop;
2024	struct socket *so = inp->inp_socket;
2025	int offered;
2026	int mss;
2027#if INET6
2028	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
2029#endif /* INET6 */
2030
2031	if (tp) {
2032#if INET6
2033		if (isipv6)
2034			rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2035		else
2036#endif /* INET6 */
2037		rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2038		if (!rt || !rt->rt_rmx.rmx_mtu) {
2039			tp->t_maxopd = tp->t_maxseg =
2040#if INET6
2041				isipv6 ? tcp_v6mssdflt :
2042#endif /* INET6 */
2043				tcp_mssdflt;
2044
2045			/* Route locked during lookup above */
2046			if (rt != NULL)
2047				RT_UNLOCK(rt);
2048			return;
2049		}
2050		taop = rmx_taop(rt->rt_rmx);
2051		offered = taop->tao_mssopt;
2052		mss = rt->rt_rmx.rmx_mtu -
2053#if INET6
2054			(isipv6 ?
2055			 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
2056#endif /* INET6 */
2057			 sizeof(struct tcpiphdr)
2058#if INET6
2059			 )
2060#endif /* INET6 */
2061			;
2062
2063		/* Route locked during lookup above */
2064		RT_UNLOCK(rt);
2065
2066		if (offered)
2067			mss = min(mss, offered);
2068		/*
2069		 * XXX - The above conditional probably violates the TCP
2070		 * spec.  The problem is that, since we don't know the
2071		 * other end's MSS, we are supposed to use a conservative
2072		 * default.  But, if we do that, then MTU discovery will
2073		 * never actually take place, because the conservative
2074		 * default is much less than the MTUs typically seen
2075		 * on the Internet today.  For the moment, we'll sweep
2076		 * this under the carpet.
2077		 *
2078		 * The conservative default might not actually be a problem
2079		 * if the only case this occurs is when sending an initial
2080		 * SYN with options and data to a host we've never talked
2081		 * to before.  Then, they will reply with an MSS value which
2082		 * will get recorded and the new parameters should get
2083		 * recomputed.  For Further Study.
2084		 */
2085		if (tp->t_maxopd <= mss)
2086			return;
2087		tp->t_maxopd = mss;
2088
2089		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2090		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
2091			mss -= TCPOLEN_TSTAMP_APPA;
2092
2093		if (so->so_snd.sb_hiwat < mss)
2094			mss = so->so_snd.sb_hiwat;
2095
2096		tp->t_maxseg = mss;
2097
2098		/*
2099		 * Reset the slow-start flight size as it may depends on the new MSS
2100		 */
2101		if (CC_ALGO(tp)->cwnd_init != NULL)
2102			CC_ALGO(tp)->cwnd_init(tp);
2103		tcpstat.tcps_mturesent++;
2104		tp->t_rtttime = 0;
2105		tp->snd_nxt = tp->snd_una;
2106		tcp_output(tp);
2107	}
2108}
2109
2110/*
2111 * Look-up the routing entry to the peer of this inpcb.  If no route
2112 * is found and it cannot be allocated the return NULL.  This routine
2113 * is called by TCP routines that access the rmx structure and by tcp_mss
2114 * to get the interface MTU.  If a route is found, this routine will
2115 * hold the rtentry lock; the caller is responsible for unlocking.
2116 */
2117struct rtentry *
2118tcp_rtlookup(inp, input_ifscope)
2119	struct inpcb *inp;
2120	unsigned int input_ifscope;
2121{
2122	struct route *ro;
2123	struct rtentry *rt;
2124	struct tcpcb *tp;
2125
2126	lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
2127
2128	ro = &inp->inp_route;
2129	if ((rt = ro->ro_rt) != NULL)
2130		RT_LOCK(rt);
2131
2132	if (rt == NULL || !(rt->rt_flags & RTF_UP) ||
2133	    rt->generation_id != route_generation) {
2134		/* No route yet, so try to acquire one */
2135		if (inp->inp_faddr.s_addr != INADDR_ANY) {
2136			unsigned int ifscope;
2137
2138			ro->ro_dst.sa_family = AF_INET;
2139			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
2140			((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr =
2141				inp->inp_faddr;
2142
2143			/*
2144			 * If the socket was bound to an interface, then
2145			 * the bound-to-interface takes precedence over
2146			 * the inbound interface passed in by the caller
2147			 * (if we get here as part of the output path then
2148			 * input_ifscope is IFSCOPE_NONE).
2149			 */
2150			ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2151			    inp->inp_boundifp->if_index : input_ifscope;
2152
2153			if (rt != NULL)
2154				RT_UNLOCK(rt);
2155			rtalloc_scoped(ro, ifscope);
2156			if ((rt = ro->ro_rt) != NULL)
2157				RT_LOCK(rt);
2158		}
2159	}
2160
2161	/*
2162	 * Update MTU discovery determination. Don't do it if:
2163	 *	1) it is disabled via the sysctl
2164	 *	2) the route isn't up
2165	 *	3) the MTU is locked (if it is, then discovery has been
2166	 *	   disabled)
2167	 */
2168
2169	 tp = intotcpcb(inp);
2170
2171	if (!path_mtu_discovery || ((rt != NULL) &&
2172	    (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
2173		tp->t_flags &= ~TF_PMTUD;
2174	else
2175		tp->t_flags |= TF_PMTUD;
2176
2177#if CONFIG_IFEF_NOWINDOWSCALE
2178	if (tcp_obey_ifef_nowindowscale &&
2179	    tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL &&
2180	    (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) {
2181		/* Window scaling is enabled on this interface */
2182		tp->t_flags &= ~TF_REQ_SCALE;
2183	}
2184#endif
2185
2186	if (rt != NULL && rt->rt_ifp != NULL) {
2187		somultipages(inp->inp_socket,
2188		    (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
2189		tcp_set_tso(tp, rt->rt_ifp);
2190	}
2191
2192	/*
2193	 * Caller needs to call RT_UNLOCK(rt).
2194	 */
2195	return rt;
2196}
2197
2198#if INET6
2199struct rtentry *
2200tcp_rtlookup6(inp, input_ifscope)
2201	struct inpcb *inp;
2202	unsigned int input_ifscope;
2203{
2204	struct route_in6 *ro6;
2205	struct rtentry *rt;
2206	struct tcpcb *tp;
2207
2208	lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
2209
2210	ro6 = &inp->in6p_route;
2211	if ((rt = ro6->ro_rt) != NULL)
2212		RT_LOCK(rt);
2213
2214	if (rt == NULL || !(rt->rt_flags & RTF_UP) ||
2215	    rt->generation_id != route_generation) {
2216		/* No route yet, so try to acquire one */
2217		if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
2218			struct sockaddr_in6 *dst6;
2219			unsigned int ifscope;
2220
2221			dst6 = (struct sockaddr_in6 *)&ro6->ro_dst;
2222			dst6->sin6_family = AF_INET6;
2223			dst6->sin6_len = sizeof(*dst6);
2224			dst6->sin6_addr = inp->in6p_faddr;
2225
2226			/*
2227			 * If the socket was bound to an interface, then
2228			 * the bound-to-interface takes precedence over
2229			 * the inbound interface passed in by the caller
2230			 * (if we get here as part of the output path then
2231			 * input_ifscope is IFSCOPE_NONE).
2232			 */
2233			ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2234			    inp->inp_boundifp->if_index : input_ifscope;
2235
2236			if (rt != NULL)
2237				RT_UNLOCK(rt);
2238			rtalloc_scoped((struct route *)ro6, ifscope);
2239			if ((rt = ro6->ro_rt) != NULL)
2240				RT_LOCK(rt);
2241		}
2242	}
2243	/*
2244	 * Update path MTU Discovery determination
2245	 * while looking up the route:
2246	 *  1) we have a valid route to the destination
2247	 *  2) the MTU is not locked (if it is, then discovery has been
2248	 *    disabled)
2249	 */
2250
2251
2252	 tp = intotcpcb(inp);
2253
2254	/*
2255	 * Update MTU discovery determination. Don't do it if:
2256	 *	1) it is disabled via the sysctl
2257	 *	2) the route isn't up
2258	 *	3) the MTU is locked (if it is, then discovery has been
2259	 *	   disabled)
2260	 */
2261
2262	if (!path_mtu_discovery || ((rt != NULL) &&
2263	    (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
2264		tp->t_flags &= ~TF_PMTUD;
2265	else
2266		tp->t_flags |= TF_PMTUD;
2267
2268#if CONFIG_IFEF_NOWINDOWSCALE
2269	if (tcp_obey_ifef_nowindowscale &&
2270	    tp->t_state == TCPS_SYN_SENT && rt != NULL && rt->rt_ifp != NULL &&
2271	    (rt->rt_ifp->if_eflags & IFEF_NOWINDOWSCALE)) {
2272		/* Window scaling is not enabled on this interface */
2273		tp->t_flags &= ~TF_REQ_SCALE;
2274	}
2275#endif
2276
2277	if (rt != NULL && rt->rt_ifp != NULL) {
2278		somultipages(inp->inp_socket,
2279		    (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
2280		tcp_set_tso(tp, rt->rt_ifp);
2281	}
2282
2283	/*
2284	 * Caller needs to call RT_UNLOCK(rt).
2285	 */
2286	return rt;
2287}
2288#endif /* INET6 */
2289
2290#if IPSEC
2291/* compute ESP/AH header size for TCP, including outer IP header. */
2292size_t
2293ipsec_hdrsiz_tcp(tp)
2294	struct tcpcb *tp;
2295{
2296	struct inpcb *inp;
2297	struct mbuf *m;
2298	size_t hdrsiz;
2299	struct ip *ip;
2300#if INET6
2301	struct ip6_hdr *ip6 = NULL;
2302#endif /* INET6 */
2303	struct tcphdr *th;
2304
2305	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
2306		return 0;
2307	MGETHDR(m, M_DONTWAIT, MT_DATA);	/* MAC-OK */
2308	if (!m)
2309		return 0;
2310
2311#if INET6
2312	if ((inp->inp_vflag & INP_IPV6) != 0) {
2313		ip6 = mtod(m, struct ip6_hdr *);
2314		th = (struct tcphdr *)(void *)(ip6 + 1);
2315		m->m_pkthdr.len = m->m_len =
2316			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
2317		tcp_fillheaders(tp, ip6, th);
2318		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
2319	} else
2320#endif /* INET6 */
2321      {
2322	ip = mtod(m, struct ip *);
2323	th = (struct tcphdr *)(ip + 1);
2324	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
2325	tcp_fillheaders(tp, ip, th);
2326	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
2327      }
2328	m_free(m);
2329	return hdrsiz;
2330}
2331#endif /*IPSEC*/
2332
2333/*
2334 * Return a pointer to the cached information about the remote host.
2335 * The cached information is stored in the protocol specific part of
2336 * the route metrics.
2337 */
2338struct rmxp_tao *
2339tcp_gettaocache(inp)
2340	struct inpcb *inp;
2341{
2342	struct rtentry *rt;
2343	struct rmxp_tao *taop;
2344
2345#if INET6
2346	if ((inp->inp_vflag & INP_IPV6) != 0)
2347		rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2348	else
2349#endif /* INET6 */
2350	rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2351
2352	/* Make sure this is a host route and is up. */
2353	if (rt == NULL ||
2354	    (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) {
2355		/* Route locked during lookup above */
2356		if (rt != NULL)
2357			RT_UNLOCK(rt);
2358		return NULL;
2359	}
2360
2361	taop = rmx_taop(rt->rt_rmx);
2362	/* Route locked during lookup above */
2363	RT_UNLOCK(rt);
2364	return (taop);
2365}
2366
2367/*
2368 * Clear all the TAO cache entries, called from tcp_init.
2369 *
2370 * XXX
2371 * This routine is just an empty one, because we assume that the routing
2372 * routing tables are initialized at the same time when TCP, so there is
2373 * nothing in the cache left over.
2374 */
2375static void
2376tcp_cleartaocache()
2377{
2378}
2379
2380int
2381tcp_lock(struct socket *so, int refcount, void *lr)
2382{
2383	void *lr_saved;
2384
2385	if (lr == NULL)
2386		lr_saved = __builtin_return_address(0);
2387	else
2388		lr_saved = lr;
2389
2390	if (so->so_pcb != NULL) {
2391		lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
2392	} else  {
2393		panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n",
2394		    so, lr_saved, solockhistory_nr(so));
2395		/* NOTREACHED */
2396	}
2397
2398	if (so->so_usecount < 0) {
2399		panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n",
2400		    so, so->so_pcb, lr_saved, so->so_usecount, solockhistory_nr(so));
2401		/* NOTREACHED */
2402	}
2403	if (refcount)
2404		so->so_usecount++;
2405	so->lock_lr[so->next_lock_lr] = lr_saved;
2406	so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
2407	return (0);
2408}
2409
2410int
2411tcp_unlock(struct socket *so, int refcount, void *lr)
2412{
2413	void *lr_saved;
2414
2415	if (lr == NULL)
2416		lr_saved = __builtin_return_address(0);
2417	else
2418		lr_saved = lr;
2419
2420#ifdef MORE_TCPLOCK_DEBUG
2421	printf("tcp_unlock: so=%p sopcb=%p lock=%p ref=%x lr=%p\n",
2422	    so, so->so_pcb, &((struct inpcb *)so->so_pcb)->inpcb_mtx,
2423	    so->so_usecount, lr_saved);
2424#endif
2425	if (refcount)
2426		so->so_usecount--;
2427
2428	if (so->so_usecount < 0) {
2429		panic("tcp_unlock: so=%p usecount=%x lrh= %s\n",
2430		    so, so->so_usecount, solockhistory_nr(so));
2431		/* NOTREACHED */
2432	}
2433	if (so->so_pcb == NULL) {
2434		panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s\n",
2435		    so, so->so_usecount, lr_saved, solockhistory_nr(so));
2436		/* NOTREACHED */
2437	} else {
2438		lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2439		    LCK_MTX_ASSERT_OWNED);
2440		so->unlock_lr[so->next_unlock_lr] = lr_saved;
2441		so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
2442		lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
2443	}
2444	return (0);
2445}
2446
2447lck_mtx_t *
2448tcp_getlock(
2449	struct socket *so,
2450	__unused int locktype)
2451{
2452	struct inpcb *inp = sotoinpcb(so);
2453
2454	if (so->so_pcb)  {
2455		if (so->so_usecount < 0)
2456			panic("tcp_getlock: so=%p usecount=%x lrh= %s\n",
2457			    so, so->so_usecount, solockhistory_nr(so));
2458		return(&inp->inpcb_mtx);
2459	}
2460	else {
2461		panic("tcp_getlock: so=%p NULL so_pcb %s\n",
2462		    so, solockhistory_nr(so));
2463		return (so->so_proto->pr_domain->dom_mtx);
2464	}
2465}
2466
2467/* Determine if we can grow the recieve socket buffer to avoid sending
2468 * a zero window update to the peer. We allow even socket buffers that
2469 * have fixed size (set by the application) to grow if the resource
2470 * constraints are met. They will also be trimmed after the application
2471 * reads data.
2472 */
2473static void
2474tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb) {
2475	u_int32_t rcvbufinc = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
2476	if (tcp_do_autorcvbuf == 1 &&
2477		tcp_cansbgrow(sb) &&
2478		(tp->t_flags & TF_SLOWLINK) == 0 &&
2479		(sb->sb_hiwat - sb->sb_cc) < rcvbufinc &&
2480		(sb->sb_hiwat < tcp_autorcvbuf_max)) {
2481		sbreserve(sb, (sb->sb_hiwat + rcvbufinc));
2482	}
2483}
2484
2485int32_t
2486tcp_sbspace(struct tcpcb *tp)
2487{
2488	struct sockbuf *sb = &tp->t_inpcb->inp_socket->so_rcv;
2489	int32_t space;
2490
2491	tcp_sbrcv_grow_rwin(tp, sb);
2492
2493	space =  ((int32_t) imin((sb->sb_hiwat - sb->sb_cc),
2494		(sb->sb_mbmax - sb->sb_mbcnt)));
2495	if (space < 0)
2496		space = 0;
2497
2498	/* Avoid increasing window size if the current window
2499	 * is already very low, we could be in "persist" mode and
2500	 * we could break some apps (see rdar://5409343)
2501	 */
2502
2503	if (space < tp->t_maxseg)
2504		return space;
2505
2506	/* Clip window size for slower link */
2507
2508	if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0 )
2509		return imin(space, slowlink_wsize);
2510
2511	return space;
2512}
2513/*
2514 * Checks TCP Segment Offloading capability for a given connection and interface pair.
2515 */
2516void
2517tcp_set_tso(tp, ifp)
2518	struct tcpcb *tp;
2519	struct ifnet *ifp;
2520{
2521#if INET6
2522	struct inpcb *inp = tp->t_inpcb;
2523	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
2524
2525	if (isipv6) {
2526		if (ifp && ifp->if_hwassist & IFNET_TSO_IPV6) {
2527			tp->t_flags |= TF_TSO;
2528			if (ifp->if_tso_v6_mtu != 0)
2529				tp->tso_max_segment_size = ifp->if_tso_v6_mtu;
2530			else
2531				tp->tso_max_segment_size = TCP_MAXWIN;
2532		} else
2533				tp->t_flags &= ~TF_TSO;
2534
2535	} else
2536#endif /* INET6 */
2537
2538	{
2539		if (ifp && ifp->if_hwassist & IFNET_TSO_IPV4) {
2540			tp->t_flags |= TF_TSO;
2541			if (ifp->if_tso_v4_mtu != 0)
2542				tp->tso_max_segment_size = ifp->if_tso_v4_mtu;
2543			else
2544				tp->tso_max_segment_size = TCP_MAXWIN;
2545		} else
2546				tp->t_flags &= ~TF_TSO;
2547	}
2548}
2549
2550#define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC)
2551
2552/* Function to calculate the tcp clock. The tcp clock will get updated
2553 * at the boundaries of the tcp layer. This is done at 3 places:
2554 * 1. Right before processing an input tcp packet
2555 * 2. Whenever a connection wants to access the network using tcp_usrreqs
2556 * 3. When a tcp timer fires or before tcp slow timeout
2557 *
2558 */
2559
2560void
2561calculate_tcp_clock()
2562{
2563	struct timeval tv = tcp_uptime;
2564	struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC};
2565	struct timeval now, hold_now;
2566	uint32_t incr = 0;
2567
2568	timevaladd(&tv, &interval);
2569	microuptime(&now);
2570	if (timevalcmp(&now, &tv, >)) {
2571		/* time to update the clock */
2572		lck_spin_lock(tcp_uptime_lock);
2573		if (timevalcmp(&tcp_uptime, &now, >=)) {
2574			/* clock got updated while we were waiting for the lock */
2575			lck_spin_unlock(tcp_uptime_lock);
2576			return;
2577			}
2578
2579		microuptime(&now);
2580		hold_now = now;
2581		tv = tcp_uptime;
2582		timevalsub(&now, &tv);
2583
2584		incr = TIMEVAL_TO_TCPHZ(now);
2585		if (incr > 0) {
2586			tcp_uptime = hold_now;
2587			tcp_now += incr;
2588		}
2589
2590                lck_spin_unlock(tcp_uptime_lock);
2591        }
2592        return;
2593}
2594
2595/* Compute receive window scaling that we are going to request
2596 * for this connection based on  sb_hiwat. Try to leave some
2597 * room to potentially increase the window size upto a maximum
2598 * defined by the constant tcp_autorcvbuf_max.
2599 */
2600void
2601tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so) {
2602	u_int32_t maxsockbufsize;
2603
2604	tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale);
2605	maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ?
2606		so->so_rcv.sb_hiwat : tcp_autorcvbuf_max;
2607
2608	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
2609		(TCP_MAXWIN << tp->request_r_scale) < maxsockbufsize)
2610		tp->request_r_scale++;
2611	tp->request_r_scale = min(tp->request_r_scale, TCP_MAX_WINSHIFT);
2612
2613}
2614
2615int
2616tcp_notsent_lowat_check(struct socket *so) {
2617	struct inpcb *inp = sotoinpcb(so);
2618	struct tcpcb *tp = NULL;
2619	int notsent = 0;
2620	if (inp != NULL) {
2621		tp = intotcpcb(inp);
2622	}
2623
2624	notsent = so->so_snd.sb_cc -
2625		(tp->snd_nxt - tp->snd_una);
2626
2627	/* When we send a FIN or SYN, not_sent can be negative.
2628	 * In that case also we need to send a write event to the
2629	 * process if it is waiting. In the FIN case, it will
2630	 * get an error from send because cantsendmore will be set.
2631	 */
2632	if (notsent <= tp->t_notsent_lowat) {
2633		return(1);
2634	}
2635
2636	/* When Nagle's algorithm is not disabled, it is better
2637	 * to wakeup the client until there is atleast one
2638	 * maxseg of data to write.
2639	 */
2640	if ((tp->t_flags & TF_NODELAY) == 0 &&
2641		notsent > 0 && notsent < tp->t_maxseg) {
2642		return(1);
2643	}
2644	return(0);
2645}
2646
2647
2648/* DSEP Review Done pl-20051213-v02 @3253,@3391,@3400 */
2649