1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/kernel.h>
68#include <sys/mbuf.h>
69#include <sys/sysctl.h>
70#include <sys/socket.h>
71#include <sys/socketvar.h>
72#include <sys/protosw.h>
73#include <sys/domain.h>
74#include <sys/mcache.h>
75#include <sys/queue.h>
76#include <kern/locks.h>
77#include <kern/cpu_number.h>	/* before tcp_seq.h, for tcp_random18() */
78#include <mach/boolean.h>
79
80#include <net/route.h>
81#include <net/if_var.h>
82#include <net/ntstat.h>
83
84#include <netinet/in.h>
85#include <netinet/in_systm.h>
86#include <netinet/in_pcb.h>
87#if INET6
88#include <netinet6/in6_pcb.h>
89#endif
90#include <netinet/ip_var.h>
91#include <netinet/tcp.h>
92#include <netinet/tcp_fsm.h>
93#include <netinet/tcp_seq.h>
94#include <netinet/tcp_timer.h>
95#include <netinet/tcp_var.h>
96#include <netinet/tcp_cc.h>
97#if INET6
98#include <netinet6/tcp6_var.h>
99#endif
100#include <netinet/tcpip.h>
101#if TCPDEBUG
102#include <netinet/tcp_debug.h>
103#endif
104#include <sys/kdebug.h>
105#include <mach/sdt.h>
106#include <netinet/mptcp_var.h>
107
108#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
109
110#define VERIFY_NEXT_LINK(elm,field) do {	\
111	if (LIST_NEXT((elm),field) != NULL && 	\
112	    LIST_NEXT((elm),field)->field.le_prev !=	\
113		&((elm)->field.le_next))	\
114		panic("Bad link elm %p next->prev != elm", (elm));	\
115} while(0)
116
117#define VERIFY_PREV_LINK(elm,field) do {	\
118	if (*(elm)->field.le_prev != (elm))	\
119		panic("Bad link elm %p prev->next != elm", (elm));	\
120} while(0)
121
122#define TCP_SET_TIMER_MODE(mode, i) do { \
123	if (IS_TIMER_HZ_10MS(i)) \
124		(mode) |= TCP_TIMERLIST_10MS_MODE; \
125	else if (IS_TIMER_HZ_100MS(i)) \
126		(mode) |= TCP_TIMERLIST_100MS_MODE; \
127	else \
128		(mode) |= TCP_TIMERLIST_500MS_MODE; \
129} while(0)
130
131/* Max number of times a stretch ack can be delayed on a connection */
132#define	TCP_STRETCHACK_DELAY_THRESHOLD	5
133
134/* tcp timer list */
135struct tcptimerlist tcp_timer_list;
136
137/* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
138struct tcptailq tcp_tw_tailq;
139
140static int
141sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
142{
143#pragma unused(arg1, arg2)
144	int error, s, tt;
145
146	tt = *(int *)oidp->oid_arg1;
147	s = tt * 1000 / TCP_RETRANSHZ;;
148
149	error = sysctl_handle_int(oidp, &s, 0, req);
150	if (error || !req->newptr)
151		return (error);
152
153	tt = s * TCP_RETRANSHZ / 1000;
154	if (tt < 1)
155		return (EINVAL);
156
157	*(int *)oidp->oid_arg1 = tt;
158        return (0);
159}
160
161int	tcp_keepinit;
162SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
163    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
164    &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
165
166int	tcp_keepidle;
167SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
168    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
169    &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
170
171int	tcp_keepintvl;
172SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
173    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
174    &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
175
176int	tcp_keepcnt;
177SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt,
178    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
179    &tcp_keepcnt, 0, "number of times to repeat keepalive");
180
181int	tcp_msl;
182SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
183    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
184    &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
185
186/*
187 * Avoid DoS via TCP Robustness in Persist Condition
188 * (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
189 * by allowing a system wide maximum persistence timeout value when in
190 * Zero Window Probe mode.
191 *
192 * Expressed in milliseconds to be consistent without timeout related
193 * values, the TCP socket option is in seconds.
194 */
195u_int32_t tcp_max_persist_timeout = 0;
196SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
197    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
198    &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I",
199    "Maximum persistence timeout for ZWP");
200
201static int	always_keepalive = 0;
202SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive,
203    CTLFLAG_RW | CTLFLAG_LOCKED,
204    &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
205
206/*
207 * This parameter determines how long the timer list will stay in fast or
208 * quick mode even though all connections are idle. In this state, the
209 * timer will run more frequently anticipating new data.
210 */
211int timer_fastmode_idlemax = TCP_FASTMODE_IDLERUN_MAX;
212SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax,
213    CTLFLAG_RW | CTLFLAG_LOCKED,
214    &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode");
215
216/*
217 * See tcp_syn_backoff[] for interval values between SYN retransmits;
218 * the value set below defines the number of retransmits, before we
219 * disable the timestamp and window scaling options during subsequent
220 * SYN retransmits.  Setting it to 0 disables the dropping off of those
221 * two options.
222 */
223static int tcp_broken_peer_syn_rxmit_thres = 7;
224SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres,
225    CTLFLAG_RW | CTLFLAG_LOCKED,
226    &tcp_broken_peer_syn_rxmit_thres, 0,
227    "Number of retransmitted SYNs before "
228    "TCP disables rfc1323 and rfc1644 during the rest of attempts");
229
230/* A higher threshold on local connections for disabling RFC 1323 options */
231static int tcp_broken_peer_syn_rxmit_thres_local = 10;
232SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres_local,
233    CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres_local, 0,
234    "Number of retransmitted SYNs before disabling RFC 1323 "
235    "options on local connections");
236
237static int tcp_timer_advanced = 0;
238SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced,
239    CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_timer_advanced, 0,
240    "Number of times one of the timers was advanced");
241
242static int tcp_resched_timerlist = 0;
243SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist,
244    CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0,
245    "Number of times timer list was rescheduled as part of processing a packet");
246
247int	tcp_pmtud_black_hole_detect = 1 ;
248SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
249    CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_detect, 0,
250    "Path MTU Discovery Black Hole Detection");
251
252int	tcp_pmtud_black_hole_mss = 1200 ;
253SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
254    CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_mss, 0,
255    "Path MTU Discovery Black Hole Detection lowered MSS");
256
257/* performed garbage collection of "used" sockets */
258static boolean_t tcp_gc_done = FALSE;
259
260/* max idle probes */
261int	tcp_maxpersistidle;
262
263/*
264 * TCP delack timer is set to 100 ms. Since the processing of timer list
265 * in fast mode will happen no faster than 100 ms, the delayed ack timer
266 * will fire some where between 100 and 200 ms.
267 */
268int	tcp_delack = TCP_RETRANSHZ / 10;
269
270#if MPTCP
271/*
272 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
273 */
274int	tcp_jack_rxmt = TCP_RETRANSHZ / 2;
275#endif /* MPTCP */
276
277static void tcp_remove_timer(struct tcpcb *tp);
278static void tcp_sched_timerlist(uint32_t offset);
279static u_int32_t tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *mode);
280static void tcp_sched_timers(struct tcpcb *tp);
281static inline void tcp_set_lotimer_index(struct tcpcb *);
282static void tcp_rexmt_save_state(struct tcpcb *tp);
283__private_extern__ void tcp_remove_from_time_wait(struct inpcb *inp);
284__private_extern__ void tcp_report_stats(void);
285
286/*
287 * Macro to compare two timers. If there is a reset of the sign bit,
288 * it is safe to assume that the timer has wrapped around. By doing
289 * signed comparision, we take care of wrap around such that the value
290 * with the sign bit reset is actually ahead of the other.
291 */
292inline int32_t
293timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
294	return (int32_t)((t1 + toff1) - (t2 + toff2));
295};
296
297static	u_int64_t tcp_last_report_time;
298#define	TCP_REPORT_STATS_INTERVAL	345600 /* 4 days, in seconds */
299
300/* Returns true if the timer is on the timer list */
301#define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
302
303/* Run the TCP timerlist atleast once every hour */
304#define TCP_TIMERLIST_MAX_OFFSET (60 * 60 * TCP_RETRANSHZ)
305
306
307static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
308static boolean_t tcp_garbage_collect(struct inpcb *, int);
309
310/*
311 * Add to tcp timewait list, delay is given in milliseconds.
312 */
313static void
314add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
315{
316	struct inpcbinfo *pcbinfo = &tcbinfo;
317	struct inpcb *inp = tp->t_inpcb;
318	uint32_t timer;
319
320	/* pcb list should be locked when we get here */
321	lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
322
323	/* We may get here multiple times, so check */
324	if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
325		pcbinfo->ipi_twcount++;
326		inp->inp_flags2 |= INP2_TIMEWAIT;
327
328		/* Remove from global inp list */
329		LIST_REMOVE(inp, inp_list);
330	} else {
331		TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
332	}
333
334	/* Compute the time at which this socket can be closed */
335	timer = tcp_now + delay;
336
337	/* We will use the TCPT_2MSL timer for tracking this delay */
338
339	if (TIMER_IS_ON_LIST(tp))
340		tcp_remove_timer(tp);
341	tp->t_timer[TCPT_2MSL] = timer;
342
343	TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
344}
345
346void
347add_to_time_wait(struct tcpcb *tp, uint32_t delay)
348{
349	struct inpcbinfo *pcbinfo = &tcbinfo;
350	if (tp->t_inpcb->inp_socket->so_options & SO_NOWAKEFROMSLEEP)
351		socket_post_kev_msg_closed(tp->t_inpcb->inp_socket);
352
353	if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
354		tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
355		lck_rw_lock_exclusive(pcbinfo->ipi_lock);
356		tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
357	}
358	add_to_time_wait_locked(tp, delay);
359	lck_rw_done(pcbinfo->ipi_lock);
360
361	inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
362}
363
364/* If this is on time wait queue, remove it. */
365void
366tcp_remove_from_time_wait(struct inpcb *inp)
367{
368	struct tcpcb *tp = intotcpcb(inp);
369	if (inp->inp_flags2 & INP2_TIMEWAIT)
370		TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
371}
372
373static boolean_t
374tcp_garbage_collect(struct inpcb *inp, int istimewait)
375{
376	boolean_t active = FALSE;
377	struct socket *so;
378	struct tcpcb *tp;
379
380	so = inp->inp_socket;
381	tp = intotcpcb(inp);
382
383	/*
384	 * Skip if still in use or busy; it would have been more efficient
385	 * if we were to test so_usecount against 0, but this isn't possible
386	 * due to the current implementation of tcp_dropdropablreq() where
387	 * overflow sockets that are eligible for garbage collection have
388	 * their usecounts set to 1.
389	 */
390	if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx))
391		return (TRUE);
392
393	/* Check again under the lock */
394	if (so->so_usecount > 1) {
395		if (inp->inp_wantcnt == WNT_STOPUSING)
396			active = TRUE;
397		lck_mtx_unlock(&inp->inpcb_mtx);
398		return (active);
399	}
400
401	if (istimewait &&
402		TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
403		tp->t_state != TCPS_CLOSED) {
404		/* Become a regular mutex */
405		lck_mtx_convert_spin(&inp->inpcb_mtx);
406		tcp_close(tp);
407	}
408
409	/*
410	 * Overflowed socket dropped from the listening queue?  Do this
411	 * only if we are called to clean up the time wait slots, since
412	 * tcp_dropdropablreq() considers a socket to have been fully
413	 * dropped after add_to_time_wait() is finished.
414	 * Also handle the case of connections getting closed by the peer
415	 * while in the queue as seen with rdar://6422317
416	 *
417	 */
418	if (so->so_usecount == 1 &&
419	    ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
420	    ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
421	    (so->so_head != NULL) &&
422	    ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
423	    (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
424
425		if (inp->inp_state != INPCB_STATE_DEAD) {
426			/* Become a regular mutex */
427			lck_mtx_convert_spin(&inp->inpcb_mtx);
428#if INET6
429			if (SOCK_CHECK_DOM(so, PF_INET6))
430				in6_pcbdetach(inp);
431			else
432#endif /* INET6 */
433				in_pcbdetach(inp);
434		}
435		so->so_usecount--;
436		if (inp->inp_wantcnt == WNT_STOPUSING)
437			active = TRUE;
438		lck_mtx_unlock(&inp->inpcb_mtx);
439		return (active);
440	} else if (inp->inp_wantcnt != WNT_STOPUSING) {
441		lck_mtx_unlock(&inp->inpcb_mtx);
442		return (FALSE);
443	}
444
445	/*
446	 * We get here because the PCB is no longer searchable
447	 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead
448	 * (usecount is 0).  This covers all cases, including overflow
449	 * sockets and those that are considered as "embryonic",
450	 * i.e. created by sonewconn() in TCP input path, and have
451	 * not yet been committed.  For the former, we reduce the usecount
452	 *  to 0 as done by the code above.  For the latter, the usecount
453	 * would have reduced to 0 as part calling soabort() when the
454	 * socket is dropped at the end of tcp_input().
455	 */
456	if (so->so_usecount == 0) {
457		DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
458			struct tcpcb *, tp, int32_t, TCPS_CLOSED);
459		/* Become a regular mutex */
460		lck_mtx_convert_spin(&inp->inpcb_mtx);
461
462		/*
463		 * If this tp still happens to be on the timer list,
464		 * take it out
465		 */
466		if (TIMER_IS_ON_LIST(tp)) {
467			tcp_remove_timer(tp);
468		}
469
470		if (inp->inp_state != INPCB_STATE_DEAD) {
471#if INET6
472			if (SOCK_CHECK_DOM(so, PF_INET6))
473				in6_pcbdetach(inp);
474			else
475#endif /* INET6 */
476				in_pcbdetach(inp);
477		}
478		in_pcbdispose(inp);
479		return (FALSE);
480	}
481
482	lck_mtx_unlock(&inp->inpcb_mtx);
483	return (TRUE);
484}
485
486/*
487 * TCP garbage collector callback (inpcb_timer_func_t).
488 *
489 * Returns the number of pcbs that will need to be gc-ed soon,
490 * returnining > 0 will keep timer active.
491 */
492void
493tcp_gc(struct inpcbinfo *ipi)
494{
495	struct inpcb *inp, *nxt;
496	struct tcpcb *tw_tp, *tw_ntp;
497#if TCPDEBUG
498	int ostate;
499#endif
500#if  KDEBUG
501	static int tws_checked = 0;
502#endif
503
504	KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
505
506	/*
507	 * Update tcp_now here as it may get used while
508	 * processing the slow timer.
509	 */
510	calculate_tcp_clock();
511
512	/*
513	 * Garbage collect socket/tcpcb: We need to acquire the list lock
514	 * exclusively to do this
515	 */
516
517	if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
518		/* don't sweat it this time; cleanup was done last time */
519		if (tcp_gc_done == TRUE) {
520			tcp_gc_done = FALSE;
521			KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
522			    tws_checked, cur_tw_slot, 0, 0, 0);
523			/* Lock upgrade failed, give up this round */
524			atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
525			return;
526		}
527		/* Upgrade failed, lost lock now take it again exclusive */
528		lck_rw_lock_exclusive(ipi->ipi_lock);
529	}
530	tcp_gc_done = TRUE;
531
532	LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
533		if (tcp_garbage_collect(inp, 0))
534			atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
535	}
536
537	/* Now cleanup the time wait ones */
538	TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
539		/*
540		 * We check the timestamp here without holding the
541		 * socket lock for better performance. If there are
542		 * any pcbs in time-wait, the timer will get rescheduled.
543		 * Hence some error in this check can be tolerated.
544		 *
545		 * Sometimes a socket on time-wait queue can be closed if
546		 * 2MSL timer expired but the application still has a
547		 * usecount on it.
548		 */
549		if (tw_tp->t_state == TCPS_CLOSED ||
550		    TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
551			if (tcp_garbage_collect(tw_tp->t_inpcb, 1))
552				atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
553		}
554	}
555
556	/* take into account pcbs that are still in time_wait_slots */
557	atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
558
559	lck_rw_done(ipi->ipi_lock);
560
561	/* Clean up the socache while we are here */
562	if (so_cache_timer())
563		atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
564
565	KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
566	    cur_tw_slot, 0, 0, 0);
567
568	return;
569}
570
571/*
572 * Cancel all timers for TCP tp.
573 */
574void
575tcp_canceltimers(tp)
576	struct tcpcb *tp;
577{
578	register int i;
579
580	tcp_remove_timer(tp);
581	for (i = 0; i < TCPT_NTIMERS; i++)
582		tp->t_timer[i] = 0;
583	tp->tentry.timer_start = tcp_now;
584	tp->tentry.index = TCPT_NONE;
585}
586
587int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
588    { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
589
590int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
591    { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
592
593static int tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
594
595static void tcp_rexmt_save_state(struct tcpcb *tp)
596{
597	u_int32_t fsize;
598	if (TSTMP_SUPPORTED(tp)) {
599		/*
600		 * Since timestamps are supported on the connection,
601		 * we can do recovery as described in rfc 4015.
602		 */
603		fsize = tp->snd_max - tp->snd_una;
604		tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
605		tp->snd_recover_prev = tp->snd_recover;
606	} else {
607		/*
608		 * Timestamp option is not supported on this connection.
609		 * Record ssthresh and cwnd so they can
610		 * be recovered if this turns out to be a "bad" retransmit.
611		 * A retransmit is considered "bad" if an ACK for this
612		 * segment is received within RTT/2 interval; the assumption
613		 * here is that the ACK was already in flight.  See
614		 * "On Estimating End-to-End Network Path Properties" by
615		 * Allman and Paxson for more details.
616		 */
617		tp->snd_cwnd_prev = tp->snd_cwnd;
618		tp->snd_ssthresh_prev = tp->snd_ssthresh;
619		tp->snd_recover_prev = tp->snd_recover;
620		if (IN_FASTRECOVERY(tp))
621			tp->t_flags |= TF_WASFRECOVERY;
622		else
623			tp->t_flags &= ~TF_WASFRECOVERY;
624	}
625	tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
626	tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
627	tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
628}
629
630/*
631 * Revert to the older segment size if there is an indication that PMTU
632 * blackhole detection was not needed.
633 */
634void tcp_pmtud_revert_segment_size(struct tcpcb *tp)
635{
636	int32_t optlen;
637
638	VERIFY(tp->t_pmtud_saved_maxopd > 0);
639	tp->t_flags |= TF_PMTUD;
640	tp->t_flags &= ~TF_BLACKHOLE;
641	optlen = tp->t_maxopd - tp->t_maxseg;
642	tp->t_maxopd = tp->t_pmtud_saved_maxopd;
643	tp->t_maxseg = tp->t_maxopd - optlen;
644	/*
645	 * Reset the slow-start flight size as it
646	 * may depend on the new MSS
647	 */
648	if (CC_ALGO(tp)->cwnd_init != NULL)
649		CC_ALGO(tp)->cwnd_init(tp);
650	tp->t_pmtud_start_ts = 0;
651	tcpstat.tcps_pmtudbh_reverted++;
652}
653
654/*
655 * TCP timer processing.
656 */
657struct tcpcb *
658tcp_timers(tp, timer)
659	register struct tcpcb *tp;
660	int timer;
661{
662	int32_t rexmt, optlen = 0, idle_time = 0;
663	struct socket *so;
664	struct tcptemp *t_template;
665#if TCPDEBUG
666	int ostate;
667#endif
668
669#if INET6
670	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
671#endif /* INET6 */
672
673	so = tp->t_inpcb->inp_socket;
674	idle_time = tcp_now - tp->t_rcvtime;
675
676	switch (timer) {
677
678	/*
679	 * 2 MSL timeout in shutdown went off.  If we're closed but
680	 * still waiting for peer to close and connection has been idle
681	 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
682	 * delete connection control block.
683	 * Otherwise, (this case shouldn't happen) check again in a bit
684	 * we keep the socket in the main list in that case.
685	 */
686	case TCPT_2MSL:
687		tcp_free_sackholes(tp);
688		if (tp->t_state != TCPS_TIME_WAIT &&
689		    tp->t_state != TCPS_FIN_WAIT_2 &&
690		    ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
691			tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
692				(u_int32_t)TCP_CONN_KEEPINTVL(tp));
693		} else {
694			tp = tcp_close(tp);
695			return(tp);
696		}
697		break;
698
699	/*
700	 * Retransmission timer went off.  Message has not
701	 * been acked within retransmit interval.  Back off
702	 * to a longer retransmit interval and retransmit one segment.
703	 */
704	case TCPT_REXMT:
705		/*
706		 * Drop a connection in the retransmit timer
707		 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT
708		 * times
709		 * 2. If the time spent in this retransmission episode is
710		 * more than the time limit set with TCP_RXT_CONNDROPTIME
711		 * socket option
712		 * 3. If TCP_RXT_FINDROP socket option was set and
713		 * we have already retransmitted the FIN 3 times without
714		 * receiving an ack
715		 */
716		if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
717		    (tp->t_rxt_conndroptime > 0
718		    && tp->t_rxtstart > 0 &&
719		    (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime)
720		    || ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
721			(tp->t_flags & TF_SENTFIN) != 0 &&
722			tp->t_rxtshift >= 4)) {
723			if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
724				tcpstat.tcps_rxtfindrop++;
725			} else {
726				tcpstat.tcps_timeoutdrop++;
727			}
728			tp->t_rxtshift = TCP_MAXRXTSHIFT;
729			postevent(so, 0, EV_TIMEOUT);
730			soevent(so,
731			    (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
732			tp = tcp_drop(tp, tp->t_softerror ?
733			    tp->t_softerror : ETIMEDOUT);
734
735			break;
736		}
737
738		tcpstat.tcps_rexmttimeo++;
739
740		if (tp->t_rxtshift == 1 &&
741			tp->t_state == TCPS_ESTABLISHED) {
742			/* Set the time at which retransmission started. */
743			tp->t_rxtstart = tcp_now;
744
745			/*
746			 * if this is the first retransmit timeout, save
747			 * the state so that we can recover if the timeout
748			 * is spurious.
749			 */
750			tcp_rexmt_save_state(tp);
751		}
752#if MPTCP
753		if ((tp->t_rxtshift >= mptcp_fail_thresh) &&
754		    (tp->t_state == TCPS_ESTABLISHED) &&
755		    (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
756			mptcp_act_on_txfail(so);
757
758		}
759#endif /* MPTCP */
760
761		if (tp->t_adaptive_wtimo > 0 &&
762			tp->t_rxtshift > tp->t_adaptive_wtimo &&
763			TCPS_HAVEESTABLISHED(tp->t_state)) {
764			/* Send an event to the application */
765			soevent(so,
766				(SO_FILT_HINT_LOCKED|
767				SO_FILT_HINT_ADAPTIVE_WTIMO));
768		}
769
770		/*
771		 * If this is a retransmit timeout after PTO, the PTO
772		 * was not effective
773		 */
774		if (tp->t_flagsext & TF_SENT_TLPROBE) {
775			tp->t_flagsext &= ~(TF_SENT_TLPROBE);
776			tcpstat.tcps_rto_after_pto++;
777		}
778
779		if (tp->t_flagsext & TF_DELAY_RECOVERY) {
780			/*
781			 * Retransmit timer fired before entering recovery
782			 * on a connection with packet re-ordering. This
783			 * suggests that the reordering metrics computed
784			 * are not accurate.
785			 */
786			tp->t_reorderwin = 0;
787			tp->t_timer[TCPT_DELAYFR] = 0;
788			tp->t_flagsext &= ~(TF_DELAY_RECOVERY);
789		}
790
791		if (tp->t_state == TCPS_SYN_SENT) {
792			rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
793			tp->t_stat.synrxtshift = tp->t_rxtshift;
794		} else {
795			rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
796		}
797
798		TCPT_RANGESET(tp->t_rxtcur, rexmt,
799			tp->t_rttmin, TCPTV_REXMTMAX,
800			TCP_ADD_REXMTSLOP(tp));
801		tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
802
803		if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb))
804			goto fc_output;
805
806		tcp_free_sackholes(tp);
807		/*
808		 * Check for potential Path MTU Discovery Black Hole
809		 */
810		if (tcp_pmtud_black_hole_detect &&
811			!(tp->t_flagsext & TF_NOBLACKHOLE_DETECTION) &&
812			(tp->t_state == TCPS_ESTABLISHED)) {
813			if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT))
814			    == (TF_PMTUD|TF_MAXSEGSNT)) &&
815				 (tp->t_rxtshift == 2)) {
816				/*
817				 * Enter Path MTU Black-hole Detection mechanism:
818				 * - Disable Path MTU Discovery (IP "DF" bit).
819				 * - Reduce MTU to lower value than what we
820				 * negotiated with the peer.
821				 */
822				/* Disable Path MTU Discovery for now */
823				tp->t_flags &= ~TF_PMTUD;
824				/* Record that we may have found a black hole */
825				tp->t_flags |= TF_BLACKHOLE;
826				optlen = tp->t_maxopd - tp->t_maxseg;
827				/* Keep track of previous MSS */
828				tp->t_pmtud_saved_maxopd = tp->t_maxopd;
829				tp->t_pmtud_start_ts = tcp_now;
830				if (tp->t_pmtud_start_ts == 0)
831					tp->t_pmtud_start_ts++;
832				/* Reduce the MSS to intermediary value */
833				if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
834					tp->t_maxopd = tcp_pmtud_black_hole_mss;
835				} else {
836					tp->t_maxopd = 	/* use the default MSS */
837#if INET6
838						isipv6 ? tcp_v6mssdflt :
839#endif /* INET6 */
840							tcp_mssdflt;
841				}
842				tp->t_maxseg = tp->t_maxopd - optlen;
843
844				/*
845	 			 * Reset the slow-start flight size
846				 * as it may depend on the new MSS
847	 			 */
848				if (CC_ALGO(tp)->cwnd_init != NULL)
849					CC_ALGO(tp)->cwnd_init(tp);
850			}
851			/*
852			 * If further retransmissions are still
853			 * unsuccessful with a lowered MTU, maybe this
854			 * isn't a Black Hole and we restore the previous
855			 * MSS and blackhole detection flags.
856			 */
857			else {
858
859				if ((tp->t_flags & TF_BLACKHOLE) &&
860				    (tp->t_rxtshift > 4)) {
861					tcp_pmtud_revert_segment_size(tp);
862				}
863			}
864		}
865
866
867		/*
868		 * Disable rfc1323 and rfc1644 if we haven't got any
869		 * response to our SYN (after we reach the threshold)
870		 * to work-around some broken terminal servers (most of
871		 * which have hopefully been retired) that have bad VJ
872		 * header compression code which trashes TCP segments
873		 * containing unknown-to-them TCP options.
874		 * Do this only on non-local connections.
875		 */
876		if (tp->t_state == TCPS_SYN_SENT &&
877		    ((!(tp->t_flags & TF_LOCAL) &&
878		    tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres) ||
879		    ((tp->t_flags & TF_LOCAL) &&
880		    tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres_local)))
881			tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
882
883		/*
884		 * If losing, let the lower level know and try for
885		 * a better route.  Also, if we backed off this far,
886		 * our srtt estimate is probably bogus.  Clobber it
887		 * so we'll take the next rtt measurement as our srtt;
888		 * move the current srtt into rttvar to keep the current
889		 * retransmit times until then.
890		 */
891		if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
892#if INET6
893			if (isipv6)
894				in6_losing(tp->t_inpcb);
895			else
896#endif /* INET6 */
897			in_losing(tp->t_inpcb);
898			tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
899			tp->t_srtt = 0;
900		}
901		tp->snd_nxt = tp->snd_una;
902		/*
903		 * Note:  We overload snd_recover to function also as the
904		 * snd_last variable described in RFC 2582
905		 */
906		tp->snd_recover = tp->snd_max;
907		/*
908		 * Force a segment to be sent.
909		 */
910		tp->t_flags |= TF_ACKNOW;
911
912		/* If timing a segment in this window, stop the timer */
913		tp->t_rtttime = 0;
914
915		if (!IN_FASTRECOVERY(tp) && tp->t_rxtshift == 1)
916			tcpstat.tcps_tailloss_rto++;
917
918
919		/*
920		 * RFC 5681 says: when a TCP sender detects segment loss
921		 * using retransmit timer and the given segment has already
922		 * been retransmitted by way of the retransmission timer at
923		 * least once, the value of ssthresh is held constant
924		 */
925		if (tp->t_rxtshift == 1 &&
926			CC_ALGO(tp)->after_timeout != NULL)
927			CC_ALGO(tp)->after_timeout(tp);
928
929		EXIT_FASTRECOVERY(tp);
930
931		/* CWR notifications are to be sent on new data right after
932		 * RTOs, Fast Retransmits and ECE notification receipts.
933		 */
934		if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) {
935			tp->ecn_flags |= TE_SENDCWR;
936		}
937fc_output:
938		tcp_ccdbg_trace(tp, NULL, TCP_CC_REXMT_TIMEOUT);
939
940		(void) tcp_output(tp);
941		break;
942
943	/*
944	 * Persistance timer into zero window.
945	 * Force a byte to be output, if possible.
946	 */
947	case TCPT_PERSIST:
948		tcpstat.tcps_persisttimeo++;
949		/*
950		 * Hack: if the peer is dead/unreachable, we do not
951		 * time out if the window is closed.  After a full
952		 * backoff, drop the connection if the idle time
953		 * (no responses to probes) reaches the maximum
954		 * backoff that we would use if retransmitting.
955		 *
956		 * Drop the connection if we reached the maximum allowed time for
957		 * Zero Window Probes without a non-zero update from the peer.
958		 * See rdar://5805356
959		 */
960		if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
961		    (idle_time >= tcp_maxpersistidle ||
962		    idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
963		    ((tp->t_persist_stop != 0) &&
964			TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
965			tcpstat.tcps_persistdrop++;
966			postevent(so, 0, EV_TIMEOUT);
967			soevent(so,
968			    (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
969			tp = tcp_drop(tp, ETIMEDOUT);
970			break;
971		}
972		tcp_setpersist(tp);
973		tp->t_flagsext |= TF_FORCE;
974		(void) tcp_output(tp);
975		tp->t_flagsext &= ~TF_FORCE;
976		break;
977
978	/*
979	 * Keep-alive timer went off; send something
980	 * or drop connection if idle for too long.
981	 */
982	case TCPT_KEEP:
983		tcpstat.tcps_keeptimeo++;
984#if MPTCP
985		/*
986		 * Regular TCP connections do not send keepalives after closing
987		 * MPTCP must not also, after sending Data FINs.
988		 */
989		struct mptcb *mp_tp = tp->t_mptcb;
990		if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
991		    (tp->t_state > TCPS_ESTABLISHED)) {
992			goto dropit;
993		} else if (mp_tp != NULL) {
994			if ((mptcp_ok_to_keepalive(mp_tp) == 0))
995				goto dropit;
996		}
997#endif /* MPTCP */
998		if (tp->t_state < TCPS_ESTABLISHED)
999			goto dropit;
1000		if ((always_keepalive ||
1001		    (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
1002		    (tp->t_flagsext & TF_DETECT_READSTALL)) &&
1003		    (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
1004		    	if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
1005				goto dropit;
1006			/*
1007			 * Send a packet designed to force a response
1008			 * if the peer is up and reachable:
1009			 * either an ACK if the connection is still alive,
1010			 * or an RST if the peer has closed the connection
1011			 * due to timeout or reboot.
1012			 * Using sequence number tp->snd_una-1
1013			 * causes the transmitted zero-length segment
1014			 * to lie outside the receive window;
1015			 * by the protocol spec, this requires the
1016			 * correspondent TCP to respond.
1017			 */
1018			tcpstat.tcps_keepprobe++;
1019			t_template = tcp_maketemplate(tp);
1020			if (t_template) {
1021				struct inpcb *inp = tp->t_inpcb;
1022				struct tcp_respond_args tra;
1023
1024				bzero(&tra, sizeof(tra));
1025				tra.nocell = INP_NO_CELLULAR(inp);
1026				tra.noexpensive = INP_NO_EXPENSIVE(inp);
1027				tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
1028				if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
1029					tra.ifscope = tp->t_inpcb->inp_boundifp->if_index;
1030				else
1031					tra.ifscope = IFSCOPE_NONE;
1032				tcp_respond(tp, t_template->tt_ipgen,
1033				    &t_template->tt_t, (struct mbuf *)NULL,
1034				    tp->rcv_nxt, tp->snd_una - 1, 0, &tra);
1035				(void) m_free(dtom(t_template));
1036				if (tp->t_flagsext & TF_DETECT_READSTALL)
1037					tp->t_rtimo_probes++;
1038			}
1039			tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1040				TCP_CONN_KEEPINTVL(tp));
1041		} else {
1042			tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1043				TCP_CONN_KEEPIDLE(tp));
1044		}
1045		if (tp->t_flagsext & TF_DETECT_READSTALL) {
1046			/*
1047			 * The keep alive packets sent to detect a read
1048			 * stall did not get a response from the
1049			 * peer. Generate more keep-alives to confirm this.
1050			 * If the number of probes sent reaches the limit,
1051			 * generate an event.
1052			 */
1053			if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
1054				/* Generate an event */
1055				soevent(so,
1056					(SO_FILT_HINT_LOCKED|
1057					SO_FILT_HINT_ADAPTIVE_RTIMO));
1058				tcp_keepalive_reset(tp);
1059			} else {
1060				tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
1061					tp, TCP_REXMTVAL(tp));
1062			}
1063		}
1064		break;
1065	case TCPT_DELACK:
1066		if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
1067			tp->t_flags &= ~TF_DELACK;
1068			tp->t_timer[TCPT_DELACK] = 0;
1069			tp->t_flags |= TF_ACKNOW;
1070
1071			/*
1072			 * If delayed ack timer fired while stretching
1073			 * acks, count the number of times the streaming
1074			 * detection was not correct. If this exceeds a
1075			 * threshold, disable strech ack on this
1076			 * connection
1077			 *
1078			 * Also, go back to acking every other packet.
1079			 */
1080			if ((tp->t_flags & TF_STRETCHACK)) {
1081				if (tp->t_unacksegs > 1 &&
1082				    tp->t_unacksegs < maxseg_unacked)
1083					tp->t_stretchack_delayed++;
1084
1085				if (tp->t_stretchack_delayed >
1086					TCP_STRETCHACK_DELAY_THRESHOLD) {
1087					tp->t_flagsext |= TF_DISABLE_STRETCHACK;
1088					/*
1089					 * Note the time at which stretch
1090					 * ack was disabled automatically
1091					 */
1092					tp->rcv_nostrack_ts = tcp_now;
1093					tcpstat.tcps_nostretchack++;
1094					tp->t_stretchack_delayed = 0;
1095				}
1096				tcp_reset_stretch_ack(tp);
1097			}
1098
1099			/*
1100			 * If we are measuring inter packet arrival jitter
1101			 * for throttling a connection, this delayed ack
1102			 * might be the reason for accumulating some
1103			 * jitter. So let's restart the measurement.
1104			 */
1105			CLEAR_IAJ_STATE(tp);
1106
1107			tcpstat.tcps_delack++;
1108			(void) tcp_output(tp);
1109		}
1110		break;
1111
1112#if MPTCP
1113	case TCPT_JACK_RXMT:
1114		if ((tp->t_state == TCPS_ESTABLISHED) &&
1115		    (tp->t_mpflags & TMPF_PREESTABLISHED) &&
1116		    (tp->t_mpflags & TMPF_JOINED_FLOW)) {
1117			if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
1118				tcpstat.tcps_timeoutdrop++;
1119				postevent(so, 0, EV_TIMEOUT);
1120				soevent(so,
1121			    	    (SO_FILT_HINT_LOCKED|
1122				    SO_FILT_HINT_TIMEOUT));
1123				tp = tcp_drop(tp, tp->t_softerror ?
1124			    	    tp->t_softerror : ETIMEDOUT);
1125				break;
1126			}
1127			tcpstat.tcps_join_rxmts++;
1128			tp->t_flags |= TF_ACKNOW;
1129
1130			/*
1131			 * No backoff is implemented for simplicity for this
1132			 * corner case.
1133			 */
1134			(void) tcp_output(tp);
1135		}
1136		break;
1137#endif /* MPTCP */
1138
1139	case TCPT_PTO:
1140	{
1141		tcp_seq old_snd_nxt;
1142		int32_t snd_len;
1143		boolean_t rescue_rxt = FALSE;
1144
1145		tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1146
1147		/*
1148		 * Check if the connection is in the right state to
1149		 * send a probe
1150		 */
1151		if (tp->t_state != TCPS_ESTABLISHED ||
1152		    tp->t_rxtshift > 0 || tp->snd_max == tp->snd_una ||
1153		    !SACK_ENABLED(tp) || TAILQ_EMPTY(&tp->snd_holes) ||
1154		    (IN_FASTRECOVERY(tp) &&
1155		    (SEQ_GEQ(tp->snd_fack, tp->snd_recover) ||
1156		    SEQ_GT(tp->snd_nxt, tp->sack_newdata))))
1157			break;
1158
1159		tcpstat.tcps_pto++;
1160
1161		/* If timing a segment in this window, stop the timer */
1162		tp->t_rtttime = 0;
1163
1164		if (IN_FASTRECOVERY(tp)) {
1165			/*
1166			 * Send a probe to detect tail loss in a
1167			 * recovery window when the connection is in
1168			 * fast_recovery.
1169			 */
1170			old_snd_nxt = tp->snd_nxt;
1171			rescue_rxt = TRUE;
1172			VERIFY(SEQ_GEQ(tp->snd_fack, tp->snd_una));
1173			snd_len = min((tp->snd_recover - tp->snd_fack),
1174			    tp->t_maxseg);
1175			tp->snd_nxt = tp->snd_recover - snd_len;
1176			tcpstat.tcps_pto_in_recovery++;
1177			tcp_ccdbg_trace(tp, NULL, TCP_CC_TLP_IN_FASTRECOVERY);
1178		} else {
1179			/*
1180			 * If there is no new data to send or if the
1181			 * connection is limited by receive window then
1182			 * retransmit the last segment, otherwise send
1183			 * new data.
1184			 */
1185			snd_len = min(so->so_snd.sb_cc, tp->snd_wnd)
1186			    - (tp->snd_max - tp->snd_una);
1187			if (snd_len > 0) {
1188				tp->snd_nxt = tp->snd_max;
1189			} else {
1190				snd_len = min((tp->snd_max - tp->snd_una),
1191				    tp->t_maxseg);
1192				tp->snd_nxt = tp->snd_max - snd_len;
1193			}
1194		}
1195
1196		/* Note that tail loss probe is being sent */
1197		tp->t_flagsext |= TF_SENT_TLPROBE;
1198		tp->t_tlpstart = tcp_now;
1199
1200		tp->snd_cwnd += tp->t_maxseg;
1201		(void )tcp_output(tp);
1202		tp->snd_cwnd -= tp->t_maxseg;
1203
1204		tp->t_tlphighrxt = tp->snd_nxt;
1205
1206		/*
1207		 * If a tail loss probe was sent after entering recovery,
1208		 * restore the old snd_nxt value so that other packets
1209		 * will get retransmitted correctly.
1210		 */
1211		if (rescue_rxt)
1212			tp->snd_nxt = old_snd_nxt;
1213		break;
1214	}
1215	case TCPT_DELAYFR:
1216		tp->t_flagsext &= ~TF_DELAY_RECOVERY;
1217
1218		/*
1219		 * Don't do anything if one of the following is true:
1220		 * - the connection is already in recovery
1221		 * - sequence until snd_recover has been acknowledged.
1222		 * - retransmit timeout has fired
1223		 */
1224		if (IN_FASTRECOVERY(tp) ||
1225		    SEQ_GEQ(tp->snd_una, tp->snd_recover) ||
1226		    tp->t_rxtshift > 0)
1227			break;
1228
1229		VERIFY(SACK_ENABLED(tp));
1230		if (CC_ALGO(tp)->pre_fr != NULL)
1231			CC_ALGO(tp)->pre_fr(tp);
1232		ENTER_FASTRECOVERY(tp);
1233		if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON)
1234			tp->ecn_flags |= TE_SENDCWR;
1235
1236		tp->t_timer[TCPT_REXMT] = 0;
1237		tcpstat.tcps_sack_recovery_episode++;
1238		tp->sack_newdata = tp->snd_nxt;
1239		tp->snd_cwnd = tp->t_maxseg;
1240		tcp_ccdbg_trace(tp, NULL, TCP_CC_ENTER_FASTRECOVERY);
1241		(void) tcp_output(tp);
1242		break;
1243	dropit:
1244		tcpstat.tcps_keepdrops++;
1245		postevent(so, 0, EV_TIMEOUT);
1246		soevent(so,
1247		    (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1248		tp = tcp_drop(tp, ETIMEDOUT);
1249		break;
1250	}
1251#if TCPDEBUG
1252	if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
1253		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
1254			  PRU_SLOWTIMO);
1255#endif
1256	return (tp);
1257}
1258
1259/* Remove a timer entry from timer list */
1260void
1261tcp_remove_timer(struct tcpcb *tp)
1262{
1263	struct tcptimerlist *listp = &tcp_timer_list;
1264
1265	lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1266	if (!(TIMER_IS_ON_LIST(tp))) {
1267		return;
1268	}
1269	lck_mtx_lock(listp->mtx);
1270
1271	/* Check if pcb is on timer list again after acquiring the lock */
1272	if (!(TIMER_IS_ON_LIST(tp))) {
1273		lck_mtx_unlock(listp->mtx);
1274		return;
1275	}
1276
1277	if (listp->next_te != NULL && listp->next_te == &tp->tentry)
1278		listp->next_te = LIST_NEXT(&tp->tentry, le);
1279
1280	LIST_REMOVE(&tp->tentry, le);
1281	tp->t_flags &= ~(TF_TIMER_ONLIST);
1282
1283	listp->entries--;
1284
1285	tp->tentry.le.le_next = NULL;
1286	tp->tentry.le.le_prev = NULL;
1287	lck_mtx_unlock(listp->mtx);
1288}
1289
1290/*
1291 * Function to check if the timerlist needs to be rescheduled to run
1292 * the timer entry correctly. Basically, this is to check if we can avoid
1293 * taking the list lock.
1294 */
1295
1296static boolean_t
1297need_to_resched_timerlist(u_int32_t runtime, u_int16_t mode)
1298{
1299	struct tcptimerlist *listp = &tcp_timer_list;
1300	int32_t diff;
1301
1302	/*
1303	 * If the list is being processed then the state of the list is
1304	 * in flux. In this case always acquire the lock and set the state
1305	 * correctly.
1306	 */
1307	if (listp->running)
1308		return (TRUE);
1309
1310	if (!listp->scheduled)
1311		return (TRUE);
1312
1313	diff = timer_diff(listp->runtime, 0, runtime, 0);
1314	if (diff <= 0) {
1315		/* The list is going to run before this timer */
1316		return (FALSE);
1317	} else {
1318		if (mode & TCP_TIMERLIST_10MS_MODE) {
1319			if (diff <= TCP_TIMER_10MS_QUANTUM)
1320				return (FALSE);
1321		} else if (mode & TCP_TIMERLIST_100MS_MODE) {
1322			if (diff <= TCP_TIMER_100MS_QUANTUM)
1323				return (FALSE);
1324		} else {
1325			if (diff <= TCP_TIMER_500MS_QUANTUM)
1326				return (FALSE);
1327		}
1328	}
1329	return (TRUE);
1330}
1331
1332void
1333tcp_sched_timerlist(uint32_t offset)
1334{
1335
1336	uint64_t deadline = 0;
1337	struct tcptimerlist *listp = &tcp_timer_list;
1338
1339	lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED);
1340
1341	offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
1342	listp->runtime = tcp_now + offset;
1343	if (listp->runtime == 0) {
1344		listp->runtime++;
1345		offset++;
1346	}
1347
1348	clock_interval_to_deadline(offset, USEC_PER_SEC, &deadline);
1349
1350	thread_call_enter_delayed(listp->call, deadline);
1351	listp->scheduled = TRUE;
1352}
1353
1354/*
1355 * Function to run the timers for a connection.
1356 *
1357 * Returns the offset of next timer to be run for this connection which
1358 * can be used to reschedule the timerlist.
1359 *
1360 * te_mode is an out parameter that indicates the modes of active
1361 * timers for this connection.
1362 */
1363u_int32_t
1364tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode) {
1365
1366	struct socket *so;
1367	u_int16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
1368	u_int32_t timer_val, offset = 0, lo_timer = 0;
1369	int32_t diff;
1370	boolean_t needtorun[TCPT_NTIMERS];
1371	int count = 0;
1372
1373	VERIFY(tp != NULL);
1374	bzero(needtorun, sizeof(needtorun));
1375	*te_mode = 0;
1376
1377	tcp_lock(tp->t_inpcb->inp_socket, 1, 0);
1378
1379	so = tp->t_inpcb->inp_socket;
1380	/* Release the want count on inp */
1381	if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1)
1382		== WNT_STOPUSING) {
1383		if (TIMER_IS_ON_LIST(tp)) {
1384			tcp_remove_timer(tp);
1385		}
1386
1387		/* Looks like the TCP connection got closed while we
1388		 * were waiting for the lock.. Done
1389		 */
1390		goto done;
1391	}
1392
1393	/*
1394	 * Since the timer thread needs to wait for tcp lock, it may race
1395	 * with another thread that can cancel or reschedule the timer
1396	 * that is about to run. Check if we need to run anything.
1397	 */
1398	if ((index = tp->tentry.index) == TCPT_NONE)
1399		goto done;
1400
1401	timer_val = tp->t_timer[index];
1402
1403	diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
1404	if (diff > 0) {
1405		if (tp->tentry.index != TCPT_NONE) {
1406			offset = diff;
1407			*(te_mode) = tp->tentry.mode;
1408		}
1409		goto done;
1410	}
1411
1412	tp->t_timer[index] = 0;
1413	if (timer_val > 0) {
1414		tp = tcp_timers(tp, index);
1415		if (tp == NULL)
1416			goto done;
1417	}
1418
1419	/*
1420	 * Check if there are any other timers that need to be run.
1421	 * While doing it, adjust the timer values wrt tcp_now.
1422	 */
1423	tp->tentry.mode = 0;
1424	for (i = 0; i < TCPT_NTIMERS; ++i) {
1425		if (tp->t_timer[i] != 0) {
1426			diff = timer_diff(tp->tentry.timer_start,
1427				tp->t_timer[i], tcp_now, 0);
1428			if (diff <= 0) {
1429				needtorun[i] = TRUE;
1430				count++;
1431			} else {
1432				tp->t_timer[i] = diff;
1433				needtorun[i] = FALSE;
1434				if (lo_timer == 0 || diff < lo_timer) {
1435					lo_timer = diff;
1436					lo_index = i;
1437				}
1438				TCP_SET_TIMER_MODE(tp->tentry.mode, i);
1439			}
1440		}
1441	}
1442
1443	tp->tentry.timer_start = tcp_now;
1444	tp->tentry.index = lo_index;
1445	VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1446
1447	if (tp->tentry.index != TCPT_NONE) {
1448		tp->tentry.runtime = tp->tentry.timer_start +
1449			tp->t_timer[tp->tentry.index];
1450		if (tp->tentry.runtime == 0)
1451			tp->tentry.runtime++;
1452	}
1453
1454	if (count > 0) {
1455		/* run any other timers outstanding at this time. */
1456		for (i = 0; i < TCPT_NTIMERS; ++i) {
1457			if (needtorun[i]) {
1458				tp->t_timer[i] = 0;
1459				tp = tcp_timers(tp, i);
1460				if (tp == NULL) {
1461					offset = 0;
1462					*(te_mode) = 0;
1463					goto done;
1464				}
1465			}
1466		}
1467		tcp_set_lotimer_index(tp);
1468	}
1469
1470	if (tp->tentry.index < TCPT_NONE) {
1471		offset = tp->t_timer[tp->tentry.index];
1472		*(te_mode) = tp->tentry.mode;
1473	}
1474
1475done:
1476	if (tp != NULL && tp->tentry.index == TCPT_NONE) {
1477		tcp_remove_timer(tp);
1478		offset = 0;
1479	}
1480
1481	tcp_unlock(so, 1, 0);
1482	return(offset);
1483}
1484
1485void
1486tcp_run_timerlist(void * arg1, void * arg2) {
1487#pragma unused(arg1, arg2)
1488	struct tcptimerentry *te, *next_te;
1489	struct tcptimerlist *listp = &tcp_timer_list;
1490	struct tcpcb *tp;
1491	uint32_t next_timer = 0; /* offset of the next timer on the list */
1492	u_int16_t te_mode = 0;	/* modes of all active timers in a tcpcb */
1493	u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */
1494	uint32_t active_count = 0;
1495
1496	calculate_tcp_clock();
1497
1498	lck_mtx_lock(listp->mtx);
1499
1500	listp->running = TRUE;
1501
1502	LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
1503		uint32_t offset = 0;
1504		uint32_t runtime = te->runtime;
1505		if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
1506			offset = timer_diff(runtime, 0, tcp_now, 0);
1507			if (next_timer == 0 || offset < next_timer) {
1508				next_timer = offset;
1509			}
1510			list_mode |= te->mode;
1511			continue;
1512		}
1513
1514		tp = TIMERENTRY_TO_TP(te);
1515
1516		/*
1517		 * Acquire an inp wantcnt on the inpcb so that the socket
1518		 * won't get detached even if tcp_close is called
1519		 */
1520		if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0)
1521		    == WNT_STOPUSING) {
1522			/*
1523			 * Some how this pcb went into dead state while
1524			 * on the timer list, just take it off the list.
1525			 * Since the timer list entry pointers are
1526			 * protected by the timer list lock, we can
1527			 * do it here without the socket lock.
1528			 */
1529			if (TIMER_IS_ON_LIST(tp)) {
1530				tp->t_flags &= ~(TF_TIMER_ONLIST);
1531				LIST_REMOVE(&tp->tentry, le);
1532				listp->entries--;
1533
1534				tp->tentry.le.le_next = NULL;
1535				tp->tentry.le.le_prev = NULL;
1536			}
1537			continue;
1538		}
1539		active_count++;
1540
1541		/*
1542		 * Store the next timerentry pointer before releasing the
1543		 * list lock. If that entry has to be removed when we
1544		 * release the lock, this pointer will be updated to the
1545		 * element after that.
1546		 */
1547		listp->next_te = next_te;
1548
1549		VERIFY_NEXT_LINK(&tp->tentry, le);
1550		VERIFY_PREV_LINK(&tp->tentry, le);
1551
1552		lck_mtx_unlock(listp->mtx);
1553
1554		offset = tcp_run_conn_timer(tp, &te_mode);
1555
1556		lck_mtx_lock(listp->mtx);
1557
1558		next_te = listp->next_te;
1559		listp->next_te = NULL;
1560
1561		if (offset > 0 && te_mode != 0) {
1562			list_mode |= te_mode;
1563
1564			if (next_timer == 0 || offset < next_timer)
1565				next_timer = offset;
1566		}
1567	}
1568
1569	if (!LIST_EMPTY(&listp->lhead)) {
1570		u_int16_t next_mode = 0;
1571		if ((list_mode & TCP_TIMERLIST_10MS_MODE) ||
1572			(listp->pref_mode & TCP_TIMERLIST_10MS_MODE))
1573			next_mode = TCP_TIMERLIST_10MS_MODE;
1574		else if ((list_mode & TCP_TIMERLIST_100MS_MODE) ||
1575			(listp->pref_mode & TCP_TIMERLIST_100MS_MODE))
1576			next_mode = TCP_TIMERLIST_100MS_MODE;
1577		else
1578			next_mode = TCP_TIMERLIST_500MS_MODE;
1579
1580		if (next_mode != TCP_TIMERLIST_500MS_MODE) {
1581			listp->idleruns = 0;
1582		} else {
1583			/*
1584			 * the next required mode is slow mode, but if
1585			 * the last one was a faster mode and we did not
1586			 * have enough idle runs, repeat the last mode.
1587			 *
1588			 * We try to keep the timer list in fast mode for
1589			 * some idle time in expectation of new data.
1590			 */
1591			if (listp->mode != next_mode &&
1592			    listp->idleruns < timer_fastmode_idlemax) {
1593				listp->idleruns++;
1594				next_mode = listp->mode;
1595				next_timer = TCP_TIMER_100MS_QUANTUM;
1596			} else {
1597				listp->idleruns = 0;
1598			}
1599		}
1600		listp->mode = next_mode;
1601		if (listp->pref_offset != 0)
1602			next_timer = min(listp->pref_offset, next_timer);
1603
1604		if (listp->mode == TCP_TIMERLIST_500MS_MODE)
1605			next_timer = max(next_timer,
1606				TCP_TIMER_500MS_QUANTUM);
1607
1608		tcp_sched_timerlist(next_timer);
1609	} else {
1610		/*
1611		 * No need to reschedule this timer, but always run
1612		 * periodically at a much higher granularity.
1613		 */
1614		tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
1615	}
1616
1617	listp->running = FALSE;
1618	listp->pref_mode = 0;
1619	listp->pref_offset = 0;
1620
1621	lck_mtx_unlock(listp->mtx);
1622}
1623
1624/*
1625 * Function to check if the timerlist needs to be reschduled to run this
1626 * connection's timers correctly.
1627 */
1628void
1629tcp_sched_timers(struct tcpcb *tp)
1630{
1631	struct tcptimerentry *te = &tp->tentry;
1632	u_int16_t index = te->index;
1633	u_int16_t mode = te->mode;
1634	struct tcptimerlist *listp = &tcp_timer_list;
1635	int32_t offset = 0;
1636	boolean_t list_locked = FALSE;
1637
1638	if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
1639		/* Just return without adding the dead pcb to the list */
1640		if (TIMER_IS_ON_LIST(tp)) {
1641			tcp_remove_timer(tp);
1642		}
1643		return;
1644	}
1645
1646	if (index == TCPT_NONE) {
1647		/* Nothing to run */
1648		tcp_remove_timer(tp);
1649		return;
1650	}
1651
1652	/*
1653	 * compute the offset at which the next timer for this connection
1654	 * has to run.
1655	 */
1656	offset = timer_diff(te->runtime, 0, tcp_now, 0);
1657	if (offset <= 0) {
1658		offset = 1;
1659		tcp_timer_advanced++;
1660	}
1661
1662	if (!TIMER_IS_ON_LIST(tp)) {
1663		if (!list_locked) {
1664			lck_mtx_lock(listp->mtx);
1665			list_locked = TRUE;
1666		}
1667
1668		LIST_INSERT_HEAD(&listp->lhead, te, le);
1669		tp->t_flags |= TF_TIMER_ONLIST;
1670
1671		listp->entries++;
1672		if (listp->entries > listp->maxentries)
1673			listp->maxentries = listp->entries;
1674
1675		/* if the list is not scheduled, just schedule it */
1676		if (!listp->scheduled)
1677			goto schedule;
1678	}
1679
1680
1681	/*
1682	 * Timer entry is currently on the list, check if the list needs
1683	 * to be rescheduled.
1684	 */
1685	if (need_to_resched_timerlist(te->runtime, mode)) {
1686		tcp_resched_timerlist++;
1687
1688		if (!list_locked) {
1689			lck_mtx_lock(listp->mtx);
1690			list_locked = TRUE;
1691		}
1692
1693		VERIFY_NEXT_LINK(te, le);
1694		VERIFY_PREV_LINK(te, le);
1695
1696		if (listp->running) {
1697			listp->pref_mode |= mode;
1698			if (listp->pref_offset == 0 ||
1699				offset < listp->pref_offset) {
1700				listp->pref_offset = offset;
1701			}
1702		} else {
1703			/*
1704			 * The list could have got rescheduled while
1705			 * this thread was waiting for the lock
1706			 */
1707			if (listp->scheduled) {
1708				int32_t diff;
1709				diff = timer_diff(listp->runtime, 0,
1710				    tcp_now, offset);
1711				if (diff <= 0)
1712					goto done;
1713				else
1714					goto schedule;
1715			} else {
1716				goto schedule;
1717			}
1718		}
1719	}
1720	goto done;
1721
1722schedule:
1723	/*
1724	 * Since a connection with timers is getting scheduled, the timer
1725	 * list moves from idle to active state and that is why idlegen is
1726	 * reset
1727	 */
1728	if (mode & TCP_TIMERLIST_10MS_MODE) {
1729		listp->mode = TCP_TIMERLIST_10MS_MODE;
1730		listp->idleruns = 0;
1731		offset = min(offset, TCP_TIMER_10MS_QUANTUM);
1732	} else if (mode & TCP_TIMERLIST_100MS_MODE) {
1733		if (listp->mode > TCP_TIMERLIST_100MS_MODE)
1734			listp->mode = TCP_TIMERLIST_100MS_MODE;
1735		listp->idleruns = 0;
1736		offset = min(offset, TCP_TIMER_100MS_QUANTUM);
1737	}
1738	tcp_sched_timerlist(offset);
1739
1740done:
1741	if (list_locked)
1742		lck_mtx_unlock(listp->mtx);
1743
1744	return;
1745}
1746
1747static inline void
1748tcp_set_lotimer_index(struct tcpcb *tp) {
1749	uint16_t i, lo_index = TCPT_NONE, mode = 0;
1750	uint32_t lo_timer = 0;
1751	for (i = 0; i < TCPT_NTIMERS; ++i) {
1752		if (tp->t_timer[i] != 0) {
1753			TCP_SET_TIMER_MODE(mode, i);
1754			if (lo_timer == 0 || tp->t_timer[i] < lo_timer) {
1755				lo_timer = tp->t_timer[i];
1756				lo_index = i;
1757			}
1758		}
1759	}
1760	tp->tentry.index = lo_index;
1761	tp->tentry.mode = mode;
1762	VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1763
1764	if (tp->tentry.index != TCPT_NONE) {
1765		tp->tentry.runtime = tp->tentry.timer_start
1766		    + tp->t_timer[tp->tentry.index];
1767		if (tp->tentry.runtime == 0)
1768			tp->tentry.runtime++;
1769	}
1770}
1771
1772void
1773tcp_check_timer_state(struct tcpcb *tp) {
1774
1775	lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1776
1777	if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT)
1778		return;
1779
1780	tcp_set_lotimer_index(tp);
1781
1782	tcp_sched_timers(tp);
1783	return;
1784}
1785
1786__private_extern__ void
1787tcp_report_stats(void)
1788{
1789	struct nstat_sysinfo_data data;
1790	struct sockaddr_in dst;
1791	struct sockaddr_in6 dst6;
1792	struct rtentry *rt = NULL;
1793	u_int64_t var, uptime;
1794
1795#define	stat	data.u.tcp_stats
1796	if (((uptime = net_uptime()) - tcp_last_report_time) <
1797		TCP_REPORT_STATS_INTERVAL)
1798		return;
1799
1800	tcp_last_report_time = uptime;
1801
1802	bzero(&data, sizeof(data));
1803	data.flags = NSTAT_SYSINFO_TCP_STATS;
1804
1805	bzero(&dst, sizeof(dst));
1806	dst.sin_len = sizeof(dst);
1807	dst.sin_family = AF_INET;
1808
1809	/* ipv4 avg rtt */
1810	lck_mtx_lock(rnh_lock);
1811	rt =  rt_lookup(TRUE, (struct sockaddr *)&dst, NULL,
1812		rt_tables[AF_INET], IFSCOPE_NONE);
1813	lck_mtx_unlock(rnh_lock);
1814	if (rt != NULL) {
1815		RT_LOCK(rt);
1816		if (rt_primary_default(rt, rt_key(rt)) &&
1817			rt->rt_stats != NULL) {
1818			stat.ipv4_avgrtt = rt->rt_stats->nstat_avg_rtt;
1819		}
1820		RT_UNLOCK(rt);
1821		rtfree(rt);
1822		rt = NULL;
1823	}
1824
1825	/* ipv6 avg rtt */
1826	bzero(&dst6, sizeof(dst6));
1827	dst6.sin6_len = sizeof(dst6);
1828	dst6.sin6_family = AF_INET6;
1829
1830	lck_mtx_lock(rnh_lock);
1831	rt = rt_lookup(TRUE,(struct sockaddr *)&dst6, NULL,
1832		rt_tables[AF_INET6], IFSCOPE_NONE);
1833	lck_mtx_unlock(rnh_lock);
1834	if (rt != NULL) {
1835		RT_LOCK(rt);
1836		if (rt_primary_default(rt, rt_key(rt)) &&
1837			rt->rt_stats != NULL) {
1838			stat.ipv6_avgrtt = rt->rt_stats->nstat_avg_rtt;
1839		}
1840		RT_UNLOCK(rt);
1841		rtfree(rt);
1842		rt = NULL;
1843	}
1844
1845	/* send packet loss rate, shift by 10 for precision */
1846	if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_sndrexmitpack > 0) {
1847		var = tcpstat.tcps_sndrexmitpack << 10;
1848		stat.send_plr = (var * 100) / tcpstat.tcps_sndpack;
1849	}
1850
1851	/* recv packet loss rate, shift by 10 for precision */
1852	if (tcpstat.tcps_rcvpack > 0 && tcpstat.tcps_recovered_pkts > 0) {
1853		var = tcpstat.tcps_recovered_pkts << 10;
1854		stat.recv_plr = (var * 100) / tcpstat.tcps_rcvpack;
1855	}
1856
1857	/* RTO after tail loss, shift by 10 for precision */
1858	if (tcpstat.tcps_sndrexmitpack > 0
1859	    && tcpstat.tcps_tailloss_rto > 0) {
1860		var = tcpstat.tcps_tailloss_rto << 10;
1861		stat.send_tlrto_rate =
1862			(var * 100) / tcpstat.tcps_sndrexmitpack;
1863	}
1864
1865	/* packet reordering */
1866	if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_reordered_pkts > 0) {
1867		var = tcpstat.tcps_reordered_pkts << 10;
1868		stat.send_reorder_rate =
1869			(var * 100) / tcpstat.tcps_sndpack;
1870	}
1871
1872	nstat_sysinfo_send_data(&data);
1873
1874#undef	stat
1875}
1876