1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/kernel.h>
68#include <sys/mbuf.h>
69#include <sys/sysctl.h>
70#include <sys/socket.h>
71#include <sys/socketvar.h>
72#include <sys/protosw.h>
73#include <sys/domain.h>
74#include <sys/mcache.h>
75#include <sys/queue.h>
76#include <kern/locks.h>
77#include <kern/cpu_number.h>	/* before tcp_seq.h, for tcp_random18() */
78#include <mach/boolean.h>
79
80#include <net/route.h>
81#include <net/if_var.h>
82
83#include <netinet/in.h>
84#include <netinet/in_systm.h>
85#include <netinet/in_pcb.h>
86#if INET6
87#include <netinet6/in6_pcb.h>
88#endif
89#include <netinet/ip_var.h>
90#include <netinet/tcp.h>
91#include <netinet/tcp_fsm.h>
92#include <netinet/tcp_seq.h>
93#include <netinet/tcp_timer.h>
94#include <netinet/tcp_var.h>
95#include <netinet/tcp_cc.h>
96#if INET6
97#include <netinet6/tcp6_var.h>
98#endif
99#include <netinet/tcpip.h>
100#if TCPDEBUG
101#include <netinet/tcp_debug.h>
102#endif
103#include <sys/kdebug.h>
104#include <mach/sdt.h>
105#include <netinet/mptcp_var.h>
106
107extern void postevent(struct socket *, struct sockbuf *,
108                                               int);
109#define DBG_FNC_TCP_FAST	NETDBG_CODE(DBG_NETTCP, (5 << 8))
110#define DBG_FNC_TCP_SLOW	NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1)
111
112#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
113
114#define VERIFY_NEXT_LINK(elm,field) do {	\
115	if (LIST_NEXT((elm),field) != NULL && 	\
116	    LIST_NEXT((elm),field)->field.le_prev !=	\
117		&((elm)->field.le_next))	\
118		panic("Bad link elm %p next->prev != elm", (elm));	\
119} while(0)
120
121#define VERIFY_PREV_LINK(elm,field) do {	\
122	if (*(elm)->field.le_prev != (elm))	\
123		panic("Bad link elm %p prev->next != elm", (elm));	\
124} while(0)
125
126/* tcp timer list */
127struct tcptimerlist tcp_timer_list;
128
129/* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
130struct tcptailq tcp_tw_tailq;
131
132static int 	background_io_trigger = 5;
133SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW | CTLFLAG_LOCKED,
134    &background_io_trigger, 0, "Background IO Trigger Setting");
135
136static int
137sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
138{
139#pragma unused(arg1, arg2)
140	int error, s, tt;
141
142	tt = *(int *)oidp->oid_arg1;
143	s = tt * 1000 / TCP_RETRANSHZ;;
144
145	error = sysctl_handle_int(oidp, &s, 0, req);
146	if (error || !req->newptr)
147		return (error);
148
149	tt = s * TCP_RETRANSHZ / 1000;
150	if (tt < 1)
151		return (EINVAL);
152
153	*(int *)oidp->oid_arg1 = tt;
154        return (0);
155}
156
157int	tcp_keepinit;
158SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
159    &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
160
161int	tcp_keepidle;
162SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
163    &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
164
165int	tcp_keepintvl;
166SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
167    &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
168
169int	tcp_keepcnt;
170SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
171	&tcp_keepcnt, 0, "number of times to repeat keepalive");
172
173int	tcp_msl;
174SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
175    &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
176
177/*
178 * Avoid DoS via TCP Robustness in Persist Condition (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
179 * by allowing a system wide maximum persistence timeout value when in Zero Window Probe mode.
180 * Expressed in milliseconds to be consistent without timeout related values, the TCP socket option is in seconds.
181 */
182u_int32_t tcp_max_persist_timeout = 0;
183SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
184    &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I", "Maximum persistence timout for ZWP");
185
186static int	always_keepalive = 0;
187SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
188    &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
189
190/* This parameter determines how long the timer list will stay in fast mode even
191 * though all connections are idle. In fast mode, the timer will fire more frequently
192 * anticipating new data.
193 */
194int timer_fastmode_idlemax = TCP_FASTMODE_IDLEGEN_MAX;
195SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax, CTLFLAG_RW | CTLFLAG_LOCKED,
196	&timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode");
197
198/*
199 * See tcp_syn_backoff[] for interval values between SYN retransmits;
200 * the value set below defines the number of retransmits, before we
201 * disable the timestamp and window scaling options during subsequent
202 * SYN retransmits.  Setting it to 0 disables the dropping off of those
203 * two options.
204 */
205static int tcp_broken_peer_syn_rxmit_thres = 7;
206SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW | CTLFLAG_LOCKED,
207    &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before "
208    "TCP disables rfc1323 and rfc1644 during the rest of attempts");
209
210/* A higher threshold on local connections for disabling RFC 1323 options */
211static int tcp_broken_peer_syn_rxmit_thres_local = 10;
212SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres_local,
213	CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres_local, 0,
214	"Number of retransmitted SYNs before disabling RFC 1323 options on local connections");
215
216static int tcp_timer_advanced = 0;
217SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, CTLFLAG_RD | CTLFLAG_LOCKED,
218    &tcp_timer_advanced, 0, "Number of times one of the timers was advanced");
219
220static int tcp_resched_timerlist = 0;
221SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist, CTLFLAG_RD | CTLFLAG_LOCKED,
222    &tcp_resched_timerlist, 0,
223    "Number of times timer list was rescheduled as part of processing a packet");
224
225int	tcp_pmtud_black_hole_detect = 1 ;
226SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW | CTLFLAG_LOCKED,
227    &tcp_pmtud_black_hole_detect, 0, "Path MTU Discovery Black Hole Detection");
228
229int	tcp_pmtud_black_hole_mss = 1200 ;
230SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED,
231    &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS");
232
233/* performed garbage collection of "used" sockets */
234static boolean_t tcp_gc_done = FALSE;
235
236	/* max idle probes */
237int	tcp_maxpersistidle;
238
239/* TCP delack timer is set to 100 ms. Since the processing of timer list in fast
240 * mode will happen no faster than 100 ms, the delayed ack timer will fire some where
241 * between 100 and 200 ms.
242 */
243int	tcp_delack = TCP_RETRANSHZ / 10;
244
245#if MPTCP
246/*
247 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
248 */
249int	tcp_jack_rxmt = TCP_RETRANSHZ / 2;
250#endif /* MPTCP */
251
252/* The frequency of running through the TCP timer list in
253 * fast and slow mode can be configured.
254 */
255SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_fastquantum, CTLFLAG_RW | CTLFLAG_LOCKED,
256	&tcp_timer_list.fast_quantum, TCP_FASTTIMER_QUANTUM,
257	"Frequency of running timer list in fast mode");
258
259SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_slowquantum, CTLFLAG_RW | CTLFLAG_LOCKED,
260	&tcp_timer_list.slow_quantum, TCP_SLOWTIMER_QUANTUM,
261	"Frequency of running timer list in slow mode");
262
263static void tcp_remove_timer(struct tcpcb *tp);
264static void tcp_sched_timerlist(uint32_t offset);
265static uint32_t tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index);
266static void tcp_sched_timers(struct tcpcb *tp);
267static inline void tcp_set_lotimer_index(struct tcpcb *);
268static void tcp_rexmt_save_state(struct tcpcb *tp);
269void tcp_remove_from_time_wait(struct inpcb *inp);
270
271/* Macro to compare two timers. If there is a reset of the sign bit, it is
272 * safe to assume that the timer has wrapped around. By doing signed comparision,
273 * we take care of wrap around such that the value with the sign bit reset is
274 * actually ahead of the other.
275 */
276
277static inline int32_t
278timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
279	return (int32_t)((t1 + toff1) - (t2 + toff2));
280};
281
282/* Returns true if the timer is on the timer list */
283#define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
284
285/* Run the TCP timerlist atleast once every hour */
286#define	TCP_TIMERLIST_MAX_OFFSET	(60 * 60 * TCP_RETRANSHZ)
287
288static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
289void	add_to_time_wait(struct tcpcb *tp, uint32_t delay) ;
290
291static boolean_t tcp_garbage_collect(struct inpcb *, int);
292
293/*
294 * Add to tcp timewait list, delay is given in milliseconds.
295 */
296static void
297add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
298{
299	struct inpcbinfo *pcbinfo = &tcbinfo;
300	struct inpcb *inp = tp->t_inpcb;
301	uint32_t timer;
302
303	/* pcb list should be locked when we get here */
304	lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
305
306	/* We may get here multiple times, so check */
307	if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
308		pcbinfo->ipi_twcount++;
309		inp->inp_flags2 |= INP2_TIMEWAIT;
310
311		/* Remove from global inp list */
312		LIST_REMOVE(inp, inp_list);
313	} else {
314		TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
315	}
316
317	/* Compute the time at which this socket can be closed */
318	timer = tcp_now + delay;
319
320	/* We will use the TCPT_2MSL timer for tracking this delay */
321
322	if (TIMER_IS_ON_LIST(tp))
323		tcp_remove_timer(tp);
324	tp->t_timer[TCPT_2MSL] = timer;
325
326	TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
327}
328
329void
330add_to_time_wait(struct tcpcb *tp, uint32_t delay)
331{
332	struct inpcbinfo *pcbinfo = &tcbinfo;
333
334	if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
335		tcp_unlock(tp->t_inpcb->inp_socket, 0, 0);
336		lck_rw_lock_exclusive(pcbinfo->ipi_lock);
337		tcp_lock(tp->t_inpcb->inp_socket, 0, 0);
338	}
339	add_to_time_wait_locked(tp, delay);
340	lck_rw_done(pcbinfo->ipi_lock);
341
342	inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
343}
344
345/* If this is on time wait queue, remove it. */
346void
347tcp_remove_from_time_wait(struct inpcb *inp)
348{
349	struct tcpcb *tp = intotcpcb(inp);
350	if (inp->inp_flags2 & INP2_TIMEWAIT)
351		TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
352}
353
354static boolean_t
355tcp_garbage_collect(struct inpcb *inp, int istimewait)
356{
357	boolean_t active = FALSE;
358	struct socket *so;
359	struct tcpcb *tp;
360
361	so = inp->inp_socket;
362	tp = intotcpcb(inp);
363
364	/*
365	 * Skip if still in use or busy; it would have been more efficient
366	 * if we were to test so_usecount against 0, but this isn't possible
367	 * due to the current implementation of tcp_dropdropablreq() where
368	 * overflow sockets that are eligible for garbage collection have
369	 * their usecounts set to 1.
370	 */
371	if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx))
372		return (TRUE);
373
374	/* Check again under the lock */
375	if (so->so_usecount > 1) {
376		if (inp->inp_wantcnt == WNT_STOPUSING)
377			active = TRUE;
378		lck_mtx_unlock(&inp->inpcb_mtx);
379		return (active);
380	}
381
382	if (istimewait &&
383		TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
384		tp->t_state != TCPS_CLOSED) {
385		/* Become a regular mutex */
386		lck_mtx_convert_spin(&inp->inpcb_mtx);
387		tcp_close(tp);
388	}
389
390	/*
391	 * Overflowed socket dropped from the listening queue?  Do this
392	 * only if we are called to clean up the time wait slots, since
393	 * tcp_dropdropablreq() considers a socket to have been fully
394	 * dropped after add_to_time_wait() is finished.
395	 * Also handle the case of connections getting closed by the peer
396	 * while in the queue as seen with rdar://6422317
397	 *
398	 */
399	if (so->so_usecount == 1 &&
400	    ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
401	    ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
402	    (so->so_head != NULL) &&
403	    ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
404	    (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
405
406		if (inp->inp_state != INPCB_STATE_DEAD) {
407			/* Become a regular mutex */
408			lck_mtx_convert_spin(&inp->inpcb_mtx);
409#if INET6
410			if (SOCK_CHECK_DOM(so, PF_INET6))
411				in6_pcbdetach(inp);
412			else
413#endif /* INET6 */
414				in_pcbdetach(inp);
415		}
416		so->so_usecount--;
417		if (inp->inp_wantcnt == WNT_STOPUSING)
418			active = TRUE;
419		lck_mtx_unlock(&inp->inpcb_mtx);
420		return (active);
421	} else if (inp->inp_wantcnt != WNT_STOPUSING) {
422		lck_mtx_unlock(&inp->inpcb_mtx);
423		return (FALSE);
424	}
425
426	/*
427	 * We get here because the PCB is no longer searchable
428	 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead
429	 * (usecount is 0).  This covers all cases, including overflow
430	 * sockets and those that are considered as "embryonic",
431	 * i.e. created by sonewconn() in TCP input path, and have
432	 * not yet been committed.  For the former, we reduce the usecount
433	 *  to 0 as done by the code above.  For the latter, the usecount
434	 * would have reduced to 0 as part calling soabort() when the
435	 * socket is dropped at the end of tcp_input().
436	 */
437	if (so->so_usecount == 0) {
438		DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
439			struct tcpcb *, tp, int32_t, TCPS_CLOSED);
440		/* Become a regular mutex */
441		lck_mtx_convert_spin(&inp->inpcb_mtx);
442
443		/*
444		 * If this tp still happens to be on the timer list,
445		 * take it out
446		 */
447		if (TIMER_IS_ON_LIST(tp)) {
448			tcp_remove_timer(tp);
449		}
450
451		if (inp->inp_state != INPCB_STATE_DEAD) {
452#if INET6
453			if (SOCK_CHECK_DOM(so, PF_INET6))
454				in6_pcbdetach(inp);
455			else
456#endif /* INET6 */
457				in_pcbdetach(inp);
458		}
459		in_pcbdispose(inp);
460		return (FALSE);
461	}
462
463	lck_mtx_unlock(&inp->inpcb_mtx);
464	return (TRUE);
465}
466
467/*
468 * TCP garbage collector callback (inpcb_timer_func_t).
469 *
470 * Returns the number of pcbs that will need to be gc-ed soon,
471 * returnining > 0 will keep timer active.
472 */
473void
474tcp_gc(struct inpcbinfo *ipi)
475{
476	struct inpcb *inp, *nxt;
477	struct tcpcb *tw_tp, *tw_ntp;
478#if TCPDEBUG
479	int ostate;
480#endif
481#if  KDEBUG
482	static int tws_checked = 0;
483#endif
484
485	KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
486
487	/*
488	 * Update tcp_now here as it may get used while
489	 * processing the slow timer.
490	 */
491	calculate_tcp_clock();
492
493	/*
494	 * Garbage collect socket/tcpcb: We need to acquire the list lock
495	 * exclusively to do this
496	 */
497
498	if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
499		/* don't sweat it this time; cleanup was done last time */
500		if (tcp_gc_done == TRUE) {
501			tcp_gc_done = FALSE;
502			KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
503			    tws_checked, cur_tw_slot, 0, 0, 0);
504			/* Lock upgrade failed, give up this round */
505			atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
506			return;
507		}
508		/* Upgrade failed, lost lock now take it again exclusive */
509		lck_rw_lock_exclusive(ipi->ipi_lock);
510	}
511	tcp_gc_done = TRUE;
512
513	LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
514		if (tcp_garbage_collect(inp, 0))
515			atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
516	}
517
518	/* Now cleanup the time wait ones */
519	TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
520		/*
521		 * We check the timestamp here without holding the
522		 * socket lock for better performance. If there are
523		 * any pcbs in time-wait, the timer will get rescheduled.
524		 * Hence some error in this check can be tolerated.
525		 *
526		 * Sometimes a socket on time-wait queue can be closed if
527		 * 2MSL timer expired but the application still has a
528		 * usecount on it.
529		 */
530		if (tw_tp->t_state == TCPS_CLOSED ||
531		    TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
532			if (tcp_garbage_collect(tw_tp->t_inpcb, 1))
533				atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
534		}
535	}
536
537	/* take into account pcbs that are still in time_wait_slots */
538	atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
539
540	lck_rw_done(ipi->ipi_lock);
541
542	/* Clean up the socache while we are here */
543	if (so_cache_timer())
544		atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
545
546	KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
547	    cur_tw_slot, 0, 0, 0);
548
549	return;
550}
551
552/*
553 * Cancel all timers for TCP tp.
554 */
555void
556tcp_canceltimers(tp)
557	struct tcpcb *tp;
558{
559	register int i;
560
561	tcp_remove_timer(tp);
562	for (i = 0; i < TCPT_NTIMERS; i++)
563		tp->t_timer[i] = 0;
564	tp->tentry.timer_start = tcp_now;
565	tp->tentry.index = TCPT_NONE;
566}
567
568int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
569    { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
570
571int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
572    { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
573
574static int tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
575
576static void tcp_rexmt_save_state(struct tcpcb *tp)
577{
578	u_int32_t fsize;
579	if (TSTMP_SUPPORTED(tp)) {
580		/*
581		 * Since timestamps are supported on the connection,
582		 * we can do recovery as described in rfc 4015.
583		 */
584		fsize = tp->snd_max - tp->snd_una;
585		tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
586		tp->snd_recover_prev = tp->snd_recover;
587	} else {
588		/*
589		 * Timestamp option is not supported on this connection.
590		 * Record ssthresh and cwnd so they can
591		 * be recovered if this turns out to be a "bad" retransmit.
592		 * A retransmit is considered "bad" if an ACK for this
593		 * segment is received within RTT/2 interval; the assumption
594		 * here is that the ACK was already in flight.  See
595		 * "On Estimating End-to-End Network Path Properties" by
596		 * Allman and Paxson for more details.
597		 */
598		tp->snd_cwnd_prev = tp->snd_cwnd;
599		tp->snd_ssthresh_prev = tp->snd_ssthresh;
600		tp->snd_recover_prev = tp->snd_recover;
601		if (IN_FASTRECOVERY(tp))
602			tp->t_flags |= TF_WASFRECOVERY;
603		else
604			tp->t_flags &= ~TF_WASFRECOVERY;
605	}
606	tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
607	tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
608	tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
609}
610
611/*
612 * TCP timer processing.
613 */
614struct tcpcb *
615tcp_timers(tp, timer)
616	register struct tcpcb *tp;
617	int timer;
618{
619	register int rexmt;
620	struct socket *so;
621	struct tcptemp *t_template;
622	int optlen = 0;
623	int idle_time = 0;
624
625#if TCPDEBUG
626	int ostate;
627#endif
628
629#if INET6
630	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
631#endif /* INET6 */
632
633	so = tp->t_inpcb->inp_socket;
634	idle_time = tcp_now - tp->t_rcvtime;
635
636	switch (timer) {
637
638	/*
639	 * 2 MSL timeout in shutdown went off.  If we're closed but
640	 * still waiting for peer to close and connection has been idle
641	 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
642	 * delete connection control block.
643	 * Otherwise, (this case shouldn't happen) check again in a bit
644	 * we keep the socket in the main list in that case.
645	 */
646	case TCPT_2MSL:
647		tcp_free_sackholes(tp);
648		if (tp->t_state != TCPS_TIME_WAIT &&
649		    tp->t_state != TCPS_FIN_WAIT_2 &&
650		    ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
651			tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
652				(u_int32_t)TCP_CONN_KEEPINTVL(tp));
653		} else {
654			tp = tcp_close(tp);
655			return(tp);
656		}
657		break;
658
659	/*
660	 * Retransmission timer went off.  Message has not
661	 * been acked within retransmit interval.  Back off
662	 * to a longer retransmit interval and retransmit one segment.
663	 */
664	case TCPT_REXMT:
665		/* Drop a connection in the retransmit timer
666		 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT times
667		 * 2. If the time spent in this retransmission episode is more than
668		 *    the time limit set with TCP_RXT_CONNDROPTIME socket option
669		 * 3. If TCP_RXT_FINDROP socket option was set and we have already
670		 *    retransmitted the FIN 3 times without receiving an ack
671		 */
672		if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
673			(tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 &&
674			(tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) ||
675			((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
676			(tp->t_flags & TF_SENTFIN) != 0 &&
677			tp->t_rxtshift >= 4)) {
678
679			if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
680				tcpstat.tcps_rxtfindrop++;
681			} else {
682				tcpstat.tcps_timeoutdrop++;
683			}
684			tp->t_rxtshift = TCP_MAXRXTSHIFT;
685			postevent(so, 0, EV_TIMEOUT);
686			soevent(so,
687			    (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
688			tp = tcp_drop(tp, tp->t_softerror ?
689			    tp->t_softerror : ETIMEDOUT);
690
691			break;
692		}
693
694		tcpstat.tcps_rexmttimeo++;
695
696		if (tp->t_rxtshift == 1 &&
697			tp->t_state == TCPS_ESTABLISHED) {
698			/* Set the time at which retransmission started. */
699			tp->t_rxtstart = tcp_now;
700
701			/*
702			 * if this is the first retransmit timeout, save
703			 * the state so that we can recover if the timeout
704			 * is spurious.
705			 */
706			tcp_rexmt_save_state(tp);
707		}
708#if MPTCP
709		if ((tp->t_rxtshift == mptcp_fail_thresh) &&
710		    (tp->t_state == TCPS_ESTABLISHED) &&
711		    (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
712			mptcp_act_on_txfail(so);
713
714		}
715#endif /* MPTCP */
716
717		if (tp->t_adaptive_wtimo > 0 &&
718			tp->t_rxtshift > tp->t_adaptive_wtimo &&
719			TCPS_HAVEESTABLISHED(tp->t_state)) {
720			/* Send an event to the application */
721			soevent(so,
722				(SO_FILT_HINT_LOCKED|
723				SO_FILT_HINT_ADAPTIVE_WTIMO));
724		}
725
726		if (tp->t_state == TCPS_SYN_SENT) {
727			rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
728			tp->t_stat.synrxtshift = tp->t_rxtshift;
729		}
730		else
731			rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
732		TCPT_RANGESET(tp->t_rxtcur, rexmt,
733			tp->t_rttmin, TCPTV_REXMTMAX,
734			TCP_ADD_REXMTSLOP(tp));
735		tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
736
737		if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb))
738			goto fc_output;
739
740		tcp_free_sackholes(tp);
741		/*
742		 * Check for potential Path MTU Discovery Black Hole
743		 */
744
745		if (tcp_pmtud_black_hole_detect && (tp->t_state == TCPS_ESTABLISHED)) {
746			if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) &&
747				 (tp->t_rxtshift == 2)) {
748				/*
749				 * Enter Path MTU Black-hole Detection mechanism:
750				 * - Disable Path MTU Discovery (IP "DF" bit).
751				 * - Reduce MTU to lower value than what we negociated with peer.
752				 */
753				/* Disable Path MTU Discovery for now */
754				tp->t_flags &= ~TF_PMTUD;
755				/* Record that we may have found a black hole */
756				tp->t_flags |= TF_BLACKHOLE;
757				optlen = tp->t_maxopd - tp->t_maxseg;
758				/* Keep track of previous MSS */
759				tp->t_pmtud_saved_maxopd = tp->t_maxopd;
760				/* Reduce the MSS to intermediary value */
761				if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
762					tp->t_maxopd = tcp_pmtud_black_hole_mss;
763				} else {
764					tp->t_maxopd = 	/* use the default MSS */
765#if INET6
766						isipv6 ? tcp_v6mssdflt :
767#endif /* INET6 */
768							tcp_mssdflt;
769				}
770				tp->t_maxseg = tp->t_maxopd - optlen;
771
772				/*
773	 			 * Reset the slow-start flight size
774				 * as it may depend on the new MSS
775	 			 */
776				if (CC_ALGO(tp)->cwnd_init != NULL)
777					CC_ALGO(tp)->cwnd_init(tp);
778			}
779			/*
780			 * If further retransmissions are still unsuccessful with a lowered MTU,
781			 * maybe this isn't a Black Hole and we restore the previous MSS and
782			 * blackhole detection flags.
783			 */
784			else {
785
786				if ((tp->t_flags & TF_BLACKHOLE) && (tp->t_rxtshift > 4)) {
787					tp->t_flags |= TF_PMTUD;
788					tp->t_flags &= ~TF_BLACKHOLE;
789					optlen = tp->t_maxopd - tp->t_maxseg;
790					tp->t_maxopd = tp->t_pmtud_saved_maxopd;
791					tp->t_maxseg = tp->t_maxopd - optlen;
792					/*
793	 			 	 * Reset the slow-start flight size as it
794					 * may depend on the new MSS
795	 			 	 */
796					if (CC_ALGO(tp)->cwnd_init != NULL)
797						CC_ALGO(tp)->cwnd_init(tp);
798				}
799			}
800		}
801
802
803		/*
804		 * Disable rfc1323 and rfc1644 if we haven't got any response to
805		 * our SYN (after we reach the threshold) to work-around some
806		 * broken terminal servers (most of which have hopefully been
807		 * retired) that have bad VJ header compression code which
808		 * trashes TCP segments containing unknown-to-them TCP options.
809		 * Do this only on non-local connections.
810		 */
811		if (tp->t_state == TCPS_SYN_SENT &&
812		    ((!(tp->t_flags & TF_LOCAL) &&
813		    tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres) ||
814		    ((tp->t_flags & TF_LOCAL) &&
815		    tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres_local)))
816			tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
817
818		/*
819		 * If losing, let the lower level know and try for
820		 * a better route.  Also, if we backed off this far,
821		 * our srtt estimate is probably bogus.  Clobber it
822		 * so we'll take the next rtt measurement as our srtt;
823		 * move the current srtt into rttvar to keep the current
824		 * retransmit times until then.
825		 */
826		if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
827#if INET6
828			if (isipv6)
829				in6_losing(tp->t_inpcb);
830			else
831#endif /* INET6 */
832			in_losing(tp->t_inpcb);
833			tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
834			tp->t_srtt = 0;
835		}
836		tp->snd_nxt = tp->snd_una;
837		/*
838		 * Note:  We overload snd_recover to function also as the
839		 * snd_last variable described in RFC 2582
840		 */
841		tp->snd_recover = tp->snd_max;
842		/*
843		 * Force a segment to be sent.
844		 */
845		tp->t_flags |= TF_ACKNOW;
846		/*
847		 * If timing a segment in this window, stop the timer.
848		 */
849		tp->t_rtttime = 0;
850
851		EXIT_FASTRECOVERY(tp);
852
853		/* RFC 5681 says: when a TCP sender detects segment loss
854		 * using retransmit timer and the given segment has already
855		 * been retransmitted by way of the retransmission timer at
856		 * least once, the value of ssthresh is held constant
857		 */
858		if (tp->t_rxtshift == 1 &&
859			CC_ALGO(tp)->after_timeout != NULL)
860			CC_ALGO(tp)->after_timeout(tp);
861
862
863		/* CWR notifications are to be sent on new data right after
864		 * RTOs, Fast Retransmits and ECE notification receipts.
865		 */
866		if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) {
867			tp->ecn_flags |= TE_SENDCWR;
868		}
869fc_output:
870		DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb,
871			struct tcpcb *, tp, struct tcphdr *, NULL,
872			int32_t, TCP_CC_REXMT_TIMEOUT);
873
874		(void) tcp_output(tp);
875		break;
876
877	/*
878	 * Persistance timer into zero window.
879	 * Force a byte to be output, if possible.
880	 */
881	case TCPT_PERSIST:
882		tcpstat.tcps_persisttimeo++;
883		/*
884		 * Hack: if the peer is dead/unreachable, we do not
885		 * time out if the window is closed.  After a full
886		 * backoff, drop the connection if the idle time
887		 * (no responses to probes) reaches the maximum
888		 * backoff that we would use if retransmitting.
889		 *
890		 * Drop the connection if we reached the maximum allowed time for
891		 * Zero Window Probes without a non-zero update from the peer.
892		 * See rdar://5805356
893		 */
894		if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
895		    (idle_time >= tcp_maxpersistidle ||
896		    idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
897		    ((tp->t_persist_stop != 0) &&
898			TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
899			tcpstat.tcps_persistdrop++;
900			postevent(so, 0, EV_TIMEOUT);
901			soevent(so,
902			    (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
903			tp = tcp_drop(tp, ETIMEDOUT);
904			break;
905		}
906		tcp_setpersist(tp);
907		tp->t_force = 1;
908		(void) tcp_output(tp);
909		tp->t_force = 0;
910		break;
911
912	/*
913	 * Keep-alive timer went off; send something
914	 * or drop connection if idle for too long.
915	 */
916	case TCPT_KEEP:
917		tcpstat.tcps_keeptimeo++;
918#if MPTCP
919		/*
920		 * Regular TCP connections do not send keepalives after closing
921		 * MPTCP must not also, after sending Data FINs.
922		 */
923		struct mptcb *mp_tp = tp->t_mptcb;
924		if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
925		    (mp_tp == NULL)) {
926			goto dropit;
927		} else if (mp_tp != NULL) {
928			if ((mptcp_ok_to_keepalive(mp_tp) == 0))
929				goto dropit;
930		}
931#endif /* MPTCP */
932		if (tp->t_state < TCPS_ESTABLISHED)
933			goto dropit;
934		if ((always_keepalive ||
935		    (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
936		    (tp->t_flagsext & TF_DETECT_READSTALL)) &&
937		    (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
938		    	if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
939				goto dropit;
940			/*
941			 * Send a packet designed to force a response
942			 * if the peer is up and reachable:
943			 * either an ACK if the connection is still alive,
944			 * or an RST if the peer has closed the connection
945			 * due to timeout or reboot.
946			 * Using sequence number tp->snd_una-1
947			 * causes the transmitted zero-length segment
948			 * to lie outside the receive window;
949			 * by the protocol spec, this requires the
950			 * correspondent TCP to respond.
951			 */
952			tcpstat.tcps_keepprobe++;
953			t_template = tcp_maketemplate(tp);
954			if (t_template) {
955				unsigned int ifscope, nocell = 0;
956
957				if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
958					ifscope = tp->t_inpcb->inp_boundifp->if_index;
959				else
960					ifscope = IFSCOPE_NONE;
961
962				/*
963				 * If the socket isn't allowed to use the
964				 * cellular interface, indicate it as such.
965				 */
966				if (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR)
967					nocell = 1;
968
969				tcp_respond(tp, t_template->tt_ipgen,
970				    &t_template->tt_t, (struct mbuf *)NULL,
971				    tp->rcv_nxt, tp->snd_una - 1, 0, ifscope,
972				    nocell);
973				(void) m_free(dtom(t_template));
974				if (tp->t_flagsext & TF_DETECT_READSTALL)
975					tp->t_rtimo_probes++;
976			}
977			tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
978				TCP_CONN_KEEPINTVL(tp));
979		} else {
980			tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
981				TCP_CONN_KEEPIDLE(tp));
982		}
983		if (tp->t_flagsext & TF_DETECT_READSTALL) {
984			/*
985			 * The keep alive packets sent to detect a read
986			 * stall did not get a response from the
987			 * peer. Generate more keep-alives to confirm this.
988			 * If the number of probes sent reaches the limit,
989			 * generate an event.
990			 */
991			if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
992				/* Generate an event */
993				soevent(so,
994					(SO_FILT_HINT_LOCKED|
995					SO_FILT_HINT_ADAPTIVE_RTIMO));
996				tcp_keepalive_reset(tp);
997			} else {
998				tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
999					tp, TCP_REXMTVAL(tp));
1000			}
1001		}
1002		break;
1003	case TCPT_DELACK:
1004		if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
1005			tp->t_flags &= ~TF_DELACK;
1006			tp->t_timer[TCPT_DELACK] = 0;
1007			tp->t_flags |= TF_ACKNOW;
1008
1009			/* If delayed ack timer fired while stretching acks
1010			 * go back to acking every other packet
1011			 */
1012			if ((tp->t_flags & TF_STRETCHACK) != 0)
1013				tcp_reset_stretch_ack(tp);
1014
1015			/* If we are measuring inter packet arrival jitter for
1016			 * throttling a connection, this delayed ack might be
1017			 * the reason for accumulating some jitter. So let's
1018			 * restart the measurement.
1019			 */
1020			CLEAR_IAJ_STATE(tp);
1021
1022			tcpstat.tcps_delack++;
1023			(void) tcp_output(tp);
1024		}
1025		break;
1026
1027#if MPTCP
1028	case TCPT_JACK_RXMT:
1029		if ((tp->t_state == TCPS_ESTABLISHED) &&
1030		    (tp->t_mpflags & TMPF_PREESTABLISHED) &&
1031		    (tp->t_mpflags & TMPF_JOINED_FLOW)) {
1032			if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
1033				tcpstat.tcps_timeoutdrop++;
1034				postevent(so, 0, EV_TIMEOUT);
1035				soevent(so,
1036			    	    (SO_FILT_HINT_LOCKED|
1037				    SO_FILT_HINT_TIMEOUT));
1038				tp = tcp_drop(tp, tp->t_softerror ?
1039			    	    tp->t_softerror : ETIMEDOUT);
1040				break;
1041			}
1042			tcpstat.tcps_join_rxmts++;
1043			tp->t_flags |= TF_ACKNOW;
1044
1045			/*
1046			 * No backoff is implemented for simplicity for this
1047			 * corner case.
1048			 */
1049			(void) tcp_output(tp);
1050		}
1051		break;
1052#endif /* MPTCP */
1053
1054#if TCPDEBUG
1055	if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
1056		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
1057			  PRU_SLOWTIMO);
1058#endif
1059	dropit:
1060		tcpstat.tcps_keepdrops++;
1061		postevent(so, 0, EV_TIMEOUT);
1062		soevent(so,
1063		    (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1064		tp = tcp_drop(tp, ETIMEDOUT);
1065		break;
1066	}
1067	return (tp);
1068}
1069
1070/* Remove a timer entry from timer list */
1071void
1072tcp_remove_timer(struct tcpcb *tp)
1073{
1074	struct tcptimerlist *listp = &tcp_timer_list;
1075
1076	lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1077	if (!(TIMER_IS_ON_LIST(tp))) {
1078		return;
1079	}
1080	lck_mtx_lock(listp->mtx);
1081
1082	/* Check if pcb is on timer list again after acquiring the lock */
1083	if (!(TIMER_IS_ON_LIST(tp))) {
1084		lck_mtx_unlock(listp->mtx);
1085		return;
1086	}
1087
1088	if (listp->next_te != NULL && listp->next_te == &tp->tentry)
1089		listp->next_te = LIST_NEXT(&tp->tentry, le);
1090
1091	LIST_REMOVE(&tp->tentry, le);
1092	tp->t_flags &= ~(TF_TIMER_ONLIST);
1093
1094	listp->entries--;
1095
1096	tp->tentry.le.le_next = NULL;
1097	tp->tentry.le.le_prev = NULL;
1098	lck_mtx_unlock(listp->mtx);
1099}
1100
1101/* Function to check if the timerlist needs to be rescheduled to run
1102 * the timer entry correctly. Basically, this is to check if we can avoid
1103 * taking the list lock.
1104 */
1105
1106static boolean_t
1107need_to_resched_timerlist(uint32_t runtime, uint16_t index) {
1108	struct tcptimerlist *listp = &tcp_timer_list;
1109	int32_t diff;
1110	boolean_t is_fast;
1111
1112	if (index == TCPT_NONE)
1113		return FALSE;
1114	is_fast = !(IS_TIMER_SLOW(index));
1115
1116	/* If the list is being processed then the state of the list is in flux.
1117	 * In this case always acquire the lock and set the state correctly.
1118	 */
1119	if (listp->running)
1120		return TRUE;
1121
1122	if (!listp->scheduled)
1123		return (TRUE);
1124
1125	diff = timer_diff(listp->runtime, 0, runtime, 0);
1126	if (diff <= 0) {
1127		/* The list is going to run before this timer */
1128		return FALSE;
1129	} else {
1130		if (is_fast) {
1131			if (diff <= listp->fast_quantum)
1132				return FALSE;
1133		} else {
1134			if (diff <= listp->slow_quantum)
1135				return FALSE;
1136		}
1137	}
1138	return TRUE;
1139}
1140
1141void
1142tcp_sched_timerlist(uint32_t offset)
1143{
1144
1145	uint64_t deadline = 0;
1146	struct tcptimerlist *listp = &tcp_timer_list;
1147
1148	lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED);
1149
1150	offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
1151	listp->runtime = tcp_now + offset;
1152	if (listp->runtime == 0)
1153		listp->runtime++;
1154
1155	clock_interval_to_deadline(offset, NSEC_PER_SEC / TCP_RETRANSHZ,
1156		&deadline);
1157
1158	thread_call_enter_delayed(listp->call, deadline);
1159	listp->scheduled = TRUE;
1160}
1161
1162/* Function to run the timers for a connection.
1163 *
1164 * Returns the offset of next timer to be run for this connection which
1165 * can be used to reschedule the timerlist.
1166 */
1167uint32_t
1168tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) {
1169
1170        struct socket *so;
1171        uint16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
1172        uint32_t timer_val, offset = 0, lo_timer = 0;
1173	int32_t diff;
1174	boolean_t needtorun[TCPT_NTIMERS];
1175	int count = 0;
1176
1177        VERIFY(tp != NULL);
1178        bzero(needtorun, sizeof(needtorun));
1179
1180        tcp_lock(tp->t_inpcb->inp_socket, 1, 0);
1181
1182        so = tp->t_inpcb->inp_socket;
1183	/* Release the want count on inp */
1184	if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1) == WNT_STOPUSING) {
1185		if (TIMER_IS_ON_LIST(tp)) {
1186			tcp_remove_timer(tp);
1187		}
1188
1189		/* Looks like the TCP connection got closed while we
1190		 * were waiting for the lock.. Done
1191		 */
1192		goto done;
1193	}
1194
1195        /* Since the timer thread needs to wait for tcp lock, it may race
1196         * with another thread that can cancel or reschedule the timer that is
1197         * about to run. Check if we need to run anything.
1198         */
1199	if ((index = tp->tentry.index) == TCPT_NONE)
1200		goto done;
1201	timer_val = tp->t_timer[index];
1202
1203	diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
1204	if (diff > 0) {
1205		if (tp->tentry.index != TCPT_NONE) {
1206			offset = diff;
1207			*(next_index) = tp->tentry.index;
1208		}
1209		goto done;
1210	}
1211
1212	tp->t_timer[index] = 0;
1213	if (timer_val > 0) {
1214		tp = tcp_timers(tp, index);
1215		if (tp == NULL)
1216			goto done;
1217	}
1218
1219	/* Check if there are any other timers that need to be run. While doing it,
1220	 * adjust the timer values wrt tcp_now.
1221	 */
1222	for (i = 0; i < TCPT_NTIMERS; ++i) {
1223		if (tp->t_timer[i] != 0) {
1224			diff = timer_diff(tp->tentry.timer_start, tp->t_timer[i], tcp_now, 0);
1225			if (diff <= 0) {
1226				tp->t_timer[i] = 0;
1227				needtorun[i] = TRUE;
1228				count++;
1229			} else {
1230				tp->t_timer[i] = diff;
1231				needtorun[i] = FALSE;
1232				if (lo_timer == 0 || diff < lo_timer) {
1233					lo_timer = diff;
1234					lo_index = i;
1235				}
1236			}
1237		}
1238	}
1239
1240	tp->tentry.timer_start = tcp_now;
1241	tp->tentry.index = lo_index;
1242	if (lo_index != TCPT_NONE) {
1243		tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index];
1244		if (tp->tentry.runtime == 0)
1245			tp->tentry.runtime++;
1246	}
1247
1248	if (count > 0) {
1249		/* run any other timers that are also outstanding at this time. */
1250		for (i = 0; i < TCPT_NTIMERS; ++i) {
1251			if (needtorun[i]) {
1252				tp->t_timer[i] = 0;
1253				tp = tcp_timers(tp, i);
1254				if (tp == NULL) {
1255					offset = 0;
1256					*(next_index) = TCPT_NONE;
1257					goto done;
1258				}
1259			}
1260		}
1261		tcp_set_lotimer_index(tp);
1262	}
1263
1264	if (tp->tentry.index < TCPT_NONE) {
1265		offset = tp->t_timer[tp->tentry.index];
1266		*(next_index) = tp->tentry.index;
1267	}
1268
1269done:
1270	if (tp != NULL && tp->tentry.index == TCPT_NONE) {
1271		tcp_remove_timer(tp);
1272		offset = 0;
1273	}
1274        tcp_unlock(so, 1, 0);
1275        return offset;
1276}
1277
1278void
1279tcp_run_timerlist(void * arg1, void * arg2) {
1280
1281#pragma unused(arg1, arg2)
1282
1283	struct tcptimerentry *te, *next_te;
1284	struct tcptimerlist *listp = &tcp_timer_list;
1285	struct tcpcb *tp;
1286	uint32_t next_timer = 0;
1287	uint16_t index = TCPT_NONE;
1288	boolean_t need_fast = FALSE;
1289	uint32_t active_count = 0;
1290	uint32_t mode = TCP_TIMERLIST_FASTMODE;
1291
1292	calculate_tcp_clock();
1293
1294	lck_mtx_lock(listp->mtx);
1295
1296	listp->running = TRUE;
1297
1298	LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
1299		uint32_t offset = 0;
1300		uint32_t runtime = te->runtime;
1301		if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
1302			offset = timer_diff(runtime, 0, tcp_now, 0);
1303			if (next_timer == 0 || offset < next_timer) {
1304				next_timer = offset;
1305			}
1306			continue;
1307		}
1308		active_count++;
1309
1310		tp = TIMERENTRY_TO_TP(te);
1311
1312		/* Acquire an inp wantcnt on the inpcb so that the socket won't get
1313		 * detached even if tcp_close is called
1314		 */
1315		if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1316			/* Some how this pcb went into dead state while on the timer list,
1317			 * just take it off the list. Since the timer list entry pointers
1318			 * are protected by the timer list lock, we can do it here
1319			 */
1320			if (TIMER_IS_ON_LIST(tp)) {
1321				tp->t_flags &= ~(TF_TIMER_ONLIST);
1322				LIST_REMOVE(&tp->tentry, le);
1323				listp->entries--;
1324
1325				tp->tentry.le.le_next = NULL;
1326				tp->tentry.le.le_prev = NULL;
1327			}
1328			continue;
1329		}
1330
1331		/* Store the next timerentry pointer before releasing the list lock.
1332		 * If that entry has to be removed when we release the lock, this
1333		 * pointer will be updated to the element after that.
1334		 */
1335		listp->next_te = next_te;
1336
1337		VERIFY_NEXT_LINK(&tp->tentry, le);
1338		VERIFY_PREV_LINK(&tp->tentry, le);
1339
1340		lck_mtx_unlock(listp->mtx);
1341
1342		index = TCPT_NONE;
1343		offset = tcp_run_conn_timer(tp, &index);
1344
1345		lck_mtx_lock(listp->mtx);
1346
1347		next_te = listp->next_te;
1348		listp->next_te = NULL;
1349
1350		if (offset > 0) {
1351			if (index < TCPT_NONE) {
1352				/* Check if this is a fast_timer. */
1353				if (!need_fast && !(IS_TIMER_SLOW(index))) {
1354					need_fast = TRUE;
1355				}
1356
1357				if (next_timer == 0 || offset < next_timer) {
1358					next_timer = offset;
1359				}
1360			}
1361		}
1362	}
1363
1364	if (!LIST_EMPTY(&listp->lhead)) {
1365		if (listp->mode == TCP_TIMERLIST_FASTMODE) {
1366			if (need_fast || active_count > 0 ||
1367				listp->pref_mode == TCP_TIMERLIST_FASTMODE) {
1368				listp->idlegen = 0;
1369			} else {
1370				listp->idlegen++;
1371				if (listp->idlegen > timer_fastmode_idlemax) {
1372					mode = TCP_TIMERLIST_SLOWMODE;
1373					listp->idlegen = 0;
1374				}
1375			}
1376		} else {
1377			if (!need_fast) {
1378				mode = TCP_TIMERLIST_SLOWMODE;
1379			}
1380		}
1381
1382		if (mode == TCP_TIMERLIST_FASTMODE ||
1383			listp->pref_mode == TCP_TIMERLIST_FASTMODE) {
1384			next_timer = listp->fast_quantum;
1385		} else {
1386			if (listp->pref_offset != 0 &&
1387				listp->pref_offset < next_timer)
1388				next_timer = listp->pref_offset;
1389			if (next_timer < listp->slow_quantum)
1390				next_timer = listp->slow_quantum;
1391		}
1392
1393		listp->mode = mode;
1394
1395		tcp_sched_timerlist(next_timer);
1396	} else {
1397		/*
1398		 * No need to reschedule this timer, but always run
1399		 * periodically at a much higher granularity.
1400		 */
1401		tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
1402	}
1403
1404	listp->running = FALSE;
1405	listp->pref_mode = 0;
1406	listp->pref_offset = 0;
1407
1408	lck_mtx_unlock(listp->mtx);
1409}
1410
1411/* Function to verify if a change in timer state is required for a connection */
1412void
1413tcp_sched_timers(struct tcpcb *tp)
1414{
1415	struct tcptimerentry *te = &tp->tentry;
1416	uint16_t index = te->index;
1417	struct tcptimerlist *listp = &tcp_timer_list;
1418	int32_t offset = 0;
1419	boolean_t is_fast;
1420	int list_locked = 0;
1421
1422	if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
1423		/* Just return without adding the dead pcb to the list */
1424		if (TIMER_IS_ON_LIST(tp)) {
1425			tcp_remove_timer(tp);
1426		}
1427		return;
1428	}
1429
1430	if (index == TCPT_NONE) {
1431		tcp_remove_timer(tp);
1432		return;
1433	}
1434
1435	is_fast = !(IS_TIMER_SLOW(index));
1436	offset = timer_diff(te->runtime, 0, tcp_now, 0);
1437	if (offset <= 0) {
1438		offset = 1;
1439		tcp_timer_advanced++;
1440	}
1441	if (is_fast)
1442		offset = listp->fast_quantum;
1443
1444	if (!TIMER_IS_ON_LIST(tp)) {
1445		if (!list_locked) {
1446			lck_mtx_lock(listp->mtx);
1447			list_locked = 1;
1448		}
1449
1450		LIST_INSERT_HEAD(&listp->lhead, te, le);
1451		tp->t_flags |= TF_TIMER_ONLIST;
1452
1453        	listp->entries++;
1454        	if (listp->entries > listp->maxentries)
1455                	listp->maxentries = listp->entries;
1456
1457		/* if the list is not scheduled, just schedule it */
1458		if (!listp->scheduled)
1459			goto schedule;
1460
1461	}
1462
1463
1464	/* timer entry is currently on the list */
1465	if (need_to_resched_timerlist(te->runtime, index)) {
1466		tcp_resched_timerlist++;
1467
1468		if (!list_locked) {
1469			lck_mtx_lock(listp->mtx);
1470			list_locked = 1;
1471		}
1472
1473		VERIFY_NEXT_LINK(te, le);
1474		VERIFY_PREV_LINK(te, le);
1475
1476		if (listp->running) {
1477			if (is_fast) {
1478				listp->pref_mode = TCP_TIMERLIST_FASTMODE;
1479			} else if (listp->pref_offset == 0 ||
1480				offset < listp->pref_offset) {
1481				listp->pref_offset = offset;
1482			}
1483		} else {
1484			/*
1485			 * The list could have got scheduled while this
1486			 * thread was waiting for the lock
1487			 */
1488			if (listp->scheduled) {
1489				int32_t diff;
1490				diff = timer_diff(listp->runtime, 0,
1491				    tcp_now, offset);
1492				if (diff <= 0)
1493					goto done;
1494				else
1495					goto schedule;
1496			} else {
1497				goto schedule;
1498			}
1499		}
1500	}
1501	goto done;
1502
1503schedule:
1504	if (is_fast) {
1505		listp->mode = TCP_TIMERLIST_FASTMODE;
1506		listp->idlegen = 0;
1507	}
1508	tcp_sched_timerlist(offset);
1509
1510done:
1511	if (list_locked)
1512		lck_mtx_unlock(listp->mtx);
1513
1514	return;
1515}
1516
1517void
1518tcp_set_lotimer_index(struct tcpcb *tp) {
1519	uint16_t i, lo_index = TCPT_NONE;
1520	uint32_t lo_timer = 0;
1521	for (i = 0; i < TCPT_NTIMERS; ++i) {
1522		if (tp->t_timer[i] != 0 &&
1523			(lo_timer == 0 || tp->t_timer[i] < lo_timer)) {
1524			lo_timer = tp->t_timer[i];
1525			lo_index = i;
1526		}
1527	}
1528	tp->tentry.index = lo_index;
1529	if (lo_index != TCPT_NONE) {
1530		tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index];
1531		if (tp->tentry.runtime == 0)
1532			tp->tentry.runtime++;
1533	}
1534}
1535
1536void
1537tcp_check_timer_state(struct tcpcb *tp) {
1538
1539	lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1540
1541	if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT)
1542		return;
1543
1544	tcp_set_lotimer_index(tp);
1545
1546	tcp_sched_timers(tp);
1547	return;
1548}
1549