1/* 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $ 62 */ 63 64 65#include <sys/param.h> 66#include <sys/systm.h> 67#include <sys/kernel.h> 68#include <sys/mbuf.h> 69#include <sys/sysctl.h> 70#include <sys/socket.h> 71#include <sys/socketvar.h> 72#include <sys/protosw.h> 73#include <sys/domain.h> 74#include <sys/mcache.h> 75#include <sys/queue.h> 76#include <kern/locks.h> 77#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */ 78#include <mach/boolean.h> 79 80#include <net/route.h> 81#include <net/if_var.h> 82 83#include <netinet/in.h> 84#include <netinet/in_systm.h> 85#include <netinet/in_pcb.h> 86#if INET6 87#include <netinet6/in6_pcb.h> 88#endif 89#include <netinet/ip_var.h> 90#include <netinet/tcp.h> 91#include <netinet/tcp_fsm.h> 92#include <netinet/tcp_seq.h> 93#include <netinet/tcp_timer.h> 94#include <netinet/tcp_var.h> 95#include <netinet/tcp_cc.h> 96#if INET6 97#include <netinet6/tcp6_var.h> 98#endif 99#include <netinet/tcpip.h> 100#if TCPDEBUG 101#include <netinet/tcp_debug.h> 102#endif 103#include <sys/kdebug.h> 104#include <mach/sdt.h> 105#include <netinet/mptcp_var.h> 106 107extern void postevent(struct socket *, struct sockbuf *, 108 int); 109#define DBG_FNC_TCP_FAST NETDBG_CODE(DBG_NETTCP, (5 << 8)) 110#define DBG_FNC_TCP_SLOW NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1) 111 112#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next))) 113 114#define VERIFY_NEXT_LINK(elm,field) do { \ 115 if (LIST_NEXT((elm),field) != NULL && \ 116 LIST_NEXT((elm),field)->field.le_prev != \ 117 &((elm)->field.le_next)) \ 118 panic("Bad link elm %p next->prev != elm", (elm)); \ 119} while(0) 120 121#define VERIFY_PREV_LINK(elm,field) do { \ 122 if (*(elm)->field.le_prev != (elm)) \ 123 panic("Bad link elm %p prev->next != elm", (elm)); \ 124} while(0) 125 126/* tcp timer list */ 127struct tcptimerlist tcp_timer_list; 128 129/* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */ 130struct tcptailq tcp_tw_tailq; 131 132static int background_io_trigger = 5; 133SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW | CTLFLAG_LOCKED, 134 &background_io_trigger, 0, "Background IO Trigger Setting"); 135 136static int 137sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS 138{ 139#pragma unused(arg1, arg2) 140 int error, s, tt; 141 142 tt = *(int *)oidp->oid_arg1; 143 s = tt * 1000 / TCP_RETRANSHZ;; 144 145 error = sysctl_handle_int(oidp, &s, 0, req); 146 if (error || !req->newptr) 147 return (error); 148 149 tt = s * TCP_RETRANSHZ / 1000; 150 if (tt < 1) 151 return (EINVAL); 152 153 *(int *)oidp->oid_arg1 = tt; 154 return (0); 155} 156 157int tcp_keepinit; 158SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 159 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); 160 161int tcp_keepidle; 162SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 163 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); 164 165int tcp_keepintvl; 166SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 167 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); 168 169int tcp_keepcnt; 170SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 171 &tcp_keepcnt, 0, "number of times to repeat keepalive"); 172 173int tcp_msl; 174SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 175 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 176 177/* 178 * Avoid DoS via TCP Robustness in Persist Condition (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt) 179 * by allowing a system wide maximum persistence timeout value when in Zero Window Probe mode. 180 * Expressed in milliseconds to be consistent without timeout related values, the TCP socket option is in seconds. 181 */ 182u_int32_t tcp_max_persist_timeout = 0; 183SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 184 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I", "Maximum persistence timout for ZWP"); 185 186static int always_keepalive = 0; 187SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW | CTLFLAG_LOCKED, 188 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 189 190/* This parameter determines how long the timer list will stay in fast mode even 191 * though all connections are idle. In fast mode, the timer will fire more frequently 192 * anticipating new data. 193 */ 194int timer_fastmode_idlemax = TCP_FASTMODE_IDLEGEN_MAX; 195SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax, CTLFLAG_RW | CTLFLAG_LOCKED, 196 &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode"); 197 198/* 199 * See tcp_syn_backoff[] for interval values between SYN retransmits; 200 * the value set below defines the number of retransmits, before we 201 * disable the timestamp and window scaling options during subsequent 202 * SYN retransmits. Setting it to 0 disables the dropping off of those 203 * two options. 204 */ 205static int tcp_broken_peer_syn_rxmit_thres = 7; 206SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW | CTLFLAG_LOCKED, 207 &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before " 208 "TCP disables rfc1323 and rfc1644 during the rest of attempts"); 209 210/* A higher threshold on local connections for disabling RFC 1323 options */ 211static int tcp_broken_peer_syn_rxmit_thres_local = 10; 212SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres_local, 213 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres_local, 0, 214 "Number of retransmitted SYNs before disabling RFC 1323 options on local connections"); 215 216static int tcp_timer_advanced = 0; 217SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, CTLFLAG_RD | CTLFLAG_LOCKED, 218 &tcp_timer_advanced, 0, "Number of times one of the timers was advanced"); 219 220static int tcp_resched_timerlist = 0; 221SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist, CTLFLAG_RD | CTLFLAG_LOCKED, 222 &tcp_resched_timerlist, 0, 223 "Number of times timer list was rescheduled as part of processing a packet"); 224 225int tcp_pmtud_black_hole_detect = 1 ; 226SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW | CTLFLAG_LOCKED, 227 &tcp_pmtud_black_hole_detect, 0, "Path MTU Discovery Black Hole Detection"); 228 229int tcp_pmtud_black_hole_mss = 1200 ; 230SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED, 231 &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS"); 232 233/* performed garbage collection of "used" sockets */ 234static boolean_t tcp_gc_done = FALSE; 235 236 /* max idle probes */ 237int tcp_maxpersistidle; 238 239/* TCP delack timer is set to 100 ms. Since the processing of timer list in fast 240 * mode will happen no faster than 100 ms, the delayed ack timer will fire some where 241 * between 100 and 200 ms. 242 */ 243int tcp_delack = TCP_RETRANSHZ / 10; 244 245#if MPTCP 246/* 247 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff 248 */ 249int tcp_jack_rxmt = TCP_RETRANSHZ / 2; 250#endif /* MPTCP */ 251 252/* The frequency of running through the TCP timer list in 253 * fast and slow mode can be configured. 254 */ 255SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_fastquantum, CTLFLAG_RW | CTLFLAG_LOCKED, 256 &tcp_timer_list.fast_quantum, TCP_FASTTIMER_QUANTUM, 257 "Frequency of running timer list in fast mode"); 258 259SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_slowquantum, CTLFLAG_RW | CTLFLAG_LOCKED, 260 &tcp_timer_list.slow_quantum, TCP_SLOWTIMER_QUANTUM, 261 "Frequency of running timer list in slow mode"); 262 263static void tcp_remove_timer(struct tcpcb *tp); 264static void tcp_sched_timerlist(uint32_t offset); 265static uint32_t tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index); 266static void tcp_sched_timers(struct tcpcb *tp); 267static inline void tcp_set_lotimer_index(struct tcpcb *); 268static void tcp_rexmt_save_state(struct tcpcb *tp); 269void tcp_remove_from_time_wait(struct inpcb *inp); 270 271/* Macro to compare two timers. If there is a reset of the sign bit, it is 272 * safe to assume that the timer has wrapped around. By doing signed comparision, 273 * we take care of wrap around such that the value with the sign bit reset is 274 * actually ahead of the other. 275 */ 276 277static inline int32_t 278timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) { 279 return (int32_t)((t1 + toff1) - (t2 + toff2)); 280}; 281 282/* Returns true if the timer is on the timer list */ 283#define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST) 284 285/* Run the TCP timerlist atleast once every hour */ 286#define TCP_TIMERLIST_MAX_OFFSET (60 * 60 * TCP_RETRANSHZ) 287 288static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay); 289void add_to_time_wait(struct tcpcb *tp, uint32_t delay) ; 290 291static boolean_t tcp_garbage_collect(struct inpcb *, int); 292 293/* 294 * Add to tcp timewait list, delay is given in milliseconds. 295 */ 296static void 297add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay) 298{ 299 struct inpcbinfo *pcbinfo = &tcbinfo; 300 struct inpcb *inp = tp->t_inpcb; 301 uint32_t timer; 302 303 /* pcb list should be locked when we get here */ 304 lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE); 305 306 /* We may get here multiple times, so check */ 307 if (!(inp->inp_flags2 & INP2_TIMEWAIT)) { 308 pcbinfo->ipi_twcount++; 309 inp->inp_flags2 |= INP2_TIMEWAIT; 310 311 /* Remove from global inp list */ 312 LIST_REMOVE(inp, inp_list); 313 } else { 314 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry); 315 } 316 317 /* Compute the time at which this socket can be closed */ 318 timer = tcp_now + delay; 319 320 /* We will use the TCPT_2MSL timer for tracking this delay */ 321 322 if (TIMER_IS_ON_LIST(tp)) 323 tcp_remove_timer(tp); 324 tp->t_timer[TCPT_2MSL] = timer; 325 326 TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry); 327} 328 329void 330add_to_time_wait(struct tcpcb *tp, uint32_t delay) 331{ 332 struct inpcbinfo *pcbinfo = &tcbinfo; 333 334 if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) { 335 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0); 336 lck_rw_lock_exclusive(pcbinfo->ipi_lock); 337 tcp_lock(tp->t_inpcb->inp_socket, 0, 0); 338 } 339 add_to_time_wait_locked(tp, delay); 340 lck_rw_done(pcbinfo->ipi_lock); 341 342 inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY); 343} 344 345/* If this is on time wait queue, remove it. */ 346void 347tcp_remove_from_time_wait(struct inpcb *inp) 348{ 349 struct tcpcb *tp = intotcpcb(inp); 350 if (inp->inp_flags2 & INP2_TIMEWAIT) 351 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry); 352} 353 354static boolean_t 355tcp_garbage_collect(struct inpcb *inp, int istimewait) 356{ 357 boolean_t active = FALSE; 358 struct socket *so; 359 struct tcpcb *tp; 360 361 so = inp->inp_socket; 362 tp = intotcpcb(inp); 363 364 /* 365 * Skip if still in use or busy; it would have been more efficient 366 * if we were to test so_usecount against 0, but this isn't possible 367 * due to the current implementation of tcp_dropdropablreq() where 368 * overflow sockets that are eligible for garbage collection have 369 * their usecounts set to 1. 370 */ 371 if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx)) 372 return (TRUE); 373 374 /* Check again under the lock */ 375 if (so->so_usecount > 1) { 376 if (inp->inp_wantcnt == WNT_STOPUSING) 377 active = TRUE; 378 lck_mtx_unlock(&inp->inpcb_mtx); 379 return (active); 380 } 381 382 if (istimewait && 383 TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) && 384 tp->t_state != TCPS_CLOSED) { 385 /* Become a regular mutex */ 386 lck_mtx_convert_spin(&inp->inpcb_mtx); 387 tcp_close(tp); 388 } 389 390 /* 391 * Overflowed socket dropped from the listening queue? Do this 392 * only if we are called to clean up the time wait slots, since 393 * tcp_dropdropablreq() considers a socket to have been fully 394 * dropped after add_to_time_wait() is finished. 395 * Also handle the case of connections getting closed by the peer 396 * while in the queue as seen with rdar://6422317 397 * 398 */ 399 if (so->so_usecount == 1 && 400 ((istimewait && (so->so_flags & SOF_OVERFLOW)) || 401 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) && 402 (so->so_head != NULL) && 403 ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) == 404 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) { 405 406 if (inp->inp_state != INPCB_STATE_DEAD) { 407 /* Become a regular mutex */ 408 lck_mtx_convert_spin(&inp->inpcb_mtx); 409#if INET6 410 if (SOCK_CHECK_DOM(so, PF_INET6)) 411 in6_pcbdetach(inp); 412 else 413#endif /* INET6 */ 414 in_pcbdetach(inp); 415 } 416 so->so_usecount--; 417 if (inp->inp_wantcnt == WNT_STOPUSING) 418 active = TRUE; 419 lck_mtx_unlock(&inp->inpcb_mtx); 420 return (active); 421 } else if (inp->inp_wantcnt != WNT_STOPUSING) { 422 lck_mtx_unlock(&inp->inpcb_mtx); 423 return (FALSE); 424 } 425 426 /* 427 * We get here because the PCB is no longer searchable 428 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead 429 * (usecount is 0). This covers all cases, including overflow 430 * sockets and those that are considered as "embryonic", 431 * i.e. created by sonewconn() in TCP input path, and have 432 * not yet been committed. For the former, we reduce the usecount 433 * to 0 as done by the code above. For the latter, the usecount 434 * would have reduced to 0 as part calling soabort() when the 435 * socket is dropped at the end of tcp_input(). 436 */ 437 if (so->so_usecount == 0) { 438 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 439 struct tcpcb *, tp, int32_t, TCPS_CLOSED); 440 /* Become a regular mutex */ 441 lck_mtx_convert_spin(&inp->inpcb_mtx); 442 443 /* 444 * If this tp still happens to be on the timer list, 445 * take it out 446 */ 447 if (TIMER_IS_ON_LIST(tp)) { 448 tcp_remove_timer(tp); 449 } 450 451 if (inp->inp_state != INPCB_STATE_DEAD) { 452#if INET6 453 if (SOCK_CHECK_DOM(so, PF_INET6)) 454 in6_pcbdetach(inp); 455 else 456#endif /* INET6 */ 457 in_pcbdetach(inp); 458 } 459 in_pcbdispose(inp); 460 return (FALSE); 461 } 462 463 lck_mtx_unlock(&inp->inpcb_mtx); 464 return (TRUE); 465} 466 467/* 468 * TCP garbage collector callback (inpcb_timer_func_t). 469 * 470 * Returns the number of pcbs that will need to be gc-ed soon, 471 * returnining > 0 will keep timer active. 472 */ 473void 474tcp_gc(struct inpcbinfo *ipi) 475{ 476 struct inpcb *inp, *nxt; 477 struct tcpcb *tw_tp, *tw_ntp; 478#if TCPDEBUG 479 int ostate; 480#endif 481#if KDEBUG 482 static int tws_checked = 0; 483#endif 484 485 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0); 486 487 /* 488 * Update tcp_now here as it may get used while 489 * processing the slow timer. 490 */ 491 calculate_tcp_clock(); 492 493 /* 494 * Garbage collect socket/tcpcb: We need to acquire the list lock 495 * exclusively to do this 496 */ 497 498 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) { 499 /* don't sweat it this time; cleanup was done last time */ 500 if (tcp_gc_done == TRUE) { 501 tcp_gc_done = FALSE; 502 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, 503 tws_checked, cur_tw_slot, 0, 0, 0); 504 /* Lock upgrade failed, give up this round */ 505 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1); 506 return; 507 } 508 /* Upgrade failed, lost lock now take it again exclusive */ 509 lck_rw_lock_exclusive(ipi->ipi_lock); 510 } 511 tcp_gc_done = TRUE; 512 513 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) { 514 if (tcp_garbage_collect(inp, 0)) 515 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1); 516 } 517 518 /* Now cleanup the time wait ones */ 519 TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) { 520 /* 521 * We check the timestamp here without holding the 522 * socket lock for better performance. If there are 523 * any pcbs in time-wait, the timer will get rescheduled. 524 * Hence some error in this check can be tolerated. 525 * 526 * Sometimes a socket on time-wait queue can be closed if 527 * 2MSL timer expired but the application still has a 528 * usecount on it. 529 */ 530 if (tw_tp->t_state == TCPS_CLOSED || 531 TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) { 532 if (tcp_garbage_collect(tw_tp->t_inpcb, 1)) 533 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1); 534 } 535 } 536 537 /* take into account pcbs that are still in time_wait_slots */ 538 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount); 539 540 lck_rw_done(ipi->ipi_lock); 541 542 /* Clean up the socache while we are here */ 543 if (so_cache_timer()) 544 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1); 545 546 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, 547 cur_tw_slot, 0, 0, 0); 548 549 return; 550} 551 552/* 553 * Cancel all timers for TCP tp. 554 */ 555void 556tcp_canceltimers(tp) 557 struct tcpcb *tp; 558{ 559 register int i; 560 561 tcp_remove_timer(tp); 562 for (i = 0; i < TCPT_NTIMERS; i++) 563 tp->t_timer[i] = 0; 564 tp->tentry.timer_start = tcp_now; 565 tp->tentry.index = TCPT_NONE; 566} 567 568int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 569 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 570 571int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 572 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; 573 574static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ 575 576static void tcp_rexmt_save_state(struct tcpcb *tp) 577{ 578 u_int32_t fsize; 579 if (TSTMP_SUPPORTED(tp)) { 580 /* 581 * Since timestamps are supported on the connection, 582 * we can do recovery as described in rfc 4015. 583 */ 584 fsize = tp->snd_max - tp->snd_una; 585 tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh); 586 tp->snd_recover_prev = tp->snd_recover; 587 } else { 588 /* 589 * Timestamp option is not supported on this connection. 590 * Record ssthresh and cwnd so they can 591 * be recovered if this turns out to be a "bad" retransmit. 592 * A retransmit is considered "bad" if an ACK for this 593 * segment is received within RTT/2 interval; the assumption 594 * here is that the ACK was already in flight. See 595 * "On Estimating End-to-End Network Path Properties" by 596 * Allman and Paxson for more details. 597 */ 598 tp->snd_cwnd_prev = tp->snd_cwnd; 599 tp->snd_ssthresh_prev = tp->snd_ssthresh; 600 tp->snd_recover_prev = tp->snd_recover; 601 if (IN_FASTRECOVERY(tp)) 602 tp->t_flags |= TF_WASFRECOVERY; 603 else 604 tp->t_flags &= ~TF_WASFRECOVERY; 605 } 606 tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2; 607 tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 608 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT); 609} 610 611/* 612 * TCP timer processing. 613 */ 614struct tcpcb * 615tcp_timers(tp, timer) 616 register struct tcpcb *tp; 617 int timer; 618{ 619 register int rexmt; 620 struct socket *so; 621 struct tcptemp *t_template; 622 int optlen = 0; 623 int idle_time = 0; 624 625#if TCPDEBUG 626 int ostate; 627#endif 628 629#if INET6 630 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0; 631#endif /* INET6 */ 632 633 so = tp->t_inpcb->inp_socket; 634 idle_time = tcp_now - tp->t_rcvtime; 635 636 switch (timer) { 637 638 /* 639 * 2 MSL timeout in shutdown went off. If we're closed but 640 * still waiting for peer to close and connection has been idle 641 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2, 642 * delete connection control block. 643 * Otherwise, (this case shouldn't happen) check again in a bit 644 * we keep the socket in the main list in that case. 645 */ 646 case TCPT_2MSL: 647 tcp_free_sackholes(tp); 648 if (tp->t_state != TCPS_TIME_WAIT && 649 tp->t_state != TCPS_FIN_WAIT_2 && 650 ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) { 651 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, 652 (u_int32_t)TCP_CONN_KEEPINTVL(tp)); 653 } else { 654 tp = tcp_close(tp); 655 return(tp); 656 } 657 break; 658 659 /* 660 * Retransmission timer went off. Message has not 661 * been acked within retransmit interval. Back off 662 * to a longer retransmit interval and retransmit one segment. 663 */ 664 case TCPT_REXMT: 665 /* Drop a connection in the retransmit timer 666 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT times 667 * 2. If the time spent in this retransmission episode is more than 668 * the time limit set with TCP_RXT_CONNDROPTIME socket option 669 * 3. If TCP_RXT_FINDROP socket option was set and we have already 670 * retransmitted the FIN 3 times without receiving an ack 671 */ 672 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || 673 (tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 && 674 (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) || 675 ((tp->t_flagsext & TF_RXTFINDROP) != 0 && 676 (tp->t_flags & TF_SENTFIN) != 0 && 677 tp->t_rxtshift >= 4)) { 678 679 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) { 680 tcpstat.tcps_rxtfindrop++; 681 } else { 682 tcpstat.tcps_timeoutdrop++; 683 } 684 tp->t_rxtshift = TCP_MAXRXTSHIFT; 685 postevent(so, 0, EV_TIMEOUT); 686 soevent(so, 687 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); 688 tp = tcp_drop(tp, tp->t_softerror ? 689 tp->t_softerror : ETIMEDOUT); 690 691 break; 692 } 693 694 tcpstat.tcps_rexmttimeo++; 695 696 if (tp->t_rxtshift == 1 && 697 tp->t_state == TCPS_ESTABLISHED) { 698 /* Set the time at which retransmission started. */ 699 tp->t_rxtstart = tcp_now; 700 701 /* 702 * if this is the first retransmit timeout, save 703 * the state so that we can recover if the timeout 704 * is spurious. 705 */ 706 tcp_rexmt_save_state(tp); 707 } 708#if MPTCP 709 if ((tp->t_rxtshift == mptcp_fail_thresh) && 710 (tp->t_state == TCPS_ESTABLISHED) && 711 (tp->t_mpflags & TMPF_MPTCP_TRUE)) { 712 mptcp_act_on_txfail(so); 713 714 } 715#endif /* MPTCP */ 716 717 if (tp->t_adaptive_wtimo > 0 && 718 tp->t_rxtshift > tp->t_adaptive_wtimo && 719 TCPS_HAVEESTABLISHED(tp->t_state)) { 720 /* Send an event to the application */ 721 soevent(so, 722 (SO_FILT_HINT_LOCKED| 723 SO_FILT_HINT_ADAPTIVE_WTIMO)); 724 } 725 726 if (tp->t_state == TCPS_SYN_SENT) { 727 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; 728 tp->t_stat.synrxtshift = tp->t_rxtshift; 729 } 730 else 731 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 732 TCPT_RANGESET(tp->t_rxtcur, rexmt, 733 tp->t_rttmin, TCPTV_REXMTMAX, 734 TCP_ADD_REXMTSLOP(tp)); 735 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); 736 737 if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb)) 738 goto fc_output; 739 740 tcp_free_sackholes(tp); 741 /* 742 * Check for potential Path MTU Discovery Black Hole 743 */ 744 745 if (tcp_pmtud_black_hole_detect && (tp->t_state == TCPS_ESTABLISHED)) { 746 if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) && 747 (tp->t_rxtshift == 2)) { 748 /* 749 * Enter Path MTU Black-hole Detection mechanism: 750 * - Disable Path MTU Discovery (IP "DF" bit). 751 * - Reduce MTU to lower value than what we negociated with peer. 752 */ 753 /* Disable Path MTU Discovery for now */ 754 tp->t_flags &= ~TF_PMTUD; 755 /* Record that we may have found a black hole */ 756 tp->t_flags |= TF_BLACKHOLE; 757 optlen = tp->t_maxopd - tp->t_maxseg; 758 /* Keep track of previous MSS */ 759 tp->t_pmtud_saved_maxopd = tp->t_maxopd; 760 /* Reduce the MSS to intermediary value */ 761 if (tp->t_maxopd > tcp_pmtud_black_hole_mss) { 762 tp->t_maxopd = tcp_pmtud_black_hole_mss; 763 } else { 764 tp->t_maxopd = /* use the default MSS */ 765#if INET6 766 isipv6 ? tcp_v6mssdflt : 767#endif /* INET6 */ 768 tcp_mssdflt; 769 } 770 tp->t_maxseg = tp->t_maxopd - optlen; 771 772 /* 773 * Reset the slow-start flight size 774 * as it may depend on the new MSS 775 */ 776 if (CC_ALGO(tp)->cwnd_init != NULL) 777 CC_ALGO(tp)->cwnd_init(tp); 778 } 779 /* 780 * If further retransmissions are still unsuccessful with a lowered MTU, 781 * maybe this isn't a Black Hole and we restore the previous MSS and 782 * blackhole detection flags. 783 */ 784 else { 785 786 if ((tp->t_flags & TF_BLACKHOLE) && (tp->t_rxtshift > 4)) { 787 tp->t_flags |= TF_PMTUD; 788 tp->t_flags &= ~TF_BLACKHOLE; 789 optlen = tp->t_maxopd - tp->t_maxseg; 790 tp->t_maxopd = tp->t_pmtud_saved_maxopd; 791 tp->t_maxseg = tp->t_maxopd - optlen; 792 /* 793 * Reset the slow-start flight size as it 794 * may depend on the new MSS 795 */ 796 if (CC_ALGO(tp)->cwnd_init != NULL) 797 CC_ALGO(tp)->cwnd_init(tp); 798 } 799 } 800 } 801 802 803 /* 804 * Disable rfc1323 and rfc1644 if we haven't got any response to 805 * our SYN (after we reach the threshold) to work-around some 806 * broken terminal servers (most of which have hopefully been 807 * retired) that have bad VJ header compression code which 808 * trashes TCP segments containing unknown-to-them TCP options. 809 * Do this only on non-local connections. 810 */ 811 if (tp->t_state == TCPS_SYN_SENT && 812 ((!(tp->t_flags & TF_LOCAL) && 813 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres) || 814 ((tp->t_flags & TF_LOCAL) && 815 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres_local))) 816 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC); 817 818 /* 819 * If losing, let the lower level know and try for 820 * a better route. Also, if we backed off this far, 821 * our srtt estimate is probably bogus. Clobber it 822 * so we'll take the next rtt measurement as our srtt; 823 * move the current srtt into rttvar to keep the current 824 * retransmit times until then. 825 */ 826 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 827#if INET6 828 if (isipv6) 829 in6_losing(tp->t_inpcb); 830 else 831#endif /* INET6 */ 832 in_losing(tp->t_inpcb); 833 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 834 tp->t_srtt = 0; 835 } 836 tp->snd_nxt = tp->snd_una; 837 /* 838 * Note: We overload snd_recover to function also as the 839 * snd_last variable described in RFC 2582 840 */ 841 tp->snd_recover = tp->snd_max; 842 /* 843 * Force a segment to be sent. 844 */ 845 tp->t_flags |= TF_ACKNOW; 846 /* 847 * If timing a segment in this window, stop the timer. 848 */ 849 tp->t_rtttime = 0; 850 851 EXIT_FASTRECOVERY(tp); 852 853 /* RFC 5681 says: when a TCP sender detects segment loss 854 * using retransmit timer and the given segment has already 855 * been retransmitted by way of the retransmission timer at 856 * least once, the value of ssthresh is held constant 857 */ 858 if (tp->t_rxtshift == 1 && 859 CC_ALGO(tp)->after_timeout != NULL) 860 CC_ALGO(tp)->after_timeout(tp); 861 862 863 /* CWR notifications are to be sent on new data right after 864 * RTOs, Fast Retransmits and ECE notification receipts. 865 */ 866 if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) { 867 tp->ecn_flags |= TE_SENDCWR; 868 } 869fc_output: 870 DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, 871 struct tcpcb *, tp, struct tcphdr *, NULL, 872 int32_t, TCP_CC_REXMT_TIMEOUT); 873 874 (void) tcp_output(tp); 875 break; 876 877 /* 878 * Persistance timer into zero window. 879 * Force a byte to be output, if possible. 880 */ 881 case TCPT_PERSIST: 882 tcpstat.tcps_persisttimeo++; 883 /* 884 * Hack: if the peer is dead/unreachable, we do not 885 * time out if the window is closed. After a full 886 * backoff, drop the connection if the idle time 887 * (no responses to probes) reaches the maximum 888 * backoff that we would use if retransmitting. 889 * 890 * Drop the connection if we reached the maximum allowed time for 891 * Zero Window Probes without a non-zero update from the peer. 892 * See rdar://5805356 893 */ 894 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT && 895 (idle_time >= tcp_maxpersistidle || 896 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) || 897 ((tp->t_persist_stop != 0) && 898 TSTMP_LEQ(tp->t_persist_stop, tcp_now))) { 899 tcpstat.tcps_persistdrop++; 900 postevent(so, 0, EV_TIMEOUT); 901 soevent(so, 902 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); 903 tp = tcp_drop(tp, ETIMEDOUT); 904 break; 905 } 906 tcp_setpersist(tp); 907 tp->t_force = 1; 908 (void) tcp_output(tp); 909 tp->t_force = 0; 910 break; 911 912 /* 913 * Keep-alive timer went off; send something 914 * or drop connection if idle for too long. 915 */ 916 case TCPT_KEEP: 917 tcpstat.tcps_keeptimeo++; 918#if MPTCP 919 /* 920 * Regular TCP connections do not send keepalives after closing 921 * MPTCP must not also, after sending Data FINs. 922 */ 923 struct mptcb *mp_tp = tp->t_mptcb; 924 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && 925 (mp_tp == NULL)) { 926 goto dropit; 927 } else if (mp_tp != NULL) { 928 if ((mptcp_ok_to_keepalive(mp_tp) == 0)) 929 goto dropit; 930 } 931#endif /* MPTCP */ 932 if (tp->t_state < TCPS_ESTABLISHED) 933 goto dropit; 934 if ((always_keepalive || 935 (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) || 936 (tp->t_flagsext & TF_DETECT_READSTALL)) && 937 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) { 938 if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp)) 939 goto dropit; 940 /* 941 * Send a packet designed to force a response 942 * if the peer is up and reachable: 943 * either an ACK if the connection is still alive, 944 * or an RST if the peer has closed the connection 945 * due to timeout or reboot. 946 * Using sequence number tp->snd_una-1 947 * causes the transmitted zero-length segment 948 * to lie outside the receive window; 949 * by the protocol spec, this requires the 950 * correspondent TCP to respond. 951 */ 952 tcpstat.tcps_keepprobe++; 953 t_template = tcp_maketemplate(tp); 954 if (t_template) { 955 unsigned int ifscope, nocell = 0; 956 957 if (tp->t_inpcb->inp_flags & INP_BOUND_IF) 958 ifscope = tp->t_inpcb->inp_boundifp->if_index; 959 else 960 ifscope = IFSCOPE_NONE; 961 962 /* 963 * If the socket isn't allowed to use the 964 * cellular interface, indicate it as such. 965 */ 966 if (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR) 967 nocell = 1; 968 969 tcp_respond(tp, t_template->tt_ipgen, 970 &t_template->tt_t, (struct mbuf *)NULL, 971 tp->rcv_nxt, tp->snd_una - 1, 0, ifscope, 972 nocell); 973 (void) m_free(dtom(t_template)); 974 if (tp->t_flagsext & TF_DETECT_READSTALL) 975 tp->t_rtimo_probes++; 976 } 977 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 978 TCP_CONN_KEEPINTVL(tp)); 979 } else { 980 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 981 TCP_CONN_KEEPIDLE(tp)); 982 } 983 if (tp->t_flagsext & TF_DETECT_READSTALL) { 984 /* 985 * The keep alive packets sent to detect a read 986 * stall did not get a response from the 987 * peer. Generate more keep-alives to confirm this. 988 * If the number of probes sent reaches the limit, 989 * generate an event. 990 */ 991 if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) { 992 /* Generate an event */ 993 soevent(so, 994 (SO_FILT_HINT_LOCKED| 995 SO_FILT_HINT_ADAPTIVE_RTIMO)); 996 tcp_keepalive_reset(tp); 997 } else { 998 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START( 999 tp, TCP_REXMTVAL(tp)); 1000 } 1001 } 1002 break; 1003 case TCPT_DELACK: 1004 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) { 1005 tp->t_flags &= ~TF_DELACK; 1006 tp->t_timer[TCPT_DELACK] = 0; 1007 tp->t_flags |= TF_ACKNOW; 1008 1009 /* If delayed ack timer fired while stretching acks 1010 * go back to acking every other packet 1011 */ 1012 if ((tp->t_flags & TF_STRETCHACK) != 0) 1013 tcp_reset_stretch_ack(tp); 1014 1015 /* If we are measuring inter packet arrival jitter for 1016 * throttling a connection, this delayed ack might be 1017 * the reason for accumulating some jitter. So let's 1018 * restart the measurement. 1019 */ 1020 CLEAR_IAJ_STATE(tp); 1021 1022 tcpstat.tcps_delack++; 1023 (void) tcp_output(tp); 1024 } 1025 break; 1026 1027#if MPTCP 1028 case TCPT_JACK_RXMT: 1029 if ((tp->t_state == TCPS_ESTABLISHED) && 1030 (tp->t_mpflags & TMPF_PREESTABLISHED) && 1031 (tp->t_mpflags & TMPF_JOINED_FLOW)) { 1032 if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) { 1033 tcpstat.tcps_timeoutdrop++; 1034 postevent(so, 0, EV_TIMEOUT); 1035 soevent(so, 1036 (SO_FILT_HINT_LOCKED| 1037 SO_FILT_HINT_TIMEOUT)); 1038 tp = tcp_drop(tp, tp->t_softerror ? 1039 tp->t_softerror : ETIMEDOUT); 1040 break; 1041 } 1042 tcpstat.tcps_join_rxmts++; 1043 tp->t_flags |= TF_ACKNOW; 1044 1045 /* 1046 * No backoff is implemented for simplicity for this 1047 * corner case. 1048 */ 1049 (void) tcp_output(tp); 1050 } 1051 break; 1052#endif /* MPTCP */ 1053 1054#if TCPDEBUG 1055 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 1056 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 1057 PRU_SLOWTIMO); 1058#endif 1059 dropit: 1060 tcpstat.tcps_keepdrops++; 1061 postevent(so, 0, EV_TIMEOUT); 1062 soevent(so, 1063 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); 1064 tp = tcp_drop(tp, ETIMEDOUT); 1065 break; 1066 } 1067 return (tp); 1068} 1069 1070/* Remove a timer entry from timer list */ 1071void 1072tcp_remove_timer(struct tcpcb *tp) 1073{ 1074 struct tcptimerlist *listp = &tcp_timer_list; 1075 1076 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); 1077 if (!(TIMER_IS_ON_LIST(tp))) { 1078 return; 1079 } 1080 lck_mtx_lock(listp->mtx); 1081 1082 /* Check if pcb is on timer list again after acquiring the lock */ 1083 if (!(TIMER_IS_ON_LIST(tp))) { 1084 lck_mtx_unlock(listp->mtx); 1085 return; 1086 } 1087 1088 if (listp->next_te != NULL && listp->next_te == &tp->tentry) 1089 listp->next_te = LIST_NEXT(&tp->tentry, le); 1090 1091 LIST_REMOVE(&tp->tentry, le); 1092 tp->t_flags &= ~(TF_TIMER_ONLIST); 1093 1094 listp->entries--; 1095 1096 tp->tentry.le.le_next = NULL; 1097 tp->tentry.le.le_prev = NULL; 1098 lck_mtx_unlock(listp->mtx); 1099} 1100 1101/* Function to check if the timerlist needs to be rescheduled to run 1102 * the timer entry correctly. Basically, this is to check if we can avoid 1103 * taking the list lock. 1104 */ 1105 1106static boolean_t 1107need_to_resched_timerlist(uint32_t runtime, uint16_t index) { 1108 struct tcptimerlist *listp = &tcp_timer_list; 1109 int32_t diff; 1110 boolean_t is_fast; 1111 1112 if (index == TCPT_NONE) 1113 return FALSE; 1114 is_fast = !(IS_TIMER_SLOW(index)); 1115 1116 /* If the list is being processed then the state of the list is in flux. 1117 * In this case always acquire the lock and set the state correctly. 1118 */ 1119 if (listp->running) 1120 return TRUE; 1121 1122 if (!listp->scheduled) 1123 return (TRUE); 1124 1125 diff = timer_diff(listp->runtime, 0, runtime, 0); 1126 if (diff <= 0) { 1127 /* The list is going to run before this timer */ 1128 return FALSE; 1129 } else { 1130 if (is_fast) { 1131 if (diff <= listp->fast_quantum) 1132 return FALSE; 1133 } else { 1134 if (diff <= listp->slow_quantum) 1135 return FALSE; 1136 } 1137 } 1138 return TRUE; 1139} 1140 1141void 1142tcp_sched_timerlist(uint32_t offset) 1143{ 1144 1145 uint64_t deadline = 0; 1146 struct tcptimerlist *listp = &tcp_timer_list; 1147 1148 lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED); 1149 1150 offset = min(offset, TCP_TIMERLIST_MAX_OFFSET); 1151 listp->runtime = tcp_now + offset; 1152 if (listp->runtime == 0) 1153 listp->runtime++; 1154 1155 clock_interval_to_deadline(offset, NSEC_PER_SEC / TCP_RETRANSHZ, 1156 &deadline); 1157 1158 thread_call_enter_delayed(listp->call, deadline); 1159 listp->scheduled = TRUE; 1160} 1161 1162/* Function to run the timers for a connection. 1163 * 1164 * Returns the offset of next timer to be run for this connection which 1165 * can be used to reschedule the timerlist. 1166 */ 1167uint32_t 1168tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) { 1169 1170 struct socket *so; 1171 uint16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE; 1172 uint32_t timer_val, offset = 0, lo_timer = 0; 1173 int32_t diff; 1174 boolean_t needtorun[TCPT_NTIMERS]; 1175 int count = 0; 1176 1177 VERIFY(tp != NULL); 1178 bzero(needtorun, sizeof(needtorun)); 1179 1180 tcp_lock(tp->t_inpcb->inp_socket, 1, 0); 1181 1182 so = tp->t_inpcb->inp_socket; 1183 /* Release the want count on inp */ 1184 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1) == WNT_STOPUSING) { 1185 if (TIMER_IS_ON_LIST(tp)) { 1186 tcp_remove_timer(tp); 1187 } 1188 1189 /* Looks like the TCP connection got closed while we 1190 * were waiting for the lock.. Done 1191 */ 1192 goto done; 1193 } 1194 1195 /* Since the timer thread needs to wait for tcp lock, it may race 1196 * with another thread that can cancel or reschedule the timer that is 1197 * about to run. Check if we need to run anything. 1198 */ 1199 if ((index = tp->tentry.index) == TCPT_NONE) 1200 goto done; 1201 timer_val = tp->t_timer[index]; 1202 1203 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0); 1204 if (diff > 0) { 1205 if (tp->tentry.index != TCPT_NONE) { 1206 offset = diff; 1207 *(next_index) = tp->tentry.index; 1208 } 1209 goto done; 1210 } 1211 1212 tp->t_timer[index] = 0; 1213 if (timer_val > 0) { 1214 tp = tcp_timers(tp, index); 1215 if (tp == NULL) 1216 goto done; 1217 } 1218 1219 /* Check if there are any other timers that need to be run. While doing it, 1220 * adjust the timer values wrt tcp_now. 1221 */ 1222 for (i = 0; i < TCPT_NTIMERS; ++i) { 1223 if (tp->t_timer[i] != 0) { 1224 diff = timer_diff(tp->tentry.timer_start, tp->t_timer[i], tcp_now, 0); 1225 if (diff <= 0) { 1226 tp->t_timer[i] = 0; 1227 needtorun[i] = TRUE; 1228 count++; 1229 } else { 1230 tp->t_timer[i] = diff; 1231 needtorun[i] = FALSE; 1232 if (lo_timer == 0 || diff < lo_timer) { 1233 lo_timer = diff; 1234 lo_index = i; 1235 } 1236 } 1237 } 1238 } 1239 1240 tp->tentry.timer_start = tcp_now; 1241 tp->tentry.index = lo_index; 1242 if (lo_index != TCPT_NONE) { 1243 tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index]; 1244 if (tp->tentry.runtime == 0) 1245 tp->tentry.runtime++; 1246 } 1247 1248 if (count > 0) { 1249 /* run any other timers that are also outstanding at this time. */ 1250 for (i = 0; i < TCPT_NTIMERS; ++i) { 1251 if (needtorun[i]) { 1252 tp->t_timer[i] = 0; 1253 tp = tcp_timers(tp, i); 1254 if (tp == NULL) { 1255 offset = 0; 1256 *(next_index) = TCPT_NONE; 1257 goto done; 1258 } 1259 } 1260 } 1261 tcp_set_lotimer_index(tp); 1262 } 1263 1264 if (tp->tentry.index < TCPT_NONE) { 1265 offset = tp->t_timer[tp->tentry.index]; 1266 *(next_index) = tp->tentry.index; 1267 } 1268 1269done: 1270 if (tp != NULL && tp->tentry.index == TCPT_NONE) { 1271 tcp_remove_timer(tp); 1272 offset = 0; 1273 } 1274 tcp_unlock(so, 1, 0); 1275 return offset; 1276} 1277 1278void 1279tcp_run_timerlist(void * arg1, void * arg2) { 1280 1281#pragma unused(arg1, arg2) 1282 1283 struct tcptimerentry *te, *next_te; 1284 struct tcptimerlist *listp = &tcp_timer_list; 1285 struct tcpcb *tp; 1286 uint32_t next_timer = 0; 1287 uint16_t index = TCPT_NONE; 1288 boolean_t need_fast = FALSE; 1289 uint32_t active_count = 0; 1290 uint32_t mode = TCP_TIMERLIST_FASTMODE; 1291 1292 calculate_tcp_clock(); 1293 1294 lck_mtx_lock(listp->mtx); 1295 1296 listp->running = TRUE; 1297 1298 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) { 1299 uint32_t offset = 0; 1300 uint32_t runtime = te->runtime; 1301 if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) { 1302 offset = timer_diff(runtime, 0, tcp_now, 0); 1303 if (next_timer == 0 || offset < next_timer) { 1304 next_timer = offset; 1305 } 1306 continue; 1307 } 1308 active_count++; 1309 1310 tp = TIMERENTRY_TO_TP(te); 1311 1312 /* Acquire an inp wantcnt on the inpcb so that the socket won't get 1313 * detached even if tcp_close is called 1314 */ 1315 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0) == WNT_STOPUSING) { 1316 /* Some how this pcb went into dead state while on the timer list, 1317 * just take it off the list. Since the timer list entry pointers 1318 * are protected by the timer list lock, we can do it here 1319 */ 1320 if (TIMER_IS_ON_LIST(tp)) { 1321 tp->t_flags &= ~(TF_TIMER_ONLIST); 1322 LIST_REMOVE(&tp->tentry, le); 1323 listp->entries--; 1324 1325 tp->tentry.le.le_next = NULL; 1326 tp->tentry.le.le_prev = NULL; 1327 } 1328 continue; 1329 } 1330 1331 /* Store the next timerentry pointer before releasing the list lock. 1332 * If that entry has to be removed when we release the lock, this 1333 * pointer will be updated to the element after that. 1334 */ 1335 listp->next_te = next_te; 1336 1337 VERIFY_NEXT_LINK(&tp->tentry, le); 1338 VERIFY_PREV_LINK(&tp->tentry, le); 1339 1340 lck_mtx_unlock(listp->mtx); 1341 1342 index = TCPT_NONE; 1343 offset = tcp_run_conn_timer(tp, &index); 1344 1345 lck_mtx_lock(listp->mtx); 1346 1347 next_te = listp->next_te; 1348 listp->next_te = NULL; 1349 1350 if (offset > 0) { 1351 if (index < TCPT_NONE) { 1352 /* Check if this is a fast_timer. */ 1353 if (!need_fast && !(IS_TIMER_SLOW(index))) { 1354 need_fast = TRUE; 1355 } 1356 1357 if (next_timer == 0 || offset < next_timer) { 1358 next_timer = offset; 1359 } 1360 } 1361 } 1362 } 1363 1364 if (!LIST_EMPTY(&listp->lhead)) { 1365 if (listp->mode == TCP_TIMERLIST_FASTMODE) { 1366 if (need_fast || active_count > 0 || 1367 listp->pref_mode == TCP_TIMERLIST_FASTMODE) { 1368 listp->idlegen = 0; 1369 } else { 1370 listp->idlegen++; 1371 if (listp->idlegen > timer_fastmode_idlemax) { 1372 mode = TCP_TIMERLIST_SLOWMODE; 1373 listp->idlegen = 0; 1374 } 1375 } 1376 } else { 1377 if (!need_fast) { 1378 mode = TCP_TIMERLIST_SLOWMODE; 1379 } 1380 } 1381 1382 if (mode == TCP_TIMERLIST_FASTMODE || 1383 listp->pref_mode == TCP_TIMERLIST_FASTMODE) { 1384 next_timer = listp->fast_quantum; 1385 } else { 1386 if (listp->pref_offset != 0 && 1387 listp->pref_offset < next_timer) 1388 next_timer = listp->pref_offset; 1389 if (next_timer < listp->slow_quantum) 1390 next_timer = listp->slow_quantum; 1391 } 1392 1393 listp->mode = mode; 1394 1395 tcp_sched_timerlist(next_timer); 1396 } else { 1397 /* 1398 * No need to reschedule this timer, but always run 1399 * periodically at a much higher granularity. 1400 */ 1401 tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET); 1402 } 1403 1404 listp->running = FALSE; 1405 listp->pref_mode = 0; 1406 listp->pref_offset = 0; 1407 1408 lck_mtx_unlock(listp->mtx); 1409} 1410 1411/* Function to verify if a change in timer state is required for a connection */ 1412void 1413tcp_sched_timers(struct tcpcb *tp) 1414{ 1415 struct tcptimerentry *te = &tp->tentry; 1416 uint16_t index = te->index; 1417 struct tcptimerlist *listp = &tcp_timer_list; 1418 int32_t offset = 0; 1419 boolean_t is_fast; 1420 int list_locked = 0; 1421 1422 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) { 1423 /* Just return without adding the dead pcb to the list */ 1424 if (TIMER_IS_ON_LIST(tp)) { 1425 tcp_remove_timer(tp); 1426 } 1427 return; 1428 } 1429 1430 if (index == TCPT_NONE) { 1431 tcp_remove_timer(tp); 1432 return; 1433 } 1434 1435 is_fast = !(IS_TIMER_SLOW(index)); 1436 offset = timer_diff(te->runtime, 0, tcp_now, 0); 1437 if (offset <= 0) { 1438 offset = 1; 1439 tcp_timer_advanced++; 1440 } 1441 if (is_fast) 1442 offset = listp->fast_quantum; 1443 1444 if (!TIMER_IS_ON_LIST(tp)) { 1445 if (!list_locked) { 1446 lck_mtx_lock(listp->mtx); 1447 list_locked = 1; 1448 } 1449 1450 LIST_INSERT_HEAD(&listp->lhead, te, le); 1451 tp->t_flags |= TF_TIMER_ONLIST; 1452 1453 listp->entries++; 1454 if (listp->entries > listp->maxentries) 1455 listp->maxentries = listp->entries; 1456 1457 /* if the list is not scheduled, just schedule it */ 1458 if (!listp->scheduled) 1459 goto schedule; 1460 1461 } 1462 1463 1464 /* timer entry is currently on the list */ 1465 if (need_to_resched_timerlist(te->runtime, index)) { 1466 tcp_resched_timerlist++; 1467 1468 if (!list_locked) { 1469 lck_mtx_lock(listp->mtx); 1470 list_locked = 1; 1471 } 1472 1473 VERIFY_NEXT_LINK(te, le); 1474 VERIFY_PREV_LINK(te, le); 1475 1476 if (listp->running) { 1477 if (is_fast) { 1478 listp->pref_mode = TCP_TIMERLIST_FASTMODE; 1479 } else if (listp->pref_offset == 0 || 1480 offset < listp->pref_offset) { 1481 listp->pref_offset = offset; 1482 } 1483 } else { 1484 /* 1485 * The list could have got scheduled while this 1486 * thread was waiting for the lock 1487 */ 1488 if (listp->scheduled) { 1489 int32_t diff; 1490 diff = timer_diff(listp->runtime, 0, 1491 tcp_now, offset); 1492 if (diff <= 0) 1493 goto done; 1494 else 1495 goto schedule; 1496 } else { 1497 goto schedule; 1498 } 1499 } 1500 } 1501 goto done; 1502 1503schedule: 1504 if (is_fast) { 1505 listp->mode = TCP_TIMERLIST_FASTMODE; 1506 listp->idlegen = 0; 1507 } 1508 tcp_sched_timerlist(offset); 1509 1510done: 1511 if (list_locked) 1512 lck_mtx_unlock(listp->mtx); 1513 1514 return; 1515} 1516 1517void 1518tcp_set_lotimer_index(struct tcpcb *tp) { 1519 uint16_t i, lo_index = TCPT_NONE; 1520 uint32_t lo_timer = 0; 1521 for (i = 0; i < TCPT_NTIMERS; ++i) { 1522 if (tp->t_timer[i] != 0 && 1523 (lo_timer == 0 || tp->t_timer[i] < lo_timer)) { 1524 lo_timer = tp->t_timer[i]; 1525 lo_index = i; 1526 } 1527 } 1528 tp->tentry.index = lo_index; 1529 if (lo_index != TCPT_NONE) { 1530 tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index]; 1531 if (tp->tentry.runtime == 0) 1532 tp->tentry.runtime++; 1533 } 1534} 1535 1536void 1537tcp_check_timer_state(struct tcpcb *tp) { 1538 1539 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); 1540 1541 if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT) 1542 return; 1543 1544 tcp_set_lotimer_index(tp); 1545 1546 tcp_sched_timers(tp); 1547 return; 1548} 1549