1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $ 62 */ 63 64 65#include <sys/param.h> 66#include <sys/systm.h> 67#include <sys/kernel.h> 68#include <sys/mbuf.h> 69#include <sys/sysctl.h> 70#include <sys/socket.h> 71#include <sys/socketvar.h> 72#include <sys/protosw.h> 73#include <sys/domain.h> 74#include <sys/mcache.h> 75#include <sys/queue.h> 76#include <kern/locks.h> 77 78#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */ 79 80#include <net/route.h> 81#include <net/if_var.h> 82 83#include <netinet/in.h> 84#include <netinet/in_systm.h> 85#include <netinet/in_pcb.h> 86#if INET6 87#include <netinet6/in6_pcb.h> 88#endif 89#include <netinet/ip_var.h> 90#include <netinet/tcp.h> 91#include <netinet/tcp_fsm.h> 92#include <netinet/tcp_seq.h> 93#include <netinet/tcp_timer.h> 94#include <netinet/tcp_var.h> 95#include <netinet/tcp_cc.h> 96#if INET6 97#include <netinet6/tcp6_var.h> 98#endif 99#include <netinet/tcpip.h> 100#if TCPDEBUG 101#include <netinet/tcp_debug.h> 102#endif 103#include <sys/kdebug.h> 104#include <mach/sdt.h> 105 106extern void postevent(struct socket *, struct sockbuf *, 107 int); 108#define DBG_FNC_TCP_FAST NETDBG_CODE(DBG_NETTCP, (5 << 8)) 109#define DBG_FNC_TCP_SLOW NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1) 110 111#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next))) 112 113#define VERIFY_NEXT_LINK(elm,field) do { \ 114 if (LIST_NEXT((elm),field) != NULL && \ 115 LIST_NEXT((elm),field)->field.le_prev != \ 116 &((elm)->field.le_next)) \ 117 panic("Bad link elm %p next->prev != elm", (elm)); \ 118} while(0) 119 120#define VERIFY_PREV_LINK(elm,field) do { \ 121 if (*(elm)->field.le_prev != (elm)) \ 122 panic("Bad link elm %p prev->next != elm", (elm)); \ 123} while(0) 124 125static int background_io_trigger = 5; 126SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW | CTLFLAG_LOCKED, 127 &background_io_trigger, 0, "Background IO Trigger Setting"); 128 129static int 130sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS 131{ 132#pragma unused(arg1, arg2) 133 int error, s, tt; 134 135 tt = *(int *)oidp->oid_arg1; 136 s = tt * 1000 / TCP_RETRANSHZ;; 137 138 error = sysctl_handle_int(oidp, &s, 0, req); 139 if (error || !req->newptr) 140 return (error); 141 142 tt = s * TCP_RETRANSHZ / 1000; 143 if (tt < 1) 144 return (EINVAL); 145 146 *(int *)oidp->oid_arg1 = tt; 147 return (0); 148} 149 150int tcp_keepinit; 151SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 152 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); 153 154int tcp_keepidle; 155SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 156 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); 157 158int tcp_keepintvl; 159SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 160 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); 161 162int tcp_msl; 163SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 164 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 165 166/* 167 * Avoid DoS via TCP Robustness in Persist Condition (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt) 168 * by allowing a system wide maximum persistence timeout value when in Zero Window Probe mode. 169 * Expressed in milliseconds to be consistent without timeout related values, the TCP socket option is in seconds. 170 */ 171u_int32_t tcp_max_persist_timeout = 0; 172SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 173 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I", "Maximum persistence timout for ZWP"); 174 175static int always_keepalive = 0; 176SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW | CTLFLAG_LOCKED, 177 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 178 179/* This parameter determines how long the timer list will stay in fast mode even 180 * though all connections are idle. In fast mode, the timer will fire more frequently 181 * anticipating new data. 182 */ 183int timer_fastmode_idlemax = TCP_FASTMODE_IDLEGEN_MAX; 184SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax, CTLFLAG_RW | CTLFLAG_LOCKED, 185 &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode"); 186 187/* 188 * See tcp_syn_backoff[] for interval values between SYN retransmits; 189 * the value set below defines the number of retransmits, before we 190 * disable the timestamp and window scaling options during subsequent 191 * SYN retransmits. Setting it to 0 disables the dropping off of those 192 * two options. 193 */ 194static int tcp_broken_peer_syn_rxmit_thres = 7; 195SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW | CTLFLAG_LOCKED, 196 &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before " 197 "TCP disables rfc1323 and rfc1644 during the rest of attempts"); 198 199static int tcp_timer_advanced = 0; 200SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, CTLFLAG_RD | CTLFLAG_LOCKED, 201 &tcp_timer_advanced, 0, "Number of times one of the timers was advanced"); 202 203static int tcp_resched_timerlist = 0; 204SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist, CTLFLAG_RD | CTLFLAG_LOCKED, 205 &tcp_resched_timerlist, 0, 206 "Number of times timer list was rescheduled as part of processing a packet"); 207 208int tcp_pmtud_black_hole_detect = 1 ; 209SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW | CTLFLAG_LOCKED, 210 &tcp_pmtud_black_hole_detect, 0, "Path MTU Discovery Black Hole Detection"); 211 212int tcp_pmtud_black_hole_mss = 1200 ; 213SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED, 214 &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS"); 215 216static int tcp_keepcnt = TCPTV_KEEPCNT; 217static int tcp_gc_done = FALSE; /* perfromed garbage collection of "used" sockets */ 218 /* max idle probes */ 219int tcp_maxpersistidle; 220 /* max idle time in persist */ 221int tcp_maxidle; 222 223/* TCP delack timer is set to 100 ms. Since the processing of timer list in fast 224 * mode will happen no faster than 100 ms, the delayed ack timer will fire some where 225 * between 100 and 200 ms. 226 */ 227int tcp_delack = TCP_RETRANSHZ / 10; 228 229struct inpcbhead time_wait_slots[N_TIME_WAIT_SLOTS]; 230int cur_tw_slot = 0; 231 232/* tcp timer list */ 233struct tcptimerlist tcp_timer_list; 234 235/* The frequency of running through the TCP timer list in 236 * fast and slow mode can be configured. 237 */ 238SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_fastquantum, CTLFLAG_RW | CTLFLAG_LOCKED, 239 &tcp_timer_list.fast_quantum, TCP_FASTTIMER_QUANTUM, 240 "Frequency of running timer list in fast mode"); 241 242SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_slowquantum, CTLFLAG_RW | CTLFLAG_LOCKED, 243 &tcp_timer_list.slow_quantum, TCP_SLOWTIMER_QUANTUM, 244 "Frequency of running timer list in slow mode"); 245 246static void tcp_remove_timer(struct tcpcb *tp); 247static void tcp_sched_timerlist(uint32_t offset); 248static uint32_t tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index); 249static void tcp_sched_timers(struct tcpcb *tp); 250static inline void tcp_set_lotimer_index(struct tcpcb *); 251 252/* Macro to compare two timers. If there is a reset of the sign bit, it is 253 * safe to assume that the timer has wrapped around. By doing signed comparision, 254 * we take care of wrap around such that the value with the sign bit reset is 255 * actually ahead of the other. 256 */ 257 258static inline int32_t 259timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) { 260 return (int32_t)((t1 + toff1) - (t2 + toff2)); 261}; 262 263/* Returns true if the timer is on the timer list */ 264#define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST) 265 266 267void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay); 268void add_to_time_wait(struct tcpcb *tp, uint32_t delay) ; 269 270static void tcp_garbage_collect(struct inpcb *, int); 271 272void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay) 273{ 274 int tw_slot; 275 struct inpcbinfo *pcbinfo = &tcbinfo; 276 uint32_t timer; 277 278 /* pcb list should be locked when we get here */ 279 lck_rw_assert(pcbinfo->mtx, LCK_RW_ASSERT_EXCLUSIVE); 280 281 LIST_REMOVE(tp->t_inpcb, inp_list); 282 283 /* if (tp->t_timer[TCPT_2MSL] <= 0) 284 tp->t_timer[TCPT_2MSL] = 1; */ 285 286 /* 287 * Because we're pulling this pcb out of the main TCP pcb list, 288 * we need to recalculate the TCPT_2MSL timer value for tcp_slowtimo 289 * higher timer granularity. 290 */ 291 292 timer = (delay / TCP_RETRANSHZ) * PR_SLOWHZ; 293 tp->t_rcvtime = (tp->t_rcvtime / TCP_RETRANSHZ) * PR_SLOWHZ; 294 295 tp->t_rcvtime += timer & (N_TIME_WAIT_SLOTS - 1); 296 297 tw_slot = (timer & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot; 298 if (tw_slot >= N_TIME_WAIT_SLOTS) 299 tw_slot -= N_TIME_WAIT_SLOTS; 300 301 LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list); 302} 303 304void add_to_time_wait(struct tcpcb *tp, uint32_t delay) 305{ 306 struct inpcbinfo *pcbinfo = &tcbinfo; 307 308 if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) { 309 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0); 310 lck_rw_lock_exclusive(pcbinfo->mtx); 311 tcp_lock(tp->t_inpcb->inp_socket, 0, 0); 312 } 313 add_to_time_wait_locked(tp, delay); 314 lck_rw_done(pcbinfo->mtx); 315} 316 317static void 318tcp_garbage_collect(struct inpcb *inp, int istimewait) 319{ 320 struct socket *so; 321 struct tcpcb *tp; 322 323 so = inp->inp_socket; 324 tp = intotcpcb(inp); 325 326 /* 327 * Skip if still in use or busy; it would have been more efficient 328 * if we were to test so_usecount against 0, but this isn't possible 329 * due to the current implementation of tcp_dropdropablreq() where 330 * overflow sockets that are eligible for garbage collection have 331 * their usecounts set to 1. 332 */ 333 if (so->so_usecount > 1 || !lck_mtx_try_lock_spin(&inp->inpcb_mtx)) 334 return; 335 336 /* Check again under the lock */ 337 if (so->so_usecount > 1) { 338 lck_mtx_unlock(&inp->inpcb_mtx); 339 return; 340 } 341 342 /* 343 * Overflowed socket dropped from the listening queue? Do this 344 * only if we are called to clean up the time wait slots, since 345 * tcp_dropdropablreq() considers a socket to have been fully 346 * dropped after add_to_time_wait() is finished. 347 * Also handle the case of connections getting closed by the peer while in the queue as 348 * seen with rdar://6422317 349 * 350 */ 351 if (so->so_usecount == 1 && 352 ((istimewait && (so->so_flags & SOF_OVERFLOW)) || 353 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) && (so->so_head != NULL) 354 && ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) == 355 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) { 356 357 if (inp->inp_state != INPCB_STATE_DEAD) { 358 /* Become a regular mutex */ 359 lck_mtx_convert_spin(&inp->inpcb_mtx); 360#if INET6 361 if (INP_CHECK_SOCKAF(so, AF_INET6)) 362 in6_pcbdetach(inp); 363 else 364#endif /* INET6 */ 365 in_pcbdetach(inp); 366 } 367 so->so_usecount--; 368 lck_mtx_unlock(&inp->inpcb_mtx); 369 return; 370 } else if (inp->inp_wantcnt != WNT_STOPUSING) { 371 lck_mtx_unlock(&inp->inpcb_mtx); 372 return; 373 } 374 375 /* 376 * We get here because the PCB is no longer searchable (WNT_STOPUSING); 377 * detach (if needed) and dispose if it is dead (usecount is 0). This 378 * covers all cases, including overflow sockets and those that are 379 * considered as "embryonic", i.e. created by sonewconn() in TCP input 380 * path, and have not yet been committed. For the former, we reduce 381 * the usecount to 0 as done by the code above. For the latter, the 382 * usecount would have reduced to 0 as part calling soabort() when the 383 * socket is dropped at the end of tcp_input(). 384 */ 385 if (so->so_usecount == 0) { 386 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 387 struct tcpcb *, tp, int32_t, TCPS_CLOSED); 388 /* Become a regular mutex */ 389 lck_mtx_convert_spin(&inp->inpcb_mtx); 390 391 /* If this tp still happens to be on the timer list, 392 * take it out 393 */ 394 if (TIMER_IS_ON_LIST(tp)) { 395 tcp_remove_timer(tp); 396 } 397 398 if (inp->inp_state != INPCB_STATE_DEAD) { 399#if INET6 400 if (INP_CHECK_SOCKAF(so, AF_INET6)) 401 in6_pcbdetach(inp); 402 else 403#endif /* INET6 */ 404 in_pcbdetach(inp); 405 } 406 in_pcbdispose(inp); 407 } else { 408 lck_mtx_unlock(&inp->inpcb_mtx); 409 } 410} 411 412void 413tcp_slowtimo(void) 414{ 415 struct inpcb *inp, *nxt; 416 struct tcpcb *tp; 417#if TCPDEBUG 418 int ostate; 419#endif 420 421#if KDEBUG 422 static int tws_checked = 0; 423#endif 424 425 struct inpcbinfo *pcbinfo = &tcbinfo; 426 427 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0,0,0,0,0); 428 429 tcp_maxidle = tcp_keepcnt * tcp_keepintvl; 430 431 /* Update tcp_now here as it may get used while processing the slow timer */ 432 calculate_tcp_clock(); 433 434 /* Garbage collect socket/tcpcb: We need to acquire the list lock 435 * exclusively to do this 436 */ 437 438 if (lck_rw_try_lock_exclusive(pcbinfo->mtx) == FALSE) { 439 if (tcp_gc_done == TRUE) { /* don't sweat it this time. cleanup was done last time */ 440 tcp_gc_done = FALSE; 441 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0); 442 return; /* Upgrade failed and lost lock - give up this time. */ 443 } 444 lck_rw_lock_exclusive(pcbinfo->mtx); /* Upgrade failed, lost lock now take it again exclusive */ 445 } 446 tcp_gc_done = TRUE; 447 448 /* 449 * Process the items in the current time-wait slot 450 */ 451#if KDEBUG 452 tws_checked = 0; 453#endif 454 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_NONE, tws_checked,0,0,0,0); 455 456 LIST_FOREACH(inp, &time_wait_slots[cur_tw_slot], inp_list) { 457#if KDEBUG 458 tws_checked++; 459#endif 460 461 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) 462 continue; 463 464 tcp_lock(inp->inp_socket, 1, 0); 465 466 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) 467 goto twunlock; 468 469 tp = intotcpcb(inp); 470 if (tp == NULL) /* tp already closed, remove from list */ 471 goto twunlock; 472 473 if (tp->t_timer[TCPT_2MSL] >= N_TIME_WAIT_SLOTS) { 474 tp->t_timer[TCPT_2MSL] -= N_TIME_WAIT_SLOTS; 475 tp->t_rcvtime += N_TIME_WAIT_SLOTS; 476 } 477 else 478 tp->t_timer[TCPT_2MSL] = 0; 479 480 if (tp->t_timer[TCPT_2MSL] == 0) { 481 482 /* That pcb is ready for a close */ 483 tcp_free_sackholes(tp); 484 tp = tcp_close(tp); 485 } 486twunlock: 487 tcp_unlock(inp->inp_socket, 1, 0); 488 } 489 490 491 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) { 492 tcp_garbage_collect(inp, 0); 493 } 494 495 /* Now cleanup the time wait ones */ 496 LIST_FOREACH_SAFE(inp, &time_wait_slots[cur_tw_slot], inp_list, nxt) { 497 tcp_garbage_collect(inp, 1); 498 } 499 500 if (++cur_tw_slot >= N_TIME_WAIT_SLOTS) 501 cur_tw_slot = 0; 502 503 lck_rw_done(pcbinfo->mtx); 504 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0); 505} 506 507/* 508 * Cancel all timers for TCP tp. 509 */ 510void 511tcp_canceltimers(tp) 512 struct tcpcb *tp; 513{ 514 register int i; 515 516 tcp_remove_timer(tp); 517 for (i = 0; i < TCPT_NTIMERS; i++) 518 tp->t_timer[i] = 0; 519 tp->tentry.timer_start = tcp_now; 520 tp->tentry.index = TCPT_NONE; 521} 522 523int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 524 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 525 526int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 527 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; 528 529static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ 530 531/* 532 * TCP timer processing. 533 */ 534struct tcpcb * 535tcp_timers(tp, timer) 536 register struct tcpcb *tp; 537 int timer; 538{ 539 register int rexmt; 540 struct socket *so; 541 struct tcptemp *t_template; 542 int optlen = 0; 543 int idle_time = 0; 544 545#if TCPDEBUG 546 int ostate; 547#endif 548 549#if INET6 550 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0; 551#endif /* INET6 */ 552 553 so = tp->t_inpcb->inp_socket; 554 idle_time = tcp_now - tp->t_rcvtime; 555 556 switch (timer) { 557 558 /* 559 * 2 MSL timeout in shutdown went off. If we're closed but 560 * still waiting for peer to close and connection has been idle 561 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2, 562 * delete connection control block. 563 * Otherwise, (this case shouldn't happen) check again in a bit 564 * we keep the socket in the main list in that case. 565 */ 566 case TCPT_2MSL: 567 tcp_free_sackholes(tp); 568 if (tp->t_state != TCPS_TIME_WAIT && 569 tp->t_state != TCPS_FIN_WAIT_2 && 570 ((idle_time > 0) && (idle_time < tcp_maxidle))) { 571 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, (u_int32_t)tcp_keepintvl); 572 } 573 else { 574 tp = tcp_close(tp); 575 return(tp); 576 } 577 break; 578 579 /* 580 * Retransmission timer went off. Message has not 581 * been acked within retransmit interval. Back off 582 * to a longer retransmit interval and retransmit one segment. 583 */ 584 case TCPT_REXMT: 585 /* Drop a connection in the retransmit timer 586 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT times 587 * 2. If the time spent in this retransmission episode is more than 588 * the time limit set with TCP_RXT_CONNDROPTIME socket option 589 * 3. If TCP_RXT_FINDROP socket option was set and we have already 590 * retransmitted the FIN 3 times without receiving an ack 591 */ 592 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || 593 (tp->rxt_conndroptime > 0 && tp->rxt_start > 0 && 594 (tcp_now - tp->rxt_start) >= tp->rxt_conndroptime) || 595 ((tp->t_flagsext & TF_RXTFINDROP) != 0 && 596 (tp->t_flags & TF_SENTFIN) != 0 && 597 tp->t_rxtshift >= 4)) { 598 599 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) { 600 tcpstat.tcps_rxtfindrop++; 601 } else { 602 tcpstat.tcps_timeoutdrop++; 603 } 604 tp->t_rxtshift = TCP_MAXRXTSHIFT; 605 postevent(so, 0, EV_TIMEOUT); 606 soevent(so, 607 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); 608 tp = tcp_drop(tp, tp->t_softerror ? 609 tp->t_softerror : ETIMEDOUT); 610 611 break; 612 } 613 614 if (tp->t_rxtshift == 1) { 615 /* 616 * first retransmit; record ssthresh and cwnd so they can 617 * be recovered if this turns out to be a "bad" retransmit. 618 * A retransmit is considered "bad" if an ACK for this 619 * segment is received within RTT/2 interval; the assumption 620 * here is that the ACK was already in flight. See 621 * "On Estimating End-to-End Network Path Properties" by 622 * Allman and Paxson for more details. 623 */ 624 tp->snd_cwnd_prev = tp->snd_cwnd; 625 tp->snd_ssthresh_prev = tp->snd_ssthresh; 626 tp->snd_recover_prev = tp->snd_recover; 627 if (IN_FASTRECOVERY(tp)) 628 tp->t_flags |= TF_WASFRECOVERY; 629 else 630 tp->t_flags &= ~TF_WASFRECOVERY; 631 tp->t_badrxtwin = tcp_now + (tp->t_srtt >> (TCP_RTT_SHIFT)); 632 633 /* Set the time at which retransmission on this 634 * connection started 635 */ 636 tp->rxt_start = tcp_now; 637 } 638 tcpstat.tcps_rexmttimeo++; 639 640 if (tp->t_state == TCPS_SYN_SENT) 641 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; 642 else 643 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 644 TCPT_RANGESET(tp->t_rxtcur, rexmt, 645 tp->t_rttmin, TCPTV_REXMTMAX, 646 TCP_ADD_REXMTSLOP(tp)); 647 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); 648 649 if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb)) 650 goto fc_output; 651 652 tcp_free_sackholes(tp); 653 /* 654 * Check for potential Path MTU Discovery Black Hole 655 */ 656 657 if (tcp_pmtud_black_hole_detect && (tp->t_state == TCPS_ESTABLISHED)) { 658 if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) == (TF_PMTUD|TF_MAXSEGSNT)) && 659 (tp->t_rxtshift == 2)) { 660 /* 661 * Enter Path MTU Black-hole Detection mechanism: 662 * - Disable Path MTU Discovery (IP "DF" bit). 663 * - Reduce MTU to lower value than what we negociated with peer. 664 */ 665 666 tp->t_flags &= ~TF_PMTUD; /* Disable Path MTU Discovery for now */ 667 tp->t_flags |= TF_BLACKHOLE; /* Record that we may have found a black hole */ 668 optlen = tp->t_maxopd - tp->t_maxseg; 669 tp->t_pmtud_saved_maxopd = tp->t_maxopd; /* Keep track of previous MSS */ 670 if (tp->t_maxopd > tcp_pmtud_black_hole_mss) 671 tp->t_maxopd = tcp_pmtud_black_hole_mss; /* Reduce the MSS to intermediary value */ 672 else { 673 tp->t_maxopd = /* use the default MSS */ 674#if INET6 675 isipv6 ? tcp_v6mssdflt : 676#endif /* INET6 */ 677 tcp_mssdflt; 678 } 679 tp->t_maxseg = tp->t_maxopd - optlen; 680 681 /* 682 * Reset the slow-start flight size as it may depends on the new MSS 683 */ 684 if (CC_ALGO(tp)->cwnd_init != NULL) 685 CC_ALGO(tp)->cwnd_init(tp); 686 } 687 /* 688 * If further retransmissions are still unsuccessful with a lowered MTU, 689 * maybe this isn't a Black Hole and we restore the previous MSS and 690 * blackhole detection flags. 691 */ 692 else { 693 694 if ((tp->t_flags & TF_BLACKHOLE) && (tp->t_rxtshift > 4)) { 695 tp->t_flags |= TF_PMTUD; 696 tp->t_flags &= ~TF_BLACKHOLE; 697 optlen = tp->t_maxopd - tp->t_maxseg; 698 tp->t_maxopd = tp->t_pmtud_saved_maxopd; 699 tp->t_maxseg = tp->t_maxopd - optlen; 700 /* 701 * Reset the slow-start flight size as it may depends on the new MSS 702 */ 703 if (CC_ALGO(tp)->cwnd_init != NULL) 704 CC_ALGO(tp)->cwnd_init(tp); 705 } 706 } 707 } 708 709 710 /* 711 * Disable rfc1323 and rfc1644 if we haven't got any response to 712 * our SYN (after we reach the threshold) to work-around some 713 * broken terminal servers (most of which have hopefully been 714 * retired) that have bad VJ header compression code which 715 * trashes TCP segments containing unknown-to-them TCP options. 716 */ 717 if ((tp->t_state == TCPS_SYN_SENT) && 718 (tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres)) 719 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC); 720 721 /* 722 * If losing, let the lower level know and try for 723 * a better route. Also, if we backed off this far, 724 * our srtt estimate is probably bogus. Clobber it 725 * so we'll take the next rtt measurement as our srtt; 726 * move the current srtt into rttvar to keep the current 727 * retransmit times until then. 728 */ 729 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 730#if INET6 731 if (isipv6) 732 in6_losing(tp->t_inpcb); 733 else 734#endif /* INET6 */ 735 in_losing(tp->t_inpcb); 736 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 737 tp->t_srtt = 0; 738 } 739 tp->snd_nxt = tp->snd_una; 740 /* 741 * Note: We overload snd_recover to function also as the 742 * snd_last variable described in RFC 2582 743 */ 744 tp->snd_recover = tp->snd_max; 745 /* 746 * Force a segment to be sent. 747 */ 748 tp->t_flags |= TF_ACKNOW; 749 /* 750 * If timing a segment in this window, stop the timer. 751 */ 752 tp->t_rtttime = 0; 753 754 if (CC_ALGO(tp)->after_timeout != NULL) 755 CC_ALGO(tp)->after_timeout(tp); 756 757 tp->t_dupacks = 0; 758 EXIT_FASTRECOVERY(tp); 759 760 /* CWR notifications are to be sent on new data right after 761 * RTOs, Fast Retransmits and ECE notification receipts. 762 */ 763 if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) { 764 tp->ecn_flags |= TE_SENDCWR; 765 } 766fc_output: 767 DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, 768 struct tcpcb *, tp, struct tcphdr *, NULL, 769 int32_t, TCP_CC_REXMT_TIMEOUT); 770 771 (void) tcp_output(tp); 772 break; 773 774 /* 775 * Persistance timer into zero window. 776 * Force a byte to be output, if possible. 777 */ 778 case TCPT_PERSIST: 779 tcpstat.tcps_persisttimeo++; 780 /* 781 * Hack: if the peer is dead/unreachable, we do not 782 * time out if the window is closed. After a full 783 * backoff, drop the connection if the idle time 784 * (no responses to probes) reaches the maximum 785 * backoff that we would use if retransmitting. 786 * 787 * Drop the connection if we reached the maximum allowed time for 788 * Zero Window Probes without a non-zero update from the peer. 789 * See rdar://5805356 790 */ 791 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT && 792 (idle_time >= tcp_maxpersistidle || 793 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) || 794 ((tp->t_persist_stop != 0) && 795 TSTMP_LEQ(tp->t_persist_stop, tcp_now))) { 796 tcpstat.tcps_persistdrop++; 797 postevent(so, 0, EV_TIMEOUT); 798 soevent(so, 799 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); 800 tp = tcp_drop(tp, ETIMEDOUT); 801 break; 802 } 803 tcp_setpersist(tp); 804 tp->t_force = 1; 805 (void) tcp_output(tp); 806 tp->t_force = 0; 807 break; 808 809 /* 810 * Keep-alive timer went off; send something 811 * or drop connection if idle for too long. 812 */ 813 case TCPT_KEEP: 814 tcpstat.tcps_keeptimeo++; 815 if (tp->t_state < TCPS_ESTABLISHED) 816 goto dropit; 817 if ((always_keepalive || 818 tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && 819 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) { 820 if (idle_time >= TCP_KEEPIDLE(tp) + (u_int32_t)tcp_maxidle) 821 goto dropit; 822 /* 823 * Send a packet designed to force a response 824 * if the peer is up and reachable: 825 * either an ACK if the connection is still alive, 826 * or an RST if the peer has closed the connection 827 * due to timeout or reboot. 828 * Using sequence number tp->snd_una-1 829 * causes the transmitted zero-length segment 830 * to lie outside the receive window; 831 * by the protocol spec, this requires the 832 * correspondent TCP to respond. 833 */ 834 tcpstat.tcps_keepprobe++; 835 t_template = tcp_maketemplate(tp); 836 if (t_template) { 837 unsigned int ifscope, nocell = 0; 838 839 if (tp->t_inpcb->inp_flags & INP_BOUND_IF) 840 ifscope = tp->t_inpcb->inp_boundifp->if_index; 841 else 842 ifscope = IFSCOPE_NONE; 843 844 /* 845 * If the socket isn't allowed to use the 846 * cellular interface, indicate it as such. 847 */ 848 if (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR) 849 nocell = 1; 850 851 tcp_respond(tp, t_template->tt_ipgen, 852 &t_template->tt_t, (struct mbuf *)NULL, 853 tp->rcv_nxt, tp->snd_una - 1, 0, ifscope, 854 nocell); 855 (void) m_free(dtom(t_template)); 856 } 857 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, tcp_keepintvl); 858 } else 859 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp)); 860 break; 861 case TCPT_DELACK: 862 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) { 863 tp->t_flags &= ~TF_DELACK; 864 tp->t_timer[TCPT_DELACK] = 0; 865 tp->t_flags |= TF_ACKNOW; 866 867 /* If delayed ack timer fired while we are stretching acks, 868 * go back to acking every other packet 869 */ 870 if ((tp->t_flags & TF_STRETCHACK) != 0) 871 tcp_reset_stretch_ack(tp); 872 873 /* If we are measuring inter packet arrival jitter for 874 * throttling a connection, this delayed ack might be 875 * the reason for accumulating some jitter. So let's 876 * restart the measurement. 877 */ 878 CLEAR_IAJ_STATE(tp); 879 880 tcpstat.tcps_delack++; 881 (void) tcp_output(tp); 882 } 883 break; 884 885#if TCPDEBUG 886 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 887 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 888 PRU_SLOWTIMO); 889#endif 890 dropit: 891 tcpstat.tcps_keepdrops++; 892 postevent(so, 0, EV_TIMEOUT); 893 soevent(so, 894 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); 895 tp = tcp_drop(tp, ETIMEDOUT); 896 break; 897 } 898 return (tp); 899} 900 901/* Remove a timer entry from timer list */ 902void 903tcp_remove_timer(struct tcpcb *tp) 904{ 905 struct tcptimerlist *listp = &tcp_timer_list; 906 907 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); 908 if (!(TIMER_IS_ON_LIST(tp))) { 909 return; 910 } 911 lck_mtx_lock(listp->mtx); 912 913 /* Check if pcb is on timer list again after acquiring the lock */ 914 if (!(TIMER_IS_ON_LIST(tp))) { 915 lck_mtx_unlock(listp->mtx); 916 return; 917 } 918 919 if (listp->next_te != NULL && listp->next_te == &tp->tentry) 920 listp->next_te = LIST_NEXT(&tp->tentry, le); 921 922 LIST_REMOVE(&tp->tentry, le); 923 tp->t_flags &= ~(TF_TIMER_ONLIST); 924 925 listp->entries--; 926 927 tp->tentry.le.le_next = NULL; 928 tp->tentry.le.le_prev = NULL; 929 lck_mtx_unlock(listp->mtx); 930} 931 932/* Function to check if the timerlist needs to be rescheduled to run 933 * the timer entry correctly. Basically, this is to check if we can avoid 934 * taking the list lock. 935 */ 936 937static boolean_t 938need_to_resched_timerlist(uint32_t runtime, uint16_t index) { 939 struct tcptimerlist *listp = &tcp_timer_list; 940 int32_t diff; 941 boolean_t is_fast; 942 943 if (runtime == 0 || index == TCPT_NONE) 944 return FALSE; 945 is_fast = !(IS_TIMER_SLOW(index)); 946 947 /* If the list is being processed then the state of the list is in flux. 948 * In this case always acquire the lock and set the state correctly. 949 */ 950 if (listp->running) { 951 return TRUE; 952 } 953 954 diff = timer_diff(listp->runtime, 0, runtime, 0); 955 if (diff <= 0) { 956 /* The list is going to run before this timer */ 957 return FALSE; 958 } else { 959 if (is_fast) { 960 if (diff <= listp->fast_quantum) 961 return FALSE; 962 } else { 963 if (diff <= listp->slow_quantum) 964 return FALSE; 965 } 966 } 967 return TRUE; 968} 969 970void 971tcp_sched_timerlist(uint32_t offset) 972{ 973 974 uint64_t deadline = 0; 975 struct tcptimerlist *listp = &tcp_timer_list; 976 977 lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED); 978 979 listp->runtime = tcp_now + offset; 980 981 clock_interval_to_deadline(offset, NSEC_PER_SEC / TCP_RETRANSHZ, 982 &deadline); 983 984 thread_call_enter_delayed(listp->call, deadline); 985} 986 987/* Function to run the timers for a connection. 988 * 989 * Returns the offset of next timer to be run for this connection which 990 * can be used to reschedule the timerlist. 991 */ 992uint32_t 993tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) { 994 995 struct socket *so; 996 uint16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE; 997 uint32_t timer_val, offset = 0, lo_timer = 0; 998 int32_t diff; 999 boolean_t needtorun[TCPT_NTIMERS]; 1000 int count = 0; 1001 1002 VERIFY(tp != NULL); 1003 bzero(needtorun, sizeof(needtorun)); 1004 1005 tcp_lock(tp->t_inpcb->inp_socket, 1, 0); 1006 1007 so = tp->t_inpcb->inp_socket; 1008 /* Release the want count on inp */ 1009 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1) == WNT_STOPUSING) { 1010 if (TIMER_IS_ON_LIST(tp)) { 1011 tcp_remove_timer(tp); 1012 } 1013 1014 /* Looks like the TCP connection got closed while we 1015 * were waiting for the lock.. Done 1016 */ 1017 goto done; 1018 } 1019 1020 /* Since the timer thread needs to wait for tcp lock, it may race 1021 * with another thread that can cancel or reschedule the timer that is 1022 * about to run. Check if we need to run anything. 1023 */ 1024 index = tp->tentry.index; 1025 timer_val = tp->t_timer[index]; 1026 1027 if (index == TCPT_NONE || tp->tentry.runtime == 0) 1028 goto done; 1029 1030 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0); 1031 if (diff > 0) { 1032 if (tp->tentry.index != TCPT_NONE) { 1033 offset = diff; 1034 *(next_index) = tp->tentry.index; 1035 } 1036 goto done; 1037 } 1038 1039 tp->t_timer[index] = 0; 1040 if (timer_val > 0) { 1041 tp = tcp_timers(tp, index); 1042 if (tp == NULL) 1043 goto done; 1044 } 1045 1046 /* Check if there are any other timers that need to be run. While doing it, 1047 * adjust the timer values wrt tcp_now. 1048 */ 1049 for (i = 0; i < TCPT_NTIMERS; ++i) { 1050 if (tp->t_timer[i] != 0) { 1051 diff = timer_diff(tp->tentry.timer_start, tp->t_timer[i], tcp_now, 0); 1052 if (diff <= 0) { 1053 tp->t_timer[i] = 0; 1054 needtorun[i] = TRUE; 1055 count++; 1056 } else { 1057 tp->t_timer[i] = diff; 1058 needtorun[i] = FALSE; 1059 if (lo_timer == 0 || diff < lo_timer) { 1060 lo_timer = diff; 1061 lo_index = i; 1062 } 1063 } 1064 } 1065 } 1066 1067 tp->tentry.timer_start = tcp_now; 1068 tp->tentry.index = lo_index; 1069 if (lo_index != TCPT_NONE) { 1070 tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index]; 1071 } else { 1072 tp->tentry.runtime = 0; 1073 } 1074 1075 if (count > 0) { 1076 /* run any other timers that are also outstanding at this time. */ 1077 for (i = 0; i < TCPT_NTIMERS; ++i) { 1078 if (needtorun[i]) { 1079 tp->t_timer[i] = 0; 1080 tp = tcp_timers(tp, i); 1081 if (tp == NULL) 1082 goto done; 1083 } 1084 } 1085 tcp_set_lotimer_index(tp); 1086 } 1087 1088 if (tp->tentry.index < TCPT_NONE) { 1089 offset = tp->t_timer[tp->tentry.index]; 1090 *(next_index) = tp->tentry.index; 1091 } 1092 1093done: 1094 if (tp != NULL && tp->tentry.index == TCPT_NONE) { 1095 tcp_remove_timer(tp); 1096 } 1097 tcp_unlock(so, 1, 0); 1098 return offset; 1099} 1100 1101void 1102tcp_run_timerlist(void * arg1, void * arg2) { 1103 1104#pragma unused(arg1, arg2) 1105 1106 struct tcptimerentry *te, *next_te; 1107 struct tcptimerlist *listp = &tcp_timer_list; 1108 struct tcpcb *tp; 1109 uint32_t next_timer = 0; 1110 uint16_t index = TCPT_NONE; 1111 boolean_t need_fast = FALSE; 1112 uint32_t active_count = 0; 1113 uint32_t mode = TCP_TIMERLIST_FASTMODE; 1114 1115 calculate_tcp_clock(); 1116 1117 lck_mtx_lock(listp->mtx); 1118 1119 listp->running = TRUE; 1120 1121 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) { 1122 uint32_t offset = 0; 1123 uint32_t runtime = te->runtime; 1124 if (TSTMP_GT(runtime, tcp_now)) { 1125 offset = timer_diff(runtime, 0, tcp_now, 0); 1126 if (next_timer == 0 || offset < next_timer) { 1127 next_timer = offset; 1128 } 1129 continue; 1130 } 1131 active_count++; 1132 1133 tp = TIMERENTRY_TO_TP(te); 1134 1135 /* Acquire an inp wantcnt on the inpcb so that the socket won't get 1136 * detached even if tcp_close is called 1137 */ 1138 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0) == WNT_STOPUSING) { 1139 /* Some how this pcb went into dead state while on the timer list, 1140 * just take it off the list. Since the timer list entry pointers 1141 * are protected by the timer list lock, we can do it here 1142 */ 1143 if (TIMER_IS_ON_LIST(tp)) { 1144 tp->t_flags &= ~(TF_TIMER_ONLIST); 1145 LIST_REMOVE(&tp->tentry, le); 1146 listp->entries--; 1147 1148 tp->tentry.le.le_next = NULL; 1149 tp->tentry.le.le_prev = NULL; 1150 } 1151 continue; 1152 } 1153 1154 /* Store the next timerentry pointer before releasing the list lock. 1155 * If that entry has to be removed when we release the lock, this 1156 * pointer will be updated to the element after that. 1157 */ 1158 listp->next_te = next_te; 1159 1160 VERIFY_NEXT_LINK(&tp->tentry, le); 1161 VERIFY_PREV_LINK(&tp->tentry, le); 1162 1163 lck_mtx_unlock(listp->mtx); 1164 1165 index = TCPT_NONE; 1166 offset = tcp_run_conn_timer(tp, &index); 1167 1168 lck_mtx_lock(listp->mtx); 1169 1170 next_te = listp->next_te; 1171 listp->next_te = NULL; 1172 1173 if (offset > 0) { 1174 if (index < TCPT_NONE) { 1175 /* Check if this is a fast_timer. */ 1176 if (!need_fast && !(IS_TIMER_SLOW(index))) { 1177 need_fast = TRUE; 1178 } 1179 1180 if (next_timer == 0 || offset < next_timer) { 1181 next_timer = offset; 1182 } 1183 } 1184 } 1185 } 1186 1187 if (!LIST_EMPTY(&listp->lhead)) { 1188 if (listp->mode == TCP_TIMERLIST_FASTMODE) { 1189 if (need_fast || active_count > 0 || 1190 listp->pref_mode == TCP_TIMERLIST_FASTMODE) { 1191 listp->idlegen = 0; 1192 } else { 1193 listp->idlegen++; 1194 if (listp->idlegen > timer_fastmode_idlemax) { 1195 mode = TCP_TIMERLIST_SLOWMODE; 1196 listp->idlegen = 0; 1197 } 1198 } 1199 } else { 1200 if (!need_fast) { 1201 mode = TCP_TIMERLIST_SLOWMODE; 1202 } 1203 } 1204 1205 if (mode == TCP_TIMERLIST_FASTMODE || 1206 listp->pref_mode == TCP_TIMERLIST_FASTMODE) { 1207 next_timer = listp->fast_quantum; 1208 } else { 1209 if (listp->pref_offset != 0 && 1210 listp->pref_offset < next_timer) 1211 next_timer = listp->pref_offset; 1212 if (next_timer < listp->slow_quantum) 1213 next_timer = listp->slow_quantum; 1214 } 1215 1216 listp->mode = mode; 1217 1218 tcp_sched_timerlist(next_timer); 1219 } else { 1220 /* No need to reschedule this timer */ 1221 listp->runtime = 0; 1222 } 1223 1224 listp->running = FALSE; 1225 listp->pref_mode = 0; 1226 listp->pref_offset = 0; 1227 1228 lck_mtx_unlock(listp->mtx); 1229} 1230 1231/* Function to verify if a change in timer state is required for a connection */ 1232void 1233tcp_sched_timers(struct tcpcb *tp) 1234{ 1235 struct tcptimerentry *te = &tp->tentry; 1236 uint16_t index = te->index; 1237 struct tcptimerlist *listp = &tcp_timer_list; 1238 uint32_t offset = 0; 1239 boolean_t is_fast; 1240 int list_locked = 0; 1241 1242 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) { 1243 /* Just return without adding the dead pcb to the list */ 1244 if (TIMER_IS_ON_LIST(tp)) { 1245 tcp_remove_timer(tp); 1246 } 1247 return; 1248 } 1249 1250 if (index == TCPT_NONE) { 1251 tcp_remove_timer(tp); 1252 return; 1253 } 1254 1255 is_fast = !(IS_TIMER_SLOW(index)); 1256 offset = te->runtime - tcp_now; 1257 if (offset == 0) { 1258 offset = 1; 1259 tcp_timer_advanced++; 1260 } 1261 if (is_fast) 1262 offset = listp->fast_quantum; 1263 1264 if (!TIMER_IS_ON_LIST(tp)) { 1265 if (!list_locked) { 1266 lck_mtx_lock(listp->mtx); 1267 list_locked = 1; 1268 } 1269 1270 LIST_INSERT_HEAD(&listp->lhead, te, le); 1271 tp->t_flags |= TF_TIMER_ONLIST; 1272 1273 listp->entries++; 1274 if (listp->entries > listp->maxentries) 1275 listp->maxentries = listp->entries; 1276 1277 /* if the list is not scheduled, just schedule it */ 1278 if (listp->runtime == 0) 1279 goto schedule; 1280 1281 } 1282 1283 1284 /* timer entry is currently on the list */ 1285 if (need_to_resched_timerlist(te->runtime, index)) { 1286 tcp_resched_timerlist++; 1287 1288 if (!list_locked) { 1289 lck_mtx_lock(listp->mtx); 1290 list_locked = 1; 1291 } 1292 1293 VERIFY_NEXT_LINK(te, le); 1294 VERIFY_PREV_LINK(te, le); 1295 1296 if (listp->running) { 1297 if (is_fast) { 1298 listp->pref_mode = TCP_TIMERLIST_FASTMODE; 1299 } else if (listp->pref_offset == 0 || 1300 ((int)offset) < listp->pref_offset) { 1301 listp->pref_offset = offset; 1302 } 1303 } else { 1304 int32_t diff; 1305 diff = timer_diff(listp->runtime, 0, tcp_now, offset); 1306 if (diff <= 0) { 1307 /* The list is going to run before this timer */ 1308 goto done; 1309 } else { 1310 goto schedule; 1311 } 1312 } 1313 } 1314 goto done; 1315 1316schedule: 1317 if (is_fast) { 1318 listp->mode = TCP_TIMERLIST_FASTMODE; 1319 listp->idlegen = 0; 1320 } 1321 tcp_sched_timerlist(offset); 1322 1323done: 1324 if (list_locked) 1325 lck_mtx_unlock(listp->mtx); 1326 1327 return; 1328} 1329 1330void 1331tcp_set_lotimer_index(struct tcpcb *tp) { 1332 uint16_t i, lo_index = TCPT_NONE; 1333 uint32_t lo_timer = 0; 1334 for (i = 0; i < TCPT_NTIMERS; ++i) { 1335 if (tp->t_timer[i] != 0 && 1336 (lo_timer == 0 || tp->t_timer[i] < lo_timer)) { 1337 lo_timer = tp->t_timer[i]; 1338 lo_index = i; 1339 } 1340 } 1341 tp->tentry.index = lo_index; 1342 if (lo_index != TCPT_NONE) { 1343 tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index]; 1344 } else { 1345 tp->tentry.runtime = 0; 1346 } 1347} 1348 1349void 1350tcp_check_timer_state(struct tcpcb *tp) { 1351 1352 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); 1353 1354 tcp_set_lotimer_index(tp); 1355 1356 tcp_sched_timers(tp); 1357 return; 1358} 1359