1/* 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $ 62 */ 63 64 65#include <sys/param.h> 66#include <sys/systm.h> 67#include <sys/kernel.h> 68#include <sys/mbuf.h> 69#include <sys/sysctl.h> 70#include <sys/socket.h> 71#include <sys/socketvar.h> 72#include <sys/protosw.h> 73#include <sys/domain.h> 74#include <sys/mcache.h> 75#include <sys/queue.h> 76#include <kern/locks.h> 77#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */ 78#include <mach/boolean.h> 79 80#include <net/route.h> 81#include <net/if_var.h> 82#include <net/ntstat.h> 83 84#include <netinet/in.h> 85#include <netinet/in_systm.h> 86#include <netinet/in_pcb.h> 87#if INET6 88#include <netinet6/in6_pcb.h> 89#endif 90#include <netinet/ip_var.h> 91#include <netinet/tcp.h> 92#include <netinet/tcp_fsm.h> 93#include <netinet/tcp_seq.h> 94#include <netinet/tcp_timer.h> 95#include <netinet/tcp_var.h> 96#include <netinet/tcp_cc.h> 97#if INET6 98#include <netinet6/tcp6_var.h> 99#endif 100#include <netinet/tcpip.h> 101#if TCPDEBUG 102#include <netinet/tcp_debug.h> 103#endif 104#include <sys/kdebug.h> 105#include <mach/sdt.h> 106#include <netinet/mptcp_var.h> 107 108#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next))) 109 110#define VERIFY_NEXT_LINK(elm,field) do { \ 111 if (LIST_NEXT((elm),field) != NULL && \ 112 LIST_NEXT((elm),field)->field.le_prev != \ 113 &((elm)->field.le_next)) \ 114 panic("Bad link elm %p next->prev != elm", (elm)); \ 115} while(0) 116 117#define VERIFY_PREV_LINK(elm,field) do { \ 118 if (*(elm)->field.le_prev != (elm)) \ 119 panic("Bad link elm %p prev->next != elm", (elm)); \ 120} while(0) 121 122#define TCP_SET_TIMER_MODE(mode, i) do { \ 123 if (IS_TIMER_HZ_10MS(i)) \ 124 (mode) |= TCP_TIMERLIST_10MS_MODE; \ 125 else if (IS_TIMER_HZ_100MS(i)) \ 126 (mode) |= TCP_TIMERLIST_100MS_MODE; \ 127 else \ 128 (mode) |= TCP_TIMERLIST_500MS_MODE; \ 129} while(0) 130 131/* Max number of times a stretch ack can be delayed on a connection */ 132#define TCP_STRETCHACK_DELAY_THRESHOLD 5 133 134/* tcp timer list */ 135struct tcptimerlist tcp_timer_list; 136 137/* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */ 138struct tcptailq tcp_tw_tailq; 139 140static int 141sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS 142{ 143#pragma unused(arg1, arg2) 144 int error, s, tt; 145 146 tt = *(int *)oidp->oid_arg1; 147 s = tt * 1000 / TCP_RETRANSHZ;; 148 149 error = sysctl_handle_int(oidp, &s, 0, req); 150 if (error || !req->newptr) 151 return (error); 152 153 tt = s * TCP_RETRANSHZ / 1000; 154 if (tt < 1) 155 return (EINVAL); 156 157 *(int *)oidp->oid_arg1 = tt; 158 return (0); 159} 160 161int tcp_keepinit; 162SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, 163 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 164 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); 165 166int tcp_keepidle; 167SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, 168 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 169 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); 170 171int tcp_keepintvl; 172SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, 173 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 174 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); 175 176int tcp_keepcnt; 177SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, 178 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 179 &tcp_keepcnt, 0, "number of times to repeat keepalive"); 180 181int tcp_msl; 182SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, 183 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 184 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 185 186/* 187 * Avoid DoS via TCP Robustness in Persist Condition 188 * (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt) 189 * by allowing a system wide maximum persistence timeout value when in 190 * Zero Window Probe mode. 191 * 192 * Expressed in milliseconds to be consistent without timeout related 193 * values, the TCP socket option is in seconds. 194 */ 195u_int32_t tcp_max_persist_timeout = 0; 196SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout, 197 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 198 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I", 199 "Maximum persistence timeout for ZWP"); 200 201static int always_keepalive = 0; 202SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, 203 CTLFLAG_RW | CTLFLAG_LOCKED, 204 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 205 206/* 207 * This parameter determines how long the timer list will stay in fast or 208 * quick mode even though all connections are idle. In this state, the 209 * timer will run more frequently anticipating new data. 210 */ 211int timer_fastmode_idlemax = TCP_FASTMODE_IDLERUN_MAX; 212SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax, 213 CTLFLAG_RW | CTLFLAG_LOCKED, 214 &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode"); 215 216/* 217 * See tcp_syn_backoff[] for interval values between SYN retransmits; 218 * the value set below defines the number of retransmits, before we 219 * disable the timestamp and window scaling options during subsequent 220 * SYN retransmits. Setting it to 0 disables the dropping off of those 221 * two options. 222 */ 223static int tcp_broken_peer_syn_rxmit_thres = 7; 224SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, 225 CTLFLAG_RW | CTLFLAG_LOCKED, 226 &tcp_broken_peer_syn_rxmit_thres, 0, 227 "Number of retransmitted SYNs before " 228 "TCP disables rfc1323 and rfc1644 during the rest of attempts"); 229 230/* A higher threshold on local connections for disabling RFC 1323 options */ 231static int tcp_broken_peer_syn_rxmit_thres_local = 10; 232SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rexmit_thres_local, 233 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres_local, 0, 234 "Number of retransmitted SYNs before disabling RFC 1323 " 235 "options on local connections"); 236 237static int tcp_timer_advanced = 0; 238SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, 239 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_timer_advanced, 0, 240 "Number of times one of the timers was advanced"); 241 242static int tcp_resched_timerlist = 0; 243SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist, 244 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0, 245 "Number of times timer list was rescheduled as part of processing a packet"); 246 247int tcp_pmtud_black_hole_detect = 1 ; 248SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 249 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_detect, 0, 250 "Path MTU Discovery Black Hole Detection"); 251 252int tcp_pmtud_black_hole_mss = 1200 ; 253SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 254 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_mss, 0, 255 "Path MTU Discovery Black Hole Detection lowered MSS"); 256 257/* performed garbage collection of "used" sockets */ 258static boolean_t tcp_gc_done = FALSE; 259 260/* max idle probes */ 261int tcp_maxpersistidle; 262 263/* 264 * TCP delack timer is set to 100 ms. Since the processing of timer list 265 * in fast mode will happen no faster than 100 ms, the delayed ack timer 266 * will fire some where between 100 and 200 ms. 267 */ 268int tcp_delack = TCP_RETRANSHZ / 10; 269 270#if MPTCP 271/* 272 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff 273 */ 274int tcp_jack_rxmt = TCP_RETRANSHZ / 2; 275#endif /* MPTCP */ 276 277static void tcp_remove_timer(struct tcpcb *tp); 278static void tcp_sched_timerlist(uint32_t offset); 279static u_int32_t tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *mode); 280static void tcp_sched_timers(struct tcpcb *tp); 281static inline void tcp_set_lotimer_index(struct tcpcb *); 282static void tcp_rexmt_save_state(struct tcpcb *tp); 283__private_extern__ void tcp_remove_from_time_wait(struct inpcb *inp); 284__private_extern__ void tcp_report_stats(void); 285 286/* 287 * Macro to compare two timers. If there is a reset of the sign bit, 288 * it is safe to assume that the timer has wrapped around. By doing 289 * signed comparision, we take care of wrap around such that the value 290 * with the sign bit reset is actually ahead of the other. 291 */ 292inline int32_t 293timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) { 294 return (int32_t)((t1 + toff1) - (t2 + toff2)); 295}; 296 297static u_int64_t tcp_last_report_time; 298#define TCP_REPORT_STATS_INTERVAL 345600 /* 4 days, in seconds */ 299 300/* Returns true if the timer is on the timer list */ 301#define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST) 302 303/* Run the TCP timerlist atleast once every hour */ 304#define TCP_TIMERLIST_MAX_OFFSET (60 * 60 * TCP_RETRANSHZ) 305 306 307static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay); 308static boolean_t tcp_garbage_collect(struct inpcb *, int); 309 310/* 311 * Add to tcp timewait list, delay is given in milliseconds. 312 */ 313static void 314add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay) 315{ 316 struct inpcbinfo *pcbinfo = &tcbinfo; 317 struct inpcb *inp = tp->t_inpcb; 318 uint32_t timer; 319 320 /* pcb list should be locked when we get here */ 321 lck_rw_assert(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE); 322 323 /* We may get here multiple times, so check */ 324 if (!(inp->inp_flags2 & INP2_TIMEWAIT)) { 325 pcbinfo->ipi_twcount++; 326 inp->inp_flags2 |= INP2_TIMEWAIT; 327 328 /* Remove from global inp list */ 329 LIST_REMOVE(inp, inp_list); 330 } else { 331 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry); 332 } 333 334 /* Compute the time at which this socket can be closed */ 335 timer = tcp_now + delay; 336 337 /* We will use the TCPT_2MSL timer for tracking this delay */ 338 339 if (TIMER_IS_ON_LIST(tp)) 340 tcp_remove_timer(tp); 341 tp->t_timer[TCPT_2MSL] = timer; 342 343 TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry); 344} 345 346void 347add_to_time_wait(struct tcpcb *tp, uint32_t delay) 348{ 349 struct inpcbinfo *pcbinfo = &tcbinfo; 350 if (tp->t_inpcb->inp_socket->so_options & SO_NOWAKEFROMSLEEP) 351 socket_post_kev_msg_closed(tp->t_inpcb->inp_socket); 352 353 if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) { 354 tcp_unlock(tp->t_inpcb->inp_socket, 0, 0); 355 lck_rw_lock_exclusive(pcbinfo->ipi_lock); 356 tcp_lock(tp->t_inpcb->inp_socket, 0, 0); 357 } 358 add_to_time_wait_locked(tp, delay); 359 lck_rw_done(pcbinfo->ipi_lock); 360 361 inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY); 362} 363 364/* If this is on time wait queue, remove it. */ 365void 366tcp_remove_from_time_wait(struct inpcb *inp) 367{ 368 struct tcpcb *tp = intotcpcb(inp); 369 if (inp->inp_flags2 & INP2_TIMEWAIT) 370 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry); 371} 372 373static boolean_t 374tcp_garbage_collect(struct inpcb *inp, int istimewait) 375{ 376 boolean_t active = FALSE; 377 struct socket *so; 378 struct tcpcb *tp; 379 380 so = inp->inp_socket; 381 tp = intotcpcb(inp); 382 383 /* 384 * Skip if still in use or busy; it would have been more efficient 385 * if we were to test so_usecount against 0, but this isn't possible 386 * due to the current implementation of tcp_dropdropablreq() where 387 * overflow sockets that are eligible for garbage collection have 388 * their usecounts set to 1. 389 */ 390 if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx)) 391 return (TRUE); 392 393 /* Check again under the lock */ 394 if (so->so_usecount > 1) { 395 if (inp->inp_wantcnt == WNT_STOPUSING) 396 active = TRUE; 397 lck_mtx_unlock(&inp->inpcb_mtx); 398 return (active); 399 } 400 401 if (istimewait && 402 TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) && 403 tp->t_state != TCPS_CLOSED) { 404 /* Become a regular mutex */ 405 lck_mtx_convert_spin(&inp->inpcb_mtx); 406 tcp_close(tp); 407 } 408 409 /* 410 * Overflowed socket dropped from the listening queue? Do this 411 * only if we are called to clean up the time wait slots, since 412 * tcp_dropdropablreq() considers a socket to have been fully 413 * dropped after add_to_time_wait() is finished. 414 * Also handle the case of connections getting closed by the peer 415 * while in the queue as seen with rdar://6422317 416 * 417 */ 418 if (so->so_usecount == 1 && 419 ((istimewait && (so->so_flags & SOF_OVERFLOW)) || 420 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) && 421 (so->so_head != NULL) && 422 ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) == 423 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) { 424 425 if (inp->inp_state != INPCB_STATE_DEAD) { 426 /* Become a regular mutex */ 427 lck_mtx_convert_spin(&inp->inpcb_mtx); 428#if INET6 429 if (SOCK_CHECK_DOM(so, PF_INET6)) 430 in6_pcbdetach(inp); 431 else 432#endif /* INET6 */ 433 in_pcbdetach(inp); 434 } 435 so->so_usecount--; 436 if (inp->inp_wantcnt == WNT_STOPUSING) 437 active = TRUE; 438 lck_mtx_unlock(&inp->inpcb_mtx); 439 return (active); 440 } else if (inp->inp_wantcnt != WNT_STOPUSING) { 441 lck_mtx_unlock(&inp->inpcb_mtx); 442 return (FALSE); 443 } 444 445 /* 446 * We get here because the PCB is no longer searchable 447 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead 448 * (usecount is 0). This covers all cases, including overflow 449 * sockets and those that are considered as "embryonic", 450 * i.e. created by sonewconn() in TCP input path, and have 451 * not yet been committed. For the former, we reduce the usecount 452 * to 0 as done by the code above. For the latter, the usecount 453 * would have reduced to 0 as part calling soabort() when the 454 * socket is dropped at the end of tcp_input(). 455 */ 456 if (so->so_usecount == 0) { 457 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 458 struct tcpcb *, tp, int32_t, TCPS_CLOSED); 459 /* Become a regular mutex */ 460 lck_mtx_convert_spin(&inp->inpcb_mtx); 461 462 /* 463 * If this tp still happens to be on the timer list, 464 * take it out 465 */ 466 if (TIMER_IS_ON_LIST(tp)) { 467 tcp_remove_timer(tp); 468 } 469 470 if (inp->inp_state != INPCB_STATE_DEAD) { 471#if INET6 472 if (SOCK_CHECK_DOM(so, PF_INET6)) 473 in6_pcbdetach(inp); 474 else 475#endif /* INET6 */ 476 in_pcbdetach(inp); 477 } 478 in_pcbdispose(inp); 479 return (FALSE); 480 } 481 482 lck_mtx_unlock(&inp->inpcb_mtx); 483 return (TRUE); 484} 485 486/* 487 * TCP garbage collector callback (inpcb_timer_func_t). 488 * 489 * Returns the number of pcbs that will need to be gc-ed soon, 490 * returnining > 0 will keep timer active. 491 */ 492void 493tcp_gc(struct inpcbinfo *ipi) 494{ 495 struct inpcb *inp, *nxt; 496 struct tcpcb *tw_tp, *tw_ntp; 497#if TCPDEBUG 498 int ostate; 499#endif 500#if KDEBUG 501 static int tws_checked = 0; 502#endif 503 504 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0); 505 506 /* 507 * Update tcp_now here as it may get used while 508 * processing the slow timer. 509 */ 510 calculate_tcp_clock(); 511 512 /* 513 * Garbage collect socket/tcpcb: We need to acquire the list lock 514 * exclusively to do this 515 */ 516 517 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) { 518 /* don't sweat it this time; cleanup was done last time */ 519 if (tcp_gc_done == TRUE) { 520 tcp_gc_done = FALSE; 521 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, 522 tws_checked, cur_tw_slot, 0, 0, 0); 523 /* Lock upgrade failed, give up this round */ 524 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1); 525 return; 526 } 527 /* Upgrade failed, lost lock now take it again exclusive */ 528 lck_rw_lock_exclusive(ipi->ipi_lock); 529 } 530 tcp_gc_done = TRUE; 531 532 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) { 533 if (tcp_garbage_collect(inp, 0)) 534 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1); 535 } 536 537 /* Now cleanup the time wait ones */ 538 TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) { 539 /* 540 * We check the timestamp here without holding the 541 * socket lock for better performance. If there are 542 * any pcbs in time-wait, the timer will get rescheduled. 543 * Hence some error in this check can be tolerated. 544 * 545 * Sometimes a socket on time-wait queue can be closed if 546 * 2MSL timer expired but the application still has a 547 * usecount on it. 548 */ 549 if (tw_tp->t_state == TCPS_CLOSED || 550 TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) { 551 if (tcp_garbage_collect(tw_tp->t_inpcb, 1)) 552 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1); 553 } 554 } 555 556 /* take into account pcbs that are still in time_wait_slots */ 557 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount); 558 559 lck_rw_done(ipi->ipi_lock); 560 561 /* Clean up the socache while we are here */ 562 if (so_cache_timer()) 563 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1); 564 565 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, 566 cur_tw_slot, 0, 0, 0); 567 568 return; 569} 570 571/* 572 * Cancel all timers for TCP tp. 573 */ 574void 575tcp_canceltimers(tp) 576 struct tcpcb *tp; 577{ 578 register int i; 579 580 tcp_remove_timer(tp); 581 for (i = 0; i < TCPT_NTIMERS; i++) 582 tp->t_timer[i] = 0; 583 tp->tentry.timer_start = tcp_now; 584 tp->tentry.index = TCPT_NONE; 585} 586 587int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 588 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 589 590int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 591 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; 592 593static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ 594 595static void tcp_rexmt_save_state(struct tcpcb *tp) 596{ 597 u_int32_t fsize; 598 if (TSTMP_SUPPORTED(tp)) { 599 /* 600 * Since timestamps are supported on the connection, 601 * we can do recovery as described in rfc 4015. 602 */ 603 fsize = tp->snd_max - tp->snd_una; 604 tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh); 605 tp->snd_recover_prev = tp->snd_recover; 606 } else { 607 /* 608 * Timestamp option is not supported on this connection. 609 * Record ssthresh and cwnd so they can 610 * be recovered if this turns out to be a "bad" retransmit. 611 * A retransmit is considered "bad" if an ACK for this 612 * segment is received within RTT/2 interval; the assumption 613 * here is that the ACK was already in flight. See 614 * "On Estimating End-to-End Network Path Properties" by 615 * Allman and Paxson for more details. 616 */ 617 tp->snd_cwnd_prev = tp->snd_cwnd; 618 tp->snd_ssthresh_prev = tp->snd_ssthresh; 619 tp->snd_recover_prev = tp->snd_recover; 620 if (IN_FASTRECOVERY(tp)) 621 tp->t_flags |= TF_WASFRECOVERY; 622 else 623 tp->t_flags &= ~TF_WASFRECOVERY; 624 } 625 tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2; 626 tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 627 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT); 628} 629 630/* 631 * Revert to the older segment size if there is an indication that PMTU 632 * blackhole detection was not needed. 633 */ 634void tcp_pmtud_revert_segment_size(struct tcpcb *tp) 635{ 636 int32_t optlen; 637 638 VERIFY(tp->t_pmtud_saved_maxopd > 0); 639 tp->t_flags |= TF_PMTUD; 640 tp->t_flags &= ~TF_BLACKHOLE; 641 optlen = tp->t_maxopd - tp->t_maxseg; 642 tp->t_maxopd = tp->t_pmtud_saved_maxopd; 643 tp->t_maxseg = tp->t_maxopd - optlen; 644 /* 645 * Reset the slow-start flight size as it 646 * may depend on the new MSS 647 */ 648 if (CC_ALGO(tp)->cwnd_init != NULL) 649 CC_ALGO(tp)->cwnd_init(tp); 650 tp->t_pmtud_start_ts = 0; 651 tcpstat.tcps_pmtudbh_reverted++; 652} 653 654/* 655 * TCP timer processing. 656 */ 657struct tcpcb * 658tcp_timers(tp, timer) 659 register struct tcpcb *tp; 660 int timer; 661{ 662 int32_t rexmt, optlen = 0, idle_time = 0; 663 struct socket *so; 664 struct tcptemp *t_template; 665#if TCPDEBUG 666 int ostate; 667#endif 668 669#if INET6 670 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0; 671#endif /* INET6 */ 672 673 so = tp->t_inpcb->inp_socket; 674 idle_time = tcp_now - tp->t_rcvtime; 675 676 switch (timer) { 677 678 /* 679 * 2 MSL timeout in shutdown went off. If we're closed but 680 * still waiting for peer to close and connection has been idle 681 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2, 682 * delete connection control block. 683 * Otherwise, (this case shouldn't happen) check again in a bit 684 * we keep the socket in the main list in that case. 685 */ 686 case TCPT_2MSL: 687 tcp_free_sackholes(tp); 688 if (tp->t_state != TCPS_TIME_WAIT && 689 tp->t_state != TCPS_FIN_WAIT_2 && 690 ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) { 691 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, 692 (u_int32_t)TCP_CONN_KEEPINTVL(tp)); 693 } else { 694 tp = tcp_close(tp); 695 return(tp); 696 } 697 break; 698 699 /* 700 * Retransmission timer went off. Message has not 701 * been acked within retransmit interval. Back off 702 * to a longer retransmit interval and retransmit one segment. 703 */ 704 case TCPT_REXMT: 705 /* 706 * Drop a connection in the retransmit timer 707 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT 708 * times 709 * 2. If the time spent in this retransmission episode is 710 * more than the time limit set with TCP_RXT_CONNDROPTIME 711 * socket option 712 * 3. If TCP_RXT_FINDROP socket option was set and 713 * we have already retransmitted the FIN 3 times without 714 * receiving an ack 715 */ 716 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || 717 (tp->t_rxt_conndroptime > 0 718 && tp->t_rxtstart > 0 && 719 (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) 720 || ((tp->t_flagsext & TF_RXTFINDROP) != 0 && 721 (tp->t_flags & TF_SENTFIN) != 0 && 722 tp->t_rxtshift >= 4)) { 723 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) { 724 tcpstat.tcps_rxtfindrop++; 725 } else { 726 tcpstat.tcps_timeoutdrop++; 727 } 728 tp->t_rxtshift = TCP_MAXRXTSHIFT; 729 postevent(so, 0, EV_TIMEOUT); 730 soevent(so, 731 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); 732 tp = tcp_drop(tp, tp->t_softerror ? 733 tp->t_softerror : ETIMEDOUT); 734 735 break; 736 } 737 738 tcpstat.tcps_rexmttimeo++; 739 740 if (tp->t_rxtshift == 1 && 741 tp->t_state == TCPS_ESTABLISHED) { 742 /* Set the time at which retransmission started. */ 743 tp->t_rxtstart = tcp_now; 744 745 /* 746 * if this is the first retransmit timeout, save 747 * the state so that we can recover if the timeout 748 * is spurious. 749 */ 750 tcp_rexmt_save_state(tp); 751 } 752#if MPTCP 753 if ((tp->t_rxtshift >= mptcp_fail_thresh) && 754 (tp->t_state == TCPS_ESTABLISHED) && 755 (tp->t_mpflags & TMPF_MPTCP_TRUE)) { 756 mptcp_act_on_txfail(so); 757 758 } 759#endif /* MPTCP */ 760 761 if (tp->t_adaptive_wtimo > 0 && 762 tp->t_rxtshift > tp->t_adaptive_wtimo && 763 TCPS_HAVEESTABLISHED(tp->t_state)) { 764 /* Send an event to the application */ 765 soevent(so, 766 (SO_FILT_HINT_LOCKED| 767 SO_FILT_HINT_ADAPTIVE_WTIMO)); 768 } 769 770 /* 771 * If this is a retransmit timeout after PTO, the PTO 772 * was not effective 773 */ 774 if (tp->t_flagsext & TF_SENT_TLPROBE) { 775 tp->t_flagsext &= ~(TF_SENT_TLPROBE); 776 tcpstat.tcps_rto_after_pto++; 777 } 778 779 if (tp->t_flagsext & TF_DELAY_RECOVERY) { 780 /* 781 * Retransmit timer fired before entering recovery 782 * on a connection with packet re-ordering. This 783 * suggests that the reordering metrics computed 784 * are not accurate. 785 */ 786 tp->t_reorderwin = 0; 787 tp->t_timer[TCPT_DELAYFR] = 0; 788 tp->t_flagsext &= ~(TF_DELAY_RECOVERY); 789 } 790 791 if (tp->t_state == TCPS_SYN_SENT) { 792 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; 793 tp->t_stat.synrxtshift = tp->t_rxtshift; 794 } else { 795 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 796 } 797 798 TCPT_RANGESET(tp->t_rxtcur, rexmt, 799 tp->t_rttmin, TCPTV_REXMTMAX, 800 TCP_ADD_REXMTSLOP(tp)); 801 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); 802 803 if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb)) 804 goto fc_output; 805 806 tcp_free_sackholes(tp); 807 /* 808 * Check for potential Path MTU Discovery Black Hole 809 */ 810 if (tcp_pmtud_black_hole_detect && 811 !(tp->t_flagsext & TF_NOBLACKHOLE_DETECTION) && 812 (tp->t_state == TCPS_ESTABLISHED)) { 813 if (((tp->t_flags & (TF_PMTUD|TF_MAXSEGSNT)) 814 == (TF_PMTUD|TF_MAXSEGSNT)) && 815 (tp->t_rxtshift == 2)) { 816 /* 817 * Enter Path MTU Black-hole Detection mechanism: 818 * - Disable Path MTU Discovery (IP "DF" bit). 819 * - Reduce MTU to lower value than what we 820 * negotiated with the peer. 821 */ 822 /* Disable Path MTU Discovery for now */ 823 tp->t_flags &= ~TF_PMTUD; 824 /* Record that we may have found a black hole */ 825 tp->t_flags |= TF_BLACKHOLE; 826 optlen = tp->t_maxopd - tp->t_maxseg; 827 /* Keep track of previous MSS */ 828 tp->t_pmtud_saved_maxopd = tp->t_maxopd; 829 tp->t_pmtud_start_ts = tcp_now; 830 if (tp->t_pmtud_start_ts == 0) 831 tp->t_pmtud_start_ts++; 832 /* Reduce the MSS to intermediary value */ 833 if (tp->t_maxopd > tcp_pmtud_black_hole_mss) { 834 tp->t_maxopd = tcp_pmtud_black_hole_mss; 835 } else { 836 tp->t_maxopd = /* use the default MSS */ 837#if INET6 838 isipv6 ? tcp_v6mssdflt : 839#endif /* INET6 */ 840 tcp_mssdflt; 841 } 842 tp->t_maxseg = tp->t_maxopd - optlen; 843 844 /* 845 * Reset the slow-start flight size 846 * as it may depend on the new MSS 847 */ 848 if (CC_ALGO(tp)->cwnd_init != NULL) 849 CC_ALGO(tp)->cwnd_init(tp); 850 } 851 /* 852 * If further retransmissions are still 853 * unsuccessful with a lowered MTU, maybe this 854 * isn't a Black Hole and we restore the previous 855 * MSS and blackhole detection flags. 856 */ 857 else { 858 859 if ((tp->t_flags & TF_BLACKHOLE) && 860 (tp->t_rxtshift > 4)) { 861 tcp_pmtud_revert_segment_size(tp); 862 } 863 } 864 } 865 866 867 /* 868 * Disable rfc1323 and rfc1644 if we haven't got any 869 * response to our SYN (after we reach the threshold) 870 * to work-around some broken terminal servers (most of 871 * which have hopefully been retired) that have bad VJ 872 * header compression code which trashes TCP segments 873 * containing unknown-to-them TCP options. 874 * Do this only on non-local connections. 875 */ 876 if (tp->t_state == TCPS_SYN_SENT && 877 ((!(tp->t_flags & TF_LOCAL) && 878 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres) || 879 ((tp->t_flags & TF_LOCAL) && 880 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres_local))) 881 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC); 882 883 /* 884 * If losing, let the lower level know and try for 885 * a better route. Also, if we backed off this far, 886 * our srtt estimate is probably bogus. Clobber it 887 * so we'll take the next rtt measurement as our srtt; 888 * move the current srtt into rttvar to keep the current 889 * retransmit times until then. 890 */ 891 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 892#if INET6 893 if (isipv6) 894 in6_losing(tp->t_inpcb); 895 else 896#endif /* INET6 */ 897 in_losing(tp->t_inpcb); 898 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 899 tp->t_srtt = 0; 900 } 901 tp->snd_nxt = tp->snd_una; 902 /* 903 * Note: We overload snd_recover to function also as the 904 * snd_last variable described in RFC 2582 905 */ 906 tp->snd_recover = tp->snd_max; 907 /* 908 * Force a segment to be sent. 909 */ 910 tp->t_flags |= TF_ACKNOW; 911 912 /* If timing a segment in this window, stop the timer */ 913 tp->t_rtttime = 0; 914 915 if (!IN_FASTRECOVERY(tp) && tp->t_rxtshift == 1) 916 tcpstat.tcps_tailloss_rto++; 917 918 919 /* 920 * RFC 5681 says: when a TCP sender detects segment loss 921 * using retransmit timer and the given segment has already 922 * been retransmitted by way of the retransmission timer at 923 * least once, the value of ssthresh is held constant 924 */ 925 if (tp->t_rxtshift == 1 && 926 CC_ALGO(tp)->after_timeout != NULL) 927 CC_ALGO(tp)->after_timeout(tp); 928 929 EXIT_FASTRECOVERY(tp); 930 931 /* CWR notifications are to be sent on new data right after 932 * RTOs, Fast Retransmits and ECE notification receipts. 933 */ 934 if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) { 935 tp->ecn_flags |= TE_SENDCWR; 936 } 937fc_output: 938 tcp_ccdbg_trace(tp, NULL, TCP_CC_REXMT_TIMEOUT); 939 940 (void) tcp_output(tp); 941 break; 942 943 /* 944 * Persistance timer into zero window. 945 * Force a byte to be output, if possible. 946 */ 947 case TCPT_PERSIST: 948 tcpstat.tcps_persisttimeo++; 949 /* 950 * Hack: if the peer is dead/unreachable, we do not 951 * time out if the window is closed. After a full 952 * backoff, drop the connection if the idle time 953 * (no responses to probes) reaches the maximum 954 * backoff that we would use if retransmitting. 955 * 956 * Drop the connection if we reached the maximum allowed time for 957 * Zero Window Probes without a non-zero update from the peer. 958 * See rdar://5805356 959 */ 960 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT && 961 (idle_time >= tcp_maxpersistidle || 962 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) || 963 ((tp->t_persist_stop != 0) && 964 TSTMP_LEQ(tp->t_persist_stop, tcp_now))) { 965 tcpstat.tcps_persistdrop++; 966 postevent(so, 0, EV_TIMEOUT); 967 soevent(so, 968 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); 969 tp = tcp_drop(tp, ETIMEDOUT); 970 break; 971 } 972 tcp_setpersist(tp); 973 tp->t_flagsext |= TF_FORCE; 974 (void) tcp_output(tp); 975 tp->t_flagsext &= ~TF_FORCE; 976 break; 977 978 /* 979 * Keep-alive timer went off; send something 980 * or drop connection if idle for too long. 981 */ 982 case TCPT_KEEP: 983 tcpstat.tcps_keeptimeo++; 984#if MPTCP 985 /* 986 * Regular TCP connections do not send keepalives after closing 987 * MPTCP must not also, after sending Data FINs. 988 */ 989 struct mptcb *mp_tp = tp->t_mptcb; 990 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && 991 (tp->t_state > TCPS_ESTABLISHED)) { 992 goto dropit; 993 } else if (mp_tp != NULL) { 994 if ((mptcp_ok_to_keepalive(mp_tp) == 0)) 995 goto dropit; 996 } 997#endif /* MPTCP */ 998 if (tp->t_state < TCPS_ESTABLISHED) 999 goto dropit; 1000 if ((always_keepalive || 1001 (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) || 1002 (tp->t_flagsext & TF_DETECT_READSTALL)) && 1003 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) { 1004 if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp)) 1005 goto dropit; 1006 /* 1007 * Send a packet designed to force a response 1008 * if the peer is up and reachable: 1009 * either an ACK if the connection is still alive, 1010 * or an RST if the peer has closed the connection 1011 * due to timeout or reboot. 1012 * Using sequence number tp->snd_una-1 1013 * causes the transmitted zero-length segment 1014 * to lie outside the receive window; 1015 * by the protocol spec, this requires the 1016 * correspondent TCP to respond. 1017 */ 1018 tcpstat.tcps_keepprobe++; 1019 t_template = tcp_maketemplate(tp); 1020 if (t_template) { 1021 struct inpcb *inp = tp->t_inpcb; 1022 struct tcp_respond_args tra; 1023 1024 bzero(&tra, sizeof(tra)); 1025 tra.nocell = INP_NO_CELLULAR(inp); 1026 tra.noexpensive = INP_NO_EXPENSIVE(inp); 1027 tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp); 1028 if (tp->t_inpcb->inp_flags & INP_BOUND_IF) 1029 tra.ifscope = tp->t_inpcb->inp_boundifp->if_index; 1030 else 1031 tra.ifscope = IFSCOPE_NONE; 1032 tcp_respond(tp, t_template->tt_ipgen, 1033 &t_template->tt_t, (struct mbuf *)NULL, 1034 tp->rcv_nxt, tp->snd_una - 1, 0, &tra); 1035 (void) m_free(dtom(t_template)); 1036 if (tp->t_flagsext & TF_DETECT_READSTALL) 1037 tp->t_rtimo_probes++; 1038 } 1039 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 1040 TCP_CONN_KEEPINTVL(tp)); 1041 } else { 1042 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 1043 TCP_CONN_KEEPIDLE(tp)); 1044 } 1045 if (tp->t_flagsext & TF_DETECT_READSTALL) { 1046 /* 1047 * The keep alive packets sent to detect a read 1048 * stall did not get a response from the 1049 * peer. Generate more keep-alives to confirm this. 1050 * If the number of probes sent reaches the limit, 1051 * generate an event. 1052 */ 1053 if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) { 1054 /* Generate an event */ 1055 soevent(so, 1056 (SO_FILT_HINT_LOCKED| 1057 SO_FILT_HINT_ADAPTIVE_RTIMO)); 1058 tcp_keepalive_reset(tp); 1059 } else { 1060 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START( 1061 tp, TCP_REXMTVAL(tp)); 1062 } 1063 } 1064 break; 1065 case TCPT_DELACK: 1066 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) { 1067 tp->t_flags &= ~TF_DELACK; 1068 tp->t_timer[TCPT_DELACK] = 0; 1069 tp->t_flags |= TF_ACKNOW; 1070 1071 /* 1072 * If delayed ack timer fired while stretching 1073 * acks, count the number of times the streaming 1074 * detection was not correct. If this exceeds a 1075 * threshold, disable strech ack on this 1076 * connection 1077 * 1078 * Also, go back to acking every other packet. 1079 */ 1080 if ((tp->t_flags & TF_STRETCHACK)) { 1081 if (tp->t_unacksegs > 1 && 1082 tp->t_unacksegs < maxseg_unacked) 1083 tp->t_stretchack_delayed++; 1084 1085 if (tp->t_stretchack_delayed > 1086 TCP_STRETCHACK_DELAY_THRESHOLD) { 1087 tp->t_flagsext |= TF_DISABLE_STRETCHACK; 1088 /* 1089 * Note the time at which stretch 1090 * ack was disabled automatically 1091 */ 1092 tp->rcv_nostrack_ts = tcp_now; 1093 tcpstat.tcps_nostretchack++; 1094 tp->t_stretchack_delayed = 0; 1095 } 1096 tcp_reset_stretch_ack(tp); 1097 } 1098 1099 /* 1100 * If we are measuring inter packet arrival jitter 1101 * for throttling a connection, this delayed ack 1102 * might be the reason for accumulating some 1103 * jitter. So let's restart the measurement. 1104 */ 1105 CLEAR_IAJ_STATE(tp); 1106 1107 tcpstat.tcps_delack++; 1108 (void) tcp_output(tp); 1109 } 1110 break; 1111 1112#if MPTCP 1113 case TCPT_JACK_RXMT: 1114 if ((tp->t_state == TCPS_ESTABLISHED) && 1115 (tp->t_mpflags & TMPF_PREESTABLISHED) && 1116 (tp->t_mpflags & TMPF_JOINED_FLOW)) { 1117 if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) { 1118 tcpstat.tcps_timeoutdrop++; 1119 postevent(so, 0, EV_TIMEOUT); 1120 soevent(so, 1121 (SO_FILT_HINT_LOCKED| 1122 SO_FILT_HINT_TIMEOUT)); 1123 tp = tcp_drop(tp, tp->t_softerror ? 1124 tp->t_softerror : ETIMEDOUT); 1125 break; 1126 } 1127 tcpstat.tcps_join_rxmts++; 1128 tp->t_flags |= TF_ACKNOW; 1129 1130 /* 1131 * No backoff is implemented for simplicity for this 1132 * corner case. 1133 */ 1134 (void) tcp_output(tp); 1135 } 1136 break; 1137#endif /* MPTCP */ 1138 1139 case TCPT_PTO: 1140 { 1141 tcp_seq old_snd_nxt; 1142 int32_t snd_len; 1143 boolean_t rescue_rxt = FALSE; 1144 1145 tp->t_flagsext &= ~(TF_SENT_TLPROBE); 1146 1147 /* 1148 * Check if the connection is in the right state to 1149 * send a probe 1150 */ 1151 if (tp->t_state != TCPS_ESTABLISHED || 1152 tp->t_rxtshift > 0 || tp->snd_max == tp->snd_una || 1153 !SACK_ENABLED(tp) || TAILQ_EMPTY(&tp->snd_holes) || 1154 (IN_FASTRECOVERY(tp) && 1155 (SEQ_GEQ(tp->snd_fack, tp->snd_recover) || 1156 SEQ_GT(tp->snd_nxt, tp->sack_newdata)))) 1157 break; 1158 1159 tcpstat.tcps_pto++; 1160 1161 /* If timing a segment in this window, stop the timer */ 1162 tp->t_rtttime = 0; 1163 1164 if (IN_FASTRECOVERY(tp)) { 1165 /* 1166 * Send a probe to detect tail loss in a 1167 * recovery window when the connection is in 1168 * fast_recovery. 1169 */ 1170 old_snd_nxt = tp->snd_nxt; 1171 rescue_rxt = TRUE; 1172 VERIFY(SEQ_GEQ(tp->snd_fack, tp->snd_una)); 1173 snd_len = min((tp->snd_recover - tp->snd_fack), 1174 tp->t_maxseg); 1175 tp->snd_nxt = tp->snd_recover - snd_len; 1176 tcpstat.tcps_pto_in_recovery++; 1177 tcp_ccdbg_trace(tp, NULL, TCP_CC_TLP_IN_FASTRECOVERY); 1178 } else { 1179 /* 1180 * If there is no new data to send or if the 1181 * connection is limited by receive window then 1182 * retransmit the last segment, otherwise send 1183 * new data. 1184 */ 1185 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) 1186 - (tp->snd_max - tp->snd_una); 1187 if (snd_len > 0) { 1188 tp->snd_nxt = tp->snd_max; 1189 } else { 1190 snd_len = min((tp->snd_max - tp->snd_una), 1191 tp->t_maxseg); 1192 tp->snd_nxt = tp->snd_max - snd_len; 1193 } 1194 } 1195 1196 /* Note that tail loss probe is being sent */ 1197 tp->t_flagsext |= TF_SENT_TLPROBE; 1198 tp->t_tlpstart = tcp_now; 1199 1200 tp->snd_cwnd += tp->t_maxseg; 1201 (void )tcp_output(tp); 1202 tp->snd_cwnd -= tp->t_maxseg; 1203 1204 tp->t_tlphighrxt = tp->snd_nxt; 1205 1206 /* 1207 * If a tail loss probe was sent after entering recovery, 1208 * restore the old snd_nxt value so that other packets 1209 * will get retransmitted correctly. 1210 */ 1211 if (rescue_rxt) 1212 tp->snd_nxt = old_snd_nxt; 1213 break; 1214 } 1215 case TCPT_DELAYFR: 1216 tp->t_flagsext &= ~TF_DELAY_RECOVERY; 1217 1218 /* 1219 * Don't do anything if one of the following is true: 1220 * - the connection is already in recovery 1221 * - sequence until snd_recover has been acknowledged. 1222 * - retransmit timeout has fired 1223 */ 1224 if (IN_FASTRECOVERY(tp) || 1225 SEQ_GEQ(tp->snd_una, tp->snd_recover) || 1226 tp->t_rxtshift > 0) 1227 break; 1228 1229 VERIFY(SACK_ENABLED(tp)); 1230 if (CC_ALGO(tp)->pre_fr != NULL) 1231 CC_ALGO(tp)->pre_fr(tp); 1232 ENTER_FASTRECOVERY(tp); 1233 if ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON) 1234 tp->ecn_flags |= TE_SENDCWR; 1235 1236 tp->t_timer[TCPT_REXMT] = 0; 1237 tcpstat.tcps_sack_recovery_episode++; 1238 tp->sack_newdata = tp->snd_nxt; 1239 tp->snd_cwnd = tp->t_maxseg; 1240 tcp_ccdbg_trace(tp, NULL, TCP_CC_ENTER_FASTRECOVERY); 1241 (void) tcp_output(tp); 1242 break; 1243 dropit: 1244 tcpstat.tcps_keepdrops++; 1245 postevent(so, 0, EV_TIMEOUT); 1246 soevent(so, 1247 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT)); 1248 tp = tcp_drop(tp, ETIMEDOUT); 1249 break; 1250 } 1251#if TCPDEBUG 1252 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 1253 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 1254 PRU_SLOWTIMO); 1255#endif 1256 return (tp); 1257} 1258 1259/* Remove a timer entry from timer list */ 1260void 1261tcp_remove_timer(struct tcpcb *tp) 1262{ 1263 struct tcptimerlist *listp = &tcp_timer_list; 1264 1265 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); 1266 if (!(TIMER_IS_ON_LIST(tp))) { 1267 return; 1268 } 1269 lck_mtx_lock(listp->mtx); 1270 1271 /* Check if pcb is on timer list again after acquiring the lock */ 1272 if (!(TIMER_IS_ON_LIST(tp))) { 1273 lck_mtx_unlock(listp->mtx); 1274 return; 1275 } 1276 1277 if (listp->next_te != NULL && listp->next_te == &tp->tentry) 1278 listp->next_te = LIST_NEXT(&tp->tentry, le); 1279 1280 LIST_REMOVE(&tp->tentry, le); 1281 tp->t_flags &= ~(TF_TIMER_ONLIST); 1282 1283 listp->entries--; 1284 1285 tp->tentry.le.le_next = NULL; 1286 tp->tentry.le.le_prev = NULL; 1287 lck_mtx_unlock(listp->mtx); 1288} 1289 1290/* 1291 * Function to check if the timerlist needs to be rescheduled to run 1292 * the timer entry correctly. Basically, this is to check if we can avoid 1293 * taking the list lock. 1294 */ 1295 1296static boolean_t 1297need_to_resched_timerlist(u_int32_t runtime, u_int16_t mode) 1298{ 1299 struct tcptimerlist *listp = &tcp_timer_list; 1300 int32_t diff; 1301 1302 /* 1303 * If the list is being processed then the state of the list is 1304 * in flux. In this case always acquire the lock and set the state 1305 * correctly. 1306 */ 1307 if (listp->running) 1308 return (TRUE); 1309 1310 if (!listp->scheduled) 1311 return (TRUE); 1312 1313 diff = timer_diff(listp->runtime, 0, runtime, 0); 1314 if (diff <= 0) { 1315 /* The list is going to run before this timer */ 1316 return (FALSE); 1317 } else { 1318 if (mode & TCP_TIMERLIST_10MS_MODE) { 1319 if (diff <= TCP_TIMER_10MS_QUANTUM) 1320 return (FALSE); 1321 } else if (mode & TCP_TIMERLIST_100MS_MODE) { 1322 if (diff <= TCP_TIMER_100MS_QUANTUM) 1323 return (FALSE); 1324 } else { 1325 if (diff <= TCP_TIMER_500MS_QUANTUM) 1326 return (FALSE); 1327 } 1328 } 1329 return (TRUE); 1330} 1331 1332void 1333tcp_sched_timerlist(uint32_t offset) 1334{ 1335 1336 uint64_t deadline = 0; 1337 struct tcptimerlist *listp = &tcp_timer_list; 1338 1339 lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED); 1340 1341 offset = min(offset, TCP_TIMERLIST_MAX_OFFSET); 1342 listp->runtime = tcp_now + offset; 1343 if (listp->runtime == 0) { 1344 listp->runtime++; 1345 offset++; 1346 } 1347 1348 clock_interval_to_deadline(offset, USEC_PER_SEC, &deadline); 1349 1350 thread_call_enter_delayed(listp->call, deadline); 1351 listp->scheduled = TRUE; 1352} 1353 1354/* 1355 * Function to run the timers for a connection. 1356 * 1357 * Returns the offset of next timer to be run for this connection which 1358 * can be used to reschedule the timerlist. 1359 * 1360 * te_mode is an out parameter that indicates the modes of active 1361 * timers for this connection. 1362 */ 1363u_int32_t 1364tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode) { 1365 1366 struct socket *so; 1367 u_int16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE; 1368 u_int32_t timer_val, offset = 0, lo_timer = 0; 1369 int32_t diff; 1370 boolean_t needtorun[TCPT_NTIMERS]; 1371 int count = 0; 1372 1373 VERIFY(tp != NULL); 1374 bzero(needtorun, sizeof(needtorun)); 1375 *te_mode = 0; 1376 1377 tcp_lock(tp->t_inpcb->inp_socket, 1, 0); 1378 1379 so = tp->t_inpcb->inp_socket; 1380 /* Release the want count on inp */ 1381 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1) 1382 == WNT_STOPUSING) { 1383 if (TIMER_IS_ON_LIST(tp)) { 1384 tcp_remove_timer(tp); 1385 } 1386 1387 /* Looks like the TCP connection got closed while we 1388 * were waiting for the lock.. Done 1389 */ 1390 goto done; 1391 } 1392 1393 /* 1394 * Since the timer thread needs to wait for tcp lock, it may race 1395 * with another thread that can cancel or reschedule the timer 1396 * that is about to run. Check if we need to run anything. 1397 */ 1398 if ((index = tp->tentry.index) == TCPT_NONE) 1399 goto done; 1400 1401 timer_val = tp->t_timer[index]; 1402 1403 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0); 1404 if (diff > 0) { 1405 if (tp->tentry.index != TCPT_NONE) { 1406 offset = diff; 1407 *(te_mode) = tp->tentry.mode; 1408 } 1409 goto done; 1410 } 1411 1412 tp->t_timer[index] = 0; 1413 if (timer_val > 0) { 1414 tp = tcp_timers(tp, index); 1415 if (tp == NULL) 1416 goto done; 1417 } 1418 1419 /* 1420 * Check if there are any other timers that need to be run. 1421 * While doing it, adjust the timer values wrt tcp_now. 1422 */ 1423 tp->tentry.mode = 0; 1424 for (i = 0; i < TCPT_NTIMERS; ++i) { 1425 if (tp->t_timer[i] != 0) { 1426 diff = timer_diff(tp->tentry.timer_start, 1427 tp->t_timer[i], tcp_now, 0); 1428 if (diff <= 0) { 1429 needtorun[i] = TRUE; 1430 count++; 1431 } else { 1432 tp->t_timer[i] = diff; 1433 needtorun[i] = FALSE; 1434 if (lo_timer == 0 || diff < lo_timer) { 1435 lo_timer = diff; 1436 lo_index = i; 1437 } 1438 TCP_SET_TIMER_MODE(tp->tentry.mode, i); 1439 } 1440 } 1441 } 1442 1443 tp->tentry.timer_start = tcp_now; 1444 tp->tentry.index = lo_index; 1445 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0); 1446 1447 if (tp->tentry.index != TCPT_NONE) { 1448 tp->tentry.runtime = tp->tentry.timer_start + 1449 tp->t_timer[tp->tentry.index]; 1450 if (tp->tentry.runtime == 0) 1451 tp->tentry.runtime++; 1452 } 1453 1454 if (count > 0) { 1455 /* run any other timers outstanding at this time. */ 1456 for (i = 0; i < TCPT_NTIMERS; ++i) { 1457 if (needtorun[i]) { 1458 tp->t_timer[i] = 0; 1459 tp = tcp_timers(tp, i); 1460 if (tp == NULL) { 1461 offset = 0; 1462 *(te_mode) = 0; 1463 goto done; 1464 } 1465 } 1466 } 1467 tcp_set_lotimer_index(tp); 1468 } 1469 1470 if (tp->tentry.index < TCPT_NONE) { 1471 offset = tp->t_timer[tp->tentry.index]; 1472 *(te_mode) = tp->tentry.mode; 1473 } 1474 1475done: 1476 if (tp != NULL && tp->tentry.index == TCPT_NONE) { 1477 tcp_remove_timer(tp); 1478 offset = 0; 1479 } 1480 1481 tcp_unlock(so, 1, 0); 1482 return(offset); 1483} 1484 1485void 1486tcp_run_timerlist(void * arg1, void * arg2) { 1487#pragma unused(arg1, arg2) 1488 struct tcptimerentry *te, *next_te; 1489 struct tcptimerlist *listp = &tcp_timer_list; 1490 struct tcpcb *tp; 1491 uint32_t next_timer = 0; /* offset of the next timer on the list */ 1492 u_int16_t te_mode = 0; /* modes of all active timers in a tcpcb */ 1493 u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */ 1494 uint32_t active_count = 0; 1495 1496 calculate_tcp_clock(); 1497 1498 lck_mtx_lock(listp->mtx); 1499 1500 listp->running = TRUE; 1501 1502 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) { 1503 uint32_t offset = 0; 1504 uint32_t runtime = te->runtime; 1505 if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) { 1506 offset = timer_diff(runtime, 0, tcp_now, 0); 1507 if (next_timer == 0 || offset < next_timer) { 1508 next_timer = offset; 1509 } 1510 list_mode |= te->mode; 1511 continue; 1512 } 1513 1514 tp = TIMERENTRY_TO_TP(te); 1515 1516 /* 1517 * Acquire an inp wantcnt on the inpcb so that the socket 1518 * won't get detached even if tcp_close is called 1519 */ 1520 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0) 1521 == WNT_STOPUSING) { 1522 /* 1523 * Some how this pcb went into dead state while 1524 * on the timer list, just take it off the list. 1525 * Since the timer list entry pointers are 1526 * protected by the timer list lock, we can 1527 * do it here without the socket lock. 1528 */ 1529 if (TIMER_IS_ON_LIST(tp)) { 1530 tp->t_flags &= ~(TF_TIMER_ONLIST); 1531 LIST_REMOVE(&tp->tentry, le); 1532 listp->entries--; 1533 1534 tp->tentry.le.le_next = NULL; 1535 tp->tentry.le.le_prev = NULL; 1536 } 1537 continue; 1538 } 1539 active_count++; 1540 1541 /* 1542 * Store the next timerentry pointer before releasing the 1543 * list lock. If that entry has to be removed when we 1544 * release the lock, this pointer will be updated to the 1545 * element after that. 1546 */ 1547 listp->next_te = next_te; 1548 1549 VERIFY_NEXT_LINK(&tp->tentry, le); 1550 VERIFY_PREV_LINK(&tp->tentry, le); 1551 1552 lck_mtx_unlock(listp->mtx); 1553 1554 offset = tcp_run_conn_timer(tp, &te_mode); 1555 1556 lck_mtx_lock(listp->mtx); 1557 1558 next_te = listp->next_te; 1559 listp->next_te = NULL; 1560 1561 if (offset > 0 && te_mode != 0) { 1562 list_mode |= te_mode; 1563 1564 if (next_timer == 0 || offset < next_timer) 1565 next_timer = offset; 1566 } 1567 } 1568 1569 if (!LIST_EMPTY(&listp->lhead)) { 1570 u_int16_t next_mode = 0; 1571 if ((list_mode & TCP_TIMERLIST_10MS_MODE) || 1572 (listp->pref_mode & TCP_TIMERLIST_10MS_MODE)) 1573 next_mode = TCP_TIMERLIST_10MS_MODE; 1574 else if ((list_mode & TCP_TIMERLIST_100MS_MODE) || 1575 (listp->pref_mode & TCP_TIMERLIST_100MS_MODE)) 1576 next_mode = TCP_TIMERLIST_100MS_MODE; 1577 else 1578 next_mode = TCP_TIMERLIST_500MS_MODE; 1579 1580 if (next_mode != TCP_TIMERLIST_500MS_MODE) { 1581 listp->idleruns = 0; 1582 } else { 1583 /* 1584 * the next required mode is slow mode, but if 1585 * the last one was a faster mode and we did not 1586 * have enough idle runs, repeat the last mode. 1587 * 1588 * We try to keep the timer list in fast mode for 1589 * some idle time in expectation of new data. 1590 */ 1591 if (listp->mode != next_mode && 1592 listp->idleruns < timer_fastmode_idlemax) { 1593 listp->idleruns++; 1594 next_mode = listp->mode; 1595 next_timer = TCP_TIMER_100MS_QUANTUM; 1596 } else { 1597 listp->idleruns = 0; 1598 } 1599 } 1600 listp->mode = next_mode; 1601 if (listp->pref_offset != 0) 1602 next_timer = min(listp->pref_offset, next_timer); 1603 1604 if (listp->mode == TCP_TIMERLIST_500MS_MODE) 1605 next_timer = max(next_timer, 1606 TCP_TIMER_500MS_QUANTUM); 1607 1608 tcp_sched_timerlist(next_timer); 1609 } else { 1610 /* 1611 * No need to reschedule this timer, but always run 1612 * periodically at a much higher granularity. 1613 */ 1614 tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET); 1615 } 1616 1617 listp->running = FALSE; 1618 listp->pref_mode = 0; 1619 listp->pref_offset = 0; 1620 1621 lck_mtx_unlock(listp->mtx); 1622} 1623 1624/* 1625 * Function to check if the timerlist needs to be reschduled to run this 1626 * connection's timers correctly. 1627 */ 1628void 1629tcp_sched_timers(struct tcpcb *tp) 1630{ 1631 struct tcptimerentry *te = &tp->tentry; 1632 u_int16_t index = te->index; 1633 u_int16_t mode = te->mode; 1634 struct tcptimerlist *listp = &tcp_timer_list; 1635 int32_t offset = 0; 1636 boolean_t list_locked = FALSE; 1637 1638 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) { 1639 /* Just return without adding the dead pcb to the list */ 1640 if (TIMER_IS_ON_LIST(tp)) { 1641 tcp_remove_timer(tp); 1642 } 1643 return; 1644 } 1645 1646 if (index == TCPT_NONE) { 1647 /* Nothing to run */ 1648 tcp_remove_timer(tp); 1649 return; 1650 } 1651 1652 /* 1653 * compute the offset at which the next timer for this connection 1654 * has to run. 1655 */ 1656 offset = timer_diff(te->runtime, 0, tcp_now, 0); 1657 if (offset <= 0) { 1658 offset = 1; 1659 tcp_timer_advanced++; 1660 } 1661 1662 if (!TIMER_IS_ON_LIST(tp)) { 1663 if (!list_locked) { 1664 lck_mtx_lock(listp->mtx); 1665 list_locked = TRUE; 1666 } 1667 1668 LIST_INSERT_HEAD(&listp->lhead, te, le); 1669 tp->t_flags |= TF_TIMER_ONLIST; 1670 1671 listp->entries++; 1672 if (listp->entries > listp->maxentries) 1673 listp->maxentries = listp->entries; 1674 1675 /* if the list is not scheduled, just schedule it */ 1676 if (!listp->scheduled) 1677 goto schedule; 1678 } 1679 1680 1681 /* 1682 * Timer entry is currently on the list, check if the list needs 1683 * to be rescheduled. 1684 */ 1685 if (need_to_resched_timerlist(te->runtime, mode)) { 1686 tcp_resched_timerlist++; 1687 1688 if (!list_locked) { 1689 lck_mtx_lock(listp->mtx); 1690 list_locked = TRUE; 1691 } 1692 1693 VERIFY_NEXT_LINK(te, le); 1694 VERIFY_PREV_LINK(te, le); 1695 1696 if (listp->running) { 1697 listp->pref_mode |= mode; 1698 if (listp->pref_offset == 0 || 1699 offset < listp->pref_offset) { 1700 listp->pref_offset = offset; 1701 } 1702 } else { 1703 /* 1704 * The list could have got rescheduled while 1705 * this thread was waiting for the lock 1706 */ 1707 if (listp->scheduled) { 1708 int32_t diff; 1709 diff = timer_diff(listp->runtime, 0, 1710 tcp_now, offset); 1711 if (diff <= 0) 1712 goto done; 1713 else 1714 goto schedule; 1715 } else { 1716 goto schedule; 1717 } 1718 } 1719 } 1720 goto done; 1721 1722schedule: 1723 /* 1724 * Since a connection with timers is getting scheduled, the timer 1725 * list moves from idle to active state and that is why idlegen is 1726 * reset 1727 */ 1728 if (mode & TCP_TIMERLIST_10MS_MODE) { 1729 listp->mode = TCP_TIMERLIST_10MS_MODE; 1730 listp->idleruns = 0; 1731 offset = min(offset, TCP_TIMER_10MS_QUANTUM); 1732 } else if (mode & TCP_TIMERLIST_100MS_MODE) { 1733 if (listp->mode > TCP_TIMERLIST_100MS_MODE) 1734 listp->mode = TCP_TIMERLIST_100MS_MODE; 1735 listp->idleruns = 0; 1736 offset = min(offset, TCP_TIMER_100MS_QUANTUM); 1737 } 1738 tcp_sched_timerlist(offset); 1739 1740done: 1741 if (list_locked) 1742 lck_mtx_unlock(listp->mtx); 1743 1744 return; 1745} 1746 1747static inline void 1748tcp_set_lotimer_index(struct tcpcb *tp) { 1749 uint16_t i, lo_index = TCPT_NONE, mode = 0; 1750 uint32_t lo_timer = 0; 1751 for (i = 0; i < TCPT_NTIMERS; ++i) { 1752 if (tp->t_timer[i] != 0) { 1753 TCP_SET_TIMER_MODE(mode, i); 1754 if (lo_timer == 0 || tp->t_timer[i] < lo_timer) { 1755 lo_timer = tp->t_timer[i]; 1756 lo_index = i; 1757 } 1758 } 1759 } 1760 tp->tentry.index = lo_index; 1761 tp->tentry.mode = mode; 1762 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0); 1763 1764 if (tp->tentry.index != TCPT_NONE) { 1765 tp->tentry.runtime = tp->tentry.timer_start 1766 + tp->t_timer[tp->tentry.index]; 1767 if (tp->tentry.runtime == 0) 1768 tp->tentry.runtime++; 1769 } 1770} 1771 1772void 1773tcp_check_timer_state(struct tcpcb *tp) { 1774 1775 lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); 1776 1777 if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT) 1778 return; 1779 1780 tcp_set_lotimer_index(tp); 1781 1782 tcp_sched_timers(tp); 1783 return; 1784} 1785 1786__private_extern__ void 1787tcp_report_stats(void) 1788{ 1789 struct nstat_sysinfo_data data; 1790 struct sockaddr_in dst; 1791 struct sockaddr_in6 dst6; 1792 struct rtentry *rt = NULL; 1793 u_int64_t var, uptime; 1794 1795#define stat data.u.tcp_stats 1796 if (((uptime = net_uptime()) - tcp_last_report_time) < 1797 TCP_REPORT_STATS_INTERVAL) 1798 return; 1799 1800 tcp_last_report_time = uptime; 1801 1802 bzero(&data, sizeof(data)); 1803 data.flags = NSTAT_SYSINFO_TCP_STATS; 1804 1805 bzero(&dst, sizeof(dst)); 1806 dst.sin_len = sizeof(dst); 1807 dst.sin_family = AF_INET; 1808 1809 /* ipv4 avg rtt */ 1810 lck_mtx_lock(rnh_lock); 1811 rt = rt_lookup(TRUE, (struct sockaddr *)&dst, NULL, 1812 rt_tables[AF_INET], IFSCOPE_NONE); 1813 lck_mtx_unlock(rnh_lock); 1814 if (rt != NULL) { 1815 RT_LOCK(rt); 1816 if (rt_primary_default(rt, rt_key(rt)) && 1817 rt->rt_stats != NULL) { 1818 stat.ipv4_avgrtt = rt->rt_stats->nstat_avg_rtt; 1819 } 1820 RT_UNLOCK(rt); 1821 rtfree(rt); 1822 rt = NULL; 1823 } 1824 1825 /* ipv6 avg rtt */ 1826 bzero(&dst6, sizeof(dst6)); 1827 dst6.sin6_len = sizeof(dst6); 1828 dst6.sin6_family = AF_INET6; 1829 1830 lck_mtx_lock(rnh_lock); 1831 rt = rt_lookup(TRUE,(struct sockaddr *)&dst6, NULL, 1832 rt_tables[AF_INET6], IFSCOPE_NONE); 1833 lck_mtx_unlock(rnh_lock); 1834 if (rt != NULL) { 1835 RT_LOCK(rt); 1836 if (rt_primary_default(rt, rt_key(rt)) && 1837 rt->rt_stats != NULL) { 1838 stat.ipv6_avgrtt = rt->rt_stats->nstat_avg_rtt; 1839 } 1840 RT_UNLOCK(rt); 1841 rtfree(rt); 1842 rt = NULL; 1843 } 1844 1845 /* send packet loss rate, shift by 10 for precision */ 1846 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_sndrexmitpack > 0) { 1847 var = tcpstat.tcps_sndrexmitpack << 10; 1848 stat.send_plr = (var * 100) / tcpstat.tcps_sndpack; 1849 } 1850 1851 /* recv packet loss rate, shift by 10 for precision */ 1852 if (tcpstat.tcps_rcvpack > 0 && tcpstat.tcps_recovered_pkts > 0) { 1853 var = tcpstat.tcps_recovered_pkts << 10; 1854 stat.recv_plr = (var * 100) / tcpstat.tcps_rcvpack; 1855 } 1856 1857 /* RTO after tail loss, shift by 10 for precision */ 1858 if (tcpstat.tcps_sndrexmitpack > 0 1859 && tcpstat.tcps_tailloss_rto > 0) { 1860 var = tcpstat.tcps_tailloss_rto << 10; 1861 stat.send_tlrto_rate = 1862 (var * 100) / tcpstat.tcps_sndrexmitpack; 1863 } 1864 1865 /* packet reordering */ 1866 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_reordered_pkts > 0) { 1867 var = tcpstat.tcps_reordered_pkts << 10; 1868 stat.send_reorder_rate = 1869 (var * 100) / tcpstat.tcps_sndpack; 1870 } 1871 1872 nstat_sysinfo_send_data(&data); 1873 1874#undef stat 1875} 1876