1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: stable/11/sys/netinet/tcp_timer.c 334727 2018-06-06 19:48:39Z tuexen $"); 34 35#include "opt_inet.h" 36#include "opt_inet6.h" 37#include "opt_tcpdebug.h" 38#include "opt_rss.h" 39 40#include <sys/param.h> 41#include <sys/kernel.h> 42#include <sys/lock.h> 43#include <sys/mbuf.h> 44#include <sys/mutex.h> 45#include <sys/protosw.h> 46#include <sys/smp.h> 47#include <sys/socket.h> 48#include <sys/socketvar.h> 49#include <sys/sysctl.h> 50#include <sys/systm.h> 51 52#include <net/if.h> 53#include <net/route.h> 54#include <net/rss_config.h> 55#include <net/vnet.h> 56#include <net/netisr.h> 57 58#include <netinet/in.h> 59#include <netinet/in_kdtrace.h> 60#include <netinet/in_pcb.h> 61#include <netinet/in_rss.h> 62#include <netinet/in_systm.h> 63#ifdef INET6 64#include <netinet6/in6_pcb.h> 65#endif 66#include <netinet/ip_var.h> 67#include <netinet/tcp.h> 68#include <netinet/tcp_fsm.h> 69#include <netinet/tcp_timer.h> 70#include <netinet/tcp_var.h> 71#include <netinet/cc/cc.h> 72#ifdef INET6 73#include <netinet6/tcp6_var.h> 74#endif 75#include <netinet/tcpip.h> 76#ifdef TCPDEBUG 77#include <netinet/tcp_debug.h> 78#endif 79 80int tcp_persmin; 81SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 82 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 83 84int tcp_persmax; 85SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 87 88int tcp_keepinit; 89SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 91 92int tcp_keepidle; 93SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 95 96int tcp_keepintvl; 97SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 99 100int tcp_delacktime; 101SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 103 "Time before a delayed ACK is sent"); 104 105int tcp_msl; 106SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 107 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 108 109int tcp_rexmit_min; 110SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 111 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 112 "Minimum Retransmission Timeout"); 113 114int tcp_rexmit_slop; 115SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 116 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 117 "Retransmission Timer Slop"); 118 119int tcp_always_keepalive = 1; 120SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 121 &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 122__strong_reference(tcp_always_keepalive, always_keepalive); 123 124int tcp_fast_finwait2_recycle = 0; 125SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 126 &tcp_fast_finwait2_recycle, 0, 127 "Recycle closed FIN_WAIT_2 connections faster"); 128 129int tcp_finwait2_timeout; 130SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 131 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 132 133int tcp_keepcnt = TCPTV_KEEPCNT; 134SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 135 "Number of keepalive probes to send"); 136 137 /* max idle probes */ 138int tcp_maxpersistidle; 139 140static int tcp_rexmit_drop_options = 0; 141SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 142 &tcp_rexmit_drop_options, 0, 143 "Drop TCP options from 3rd and later retransmitted SYN"); 144 145static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 146#define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 147SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 148 CTLFLAG_RW|CTLFLAG_VNET, 149 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 150 "Path MTU Discovery Black Hole Detection Enabled"); 151 152static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 153#define V_tcp_pmtud_blackhole_activated \ 154 VNET(tcp_pmtud_blackhole_activated) 155SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 156 CTLFLAG_RD|CTLFLAG_VNET, 157 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 158 "Path MTU Discovery Black Hole Detection, Activation Count"); 159 160static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 161#define V_tcp_pmtud_blackhole_activated_min_mss \ 162 VNET(tcp_pmtud_blackhole_activated_min_mss) 163SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 164 CTLFLAG_RD|CTLFLAG_VNET, 165 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 166 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 167 168static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 169#define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 170SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 171 CTLFLAG_RD|CTLFLAG_VNET, 172 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 173 "Path MTU Discovery Black Hole Detection, Failure Count"); 174 175#ifdef INET 176static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 177#define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 178SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 179 CTLFLAG_RW|CTLFLAG_VNET, 180 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 181 "Path MTU Discovery Black Hole Detection lowered MSS"); 182#endif 183 184#ifdef INET6 185static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 186#define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 187SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 188 CTLFLAG_RW|CTLFLAG_VNET, 189 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 190 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 191#endif 192 193#ifdef RSS 194static int per_cpu_timers = 1; 195#else 196static int per_cpu_timers = 0; 197#endif 198SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 199 &per_cpu_timers , 0, "run tcp timers on all cpus"); 200 201#if 0 202#define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 203 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 204#endif 205 206/* 207 * Map the given inp to a CPU id. 208 * 209 * This queries RSS if it's compiled in, else it defaults to the current 210 * CPU ID. 211 */ 212static inline int 213inp_to_cpuid(struct inpcb *inp) 214{ 215 u_int cpuid; 216 217#ifdef RSS 218 if (per_cpu_timers) { 219 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 220 if (cpuid == NETISR_CPUID_NONE) 221 return (curcpu); /* XXX */ 222 else 223 return (cpuid); 224 } 225#else 226 /* Legacy, pre-RSS behaviour */ 227 if (per_cpu_timers) { 228 /* 229 * We don't have a flowid -> cpuid mapping, so cheat and 230 * just map unknown cpuids to curcpu. Not the best, but 231 * apparently better than defaulting to swi 0. 232 */ 233 cpuid = inp->inp_flowid % (mp_maxid + 1); 234 if (! CPU_ABSENT(cpuid)) 235 return (cpuid); 236 return (curcpu); 237 } 238#endif 239 /* Default for RSS and non-RSS - cpuid 0 */ 240 else { 241 return (0); 242 } 243} 244 245/* 246 * Tcp protocol timeout routine called every 500 ms. 247 * Updates timestamps used for TCP 248 * causes finite state machine actions if timers expire. 249 */ 250void 251tcp_slowtimo(void) 252{ 253 VNET_ITERATOR_DECL(vnet_iter); 254 255 VNET_LIST_RLOCK_NOSLEEP(); 256 VNET_FOREACH(vnet_iter) { 257 CURVNET_SET(vnet_iter); 258 (void) tcp_tw_2msl_scan(0); 259 CURVNET_RESTORE(); 260 } 261 VNET_LIST_RUNLOCK_NOSLEEP(); 262} 263 264int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 265 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 266 267int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 268 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 269 270static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 271 272/* 273 * TCP timer processing. 274 */ 275 276void 277tcp_timer_delack(void *xtp) 278{ 279 struct tcpcb *tp = xtp; 280 struct inpcb *inp; 281 CURVNET_SET(tp->t_vnet); 282 283 inp = tp->t_inpcb; 284 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 285 INP_WLOCK(inp); 286 if (callout_pending(&tp->t_timers->tt_delack) || 287 !callout_active(&tp->t_timers->tt_delack)) { 288 INP_WUNLOCK(inp); 289 CURVNET_RESTORE(); 290 return; 291 } 292 callout_deactivate(&tp->t_timers->tt_delack); 293 if ((inp->inp_flags & INP_DROPPED) != 0) { 294 INP_WUNLOCK(inp); 295 CURVNET_RESTORE(); 296 return; 297 } 298 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 299 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 300 KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, 301 ("%s: tp %p delack callout should be running", __func__, tp)); 302 303 tp->t_flags |= TF_ACKNOW; 304 TCPSTAT_INC(tcps_delack); 305 (void) tp->t_fb->tfb_tcp_output(tp); 306 INP_WUNLOCK(inp); 307 CURVNET_RESTORE(); 308} 309 310void 311tcp_timer_2msl(void *xtp) 312{ 313 struct tcpcb *tp = xtp; 314 struct inpcb *inp; 315 CURVNET_SET(tp->t_vnet); 316#ifdef TCPDEBUG 317 int ostate; 318 319 ostate = tp->t_state; 320#endif 321 INP_INFO_RLOCK(&V_tcbinfo); 322 inp = tp->t_inpcb; 323 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 324 INP_WLOCK(inp); 325 tcp_free_sackholes(tp); 326 if (callout_pending(&tp->t_timers->tt_2msl) || 327 !callout_active(&tp->t_timers->tt_2msl)) { 328 INP_WUNLOCK(tp->t_inpcb); 329 INP_INFO_RUNLOCK(&V_tcbinfo); 330 CURVNET_RESTORE(); 331 return; 332 } 333 callout_deactivate(&tp->t_timers->tt_2msl); 334 if ((inp->inp_flags & INP_DROPPED) != 0) { 335 INP_WUNLOCK(inp); 336 INP_INFO_RUNLOCK(&V_tcbinfo); 337 CURVNET_RESTORE(); 338 return; 339 } 340 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 341 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 342 KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, 343 ("%s: tp %p 2msl callout should be running", __func__, tp)); 344 /* 345 * 2 MSL timeout in shutdown went off. If we're closed but 346 * still waiting for peer to close and connection has been idle 347 * too long delete connection control block. Otherwise, check 348 * again in a bit. 349 * 350 * If in TIME_WAIT state just ignore as this timeout is handled in 351 * tcp_tw_2msl_scan(). 352 * 353 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 354 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 355 * Ignore fact that there were recent incoming segments. 356 */ 357 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 358 INP_WUNLOCK(inp); 359 INP_INFO_RUNLOCK(&V_tcbinfo); 360 CURVNET_RESTORE(); 361 return; 362 } 363 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 364 tp->t_inpcb && tp->t_inpcb->inp_socket && 365 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 366 TCPSTAT_INC(tcps_finwait2_drops); 367 tp = tcp_close(tp); 368 } else { 369 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 370 if (!callout_reset(&tp->t_timers->tt_2msl, 371 TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { 372 tp->t_timers->tt_flags &= ~TT_2MSL_RST; 373 } 374 } else 375 tp = tcp_close(tp); 376 } 377 378#ifdef TCPDEBUG 379 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 380 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 381 PRU_SLOWTIMO); 382#endif 383 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 384 385 if (tp != NULL) 386 INP_WUNLOCK(inp); 387 INP_INFO_RUNLOCK(&V_tcbinfo); 388 CURVNET_RESTORE(); 389} 390 391void 392tcp_timer_keep(void *xtp) 393{ 394 struct tcpcb *tp = xtp; 395 struct tcptemp *t_template; 396 struct inpcb *inp; 397 CURVNET_SET(tp->t_vnet); 398#ifdef TCPDEBUG 399 int ostate; 400 401 ostate = tp->t_state; 402#endif 403 INP_INFO_RLOCK(&V_tcbinfo); 404 inp = tp->t_inpcb; 405 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 406 INP_WLOCK(inp); 407 if (callout_pending(&tp->t_timers->tt_keep) || 408 !callout_active(&tp->t_timers->tt_keep)) { 409 INP_WUNLOCK(inp); 410 INP_INFO_RUNLOCK(&V_tcbinfo); 411 CURVNET_RESTORE(); 412 return; 413 } 414 callout_deactivate(&tp->t_timers->tt_keep); 415 if ((inp->inp_flags & INP_DROPPED) != 0) { 416 INP_WUNLOCK(inp); 417 INP_INFO_RUNLOCK(&V_tcbinfo); 418 CURVNET_RESTORE(); 419 return; 420 } 421 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 422 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 423 KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, 424 ("%s: tp %p keep callout should be running", __func__, tp)); 425 /* 426 * Keep-alive timer went off; send something 427 * or drop connection if idle for too long. 428 */ 429 TCPSTAT_INC(tcps_keeptimeo); 430 if (tp->t_state < TCPS_ESTABLISHED) 431 goto dropit; 432 if ((tcp_always_keepalive || 433 inp->inp_socket->so_options & SO_KEEPALIVE) && 434 tp->t_state <= TCPS_CLOSING) { 435 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 436 goto dropit; 437 /* 438 * Send a packet designed to force a response 439 * if the peer is up and reachable: 440 * either an ACK if the connection is still alive, 441 * or an RST if the peer has closed the connection 442 * due to timeout or reboot. 443 * Using sequence number tp->snd_una-1 444 * causes the transmitted zero-length segment 445 * to lie outside the receive window; 446 * by the protocol spec, this requires the 447 * correspondent TCP to respond. 448 */ 449 TCPSTAT_INC(tcps_keepprobe); 450 t_template = tcpip_maketemplate(inp); 451 if (t_template) { 452 tcp_respond(tp, t_template->tt_ipgen, 453 &t_template->tt_t, (struct mbuf *)NULL, 454 tp->rcv_nxt, tp->snd_una - 1, 0); 455 free(t_template, M_TEMP); 456 } 457 if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 458 tcp_timer_keep, tp)) { 459 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 460 } 461 } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 462 tcp_timer_keep, tp)) { 463 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 464 } 465 466#ifdef TCPDEBUG 467 if (inp->inp_socket->so_options & SO_DEBUG) 468 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 469 PRU_SLOWTIMO); 470#endif 471 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 472 INP_WUNLOCK(inp); 473 INP_INFO_RUNLOCK(&V_tcbinfo); 474 CURVNET_RESTORE(); 475 return; 476 477dropit: 478 TCPSTAT_INC(tcps_keepdrops); 479 tp = tcp_drop(tp, ETIMEDOUT); 480 481#ifdef TCPDEBUG 482 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 483 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 484 PRU_SLOWTIMO); 485#endif 486 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 487 if (tp != NULL) 488 INP_WUNLOCK(tp->t_inpcb); 489 INP_INFO_RUNLOCK(&V_tcbinfo); 490 CURVNET_RESTORE(); 491} 492 493void 494tcp_timer_persist(void *xtp) 495{ 496 struct tcpcb *tp = xtp; 497 struct inpcb *inp; 498 CURVNET_SET(tp->t_vnet); 499#ifdef TCPDEBUG 500 int ostate; 501 502 ostate = tp->t_state; 503#endif 504 INP_INFO_RLOCK(&V_tcbinfo); 505 inp = tp->t_inpcb; 506 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 507 INP_WLOCK(inp); 508 if (callout_pending(&tp->t_timers->tt_persist) || 509 !callout_active(&tp->t_timers->tt_persist)) { 510 INP_WUNLOCK(inp); 511 INP_INFO_RUNLOCK(&V_tcbinfo); 512 CURVNET_RESTORE(); 513 return; 514 } 515 callout_deactivate(&tp->t_timers->tt_persist); 516 if ((inp->inp_flags & INP_DROPPED) != 0) { 517 INP_WUNLOCK(inp); 518 INP_INFO_RUNLOCK(&V_tcbinfo); 519 CURVNET_RESTORE(); 520 return; 521 } 522 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 523 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 524 KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, 525 ("%s: tp %p persist callout should be running", __func__, tp)); 526 /* 527 * Persistence timer into zero window. 528 * Force a byte to be output, if possible. 529 */ 530 TCPSTAT_INC(tcps_persisttimeo); 531 /* 532 * Hack: if the peer is dead/unreachable, we do not 533 * time out if the window is closed. After a full 534 * backoff, drop the connection if the idle time 535 * (no responses to probes) reaches the maximum 536 * backoff that we would use if retransmitting. 537 */ 538 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 539 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 540 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 541 TCPSTAT_INC(tcps_persistdrop); 542 tp = tcp_drop(tp, ETIMEDOUT); 543 goto out; 544 } 545 /* 546 * If the user has closed the socket then drop a persisting 547 * connection after a much reduced timeout. 548 */ 549 if (tp->t_state > TCPS_CLOSE_WAIT && 550 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 551 TCPSTAT_INC(tcps_persistdrop); 552 tp = tcp_drop(tp, ETIMEDOUT); 553 goto out; 554 } 555 tcp_setpersist(tp); 556 tp->t_flags |= TF_FORCEDATA; 557 (void) tp->t_fb->tfb_tcp_output(tp); 558 tp->t_flags &= ~TF_FORCEDATA; 559 560out: 561#ifdef TCPDEBUG 562 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 563 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 564#endif 565 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 566 if (tp != NULL) 567 INP_WUNLOCK(inp); 568 INP_INFO_RUNLOCK(&V_tcbinfo); 569 CURVNET_RESTORE(); 570} 571 572void 573tcp_timer_rexmt(void * xtp) 574{ 575 struct tcpcb *tp = xtp; 576 CURVNET_SET(tp->t_vnet); 577 int rexmt; 578 int headlocked; 579 struct inpcb *inp; 580#ifdef TCPDEBUG 581 int ostate; 582 583 ostate = tp->t_state; 584#endif 585 586 INP_INFO_RLOCK(&V_tcbinfo); 587 inp = tp->t_inpcb; 588 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 589 INP_WLOCK(inp); 590 if (callout_pending(&tp->t_timers->tt_rexmt) || 591 !callout_active(&tp->t_timers->tt_rexmt)) { 592 INP_WUNLOCK(inp); 593 INP_INFO_RUNLOCK(&V_tcbinfo); 594 CURVNET_RESTORE(); 595 return; 596 } 597 callout_deactivate(&tp->t_timers->tt_rexmt); 598 if ((inp->inp_flags & INP_DROPPED) != 0) { 599 INP_WUNLOCK(inp); 600 INP_INFO_RUNLOCK(&V_tcbinfo); 601 CURVNET_RESTORE(); 602 return; 603 } 604 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 605 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 606 KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, 607 ("%s: tp %p rexmt callout should be running", __func__, tp)); 608 tcp_free_sackholes(tp); 609 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 610 /* The stack has a timer action too. */ 611 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 612 } 613 /* 614 * Retransmission timer went off. Message has not 615 * been acked within retransmit interval. Back off 616 * to a longer retransmit interval and retransmit one segment. 617 */ 618 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 619 tp->t_rxtshift = TCP_MAXRXTSHIFT; 620 TCPSTAT_INC(tcps_timeoutdrop); 621 622 tp = tcp_drop(tp, ETIMEDOUT); 623 headlocked = 1; 624 goto out; 625 } 626 INP_INFO_RUNLOCK(&V_tcbinfo); 627 headlocked = 0; 628 if (tp->t_state == TCPS_SYN_SENT) { 629 /* 630 * If the SYN was retransmitted, indicate CWND to be 631 * limited to 1 segment in cc_conn_init(). 632 */ 633 tp->snd_cwnd = 1; 634 } else if (tp->t_rxtshift == 1) { 635 /* 636 * first retransmit; record ssthresh and cwnd so they can 637 * be recovered if this turns out to be a "bad" retransmit. 638 * A retransmit is considered "bad" if an ACK for this 639 * segment is received within RTT/2 interval; the assumption 640 * here is that the ACK was already in flight. See 641 * "On Estimating End-to-End Network Path Properties" by 642 * Allman and Paxson for more details. 643 */ 644 tp->snd_cwnd_prev = tp->snd_cwnd; 645 tp->snd_ssthresh_prev = tp->snd_ssthresh; 646 tp->snd_recover_prev = tp->snd_recover; 647 if (IN_FASTRECOVERY(tp->t_flags)) 648 tp->t_flags |= TF_WASFRECOVERY; 649 else 650 tp->t_flags &= ~TF_WASFRECOVERY; 651 if (IN_CONGRECOVERY(tp->t_flags)) 652 tp->t_flags |= TF_WASCRECOVERY; 653 else 654 tp->t_flags &= ~TF_WASCRECOVERY; 655 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 656 tp->t_flags |= TF_PREVVALID; 657 } else 658 tp->t_flags &= ~TF_PREVVALID; 659 TCPSTAT_INC(tcps_rexmttimeo); 660 if ((tp->t_state == TCPS_SYN_SENT) || 661 (tp->t_state == TCPS_SYN_RECEIVED)) 662 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 663 else 664 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 665 TCPT_RANGESET(tp->t_rxtcur, rexmt, 666 tp->t_rttmin, TCPTV_REXMTMAX); 667 668 /* 669 * We enter the path for PLMTUD if connection is established or, if 670 * connection is FIN_WAIT_1 status, reason for the last is that if 671 * amount of data we send is very small, we could send it in couple of 672 * packets and process straight to FIN. In that case we won't catch 673 * ESTABLISHED state. 674 */ 675 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 676 || (tp->t_state == TCPS_FIN_WAIT_1))) { 677#ifdef INET6 678 int isipv6; 679#endif 680 681 /* 682 * Idea here is that at each stage of mtu probe (usually, 1448 683 * -> 1188 -> 524) should be given 2 chances to recover before 684 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 685 * take care of that. 686 */ 687 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 688 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 689 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 690 tp->t_rxtshift % 2 == 0)) { 691 /* 692 * Enter Path MTU Black-hole Detection mechanism: 693 * - Disable Path MTU Discovery (IP "DF" bit). 694 * - Reduce MTU to lower value than what we 695 * negotiated with peer. 696 */ 697 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 698 /* Record that we may have found a black hole. */ 699 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 700 /* Keep track of previous MSS. */ 701 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 702 } 703 704 /* 705 * Reduce the MSS to blackhole value or to the default 706 * in an attempt to retransmit. 707 */ 708#ifdef INET6 709 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 710 if (isipv6 && 711 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 712 /* Use the sysctl tuneable blackhole MSS. */ 713 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 714 V_tcp_pmtud_blackhole_activated++; 715 } else if (isipv6) { 716 /* Use the default MSS. */ 717 tp->t_maxseg = V_tcp_v6mssdflt; 718 /* 719 * Disable Path MTU Discovery when we switch to 720 * minmss. 721 */ 722 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 723 V_tcp_pmtud_blackhole_activated_min_mss++; 724 } 725#endif 726#if defined(INET6) && defined(INET) 727 else 728#endif 729#ifdef INET 730 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 731 /* Use the sysctl tuneable blackhole MSS. */ 732 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 733 V_tcp_pmtud_blackhole_activated++; 734 } else { 735 /* Use the default MSS. */ 736 tp->t_maxseg = V_tcp_mssdflt; 737 /* 738 * Disable Path MTU Discovery when we switch to 739 * minmss. 740 */ 741 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 742 V_tcp_pmtud_blackhole_activated_min_mss++; 743 } 744#endif 745 /* 746 * Reset the slow-start flight size 747 * as it may depend on the new MSS. 748 */ 749 if (CC_ALGO(tp)->conn_init != NULL) 750 CC_ALGO(tp)->conn_init(tp->ccv); 751 } else { 752 /* 753 * If further retransmissions are still unsuccessful 754 * with a lowered MTU, maybe this isn't a blackhole and 755 * we restore the previous MSS and blackhole detection 756 * flags. 757 * The limit '6' is determined by giving each probe 758 * stage (1448, 1188, 524) 2 chances to recover. 759 */ 760 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 761 (tp->t_rxtshift >= 6)) { 762 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 763 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 764 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 765 V_tcp_pmtud_blackhole_failed++; 766 /* 767 * Reset the slow-start flight size as it 768 * may depend on the new MSS. 769 */ 770 if (CC_ALGO(tp)->conn_init != NULL) 771 CC_ALGO(tp)->conn_init(tp->ccv); 772 } 773 } 774 } 775 776 /* 777 * Disable RFC1323 and SACK if we haven't got any response to 778 * our third SYN to work-around some broken terminal servers 779 * (most of which have hopefully been retired) that have bad VJ 780 * header compression code which trashes TCP segments containing 781 * unknown-to-them TCP options. 782 */ 783 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 784 (tp->t_rxtshift == 3)) 785 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 786 /* 787 * If we backed off this far, our srtt estimate is probably bogus. 788 * Clobber it so we'll take the next rtt measurement as our srtt; 789 * move the current srtt into rttvar to keep the current 790 * retransmit times until then. 791 */ 792 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 793#ifdef INET6 794 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 795 in6_losing(tp->t_inpcb); 796 else 797#endif 798 in_losing(tp->t_inpcb); 799 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 800 tp->t_srtt = 0; 801 } 802 tp->snd_nxt = tp->snd_una; 803 tp->snd_recover = tp->snd_max; 804 /* 805 * Force a segment to be sent. 806 */ 807 tp->t_flags |= TF_ACKNOW; 808 /* 809 * If timing a segment in this window, stop the timer. 810 */ 811 tp->t_rtttime = 0; 812 813 cc_cong_signal(tp, NULL, CC_RTO); 814 815 (void) tp->t_fb->tfb_tcp_output(tp); 816 817out: 818#ifdef TCPDEBUG 819 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 820 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 821 PRU_SLOWTIMO); 822#endif 823 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 824 if (tp != NULL) 825 INP_WUNLOCK(inp); 826 if (headlocked) 827 INP_INFO_RUNLOCK(&V_tcbinfo); 828 CURVNET_RESTORE(); 829} 830 831void 832tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 833{ 834 struct callout *t_callout; 835 timeout_t *f_callout; 836 struct inpcb *inp = tp->t_inpcb; 837 int cpu = inp_to_cpuid(inp); 838 uint32_t f_reset; 839 840#ifdef TCP_OFFLOAD 841 if (tp->t_flags & TF_TOE) 842 return; 843#endif 844 845 if (tp->t_timers->tt_flags & TT_STOPPED) 846 return; 847 848 switch (timer_type) { 849 case TT_DELACK: 850 t_callout = &tp->t_timers->tt_delack; 851 f_callout = tcp_timer_delack; 852 f_reset = TT_DELACK_RST; 853 break; 854 case TT_REXMT: 855 t_callout = &tp->t_timers->tt_rexmt; 856 f_callout = tcp_timer_rexmt; 857 f_reset = TT_REXMT_RST; 858 break; 859 case TT_PERSIST: 860 t_callout = &tp->t_timers->tt_persist; 861 f_callout = tcp_timer_persist; 862 f_reset = TT_PERSIST_RST; 863 break; 864 case TT_KEEP: 865 t_callout = &tp->t_timers->tt_keep; 866 f_callout = tcp_timer_keep; 867 f_reset = TT_KEEP_RST; 868 break; 869 case TT_2MSL: 870 t_callout = &tp->t_timers->tt_2msl; 871 f_callout = tcp_timer_2msl; 872 f_reset = TT_2MSL_RST; 873 break; 874 default: 875 if (tp->t_fb->tfb_tcp_timer_activate) { 876 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 877 return; 878 } 879 panic("tp %p bad timer_type %#x", tp, timer_type); 880 } 881 if (delta == 0) { 882 if ((tp->t_timers->tt_flags & timer_type) && 883 (callout_stop(t_callout) > 0) && 884 (tp->t_timers->tt_flags & f_reset)) { 885 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 886 } 887 } else { 888 if ((tp->t_timers->tt_flags & timer_type) == 0) { 889 tp->t_timers->tt_flags |= (timer_type | f_reset); 890 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 891 } else { 892 /* Reset already running callout on the same CPU. */ 893 if (!callout_reset(t_callout, delta, f_callout, tp)) { 894 /* 895 * Callout not cancelled, consider it as not 896 * properly restarted. */ 897 tp->t_timers->tt_flags &= ~f_reset; 898 } 899 } 900 } 901} 902 903int 904tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 905{ 906 struct callout *t_callout; 907 908 switch (timer_type) { 909 case TT_DELACK: 910 t_callout = &tp->t_timers->tt_delack; 911 break; 912 case TT_REXMT: 913 t_callout = &tp->t_timers->tt_rexmt; 914 break; 915 case TT_PERSIST: 916 t_callout = &tp->t_timers->tt_persist; 917 break; 918 case TT_KEEP: 919 t_callout = &tp->t_timers->tt_keep; 920 break; 921 case TT_2MSL: 922 t_callout = &tp->t_timers->tt_2msl; 923 break; 924 default: 925 if (tp->t_fb->tfb_tcp_timer_active) { 926 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 927 } 928 panic("tp %p bad timer_type %#x", tp, timer_type); 929 } 930 return callout_active(t_callout); 931} 932 933void 934tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 935{ 936 struct callout *t_callout; 937 uint32_t f_reset; 938 939 tp->t_timers->tt_flags |= TT_STOPPED; 940 941 switch (timer_type) { 942 case TT_DELACK: 943 t_callout = &tp->t_timers->tt_delack; 944 f_reset = TT_DELACK_RST; 945 break; 946 case TT_REXMT: 947 t_callout = &tp->t_timers->tt_rexmt; 948 f_reset = TT_REXMT_RST; 949 break; 950 case TT_PERSIST: 951 t_callout = &tp->t_timers->tt_persist; 952 f_reset = TT_PERSIST_RST; 953 break; 954 case TT_KEEP: 955 t_callout = &tp->t_timers->tt_keep; 956 f_reset = TT_KEEP_RST; 957 break; 958 case TT_2MSL: 959 t_callout = &tp->t_timers->tt_2msl; 960 f_reset = TT_2MSL_RST; 961 break; 962 default: 963 if (tp->t_fb->tfb_tcp_timer_stop) { 964 /* 965 * XXXrrs we need to look at this with the 966 * stop case below (flags). 967 */ 968 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 969 return; 970 } 971 panic("tp %p bad timer_type %#x", tp, timer_type); 972 } 973 974 if (tp->t_timers->tt_flags & timer_type) { 975 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 976 /* 977 * Can't stop the callout, defer tcpcb actual deletion 978 * to the last one. We do this using the async drain 979 * function and incrementing the count in 980 */ 981 tp->t_timers->tt_draincnt++; 982 } 983 } 984} 985 986#define ticks_to_msecs(t) (1000*(t) / hz) 987 988void 989tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 990 struct xtcp_timer *xtimer) 991{ 992 sbintime_t now; 993 994 bzero(xtimer, sizeof(*xtimer)); 995 if (timer == NULL) 996 return; 997 now = getsbinuptime(); 998 if (callout_active(&timer->tt_delack)) 999 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 1000 if (callout_active(&timer->tt_rexmt)) 1001 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 1002 if (callout_active(&timer->tt_persist)) 1003 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 1004 if (callout_active(&timer->tt_keep)) 1005 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 1006 if (callout_active(&timer->tt_2msl)) 1007 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 1008 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 1009} 1010