tcp_timer.c revision 303389
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: stable/10/sys/netinet/tcp_timer.c 303389 2016-07-27 13:53:15Z jch $"); 34 35#include "opt_inet.h" 36#include "opt_inet6.h" 37#include "opt_tcpdebug.h" 38 39#include <sys/param.h> 40#include <sys/kernel.h> 41#include <sys/lock.h> 42#include <sys/mbuf.h> 43#include <sys/mutex.h> 44#include <sys/protosw.h> 45#include <sys/smp.h> 46#include <sys/socket.h> 47#include <sys/socketvar.h> 48#include <sys/sysctl.h> 49#include <sys/systm.h> 50 51#include <net/if.h> 52#include <net/route.h> 53#include <net/vnet.h> 54 55#include <netinet/cc.h> 56#include <netinet/in.h> 57#include <netinet/in_pcb.h> 58#include <netinet/in_systm.h> 59#ifdef INET6 60#include <netinet6/in6_pcb.h> 61#endif 62#include <netinet/ip_var.h> 63#include <netinet/tcp_fsm.h> 64#include <netinet/tcp_timer.h> 65#include <netinet/tcp_var.h> 66#ifdef INET6 67#include <netinet6/tcp6_var.h> 68#endif 69#include <netinet/tcpip.h> 70#ifdef TCPDEBUG 71#include <netinet/tcp_debug.h> 72#endif 73 74int tcp_persmin; 75SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 76 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 77 78int tcp_persmax; 79SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 80 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 81 82int tcp_keepinit; 83SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 84 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 85 86int tcp_keepidle; 87SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 88 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 89 90int tcp_keepintvl; 91SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 92 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 93 94int tcp_delacktime; 95SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 96 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 97 "Time before a delayed ACK is sent"); 98 99int tcp_msl; 100SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 101 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 102 103int tcp_rexmit_min; 104SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 105 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 106 "Minimum Retransmission Timeout"); 107 108int tcp_rexmit_slop; 109SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 110 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 111 "Retransmission Timer Slop"); 112 113static int always_keepalive = 1; 114SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 115 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 116 117int tcp_fast_finwait2_recycle = 0; 118SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 119 &tcp_fast_finwait2_recycle, 0, 120 "Recycle closed FIN_WAIT_2 connections faster"); 121 122int tcp_finwait2_timeout; 123SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 124 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 125 126int tcp_keepcnt = TCPTV_KEEPCNT; 127SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 128 "Number of keepalive probes to send"); 129 130 /* max idle probes */ 131int tcp_maxpersistidle; 132 133static int tcp_rexmit_drop_options = 0; 134SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 135 &tcp_rexmit_drop_options, 0, 136 "Drop TCP options from 3rd and later retransmitted SYN"); 137 138static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 139#define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 140SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 141 CTLFLAG_RW, 142 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 143 "Path MTU Discovery Black Hole Detection Enabled"); 144 145static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 146#define V_tcp_pmtud_blackhole_activated \ 147 VNET(tcp_pmtud_blackhole_activated) 148SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 149 CTLFLAG_RD, 150 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 151 "Path MTU Discovery Black Hole Detection, Activation Count"); 152 153static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 154#define V_tcp_pmtud_blackhole_activated_min_mss \ 155 VNET(tcp_pmtud_blackhole_activated_min_mss) 156SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 157 CTLFLAG_RD, 158 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 159 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 160 161static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 162#define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 163SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 164 CTLFLAG_RD, 165 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 166 "Path MTU Discovery Black Hole Detection, Failure Count"); 167 168#ifdef INET 169static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 170#define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 171SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 172 CTLFLAG_RW, 173 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 174 "Path MTU Discovery Black Hole Detection lowered MSS"); 175#endif 176 177#ifdef INET6 178static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 179#define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 180SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 181 CTLFLAG_RW, 182 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 183 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 184#endif 185 186static int per_cpu_timers = 0; 187SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 188 &per_cpu_timers , 0, "run tcp timers on all cpus"); 189 190#define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 191 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 192 193/* 194 * Tcp protocol timeout routine called every 500 ms. 195 * Updates timestamps used for TCP 196 * causes finite state machine actions if timers expire. 197 */ 198void 199tcp_slowtimo(void) 200{ 201 VNET_ITERATOR_DECL(vnet_iter); 202 203 VNET_LIST_RLOCK_NOSLEEP(); 204 VNET_FOREACH(vnet_iter) { 205 CURVNET_SET(vnet_iter); 206 (void) tcp_tw_2msl_scan(0); 207 CURVNET_RESTORE(); 208 } 209 VNET_LIST_RUNLOCK_NOSLEEP(); 210} 211 212int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 213 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 214 215int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 216 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 217 218static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 219 220/* 221 * TCP timer processing. 222 */ 223 224void 225tcp_timer_delack(void *xtp) 226{ 227 struct tcpcb *tp = xtp; 228 struct inpcb *inp; 229 CURVNET_SET(tp->t_vnet); 230 231 inp = tp->t_inpcb; 232 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 233 INP_WLOCK(inp); 234 if (callout_pending(&tp->t_timers->tt_delack) || 235 !callout_active(&tp->t_timers->tt_delack)) { 236 INP_WUNLOCK(inp); 237 CURVNET_RESTORE(); 238 return; 239 } 240 callout_deactivate(&tp->t_timers->tt_delack); 241 if ((inp->inp_flags & INP_DROPPED) != 0) { 242 INP_WUNLOCK(inp); 243 CURVNET_RESTORE(); 244 return; 245 } 246 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 247 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 248 KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, 249 ("%s: tp %p delack callout should be running", __func__, tp)); 250 251 tp->t_flags |= TF_ACKNOW; 252 TCPSTAT_INC(tcps_delack); 253 (void) tcp_output(tp); 254 INP_WUNLOCK(inp); 255 CURVNET_RESTORE(); 256} 257 258void 259tcp_timer_2msl(void *xtp) 260{ 261 struct tcpcb *tp = xtp; 262 struct inpcb *inp; 263 CURVNET_SET(tp->t_vnet); 264#ifdef TCPDEBUG 265 int ostate; 266 267 ostate = tp->t_state; 268#endif 269 INP_INFO_WLOCK(&V_tcbinfo); 270 inp = tp->t_inpcb; 271 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 272 INP_WLOCK(inp); 273 tcp_free_sackholes(tp); 274 if (callout_pending(&tp->t_timers->tt_2msl) || 275 !callout_active(&tp->t_timers->tt_2msl)) { 276 INP_WUNLOCK(tp->t_inpcb); 277 INP_INFO_WUNLOCK(&V_tcbinfo); 278 CURVNET_RESTORE(); 279 return; 280 } 281 callout_deactivate(&tp->t_timers->tt_2msl); 282 if ((inp->inp_flags & INP_DROPPED) != 0) { 283 INP_WUNLOCK(inp); 284 INP_INFO_WUNLOCK(&V_tcbinfo); 285 CURVNET_RESTORE(); 286 return; 287 } 288 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 289 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 290 KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, 291 ("%s: tp %p 2msl callout should be running", __func__, tp)); 292 /* 293 * 2 MSL timeout in shutdown went off. If we're closed but 294 * still waiting for peer to close and connection has been idle 295 * too long delete connection control block. Otherwise, check 296 * again in a bit. 297 * 298 * If in TIME_WAIT state just ignore as this timeout is handled in 299 * tcp_tw_2msl_scan(). 300 * 301 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 302 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 303 * Ignore fact that there were recent incoming segments. 304 */ 305 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 306 INP_WUNLOCK(inp); 307 INP_INFO_WUNLOCK(&V_tcbinfo); 308 CURVNET_RESTORE(); 309 return; 310 } 311 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 312 tp->t_inpcb && tp->t_inpcb->inp_socket && 313 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 314 TCPSTAT_INC(tcps_finwait2_drops); 315 tp = tcp_close(tp); 316 } else { 317 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 318 if (!callout_reset(&tp->t_timers->tt_2msl, 319 TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { 320 tp->t_timers->tt_flags &= ~TT_2MSL_RST; 321 } 322 } else 323 tp = tcp_close(tp); 324 } 325 326#ifdef TCPDEBUG 327 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 328 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 329 PRU_SLOWTIMO); 330#endif 331 if (tp != NULL) 332 INP_WUNLOCK(inp); 333 INP_INFO_WUNLOCK(&V_tcbinfo); 334 CURVNET_RESTORE(); 335} 336 337void 338tcp_timer_keep(void *xtp) 339{ 340 struct tcpcb *tp = xtp; 341 struct tcptemp *t_template; 342 struct inpcb *inp; 343 CURVNET_SET(tp->t_vnet); 344#ifdef TCPDEBUG 345 int ostate; 346 347 ostate = tp->t_state; 348#endif 349 INP_INFO_WLOCK(&V_tcbinfo); 350 inp = tp->t_inpcb; 351 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 352 INP_WLOCK(inp); 353 if (callout_pending(&tp->t_timers->tt_keep) || 354 !callout_active(&tp->t_timers->tt_keep)) { 355 INP_WUNLOCK(inp); 356 INP_INFO_WUNLOCK(&V_tcbinfo); 357 CURVNET_RESTORE(); 358 return; 359 } 360 callout_deactivate(&tp->t_timers->tt_keep); 361 if ((inp->inp_flags & INP_DROPPED) != 0) { 362 INP_WUNLOCK(inp); 363 INP_INFO_WUNLOCK(&V_tcbinfo); 364 CURVNET_RESTORE(); 365 return; 366 } 367 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 368 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 369 KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, 370 ("%s: tp %p keep callout should be running", __func__, tp)); 371 /* 372 * Keep-alive timer went off; send something 373 * or drop connection if idle for too long. 374 */ 375 TCPSTAT_INC(tcps_keeptimeo); 376 if (tp->t_state < TCPS_ESTABLISHED) 377 goto dropit; 378 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 379 tp->t_state <= TCPS_CLOSING) { 380 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 381 goto dropit; 382 /* 383 * Send a packet designed to force a response 384 * if the peer is up and reachable: 385 * either an ACK if the connection is still alive, 386 * or an RST if the peer has closed the connection 387 * due to timeout or reboot. 388 * Using sequence number tp->snd_una-1 389 * causes the transmitted zero-length segment 390 * to lie outside the receive window; 391 * by the protocol spec, this requires the 392 * correspondent TCP to respond. 393 */ 394 TCPSTAT_INC(tcps_keepprobe); 395 t_template = tcpip_maketemplate(inp); 396 if (t_template) { 397 tcp_respond(tp, t_template->tt_ipgen, 398 &t_template->tt_t, (struct mbuf *)NULL, 399 tp->rcv_nxt, tp->snd_una - 1, 0); 400 free(t_template, M_TEMP); 401 } 402 if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 403 tcp_timer_keep, tp)) { 404 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 405 } 406 } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 407 tcp_timer_keep, tp)) { 408 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 409 } 410 411#ifdef TCPDEBUG 412 if (inp->inp_socket->so_options & SO_DEBUG) 413 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 414 PRU_SLOWTIMO); 415#endif 416 INP_WUNLOCK(inp); 417 INP_INFO_WUNLOCK(&V_tcbinfo); 418 CURVNET_RESTORE(); 419 return; 420 421dropit: 422 TCPSTAT_INC(tcps_keepdrops); 423 tp = tcp_drop(tp, ETIMEDOUT); 424 425#ifdef TCPDEBUG 426 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 427 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 428 PRU_SLOWTIMO); 429#endif 430 if (tp != NULL) 431 INP_WUNLOCK(tp->t_inpcb); 432 INP_INFO_WUNLOCK(&V_tcbinfo); 433 CURVNET_RESTORE(); 434} 435 436void 437tcp_timer_persist(void *xtp) 438{ 439 struct tcpcb *tp = xtp; 440 struct inpcb *inp; 441 CURVNET_SET(tp->t_vnet); 442#ifdef TCPDEBUG 443 int ostate; 444 445 ostate = tp->t_state; 446#endif 447 INP_INFO_WLOCK(&V_tcbinfo); 448 inp = tp->t_inpcb; 449 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 450 INP_WLOCK(inp); 451 if (callout_pending(&tp->t_timers->tt_persist) || 452 !callout_active(&tp->t_timers->tt_persist)) { 453 INP_WUNLOCK(inp); 454 INP_INFO_WUNLOCK(&V_tcbinfo); 455 CURVNET_RESTORE(); 456 return; 457 } 458 callout_deactivate(&tp->t_timers->tt_persist); 459 if ((inp->inp_flags & INP_DROPPED) != 0) { 460 INP_WUNLOCK(inp); 461 INP_INFO_WUNLOCK(&V_tcbinfo); 462 CURVNET_RESTORE(); 463 return; 464 } 465 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 466 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 467 KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, 468 ("%s: tp %p persist callout should be running", __func__, tp)); 469 /* 470 * Persistance timer into zero window. 471 * Force a byte to be output, if possible. 472 */ 473 TCPSTAT_INC(tcps_persisttimeo); 474 /* 475 * Hack: if the peer is dead/unreachable, we do not 476 * time out if the window is closed. After a full 477 * backoff, drop the connection if the idle time 478 * (no responses to probes) reaches the maximum 479 * backoff that we would use if retransmitting. 480 */ 481 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 482 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 483 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 484 TCPSTAT_INC(tcps_persistdrop); 485 tp = tcp_drop(tp, ETIMEDOUT); 486 goto out; 487 } 488 /* 489 * If the user has closed the socket then drop a persisting 490 * connection after a much reduced timeout. 491 */ 492 if (tp->t_state > TCPS_CLOSE_WAIT && 493 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 494 TCPSTAT_INC(tcps_persistdrop); 495 tp = tcp_drop(tp, ETIMEDOUT); 496 goto out; 497 } 498 tcp_setpersist(tp); 499 tp->t_flags |= TF_FORCEDATA; 500 (void) tcp_output(tp); 501 tp->t_flags &= ~TF_FORCEDATA; 502 503out: 504#ifdef TCPDEBUG 505 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 506 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 507#endif 508 if (tp != NULL) 509 INP_WUNLOCK(inp); 510 INP_INFO_WUNLOCK(&V_tcbinfo); 511 CURVNET_RESTORE(); 512} 513 514void 515tcp_timer_rexmt(void * xtp) 516{ 517 struct tcpcb *tp = xtp; 518 CURVNET_SET(tp->t_vnet); 519 int rexmt; 520 int headlocked; 521 struct inpcb *inp; 522#ifdef TCPDEBUG 523 int ostate; 524 525 ostate = tp->t_state; 526#endif 527 528 INP_INFO_RLOCK(&V_tcbinfo); 529 inp = tp->t_inpcb; 530 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 531 INP_WLOCK(inp); 532 if (callout_pending(&tp->t_timers->tt_rexmt) || 533 !callout_active(&tp->t_timers->tt_rexmt)) { 534 INP_WUNLOCK(inp); 535 INP_INFO_RUNLOCK(&V_tcbinfo); 536 CURVNET_RESTORE(); 537 return; 538 } 539 callout_deactivate(&tp->t_timers->tt_rexmt); 540 if ((inp->inp_flags & INP_DROPPED) != 0) { 541 INP_WUNLOCK(inp); 542 INP_INFO_RUNLOCK(&V_tcbinfo); 543 CURVNET_RESTORE(); 544 return; 545 } 546 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 547 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 548 KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, 549 ("%s: tp %p rexmt callout should be running", __func__, tp)); 550 tcp_free_sackholes(tp); 551 /* 552 * Retransmission timer went off. Message has not 553 * been acked within retransmit interval. Back off 554 * to a longer retransmit interval and retransmit one segment. 555 */ 556 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 557 tp->t_rxtshift = TCP_MAXRXTSHIFT; 558 TCPSTAT_INC(tcps_timeoutdrop); 559 in_pcbref(inp); 560 INP_INFO_RUNLOCK(&V_tcbinfo); 561 INP_WUNLOCK(inp); 562 INP_INFO_WLOCK(&V_tcbinfo); 563 INP_WLOCK(inp); 564 if (in_pcbrele_wlocked(inp)) { 565 INP_INFO_WUNLOCK(&V_tcbinfo); 566 CURVNET_RESTORE(); 567 return; 568 } 569 if (inp->inp_flags & INP_DROPPED) { 570 INP_WUNLOCK(inp); 571 INP_INFO_WUNLOCK(&V_tcbinfo); 572 CURVNET_RESTORE(); 573 return; 574 } 575 576 tp = tcp_drop(tp, tp->t_softerror ? 577 tp->t_softerror : ETIMEDOUT); 578 headlocked = 1; 579 goto out; 580 } 581 INP_INFO_RUNLOCK(&V_tcbinfo); 582 headlocked = 0; 583 if (tp->t_state == TCPS_SYN_SENT) { 584 /* 585 * If the SYN was retransmitted, indicate CWND to be 586 * limited to 1 segment in cc_conn_init(). 587 */ 588 tp->snd_cwnd = 1; 589 } else if (tp->t_rxtshift == 1) { 590 /* 591 * first retransmit; record ssthresh and cwnd so they can 592 * be recovered if this turns out to be a "bad" retransmit. 593 * A retransmit is considered "bad" if an ACK for this 594 * segment is received within RTT/2 interval; the assumption 595 * here is that the ACK was already in flight. See 596 * "On Estimating End-to-End Network Path Properties" by 597 * Allman and Paxson for more details. 598 */ 599 tp->snd_cwnd_prev = tp->snd_cwnd; 600 tp->snd_ssthresh_prev = tp->snd_ssthresh; 601 tp->snd_recover_prev = tp->snd_recover; 602 if (IN_FASTRECOVERY(tp->t_flags)) 603 tp->t_flags |= TF_WASFRECOVERY; 604 else 605 tp->t_flags &= ~TF_WASFRECOVERY; 606 if (IN_CONGRECOVERY(tp->t_flags)) 607 tp->t_flags |= TF_WASCRECOVERY; 608 else 609 tp->t_flags &= ~TF_WASCRECOVERY; 610 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 611 tp->t_flags |= TF_PREVVALID; 612 } else 613 tp->t_flags &= ~TF_PREVVALID; 614 TCPSTAT_INC(tcps_rexmttimeo); 615 if ((tp->t_state == TCPS_SYN_SENT) || 616 (tp->t_state == TCPS_SYN_RECEIVED)) 617 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 618 else 619 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 620 TCPT_RANGESET(tp->t_rxtcur, rexmt, 621 tp->t_rttmin, TCPTV_REXMTMAX); 622 623 /* 624 * We enter the path for PLMTUD if connection is established or, if 625 * connection is FIN_WAIT_1 status, reason for the last is that if 626 * amount of data we send is very small, we could send it in couple of 627 * packets and process straight to FIN. In that case we won't catch 628 * ESTABLISHED state. 629 */ 630 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 631 || (tp->t_state == TCPS_FIN_WAIT_1))) { 632 int optlen; 633#ifdef INET6 634 int isipv6; 635#endif 636 637 /* 638 * Idea here is that at each stage of mtu probe (usually, 1448 639 * -> 1188 -> 524) should be given 2 chances to recover before 640 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 641 * take care of that. 642 */ 643 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 644 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 645 (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { 646 /* 647 * Enter Path MTU Black-hole Detection mechanism: 648 * - Disable Path MTU Discovery (IP "DF" bit). 649 * - Reduce MTU to lower value than what we 650 * negotiated with peer. 651 */ 652 /* Record that we may have found a black hole. */ 653 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 654 655 /* Keep track of previous MSS. */ 656 optlen = tp->t_maxopd - tp->t_maxseg; 657 tp->t_pmtud_saved_maxopd = tp->t_maxopd; 658 659 /* 660 * Reduce the MSS to blackhole value or to the default 661 * in an attempt to retransmit. 662 */ 663#ifdef INET6 664 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 665 if (isipv6 && 666 tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) { 667 /* Use the sysctl tuneable blackhole MSS. */ 668 tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss; 669 V_tcp_pmtud_blackhole_activated++; 670 } else if (isipv6) { 671 /* Use the default MSS. */ 672 tp->t_maxopd = V_tcp_v6mssdflt; 673 /* 674 * Disable Path MTU Discovery when we switch to 675 * minmss. 676 */ 677 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 678 V_tcp_pmtud_blackhole_activated_min_mss++; 679 } 680#endif 681#if defined(INET6) && defined(INET) 682 else 683#endif 684#ifdef INET 685 if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) { 686 /* Use the sysctl tuneable blackhole MSS. */ 687 tp->t_maxopd = V_tcp_pmtud_blackhole_mss; 688 V_tcp_pmtud_blackhole_activated++; 689 } else { 690 /* Use the default MSS. */ 691 tp->t_maxopd = V_tcp_mssdflt; 692 /* 693 * Disable Path MTU Discovery when we switch to 694 * minmss. 695 */ 696 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 697 V_tcp_pmtud_blackhole_activated_min_mss++; 698 } 699#endif 700 tp->t_maxseg = tp->t_maxopd - optlen; 701 /* 702 * Reset the slow-start flight size 703 * as it may depend on the new MSS. 704 */ 705 if (CC_ALGO(tp)->conn_init != NULL) 706 CC_ALGO(tp)->conn_init(tp->ccv); 707 } else { 708 /* 709 * If further retransmissions are still unsuccessful 710 * with a lowered MTU, maybe this isn't a blackhole and 711 * we restore the previous MSS and blackhole detection 712 * flags. 713 * The limit '6' is determined by giving each probe 714 * stage (1448, 1188, 524) 2 chances to recover. 715 */ 716 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 717 (tp->t_rxtshift > 6)) { 718 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 719 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 720 optlen = tp->t_maxopd - tp->t_maxseg; 721 tp->t_maxopd = tp->t_pmtud_saved_maxopd; 722 tp->t_maxseg = tp->t_maxopd - optlen; 723 V_tcp_pmtud_blackhole_failed++; 724 /* 725 * Reset the slow-start flight size as it 726 * may depend on the new MSS. 727 */ 728 if (CC_ALGO(tp)->conn_init != NULL) 729 CC_ALGO(tp)->conn_init(tp->ccv); 730 } 731 } 732 } 733 734 /* 735 * Disable RFC1323 and SACK if we haven't got any response to 736 * our third SYN to work-around some broken terminal servers 737 * (most of which have hopefully been retired) that have bad VJ 738 * header compression code which trashes TCP segments containing 739 * unknown-to-them TCP options. 740 */ 741 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 742 (tp->t_rxtshift == 3)) 743 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 744 /* 745 * If we backed off this far, our srtt estimate is probably bogus. 746 * Clobber it so we'll take the next rtt measurement as our srtt; 747 * move the current srtt into rttvar to keep the current 748 * retransmit times until then. 749 */ 750 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 751#ifdef INET6 752 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 753 in6_losing(tp->t_inpcb); 754#endif 755 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 756 tp->t_srtt = 0; 757 } 758 tp->snd_nxt = tp->snd_una; 759 tp->snd_recover = tp->snd_max; 760 /* 761 * Force a segment to be sent. 762 */ 763 tp->t_flags |= TF_ACKNOW; 764 /* 765 * If timing a segment in this window, stop the timer. 766 */ 767 tp->t_rtttime = 0; 768 769 cc_cong_signal(tp, NULL, CC_RTO); 770 771 (void) tcp_output(tp); 772 773out: 774#ifdef TCPDEBUG 775 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 776 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 777 PRU_SLOWTIMO); 778#endif 779 if (tp != NULL) 780 INP_WUNLOCK(inp); 781 if (headlocked) 782 INP_INFO_WUNLOCK(&V_tcbinfo); 783 CURVNET_RESTORE(); 784} 785 786void 787tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 788{ 789 struct callout *t_callout; 790 timeout_t *f_callout; 791 struct inpcb *inp = tp->t_inpcb; 792 int cpu = INP_CPU(inp); 793 uint32_t f_reset; 794 795#ifdef TCP_OFFLOAD 796 if (tp->t_flags & TF_TOE) 797 return; 798#endif 799 800 if (tp->t_timers->tt_flags & TT_STOPPED) 801 return; 802 803 switch (timer_type) { 804 case TT_DELACK: 805 t_callout = &tp->t_timers->tt_delack; 806 f_callout = tcp_timer_delack; 807 f_reset = TT_DELACK_RST; 808 break; 809 case TT_REXMT: 810 t_callout = &tp->t_timers->tt_rexmt; 811 f_callout = tcp_timer_rexmt; 812 f_reset = TT_REXMT_RST; 813 break; 814 case TT_PERSIST: 815 t_callout = &tp->t_timers->tt_persist; 816 f_callout = tcp_timer_persist; 817 f_reset = TT_PERSIST_RST; 818 break; 819 case TT_KEEP: 820 t_callout = &tp->t_timers->tt_keep; 821 f_callout = tcp_timer_keep; 822 f_reset = TT_KEEP_RST; 823 break; 824 case TT_2MSL: 825 t_callout = &tp->t_timers->tt_2msl; 826 f_callout = tcp_timer_2msl; 827 f_reset = TT_2MSL_RST; 828 break; 829 default: 830 panic("tp %p bad timer_type %#x", tp, timer_type); 831 } 832 if (delta == 0) { 833 if ((tp->t_timers->tt_flags & timer_type) && 834 callout_stop(t_callout) && 835 (tp->t_timers->tt_flags & f_reset)) { 836 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 837 } 838 } else { 839 if ((tp->t_timers->tt_flags & timer_type) == 0) { 840 tp->t_timers->tt_flags |= (timer_type | f_reset); 841 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 842 } else { 843 /* Reset already running callout on the same CPU. */ 844 if (!callout_reset(t_callout, delta, f_callout, tp)) { 845 /* 846 * Callout not cancelled, consider it as not 847 * properly restarted. */ 848 tp->t_timers->tt_flags &= ~f_reset; 849 } 850 } 851 } 852} 853 854int 855tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 856{ 857 struct callout *t_callout; 858 859 switch (timer_type) { 860 case TT_DELACK: 861 t_callout = &tp->t_timers->tt_delack; 862 break; 863 case TT_REXMT: 864 t_callout = &tp->t_timers->tt_rexmt; 865 break; 866 case TT_PERSIST: 867 t_callout = &tp->t_timers->tt_persist; 868 break; 869 case TT_KEEP: 870 t_callout = &tp->t_timers->tt_keep; 871 break; 872 case TT_2MSL: 873 t_callout = &tp->t_timers->tt_2msl; 874 break; 875 default: 876 panic("tp %p bad timer_type %#x", tp, timer_type); 877 } 878 return callout_active(t_callout); 879} 880 881void 882tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 883{ 884 struct callout *t_callout; 885 timeout_t *f_callout; 886 uint32_t f_reset; 887 888 tp->t_timers->tt_flags |= TT_STOPPED; 889 890 switch (timer_type) { 891 case TT_DELACK: 892 t_callout = &tp->t_timers->tt_delack; 893 f_callout = tcp_timer_delack_discard; 894 f_reset = TT_DELACK_RST; 895 break; 896 case TT_REXMT: 897 t_callout = &tp->t_timers->tt_rexmt; 898 f_callout = tcp_timer_rexmt_discard; 899 f_reset = TT_REXMT_RST; 900 break; 901 case TT_PERSIST: 902 t_callout = &tp->t_timers->tt_persist; 903 f_callout = tcp_timer_persist_discard; 904 f_reset = TT_PERSIST_RST; 905 break; 906 case TT_KEEP: 907 t_callout = &tp->t_timers->tt_keep; 908 f_callout = tcp_timer_keep_discard; 909 f_reset = TT_KEEP_RST; 910 break; 911 case TT_2MSL: 912 t_callout = &tp->t_timers->tt_2msl; 913 f_callout = tcp_timer_2msl_discard; 914 f_reset = TT_2MSL_RST; 915 break; 916 default: 917 panic("tp %p bad timer_type %#x", tp, timer_type); 918 } 919 920 if (tp->t_timers->tt_flags & timer_type) { 921 if (callout_stop(t_callout) && 922 (tp->t_timers->tt_flags & f_reset)) { 923 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 924 } else { 925 /* 926 * Can't stop the callout, defer tcpcb actual deletion 927 * to the last tcp timer discard callout. 928 * The TT_STOPPED flag will ensure that no tcp timer 929 * callouts can be restarted on our behalf, and 930 * past this point currently running callouts waiting 931 * on inp lock will return right away after the 932 * classical check for callout reset/stop events: 933 * callout_pending() || !callout_active() 934 */ 935 callout_reset(t_callout, 1, f_callout, tp); 936 } 937 } 938} 939 940#define ticks_to_msecs(t) (1000*(t) / hz) 941 942void 943tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 944 struct xtcp_timer *xtimer) 945{ 946 sbintime_t now; 947 948 bzero(xtimer, sizeof(*xtimer)); 949 if (timer == NULL) 950 return; 951 now = getsbinuptime(); 952 if (callout_active(&timer->tt_delack)) 953 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 954 if (callout_active(&timer->tt_rexmt)) 955 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 956 if (callout_active(&timer->tt_persist)) 957 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 958 if (callout_active(&timer->tt_keep)) 959 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 960 if (callout_active(&timer->tt_2msl)) 961 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 962 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 963} 964