1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD$");
36
37#include "opt_inet.h"
38#include "opt_inet6.h"
39#include "opt_tcpdebug.h"
40#include "opt_rss.h"
41
42#include <sys/param.h>
43#include <sys/kernel.h>
44#include <sys/lock.h>
45#include <sys/mbuf.h>
46#include <sys/mutex.h>
47#include <sys/protosw.h>
48#include <sys/smp.h>
49#include <sys/socket.h>
50#include <sys/socketvar.h>
51#include <sys/sysctl.h>
52#include <sys/systm.h>
53
54#include <net/if.h>
55#include <net/route.h>
56#include <net/rss_config.h>
57#include <net/vnet.h>
58#include <net/netisr.h>
59
60#include <netinet/in.h>
61#include <netinet/in_kdtrace.h>
62#include <netinet/in_pcb.h>
63#include <netinet/in_rss.h>
64#include <netinet/in_systm.h>
65#ifdef INET6
66#include <netinet6/in6_pcb.h>
67#endif
68#include <netinet/ip_var.h>
69#include <netinet/tcp.h>
70#include <netinet/tcp_fsm.h>
71#include <netinet/tcp_log_buf.h>
72#include <netinet/tcp_timer.h>
73#include <netinet/tcp_var.h>
74#include <netinet/tcp_seq.h>
75#include <netinet/cc/cc.h>
76#ifdef INET6
77#include <netinet6/tcp6_var.h>
78#endif
79#include <netinet/tcpip.h>
80#ifdef TCPDEBUG
81#include <netinet/tcp_debug.h>
82#endif
83
84int    tcp_persmin;
85SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin,
86    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
87    &tcp_persmin, 0, sysctl_msec_to_ticks, "I",
88    "minimum persistence interval");
89
90int    tcp_persmax;
91SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax,
92    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
93    &tcp_persmax, 0, sysctl_msec_to_ticks, "I",
94    "maximum persistence interval");
95
96int	tcp_keepinit;
97SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
98    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
99    &tcp_keepinit, 0, sysctl_msec_to_ticks, "I",
100    "time to establish connection");
101
102int	tcp_keepidle;
103SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
104    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
105    &tcp_keepidle, 0, sysctl_msec_to_ticks, "I",
106    "time before keepalive probes begin");
107
108int	tcp_keepintvl;
109SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
110    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
111    &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I",
112    "time between keepalive probes");
113
114int	tcp_delacktime;
115SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime,
116    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
117    &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
118    "Time before a delayed ACK is sent");
119
120int	tcp_msl;
121SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
122    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
123    &tcp_msl, 0, sysctl_msec_to_ticks, "I",
124    "Maximum segment lifetime");
125
126int	tcp_rexmit_initial;
127SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial,
128   CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
129    &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I",
130    "Initial Retransmission Timeout");
131
132int	tcp_rexmit_min;
133SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min,
134    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
135    &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
136    "Minimum Retransmission Timeout");
137
138int	tcp_rexmit_slop;
139SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop,
140    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
141    &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
142    "Retransmission Timer Slop");
143
144VNET_DEFINE(int, tcp_always_keepalive) = 1;
145SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW,
146    &VNET_NAME(tcp_always_keepalive) , 0,
147    "Assume SO_KEEPALIVE on all TCP connections");
148
149int    tcp_fast_finwait2_recycle = 0;
150SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
151    &tcp_fast_finwait2_recycle, 0,
152    "Recycle closed FIN_WAIT_2 connections faster");
153
154int    tcp_finwait2_timeout;
155SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout,
156    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
157    &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I",
158    "FIN-WAIT2 timeout");
159
160int	tcp_keepcnt = TCPTV_KEEPCNT;
161SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
162    "Number of keepalive probes to send");
163
164	/* max idle probes */
165int	tcp_maxpersistidle;
166
167int	tcp_rexmit_drop_options = 0;
168SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
169    &tcp_rexmit_drop_options, 0,
170    "Drop TCP options from 3rd and later retransmitted SYN");
171
172VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
173SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
174    CTLFLAG_RW|CTLFLAG_VNET,
175    &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
176    "Path MTU Discovery Black Hole Detection Enabled");
177
178#ifdef INET
179VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
180SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
181    CTLFLAG_RW|CTLFLAG_VNET,
182    &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
183    "Path MTU Discovery Black Hole Detection lowered MSS");
184#endif
185
186#ifdef INET6
187VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
188SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
189    CTLFLAG_RW|CTLFLAG_VNET,
190    &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
191    "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
192#endif
193
194#ifdef	RSS
195static int	per_cpu_timers = 1;
196#else
197static int	per_cpu_timers = 0;
198#endif
199SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
200    &per_cpu_timers , 0, "run tcp timers on all cpus");
201
202/*
203 * Map the given inp to a CPU id.
204 *
205 * This queries RSS if it's compiled in, else it defaults to the current
206 * CPU ID.
207 */
208inline int
209inp_to_cpuid(struct inpcb *inp)
210{
211	u_int cpuid;
212
213#ifdef	RSS
214	if (per_cpu_timers) {
215		cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
216		if (cpuid == NETISR_CPUID_NONE)
217			return (curcpu);	/* XXX */
218		else
219			return (cpuid);
220	}
221#else
222	/* Legacy, pre-RSS behaviour */
223	if (per_cpu_timers) {
224		/*
225		 * We don't have a flowid -> cpuid mapping, so cheat and
226		 * just map unknown cpuids to curcpu.  Not the best, but
227		 * apparently better than defaulting to swi 0.
228		 */
229		cpuid = inp->inp_flowid % (mp_maxid + 1);
230		if (! CPU_ABSENT(cpuid))
231			return (cpuid);
232		return (curcpu);
233	}
234#endif
235	/* Default for RSS and non-RSS - cpuid 0 */
236	else {
237		return (0);
238	}
239}
240
241/*
242 * Tcp protocol timeout routine called every 500 ms.
243 * Updates timestamps used for TCP
244 * causes finite state machine actions if timers expire.
245 */
246void
247tcp_slowtimo(void)
248{
249	VNET_ITERATOR_DECL(vnet_iter);
250
251	VNET_LIST_RLOCK_NOSLEEP();
252	VNET_FOREACH(vnet_iter) {
253		CURVNET_SET(vnet_iter);
254		(void) tcp_tw_2msl_scan(0);
255		CURVNET_RESTORE();
256	}
257	VNET_LIST_RUNLOCK_NOSLEEP();
258}
259
260int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
261    { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
262
263int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
264
265/*
266 * TCP timer processing.
267 */
268
269void
270tcp_timer_delack(void *xtp)
271{
272	struct epoch_tracker et;
273	struct tcpcb *tp = xtp;
274	struct inpcb *inp;
275	CURVNET_SET(tp->t_vnet);
276
277	inp = tp->t_inpcb;
278	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
279	INP_WLOCK(inp);
280	if (callout_pending(&tp->t_timers->tt_delack) ||
281	    !callout_active(&tp->t_timers->tt_delack)) {
282		INP_WUNLOCK(inp);
283		CURVNET_RESTORE();
284		return;
285	}
286	callout_deactivate(&tp->t_timers->tt_delack);
287	if ((inp->inp_flags & INP_DROPPED) != 0) {
288		INP_WUNLOCK(inp);
289		CURVNET_RESTORE();
290		return;
291	}
292	tp->t_flags |= TF_ACKNOW;
293	TCPSTAT_INC(tcps_delack);
294	NET_EPOCH_ENTER(et);
295	(void) tp->t_fb->tfb_tcp_output(tp);
296	INP_WUNLOCK(inp);
297	NET_EPOCH_EXIT(et);
298	CURVNET_RESTORE();
299}
300
301void
302tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp)
303{
304	if (inp && tp != NULL)
305		INP_WUNLOCK(inp);
306}
307
308void
309tcp_timer_2msl(void *xtp)
310{
311	struct tcpcb *tp = xtp;
312	struct inpcb *inp;
313	struct epoch_tracker et;
314	CURVNET_SET(tp->t_vnet);
315#ifdef TCPDEBUG
316	int ostate;
317
318	ostate = tp->t_state;
319#endif
320	inp = tp->t_inpcb;
321	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
322	INP_WLOCK(inp);
323	tcp_free_sackholes(tp);
324	if (callout_pending(&tp->t_timers->tt_2msl) ||
325	    !callout_active(&tp->t_timers->tt_2msl)) {
326		INP_WUNLOCK(tp->t_inpcb);
327		CURVNET_RESTORE();
328		return;
329	}
330	callout_deactivate(&tp->t_timers->tt_2msl);
331	if ((inp->inp_flags & INP_DROPPED) != 0) {
332		INP_WUNLOCK(inp);
333		CURVNET_RESTORE();
334		return;
335	}
336	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
337		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
338	/*
339	 * 2 MSL timeout in shutdown went off.  If we're closed but
340	 * still waiting for peer to close and connection has been idle
341	 * too long delete connection control block.  Otherwise, check
342	 * again in a bit.
343	 *
344	 * If in TIME_WAIT state just ignore as this timeout is handled in
345	 * tcp_tw_2msl_scan().
346	 *
347	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
348	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
349	 * Ignore fact that there were recent incoming segments.
350	 */
351	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
352		INP_WUNLOCK(inp);
353		CURVNET_RESTORE();
354		return;
355	}
356	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
357	    tp->t_inpcb && tp->t_inpcb->inp_socket &&
358	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
359		TCPSTAT_INC(tcps_finwait2_drops);
360		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
361			tcp_inpinfo_lock_del(inp, tp);
362			goto out;
363		}
364		NET_EPOCH_ENTER(et);
365		tp = tcp_close(tp);
366		NET_EPOCH_EXIT(et);
367		tcp_inpinfo_lock_del(inp, tp);
368		goto out;
369	} else {
370		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
371			callout_reset(&tp->t_timers->tt_2msl,
372				      TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
373		} else {
374			if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
375				tcp_inpinfo_lock_del(inp, tp);
376				goto out;
377			}
378			NET_EPOCH_ENTER(et);
379			tp = tcp_close(tp);
380			NET_EPOCH_EXIT(et);
381			tcp_inpinfo_lock_del(inp, tp);
382			goto out;
383		}
384	}
385
386#ifdef TCPDEBUG
387	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
388		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
389			  PRU_SLOWTIMO);
390#endif
391	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
392
393	if (tp != NULL)
394		INP_WUNLOCK(inp);
395out:
396	CURVNET_RESTORE();
397}
398
399void
400tcp_timer_keep(void *xtp)
401{
402	struct tcpcb *tp = xtp;
403	struct tcptemp *t_template;
404	struct inpcb *inp;
405	struct epoch_tracker et;
406	CURVNET_SET(tp->t_vnet);
407#ifdef TCPDEBUG
408	int ostate;
409
410	ostate = tp->t_state;
411#endif
412	inp = tp->t_inpcb;
413	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
414	INP_WLOCK(inp);
415	if (callout_pending(&tp->t_timers->tt_keep) ||
416	    !callout_active(&tp->t_timers->tt_keep)) {
417		INP_WUNLOCK(inp);
418		CURVNET_RESTORE();
419		return;
420	}
421	callout_deactivate(&tp->t_timers->tt_keep);
422	if ((inp->inp_flags & INP_DROPPED) != 0) {
423		INP_WUNLOCK(inp);
424		CURVNET_RESTORE();
425		return;
426	}
427	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
428		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
429
430	/*
431	 * Because we don't regularly reset the keepalive callout in
432	 * the ESTABLISHED state, it may be that we don't actually need
433	 * to send a keepalive yet. If that occurs, schedule another
434	 * call for the next time the keepalive timer might expire.
435	 */
436	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
437		u_int idletime;
438
439		idletime = ticks - tp->t_rcvtime;
440		if (idletime < TP_KEEPIDLE(tp)) {
441			callout_reset(&tp->t_timers->tt_keep,
442			    TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp);
443			INP_WUNLOCK(inp);
444			CURVNET_RESTORE();
445			return;
446		}
447	}
448
449	/*
450	 * Keep-alive timer went off; send something
451	 * or drop connection if idle for too long.
452	 */
453	TCPSTAT_INC(tcps_keeptimeo);
454	if (tp->t_state < TCPS_ESTABLISHED)
455		goto dropit;
456	if ((V_tcp_always_keepalive ||
457	    inp->inp_socket->so_options & SO_KEEPALIVE) &&
458	    tp->t_state <= TCPS_CLOSING) {
459		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
460			goto dropit;
461		/*
462		 * Send a packet designed to force a response
463		 * if the peer is up and reachable:
464		 * either an ACK if the connection is still alive,
465		 * or an RST if the peer has closed the connection
466		 * due to timeout or reboot.
467		 * Using sequence number tp->snd_una-1
468		 * causes the transmitted zero-length segment
469		 * to lie outside the receive window;
470		 * by the protocol spec, this requires the
471		 * correspondent TCP to respond.
472		 */
473		TCPSTAT_INC(tcps_keepprobe);
474		t_template = tcpip_maketemplate(inp);
475		if (t_template) {
476			NET_EPOCH_ENTER(et);
477			tcp_respond(tp, t_template->tt_ipgen,
478				    &t_template->tt_t, (struct mbuf *)NULL,
479				    tp->rcv_nxt, tp->snd_una - 1, 0);
480			NET_EPOCH_EXIT(et);
481			free(t_template, M_TEMP);
482		}
483		callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
484			      tcp_timer_keep, tp);
485	} else
486		callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
487			      tcp_timer_keep, tp);
488
489#ifdef TCPDEBUG
490	if (inp->inp_socket->so_options & SO_DEBUG)
491		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
492			  PRU_SLOWTIMO);
493#endif
494	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
495	INP_WUNLOCK(inp);
496	CURVNET_RESTORE();
497	return;
498
499dropit:
500	TCPSTAT_INC(tcps_keepdrops);
501	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
502		tcp_inpinfo_lock_del(inp, tp);
503		goto out;
504	}
505	NET_EPOCH_ENTER(et);
506	tp = tcp_drop(tp, ETIMEDOUT);
507
508#ifdef TCPDEBUG
509	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
510		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
511			  PRU_SLOWTIMO);
512#endif
513	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
514	NET_EPOCH_EXIT(et);
515	tcp_inpinfo_lock_del(inp, tp);
516 out:
517	CURVNET_RESTORE();
518}
519
520void
521tcp_timer_persist(void *xtp)
522{
523	struct tcpcb *tp = xtp;
524	struct inpcb *inp;
525	struct epoch_tracker et;
526	CURVNET_SET(tp->t_vnet);
527#ifdef TCPDEBUG
528	int ostate;
529
530	ostate = tp->t_state;
531#endif
532	inp = tp->t_inpcb;
533	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
534	INP_WLOCK(inp);
535	if (callout_pending(&tp->t_timers->tt_persist) ||
536	    !callout_active(&tp->t_timers->tt_persist)) {
537		INP_WUNLOCK(inp);
538		CURVNET_RESTORE();
539		return;
540	}
541	callout_deactivate(&tp->t_timers->tt_persist);
542	if ((inp->inp_flags & INP_DROPPED) != 0) {
543		INP_WUNLOCK(inp);
544		CURVNET_RESTORE();
545		return;
546	}
547	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
548		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
549	/*
550	 * Persistence timer into zero window.
551	 * Force a byte to be output, if possible.
552	 */
553	TCPSTAT_INC(tcps_persisttimeo);
554	/*
555	 * Hack: if the peer is dead/unreachable, we do not
556	 * time out if the window is closed.  After a full
557	 * backoff, drop the connection if the idle time
558	 * (no responses to probes) reaches the maximum
559	 * backoff that we would use if retransmitting.
560	 */
561	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
562	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
563	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
564		TCPSTAT_INC(tcps_persistdrop);
565		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
566			tcp_inpinfo_lock_del(inp, tp);
567			goto out;
568		}
569		NET_EPOCH_ENTER(et);
570		tp = tcp_drop(tp, ETIMEDOUT);
571		NET_EPOCH_EXIT(et);
572		tcp_inpinfo_lock_del(inp, tp);
573		goto out;
574	}
575	/*
576	 * If the user has closed the socket then drop a persisting
577	 * connection after a much reduced timeout.
578	 */
579	if (tp->t_state > TCPS_CLOSE_WAIT &&
580	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
581		TCPSTAT_INC(tcps_persistdrop);
582		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
583			tcp_inpinfo_lock_del(inp, tp);
584			goto out;
585		}
586		NET_EPOCH_ENTER(et);
587		tp = tcp_drop(tp, ETIMEDOUT);
588		NET_EPOCH_EXIT(et);
589		tcp_inpinfo_lock_del(inp, tp);
590		goto out;
591	}
592	tcp_setpersist(tp);
593	tp->t_flags |= TF_FORCEDATA;
594	NET_EPOCH_ENTER(et);
595	(void) tp->t_fb->tfb_tcp_output(tp);
596	NET_EPOCH_EXIT(et);
597	tp->t_flags &= ~TF_FORCEDATA;
598
599#ifdef TCPDEBUG
600	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
601		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
602#endif
603	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
604	INP_WUNLOCK(inp);
605out:
606	CURVNET_RESTORE();
607}
608
609void
610tcp_timer_rexmt(void * xtp)
611{
612	struct tcpcb *tp = xtp;
613	CURVNET_SET(tp->t_vnet);
614	int rexmt;
615	struct inpcb *inp;
616	struct epoch_tracker et;
617	bool isipv6;
618#ifdef TCPDEBUG
619	int ostate;
620
621	ostate = tp->t_state;
622#endif
623	inp = tp->t_inpcb;
624	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
625	INP_WLOCK(inp);
626	if (callout_pending(&tp->t_timers->tt_rexmt) ||
627	    !callout_active(&tp->t_timers->tt_rexmt)) {
628		INP_WUNLOCK(inp);
629		CURVNET_RESTORE();
630		return;
631	}
632	callout_deactivate(&tp->t_timers->tt_rexmt);
633	if ((inp->inp_flags & INP_DROPPED) != 0) {
634		INP_WUNLOCK(inp);
635		CURVNET_RESTORE();
636		return;
637	}
638	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
639		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
640	tcp_free_sackholes(tp);
641	TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false);
642	if (tp->t_fb->tfb_tcp_rexmit_tmr) {
643		/* The stack has a timer action too. */
644		(*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
645	}
646	/*
647	 * Retransmission timer went off.  Message has not
648	 * been acked within retransmit interval.  Back off
649	 * to a longer retransmit interval and retransmit one segment.
650	 */
651	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
652		tp->t_rxtshift = TCP_MAXRXTSHIFT;
653		TCPSTAT_INC(tcps_timeoutdrop);
654		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
655			tcp_inpinfo_lock_del(inp, tp);
656			goto out;
657		}
658		NET_EPOCH_ENTER(et);
659		tp = tcp_drop(tp, ETIMEDOUT);
660		NET_EPOCH_EXIT(et);
661		tcp_inpinfo_lock_del(inp, tp);
662		goto out;
663	}
664	if (tp->t_state == TCPS_SYN_SENT) {
665		/*
666		 * If the SYN was retransmitted, indicate CWND to be
667		 * limited to 1 segment in cc_conn_init().
668		 */
669		tp->snd_cwnd = 1;
670	} else if (tp->t_rxtshift == 1) {
671		/*
672		 * first retransmit; record ssthresh and cwnd so they can
673		 * be recovered if this turns out to be a "bad" retransmit.
674		 * A retransmit is considered "bad" if an ACK for this
675		 * segment is received within RTT/2 interval; the assumption
676		 * here is that the ACK was already in flight.  See
677		 * "On Estimating End-to-End Network Path Properties" by
678		 * Allman and Paxson for more details.
679		 */
680		tp->snd_cwnd_prev = tp->snd_cwnd;
681		tp->snd_ssthresh_prev = tp->snd_ssthresh;
682		tp->snd_recover_prev = tp->snd_recover;
683		if (IN_FASTRECOVERY(tp->t_flags))
684			tp->t_flags |= TF_WASFRECOVERY;
685		else
686			tp->t_flags &= ~TF_WASFRECOVERY;
687		if (IN_CONGRECOVERY(tp->t_flags))
688			tp->t_flags |= TF_WASCRECOVERY;
689		else
690			tp->t_flags &= ~TF_WASCRECOVERY;
691		if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
692			tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
693		/* In the event that we've negotiated timestamps
694		 * badrxtwin will be set to the value that we set
695		 * the retransmitted packet's to_tsval to by tcp_output
696		 */
697		tp->t_flags |= TF_PREVVALID;
698	} else
699		tp->t_flags &= ~TF_PREVVALID;
700	TCPSTAT_INC(tcps_rexmttimeo);
701	if ((tp->t_state == TCPS_SYN_SENT) ||
702	    (tp->t_state == TCPS_SYN_RECEIVED))
703		rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
704	else
705		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
706	TCPT_RANGESET(tp->t_rxtcur, rexmt,
707		      tp->t_rttmin, TCPTV_REXMTMAX);
708
709	/*
710	 * We enter the path for PLMTUD if connection is established or, if
711	 * connection is FIN_WAIT_1 status, reason for the last is that if
712	 * amount of data we send is very small, we could send it in couple of
713	 * packets and process straight to FIN. In that case we won't catch
714	 * ESTABLISHED state.
715	 */
716#ifdef INET6
717	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
718#else
719	isipv6 = false;
720#endif
721	if (((V_tcp_pmtud_blackhole_detect == 1) ||
722	    (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
723	    (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
724	    ((tp->t_state == TCPS_ESTABLISHED) ||
725	    (tp->t_state == TCPS_FIN_WAIT_1))) {
726		if (tp->t_rxtshift == 1) {
727			/*
728			 * We enter blackhole detection after the first
729			 * unsuccessful timer based retransmission.
730			 * Then we reduce up to two times the MSS, each
731			 * candidate giving two tries of retransmissions.
732			 * But we give a candidate only two tries, if it
733			 * actually reduces the MSS.
734			 */
735			tp->t_blackhole_enter = 2;
736			tp->t_blackhole_exit = tp->t_blackhole_enter;
737			if (isipv6) {
738#ifdef INET6
739				if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss)
740					tp->t_blackhole_exit += 2;
741				if (tp->t_maxseg > V_tcp_v6mssdflt &&
742				    V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt)
743					tp->t_blackhole_exit += 2;
744#endif
745			} else {
746#ifdef INET
747				if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss)
748					tp->t_blackhole_exit += 2;
749				if (tp->t_maxseg > V_tcp_mssdflt &&
750				    V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt)
751					tp->t_blackhole_exit += 2;
752#endif
753			}
754		}
755		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
756		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
757		    (tp->t_rxtshift >= tp->t_blackhole_enter &&
758		    tp->t_rxtshift < tp->t_blackhole_exit &&
759		    (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) {
760			/*
761			 * Enter Path MTU Black-hole Detection mechanism:
762			 * - Disable Path MTU Discovery (IP "DF" bit).
763			 * - Reduce MTU to lower value than what we
764			 *   negotiated with peer.
765			 */
766			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
767				/* Record that we may have found a black hole. */
768				tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
769				/* Keep track of previous MSS. */
770				tp->t_pmtud_saved_maxseg = tp->t_maxseg;
771			}
772
773			/*
774			 * Reduce the MSS to blackhole value or to the default
775			 * in an attempt to retransmit.
776			 */
777#ifdef INET6
778			if (isipv6 &&
779			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss &&
780			    V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) {
781				/* Use the sysctl tuneable blackhole MSS. */
782				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
783				TCPSTAT_INC(tcps_pmtud_blackhole_activated);
784			} else if (isipv6) {
785				/* Use the default MSS. */
786				tp->t_maxseg = V_tcp_v6mssdflt;
787				/*
788				 * Disable Path MTU Discovery when we switch to
789				 * minmss.
790				 */
791				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
792				TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
793			}
794#endif
795#if defined(INET6) && defined(INET)
796			else
797#endif
798#ifdef INET
799			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss &&
800			    V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) {
801				/* Use the sysctl tuneable blackhole MSS. */
802				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
803				TCPSTAT_INC(tcps_pmtud_blackhole_activated);
804			} else {
805				/* Use the default MSS. */
806				tp->t_maxseg = V_tcp_mssdflt;
807				/*
808				 * Disable Path MTU Discovery when we switch to
809				 * minmss.
810				 */
811				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
812				TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
813			}
814#endif
815			/*
816			 * Reset the slow-start flight size
817			 * as it may depend on the new MSS.
818			 */
819			if (CC_ALGO(tp)->conn_init != NULL)
820				CC_ALGO(tp)->conn_init(tp->ccv);
821		} else {
822			/*
823			 * If further retransmissions are still unsuccessful
824			 * with a lowered MTU, maybe this isn't a blackhole and
825			 * we restore the previous MSS and blackhole detection
826			 * flags.
827			 */
828			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
829			    (tp->t_rxtshift >= tp->t_blackhole_exit)) {
830				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
831				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
832				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
833				TCPSTAT_INC(tcps_pmtud_blackhole_failed);
834				/*
835				 * Reset the slow-start flight size as it
836				 * may depend on the new MSS.
837				 */
838				if (CC_ALGO(tp)->conn_init != NULL)
839					CC_ALGO(tp)->conn_init(tp->ccv);
840			}
841		}
842	}
843
844	/*
845	 * Disable RFC1323 and SACK if we haven't got any response to
846	 * our third SYN to work-around some broken terminal servers
847	 * (most of which have hopefully been retired) that have bad VJ
848	 * header compression code which trashes TCP segments containing
849	 * unknown-to-them TCP options.
850	 */
851	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
852	    (tp->t_rxtshift == 3))
853		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
854	/*
855	 * If we backed off this far, notify the L3 protocol that we're having
856	 * connection problems.
857	 */
858	if (tp->t_rxtshift > TCP_RTT_INVALIDATE) {
859#ifdef INET6
860		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
861			in6_losing(tp->t_inpcb);
862		else
863#endif
864			in_losing(tp->t_inpcb);
865	}
866	tp->snd_nxt = tp->snd_una;
867	tp->snd_recover = tp->snd_max;
868	/*
869	 * Force a segment to be sent.
870	 */
871	tp->t_flags |= TF_ACKNOW;
872	/*
873	 * If timing a segment in this window, stop the timer.
874	 */
875	tp->t_rtttime = 0;
876
877	cc_cong_signal(tp, NULL, CC_RTO);
878	NET_EPOCH_ENTER(et);
879	(void) tp->t_fb->tfb_tcp_output(tp);
880	NET_EPOCH_EXIT(et);
881#ifdef TCPDEBUG
882	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
883		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
884			  PRU_SLOWTIMO);
885#endif
886	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
887	INP_WUNLOCK(inp);
888out:
889	CURVNET_RESTORE();
890}
891
892void
893tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
894{
895	struct callout *t_callout;
896	callout_func_t *f_callout;
897	struct inpcb *inp = tp->t_inpcb;
898	int cpu = inp_to_cpuid(inp);
899
900#ifdef TCP_OFFLOAD
901	if (tp->t_flags & TF_TOE)
902		return;
903#endif
904
905	if (tp->t_timers->tt_flags & TT_STOPPED)
906		return;
907
908	switch (timer_type) {
909		case TT_DELACK:
910			t_callout = &tp->t_timers->tt_delack;
911			f_callout = tcp_timer_delack;
912			break;
913		case TT_REXMT:
914			t_callout = &tp->t_timers->tt_rexmt;
915			f_callout = tcp_timer_rexmt;
916			break;
917		case TT_PERSIST:
918			t_callout = &tp->t_timers->tt_persist;
919			f_callout = tcp_timer_persist;
920			break;
921		case TT_KEEP:
922			t_callout = &tp->t_timers->tt_keep;
923			f_callout = tcp_timer_keep;
924			break;
925		case TT_2MSL:
926			t_callout = &tp->t_timers->tt_2msl;
927			f_callout = tcp_timer_2msl;
928			break;
929		default:
930			if (tp->t_fb->tfb_tcp_timer_activate) {
931				tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
932				return;
933			}
934			panic("tp %p bad timer_type %#x", tp, timer_type);
935		}
936	if (delta == 0) {
937		callout_stop(t_callout);
938	} else {
939		callout_reset_on(t_callout, delta, f_callout, tp, cpu);
940	}
941}
942
943int
944tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
945{
946	struct callout *t_callout;
947
948	switch (timer_type) {
949		case TT_DELACK:
950			t_callout = &tp->t_timers->tt_delack;
951			break;
952		case TT_REXMT:
953			t_callout = &tp->t_timers->tt_rexmt;
954			break;
955		case TT_PERSIST:
956			t_callout = &tp->t_timers->tt_persist;
957			break;
958		case TT_KEEP:
959			t_callout = &tp->t_timers->tt_keep;
960			break;
961		case TT_2MSL:
962			t_callout = &tp->t_timers->tt_2msl;
963			break;
964		default:
965			if (tp->t_fb->tfb_tcp_timer_active) {
966				return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
967			}
968			panic("tp %p bad timer_type %#x", tp, timer_type);
969		}
970	return callout_active(t_callout);
971}
972
973/*
974 * Stop the timer from running, and apply a flag
975 * against the timer_flags that will force the
976 * timer never to run. The flag is needed to assure
977 * a race does not leave it running and cause
978 * the timer to possibly restart itself (keep and persist
979 * especially do this).
980 */
981int
982tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type)
983{
984	struct callout *t_callout;
985	uint32_t t_flags;
986
987	switch (timer_type) {
988		case TT_DELACK:
989			t_flags = TT_DELACK_SUS;
990			t_callout = &tp->t_timers->tt_delack;
991			break;
992		case TT_REXMT:
993			t_flags = TT_REXMT_SUS;
994			t_callout = &tp->t_timers->tt_rexmt;
995			break;
996		case TT_PERSIST:
997			t_flags = TT_PERSIST_SUS;
998			t_callout = &tp->t_timers->tt_persist;
999			break;
1000		case TT_KEEP:
1001			t_flags = TT_KEEP_SUS;
1002			t_callout = &tp->t_timers->tt_keep;
1003			break;
1004		case TT_2MSL:
1005			t_flags = TT_2MSL_SUS;
1006			t_callout = &tp->t_timers->tt_2msl;
1007			break;
1008		default:
1009			panic("tp:%p bad timer_type 0x%x", tp, timer_type);
1010	}
1011	tp->t_timers->tt_flags |= t_flags;
1012	return (callout_stop(t_callout));
1013}
1014
1015void
1016tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type)
1017{
1018	switch (timer_type) {
1019		case TT_DELACK:
1020			if (tp->t_timers->tt_flags & TT_DELACK_SUS) {
1021				tp->t_timers->tt_flags &= ~TT_DELACK_SUS;
1022				if (tp->t_flags & TF_DELACK) {
1023					/* Delayed ack timer should be up activate a timer */
1024					tp->t_flags &= ~TF_DELACK;
1025					tcp_timer_activate(tp, TT_DELACK,
1026					    tcp_delacktime);
1027				}
1028			}
1029			break;
1030		case TT_REXMT:
1031			if (tp->t_timers->tt_flags & TT_REXMT_SUS) {
1032				tp->t_timers->tt_flags &= ~TT_REXMT_SUS;
1033				if (SEQ_GT(tp->snd_max, tp->snd_una) &&
1034				    (tcp_timer_active((tp), TT_PERSIST) == 0) &&
1035				    tp->snd_wnd) {
1036					/* We have outstanding data activate a timer */
1037					tcp_timer_activate(tp, TT_REXMT,
1038                                            tp->t_rxtcur);
1039				}
1040			}
1041			break;
1042		case TT_PERSIST:
1043			if (tp->t_timers->tt_flags & TT_PERSIST_SUS) {
1044				tp->t_timers->tt_flags &= ~TT_PERSIST_SUS;
1045				if (tp->snd_wnd == 0) {
1046					/* Activate the persists timer */
1047					tp->t_rxtshift = 0;
1048					tcp_setpersist(tp);
1049				}
1050			}
1051			break;
1052		case TT_KEEP:
1053			if (tp->t_timers->tt_flags & TT_KEEP_SUS) {
1054				tp->t_timers->tt_flags &= ~TT_KEEP_SUS;
1055				tcp_timer_activate(tp, TT_KEEP,
1056					    TCPS_HAVEESTABLISHED(tp->t_state) ?
1057					    TP_KEEPIDLE(tp) : TP_KEEPINIT(tp));
1058			}
1059			break;
1060		case TT_2MSL:
1061			if (tp->t_timers->tt_flags &= TT_2MSL_SUS) {
1062				tp->t_timers->tt_flags &= ~TT_2MSL_SUS;
1063				if ((tp->t_state == TCPS_FIN_WAIT_2) &&
1064				    ((tp->t_inpcb->inp_socket == NULL) ||
1065				     (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) {
1066					/* Star the 2MSL timer */
1067					tcp_timer_activate(tp, TT_2MSL,
1068					    (tcp_fast_finwait2_recycle) ?
1069					    tcp_finwait2_timeout : TP_MAXIDLE(tp));
1070				}
1071			}
1072			break;
1073		default:
1074			panic("tp:%p bad timer_type 0x%x", tp, timer_type);
1075	}
1076}
1077
1078void
1079tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
1080{
1081	struct callout *t_callout;
1082
1083	tp->t_timers->tt_flags |= TT_STOPPED;
1084	switch (timer_type) {
1085		case TT_DELACK:
1086			t_callout = &tp->t_timers->tt_delack;
1087			break;
1088		case TT_REXMT:
1089			t_callout = &tp->t_timers->tt_rexmt;
1090			break;
1091		case TT_PERSIST:
1092			t_callout = &tp->t_timers->tt_persist;
1093			break;
1094		case TT_KEEP:
1095			t_callout = &tp->t_timers->tt_keep;
1096			break;
1097		case TT_2MSL:
1098			t_callout = &tp->t_timers->tt_2msl;
1099			break;
1100		default:
1101			if (tp->t_fb->tfb_tcp_timer_stop) {
1102				/*
1103				 * XXXrrs we need to look at this with the
1104				 * stop case below (flags).
1105				 */
1106				tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
1107				return;
1108			}
1109			panic("tp %p bad timer_type %#x", tp, timer_type);
1110		}
1111
1112	if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
1113		/*
1114		 * Can't stop the callout, defer tcpcb actual deletion
1115		 * to the last one. We do this using the async drain
1116		 * function and incrementing the count in
1117		 */
1118		tp->t_timers->tt_draincnt++;
1119	}
1120}
1121