1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: stable/11/sys/netinet/tcp_timer.c 334727 2018-06-06 19:48:39Z tuexen $");
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37#include "opt_tcpdebug.h"
38#include "opt_rss.h"
39
40#include <sys/param.h>
41#include <sys/kernel.h>
42#include <sys/lock.h>
43#include <sys/mbuf.h>
44#include <sys/mutex.h>
45#include <sys/protosw.h>
46#include <sys/smp.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/sysctl.h>
50#include <sys/systm.h>
51
52#include <net/if.h>
53#include <net/route.h>
54#include <net/rss_config.h>
55#include <net/vnet.h>
56#include <net/netisr.h>
57
58#include <netinet/in.h>
59#include <netinet/in_kdtrace.h>
60#include <netinet/in_pcb.h>
61#include <netinet/in_rss.h>
62#include <netinet/in_systm.h>
63#ifdef INET6
64#include <netinet6/in6_pcb.h>
65#endif
66#include <netinet/ip_var.h>
67#include <netinet/tcp.h>
68#include <netinet/tcp_fsm.h>
69#include <netinet/tcp_timer.h>
70#include <netinet/tcp_var.h>
71#include <netinet/cc/cc.h>
72#ifdef INET6
73#include <netinet6/tcp6_var.h>
74#endif
75#include <netinet/tcpip.h>
76#ifdef TCPDEBUG
77#include <netinet/tcp_debug.h>
78#endif
79
80int    tcp_persmin;
81SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
82    &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
83
84int    tcp_persmax;
85SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
86    &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
87
88int	tcp_keepinit;
89SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
90    &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
91
92int	tcp_keepidle;
93SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
94    &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
95
96int	tcp_keepintvl;
97SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
98    &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
99
100int	tcp_delacktime;
101SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
102    &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
103    "Time before a delayed ACK is sent");
104
105int	tcp_msl;
106SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
107    &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
108
109int	tcp_rexmit_min;
110SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
111    &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
112    "Minimum Retransmission Timeout");
113
114int	tcp_rexmit_slop;
115SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
116    &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
117    "Retransmission Timer Slop");
118
119int	tcp_always_keepalive = 1;
120SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
121    &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
122__strong_reference(tcp_always_keepalive, always_keepalive);
123
124int    tcp_fast_finwait2_recycle = 0;
125SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
126    &tcp_fast_finwait2_recycle, 0,
127    "Recycle closed FIN_WAIT_2 connections faster");
128
129int    tcp_finwait2_timeout;
130SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
131    &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
132
133int	tcp_keepcnt = TCPTV_KEEPCNT;
134SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
135    "Number of keepalive probes to send");
136
137	/* max idle probes */
138int	tcp_maxpersistidle;
139
140static int	tcp_rexmit_drop_options = 0;
141SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
142    &tcp_rexmit_drop_options, 0,
143    "Drop TCP options from 3rd and later retransmitted SYN");
144
145static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
146#define	V_tcp_pmtud_blackhole_detect	VNET(tcp_pmtud_blackhole_detect)
147SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
148    CTLFLAG_RW|CTLFLAG_VNET,
149    &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
150    "Path MTU Discovery Black Hole Detection Enabled");
151
152static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
153#define	V_tcp_pmtud_blackhole_activated \
154    VNET(tcp_pmtud_blackhole_activated)
155SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
156    CTLFLAG_RD|CTLFLAG_VNET,
157    &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
158    "Path MTU Discovery Black Hole Detection, Activation Count");
159
160static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
161#define	V_tcp_pmtud_blackhole_activated_min_mss \
162    VNET(tcp_pmtud_blackhole_activated_min_mss)
163SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
164    CTLFLAG_RD|CTLFLAG_VNET,
165    &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
166    "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
167
168static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
169#define	V_tcp_pmtud_blackhole_failed	VNET(tcp_pmtud_blackhole_failed)
170SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
171    CTLFLAG_RD|CTLFLAG_VNET,
172    &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
173    "Path MTU Discovery Black Hole Detection, Failure Count");
174
175#ifdef INET
176static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
177#define	V_tcp_pmtud_blackhole_mss	VNET(tcp_pmtud_blackhole_mss)
178SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
179    CTLFLAG_RW|CTLFLAG_VNET,
180    &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
181    "Path MTU Discovery Black Hole Detection lowered MSS");
182#endif
183
184#ifdef INET6
185static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
186#define	V_tcp_v6pmtud_blackhole_mss	VNET(tcp_v6pmtud_blackhole_mss)
187SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
188    CTLFLAG_RW|CTLFLAG_VNET,
189    &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
190    "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
191#endif
192
193#ifdef	RSS
194static int	per_cpu_timers = 1;
195#else
196static int	per_cpu_timers = 0;
197#endif
198SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
199    &per_cpu_timers , 0, "run tcp timers on all cpus");
200
201#if 0
202#define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
203		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
204#endif
205
206/*
207 * Map the given inp to a CPU id.
208 *
209 * This queries RSS if it's compiled in, else it defaults to the current
210 * CPU ID.
211 */
212static inline int
213inp_to_cpuid(struct inpcb *inp)
214{
215	u_int cpuid;
216
217#ifdef	RSS
218	if (per_cpu_timers) {
219		cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
220		if (cpuid == NETISR_CPUID_NONE)
221			return (curcpu);	/* XXX */
222		else
223			return (cpuid);
224	}
225#else
226	/* Legacy, pre-RSS behaviour */
227	if (per_cpu_timers) {
228		/*
229		 * We don't have a flowid -> cpuid mapping, so cheat and
230		 * just map unknown cpuids to curcpu.  Not the best, but
231		 * apparently better than defaulting to swi 0.
232		 */
233		cpuid = inp->inp_flowid % (mp_maxid + 1);
234		if (! CPU_ABSENT(cpuid))
235			return (cpuid);
236		return (curcpu);
237	}
238#endif
239	/* Default for RSS and non-RSS - cpuid 0 */
240	else {
241		return (0);
242	}
243}
244
245/*
246 * Tcp protocol timeout routine called every 500 ms.
247 * Updates timestamps used for TCP
248 * causes finite state machine actions if timers expire.
249 */
250void
251tcp_slowtimo(void)
252{
253	VNET_ITERATOR_DECL(vnet_iter);
254
255	VNET_LIST_RLOCK_NOSLEEP();
256	VNET_FOREACH(vnet_iter) {
257		CURVNET_SET(vnet_iter);
258		(void) tcp_tw_2msl_scan(0);
259		CURVNET_RESTORE();
260	}
261	VNET_LIST_RUNLOCK_NOSLEEP();
262}
263
264int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
265    { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
266
267int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
268    { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
269
270static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
271
272/*
273 * TCP timer processing.
274 */
275
276void
277tcp_timer_delack(void *xtp)
278{
279	struct tcpcb *tp = xtp;
280	struct inpcb *inp;
281	CURVNET_SET(tp->t_vnet);
282
283	inp = tp->t_inpcb;
284	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
285	INP_WLOCK(inp);
286	if (callout_pending(&tp->t_timers->tt_delack) ||
287	    !callout_active(&tp->t_timers->tt_delack)) {
288		INP_WUNLOCK(inp);
289		CURVNET_RESTORE();
290		return;
291	}
292	callout_deactivate(&tp->t_timers->tt_delack);
293	if ((inp->inp_flags & INP_DROPPED) != 0) {
294		INP_WUNLOCK(inp);
295		CURVNET_RESTORE();
296		return;
297	}
298	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
299		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
300	KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
301		("%s: tp %p delack callout should be running", __func__, tp));
302
303	tp->t_flags |= TF_ACKNOW;
304	TCPSTAT_INC(tcps_delack);
305	(void) tp->t_fb->tfb_tcp_output(tp);
306	INP_WUNLOCK(inp);
307	CURVNET_RESTORE();
308}
309
310void
311tcp_timer_2msl(void *xtp)
312{
313	struct tcpcb *tp = xtp;
314	struct inpcb *inp;
315	CURVNET_SET(tp->t_vnet);
316#ifdef TCPDEBUG
317	int ostate;
318
319	ostate = tp->t_state;
320#endif
321	INP_INFO_RLOCK(&V_tcbinfo);
322	inp = tp->t_inpcb;
323	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
324	INP_WLOCK(inp);
325	tcp_free_sackholes(tp);
326	if (callout_pending(&tp->t_timers->tt_2msl) ||
327	    !callout_active(&tp->t_timers->tt_2msl)) {
328		INP_WUNLOCK(tp->t_inpcb);
329		INP_INFO_RUNLOCK(&V_tcbinfo);
330		CURVNET_RESTORE();
331		return;
332	}
333	callout_deactivate(&tp->t_timers->tt_2msl);
334	if ((inp->inp_flags & INP_DROPPED) != 0) {
335		INP_WUNLOCK(inp);
336		INP_INFO_RUNLOCK(&V_tcbinfo);
337		CURVNET_RESTORE();
338		return;
339	}
340	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
341		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
342	KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
343		("%s: tp %p 2msl callout should be running", __func__, tp));
344	/*
345	 * 2 MSL timeout in shutdown went off.  If we're closed but
346	 * still waiting for peer to close and connection has been idle
347	 * too long delete connection control block.  Otherwise, check
348	 * again in a bit.
349	 *
350	 * If in TIME_WAIT state just ignore as this timeout is handled in
351	 * tcp_tw_2msl_scan().
352	 *
353	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
354	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
355	 * Ignore fact that there were recent incoming segments.
356	 */
357	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
358		INP_WUNLOCK(inp);
359		INP_INFO_RUNLOCK(&V_tcbinfo);
360		CURVNET_RESTORE();
361		return;
362	}
363	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
364	    tp->t_inpcb && tp->t_inpcb->inp_socket &&
365	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
366		TCPSTAT_INC(tcps_finwait2_drops);
367		tp = tcp_close(tp);
368	} else {
369		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
370			if (!callout_reset(&tp->t_timers->tt_2msl,
371			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
372				tp->t_timers->tt_flags &= ~TT_2MSL_RST;
373			}
374		} else
375		       tp = tcp_close(tp);
376       }
377
378#ifdef TCPDEBUG
379	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
380		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
381			  PRU_SLOWTIMO);
382#endif
383	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
384
385	if (tp != NULL)
386		INP_WUNLOCK(inp);
387	INP_INFO_RUNLOCK(&V_tcbinfo);
388	CURVNET_RESTORE();
389}
390
391void
392tcp_timer_keep(void *xtp)
393{
394	struct tcpcb *tp = xtp;
395	struct tcptemp *t_template;
396	struct inpcb *inp;
397	CURVNET_SET(tp->t_vnet);
398#ifdef TCPDEBUG
399	int ostate;
400
401	ostate = tp->t_state;
402#endif
403	INP_INFO_RLOCK(&V_tcbinfo);
404	inp = tp->t_inpcb;
405	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
406	INP_WLOCK(inp);
407	if (callout_pending(&tp->t_timers->tt_keep) ||
408	    !callout_active(&tp->t_timers->tt_keep)) {
409		INP_WUNLOCK(inp);
410		INP_INFO_RUNLOCK(&V_tcbinfo);
411		CURVNET_RESTORE();
412		return;
413	}
414	callout_deactivate(&tp->t_timers->tt_keep);
415	if ((inp->inp_flags & INP_DROPPED) != 0) {
416		INP_WUNLOCK(inp);
417		INP_INFO_RUNLOCK(&V_tcbinfo);
418		CURVNET_RESTORE();
419		return;
420	}
421	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
422		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
423	KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
424		("%s: tp %p keep callout should be running", __func__, tp));
425	/*
426	 * Keep-alive timer went off; send something
427	 * or drop connection if idle for too long.
428	 */
429	TCPSTAT_INC(tcps_keeptimeo);
430	if (tp->t_state < TCPS_ESTABLISHED)
431		goto dropit;
432	if ((tcp_always_keepalive ||
433	    inp->inp_socket->so_options & SO_KEEPALIVE) &&
434	    tp->t_state <= TCPS_CLOSING) {
435		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
436			goto dropit;
437		/*
438		 * Send a packet designed to force a response
439		 * if the peer is up and reachable:
440		 * either an ACK if the connection is still alive,
441		 * or an RST if the peer has closed the connection
442		 * due to timeout or reboot.
443		 * Using sequence number tp->snd_una-1
444		 * causes the transmitted zero-length segment
445		 * to lie outside the receive window;
446		 * by the protocol spec, this requires the
447		 * correspondent TCP to respond.
448		 */
449		TCPSTAT_INC(tcps_keepprobe);
450		t_template = tcpip_maketemplate(inp);
451		if (t_template) {
452			tcp_respond(tp, t_template->tt_ipgen,
453				    &t_template->tt_t, (struct mbuf *)NULL,
454				    tp->rcv_nxt, tp->snd_una - 1, 0);
455			free(t_template, M_TEMP);
456		}
457		if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
458		    tcp_timer_keep, tp)) {
459			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
460		}
461	} else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
462		    tcp_timer_keep, tp)) {
463			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
464		}
465
466#ifdef TCPDEBUG
467	if (inp->inp_socket->so_options & SO_DEBUG)
468		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
469			  PRU_SLOWTIMO);
470#endif
471	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
472	INP_WUNLOCK(inp);
473	INP_INFO_RUNLOCK(&V_tcbinfo);
474	CURVNET_RESTORE();
475	return;
476
477dropit:
478	TCPSTAT_INC(tcps_keepdrops);
479	tp = tcp_drop(tp, ETIMEDOUT);
480
481#ifdef TCPDEBUG
482	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
483		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
484			  PRU_SLOWTIMO);
485#endif
486	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
487	if (tp != NULL)
488		INP_WUNLOCK(tp->t_inpcb);
489	INP_INFO_RUNLOCK(&V_tcbinfo);
490	CURVNET_RESTORE();
491}
492
493void
494tcp_timer_persist(void *xtp)
495{
496	struct tcpcb *tp = xtp;
497	struct inpcb *inp;
498	CURVNET_SET(tp->t_vnet);
499#ifdef TCPDEBUG
500	int ostate;
501
502	ostate = tp->t_state;
503#endif
504	INP_INFO_RLOCK(&V_tcbinfo);
505	inp = tp->t_inpcb;
506	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
507	INP_WLOCK(inp);
508	if (callout_pending(&tp->t_timers->tt_persist) ||
509	    !callout_active(&tp->t_timers->tt_persist)) {
510		INP_WUNLOCK(inp);
511		INP_INFO_RUNLOCK(&V_tcbinfo);
512		CURVNET_RESTORE();
513		return;
514	}
515	callout_deactivate(&tp->t_timers->tt_persist);
516	if ((inp->inp_flags & INP_DROPPED) != 0) {
517		INP_WUNLOCK(inp);
518		INP_INFO_RUNLOCK(&V_tcbinfo);
519		CURVNET_RESTORE();
520		return;
521	}
522	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
523		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
524	KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
525		("%s: tp %p persist callout should be running", __func__, tp));
526	/*
527	 * Persistence timer into zero window.
528	 * Force a byte to be output, if possible.
529	 */
530	TCPSTAT_INC(tcps_persisttimeo);
531	/*
532	 * Hack: if the peer is dead/unreachable, we do not
533	 * time out if the window is closed.  After a full
534	 * backoff, drop the connection if the idle time
535	 * (no responses to probes) reaches the maximum
536	 * backoff that we would use if retransmitting.
537	 */
538	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
539	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
540	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
541		TCPSTAT_INC(tcps_persistdrop);
542		tp = tcp_drop(tp, ETIMEDOUT);
543		goto out;
544	}
545	/*
546	 * If the user has closed the socket then drop a persisting
547	 * connection after a much reduced timeout.
548	 */
549	if (tp->t_state > TCPS_CLOSE_WAIT &&
550	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
551		TCPSTAT_INC(tcps_persistdrop);
552		tp = tcp_drop(tp, ETIMEDOUT);
553		goto out;
554	}
555	tcp_setpersist(tp);
556	tp->t_flags |= TF_FORCEDATA;
557	(void) tp->t_fb->tfb_tcp_output(tp);
558	tp->t_flags &= ~TF_FORCEDATA;
559
560out:
561#ifdef TCPDEBUG
562	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
563		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
564#endif
565	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
566	if (tp != NULL)
567		INP_WUNLOCK(inp);
568	INP_INFO_RUNLOCK(&V_tcbinfo);
569	CURVNET_RESTORE();
570}
571
572void
573tcp_timer_rexmt(void * xtp)
574{
575	struct tcpcb *tp = xtp;
576	CURVNET_SET(tp->t_vnet);
577	int rexmt;
578	int headlocked;
579	struct inpcb *inp;
580#ifdef TCPDEBUG
581	int ostate;
582
583	ostate = tp->t_state;
584#endif
585
586	INP_INFO_RLOCK(&V_tcbinfo);
587	inp = tp->t_inpcb;
588	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
589	INP_WLOCK(inp);
590	if (callout_pending(&tp->t_timers->tt_rexmt) ||
591	    !callout_active(&tp->t_timers->tt_rexmt)) {
592		INP_WUNLOCK(inp);
593		INP_INFO_RUNLOCK(&V_tcbinfo);
594		CURVNET_RESTORE();
595		return;
596	}
597	callout_deactivate(&tp->t_timers->tt_rexmt);
598	if ((inp->inp_flags & INP_DROPPED) != 0) {
599		INP_WUNLOCK(inp);
600		INP_INFO_RUNLOCK(&V_tcbinfo);
601		CURVNET_RESTORE();
602		return;
603	}
604	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
605		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
606	KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
607		("%s: tp %p rexmt callout should be running", __func__, tp));
608	tcp_free_sackholes(tp);
609	if (tp->t_fb->tfb_tcp_rexmit_tmr) {
610		/* The stack has a timer action too. */
611		(*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
612	}
613	/*
614	 * Retransmission timer went off.  Message has not
615	 * been acked within retransmit interval.  Back off
616	 * to a longer retransmit interval and retransmit one segment.
617	 */
618	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
619		tp->t_rxtshift = TCP_MAXRXTSHIFT;
620		TCPSTAT_INC(tcps_timeoutdrop);
621
622		tp = tcp_drop(tp, ETIMEDOUT);
623		headlocked = 1;
624		goto out;
625	}
626	INP_INFO_RUNLOCK(&V_tcbinfo);
627	headlocked = 0;
628	if (tp->t_state == TCPS_SYN_SENT) {
629		/*
630		 * If the SYN was retransmitted, indicate CWND to be
631		 * limited to 1 segment in cc_conn_init().
632		 */
633		tp->snd_cwnd = 1;
634	} else if (tp->t_rxtshift == 1) {
635		/*
636		 * first retransmit; record ssthresh and cwnd so they can
637		 * be recovered if this turns out to be a "bad" retransmit.
638		 * A retransmit is considered "bad" if an ACK for this
639		 * segment is received within RTT/2 interval; the assumption
640		 * here is that the ACK was already in flight.  See
641		 * "On Estimating End-to-End Network Path Properties" by
642		 * Allman and Paxson for more details.
643		 */
644		tp->snd_cwnd_prev = tp->snd_cwnd;
645		tp->snd_ssthresh_prev = tp->snd_ssthresh;
646		tp->snd_recover_prev = tp->snd_recover;
647		if (IN_FASTRECOVERY(tp->t_flags))
648			tp->t_flags |= TF_WASFRECOVERY;
649		else
650			tp->t_flags &= ~TF_WASFRECOVERY;
651		if (IN_CONGRECOVERY(tp->t_flags))
652			tp->t_flags |= TF_WASCRECOVERY;
653		else
654			tp->t_flags &= ~TF_WASCRECOVERY;
655		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
656		tp->t_flags |= TF_PREVVALID;
657	} else
658		tp->t_flags &= ~TF_PREVVALID;
659	TCPSTAT_INC(tcps_rexmttimeo);
660	if ((tp->t_state == TCPS_SYN_SENT) ||
661	    (tp->t_state == TCPS_SYN_RECEIVED))
662		rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
663	else
664		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
665	TCPT_RANGESET(tp->t_rxtcur, rexmt,
666		      tp->t_rttmin, TCPTV_REXMTMAX);
667
668	/*
669	 * We enter the path for PLMTUD if connection is established or, if
670	 * connection is FIN_WAIT_1 status, reason for the last is that if
671	 * amount of data we send is very small, we could send it in couple of
672	 * packets and process straight to FIN. In that case we won't catch
673	 * ESTABLISHED state.
674	 */
675	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
676	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
677#ifdef INET6
678		int isipv6;
679#endif
680
681		/*
682		 * Idea here is that at each stage of mtu probe (usually, 1448
683		 * -> 1188 -> 524) should be given 2 chances to recover before
684		 *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
685		 *  take care of that.
686		 */
687		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
688		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
689		    (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
690		    tp->t_rxtshift % 2 == 0)) {
691			/*
692			 * Enter Path MTU Black-hole Detection mechanism:
693			 * - Disable Path MTU Discovery (IP "DF" bit).
694			 * - Reduce MTU to lower value than what we
695			 *   negotiated with peer.
696			 */
697			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
698				/* Record that we may have found a black hole. */
699				tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
700				/* Keep track of previous MSS. */
701				tp->t_pmtud_saved_maxseg = tp->t_maxseg;
702			}
703
704			/*
705			 * Reduce the MSS to blackhole value or to the default
706			 * in an attempt to retransmit.
707			 */
708#ifdef INET6
709			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
710			if (isipv6 &&
711			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
712				/* Use the sysctl tuneable blackhole MSS. */
713				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
714				V_tcp_pmtud_blackhole_activated++;
715			} else if (isipv6) {
716				/* Use the default MSS. */
717				tp->t_maxseg = V_tcp_v6mssdflt;
718				/*
719				 * Disable Path MTU Discovery when we switch to
720				 * minmss.
721				 */
722				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
723				V_tcp_pmtud_blackhole_activated_min_mss++;
724			}
725#endif
726#if defined(INET6) && defined(INET)
727			else
728#endif
729#ifdef INET
730			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
731				/* Use the sysctl tuneable blackhole MSS. */
732				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
733				V_tcp_pmtud_blackhole_activated++;
734			} else {
735				/* Use the default MSS. */
736				tp->t_maxseg = V_tcp_mssdflt;
737				/*
738				 * Disable Path MTU Discovery when we switch to
739				 * minmss.
740				 */
741				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
742				V_tcp_pmtud_blackhole_activated_min_mss++;
743			}
744#endif
745			/*
746			 * Reset the slow-start flight size
747			 * as it may depend on the new MSS.
748			 */
749			if (CC_ALGO(tp)->conn_init != NULL)
750				CC_ALGO(tp)->conn_init(tp->ccv);
751		} else {
752			/*
753			 * If further retransmissions are still unsuccessful
754			 * with a lowered MTU, maybe this isn't a blackhole and
755			 * we restore the previous MSS and blackhole detection
756			 * flags.
757			 * The limit '6' is determined by giving each probe
758			 * stage (1448, 1188, 524) 2 chances to recover.
759			 */
760			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
761			    (tp->t_rxtshift >= 6)) {
762				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
763				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
764				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
765				V_tcp_pmtud_blackhole_failed++;
766				/*
767				 * Reset the slow-start flight size as it
768				 * may depend on the new MSS.
769				 */
770				if (CC_ALGO(tp)->conn_init != NULL)
771					CC_ALGO(tp)->conn_init(tp->ccv);
772			}
773		}
774	}
775
776	/*
777	 * Disable RFC1323 and SACK if we haven't got any response to
778	 * our third SYN to work-around some broken terminal servers
779	 * (most of which have hopefully been retired) that have bad VJ
780	 * header compression code which trashes TCP segments containing
781	 * unknown-to-them TCP options.
782	 */
783	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
784	    (tp->t_rxtshift == 3))
785		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
786	/*
787	 * If we backed off this far, our srtt estimate is probably bogus.
788	 * Clobber it so we'll take the next rtt measurement as our srtt;
789	 * move the current srtt into rttvar to keep the current
790	 * retransmit times until then.
791	 */
792	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
793#ifdef INET6
794		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
795			in6_losing(tp->t_inpcb);
796		else
797#endif
798			in_losing(tp->t_inpcb);
799		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
800		tp->t_srtt = 0;
801	}
802	tp->snd_nxt = tp->snd_una;
803	tp->snd_recover = tp->snd_max;
804	/*
805	 * Force a segment to be sent.
806	 */
807	tp->t_flags |= TF_ACKNOW;
808	/*
809	 * If timing a segment in this window, stop the timer.
810	 */
811	tp->t_rtttime = 0;
812
813	cc_cong_signal(tp, NULL, CC_RTO);
814
815	(void) tp->t_fb->tfb_tcp_output(tp);
816
817out:
818#ifdef TCPDEBUG
819	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
820		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
821			  PRU_SLOWTIMO);
822#endif
823	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
824	if (tp != NULL)
825		INP_WUNLOCK(inp);
826	if (headlocked)
827		INP_INFO_RUNLOCK(&V_tcbinfo);
828	CURVNET_RESTORE();
829}
830
831void
832tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
833{
834	struct callout *t_callout;
835	timeout_t *f_callout;
836	struct inpcb *inp = tp->t_inpcb;
837	int cpu = inp_to_cpuid(inp);
838	uint32_t f_reset;
839
840#ifdef TCP_OFFLOAD
841	if (tp->t_flags & TF_TOE)
842		return;
843#endif
844
845	if (tp->t_timers->tt_flags & TT_STOPPED)
846		return;
847
848	switch (timer_type) {
849		case TT_DELACK:
850			t_callout = &tp->t_timers->tt_delack;
851			f_callout = tcp_timer_delack;
852			f_reset = TT_DELACK_RST;
853			break;
854		case TT_REXMT:
855			t_callout = &tp->t_timers->tt_rexmt;
856			f_callout = tcp_timer_rexmt;
857			f_reset = TT_REXMT_RST;
858			break;
859		case TT_PERSIST:
860			t_callout = &tp->t_timers->tt_persist;
861			f_callout = tcp_timer_persist;
862			f_reset = TT_PERSIST_RST;
863			break;
864		case TT_KEEP:
865			t_callout = &tp->t_timers->tt_keep;
866			f_callout = tcp_timer_keep;
867			f_reset = TT_KEEP_RST;
868			break;
869		case TT_2MSL:
870			t_callout = &tp->t_timers->tt_2msl;
871			f_callout = tcp_timer_2msl;
872			f_reset = TT_2MSL_RST;
873			break;
874		default:
875			if (tp->t_fb->tfb_tcp_timer_activate) {
876				tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
877				return;
878			}
879			panic("tp %p bad timer_type %#x", tp, timer_type);
880		}
881	if (delta == 0) {
882		if ((tp->t_timers->tt_flags & timer_type) &&
883		    (callout_stop(t_callout) > 0) &&
884		    (tp->t_timers->tt_flags & f_reset)) {
885			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
886		}
887	} else {
888		if ((tp->t_timers->tt_flags & timer_type) == 0) {
889			tp->t_timers->tt_flags |= (timer_type | f_reset);
890			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
891		} else {
892			/* Reset already running callout on the same CPU. */
893			if (!callout_reset(t_callout, delta, f_callout, tp)) {
894				/*
895				 * Callout not cancelled, consider it as not
896				 * properly restarted. */
897				tp->t_timers->tt_flags &= ~f_reset;
898			}
899		}
900	}
901}
902
903int
904tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
905{
906	struct callout *t_callout;
907
908	switch (timer_type) {
909		case TT_DELACK:
910			t_callout = &tp->t_timers->tt_delack;
911			break;
912		case TT_REXMT:
913			t_callout = &tp->t_timers->tt_rexmt;
914			break;
915		case TT_PERSIST:
916			t_callout = &tp->t_timers->tt_persist;
917			break;
918		case TT_KEEP:
919			t_callout = &tp->t_timers->tt_keep;
920			break;
921		case TT_2MSL:
922			t_callout = &tp->t_timers->tt_2msl;
923			break;
924		default:
925			if (tp->t_fb->tfb_tcp_timer_active) {
926				return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
927			}
928			panic("tp %p bad timer_type %#x", tp, timer_type);
929		}
930	return callout_active(t_callout);
931}
932
933void
934tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
935{
936	struct callout *t_callout;
937	uint32_t f_reset;
938
939	tp->t_timers->tt_flags |= TT_STOPPED;
940
941	switch (timer_type) {
942		case TT_DELACK:
943			t_callout = &tp->t_timers->tt_delack;
944			f_reset = TT_DELACK_RST;
945			break;
946		case TT_REXMT:
947			t_callout = &tp->t_timers->tt_rexmt;
948			f_reset = TT_REXMT_RST;
949			break;
950		case TT_PERSIST:
951			t_callout = &tp->t_timers->tt_persist;
952			f_reset = TT_PERSIST_RST;
953			break;
954		case TT_KEEP:
955			t_callout = &tp->t_timers->tt_keep;
956			f_reset = TT_KEEP_RST;
957			break;
958		case TT_2MSL:
959			t_callout = &tp->t_timers->tt_2msl;
960			f_reset = TT_2MSL_RST;
961			break;
962		default:
963			if (tp->t_fb->tfb_tcp_timer_stop) {
964				/*
965				 * XXXrrs we need to look at this with the
966				 * stop case below (flags).
967				 */
968				tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
969				return;
970			}
971			panic("tp %p bad timer_type %#x", tp, timer_type);
972		}
973
974	if (tp->t_timers->tt_flags & timer_type) {
975		if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
976			/*
977			 * Can't stop the callout, defer tcpcb actual deletion
978			 * to the last one. We do this using the async drain
979			 * function and incrementing the count in
980			 */
981			tp->t_timers->tt_draincnt++;
982		}
983	}
984}
985
986#define	ticks_to_msecs(t)	(1000*(t) / hz)
987
988void
989tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
990    struct xtcp_timer *xtimer)
991{
992	sbintime_t now;
993
994	bzero(xtimer, sizeof(*xtimer));
995	if (timer == NULL)
996		return;
997	now = getsbinuptime();
998	if (callout_active(&timer->tt_delack))
999		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
1000	if (callout_active(&timer->tt_rexmt))
1001		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
1002	if (callout_active(&timer->tt_persist))
1003		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
1004	if (callout_active(&timer->tt_keep))
1005		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
1006	if (callout_active(&timer->tt_2msl))
1007		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
1008	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
1009}
1010