tcp_timer.c revision 287304
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/netinet/tcp_timer.c 287304 2015-08-30 13:44:39Z jch $");
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37#include "opt_tcpdebug.h"
38#include "opt_rss.h"
39
40#include <sys/param.h>
41#include <sys/kernel.h>
42#include <sys/lock.h>
43#include <sys/mbuf.h>
44#include <sys/mutex.h>
45#include <sys/protosw.h>
46#include <sys/smp.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/sysctl.h>
50#include <sys/systm.h>
51
52#include <net/if.h>
53#include <net/route.h>
54#include <net/rss_config.h>
55#include <net/vnet.h>
56#include <net/netisr.h>
57
58#include <netinet/cc.h>
59#include <netinet/in.h>
60#include <netinet/in_pcb.h>
61#include <netinet/in_rss.h>
62#include <netinet/in_systm.h>
63#ifdef INET6
64#include <netinet6/in6_pcb.h>
65#endif
66#include <netinet/ip_var.h>
67#include <netinet/tcp_fsm.h>
68#include <netinet/tcp_timer.h>
69#include <netinet/tcp_var.h>
70#ifdef INET6
71#include <netinet6/tcp6_var.h>
72#endif
73#include <netinet/tcpip.h>
74#ifdef TCPDEBUG
75#include <netinet/tcp_debug.h>
76#endif
77
78int	tcp_keepinit;
79SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
80    &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
81
82int	tcp_keepidle;
83SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
84    &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
85
86int	tcp_keepintvl;
87SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
88    &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
89
90int	tcp_delacktime;
91SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
92    &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
93    "Time before a delayed ACK is sent");
94
95int	tcp_msl;
96SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
97    &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
98
99int	tcp_rexmit_min;
100SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
101    &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
102    "Minimum Retransmission Timeout");
103
104int	tcp_rexmit_slop;
105SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
106    &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
107    "Retransmission Timer Slop");
108
109static int	always_keepalive = 1;
110SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
111    &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
112
113int    tcp_fast_finwait2_recycle = 0;
114SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
115    &tcp_fast_finwait2_recycle, 0,
116    "Recycle closed FIN_WAIT_2 connections faster");
117
118int    tcp_finwait2_timeout;
119SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
120    &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
121
122int	tcp_keepcnt = TCPTV_KEEPCNT;
123SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
124    "Number of keepalive probes to send");
125
126	/* max idle probes */
127int	tcp_maxpersistidle;
128
129static int	tcp_rexmit_drop_options = 0;
130SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
131    &tcp_rexmit_drop_options, 0,
132    "Drop TCP options from 3rd and later retransmitted SYN");
133
134static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
135#define	V_tcp_pmtud_blackhole_detect	VNET(tcp_pmtud_blackhole_detect)
136SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
137    CTLFLAG_RW|CTLFLAG_VNET,
138    &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
139    "Path MTU Discovery Black Hole Detection Enabled");
140
141static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
142#define	V_tcp_pmtud_blackhole_activated \
143    VNET(tcp_pmtud_blackhole_activated)
144SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
145    CTLFLAG_RD|CTLFLAG_VNET,
146    &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
147    "Path MTU Discovery Black Hole Detection, Activation Count");
148
149static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
150#define	V_tcp_pmtud_blackhole_activated_min_mss \
151    VNET(tcp_pmtud_blackhole_activated_min_mss)
152SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
153    CTLFLAG_RD|CTLFLAG_VNET,
154    &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
155    "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
156
157static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
158#define	V_tcp_pmtud_blackhole_failed	VNET(tcp_pmtud_blackhole_failed)
159SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
160    CTLFLAG_RD|CTLFLAG_VNET,
161    &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
162    "Path MTU Discovery Black Hole Detection, Failure Count");
163
164#ifdef INET
165static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
166#define	V_tcp_pmtud_blackhole_mss	VNET(tcp_pmtud_blackhole_mss)
167SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
168    CTLFLAG_RW|CTLFLAG_VNET,
169    &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
170    "Path MTU Discovery Black Hole Detection lowered MSS");
171#endif
172
173#ifdef INET6
174static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
175#define	V_tcp_v6pmtud_blackhole_mss	VNET(tcp_v6pmtud_blackhole_mss)
176SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
177    CTLFLAG_RW|CTLFLAG_VNET,
178    &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
179    "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
180#endif
181
182#ifdef	RSS
183static int	per_cpu_timers = 1;
184#else
185static int	per_cpu_timers = 0;
186#endif
187SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
188    &per_cpu_timers , 0, "run tcp timers on all cpus");
189
190#if 0
191#define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
192		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
193#endif
194
195/*
196 * Map the given inp to a CPU id.
197 *
198 * This queries RSS if it's compiled in, else it defaults to the current
199 * CPU ID.
200 */
201static inline int
202inp_to_cpuid(struct inpcb *inp)
203{
204	u_int cpuid;
205
206#ifdef	RSS
207	if (per_cpu_timers) {
208		cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
209		if (cpuid == NETISR_CPUID_NONE)
210			return (curcpu);	/* XXX */
211		else
212			return (cpuid);
213	}
214#else
215	/* Legacy, pre-RSS behaviour */
216	if (per_cpu_timers) {
217		/*
218		 * We don't have a flowid -> cpuid mapping, so cheat and
219		 * just map unknown cpuids to curcpu.  Not the best, but
220		 * apparently better than defaulting to swi 0.
221		 */
222		cpuid = inp->inp_flowid % (mp_maxid + 1);
223		if (! CPU_ABSENT(cpuid))
224			return (cpuid);
225		return (curcpu);
226	}
227#endif
228	/* Default for RSS and non-RSS - cpuid 0 */
229	else {
230		return (0);
231	}
232}
233
234/*
235 * Tcp protocol timeout routine called every 500 ms.
236 * Updates timestamps used for TCP
237 * causes finite state machine actions if timers expire.
238 */
239void
240tcp_slowtimo(void)
241{
242	VNET_ITERATOR_DECL(vnet_iter);
243
244	VNET_LIST_RLOCK_NOSLEEP();
245	VNET_FOREACH(vnet_iter) {
246		CURVNET_SET(vnet_iter);
247		(void) tcp_tw_2msl_scan(0);
248		CURVNET_RESTORE();
249	}
250	VNET_LIST_RUNLOCK_NOSLEEP();
251}
252
253int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
254    { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
255
256int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
257    { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
258
259static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
260
261/*
262 * TCP timer processing.
263 */
264
265void
266tcp_timer_delack(void *xtp)
267{
268	struct tcpcb *tp = xtp;
269	struct inpcb *inp;
270	CURVNET_SET(tp->t_vnet);
271
272	inp = tp->t_inpcb;
273	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
274	INP_WLOCK(inp);
275	if (callout_pending(&tp->t_timers->tt_delack) ||
276	    !callout_active(&tp->t_timers->tt_delack)) {
277		INP_WUNLOCK(inp);
278		CURVNET_RESTORE();
279		return;
280	}
281	callout_deactivate(&tp->t_timers->tt_delack);
282	if ((inp->inp_flags & INP_DROPPED) != 0) {
283		INP_WUNLOCK(inp);
284		CURVNET_RESTORE();
285		return;
286	}
287	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
288		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
289	KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
290		("%s: tp %p delack callout should be running", __func__, tp));
291
292	tp->t_flags |= TF_ACKNOW;
293	TCPSTAT_INC(tcps_delack);
294	(void) tcp_output(tp);
295	INP_WUNLOCK(inp);
296	CURVNET_RESTORE();
297}
298
299void
300tcp_timer_2msl(void *xtp)
301{
302	struct tcpcb *tp = xtp;
303	struct inpcb *inp;
304	CURVNET_SET(tp->t_vnet);
305#ifdef TCPDEBUG
306	int ostate;
307
308	ostate = tp->t_state;
309#endif
310	INP_INFO_RLOCK(&V_tcbinfo);
311	inp = tp->t_inpcb;
312	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
313	INP_WLOCK(inp);
314	tcp_free_sackholes(tp);
315	if (callout_pending(&tp->t_timers->tt_2msl) ||
316	    !callout_active(&tp->t_timers->tt_2msl)) {
317		INP_WUNLOCK(tp->t_inpcb);
318		INP_INFO_RUNLOCK(&V_tcbinfo);
319		CURVNET_RESTORE();
320		return;
321	}
322	callout_deactivate(&tp->t_timers->tt_2msl);
323	if ((inp->inp_flags & INP_DROPPED) != 0) {
324		INP_WUNLOCK(inp);
325		INP_INFO_RUNLOCK(&V_tcbinfo);
326		CURVNET_RESTORE();
327		return;
328	}
329	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
330		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
331	KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
332		("%s: tp %p 2msl callout should be running", __func__, tp));
333	/*
334	 * 2 MSL timeout in shutdown went off.  If we're closed but
335	 * still waiting for peer to close and connection has been idle
336	 * too long delete connection control block.  Otherwise, check
337	 * again in a bit.
338	 *
339	 * If in TIME_WAIT state just ignore as this timeout is handled in
340	 * tcp_tw_2msl_scan().
341	 *
342	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
343	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
344	 * Ignore fact that there were recent incoming segments.
345	 */
346	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
347		INP_WUNLOCK(inp);
348		INP_INFO_RUNLOCK(&V_tcbinfo);
349		CURVNET_RESTORE();
350		return;
351	}
352	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
353	    tp->t_inpcb && tp->t_inpcb->inp_socket &&
354	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
355		TCPSTAT_INC(tcps_finwait2_drops);
356		tp = tcp_close(tp);
357	} else {
358		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
359			if (!callout_reset(&tp->t_timers->tt_2msl,
360			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
361				tp->t_timers->tt_flags &= ~TT_2MSL_RST;
362			}
363		} else
364		       tp = tcp_close(tp);
365       }
366
367#ifdef TCPDEBUG
368	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
369		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
370			  PRU_SLOWTIMO);
371#endif
372	if (tp != NULL)
373		INP_WUNLOCK(inp);
374	INP_INFO_RUNLOCK(&V_tcbinfo);
375	CURVNET_RESTORE();
376}
377
378void
379tcp_timer_keep(void *xtp)
380{
381	struct tcpcb *tp = xtp;
382	struct tcptemp *t_template;
383	struct inpcb *inp;
384	CURVNET_SET(tp->t_vnet);
385#ifdef TCPDEBUG
386	int ostate;
387
388	ostate = tp->t_state;
389#endif
390	INP_INFO_RLOCK(&V_tcbinfo);
391	inp = tp->t_inpcb;
392	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
393	INP_WLOCK(inp);
394	if (callout_pending(&tp->t_timers->tt_keep) ||
395	    !callout_active(&tp->t_timers->tt_keep)) {
396		INP_WUNLOCK(inp);
397		INP_INFO_RUNLOCK(&V_tcbinfo);
398		CURVNET_RESTORE();
399		return;
400	}
401	callout_deactivate(&tp->t_timers->tt_keep);
402	if ((inp->inp_flags & INP_DROPPED) != 0) {
403		INP_WUNLOCK(inp);
404		INP_INFO_RUNLOCK(&V_tcbinfo);
405		CURVNET_RESTORE();
406		return;
407	}
408	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
409		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
410	KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
411		("%s: tp %p keep callout should be running", __func__, tp));
412	/*
413	 * Keep-alive timer went off; send something
414	 * or drop connection if idle for too long.
415	 */
416	TCPSTAT_INC(tcps_keeptimeo);
417	if (tp->t_state < TCPS_ESTABLISHED)
418		goto dropit;
419	if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
420	    tp->t_state <= TCPS_CLOSING) {
421		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
422			goto dropit;
423		/*
424		 * Send a packet designed to force a response
425		 * if the peer is up and reachable:
426		 * either an ACK if the connection is still alive,
427		 * or an RST if the peer has closed the connection
428		 * due to timeout or reboot.
429		 * Using sequence number tp->snd_una-1
430		 * causes the transmitted zero-length segment
431		 * to lie outside the receive window;
432		 * by the protocol spec, this requires the
433		 * correspondent TCP to respond.
434		 */
435		TCPSTAT_INC(tcps_keepprobe);
436		t_template = tcpip_maketemplate(inp);
437		if (t_template) {
438			tcp_respond(tp, t_template->tt_ipgen,
439				    &t_template->tt_t, (struct mbuf *)NULL,
440				    tp->rcv_nxt, tp->snd_una - 1, 0);
441			free(t_template, M_TEMP);
442		}
443		if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
444		    tcp_timer_keep, tp)) {
445			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
446		}
447	} else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
448		    tcp_timer_keep, tp)) {
449			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
450		}
451
452#ifdef TCPDEBUG
453	if (inp->inp_socket->so_options & SO_DEBUG)
454		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
455			  PRU_SLOWTIMO);
456#endif
457	INP_WUNLOCK(inp);
458	INP_INFO_RUNLOCK(&V_tcbinfo);
459	CURVNET_RESTORE();
460	return;
461
462dropit:
463	TCPSTAT_INC(tcps_keepdrops);
464	tp = tcp_drop(tp, ETIMEDOUT);
465
466#ifdef TCPDEBUG
467	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
468		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
469			  PRU_SLOWTIMO);
470#endif
471	if (tp != NULL)
472		INP_WUNLOCK(tp->t_inpcb);
473	INP_INFO_RUNLOCK(&V_tcbinfo);
474	CURVNET_RESTORE();
475}
476
477void
478tcp_timer_persist(void *xtp)
479{
480	struct tcpcb *tp = xtp;
481	struct inpcb *inp;
482	CURVNET_SET(tp->t_vnet);
483#ifdef TCPDEBUG
484	int ostate;
485
486	ostate = tp->t_state;
487#endif
488	INP_INFO_RLOCK(&V_tcbinfo);
489	inp = tp->t_inpcb;
490	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
491	INP_WLOCK(inp);
492	if (callout_pending(&tp->t_timers->tt_persist) ||
493	    !callout_active(&tp->t_timers->tt_persist)) {
494		INP_WUNLOCK(inp);
495		INP_INFO_RUNLOCK(&V_tcbinfo);
496		CURVNET_RESTORE();
497		return;
498	}
499	callout_deactivate(&tp->t_timers->tt_persist);
500	if ((inp->inp_flags & INP_DROPPED) != 0) {
501		INP_WUNLOCK(inp);
502		INP_INFO_RUNLOCK(&V_tcbinfo);
503		CURVNET_RESTORE();
504		return;
505	}
506	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
507		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
508	KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
509		("%s: tp %p persist callout should be running", __func__, tp));
510	/*
511	 * Persistance timer into zero window.
512	 * Force a byte to be output, if possible.
513	 */
514	TCPSTAT_INC(tcps_persisttimeo);
515	/*
516	 * Hack: if the peer is dead/unreachable, we do not
517	 * time out if the window is closed.  After a full
518	 * backoff, drop the connection if the idle time
519	 * (no responses to probes) reaches the maximum
520	 * backoff that we would use if retransmitting.
521	 */
522	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
523	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
524	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
525		TCPSTAT_INC(tcps_persistdrop);
526		tp = tcp_drop(tp, ETIMEDOUT);
527		goto out;
528	}
529	/*
530	 * If the user has closed the socket then drop a persisting
531	 * connection after a much reduced timeout.
532	 */
533	if (tp->t_state > TCPS_CLOSE_WAIT &&
534	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
535		TCPSTAT_INC(tcps_persistdrop);
536		tp = tcp_drop(tp, ETIMEDOUT);
537		goto out;
538	}
539	tcp_setpersist(tp);
540	tp->t_flags |= TF_FORCEDATA;
541	(void) tcp_output(tp);
542	tp->t_flags &= ~TF_FORCEDATA;
543
544out:
545#ifdef TCPDEBUG
546	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
547		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
548#endif
549	if (tp != NULL)
550		INP_WUNLOCK(inp);
551	INP_INFO_RUNLOCK(&V_tcbinfo);
552	CURVNET_RESTORE();
553}
554
555void
556tcp_timer_rexmt(void * xtp)
557{
558	struct tcpcb *tp = xtp;
559	CURVNET_SET(tp->t_vnet);
560	int rexmt;
561	int headlocked;
562	struct inpcb *inp;
563#ifdef TCPDEBUG
564	int ostate;
565
566	ostate = tp->t_state;
567#endif
568
569	INP_INFO_RLOCK(&V_tcbinfo);
570	inp = tp->t_inpcb;
571	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
572	INP_WLOCK(inp);
573	if (callout_pending(&tp->t_timers->tt_rexmt) ||
574	    !callout_active(&tp->t_timers->tt_rexmt)) {
575		INP_WUNLOCK(inp);
576		INP_INFO_RUNLOCK(&V_tcbinfo);
577		CURVNET_RESTORE();
578		return;
579	}
580	callout_deactivate(&tp->t_timers->tt_rexmt);
581	if ((inp->inp_flags & INP_DROPPED) != 0) {
582		INP_WUNLOCK(inp);
583		INP_INFO_RUNLOCK(&V_tcbinfo);
584		CURVNET_RESTORE();
585		return;
586	}
587	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
588		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
589	KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
590		("%s: tp %p rexmt callout should be running", __func__, tp));
591	tcp_free_sackholes(tp);
592	/*
593	 * Retransmission timer went off.  Message has not
594	 * been acked within retransmit interval.  Back off
595	 * to a longer retransmit interval and retransmit one segment.
596	 */
597	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
598		tp->t_rxtshift = TCP_MAXRXTSHIFT;
599		TCPSTAT_INC(tcps_timeoutdrop);
600
601		tp = tcp_drop(tp, tp->t_softerror ?
602			      tp->t_softerror : ETIMEDOUT);
603		headlocked = 1;
604		goto out;
605	}
606	INP_INFO_RUNLOCK(&V_tcbinfo);
607	headlocked = 0;
608	if (tp->t_state == TCPS_SYN_SENT) {
609		/*
610		 * If the SYN was retransmitted, indicate CWND to be
611		 * limited to 1 segment in cc_conn_init().
612		 */
613		tp->snd_cwnd = 1;
614	} else if (tp->t_rxtshift == 1) {
615		/*
616		 * first retransmit; record ssthresh and cwnd so they can
617		 * be recovered if this turns out to be a "bad" retransmit.
618		 * A retransmit is considered "bad" if an ACK for this
619		 * segment is received within RTT/2 interval; the assumption
620		 * here is that the ACK was already in flight.  See
621		 * "On Estimating End-to-End Network Path Properties" by
622		 * Allman and Paxson for more details.
623		 */
624		tp->snd_cwnd_prev = tp->snd_cwnd;
625		tp->snd_ssthresh_prev = tp->snd_ssthresh;
626		tp->snd_recover_prev = tp->snd_recover;
627		if (IN_FASTRECOVERY(tp->t_flags))
628			tp->t_flags |= TF_WASFRECOVERY;
629		else
630			tp->t_flags &= ~TF_WASFRECOVERY;
631		if (IN_CONGRECOVERY(tp->t_flags))
632			tp->t_flags |= TF_WASCRECOVERY;
633		else
634			tp->t_flags &= ~TF_WASCRECOVERY;
635		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
636		tp->t_flags |= TF_PREVVALID;
637	} else
638		tp->t_flags &= ~TF_PREVVALID;
639	TCPSTAT_INC(tcps_rexmttimeo);
640	if (tp->t_state == TCPS_SYN_SENT)
641		rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
642	else
643		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
644	TCPT_RANGESET(tp->t_rxtcur, rexmt,
645		      tp->t_rttmin, TCPTV_REXMTMAX);
646
647	/*
648	 * We enter the path for PLMTUD if connection is established or, if
649	 * connection is FIN_WAIT_1 status, reason for the last is that if
650	 * amount of data we send is very small, we could send it in couple of
651	 * packets and process straight to FIN. In that case we won't catch
652	 * ESTABLISHED state.
653	 */
654	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
655	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
656		int optlen;
657#ifdef INET6
658		int isipv6;
659#endif
660
661		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
662		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
663		    (tp->t_rxtshift <= 2)) {
664			/*
665			 * Enter Path MTU Black-hole Detection mechanism:
666			 * - Disable Path MTU Discovery (IP "DF" bit).
667			 * - Reduce MTU to lower value than what we
668			 *   negotiated with peer.
669			 */
670			/* Record that we may have found a black hole. */
671			tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
672
673			/* Keep track of previous MSS. */
674			optlen = tp->t_maxopd - tp->t_maxseg;
675			tp->t_pmtud_saved_maxopd = tp->t_maxopd;
676
677			/*
678			 * Reduce the MSS to blackhole value or to the default
679			 * in an attempt to retransmit.
680			 */
681#ifdef INET6
682			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
683			if (isipv6 &&
684			    tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) {
685				/* Use the sysctl tuneable blackhole MSS. */
686				tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss;
687				V_tcp_pmtud_blackhole_activated++;
688			} else if (isipv6) {
689				/* Use the default MSS. */
690				tp->t_maxopd = V_tcp_v6mssdflt;
691				/*
692				 * Disable Path MTU Discovery when we switch to
693				 * minmss.
694				 */
695				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
696				V_tcp_pmtud_blackhole_activated_min_mss++;
697			}
698#endif
699#if defined(INET6) && defined(INET)
700			else
701#endif
702#ifdef INET
703			if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) {
704				/* Use the sysctl tuneable blackhole MSS. */
705				tp->t_maxopd = V_tcp_pmtud_blackhole_mss;
706				V_tcp_pmtud_blackhole_activated++;
707			} else {
708				/* Use the default MSS. */
709				tp->t_maxopd = V_tcp_mssdflt;
710				/*
711				 * Disable Path MTU Discovery when we switch to
712				 * minmss.
713				 */
714				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
715				V_tcp_pmtud_blackhole_activated_min_mss++;
716			}
717#endif
718			tp->t_maxseg = tp->t_maxopd - optlen;
719			/*
720			 * Reset the slow-start flight size
721			 * as it may depend on the new MSS.
722			 */
723			if (CC_ALGO(tp)->conn_init != NULL)
724				CC_ALGO(tp)->conn_init(tp->ccv);
725		} else {
726			/*
727			 * If further retransmissions are still unsuccessful
728			 * with a lowered MTU, maybe this isn't a blackhole and
729			 * we restore the previous MSS and blackhole detection
730			 * flags.
731			 */
732			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
733			    (tp->t_rxtshift > 4)) {
734				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
735				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
736				optlen = tp->t_maxopd - tp->t_maxseg;
737				tp->t_maxopd = tp->t_pmtud_saved_maxopd;
738				tp->t_maxseg = tp->t_maxopd - optlen;
739				V_tcp_pmtud_blackhole_failed++;
740				/*
741				 * Reset the slow-start flight size as it
742				 * may depend on the new MSS.
743				 */
744				if (CC_ALGO(tp)->conn_init != NULL)
745					CC_ALGO(tp)->conn_init(tp->ccv);
746			}
747		}
748	}
749
750	/*
751	 * Disable RFC1323 and SACK if we haven't got any response to
752	 * our third SYN to work-around some broken terminal servers
753	 * (most of which have hopefully been retired) that have bad VJ
754	 * header compression code which trashes TCP segments containing
755	 * unknown-to-them TCP options.
756	 */
757	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
758	    (tp->t_rxtshift == 3))
759		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
760	/*
761	 * If we backed off this far, our srtt estimate is probably bogus.
762	 * Clobber it so we'll take the next rtt measurement as our srtt;
763	 * move the current srtt into rttvar to keep the current
764	 * retransmit times until then.
765	 */
766	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
767#ifdef INET6
768		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
769			in6_losing(tp->t_inpcb);
770#endif
771		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
772		tp->t_srtt = 0;
773	}
774	tp->snd_nxt = tp->snd_una;
775	tp->snd_recover = tp->snd_max;
776	/*
777	 * Force a segment to be sent.
778	 */
779	tp->t_flags |= TF_ACKNOW;
780	/*
781	 * If timing a segment in this window, stop the timer.
782	 */
783	tp->t_rtttime = 0;
784
785	cc_cong_signal(tp, NULL, CC_RTO);
786
787	(void) tcp_output(tp);
788
789out:
790#ifdef TCPDEBUG
791	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
792		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
793			  PRU_SLOWTIMO);
794#endif
795	if (tp != NULL)
796		INP_WUNLOCK(inp);
797	if (headlocked)
798		INP_INFO_RUNLOCK(&V_tcbinfo);
799	CURVNET_RESTORE();
800}
801
802void
803tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
804{
805	struct callout *t_callout;
806	timeout_t *f_callout;
807	struct inpcb *inp = tp->t_inpcb;
808	int cpu = inp_to_cpuid(inp);
809	uint32_t f_reset;
810
811#ifdef TCP_OFFLOAD
812	if (tp->t_flags & TF_TOE)
813		return;
814#endif
815
816	if (tp->t_timers->tt_flags & TT_STOPPED)
817		return;
818
819	switch (timer_type) {
820		case TT_DELACK:
821			t_callout = &tp->t_timers->tt_delack;
822			f_callout = tcp_timer_delack;
823			f_reset = TT_DELACK_RST;
824			break;
825		case TT_REXMT:
826			t_callout = &tp->t_timers->tt_rexmt;
827			f_callout = tcp_timer_rexmt;
828			f_reset = TT_REXMT_RST;
829			break;
830		case TT_PERSIST:
831			t_callout = &tp->t_timers->tt_persist;
832			f_callout = tcp_timer_persist;
833			f_reset = TT_PERSIST_RST;
834			break;
835		case TT_KEEP:
836			t_callout = &tp->t_timers->tt_keep;
837			f_callout = tcp_timer_keep;
838			f_reset = TT_KEEP_RST;
839			break;
840		case TT_2MSL:
841			t_callout = &tp->t_timers->tt_2msl;
842			f_callout = tcp_timer_2msl;
843			f_reset = TT_2MSL_RST;
844			break;
845		default:
846			panic("tp %p bad timer_type %#x", tp, timer_type);
847		}
848	if (delta == 0) {
849		if ((tp->t_timers->tt_flags & timer_type) &&
850		    callout_stop(t_callout) &&
851		    (tp->t_timers->tt_flags & f_reset)) {
852			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
853		}
854	} else {
855		if ((tp->t_timers->tt_flags & timer_type) == 0) {
856			tp->t_timers->tt_flags |= (timer_type | f_reset);
857			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
858		} else {
859			/* Reset already running callout on the same CPU. */
860			if (!callout_reset(t_callout, delta, f_callout, tp)) {
861				/*
862				 * Callout not cancelled, consider it as not
863				 * properly restarted. */
864				tp->t_timers->tt_flags &= ~f_reset;
865			}
866		}
867	}
868}
869
870int
871tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
872{
873	struct callout *t_callout;
874
875	switch (timer_type) {
876		case TT_DELACK:
877			t_callout = &tp->t_timers->tt_delack;
878			break;
879		case TT_REXMT:
880			t_callout = &tp->t_timers->tt_rexmt;
881			break;
882		case TT_PERSIST:
883			t_callout = &tp->t_timers->tt_persist;
884			break;
885		case TT_KEEP:
886			t_callout = &tp->t_timers->tt_keep;
887			break;
888		case TT_2MSL:
889			t_callout = &tp->t_timers->tt_2msl;
890			break;
891		default:
892			panic("tp %p bad timer_type %#x", tp, timer_type);
893		}
894	return callout_active(t_callout);
895}
896
897void
898tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
899{
900	struct callout *t_callout;
901	timeout_t *f_callout;
902	uint32_t f_reset;
903
904	tp->t_timers->tt_flags |= TT_STOPPED;
905
906	switch (timer_type) {
907		case TT_DELACK:
908			t_callout = &tp->t_timers->tt_delack;
909			f_callout = tcp_timer_delack_discard;
910			f_reset = TT_DELACK_RST;
911			break;
912		case TT_REXMT:
913			t_callout = &tp->t_timers->tt_rexmt;
914			f_callout = tcp_timer_rexmt_discard;
915			f_reset = TT_REXMT_RST;
916			break;
917		case TT_PERSIST:
918			t_callout = &tp->t_timers->tt_persist;
919			f_callout = tcp_timer_persist_discard;
920			f_reset = TT_PERSIST_RST;
921			break;
922		case TT_KEEP:
923			t_callout = &tp->t_timers->tt_keep;
924			f_callout = tcp_timer_keep_discard;
925			f_reset = TT_KEEP_RST;
926			break;
927		case TT_2MSL:
928			t_callout = &tp->t_timers->tt_2msl;
929			f_callout = tcp_timer_2msl_discard;
930			f_reset = TT_2MSL_RST;
931			break;
932		default:
933			panic("tp %p bad timer_type %#x", tp, timer_type);
934		}
935
936	if (tp->t_timers->tt_flags & timer_type) {
937		if (callout_stop(t_callout) &&
938		    (tp->t_timers->tt_flags & f_reset)) {
939			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
940		} else {
941			/*
942			 * Can't stop the callout, defer tcpcb actual deletion
943			 * to the last tcp timer discard callout.
944			 * The TT_STOPPED flag will ensure that no tcp timer
945			 * callouts can be restarted on our behalf, and
946			 * past this point currently running callouts waiting
947			 * on inp lock will return right away after the
948			 * classical check for callout reset/stop events:
949			 * callout_pending() || !callout_active()
950			 */
951			callout_reset(t_callout, 1, f_callout, tp);
952		}
953	}
954}
955
956#define	ticks_to_msecs(t)	(1000*(t) / hz)
957
958void
959tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
960    struct xtcp_timer *xtimer)
961{
962	sbintime_t now;
963
964	bzero(xtimer, sizeof(*xtimer));
965	if (timer == NULL)
966		return;
967	now = getsbinuptime();
968	if (callout_active(&timer->tt_delack))
969		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
970	if (callout_active(&timer->tt_rexmt))
971		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
972	if (callout_active(&timer->tt_persist))
973		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
974	if (callout_active(&timer->tt_keep))
975		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
976	if (callout_active(&timer->tt_2msl))
977		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
978	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
979}
980