tcp_timer.c revision 303389
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: stable/10/sys/netinet/tcp_timer.c 303389 2016-07-27 13:53:15Z jch $");
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37#include "opt_tcpdebug.h"
38
39#include <sys/param.h>
40#include <sys/kernel.h>
41#include <sys/lock.h>
42#include <sys/mbuf.h>
43#include <sys/mutex.h>
44#include <sys/protosw.h>
45#include <sys/smp.h>
46#include <sys/socket.h>
47#include <sys/socketvar.h>
48#include <sys/sysctl.h>
49#include <sys/systm.h>
50
51#include <net/if.h>
52#include <net/route.h>
53#include <net/vnet.h>
54
55#include <netinet/cc.h>
56#include <netinet/in.h>
57#include <netinet/in_pcb.h>
58#include <netinet/in_systm.h>
59#ifdef INET6
60#include <netinet6/in6_pcb.h>
61#endif
62#include <netinet/ip_var.h>
63#include <netinet/tcp_fsm.h>
64#include <netinet/tcp_timer.h>
65#include <netinet/tcp_var.h>
66#ifdef INET6
67#include <netinet6/tcp6_var.h>
68#endif
69#include <netinet/tcpip.h>
70#ifdef TCPDEBUG
71#include <netinet/tcp_debug.h>
72#endif
73
74int    tcp_persmin;
75SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
76    &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
77
78int    tcp_persmax;
79SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
80    &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
81
82int	tcp_keepinit;
83SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
84    &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
85
86int	tcp_keepidle;
87SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
88    &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
89
90int	tcp_keepintvl;
91SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
92    &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
93
94int	tcp_delacktime;
95SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
96    &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
97    "Time before a delayed ACK is sent");
98
99int	tcp_msl;
100SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
101    &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
102
103int	tcp_rexmit_min;
104SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
105    &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
106    "Minimum Retransmission Timeout");
107
108int	tcp_rexmit_slop;
109SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
110    &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
111    "Retransmission Timer Slop");
112
113static int	always_keepalive = 1;
114SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
115    &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
116
117int    tcp_fast_finwait2_recycle = 0;
118SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
119    &tcp_fast_finwait2_recycle, 0,
120    "Recycle closed FIN_WAIT_2 connections faster");
121
122int    tcp_finwait2_timeout;
123SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
124    &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
125
126int	tcp_keepcnt = TCPTV_KEEPCNT;
127SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
128    "Number of keepalive probes to send");
129
130	/* max idle probes */
131int	tcp_maxpersistidle;
132
133static int	tcp_rexmit_drop_options = 0;
134SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
135    &tcp_rexmit_drop_options, 0,
136    "Drop TCP options from 3rd and later retransmitted SYN");
137
138static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
139#define	V_tcp_pmtud_blackhole_detect	VNET(tcp_pmtud_blackhole_detect)
140SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
141    CTLFLAG_RW,
142    &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
143    "Path MTU Discovery Black Hole Detection Enabled");
144
145static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
146#define	V_tcp_pmtud_blackhole_activated \
147    VNET(tcp_pmtud_blackhole_activated)
148SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
149    CTLFLAG_RD,
150    &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
151    "Path MTU Discovery Black Hole Detection, Activation Count");
152
153static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
154#define	V_tcp_pmtud_blackhole_activated_min_mss \
155    VNET(tcp_pmtud_blackhole_activated_min_mss)
156SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
157    CTLFLAG_RD,
158    &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
159    "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
160
161static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
162#define	V_tcp_pmtud_blackhole_failed	VNET(tcp_pmtud_blackhole_failed)
163SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
164    CTLFLAG_RD,
165    &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
166    "Path MTU Discovery Black Hole Detection, Failure Count");
167
168#ifdef INET
169static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
170#define	V_tcp_pmtud_blackhole_mss	VNET(tcp_pmtud_blackhole_mss)
171SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
172    CTLFLAG_RW,
173    &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
174    "Path MTU Discovery Black Hole Detection lowered MSS");
175#endif
176
177#ifdef INET6
178static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
179#define	V_tcp_v6pmtud_blackhole_mss	VNET(tcp_v6pmtud_blackhole_mss)
180SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
181    CTLFLAG_RW,
182    &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
183    "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
184#endif
185
186static int	per_cpu_timers = 0;
187SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
188    &per_cpu_timers , 0, "run tcp timers on all cpus");
189
190#define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
191		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
192
193/*
194 * Tcp protocol timeout routine called every 500 ms.
195 * Updates timestamps used for TCP
196 * causes finite state machine actions if timers expire.
197 */
198void
199tcp_slowtimo(void)
200{
201	VNET_ITERATOR_DECL(vnet_iter);
202
203	VNET_LIST_RLOCK_NOSLEEP();
204	VNET_FOREACH(vnet_iter) {
205		CURVNET_SET(vnet_iter);
206		(void) tcp_tw_2msl_scan(0);
207		CURVNET_RESTORE();
208	}
209	VNET_LIST_RUNLOCK_NOSLEEP();
210}
211
212int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
213    { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
214
215int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
216    { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
217
218static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
219
220/*
221 * TCP timer processing.
222 */
223
224void
225tcp_timer_delack(void *xtp)
226{
227	struct tcpcb *tp = xtp;
228	struct inpcb *inp;
229	CURVNET_SET(tp->t_vnet);
230
231	inp = tp->t_inpcb;
232	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
233	INP_WLOCK(inp);
234	if (callout_pending(&tp->t_timers->tt_delack) ||
235	    !callout_active(&tp->t_timers->tt_delack)) {
236		INP_WUNLOCK(inp);
237		CURVNET_RESTORE();
238		return;
239	}
240	callout_deactivate(&tp->t_timers->tt_delack);
241	if ((inp->inp_flags & INP_DROPPED) != 0) {
242		INP_WUNLOCK(inp);
243		CURVNET_RESTORE();
244		return;
245	}
246	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
247		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
248	KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
249		("%s: tp %p delack callout should be running", __func__, tp));
250
251	tp->t_flags |= TF_ACKNOW;
252	TCPSTAT_INC(tcps_delack);
253	(void) tcp_output(tp);
254	INP_WUNLOCK(inp);
255	CURVNET_RESTORE();
256}
257
258void
259tcp_timer_2msl(void *xtp)
260{
261	struct tcpcb *tp = xtp;
262	struct inpcb *inp;
263	CURVNET_SET(tp->t_vnet);
264#ifdef TCPDEBUG
265	int ostate;
266
267	ostate = tp->t_state;
268#endif
269	INP_INFO_WLOCK(&V_tcbinfo);
270	inp = tp->t_inpcb;
271	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
272	INP_WLOCK(inp);
273	tcp_free_sackholes(tp);
274	if (callout_pending(&tp->t_timers->tt_2msl) ||
275	    !callout_active(&tp->t_timers->tt_2msl)) {
276		INP_WUNLOCK(tp->t_inpcb);
277		INP_INFO_WUNLOCK(&V_tcbinfo);
278		CURVNET_RESTORE();
279		return;
280	}
281	callout_deactivate(&tp->t_timers->tt_2msl);
282	if ((inp->inp_flags & INP_DROPPED) != 0) {
283		INP_WUNLOCK(inp);
284		INP_INFO_WUNLOCK(&V_tcbinfo);
285		CURVNET_RESTORE();
286		return;
287	}
288	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
289		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
290	KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
291		("%s: tp %p 2msl callout should be running", __func__, tp));
292	/*
293	 * 2 MSL timeout in shutdown went off.  If we're closed but
294	 * still waiting for peer to close and connection has been idle
295	 * too long delete connection control block.  Otherwise, check
296	 * again in a bit.
297	 *
298	 * If in TIME_WAIT state just ignore as this timeout is handled in
299	 * tcp_tw_2msl_scan().
300	 *
301	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
302	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
303	 * Ignore fact that there were recent incoming segments.
304	 */
305	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
306		INP_WUNLOCK(inp);
307		INP_INFO_WUNLOCK(&V_tcbinfo);
308		CURVNET_RESTORE();
309		return;
310	}
311	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
312	    tp->t_inpcb && tp->t_inpcb->inp_socket &&
313	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
314		TCPSTAT_INC(tcps_finwait2_drops);
315		tp = tcp_close(tp);
316	} else {
317		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
318			if (!callout_reset(&tp->t_timers->tt_2msl,
319			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
320				tp->t_timers->tt_flags &= ~TT_2MSL_RST;
321			}
322		} else
323		       tp = tcp_close(tp);
324       }
325
326#ifdef TCPDEBUG
327	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
328		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
329			  PRU_SLOWTIMO);
330#endif
331	if (tp != NULL)
332		INP_WUNLOCK(inp);
333	INP_INFO_WUNLOCK(&V_tcbinfo);
334	CURVNET_RESTORE();
335}
336
337void
338tcp_timer_keep(void *xtp)
339{
340	struct tcpcb *tp = xtp;
341	struct tcptemp *t_template;
342	struct inpcb *inp;
343	CURVNET_SET(tp->t_vnet);
344#ifdef TCPDEBUG
345	int ostate;
346
347	ostate = tp->t_state;
348#endif
349	INP_INFO_WLOCK(&V_tcbinfo);
350	inp = tp->t_inpcb;
351	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
352	INP_WLOCK(inp);
353	if (callout_pending(&tp->t_timers->tt_keep) ||
354	    !callout_active(&tp->t_timers->tt_keep)) {
355		INP_WUNLOCK(inp);
356		INP_INFO_WUNLOCK(&V_tcbinfo);
357		CURVNET_RESTORE();
358		return;
359	}
360	callout_deactivate(&tp->t_timers->tt_keep);
361	if ((inp->inp_flags & INP_DROPPED) != 0) {
362		INP_WUNLOCK(inp);
363		INP_INFO_WUNLOCK(&V_tcbinfo);
364		CURVNET_RESTORE();
365		return;
366	}
367	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
368		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
369	KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
370		("%s: tp %p keep callout should be running", __func__, tp));
371	/*
372	 * Keep-alive timer went off; send something
373	 * or drop connection if idle for too long.
374	 */
375	TCPSTAT_INC(tcps_keeptimeo);
376	if (tp->t_state < TCPS_ESTABLISHED)
377		goto dropit;
378	if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
379	    tp->t_state <= TCPS_CLOSING) {
380		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
381			goto dropit;
382		/*
383		 * Send a packet designed to force a response
384		 * if the peer is up and reachable:
385		 * either an ACK if the connection is still alive,
386		 * or an RST if the peer has closed the connection
387		 * due to timeout or reboot.
388		 * Using sequence number tp->snd_una-1
389		 * causes the transmitted zero-length segment
390		 * to lie outside the receive window;
391		 * by the protocol spec, this requires the
392		 * correspondent TCP to respond.
393		 */
394		TCPSTAT_INC(tcps_keepprobe);
395		t_template = tcpip_maketemplate(inp);
396		if (t_template) {
397			tcp_respond(tp, t_template->tt_ipgen,
398				    &t_template->tt_t, (struct mbuf *)NULL,
399				    tp->rcv_nxt, tp->snd_una - 1, 0);
400			free(t_template, M_TEMP);
401		}
402		if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
403		    tcp_timer_keep, tp)) {
404			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
405		}
406	} else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
407		    tcp_timer_keep, tp)) {
408			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
409		}
410
411#ifdef TCPDEBUG
412	if (inp->inp_socket->so_options & SO_DEBUG)
413		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
414			  PRU_SLOWTIMO);
415#endif
416	INP_WUNLOCK(inp);
417	INP_INFO_WUNLOCK(&V_tcbinfo);
418	CURVNET_RESTORE();
419	return;
420
421dropit:
422	TCPSTAT_INC(tcps_keepdrops);
423	tp = tcp_drop(tp, ETIMEDOUT);
424
425#ifdef TCPDEBUG
426	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
427		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
428			  PRU_SLOWTIMO);
429#endif
430	if (tp != NULL)
431		INP_WUNLOCK(tp->t_inpcb);
432	INP_INFO_WUNLOCK(&V_tcbinfo);
433	CURVNET_RESTORE();
434}
435
436void
437tcp_timer_persist(void *xtp)
438{
439	struct tcpcb *tp = xtp;
440	struct inpcb *inp;
441	CURVNET_SET(tp->t_vnet);
442#ifdef TCPDEBUG
443	int ostate;
444
445	ostate = tp->t_state;
446#endif
447	INP_INFO_WLOCK(&V_tcbinfo);
448	inp = tp->t_inpcb;
449	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
450	INP_WLOCK(inp);
451	if (callout_pending(&tp->t_timers->tt_persist) ||
452	    !callout_active(&tp->t_timers->tt_persist)) {
453		INP_WUNLOCK(inp);
454		INP_INFO_WUNLOCK(&V_tcbinfo);
455		CURVNET_RESTORE();
456		return;
457	}
458	callout_deactivate(&tp->t_timers->tt_persist);
459	if ((inp->inp_flags & INP_DROPPED) != 0) {
460		INP_WUNLOCK(inp);
461		INP_INFO_WUNLOCK(&V_tcbinfo);
462		CURVNET_RESTORE();
463		return;
464	}
465	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
466		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
467	KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
468		("%s: tp %p persist callout should be running", __func__, tp));
469	/*
470	 * Persistance timer into zero window.
471	 * Force a byte to be output, if possible.
472	 */
473	TCPSTAT_INC(tcps_persisttimeo);
474	/*
475	 * Hack: if the peer is dead/unreachable, we do not
476	 * time out if the window is closed.  After a full
477	 * backoff, drop the connection if the idle time
478	 * (no responses to probes) reaches the maximum
479	 * backoff that we would use if retransmitting.
480	 */
481	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
482	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
483	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
484		TCPSTAT_INC(tcps_persistdrop);
485		tp = tcp_drop(tp, ETIMEDOUT);
486		goto out;
487	}
488	/*
489	 * If the user has closed the socket then drop a persisting
490	 * connection after a much reduced timeout.
491	 */
492	if (tp->t_state > TCPS_CLOSE_WAIT &&
493	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
494		TCPSTAT_INC(tcps_persistdrop);
495		tp = tcp_drop(tp, ETIMEDOUT);
496		goto out;
497	}
498	tcp_setpersist(tp);
499	tp->t_flags |= TF_FORCEDATA;
500	(void) tcp_output(tp);
501	tp->t_flags &= ~TF_FORCEDATA;
502
503out:
504#ifdef TCPDEBUG
505	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
506		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
507#endif
508	if (tp != NULL)
509		INP_WUNLOCK(inp);
510	INP_INFO_WUNLOCK(&V_tcbinfo);
511	CURVNET_RESTORE();
512}
513
514void
515tcp_timer_rexmt(void * xtp)
516{
517	struct tcpcb *tp = xtp;
518	CURVNET_SET(tp->t_vnet);
519	int rexmt;
520	int headlocked;
521	struct inpcb *inp;
522#ifdef TCPDEBUG
523	int ostate;
524
525	ostate = tp->t_state;
526#endif
527
528	INP_INFO_RLOCK(&V_tcbinfo);
529	inp = tp->t_inpcb;
530	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
531	INP_WLOCK(inp);
532	if (callout_pending(&tp->t_timers->tt_rexmt) ||
533	    !callout_active(&tp->t_timers->tt_rexmt)) {
534		INP_WUNLOCK(inp);
535		INP_INFO_RUNLOCK(&V_tcbinfo);
536		CURVNET_RESTORE();
537		return;
538	}
539	callout_deactivate(&tp->t_timers->tt_rexmt);
540	if ((inp->inp_flags & INP_DROPPED) != 0) {
541		INP_WUNLOCK(inp);
542		INP_INFO_RUNLOCK(&V_tcbinfo);
543		CURVNET_RESTORE();
544		return;
545	}
546	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
547		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
548	KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
549		("%s: tp %p rexmt callout should be running", __func__, tp));
550	tcp_free_sackholes(tp);
551	/*
552	 * Retransmission timer went off.  Message has not
553	 * been acked within retransmit interval.  Back off
554	 * to a longer retransmit interval and retransmit one segment.
555	 */
556	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
557		tp->t_rxtshift = TCP_MAXRXTSHIFT;
558		TCPSTAT_INC(tcps_timeoutdrop);
559		in_pcbref(inp);
560		INP_INFO_RUNLOCK(&V_tcbinfo);
561		INP_WUNLOCK(inp);
562		INP_INFO_WLOCK(&V_tcbinfo);
563		INP_WLOCK(inp);
564		if (in_pcbrele_wlocked(inp)) {
565			INP_INFO_WUNLOCK(&V_tcbinfo);
566			CURVNET_RESTORE();
567			return;
568		}
569		if (inp->inp_flags & INP_DROPPED) {
570			INP_WUNLOCK(inp);
571			INP_INFO_WUNLOCK(&V_tcbinfo);
572			CURVNET_RESTORE();
573			return;
574		}
575
576		tp = tcp_drop(tp, tp->t_softerror ?
577			      tp->t_softerror : ETIMEDOUT);
578		headlocked = 1;
579		goto out;
580	}
581	INP_INFO_RUNLOCK(&V_tcbinfo);
582	headlocked = 0;
583	if (tp->t_state == TCPS_SYN_SENT) {
584		/*
585		 * If the SYN was retransmitted, indicate CWND to be
586		 * limited to 1 segment in cc_conn_init().
587		 */
588		tp->snd_cwnd = 1;
589	} else if (tp->t_rxtshift == 1) {
590		/*
591		 * first retransmit; record ssthresh and cwnd so they can
592		 * be recovered if this turns out to be a "bad" retransmit.
593		 * A retransmit is considered "bad" if an ACK for this
594		 * segment is received within RTT/2 interval; the assumption
595		 * here is that the ACK was already in flight.  See
596		 * "On Estimating End-to-End Network Path Properties" by
597		 * Allman and Paxson for more details.
598		 */
599		tp->snd_cwnd_prev = tp->snd_cwnd;
600		tp->snd_ssthresh_prev = tp->snd_ssthresh;
601		tp->snd_recover_prev = tp->snd_recover;
602		if (IN_FASTRECOVERY(tp->t_flags))
603			tp->t_flags |= TF_WASFRECOVERY;
604		else
605			tp->t_flags &= ~TF_WASFRECOVERY;
606		if (IN_CONGRECOVERY(tp->t_flags))
607			tp->t_flags |= TF_WASCRECOVERY;
608		else
609			tp->t_flags &= ~TF_WASCRECOVERY;
610		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
611		tp->t_flags |= TF_PREVVALID;
612	} else
613		tp->t_flags &= ~TF_PREVVALID;
614	TCPSTAT_INC(tcps_rexmttimeo);
615	if ((tp->t_state == TCPS_SYN_SENT) ||
616	    (tp->t_state == TCPS_SYN_RECEIVED))
617		rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
618	else
619		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
620	TCPT_RANGESET(tp->t_rxtcur, rexmt,
621		      tp->t_rttmin, TCPTV_REXMTMAX);
622
623	/*
624	 * We enter the path for PLMTUD if connection is established or, if
625	 * connection is FIN_WAIT_1 status, reason for the last is that if
626	 * amount of data we send is very small, we could send it in couple of
627	 * packets and process straight to FIN. In that case we won't catch
628	 * ESTABLISHED state.
629	 */
630	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
631	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
632		int optlen;
633#ifdef INET6
634		int isipv6;
635#endif
636
637		/*
638		 * Idea here is that at each stage of mtu probe (usually, 1448
639		 * -> 1188 -> 524) should be given 2 chances to recover before
640		 *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
641		 *  take care of that.
642		 */
643		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
644		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
645		    (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
646			/*
647			 * Enter Path MTU Black-hole Detection mechanism:
648			 * - Disable Path MTU Discovery (IP "DF" bit).
649			 * - Reduce MTU to lower value than what we
650			 *   negotiated with peer.
651			 */
652			/* Record that we may have found a black hole. */
653			tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
654
655			/* Keep track of previous MSS. */
656			optlen = tp->t_maxopd - tp->t_maxseg;
657			tp->t_pmtud_saved_maxopd = tp->t_maxopd;
658
659			/*
660			 * Reduce the MSS to blackhole value or to the default
661			 * in an attempt to retransmit.
662			 */
663#ifdef INET6
664			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
665			if (isipv6 &&
666			    tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) {
667				/* Use the sysctl tuneable blackhole MSS. */
668				tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss;
669				V_tcp_pmtud_blackhole_activated++;
670			} else if (isipv6) {
671				/* Use the default MSS. */
672				tp->t_maxopd = V_tcp_v6mssdflt;
673				/*
674				 * Disable Path MTU Discovery when we switch to
675				 * minmss.
676				 */
677				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
678				V_tcp_pmtud_blackhole_activated_min_mss++;
679			}
680#endif
681#if defined(INET6) && defined(INET)
682			else
683#endif
684#ifdef INET
685			if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) {
686				/* Use the sysctl tuneable blackhole MSS. */
687				tp->t_maxopd = V_tcp_pmtud_blackhole_mss;
688				V_tcp_pmtud_blackhole_activated++;
689			} else {
690				/* Use the default MSS. */
691				tp->t_maxopd = V_tcp_mssdflt;
692				/*
693				 * Disable Path MTU Discovery when we switch to
694				 * minmss.
695				 */
696				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
697				V_tcp_pmtud_blackhole_activated_min_mss++;
698			}
699#endif
700			tp->t_maxseg = tp->t_maxopd - optlen;
701			/*
702			 * Reset the slow-start flight size
703			 * as it may depend on the new MSS.
704			 */
705			if (CC_ALGO(tp)->conn_init != NULL)
706				CC_ALGO(tp)->conn_init(tp->ccv);
707		} else {
708			/*
709			 * If further retransmissions are still unsuccessful
710			 * with a lowered MTU, maybe this isn't a blackhole and
711			 * we restore the previous MSS and blackhole detection
712			 * flags.
713			 * The limit '6' is determined by giving each probe
714			 * stage (1448, 1188, 524) 2 chances to recover.
715			 */
716			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
717			    (tp->t_rxtshift > 6)) {
718				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
719				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
720				optlen = tp->t_maxopd - tp->t_maxseg;
721				tp->t_maxopd = tp->t_pmtud_saved_maxopd;
722				tp->t_maxseg = tp->t_maxopd - optlen;
723				V_tcp_pmtud_blackhole_failed++;
724				/*
725				 * Reset the slow-start flight size as it
726				 * may depend on the new MSS.
727				 */
728				if (CC_ALGO(tp)->conn_init != NULL)
729					CC_ALGO(tp)->conn_init(tp->ccv);
730			}
731		}
732	}
733
734	/*
735	 * Disable RFC1323 and SACK if we haven't got any response to
736	 * our third SYN to work-around some broken terminal servers
737	 * (most of which have hopefully been retired) that have bad VJ
738	 * header compression code which trashes TCP segments containing
739	 * unknown-to-them TCP options.
740	 */
741	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
742	    (tp->t_rxtshift == 3))
743		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
744	/*
745	 * If we backed off this far, our srtt estimate is probably bogus.
746	 * Clobber it so we'll take the next rtt measurement as our srtt;
747	 * move the current srtt into rttvar to keep the current
748	 * retransmit times until then.
749	 */
750	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
751#ifdef INET6
752		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
753			in6_losing(tp->t_inpcb);
754#endif
755		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
756		tp->t_srtt = 0;
757	}
758	tp->snd_nxt = tp->snd_una;
759	tp->snd_recover = tp->snd_max;
760	/*
761	 * Force a segment to be sent.
762	 */
763	tp->t_flags |= TF_ACKNOW;
764	/*
765	 * If timing a segment in this window, stop the timer.
766	 */
767	tp->t_rtttime = 0;
768
769	cc_cong_signal(tp, NULL, CC_RTO);
770
771	(void) tcp_output(tp);
772
773out:
774#ifdef TCPDEBUG
775	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
776		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
777			  PRU_SLOWTIMO);
778#endif
779	if (tp != NULL)
780		INP_WUNLOCK(inp);
781	if (headlocked)
782		INP_INFO_WUNLOCK(&V_tcbinfo);
783	CURVNET_RESTORE();
784}
785
786void
787tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
788{
789	struct callout *t_callout;
790	timeout_t *f_callout;
791	struct inpcb *inp = tp->t_inpcb;
792	int cpu = INP_CPU(inp);
793	uint32_t f_reset;
794
795#ifdef TCP_OFFLOAD
796	if (tp->t_flags & TF_TOE)
797		return;
798#endif
799
800	if (tp->t_timers->tt_flags & TT_STOPPED)
801		return;
802
803	switch (timer_type) {
804		case TT_DELACK:
805			t_callout = &tp->t_timers->tt_delack;
806			f_callout = tcp_timer_delack;
807			f_reset = TT_DELACK_RST;
808			break;
809		case TT_REXMT:
810			t_callout = &tp->t_timers->tt_rexmt;
811			f_callout = tcp_timer_rexmt;
812			f_reset = TT_REXMT_RST;
813			break;
814		case TT_PERSIST:
815			t_callout = &tp->t_timers->tt_persist;
816			f_callout = tcp_timer_persist;
817			f_reset = TT_PERSIST_RST;
818			break;
819		case TT_KEEP:
820			t_callout = &tp->t_timers->tt_keep;
821			f_callout = tcp_timer_keep;
822			f_reset = TT_KEEP_RST;
823			break;
824		case TT_2MSL:
825			t_callout = &tp->t_timers->tt_2msl;
826			f_callout = tcp_timer_2msl;
827			f_reset = TT_2MSL_RST;
828			break;
829		default:
830			panic("tp %p bad timer_type %#x", tp, timer_type);
831		}
832	if (delta == 0) {
833		if ((tp->t_timers->tt_flags & timer_type) &&
834		    callout_stop(t_callout) &&
835		    (tp->t_timers->tt_flags & f_reset)) {
836			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
837		}
838	} else {
839		if ((tp->t_timers->tt_flags & timer_type) == 0) {
840			tp->t_timers->tt_flags |= (timer_type | f_reset);
841			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
842		} else {
843			/* Reset already running callout on the same CPU. */
844			if (!callout_reset(t_callout, delta, f_callout, tp)) {
845				/*
846				 * Callout not cancelled, consider it as not
847				 * properly restarted. */
848				tp->t_timers->tt_flags &= ~f_reset;
849			}
850		}
851	}
852}
853
854int
855tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
856{
857	struct callout *t_callout;
858
859	switch (timer_type) {
860		case TT_DELACK:
861			t_callout = &tp->t_timers->tt_delack;
862			break;
863		case TT_REXMT:
864			t_callout = &tp->t_timers->tt_rexmt;
865			break;
866		case TT_PERSIST:
867			t_callout = &tp->t_timers->tt_persist;
868			break;
869		case TT_KEEP:
870			t_callout = &tp->t_timers->tt_keep;
871			break;
872		case TT_2MSL:
873			t_callout = &tp->t_timers->tt_2msl;
874			break;
875		default:
876			panic("tp %p bad timer_type %#x", tp, timer_type);
877		}
878	return callout_active(t_callout);
879}
880
881void
882tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
883{
884	struct callout *t_callout;
885	timeout_t *f_callout;
886	uint32_t f_reset;
887
888	tp->t_timers->tt_flags |= TT_STOPPED;
889
890	switch (timer_type) {
891		case TT_DELACK:
892			t_callout = &tp->t_timers->tt_delack;
893			f_callout = tcp_timer_delack_discard;
894			f_reset = TT_DELACK_RST;
895			break;
896		case TT_REXMT:
897			t_callout = &tp->t_timers->tt_rexmt;
898			f_callout = tcp_timer_rexmt_discard;
899			f_reset = TT_REXMT_RST;
900			break;
901		case TT_PERSIST:
902			t_callout = &tp->t_timers->tt_persist;
903			f_callout = tcp_timer_persist_discard;
904			f_reset = TT_PERSIST_RST;
905			break;
906		case TT_KEEP:
907			t_callout = &tp->t_timers->tt_keep;
908			f_callout = tcp_timer_keep_discard;
909			f_reset = TT_KEEP_RST;
910			break;
911		case TT_2MSL:
912			t_callout = &tp->t_timers->tt_2msl;
913			f_callout = tcp_timer_2msl_discard;
914			f_reset = TT_2MSL_RST;
915			break;
916		default:
917			panic("tp %p bad timer_type %#x", tp, timer_type);
918		}
919
920	if (tp->t_timers->tt_flags & timer_type) {
921		if (callout_stop(t_callout) &&
922		    (tp->t_timers->tt_flags & f_reset)) {
923			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
924		} else {
925			/*
926			 * Can't stop the callout, defer tcpcb actual deletion
927			 * to the last tcp timer discard callout.
928			 * The TT_STOPPED flag will ensure that no tcp timer
929			 * callouts can be restarted on our behalf, and
930			 * past this point currently running callouts waiting
931			 * on inp lock will return right away after the
932			 * classical check for callout reset/stop events:
933			 * callout_pending() || !callout_active()
934			 */
935			callout_reset(t_callout, 1, f_callout, tp);
936		}
937	}
938}
939
940#define	ticks_to_msecs(t)	(1000*(t) / hz)
941
942void
943tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
944    struct xtcp_timer *xtimer)
945{
946	sbintime_t now;
947
948	bzero(xtimer, sizeof(*xtimer));
949	if (timer == NULL)
950		return;
951	now = getsbinuptime();
952	if (callout_active(&timer->tt_delack))
953		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
954	if (callout_active(&timer->tt_rexmt))
955		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
956	if (callout_active(&timer->tt_persist))
957		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
958	if (callout_active(&timer->tt_keep))
959		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
960	if (callout_active(&timer->tt_2msl))
961		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
962	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
963}
964