tcp_timer.c revision 1.67
1/*	$NetBSD: tcp_timer.c,v 1.67 2005/01/26 21:49:27 mycroft Exp $	*/
2
3/*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*-
33 * Copyright (c) 1997, 1998, 2001 The NetBSD Foundation, Inc.
34 * All rights reserved.
35 *
36 * This code is derived from software contributed to The NetBSD Foundation
37 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
38 * Facility, NASA Ames Research Center.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 *    notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 *    notice, this list of conditions and the following disclaimer in the
47 *    documentation and/or other materials provided with the distribution.
48 * 3. All advertising materials mentioning features or use of this software
49 *    must display the following acknowledgement:
50 *	This product includes software developed by the NetBSD
51 *	Foundation, Inc. and its contributors.
52 * 4. Neither the name of The NetBSD Foundation nor the names of its
53 *    contributors may be used to endorse or promote products derived
54 *    from this software without specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
57 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
58 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
59 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
60 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
61 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
62 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
63 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
64 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
65 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
66 * POSSIBILITY OF SUCH DAMAGE.
67 */
68
69/*
70 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
71 *	The Regents of the University of California.  All rights reserved.
72 *
73 * Redistribution and use in source and binary forms, with or without
74 * modification, are permitted provided that the following conditions
75 * are met:
76 * 1. Redistributions of source code must retain the above copyright
77 *    notice, this list of conditions and the following disclaimer.
78 * 2. Redistributions in binary form must reproduce the above copyright
79 *    notice, this list of conditions and the following disclaimer in the
80 *    documentation and/or other materials provided with the distribution.
81 * 3. Neither the name of the University nor the names of its contributors
82 *    may be used to endorse or promote products derived from this software
83 *    without specific prior written permission.
84 *
85 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
88 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
95 * SUCH DAMAGE.
96 *
97 *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
98 */
99
100#include <sys/cdefs.h>
101__KERNEL_RCSID(0, "$NetBSD: tcp_timer.c,v 1.67 2005/01/26 21:49:27 mycroft Exp $");
102
103#include "opt_inet.h"
104#include "opt_tcp_debug.h"
105
106#include <sys/param.h>
107#include <sys/systm.h>
108#include <sys/malloc.h>
109#include <sys/mbuf.h>
110#include <sys/socket.h>
111#include <sys/socketvar.h>
112#include <sys/protosw.h>
113#include <sys/errno.h>
114#include <sys/kernel.h>
115
116#include <net/if.h>
117#include <net/route.h>
118
119#include <netinet/in.h>
120#include <netinet/in_systm.h>
121#include <netinet/ip.h>
122#include <netinet/in_pcb.h>
123#include <netinet/ip_var.h>
124
125#ifdef INET6
126#ifndef INET
127#include <netinet/in.h>
128#endif
129#include <netinet/ip6.h>
130#include <netinet6/in6_pcb.h>
131#endif
132
133#include <netinet/tcp.h>
134#include <netinet/tcp_fsm.h>
135#include <netinet/tcp_seq.h>
136#include <netinet/tcp_timer.h>
137#include <netinet/tcp_var.h>
138#include <netinet/tcpip.h>
139#ifdef TCP_DEBUG
140#include <netinet/tcp_debug.h>
141#endif
142
143/*
144 * Various tunable timer parameters.  These are initialized in tcp_init(),
145 * unless they are patched.
146 */
147int	tcp_keepidle = 0;
148int	tcp_keepintvl = 0;
149int	tcp_keepcnt = 0;		/* max idle probes */
150int	tcp_maxpersistidle = 0;		/* max idle time in persist */
151int	tcp_maxidle;			/* computed in tcp_slowtimo() */
152
153/*
154 * Time to delay the ACK.  This is initialized in tcp_init(), unless
155 * its patched.
156 */
157int	tcp_delack_ticks = 0;
158
159void	tcp_timer_rexmt(void *);
160void	tcp_timer_persist(void *);
161void	tcp_timer_keep(void *);
162void	tcp_timer_2msl(void *);
163
164const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS] = {
165	tcp_timer_rexmt,
166	tcp_timer_persist,
167	tcp_timer_keep,
168	tcp_timer_2msl,
169};
170
171/*
172 * Timer state initialization, called from tcp_init().
173 */
174void
175tcp_timer_init(void)
176{
177
178	if (tcp_keepidle == 0)
179		tcp_keepidle = TCPTV_KEEP_IDLE;
180
181	if (tcp_keepintvl == 0)
182		tcp_keepintvl = TCPTV_KEEPINTVL;
183
184	if (tcp_keepcnt == 0)
185		tcp_keepcnt = TCPTV_KEEPCNT;
186
187	if (tcp_maxpersistidle == 0)
188		tcp_maxpersistidle = TCPTV_KEEP_IDLE;
189
190	if (tcp_delack_ticks == 0)
191		tcp_delack_ticks = TCP_DELACK_TICKS;
192}
193
194/*
195 * Return how many timers are currently being invoked.
196 */
197int
198tcp_timers_invoking(struct tcpcb *tp)
199{
200	int i;
201	int count = 0;
202
203	for (i = 0; i < TCPT_NTIMERS; i++)
204		if (callout_invoking(&tp->t_timer[i]))
205			count++;
206	if (callout_invoking(&tp->t_delack_ch))
207		count++;
208
209	return count;
210}
211
212/*
213 * Callout to process delayed ACKs for a TCPCB.
214 */
215void
216tcp_delack(void *arg)
217{
218	struct tcpcb *tp = arg;
219	int s;
220
221	/*
222	 * If tcp_output() wasn't able to transmit the ACK
223	 * for whatever reason, it will restart the delayed
224	 * ACK callout.
225	 */
226
227	s = splsoftnet();
228	callout_ack(&tp->t_delack_ch);
229	if (tcp_isdead(tp)) {
230		splx(s);
231		return;
232	}
233
234	tp->t_flags |= TF_ACKNOW;
235	(void) tcp_output(tp);
236	splx(s);
237}
238
239/*
240 * Tcp protocol timeout routine called every 500 ms.
241 * Updates the timers in all active tcb's and
242 * causes finite state machine actions if timers expire.
243 */
244void
245tcp_slowtimo()
246{
247	int s;
248
249	s = splsoftnet();
250	tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
251	tcp_iss_seq += TCP_ISSINCR;			/* increment iss */
252	tcp_now++;					/* for timestamps */
253	splx(s);
254}
255
256/*
257 * Cancel all timers for TCP tp.
258 */
259void
260tcp_canceltimers(tp)
261	struct tcpcb *tp;
262{
263	int i;
264
265	for (i = 0; i < TCPT_NTIMERS; i++)
266		TCP_TIMER_DISARM(tp, i);
267}
268
269const int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
270    { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
271
272const int	tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
273
274/*
275 * TCP timer processing.
276 */
277
278void
279tcp_timer_rexmt(void *arg)
280{
281	struct tcpcb *tp = arg;
282	uint32_t rto;
283	int s;
284#ifdef TCP_DEBUG
285	struct socket *so = NULL;
286	short ostate;
287#endif
288
289	s = splsoftnet();
290	callout_ack(&tp->t_timer[TCPT_REXMT]);
291	if (tcp_isdead(tp)) {
292		splx(s);
293		return;
294	}
295
296#ifdef TCP_DEBUG
297#ifdef INET
298	if (tp->t_inpcb)
299		so = tp->t_inpcb->inp_socket;
300#endif
301#ifdef INET6
302	if (tp->t_in6pcb)
303		so = tp->t_in6pcb->in6p_socket;
304#endif
305	ostate = tp->t_state;
306#endif /* TCP_DEBUG */
307
308	/*
309	 * Retransmission timer went off.  Message has not
310	 * been acked within retransmit interval.  Back off
311	 * to a longer retransmit interval and retransmit one segment.
312	 */
313
314	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
315		tp->t_rxtshift = TCP_MAXRXTSHIFT;
316		tcpstat.tcps_timeoutdrop++;
317		tp = tcp_drop(tp, tp->t_softerror ?
318		    tp->t_softerror : ETIMEDOUT);
319		goto out;
320	}
321	tcpstat.tcps_rexmttimeo++;
322	rto = TCP_REXMTVAL(tp);
323	if (rto < tp->t_rttmin)
324		rto = tp->t_rttmin;
325	TCPT_RANGESET(tp->t_rxtcur, rto * tcp_backoff[tp->t_rxtshift],
326	    tp->t_rttmin, TCPTV_REXMTMAX);
327	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
328
329	/*
330	 * If we are losing and we are trying path MTU discovery,
331	 * try turning it off.  This will avoid black holes in
332	 * the network which suppress or fail to send "packet
333	 * too big" ICMP messages.  We should ideally do
334	 * lots more sophisticated searching to find the right
335	 * value here...
336	 */
337	if (tp->t_mtudisc && tp->t_rxtshift > TCP_MAXRXTSHIFT / 6) {
338		tcpstat.tcps_pmtublackhole++;
339
340#ifdef INET
341		/* try turning PMTUD off */
342		if (tp->t_inpcb)
343			tp->t_mtudisc = 0;
344#endif
345#ifdef INET6
346		/* try using IPv6 minimum MTU */
347		if (tp->t_in6pcb)
348			tp->t_mtudisc = 0;
349#endif
350
351		/* XXX: more sophisticated Black hole recovery code? */
352	}
353
354	/*
355	 * If losing, let the lower level know and try for
356	 * a better route.  Also, if we backed off this far,
357	 * our srtt estimate is probably bogus.  Clobber it
358	 * so we'll take the next rtt measurement as our srtt;
359	 * move the current srtt into rttvar to keep the current
360	 * retransmit times until then.
361	 */
362	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
363#ifdef INET
364		if (tp->t_inpcb)
365			in_losing(tp->t_inpcb);
366#endif
367#ifdef INET6
368		if (tp->t_in6pcb)
369			in6_losing(tp->t_in6pcb);
370#endif
371		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
372		tp->t_srtt = 0;
373	}
374	tp->snd_nxt = tp->snd_una;
375	tp->snd_high = tp->snd_max;
376	/*
377	 * If timing a segment in this window, stop the timer.
378	 */
379	tp->t_rtttime = 0;
380	/*
381	 * Remember if we are retransmitting a SYN, because if
382	 * we do, set the initial congestion window must be set
383	 * to 1 segment.
384	 */
385	if (tp->t_state == TCPS_SYN_SENT)
386		tp->t_flags |= TF_SYN_REXMT;
387	/*
388	 * Close the congestion window down to one segment
389	 * (we'll open it by one segment for each ack we get).
390	 * Since we probably have a window's worth of unacked
391	 * data accumulated, this "slow start" keeps us from
392	 * dumping all that data as back-to-back packets (which
393	 * might overwhelm an intermediate gateway).
394	 *
395	 * There are two phases to the opening: Initially we
396	 * open by one mss on each ack.  This makes the window
397	 * size increase exponentially with time.  If the
398	 * window is larger than the path can handle, this
399	 * exponential growth results in dropped packet(s)
400	 * almost immediately.  To get more time between
401	 * drops but still "push" the network to take advantage
402	 * of improving conditions, we switch from exponential
403	 * to linear window opening at some threshhold size.
404	 * For a threshhold, we use half the current window
405	 * size, truncated to a multiple of the mss.
406	 *
407	 * (the minimum cwnd that will give us exponential
408	 * growth is 2 mss.  We don't allow the threshhold
409	 * to go below this.)
410	 */
411	{
412	u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
413	if (win < 2)
414		win = 2;
415	/* Loss Window MUST be one segment. */
416	tp->snd_cwnd = tp->t_segsz;
417	tp->snd_ssthresh = win * tp->t_segsz;
418	tp->t_dupacks = 0;
419	}
420	(void) tcp_output(tp);
421
422 out:
423#ifdef TCP_DEBUG
424	if (tp && so->so_options & SO_DEBUG)
425		tcp_trace(TA_USER, ostate, tp, NULL,
426		    PRU_SLOWTIMO | (TCPT_REXMT << 8));
427#endif
428	splx(s);
429}
430
431void
432tcp_timer_persist(void *arg)
433{
434	struct tcpcb *tp = arg;
435	uint32_t rto;
436	int s;
437#ifdef TCP_DEBUG
438	struct socket *so = NULL;
439	short ostate;
440#endif
441
442	s = splsoftnet();
443	callout_ack(&tp->t_timer[TCPT_PERSIST]);
444	if (tcp_isdead(tp)) {
445		splx(s);
446		return;
447	}
448
449#ifdef TCP_DEBUG
450#ifdef INET
451	if (tp->t_inpcb)
452		so = tp->t_inpcb->inp_socket;
453#endif
454#ifdef INET6
455	if (tp->t_in6pcb)
456		so = tp->t_in6pcb->in6p_socket;
457#endif
458
459	ostate = tp->t_state;
460#endif /* TCP_DEBUG */
461
462	/*
463	 * Persistance timer into zero window.
464	 * Force a byte to be output, if possible.
465	 */
466
467	/*
468	 * Hack: if the peer is dead/unreachable, we do not
469	 * time out if the window is closed.  After a full
470	 * backoff, drop the connection if the idle time
471	 * (no responses to probes) reaches the maximum
472	 * backoff that we would use if retransmitting.
473	 */
474	rto = TCP_REXMTVAL(tp);
475	if (rto < tp->t_rttmin)
476		rto = tp->t_rttmin;
477	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
478	    ((tcp_now - tp->t_rcvtime) >= tcp_maxpersistidle ||
479	    (tcp_now - tp->t_rcvtime) >= rto * tcp_totbackoff)) {
480		tcpstat.tcps_persistdrops++;
481		tp = tcp_drop(tp, ETIMEDOUT);
482		goto out;
483	}
484	tcpstat.tcps_persisttimeo++;
485	tcp_setpersist(tp);
486	tp->t_force = 1;
487	(void) tcp_output(tp);
488	tp->t_force = 0;
489
490 out:
491#ifdef TCP_DEBUG
492	if (tp && so->so_options & SO_DEBUG)
493		tcp_trace(TA_USER, ostate, tp, NULL,
494		    PRU_SLOWTIMO | (TCPT_PERSIST << 8));
495#endif
496	splx(s);
497}
498
499void
500tcp_timer_keep(void *arg)
501{
502	struct tcpcb *tp = arg;
503	struct socket *so = NULL;	/* Quell compiler warning */
504	int s;
505#ifdef TCP_DEBUG
506	short ostate;
507#endif
508
509	s = splsoftnet();
510	callout_ack(&tp->t_timer[TCPT_KEEP]);
511	if (tcp_isdead(tp)) {
512		splx(s);
513		return;
514	}
515
516#ifdef TCP_DEBUG
517	ostate = tp->t_state;
518#endif /* TCP_DEBUG */
519
520	/*
521	 * Keep-alive timer went off; send something
522	 * or drop connection if idle for too long.
523	 */
524
525	tcpstat.tcps_keeptimeo++;
526	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
527		goto dropit;
528#ifdef INET
529	if (tp->t_inpcb)
530		so = tp->t_inpcb->inp_socket;
531#endif
532#ifdef INET6
533	if (tp->t_in6pcb)
534		so = tp->t_in6pcb->in6p_socket;
535#endif
536	if (so->so_options & SO_KEEPALIVE &&
537	    tp->t_state <= TCPS_CLOSE_WAIT) {
538	    	if ((tcp_maxidle > 0) &&
539		    ((tcp_now - tp->t_rcvtime) >=
540		     tcp_keepidle + tcp_maxidle))
541			goto dropit;
542		/*
543		 * Send a packet designed to force a response
544		 * if the peer is up and reachable:
545		 * either an ACK if the connection is still alive,
546		 * or an RST if the peer has closed the connection
547		 * due to timeout or reboot.
548		 * Using sequence number tp->snd_una-1
549		 * causes the transmitted zero-length segment
550		 * to lie outside the receive window;
551		 * by the protocol spec, this requires the
552		 * correspondent TCP to respond.
553		 */
554		tcpstat.tcps_keepprobe++;
555		if (tcp_compat_42) {
556			/*
557			 * The keepalive packet must have nonzero
558			 * length to get a 4.2 host to respond.
559			 */
560			(void)tcp_respond(tp, tp->t_template,
561			    (struct mbuf *)NULL, NULL, tp->rcv_nxt - 1,
562			    tp->snd_una - 1, 0);
563		} else {
564			(void)tcp_respond(tp, tp->t_template,
565			    (struct mbuf *)NULL, NULL, tp->rcv_nxt,
566			    tp->snd_una - 1, 0);
567		}
568		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl);
569	} else
570		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
571
572#ifdef TCP_DEBUG
573	if (tp && so->so_options & SO_DEBUG)
574		tcp_trace(TA_USER, ostate, tp, NULL,
575		    PRU_SLOWTIMO | (TCPT_KEEP << 8));
576#endif
577	splx(s);
578	return;
579
580 dropit:
581	tcpstat.tcps_keepdrops++;
582	(void) tcp_drop(tp, ETIMEDOUT);
583	splx(s);
584}
585
586void
587tcp_timer_2msl(void *arg)
588{
589	struct tcpcb *tp = arg;
590	int s;
591#ifdef TCP_DEBUG
592	struct socket *so = NULL;
593	short ostate;
594#endif
595
596	s = splsoftnet();
597	callout_ack(&tp->t_timer[TCPT_2MSL]);
598	if (tcp_isdead(tp)) {
599		splx(s);
600		return;
601	}
602
603#ifdef TCP_DEBUG
604#ifdef INET
605	if (tp->t_inpcb)
606		so = tp->t_inpcb->inp_socket;
607#endif
608#ifdef INET6
609	if (tp->t_in6pcb)
610		so = tp->t_in6pcb->in6p_socket;
611#endif
612
613	ostate = tp->t_state;
614#endif /* TCP_DEBUG */
615
616	/*
617	 * 2 MSL timeout in shutdown went off.  If we're closed but
618	 * still waiting for peer to close and connection has been idle
619	 * too long, or if 2MSL time is up from TIME_WAIT, delete connection
620	 * control block.  Otherwise, check again in a bit.
621	 */
622	if (tp->t_state != TCPS_TIME_WAIT &&
623	    ((tcp_maxidle == 0) || ((tcp_now - tp->t_rcvtime) <= tcp_maxidle)))
624		TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl);
625	else
626		tp = tcp_close(tp);
627
628#ifdef TCP_DEBUG
629	if (tp && so->so_options & SO_DEBUG)
630		tcp_trace(TA_USER, ostate, tp, NULL,
631		    PRU_SLOWTIMO | (TCPT_2MSL << 8));
632#endif
633	splx(s);
634}
635