tcp_sack.c revision 1.16
1/* $NetBSD: tcp_sack.c,v 1.16 2006/10/07 19:26:07 yamt Exp $ */
2
3/*
4 * Copyright (c) 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Kentaro A. Kurahone.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the NetBSD
21 *	Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 *    contributors may be used to endorse or promote products derived
24 *    from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39/*
40 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
41 *	The Regents of the University of California.  All rights reserved.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 *    notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 *    notice, this list of conditions and the following disclaimer in the
50 *    documentation and/or other materials provided with the distribution.
51 * 4. Neither the name of the University nor the names of its contributors
52 *    may be used to endorse or promote products derived from this software
53 *    without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 *	@(#)tcp_sack.c	8.12 (Berkeley) 5/24/95
68 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
69 */
70
71/*
72 *	@@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
73 *
74 * NRL grants permission for redistribution and use in source and binary
75 * forms, with or without modification, of the software and documentation
76 * created at NRL provided that the following conditions are met:
77 *
78 * 1. Redistributions of source code must retain the above copyright
79 *    notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 *    notice, this list of conditions and the following disclaimer in the
82 *    documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software
84 *    must display the following acknowledgements:
85 *	This product includes software developed by the University of
86 *	California, Berkeley and its contributors.
87 *	This product includes software developed at the Information
88 *	Technology Division, US Naval Research Laboratory.
89 * 4. Neither the name of the NRL nor the names of its contributors
90 *    may be used to endorse or promote products derived from this software
91 *    without specific prior written permission.
92 *
93 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
94 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
95 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
96 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
97 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
98 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
99 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
100 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
101 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
102 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
103 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
104 *
105 * The views and conclusions contained in the software and documentation
106 * are those of the authors and should not be interpreted as representing
107 * official policies, either expressed or implied, of the US Naval
108 * Research Laboratory (NRL).
109 */
110
111#include <sys/cdefs.h>
112__KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.16 2006/10/07 19:26:07 yamt Exp $");
113
114#include "opt_inet.h"
115#include "opt_ipsec.h"
116#include "opt_inet_csum.h"
117#include "opt_tcp_debug.h"
118
119#include <sys/param.h>
120#include <sys/systm.h>
121#include <sys/malloc.h>
122#include <sys/mbuf.h>
123#include <sys/protosw.h>
124#include <sys/socket.h>
125#include <sys/socketvar.h>
126#include <sys/errno.h>
127#include <sys/syslog.h>
128#include <sys/pool.h>
129#include <sys/domain.h>
130#include <sys/kernel.h>
131
132#include <net/if.h>
133#include <net/route.h>
134#include <net/if_types.h>
135
136#include <netinet/in.h>
137#include <netinet/in_systm.h>
138#include <netinet/ip.h>
139#include <netinet/in_pcb.h>
140#include <netinet/in_var.h>
141#include <netinet/ip_var.h>
142
143#ifdef INET6
144#ifndef INET
145#include <netinet/in.h>
146#endif
147#include <netinet/ip6.h>
148#include <netinet6/ip6_var.h>
149#include <netinet6/in6_pcb.h>
150#include <netinet6/ip6_var.h>
151#include <netinet6/in6_var.h>
152#include <netinet/icmp6.h>
153#include <netinet6/nd6.h>
154#endif
155
156#ifndef INET6
157/* always need ip6.h for IP6_EXTHDR_GET */
158#include <netinet/ip6.h>
159#endif
160
161#include <netinet/tcp.h>
162#include <netinet/tcp_fsm.h>
163#include <netinet/tcp_seq.h>
164#include <netinet/tcp_timer.h>
165#include <netinet/tcp_var.h>
166#include <netinet/tcpip.h>
167#include <netinet/tcp_debug.h>
168
169#include <machine/stdarg.h>
170
171/* SACK block pool. */
172POOL_INIT(sackhole_pool, sizeof(struct sackhole), 0, 0, 0, "sackholepl", NULL);
173
174void
175tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
176{
177	if (TCP_SACK_ENABLED(tp)) {
178		tp->rcv_dsack_block.left = seq;
179		tp->rcv_dsack_block.right = seq + len;
180		tp->rcv_sack_flags |= TCPSACK_HAVED;
181	}
182}
183
184void
185tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
186{
187	struct sackblk
188	    t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
189	struct sackblk *sack = NULL;
190	struct sackhole *cur = NULL;
191	struct sackhole *tmp = NULL;
192	u_int32_t *lp = (u_int32_t *) (cp + 2);
193	int i, j, num_sack_blks, s;
194	tcp_seq left, right, acked;
195
196	/*
197	 * If we aren't processing SACK responses, this is not an ACK
198	 * or the peer sends us a sack option with invalid length, don't
199	 * update the scoreboard.
200	 */
201	if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) ||
202			(optlen % 8 != 2 || optlen < 10)) {
203		return;
204	}
205
206	/*
207	 * If we don't want any SACK holes to be allocated, just return.
208	 */
209	if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) {
210		return;
211	}
212
213	/* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
214	if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))
215		return;
216
217	/*
218	 * Extract SACK blocks.
219	 *
220	 * Note that t_sack_block is sorted so that we only need to do
221	 * one pass over the sequence number space. (SACK "fast-path")
222	 */
223	num_sack_blks = optlen / 8;
224	acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
225	for (i = 0; i < num_sack_blks; i++, lp += 2) {
226		memcpy(&left, lp, sizeof(*lp));
227		memcpy(&right, lp + 1, sizeof(*lp));
228		left = ntohl(left);
229		right = ntohl(right);
230
231		if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) ||
232		    SEQ_GEQ(left, right)) {
233			/* SACK entry that's old, or invalid. */
234			i--;
235			num_sack_blks--;
236			continue;
237		}
238
239		/* Insertion sort. */
240		for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
241		    j--) {
242			t_sack_block[j].left = t_sack_block[j - 1].left;
243			t_sack_block[j].right = t_sack_block[j - 1].right;
244		}
245		t_sack_block[j].left = left;
246		t_sack_block[j].right = right;
247	}
248
249	/* XXX: Investigate making this a bit more fine-grained. */
250	s = splsoftnet();
251
252	/* Update the scoreboard. */
253	cur = TAILQ_FIRST(&tp->snd_holes);
254	for (i = 0; i < num_sack_blks; i++) {
255		sack = &t_sack_block[i];
256		/*
257		 * FACK TCP.  Update snd_fack so we can enter Fast
258		 * Recovery early.
259		 */
260		if (SEQ_GEQ(sack->right, tp->snd_fack))
261			tp->snd_fack = sack->right;
262
263		if (TAILQ_EMPTY(&tp->snd_holes)) {
264			/* First hole. */
265			if (tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
266				splx(s);
267				return;
268			}
269			cur = (struct sackhole *)
270			    pool_get(&sackhole_pool, PR_NOWAIT);
271			if (cur == NULL) {
272				/* ENOBUFS, bail out*/
273				splx(s);
274				return;
275			}
276			cur->start = th->th_ack;
277			cur->end = sack->left;
278			cur->rxmit = cur->start;
279			tp->rcv_lastsack = sack->right;
280			tp->snd_numholes++;
281			tcp_sack_globalholes++;
282			TAILQ_INSERT_HEAD(&tp->snd_holes, cur, sackhole_q);
283			continue; /* With next sack block */
284		}
285
286		/* Go through the list of holes. */
287		while (cur) {
288			if (SEQ_LEQ(sack->right, cur->start))
289				/* SACKs data before the current hole */
290				break; /* No use going through more holes */
291
292			if (SEQ_GEQ(sack->left, cur->end)) {
293				/* SACKs data beyond the current hole */
294				cur = TAILQ_NEXT(cur, sackhole_q);
295				continue;
296			}
297
298			if (SEQ_LEQ(sack->left, cur->start)) {
299				/* Data acks at least the beginning of hole */
300				if (SEQ_GEQ(sack->right, cur->end)) {
301					/* Acks entire hole, so delete hole */
302					tmp = cur;
303					cur = TAILQ_NEXT(cur, sackhole_q);
304					tp->snd_numholes--;
305					tcp_sack_globalholes--;
306					TAILQ_REMOVE(&tp->snd_holes, tmp,
307					    sackhole_q);
308					pool_put(&sackhole_pool, tmp);
309					break;
310				}
311
312				/* Otherwise, move start of hole forward */
313				cur->start = sack->right;
314				cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
315				break;
316			}
317
318			if (SEQ_GEQ(sack->right, cur->end)) {
319				/* Move end of hole backward. */
320				cur->end = sack->left;
321				cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
322				cur = TAILQ_NEXT(cur, sackhole_q);
323				break;
324			}
325
326			if (SEQ_LT(cur->start, sack->left) &&
327			    SEQ_GT(cur->end, sack->right)) {
328				/*
329				 * ACKs some data in middle of a hole; need to
330				 * split current hole
331				 */
332				if (tcp_sack_globalholes >=
333						tcp_sack_globalmaxholes ||
334						tp->snd_numholes >=
335						tcp_sack_tp_maxholes) {
336					splx(s);
337					return;
338				}
339				tmp = (struct sackhole *)
340				    pool_get(&sackhole_pool, PR_NOWAIT);
341				if (tmp == NULL) {
342					/* ENOBUFS, bail out. */
343					splx(s);
344					return;
345				}
346				tmp->start = sack->right;
347				tmp->end = cur->end;
348				tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
349				cur->end = sack->left;
350				cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
351				tp->snd_numholes++;
352				tcp_sack_globalholes++;
353				TAILQ_INSERT_AFTER(&tp->snd_holes, cur, tmp,
354						sackhole_q);
355				cur = tmp;
356				break;
357			}
358		}
359
360		/* At this point, we have reached the tail of the list. */
361		if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
362			/*
363			 * Need to append new hole at end.
364			 */
365			if (tcp_sack_globalholes >=
366					tcp_sack_globalmaxholes ||
367					tp->snd_numholes >=
368					tcp_sack_tp_maxholes) {
369				splx(s);
370				return;
371			}
372			tmp = (struct sackhole *)
373			    pool_get(&sackhole_pool, PR_NOWAIT);
374			if (tmp == NULL)
375				continue; /* ENOBUFS */
376			tmp->start = tp->rcv_lastsack;
377			tmp->end = sack->left;
378			tmp->rxmit = tmp->start;
379			tp->snd_numholes++;
380			tcp_sack_globalholes++;
381			TAILQ_INSERT_TAIL(&tp->snd_holes, tmp, sackhole_q);
382			cur = tmp;
383		}
384		if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
385			tp->rcv_lastsack = sack->right;
386		}
387	}
388
389	splx(s);
390}
391
392void
393tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th)
394{
395	/* Max because this could be an older ack that just arrived. */
396	tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
397		th->th_ack : tp->snd_una;
398	struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
399	struct sackhole *tmp;
400	int s;
401
402	s = splsoftnet();
403
404	while (cur) {
405		if (SEQ_LEQ(cur->end, lastack)) {
406			tmp = cur;
407			cur = TAILQ_NEXT(cur, sackhole_q);
408			tp->snd_numholes--;
409			tcp_sack_globalholes--;
410			TAILQ_REMOVE(&tp->snd_holes, tmp, sackhole_q);
411			pool_put(&sackhole_pool, tmp);
412		} else if (SEQ_LT(cur->start, lastack)) {
413			cur->start = lastack;
414			if (SEQ_LT(cur->rxmit, cur->start))
415				cur->rxmit = cur->start;
416			break;
417		} else
418			break;
419	}
420
421	splx(s);
422}
423
424void
425tcp_free_sackholes(struct tcpcb *tp)
426{
427	struct sackhole *sack;
428	int s;
429
430	s = splsoftnet();
431
432	/* Free up the SACK hole list. */
433	while (!TAILQ_EMPTY(&tp->snd_holes)) {
434		sack = TAILQ_FIRST(&tp->snd_holes);
435		tcp_sack_globalholes--;
436		TAILQ_REMOVE(&tp->snd_holes, sack, sackhole_q);
437		pool_put(&sackhole_pool, sack);
438	}
439
440	tp->snd_numholes = 0;
441
442	splx(s);
443}
444
445/*
446 * Implements the SACK response to a new ack, checking for partial acks
447 * in fast recovery.
448 */
449void
450tcp_sack_newack(struct tcpcb *tp, struct tcphdr *th)
451{
452	if (tp->t_partialacks < 0) {
453		/*
454		 * Not in fast recovery.  Reset the duplicate ack
455		 * counter.
456		 */
457		tp->t_dupacks = 0;
458	} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
459		/*
460		 * Partial ack handling within a sack recovery episode.
461		 * Keeping this very simple for now. When a partial ack
462		 * is received, force snd_cwnd to a value that will allow
463		 * the sender to transmit no more than 2 segments.
464		 * If necessary, a fancier scheme can be adopted at a
465		 * later point, but for now, the goal is to prevent the
466		 * sender from bursting a large amount of data in the midst
467		 * of sack recovery.
468		 */
469		int num_segs = 1;
470		int sack_bytes_rxmt = 0;
471
472		tp->t_partialacks++;
473		TCP_TIMER_DISARM(tp, TCPT_REXMT);
474		tp->t_rtttime = 0;
475
476	 	/*
477		 * send one or 2 segments based on how much new data was acked
478		 */
479 		if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
480 			num_segs = 2;
481	 	(void)tcp_sack_output(tp, &sack_bytes_rxmt);
482 		tp->snd_cwnd = sack_bytes_rxmt +
483		    (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_segsz;
484  		tp->t_flags |= TF_ACKNOW;
485	  	(void) tcp_output(tp);
486	} else {
487		/*
488		 * Complete ack, inflate the congestion window to
489                 * ssthresh and exit fast recovery.
490		 *
491		 * Window inflation should have left us with approx.
492		 * snd_ssthresh outstanding data.  But in case we
493		 * would be inclined to send a burst, better to do
494		 * it via the slow start mechanism.
495		 */
496		if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
497			tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
498			    + tp->t_segsz;
499		else
500			tp->snd_cwnd = tp->snd_ssthresh;
501		tp->t_partialacks = -1;
502		tp->t_dupacks = 0;
503		if (SEQ_GT(th->th_ack, tp->snd_fack))
504			tp->snd_fack = th->th_ack;
505	}
506}
507
508/*
509 * Returns pointer to a sackhole if there are any pending retransmissions;
510 * NULL otherwise.
511 */
512struct sackhole *
513tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
514{
515	struct sackhole *cur = NULL;
516
517	if(!TCP_SACK_ENABLED(tp))
518		return (NULL);
519
520	*sack_bytes_rexmt = 0;
521	TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
522		if (SEQ_LT(cur->rxmit, cur->end)) {
523			if (SEQ_LT(cur->rxmit, tp->snd_una)) {
524				/* old SACK hole */
525				continue;
526			}
527			*sack_bytes_rexmt += (cur->rxmit - cur->start);
528			break;
529		}
530		*sack_bytes_rexmt += (cur->rxmit - cur->start);
531	}
532
533	return (cur);
534}
535
536/*
537 * After a timeout, the SACK list may be rebuilt.  This SACK information
538 * should be used to avoid retransmitting SACKed data.  This function
539 * traverses the SACK list to see if snd_nxt should be moved forward.
540 */
541void
542tcp_sack_adjust(struct tcpcb *tp)
543{
544	struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
545	struct sackhole *n = NULL;
546
547	if (TAILQ_EMPTY(&tp->snd_holes))
548		return; /* No holes */
549	if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
550		return; /* We're already beyond any SACKed blocks */
551
552	/*
553	 * Two cases for which we want to advance snd_nxt:
554	 * i) snd_nxt lies between end of one hole and beginning of another
555	 * ii) snd_nxt lies between end of last hole and rcv_lastsack
556	 */
557	while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
558		if (SEQ_LT(tp->snd_nxt, cur->end))
559			return;
560		if (SEQ_GEQ(tp->snd_nxt, n->start))
561			cur = n;
562		else {
563			tp->snd_nxt = n->start;
564			return;
565		}
566	}
567	if (SEQ_LT(tp->snd_nxt, cur->end))
568		return;
569	tp->snd_nxt = tp->rcv_lastsack;
570
571	return;
572}
573
574int
575tcp_sack_numblks(const struct tcpcb *tp)
576{
577	int numblks;
578
579	if (!TCP_SACK_ENABLED(tp)) {
580		return 0;
581	}
582
583	numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) +
584	    tp->t_segqlen;
585
586	if (numblks == 0) {
587		return 0;
588	}
589
590	if (numblks > TCP_SACK_MAX) {
591		numblks = TCP_SACK_MAX;
592	}
593
594	return numblks;
595}
596