tcp_sack.c revision 1.33
1/* $NetBSD: tcp_sack.c,v 1.33 2016/12/13 08:29:03 ozaki-r Exp $ */
2
3/*
4 * Copyright (c) 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Kentaro A. Kurahone.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
34 *	The Regents of the University of California.  All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)tcp_sack.c	8.12 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
62 */
63
64/*
65 *	@@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
66 *
67 * NRL grants permission for redistribution and use in source and binary
68 * forms, with or without modification, of the software and documentation
69 * created at NRL provided that the following conditions are met:
70 *
71 * 1. Redistributions of source code must retain the above copyright
72 *    notice, this list of conditions and the following disclaimer.
73 * 2. Redistributions in binary form must reproduce the above copyright
74 *    notice, this list of conditions and the following disclaimer in the
75 *    documentation and/or other materials provided with the distribution.
76 * 3. All advertising materials mentioning features or use of this software
77 *    must display the following acknowledgements:
78 *	This product includes software developed by the University of
79 *	California, Berkeley and its contributors.
80 *	This product includes software developed at the Information
81 *	Technology Division, US Naval Research Laboratory.
82 * 4. Neither the name of the NRL nor the names of its contributors
83 *    may be used to endorse or promote products derived from this software
84 *    without specific prior written permission.
85 *
86 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
87 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
88 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
89 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
90 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
91 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
92 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
93 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
94 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
95 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
96 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97 *
98 * The views and conclusions contained in the software and documentation
99 * are those of the authors and should not be interpreted as representing
100 * official policies, either expressed or implied, of the US Naval
101 * Research Laboratory (NRL).
102 */
103
104#include <sys/cdefs.h>
105__KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.33 2016/12/13 08:29:03 ozaki-r Exp $");
106
107#ifdef _KERNEL_OPT
108#include "opt_inet.h"
109#include "opt_inet_csum.h"
110#include "opt_tcp_debug.h"
111#include "opt_ddb.h"
112#endif
113
114#include <sys/param.h>
115#include <sys/systm.h>
116#include <sys/mbuf.h>
117#include <sys/protosw.h>
118#include <sys/socket.h>
119#include <sys/socketvar.h>
120#include <sys/errno.h>
121#include <sys/syslog.h>
122#include <sys/pool.h>
123#include <sys/domain.h>
124#include <sys/kernel.h>
125
126#include <net/if.h>
127#include <net/route.h>
128#include <net/if_types.h>
129
130#include <netinet/in.h>
131#include <netinet/in_systm.h>
132#include <netinet/ip.h>
133#include <netinet/in_pcb.h>
134#include <netinet/in_var.h>
135#include <netinet/ip_var.h>
136
137#ifdef INET6
138#ifndef INET
139#include <netinet/in.h>
140#endif
141#include <netinet/ip6.h>
142#include <netinet6/ip6_var.h>
143#include <netinet6/in6_pcb.h>
144#include <netinet6/ip6_var.h>
145#include <netinet6/in6_var.h>
146#include <netinet/icmp6.h>
147#endif
148
149#ifndef INET6
150/* always need ip6.h for IP6_EXTHDR_GET */
151#include <netinet/ip6.h>
152#endif
153
154#include <netinet/tcp.h>
155#include <netinet/tcp_fsm.h>
156#include <netinet/tcp_seq.h>
157#include <netinet/tcp_timer.h>
158#include <netinet/tcp_var.h>
159#include <netinet/tcpip.h>
160#include <netinet/tcp_debug.h>
161
162/* SACK block pool. */
163static struct pool sackhole_pool;
164
165void
166tcp_sack_init(void)
167{
168
169	pool_init(&sackhole_pool, sizeof(struct sackhole), 0, 0, 0,
170	    "sackholepl", NULL, IPL_SOFTNET);
171}
172
173static struct sackhole *
174sack_allochole(struct tcpcb *tp)
175{
176	struct sackhole *hole;
177
178	if (tp->snd_numholes >= tcp_sack_tp_maxholes ||
179	    tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
180		return NULL;
181	}
182	hole = pool_get(&sackhole_pool, PR_NOWAIT);
183	if (hole == NULL) {
184		return NULL;
185	}
186	tp->snd_numholes++;
187	tcp_sack_globalholes++;
188
189	return hole;
190}
191
192static struct sackhole *
193sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end,
194    struct sackhole *prev)
195{
196	struct sackhole *hole;
197
198	hole = sack_allochole(tp);
199	if (hole == NULL) {
200		return NULL;
201	}
202	hole->start = hole->rxmit = start;
203	hole->end = end;
204	if (prev != NULL) {
205		TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q);
206	} else {
207		TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q);
208	}
209	return hole;
210}
211
212static struct sackhole *
213sack_removehole(struct tcpcb *tp, struct sackhole *hole)
214{
215	struct sackhole *next;
216
217	next = TAILQ_NEXT(hole, sackhole_q);
218	tp->snd_numholes--;
219	tcp_sack_globalholes--;
220	TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q);
221	pool_put(&sackhole_pool, hole);
222
223	return next;
224}
225
226/*
227 * tcp_new_dsack: record the reception of a duplicated segment.
228 */
229
230void
231tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
232{
233
234	if (TCP_SACK_ENABLED(tp)) {
235		tp->rcv_dsack_block.left = seq;
236		tp->rcv_dsack_block.right = seq + len;
237		tp->rcv_sack_flags |= TCPSACK_HAVED;
238	}
239}
240
241/*
242 * tcp_sack_option: parse the given SACK option and update the scoreboard.
243 */
244
245void
246tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp,
247    int optlen)
248{
249	struct sackblk
250	    t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
251	struct sackblk *sack = NULL;
252	struct sackhole *cur = NULL;
253	struct sackhole *tmp = NULL;
254	const char *lp = cp + 2;
255	int i, j, num_sack_blks;
256	tcp_seq left, right, acked;
257
258	/*
259	 * If we aren't processing SACK responses, this is not an ACK
260	 * or the peer sends us a sack option with invalid length, don't
261	 * update the scoreboard.
262	 */
263	if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) ||
264			(optlen % 8 != 2 || optlen < 10)) {
265		return;
266	}
267
268	/*
269	 * If we don't want any SACK holes to be allocated, just return.
270	 */
271	if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) {
272		return;
273	}
274
275	/* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
276	if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))
277		return;
278
279	/*
280	 * Extract SACK blocks.
281	 *
282	 * Note that t_sack_block is sorted so that we only need to do
283	 * one pass over the sequence number space. (SACK "fast-path")
284	 */
285	num_sack_blks = optlen / 8;
286	acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
287	for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) {
288		memcpy(&left, lp, sizeof(uint32_t));
289		memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t));
290		left = ntohl(left);
291		right = ntohl(right);
292
293		if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) ||
294		    SEQ_GEQ(left, right)) {
295			/* SACK entry that's old, or invalid. */
296			i--;
297			num_sack_blks--;
298			continue;
299		}
300
301		/* Insertion sort. */
302		for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
303		    j--) {
304			t_sack_block[j].left = t_sack_block[j - 1].left;
305			t_sack_block[j].right = t_sack_block[j - 1].right;
306		}
307		t_sack_block[j].left = left;
308		t_sack_block[j].right = right;
309	}
310
311	/* Update the scoreboard. */
312	cur = TAILQ_FIRST(&tp->snd_holes);
313	for (i = 0; i < num_sack_blks; i++) {
314		sack = &t_sack_block[i];
315		/*
316		 * FACK TCP.  Update snd_fack so we can enter Fast
317		 * Recovery early.
318		 */
319		if (SEQ_GEQ(sack->right, tp->snd_fack))
320			tp->snd_fack = sack->right;
321
322		if (TAILQ_EMPTY(&tp->snd_holes)) {
323			/* First hole. */
324			cur = sack_inserthole(tp, th->th_ack, sack->left, NULL);
325			if (cur == NULL) {
326				/* ENOBUFS, bail out*/
327				return;
328			}
329			tp->rcv_lastsack = sack->right;
330			continue; /* With next sack block */
331		}
332
333		/* Go through the list of holes. */
334		while (cur) {
335			if (SEQ_LEQ(sack->right, cur->start))
336				/* SACKs data before the current hole */
337				break; /* No use going through more holes */
338
339			if (SEQ_GEQ(sack->left, cur->end)) {
340				/* SACKs data beyond the current hole */
341				cur = TAILQ_NEXT(cur, sackhole_q);
342				continue;
343			}
344
345			if (SEQ_LEQ(sack->left, cur->start)) {
346				/* Data acks at least the beginning of hole */
347				if (SEQ_GEQ(sack->right, cur->end)) {
348					/* Acks entire hole, so delete hole */
349					cur = sack_removehole(tp, cur);
350					break;
351				}
352
353				/* Otherwise, move start of hole forward */
354				cur->start = sack->right;
355				cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
356				break;
357			}
358
359			if (SEQ_GEQ(sack->right, cur->end)) {
360				/* Move end of hole backward. */
361				cur->end = sack->left;
362				cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
363				cur = TAILQ_NEXT(cur, sackhole_q);
364				break;
365			}
366
367			if (SEQ_LT(cur->start, sack->left) &&
368			    SEQ_GT(cur->end, sack->right)) {
369				/*
370				 * ACKs some data in middle of a hole; need to
371				 * split current hole
372				 */
373				tmp = sack_inserthole(tp, sack->right, cur->end,
374				    cur);
375				if (tmp == NULL) {
376					return;
377				}
378				tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
379				cur->end = sack->left;
380				cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
381				cur = tmp;
382				break;
383			}
384		}
385
386		/* At this point, we have reached the tail of the list. */
387		if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
388			/*
389			 * Need to append new hole at end.
390			 */
391			cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left,
392			    NULL);
393			if (cur == NULL) {
394				return;
395			}
396		}
397		if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
398			tp->rcv_lastsack = sack->right;
399		}
400	}
401}
402
403/*
404 * tcp_del_sackholes: remove holes covered by a cumulative ACK.
405 */
406
407void
408tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th)
409{
410	/* Max because this could be an older ack that just arrived. */
411	tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
412		th->th_ack : tp->snd_una;
413	struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
414
415	while (cur) {
416		if (SEQ_LEQ(cur->end, lastack)) {
417			cur = sack_removehole(tp, cur);
418		} else if (SEQ_LT(cur->start, lastack)) {
419			cur->start = lastack;
420			if (SEQ_LT(cur->rxmit, cur->start))
421				cur->rxmit = cur->start;
422			break;
423		} else
424			break;
425	}
426}
427
428/*
429 * tcp_free_sackholes: clear the scoreboard.
430 */
431
432void
433tcp_free_sackholes(struct tcpcb *tp)
434{
435	struct sackhole *sack;
436
437	/* Free up the SACK hole list. */
438	while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
439		sack_removehole(tp, sack);
440	}
441	KASSERT(tp->snd_numholes == 0);
442}
443
444/*
445 * Returns pointer to a sackhole if there are any pending retransmissions;
446 * NULL otherwise.
447 */
448struct sackhole *
449tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
450{
451	struct sackhole *cur = NULL;
452
453	if (!TCP_SACK_ENABLED(tp))
454		return (NULL);
455
456	*sack_bytes_rexmt = 0;
457	TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
458		if (SEQ_LT(cur->rxmit, cur->end)) {
459			if (SEQ_LT(cur->rxmit, tp->snd_una)) {
460				/* old SACK hole */
461				continue;
462			}
463			*sack_bytes_rexmt += (cur->rxmit - cur->start);
464			break;
465		}
466		*sack_bytes_rexmt += (cur->rxmit - cur->start);
467	}
468
469	return (cur);
470}
471
472/*
473 * After a timeout, the SACK list may be rebuilt.  This SACK information
474 * should be used to avoid retransmitting SACKed data.  This function
475 * traverses the SACK list to see if snd_nxt should be moved forward.
476 */
477void
478tcp_sack_adjust(struct tcpcb *tp)
479{
480	struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
481	struct sackhole *n = NULL;
482
483	if (TAILQ_EMPTY(&tp->snd_holes))
484		return; /* No holes */
485	if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
486		return; /* We're already beyond any SACKed blocks */
487
488	/*
489	 * Two cases for which we want to advance snd_nxt:
490	 * i) snd_nxt lies between end of one hole and beginning of another
491	 * ii) snd_nxt lies between end of last hole and rcv_lastsack
492	 */
493	while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
494		if (SEQ_LT(tp->snd_nxt, cur->end))
495			return;
496		if (SEQ_GEQ(tp->snd_nxt, n->start))
497			cur = n;
498		else {
499			tp->snd_nxt = n->start;
500			return;
501		}
502	}
503	if (SEQ_LT(tp->snd_nxt, cur->end))
504		return;
505	tp->snd_nxt = tp->rcv_lastsack;
506
507	return;
508}
509
510/*
511 * tcp_sack_numblks: return the number of SACK blocks to send.
512 */
513
514int
515tcp_sack_numblks(const struct tcpcb *tp)
516{
517	int numblks;
518
519	if (!TCP_SACK_ENABLED(tp)) {
520		return 0;
521	}
522
523	numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) +
524	    tp->t_segqlen;
525
526	if (numblks == 0) {
527		return 0;
528	}
529
530	if (numblks > TCP_SACK_MAX) {
531		numblks = TCP_SACK_MAX;
532	}
533
534	return numblks;
535}
536
537#if defined(DDB)
538void sack_dump(const struct tcpcb *);
539
540void
541sack_dump(const struct tcpcb *tp)
542{
543	const struct sackhole *cur;
544
545	printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n",
546	    tp->snd_una, tp->snd_max);
547	printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n",
548	    tp->rcv_lastsack, tp->snd_fack);
549	printf("numholes=%d\n", tp->snd_numholes);
550	TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
551		printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n",
552		    cur->start, cur->end, cur->rxmit);
553	}
554}
555#endif /* defined(DDB) */
556