tcp_sack.c revision 1.1
1/* $NetBSD: tcp_sack.c,v 1.1 2005/02/28 16:20:59 jonathan Exp $ */ 2 3/* 4 * Copyright (c) 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Kentaro A. Kurahone. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39/* 40 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 41 * The Regents of the University of California. All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 4. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 68 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $ 69 */ 70 71/* 72 * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 73 * 74 * NRL grants permission for redistribution and use in source and binary 75 * forms, with or without modification, of the software and documentation 76 * created at NRL provided that the following conditions are met: 77 * 78 * 1. Redistributions of source code must retain the above copyright 79 * notice, this list of conditions and the following disclaimer. 80 * 2. Redistributions in binary form must reproduce the above copyright 81 * notice, this list of conditions and the following disclaimer in the 82 * documentation and/or other materials provided with the distribution. 83 * 3. All advertising materials mentioning features or use of this software 84 * must display the following acknowledgements: 85 * This product includes software developed by the University of 86 * California, Berkeley and its contributors. 87 * This product includes software developed at the Information 88 * Technology Division, US Naval Research Laboratory. 89 * 4. Neither the name of the NRL nor the names of its contributors 90 * may be used to endorse or promote products derived from this software 91 * without specific prior written permission. 92 * 93 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 94 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 95 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 96 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 97 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 98 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 99 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 100 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 101 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 102 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 103 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 104 * 105 * The views and conclusions contained in the software and documentation 106 * are those of the authors and should not be interpreted as representing 107 * official policies, either expressed or implied, of the US Naval 108 * Research Laboratory (NRL). 109 */ 110 111#include <sys/cdefs.h> 112__KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.1 2005/02/28 16:20:59 jonathan Exp $"); 113 114#include "opt_inet.h" 115#include "opt_ipsec.h" 116#include "opt_inet_csum.h" 117#include "opt_tcp_debug.h" 118 119#include <sys/param.h> 120#include <sys/systm.h> 121#include <sys/malloc.h> 122#include <sys/mbuf.h> 123#include <sys/protosw.h> 124#include <sys/socket.h> 125#include <sys/socketvar.h> 126#include <sys/errno.h> 127#include <sys/syslog.h> 128#include <sys/pool.h> 129#include <sys/domain.h> 130#include <sys/kernel.h> 131 132#include <net/if.h> 133#include <net/route.h> 134#include <net/if_types.h> 135 136#include <netinet/in.h> 137#include <netinet/in_systm.h> 138#include <netinet/ip.h> 139#include <netinet/in_pcb.h> 140#include <netinet/in_var.h> 141#include <netinet/ip_var.h> 142 143#ifdef INET6 144#ifndef INET 145#include <netinet/in.h> 146#endif 147#include <netinet/ip6.h> 148#include <netinet6/ip6_var.h> 149#include <netinet6/in6_pcb.h> 150#include <netinet6/ip6_var.h> 151#include <netinet6/in6_var.h> 152#include <netinet/icmp6.h> 153#include <netinet6/nd6.h> 154#endif 155 156#ifndef INET6 157/* always need ip6.h for IP6_EXTHDR_GET */ 158#include <netinet/ip6.h> 159#endif 160 161#include <netinet/tcp.h> 162#include <netinet/tcp_fsm.h> 163#include <netinet/tcp_seq.h> 164#include <netinet/tcp_timer.h> 165#include <netinet/tcp_var.h> 166#include <netinet/tcpip.h> 167#include <netinet/tcp_debug.h> 168 169#include <machine/stdarg.h> 170 171#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) 172#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) 173 174/* SACK block pool. */ 175POOL_INIT(sackhole_pool, sizeof(struct sackhole), 0, 0, 0, "sackholepl", NULL); 176 177void 178tcp_update_sack_list(struct tcpcb *tp) 179{ 180 int i = 0; 181 struct ipqent *tiqe = NULL; 182 183 if (!TCP_SACK_ENABLED(tp) || (tp->t_flags & TF_SIGNATURE)) { 184 /* Can't SACK this connection. */ 185 return; 186 } 187 188 /* 189 * If possible, tack on the D-SACK block. (RFC2883) 190 */ 191 if (tp->rcv_sack_flags & TCPSACK_HAVED) { 192 tp->rcv_sack_block[0].left = tp->rcv_dsack_block.left; 193 tp->rcv_sack_block[0].right = tp->rcv_dsack_block.right; 194 tp->rcv_sack_flags &= ~TCPSACK_HAVED; 195 i++; 196 } 197 198 /* 199 * Build up a list of holes in the TCP space. Note that 200 * the first SACK block is always the most recent segment 201 * received. 202 */ 203 TAILQ_FOREACH(tiqe, &tp->timeq, ipqe_timeq) { 204 tp->rcv_sack_block[i].left = tiqe->ipqe_seq; 205 tp->rcv_sack_block[i].right = tiqe->ipqe_seq + tiqe->ipqe_len; 206 i++; 207 if (i >= TCP_SACK_MAX) { 208 break; 209 } 210 } 211 212 /* If we can SACK, do so. */ 213 tp->rcv_sack_num = i; 214} 215 216void 217tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len) 218{ 219 if (TCP_SACK_ENABLED(tp)) { 220 tp->rcv_dsack_block.left = seq; 221 tp->rcv_dsack_block.right = seq + len; 222 tp->rcv_sack_flags |= TCPSACK_HAVED; 223 } 224} 225 226void 227tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 228{ 229 struct sackblk t_sack_block[TCP_SACK_MAX]; 230 struct sackblk *sack = NULL; 231 struct sackhole *cur = NULL; 232 struct sackhole *tmp = NULL; 233 u_int32_t *lp = (u_int32_t *) (cp + 2); 234 int i, j, num_sack_blks; 235 tcp_seq left, right, acked; 236 237 /* 238 * If we aren't processing SACK responses, or the peer 239 * sends us a sack option with invalid length, don't 240 * update the scoreboard. 241 */ 242 if (!TCP_SACK_ENABLED(tp) || 243 (optlen % 8 != 2 || optlen < 10)) { 244 return; 245 } 246 247 /* 248 * Extract SACK blocks. 249 * 250 * Note that t_sack_block is sorted so that we only need to do 251 * one pass over the sequence number space. (SACK "fast-path") 252 */ 253 num_sack_blks = optlen / 8; 254 acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una; 255 for (i = 0; i < num_sack_blks; i++, lp += 2) { 256 left = ntohl(*lp); 257 right = ntohl(*(lp + 1)); 258 259 if ((SEQ_LEQ(right, acked)) || 260 SEQ_GEQ(left, tp->snd_max) || 261 SEQ_GEQ(left, right)) { 262 /* SACK entry that's old, or invalid. */ 263 i--; 264 num_sack_blks--; 265 continue; 266 } 267 268 /* Insertion sort. */ 269 for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left); j--) { 270 t_sack_block[j].left = t_sack_block[j - 1].left; 271 t_sack_block[j].right = t_sack_block[j - 1].right; 272 } 273 t_sack_block[j].left = left; 274 t_sack_block[j].right = right; 275 } 276 277 /* Update the scoreboard. */ 278 cur = TAILQ_FIRST(&tp->snd_holes); 279 for (i = 0; i < num_sack_blks; i++) { 280 sack = &t_sack_block[i]; 281 /* 282 * FACK TCP. Update snd_fack so we can enter Fast 283 * Recovery early. 284 */ 285 if (SEQ_GEQ(sack->right, tp->snd_fack)) 286 tp->snd_fack = sack->right; 287 288 if (TAILQ_EMPTY(&tp->snd_holes)) { 289 /* First hole. */ 290 cur = (struct sackhole *) pool_get(&sackhole_pool, PR_NOWAIT); 291 if (cur == NULL) { 292 /* ENOBUFS, bail out*/ 293 return; 294 } 295 cur->start = th->th_ack; 296 cur->end = sack->left; 297 cur->rxmit = cur->start; 298 tp->rcv_lastsack = sack->right; 299 TAILQ_INSERT_HEAD(&tp->snd_holes, cur, sackhole_q); 300 continue; /* With next sack block */ 301 } 302 303 /* Go through the list of holes. */ 304 while (cur) { 305 if (SEQ_LEQ(sack->left, cur->start)) 306 /* SACKs data before the current hole */ 307 break; /* No use going through more holes */ 308 309 if (SEQ_GEQ(sack->left, cur->end)) { 310 /* SACKs data beyond the current hole */ 311 cur = TAILQ_NEXT(cur, sackhole_q); 312 continue; 313 } 314 315 if (SEQ_LEQ(sack->left, cur->start)) { 316 /* Data acks at least the beginning of hole */ 317 if (SEQ_GEQ(sack->right, cur->end)) { 318 /* Acks entire hole, so delete hole */ 319 tmp = cur; 320 cur = TAILQ_NEXT(cur, sackhole_q); 321 TAILQ_REMOVE(&tp->snd_holes, tmp, sackhole_q); 322 pool_put(&sackhole_pool, tmp); 323 break; 324 } 325 326 /* Otherwise, move start of hole forward */ 327 cur->start = sack->right; 328 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 329 cur = TAILQ_NEXT(cur, sackhole_q); 330 break; 331 } 332 333 if (SEQ_GEQ(sack->right, cur->end)) { 334 /* Move end of hole backward. */ 335 cur->end = sack->left; 336 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 337 cur = TAILQ_NEXT(cur, sackhole_q); 338 break; 339 } 340 341 if (SEQ_LT(cur->start, sack->left) && 342 SEQ_GT(cur->end, sack->right)) { 343 /* 344 * ACKs some data in middle of a hole; need to 345 * split current hole 346 */ 347 tmp = (struct sackhole *) 348 pool_get(&sackhole_pool, PR_NOWAIT); 349 if (tmp == NULL) { 350 /* ENOBUFS, bail out. */ 351 return; 352 } 353 tmp->start = sack->right; 354 tmp->end = cur->end; 355 tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start); 356 cur->end = sack->left; 357 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 358 TAILQ_INSERT_AFTER(&tp->snd_holes, cur, tmp, 359 sackhole_q); 360 cur = TAILQ_NEXT(tmp, sackhole_q); 361 break; 362 } 363 } 364 365 /* At this point, we have reached the tail of the list. */ 366 if (SEQ_LT(tp->rcv_lastsack, sack->left)) { 367 /* 368 * Need to append new hole at end. 369 */ 370 tmp = (struct sackhole *) 371 pool_get(&sackhole_pool, PR_NOWAIT); 372 if (tmp == NULL) 373 continue; /* ENOBUFS */ 374 tmp->start = tp->rcv_lastsack; 375 tmp->end = sack->left; 376 tmp->rxmit = tmp->start; 377 tp->rcv_lastsack = sack->right; 378 TAILQ_INSERT_TAIL(&tp->snd_holes, tmp, sackhole_q); 379 cur = tmp; 380 } 381 } 382} 383 384void 385tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 386{ 387 /* Max because this could be an older ack that just arrived. */ 388 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 389 th->th_ack : tp->snd_una; 390 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); 391 struct sackhole *tmp; 392 393 while (cur) { 394 if (SEQ_LEQ(cur->end, lastack)) { 395 tmp = cur; 396 cur = TAILQ_NEXT(cur, sackhole_q); 397 TAILQ_REMOVE(&tp->snd_holes, tmp, sackhole_q); 398 pool_put(&sackhole_pool, tmp); 399 } else if (SEQ_LT(cur->start, lastack)) { 400 cur->start = lastack; 401 if (SEQ_LT(cur->rxmit, cur->start)) 402 cur->rxmit = cur->start; 403 break; 404 } else 405 break; 406 407 } 408} 409 410void 411tcp_free_sackholes(struct tcpcb *tp) 412{ 413 struct sackhole *sack; 414 415 /* Free up the SACK hole list. */ 416 while (!TAILQ_EMPTY(&tp->snd_holes)) { 417 sack = TAILQ_FIRST(&tp->snd_holes); 418 TAILQ_REMOVE(&tp->snd_holes, sack, sackhole_q); 419 pool_put(&sackhole_pool, sack); 420 } 421} 422 423/* 424 * Implements the SACK response to a new ack, checking for partial acks 425 * in fast recovery. 426 */ 427void 428tcp_sack_newack(struct tcpcb *tp, struct tcphdr *th) 429{ 430 if (tp->t_partialacks < 0) { 431 /* 432 * Not in fast recovery. Reset the duplicate ack 433 * counter. 434 */ 435 tp->t_dupacks = 0; 436 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { 437 /* 438 * Partial ack handling within a sack recovery episode. 439 * Keeping this very simple for now. When a partial ack 440 * is received, force snd_cwnd to a value that will allow 441 * the sender to transmit no more than 2 segments. 442 * If necessary, a fancier scheme can be adopted at a 443 * later point, but for now, the goal is to prevent the 444 * sender from bursting a large amount of data in the midst 445 * of sack recovery. 446 */ 447 int num_segs = 1; 448 int sack_bytes_rxmt = 0; 449 450 tp->t_partialacks++; 451 TCP_TIMER_DISARM(tp, TCPT_REXMT); 452 tp->t_rtttime = 0; 453 454 /* send one or 2 segments based on how much new data was acked */ 455 if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) 456 num_segs = 2; 457 (void)tcp_sack_output(tp, &sack_bytes_rxmt); 458 tp->snd_cwnd = sack_bytes_rxmt + (tp->snd_nxt - tp->sack_newdata) + 459 num_segs * tp->t_segsz; 460 tp->t_flags |= TF_ACKNOW; 461 (void) tcp_output(tp); 462 } else { 463 /* 464 * Complete ack, inflate the congestion window to 465 * ssthresh and exit fast recovery. 466 * 467 * Window inflation should have left us with approx. 468 * snd_ssthresh outstanding data. But in case we 469 * would be inclined to send a burst, better to do 470 * it via the slow start mechanism. 471 */ 472 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 473 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 474 + tp->t_segsz; 475 else 476 tp->snd_cwnd = tp->snd_ssthresh; 477 tp->t_partialacks = -1; 478 tp->t_dupacks = 0; 479 if (SEQ_GT(th->th_ack, tp->snd_fack)) 480 tp->snd_fack = th->th_ack; 481 } 482} 483 484/* 485 * Returns pointer to a sackhole if there are any pending retransmissions; 486 * NULL otherwise. 487 */ 488struct sackhole * 489tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) 490{ 491 struct sackhole *cur = NULL; 492 493 if(!TCP_SACK_ENABLED(tp)) 494 return (NULL); 495 496 *sack_bytes_rexmt = 0; 497 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { 498 if (SEQ_LT(cur->rxmit, cur->end)) { 499 if (SEQ_LT(cur->rxmit, tp->snd_una)) {/* old SACK hole */ 500 continue; 501 } 502 *sack_bytes_rexmt += (cur->rxmit - cur->start); 503 break; 504 } 505 *sack_bytes_rexmt += (cur->rxmit - cur->start); 506 } 507 508 return (cur); 509} 510 511/* 512 * After a timeout, the SACK list may be rebuilt. This SACK information 513 * should be used to avoid retransmitting SACKed data. This function 514 * traverses the SACK list to see if snd_nxt should be moved forward. 515 */ 516void 517tcp_sack_adjust(struct tcpcb *tp) 518{ 519 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); 520 struct sackhole *n = NULL; 521 522 if (TAILQ_EMPTY(&tp->snd_holes)) 523 return; /* No holes */ 524 if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) 525 return; /* We're already beyond any SACKed blocks */ 526 527 /* 528 * Two cases for which we want to advance snd_nxt: 529 * i) snd_nxt lies between end of one hole and beginning of another 530 * ii) snd_nxt lies between end of last hole and rcv_lastsack 531 */ 532 while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) { 533 if (SEQ_LT(tp->snd_nxt, cur->end)) 534 return; 535 if (SEQ_GEQ(tp->snd_nxt, n->start)) 536 cur = n; 537 else { 538 tp->snd_nxt = n->start; 539 return; 540 } 541 } 542 if (SEQ_LT(tp->snd_nxt, cur->end)) 543 return; 544 tp->snd_nxt = tp->rcv_lastsack; 545 546 return; 547} 548