tcp_sack.c revision 1.33
1/* $NetBSD: tcp_sack.c,v 1.33 2016/12/13 08:29:03 ozaki-r Exp $ */ 2 3/* 4 * Copyright (c) 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Kentaro A. Kurahone. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* 33 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 34 * The Regents of the University of California. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $ 62 */ 63 64/* 65 * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 66 * 67 * NRL grants permission for redistribution and use in source and binary 68 * forms, with or without modification, of the software and documentation 69 * created at NRL provided that the following conditions are met: 70 * 71 * 1. Redistributions of source code must retain the above copyright 72 * notice, this list of conditions and the following disclaimer. 73 * 2. Redistributions in binary form must reproduce the above copyright 74 * notice, this list of conditions and the following disclaimer in the 75 * documentation and/or other materials provided with the distribution. 76 * 3. All advertising materials mentioning features or use of this software 77 * must display the following acknowledgements: 78 * This product includes software developed by the University of 79 * California, Berkeley and its contributors. 80 * This product includes software developed at the Information 81 * Technology Division, US Naval Research Laboratory. 82 * 4. Neither the name of the NRL nor the names of its contributors 83 * may be used to endorse or promote products derived from this software 84 * without specific prior written permission. 85 * 86 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 87 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 88 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 89 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 90 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 91 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 92 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 93 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 94 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 95 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 96 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 97 * 98 * The views and conclusions contained in the software and documentation 99 * are those of the authors and should not be interpreted as representing 100 * official policies, either expressed or implied, of the US Naval 101 * Research Laboratory (NRL). 102 */ 103 104#include <sys/cdefs.h> 105__KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.33 2016/12/13 08:29:03 ozaki-r Exp $"); 106 107#ifdef _KERNEL_OPT 108#include "opt_inet.h" 109#include "opt_inet_csum.h" 110#include "opt_tcp_debug.h" 111#include "opt_ddb.h" 112#endif 113 114#include <sys/param.h> 115#include <sys/systm.h> 116#include <sys/mbuf.h> 117#include <sys/protosw.h> 118#include <sys/socket.h> 119#include <sys/socketvar.h> 120#include <sys/errno.h> 121#include <sys/syslog.h> 122#include <sys/pool.h> 123#include <sys/domain.h> 124#include <sys/kernel.h> 125 126#include <net/if.h> 127#include <net/route.h> 128#include <net/if_types.h> 129 130#include <netinet/in.h> 131#include <netinet/in_systm.h> 132#include <netinet/ip.h> 133#include <netinet/in_pcb.h> 134#include <netinet/in_var.h> 135#include <netinet/ip_var.h> 136 137#ifdef INET6 138#ifndef INET 139#include <netinet/in.h> 140#endif 141#include <netinet/ip6.h> 142#include <netinet6/ip6_var.h> 143#include <netinet6/in6_pcb.h> 144#include <netinet6/ip6_var.h> 145#include <netinet6/in6_var.h> 146#include <netinet/icmp6.h> 147#endif 148 149#ifndef INET6 150/* always need ip6.h for IP6_EXTHDR_GET */ 151#include <netinet/ip6.h> 152#endif 153 154#include <netinet/tcp.h> 155#include <netinet/tcp_fsm.h> 156#include <netinet/tcp_seq.h> 157#include <netinet/tcp_timer.h> 158#include <netinet/tcp_var.h> 159#include <netinet/tcpip.h> 160#include <netinet/tcp_debug.h> 161 162/* SACK block pool. */ 163static struct pool sackhole_pool; 164 165void 166tcp_sack_init(void) 167{ 168 169 pool_init(&sackhole_pool, sizeof(struct sackhole), 0, 0, 0, 170 "sackholepl", NULL, IPL_SOFTNET); 171} 172 173static struct sackhole * 174sack_allochole(struct tcpcb *tp) 175{ 176 struct sackhole *hole; 177 178 if (tp->snd_numholes >= tcp_sack_tp_maxholes || 179 tcp_sack_globalholes >= tcp_sack_globalmaxholes) { 180 return NULL; 181 } 182 hole = pool_get(&sackhole_pool, PR_NOWAIT); 183 if (hole == NULL) { 184 return NULL; 185 } 186 tp->snd_numholes++; 187 tcp_sack_globalholes++; 188 189 return hole; 190} 191 192static struct sackhole * 193sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end, 194 struct sackhole *prev) 195{ 196 struct sackhole *hole; 197 198 hole = sack_allochole(tp); 199 if (hole == NULL) { 200 return NULL; 201 } 202 hole->start = hole->rxmit = start; 203 hole->end = end; 204 if (prev != NULL) { 205 TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q); 206 } else { 207 TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q); 208 } 209 return hole; 210} 211 212static struct sackhole * 213sack_removehole(struct tcpcb *tp, struct sackhole *hole) 214{ 215 struct sackhole *next; 216 217 next = TAILQ_NEXT(hole, sackhole_q); 218 tp->snd_numholes--; 219 tcp_sack_globalholes--; 220 TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q); 221 pool_put(&sackhole_pool, hole); 222 223 return next; 224} 225 226/* 227 * tcp_new_dsack: record the reception of a duplicated segment. 228 */ 229 230void 231tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len) 232{ 233 234 if (TCP_SACK_ENABLED(tp)) { 235 tp->rcv_dsack_block.left = seq; 236 tp->rcv_dsack_block.right = seq + len; 237 tp->rcv_sack_flags |= TCPSACK_HAVED; 238 } 239} 240 241/* 242 * tcp_sack_option: parse the given SACK option and update the scoreboard. 243 */ 244 245void 246tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp, 247 int optlen) 248{ 249 struct sackblk 250 t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)]; 251 struct sackblk *sack = NULL; 252 struct sackhole *cur = NULL; 253 struct sackhole *tmp = NULL; 254 const char *lp = cp + 2; 255 int i, j, num_sack_blks; 256 tcp_seq left, right, acked; 257 258 /* 259 * If we aren't processing SACK responses, this is not an ACK 260 * or the peer sends us a sack option with invalid length, don't 261 * update the scoreboard. 262 */ 263 if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) || 264 (optlen % 8 != 2 || optlen < 10)) { 265 return; 266 } 267 268 /* 269 * If we don't want any SACK holes to be allocated, just return. 270 */ 271 if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) { 272 return; 273 } 274 275 /* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */ 276 if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max)) 277 return; 278 279 /* 280 * Extract SACK blocks. 281 * 282 * Note that t_sack_block is sorted so that we only need to do 283 * one pass over the sequence number space. (SACK "fast-path") 284 */ 285 num_sack_blks = optlen / 8; 286 acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una; 287 for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) { 288 memcpy(&left, lp, sizeof(uint32_t)); 289 memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t)); 290 left = ntohl(left); 291 right = ntohl(right); 292 293 if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) || 294 SEQ_GEQ(left, right)) { 295 /* SACK entry that's old, or invalid. */ 296 i--; 297 num_sack_blks--; 298 continue; 299 } 300 301 /* Insertion sort. */ 302 for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left); 303 j--) { 304 t_sack_block[j].left = t_sack_block[j - 1].left; 305 t_sack_block[j].right = t_sack_block[j - 1].right; 306 } 307 t_sack_block[j].left = left; 308 t_sack_block[j].right = right; 309 } 310 311 /* Update the scoreboard. */ 312 cur = TAILQ_FIRST(&tp->snd_holes); 313 for (i = 0; i < num_sack_blks; i++) { 314 sack = &t_sack_block[i]; 315 /* 316 * FACK TCP. Update snd_fack so we can enter Fast 317 * Recovery early. 318 */ 319 if (SEQ_GEQ(sack->right, tp->snd_fack)) 320 tp->snd_fack = sack->right; 321 322 if (TAILQ_EMPTY(&tp->snd_holes)) { 323 /* First hole. */ 324 cur = sack_inserthole(tp, th->th_ack, sack->left, NULL); 325 if (cur == NULL) { 326 /* ENOBUFS, bail out*/ 327 return; 328 } 329 tp->rcv_lastsack = sack->right; 330 continue; /* With next sack block */ 331 } 332 333 /* Go through the list of holes. */ 334 while (cur) { 335 if (SEQ_LEQ(sack->right, cur->start)) 336 /* SACKs data before the current hole */ 337 break; /* No use going through more holes */ 338 339 if (SEQ_GEQ(sack->left, cur->end)) { 340 /* SACKs data beyond the current hole */ 341 cur = TAILQ_NEXT(cur, sackhole_q); 342 continue; 343 } 344 345 if (SEQ_LEQ(sack->left, cur->start)) { 346 /* Data acks at least the beginning of hole */ 347 if (SEQ_GEQ(sack->right, cur->end)) { 348 /* Acks entire hole, so delete hole */ 349 cur = sack_removehole(tp, cur); 350 break; 351 } 352 353 /* Otherwise, move start of hole forward */ 354 cur->start = sack->right; 355 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 356 break; 357 } 358 359 if (SEQ_GEQ(sack->right, cur->end)) { 360 /* Move end of hole backward. */ 361 cur->end = sack->left; 362 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 363 cur = TAILQ_NEXT(cur, sackhole_q); 364 break; 365 } 366 367 if (SEQ_LT(cur->start, sack->left) && 368 SEQ_GT(cur->end, sack->right)) { 369 /* 370 * ACKs some data in middle of a hole; need to 371 * split current hole 372 */ 373 tmp = sack_inserthole(tp, sack->right, cur->end, 374 cur); 375 if (tmp == NULL) { 376 return; 377 } 378 tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start); 379 cur->end = sack->left; 380 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 381 cur = tmp; 382 break; 383 } 384 } 385 386 /* At this point, we have reached the tail of the list. */ 387 if (SEQ_LT(tp->rcv_lastsack, sack->left)) { 388 /* 389 * Need to append new hole at end. 390 */ 391 cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left, 392 NULL); 393 if (cur == NULL) { 394 return; 395 } 396 } 397 if (SEQ_LT(tp->rcv_lastsack, sack->right)) { 398 tp->rcv_lastsack = sack->right; 399 } 400 } 401} 402 403/* 404 * tcp_del_sackholes: remove holes covered by a cumulative ACK. 405 */ 406 407void 408tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th) 409{ 410 /* Max because this could be an older ack that just arrived. */ 411 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 412 th->th_ack : tp->snd_una; 413 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); 414 415 while (cur) { 416 if (SEQ_LEQ(cur->end, lastack)) { 417 cur = sack_removehole(tp, cur); 418 } else if (SEQ_LT(cur->start, lastack)) { 419 cur->start = lastack; 420 if (SEQ_LT(cur->rxmit, cur->start)) 421 cur->rxmit = cur->start; 422 break; 423 } else 424 break; 425 } 426} 427 428/* 429 * tcp_free_sackholes: clear the scoreboard. 430 */ 431 432void 433tcp_free_sackholes(struct tcpcb *tp) 434{ 435 struct sackhole *sack; 436 437 /* Free up the SACK hole list. */ 438 while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) { 439 sack_removehole(tp, sack); 440 } 441 KASSERT(tp->snd_numholes == 0); 442} 443 444/* 445 * Returns pointer to a sackhole if there are any pending retransmissions; 446 * NULL otherwise. 447 */ 448struct sackhole * 449tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) 450{ 451 struct sackhole *cur = NULL; 452 453 if (!TCP_SACK_ENABLED(tp)) 454 return (NULL); 455 456 *sack_bytes_rexmt = 0; 457 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { 458 if (SEQ_LT(cur->rxmit, cur->end)) { 459 if (SEQ_LT(cur->rxmit, tp->snd_una)) { 460 /* old SACK hole */ 461 continue; 462 } 463 *sack_bytes_rexmt += (cur->rxmit - cur->start); 464 break; 465 } 466 *sack_bytes_rexmt += (cur->rxmit - cur->start); 467 } 468 469 return (cur); 470} 471 472/* 473 * After a timeout, the SACK list may be rebuilt. This SACK information 474 * should be used to avoid retransmitting SACKed data. This function 475 * traverses the SACK list to see if snd_nxt should be moved forward. 476 */ 477void 478tcp_sack_adjust(struct tcpcb *tp) 479{ 480 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); 481 struct sackhole *n = NULL; 482 483 if (TAILQ_EMPTY(&tp->snd_holes)) 484 return; /* No holes */ 485 if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) 486 return; /* We're already beyond any SACKed blocks */ 487 488 /* 489 * Two cases for which we want to advance snd_nxt: 490 * i) snd_nxt lies between end of one hole and beginning of another 491 * ii) snd_nxt lies between end of last hole and rcv_lastsack 492 */ 493 while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) { 494 if (SEQ_LT(tp->snd_nxt, cur->end)) 495 return; 496 if (SEQ_GEQ(tp->snd_nxt, n->start)) 497 cur = n; 498 else { 499 tp->snd_nxt = n->start; 500 return; 501 } 502 } 503 if (SEQ_LT(tp->snd_nxt, cur->end)) 504 return; 505 tp->snd_nxt = tp->rcv_lastsack; 506 507 return; 508} 509 510/* 511 * tcp_sack_numblks: return the number of SACK blocks to send. 512 */ 513 514int 515tcp_sack_numblks(const struct tcpcb *tp) 516{ 517 int numblks; 518 519 if (!TCP_SACK_ENABLED(tp)) { 520 return 0; 521 } 522 523 numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) + 524 tp->t_segqlen; 525 526 if (numblks == 0) { 527 return 0; 528 } 529 530 if (numblks > TCP_SACK_MAX) { 531 numblks = TCP_SACK_MAX; 532 } 533 534 return numblks; 535} 536 537#if defined(DDB) 538void sack_dump(const struct tcpcb *); 539 540void 541sack_dump(const struct tcpcb *tp) 542{ 543 const struct sackhole *cur; 544 545 printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n", 546 tp->snd_una, tp->snd_max); 547 printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n", 548 tp->rcv_lastsack, tp->snd_fack); 549 printf("numholes=%d\n", tp->snd_numholes); 550 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { 551 printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n", 552 cur->start, cur->end, cur->rxmit); 553 } 554} 555#endif /* defined(DDB) */ 556