1/* 2 * Copyright (c) 2004-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 */ 61 62#define _IP_VHL 63 64 65#include <sys/param.h> 66#include <sys/systm.h> 67#include <sys/kernel.h> 68#include <sys/sysctl.h> 69#include <sys/mbuf.h> 70#include <sys/domain.h> 71#include <sys/protosw.h> 72#include <sys/socket.h> 73#include <sys/socketvar.h> 74 75#include <kern/zalloc.h> 76 77#include <net/route.h> 78 79#include <netinet/in.h> 80#include <netinet/in_systm.h> 81#include <netinet/ip.h> 82#include <netinet/in_pcb.h> 83#include <netinet/ip_var.h> 84#if INET6 85#include <netinet6/in6_pcb.h> 86#include <netinet/ip6.h> 87#include <netinet6/ip6_var.h> 88#endif 89#include <netinet/tcp.h> 90//#define TCPOUTFLAGS 91#include <netinet/tcp_fsm.h> 92#include <netinet/tcp_seq.h> 93#include <netinet/tcp_timer.h> 94#include <netinet/tcp_var.h> 95#include <netinet/tcpip.h> 96#if TCPDEBUG 97#include <netinet/tcp_debug.h> 98#endif 99#include <sys/kdebug.h> 100 101#if IPSEC 102#include <netinet6/ipsec.h> 103#endif /*IPSEC*/ 104 105#include <libkern/OSAtomic.h> 106 107int tcp_do_sack = 1; 108SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_sack, 0, 109 "Enable/Disable TCP SACK support"); 110static int tcp_sack_maxholes = 128; 111SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_maxholes, CTLFLAG_RW | CTLFLAG_LOCKED, 112 &tcp_sack_maxholes, 0, 113 "Maximum number of TCP SACK holes allowed per connection"); 114 115static int tcp_sack_globalmaxholes = 65536; 116SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalmaxholes, CTLFLAG_RW | CTLFLAG_LOCKED, 117 &tcp_sack_globalmaxholes, 0, 118 "Global maximum number of TCP SACK holes"); 119 120static SInt32 tcp_sack_globalholes = 0; 121SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalholes, CTLFLAG_RD | CTLFLAG_LOCKED, 122 &tcp_sack_globalholes, 0, 123 "Global number of TCP SACK holes currently allocated"); 124 125extern struct zone *sack_hole_zone; 126 127/* 128 * This function is called upon receipt of new valid data (while not in header 129 * prediction mode), and it updates the ordered list of sacks. 130 */ 131void 132tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) 133{ 134 /* 135 * First reported block MUST be the most recent one. Subsequent 136 * blocks SHOULD be in the order in which they arrived at the 137 * receiver. These two conditions make the implementation fully 138 * compliant with RFC 2018. 139 */ 140 struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; 141 int num_head, num_saved, i; 142 143 /* SACK block for the received segment. */ 144 head_blk.start = rcv_start; 145 head_blk.end = rcv_end; 146 147 /* 148 * Merge updated SACK blocks into head_blk, and 149 * save unchanged SACK blocks into saved_blks[]. 150 * num_saved will have the number of the saved SACK blocks. 151 */ 152 num_saved = 0; 153 for (i = 0; i < tp->rcv_numsacks; i++) { 154 tcp_seq start = tp->sackblks[i].start; 155 tcp_seq end = tp->sackblks[i].end; 156 if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { 157 /* 158 * Discard this SACK block. 159 */ 160 } else if (SEQ_LEQ(head_blk.start, end) && 161 SEQ_GEQ(head_blk.end, start)) { 162 /* 163 * Merge this SACK block into head_blk. 164 * This SACK block itself will be discarded. 165 */ 166 if (SEQ_GT(head_blk.start, start)) 167 head_blk.start = start; 168 if (SEQ_LT(head_blk.end, end)) 169 head_blk.end = end; 170 } else { 171 /* 172 * Save this SACK block. 173 */ 174 saved_blks[num_saved].start = start; 175 saved_blks[num_saved].end = end; 176 num_saved++; 177 } 178 } 179 180 /* 181 * Update SACK list in tp->sackblks[]. 182 */ 183 num_head = 0; 184 if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { 185 /* 186 * The received data segment is an out-of-order segment. 187 * Put head_blk at the top of SACK list. 188 */ 189 tp->sackblks[0] = head_blk; 190 num_head = 1; 191 /* 192 * If the number of saved SACK blocks exceeds its limit, 193 * discard the last SACK block. 194 */ 195 if (num_saved >= MAX_SACK_BLKS) 196 num_saved--; 197 } 198 if (num_saved > 0) { 199 /* 200 * Copy the saved SACK blocks back. 201 */ 202 bcopy(saved_blks, &tp->sackblks[num_head], 203 sizeof(struct sackblk) * num_saved); 204 } 205 206 /* Save the number of SACK blocks. */ 207 tp->rcv_numsacks = num_head + num_saved; 208 209 /* If we are requesting SACK recovery, reset the stretch-ack state 210 * so that connection will generate more acks after recovery and 211 * sender's cwnd will open. 212 */ 213 if ((tp->t_flags & TF_STRETCHACK) != 0 && tp->rcv_numsacks > 0) 214 tcp_reset_stretch_ack(tp); 215 216#if TRAFFIC_MGT 217 if (tp->acc_iaj > 0 && tp->rcv_numsacks > 0) 218 reset_acc_iaj(tp); 219#endif /* TRAFFIC_MGT */ 220} 221 222/* 223 * Delete all receiver-side SACK information. 224 */ 225void 226tcp_clean_sackreport( struct tcpcb *tp) 227{ 228 229 tp->rcv_numsacks = 0; 230 bzero(&tp->sackblks[0], sizeof (struct sackblk) * MAX_SACK_BLKS); 231} 232 233/* 234 * Allocate struct sackhole. 235 */ 236static struct sackhole * 237tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) 238{ 239 struct sackhole *hole; 240 241 if (tp->snd_numholes >= tcp_sack_maxholes || 242 tcp_sack_globalholes >= tcp_sack_globalmaxholes) { 243 tcpstat.tcps_sack_sboverflow++; 244 return NULL; 245 } 246 247 hole = (struct sackhole *)zalloc(sack_hole_zone); 248 if (hole == NULL) 249 return NULL; 250 251 hole->start = start; 252 hole->end = end; 253 hole->rxmit = start; 254 255 tp->snd_numholes++; 256 OSIncrementAtomic(&tcp_sack_globalholes); 257 258 return hole; 259} 260 261/* 262 * Free struct sackhole. 263 */ 264static void 265tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) 266{ 267 zfree(sack_hole_zone, hole); 268 269 tp->snd_numholes--; 270 OSDecrementAtomic(&tcp_sack_globalholes); 271} 272 273/* 274 * Insert new SACK hole into scoreboard. 275 */ 276static struct sackhole * 277tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end, 278 struct sackhole *after) 279{ 280 struct sackhole *hole; 281 282 /* Allocate a new SACK hole. */ 283 hole = tcp_sackhole_alloc(tp, start, end); 284 if (hole == NULL) 285 return NULL; 286 hole->rxmit_start = tcp_now; 287 /* Insert the new SACK hole into scoreboard */ 288 if (after != NULL) 289 TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink); 290 else 291 TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink); 292 293 /* Update SACK hint. */ 294 if (tp->sackhint.nexthole == NULL) 295 tp->sackhint.nexthole = hole; 296 297 return hole; 298} 299 300/* 301 * Remove SACK hole from scoreboard. 302 */ 303static void 304tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole) 305{ 306 /* Update SACK hint. */ 307 if (tp->sackhint.nexthole == hole) 308 tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink); 309 310 /* Remove this SACK hole. */ 311 TAILQ_REMOVE(&tp->snd_holes, hole, scblink); 312 313 /* Free this SACK hole. */ 314 tcp_sackhole_free(tp, hole); 315} 316/* 317 * When a new ack with SACK is received, check if it indicates packet 318 * reordering. If there is packet reordering, the socket is marked and 319 * the late time offset by which the packet was reordered with 320 * respect to its closest neighboring packets is computed. 321 */ 322static void 323tcp_sack_detect_reordering(struct tcpcb *tp, struct sackhole *s, 324 tcp_seq sacked_seq, tcp_seq snd_fack) 325{ 326 int32_t rext = 0, reordered = 0; 327 328 /* 329 * If the SACK hole is past snd_fack, this is from new SACK 330 * information, so we can ignore it. 331 */ 332 if (SEQ_GT(s->end, snd_fack)) 333 return; 334 /* 335 * If there has been a retransmit timeout, then the timestamp on 336 * the SACK segment will be newer. This might lead to a 337 * false-positive. Avoid re-ordering detection in this case. 338 */ 339 if (tp->t_rxtshift > 0) 340 return; 341 342 /* 343 * Detect reordering from SACK information by checking 344 * if recently sacked data was never retransmitted from this hole. 345 */ 346 if (SEQ_LT(s->rxmit, sacked_seq)) { 347 reordered = 1; 348 tcpstat.tcps_avoid_rxmt++; 349 } 350 351 if (reordered) { 352 if (!(tp->t_flagsext & TF_PKTS_REORDERED)) { 353 tp->t_flagsext |= TF_PKTS_REORDERED; 354 tcpstat.tcps_detect_reordering++; 355 } 356 357 tcpstat.tcps_reordered_pkts++; 358 359 VERIFY(SEQ_GEQ(snd_fack, s->rxmit)); 360 361 if (s->rxmit_start > 0) { 362 rext = timer_diff(tcp_now, 0, s->rxmit_start, 0); 363 if (rext < 0) 364 return; 365 366 /* 367 * We take the maximum reorder window to schedule 368 * DELAYFR timer as that will take care of jitter 369 * on the network path. 370 * 371 * Computing average and standard deviation seems 372 * to cause unnecessary retransmissions when there 373 * is high jitter. 374 * 375 * We set a maximum of SRTT/2 and a minimum of 376 * 10 ms on the reorder window. 377 */ 378 tp->t_reorderwin = max(tp->t_reorderwin, rext); 379 tp->t_reorderwin = min(tp->t_reorderwin, 380 (tp->t_srtt >> (TCP_RTT_SHIFT - 1))); 381 tp->t_reorderwin = max(tp->t_reorderwin, 10); 382 } 383 } 384} 385 386/* 387 * Process cumulative ACK and the TCP SACK option to update the scoreboard. 388 * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of 389 * the sequence space). 390 */ 391void 392tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, 393 u_int32_t *newbytes_acked) 394{ 395 struct sackhole *cur, *temp; 396 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; 397 int i, j, num_sack_blks; 398 tcp_seq old_snd_fack = 0, th_ack = th->th_ack; 399 400 num_sack_blks = 0; 401 /* 402 * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, 403 * treat [SND.UNA, SEG.ACK) as if it is a SACK block. 404 */ 405 if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { 406 sack_blocks[num_sack_blks].start = tp->snd_una; 407 sack_blocks[num_sack_blks++].end = th_ack; 408 } 409 /* 410 * Append received valid SACK blocks to sack_blocks[]. 411 * Check that the SACK block range is valid. 412 */ 413 for (i = 0; i < to->to_nsacks; i++) { 414 bcopy((to->to_sacks + i * TCPOLEN_SACK), 415 &sack, sizeof(sack)); 416 sack.start = ntohl(sack.start); 417 sack.end = ntohl(sack.end); 418 if (SEQ_GT(sack.end, sack.start) && 419 SEQ_GT(sack.start, tp->snd_una) && 420 SEQ_GT(sack.start, th_ack) && 421 SEQ_LT(sack.start, tp->snd_max) && 422 SEQ_GT(sack.end, tp->snd_una) && 423 SEQ_LEQ(sack.end, tp->snd_max)) 424 sack_blocks[num_sack_blks++] = sack; 425 } 426 427 /* 428 * Return if SND.UNA is not advanced and no valid SACK block 429 * is received. 430 */ 431 if (num_sack_blks == 0) 432 return; 433 434 VERIFY(num_sack_blks <= (TCP_MAX_SACK + 1)); 435 /* 436 * Sort the SACK blocks so we can update the scoreboard 437 * with just one pass. The overhead of sorting upto 4+1 elements 438 * is less than making upto 4+1 passes over the scoreboard. 439 */ 440 for (i = 0; i < num_sack_blks; i++) { 441 for (j = i + 1; j < num_sack_blks; j++) { 442 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 443 sack = sack_blocks[i]; 444 sack_blocks[i] = sack_blocks[j]; 445 sack_blocks[j] = sack; 446 } 447 } 448 } 449 if (TAILQ_EMPTY(&tp->snd_holes)) { 450 /* 451 * Empty scoreboard. Need to initialize snd_fack (it may be 452 * uninitialized or have a bogus value). Scoreboard holes 453 * (from the sack blocks received) are created later below (in 454 * the logic that adds holes to the tail of the scoreboard). 455 */ 456 tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); 457 *newbytes_acked += (tp->snd_fack - tp->snd_una); 458 } 459 460 old_snd_fack = tp->snd_fack; 461 /* 462 * In the while-loop below, incoming SACK blocks (sack_blocks[]) 463 * and SACK holes (snd_holes) are traversed from their tails with 464 * just one pass in order to reduce the number of compares especially 465 * when the bandwidth-delay product is large. 466 * Note: Typically, in the first RTT of SACK recovery, the highest 467 * three or four SACK blocks with the same ack number are received. 468 * In the second RTT, if retransmitted data segments are not lost, 469 * the highest three or four SACK blocks with ack number advancing 470 * are received. 471 */ 472 sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ 473 if (SEQ_LT(tp->snd_fack, sblkp->start)) { 474 /* 475 * The highest SACK block is beyond fack. 476 * Append new SACK hole at the tail. 477 * If the second or later highest SACK blocks are also 478 * beyond the current fack, they will be inserted by 479 * way of hole splitting in the while-loop below. 480 */ 481 temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); 482 if (temp != NULL) { 483 tp->snd_fack = sblkp->end; 484 *newbytes_acked += (sblkp->end - sblkp->start); 485 486 /* Go to the previous sack block. */ 487 sblkp--; 488 } else { 489 /* 490 * We failed to add a new hole based on the current 491 * sack block. Skip over all the sack blocks that 492 * fall completely to the right of snd_fack and proceed 493 * to trim the scoreboard based on the remaining sack 494 * blocks. This also trims the scoreboard for th_ack 495 * (which is sack_blocks[0]). 496 */ 497 while (sblkp >= sack_blocks && 498 SEQ_LT(tp->snd_fack, sblkp->start)) 499 sblkp--; 500 if (sblkp >= sack_blocks && 501 SEQ_LT(tp->snd_fack, sblkp->end)) { 502 *newbytes_acked += (sblkp->end - tp->snd_fack); 503 tp->snd_fack = sblkp->end; 504 } 505 } 506 } else if (SEQ_LT(tp->snd_fack, sblkp->end)) { 507 /* fack is advanced. */ 508 *newbytes_acked += (sblkp->end - tp->snd_fack); 509 tp->snd_fack = sblkp->end; 510 } 511 /* We must have at least one SACK hole in scoreboard */ 512 cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole */ 513 /* 514 * Since the incoming sack blocks are sorted, we can process them 515 * making one sweep of the scoreboard. 516 */ 517 while (sblkp >= sack_blocks && cur != NULL) { 518 if (SEQ_GEQ(sblkp->start, cur->end)) { 519 /* 520 * SACKs data beyond the current hole. 521 * Go to the previous sack block. 522 */ 523 sblkp--; 524 continue; 525 } 526 if (SEQ_LEQ(sblkp->end, cur->start)) { 527 /* 528 * SACKs data before the current hole. 529 * Go to the previous hole. 530 */ 531 cur = TAILQ_PREV(cur, sackhole_head, scblink); 532 continue; 533 } 534 tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); 535 if (SEQ_LEQ(sblkp->start, cur->start)) { 536 /* Data acks at least the beginning of hole */ 537 if (SEQ_GEQ(sblkp->end, cur->end)) { 538 /* Acks entire hole, so delete hole */ 539 *newbytes_acked += (cur->end - cur->start); 540 541 tcp_sack_detect_reordering(tp, cur, 542 cur->end, old_snd_fack); 543 temp = cur; 544 cur = TAILQ_PREV(cur, sackhole_head, scblink); 545 tcp_sackhole_remove(tp, temp); 546 /* 547 * The sack block may ack all or part of the next 548 * hole too, so continue onto the next hole. 549 */ 550 continue; 551 } else { 552 /* Move start of hole forward */ 553 *newbytes_acked += (sblkp->end - cur->start); 554 tcp_sack_detect_reordering(tp, cur, 555 sblkp->end, old_snd_fack); 556 cur->start = sblkp->end; 557 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 558 } 559 } else { 560 /* Data acks at least the end of hole */ 561 if (SEQ_GEQ(sblkp->end, cur->end)) { 562 /* Move end of hole backward */ 563 *newbytes_acked += (cur->end - sblkp->start); 564 tcp_sack_detect_reordering(tp, cur, 565 cur->end, old_snd_fack); 566 cur->end = sblkp->start; 567 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 568 } else { 569 /* 570 * ACKs some data in the middle of a hole; 571 * need to split current hole 572 */ 573 *newbytes_acked += (sblkp->end - sblkp->start); 574 tcp_sack_detect_reordering(tp, cur, 575 sblkp->end, old_snd_fack); 576 temp = tcp_sackhole_insert(tp, sblkp->end, 577 cur->end, cur); 578 if (temp != NULL) { 579 if (SEQ_GT(cur->rxmit, temp->rxmit)) { 580 temp->rxmit = cur->rxmit; 581 tp->sackhint.sack_bytes_rexmit 582 += (temp->rxmit 583 - temp->start); 584 } 585 cur->end = sblkp->start; 586 cur->rxmit = SEQ_MIN(cur->rxmit, 587 cur->end); 588 /* 589 * Reset the rxmit_start to that of 590 * the current hole as that will 591 * help to compute the reorder 592 * window correctly 593 */ 594 temp->rxmit_start = cur->rxmit_start; 595 } 596 } 597 } 598 tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); 599 /* 600 * Testing sblkp->start against cur->start tells us whether 601 * we're done with the sack block or the sack hole. 602 * Accordingly, we advance one or the other. 603 */ 604 if (SEQ_LEQ(sblkp->start, cur->start)) 605 cur = TAILQ_PREV(cur, sackhole_head, scblink); 606 else 607 sblkp--; 608 } 609} 610 611/* 612 * Free all SACK holes to clear the scoreboard. 613 */ 614void 615tcp_free_sackholes(struct tcpcb *tp) 616{ 617 struct sackhole *q; 618 619 while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL) 620 tcp_sackhole_remove(tp, q); 621 tp->sackhint.sack_bytes_rexmit = 0; 622 tp->sackhint.nexthole = NULL; 623 tp->sack_newdata = 0; 624 625} 626 627/* 628 * Partial ack handling within a sack recovery episode. 629 * Keeping this very simple for now. When a partial ack 630 * is received, force snd_cwnd to a value that will allow 631 * the sender to transmit no more than 2 segments. 632 * If necessary, a better scheme can be adopted at a 633 * later point, but for now, the goal is to prevent the 634 * sender from bursting a large amount of data in the midst 635 * of sack recovery. 636 */ 637void 638tcp_sack_partialack(tp, th) 639 struct tcpcb *tp; 640 struct tcphdr *th; 641{ 642 int num_segs = 1; 643 644 tp->t_timer[TCPT_REXMT] = 0; 645 tp->t_rtttime = 0; 646 /* send one or 2 segments based on how much new data was acked */ 647 if (((BYTES_ACKED(th, tp)) / tp->t_maxseg) > 2) 648 num_segs = 2; 649 tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + 650 (tp->snd_nxt - tp->sack_newdata) + 651 num_segs * tp->t_maxseg); 652 if (tp->snd_cwnd > tp->snd_ssthresh) 653 tp->snd_cwnd = tp->snd_ssthresh; 654 tp->t_flags |= TF_ACKNOW; 655 (void) tcp_output(tp); 656} 657 658/* 659 * Debug version of tcp_sack_output() that walks the scoreboard. Used for 660 * now to sanity check the hint. 661 */ 662static struct sackhole * 663tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt) 664{ 665 struct sackhole *p; 666 667 *sack_bytes_rexmt = 0; 668 TAILQ_FOREACH(p, &tp->snd_holes, scblink) { 669 if (SEQ_LT(p->rxmit, p->end)) { 670 if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ 671 continue; 672 } 673 *sack_bytes_rexmt += (p->rxmit - p->start); 674 break; 675 } 676 *sack_bytes_rexmt += (p->rxmit - p->start); 677 } 678 return (p); 679} 680 681/* 682 * Returns the next hole to retransmit and the number of retransmitted bytes 683 * from the scoreboard. We store both the next hole and the number of 684 * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK 685 * reception). This avoids scoreboard traversals completely. 686 * 687 * The loop here will traverse *at most* one link. Here's the argument. 688 * For the loop to traverse more than 1 link before finding the next hole to 689 * retransmit, we would need to have at least 1 node following the current hint 690 * with (rxmit == end). But, for all holes following the current hint, 691 * (start == rxmit), since we have not yet retransmitted from them. Therefore, 692 * in order to traverse more 1 link in the loop below, we need to have at least 693 * one node following the current hint with (start == rxmit == end). 694 * But that can't happen, (start == end) means that all the data in that hole 695 * has been sacked, in which case, the hole would have been removed from the 696 * scoreboard. 697 */ 698struct sackhole * 699tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) 700{ 701 struct sackhole *hole = NULL, *dbg_hole = NULL; 702 int dbg_bytes_rexmt; 703 704 dbg_hole = tcp_sack_output_debug(tp, &dbg_bytes_rexmt); 705 *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit; 706 hole = tp->sackhint.nexthole; 707 if (hole == NULL || SEQ_LT(hole->rxmit, hole->end)) 708 goto out; 709 while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) { 710 if (SEQ_LT(hole->rxmit, hole->end)) { 711 tp->sackhint.nexthole = hole; 712 break; 713 } 714 } 715out: 716 if (dbg_hole != hole) { 717 printf("%s: Computed sack hole not the same as cached value\n", __func__); 718 hole = dbg_hole; 719 } 720 if (*sack_bytes_rexmt != dbg_bytes_rexmt) { 721 printf("%s: Computed sack_bytes_retransmitted (%d) not " 722 "the same as cached value (%d)\n", 723 __func__, dbg_bytes_rexmt, *sack_bytes_rexmt); 724 *sack_bytes_rexmt = dbg_bytes_rexmt; 725 } 726 return (hole); 727} 728 729/* 730 * After a timeout, the SACK list may be rebuilt. This SACK information 731 * should be used to avoid retransmitting SACKed data. This function 732 * traverses the SACK list to see if snd_nxt should be moved forward. 733 */ 734void 735tcp_sack_adjust(struct tcpcb *tp) 736{ 737 struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); 738 739 if (cur == NULL) 740 return; /* No holes */ 741 if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) 742 return; /* We're already beyond any SACKed blocks */ 743 /* 744 * Two cases for which we want to advance snd_nxt: 745 * i) snd_nxt lies between end of one hole and beginning of another 746 * ii) snd_nxt lies between end of last hole and snd_fack 747 */ 748 while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { 749 if (SEQ_LT(tp->snd_nxt, cur->end)) 750 return; 751 if (SEQ_GEQ(tp->snd_nxt, p->start)) 752 cur = p; 753 else { 754 tp->snd_nxt = p->start; 755 return; 756 } 757 } 758 if (SEQ_LT(tp->snd_nxt, cur->end)) 759 return; 760 tp->snd_nxt = tp->snd_fack; 761 return; 762} 763 764/* 765 * This function returns true if more than (tcprexmtthresh - 1) * SMSS 766 * bytes with sequence numbers greater than snd_una have been SACKed. 767 */ 768boolean_t 769tcp_sack_byte_islost(struct tcpcb *tp) 770{ 771 u_int32_t unacked_bytes, sndhole_bytes = 0; 772 struct sackhole *sndhole; 773 if (!SACK_ENABLED(tp) || IN_FASTRECOVERY(tp) || 774 TAILQ_EMPTY(&tp->snd_holes) || 775 (tp->t_flagsext & TF_PKTS_REORDERED)) 776 return (FALSE); 777 778 unacked_bytes = tp->snd_max - tp->snd_una; 779 780 TAILQ_FOREACH(sndhole, &tp->snd_holes, scblink) { 781 sndhole_bytes += (sndhole->end - sndhole->start); 782 } 783 784 VERIFY(unacked_bytes >= sndhole_bytes); 785 return ((unacked_bytes - sndhole_bytes) > 786 ((tcprexmtthresh - 1) * tp->t_maxseg)); 787} 788