tcp_congctl.c revision 1.21
1/* $NetBSD: tcp_congctl.c,v 1.21 2016/04/26 08:44:44 ozaki-r Exp $ */ 2 3/*- 4 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 9 * Facility, NASA Ames Research Center. 10 * This code is derived from software contributed to The NetBSD Foundation 11 * by Charles M. Hannum. 12 * This code is derived from software contributed to The NetBSD Foundation 13 * by Rui Paulo. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 * POSSIBILITY OF SUCH DAMAGE. 35 */ 36 37/* 38 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 39 * All rights reserved. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the project nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 */ 65 66/* 67 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 68 * 69 * NRL grants permission for redistribution and use in source and binary 70 * forms, with or without modification, of the software and documentation 71 * created at NRL provided that the following conditions are met: 72 * 73 * 1. Redistributions of source code must retain the above copyright 74 * notice, this list of conditions and the following disclaimer. 75 * 2. Redistributions in binary form must reproduce the above copyright 76 * notice, this list of conditions and the following disclaimer in the 77 * documentation and/or other materials provided with the distribution. 78 * 3. All advertising materials mentioning features or use of this software 79 * must display the following acknowledgements: 80 * This product includes software developed by the University of 81 * California, Berkeley and its contributors. 82 * This product includes software developed at the Information 83 * Technology Division, US Naval Research Laboratory. 84 * 4. Neither the name of the NRL nor the names of its contributors 85 * may be used to endorse or promote products derived from this software 86 * without specific prior written permission. 87 * 88 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 89 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 90 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 91 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 92 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 93 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 94 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 95 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 96 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 97 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 98 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 99 * 100 * The views and conclusions contained in the software and documentation 101 * are those of the authors and should not be interpreted as representing 102 * official policies, either expressed or implied, of the US Naval 103 * Research Laboratory (NRL). 104 */ 105 106/* 107 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 108 * The Regents of the University of California. All rights reserved. 109 * 110 * Redistribution and use in source and binary forms, with or without 111 * modification, are permitted provided that the following conditions 112 * are met: 113 * 1. Redistributions of source code must retain the above copyright 114 * notice, this list of conditions and the following disclaimer. 115 * 2. Redistributions in binary form must reproduce the above copyright 116 * notice, this list of conditions and the following disclaimer in the 117 * documentation and/or other materials provided with the distribution. 118 * 3. Neither the name of the University nor the names of its contributors 119 * may be used to endorse or promote products derived from this software 120 * without specific prior written permission. 121 * 122 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 123 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 124 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 125 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 126 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 127 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 128 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 129 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 130 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 131 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 132 * SUCH DAMAGE. 133 * 134 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 135 */ 136 137#include <sys/cdefs.h> 138__KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.21 2016/04/26 08:44:44 ozaki-r Exp $"); 139 140#ifdef _KERNEL_OPT 141#include "opt_inet.h" 142#include "opt_tcp_debug.h" 143#include "opt_tcp_congctl.h" 144#endif 145 146#include <sys/param.h> 147#include <sys/systm.h> 148#include <sys/malloc.h> 149#include <sys/mbuf.h> 150#include <sys/protosw.h> 151#include <sys/socket.h> 152#include <sys/socketvar.h> 153#include <sys/errno.h> 154#include <sys/syslog.h> 155#include <sys/pool.h> 156#include <sys/domain.h> 157#include <sys/kernel.h> 158#include <sys/mutex.h> 159 160#include <net/if.h> 161 162#include <netinet/in.h> 163#include <netinet/in_systm.h> 164#include <netinet/ip.h> 165#include <netinet/in_pcb.h> 166#include <netinet/in_var.h> 167#include <netinet/ip_var.h> 168 169#ifdef INET6 170#ifndef INET 171#include <netinet/in.h> 172#endif 173#include <netinet/ip6.h> 174#include <netinet6/ip6_var.h> 175#include <netinet6/in6_pcb.h> 176#include <netinet6/ip6_var.h> 177#include <netinet6/in6_var.h> 178#include <netinet/icmp6.h> 179#include <netinet6/nd6.h> 180#endif 181 182#include <netinet/tcp.h> 183#include <netinet/tcp_fsm.h> 184#include <netinet/tcp_seq.h> 185#include <netinet/tcp_timer.h> 186#include <netinet/tcp_var.h> 187#include <netinet/tcpip.h> 188#include <netinet/tcp_congctl.h> 189#ifdef TCP_DEBUG 190#include <netinet/tcp_debug.h> 191#endif 192 193/* 194 * TODO: 195 * consider separating the actual implementations in another file. 196 */ 197 198static void tcp_common_congestion_exp(struct tcpcb *, int, int); 199 200static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *); 201static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *); 202static void tcp_reno_slow_retransmit(struct tcpcb *); 203static void tcp_reno_fast_retransmit_newack(struct tcpcb *, 204 const struct tcphdr *); 205static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *); 206static void tcp_reno_congestion_exp(struct tcpcb *tp); 207 208static int tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *); 209static void tcp_newreno_fast_retransmit_newack(struct tcpcb *, 210 const struct tcphdr *); 211static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *); 212 213static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *); 214static void tcp_cubic_slow_retransmit(struct tcpcb *tp); 215static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *); 216static void tcp_cubic_congestion_exp(struct tcpcb *); 217 218static void tcp_congctl_fillnames(void); 219 220extern int tcprexmtthresh; 221 222MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures"); 223 224/* currently selected global congestion control */ 225char tcp_congctl_global_name[TCPCC_MAXLEN]; 226 227/* available global congestion control algorithms */ 228char tcp_congctl_avail[10 * TCPCC_MAXLEN]; 229 230/* 231 * Used to list the available congestion control algorithms. 232 */ 233TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd = 234 TAILQ_HEAD_INITIALIZER(tcp_congctlhd); 235 236static struct tcp_congctlent * tcp_congctl_global; 237 238static kmutex_t tcp_congctl_mtx; 239 240void 241tcp_congctl_init(void) 242{ 243 int r __diagused; 244 245 mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE); 246 247 /* Base algorithms. */ 248 r = tcp_congctl_register("reno", &tcp_reno_ctl); 249 KASSERT(r == 0); 250 r = tcp_congctl_register("newreno", &tcp_newreno_ctl); 251 KASSERT(r == 0); 252 r = tcp_congctl_register("cubic", &tcp_cubic_ctl); 253 KASSERT(r == 0); 254 255 /* NewReno is the default. */ 256#ifndef TCP_CONGCTL_DEFAULT 257#define TCP_CONGCTL_DEFAULT "newreno" 258#endif 259 260 r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT); 261 KASSERT(r == 0); 262} 263 264/* 265 * Register a congestion algorithm and select it if we have none. 266 */ 267int 268tcp_congctl_register(const char *name, const struct tcp_congctl *tcc) 269{ 270 struct tcp_congctlent *ntcc, *tccp; 271 272 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 273 if (!strcmp(name, tccp->congctl_name)) { 274 /* name already registered */ 275 return EEXIST; 276 } 277 278 ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO); 279 280 strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1); 281 ntcc->congctl_ctl = tcc; 282 283 TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent); 284 tcp_congctl_fillnames(); 285 286 if (TAILQ_FIRST(&tcp_congctlhd) == ntcc) 287 tcp_congctl_select(NULL, name); 288 289 return 0; 290} 291 292int 293tcp_congctl_unregister(const char *name) 294{ 295 struct tcp_congctlent *tccp, *rtccp; 296 unsigned int size; 297 298 rtccp = NULL; 299 size = 0; 300 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 301 if (!strcmp(name, tccp->congctl_name)) 302 rtccp = tccp; 303 size++; 304 } 305 306 if (!rtccp) 307 return ENOENT; 308 309 if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt) 310 return EBUSY; 311 312 TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent); 313 free(rtccp, M_TCPCONGCTL); 314 tcp_congctl_fillnames(); 315 316 return 0; 317} 318 319/* 320 * Select a congestion algorithm by name. 321 */ 322int 323tcp_congctl_select(struct tcpcb *tp, const char *name) 324{ 325 struct tcp_congctlent *tccp, *old_tccp, *new_tccp; 326 bool old_found, new_found; 327 328 KASSERT(name); 329 330 old_found = (tp == NULL || tp->t_congctl == NULL); 331 old_tccp = NULL; 332 new_found = false; 333 new_tccp = NULL; 334 335 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 336 if (!old_found && tccp->congctl_ctl == tp->t_congctl) { 337 old_tccp = tccp; 338 old_found = true; 339 } 340 341 if (!new_found && !strcmp(name, tccp->congctl_name)) { 342 new_tccp = tccp; 343 new_found = true; 344 } 345 346 if (new_found && old_found) { 347 if (tp) { 348 mutex_enter(&tcp_congctl_mtx); 349 if (old_tccp) 350 old_tccp->congctl_refcnt--; 351 tp->t_congctl = new_tccp->congctl_ctl; 352 new_tccp->congctl_refcnt++; 353 mutex_exit(&tcp_congctl_mtx); 354 } else { 355 tcp_congctl_global = new_tccp; 356 strlcpy(tcp_congctl_global_name, 357 new_tccp->congctl_name, 358 sizeof(tcp_congctl_global_name) - 1); 359 } 360 return 0; 361 } 362 } 363 364 return EINVAL; 365} 366 367void 368tcp_congctl_release(struct tcpcb *tp) 369{ 370 struct tcp_congctlent *tccp; 371 372 KASSERT(tp->t_congctl); 373 374 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 375 if (tccp->congctl_ctl == tp->t_congctl) { 376 tccp->congctl_refcnt--; 377 return; 378 } 379 } 380} 381 382/* 383 * Returns the name of a congestion algorithm. 384 */ 385const char * 386tcp_congctl_bystruct(const struct tcp_congctl *tcc) 387{ 388 struct tcp_congctlent *tccp; 389 390 KASSERT(tcc); 391 392 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 393 if (tccp->congctl_ctl == tcc) 394 return tccp->congctl_name; 395 396 return NULL; 397} 398 399static void 400tcp_congctl_fillnames(void) 401{ 402 struct tcp_congctlent *tccp; 403 const char *delim = " "; 404 405 tcp_congctl_avail[0] = '\0'; 406 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 407 strlcat(tcp_congctl_avail, tccp->congctl_name, 408 sizeof(tcp_congctl_avail) - 1); 409 if (TAILQ_NEXT(tccp, congctl_ent)) 410 strlcat(tcp_congctl_avail, delim, 411 sizeof(tcp_congctl_avail) - 1); 412 } 413 414} 415 416/* ------------------------------------------------------------------------ */ 417 418/* 419 * Common stuff 420 */ 421 422/* Window reduction (1-beta) for [New]Reno: 0.5 */ 423#define RENO_BETAA 1 424#define RENO_BETAB 2 425/* Window reduction (1-beta) for Cubic: 0.8 */ 426#define CUBIC_BETAA 4 427#define CUBIC_BETAB 5 428/* Draft Rhee Section 4.1 */ 429#define CUBIC_CA 4 430#define CUBIC_CB 10 431 432static void 433tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab) 434{ 435 u_int win; 436 437 /* 438 * Reduce the congestion window and the slow start threshold. 439 */ 440 win = min(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz; 441 if (win < 2) 442 win = 2; 443 444 tp->snd_ssthresh = win * tp->t_segsz; 445 tp->snd_recover = tp->snd_max; 446 tp->snd_cwnd = tp->snd_ssthresh; 447 448 /* 449 * When using TCP ECN, notify the peer that 450 * we reduced the cwnd. 451 */ 452 if (TCP_ECN_ALLOWED(tp)) 453 tp->t_flags |= TF_ECN_SND_CWR; 454} 455 456 457/* ------------------------------------------------------------------------ */ 458 459/* 460 * TCP/Reno congestion control. 461 */ 462static void 463tcp_reno_congestion_exp(struct tcpcb *tp) 464{ 465 466 tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB); 467} 468 469static int 470tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 471{ 472 /* 473 * Dup acks mean that packets have left the 474 * network (they're now cached at the receiver) 475 * so bump cwnd by the amount in the receiver 476 * to keep a constant cwnd packets in the 477 * network. 478 * 479 * If we are using TCP/SACK, then enter 480 * Fast Recovery if the receiver SACKs 481 * data that is tcprexmtthresh * MSS 482 * bytes past the last ACKed segment, 483 * irrespective of the number of DupAcks. 484 */ 485 486 tcp_seq onxt = tp->snd_nxt; 487 488 tp->t_partialacks = 0; 489 TCP_TIMER_DISARM(tp, TCPT_REXMT); 490 tp->t_rtttime = 0; 491 if (TCP_SACK_ENABLED(tp)) { 492 tp->t_dupacks = tcprexmtthresh; 493 tp->sack_newdata = tp->snd_nxt; 494 tp->snd_cwnd = tp->t_segsz; 495 (void) tcp_output(tp); 496 return 0; 497 } 498 tp->snd_nxt = th->th_ack; 499 tp->snd_cwnd = tp->t_segsz; 500 (void) tcp_output(tp); 501 tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks; 502 if (SEQ_GT(onxt, tp->snd_nxt)) 503 tp->snd_nxt = onxt; 504 505 return 0; 506} 507 508static int 509tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 510{ 511 512 /* 513 * We know we're losing at the current 514 * window size so do congestion avoidance 515 * (set ssthresh to half the current window 516 * and pull our congestion window back to 517 * the new ssthresh). 518 */ 519 520 tcp_reno_congestion_exp(tp); 521 return tcp_reno_do_fast_retransmit(tp, th); 522} 523 524static void 525tcp_reno_slow_retransmit(struct tcpcb *tp) 526{ 527 u_int win; 528 529 /* 530 * Close the congestion window down to one segment 531 * (we'll open it by one segment for each ack we get). 532 * Since we probably have a window's worth of unacked 533 * data accumulated, this "slow start" keeps us from 534 * dumping all that data as back-to-back packets (which 535 * might overwhelm an intermediate gateway). 536 * 537 * There are two phases to the opening: Initially we 538 * open by one mss on each ack. This makes the window 539 * size increase exponentially with time. If the 540 * window is larger than the path can handle, this 541 * exponential growth results in dropped packet(s) 542 * almost immediately. To get more time between 543 * drops but still "push" the network to take advantage 544 * of improving conditions, we switch from exponential 545 * to linear window opening at some threshhold size. 546 * For a threshhold, we use half the current window 547 * size, truncated to a multiple of the mss. 548 * 549 * (the minimum cwnd that will give us exponential 550 * growth is 2 mss. We don't allow the threshhold 551 * to go below this.) 552 */ 553 554 win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz; 555 if (win < 2) 556 win = 2; 557 /* Loss Window MUST be one segment. */ 558 tp->snd_cwnd = tp->t_segsz; 559 tp->snd_ssthresh = win * tp->t_segsz; 560 tp->t_partialacks = -1; 561 tp->t_dupacks = 0; 562 tp->t_bytes_acked = 0; 563 564 if (TCP_ECN_ALLOWED(tp)) 565 tp->t_flags |= TF_ECN_SND_CWR; 566} 567 568static void 569tcp_reno_fast_retransmit_newack(struct tcpcb *tp, 570 const struct tcphdr *th) 571{ 572 if (tp->t_partialacks < 0) { 573 /* 574 * We were not in fast recovery. Reset the duplicate ack 575 * counter. 576 */ 577 tp->t_dupacks = 0; 578 } else { 579 /* 580 * Clamp the congestion window to the crossover point and 581 * exit fast recovery. 582 */ 583 if (tp->snd_cwnd > tp->snd_ssthresh) 584 tp->snd_cwnd = tp->snd_ssthresh; 585 tp->t_partialacks = -1; 586 tp->t_dupacks = 0; 587 tp->t_bytes_acked = 0; 588 if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) 589 tp->snd_fack = th->th_ack; 590 } 591} 592 593static void 594tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th) 595{ 596 /* 597 * When new data is acked, open the congestion window. 598 */ 599 600 u_int cw = tp->snd_cwnd; 601 u_int incr = tp->t_segsz; 602 603 if (tcp_do_abc) { 604 605 /* 606 * RFC 3465 Appropriate Byte Counting (ABC) 607 */ 608 609 int acked = th->th_ack - tp->snd_una; 610 611 if (cw >= tp->snd_ssthresh) { 612 tp->t_bytes_acked += acked; 613 if (tp->t_bytes_acked >= cw) { 614 /* Time to increase the window. */ 615 tp->t_bytes_acked -= cw; 616 } else { 617 /* No need to increase yet. */ 618 incr = 0; 619 } 620 } else { 621 /* 622 * use 2*SMSS or 1*SMSS for the "L" param, 623 * depending on sysctl setting. 624 * 625 * (See RFC 3465 2.3 Choosing the Limit) 626 */ 627 u_int abc_lim; 628 629 abc_lim = (tcp_abc_aggressive == 0 || 630 tp->snd_nxt != tp->snd_max) ? incr : incr * 2; 631 incr = min(acked, abc_lim); 632 } 633 } else { 634 635 /* 636 * If the window gives us less than ssthresh packets 637 * in flight, open exponentially (segsz per packet). 638 * Otherwise open linearly: segsz per window 639 * (segsz^2 / cwnd per packet). 640 */ 641 642 if (cw >= tp->snd_ssthresh) { 643 incr = incr * incr / cw; 644 } 645 } 646 647 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale); 648} 649 650const struct tcp_congctl tcp_reno_ctl = { 651 .fast_retransmit = tcp_reno_fast_retransmit, 652 .slow_retransmit = tcp_reno_slow_retransmit, 653 .fast_retransmit_newack = tcp_reno_fast_retransmit_newack, 654 .newack = tcp_reno_newack, 655 .cong_exp = tcp_reno_congestion_exp, 656}; 657 658/* 659 * TCP/NewReno Congestion control. 660 */ 661static int 662tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 663{ 664 665 if (SEQ_LT(th->th_ack, tp->snd_high)) { 666 /* 667 * False fast retransmit after timeout. 668 * Do not enter fast recovery 669 */ 670 tp->t_dupacks = 0; 671 return 1; 672 } 673 /* 674 * Fast retransmit is same as reno. 675 */ 676 return tcp_reno_fast_retransmit(tp, th); 677} 678 679/* 680 * Implement the NewReno response to a new ack, checking for partial acks in 681 * fast recovery. 682 */ 683static void 684tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th) 685{ 686 if (tp->t_partialacks < 0) { 687 /* 688 * We were not in fast recovery. Reset the duplicate ack 689 * counter. 690 */ 691 tp->t_dupacks = 0; 692 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { 693 /* 694 * This is a partial ack. Retransmit the first unacknowledged 695 * segment and deflate the congestion window by the amount of 696 * acknowledged data. Do not exit fast recovery. 697 */ 698 tcp_seq onxt = tp->snd_nxt; 699 u_long ocwnd = tp->snd_cwnd; 700 int sack_num_segs = 1, sack_bytes_rxmt = 0; 701 702 /* 703 * snd_una has not yet been updated and the socket's send 704 * buffer has not yet drained off the ACK'd data, so we 705 * have to leave snd_una as it was to get the correct data 706 * offset in tcp_output(). 707 */ 708 tp->t_partialacks++; 709 TCP_TIMER_DISARM(tp, TCPT_REXMT); 710 tp->t_rtttime = 0; 711 tp->snd_nxt = th->th_ack; 712 713 if (TCP_SACK_ENABLED(tp)) { 714 /* 715 * Partial ack handling within a sack recovery episode. 716 * Keeping this very simple for now. When a partial ack 717 * is received, force snd_cwnd to a value that will 718 * allow the sender to transmit no more than 2 segments. 719 * If necessary, a fancier scheme can be adopted at a 720 * later point, but for now, the goal is to prevent the 721 * sender from bursting a large amount of data in the 722 * midst of sack recovery. 723 */ 724 725 /* 726 * send one or 2 segments based on how much 727 * new data was acked 728 */ 729 if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) 730 sack_num_segs = 2; 731 (void)tcp_sack_output(tp, &sack_bytes_rxmt); 732 tp->snd_cwnd = sack_bytes_rxmt + 733 (tp->snd_nxt - tp->sack_newdata) + 734 sack_num_segs * tp->t_segsz; 735 tp->t_flags |= TF_ACKNOW; 736 (void) tcp_output(tp); 737 } else { 738 /* 739 * Set snd_cwnd to one segment beyond ACK'd offset 740 * snd_una is not yet updated when we're called 741 */ 742 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 743 (void) tcp_output(tp); 744 tp->snd_cwnd = ocwnd; 745 if (SEQ_GT(onxt, tp->snd_nxt)) 746 tp->snd_nxt = onxt; 747 /* 748 * Partial window deflation. Relies on fact that 749 * tp->snd_una not updated yet. 750 */ 751 tp->snd_cwnd -= (th->th_ack - tp->snd_una - 752 tp->t_segsz); 753 } 754 } else { 755 /* 756 * Complete ack. Inflate the congestion window to ssthresh 757 * and exit fast recovery. 758 * 759 * Window inflation should have left us with approx. 760 * snd_ssthresh outstanding data. But in case we 761 * would be inclined to send a burst, better to do 762 * it via the slow start mechanism. 763 */ 764 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 765 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 766 + tp->t_segsz; 767 else 768 tp->snd_cwnd = tp->snd_ssthresh; 769 tp->t_partialacks = -1; 770 tp->t_dupacks = 0; 771 tp->t_bytes_acked = 0; 772 if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) 773 tp->snd_fack = th->th_ack; 774 } 775} 776 777static void 778tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th) 779{ 780 /* 781 * If we are still in fast recovery (meaning we are using 782 * NewReno and we have only received partial acks), do not 783 * inflate the window yet. 784 */ 785 if (tp->t_partialacks < 0) 786 tcp_reno_newack(tp, th); 787} 788 789 790const struct tcp_congctl tcp_newreno_ctl = { 791 .fast_retransmit = tcp_newreno_fast_retransmit, 792 .slow_retransmit = tcp_reno_slow_retransmit, 793 .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, 794 .newack = tcp_newreno_newack, 795 .cong_exp = tcp_reno_congestion_exp, 796}; 797 798/* 799 * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02 800 */ 801 802/* Cubic prototypes */ 803static void tcp_cubic_update_ctime(struct tcpcb *tp); 804static uint32_t tcp_cubic_diff_ctime(struct tcpcb *); 805static uint32_t tcp_cubic_cbrt(uint32_t); 806static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t); 807 808/* Cubic TIME functions - XXX I don't like using timevals and microuptime */ 809/* 810 * Set congestion timer to now 811 */ 812static void 813tcp_cubic_update_ctime(struct tcpcb *tp) 814{ 815 struct timeval now_timeval; 816 817 getmicrouptime(&now_timeval); 818 tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 + 819 now_timeval.tv_usec / 1000; 820} 821 822/* 823 * miliseconds from last congestion 824 */ 825static uint32_t 826tcp_cubic_diff_ctime(struct tcpcb *tp) 827{ 828 struct timeval now_timeval; 829 830 getmicrouptime(&now_timeval); 831 return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 - 832 tp->snd_cubic_ctime; 833} 834 835/* 836 * Approximate cubic root 837 */ 838#define CBRT_ROUNDS 30 839static uint32_t 840tcp_cubic_cbrt(uint32_t v) 841{ 842 int i, rounds = CBRT_ROUNDS; 843 uint64_t x = v / 3; 844 845 /* We fail to calculate correct for small numbers */ 846 if (v == 0) 847 return 0; 848 else if (v < 4) 849 return 1; 850 851 /* 852 * largest x that 2*x^3+3*x fits 64bit 853 * Avoid overflow for a time cost 854 */ 855 if (x > 2097151) 856 rounds += 10; 857 858 for (i = 0; i < rounds; i++) 859 if (rounds == CBRT_ROUNDS) 860 x = (v + 2 * x * x * x) / (3 * x * x); 861 else 862 /* Avoid overflow */ 863 x = v / (3 * x * x) + 2 * x / 3; 864 865 return (uint32_t)x; 866} 867 868/* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */ 869static ulong 870tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt) 871{ 872 uint32_t K; 873 long tK3; 874 875 /* Section 3.1 Eq. 2 */ 876 K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB * 877 CUBIC_CB / CUBIC_CA); 878 /* (t-K)^3 - not clear why is the measure unit mattering */ 879 tK3 = (long)(ms_elapsed + rtt) - (long)K; 880 tK3 = tK3 * tK3 * tK3; 881 882 return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax; 883} 884 885static void 886tcp_cubic_congestion_exp(struct tcpcb *tp) 887{ 888 889 /* 890 * Congestion - Set WMax and shrink cwnd 891 */ 892 tcp_cubic_update_ctime(tp); 893 894 /* Section 3.6 - Fast Convergence */ 895 if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) { 896 tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; 897 tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 + 898 tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2; 899 } else { 900 tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; 901 tp->snd_cubic_wmax = tp->snd_cwnd; 902 } 903 904 tp->snd_cubic_wmax = max(tp->t_segsz, tp->snd_cubic_wmax); 905 906 /* Shrink CWND */ 907 tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB); 908} 909 910static int 911tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 912{ 913 914 if (SEQ_LT(th->th_ack, tp->snd_high)) { 915 /* See newreno */ 916 tp->t_dupacks = 0; 917 return 1; 918 } 919 920 /* 921 * mark WMax 922 */ 923 tcp_cubic_congestion_exp(tp); 924 925 /* Do fast retransmit */ 926 return tcp_reno_do_fast_retransmit(tp, th); 927} 928 929static void 930tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th) 931{ 932 uint32_t ms_elapsed, rtt; 933 u_long w_tcp; 934 935 /* Congestion avoidance and not in fast recovery and usable rtt */ 936 if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 && 937 /* 938 * t_srtt is 1/32 units of slow ticks 939 * converting it in ms would be equal to 940 * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ 941 */ 942 (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) { 943 ms_elapsed = tcp_cubic_diff_ctime(tp); 944 945 /* Compute W_tcp(t) */ 946 w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB + 947 ms_elapsed / rtt / 3; 948 949 if (tp->snd_cwnd > w_tcp) { 950 /* Not in TCP friendly mode */ 951 tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) - 952 tp->snd_cwnd) / tp->snd_cwnd; 953 } else { 954 /* friendly TCP mode */ 955 tp->snd_cwnd = w_tcp; 956 } 957 958 /* Make sure we are within limits */ 959 tp->snd_cwnd = max(tp->snd_cwnd, tp->t_segsz); 960 tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale); 961 } else { 962 /* Use New Reno */ 963 tcp_newreno_newack(tp, th); 964 } 965} 966 967static void 968tcp_cubic_slow_retransmit(struct tcpcb *tp) 969{ 970 971 /* Timeout - Mark new congestion */ 972 tcp_cubic_congestion_exp(tp); 973 974 /* Loss Window MUST be one segment. */ 975 tp->snd_cwnd = tp->t_segsz; 976 tp->t_partialacks = -1; 977 tp->t_dupacks = 0; 978 tp->t_bytes_acked = 0; 979 980 if (TCP_ECN_ALLOWED(tp)) 981 tp->t_flags |= TF_ECN_SND_CWR; 982} 983 984const struct tcp_congctl tcp_cubic_ctl = { 985 .fast_retransmit = tcp_cubic_fast_retransmit, 986 .slow_retransmit = tcp_cubic_slow_retransmit, 987 .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, 988 .newack = tcp_cubic_newack, 989 .cong_exp = tcp_cubic_congestion_exp, 990}; 991