tcp_congctl.c revision 1.25
1/* $NetBSD: tcp_congctl.c,v 1.25 2018/05/03 07:13:48 maxv Exp $ */ 2 3/*- 4 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 9 * Facility, NASA Ames Research Center. 10 * This code is derived from software contributed to The NetBSD Foundation 11 * by Charles M. Hannum. 12 * This code is derived from software contributed to The NetBSD Foundation 13 * by Rui Paulo. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 * POSSIBILITY OF SUCH DAMAGE. 35 */ 36 37/* 38 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 39 * All rights reserved. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the project nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 */ 65 66/* 67 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 68 * 69 * NRL grants permission for redistribution and use in source and binary 70 * forms, with or without modification, of the software and documentation 71 * created at NRL provided that the following conditions are met: 72 * 73 * 1. Redistributions of source code must retain the above copyright 74 * notice, this list of conditions and the following disclaimer. 75 * 2. Redistributions in binary form must reproduce the above copyright 76 * notice, this list of conditions and the following disclaimer in the 77 * documentation and/or other materials provided with the distribution. 78 * 3. All advertising materials mentioning features or use of this software 79 * must display the following acknowledgements: 80 * This product includes software developed by the University of 81 * California, Berkeley and its contributors. 82 * This product includes software developed at the Information 83 * Technology Division, US Naval Research Laboratory. 84 * 4. Neither the name of the NRL nor the names of its contributors 85 * may be used to endorse or promote products derived from this software 86 * without specific prior written permission. 87 * 88 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 89 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 90 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 91 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 92 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 93 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 94 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 95 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 96 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 97 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 98 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 99 * 100 * The views and conclusions contained in the software and documentation 101 * are those of the authors and should not be interpreted as representing 102 * official policies, either expressed or implied, of the US Naval 103 * Research Laboratory (NRL). 104 */ 105 106/* 107 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 108 * The Regents of the University of California. All rights reserved. 109 * 110 * Redistribution and use in source and binary forms, with or without 111 * modification, are permitted provided that the following conditions 112 * are met: 113 * 1. Redistributions of source code must retain the above copyright 114 * notice, this list of conditions and the following disclaimer. 115 * 2. Redistributions in binary form must reproduce the above copyright 116 * notice, this list of conditions and the following disclaimer in the 117 * documentation and/or other materials provided with the distribution. 118 * 3. Neither the name of the University nor the names of its contributors 119 * may be used to endorse or promote products derived from this software 120 * without specific prior written permission. 121 * 122 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 123 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 124 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 125 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 126 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 127 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 128 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 129 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 130 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 131 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 132 * SUCH DAMAGE. 133 * 134 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 135 */ 136 137#include <sys/cdefs.h> 138__KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.25 2018/05/03 07:13:48 maxv Exp $"); 139 140#ifdef _KERNEL_OPT 141#include "opt_inet.h" 142#include "opt_tcp_debug.h" 143#include "opt_tcp_congctl.h" 144#endif 145 146#include <sys/param.h> 147#include <sys/systm.h> 148#include <sys/malloc.h> 149#include <sys/mbuf.h> 150#include <sys/protosw.h> 151#include <sys/socket.h> 152#include <sys/socketvar.h> 153#include <sys/errno.h> 154#include <sys/syslog.h> 155#include <sys/pool.h> 156#include <sys/domain.h> 157#include <sys/kernel.h> 158#include <sys/mutex.h> 159 160#include <net/if.h> 161 162#include <netinet/in.h> 163#include <netinet/in_systm.h> 164#include <netinet/ip.h> 165#include <netinet/in_pcb.h> 166#include <netinet/in_var.h> 167#include <netinet/ip_var.h> 168 169#ifdef INET6 170#include <netinet/ip6.h> 171#include <netinet6/ip6_var.h> 172#include <netinet6/in6_pcb.h> 173#include <netinet6/ip6_var.h> 174#include <netinet6/in6_var.h> 175#include <netinet/icmp6.h> 176#endif 177 178#include <netinet/tcp.h> 179#include <netinet/tcp_fsm.h> 180#include <netinet/tcp_seq.h> 181#include <netinet/tcp_timer.h> 182#include <netinet/tcp_var.h> 183#include <netinet/tcp_congctl.h> 184#ifdef TCP_DEBUG 185#include <netinet/tcp_debug.h> 186#endif 187 188/* 189 * TODO: 190 * consider separating the actual implementations in another file. 191 */ 192 193static void tcp_common_congestion_exp(struct tcpcb *, int, int); 194 195static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *); 196static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *); 197static void tcp_reno_slow_retransmit(struct tcpcb *); 198static void tcp_reno_fast_retransmit_newack(struct tcpcb *, 199 const struct tcphdr *); 200static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *); 201static void tcp_reno_congestion_exp(struct tcpcb *tp); 202 203static int tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *); 204static void tcp_newreno_fast_retransmit_newack(struct tcpcb *, 205 const struct tcphdr *); 206static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *); 207 208static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *); 209static void tcp_cubic_slow_retransmit(struct tcpcb *tp); 210static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *); 211static void tcp_cubic_congestion_exp(struct tcpcb *); 212 213static void tcp_congctl_fillnames(void); 214 215extern int tcprexmtthresh; 216 217MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures"); 218 219/* currently selected global congestion control */ 220char tcp_congctl_global_name[TCPCC_MAXLEN]; 221 222/* available global congestion control algorithms */ 223char tcp_congctl_avail[10 * TCPCC_MAXLEN]; 224 225/* 226 * Used to list the available congestion control algorithms. 227 */ 228TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd = 229 TAILQ_HEAD_INITIALIZER(tcp_congctlhd); 230 231static struct tcp_congctlent * tcp_congctl_global; 232 233static kmutex_t tcp_congctl_mtx; 234 235void 236tcp_congctl_init(void) 237{ 238 int r __diagused; 239 240 mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE); 241 242 /* Base algorithms. */ 243 r = tcp_congctl_register("reno", &tcp_reno_ctl); 244 KASSERT(r == 0); 245 r = tcp_congctl_register("newreno", &tcp_newreno_ctl); 246 KASSERT(r == 0); 247 r = tcp_congctl_register("cubic", &tcp_cubic_ctl); 248 KASSERT(r == 0); 249 250 /* NewReno is the default. */ 251#ifndef TCP_CONGCTL_DEFAULT 252#define TCP_CONGCTL_DEFAULT "newreno" 253#endif 254 255 r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT); 256 KASSERT(r == 0); 257} 258 259/* 260 * Register a congestion algorithm and select it if we have none. 261 */ 262int 263tcp_congctl_register(const char *name, const struct tcp_congctl *tcc) 264{ 265 struct tcp_congctlent *ntcc, *tccp; 266 267 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 268 if (!strcmp(name, tccp->congctl_name)) { 269 /* name already registered */ 270 return EEXIST; 271 } 272 273 ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO); 274 275 strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1); 276 ntcc->congctl_ctl = tcc; 277 278 TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent); 279 tcp_congctl_fillnames(); 280 281 if (TAILQ_FIRST(&tcp_congctlhd) == ntcc) 282 tcp_congctl_select(NULL, name); 283 284 return 0; 285} 286 287int 288tcp_congctl_unregister(const char *name) 289{ 290 struct tcp_congctlent *tccp, *rtccp; 291 unsigned int size; 292 293 rtccp = NULL; 294 size = 0; 295 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 296 if (!strcmp(name, tccp->congctl_name)) 297 rtccp = tccp; 298 size++; 299 } 300 301 if (!rtccp) 302 return ENOENT; 303 304 if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt) 305 return EBUSY; 306 307 TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent); 308 free(rtccp, M_TCPCONGCTL); 309 tcp_congctl_fillnames(); 310 311 return 0; 312} 313 314/* 315 * Select a congestion algorithm by name. 316 */ 317int 318tcp_congctl_select(struct tcpcb *tp, const char *name) 319{ 320 struct tcp_congctlent *tccp, *old_tccp, *new_tccp; 321 bool old_found, new_found; 322 323 KASSERT(name); 324 325 old_found = (tp == NULL || tp->t_congctl == NULL); 326 old_tccp = NULL; 327 new_found = false; 328 new_tccp = NULL; 329 330 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 331 if (!old_found && tccp->congctl_ctl == tp->t_congctl) { 332 old_tccp = tccp; 333 old_found = true; 334 } 335 336 if (!new_found && !strcmp(name, tccp->congctl_name)) { 337 new_tccp = tccp; 338 new_found = true; 339 } 340 341 if (new_found && old_found) { 342 if (tp) { 343 mutex_enter(&tcp_congctl_mtx); 344 if (old_tccp) 345 old_tccp->congctl_refcnt--; 346 tp->t_congctl = new_tccp->congctl_ctl; 347 new_tccp->congctl_refcnt++; 348 mutex_exit(&tcp_congctl_mtx); 349 } else { 350 tcp_congctl_global = new_tccp; 351 strlcpy(tcp_congctl_global_name, 352 new_tccp->congctl_name, 353 sizeof(tcp_congctl_global_name) - 1); 354 } 355 return 0; 356 } 357 } 358 359 return EINVAL; 360} 361 362void 363tcp_congctl_release(struct tcpcb *tp) 364{ 365 struct tcp_congctlent *tccp; 366 367 KASSERT(tp->t_congctl); 368 369 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 370 if (tccp->congctl_ctl == tp->t_congctl) { 371 tccp->congctl_refcnt--; 372 return; 373 } 374 } 375} 376 377/* 378 * Returns the name of a congestion algorithm. 379 */ 380const char * 381tcp_congctl_bystruct(const struct tcp_congctl *tcc) 382{ 383 struct tcp_congctlent *tccp; 384 385 KASSERT(tcc); 386 387 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 388 if (tccp->congctl_ctl == tcc) 389 return tccp->congctl_name; 390 391 return NULL; 392} 393 394static void 395tcp_congctl_fillnames(void) 396{ 397 struct tcp_congctlent *tccp; 398 const char *delim = " "; 399 400 tcp_congctl_avail[0] = '\0'; 401 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 402 strlcat(tcp_congctl_avail, tccp->congctl_name, 403 sizeof(tcp_congctl_avail) - 1); 404 if (TAILQ_NEXT(tccp, congctl_ent)) 405 strlcat(tcp_congctl_avail, delim, 406 sizeof(tcp_congctl_avail) - 1); 407 } 408 409} 410 411/* ------------------------------------------------------------------------ */ 412 413/* 414 * Common stuff 415 */ 416 417/* Window reduction (1-beta) for [New]Reno: 0.5 */ 418#define RENO_BETAA 1 419#define RENO_BETAB 2 420/* Window reduction (1-beta) for Cubic: 0.8 */ 421#define CUBIC_BETAA 4 422#define CUBIC_BETAB 5 423/* Draft Rhee Section 4.1 */ 424#define CUBIC_CA 4 425#define CUBIC_CB 10 426 427static void 428tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab) 429{ 430 u_int win; 431 432 /* 433 * Reduce the congestion window and the slow start threshold. 434 */ 435 win = min(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz; 436 if (win < 2) 437 win = 2; 438 439 tp->snd_ssthresh = win * tp->t_segsz; 440 tp->snd_recover = tp->snd_max; 441 tp->snd_cwnd = tp->snd_ssthresh; 442 443 /* 444 * When using TCP ECN, notify the peer that 445 * we reduced the cwnd. 446 */ 447 if (TCP_ECN_ALLOWED(tp)) 448 tp->t_flags |= TF_ECN_SND_CWR; 449} 450 451 452/* ------------------------------------------------------------------------ */ 453 454/* 455 * TCP/Reno congestion control. 456 */ 457static void 458tcp_reno_congestion_exp(struct tcpcb *tp) 459{ 460 461 tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB); 462} 463 464static int 465tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 466{ 467 /* 468 * Dup acks mean that packets have left the 469 * network (they're now cached at the receiver) 470 * so bump cwnd by the amount in the receiver 471 * to keep a constant cwnd packets in the 472 * network. 473 * 474 * If we are using TCP/SACK, then enter 475 * Fast Recovery if the receiver SACKs 476 * data that is tcprexmtthresh * MSS 477 * bytes past the last ACKed segment, 478 * irrespective of the number of DupAcks. 479 */ 480 481 tcp_seq onxt = tp->snd_nxt; 482 483 tp->t_partialacks = 0; 484 TCP_TIMER_DISARM(tp, TCPT_REXMT); 485 tp->t_rtttime = 0; 486 if (TCP_SACK_ENABLED(tp)) { 487 tp->t_dupacks = tcprexmtthresh; 488 tp->sack_newdata = tp->snd_nxt; 489 tp->snd_cwnd = tp->t_segsz; 490 (void) tcp_output(tp); 491 return 0; 492 } 493 tp->snd_nxt = th->th_ack; 494 tp->snd_cwnd = tp->t_segsz; 495 (void) tcp_output(tp); 496 tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks; 497 if (SEQ_GT(onxt, tp->snd_nxt)) 498 tp->snd_nxt = onxt; 499 500 return 0; 501} 502 503static int 504tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 505{ 506 507 /* 508 * We know we're losing at the current 509 * window size so do congestion avoidance 510 * (set ssthresh to half the current window 511 * and pull our congestion window back to 512 * the new ssthresh). 513 */ 514 515 tcp_reno_congestion_exp(tp); 516 return tcp_reno_do_fast_retransmit(tp, th); 517} 518 519static void 520tcp_reno_slow_retransmit(struct tcpcb *tp) 521{ 522 u_int win; 523 524 /* 525 * Close the congestion window down to one segment 526 * (we'll open it by one segment for each ack we get). 527 * Since we probably have a window's worth of unacked 528 * data accumulated, this "slow start" keeps us from 529 * dumping all that data as back-to-back packets (which 530 * might overwhelm an intermediate gateway). 531 * 532 * There are two phases to the opening: Initially we 533 * open by one mss on each ack. This makes the window 534 * size increase exponentially with time. If the 535 * window is larger than the path can handle, this 536 * exponential growth results in dropped packet(s) 537 * almost immediately. To get more time between 538 * drops but still "push" the network to take advantage 539 * of improving conditions, we switch from exponential 540 * to linear window opening at some threshhold size. 541 * For a threshhold, we use half the current window 542 * size, truncated to a multiple of the mss. 543 * 544 * (the minimum cwnd that will give us exponential 545 * growth is 2 mss. We don't allow the threshhold 546 * to go below this.) 547 */ 548 549 win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz; 550 if (win < 2) 551 win = 2; 552 /* Loss Window MUST be one segment. */ 553 tp->snd_cwnd = tp->t_segsz; 554 tp->snd_ssthresh = win * tp->t_segsz; 555 tp->t_partialacks = -1; 556 tp->t_dupacks = 0; 557 tp->t_bytes_acked = 0; 558 559 if (TCP_ECN_ALLOWED(tp)) 560 tp->t_flags |= TF_ECN_SND_CWR; 561} 562 563static void 564tcp_reno_fast_retransmit_newack(struct tcpcb *tp, 565 const struct tcphdr *th) 566{ 567 if (tp->t_partialacks < 0) { 568 /* 569 * We were not in fast recovery. Reset the duplicate ack 570 * counter. 571 */ 572 tp->t_dupacks = 0; 573 } else { 574 /* 575 * Clamp the congestion window to the crossover point and 576 * exit fast recovery. 577 */ 578 if (tp->snd_cwnd > tp->snd_ssthresh) 579 tp->snd_cwnd = tp->snd_ssthresh; 580 tp->t_partialacks = -1; 581 tp->t_dupacks = 0; 582 tp->t_bytes_acked = 0; 583 if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) 584 tp->snd_fack = th->th_ack; 585 } 586} 587 588static void 589tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th) 590{ 591 /* 592 * When new data is acked, open the congestion window. 593 */ 594 595 u_int cw = tp->snd_cwnd; 596 u_int incr = tp->t_segsz; 597 598 if (tcp_do_abc) { 599 600 /* 601 * RFC 3465 Appropriate Byte Counting (ABC) 602 */ 603 604 int acked = th->th_ack - tp->snd_una; 605 606 if (cw >= tp->snd_ssthresh) { 607 tp->t_bytes_acked += acked; 608 if (tp->t_bytes_acked >= cw) { 609 /* Time to increase the window. */ 610 tp->t_bytes_acked -= cw; 611 } else { 612 /* No need to increase yet. */ 613 incr = 0; 614 } 615 } else { 616 /* 617 * use 2*SMSS or 1*SMSS for the "L" param, 618 * depending on sysctl setting. 619 * 620 * (See RFC 3465 2.3 Choosing the Limit) 621 */ 622 u_int abc_lim; 623 624 abc_lim = (tcp_abc_aggressive == 0 || 625 tp->snd_nxt != tp->snd_max) ? incr : incr * 2; 626 incr = min(acked, abc_lim); 627 } 628 } else { 629 630 /* 631 * If the window gives us less than ssthresh packets 632 * in flight, open exponentially (segsz per packet). 633 * Otherwise open linearly: segsz per window 634 * (segsz^2 / cwnd per packet). 635 */ 636 637 if (cw >= tp->snd_ssthresh) { 638 incr = incr * incr / cw; 639 } 640 } 641 642 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale); 643} 644 645const struct tcp_congctl tcp_reno_ctl = { 646 .fast_retransmit = tcp_reno_fast_retransmit, 647 .slow_retransmit = tcp_reno_slow_retransmit, 648 .fast_retransmit_newack = tcp_reno_fast_retransmit_newack, 649 .newack = tcp_reno_newack, 650 .cong_exp = tcp_reno_congestion_exp, 651}; 652 653/* 654 * TCP/NewReno Congestion control. 655 */ 656static int 657tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 658{ 659 660 if (SEQ_LT(th->th_ack, tp->snd_high)) { 661 /* 662 * False fast retransmit after timeout. 663 * Do not enter fast recovery 664 */ 665 tp->t_dupacks = 0; 666 return 1; 667 } 668 /* 669 * Fast retransmit is same as reno. 670 */ 671 return tcp_reno_fast_retransmit(tp, th); 672} 673 674/* 675 * Implement the NewReno response to a new ack, checking for partial acks in 676 * fast recovery. 677 */ 678static void 679tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th) 680{ 681 if (tp->t_partialacks < 0) { 682 /* 683 * We were not in fast recovery. Reset the duplicate ack 684 * counter. 685 */ 686 tp->t_dupacks = 0; 687 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { 688 /* 689 * This is a partial ack. Retransmit the first unacknowledged 690 * segment and deflate the congestion window by the amount of 691 * acknowledged data. Do not exit fast recovery. 692 */ 693 tcp_seq onxt = tp->snd_nxt; 694 u_long ocwnd = tp->snd_cwnd; 695 int sack_num_segs = 1, sack_bytes_rxmt = 0; 696 697 /* 698 * snd_una has not yet been updated and the socket's send 699 * buffer has not yet drained off the ACK'd data, so we 700 * have to leave snd_una as it was to get the correct data 701 * offset in tcp_output(). 702 */ 703 tp->t_partialacks++; 704 TCP_TIMER_DISARM(tp, TCPT_REXMT); 705 tp->t_rtttime = 0; 706 707 if (TCP_SACK_ENABLED(tp)) { 708 /* 709 * Partial ack handling within a sack recovery episode. 710 * Keeping this very simple for now. When a partial ack 711 * is received, force snd_cwnd to a value that will 712 * allow the sender to transmit no more than 2 segments. 713 * If necessary, a fancier scheme can be adopted at a 714 * later point, but for now, the goal is to prevent the 715 * sender from bursting a large amount of data in the 716 * midst of sack recovery. 717 */ 718 719 /* 720 * send one or 2 segments based on how much 721 * new data was acked 722 */ 723 if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) 724 sack_num_segs = 2; 725 (void)tcp_sack_output(tp, &sack_bytes_rxmt); 726 tp->snd_cwnd = sack_bytes_rxmt + 727 (tp->snd_nxt - tp->sack_newdata) + 728 sack_num_segs * tp->t_segsz; 729 tp->t_flags |= TF_ACKNOW; 730 (void) tcp_output(tp); 731 } else { 732 tp->snd_nxt = th->th_ack; 733 /* 734 * Set snd_cwnd to one segment beyond ACK'd offset 735 * snd_una is not yet updated when we're called 736 */ 737 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 738 (void) tcp_output(tp); 739 tp->snd_cwnd = ocwnd; 740 if (SEQ_GT(onxt, tp->snd_nxt)) 741 tp->snd_nxt = onxt; 742 /* 743 * Partial window deflation. Relies on fact that 744 * tp->snd_una not updated yet. 745 */ 746 tp->snd_cwnd -= (th->th_ack - tp->snd_una - 747 tp->t_segsz); 748 } 749 } else { 750 /* 751 * Complete ack. Inflate the congestion window to ssthresh 752 * and exit fast recovery. 753 * 754 * Window inflation should have left us with approx. 755 * snd_ssthresh outstanding data. But in case we 756 * would be inclined to send a burst, better to do 757 * it via the slow start mechanism. 758 */ 759 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 760 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 761 + tp->t_segsz; 762 else 763 tp->snd_cwnd = tp->snd_ssthresh; 764 tp->t_partialacks = -1; 765 tp->t_dupacks = 0; 766 tp->t_bytes_acked = 0; 767 if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) 768 tp->snd_fack = th->th_ack; 769 } 770} 771 772static void 773tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th) 774{ 775 /* 776 * If we are still in fast recovery (meaning we are using 777 * NewReno and we have only received partial acks), do not 778 * inflate the window yet. 779 */ 780 if (tp->t_partialacks < 0) 781 tcp_reno_newack(tp, th); 782} 783 784 785const struct tcp_congctl tcp_newreno_ctl = { 786 .fast_retransmit = tcp_newreno_fast_retransmit, 787 .slow_retransmit = tcp_reno_slow_retransmit, 788 .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, 789 .newack = tcp_newreno_newack, 790 .cong_exp = tcp_reno_congestion_exp, 791}; 792 793/* 794 * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02 795 */ 796 797/* Cubic prototypes */ 798static void tcp_cubic_update_ctime(struct tcpcb *tp); 799static uint32_t tcp_cubic_diff_ctime(struct tcpcb *); 800static uint32_t tcp_cubic_cbrt(uint32_t); 801static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t); 802 803/* Cubic TIME functions - XXX I don't like using timevals and microuptime */ 804/* 805 * Set congestion timer to now 806 */ 807static void 808tcp_cubic_update_ctime(struct tcpcb *tp) 809{ 810 struct timeval now_timeval; 811 812 getmicrouptime(&now_timeval); 813 tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 + 814 now_timeval.tv_usec / 1000; 815} 816 817/* 818 * miliseconds from last congestion 819 */ 820static uint32_t 821tcp_cubic_diff_ctime(struct tcpcb *tp) 822{ 823 struct timeval now_timeval; 824 825 getmicrouptime(&now_timeval); 826 return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 - 827 tp->snd_cubic_ctime; 828} 829 830/* 831 * Approximate cubic root 832 */ 833#define CBRT_ROUNDS 30 834static uint32_t 835tcp_cubic_cbrt(uint32_t v) 836{ 837 int i, rounds = CBRT_ROUNDS; 838 uint64_t x = v / 3; 839 840 /* We fail to calculate correct for small numbers */ 841 if (v == 0) 842 return 0; 843 else if (v < 4) 844 return 1; 845 846 /* 847 * largest x that 2*x^3+3*x fits 64bit 848 * Avoid overflow for a time cost 849 */ 850 if (x > 2097151) 851 rounds += 10; 852 853 for (i = 0; i < rounds; i++) 854 if (rounds == CBRT_ROUNDS) 855 x = (v + 2 * x * x * x) / (3 * x * x); 856 else 857 /* Avoid overflow */ 858 x = v / (3 * x * x) + 2 * x / 3; 859 860 return (uint32_t)x; 861} 862 863/* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */ 864static ulong 865tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt) 866{ 867 uint32_t K; 868 long tK3; 869 870 /* Section 3.1 Eq. 2 */ 871 K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB * 872 CUBIC_CB / CUBIC_CA); 873 /* (t-K)^3 - not clear why is the measure unit mattering */ 874 tK3 = (long)(ms_elapsed + rtt) - (long)K; 875 tK3 = tK3 * tK3 * tK3; 876 877 return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax; 878} 879 880static void 881tcp_cubic_congestion_exp(struct tcpcb *tp) 882{ 883 884 /* 885 * Congestion - Set WMax and shrink cwnd 886 */ 887 tcp_cubic_update_ctime(tp); 888 889 /* Section 3.6 - Fast Convergence */ 890 if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) { 891 tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; 892 tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 + 893 tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2; 894 } else { 895 tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; 896 tp->snd_cubic_wmax = tp->snd_cwnd; 897 } 898 899 tp->snd_cubic_wmax = max(tp->t_segsz, tp->snd_cubic_wmax); 900 901 /* Shrink CWND */ 902 tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB); 903} 904 905static int 906tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 907{ 908 909 if (SEQ_LT(th->th_ack, tp->snd_high)) { 910 /* See newreno */ 911 tp->t_dupacks = 0; 912 return 1; 913 } 914 915 /* 916 * mark WMax 917 */ 918 tcp_cubic_congestion_exp(tp); 919 920 /* Do fast retransmit */ 921 return tcp_reno_do_fast_retransmit(tp, th); 922} 923 924static void 925tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th) 926{ 927 uint32_t ms_elapsed, rtt; 928 u_long w_tcp; 929 930 /* Congestion avoidance and not in fast recovery and usable rtt */ 931 if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 && 932 /* 933 * t_srtt is 1/32 units of slow ticks 934 * converting it in ms would be equal to 935 * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ 936 */ 937 (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) { 938 ms_elapsed = tcp_cubic_diff_ctime(tp); 939 940 /* Compute W_tcp(t) */ 941 w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB + 942 ms_elapsed / rtt / 3; 943 944 if (tp->snd_cwnd > w_tcp) { 945 /* Not in TCP friendly mode */ 946 tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) - 947 tp->snd_cwnd) / tp->snd_cwnd; 948 } else { 949 /* friendly TCP mode */ 950 tp->snd_cwnd = w_tcp; 951 } 952 953 /* Make sure we are within limits */ 954 tp->snd_cwnd = max(tp->snd_cwnd, tp->t_segsz); 955 tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale); 956 } else { 957 /* Use New Reno */ 958 tcp_newreno_newack(tp, th); 959 } 960} 961 962static void 963tcp_cubic_slow_retransmit(struct tcpcb *tp) 964{ 965 966 /* Timeout - Mark new congestion */ 967 tcp_cubic_congestion_exp(tp); 968 969 /* Loss Window MUST be one segment. */ 970 tp->snd_cwnd = tp->t_segsz; 971 tp->t_partialacks = -1; 972 tp->t_dupacks = 0; 973 tp->t_bytes_acked = 0; 974 975 if (TCP_ECN_ALLOWED(tp)) 976 tp->t_flags |= TF_ECN_SND_CWR; 977} 978 979const struct tcp_congctl tcp_cubic_ctl = { 980 .fast_retransmit = tcp_cubic_fast_retransmit, 981 .slow_retransmit = tcp_cubic_slow_retransmit, 982 .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, 983 .newack = tcp_cubic_newack, 984 .cong_exp = tcp_cubic_congestion_exp, 985}; 986