1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> 5 * Copyright (c) 2010 The FreeBSD Foundation 6 * All rights reserved. 7 * 8 * This software was developed by Lawrence Stewart while studying at the Centre 9 * for Advanced Internet Architectures, Swinburne University of Technology, made 10 * possible in part by a grant from the Cisco University Research Program Fund 11 * at Community Foundation Silicon Valley. 12 * 13 * Portions of this software were developed at the Centre for Advanced 14 * Internet Architectures, Swinburne University of Technology, Melbourne, 15 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39/* 40 * An implementation of the CUBIC congestion control algorithm for FreeBSD, 41 * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. 42 * Originally released as part of the NewTCP research project at Swinburne 43 * University of Technology's Centre for Advanced Internet Architectures, 44 * Melbourne, Australia, which was made possible in part by a grant from the 45 * Cisco University Research Program Fund at Community Foundation Silicon 46 * Valley. More details are available at: 47 * http://caia.swin.edu.au/urp/newtcp/ 48 */ 49 50#include <sys/cdefs.h> 51__FBSDID("$FreeBSD$"); 52 53#include <sys/param.h> 54#include <sys/kernel.h> 55#include <sys/limits.h> 56#include <sys/malloc.h> 57#include <sys/module.h> 58#include <sys/socket.h> 59#include <sys/socketvar.h> 60#include <sys/sysctl.h> 61#include <sys/systm.h> 62 63#include <net/vnet.h> 64 65#include <netinet/tcp.h> 66#include <netinet/tcp_seq.h> 67#include <netinet/tcp_timer.h> 68#include <netinet/tcp_var.h> 69#include <netinet/cc/cc.h> 70#include <netinet/cc/cc_cubic.h> 71#include <netinet/cc/cc_module.h> 72 73static void cubic_ack_received(struct cc_var *ccv, uint16_t type); 74static void cubic_cb_destroy(struct cc_var *ccv); 75static int cubic_cb_init(struct cc_var *ccv); 76static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); 77static void cubic_conn_init(struct cc_var *ccv); 78static int cubic_mod_init(void); 79static void cubic_post_recovery(struct cc_var *ccv); 80static void cubic_record_rtt(struct cc_var *ccv); 81static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg); 82static void cubic_after_idle(struct cc_var *ccv); 83 84struct cubic { 85 /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ 86 int64_t K; 87 /* Sum of RTT samples across an epoch in ticks. */ 88 int64_t sum_rtt_ticks; 89 /* cwnd at the most recent congestion event. */ 90 unsigned long max_cwnd; 91 /* cwnd at the previous congestion event. */ 92 unsigned long prev_max_cwnd; 93 /* A copy of prev_max_cwnd. Used for CC_RTO_ERR */ 94 unsigned long prev_max_cwnd_cp; 95 /* various flags */ 96 uint32_t flags; 97#define CUBICFLAG_CONG_EVENT 0x00000001 /* congestion experienced */ 98#define CUBICFLAG_IN_SLOWSTART 0x00000002 /* in slow start */ 99#define CUBICFLAG_IN_APPLIMIT 0x00000004 /* application limited */ 100#define CUBICFLAG_RTO_EVENT 0x00000008 /* RTO experienced */ 101 /* Minimum observed rtt in ticks. */ 102 int min_rtt_ticks; 103 /* Mean observed rtt between congestion epochs. */ 104 int mean_rtt_ticks; 105 /* ACKs since last congestion event. */ 106 int epoch_ack_count; 107 /* Timestamp (in ticks) of arriving in congestion avoidance from last 108 * congestion event. 109 */ 110 int t_last_cong; 111 /* Timestamp (in ticks) of a previous congestion event. Used for 112 * CC_RTO_ERR. 113 */ 114 int t_last_cong_prev; 115}; 116 117static MALLOC_DEFINE(M_CUBIC, "cubic data", 118 "Per connection data required for the CUBIC congestion control algorithm"); 119 120struct cc_algo cubic_cc_algo = { 121 .name = "cubic", 122 .ack_received = cubic_ack_received, 123 .cb_destroy = cubic_cb_destroy, 124 .cb_init = cubic_cb_init, 125 .cong_signal = cubic_cong_signal, 126 .conn_init = cubic_conn_init, 127 .mod_init = cubic_mod_init, 128 .post_recovery = cubic_post_recovery, 129 .after_idle = cubic_after_idle, 130}; 131 132static void 133cubic_ack_received(struct cc_var *ccv, uint16_t type) 134{ 135 struct cubic *cubic_data; 136 unsigned long w_tf, w_cubic_next; 137 int ticks_since_cong; 138 139 cubic_data = ccv->cc_data; 140 cubic_record_rtt(ccv); 141 142 /* 143 * For a regular ACK and we're not in cong/fast recovery and 144 * we're cwnd limited, always recalculate cwnd. 145 */ 146 if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 147 (ccv->flags & CCF_CWND_LIMITED)) { 148 /* Use the logic in NewReno ack_received() for slow start. */ 149 if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 150 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) { 151 cubic_data->flags |= CUBICFLAG_IN_SLOWSTART; 152 newreno_cc_algo.ack_received(ccv, type); 153 } else { 154 if ((cubic_data->flags & CUBICFLAG_RTO_EVENT) && 155 (cubic_data->flags & CUBICFLAG_IN_SLOWSTART)) { 156 /* RFC8312 Section 4.7 */ 157 cubic_data->flags &= ~(CUBICFLAG_RTO_EVENT | 158 CUBICFLAG_IN_SLOWSTART); 159 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 160 cubic_data->K = 0; 161 } else if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART | 162 CUBICFLAG_IN_APPLIMIT)) { 163 cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART | 164 CUBICFLAG_IN_APPLIMIT); 165 cubic_data->t_last_cong = ticks; 166 cubic_data->K = cubic_k(cubic_data->max_cwnd / 167 CCV(ccv, t_maxseg)); 168 } 169 if ((ticks_since_cong = 170 ticks - cubic_data->t_last_cong) < 0) { 171 /* 172 * dragging t_last_cong along 173 */ 174 ticks_since_cong = INT_MAX; 175 cubic_data->t_last_cong = ticks - INT_MAX; 176 } 177 /* 178 * The mean RTT is used to best reflect the equations in 179 * the I-D. Using min_rtt in the tf_cwnd calculation 180 * causes w_tf to grow much faster than it should if the 181 * RTT is dominated by network buffering rather than 182 * propagation delay. 183 */ 184 w_tf = tf_cwnd(ticks_since_cong, 185 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 186 CCV(ccv, t_maxseg)); 187 188 w_cubic_next = cubic_cwnd(ticks_since_cong + 189 cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 190 CCV(ccv, t_maxseg), cubic_data->K); 191 192 ccv->flags &= ~CCF_ABC_SENTAWND; 193 194 if (w_cubic_next < w_tf) { 195 /* 196 * TCP-friendly region, follow tf 197 * cwnd growth. 198 */ 199 if (CCV(ccv, snd_cwnd) < w_tf) 200 CCV(ccv, snd_cwnd) = ulmin(w_tf, INT_MAX); 201 } else if (CCV(ccv, snd_cwnd) < w_cubic_next) { 202 /* 203 * Concave or convex region, follow CUBIC 204 * cwnd growth. 205 * Only update snd_cwnd, if it doesn't shrink. 206 */ 207 CCV(ccv, snd_cwnd) = ulmin(w_cubic_next, 208 INT_MAX); 209 } 210 211 /* 212 * If we're not in slow start and we're probing for a 213 * new cwnd limit at the start of a connection 214 * (happens when hostcache has a relevant entry), 215 * keep updating our current estimate of the 216 * max_cwnd. 217 */ 218 if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) && 219 cubic_data->max_cwnd < CCV(ccv, snd_cwnd)) { 220 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 221 cubic_data->K = cubic_k(cubic_data->max_cwnd / 222 CCV(ccv, t_maxseg)); 223 } 224 } 225 } else if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 226 !(ccv->flags & CCF_CWND_LIMITED)) { 227 cubic_data->flags |= CUBICFLAG_IN_APPLIMIT; 228 } 229} 230 231/* 232 * This is a Cubic specific implementation of after_idle. 233 * - Reset cwnd by calling New Reno implementation of after_idle. 234 * - Reset t_last_cong. 235 */ 236static void 237cubic_after_idle(struct cc_var *ccv) 238{ 239 struct cubic *cubic_data; 240 241 cubic_data = ccv->cc_data; 242 243 cubic_data->max_cwnd = ulmax(cubic_data->max_cwnd, CCV(ccv, snd_cwnd)); 244 cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 245 246 newreno_cc_algo.after_idle(ccv); 247 cubic_data->t_last_cong = ticks; 248} 249 250 251static void 252cubic_cb_destroy(struct cc_var *ccv) 253{ 254 free(ccv->cc_data, M_CUBIC); 255} 256 257static int 258cubic_cb_init(struct cc_var *ccv) 259{ 260 struct cubic *cubic_data; 261 262 cubic_data = malloc(sizeof(struct cubic), M_CUBIC, M_NOWAIT|M_ZERO); 263 264 if (cubic_data == NULL) 265 return (ENOMEM); 266 267 /* Init some key variables with sensible defaults. */ 268 cubic_data->t_last_cong = ticks; 269 cubic_data->min_rtt_ticks = TCPTV_SRTTBASE; 270 cubic_data->mean_rtt_ticks = 1; 271 272 ccv->cc_data = cubic_data; 273 274 return (0); 275} 276 277/* 278 * Perform any necessary tasks before we enter congestion recovery. 279 */ 280static void 281cubic_cong_signal(struct cc_var *ccv, uint32_t type) 282{ 283 struct cubic *cubic_data; 284 u_int mss; 285 286 cubic_data = ccv->cc_data; 287 mss = tcp_maxseg(ccv->ccvc.tcp); 288 289 switch (type) { 290 case CC_NDUPACK: 291 if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 292 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 293 cubic_ssthresh_update(ccv, mss); 294 cubic_data->flags |= CUBICFLAG_CONG_EVENT; 295 cubic_data->t_last_cong = ticks; 296 cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); 297 } 298 ENTER_RECOVERY(CCV(ccv, t_flags)); 299 } 300 break; 301 302 case CC_ECN: 303 if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 304 cubic_ssthresh_update(ccv, mss); 305 cubic_data->flags |= CUBICFLAG_CONG_EVENT; 306 cubic_data->t_last_cong = ticks; 307 cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); 308 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 309 ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 310 } 311 break; 312 313 case CC_RTO: 314 /* RFC8312 Section 4.7 */ 315 if (CCV(ccv, t_rxtshift) == 1) { 316 cubic_data->t_last_cong_prev = cubic_data->t_last_cong; 317 cubic_data->prev_max_cwnd_cp = cubic_data->prev_max_cwnd; 318 } 319 cubic_data->flags |= CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT; 320 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 321 CCV(ccv, snd_ssthresh) = ((uint64_t)CCV(ccv, snd_cwnd) * 322 CUBIC_BETA) >> CUBIC_SHIFT; 323 CCV(ccv, snd_cwnd) = mss; 324 break; 325 326 case CC_RTO_ERR: 327 cubic_data->flags &= ~(CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT); 328 cubic_data->max_cwnd = cubic_data->prev_max_cwnd; 329 cubic_data->prev_max_cwnd = cubic_data->prev_max_cwnd_cp; 330 cubic_data->t_last_cong = cubic_data->t_last_cong_prev; 331 cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); 332 break; 333 } 334} 335 336static void 337cubic_conn_init(struct cc_var *ccv) 338{ 339 struct cubic *cubic_data; 340 341 cubic_data = ccv->cc_data; 342 343 /* 344 * Ensure we have a sane initial value for max_cwnd recorded. Without 345 * this here bad things happen when entries from the TCP hostcache 346 * get used. 347 */ 348 cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 349} 350 351static int 352cubic_mod_init(void) 353{ 354 return (0); 355} 356 357/* 358 * Perform any necessary tasks before we exit congestion recovery. 359 */ 360static void 361cubic_post_recovery(struct cc_var *ccv) 362{ 363 struct cubic *cubic_data; 364 int pipe; 365 366 cubic_data = ccv->cc_data; 367 pipe = 0; 368 369 if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 370 /* 371 * If inflight data is less than ssthresh, set cwnd 372 * conservatively to avoid a burst of data, as suggested in 373 * the NewReno RFC. Otherwise, use the CUBIC method. 374 * 375 * XXXLAS: Find a way to do this without needing curack 376 */ 377 if (V_tcp_do_rfc6675_pipe) 378 pipe = tcp_compute_pipe(ccv->ccvc.tcp); 379 else 380 pipe = CCV(ccv, snd_max) - ccv->curack; 381 382 if (pipe < CCV(ccv, snd_ssthresh)) 383 /* 384 * Ensure that cwnd does not collapse to 1 MSS under 385 * adverse conditions. Implements RFC6582 386 */ 387 CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + 388 CCV(ccv, t_maxseg); 389 else 390 /* Update cwnd based on beta and adjusted max_cwnd. */ 391 CCV(ccv, snd_cwnd) = max(((uint64_t)cubic_data->max_cwnd * 392 CUBIC_BETA) >> CUBIC_SHIFT, 393 2 * CCV(ccv, t_maxseg)); 394 } 395 396 /* Calculate the average RTT between congestion epochs. */ 397 if (cubic_data->epoch_ack_count > 0 && 398 cubic_data->sum_rtt_ticks >= cubic_data->epoch_ack_count) { 399 cubic_data->mean_rtt_ticks = (int)(cubic_data->sum_rtt_ticks / 400 cubic_data->epoch_ack_count); 401 } 402 403 cubic_data->epoch_ack_count = 0; 404 cubic_data->sum_rtt_ticks = 0; 405} 406 407/* 408 * Record the min RTT and sum samples for the epoch average RTT calculation. 409 */ 410static void 411cubic_record_rtt(struct cc_var *ccv) 412{ 413 struct cubic *cubic_data; 414 int t_srtt_ticks; 415 416 /* Ignore srtt until a min number of samples have been taken. */ 417 if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { 418 cubic_data = ccv->cc_data; 419 t_srtt_ticks = CCV(ccv, t_srtt) / TCP_RTT_SCALE; 420 421 /* 422 * Record the current SRTT as our minrtt if it's the smallest 423 * we've seen or minrtt is currently equal to its initialised 424 * value. 425 * 426 * XXXLAS: Should there be some hysteresis for minrtt? 427 */ 428 if ((t_srtt_ticks < cubic_data->min_rtt_ticks || 429 cubic_data->min_rtt_ticks == TCPTV_SRTTBASE)) { 430 cubic_data->min_rtt_ticks = max(1, t_srtt_ticks); 431 432 /* 433 * If the connection is within its first congestion 434 * epoch, ensure we prime mean_rtt_ticks with a 435 * reasonable value until the epoch average RTT is 436 * calculated in cubic_post_recovery(). 437 */ 438 if (cubic_data->min_rtt_ticks > 439 cubic_data->mean_rtt_ticks) 440 cubic_data->mean_rtt_ticks = 441 cubic_data->min_rtt_ticks; 442 } 443 444 /* Sum samples for epoch average RTT calculation. */ 445 cubic_data->sum_rtt_ticks += t_srtt_ticks; 446 cubic_data->epoch_ack_count++; 447 } 448} 449 450/* 451 * Update the ssthresh in the event of congestion. 452 */ 453static void 454cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg) 455{ 456 struct cubic *cubic_data; 457 uint32_t ssthresh; 458 uint32_t cwnd; 459 460 cubic_data = ccv->cc_data; 461 cwnd = CCV(ccv, snd_cwnd); 462 463 /* Fast convergence heuristic. */ 464 if (cwnd < cubic_data->max_cwnd) { 465 cwnd = ((uint64_t)cwnd * CUBIC_FC_FACTOR) >> CUBIC_SHIFT; 466 } 467 cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 468 cubic_data->max_cwnd = cwnd; 469 470 /* 471 * On the first congestion event, set ssthresh to cwnd * 0.5 472 * and reduce max_cwnd to cwnd * beta. This aligns the cubic concave 473 * region appropriately. On subsequent congestion events, set 474 * ssthresh to cwnd * beta. 475 */ 476 if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) { 477 ssthresh = cwnd >> 1; 478 cubic_data->max_cwnd = ((uint64_t)cwnd * 479 CUBIC_BETA) >> CUBIC_SHIFT; 480 } else { 481 ssthresh = ((uint64_t)cwnd * 482 CUBIC_BETA) >> CUBIC_SHIFT; 483 } 484 CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * maxseg); 485} 486 487 488DECLARE_CC_MODULE(cubic, &cubic_cc_algo); 489MODULE_VERSION(cubic, 1); 490