1216114Slstewart/*- 2216114Slstewart * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> 3216114Slstewart * Copyright (c) 2010 The FreeBSD Foundation 4216114Slstewart * All rights reserved. 5216114Slstewart * 6216114Slstewart * This software was developed by Lawrence Stewart while studying at the Centre 7220560Slstewart * for Advanced Internet Architectures, Swinburne University of Technology, made 8220560Slstewart * possible in part by a grant from the Cisco University Research Program Fund 9220560Slstewart * at Community Foundation Silicon Valley. 10216114Slstewart * 11216114Slstewart * Portions of this software were developed at the Centre for Advanced 12216114Slstewart * Internet Architectures, Swinburne University of Technology, Melbourne, 13216114Slstewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 14216114Slstewart * 15216114Slstewart * Redistribution and use in source and binary forms, with or without 16216114Slstewart * modification, are permitted provided that the following conditions 17216114Slstewart * are met: 18216114Slstewart * 1. Redistributions of source code must retain the above copyright 19216114Slstewart * notice, this list of conditions and the following disclaimer. 20216114Slstewart * 2. Redistributions in binary form must reproduce the above copyright 21216114Slstewart * notice, this list of conditions and the following disclaimer in the 22216114Slstewart * documentation and/or other materials provided with the distribution. 23216114Slstewart * 24216114Slstewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 25216114Slstewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26216114Slstewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27216114Slstewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 28216114Slstewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29216114Slstewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30216114Slstewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31216114Slstewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32216114Slstewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33216114Slstewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34216114Slstewart * SUCH DAMAGE. 35216114Slstewart */ 36216114Slstewart 37216114Slstewart/* 38216114Slstewart * An implementation of the CUBIC congestion control algorithm for FreeBSD, 39216114Slstewart * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. 40216114Slstewart * Originally released as part of the NewTCP research project at Swinburne 41220560Slstewart * University of Technology's Centre for Advanced Internet Architectures, 42220560Slstewart * Melbourne, Australia, which was made possible in part by a grant from the 43220560Slstewart * Cisco University Research Program Fund at Community Foundation Silicon 44220560Slstewart * Valley. More details are available at: 45216114Slstewart * http://caia.swin.edu.au/urp/newtcp/ 46216114Slstewart */ 47216114Slstewart 48216114Slstewart#include <sys/cdefs.h> 49216114Slstewart__FBSDID("$FreeBSD$"); 50216114Slstewart 51216114Slstewart#include <sys/param.h> 52216114Slstewart#include <sys/kernel.h> 53216114Slstewart#include <sys/malloc.h> 54216114Slstewart#include <sys/module.h> 55216114Slstewart#include <sys/socket.h> 56216114Slstewart#include <sys/socketvar.h> 57216114Slstewart#include <sys/sysctl.h> 58216114Slstewart#include <sys/systm.h> 59216114Slstewart 60216114Slstewart#include <net/vnet.h> 61216114Slstewart 62216114Slstewart#include <netinet/cc.h> 63216114Slstewart#include <netinet/tcp_seq.h> 64216114Slstewart#include <netinet/tcp_timer.h> 65216114Slstewart#include <netinet/tcp_var.h> 66216114Slstewart 67216114Slstewart#include <netinet/cc/cc_cubic.h> 68216114Slstewart#include <netinet/cc/cc_module.h> 69216114Slstewart 70216114Slstewartstatic void cubic_ack_received(struct cc_var *ccv, uint16_t type); 71216114Slstewartstatic void cubic_cb_destroy(struct cc_var *ccv); 72216114Slstewartstatic int cubic_cb_init(struct cc_var *ccv); 73216114Slstewartstatic void cubic_cong_signal(struct cc_var *ccv, uint32_t type); 74216114Slstewartstatic void cubic_conn_init(struct cc_var *ccv); 75216114Slstewartstatic int cubic_mod_init(void); 76216114Slstewartstatic void cubic_post_recovery(struct cc_var *ccv); 77216114Slstewartstatic void cubic_record_rtt(struct cc_var *ccv); 78216114Slstewartstatic void cubic_ssthresh_update(struct cc_var *ccv); 79216114Slstewart 80216114Slstewartstruct cubic { 81216114Slstewart /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ 82216114Slstewart int64_t K; 83216114Slstewart /* Sum of RTT samples across an epoch in ticks. */ 84216114Slstewart int64_t sum_rtt_ticks; 85216114Slstewart /* cwnd at the most recent congestion event. */ 86216114Slstewart unsigned long max_cwnd; 87216114Slstewart /* cwnd at the previous congestion event. */ 88216114Slstewart unsigned long prev_max_cwnd; 89216114Slstewart /* Number of congestion events. */ 90216114Slstewart uint32_t num_cong_events; 91216114Slstewart /* Minimum observed rtt in ticks. */ 92216114Slstewart int min_rtt_ticks; 93216114Slstewart /* Mean observed rtt between congestion epochs. */ 94216114Slstewart int mean_rtt_ticks; 95216114Slstewart /* ACKs since last congestion event. */ 96216114Slstewart int epoch_ack_count; 97216114Slstewart /* Time of last congestion event in ticks. */ 98216114Slstewart int t_last_cong; 99216114Slstewart}; 100216114Slstewart 101220592Spluknetstatic MALLOC_DEFINE(M_CUBIC, "cubic data", 102216114Slstewart "Per connection data required for the CUBIC congestion control algorithm"); 103216114Slstewart 104216114Slstewartstruct cc_algo cubic_cc_algo = { 105216114Slstewart .name = "cubic", 106216114Slstewart .ack_received = cubic_ack_received, 107216114Slstewart .cb_destroy = cubic_cb_destroy, 108216114Slstewart .cb_init = cubic_cb_init, 109216114Slstewart .cong_signal = cubic_cong_signal, 110216114Slstewart .conn_init = cubic_conn_init, 111216114Slstewart .mod_init = cubic_mod_init, 112216114Slstewart .post_recovery = cubic_post_recovery, 113216114Slstewart}; 114216114Slstewart 115216114Slstewartstatic void 116216114Slstewartcubic_ack_received(struct cc_var *ccv, uint16_t type) 117216114Slstewart{ 118216114Slstewart struct cubic *cubic_data; 119216114Slstewart unsigned long w_tf, w_cubic_next; 120216114Slstewart int ticks_since_cong; 121216114Slstewart 122216114Slstewart cubic_data = ccv->cc_data; 123216114Slstewart cubic_record_rtt(ccv); 124216114Slstewart 125216114Slstewart /* 126216114Slstewart * Regular ACK and we're not in cong/fast recovery and we're cwnd 127216114Slstewart * limited and we're either not doing ABC or are slow starting or are 128216114Slstewart * doing ABC and we've sent a cwnd's worth of bytes. 129216114Slstewart */ 130216114Slstewart if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && 131216114Slstewart (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 || 132216114Slstewart CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 133216114Slstewart (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) { 134216114Slstewart /* Use the logic in NewReno ack_received() for slow start. */ 135216114Slstewart if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || 136216114Slstewart cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) 137216114Slstewart newreno_cc_algo.ack_received(ccv, type); 138216114Slstewart else { 139216114Slstewart ticks_since_cong = ticks - cubic_data->t_last_cong; 140216114Slstewart 141216114Slstewart /* 142216114Slstewart * The mean RTT is used to best reflect the equations in 143216114Slstewart * the I-D. Using min_rtt in the tf_cwnd calculation 144216114Slstewart * causes w_tf to grow much faster than it should if the 145216114Slstewart * RTT is dominated by network buffering rather than 146216114Slstewart * propogation delay. 147216114Slstewart */ 148216114Slstewart w_tf = tf_cwnd(ticks_since_cong, 149216114Slstewart cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 150216114Slstewart CCV(ccv, t_maxseg)); 151216114Slstewart 152216114Slstewart w_cubic_next = cubic_cwnd(ticks_since_cong + 153216114Slstewart cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, 154216114Slstewart CCV(ccv, t_maxseg), cubic_data->K); 155216114Slstewart 156216114Slstewart ccv->flags &= ~CCF_ABC_SENTAWND; 157216114Slstewart 158216114Slstewart if (w_cubic_next < w_tf) 159216114Slstewart /* 160216114Slstewart * TCP-friendly region, follow tf 161216114Slstewart * cwnd growth. 162216114Slstewart */ 163216114Slstewart CCV(ccv, snd_cwnd) = w_tf; 164216114Slstewart 165216114Slstewart else if (CCV(ccv, snd_cwnd) < w_cubic_next) { 166216114Slstewart /* 167216114Slstewart * Concave or convex region, follow CUBIC 168216114Slstewart * cwnd growth. 169216114Slstewart */ 170216114Slstewart if (V_tcp_do_rfc3465) 171216114Slstewart CCV(ccv, snd_cwnd) = w_cubic_next; 172216114Slstewart else 173216114Slstewart CCV(ccv, snd_cwnd) += ((w_cubic_next - 174216114Slstewart CCV(ccv, snd_cwnd)) * 175216114Slstewart CCV(ccv, t_maxseg)) / 176216114Slstewart CCV(ccv, snd_cwnd); 177216114Slstewart } 178216114Slstewart 179216114Slstewart /* 180216114Slstewart * If we're not in slow start and we're probing for a 181216114Slstewart * new cwnd limit at the start of a connection 182216114Slstewart * (happens when hostcache has a relevant entry), 183216114Slstewart * keep updating our current estimate of the 184216114Slstewart * max_cwnd. 185216114Slstewart */ 186216114Slstewart if (cubic_data->num_cong_events == 0 && 187216114Slstewart cubic_data->max_cwnd < CCV(ccv, snd_cwnd)) 188216114Slstewart cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 189216114Slstewart } 190216114Slstewart } 191216114Slstewart} 192216114Slstewart 193216114Slstewartstatic void 194216114Slstewartcubic_cb_destroy(struct cc_var *ccv) 195216114Slstewart{ 196216114Slstewart 197216114Slstewart if (ccv->cc_data != NULL) 198216114Slstewart free(ccv->cc_data, M_CUBIC); 199216114Slstewart} 200216114Slstewart 201216114Slstewartstatic int 202216114Slstewartcubic_cb_init(struct cc_var *ccv) 203216114Slstewart{ 204216114Slstewart struct cubic *cubic_data; 205216114Slstewart 206216114Slstewart cubic_data = malloc(sizeof(struct cubic), M_CUBIC, M_NOWAIT|M_ZERO); 207216114Slstewart 208216114Slstewart if (cubic_data == NULL) 209216114Slstewart return (ENOMEM); 210216114Slstewart 211216114Slstewart /* Init some key variables with sensible defaults. */ 212216114Slstewart cubic_data->t_last_cong = ticks; 213216114Slstewart cubic_data->min_rtt_ticks = TCPTV_SRTTBASE; 214217683Slstewart cubic_data->mean_rtt_ticks = 1; 215216114Slstewart 216216114Slstewart ccv->cc_data = cubic_data; 217216114Slstewart 218216114Slstewart return (0); 219216114Slstewart} 220216114Slstewart 221216114Slstewart/* 222216114Slstewart * Perform any necessary tasks before we enter congestion recovery. 223216114Slstewart */ 224216114Slstewartstatic void 225216114Slstewartcubic_cong_signal(struct cc_var *ccv, uint32_t type) 226216114Slstewart{ 227216114Slstewart struct cubic *cubic_data; 228216114Slstewart 229216114Slstewart cubic_data = ccv->cc_data; 230216114Slstewart 231216114Slstewart switch (type) { 232216114Slstewart case CC_NDUPACK: 233216114Slstewart if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { 234216114Slstewart if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 235216114Slstewart cubic_ssthresh_update(ccv); 236216114Slstewart cubic_data->num_cong_events++; 237216114Slstewart cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 238216114Slstewart cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 239216114Slstewart } 240216114Slstewart ENTER_RECOVERY(CCV(ccv, t_flags)); 241216114Slstewart } 242216114Slstewart break; 243216114Slstewart 244216114Slstewart case CC_ECN: 245216114Slstewart if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { 246216114Slstewart cubic_ssthresh_update(ccv); 247216114Slstewart cubic_data->num_cong_events++; 248216114Slstewart cubic_data->prev_max_cwnd = cubic_data->max_cwnd; 249216114Slstewart cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 250216114Slstewart cubic_data->t_last_cong = ticks; 251216114Slstewart CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); 252216114Slstewart ENTER_CONGRECOVERY(CCV(ccv, t_flags)); 253216114Slstewart } 254216114Slstewart break; 255216114Slstewart 256216114Slstewart case CC_RTO: 257216114Slstewart /* 258216114Slstewart * Grab the current time and record it so we know when the 259216114Slstewart * most recent congestion event was. Only record it when the 260216114Slstewart * timeout has fired more than once, as there is a reasonable 261216114Slstewart * chance the first one is a false alarm and may not indicate 262216114Slstewart * congestion. 263216114Slstewart */ 264216114Slstewart if (CCV(ccv, t_rxtshift) >= 2) 265216114Slstewart cubic_data->num_cong_events++; 266216114Slstewart cubic_data->t_last_cong = ticks; 267216114Slstewart break; 268216114Slstewart } 269216114Slstewart} 270216114Slstewart 271216114Slstewartstatic void 272216114Slstewartcubic_conn_init(struct cc_var *ccv) 273216114Slstewart{ 274216114Slstewart struct cubic *cubic_data; 275216114Slstewart 276216114Slstewart cubic_data = ccv->cc_data; 277216114Slstewart 278216114Slstewart /* 279216114Slstewart * Ensure we have a sane initial value for max_cwnd recorded. Without 280216114Slstewart * this here bad things happen when entries from the TCP hostcache 281216114Slstewart * get used. 282216114Slstewart */ 283216114Slstewart cubic_data->max_cwnd = CCV(ccv, snd_cwnd); 284216114Slstewart} 285216114Slstewart 286216114Slstewartstatic int 287216114Slstewartcubic_mod_init(void) 288216114Slstewart{ 289216114Slstewart 290216114Slstewart cubic_cc_algo.after_idle = newreno_cc_algo.after_idle; 291216114Slstewart 292216114Slstewart return (0); 293216114Slstewart} 294216114Slstewart 295216114Slstewart/* 296216114Slstewart * Perform any necessary tasks before we exit congestion recovery. 297216114Slstewart */ 298216114Slstewartstatic void 299216114Slstewartcubic_post_recovery(struct cc_var *ccv) 300216114Slstewart{ 301216114Slstewart struct cubic *cubic_data; 302293711Shiren int pipe; 303216114Slstewart 304216114Slstewart cubic_data = ccv->cc_data; 305293711Shiren pipe = 0; 306216114Slstewart 307216114Slstewart /* Fast convergence heuristic. */ 308216114Slstewart if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) 309216114Slstewart cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR) 310216114Slstewart >> CUBIC_SHIFT; 311216114Slstewart 312216114Slstewart if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { 313216114Slstewart /* 314216114Slstewart * If inflight data is less than ssthresh, set cwnd 315216114Slstewart * conservatively to avoid a burst of data, as suggested in 316216114Slstewart * the NewReno RFC. Otherwise, use the CUBIC method. 317216114Slstewart * 318216114Slstewart * XXXLAS: Find a way to do this without needing curack 319216114Slstewart */ 320293711Shiren if (V_tcp_do_rfc6675_pipe) 321293711Shiren pipe = tcp_compute_pipe(ccv->ccvc.tcp); 322216114Slstewart else 323293711Shiren pipe = CCV(ccv, snd_max) - ccv->curack; 324293711Shiren 325293711Shiren if (pipe < CCV(ccv, snd_ssthresh)) 326293711Shiren CCV(ccv, snd_cwnd) = pipe + CCV(ccv, t_maxseg); 327293711Shiren else 328216114Slstewart /* Update cwnd based on beta and adjusted max_cwnd. */ 329216114Slstewart CCV(ccv, snd_cwnd) = max(1, ((CUBIC_BETA * 330216114Slstewart cubic_data->max_cwnd) >> CUBIC_SHIFT)); 331216114Slstewart } 332216114Slstewart cubic_data->t_last_cong = ticks; 333216114Slstewart 334216114Slstewart /* Calculate the average RTT between congestion epochs. */ 335217683Slstewart if (cubic_data->epoch_ack_count > 0 && 336217683Slstewart cubic_data->sum_rtt_ticks >= cubic_data->epoch_ack_count) { 337216114Slstewart cubic_data->mean_rtt_ticks = (int)(cubic_data->sum_rtt_ticks / 338216114Slstewart cubic_data->epoch_ack_count); 339217683Slstewart } 340216114Slstewart 341216114Slstewart cubic_data->epoch_ack_count = 0; 342216114Slstewart cubic_data->sum_rtt_ticks = 0; 343216114Slstewart cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); 344216114Slstewart} 345216114Slstewart 346216114Slstewart/* 347216114Slstewart * Record the min RTT and sum samples for the epoch average RTT calculation. 348216114Slstewart */ 349216114Slstewartstatic void 350216114Slstewartcubic_record_rtt(struct cc_var *ccv) 351216114Slstewart{ 352216114Slstewart struct cubic *cubic_data; 353216114Slstewart int t_srtt_ticks; 354216114Slstewart 355216114Slstewart /* Ignore srtt until a min number of samples have been taken. */ 356216114Slstewart if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { 357216114Slstewart cubic_data = ccv->cc_data; 358216114Slstewart t_srtt_ticks = CCV(ccv, t_srtt) / TCP_RTT_SCALE; 359216114Slstewart 360216114Slstewart /* 361216114Slstewart * Record the current SRTT as our minrtt if it's the smallest 362216114Slstewart * we've seen or minrtt is currently equal to its initialised 363216114Slstewart * value. 364216114Slstewart * 365216114Slstewart * XXXLAS: Should there be some hysteresis for minrtt? 366216114Slstewart */ 367216114Slstewart if ((t_srtt_ticks < cubic_data->min_rtt_ticks || 368217683Slstewart cubic_data->min_rtt_ticks == TCPTV_SRTTBASE)) { 369216114Slstewart cubic_data->min_rtt_ticks = max(1, t_srtt_ticks); 370216114Slstewart 371217683Slstewart /* 372217683Slstewart * If the connection is within its first congestion 373217683Slstewart * epoch, ensure we prime mean_rtt_ticks with a 374217683Slstewart * reasonable value until the epoch average RTT is 375217683Slstewart * calculated in cubic_post_recovery(). 376217683Slstewart */ 377217683Slstewart if (cubic_data->min_rtt_ticks > 378217683Slstewart cubic_data->mean_rtt_ticks) 379217683Slstewart cubic_data->mean_rtt_ticks = 380217683Slstewart cubic_data->min_rtt_ticks; 381217683Slstewart } 382217683Slstewart 383216114Slstewart /* Sum samples for epoch average RTT calculation. */ 384216114Slstewart cubic_data->sum_rtt_ticks += t_srtt_ticks; 385216114Slstewart cubic_data->epoch_ack_count++; 386216114Slstewart } 387216114Slstewart} 388216114Slstewart 389216114Slstewart/* 390216114Slstewart * Update the ssthresh in the event of congestion. 391216114Slstewart */ 392216114Slstewartstatic void 393216114Slstewartcubic_ssthresh_update(struct cc_var *ccv) 394216114Slstewart{ 395216114Slstewart struct cubic *cubic_data; 396216114Slstewart 397216114Slstewart cubic_data = ccv->cc_data; 398216114Slstewart 399216114Slstewart /* 400216114Slstewart * On the first congestion event, set ssthresh to cwnd * 0.5, on 401216114Slstewart * subsequent congestion events, set it to cwnd * beta. 402216114Slstewart */ 403216114Slstewart if (cubic_data->num_cong_events == 0) 404216114Slstewart CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd) >> 1; 405216114Slstewart else 406216114Slstewart CCV(ccv, snd_ssthresh) = (CCV(ccv, snd_cwnd) * CUBIC_BETA) 407216114Slstewart >> CUBIC_SHIFT; 408216114Slstewart} 409216114Slstewart 410216114Slstewart 411216114SlstewartDECLARE_CC_MODULE(cubic, &cubic_cc_algo); 412