1/* 2 * Copyright (c) 2013-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28#include <sys/param.h> 29#include <sys/systm.h> 30#include <sys/kernel.h> 31#include <sys/protosw.h> 32#include <sys/socketvar.h> 33#include <sys/syslog.h> 34 35#include <net/route.h> 36#include <netinet/in.h> 37#include <netinet/in_systm.h> 38#include <netinet/ip.h> 39 40#if INET6 41#include <netinet/ip6.h> 42#endif /* INET6 */ 43 44#include <netinet/ip_var.h> 45#include <netinet/tcp.h> 46#include <netinet/tcp_timer.h> 47#include <netinet/tcp_var.h> 48#include <netinet/tcp_fsm.h> 49#include <netinet/tcp_var.h> 50#include <netinet/tcp_cc.h> 51#include <netinet/tcpip.h> 52#include <netinet/tcp_seq.h> 53#include <kern/task.h> 54#include <libkern/OSAtomic.h> 55 56static int tcp_cubic_init(struct tcpcb *tp); 57static int tcp_cubic_cleanup(struct tcpcb *tp); 58static void tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp); 59static void tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th); 60static void tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); 61static void tcp_cubic_pre_fr(struct tcpcb *tp); 62static void tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th); 63static void tcp_cubic_after_timeout(struct tcpcb *tp); 64static int tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th); 65static void tcp_cubic_switch_cc(struct tcpcb *tp, u_int16_t old_index); 66static uint32_t tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt); 67static uint32_t tcp_cubic_tcpwin(struct tcpcb *tp, struct tcphdr *th); 68static inline void tcp_cubic_clear_state(struct tcpcb *tp); 69 70 71extern float cbrtf(float x); 72 73struct tcp_cc_algo tcp_cc_cubic = { 74 .name = "cubic", 75 .init = tcp_cubic_init, 76 .cleanup = tcp_cubic_cleanup, 77 .cwnd_init = tcp_cubic_cwnd_init_or_reset, 78 .congestion_avd = tcp_cubic_congestion_avd, 79 .ack_rcvd = tcp_cubic_ack_rcvd, 80 .pre_fr = tcp_cubic_pre_fr, 81 .post_fr = tcp_cubic_post_fr, 82 .after_idle = tcp_cubic_cwnd_init_or_reset, 83 .after_timeout = tcp_cubic_after_timeout, 84 .delay_ack = tcp_cubic_delay_ack, 85 .switch_to = tcp_cubic_switch_cc 86}; 87 88const float tcp_cubic_backoff = 0.2; /* multiplicative decrease factor */ 89const float tcp_cubic_coeff = 0.4; 90const float tcp_cubic_fast_convergence_factor = 0.875; 91 92static int tcp_cubic_tcp_friendliness = 0; 93SYSCTL_INT(_net_inet_tcp, OID_AUTO, cubic_tcp_friendliness, 94 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_cubic_tcp_friendliness, 0, 95 "Enable TCP friendliness"); 96 97static int tcp_cubic_fast_convergence = 0; 98SYSCTL_INT(_net_inet_tcp, OID_AUTO, cubic_fast_convergence, 99 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_cubic_fast_convergence, 0, 100 "Enable fast convergence"); 101 102static int tcp_cubic_use_minrtt = 0; 103SYSCTL_INT(_net_inet_tcp, OID_AUTO, cubic_use_minrtt, 104 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_cubic_use_minrtt, 0, 105 "use a min of 5 sec rtt"); 106 107static int tcp_cubic_init(struct tcpcb *tp) 108{ 109 OSIncrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets); 110 111 VERIFY(tp->t_ccstate != NULL); 112 tcp_cubic_clear_state(tp); 113 return (0); 114} 115 116static int tcp_cubic_cleanup(struct tcpcb *tp) 117{ 118#pragma unused(tp) 119 OSDecrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets); 120 return (0); 121} 122 123/* 124 * Initialize the congestion window at the beginning of a connection or 125 * after idle time 126 */ 127static void tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp) 128{ 129 VERIFY(tp->t_ccstate != NULL); 130 131 tcp_cubic_clear_state(tp); 132 tcp_cc_cwnd_init_or_reset(tp); 133 134 /* 135 * slow start threshold could get initialized to a lower value 136 * when there is a cached value in the route metrics. In this case, 137 * the connection can enter congestion avoidance without any packet 138 * loss and Cubic will enter steady-state too early. It is better 139 * to always probe to find the initial slow-start threshold. 140 */ 141 if (tp->t_inpcb->inp_stat->txbytes <= TCP_CC_CWND_INIT_BYTES 142 && tp->snd_ssthresh < (TCP_MAXWIN << TCP_MAX_WINSHIFT)) 143 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 144 145 /* Initialize cubic last max to be same as ssthresh */ 146 tp->t_ccstate->cub_last_max = tp->snd_ssthresh; 147 148 /* If stretch ack was auto-disabled, re-evaluate it */ 149 tcp_cc_after_idle_stretchack(tp); 150} 151 152/* 153 * Compute the target congestion window for the next RTT according to 154 * cubic equation when an ack is received. 155 * 156 * W(t) = C(t-K)^3 + W(last_max) 157 */ 158static uint32_t 159tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt) 160{ 161 float K, var; 162 u_int32_t elapsed_time, win; 163 164 VERIFY(tp->t_ccstate->cub_last_max > 0); 165 win = min(tp->snd_cwnd, tp->snd_wnd); 166 if (tp->t_ccstate->cub_epoch_start == 0) { 167 /* 168 * This is the beginning of a new epoch, initialize some of 169 * the variables that we need to use for computing the 170 * congestion window later. 171 */ 172 tp->t_ccstate->cub_epoch_start = tcp_now; 173 if (tp->t_ccstate->cub_epoch_start == 0) 174 tp->t_ccstate->cub_epoch_start = 1; 175 if (win < tp->t_ccstate->cub_last_max) { 176 177 VERIFY(current_task() == kernel_task); 178 179 /* 180 * Compute cubic epoch period, this is the time 181 * period that the window will take to increase to 182 * last_max again after backoff due to loss. 183 */ 184 K = (tp->t_ccstate->cub_last_max - win) 185 / tp->t_maxseg / tcp_cubic_coeff; 186 K = cbrtf(K); 187 tp->t_ccstate->cub_epoch_period = K * TCP_RETRANSHZ; 188 /* Origin point */ 189 tp->t_ccstate->cub_origin_point = 190 tp->t_ccstate->cub_last_max; 191 } else { 192 tp->t_ccstate->cub_epoch_period = 0; 193 tp->t_ccstate->cub_origin_point = win; 194 } 195 tp->t_ccstate->cub_target_win = 0; 196 } 197 198 VERIFY(tp->t_ccstate->cub_origin_point > 0); 199 /* 200 * Compute the target window for the next RTT using smoothed RTT 201 * as an estimate for next RTT. 202 */ 203 elapsed_time = timer_diff(tcp_now, 0, 204 tp->t_ccstate->cub_epoch_start, 0); 205 206 if (tcp_cubic_use_minrtt) 207 elapsed_time += max(tcp_cubic_use_minrtt, rtt); 208 else 209 elapsed_time += rtt; 210 var = (elapsed_time - tp->t_ccstate->cub_epoch_period) / TCP_RETRANSHZ; 211 var = var * var * var * (tcp_cubic_coeff * tp->t_maxseg); 212 213 tp->t_ccstate->cub_target_win = tp->t_ccstate->cub_origin_point + var; 214 return (tp->t_ccstate->cub_target_win); 215} 216 217/* 218 * Standard TCP utilizes bandwidth well in low RTT and low BDP connections 219 * even when there is some packet loss. Enabling TCP mode will help Cubic 220 * to achieve this kind of utilization. 221 * 222 * But if there is a bottleneck link in the path with a fixed size queue 223 * and fixed bandwidth, TCP Cubic will help to reduce packet loss at this 224 * link because of the steady-state behavior. Using average and mean 225 * absolute deviation of W(lastmax), we try to detect if the congestion 226 * window is close to the bottleneck bandwidth. In that case, disabling 227 * TCP mode will help to minimize packet loss at this link. 228 * 229 * Disable TCP mode if the W(lastmax) (the window where previous packet 230 * loss happened) is within a small range from the average last max 231 * calculated. 232 */ 233#define TCP_CUBIC_ENABLE_TCPMODE(_tp_) \ 234 ((!soissrcrealtime((_tp_)->t_inpcb->inp_socket) && \ 235 (_tp_)->t_ccstate->cub_mean_dev > (tp->t_maxseg << 1)) ? 1 : 0) 236 237/* 238 * Compute the window growth if standard TCP (AIMD) was used with 239 * a backoff of 0.5 and additive increase of 1 packet per RTT. 240 * 241 * TCP window at time t can be calculated using the following equation 242 * with beta as 0.8 243 * 244 * W(t) <- Wmax * beta + 3 * ((1 - beta)/(1 + beta)) * t/RTT 245 * 246 */ 247static uint32_t 248tcp_cubic_tcpwin(struct tcpcb *tp, struct tcphdr *th) 249{ 250 if (tp->t_ccstate->cub_tcp_win == 0) { 251 tp->t_ccstate->cub_tcp_win = min(tp->snd_cwnd, tp->snd_wnd); 252 tp->t_ccstate->cub_tcp_bytes_acked = 0; 253 } else { 254 tp->t_ccstate->cub_tcp_bytes_acked += 255 BYTES_ACKED(th, tp); 256 if (tp->t_ccstate->cub_tcp_bytes_acked >= 257 tp->t_ccstate->cub_tcp_win) { 258 tp->t_ccstate->cub_tcp_bytes_acked -= 259 tp->t_ccstate->cub_tcp_win; 260 tp->t_ccstate->cub_tcp_win += tp->t_maxseg; 261 } 262 } 263 return (tp->t_ccstate->cub_tcp_win); 264} 265 266/* 267 * Handle an in-sequence ack during congestion avoidance phase. 268 */ 269static void 270tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th) 271{ 272 u_int32_t cubic_target_win, tcp_win, rtt; 273 274 tp->t_bytes_acked += BYTES_ACKED(th, tp); 275 276 rtt = get_base_rtt(tp); 277 /* 278 * First compute cubic window. If cubic variables are not 279 * initialized (after coming out of recovery), this call will 280 * initialize them. 281 */ 282 cubic_target_win = tcp_cubic_update(tp, rtt); 283 284 /* Compute TCP window if a multiplicative decrease of 0.2 is used */ 285 tcp_win = tcp_cubic_tcpwin(tp, th); 286 287 if (tp->snd_cwnd < tcp_win && 288 (tcp_cubic_tcp_friendliness == 1 || 289 TCP_CUBIC_ENABLE_TCPMODE(tp))) { 290 /* this connection is in TCP-friendly region */ 291 if (tp->t_bytes_acked >= tp->snd_cwnd) { 292 tp->t_bytes_acked -= tp->snd_cwnd; 293 tp->snd_cwnd = min(tcp_win, TCP_MAXWIN << tp->snd_scale); 294 } 295 } else { 296 if (cubic_target_win > tp->snd_cwnd) { 297 /* 298 * The target win is computed for the next RTT. 299 * To reach this value, cwnd will have to be updated 300 * one segment at a time. Compute how many bytes 301 * need to be acknowledged before we can increase 302 * the cwnd by one segment. 303 */ 304 u_int64_t incr_win; 305 incr_win = tp->snd_cwnd * tp->t_maxseg; 306 incr_win /= (cubic_target_win - tp->snd_cwnd); 307 if (incr_win > 0 && 308 tp->t_bytes_acked >= incr_win) { 309 tp->t_bytes_acked -= incr_win; 310 tp->snd_cwnd = 311 min((tp->snd_cwnd + tp->t_maxseg), 312 TCP_MAXWIN << tp->snd_scale); 313 } 314 } 315 } 316} 317 318static void 319tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) 320{ 321 if (tp->snd_cwnd >= tp->snd_ssthresh) { 322 /* Congestion avoidance phase */ 323 tcp_cubic_congestion_avd(tp, th); 324 } else { 325 /* 326 * Use 2*SMSS as limit on increment as suggested 327 * by RFC 3465 section 2.3 328 */ 329 uint32_t acked, abc_lim, incr; 330 acked = BYTES_ACKED(th, tp); 331 abc_lim = (tcp_do_rfc3465_lim2 && 332 tp->snd_nxt == tp->snd_max) ? 333 2 * tp->t_maxseg : tp->t_maxseg; 334 incr = min(acked, abc_lim); 335 336 tp->snd_cwnd += incr; 337 tp->snd_cwnd = min(tp->snd_cwnd, 338 TCP_MAXWIN << tp->snd_scale); 339 } 340} 341 342static void 343tcp_cubic_pre_fr(struct tcpcb *tp) 344{ 345 uint32_t win, avg; 346 int32_t dev; 347 tp->t_ccstate->cub_epoch_start = 0; 348 tp->t_ccstate->cub_tcp_win = 0; 349 tp->t_ccstate->cub_target_win = 0; 350 tp->t_ccstate->cub_tcp_bytes_acked = 0; 351 352 win = min(tp->snd_cwnd, tp->snd_wnd); 353 /* 354 * Note the congestion window at which packet loss occurred as 355 * cub_last_max. 356 * 357 * If the congestion window is less than the last max window when 358 * loss occurred, it indicates that capacity available in the 359 * network has gone down. This can happen if a new flow has started 360 * and it is capturing some of the bandwidth. To reach convergence 361 * quickly, backoff a little more. Disable fast convergence to 362 * disable this behavior. 363 */ 364 if (win < tp->t_ccstate->cub_last_max && 365 tcp_cubic_fast_convergence == 1) 366 tp->t_ccstate->cub_last_max = win * 367 tcp_cubic_fast_convergence_factor; 368 else 369 tp->t_ccstate->cub_last_max = win; 370 371 if (tp->t_ccstate->cub_last_max == 0) { 372 /* 373 * If last_max is zero because snd_wnd is zero or for 374 * any other reason, initialize it to the amount of data 375 * in flight 376 */ 377 tp->t_ccstate->cub_last_max = tp->snd_max - tp->snd_una; 378 } 379 380 /* 381 * Compute average and mean absolute deviation of the 382 * window at which packet loss occurred. 383 */ 384 if (tp->t_ccstate->cub_avg_lastmax == 0) { 385 tp->t_ccstate->cub_avg_lastmax = tp->t_ccstate->cub_last_max; 386 } else { 387 /* 388 * Average is computed by taking 63 parts of 389 * history and one part of the most recent value 390 */ 391 avg = tp->t_ccstate->cub_avg_lastmax; 392 avg = (avg << 6) - avg; 393 tp->t_ccstate->cub_avg_lastmax = 394 (avg + tp->t_ccstate->cub_last_max) >> 6; 395 } 396 397 /* caluclate deviation from average */ 398 dev = tp->t_ccstate->cub_avg_lastmax - tp->t_ccstate->cub_last_max; 399 400 /* Take the absolute value */ 401 if (dev < 0) 402 dev = -dev; 403 404 if (tp->t_ccstate->cub_mean_dev == 0) { 405 tp->t_ccstate->cub_mean_dev = dev; 406 } else { 407 dev = dev + ((tp->t_ccstate->cub_mean_dev << 4) 408 - tp->t_ccstate->cub_mean_dev); 409 tp->t_ccstate->cub_mean_dev = dev >> 4; 410 } 411 412 /* Backoff congestion window by tcp_cubic_backoff factor */ 413 win = win - (win * tcp_cubic_backoff); 414 win = (win / tp->t_maxseg); 415 if (win < 2) 416 win = 2; 417 tp->snd_ssthresh = win * tp->t_maxseg; 418 tcp_cc_resize_sndbuf(tp); 419} 420 421static void 422tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th) 423{ 424 uint32_t flight_size = 0; 425 426 if (SEQ_LEQ(th->th_ack, tp->snd_max)) 427 flight_size = tp->snd_max - th->th_ack; 428 /* 429 * Complete ack. The current window was inflated for fast recovery. 430 * It has to be deflated post recovery. 431 * 432 * Window inflation should have left us with approx snd_ssthresh 433 * outstanding data. If the flight size is zero or one segment, 434 * make congestion window to be at least as big as 2 segments to 435 * avoid delayed acknowledgements. This is according to RFC 6582. 436 */ 437 if (flight_size < tp->snd_ssthresh) 438 tp->snd_cwnd = max(flight_size, tp->t_maxseg) 439 + tp->t_maxseg; 440 else 441 tp->snd_cwnd = tp->snd_ssthresh; 442 tp->t_ccstate->cub_tcp_win = 0; 443 tp->t_ccstate->cub_target_win = 0; 444 tp->t_ccstate->cub_tcp_bytes_acked = 0; 445} 446 447static void 448tcp_cubic_after_timeout(struct tcpcb *tp) 449{ 450 VERIFY(tp->t_ccstate != NULL); 451 if (!IN_FASTRECOVERY(tp)) { 452 tcp_cubic_clear_state(tp); 453 tcp_cubic_pre_fr(tp); 454 } 455 456 /* 457 * Close the congestion window down to one segment as a retransmit 458 * timeout might indicate severe congestion. 459 */ 460 tp->snd_cwnd = tp->t_maxseg; 461} 462 463static int 464tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th) 465{ 466 return (tcp_cc_delay_ack(tp, th)); 467} 468 469/* 470 * When switching from a different CC it is better for Cubic to start 471 * fresh. The state required for Cubic calculation might be stale and it 472 * might not represent the current state of the network. If it starts as 473 * a new connection it will probe and learn the existing network conditions. 474 */ 475static void 476tcp_cubic_switch_cc(struct tcpcb *tp, uint16_t old_cc_index) 477{ 478#pragma unused(old_cc_index) 479 tcp_cubic_cwnd_init_or_reset(tp); 480 /* Start counting bytes for RFC 3465 again */ 481 tp->t_bytes_acked = 0; 482 483 OSIncrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets); 484} 485 486static inline void tcp_cubic_clear_state(struct tcpcb *tp) 487{ 488 tp->t_ccstate->cub_last_max = 0; 489 tp->t_ccstate->cub_epoch_start = 0; 490 tp->t_ccstate->cub_origin_point = 0; 491 tp->t_ccstate->cub_tcp_win = 0; 492 tp->t_ccstate->cub_tcp_bytes_acked = 0; 493 tp->t_ccstate->cub_epoch_period = 0; 494 tp->t_ccstate->cub_target_win = 0; 495} 496