1/* 2 * Copyright (c) 2010-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28#include <sys/param.h> 29#include <sys/systm.h> 30#include <sys/kernel.h> 31#include <sys/protosw.h> 32#include <sys/mcache.h> 33#include <sys/sysctl.h> 34 35#include <net/route.h> 36#include <netinet/in.h> 37#include <netinet/in_systm.h> 38#include <netinet/ip.h> 39 40#if INET6 41#include <netinet/ip6.h> 42#endif 43#include <netinet/ip_var.h> 44#include <netinet/tcp.h> 45#include <netinet/tcp_fsm.h> 46#include <netinet/tcp_timer.h> 47#include <netinet/tcp_var.h> 48#include <netinet/tcpip.h> 49#include <netinet/tcp_cc.h> 50 51#include <libkern/OSAtomic.h> 52 53/* This file implements an alternate TCP congestion control algorithm 54 * for background transport developed by LEDBAT working group at IETF and 55 * described in draft: draft-ietf-ledbat-congestion-02 56 */ 57 58int tcp_ledbat_init(struct tcpcb *tp); 59int tcp_ledbat_cleanup(struct tcpcb *tp); 60void tcp_ledbat_cwnd_init(struct tcpcb *tp); 61void tcp_ledbat_congestion_avd(struct tcpcb *tp, struct tcphdr *th); 62void tcp_ledbat_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); 63void tcp_ledbat_pre_fr(struct tcpcb *tp); 64void tcp_ledbat_post_fr(struct tcpcb *tp, struct tcphdr *th); 65void tcp_ledbat_after_idle(struct tcpcb *tp); 66void tcp_ledbat_after_timeout(struct tcpcb *tp); 67int tcp_ledbat_delay_ack(struct tcpcb *tp, struct tcphdr *th); 68void tcp_ledbat_switch_cc(struct tcpcb *tp, uint16_t old_cc_index); 69 70struct tcp_cc_algo tcp_cc_ledbat = { 71 .name = "ledbat", 72 .init = tcp_ledbat_init, 73 .cleanup = tcp_ledbat_cleanup, 74 .cwnd_init = tcp_ledbat_cwnd_init, 75 .congestion_avd = tcp_ledbat_congestion_avd, 76 .ack_rcvd = tcp_ledbat_ack_rcvd, 77 .pre_fr = tcp_ledbat_pre_fr, 78 .post_fr = tcp_ledbat_post_fr, 79 .after_idle = tcp_ledbat_after_idle, 80 .after_timeout = tcp_ledbat_after_timeout, 81 .delay_ack = tcp_ledbat_delay_ack, 82 .switch_to = tcp_ledbat_switch_cc 83}; 84 85/* Target queuing delay in milliseconds. This includes the processing 86 * and scheduling delay on both of the end-hosts. A LEDBAT sender tries 87 * to keep queuing delay below this limit. When the queuing delay 88 * goes above this limit, a LEDBAT sender will start reducing the 89 * congestion window. 90 * 91 * The LEDBAT draft says that target queue delay MUST be 100 ms for 92 * inter-operability. 93 */ 94int target_qdelay = 100; 95SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED, 96 &target_qdelay , 100, "Target queuing delay"); 97 98/* Allowed increase and tether are used to place an upper bound on 99 * congestion window based on the amount of data that is outstanding. 100 * This will limit the congestion window when the amount of data in 101 * flight is little because the application is writing to the socket 102 * intermittently and is preventing the connection from becoming idle . 103 * 104 * max_allowed_cwnd = allowed_increase + (tether * flight_size) 105 * cwnd = min(cwnd, max_allowed_cwnd) 106 * 107 * 'Allowed_increase' parameter is set to 8. If the flight size is zero, then 108 * we want the congestion window to be at least 8 packets to reduce the 109 * delay induced by delayed ack. This helps when the receiver is acking 110 * more than 2 packets at a time (stretching acks for better performance). 111 * 112 * 'Tether' is also set to 2. We do not want this to limit the growth of cwnd 113 * during slow-start. 114 */ 115int allowed_increase = 8; 116SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_allowed_increase, CTLFLAG_RW | CTLFLAG_LOCKED, 117 &allowed_increase, 1, "Additive constant used to calculate max allowed congestion window"); 118 119/* Left shift for cwnd to get tether value of 2 */ 120int tether_shift = 1; 121SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_tether_shift, CTLFLAG_RW | CTLFLAG_LOCKED, 122 &tether_shift, 1, "Tether shift for max allowed congestion window"); 123 124/* Start with an initial window of 2. This will help to get more accurate 125 * minimum RTT measurement in the beginning. It will help to probe 126 * the path slowly and will not add to the existing delay if the path is 127 * already congested. Using 2 packets will reduce the delay induced by delayed-ack. 128 */ 129uint32_t bg_ss_fltsz = 2; 130SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_ss_fltsz, CTLFLAG_RW | CTLFLAG_LOCKED, 131 &bg_ss_fltsz, 2, "Initial congestion window for background transport"); 132 133extern int rtt_samples_per_slot; 134 135static void update_cwnd(struct tcpcb *tp, uint32_t incr) { 136 uint32_t max_allowed_cwnd = 0, flight_size = 0; 137 uint32_t qdelay, base_rtt; 138 int32_t off_target; 139 140 base_rtt = get_base_rtt(tp); 141 142 /* If we do not have a good RTT measurement yet, increment 143 * congestion window by the default value. 144 */ 145 if (base_rtt == 0 || tp->t_rttcur == 0) { 146 tp->snd_cwnd += incr; 147 goto check_max; 148 } 149 150 qdelay = tp->t_rttcur - base_rtt; 151 off_target = (int32_t)(target_qdelay - qdelay); 152 153 if (off_target >= 0) { 154 /* Delay decreased or remained the same, we can increase 155 * the congestion window according to RFC 3465. 156 * 157 * Move background slow-start threshold to current 158 * congestion window so that the next time (after some idle 159 * period), we can attempt to do slow-start till here if there 160 * is no increase in rtt 161 */ 162 if (tp->bg_ssthresh < tp->snd_cwnd) 163 tp->bg_ssthresh = tp->snd_cwnd; 164 tp->snd_cwnd += incr; 165 166 } else { 167 /* In response to an increase in rtt, reduce the congestion 168 * window by one-eighth. This will help to yield immediately 169 * to a competing stream. 170 */ 171 uint32_t redwin; 172 173 redwin = tp->snd_cwnd >> 3; 174 tp->snd_cwnd -= redwin; 175 if (tp->snd_cwnd < bg_ss_fltsz * tp->t_maxseg) 176 tp->snd_cwnd = bg_ss_fltsz * tp->t_maxseg; 177 178 /* Lower background slow-start threshold so that the connection 179 * will go into congestion avoidance phase 180 */ 181 if (tp->bg_ssthresh > tp->snd_cwnd) 182 tp->bg_ssthresh = tp->snd_cwnd; 183 } 184check_max: 185 /* Calculate the outstanding flight size and restrict the 186 * congestion window to a factor of flight size. 187 */ 188 flight_size = tp->snd_max - tp->snd_una; 189 190 max_allowed_cwnd = (allowed_increase * tp->t_maxseg) 191 + (flight_size << tether_shift); 192 tp->snd_cwnd = min(tp->snd_cwnd, max_allowed_cwnd); 193 return; 194} 195 196int tcp_ledbat_init(struct tcpcb *tp) { 197#pragma unused(tp) 198 OSIncrementAtomic((volatile SInt32 *)&tcp_cc_ledbat.num_sockets); 199 return 0; 200} 201 202int tcp_ledbat_cleanup(struct tcpcb *tp) { 203#pragma unused(tp) 204 OSDecrementAtomic((volatile SInt32 *)&tcp_cc_ledbat.num_sockets); 205 return 0; 206} 207 208/* Initialize the congestion window for a connection 209 * 210 */ 211 212void 213tcp_ledbat_cwnd_init(struct tcpcb *tp) { 214 tp->snd_cwnd = tp->t_maxseg * bg_ss_fltsz; 215 tp->bg_ssthresh = tp->snd_ssthresh; 216} 217 218/* Function to handle an in-sequence ack which is fast-path processing 219 * of an in sequence ack in tcp_input function (called as header prediction). 220 * This gets called only during congestion avoidance phase. 221 */ 222void 223tcp_ledbat_congestion_avd(struct tcpcb *tp, struct tcphdr *th) { 224 int acked = 0; 225 u_int32_t incr = 0; 226 227 acked = BYTES_ACKED(th, tp); 228 tp->t_bytes_acked += acked; 229 if (tp->t_bytes_acked > tp->snd_cwnd) { 230 tp->t_bytes_acked -= tp->snd_cwnd; 231 incr = tp->t_maxseg; 232 } 233 234 if (tp->snd_cwnd < tp->snd_wnd && incr > 0) { 235 update_cwnd(tp, incr); 236 } 237} 238/* Function to process an ack. 239 */ 240void 241tcp_ledbat_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { 242 /* 243 * RFC 3465 - Appropriate Byte Counting. 244 * 245 * If the window is currently less than ssthresh, 246 * open the window by the number of bytes ACKed by 247 * the last ACK, however clamp the window increase 248 * to an upper limit "L". 249 * 250 * In congestion avoidance phase, open the window by 251 * one segment each time "bytes_acked" grows to be 252 * greater than or equal to the congestion window. 253 */ 254 255 register u_int cw = tp->snd_cwnd; 256 register u_int incr = tp->t_maxseg; 257 int acked = 0; 258 259 acked = BYTES_ACKED(th, tp); 260 tp->t_bytes_acked += acked; 261 if (cw >= tp->bg_ssthresh) { 262 /* congestion-avoidance */ 263 if (tp->t_bytes_acked < cw) { 264 /* No need to increase yet. */ 265 incr = 0; 266 } 267 } else { 268 /* 269 * If the user explicitly enables RFC3465 270 * use 2*SMSS for the "L" param. Otherwise 271 * use the more conservative 1*SMSS. 272 * 273 * (See RFC 3465 2.3 Choosing the Limit) 274 */ 275 u_int abc_lim; 276 277 abc_lim = (tcp_do_rfc3465_lim2 && 278 tp->snd_nxt == tp->snd_max) ? incr * 2 : incr; 279 280 incr = lmin(acked, abc_lim); 281 } 282 if (tp->t_bytes_acked >= cw) 283 tp->t_bytes_acked -= cw; 284 if (incr > 0) 285 update_cwnd(tp, incr); 286} 287 288void 289tcp_ledbat_pre_fr(struct tcpcb *tp) { 290 uint32_t win; 291 292 win = min(tp->snd_wnd, tp->snd_cwnd) / 293 2 / tp->t_maxseg; 294 if ( win < 2 ) 295 win = 2; 296 tp->snd_ssthresh = win * tp->t_maxseg; 297 if (tp->bg_ssthresh > tp->snd_ssthresh) 298 tp->bg_ssthresh = tp->snd_ssthresh; 299 300 tcp_cc_resize_sndbuf(tp); 301} 302 303void 304tcp_ledbat_post_fr(struct tcpcb *tp, struct tcphdr *th) { 305 int32_t ss; 306 307 ss = tp->snd_max - th->th_ack; 308 309 /* 310 * Complete ack. Inflate the congestion window to 311 * ssthresh and exit fast recovery. 312 * 313 * Window inflation should have left us with approx. 314 * snd_ssthresh outstanding data. But in case we 315 * would be inclined to send a burst, better to do 316 * it via the slow start mechanism. 317 * 318 * If the flight size is zero, then make congestion 319 * window to be worth at least 2 segments to avoid 320 * delayed acknowledgement (draft-ietf-tcpm-rfc3782-bis-05). 321 */ 322 if (ss < (int32_t)tp->snd_ssthresh) 323 tp->snd_cwnd = max(ss, tp->t_maxseg) + tp->t_maxseg; 324 else 325 tp->snd_cwnd = tp->snd_ssthresh; 326 tp->t_bytes_acked = 0; 327} 328 329/* 330 * Function to handle connections that have been idle for 331 * some time. Slow start to get ack "clock" running again. 332 * Clear base history after idle time. 333 */ 334void 335tcp_ledbat_after_idle(struct tcpcb *tp) { 336 int32_t n = N_RTT_BASE, i = (N_RTT_BASE - 1); 337 338 /* Decide how many base history entries have to be cleared 339 * based on how long the connection has been idle. 340 */ 341 342 if (tp->t_rttcur > 0) { 343 int32_t nrtt, idle_time; 344 345 idle_time = tcp_now - tp->t_rcvtime; 346 nrtt = idle_time / tp->t_rttcur; 347 n = nrtt / rtt_samples_per_slot; 348 if (n > N_RTT_BASE) 349 n = N_RTT_BASE; 350 } 351 for (i = (N_RTT_BASE - 1); n > 0; --i, --n) { 352 tp->rtt_hist[i] = 0; 353 } 354 for (n = (N_RTT_BASE - 1); i >= 0; --i, --n) { 355 tp->rtt_hist[n] = tp->rtt_hist[i]; 356 tp->rtt_hist[i] = 0; 357 } 358 359 /* Reset the congestion window */ 360 tp->snd_cwnd = tp->t_maxseg * bg_ss_fltsz; 361 362 /* If stretch ack was auto disabled, re-evaluate the situation */ 363 tcp_cc_after_idle_stretchack(tp); 364} 365 366/* Function to change the congestion window when the retransmit 367 * timer fires. The behavior is the same as that for best-effort 368 * TCP, reduce congestion window to one segment and start probing 369 * the link using "slow start". The slow start threshold is set 370 * to half of the current window. Lower the background slow start 371 * threshold also. 372 */ 373void 374tcp_ledbat_after_timeout(struct tcpcb *tp) { 375 if (tp->t_state >= TCPS_ESTABLISHED) { 376 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; 377 if (win < 2) 378 win = 2; 379 tp->snd_ssthresh = win * tp->t_maxseg; 380 381 if (tp->bg_ssthresh > tp->snd_ssthresh) 382 tp->bg_ssthresh = tp->snd_ssthresh; 383 384 tp->snd_cwnd = tp->t_maxseg; 385 tcp_cc_resize_sndbuf(tp); 386 } 387} 388 389/* 390 * Indicate whether this ack should be delayed. 391 * We can delay the ack if: 392 * - our last ack wasn't a 0-sized window. 393 * - the peer hasn't sent us a TH_PUSH data packet: if he did, take this 394 * as a clue that we need to ACK without any delay. This helps higher 395 * level protocols who won't send us more data even if the window is 396 * open because their last "segment" hasn't been ACKed 397 * Otherwise the receiver will ack every other full-sized segment or when the 398 * delayed ack timer fires. This will help to generate better rtt estimates for 399 * the other end if it is a ledbat sender. 400 * 401 */ 402 403int 404tcp_ledbat_delay_ack(struct tcpcb *tp, struct tcphdr *th) { 405 /* If any flag other than TH_ACK is set, set "end-of-write" bit */ 406 if (th->th_flags & ~TH_ACK) 407 tp->t_flagsext |= TF_STREAMEOW; 408 else 409 tp->t_flagsext &= ~(TF_STREAMEOW); 410 411 if ((tp->t_flags & TF_RXWIN0SENT) == 0 && 412 (th->th_flags & TH_PUSH) == 0 && 413 (tp->t_unacksegs == 1)) 414 return(1); 415 return(0); 416} 417 418/* Change a connection to use ledbat. First, lower bg_ssthresh value 419 * if it needs to be. 420 */ 421void 422tcp_ledbat_switch_cc(struct tcpcb *tp, uint16_t old_cc_index) { 423#pragma unused(old_cc_index) 424 uint32_t cwnd; 425 426 if (tp->bg_ssthresh == 0 || tp->bg_ssthresh > tp->snd_ssthresh) 427 tp->bg_ssthresh = tp->snd_ssthresh; 428 429 cwnd = min(tp->snd_wnd, tp->snd_cwnd); 430 431 if (tp->snd_cwnd > tp->bg_ssthresh) 432 cwnd = cwnd / tp->t_maxseg; 433 else 434 cwnd = cwnd / 2 / tp->t_maxseg; 435 436 if (cwnd < bg_ss_fltsz) 437 cwnd = bg_ss_fltsz; 438 439 tp->snd_cwnd = cwnd * tp->t_maxseg; 440 tp->t_bytes_acked = 0; 441 442 OSIncrementAtomic((volatile SInt32 *)&tcp_cc_ledbat.num_sockets); 443} 444