1/* 2 * Copyright (c) 2010-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $ 62 */ 63#include <sys/param.h> 64#include <sys/systm.h> 65#include <sys/kernel.h> 66#include <sys/protosw.h> 67#include <sys/socketvar.h> 68 69#include <net/route.h> 70#include <netinet/in.h> 71#include <netinet/in_systm.h> 72#include <netinet/ip.h> 73 74#if INET6 75#include <netinet/ip6.h> 76#endif 77#include <netinet/ip_var.h> 78#include <netinet/tcp.h> 79#include <netinet/tcp_fsm.h> 80#include <netinet/tcp_timer.h> 81#include <netinet/tcp_var.h> 82#include <netinet/tcpip.h> 83#include <netinet/tcp_cc.h> 84#include <libkern/OSAtomic.h> 85 86int tcp_newreno_init(struct tcpcb *tp); 87int tcp_newreno_cleanup(struct tcpcb *tp); 88void tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp); 89void tcp_newreno_congestion_avd(struct tcpcb *tp, struct tcphdr *th); 90void tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); 91void tcp_newreno_pre_fr(struct tcpcb *tp); 92void tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th); 93void tcp_newreno_after_idle(struct tcpcb *tp); 94void tcp_newreno_after_timeout(struct tcpcb *tp); 95int tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th); 96void tcp_newreno_switch_cc(struct tcpcb *tp, uint16_t old_index); 97 98struct tcp_cc_algo tcp_cc_newreno = { 99 .name = "newreno", 100 .init = tcp_newreno_init, 101 .cleanup = tcp_newreno_cleanup, 102 .cwnd_init = tcp_newreno_cwnd_init_or_reset, 103 .congestion_avd = tcp_newreno_congestion_avd, 104 .ack_rcvd = tcp_newreno_ack_rcvd, 105 .pre_fr = tcp_newreno_pre_fr, 106 .post_fr = tcp_newreno_post_fr, 107 .after_idle = tcp_newreno_cwnd_init_or_reset, 108 .after_timeout = tcp_newreno_after_timeout, 109 .delay_ack = tcp_newreno_delay_ack, 110 .switch_to = tcp_newreno_switch_cc 111}; 112 113int tcp_newreno_init(struct tcpcb *tp) { 114#pragma unused(tp) 115 OSIncrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); 116 return 0; 117} 118 119int tcp_newreno_cleanup(struct tcpcb *tp) { 120#pragma unused(tp) 121 OSDecrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); 122 return 0; 123} 124 125/* Initialize the congestion window for a connection or 126 * handles connections that have been idle for 127 * some time. In this state, no acks are 128 * expected to clock out any data we send -- 129 * slow start to get ack "clock" running again. 130 * 131 * Set the slow-start flight size depending on whether 132 * this is a local network or not. 133 */ 134void 135tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp) { 136 tcp_cc_cwnd_init_or_reset(tp); 137 138 /* If stretch ack was auto disabled, re-evaluate the situation */ 139 tcp_cc_after_idle_stretchack(tp); 140} 141 142 143/* Function to handle an in-sequence ack during congestion avoidance phase. 144 * This will get called from header prediction code. 145 */ 146void 147tcp_newreno_congestion_avd(struct tcpcb *tp, struct tcphdr *th) { 148 uint32_t acked = 0; 149 acked = BYTES_ACKED(th, tp); 150 /* 151 * Grow the congestion window, if the 152 * connection is cwnd bound. 153 */ 154 if (tp->snd_cwnd < tp->snd_wnd) { 155 tp->t_bytes_acked += acked; 156 if (tp->t_bytes_acked > tp->snd_cwnd) { 157 tp->t_bytes_acked -= tp->snd_cwnd; 158 tp->snd_cwnd += tp->t_maxseg; 159 } 160 } 161} 162/* Function to process an ack. 163 */ 164void 165tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { 166 /* 167 * RFC 3465 - Appropriate Byte Counting. 168 * 169 * If the window is currently less than ssthresh, 170 * open the window by the number of bytes ACKed by 171 * the last ACK, however clamp the window increase 172 * to an upper limit "L". 173 * 174 * In congestion avoidance phase, open the window by 175 * one segment each time "bytes_acked" grows to be 176 * greater than or equal to the congestion window. 177 */ 178 179 register u_int cw = tp->snd_cwnd; 180 register u_int incr = tp->t_maxseg; 181 int acked = 0; 182 183 acked = BYTES_ACKED(th, tp); 184 if (tcp_do_rfc3465) { 185 186 if (cw >= tp->snd_ssthresh) { 187 tp->t_bytes_acked += acked; 188 if (tp->t_bytes_acked >= cw) { 189 /* Time to increase the window. */ 190 tp->t_bytes_acked -= cw; 191 } else { 192 /* No need to increase yet. */ 193 incr = 0; 194 } 195 } else { 196 /* 197 * If the user explicitly enables RFC3465 198 * use 2*SMSS for the "L" param. Otherwise 199 * use the more conservative 1*SMSS. 200 * 201 * (See RFC 3465 2.3 Choosing the Limit) 202 */ 203 uint32_t abc_lim; 204 abc_lim = (tcp_do_rfc3465_lim2 && 205 tp->snd_nxt == tp->snd_max) ? incr * 2 206 : incr; 207 208 incr = lmin(acked, abc_lim); 209 } 210 } else { 211 /* 212 * If the window gives us less than ssthresh packets 213 * in flight, open exponentially (segsz per packet). 214 * Otherwise open linearly: segsz per window 215 * (segsz^2 / cwnd per packet). 216 */ 217 218 if (cw >= tp->snd_ssthresh) 219 incr = max((incr * incr / cw), 1); 220 } 221 tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale); 222} 223 224void 225tcp_newreno_pre_fr(struct tcpcb *tp) { 226 227 uint32_t win; 228 229 win = min(tp->snd_wnd, tp->snd_cwnd) / 230 2 / tp->t_maxseg; 231 if ( win < 2 ) 232 win = 2; 233 tp->snd_ssthresh = win * tp->t_maxseg; 234 tcp_cc_resize_sndbuf(tp); 235 236} 237 238void 239tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th) { 240 int32_t ss; 241 242 ss = tp->snd_max - th->th_ack; 243 244 /* 245 * Complete ack. Inflate the congestion window to 246 * ssthresh and exit fast recovery. 247 * 248 * Window inflation should have left us with approx. 249 * snd_ssthresh outstanding data. But in case we 250 * would be inclined to send a burst, better to do 251 * it via the slow start mechanism. 252 * 253 * If the flight size is zero, then make congestion 254 * window to be worth at least 2 segments to avoid 255 * delayed acknowledgement (draft-ietf-tcpm-rfc3782-bis-05). 256 */ 257 if (ss < (int32_t)tp->snd_ssthresh) 258 tp->snd_cwnd = max(ss, tp->t_maxseg) + tp->t_maxseg; 259 else 260 tp->snd_cwnd = tp->snd_ssthresh; 261 tp->t_bytes_acked = 0; 262} 263 264/* Function to change the congestion window when the retransmit 265 * timer fires. 266 */ 267void 268tcp_newreno_after_timeout(struct tcpcb *tp) { 269 /* 270 * Close the congestion window down to one segment 271 * (we'll open it by one segment for each ack we get). 272 * Since we probably have a window's worth of unacked 273 * data accumulated, this "slow start" keeps us from 274 * dumping all that data as back-to-back packets (which 275 * might overwhelm an intermediate gateway). 276 * 277 * There are two phases to the opening: Initially we 278 * open by one mss on each ack. This makes the window 279 * size increase exponentially with time. If the 280 * window is larger than the path can handle, this 281 * exponential growth results in dropped packet(s) 282 * almost immediately. To get more time between 283 * drops but still "push" the network to take advantage 284 * of improving conditions, we switch from exponential 285 * to linear window opening at some threshhold size. 286 * For a threshhold, we use half the current window 287 * size, truncated to a multiple of the mss. 288 * 289 * (the minimum cwnd that will give us exponential 290 * growth is 2 mss. We don't allow the threshhold 291 * to go below this.) 292 */ 293 if (tp->t_state >= TCPS_ESTABLISHED) { 294 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; 295 if (win < 2) 296 win = 2; 297 tp->snd_ssthresh = win * tp->t_maxseg; 298 299 tp->snd_cwnd = tp->t_maxseg; 300 tcp_cc_resize_sndbuf(tp); 301 } 302} 303 304/* 305 * Indicate whether this ack should be delayed. 306 * We can delay the ack if: 307 * - delayed acks are enabled and set to 1, same as when value is set to 2. 308 * We kept this for binary compatibility. 309 * - delayed acks are enabled and set to 2, will "ack every other packet" 310 * - if our last ack wasn't a 0-sized window. 311 * - if the peer hasn't sent us a TH_PUSH data packet (this solves 3649245). 312 * If TH_PUSH is set, take this as a clue that we need to ACK 313 * with no delay. This helps higher level protocols who won't send 314 * us more data even if the window is open because their 315 * last "segment" hasn't been ACKed 316 * - delayed acks are enabled and set to 3, will do "streaming detection" 317 * (see the comment in tcp_input.c) and 318 * - if we receive more than "maxseg_unacked" full packets in the last 100ms 319 * - if the connection is not in slow-start or idle or loss/recovery states 320 * - if those criteria aren't met, it will ack every other packet. 321 */ 322 323int 324tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th) { 325 return (tcp_cc_delay_ack(tp, th)); 326} 327 328/* Switch to newreno from a different CC. If the connection is in 329 * congestion avoidance state, it can continue to use the current 330 * congestion window because it is going to be conservative. But 331 * if the connection is in slow-start, we will halve the congestion 332 * window and let newreno work from there. 333 */ 334void 335tcp_newreno_switch_cc(struct tcpcb *tp, uint16_t old_index) { 336#pragma unused(old_index) 337 338 uint32_t cwnd = min(tp->snd_wnd, tp->snd_cwnd); 339 if (tp->snd_cwnd >= tp->snd_ssthresh) { 340 cwnd = cwnd / tp->t_maxseg; 341 } else { 342 cwnd = cwnd / 2 / tp->t_maxseg; 343 } 344 tp->snd_cwnd = max(TCP_CC_CWND_INIT_BYTES, cwnd * tp->t_maxseg); 345 346 /* Start counting bytes for RFC 3465 again */ 347 tp->t_bytes_acked = 0; 348 349 OSIncrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); 350} 351