1/* 2 * Copyright (c) 2010-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $ 62 */ 63#include <sys/param.h> 64#include <sys/systm.h> 65#include <sys/kernel.h> 66#include <sys/protosw.h> 67#include <sys/socketvar.h> 68 69#include <net/route.h> 70#include <netinet/in.h> 71#include <netinet/in_systm.h> 72#include <netinet/ip.h> 73 74#if INET6 75#include <netinet/ip6.h> 76#endif 77#include <netinet/ip_var.h> 78#include <netinet/tcp.h> 79#include <netinet/tcp_fsm.h> 80#include <netinet/tcp_timer.h> 81#include <netinet/tcp_var.h> 82#include <netinet/tcpip.h> 83#include <netinet/tcp_cc.h> 84#include <libkern/OSAtomic.h> 85 86int tcp_newreno_init(struct tcpcb *tp); 87int tcp_newreno_cleanup(struct tcpcb *tp); 88void tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp); 89void tcp_newreno_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); 90void tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); 91void tcp_newreno_pre_fr(struct tcpcb *tp); 92void tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th); 93void tcp_newreno_after_idle(struct tcpcb *tp); 94void tcp_newreno_after_timeout(struct tcpcb *tp); 95int tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th); 96void tcp_newreno_switch_cc(struct tcpcb *tp, uint16_t old_index); 97 98struct tcp_cc_algo tcp_cc_newreno = { 99 .name = "newreno", 100 .init = tcp_newreno_init, 101 .cleanup = tcp_newreno_cleanup, 102 .cwnd_init = tcp_newreno_cwnd_init_or_reset, 103 .inseq_ack_rcvd = tcp_newreno_inseq_ack_rcvd, 104 .ack_rcvd = tcp_newreno_ack_rcvd, 105 .pre_fr = tcp_newreno_pre_fr, 106 .post_fr = tcp_newreno_post_fr, 107 .after_idle = tcp_newreno_cwnd_init_or_reset, 108 .after_timeout = tcp_newreno_after_timeout, 109 .delay_ack = tcp_newreno_delay_ack, 110 .switch_to = tcp_newreno_switch_cc 111}; 112 113extern int tcp_do_rfc3465; 114extern int tcp_do_rfc3465_lim2; 115extern int maxseg_unacked; 116extern u_int32_t tcp_autosndbuf_max; 117 118#define SET_SNDSB_IDEAL_SIZE(sndsb, size) \ 119 sndsb->sb_idealsize = min(max(tcp_sendspace, tp->snd_ssthresh), \ 120 tcp_autosndbuf_max); 121 122void tcp_cc_resize_sndbuf(struct tcpcb *tp) { 123 struct sockbuf *sb; 124 /* If the send socket buffer size is bigger than ssthresh, 125 * it is time to trim it because we do not want to hold 126 * too many mbufs in the socket buffer 127 */ 128 sb = &(tp->t_inpcb->inp_socket->so_snd); 129 if (sb->sb_hiwat > tp->snd_ssthresh && 130 (sb->sb_flags & SB_AUTOSIZE) != 0) { 131 if (sb->sb_idealsize > tp->snd_ssthresh) { 132 SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); 133 } 134 sb->sb_flags |= SB_TRIM; 135 } 136} 137 138void tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp) { 139 struct sockbuf *sb; 140 sb = &(tp->t_inpcb->inp_socket->so_snd); 141 if ((sb->sb_flags & (SB_TRIM|SB_AUTOSIZE)) == (SB_TRIM|SB_AUTOSIZE)) { 142 /* If there was a retransmission that was not necessary 143 * then the size of socket buffer can be restored to 144 * what it was before 145 */ 146 SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); 147 if (sb->sb_hiwat <= sb->sb_idealsize) { 148 sbreserve(sb, sb->sb_idealsize); 149 sb->sb_flags &= ~SB_TRIM; 150 } 151 } 152} 153 154int tcp_newreno_init(struct tcpcb *tp) { 155#pragma unused(tp) 156 OSIncrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); 157 return 0; 158} 159 160int tcp_newreno_cleanup(struct tcpcb *tp) { 161#pragma unused(tp) 162 OSDecrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); 163 return 0; 164} 165 166/* Initialize the congestion window for a connection or 167 * handles connections that have been idle for 168 * some time. In this state, no acks are 169 * expected to clock out any data we send -- 170 * slow start to get ack "clock" running again. 171 * 172 * Set the slow-start flight size depending on whether 173 * this is a local network or not. 174 */ 175void 176tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp) { 177 if ( tp->t_flags & TF_LOCAL ) 178 tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; 179 else { 180 /* Calculate initial cwnd according to RFC3390, 181 * - On a standard link, this will result in a higher cwnd 182 * and improve initial transfer rate. 183 * - Keep the old ss_fltsz sysctl for ABI compabitility issues. 184 * but it will be overriden if tcp_do_rfc3390 sysctl is set. 185 */ 186 187 if (tcp_do_rfc3390) 188 tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); 189 190 else 191 tp->snd_cwnd = tp->t_maxseg * ss_fltsz; 192 } 193} 194 195 196/* Function to handle an in-sequence ack during congestion avoidance phase. 197 * This will get called from header prediction code. 198 */ 199void 200tcp_newreno_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { 201 int acked = 0; 202 acked = th->th_ack - tp->snd_una; 203 /* 204 * Grow the congestion window, if the 205 * connection is cwnd bound. 206 */ 207 if (tp->snd_cwnd < tp->snd_wnd) { 208 tp->t_bytes_acked += acked; 209 if (tp->t_bytes_acked > tp->snd_cwnd) { 210 tp->t_bytes_acked -= tp->snd_cwnd; 211 tp->snd_cwnd += tp->t_maxseg; 212 } 213 } 214} 215/* Function to process an ack. 216 */ 217void 218tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { 219 /* 220 * RFC 3465 - Appropriate Byte Counting. 221 * 222 * If the window is currently less than ssthresh, 223 * open the window by the number of bytes ACKed by 224 * the last ACK, however clamp the window increase 225 * to an upper limit "L". 226 * 227 * In congestion avoidance phase, open the window by 228 * one segment each time "bytes_acked" grows to be 229 * greater than or equal to the congestion window. 230 */ 231 232 register u_int cw = tp->snd_cwnd; 233 register u_int incr = tp->t_maxseg; 234 int acked = 0; 235 236 acked = th->th_ack - tp->snd_una; 237 if (tcp_do_rfc3465) { 238 239 if (cw >= tp->snd_ssthresh) { 240 tp->t_bytes_acked += acked; 241 if (tp->t_bytes_acked >= cw) { 242 /* Time to increase the window. */ 243 tp->t_bytes_acked -= cw; 244 } else { 245 /* No need to increase yet. */ 246 incr = 0; 247 } 248 } else { 249 /* 250 * If the user explicitly enables RFC3465 251 * use 2*SMSS for the "L" param. Otherwise 252 * use the more conservative 1*SMSS. 253 * 254 * (See RFC 3465 2.3 Choosing the Limit) 255 */ 256 u_int abc_lim; 257 258 abc_lim = (tcp_do_rfc3465_lim2 && 259 tp->snd_nxt == tp->snd_max) ? incr * 2 : incr; 260 261 incr = lmin(acked, abc_lim); 262 } 263 } else { 264 /* 265 * If the window gives us less than ssthresh packets 266 * in flight, open exponentially (segsz per packet). 267 * Otherwise open linearly: segsz per window 268 * (segsz^2 / cwnd per packet). 269 */ 270 271 if (cw >= tp->snd_ssthresh) 272 incr = max((incr * incr / cw), 1); 273 } 274 tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale); 275} 276 277void 278tcp_newreno_pre_fr(struct tcpcb *tp) { 279 280 uint32_t win; 281 282 win = min(tp->snd_wnd, tp->snd_cwnd) / 283 2 / tp->t_maxseg; 284 if ( win < 2 ) 285 win = 2; 286 tp->snd_ssthresh = win * tp->t_maxseg; 287 tcp_cc_resize_sndbuf(tp); 288 289} 290 291void 292tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th) { 293 int32_t ss; 294 295 ss = tp->snd_max - th->th_ack; 296 297 /* 298 * Complete ack. Inflate the congestion window to 299 * ssthresh and exit fast recovery. 300 * 301 * Window inflation should have left us with approx. 302 * snd_ssthresh outstanding data. But in case we 303 * would be inclined to send a burst, better to do 304 * it via the slow start mechanism. 305 */ 306 if (ss < (int32_t)tp->snd_ssthresh) 307 tp->snd_cwnd = ss + tp->t_maxseg; 308 else 309 tp->snd_cwnd = tp->snd_ssthresh; 310 tp->t_bytes_acked = 0; 311} 312 313/* Function to change the congestion window when the retransmit 314 * timer fires. 315 */ 316void 317tcp_newreno_after_timeout(struct tcpcb *tp) { 318 /* 319 * Close the congestion window down to one segment 320 * (we'll open it by one segment for each ack we get). 321 * Since we probably have a window's worth of unacked 322 * data accumulated, this "slow start" keeps us from 323 * dumping all that data as back-to-back packets (which 324 * might overwhelm an intermediate gateway). 325 * 326 * There are two phases to the opening: Initially we 327 * open by one mss on each ack. This makes the window 328 * size increase exponentially with time. If the 329 * window is larger than the path can handle, this 330 * exponential growth results in dropped packet(s) 331 * almost immediately. To get more time between 332 * drops but still "push" the network to take advantage 333 * of improving conditions, we switch from exponential 334 * to linear window opening at some threshhold size. 335 * For a threshhold, we use half the current window 336 * size, truncated to a multiple of the mss. 337 * 338 * (the minimum cwnd that will give us exponential 339 * growth is 2 mss. We don't allow the threshhold 340 * to go below this.) 341 */ 342 if (tp->t_state >= TCPS_ESTABLISHED) { 343 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; 344 if (win < 2) 345 win = 2; 346 tp->snd_cwnd = tp->t_maxseg; 347 tp->snd_ssthresh = win * tp->t_maxseg; 348 tp->t_bytes_acked = 0; 349 tp->t_dupacks = 0; 350 351 tcp_cc_resize_sndbuf(tp); 352 } 353} 354 355/* 356 * Indicate whether this ack should be delayed. 357 * We can delay the ack if: 358 * - delayed acks are enabled and set to 1, same as when value is set to 2. 359 * We kept this for binary compatibility. 360 * - delayed acks are enabled and set to 2, will "ack every other packet" 361 * - if our last ack wasn't a 0-sized window. 362 * - if the peer hasn't sent us a TH_PUSH data packet (this solves 3649245). 363 * If TH_PUSH is set, take this as a clue that we need to ACK 364 * with no delay. This helps higher level protocols who won't send 365 * us more data even if the window is open because their 366 * last "segment" hasn't been ACKed 367 * - delayed acks are enabled and set to 3, will do "streaming detection" 368 * (see the comment in tcp_input.c) and 369 * - if we receive more than "maxseg_unacked" full packets in the last 100ms 370 * - if the connection is not in slow-start or idle or loss/recovery states 371 * - if those criteria aren't met, it will ack every other packet. 372 */ 373 374int 375tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th) { 376 switch (tcp_delack_enabled) { 377 case 1: 378 case 2: 379 if ((tp->t_flags & TF_RXWIN0SENT) == 0 && 380 (th->th_flags & TH_PUSH) == 0 && 381 (tp->t_unacksegs == 1)) 382 return(1); 383 break; 384 case 3: 385 if ((tp->t_flags & TF_RXWIN0SENT) == 0 && 386 (th->th_flags & TH_PUSH) == 0 && 387 ((tp->t_unacksegs == 1) || 388 ((tp->t_flags & TF_STRETCHACK) != 0 && 389 tp->t_unacksegs < (maxseg_unacked)))) 390 return(1); 391 break; 392 } 393 return(0); 394} 395 396/* Switch to newreno from a different CC. If the connection is in 397 * congestion avoidance state, it can continue to use the current 398 * congestion window because it is going to be conservative. But 399 * if the connection is in slow-start, we will halve the congestion 400 * window and let newreno work from there. 401 */ 402void 403tcp_newreno_switch_cc(struct tcpcb *tp, uint16_t old_index) { 404#pragma unused(old_index) 405 406 uint32_t cwnd = min(tp->snd_wnd, tp->snd_cwnd); 407 if (tp->snd_cwnd >= tp->snd_ssthresh) { 408 cwnd = cwnd / tp->t_maxseg; 409 } else { 410 cwnd = cwnd / 2 / tp->t_maxseg; 411 } 412 if (cwnd < 1) 413 cwnd = 1; 414 tp->snd_cwnd = cwnd * tp->t_maxseg; 415 416 /* Start counting bytes for RFC 3465 again */ 417 tp->t_bytes_acked = 0; 418 419 OSIncrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); 420} 421