1/* 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $ 62 */ 63/* 64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 65 * support for mandatory and extensible security protections. This notice 66 * is included in support of clause 2.2 (b) of the Apple Public License, 67 * Version 2.0. 68 */ 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/kernel.h> 73#include <sys/sysctl.h> 74#include <sys/malloc.h> 75#include <sys/mbuf.h> 76#include <sys/proc.h> /* for proc0 declaration */ 77#include <sys/protosw.h> 78#include <sys/socket.h> 79#include <sys/socketvar.h> 80#include <sys/syslog.h> 81#include <sys/mcache.h> 82#include <sys/kasl.h> 83#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */ 84 85#include <machine/endian.h> 86 87#include <net/if.h> 88#include <net/if_types.h> 89#include <net/route.h> 90#include <net/ntstat.h> 91#include <net/dlil.h> 92 93#include <netinet/in.h> 94#include <netinet/in_systm.h> 95#include <netinet/ip.h> 96#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */ 97#include <netinet/in_var.h> 98#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 99#include <netinet/in_pcb.h> 100#include <netinet/ip_var.h> 101#include <mach/sdt.h> 102#if INET6 103#include <netinet/ip6.h> 104#include <netinet/icmp6.h> 105#include <netinet6/nd6.h> 106#include <netinet6/ip6_var.h> 107#include <netinet6/in6_pcb.h> 108#endif 109#include <netinet/tcp.h> 110#include <netinet/tcp_fsm.h> 111#include <netinet/tcp_seq.h> 112#include <netinet/tcp_timer.h> 113#include <netinet/tcp_var.h> 114#include <netinet/tcp_cc.h> 115#include <dev/random/randomdev.h> 116#include <kern/zalloc.h> 117#if INET6 118#include <netinet6/tcp6_var.h> 119#endif 120#include <netinet/tcpip.h> 121#if TCPDEBUG 122#include <netinet/tcp_debug.h> 123u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */ 124struct tcphdr tcp_savetcp; 125#endif /* TCPDEBUG */ 126 127#if IPSEC 128#include <netinet6/ipsec.h> 129#if INET6 130#include <netinet6/ipsec6.h> 131#endif 132#include <netkey/key.h> 133#endif /*IPSEC*/ 134 135#if CONFIG_MACF_NET || CONFIG_MACF_SOCKET 136#include <security/mac_framework.h> 137#endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */ 138 139#include <sys/kdebug.h> 140#include <netinet/lro_ext.h> 141#if MPTCP 142#include <netinet/mptcp_var.h> 143#include <netinet/mptcp.h> 144#include <netinet/mptcp_opt.h> 145#endif /* MPTCP */ 146 147#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0) 148#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2) 149#define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8)) 150#define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8)) 151 152tcp_cc tcp_ccgen; 153 154struct tcpstat tcpstat; 155 156static int log_in_vain = 0; 157SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED, 158 &log_in_vain, 0, "Log all incoming TCP connections"); 159 160static int blackhole = 0; 161SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED, 162 &blackhole, 0, "Do not send RST when dropping refused connections"); 163 164int tcp_delack_enabled = 3; 165SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW | CTLFLAG_LOCKED, 166 &tcp_delack_enabled, 0, 167 "Delay ACK to try and piggyback it onto a data packet"); 168 169int tcp_lq_overflow = 1; 170SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW | CTLFLAG_LOCKED, 171 &tcp_lq_overflow, 0, 172 "Listen Queue Overflow"); 173 174int tcp_recv_bg = 0; 175SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED, 176 &tcp_recv_bg, 0, 177 "Receive background"); 178 179#if TCP_DROP_SYNFIN 180static int drop_synfin = 1; 181SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW | CTLFLAG_LOCKED, 182 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); 183#endif 184 185SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0, 186 "TCP Segment Reassembly Queue"); 187 188static int tcp_reass_overflows = 0; 189SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD | CTLFLAG_LOCKED, 190 &tcp_reass_overflows, 0, 191 "Global number of TCP Segment Reassembly Queue Overflows"); 192 193 194__private_extern__ int slowlink_wsize = 8192; 195SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW | CTLFLAG_LOCKED, 196 &slowlink_wsize, 0, "Maximum advertised window size for slowlink"); 197 198int maxseg_unacked = 8; 199SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW | CTLFLAG_LOCKED, 200 &maxseg_unacked, 0, "Maximum number of outstanding segments left unacked"); 201 202int tcp_do_rfc3465 = 1; 203SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED, 204 &tcp_do_rfc3465, 0, ""); 205 206int tcp_do_rfc3465_lim2 = 1; 207SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW | CTLFLAG_LOCKED, 208 &tcp_do_rfc3465_lim2, 0, "Appropriate bytes counting w/ L=2*SMSS"); 209 210int rtt_samples_per_slot = 20; 211SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_samples_per_slot, CTLFLAG_RW | CTLFLAG_LOCKED, 212 &rtt_samples_per_slot, 0, "Number of RTT samples stored for rtt history"); 213 214int tcp_allowed_iaj = ALLOWED_IAJ; 215SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj, CTLFLAG_RW | CTLFLAG_LOCKED, 216 &tcp_allowed_iaj, 0, "Allowed inter-packet arrival jiter"); 217 218int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH; 219SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh, CTLFLAG_RW | CTLFLAG_LOCKED, 220 &tcp_acc_iaj_high_thresh, 0, "Used in calculating maximum accumulated IAJ"); 221 222u_int32_t tcp_do_autorcvbuf = 1; 223SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautorcvbuf, CTLFLAG_RW | CTLFLAG_LOCKED, 224 &tcp_do_autorcvbuf, 0, "Enable automatic socket buffer tuning"); 225 226u_int32_t tcp_autorcvbuf_inc_shift = 3; 227SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufincshift, CTLFLAG_RW | CTLFLAG_LOCKED, 228 &tcp_autorcvbuf_inc_shift, 0, "Shift for increment in receive socket buffer size"); 229 230u_int32_t tcp_autorcvbuf_max = 512 * 1024; 231SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmax, CTLFLAG_RW | CTLFLAG_LOCKED, 232 &tcp_autorcvbuf_max, 0, "Maximum receive socket buffer size"); 233 234int sw_lro = 0; 235SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_LOCKED, 236 &sw_lro, 0, "Used to coalesce TCP packets"); 237 238int lrodebug = 0; 239SYSCTL_INT(_net_inet_tcp, OID_AUTO, lrodbg, CTLFLAG_RW | CTLFLAG_LOCKED, 240 &lrodebug, 0, "Used to debug SW LRO"); 241 242int lro_start = 4; 243SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_startcnt, CTLFLAG_RW | CTLFLAG_LOCKED, 244 &lro_start, 0, "Segments for starting LRO computed as power of 2"); 245 246extern int tcp_do_autosendbuf; 247 248int limited_txmt = 1; 249SYSCTL_INT(_net_inet_tcp, OID_AUTO, limited_transmit, CTLFLAG_RW | CTLFLAG_LOCKED, 250 &limited_txmt, 0, "Enable limited transmit"); 251 252int early_rexmt = 1; 253SYSCTL_INT(_net_inet_tcp, OID_AUTO, early_rexmt, CTLFLAG_RW | CTLFLAG_LOCKED, 254 &early_rexmt, 0, "Enable Early Retransmit"); 255 256int sack_ackadv = 1; 257SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_ackadv, CTLFLAG_RW | CTLFLAG_LOCKED, 258 &sack_ackadv, 0, "Use SACK with cumulative ack advancement as a dupack"); 259 260#if CONFIG_IFEF_NOWINDOWSCALE 261int tcp_obey_ifef_nowindowscale = 0; 262SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW | CTLFLAG_LOCKED, 263 &tcp_obey_ifef_nowindowscale, 0, ""); 264#endif 265 266extern int tcp_TCPTV_MIN; 267extern int tcp_acc_iaj_high; 268extern int tcp_acc_iaj_react_limit; 269extern struct zone *tcp_reass_zone; 270 271int tcprexmtthresh = 3; 272 273u_int32_t tcp_now; 274struct timeval tcp_uptime; /* uptime when tcp_now was last updated */ 275lck_spin_t *tcp_uptime_lock; /* Used to sychronize updates to tcp_now */ 276 277struct inpcbhead tcb; 278#define tcb6 tcb /* for KAME src sync over BSD*'s */ 279struct inpcbinfo tcbinfo; 280 281static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *, 282 struct tcpopt *, unsigned int); 283static void tcp_pulloutofband(struct socket *, 284 struct tcphdr *, struct mbuf *, int); 285static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *, 286 struct ifnet *); 287static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq); 288static inline unsigned int tcp_maxmtu(struct rtentry *); 289static inline int tcp_stretch_ack_enable(struct tcpcb *tp); 290static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int); 291 292#if TRAFFIC_MGT 293static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen, 294 int reset_size); 295void compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor); 296static void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj); 297#endif /* TRAFFIC_MGT */ 298 299#if INET6 300static inline unsigned int tcp_maxmtu6(struct rtentry *); 301#endif 302 303static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb, 304 struct tcpopt *to, u_int32_t tlen); 305 306void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb); 307static void tcp_sbsnd_trim(struct sockbuf *sbsnd); 308static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp); 309static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb, 310 u_int32_t newsize, u_int32_t idealsize); 311static void tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th); 312static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, 313 struct tcphdr *th); 314static void tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th); 315static void tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, 316 struct tcpopt *to); 317/* 318 * Constants used for resizing receive socket buffer 319 * when timestamps are not supported 320 */ 321#define TCPTV_RCVNOTS_QUANTUM 100 322#define TCP_RCVNOTS_BYTELEVEL 204800 323 324/* 325 * Constants used for limiting early retransmits 326 * to 10 per minute. 327 */ 328#define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */ 329#define TCP_EARLY_REXMT_LIMIT 10 330 331extern void ipfwsyslog( int level, const char *format,...); 332extern int fw_verbose; 333 334#if IPFIREWALL 335extern void ipfw_stealth_stats_incr_tcp(void); 336 337#define log_in_vain_log( a ) { \ 338 if ( (log_in_vain == 3 ) && (fw_verbose == 2)) { /* Apple logging, log to ipfw.log */ \ 339 ipfwsyslog a ; \ 340 } else if ( (log_in_vain == 4 ) && (fw_verbose == 2)) { \ 341 ipfw_stealth_stats_incr_tcp(); \ 342 } \ 343 else log a ; \ 344} 345#else 346#define log_in_vain_log( a ) { log a; } 347#endif 348 349int tcp_rcvunackwin = TCPTV_UNACKWIN; 350int tcp_maxrcvidle = TCPTV_MAXRCVIDLE; 351int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT; 352SYSCTL_INT(_net_inet_tcp, OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED, 353 &tcp_rcvsspktcnt, 0, "packets to be seen before receiver stretches acks"); 354 355#define DELAY_ACK(tp, th) \ 356 (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th)) 357 358static int tcp_dropdropablreq(struct socket *head); 359static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th); 360 361static void update_base_rtt(struct tcpcb *tp, uint32_t rtt); 362void tcp_set_background_cc(struct socket *so); 363void tcp_set_foreground_cc(struct socket *so); 364static void tcp_set_new_cc(struct socket *so, uint16_t cc_index); 365static void tcp_bwmeas_check(struct tcpcb *tp); 366 367#if TRAFFIC_MGT 368void 369reset_acc_iaj(struct tcpcb *tp) 370{ 371 tp->acc_iaj = 0; 372 tp->iaj_rwintop = 0; 373 CLEAR_IAJ_STATE(tp); 374} 375 376static inline void 377update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size) 378{ 379 if (rst_size > 0) 380 tp->iaj_size = 0; 381 if (tp->iaj_size == 0 || size >= tp->iaj_size) { 382 tp->iaj_size = size; 383 tp->iaj_rcv_ts = tcp_now; 384 tp->iaj_small_pkt = 0; 385 } 386} 387 388/* For every 32 bit unsigned integer(v), this function will find the 389 * largest integer n such that (n*n <= v). This takes at most 16 iterations 390 * irrespective of the value of v and does not involve multiplications. 391 */ 392static inline int 393isqrt(unsigned int val) { 394 unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100}; 395 unsigned int temp, g=0, b=0x8000, bshft=15; 396 if ( val <= 100) { 397 for (g = 0; g <= 10; ++g) { 398 if (sqrt_cache[g] > val) { 399 g--; 400 break; 401 } else if (sqrt_cache[g] == val) { 402 break; 403 } 404 } 405 } else { 406 do { 407 temp = (((g << 1) + b) << (bshft--)); 408 if (val >= temp) { 409 g += b; 410 val -= temp; 411 } 412 b >>= 1; 413 } while ( b > 0 && val > 0); 414 } 415 return(g); 416} 417 418/* 419* With LRO, roughly estimate the inter arrival time between 420* each sub coalesced packet as an average. Count the delay 421* cur_iaj to be the delay between the last packet received 422* and the first packet of the LRO stream. Due to round off errors 423* cur_iaj may be the same as lro_delay_factor. Averaging has 424* round off errors too. lro_delay_factor may be close to 0 425* in steady state leading to lower values fed to compute_iaj_meat. 426*/ 427void 428compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor) 429{ 430 uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts; 431 uint32_t timediff = 0; 432 433 if (cur_iaj >= lro_delay_factor) { 434 cur_iaj = cur_iaj - lro_delay_factor; 435 } 436 437 compute_iaj_meat(tp, cur_iaj); 438 439 if (nlropkts <= 1) 440 return; 441 442 nlropkts--; 443 444 timediff = lro_delay_factor/nlropkts; 445 446 while (nlropkts > 0) 447 { 448 compute_iaj_meat(tp, timediff); 449 nlropkts--; 450 } 451} 452 453static 454void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj) 455{ 456 /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds, 457 * throttle the receive window to a minimum of MIN_IAJ_WIN packets 458 */ 459#define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit) 460#define IAJ_DIV_SHIFT 4 461#define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1)) 462 463 uint32_t allowed_iaj, acc_iaj = 0; 464 465 uint32_t mean, temp; 466 int32_t cur_iaj_dev; 467 468 cur_iaj_dev = (cur_iaj - tp->avg_iaj); 469 470 /* Allow a jitter of "allowed_iaj" milliseconds. Some connections 471 * may have a constant jitter more than that. We detect this by 472 * using standard deviation. 473 */ 474 allowed_iaj = tp->avg_iaj + tp->std_dev_iaj; 475 if (allowed_iaj < tcp_allowed_iaj) 476 allowed_iaj = tcp_allowed_iaj; 477 478 /* Initially when the connection starts, the senders congestion 479 * window is small. During this period we avoid throttling a 480 * connection because we do not have a good starting point for 481 * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over 482 * the first few packets. 483 */ 484 if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) { 485 if ( cur_iaj <= allowed_iaj ) { 486 if (tp->acc_iaj >= 2) 487 acc_iaj = tp->acc_iaj - 2; 488 else 489 acc_iaj = 0; 490 491 } else { 492 acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj); 493 } 494 495 if (acc_iaj > MAX_ACC_IAJ) 496 acc_iaj = MAX_ACC_IAJ; 497 tp->acc_iaj = acc_iaj; 498 } 499 500 /* Compute weighted average where the history has a weight of 501 * 15 out of 16 and the current value has a weight of 1 out of 16. 502 * This will make the short-term measurements have more weight. 503 * 504 * The addition of 8 will help to round-up the value 505 * instead of round-down 506 */ 507 tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj) 508 + cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT; 509 510 /* Compute Root-mean-square of deviation where mean is a weighted 511 * average as described above. 512 */ 513 temp = tp->std_dev_iaj * tp->std_dev_iaj; 514 mean = (((temp << IAJ_DIV_SHIFT) - temp) 515 + (cur_iaj_dev * cur_iaj_dev) 516 + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT; 517 518 tp->std_dev_iaj = isqrt(mean); 519 520 DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj, 521 uint32_t, allowed_iaj); 522 523 return; 524} 525#endif /* TRAFFIC_MGT */ 526 527/* Check if enough amount of data has been acknowledged since 528 * bw measurement was started 529 */ 530static void 531tcp_bwmeas_check(struct tcpcb *tp) 532{ 533 int32_t bw_meas_bytes; 534 uint32_t bw, bytes, elapsed_time; 535 bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start; 536 if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) != 0 && 537 bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) { 538 bytes = bw_meas_bytes; 539 elapsed_time = tcp_now - tp->t_bwmeas->bw_ts; 540 if (elapsed_time > 0) { 541 bw = bytes / elapsed_time; 542 if ( bw > 0) { 543 if (tp->t_bwmeas->bw_sndbw > 0) { 544 tp->t_bwmeas->bw_sndbw = 545 (((tp->t_bwmeas->bw_sndbw << 3) - tp->t_bwmeas->bw_sndbw) + bw) >> 3; 546 } else { 547 tp->t_bwmeas->bw_sndbw = bw; 548 } 549 } 550 } 551 tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS); 552 } 553} 554 555static int 556tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, 557 struct ifnet *ifp) 558{ 559 struct tseg_qent *q; 560 struct tseg_qent *p = NULL; 561 struct tseg_qent *nq; 562 struct tseg_qent *te = NULL; 563 struct inpcb *inp = tp->t_inpcb; 564 struct socket *so = inp->inp_socket; 565 int flags = 0; 566 int dowakeup = 0; 567 struct mbuf *oodata = NULL; 568 int copy_oodata = 0; 569 u_int16_t qlimit; 570 boolean_t cell = IFNET_IS_CELLULAR(ifp); 571 boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp)); 572 boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp)); 573 574 /* 575 * Call with th==0 after become established to 576 * force pre-ESTABLISHED data up to user socket. 577 */ 578 if (th == NULL) 579 goto present; 580 581 /* 582 * If the reassembly queue already has entries or if we are going 583 * to add a new one, then the connection has reached a loss state. 584 * Reset the stretch-ack algorithm at this point. 585 */ 586 tcp_reset_stretch_ack(tp); 587 588#if TRAFFIC_MGT 589 if (tp->acc_iaj > 0) 590 reset_acc_iaj(tp); 591#endif /* TRAFFIC_MGT */ 592 593 /* 594 * Limit the number of segments in the reassembly queue to prevent 595 * holding on to too many segments (and thus running out of mbufs). 596 * Make sure to let the missing segment through which caused this 597 * queue. Always keep one global queue entry spare to be able to 598 * process the missing segment. 599 */ 600 qlimit = min(max(100, so->so_rcv.sb_hiwat >> 10), 601 tcp_autorcvbuf_max >> 10); 602 if (th->th_seq != tp->rcv_nxt && 603 (tp->t_reassqlen + 1) >= qlimit) { 604 tcp_reass_overflows++; 605 tcpstat.tcps_rcvmemdrop++; 606 m_freem(m); 607 *tlenp = 0; 608 return (0); 609 } 610 611 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ 612 te = (struct tseg_qent *) zalloc(tcp_reass_zone); 613 if (te == NULL) { 614 tcpstat.tcps_rcvmemdrop++; 615 m_freem(m); 616 return (0); 617 } 618 tp->t_reassqlen++; 619 620 /* 621 * Find a segment which begins after this one does. 622 */ 623 LIST_FOREACH(q, &tp->t_segq, tqe_q) { 624 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) 625 break; 626 p = q; 627 } 628 629 /* 630 * If there is a preceding segment, it may provide some of 631 * our data already. If so, drop the data from the incoming 632 * segment. If it provides all of our data, drop us. 633 */ 634 if (p != NULL) { 635 register int i; 636 /* conversion to int (in i) handles seq wraparound */ 637 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; 638 if (i > 0) { 639 if (i >= *tlenp) { 640 tcpstat.tcps_rcvduppack++; 641 tcpstat.tcps_rcvdupbyte += *tlenp; 642 if (nstat_collect) { 643 nstat_route_rx(inp->inp_route.ro_rt, 644 1, *tlenp, 645 NSTAT_RX_FLAG_DUPLICATE); 646 INP_ADD_STAT(inp, cell, wifi, wired, 647 rxpackets, 1); 648 INP_ADD_STAT(inp, cell, wifi, wired, 649 rxbytes, *tlenp); 650 tp->t_stat.rxduplicatebytes += *tlenp; 651 } 652 m_freem(m); 653 zfree(tcp_reass_zone, te); 654 te = NULL; 655 tp->t_reassqlen--; 656 /* 657 * Try to present any queued data 658 * at the left window edge to the user. 659 * This is needed after the 3-WHS 660 * completes. 661 */ 662 goto present; 663 } 664 m_adj(m, i); 665 *tlenp -= i; 666 th->th_seq += i; 667 } 668 } 669 tcpstat.tcps_rcvoopack++; 670 tcpstat.tcps_rcvoobyte += *tlenp; 671 if (nstat_collect) { 672 nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp, 673 NSTAT_RX_FLAG_OUT_OF_ORDER); 674 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1); 675 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, *tlenp); 676 tp->t_stat.rxoutoforderbytes += *tlenp; 677 } 678 679 /* 680 * While we overlap succeeding segments trim them or, 681 * if they are completely covered, dequeue them. 682 */ 683 while (q) { 684 register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; 685 if (i <= 0) 686 break; 687 if (i < q->tqe_len) { 688 q->tqe_th->th_seq += i; 689 q->tqe_len -= i; 690 m_adj(q->tqe_m, i); 691 break; 692 } 693 694 nq = LIST_NEXT(q, tqe_q); 695 LIST_REMOVE(q, tqe_q); 696 m_freem(q->tqe_m); 697 zfree(tcp_reass_zone, q); 698 tp->t_reassqlen--; 699 q = nq; 700 } 701 702 /* Insert the new segment queue entry into place. */ 703 te->tqe_m = m; 704 te->tqe_th = th; 705 te->tqe_len = *tlenp; 706 707 if (p == NULL) { 708 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); 709 } else { 710 LIST_INSERT_AFTER(p, te, tqe_q); 711 } 712 713 /* 714 * New out-of-order data exists, and is pointed to by 715 * queue entry te. Set copy_oodata to 1 so out-of-order data 716 * can be copied off to sockbuf after in-order data 717 * is copied off. 718 */ 719 if (!(so->so_state & SS_CANTRCVMORE)) 720 copy_oodata = 1; 721 722present: 723 /* 724 * Present data to user, advancing rcv_nxt through 725 * completed sequence space. 726 */ 727 if (!TCPS_HAVEESTABLISHED(tp->t_state)) 728 return (0); 729 q = LIST_FIRST(&tp->t_segq); 730 if (!q || q->tqe_th->th_seq != tp->rcv_nxt) { 731 /* Stop using LRO once out of order packets arrive */ 732 if (tp->t_flagsext & TF_LRO_OFFLOADED) { 733 tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr, 734 th->th_dport, th->th_sport); 735 tp->t_flagsext &= ~TF_LRO_OFFLOADED; 736 } 737 738 /* 739 * continue processing if out-of-order data 740 * can be delivered 741 */ 742 if (q && (so->so_flags & SOF_ENABLE_MSGS)) 743 goto msg_unordered_delivery; 744 745 return (0); 746 } 747 748 /* lost packet was recovered, so ooo data can be returned */ 749 tcpstat.tcps_recovered_pkts++; 750 751 do { 752 tp->rcv_nxt += q->tqe_len; 753 flags = q->tqe_th->th_flags & TH_FIN; 754 nq = LIST_NEXT(q, tqe_q); 755 LIST_REMOVE(q, tqe_q); 756 if (so->so_state & SS_CANTRCVMORE) { 757 m_freem(q->tqe_m); 758 } else { 759 so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */ 760 if (so->so_flags & SOF_ENABLE_MSGS) { 761 /* 762 * Append the inorder data as a message to the 763 * receive socket buffer. Also check to see if 764 * the data we are about to deliver is the same 765 * data that we wanted to pass up to the user 766 * out of order. If so, reset copy_oodata -- 767 * the received data filled a gap, and 768 * is now in order! 769 */ 770 if (q == te) 771 copy_oodata = 0; 772 } 773 if (sbappendstream_rcvdemux(so, q->tqe_m, 774 q->tqe_th->th_seq - (tp->irs + 1), 0)) 775 dowakeup = 1; 776 if (tp->t_flagsext & TF_LRO_OFFLOADED) { 777 tcp_update_lro_seq(tp->rcv_nxt, 778 inp->inp_laddr, inp->inp_faddr, 779 th->th_dport, th->th_sport); 780 } 781 } 782 zfree(tcp_reass_zone, q); 783 tp->t_reassqlen--; 784 q = nq; 785 } while (q && q->tqe_th->th_seq == tp->rcv_nxt); 786 787#if INET6 788 if ((inp->inp_vflag & INP_IPV6) != 0) { 789 790 KERNEL_DEBUG(DBG_LAYER_BEG, 791 ((inp->inp_fport << 16) | inp->inp_lport), 792 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | 793 (inp->in6p_faddr.s6_addr16[0] & 0xffff)), 794 0,0,0); 795 } 796 else 797#endif 798 { 799 KERNEL_DEBUG(DBG_LAYER_BEG, 800 ((inp->inp_fport << 16) | inp->inp_lport), 801 (((inp->inp_laddr.s_addr & 0xffff) << 16) | 802 (inp->inp_faddr.s_addr & 0xffff)), 803 0,0,0); 804 } 805 806msg_unordered_delivery: 807 /* Deliver out-of-order data as a message */ 808 if (te && (so->so_flags & SOF_ENABLE_MSGS) && copy_oodata && te->tqe_len) { 809 /* 810 * make a copy of the mbuf to be delivered up to 811 * the user, and add it to the sockbuf 812 */ 813 oodata = m_copym(te->tqe_m, 0, M_COPYALL, M_DONTWAIT); 814 if (oodata != NULL) { 815 if (sbappendmsgstream_rcv(&so->so_rcv, oodata, 816 te->tqe_th->th_seq - (tp->irs + 1), 1)) { 817 dowakeup = 1; 818 tcpstat.tcps_msg_unopkts++; 819 } else { 820 tcpstat.tcps_msg_unoappendfail++; 821 } 822 } 823 } 824 825 if (dowakeup) 826 sorwakeup(so); /* done with socket lock held */ 827 return (flags); 828} 829 830/* 831 * Reduce congestion window. 832 */ 833static void 834tcp_reduce_congestion_window( 835 struct tcpcb *tp) 836{ 837 /* 838 * If the current tcp cc module has 839 * defined a hook for tasks to run 840 * before entering FR, call it 841 */ 842 if (CC_ALGO(tp)->pre_fr != NULL) 843 CC_ALGO(tp)->pre_fr(tp); 844 ENTER_FASTRECOVERY(tp); 845 tp->snd_recover = tp->snd_max; 846 tp->t_timer[TCPT_REXMT] = 0; 847 tp->t_timer[TCPT_PTO] = 0; 848 tp->t_rtttime = 0; 849 tp->snd_cwnd = tp->snd_ssthresh + 850 tp->t_maxseg * tcprexmtthresh; 851} 852 853/* 854 * The application wants to get an event if there 855 * is a stall during read. Set the initial keepalive 856 * timeout to be equal to twice RTO. 857 */ 858static inline void 859tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen) 860{ 861 if (tp->t_adaptive_rtimo > 0 && tlen > 0 && 862 tp->t_state == TCPS_ESTABLISHED) { 863 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 864 (TCP_REXMTVAL(tp) << 1)); 865 tp->t_flagsext |= TF_DETECT_READSTALL; 866 tp->t_rtimo_probes = 0; 867 } 868} 869 870inline void 871tcp_keepalive_reset(struct tcpcb *tp) 872{ 873 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 874 TCP_CONN_KEEPIDLE(tp)); 875 tp->t_flagsext &= ~(TF_DETECT_READSTALL); 876 tp->t_rtimo_probes = 0; 877} 878 879/* 880 * TCP input routine, follows pages 65-76 of the 881 * protocol specification dated September, 1981 very closely. 882 */ 883#if INET6 884int 885tcp6_input(struct mbuf **mp, int *offp, int proto) 886{ 887#pragma unused(proto) 888 register struct mbuf *m = *mp; 889 uint32_t ia6_flags; 890 struct ifnet *ifp = m->m_pkthdr.rcvif; 891 892 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE); 893 894 /* Expect 32-bit aligned data pointer on strict-align platforms */ 895 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); 896 897 /* 898 * draft-itojun-ipv6-tcp-to-anycast 899 * better place to put this in? 900 */ 901 if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) { 902 if (ia6_flags & IN6_IFF_ANYCAST) { 903 struct ip6_hdr *ip6; 904 905 ip6 = mtod(m, struct ip6_hdr *); 906 icmp6_error(m, ICMP6_DST_UNREACH, 907 ICMP6_DST_UNREACH_ADDR, 908 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 909 910 IF_TCP_STATINC(ifp, icmp6unreach); 911 912 return (IPPROTO_DONE); 913 } 914 } 915 916 tcp_input(m, *offp); 917 return (IPPROTO_DONE); 918} 919#endif 920 921/* Depending on the usage of mbuf space in the system, this function 922 * will return true or false. This is used to determine if a socket 923 * buffer can take more memory from the system for auto-tuning or not. 924 */ 925u_int8_t 926tcp_cansbgrow(struct sockbuf *sb) 927{ 928 /* Calculate the host level space limit in terms of MSIZE buffers. 929 * We can use a maximum of half of the available mbuf space for 930 * socket buffers. 931 */ 932 u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT)); 933 934 /* Calculate per sb limit in terms of bytes. We optimize this limit 935 * for upto 16 socket buffers. 936 */ 937 938 u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT); 939 940 if ((total_sbmb_cnt < mblim) && 941 (sb->sb_hiwat < sbspacelim)) { 942 return(1); 943 } else { 944 OSIncrementAtomic64(&sbmb_limreached); 945 } 946 return(0); 947} 948 949static void 950tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv, 951 u_int32_t newsize, u_int32_t idealsize) 952{ 953 954 /* newsize should not exceed max */ 955 newsize = min(newsize, tcp_autorcvbuf_max); 956 957 /* The receive window scale negotiated at the 958 * beginning of the connection will also set a 959 * limit on the socket buffer size 960 */ 961 newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale); 962 963 /* Set new socket buffer size */ 964 if (newsize > sbrcv->sb_hiwat && 965 (sbreserve(sbrcv, newsize) == 1)) { 966 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize, 967 (idealsize != 0) ? idealsize : newsize), 968 tcp_autorcvbuf_max); 969 970 /* Again check the limit set by the advertised 971 * window scale 972 */ 973 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize, 974 TCP_MAXWIN << tp->rcv_scale); 975 } 976} 977 978/* 979 * This function is used to grow a receive socket buffer. It 980 * will take into account system-level memory usage and the 981 * bandwidth available on the link to make a decision. 982 */ 983static void 984tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, 985 struct tcpopt *to, u_int32_t pktlen) { 986 987 /* 988 * Do not grow the receive socket buffer if 989 * - auto resizing is disabled, globally or on this socket 990 * - the high water mark already reached the maximum 991 * - the stream is in background and receive side is being 992 * throttled 993 * - if there are segments in reassembly queue indicating loss, 994 * do not need to increase recv window during recovery as more 995 * data is not going to be sent. A duplicate ack sent during 996 * recovery should not change the receive window 997 */ 998 if (tcp_do_autorcvbuf == 0 || 999 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 || 1000 tcp_cansbgrow(sbrcv) == 0 || 1001 sbrcv->sb_hiwat >= tcp_autorcvbuf_max || 1002 (tp->t_flagsext & TF_RECV_THROTTLE) || 1003 !LIST_EMPTY(&tp->t_segq)) { 1004 /* Can not resize the socket buffer, just return */ 1005 goto out; 1006 } 1007 1008 if (TSTMP_GT(tcp_now, 1009 tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) { 1010 /* If there has been an idle period in the 1011 * connection, just restart the measurement 1012 */ 1013 goto out; 1014 } 1015 1016 if (!TSTMP_SUPPORTED(tp)) { 1017 /* 1018 * Timestamp option is not supported on this connection. 1019 * If the connection reached a state to indicate that 1020 * the receive socket buffer needs to grow, increase 1021 * the high water mark. 1022 */ 1023 if (TSTMP_GEQ(tcp_now, 1024 tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) { 1025 if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) { 1026 tcp_sbrcv_reserve(tp, sbrcv, 1027 tcp_autorcvbuf_max, 0); 1028 } 1029 goto out; 1030 } else { 1031 tp->rfbuf_cnt += pktlen; 1032 return; 1033 } 1034 } else if (to->to_tsecr != 0) { 1035 /* 1036 * If the timestamp shows that one RTT has 1037 * completed, we can stop counting the 1038 * bytes. Here we consider increasing 1039 * the socket buffer if the bandwidth measured in 1040 * last rtt, is more than half of sb_hiwat, this will 1041 * help to scale the buffer according to the bandwidth 1042 * on the link. 1043 */ 1044 if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) { 1045 if (tp->rfbuf_cnt > (sbrcv->sb_hiwat - 1046 (sbrcv->sb_hiwat >> 1))) { 1047 int32_t rcvbuf_inc, min_incr; 1048 /* 1049 * Increment the receive window by a 1050 * multiple of maximum sized segments. 1051 * This will prevent a connection from 1052 * sending smaller segments on wire if it 1053 * is limited by the receive window. 1054 * 1055 * Set the ideal size based on current 1056 * bandwidth measurements. We set the 1057 * ideal size on receive socket buffer to 1058 * be twice the bandwidth delay product. 1059 */ 1060 rcvbuf_inc = (tp->rfbuf_cnt << 1) 1061 - sbrcv->sb_hiwat; 1062 1063 /* 1064 * Make the increment equal to 8 segments 1065 * at least 1066 */ 1067 min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift; 1068 if (rcvbuf_inc < min_incr) 1069 rcvbuf_inc = min_incr; 1070 1071 rcvbuf_inc = 1072 (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg; 1073 tcp_sbrcv_reserve(tp, sbrcv, 1074 sbrcv->sb_hiwat + rcvbuf_inc, 1075 (tp->rfbuf_cnt * 2)); 1076 } 1077 goto out; 1078 } else { 1079 tp->rfbuf_cnt += pktlen; 1080 return; 1081 } 1082 } 1083out: 1084 /* Restart the measurement */ 1085 tp->rfbuf_ts = 0; 1086 tp->rfbuf_cnt = 0; 1087 return; 1088} 1089 1090/* This function will trim the excess space added to the socket buffer 1091 * to help a slow-reading app. The ideal-size of a socket buffer depends 1092 * on the link bandwidth or it is set by an application and we aim to 1093 * reach that size. 1094 */ 1095void 1096tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) { 1097 if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 && 1098 sbrcv->sb_hiwat > sbrcv->sb_idealsize) { 1099 int32_t trim; 1100 /* compute the difference between ideal and current sizes */ 1101 u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize; 1102 1103 /* Compute the maximum advertised window for 1104 * this connection. 1105 */ 1106 u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt; 1107 1108 /* How much can we trim the receive socket buffer? 1109 * 1. it can not be trimmed beyond the max rcv win advertised 1110 * 2. if possible, leave 1/16 of bandwidth*delay to 1111 * avoid closing the win completely 1112 */ 1113 u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4)); 1114 1115 /* Sometimes leave can be zero, in that case leave at least 1116 * a few segments worth of space. 1117 */ 1118 if (leave == 0) 1119 leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift; 1120 1121 trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave); 1122 trim = imin(trim, (int32_t)diff); 1123 1124 if (trim > 0) 1125 sbreserve(sbrcv, (sbrcv->sb_hiwat - trim)); 1126 } 1127} 1128 1129/* We may need to trim the send socket buffer size for two reasons: 1130 * 1. if the rtt seen on the connection is climbing up, we do not 1131 * want to fill the buffers any more. 1132 * 2. if the congestion win on the socket backed off, there is no need 1133 * to hold more mbufs for that connection than what the cwnd will allow. 1134 */ 1135void 1136tcp_sbsnd_trim(struct sockbuf *sbsnd) { 1137 if (tcp_do_autosendbuf == 1 && 1138 ((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) == 1139 (SB_AUTOSIZE | SB_TRIM)) && 1140 (sbsnd->sb_idealsize > 0) && 1141 (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) { 1142 u_int32_t trim = 0; 1143 if (sbsnd->sb_cc <= sbsnd->sb_idealsize) { 1144 trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize; 1145 } else { 1146 trim = sbsnd->sb_hiwat - sbsnd->sb_cc; 1147 } 1148 sbreserve(sbsnd, (sbsnd->sb_hiwat - trim)); 1149 } 1150 if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize) 1151 sbsnd->sb_flags &= ~(SB_TRIM); 1152} 1153 1154/* 1155 * If timestamp option was not negotiated on this connection 1156 * and this connection is on the receiving side of a stream 1157 * then we can not measure the delay on the link accurately. 1158 * Instead of enabling automatic receive socket buffer 1159 * resizing, just give more space to the receive socket buffer. 1160 */ 1161static inline void 1162tcp_sbrcv_tstmp_check(struct tcpcb *tp) { 1163 struct socket *so = tp->t_inpcb->inp_socket; 1164 u_int32_t newsize = 2 * tcp_recvspace; 1165 struct sockbuf *sbrcv = &so->so_rcv; 1166 1167 if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) != 1168 (TF_REQ_TSTMP | TF_RCVD_TSTMP) && 1169 (sbrcv->sb_flags & SB_AUTOSIZE) != 0) { 1170 tcp_sbrcv_reserve(tp, sbrcv, newsize, 0); 1171 } 1172} 1173 1174/* A receiver will evaluate the flow of packets on a connection 1175 * to see if it can reduce ack traffic. The receiver will start 1176 * stretching acks if all of the following conditions are met: 1177 * 1. tcp_delack_enabled is set to 3 1178 * 2. If the bytes received in the last 100ms is greater than a threshold 1179 * defined by maxseg_unacked 1180 * 3. If the connection has not been idle for tcp_maxrcvidle period. 1181 * 4. If the connection has seen enough packets to let the slow-start 1182 * finish after connection establishment or after some packet loss. 1183 * 1184 * The receiver will stop stretching acks if there is congestion/reordering 1185 * as indicated by packets on reassembly queue or an ECN. If the delayed-ack 1186 * timer fires while stretching acks, it means that the packet flow has gone 1187 * below the threshold defined by maxseg_unacked and the receiver will stop 1188 * stretching acks. The receiver gets no indication when slow-start is completed 1189 * or when the connection reaches an idle state. That is why we use 1190 * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle 1191 * state. 1192 */ 1193static inline int 1194tcp_stretch_ack_enable(struct tcpcb *tp) 1195{ 1196 if (!(tp->t_flagsext & (TF_NOSTRETCHACK|TF_DISABLE_STRETCHACK)) && 1197 tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) && 1198 TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) && 1199 (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) || 1200 (tp->rcv_waitforss >= tcp_rcvsspktcnt))) { 1201 return(1); 1202 } 1203 1204 return(0); 1205} 1206 1207/* 1208 * Reset the state related to stretch-ack algorithm. This will make 1209 * the receiver generate an ack every other packet. The receiver 1210 * will start re-evaluating the rate at which packets come to decide 1211 * if it can benefit by lowering the ack traffic. 1212 */ 1213void 1214tcp_reset_stretch_ack(struct tcpcb *tp) 1215{ 1216 tp->t_flags &= ~(TF_STRETCHACK); 1217 tp->rcv_by_unackwin = 0; 1218 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; 1219 1220 /* 1221 * When there is packet loss or packet re-ordering or CWR due to 1222 * ECN, the sender's congestion window is reduced. In these states, 1223 * generate an ack for every other packet for some time to allow 1224 * the sender's congestion window to grow. 1225 */ 1226 tp->t_flagsext |= TF_RCVUNACK_WAITSS; 1227 tp->rcv_waitforss = 0; 1228} 1229 1230/* 1231 * The last packet was a retransmission, check if this ack 1232 * indicates that the retransmission was spurious. 1233 * 1234 * If the connection supports timestamps, we could use it to 1235 * detect if the last retransmit was not needed. Otherwise, 1236 * we check if the ACK arrived within RTT/2 window, then it 1237 * was a mistake to do the retransmit in the first place. 1238 * 1239 * This function will return 1 if it is a spurious retransmit, 1240 * 0 otherwise. 1241 */ 1242int 1243tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th, 1244 struct tcpopt *to, u_int32_t rxtime) 1245{ 1246 int32_t tdiff, bad_rexmt_win; 1247 bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 1248 1249 /* If the ack has ECN CE bit, then cwnd has to be adjusted */ 1250 if ((tp->ecn_flags & (TE_ECN_ON)) == (TE_ECN_ON) 1251 && (th->th_flags & TH_ECE)) 1252 return (0); 1253 if (TSTMP_SUPPORTED(tp)) { 1254 if (rxtime > 0 && (to->to_flags & TOF_TS) 1255 && to->to_tsecr != 0 1256 && TSTMP_LT(to->to_tsecr, rxtime)) 1257 return (1); 1258 } else { 1259 if ((tp->t_rxtshift == 1 1260 || (tp->t_flagsext & TF_SENT_TLPROBE)) 1261 && rxtime > 0) { 1262 tdiff = (int32_t)(tcp_now - rxtime); 1263 if (tdiff < bad_rexmt_win) 1264 return(1); 1265 } 1266 } 1267 return(0); 1268} 1269 1270 1271/* 1272 * Restore congestion window state if a spurious timeout 1273 * was detected. 1274 */ 1275static void 1276tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th) 1277{ 1278 if (TSTMP_SUPPORTED(tp)) { 1279 u_int32_t fsize, acked; 1280 fsize = tp->snd_max - th->th_ack; 1281 acked = BYTES_ACKED(th, tp); 1282 1283 /* 1284 * Implement bad retransmit recovery as 1285 * described in RFC 4015. 1286 */ 1287 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1288 1289 /* Initialize cwnd to the initial window */ 1290 if (CC_ALGO(tp)->cwnd_init != NULL) 1291 CC_ALGO(tp)->cwnd_init(tp); 1292 1293 tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd); 1294 1295 } else { 1296 tp->snd_cwnd = tp->snd_cwnd_prev; 1297 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1298 if (tp->t_flags & TF_WASFRECOVERY) 1299 ENTER_FASTRECOVERY(tp); 1300 } 1301 tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES); 1302 tp->snd_recover = tp->snd_recover_prev; 1303 tp->snd_nxt = tp->snd_max; 1304 tp->t_rxtshift = 0; 1305 tp->t_rxtstart = 0; 1306 1307 /* Fix send socket buffer to reflect the change in cwnd */ 1308 tcp_bad_rexmt_fix_sndbuf(tp); 1309 1310 /* 1311 * This RTT might reflect the extra delay induced 1312 * by the network. Skip using this sample for RTO 1313 * calculation and mark the connection so we can 1314 * recompute RTT when the next eligible sample is 1315 * found. 1316 */ 1317 tp->t_flagsext |= TF_RECOMPUTE_RTT; 1318 tp->t_badrexmt_time = tcp_now; 1319 tp->t_rtttime = 0; 1320} 1321 1322/* 1323 * If the previous packet was sent in retransmission timer, and it was 1324 * not needed, then restore the congestion window to the state before that 1325 * transmission. 1326 * 1327 * If the last packet was sent in tail loss probe timeout, check if that 1328 * recovered the last packet. If so, that will indicate a real loss and 1329 * the congestion window needs to be lowered. 1330 */ 1331static void 1332tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) 1333{ 1334 if (tp->t_rxtshift > 0 && 1335 tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) { 1336 ++tcpstat.tcps_sndrexmitbad; 1337 tcp_bad_rexmt_restore_state(tp, th); 1338 tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY); 1339 } else if ((tp->t_flagsext & TF_SENT_TLPROBE) 1340 && tp->t_tlphighrxt > 0 1341 && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt) 1342 && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) { 1343 /* 1344 * The tail loss probe recovered the last packet and 1345 * we need to adjust the congestion window to take 1346 * this loss into account. 1347 */ 1348 ++tcpstat.tcps_tlp_recoverlastpkt; 1349 if (!IN_FASTRECOVERY(tp)) { 1350 tcp_reduce_congestion_window(tp); 1351 EXIT_FASTRECOVERY(tp); 1352 } 1353 tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET); 1354 } 1355 1356 tp->t_flagsext &= ~(TF_SENT_TLPROBE); 1357 tp->t_tlphighrxt = 0; 1358 tp->t_tlpstart = 0; 1359 1360 /* 1361 * check if the latest ack was for a segment sent during PMTU 1362 * blackhole detection. If the timestamp on the ack is before 1363 * PMTU blackhole detection, then revert the size of the max 1364 * segment to previous size. 1365 */ 1366 if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) && 1367 tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) { 1368 if ((to->to_flags & TOF_TS) && to->to_tsecr != 0 1369 && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) { 1370 tcp_pmtud_revert_segment_size(tp); 1371 } 1372 } 1373 if (tp->t_pmtud_start_ts > 0) 1374 tp->t_pmtud_start_ts = 0; 1375} 1376 1377/* 1378 * Check if early retransmit can be attempted according to RFC 5827. 1379 * 1380 * If packet reordering is detected on a connection, fast recovery will 1381 * be delayed until it is clear that the packet was lost and not reordered. 1382 * But reordering detection is done only when SACK is enabled. 1383 * 1384 * On connections that do not support SACK, there is a limit on the number 1385 * of early retransmits that can be done per minute. This limit is needed 1386 * to make sure that too many packets are not retransmitted when there is 1387 * packet reordering. 1388 */ 1389static void 1390tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th) 1391{ 1392 u_int32_t obytes, snd_off; 1393 int32_t snd_len; 1394 struct socket *so = tp->t_inpcb->inp_socket; 1395 1396 if (early_rexmt && (SACK_ENABLED(tp) || 1397 tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) && 1398 SEQ_GT(tp->snd_max, tp->snd_una) && 1399 (tp->t_dupacks == 1 || 1400 (SACK_ENABLED(tp) && 1401 !TAILQ_EMPTY(&tp->snd_holes)))) { 1402 /* 1403 * If there are only a few outstanding 1404 * segments on the connection, we might need 1405 * to lower the retransmit threshold. This 1406 * will allow us to do Early Retransmit as 1407 * described in RFC 5827. 1408 */ 1409 if (SACK_ENABLED(tp) && 1410 !TAILQ_EMPTY(&tp->snd_holes)) { 1411 obytes = (tp->snd_max - tp->snd_fack) + 1412 tp->sackhint.sack_bytes_rexmit; 1413 } else { 1414 obytes = (tp->snd_max - tp->snd_una); 1415 } 1416 1417 /* 1418 * In order to lower retransmit threshold the 1419 * following two conditions must be met. 1420 * 1. the amount of outstanding data is less 1421 * than 4*SMSS bytes 1422 * 2. there is no unsent data ready for 1423 * transmission or the advertised window 1424 * will limit sending new segments. 1425 */ 1426 snd_off = tp->snd_max - tp->snd_una; 1427 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off; 1428 if (obytes < (tp->t_maxseg << 2) && 1429 snd_len <= 0) { 1430 u_int32_t osegs; 1431 1432 osegs = obytes / tp->t_maxseg; 1433 if ((osegs * tp->t_maxseg) < obytes) 1434 osegs++; 1435 1436 /* 1437 * Since the connection might have already 1438 * received some dupacks, we add them to 1439 * to the outstanding segments count to get 1440 * the correct retransmit threshold. 1441 * 1442 * By checking for early retransmit after 1443 * receiving some duplicate acks when SACK 1444 * is supported, the connection will 1445 * enter fast recovery even if multiple 1446 * segments are lost in the same window. 1447 */ 1448 osegs += tp->t_dupacks; 1449 if (osegs < 4) { 1450 tp->t_rexmtthresh = 1451 ((osegs - 1) > 1) ? (osegs - 1) : 1; 1452 tp->t_rexmtthresh = 1453 min(tp->t_rexmtthresh, tcprexmtthresh); 1454 tp->t_rexmtthresh = 1455 max(tp->t_rexmtthresh, tp->t_dupacks); 1456 1457 if (tp->t_early_rexmt_count == 0) 1458 tp->t_early_rexmt_win = tcp_now; 1459 1460 if (tp->t_flagsext & TF_SENT_TLPROBE) { 1461 tcpstat.tcps_tlp_recovery++; 1462 tcp_ccdbg_trace(tp, th, 1463 TCP_CC_TLP_RECOVERY); 1464 } else { 1465 tcpstat.tcps_early_rexmt++; 1466 tp->t_early_rexmt_count++; 1467 tcp_ccdbg_trace(tp, th, 1468 TCP_CC_EARLY_RETRANSMIT); 1469 } 1470 } 1471 } 1472 } 1473 1474 /* 1475 * If we ever sent a TLP probe, the acknowledgement will trigger 1476 * early retransmit because the value of snd_fack will be close 1477 * to snd_max. This will take care of adjustments to the 1478 * congestion window. So we can reset TF_SENT_PROBE flag. 1479 */ 1480 tp->t_flagsext &= ~(TF_SENT_TLPROBE); 1481 tp->t_tlphighrxt = 0; 1482 tp->t_tlpstart = 0; 1483} 1484 1485void 1486tcp_input(m, off0) 1487 struct mbuf *m; 1488 int off0; 1489{ 1490 register struct tcphdr *th; 1491 register struct ip *ip = NULL; 1492 register struct inpcb *inp; 1493 u_char *optp = NULL; 1494 int optlen = 0; 1495 int tlen, off; 1496 int drop_hdrlen; 1497 register struct tcpcb *tp = 0; 1498 register int thflags; 1499 struct socket *so = 0; 1500 int todrop, acked, ourfinisacked, needoutput = 0; 1501 struct in_addr laddr; 1502#if INET6 1503 struct in6_addr laddr6; 1504#endif 1505 int dropsocket = 0; 1506 int iss = 0, nosock = 0; 1507 u_int32_t tiwin, sack_bytes_acked = 0; 1508 struct tcpopt to; /* options in this segment */ 1509 struct sockaddr_in *next_hop = NULL; 1510#if TCPDEBUG 1511 short ostate = 0; 1512#endif 1513 struct m_tag *fwd_tag; 1514 u_char ip_ecn = IPTOS_ECN_NOTECT; 1515 unsigned int ifscope; 1516 uint8_t isconnected, isdisconnected; 1517 struct ifnet *ifp = m->m_pkthdr.rcvif; 1518 int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0; 1519 int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1; 1520 int turnoff_lro = 0, win; 1521#if MPTCP 1522 struct mptcb *mp_tp = NULL; 1523 uint16_t mptcp_csum = 0; 1524#endif /* MPTCP */ 1525 boolean_t cell = IFNET_IS_CELLULAR(ifp); 1526 boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp)); 1527 boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp)); 1528 struct tcp_respond_args tra; 1529 1530#define TCP_INC_VAR(stat, npkts) do { \ 1531 stat += npkts; \ 1532} while (0) 1533 1534 TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts); 1535 1536 /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ 1537 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) { 1538 fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, 1539 KERNEL_TAG_TYPE_IPFORWARD, NULL); 1540 } else { 1541 fwd_tag = NULL; 1542 } 1543 if (fwd_tag != NULL) { 1544 struct ip_fwd_tag *ipfwd_tag = 1545 (struct ip_fwd_tag *)(fwd_tag+1); 1546 1547 next_hop = ipfwd_tag->next_hop; 1548 m_tag_delete(m, fwd_tag); 1549 } 1550 1551#if INET6 1552 struct ip6_hdr *ip6 = NULL; 1553 int isipv6; 1554#endif /* INET6 */ 1555 int rstreason; /* For badport_bandlim accounting purposes */ 1556 struct proc *proc0=current_proc(); 1557 1558 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0); 1559 1560#if INET6 1561 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 1562#endif 1563 bzero((char *)&to, sizeof(to)); 1564 1565#if INET6 1566 if (isipv6) { 1567 /* 1568 * Expect 32-bit aligned data pointer on 1569 * strict-align platforms 1570 */ 1571 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); 1572 1573 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ 1574 ip6 = mtod(m, struct ip6_hdr *); 1575 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 1576 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0); 1577 1578 if (tcp_input_checksum(AF_INET6, m, th, off0, tlen)) 1579 goto dropnosock; 1580 1581 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport), 1582 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])), 1583 th->th_seq, th->th_ack, th->th_win); 1584 /* 1585 * Be proactive about unspecified IPv6 address in source. 1586 * As we use all-zero to indicate unbounded/unconnected pcb, 1587 * unspecified IPv6 address can be used to confuse us. 1588 * 1589 * Note that packets with unspecified IPv6 destination is 1590 * already dropped in ip6_input. 1591 */ 1592 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 1593 /* XXX stat */ 1594 IF_TCP_STATINC(ifp, unspecv6); 1595 goto dropnosock; 1596 } 1597 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL, 1598 struct ip6_hdr *, ip6, struct tcpcb *, NULL, 1599 struct tcphdr *, th); 1600 1601 ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; 1602 } else 1603#endif /* INET6 */ 1604 { 1605 /* 1606 * Get IP and TCP header together in first mbuf. 1607 * Note: IP leaves IP header in first mbuf. 1608 */ 1609 if (off0 > sizeof (struct ip)) { 1610 ip_stripoptions(m, (struct mbuf *)0); 1611 off0 = sizeof(struct ip); 1612 } 1613 if (m->m_len < sizeof (struct tcpiphdr)) { 1614 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { 1615 tcpstat.tcps_rcvshort++; 1616 return; 1617 } 1618 } 1619 1620 /* Expect 32-bit aligned data pointer on strict-align platforms */ 1621 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); 1622 1623 ip = mtod(m, struct ip *); 1624 th = (struct tcphdr *)(void *)((caddr_t)ip + off0); 1625 tlen = ip->ip_len; 1626 1627 if (tcp_input_checksum(AF_INET, m, th, off0, tlen)) 1628 goto dropnosock; 1629 1630#if INET6 1631 /* Re-initialization for later version check */ 1632 ip->ip_v = IPVERSION; 1633#endif 1634 ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK); 1635 1636 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL, 1637 struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th); 1638 1639 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport), 1640 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), 1641 th->th_seq, th->th_ack, th->th_win); 1642 1643 } 1644 1645 /* 1646 * Check that TCP offset makes sense, 1647 * pull out TCP options and adjust length. XXX 1648 */ 1649 off = th->th_off << 2; 1650 if (off < sizeof (struct tcphdr) || off > tlen) { 1651 tcpstat.tcps_rcvbadoff++; 1652 IF_TCP_STATINC(ifp, badformat); 1653 goto dropnosock; 1654 } 1655 tlen -= off; /* tlen is used instead of ti->ti_len */ 1656 if (off > sizeof (struct tcphdr)) { 1657#if INET6 1658 if (isipv6) { 1659 IP6_EXTHDR_CHECK(m, off0, off, return); 1660 ip6 = mtod(m, struct ip6_hdr *); 1661 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0); 1662 } else 1663#endif /* INET6 */ 1664 { 1665 if (m->m_len < sizeof(struct ip) + off) { 1666 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { 1667 tcpstat.tcps_rcvshort++; 1668 return; 1669 } 1670 ip = mtod(m, struct ip *); 1671 th = (struct tcphdr *)(void *)((caddr_t)ip + off0); 1672 } 1673 } 1674 optlen = off - sizeof (struct tcphdr); 1675 optp = (u_char *)(th + 1); 1676 /* 1677 * Do quick retrieval of timestamp options ("options 1678 * prediction?"). If timestamp is the only option and it's 1679 * formatted as recommended in RFC 1323 appendix A, we 1680 * quickly get the values now and not bother calling 1681 * tcp_dooptions(), etc. 1682 */ 1683 if ((optlen == TCPOLEN_TSTAMP_APPA || 1684 (optlen > TCPOLEN_TSTAMP_APPA && 1685 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 1686 *(u_int32_t *)(void *)optp == htonl(TCPOPT_TSTAMP_HDR) && 1687 (th->th_flags & TH_SYN) == 0) { 1688 to.to_flags |= TOF_TS; 1689 to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4)); 1690 to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8)); 1691 optp = NULL; /* we've parsed the options */ 1692 } 1693 } 1694 thflags = th->th_flags; 1695 1696#if TCP_DROP_SYNFIN 1697 /* 1698 * If the drop_synfin option is enabled, drop all packets with 1699 * both the SYN and FIN bits set. This prevents e.g. nmap from 1700 * identifying the TCP/IP stack. 1701 * 1702 * This is a violation of the TCP specification. 1703 */ 1704 if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) { 1705 IF_TCP_STATINC(ifp, synfin); 1706 goto dropnosock; 1707 } 1708#endif 1709 1710 /* 1711 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options, 1712 * until after ip6_savecontrol() is called and before other functions 1713 * which don't want those proto headers. 1714 * Because ip6_savecontrol() is going to parse the mbuf to 1715 * search for data to be passed up to user-land, it wants mbuf 1716 * parameters to be unchanged. 1717 */ 1718 drop_hdrlen = off0 + off; 1719 1720 /* Since this is an entry point for input processing of tcp packets, we 1721 * can update the tcp clock here. 1722 */ 1723 calculate_tcp_clock(); 1724 1725 /* 1726 * Record the interface where this segment arrived on; this does not 1727 * affect normal data output (for non-detached TCP) as it provides a 1728 * hint about which route and interface to use for sending in the 1729 * absence of a PCB, when scoped routing (and thus source interface 1730 * selection) are enabled. 1731 */ 1732 if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) || m->m_pkthdr.rcvif == NULL) 1733 ifscope = IFSCOPE_NONE; 1734 else 1735 ifscope = m->m_pkthdr.rcvif->if_index; 1736 1737 /* 1738 * Convert TCP protocol specific fields to host format. 1739 */ 1740 1741#if BYTE_ORDER != BIG_ENDIAN 1742 NTOHL(th->th_seq); 1743 NTOHL(th->th_ack); 1744 NTOHS(th->th_win); 1745 NTOHS(th->th_urp); 1746#endif 1747 1748 /* 1749 * Locate pcb for segment. 1750 */ 1751findpcb: 1752 1753 isconnected = FALSE; 1754 isdisconnected = FALSE; 1755 1756#if IPFIREWALL_FORWARD 1757 if (next_hop != NULL 1758#if INET6 1759 && isipv6 == 0 /* IPv6 support is not yet */ 1760#endif /* INET6 */ 1761 ) { 1762 /* 1763 * Diverted. Pretend to be the destination. 1764 * already got one like this? 1765 */ 1766 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, 1767 ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); 1768 if (!inp) { 1769 /* 1770 * No, then it's new. Try find the ambushing socket 1771 */ 1772 if (!next_hop->sin_port) { 1773 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, 1774 th->th_sport, next_hop->sin_addr, 1775 th->th_dport, 1, m->m_pkthdr.rcvif); 1776 } else { 1777 inp = in_pcblookup_hash(&tcbinfo, 1778 ip->ip_src, th->th_sport, 1779 next_hop->sin_addr, 1780 ntohs(next_hop->sin_port), 1, 1781 m->m_pkthdr.rcvif); 1782 } 1783 } 1784 } else 1785#endif /* IPFIREWALL_FORWARD */ 1786 { 1787#if INET6 1788 if (isipv6) 1789 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, 1790 &ip6->ip6_dst, th->th_dport, 1, 1791 m->m_pkthdr.rcvif); 1792 else 1793#endif /* INET6 */ 1794 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, 1795 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); 1796 } 1797 1798 /* 1799 * Use the interface scope information from the PCB for outbound 1800 * segments. If the PCB isn't present and if scoped routing is 1801 * enabled, tcp_respond will use the scope of the interface where 1802 * the segment arrived on. 1803 */ 1804 if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) 1805 ifscope = inp->inp_boundifp->if_index; 1806#if NECP 1807 if (inp != NULL && ( 1808#if INET6 1809 isipv6 ? !necp_socket_is_allowed_to_send_recv_v6(inp, 1810 th->th_dport, th->th_sport, &ip6->ip6_dst, 1811 &ip6->ip6_src, ifp, NULL) : 1812#endif 1813 !necp_socket_is_allowed_to_send_recv_v4(inp, th->th_dport, 1814 th->th_sport, &ip->ip_dst, &ip->ip_src, 1815 ifp, NULL))) { 1816 if (in_pcb_checkstate(inp, WNT_RELEASE, 0) 1817 == WNT_STOPUSING) { 1818 inp = NULL; /* pretend we didn't find it */ 1819 } 1820 IF_TCP_STATINC(ifp, badformatipsec); 1821 goto dropnosock; 1822 } 1823#endif /* NECP */ 1824 1825 /* 1826 * If the state is CLOSED (i.e., TCB does not exist) then 1827 * all data in the incoming segment is discarded. 1828 * If the TCB exists but is in CLOSED state, it is embryonic, 1829 * but should either do a listen or a connect soon. 1830 */ 1831 if (inp == NULL) { 1832 if (log_in_vain) { 1833#if INET6 1834 char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN]; 1835#else /* INET6 */ 1836 char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN]; 1837#endif /* INET6 */ 1838 1839#if INET6 1840 if (isipv6) { 1841 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf)); 1842 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf)); 1843 } else 1844#endif 1845 { 1846 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf)); 1847 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf)); 1848 } 1849 switch (log_in_vain) { 1850 case 1: 1851 if(thflags & TH_SYN) 1852 log(LOG_INFO, 1853 "Connection attempt to TCP %s:%d from %s:%d\n", 1854 dbuf, ntohs(th->th_dport), 1855 sbuf, 1856 ntohs(th->th_sport)); 1857 break; 1858 case 2: 1859 log(LOG_INFO, 1860 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", 1861 dbuf, ntohs(th->th_dport), sbuf, 1862 ntohs(th->th_sport), thflags); 1863 break; 1864 case 3: 1865 case 4: 1866 if ((thflags & TH_SYN) && !(thflags & TH_ACK) && 1867 !(m->m_flags & (M_BCAST | M_MCAST)) && 1868#if INET6 1869 ((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) || 1870 (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr)) 1871#else 1872 ip->ip_dst.s_addr != ip->ip_src.s_addr 1873#endif 1874 ) 1875 log_in_vain_log((LOG_INFO, 1876 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n", 1877 dbuf, ntohs(th->th_dport), 1878 sbuf, 1879 ntohs(th->th_sport))); 1880 break; 1881 default: 1882 break; 1883 } 1884 } 1885 if (blackhole) { 1886 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP) 1887 1888 switch (blackhole) { 1889 case 1: 1890 if (thflags & TH_SYN) 1891 goto dropnosock; 1892 break; 1893 case 2: 1894 goto dropnosock; 1895 default: 1896 goto dropnosock; 1897 } 1898 } 1899 rstreason = BANDLIM_RST_CLOSEDPORT; 1900 IF_TCP_STATINC(ifp, noconnnolist); 1901 goto dropwithresetnosock; 1902 } 1903 so = inp->inp_socket; 1904 if (so == NULL) { 1905 /* This case shouldn't happen as the socket shouldn't be null 1906 * if inp_state isn't set to INPCB_STATE_DEAD 1907 * But just in case, we pretend we didn't find the socket if we hit this case 1908 * as this isn't cause for a panic (the socket might be leaked however)... 1909 */ 1910 inp = NULL; 1911#if TEMPDEBUG 1912 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp); 1913#endif 1914 goto dropnosock; 1915 } 1916 1917 tcp_lock(so, 1, 0); 1918 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { 1919 tcp_unlock(so, 1, (void *)2); 1920 inp = NULL; // pretend we didn't find it 1921 goto dropnosock; 1922 } 1923 1924 tp = intotcpcb(inp); 1925 if (tp == 0) { 1926 rstreason = BANDLIM_RST_CLOSEDPORT; 1927 IF_TCP_STATINC(ifp, noconnlist); 1928 goto dropwithreset; 1929 } 1930 if (tp->t_state == TCPS_CLOSED) 1931 goto drop; 1932 1933 /* Unscale the window into a 32-bit value. */ 1934 if ((thflags & TH_SYN) == 0) 1935 tiwin = th->th_win << tp->snd_scale; 1936 else 1937 tiwin = th->th_win; 1938 1939#if CONFIG_MACF_NET 1940 if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM)) 1941 goto drop; 1942#endif 1943 1944 /* Avoid processing packets while closing a listen socket */ 1945 if (tp->t_state == TCPS_LISTEN && 1946 (so->so_options & SO_ACCEPTCONN) == 0) 1947 goto drop; 1948 1949 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 1950#if TCPDEBUG 1951 if (so->so_options & SO_DEBUG) { 1952 ostate = tp->t_state; 1953#if INET6 1954 if (isipv6) 1955 bcopy((char *)ip6, (char *)tcp_saveipgen, 1956 sizeof(*ip6)); 1957 else 1958#endif /* INET6 */ 1959 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); 1960 tcp_savetcp = *th; 1961 } 1962#endif 1963 if (so->so_options & SO_ACCEPTCONN) { 1964 register struct tcpcb *tp0 = tp; 1965 struct socket *so2; 1966 struct socket *oso; 1967 struct sockaddr_storage from; 1968#if INET6 1969 struct inpcb *oinp = sotoinpcb(so); 1970#endif /* INET6 */ 1971 struct ifnet *head_ifscope; 1972 unsigned int head_nocell, head_recvanyif, 1973 head_noexpensive, head_awdl_unrestricted; 1974 1975 /* Get listener's bound-to-interface, if any */ 1976 head_ifscope = (inp->inp_flags & INP_BOUND_IF) ? 1977 inp->inp_boundifp : NULL; 1978 /* Get listener's no-cellular information, if any */ 1979 head_nocell = INP_NO_CELLULAR(inp); 1980 /* Get listener's recv-any-interface, if any */ 1981 head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF); 1982 /* Get listener's no-expensive information, if any */ 1983 head_noexpensive = INP_NO_EXPENSIVE(inp); 1984 head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp); 1985 1986 /* 1987 * If the state is LISTEN then ignore segment if it contains an RST. 1988 * If the segment contains an ACK then it is bad and send a RST. 1989 * If it does not contain a SYN then it is not interesting; drop it. 1990 * If it is from this socket, drop it, it must be forged. 1991 */ 1992 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 1993 IF_TCP_STATINC(ifp, listbadsyn); 1994 1995 if (thflags & TH_RST) { 1996 goto drop; 1997 } 1998 if (thflags & TH_ACK) { 1999 tp = NULL; 2000 tcpstat.tcps_badsyn++; 2001 rstreason = BANDLIM_RST_OPENPORT; 2002 goto dropwithreset; 2003 } 2004 2005 /* We come here if there is no SYN set */ 2006 tcpstat.tcps_badsyn++; 2007 goto drop; 2008 } 2009 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0); 2010 if (th->th_dport == th->th_sport) { 2011#if INET6 2012 if (isipv6) { 2013 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, 2014 &ip6->ip6_src)) 2015 goto drop; 2016 } else 2017#endif /* INET6 */ 2018 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) 2019 goto drop; 2020 } 2021 /* 2022 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 2023 * in_broadcast() should never return true on a received 2024 * packet with M_BCAST not set. 2025 * 2026 * Packets with a multicast source address should also 2027 * be discarded. 2028 */ 2029 if (m->m_flags & (M_BCAST|M_MCAST)) 2030 goto drop; 2031#if INET6 2032 if (isipv6) { 2033 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 2034 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 2035 goto drop; 2036 } else 2037#endif 2038 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 2039 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 2040 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 2041 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 2042 goto drop; 2043 2044 2045#if INET6 2046 /* 2047 * If deprecated address is forbidden, 2048 * we do not accept SYN to deprecated interface 2049 * address to prevent any new inbound connection from 2050 * getting established. 2051 * When we do not accept SYN, we send a TCP RST, 2052 * with deprecated source address (instead of dropping 2053 * it). We compromise it as it is much better for peer 2054 * to send a RST, and RST will be the final packet 2055 * for the exchange. 2056 * 2057 * If we do not forbid deprecated addresses, we accept 2058 * the SYN packet. RFC 4862 forbids dropping SYN in 2059 * this case. 2060 */ 2061 if (isipv6 && !ip6_use_deprecated) { 2062 uint32_t ia6_flags; 2063 2064 if (ip6_getdstifaddr_info(m, NULL, 2065 &ia6_flags) == 0) { 2066 if (ia6_flags & IN6_IFF_DEPRECATED) { 2067 tp = NULL; 2068 rstreason = BANDLIM_RST_OPENPORT; 2069 IF_TCP_STATINC(ifp, deprecate6); 2070 goto dropwithreset; 2071 } 2072 } 2073 } 2074#endif 2075 if (so->so_filt) { 2076#if INET6 2077 if (isipv6) { 2078 struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)&from; 2079 2080 sin6->sin6_len = sizeof(*sin6); 2081 sin6->sin6_family = AF_INET6; 2082 sin6->sin6_port = th->th_sport; 2083 sin6->sin6_flowinfo = 0; 2084 sin6->sin6_addr = ip6->ip6_src; 2085 sin6->sin6_scope_id = 0; 2086 } 2087 else 2088#endif 2089 { 2090 struct sockaddr_in *sin = (struct sockaddr_in*)&from; 2091 2092 sin->sin_len = sizeof(*sin); 2093 sin->sin_family = AF_INET; 2094 sin->sin_port = th->th_sport; 2095 sin->sin_addr = ip->ip_src; 2096 } 2097 so2 = sonewconn(so, 0, (struct sockaddr*)&from); 2098 } else { 2099 so2 = sonewconn(so, 0, NULL); 2100 } 2101 if (so2 == 0) { 2102 tcpstat.tcps_listendrop++; 2103 if (tcp_dropdropablreq(so)) { 2104 if (so->so_filt) 2105 so2 = sonewconn(so, 0, (struct sockaddr*)&from); 2106 else 2107 so2 = sonewconn(so, 0, NULL); 2108 } 2109 if (!so2) 2110 goto drop; 2111 } 2112 2113 /* Point "inp" and "tp" in tandem to new socket */ 2114 inp = (struct inpcb *)so2->so_pcb; 2115 tp = intotcpcb(inp); 2116 2117 oso = so; 2118 tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */ 2119 2120 so = so2; 2121 tcp_lock(so, 1, 0); 2122 /* 2123 * Mark socket as temporary until we're 2124 * committed to keeping it. The code at 2125 * ``drop'' and ``dropwithreset'' check the 2126 * flag dropsocket to see if the temporary 2127 * socket created here should be discarded. 2128 * We mark the socket as discardable until 2129 * we're committed to it below in TCPS_LISTEN. 2130 * There are some error conditions in which we 2131 * have to drop the temporary socket. 2132 */ 2133 dropsocket++; 2134 /* 2135 * Inherit INP_BOUND_IF from listener; testing if 2136 * head_ifscope is non-NULL is sufficient, since it 2137 * can only be set to a non-zero value earlier if 2138 * the listener has such a flag set. 2139 */ 2140 if (head_ifscope != NULL) { 2141 inp->inp_flags |= INP_BOUND_IF; 2142 inp->inp_boundifp = head_ifscope; 2143 } else { 2144 inp->inp_flags &= ~INP_BOUND_IF; 2145 } 2146 /* 2147 * Inherit restrictions from listener. 2148 */ 2149 if (head_nocell) 2150 inp_set_nocellular(inp); 2151 if (head_noexpensive) 2152 inp_set_noexpensive(inp); 2153 if (head_awdl_unrestricted) 2154 inp_set_awdl_unrestricted(inp); 2155 /* 2156 * Inherit {IN,IN6}_RECV_ANYIF from listener. 2157 */ 2158 if (head_recvanyif) 2159 inp->inp_flags |= INP_RECV_ANYIF; 2160 else 2161 inp->inp_flags &= ~INP_RECV_ANYIF; 2162#if INET6 2163 if (isipv6) 2164 inp->in6p_laddr = ip6->ip6_dst; 2165 else { 2166 inp->inp_vflag &= ~INP_IPV6; 2167 inp->inp_vflag |= INP_IPV4; 2168#endif /* INET6 */ 2169 inp->inp_laddr = ip->ip_dst; 2170#if INET6 2171 } 2172#endif /* INET6 */ 2173 inp->inp_lport = th->th_dport; 2174 if (in_pcbinshash(inp, 0) != 0) { 2175 /* 2176 * Undo the assignments above if we failed to 2177 * put the PCB on the hash lists. 2178 */ 2179#if INET6 2180 if (isipv6) 2181 inp->in6p_laddr = in6addr_any; 2182 else 2183#endif /* INET6 */ 2184 inp->inp_laddr.s_addr = INADDR_ANY; 2185 inp->inp_lport = 0; 2186 tcp_lock(oso, 0, 0); /* release ref on parent */ 2187 tcp_unlock(oso, 1, 0); 2188 goto drop; 2189 } 2190#if INET6 2191 if (isipv6) { 2192 /* 2193 * Inherit socket options from the listening 2194 * socket. 2195 * Note that in6p_inputopts are not (even 2196 * should not be) copied, since it stores 2197 * previously received options and is used to 2198 * detect if each new option is different than 2199 * the previous one and hence should be passed 2200 * to a user. 2201 * If we copied in6p_inputopts, a user would 2202 * not be able to receive options just after 2203 * calling the accept system call. 2204 */ 2205 inp->inp_flags |= 2206 oinp->inp_flags & INP_CONTROLOPTS; 2207 if (oinp->in6p_outputopts) 2208 inp->in6p_outputopts = 2209 ip6_copypktopts(oinp->in6p_outputopts, 2210 M_NOWAIT); 2211 } else 2212#endif /* INET6 */ 2213 inp->inp_options = ip_srcroute(); 2214 tcp_lock(oso, 0, 0); 2215#if IPSEC 2216 /* copy old policy into new socket's */ 2217 if (sotoinpcb(oso)->inp_sp) 2218 { 2219 int error = 0; 2220 /* Is it a security hole here to silently fail to copy the policy? */ 2221 if (inp->inp_sp != NULL) 2222 error = ipsec_init_policy(so, &inp->inp_sp); 2223 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) 2224 printf("tcp_input: could not copy policy\n"); 2225 } 2226#endif 2227 /* inherit states from the listener */ 2228 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 2229 struct tcpcb *, tp, int32_t, TCPS_LISTEN); 2230 tp->t_state = TCPS_LISTEN; 2231 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY); 2232 tp->t_flagsext |= (tp0->t_flagsext & (TF_RXTFINDROP|TF_NOTIMEWAIT)); 2233 tp->t_keepinit = tp0->t_keepinit; 2234 tp->t_keepcnt = tp0->t_keepcnt; 2235 tp->t_keepintvl = tp0->t_keepintvl; 2236 tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo; 2237 tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo; 2238 tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl; 2239 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) 2240 tp->t_notsent_lowat = tp0->t_notsent_lowat; 2241 2242 /* now drop the reference on the listener */ 2243 tcp_unlock(oso, 1, 0); 2244 2245 tcp_set_max_rwinscale(tp, so); 2246 2247 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0); 2248 } 2249 } 2250 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, 2251 LCK_MTX_ASSERT_OWNED); 2252 2253 if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) { 2254 /* 2255 * Evaluate the rate of arrival of packets to see if the 2256 * receiver can reduce the ack traffic. The algorithm to 2257 * stretch acks will be enabled if the connection meets 2258 * certain criteria defined in tcp_stretch_ack_enable function. 2259 */ 2260 if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) { 2261 TCP_INC_VAR(tp->rcv_waitforss, nlropkts); 2262 } 2263 if (tcp_stretch_ack_enable(tp)) { 2264 tp->t_flags |= TF_STRETCHACK; 2265 tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS); 2266 tp->rcv_waitforss = 0; 2267 } else { 2268 tp->t_flags &= ~(TF_STRETCHACK); 2269 } 2270 if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) { 2271 tp->rcv_by_unackwin += (tlen + off); 2272 } else { 2273 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; 2274 tp->rcv_by_unackwin = tlen + off; 2275 } 2276 } 2277 2278 /* 2279 * Keep track of how many bytes were received in the LRO packet 2280 */ 2281 if ((pktf_sw_lro_pkt) && (nlropkts > 2)) { 2282 tp->t_lropktlen += tlen; 2283 } 2284 /* 2285 * Explicit Congestion Notification - Flag that we need to send ECT if 2286 * + The IP Congestion experienced flag was set. 2287 * + Socket is in established state 2288 * + We negotiated ECN in the TCP setup 2289 * + This isn't a pure ack (tlen > 0) 2290 * + The data is in the valid window 2291 * 2292 * TE_SENDECE will be cleared when we receive a packet with TH_CWR set. 2293 */ 2294 if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED && 2295 ((tp->ecn_flags & (TE_ECN_ON)) == (TE_ECN_ON)) && tlen > 0 && 2296 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 2297 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 2298 tp->ecn_flags |= TE_SENDECE; 2299 } 2300 2301 /* 2302 * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't 2303 * bother doing extensive checks for state and whatnot. 2304 */ 2305 if ((thflags & TH_CWR) == TH_CWR) { 2306 tp->ecn_flags &= ~TE_SENDECE; 2307 } 2308 2309 /* 2310 * If we received an explicit notification of congestion in 2311 * ip tos ecn bits or by the CWR bit in TCP header flags, reset 2312 * the ack-strteching state. We need to handle ECN notification if 2313 * an ECN setup SYN was sent even once. 2314 */ 2315 if (tp->t_state == TCPS_ESTABLISHED 2316 && (tp->ecn_flags & TE_SETUPSENT) 2317 && (ip_ecn == IPTOS_ECN_CE || (thflags & TH_CWR))) 2318 tcp_reset_stretch_ack(tp); 2319 2320 /* 2321 * Try to determine if we are receiving a packet after a long time. 2322 * Use our own approximation of idletime to roughly measure remote 2323 * end's idle time. Since slowstart is used after an idle period 2324 * we want to avoid doing LRO if the remote end is not up to date 2325 * on initial window support and starts with 1 or 2 packets as its IW. 2326 */ 2327 if (sw_lro && (tp->t_flagsext & TF_LRO_OFFLOADED) && 2328 ((tcp_now - tp->t_rcvtime) >= (TCP_IDLETIMEOUT(tp)))) { 2329 turnoff_lro = 1; 2330 } 2331 2332 /* Update rcvtime as a new segment was received on the connection */ 2333 tp->t_rcvtime = tcp_now; 2334 2335 /* 2336 * Segment received on connection. 2337 * Reset idle time and keep-alive timer. 2338 */ 2339 if (TCPS_HAVEESTABLISHED(tp->t_state)) 2340 tcp_keepalive_reset(tp); 2341 2342 /* 2343 * Process options if not in LISTEN state, 2344 * else do it below (after getting remote address). 2345 */ 2346 if (tp->t_state != TCPS_LISTEN && optp) { 2347 tcp_dooptions(tp, optp, optlen, th, &to, ifscope); 2348#if MPTCP 2349 mptcp_csum = mptcp_input_csum(tp, m, drop_hdrlen); 2350 if (mptcp_csum) { 2351 tp->t_mpflags |= TMPF_SND_MPFAIL; 2352 tp->t_mpflags &= ~TMPF_EMBED_DSN; 2353 mptcp_notify_mpfail(so); 2354 m_freem(m); 2355 tcpstat.tcps_mp_badcsum++; 2356 tcp_check_timer_state(tp); 2357 tcp_unlock(so, 1, 0); 2358 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | 2359 DBG_FUNC_END,0,0,0,0,0); 2360 return; 2361 } 2362 mptcp_insert_rmap(tp, m); 2363#endif /* MPTCP */ 2364 } 2365 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 2366 if (to.to_flags & TOF_TS) { 2367 tp->t_flags |= TF_RCVD_TSTMP; 2368 tp->ts_recent = to.to_tsval; 2369 tp->ts_recent_age = tcp_now; 2370 } 2371 if (to.to_flags & TOF_MSS) 2372 tcp_mss(tp, to.to_mss, ifscope); 2373 if (SACK_ENABLED(tp)) { 2374 if (!(to.to_flags & TOF_SACK)) 2375 tp->t_flagsext &= ~(TF_SACK_ENABLE); 2376 else 2377 tp->t_flags |= TF_SACK_PERMIT; 2378 } 2379 } 2380 2381#if TRAFFIC_MGT 2382 /* Compute inter-packet arrival jitter. According to RFC 3550, inter-packet 2383 * arrival jitter is defined as the difference in packet spacing at the 2384 * receiver compared to the sender for a pair of packets. When two packets 2385 * of maximum segment size come one after the other with consecutive 2386 * sequence numbers, we consider them as packets sent together at the 2387 * sender and use them as a pair to compute inter-packet arrival jitter. 2388 * This metric indicates the delay induced by the network components due 2389 * to queuing in edge/access routers. 2390 */ 2391 if (tp->t_state == TCPS_ESTABLISHED && 2392 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK && 2393 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 2394 ((to.to_flags & TOF_TS) == 0 || 2395 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && 2396 th->th_seq == tp->rcv_nxt && 2397 LIST_EMPTY(&tp->t_segq)) { 2398 int seg_size = tlen; 2399 if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) { 2400 TCP_INC_VAR(tp->iaj_pktcnt, nlropkts); 2401 } 2402 2403 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) { 2404 seg_size = m->m_pkthdr.lro_pktlen; 2405 } 2406 if ( tp->iaj_size == 0 || seg_size > tp->iaj_size || 2407 (seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) { 2408 /* State related to inter-arrival jitter is uninitialized 2409 * or we are trying to find a good first packet to start 2410 * computing the metric 2411 */ 2412 update_iaj_state(tp, seg_size, 0); 2413 } else { 2414 if (seg_size == tp->iaj_size) { 2415 /* Compute inter-arrival jitter taking this packet 2416 * as the second packet 2417 */ 2418 if (pktf_sw_lro_pkt) 2419 compute_iaj(tp, nlropkts, 2420 m->m_pkthdr.lro_elapsed); 2421 else 2422 compute_iaj(tp, 1, 0); 2423 } 2424 if (seg_size < tp->iaj_size) { 2425 /* There is a smaller packet in the stream. 2426 * Some times the maximum size supported on a path can 2427 * change if there is a new link with smaller MTU. 2428 * The receiver will not know about this change. 2429 * If there are too many packets smaller than iaj_size, 2430 * we try to learn the iaj_size again. 2431 */ 2432 TCP_INC_VAR(tp->iaj_small_pkt, nlropkts); 2433 if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) { 2434 update_iaj_state(tp, seg_size, 1); 2435 } else { 2436 CLEAR_IAJ_STATE(tp); 2437 } 2438 } else { 2439 update_iaj_state(tp, seg_size, 0); 2440 } 2441 } 2442 } else { 2443 CLEAR_IAJ_STATE(tp); 2444 } 2445#endif /* TRAFFIC_MGT */ 2446 2447 /* 2448 * Header prediction: check for the two common cases 2449 * of a uni-directional data xfer. If the packet has 2450 * no control flags, is in-sequence, the window didn't 2451 * change and we're not retransmitting, it's a 2452 * candidate. If the length is zero and the ack moved 2453 * forward, we're the sender side of the xfer. Just 2454 * free the data acked & wake any higher level process 2455 * that was blocked waiting for space. If the length 2456 * is non-zero and the ack didn't move, we're the 2457 * receiver side. If we're getting packets in-order 2458 * (the reassembly queue is empty), add the data to 2459 * the socket buffer and note that we need a delayed ack. 2460 * Make sure that the hidden state-flags are also off. 2461 * Since we check for TCPS_ESTABLISHED above, it can only 2462 * be TH_NEEDSYN. 2463 */ 2464 if (tp->t_state == TCPS_ESTABLISHED && 2465 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE)) == TH_ACK && 2466 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 2467 ((to.to_flags & TOF_TS) == 0 || 2468 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && 2469 th->th_seq == tp->rcv_nxt && 2470 tiwin && tiwin == tp->snd_wnd && 2471 tp->snd_nxt == tp->snd_max) { 2472 2473 /* 2474 * If last ACK falls within this segment's sequence numbers, 2475 * record the timestamp. 2476 * NOTE that the test is modified according to the latest 2477 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2478 */ 2479 if ((to.to_flags & TOF_TS) != 0 && 2480 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 2481 tp->ts_recent_age = tcp_now; 2482 tp->ts_recent = to.to_tsval; 2483 } 2484 2485 /* Force acknowledgment if we received a FIN */ 2486 2487 if (thflags & TH_FIN) 2488 tp->t_flags |= TF_ACKNOW; 2489 2490 if (tlen == 0) { 2491 if (SEQ_GT(th->th_ack, tp->snd_una) && 2492 SEQ_LEQ(th->th_ack, tp->snd_max) && 2493 tp->snd_cwnd >= tp->snd_ssthresh && 2494 (!IN_FASTRECOVERY(tp) && 2495 ((!(SACK_ENABLED(tp)) && 2496 tp->t_dupacks < tp->t_rexmtthresh) || 2497 (SACK_ENABLED(tp) && to.to_nsacks == 0 && 2498 TAILQ_EMPTY(&tp->snd_holes))))) { 2499 /* 2500 * this is a pure ack for outstanding data. 2501 */ 2502 ++tcpstat.tcps_predack; 2503 2504 tcp_bad_rexmt_check(tp, th, &to), 2505 2506 /* Recalculate the RTT */ 2507 tcp_compute_rtt(tp, &to, th); 2508 2509 acked = BYTES_ACKED(th, tp); 2510 tcpstat.tcps_rcvackpack++; 2511 tcpstat.tcps_rcvackbyte += acked; 2512 2513 /* Handle an ack that is in sequence during congestion 2514 * avoidance phase. The calculations in this function 2515 * assume that snd_una is not updated yet. 2516 */ 2517 if (CC_ALGO(tp)->congestion_avd != NULL) 2518 CC_ALGO(tp)->congestion_avd(tp, th); 2519 tcp_ccdbg_trace(tp, th, TCP_CC_INSEQ_ACK_RCVD); 2520 sbdrop(&so->so_snd, acked); 2521 if (so->so_flags & SOF_ENABLE_MSGS) { 2522 VERIFY(acked <= so->so_msg_state->msg_serial_bytes); 2523 so->so_msg_state->msg_serial_bytes -= acked; 2524 } 2525 tcp_sbsnd_trim(&so->so_snd); 2526 2527 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 2528 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2529 tp->snd_recover = th->th_ack - 1; 2530 tp->snd_una = th->th_ack; 2531 2532 /* 2533 * pull snd_wl2 up to prevent seq wrap relative 2534 * to th_ack. 2535 */ 2536 tp->snd_wl2 = th->th_ack; 2537 2538 if (tp->t_dupacks > 0) { 2539 tp->t_dupacks = 0; 2540 tp->t_rexmtthresh = tcprexmtthresh; 2541 } 2542 2543 m_freem(m); 2544 2545 /* 2546 * If all outstanding data are acked, stop 2547 * retransmit timer, otherwise restart timer 2548 * using current (possibly backed-off) value. 2549 * If process is waiting for space, 2550 * wakeup/selwakeup/signal. If data 2551 * are ready to send, let tcp_output 2552 * decide between more output or persist. 2553 */ 2554 if (tp->snd_una == tp->snd_max) { 2555 tp->t_timer[TCPT_REXMT] = 0; 2556 tp->t_timer[TCPT_PTO] = 0; 2557 } else if (tp->t_timer[TCPT_PERSIST] == 0) { 2558 tp->t_timer[TCPT_REXMT] = 2559 OFFSET_FROM_START(tp, 2560 tp->t_rxtcur); 2561 } 2562 2563 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 && 2564 tp->t_bwmeas != NULL) 2565 tcp_bwmeas_check(tp); 2566 sowwakeup(so); /* has to be done with socket lock held */ 2567 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) { 2568 (void) tcp_output(tp); 2569 } 2570 2571 tcp_check_timer_state(tp); 2572 tcp_unlock(so, 1, 0); 2573 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); 2574 return; 2575 } 2576 } else if (th->th_ack == tp->snd_una && 2577 LIST_EMPTY(&tp->t_segq) && 2578 tlen <= tcp_sbspace(tp)) { 2579 /* 2580 * this is a pure, in-sequence data packet 2581 * with nothing on the reassembly queue and 2582 * we have enough buffer space to take it. 2583 */ 2584 2585 /* 2586 * If this is a connection in steady state, start 2587 * coalescing packets belonging to this flow. 2588 */ 2589 if (turnoff_lro) { 2590 tcp_lro_remove_state(tp->t_inpcb->inp_laddr, 2591 tp->t_inpcb->inp_faddr, 2592 tp->t_inpcb->inp_lport, 2593 tp->t_inpcb->inp_fport); 2594 tp->t_flagsext &= ~TF_LRO_OFFLOADED; 2595 tp->t_idleat = tp->rcv_nxt; 2596 } else if (sw_lro && !pktf_sw_lro_pkt && !isipv6 && 2597 (so->so_flags & SOF_USELRO) && 2598 !IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) && 2599 (m->m_pkthdr.rcvif->if_type != IFT_LOOP) && 2600 ((th->th_seq - tp->irs) > 2601 (tp->t_maxseg << lro_start)) && 2602 ((tp->t_idleat == 0) || ((th->th_seq - 2603 tp->t_idleat) > (tp->t_maxseg << lro_start)))) { 2604 tp->t_flagsext |= TF_LRO_OFFLOADED; 2605 tcp_start_coalescing(ip, th, tlen); 2606 tp->t_idleat = 0; 2607 } 2608 2609 /* Clean receiver SACK report if present */ 2610 if (SACK_ENABLED(tp) && tp->rcv_numsacks) 2611 tcp_clean_sackreport(tp); 2612 ++tcpstat.tcps_preddat; 2613 tp->rcv_nxt += tlen; 2614 /* 2615 * Pull snd_wl1 up to prevent seq wrap relative to 2616 * th_seq. 2617 */ 2618 tp->snd_wl1 = th->th_seq; 2619 /* 2620 * Pull rcv_up up to prevent seq wrap relative to 2621 * rcv_nxt. 2622 */ 2623 tp->rcv_up = tp->rcv_nxt; 2624 TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts); 2625 tcpstat.tcps_rcvbyte += tlen; 2626 if (nstat_collect) { 2627 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) { 2628 INP_ADD_STAT(inp, cell, wifi, wired, 2629 rxpackets, m->m_pkthdr.lro_npkts); 2630 } else { 2631 INP_ADD_STAT(inp, cell, wifi, wired, 2632 rxpackets, 1); 2633 } 2634 INP_ADD_STAT(inp, cell, wifi, wired,rxbytes, 2635 tlen); 2636 } 2637 2638 /* 2639 * Calculate the RTT on the receiver only if the 2640 * connection is in streaming mode and the last 2641 * packet was not an end-of-write 2642 */ 2643 if ((tp->t_flags & TF_STRETCHACK) && 2644 !(tp->t_flagsext & TF_STREAMEOW)) 2645 tcp_compute_rtt(tp, &to, th); 2646 2647 tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen); 2648 2649 /* 2650 * Add data to socket buffer. 2651 */ 2652 so_recv_data_stat(so, m, 0); 2653 m_adj(m, drop_hdrlen); /* delayed header drop */ 2654 2655 /* 2656 * If message delivery (SOF_ENABLE_MSGS) is enabled on 2657 * this socket, deliver the packet received as an 2658 * in-order message with sequence number attached to it. 2659 */ 2660 if (sbappendstream_rcvdemux(so, m, 2661 th->th_seq - (tp->irs + 1), 0)) { 2662 sorwakeup(so); 2663 } 2664#if INET6 2665 if (isipv6) { 2666 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), 2667 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])), 2668 th->th_seq, th->th_ack, th->th_win); 2669 } 2670 else 2671#endif 2672 { 2673 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), 2674 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), 2675 th->th_seq, th->th_ack, th->th_win); 2676 } 2677 TCP_INC_VAR(tp->t_unacksegs, nlropkts); 2678 if (DELAY_ACK(tp, th)) { 2679 if ((tp->t_flags & TF_DELACK) == 0) { 2680 tp->t_flags |= TF_DELACK; 2681 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); 2682 } 2683 } else { 2684 tp->t_flags |= TF_ACKNOW; 2685 tcp_output(tp); 2686 } 2687 2688 tcp_adaptive_rwtimo_check(tp, tlen); 2689 2690 tcp_check_timer_state(tp); 2691 tcp_unlock(so, 1, 0); 2692 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); 2693 return; 2694 } 2695 } 2696 2697 /* 2698 * Calculate amount of space in receive window, 2699 * and then do TCP input processing. 2700 * Receive window is amount of space in rcv queue, 2701 * but not less than advertised window. 2702 */ 2703 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, 2704 LCK_MTX_ASSERT_OWNED); 2705 win = tcp_sbspace(tp); 2706 if (win < 0) 2707 win = 0; 2708 else { /* clip rcv window to 4K for modems */ 2709 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) 2710 win = min(win, slowlink_wsize); 2711 } 2712 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 2713#if MPTCP 2714 /* 2715 * Ensure that the subflow receive window isn't greater 2716 * than the connection level receive window. 2717 */ 2718 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && 2719 (mp_tp = tptomptp(tp))) { 2720 MPT_LOCK(mp_tp); 2721 if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) { 2722 tp->rcv_wnd = mp_tp->mpt_rcvwnd; 2723 tcpstat.tcps_mp_reducedwin++; 2724 } 2725 MPT_UNLOCK(mp_tp); 2726 } 2727#endif /* MPTCP */ 2728 2729 switch (tp->t_state) { 2730 2731 /* 2732 * Initialize tp->rcv_nxt, and tp->irs, select an initial 2733 * tp->iss, and send a segment: 2734 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 2735 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. 2736 * Fill in remote peer address fields if not previously specified. 2737 * Enter SYN_RECEIVED state, and process any other fields of this 2738 * segment in this state. 2739 */ 2740 case TCPS_LISTEN: { 2741 register struct sockaddr_in *sin; 2742#if INET6 2743 register struct sockaddr_in6 *sin6; 2744#endif 2745 2746 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, 2747 LCK_MTX_ASSERT_OWNED); 2748#if INET6 2749 if (isipv6) { 2750 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, 2751 M_SONAME, M_NOWAIT); 2752 if (sin6 == NULL) 2753 goto drop; 2754 bzero(sin6, sizeof(*sin6)); 2755 sin6->sin6_family = AF_INET6; 2756 sin6->sin6_len = sizeof(*sin6); 2757 sin6->sin6_addr = ip6->ip6_src; 2758 sin6->sin6_port = th->th_sport; 2759 laddr6 = inp->in6p_laddr; 2760 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 2761 inp->in6p_laddr = ip6->ip6_dst; 2762 if (in6_pcbconnect(inp, (struct sockaddr *)sin6, 2763 proc0)) { 2764 inp->in6p_laddr = laddr6; 2765 FREE(sin6, M_SONAME); 2766 goto drop; 2767 } 2768 FREE(sin6, M_SONAME); 2769 } else 2770#endif 2771 { 2772 lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); 2773 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, 2774 M_NOWAIT); 2775 if (sin == NULL) 2776 goto drop; 2777 sin->sin_family = AF_INET; 2778 sin->sin_len = sizeof(*sin); 2779 sin->sin_addr = ip->ip_src; 2780 sin->sin_port = th->th_sport; 2781 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); 2782 laddr = inp->inp_laddr; 2783 if (inp->inp_laddr.s_addr == INADDR_ANY) 2784 inp->inp_laddr = ip->ip_dst; 2785 if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0, 2786 IFSCOPE_NONE, NULL)) { 2787 inp->inp_laddr = laddr; 2788 FREE(sin, M_SONAME); 2789 goto drop; 2790 } 2791 FREE(sin, M_SONAME); 2792 } 2793 2794 tcp_dooptions(tp, optp, optlen, th, &to, ifscope); 2795 2796 if (SACK_ENABLED(tp)) { 2797 if (!(to.to_flags & TOF_SACK)) 2798 tp->t_flagsext &= ~(TF_SACK_ENABLE); 2799 else 2800 tp->t_flags |= TF_SACK_PERMIT; 2801 } 2802 2803 if (iss) 2804 tp->iss = iss; 2805 else { 2806 tp->iss = tcp_new_isn(tp); 2807 } 2808 tp->irs = th->th_seq; 2809 tcp_sendseqinit(tp); 2810 tcp_rcvseqinit(tp); 2811 tp->snd_recover = tp->snd_una; 2812 /* 2813 * Initialization of the tcpcb for transaction; 2814 * set SND.WND = SEG.WND, 2815 * initialize CCsend and CCrecv. 2816 */ 2817 tp->snd_wnd = tiwin; /* initial send-window */ 2818 tp->t_flags |= TF_ACKNOW; 2819 tp->t_unacksegs = 0; 2820 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 2821 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED); 2822 tp->t_state = TCPS_SYN_RECEIVED; 2823 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 2824 TCP_CONN_KEEPINIT(tp)); 2825 dropsocket = 0; /* committed to socket */ 2826 2827 if (inp->inp_flowhash == 0) 2828 inp->inp_flowhash = inp_calc_flowhash(inp); 2829#if INET6 2830 /* update flowinfo - RFC 6437 */ 2831 if (inp->inp_flow == 0 && 2832 inp->in6p_flags & IN6P_AUTOFLOWLABEL) { 2833 inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; 2834 inp->inp_flow |= 2835 (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK); 2836 } 2837#endif /* INET6 */ 2838 2839 /* reset the incomp processing flag */ 2840 so->so_flags &= ~(SOF_INCOMP_INPROGRESS); 2841 tcpstat.tcps_accepts++; 2842 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) { 2843 /* ECN-setup SYN */ 2844 tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT); 2845 } 2846 2847#if CONFIG_IFEF_NOWINDOWSCALE 2848 if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL && 2849 (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) { 2850 /* Window scaling is not enabled on this interface */ 2851 tp->t_flags &= ~TF_REQ_SCALE; 2852 } 2853#endif 2854 goto trimthenstep6; 2855 } 2856 2857 /* 2858 * If the state is SYN_RECEIVED: 2859 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 2860 */ 2861 case TCPS_SYN_RECEIVED: 2862 if ((thflags & TH_ACK) && 2863 (SEQ_LEQ(th->th_ack, tp->snd_una) || 2864 SEQ_GT(th->th_ack, tp->snd_max))) { 2865 rstreason = BANDLIM_RST_OPENPORT; 2866 IF_TCP_STATINC(ifp, ooopacket); 2867 goto dropwithreset; 2868 } 2869 2870 /* 2871 * In SYN_RECEIVED state, if we recv some SYNS with 2872 * window scale and others without, window scaling should 2873 * be disabled. Otherwise the window advertised will be 2874 * lower if we assume scaling and the other end does not. 2875 */ 2876 if ((thflags & TH_SYN) && 2877 !(to.to_flags & TOF_SCALE)) 2878 tp->t_flags &= ~TF_RCVD_SCALE; 2879 break; 2880 2881 /* 2882 * If the state is SYN_SENT: 2883 * if seg contains an ACK, but not for our SYN, drop the input. 2884 * if seg contains a RST, then drop the connection. 2885 * if seg does not contain SYN, then drop it. 2886 * Otherwise this is an acceptable SYN segment 2887 * initialize tp->rcv_nxt and tp->irs 2888 * if seg contains ack then advance tp->snd_una 2889 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 2890 * arrange for segment to be acked (eventually) 2891 * continue processing rest of data/controls, beginning with URG 2892 */ 2893 case TCPS_SYN_SENT: 2894 if ((thflags & TH_ACK) && 2895 (SEQ_LEQ(th->th_ack, tp->iss) || 2896 SEQ_GT(th->th_ack, tp->snd_max))) { 2897 rstreason = BANDLIM_UNLIMITED; 2898 IF_TCP_STATINC(ifp, ooopacket); 2899 goto dropwithreset; 2900 } 2901 if (thflags & TH_RST) { 2902 if ((thflags & TH_ACK) != 0) { 2903#if MPTCP 2904 if ((so->so_flags & SOF_MPTCP_FASTJOIN) && 2905 SEQ_GT(th->th_ack, tp->iss+1)) { 2906 so->so_flags &= ~SOF_MPTCP_FASTJOIN; 2907 /* ignore the RST and retransmit SYN */ 2908 goto drop; 2909 } 2910#endif /* MPTCP */ 2911 soevent(so, 2912 (SO_FILT_HINT_LOCKED | 2913 SO_FILT_HINT_CONNRESET)); 2914 tp = tcp_drop(tp, ECONNREFUSED); 2915 postevent(so, 0, EV_RESET); 2916 } 2917 goto drop; 2918 } 2919 if ((thflags & TH_SYN) == 0) 2920 goto drop; 2921 tp->snd_wnd = th->th_win; /* initial send window */ 2922 2923 tp->irs = th->th_seq; 2924 tcp_rcvseqinit(tp); 2925 if (thflags & TH_ACK) { 2926 tcpstat.tcps_connects++; 2927 2928 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) { 2929 /* ECN-setup SYN-ACK */ 2930 tp->ecn_flags |= TE_SETUPRECEIVED; 2931 tcpstat.tcps_ecn_setup++; 2932 } 2933 else { 2934 /* non-ECN-setup SYN-ACK */ 2935 tp->ecn_flags &= ~TE_SENDIPECT; 2936 } 2937 2938#if CONFIG_MACF_NET && CONFIG_MACF_SOCKET 2939 /* XXXMAC: recursive lock: SOCK_LOCK(so); */ 2940 mac_socketpeer_label_associate_mbuf(m, so); 2941 /* XXXMAC: SOCK_UNLOCK(so); */ 2942#endif 2943 /* Do window scaling on this connection? */ 2944 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2945 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2946 tp->snd_scale = tp->requested_s_scale; 2947 tp->rcv_scale = tp->request_r_scale; 2948 } 2949 tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); 2950 tp->snd_una++; /* SYN is acked */ 2951 /* 2952 * If there's data, delay ACK; if there's also a FIN 2953 * ACKNOW will be turned on later. 2954 */ 2955 TCP_INC_VAR(tp->t_unacksegs, nlropkts); 2956 if (DELAY_ACK(tp, th) && tlen != 0 ) { 2957 if ((tp->t_flags & TF_DELACK) == 0) { 2958 tp->t_flags |= TF_DELACK; 2959 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); 2960 } 2961 } 2962 else { 2963 tp->t_flags |= TF_ACKNOW; 2964 } 2965 /* 2966 * Received <SYN,ACK> in SYN_SENT[*] state. 2967 * Transitions: 2968 * SYN_SENT --> ESTABLISHED 2969 * SYN_SENT* --> FIN_WAIT_1 2970 */ 2971 tp->t_starttime = tcp_now; 2972 tcp_sbrcv_tstmp_check(tp); 2973 if (tp->t_flags & TF_NEEDFIN) { 2974 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 2975 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1); 2976 tp->t_state = TCPS_FIN_WAIT_1; 2977 tp->t_flags &= ~TF_NEEDFIN; 2978 thflags &= ~TH_SYN; 2979 } else { 2980 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 2981 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED); 2982 tp->t_state = TCPS_ESTABLISHED; 2983 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 2984 TCP_CONN_KEEPIDLE(tp)); 2985 if (nstat_collect) 2986 nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt); 2987 } 2988#if MPTCP 2989 /* 2990 * Do not send the connect notification for additional 2991 * subflows until ACK for 3-way handshake arrives. 2992 */ 2993 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) && 2994 (tp->t_mpflags & TMPF_SENT_JOIN)) { 2995 isconnected = FALSE; 2996 /* Start data xmit if fastjoin */ 2997 if (mptcp_fastjoin && (so->so_flags & SOF_MPTCP_FASTJOIN)) { 2998 soevent(so, (SO_FILT_HINT_LOCKED | 2999 SO_FILT_HINT_MPFASTJ)); 3000 } 3001 } else 3002#endif /* MPTCP */ 3003 isconnected = TRUE; 3004 } else { 3005 /* 3006 * Received initial SYN in SYN-SENT[*] state => simul- 3007 * taneous open. If segment contains CC option and there is 3008 * a cached CC, apply TAO test; if it succeeds, connection is 3009 * half-synchronized. Otherwise, do 3-way handshake: 3010 * SYN-SENT -> SYN-RECEIVED 3011 * SYN-SENT* -> SYN-RECEIVED* 3012 */ 3013 tp->t_flags |= TF_ACKNOW; 3014 tp->t_timer[TCPT_REXMT] = 0; 3015 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 3016 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED); 3017 tp->t_state = TCPS_SYN_RECEIVED; 3018 3019 } 3020 3021trimthenstep6: 3022 /* 3023 * Advance th->th_seq to correspond to first data byte. 3024 * If data, trim to stay within window, 3025 * dropping FIN if necessary. 3026 */ 3027 th->th_seq++; 3028 if (tlen > tp->rcv_wnd) { 3029 todrop = tlen - tp->rcv_wnd; 3030 m_adj(m, -todrop); 3031 tlen = tp->rcv_wnd; 3032 thflags &= ~TH_FIN; 3033 tcpstat.tcps_rcvpackafterwin++; 3034 tcpstat.tcps_rcvbyteafterwin += todrop; 3035 } 3036 tp->snd_wl1 = th->th_seq - 1; 3037 tp->rcv_up = th->th_seq; 3038 /* 3039 * Client side of transaction: already sent SYN and data. 3040 * If the remote host used T/TCP to validate the SYN, 3041 * our data will be ACK'd; if so, enter normal data segment 3042 * processing in the middle of step 5, ack processing. 3043 * Otherwise, goto step 6. 3044 */ 3045 if (thflags & TH_ACK) 3046 goto process_ACK; 3047 goto step6; 3048 /* 3049 * If the state is LAST_ACK or CLOSING or TIME_WAIT: 3050 * do normal processing. 3051 * 3052 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. 3053 */ 3054 case TCPS_LAST_ACK: 3055 case TCPS_CLOSING: 3056 case TCPS_TIME_WAIT: 3057 break; /* continue normal processing */ 3058 3059 /* Received a SYN while connection is already established. 3060 * This is a "half open connection and other anomalies" described 3061 * in RFC793 page 34, send an ACK so the remote reset the connection 3062 * or recovers by adjusting its sequence numberering 3063 */ 3064 case TCPS_ESTABLISHED: 3065 if (thflags & TH_SYN) 3066 goto dropafterack; 3067 break; 3068 } 3069 3070 /* 3071 * States other than LISTEN or SYN_SENT. 3072 * First check the RST flag and sequence number since reset segments 3073 * are exempt from the timestamp and connection count tests. This 3074 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 3075 * below which allowed reset segments in half the sequence space 3076 * to fall though and be processed (which gives forged reset 3077 * segments with a random sequence number a 50 percent chance of 3078 * killing a connection). 3079 * Then check timestamp, if present. 3080 * Then check the connection count, if present. 3081 * Then check that at least some bytes of segment are within 3082 * receive window. If segment begins before rcv_nxt, 3083 * drop leading data (and SYN); if nothing left, just ack. 3084 * 3085 * 3086 * If the RST bit is set, check the sequence number to see 3087 * if this is a valid reset segment. 3088 * RFC 793 page 37: 3089 * In all states except SYN-SENT, all reset (RST) segments 3090 * are validated by checking their SEQ-fields. A reset is 3091 * valid if its sequence number is in the window. 3092 * Note: this does not take into account delayed ACKs, so 3093 * we should test against last_ack_sent instead of rcv_nxt. 3094 * The sequence number in the reset segment is normally an 3095 * echo of our outgoing acknowlegement numbers, but some hosts 3096 * send a reset with the sequence number at the rightmost edge 3097 * of our receive window, and we have to handle this case. 3098 * Note 2: Paul Watson's paper "Slipping in the Window" has shown 3099 * that brute force RST attacks are possible. To combat this, 3100 * we use a much stricter check while in the ESTABLISHED state, 3101 * only accepting RSTs where the sequence number is equal to 3102 * last_ack_sent. In all other states (the states in which a 3103 * RST is more likely), the more permissive check is used. 3104 * If we have multiple segments in flight, the intial reset 3105 * segment sequence numbers will be to the left of last_ack_sent, 3106 * but they will eventually catch up. 3107 * In any case, it never made sense to trim reset segments to 3108 * fit the receive window since RFC 1122 says: 3109 * 4.2.2.12 RST Segment: RFC-793 Section 3.4 3110 * 3111 * A TCP SHOULD allow a received RST segment to include data. 3112 * 3113 * DISCUSSION 3114 * It has been suggested that a RST segment could contain 3115 * ASCII text that encoded and explained the cause of the 3116 * RST. No standard has yet been established for such 3117 * data. 3118 * 3119 * If the reset segment passes the sequence number test examine 3120 * the state: 3121 * SYN_RECEIVED STATE: 3122 * If passive open, return to LISTEN state. 3123 * If active open, inform user that connection was refused. 3124 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: 3125 * Inform user that connection was reset, and close tcb. 3126 * CLOSING, LAST_ACK STATES: 3127 * Close the tcb. 3128 * TIME_WAIT STATE: 3129 * Drop the segment - see Stevens, vol. 2, p. 964 and 3130 * RFC 1337. 3131 * 3132 * Radar 4803931: Allows for the case where we ACKed the FIN but 3133 * there is already a RST in flight from the peer. 3134 * In that case, accept the RST for non-established 3135 * state if it's one off from last_ack_sent. 3136 3137 */ 3138 if (thflags & TH_RST) { 3139 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 3140 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 3141 (tp->rcv_wnd == 0 && 3142 ((tp->last_ack_sent == th->th_seq) || 3143 ((tp->last_ack_sent -1) == th->th_seq)))) { 3144 switch (tp->t_state) { 3145 3146 case TCPS_SYN_RECEIVED: 3147 IF_TCP_STATINC(ifp, rstinsynrcv); 3148 so->so_error = ECONNREFUSED; 3149 goto close; 3150 3151 case TCPS_ESTABLISHED: 3152 if (tp->last_ack_sent != th->th_seq) { 3153 tcpstat.tcps_badrst++; 3154 goto drop; 3155 } 3156 case TCPS_FIN_WAIT_1: 3157 case TCPS_CLOSE_WAIT: 3158 /* 3159 Drop through ... 3160 */ 3161 case TCPS_FIN_WAIT_2: 3162 so->so_error = ECONNRESET; 3163 close: 3164 postevent(so, 0, EV_RESET); 3165 soevent(so, 3166 (SO_FILT_HINT_LOCKED | 3167 SO_FILT_HINT_CONNRESET)); 3168 3169 tcpstat.tcps_drops++; 3170 tp = tcp_close(tp); 3171 break; 3172 3173 case TCPS_CLOSING: 3174 case TCPS_LAST_ACK: 3175 tp = tcp_close(tp); 3176 break; 3177 3178 case TCPS_TIME_WAIT: 3179 break; 3180 } 3181 } 3182 goto drop; 3183 } 3184 3185 /* 3186 * RFC 1323 PAWS: If we have a timestamp reply on this segment 3187 * and it's less than ts_recent, drop it. 3188 */ 3189 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 3190 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 3191 3192 /* Check to see if ts_recent is over 24 days old. */ 3193 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 3194 /* 3195 * Invalidate ts_recent. If this segment updates 3196 * ts_recent, the age will be reset later and ts_recent 3197 * will get a valid value. If it does not, setting 3198 * ts_recent to zero will at least satisfy the 3199 * requirement that zero be placed in the timestamp 3200 * echo reply when ts_recent isn't valid. The 3201 * age isn't reset until we get a valid ts_recent 3202 * because we don't want out-of-order segments to be 3203 * dropped when ts_recent is old. 3204 */ 3205 tp->ts_recent = 0; 3206 } else { 3207 tcpstat.tcps_rcvduppack++; 3208 tcpstat.tcps_rcvdupbyte += tlen; 3209 tcpstat.tcps_pawsdrop++; 3210 if (nstat_collect) { 3211 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 3212 1, tlen, NSTAT_RX_FLAG_DUPLICATE); 3213 INP_ADD_STAT(inp, cell, wifi, wired, 3214 rxpackets, 1); 3215 INP_ADD_STAT(inp, cell, wifi, wired, 3216 rxbytes, tlen); 3217 tp->t_stat.rxduplicatebytes += tlen; 3218 } 3219 if (tlen) 3220 goto dropafterack; 3221 goto drop; 3222 } 3223 } 3224 3225 /* 3226 * In the SYN-RECEIVED state, validate that the packet belongs to 3227 * this connection before trimming the data to fit the receive 3228 * window. Check the sequence number versus IRS since we know 3229 * the sequence numbers haven't wrapped. This is a partial fix 3230 * for the "LAND" DoS attack. 3231 */ 3232 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 3233 rstreason = BANDLIM_RST_OPENPORT; 3234 IF_TCP_STATINC(ifp, dospacket); 3235 goto dropwithreset; 3236 } 3237 3238 todrop = tp->rcv_nxt - th->th_seq; 3239 if (todrop > 0) { 3240 if (thflags & TH_SYN) { 3241 thflags &= ~TH_SYN; 3242 th->th_seq++; 3243 if (th->th_urp > 1) 3244 th->th_urp--; 3245 else 3246 thflags &= ~TH_URG; 3247 todrop--; 3248 } 3249 /* 3250 * Following if statement from Stevens, vol. 2, p. 960. 3251 */ 3252 if (todrop > tlen 3253 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 3254 /* 3255 * Any valid FIN must be to the left of the window. 3256 * At this point the FIN must be a duplicate or out 3257 * of sequence; drop it. 3258 */ 3259 thflags &= ~TH_FIN; 3260 3261 /* 3262 * Send an ACK to resynchronize and drop any data. 3263 * But keep on processing for RST or ACK. 3264 */ 3265 tp->t_flags |= TF_ACKNOW; 3266 if (todrop == 1) { 3267 /* This could be a keepalive */ 3268 soevent(so, SO_FILT_HINT_LOCKED | 3269 SO_FILT_HINT_KEEPALIVE); 3270 } 3271 todrop = tlen; 3272 tcpstat.tcps_rcvduppack++; 3273 tcpstat.tcps_rcvdupbyte += todrop; 3274 } else { 3275 tcpstat.tcps_rcvpartduppack++; 3276 tcpstat.tcps_rcvpartdupbyte += todrop; 3277 } 3278 if (nstat_collect) { 3279 nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, 3280 todrop, NSTAT_RX_FLAG_DUPLICATE); 3281 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1); 3282 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop); 3283 tp->t_stat.rxduplicatebytes += todrop; 3284 } 3285 drop_hdrlen += todrop; /* drop from the top afterwards */ 3286 th->th_seq += todrop; 3287 tlen -= todrop; 3288 if (th->th_urp > todrop) 3289 th->th_urp -= todrop; 3290 else { 3291 thflags &= ~TH_URG; 3292 th->th_urp = 0; 3293 } 3294 } 3295 3296 /* 3297 * If new data are received on a connection after the user processes 3298 * are gone, then RST the other end. Note that an MPTCP subflow socket 3299 * would have SS_NOFDREF set by default, so check to make sure that 3300 * we test for SOF_MP_SUBFLOW socket flag (which would be cleared when 3301 * the socket is closed.) 3302 */ 3303 if (!(so->so_flags & SOF_MP_SUBFLOW) && 3304 (so->so_state & SS_NOFDREF) && 3305 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 3306 tp = tcp_close(tp); 3307 tcpstat.tcps_rcvafterclose++; 3308 rstreason = BANDLIM_UNLIMITED; 3309 IF_TCP_STATINC(ifp, cleanup); 3310 goto dropwithreset; 3311 } 3312 3313 /* 3314 * If segment ends after window, drop trailing data 3315 * (and PUSH and FIN); if nothing left, just ACK. 3316 */ 3317 todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); 3318 if (todrop > 0) { 3319 tcpstat.tcps_rcvpackafterwin++; 3320 if (todrop >= tlen) { 3321 tcpstat.tcps_rcvbyteafterwin += tlen; 3322 /* 3323 * If a new connection request is received 3324 * while in TIME_WAIT, drop the old connection 3325 * and start over if the sequence numbers 3326 * are above the previous ones. 3327 */ 3328 if (thflags & TH_SYN && 3329 tp->t_state == TCPS_TIME_WAIT && 3330 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 3331 iss = tcp_new_isn(tp); 3332 tp = tcp_close(tp); 3333 tcp_unlock(so, 1, 0); 3334 goto findpcb; 3335 } 3336 /* 3337 * If window is closed can only take segments at 3338 * window edge, and have to drop data and PUSH from 3339 * incoming segments. Continue processing, but 3340 * remember to ack. Otherwise, drop segment 3341 * and ack. 3342 */ 3343 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 3344 tp->t_flags |= TF_ACKNOW; 3345 tcpstat.tcps_rcvwinprobe++; 3346 } else 3347 goto dropafterack; 3348 } else 3349 tcpstat.tcps_rcvbyteafterwin += todrop; 3350 m_adj(m, -todrop); 3351 tlen -= todrop; 3352 thflags &= ~(TH_PUSH|TH_FIN); 3353 } 3354 3355 /* 3356 * If last ACK falls within this segment's sequence numbers, 3357 * record its timestamp. 3358 * NOTE: 3359 * 1) That the test incorporates suggestions from the latest 3360 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 3361 * 2) That updating only on newer timestamps interferes with 3362 * our earlier PAWS tests, so this check should be solely 3363 * predicated on the sequence space of this segment. 3364 * 3) That we modify the segment boundary check to be 3365 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 3366 * instead of RFC1323's 3367 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 3368 * This modified check allows us to overcome RFC1323's 3369 * limitations as described in Stevens TCP/IP Illustrated 3370 * Vol. 2 p.869. In such cases, we can still calculate the 3371 * RTT correctly when RCV.NXT == Last.ACK.Sent. 3372 */ 3373 if ((to.to_flags & TOF_TS) != 0 && 3374 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 3375 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 3376 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 3377 tp->ts_recent_age = tcp_now; 3378 tp->ts_recent = to.to_tsval; 3379 } 3380 3381 /* 3382 * If a SYN is in the window, then this is an 3383 * error and we send an RST and drop the connection. 3384 */ 3385 if (thflags & TH_SYN) { 3386 tp = tcp_drop(tp, ECONNRESET); 3387 rstreason = BANDLIM_UNLIMITED; 3388 postevent(so, 0, EV_RESET); 3389 IF_TCP_STATINC(ifp, synwindow); 3390 goto dropwithreset; 3391 } 3392 3393 /* 3394 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 3395 * flag is on (half-synchronized state), then queue data for 3396 * later processing; else drop segment and return. 3397 */ 3398 if ((thflags & TH_ACK) == 0) { 3399 if (tp->t_state == TCPS_SYN_RECEIVED || 3400 (tp->t_flags & TF_NEEDSYN)) 3401 goto step6; 3402 else if (tp->t_flags & TF_ACKNOW) 3403 goto dropafterack; 3404 else 3405 goto drop; 3406 } 3407 3408 /* 3409 * Ack processing. 3410 */ 3411 3412 switch (tp->t_state) { 3413 3414 /* 3415 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 3416 * ESTABLISHED state and continue processing. 3417 * The ACK was checked above. 3418 */ 3419 case TCPS_SYN_RECEIVED: 3420 3421 tcpstat.tcps_connects++; 3422 3423 /* Do window scaling? */ 3424 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3425 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3426 tp->snd_scale = tp->requested_s_scale; 3427 tp->rcv_scale = tp->request_r_scale; 3428 tp->snd_wnd = th->th_win << tp->snd_scale; 3429 tiwin = tp->snd_wnd; 3430 } 3431 /* 3432 * Make transitions: 3433 * SYN-RECEIVED -> ESTABLISHED 3434 * SYN-RECEIVED* -> FIN-WAIT-1 3435 */ 3436 tp->t_starttime = tcp_now; 3437 tcp_sbrcv_tstmp_check(tp); 3438 if (tp->t_flags & TF_NEEDFIN) { 3439 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 3440 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1); 3441 tp->t_state = TCPS_FIN_WAIT_1; 3442 tp->t_flags &= ~TF_NEEDFIN; 3443 } else { 3444 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 3445 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED); 3446 tp->t_state = TCPS_ESTABLISHED; 3447 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, 3448 TCP_CONN_KEEPIDLE(tp)); 3449 if (nstat_collect) 3450 nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt); 3451 } 3452 /* 3453 * If segment contains data or ACK, will call tcp_reass() 3454 * later; if not, do so now to pass queued data to user. 3455 */ 3456 if (tlen == 0 && (thflags & TH_FIN) == 0) 3457 (void) tcp_reass(tp, (struct tcphdr *)0, &tlen, 3458 NULL, ifp); 3459 tp->snd_wl1 = th->th_seq - 1; 3460 3461 /* FALLTHROUGH */ 3462#if MPTCP 3463 /* 3464 * Do not send the connect notification for additional subflows 3465 * until ACK for 3-way handshake arrives. 3466 */ 3467 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) && 3468 (tp->t_mpflags & TMPF_SENT_JOIN)) { 3469 isconnected = FALSE; 3470 } else 3471#endif /* MPTCP */ 3472 isconnected = TRUE; 3473 3474 /* 3475 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 3476 * ACKs. If the ack is in the range 3477 * tp->snd_una < th->th_ack <= tp->snd_max 3478 * then advance tp->snd_una to th->th_ack and drop 3479 * data from the retransmission queue. If this ACK reflects 3480 * more up to date window information we update our window information. 3481 */ 3482 case TCPS_ESTABLISHED: 3483 case TCPS_FIN_WAIT_1: 3484 case TCPS_FIN_WAIT_2: 3485 case TCPS_CLOSE_WAIT: 3486 case TCPS_CLOSING: 3487 case TCPS_LAST_ACK: 3488 case TCPS_TIME_WAIT: 3489 if (SEQ_GT(th->th_ack, tp->snd_max)) { 3490 tcpstat.tcps_rcvacktoomuch++; 3491 goto dropafterack; 3492 } 3493 if (SACK_ENABLED(tp) && 3494 (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes))) 3495 tcp_sack_doack(tp, &to, th, &sack_bytes_acked); 3496 3497#if MPTCP 3498 if ((tp->t_mpuna) && (SEQ_GEQ(th->th_ack, tp->t_mpuna))) { 3499 if (tp->t_mpflags & TMPF_PREESTABLISHED) { 3500 /* MP TCP establishment succeeded */ 3501 tp->t_mpuna = 0; 3502 if (tp->t_mpflags & TMPF_JOINED_FLOW) { 3503 if (tp->t_mpflags & TMPF_SENT_JOIN) { 3504 tp->t_mpflags &= 3505 ~TMPF_PREESTABLISHED; 3506 tp->t_mpflags |= 3507 TMPF_MPTCP_TRUE; 3508 so->so_flags |= SOF_MPTCP_TRUE; 3509 if (mptcp_dbg >= MP_ERR_DEBUG) 3510 printf("MPTCP SUCCESS" 3511 " %s \n",__func__); 3512 tp->t_timer[TCPT_JACK_RXMT] = 0; 3513 tp->t_mprxtshift = 0; 3514 isconnected = TRUE; 3515 } else { 3516 isconnected = FALSE; 3517 } 3518 } else { 3519 isconnected = TRUE; 3520 tp->t_mpflags &= ~TMPF_SENT_KEYS; 3521 } 3522 } 3523 } 3524#endif /* MPTCP */ 3525 /* 3526 * If we have outstanding data (other than 3527 * a window probe), this is a completely 3528 * duplicate ack (ie, window info didn't 3529 * change) and the ack is the biggest we've seen. 3530 */ 3531 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 3532 if (tlen == 0 && tiwin == tp->snd_wnd) { 3533 /* 3534 * If both ends send FIN at the same time, 3535 * then the ack will be a duplicate ack 3536 * but we have to process the FIN. Check 3537 * for this condition and process the FIN 3538 * instead of the dupack 3539 */ 3540 if ((thflags & TH_FIN) && 3541 (tp->t_flags & TF_SENTFIN) && 3542 !TCPS_HAVERCVDFIN(tp->t_state) && 3543 (th->th_ack + 1) == tp->snd_max) { 3544 break; 3545 } 3546process_dupack: 3547#if MPTCP 3548 /* 3549 * MPTCP options that are ignored must 3550 * not be treated as duplicate ACKs. 3551 */ 3552 if (to.to_flags & TOF_MPTCP) { 3553 goto drop; 3554 } 3555 3556 if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) { 3557 if (mptcp_dbg >= MP_ERR_DEBUG) 3558 printf("%s: bypass ack recovery\n",__func__); 3559 break; 3560 } 3561#endif /* MPTCP */ 3562 /* 3563 * If a duplicate acknowledgement was seen 3564 * after ECN, it indicates packet loss in 3565 * addition to ECN. Reset INRECOVERY flag 3566 * so that we can process partial acks 3567 * correctly 3568 */ 3569 if (tp->ecn_flags & TE_INRECOVERY) 3570 tp->ecn_flags &= ~TE_INRECOVERY; 3571 3572 tcpstat.tcps_rcvdupack++; 3573 ++tp->t_dupacks; 3574 3575 /* 3576 * Check if we need to reset the limit on 3577 * early retransmit 3578 */ 3579 if (tp->t_early_rexmt_count > 0 && 3580 TSTMP_GEQ(tcp_now, 3581 (tp->t_early_rexmt_win + 3582 TCP_EARLY_REXMT_WIN))) 3583 tp->t_early_rexmt_count = 0; 3584 3585 /* 3586 * Is early retransmit needed? We check for 3587 * this when the connection is waiting for 3588 * duplicate acks to enter fast recovery. 3589 */ 3590 if (!IN_FASTRECOVERY(tp)) 3591 tcp_early_rexmt_check(tp, th); 3592 3593 /* 3594 * If we've seen exactly rexmt threshold 3595 * of duplicate acks, assume a packet 3596 * has been dropped and retransmit it. 3597 * Kludge snd_nxt & the congestion 3598 * window so we send only this one 3599 * packet. 3600 * 3601 * We know we're losing at the current 3602 * window size so do congestion avoidance 3603 * (set ssthresh to half the current window 3604 * and pull our congestion window back to 3605 * the new ssthresh). 3606 * 3607 * Dup acks mean that packets have left the 3608 * network (they're now cached at the receiver) 3609 * so bump cwnd by the amount in the receiver 3610 * to keep a constant cwnd packets in the 3611 * network. 3612 */ 3613 if (tp->t_timer[TCPT_REXMT] == 0 || 3614 (th->th_ack != tp->snd_una 3615 && sack_bytes_acked == 0)) { 3616 tp->t_dupacks = 0; 3617 tp->t_rexmtthresh = tcprexmtthresh; 3618 } else if (tp->t_dupacks > tp->t_rexmtthresh || 3619 IN_FASTRECOVERY(tp)) { 3620 3621 /* 3622 * If this connection was seeing packet 3623 * reordering, then recovery might be 3624 * delayed to disambiguate between 3625 * reordering and loss 3626 */ 3627 if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) && 3628 (tp->t_flagsext & 3629 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) == 3630 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) { 3631 /* 3632 * Since the SACK information is already 3633 * updated, this ACK will be dropped 3634 */ 3635 break; 3636 } 3637 3638 if (SACK_ENABLED(tp) 3639 && IN_FASTRECOVERY(tp)) { 3640 int awnd; 3641 3642 /* 3643 * Compute the amount of data in flight first. 3644 * We can inject new data into the pipe iff 3645 * we have less than 1/2 the original window's 3646 * worth of data in flight. 3647 */ 3648 awnd = (tp->snd_nxt - tp->snd_fack) + 3649 tp->sackhint.sack_bytes_rexmit; 3650 if (awnd < tp->snd_ssthresh) { 3651 tp->snd_cwnd += tp->t_maxseg; 3652 if (tp->snd_cwnd > tp->snd_ssthresh) 3653 tp->snd_cwnd = tp->snd_ssthresh; 3654 } 3655 } else 3656 tp->snd_cwnd += tp->t_maxseg; 3657 3658 tcp_ccdbg_trace(tp, th, TCP_CC_IN_FASTRECOVERY); 3659 3660 (void) tcp_output(tp); 3661 goto drop; 3662 } else if (tp->t_dupacks == tp->t_rexmtthresh) { 3663 tcp_seq onxt = tp->snd_nxt; 3664 3665 /* 3666 * If we're doing sack, check to 3667 * see if we're already in sack 3668 * recovery. If we're not doing sack, 3669 * check to see if we're in newreno 3670 * recovery. 3671 */ 3672 if (SACK_ENABLED(tp)) { 3673 if (IN_FASTRECOVERY(tp)) { 3674 tp->t_dupacks = 0; 3675 break; 3676 } else if (tp->t_flagsext & TF_DELAY_RECOVERY) { 3677 break; 3678 } 3679 } else { 3680 if (SEQ_LEQ(th->th_ack, 3681 tp->snd_recover)) { 3682 tp->t_dupacks = 0; 3683 break; 3684 } 3685 } 3686 3687 tp->snd_recover = tp->snd_max; 3688 tp->t_timer[TCPT_PTO] = 0; 3689 tp->t_rtttime = 0; 3690 3691 /* 3692 * If the connection has seen pkt 3693 * reordering, delay recovery until 3694 * it is clear that the packet 3695 * was lost. 3696 */ 3697 if (SACK_ENABLED(tp) && 3698 (tp->t_flagsext & 3699 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) 3700 == TF_PKTS_REORDERED && 3701 !IN_FASTRECOVERY(tp) && 3702 tp->t_reorderwin > 0 && 3703 tp->t_state == TCPS_ESTABLISHED) { 3704 tp->t_timer[TCPT_DELAYFR] = 3705 OFFSET_FROM_START(tp, 3706 tp->t_reorderwin); 3707 tp->t_flagsext |= TF_DELAY_RECOVERY; 3708 tcpstat.tcps_delay_recovery++; 3709 tcp_ccdbg_trace(tp, th, 3710 TCP_CC_DELAY_FASTRECOVERY); 3711 break; 3712 } 3713 3714 /* 3715 * If the current tcp cc module has 3716 * defined a hook for tasks to run 3717 * before entering FR, call it 3718 */ 3719 if (CC_ALGO(tp)->pre_fr != NULL) 3720 CC_ALGO(tp)->pre_fr(tp); 3721 ENTER_FASTRECOVERY(tp); 3722 tp->t_timer[TCPT_REXMT] = 0; 3723 if ((tp->ecn_flags & TE_ECN_ON) 3724 == TE_ECN_ON) 3725 tp->ecn_flags |= TE_SENDCWR; 3726 3727 if (SACK_ENABLED(tp)) { 3728 tcpstat.tcps_sack_recovery_episode++; 3729 tp->sack_newdata = tp->snd_nxt; 3730 tp->snd_cwnd = tp->t_maxseg; 3731 3732 /* 3733 * Enable probe timeout to detect 3734 * a tail loss in the recovery 3735 * window. 3736 */ 3737 tp->t_timer[TCPT_PTO] = 3738 OFFSET_FROM_START(tp, 3739 max(10, (tp->t_srtt >> TCP_RTT_SHIFT))); 3740 3741 tcp_ccdbg_trace(tp, th, 3742 TCP_CC_ENTER_FASTRECOVERY); 3743 3744 (void) tcp_output(tp); 3745 goto drop; 3746 } 3747 tp->snd_nxt = th->th_ack; 3748 tp->snd_cwnd = tp->t_maxseg; 3749 (void) tcp_output(tp); 3750 tp->snd_cwnd = tp->snd_ssthresh + 3751 tp->t_maxseg * tp->t_dupacks; 3752 if (SEQ_GT(onxt, tp->snd_nxt)) 3753 tp->snd_nxt = onxt; 3754 tcp_ccdbg_trace(tp, th, 3755 TCP_CC_ENTER_FASTRECOVERY); 3756 goto drop; 3757 } else if (limited_txmt && 3758 ALLOW_LIMITED_TRANSMIT(tp) && 3759 (!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) && 3760 (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) { 3761 u_int32_t incr = (tp->t_maxseg * tp->t_dupacks); 3762 3763 /* Use Limited Transmit algorithm on the first two 3764 * duplicate acks when there is new data to transmit 3765 */ 3766 tp->snd_cwnd += incr; 3767 tcpstat.tcps_limited_txt++; 3768 (void) tcp_output(tp); 3769 3770 tcp_ccdbg_trace(tp, th, TCP_CC_LIMITED_TRANSMIT); 3771 3772 /* Reset snd_cwnd back to normal */ 3773 tp->snd_cwnd -= incr; 3774 } 3775 } else { 3776 tp->t_dupacks = 0; 3777 tp->t_rexmtthresh = tcprexmtthresh; 3778 } 3779 break; 3780 } 3781 /* 3782 * If the congestion window was inflated to account 3783 * for the other side's cached packets, retract it. 3784 */ 3785 if (IN_FASTRECOVERY(tp)) { 3786 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 3787 /* 3788 * If we received an ECE and entered 3789 * recovery, the subsequent ACKs should 3790 * not be treated as partial acks. 3791 */ 3792 if (tp->ecn_flags & TE_INRECOVERY) 3793 goto process_ACK; 3794 3795 if (SACK_ENABLED(tp)) 3796 tcp_sack_partialack(tp, th); 3797 else 3798 tcp_newreno_partial_ack(tp, th); 3799 tcp_ccdbg_trace(tp, th, TCP_CC_PARTIAL_ACK); 3800 } else { 3801 EXIT_FASTRECOVERY(tp); 3802 if (CC_ALGO(tp)->post_fr != NULL) 3803 CC_ALGO(tp)->post_fr(tp, th); 3804 3805 tcp_ccdbg_trace(tp, th, 3806 TCP_CC_EXIT_FASTRECOVERY); 3807 } 3808 } else if ((tp->t_flagsext & 3809 (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) 3810 == (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) { 3811 /* 3812 * If the ack acknowledges upto snd_recover or if 3813 * it acknowledges all the snd holes, exit 3814 * recovery and cancel the timer. Otherwise, 3815 * this is a partial ack. Wait for recovery timer 3816 * to enter recovery. The snd_holes have already 3817 * been updated. 3818 */ 3819 if (SEQ_GEQ(th->th_ack, tp->snd_recover) || 3820 TAILQ_EMPTY(&tp->snd_holes)) { 3821 tp->t_timer[TCPT_DELAYFR] = 0; 3822 tp->t_flagsext &= ~TF_DELAY_RECOVERY; 3823 EXIT_FASTRECOVERY(tp); 3824 tcp_ccdbg_trace(tp, th, 3825 TCP_CC_EXIT_FASTRECOVERY); 3826 } 3827 } else { 3828 /* 3829 * We were not in fast recovery. Reset the 3830 * duplicate ack counter. 3831 */ 3832 tp->t_dupacks = 0; 3833 tp->t_rexmtthresh = tcprexmtthresh; 3834 } 3835 3836 3837 /* 3838 * If we reach this point, ACK is not a duplicate, 3839 * i.e., it ACKs something we sent. 3840 */ 3841 if (tp->t_flags & TF_NEEDSYN) { 3842 /* 3843 * T/TCP: Connection was half-synchronized, and our 3844 * SYN has been ACK'd (so connection is now fully 3845 * synchronized). Go to non-starred state, 3846 * increment snd_una for ACK of SYN, and check if 3847 * we can do window scaling. 3848 */ 3849 tp->t_flags &= ~TF_NEEDSYN; 3850 tp->snd_una++; 3851 /* Do window scaling? */ 3852 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3853 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3854 tp->snd_scale = tp->requested_s_scale; 3855 tp->rcv_scale = tp->request_r_scale; 3856 } 3857 } 3858 3859process_ACK: 3860 acked = BYTES_ACKED(th, tp); 3861 tcpstat.tcps_rcvackpack++; 3862 tcpstat.tcps_rcvackbyte += acked; 3863 3864 /* 3865 * If the last packet was a retransmit, make sure 3866 * it was not spurious. 3867 * 3868 * This will also take care of congestion window 3869 * adjustment if a last packet was recovered due to a 3870 * tail loss probe. 3871 */ 3872 tcp_bad_rexmt_check(tp, th, &to); 3873 3874 /* Recalculate the RTT */ 3875 tcp_compute_rtt(tp, &to, th); 3876 3877 /* 3878 * If all outstanding data is acked, stop retransmit 3879 * timer and remember to restart (more output or persist). 3880 * If there is more data to be acked, restart retransmit 3881 * timer, using current (possibly backed-off) value. 3882 */ 3883 if (th->th_ack == tp->snd_max) { 3884 tp->t_timer[TCPT_REXMT] = 0; 3885 tp->t_timer[TCPT_PTO] = 0; 3886 needoutput = 1; 3887 } else if (tp->t_timer[TCPT_PERSIST] == 0) 3888 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, 3889 tp->t_rxtcur); 3890 3891 /* 3892 * If no data (only SYN) was ACK'd, skip rest of ACK 3893 * processing. 3894 */ 3895 if (acked == 0) 3896 goto step6; 3897 3898 3899 if ((thflags & TH_ECE) != 0 && 3900 ((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON)) { 3901 /* 3902 * Reduce the congestion window if we haven't 3903 * done so. 3904 */ 3905 if (!IN_FASTRECOVERY(tp)) { 3906 tcp_reduce_congestion_window(tp); 3907 tp->ecn_flags |= (TE_INRECOVERY|TE_SENDCWR); 3908 tcp_ccdbg_trace(tp, th, TCP_CC_ECN_RCVD); 3909 } 3910 } 3911 3912 /* 3913 * When new data is acked, open the congestion window. 3914 * The specifics of how this is achieved are up to the 3915 * congestion control algorithm in use for this connection. 3916 * 3917 * The calculations in this function assume that snd_una is 3918 * not updated yet. 3919 */ 3920 if (!IN_FASTRECOVERY(tp)) { 3921 if (CC_ALGO(tp)->ack_rcvd != NULL) 3922 CC_ALGO(tp)->ack_rcvd(tp, th); 3923 tcp_ccdbg_trace(tp, th, TCP_CC_ACK_RCVD); 3924 } 3925 if (acked > so->so_snd.sb_cc) { 3926 tp->snd_wnd -= so->so_snd.sb_cc; 3927 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 3928 if (so->so_flags & SOF_ENABLE_MSGS) { 3929 so->so_msg_state->msg_serial_bytes -= 3930 (int)so->so_snd.sb_cc; 3931 } 3932 ourfinisacked = 1; 3933 } else { 3934 sbdrop(&so->so_snd, acked); 3935 if (so->so_flags & SOF_ENABLE_MSGS) { 3936 so->so_msg_state->msg_serial_bytes -= 3937 acked; 3938 } 3939 tcp_sbsnd_trim(&so->so_snd); 3940 tp->snd_wnd -= acked; 3941 ourfinisacked = 0; 3942 } 3943 /* detect una wraparound */ 3944 if ( !IN_FASTRECOVERY(tp) && 3945 SEQ_GT(tp->snd_una, tp->snd_recover) && 3946 SEQ_LEQ(th->th_ack, tp->snd_recover)) 3947 tp->snd_recover = th->th_ack - 1; 3948 3949 if (IN_FASTRECOVERY(tp) && 3950 SEQ_GEQ(th->th_ack, tp->snd_recover)) 3951 EXIT_FASTRECOVERY(tp); 3952 3953 tp->snd_una = th->th_ack; 3954 if (SACK_ENABLED(tp)) { 3955 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 3956 tp->snd_recover = tp->snd_una; 3957 } 3958 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 3959 tp->snd_nxt = tp->snd_una; 3960 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 && 3961 tp->t_bwmeas != NULL) 3962 tcp_bwmeas_check(tp); 3963 3964 /* 3965 * sowwakeup must happen after snd_una, et al. are updated so that 3966 * the sequence numbers are in sync with so_snd 3967 */ 3968 sowwakeup(so); 3969 3970 switch (tp->t_state) { 3971 3972 /* 3973 * In FIN_WAIT_1 STATE in addition to the processing 3974 * for the ESTABLISHED state if our FIN is now acknowledged 3975 * then enter FIN_WAIT_2. 3976 */ 3977 case TCPS_FIN_WAIT_1: 3978 if (ourfinisacked) { 3979 /* 3980 * If we can't receive any more 3981 * data, then closing user can proceed. 3982 * Starting the TCPT_2MSL timer is contrary to the 3983 * specification, but if we don't get a FIN 3984 * we'll hang forever. 3985 */ 3986 if (so->so_state & SS_CANTRCVMORE) { 3987 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, 3988 TCP_CONN_MAXIDLE(tp)); 3989 isconnected = FALSE; 3990 isdisconnected = TRUE; 3991 } 3992 DTRACE_TCP4(state__change, void, NULL, 3993 struct inpcb *, inp, 3994 struct tcpcb *, tp, 3995 int32_t, TCPS_FIN_WAIT_2); 3996 tp->t_state = TCPS_FIN_WAIT_2; 3997 /* fall through and make sure we also recognize 3998 * data ACKed with the FIN 3999 */ 4000 } 4001 tp->t_flags |= TF_ACKNOW; 4002 break; 4003 4004 /* 4005 * In CLOSING STATE in addition to the processing for 4006 * the ESTABLISHED state if the ACK acknowledges our FIN 4007 * then enter the TIME-WAIT state, otherwise ignore 4008 * the segment. 4009 */ 4010 case TCPS_CLOSING: 4011 if (ourfinisacked) { 4012 DTRACE_TCP4(state__change, void, NULL, 4013 struct inpcb *, inp, 4014 struct tcpcb *, tp, 4015 int32_t, TCPS_TIME_WAIT); 4016 tp->t_state = TCPS_TIME_WAIT; 4017 tcp_canceltimers(tp); 4018 if (tp->t_flagsext & TF_NOTIMEWAIT) { 4019 tp->t_flags |= TF_CLOSING; 4020 } else { 4021 add_to_time_wait(tp, 2 * tcp_msl); 4022 } 4023 isconnected = FALSE; 4024 isdisconnected = TRUE; 4025 } 4026 tp->t_flags |= TF_ACKNOW; 4027 break; 4028 4029 /* 4030 * In LAST_ACK, we may still be waiting for data to drain 4031 * and/or to be acked, as well as for the ack of our FIN. 4032 * If our FIN is now acknowledged, delete the TCB, 4033 * enter the closed state and return. 4034 */ 4035 case TCPS_LAST_ACK: 4036 if (ourfinisacked) { 4037 tp = tcp_close(tp); 4038 goto drop; 4039 } 4040 break; 4041 4042 /* 4043 * In TIME_WAIT state the only thing that should arrive 4044 * is a retransmission of the remote FIN. Acknowledge 4045 * it and restart the finack timer. 4046 */ 4047 case TCPS_TIME_WAIT: 4048 add_to_time_wait(tp, 2 * tcp_msl); 4049 goto dropafterack; 4050 } 4051 4052 /* 4053 * If there is a SACK option on the ACK and we 4054 * haven't seen any duplicate acks before, count 4055 * it as a duplicate ack even if the cumulative 4056 * ack is advanced. If the receiver delayed an 4057 * ack and detected loss afterwards, then the ack 4058 * will advance cumulative ack and will also have 4059 * a SACK option. So counting it as one duplicate 4060 * ack is ok. 4061 */ 4062 if (sack_ackadv == 1 && 4063 tp->t_state == TCPS_ESTABLISHED && 4064 SACK_ENABLED(tp) && sack_bytes_acked > 0 && 4065 to.to_nsacks > 0 && tp->t_dupacks == 0 && 4066 SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0 && 4067 !(tp->t_flagsext & TF_PKTS_REORDERED)) { 4068 tcpstat.tcps_sack_ackadv++; 4069 goto process_dupack; 4070 } 4071 } 4072 4073step6: 4074 /* 4075 * Update window information. 4076 * Don't look at window if no ACK: TAC's send garbage on first SYN. 4077 */ 4078 if ((thflags & TH_ACK) && 4079 (SEQ_LT(tp->snd_wl1, th->th_seq) || 4080 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 4081 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 4082 /* keep track of pure window updates */ 4083 if (tlen == 0 && 4084 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 4085 tcpstat.tcps_rcvwinupd++; 4086 tp->snd_wnd = tiwin; 4087 tp->snd_wl1 = th->th_seq; 4088 tp->snd_wl2 = th->th_ack; 4089 if (tp->snd_wnd > tp->max_sndwnd) 4090 tp->max_sndwnd = tp->snd_wnd; 4091 needoutput = 1; 4092 } 4093 4094 /* 4095 * Process segments with URG. 4096 */ 4097 if ((thflags & TH_URG) && th->th_urp && 4098 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4099 /* 4100 * This is a kludge, but if we receive and accept 4101 * random urgent pointers, we'll crash in 4102 * soreceive. It's hard to imagine someone 4103 * actually wanting to send this much urgent data. 4104 */ 4105 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 4106 th->th_urp = 0; /* XXX */ 4107 thflags &= ~TH_URG; /* XXX */ 4108 goto dodata; /* XXX */ 4109 } 4110 /* 4111 * If this segment advances the known urgent pointer, 4112 * then mark the data stream. This should not happen 4113 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 4114 * a FIN has been received from the remote side. 4115 * In these states we ignore the URG. 4116 * 4117 * According to RFC961 (Assigned Protocols), 4118 * the urgent pointer points to the last octet 4119 * of urgent data. We continue, however, 4120 * to consider it to indicate the first octet 4121 * of data past the urgent section as the original 4122 * spec states (in one of two places). 4123 */ 4124 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 4125 tp->rcv_up = th->th_seq + th->th_urp; 4126 so->so_oobmark = so->so_rcv.sb_cc + 4127 (tp->rcv_up - tp->rcv_nxt) - 1; 4128 if (so->so_oobmark == 0) { 4129 so->so_state |= SS_RCVATMARK; 4130 postevent(so, 0, EV_OOB); 4131 } 4132 sohasoutofband(so); 4133 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 4134 } 4135 /* 4136 * Remove out of band data so doesn't get presented to user. 4137 * This can happen independent of advancing the URG pointer, 4138 * but if two URG's are pending at once, some out-of-band 4139 * data may creep in... ick. 4140 */ 4141 if (th->th_urp <= (u_int32_t)tlen 4142#if SO_OOBINLINE 4143 && (so->so_options & SO_OOBINLINE) == 0 4144#endif 4145 ) 4146 tcp_pulloutofband(so, th, m, 4147 drop_hdrlen); /* hdr drop is delayed */ 4148 } else { 4149 /* 4150 * If no out of band data is expected, 4151 * pull receive urgent pointer along 4152 * with the receive window. 4153 */ 4154 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 4155 tp->rcv_up = tp->rcv_nxt; 4156 } 4157dodata: 4158 4159 /* Set socket's connect or disconnect state correcly before doing data. 4160 * The following might unlock the socket if there is an upcall or a socket 4161 * filter. 4162 */ 4163 if (isconnected) { 4164 soisconnected(so); 4165 } else if (isdisconnected) { 4166 soisdisconnected(so); 4167 } 4168 4169 /* Let's check the state of pcb just to make sure that it did not get closed 4170 * when we unlocked above 4171 */ 4172 if (inp->inp_state == INPCB_STATE_DEAD) { 4173 /* Just drop the packet that we are processing and return */ 4174 goto drop; 4175 } 4176 4177 /* 4178 * Process the segment text, merging it into the TCP sequencing queue, 4179 * and arranging for acknowledgment of receipt if necessary. 4180 * This process logically involves adjusting tp->rcv_wnd as data 4181 * is presented to the user (this happens in tcp_usrreq.c, 4182 * case PRU_RCVD). If a FIN has already been received on this 4183 * connection then we just ignore the text. 4184 */ 4185 if ((tlen || (thflags & TH_FIN)) && 4186 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4187 tcp_seq save_start = th->th_seq; 4188 tcp_seq save_end = th->th_seq + tlen; 4189 m_adj(m, drop_hdrlen); /* delayed header drop */ 4190 /* 4191 * Insert segment which includes th into TCP reassembly queue 4192 * with control block tp. Set thflags to whether reassembly now 4193 * includes a segment with FIN. This handles the common case 4194 * inline (segment is the next to be received on an established 4195 * connection, and the queue is empty), avoiding linkage into 4196 * and removal from the queue and repetition of various 4197 * conversions. 4198 * Set DELACK for segments received in order, but ack 4199 * immediately when segments are out of order (so 4200 * fast retransmit can work). 4201 */ 4202 if (th->th_seq == tp->rcv_nxt && 4203 LIST_EMPTY(&tp->t_segq) && 4204 TCPS_HAVEESTABLISHED(tp->t_state)) { 4205 TCP_INC_VAR(tp->t_unacksegs, nlropkts); 4206 /* 4207 * Calculate the RTT on the receiver only if the 4208 * connection is in streaming mode and the last 4209 * packet was not an end-of-write 4210 */ 4211 if ((tp->t_flags & TF_STRETCHACK) && 4212 !(tp->t_flagsext & TF_STREAMEOW)) 4213 tcp_compute_rtt(tp, &to, th); 4214 4215 if (DELAY_ACK(tp, th) && 4216 ((tp->t_flags & TF_ACKNOW) == 0) ) { 4217 if ((tp->t_flags & TF_DELACK) == 0) { 4218 tp->t_flags |= TF_DELACK; 4219 tp->t_timer[TCPT_DELACK] = 4220 OFFSET_FROM_START(tp, tcp_delack); 4221 } 4222 } 4223 else { 4224 tp->t_flags |= TF_ACKNOW; 4225 } 4226 tp->rcv_nxt += tlen; 4227 thflags = th->th_flags & TH_FIN; 4228 TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts); 4229 tcpstat.tcps_rcvbyte += tlen; 4230 if (nstat_collect) { 4231 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) { 4232 INP_ADD_STAT(inp, cell, wifi, wired, 4233 rxpackets, m->m_pkthdr.lro_npkts); 4234 } else { 4235 INP_ADD_STAT(inp, cell, wifi, wired, 4236 rxpackets, 1); 4237 } 4238 INP_ADD_STAT(inp, cell, wifi, wired, 4239 rxbytes, tlen); 4240 } 4241 tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen); 4242 so_recv_data_stat(so, m, drop_hdrlen); 4243 4244 if (sbappendstream_rcvdemux(so, m, 4245 th->th_seq - (tp->irs + 1), 0)) { 4246 sorwakeup(so); 4247 } 4248 } else { 4249 thflags = tcp_reass(tp, th, &tlen, m, ifp); 4250 tp->t_flags |= TF_ACKNOW; 4251 } 4252 4253 if (tlen > 0 && SACK_ENABLED(tp)) 4254 tcp_update_sack_list(tp, save_start, save_end); 4255 4256 tcp_adaptive_rwtimo_check(tp, tlen); 4257 4258 if (tp->t_flags & TF_DELACK) 4259 { 4260#if INET6 4261 if (isipv6) { 4262 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), 4263 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])), 4264 th->th_seq, th->th_ack, th->th_win); 4265 } 4266 else 4267#endif 4268 { 4269 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), 4270 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), 4271 th->th_seq, th->th_ack, th->th_win); 4272 } 4273 4274 } 4275 } else { 4276 m_freem(m); 4277 thflags &= ~TH_FIN; 4278 } 4279 4280 /* 4281 * If FIN is received ACK the FIN and let the user know 4282 * that the connection is closing. 4283 */ 4284 if (thflags & TH_FIN) { 4285 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4286 socantrcvmore(so); 4287 postevent(so, 0, EV_FIN); 4288 /* 4289 * If connection is half-synchronized 4290 * (ie NEEDSYN flag on) then delay ACK, 4291 * so it may be piggybacked when SYN is sent. 4292 * Otherwise, since we received a FIN then no 4293 * more input can be expected, send ACK now. 4294 */ 4295 TCP_INC_VAR(tp->t_unacksegs, nlropkts); 4296 if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) { 4297 if ((tp->t_flags & TF_DELACK) == 0) { 4298 tp->t_flags |= TF_DELACK; 4299 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); 4300 } 4301 } 4302 else { 4303 tp->t_flags |= TF_ACKNOW; 4304 } 4305 tp->rcv_nxt++; 4306 } 4307 switch (tp->t_state) { 4308 4309 /* 4310 * In SYN_RECEIVED and ESTABLISHED STATES 4311 * enter the CLOSE_WAIT state. 4312 */ 4313 case TCPS_SYN_RECEIVED: 4314 tp->t_starttime = tcp_now; 4315 case TCPS_ESTABLISHED: 4316 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 4317 struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT); 4318 tp->t_state = TCPS_CLOSE_WAIT; 4319 break; 4320 4321 /* 4322 * If still in FIN_WAIT_1 STATE FIN has not been acked so 4323 * enter the CLOSING state. 4324 */ 4325 case TCPS_FIN_WAIT_1: 4326 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, 4327 struct tcpcb *, tp, int32_t, TCPS_CLOSING); 4328 tp->t_state = TCPS_CLOSING; 4329 break; 4330 4331 /* 4332 * In FIN_WAIT_2 state enter the TIME_WAIT state, 4333 * starting the time-wait timer, turning off the other 4334 * standard timers. 4335 */ 4336 case TCPS_FIN_WAIT_2: 4337 DTRACE_TCP4(state__change, void, NULL, 4338 struct inpcb *, inp, 4339 struct tcpcb *, tp, 4340 int32_t, TCPS_TIME_WAIT); 4341 tp->t_state = TCPS_TIME_WAIT; 4342 tcp_canceltimers(tp); 4343 tp->t_flags |= TF_ACKNOW; 4344 if (tp->t_flagsext & TF_NOTIMEWAIT) { 4345 tp->t_flags |= TF_CLOSING; 4346 } else { 4347 add_to_time_wait(tp, 2 * tcp_msl); 4348 } 4349 soisdisconnected(so); 4350 break; 4351 4352 /* 4353 * In TIME_WAIT state restart the 2 MSL time_wait timer. 4354 */ 4355 case TCPS_TIME_WAIT: 4356 add_to_time_wait(tp, 2 * tcp_msl); 4357 break; 4358 } 4359 } 4360#if TCPDEBUG 4361 if (so->so_options & SO_DEBUG) 4362 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, 4363 &tcp_savetcp, 0); 4364#endif 4365 4366 /* 4367 * Return any desired output. 4368 */ 4369 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 4370 (void) tcp_output(tp); 4371 } 4372 4373 tcp_check_timer_state(tp); 4374 4375 4376 tcp_unlock(so, 1, 0); 4377 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); 4378 return; 4379 4380dropafterack: 4381 /* 4382 * Generate an ACK dropping incoming segment if it occupies 4383 * sequence space, where the ACK reflects our state. 4384 * 4385 * We can now skip the test for the RST flag since all 4386 * paths to this code happen after packets containing 4387 * RST have been dropped. 4388 * 4389 * In the SYN-RECEIVED state, don't send an ACK unless the 4390 * segment we received passes the SYN-RECEIVED ACK test. 4391 * If it fails send a RST. This breaks the loop in the 4392 * "LAND" DoS attack, and also prevents an ACK storm 4393 * between two listening ports that have been sent forged 4394 * SYN segments, each with the source address of the other. 4395 */ 4396 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 4397 (SEQ_GT(tp->snd_una, th->th_ack) || 4398 SEQ_GT(th->th_ack, tp->snd_max)) ) { 4399 rstreason = BANDLIM_RST_OPENPORT; 4400 IF_TCP_STATINC(ifp, dospacket); 4401 goto dropwithreset; 4402 } 4403#if TCPDEBUG 4404 if (so->so_options & SO_DEBUG) 4405 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 4406 &tcp_savetcp, 0); 4407#endif 4408 m_freem(m); 4409 tp->t_flags |= TF_ACKNOW; 4410 (void) tcp_output(tp); 4411 4412 /* Don't need to check timer state as we should have done it during tcp_output */ 4413 tcp_unlock(so, 1, 0); 4414 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); 4415 return; 4416dropwithresetnosock: 4417 nosock = 1; 4418dropwithreset: 4419 /* 4420 * Generate a RST, dropping incoming segment. 4421 * Make ACK acceptable to originator of segment. 4422 * Don't bother to respond if destination was broadcast/multicast. 4423 */ 4424 if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 4425 goto drop; 4426#if INET6 4427 if (isipv6) { 4428 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 4429 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 4430 goto drop; 4431 } else 4432#endif /* INET6 */ 4433 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 4434 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 4435 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 4436 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 4437 goto drop; 4438 /* IPv6 anycast check is done at tcp6_input() */ 4439 4440 /* 4441 * Perform bandwidth limiting. 4442 */ 4443#if ICMP_BANDLIM 4444 if (badport_bandlim(rstreason) < 0) 4445 goto drop; 4446#endif 4447 4448#if TCPDEBUG 4449 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 4450 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 4451 &tcp_savetcp, 0); 4452#endif 4453 bzero(&tra, sizeof(tra)); 4454 tra.ifscope = ifscope; 4455 tra.awdl_unrestricted = 1; 4456 if (thflags & TH_ACK) 4457 /* mtod() below is safe as long as hdr dropping is delayed */ 4458 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, 4459 TH_RST, &tra); 4460 else { 4461 if (thflags & TH_SYN) 4462 tlen++; 4463 /* mtod() below is safe as long as hdr dropping is delayed */ 4464 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 4465 (tcp_seq)0, TH_RST|TH_ACK, &tra); 4466 } 4467 /* destroy temporarily created socket */ 4468 if (dropsocket) { 4469 (void) soabort(so); 4470 tcp_unlock(so, 1, 0); 4471 } else if ((inp != NULL) && (nosock == 0)) { 4472 tcp_unlock(so, 1, 0); 4473 } 4474 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); 4475 return; 4476dropnosock: 4477 nosock = 1; 4478drop: 4479 /* 4480 * Drop space held by incoming segment and return. 4481 */ 4482#if TCPDEBUG 4483 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 4484 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, 4485 &tcp_savetcp, 0); 4486#endif 4487 m_freem(m); 4488 /* destroy temporarily created socket */ 4489 if (dropsocket) { 4490 (void) soabort(so); 4491 tcp_unlock(so, 1, 0); 4492 } 4493 else if (nosock == 0) { 4494 tcp_unlock(so, 1, 0); 4495 } 4496 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); 4497 return; 4498} 4499 4500static void 4501tcp_dooptions(tp, cp, cnt, th, to, input_ifscope) 4502/* 4503 * Parse TCP options and place in tcpopt. 4504 */ 4505 struct tcpcb *tp; 4506 u_char *cp; 4507 int cnt; 4508 struct tcphdr *th; 4509 struct tcpopt *to; 4510 unsigned int input_ifscope; 4511{ 4512 u_short mss = 0; 4513 int opt, optlen; 4514 4515 for (; cnt > 0; cnt -= optlen, cp += optlen) { 4516 opt = cp[0]; 4517 if (opt == TCPOPT_EOL) 4518 break; 4519 if (opt == TCPOPT_NOP) 4520 optlen = 1; 4521 else { 4522 if (cnt < 2) 4523 break; 4524 optlen = cp[1]; 4525 if (optlen < 2 || optlen > cnt) 4526 break; 4527 } 4528 switch (opt) { 4529 4530 default: 4531 continue; 4532 4533 case TCPOPT_MAXSEG: 4534 if (optlen != TCPOLEN_MAXSEG) 4535 continue; 4536 if (!(th->th_flags & TH_SYN)) 4537 continue; 4538 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 4539 NTOHS(mss); 4540 break; 4541 4542 case TCPOPT_WINDOW: 4543 if (optlen != TCPOLEN_WINDOW) 4544 continue; 4545 if (!(th->th_flags & TH_SYN)) 4546 continue; 4547 to->to_flags |= TOF_SCALE; 4548 tp->t_flags |= TF_RCVD_SCALE; 4549 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 4550 break; 4551 4552 case TCPOPT_TIMESTAMP: 4553 if (optlen != TCPOLEN_TIMESTAMP) 4554 continue; 4555 to->to_flags |= TOF_TS; 4556 bcopy((char *)cp + 2, 4557 (char *)&to->to_tsval, sizeof(to->to_tsval)); 4558 NTOHL(to->to_tsval); 4559 bcopy((char *)cp + 6, 4560 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 4561 NTOHL(to->to_tsecr); 4562 /* 4563 * A timestamp received in a SYN makes 4564 * it ok to send timestamp requests and replies. 4565 */ 4566 if (th->th_flags & TH_SYN) { 4567 tp->t_flags |= TF_RCVD_TSTMP; 4568 tp->ts_recent = to->to_tsval; 4569 tp->ts_recent_age = tcp_now; 4570 } 4571 break; 4572 case TCPOPT_SACK_PERMITTED: 4573 if (!tcp_do_sack || 4574 optlen != TCPOLEN_SACK_PERMITTED) 4575 continue; 4576 if (th->th_flags & TH_SYN) 4577 to->to_flags |= TOF_SACK; 4578 break; 4579 case TCPOPT_SACK: 4580 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 4581 continue; 4582 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 4583 to->to_sacks = cp + 2; 4584 tcpstat.tcps_sack_rcv_blocks++; 4585 4586 break; 4587 4588#if MPTCP 4589 case TCPOPT_MULTIPATH: 4590 tcp_do_mptcp_options(tp, cp, th, to, optlen); 4591 break; 4592#endif /* MPTCP */ 4593 } 4594 } 4595 if (th->th_flags & TH_SYN) 4596 tcp_mss(tp, mss, input_ifscope); /* sets t_maxseg */ 4597} 4598 4599/* 4600 * Pull out of band byte out of a segment so 4601 * it doesn't appear in the user's data queue. 4602 * It is still reflected in the segment length for 4603 * sequencing purposes. 4604 */ 4605static void 4606tcp_pulloutofband(so, th, m, off) 4607 struct socket *so; 4608 struct tcphdr *th; 4609 register struct mbuf *m; 4610 int off; /* delayed to be droped hdrlen */ 4611{ 4612 int cnt = off + th->th_urp - 1; 4613 4614 while (cnt >= 0) { 4615 if (m->m_len > cnt) { 4616 char *cp = mtod(m, caddr_t) + cnt; 4617 struct tcpcb *tp = sototcpcb(so); 4618 4619 tp->t_iobc = *cp; 4620 tp->t_oobflags |= TCPOOB_HAVEDATA; 4621 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 4622 m->m_len--; 4623 if (m->m_flags & M_PKTHDR) 4624 m->m_pkthdr.len--; 4625 return; 4626 } 4627 cnt -= m->m_len; 4628 m = m->m_next; 4629 if (m == 0) 4630 break; 4631 } 4632 panic("tcp_pulloutofband"); 4633} 4634 4635uint32_t 4636get_base_rtt(struct tcpcb *tp) 4637{ 4638 uint32_t base_rtt = 0, i; 4639 for (i = 0; i < N_RTT_BASE; ++i) { 4640 if (tp->rtt_hist[i] != 0 && 4641 (base_rtt == 0 || tp->rtt_hist[i] < base_rtt)) 4642 base_rtt = tp->rtt_hist[i]; 4643 } 4644 return base_rtt; 4645} 4646 4647/* Each value of RTT base represents the minimum RTT seen in a minute. 4648 * We keep upto N_RTT_BASE minutes worth of history. 4649 */ 4650void 4651update_base_rtt(struct tcpcb *tp, uint32_t rtt) 4652{ 4653 int32_t i, qdelay; 4654 u_int32_t base_rtt; 4655 4656 if (++tp->rtt_count >= rtt_samples_per_slot) { 4657#if TRAFFIC_MGT 4658 /* 4659 * If the recv side is being throttled, check if the 4660 * current RTT is closer to the base RTT seen in 4661 * first (recent) two slots. If so, unthrottle the stream. 4662 */ 4663 if (tp->t_flagsext & TF_RECV_THROTTLE) { 4664 base_rtt = min(tp->rtt_hist[0], tp->rtt_hist[1]); 4665 qdelay = tp->t_rttcur - base_rtt; 4666 if (qdelay < target_qdelay) 4667 tp->t_flagsext &= ~(TF_RECV_THROTTLE); 4668 } 4669#endif /* TRAFFIC_MGT */ 4670 4671 for (i = (N_RTT_BASE-1); i > 0; --i) { 4672 tp->rtt_hist[i] = tp->rtt_hist[i-1]; 4673 } 4674 tp->rtt_hist[0] = rtt; 4675 tp->rtt_count = 0; 4676 } else { 4677 tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt); 4678 } 4679} 4680 4681/* 4682 * If we have a timestamp reply, update smoothed RTT. If no timestamp is 4683 * present but transmit timer is running and timed sequence number was 4684 * acked, update smoothed RTT. 4685 * 4686 * If timestamps are supported, a receiver can update RTT even if 4687 * there is no outstanding data. 4688 * 4689 * Some boxes send broken timestamp replies during the SYN+ACK phase, 4690 * ignore timestamps of 0or we could calculate a huge RTT and blow up 4691 * the retransmit timer. 4692 */ 4693static void 4694tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 4695{ 4696 VERIFY(to != NULL && th != NULL); 4697 if (((to->to_flags & TOF_TS) != 0) && 4698 (to->to_tsecr != 0) && 4699 TSTMP_GEQ(tcp_now, to->to_tsecr)) { 4700 tcp_xmit_timer(tp, tcp_now - to->to_tsecr, 4701 to->to_tsecr, th->th_ack); 4702 } else if (tp->t_rtttime != 0 && SEQ_GT(th->th_ack, tp->t_rtseq)) { 4703 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime, 0, 4704 th->th_ack); 4705 } 4706} 4707 4708/* 4709 * Collect new round-trip time estimate 4710 * and update averages and current timeout. 4711 */ 4712static void 4713tcp_xmit_timer(register struct tcpcb *tp, int rtt, 4714 u_int32_t tsecr, tcp_seq th_ack) 4715{ 4716 register int delta; 4717 4718 if (tp->t_flagsext & TF_RECOMPUTE_RTT) { 4719 if (SEQ_GT(th_ack, tp->snd_una) && 4720 SEQ_LEQ(th_ack, tp->snd_max) && 4721 (tsecr == 0 || 4722 TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) { 4723 /* 4724 * We received a new ACk after a 4725 * spurious timeout. Adapt retransmission 4726 * timer as described in rfc 4015. 4727 */ 4728 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT); 4729 tp->t_badrexmt_time = 0; 4730 tp->t_srtt = max(tp->t_srtt_prev, rtt); 4731 tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT; 4732 tp->t_rttvar = max(tp->t_rttvar_prev, (rtt >> 1)); 4733 tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT; 4734 4735 if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar)) 4736 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 4737 4738 goto compute_rto; 4739 } else { 4740 return; 4741 } 4742 } 4743 4744 tcpstat.tcps_rttupdated++; 4745 tp->t_rttupdated++; 4746 4747 if (rtt > 0) { 4748 tp->t_rttcur = rtt; 4749 update_base_rtt(tp, rtt); 4750 } 4751 4752 if (tp->t_srtt != 0) { 4753 /* 4754 * srtt is stored as fixed point with 5 bits after the 4755 * binary point (i.e., scaled by 32). The following magic 4756 * is equivalent to the smoothing algorithm in rfc793 with 4757 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 4758 * point). 4759 * 4760 * Freebsd adjusts rtt to origin 0 by subtracting 1 4761 * from the provided rtt value. This was required because 4762 * of the way t_rtttime was initiailised to 1 before. 4763 * Since we changed t_rtttime to be based on 4764 * tcp_now, this extra adjustment is not needed. 4765 */ 4766 delta = (rtt << TCP_DELTA_SHIFT) 4767 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 4768 4769 if ((tp->t_srtt += delta) <= 0) 4770 tp->t_srtt = 1; 4771 4772 /* 4773 * We accumulate a smoothed rtt variance (actually, a 4774 * smoothed mean difference), then set the retransmit 4775 * timer to smoothed rtt + 4 times the smoothed variance. 4776 * rttvar is stored as fixed point with 4 bits after the 4777 * binary point (scaled by 16). The following is 4778 * equivalent to rfc793 smoothing with an alpha of .75 4779 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 4780 * rfc793's wired-in beta. 4781 */ 4782 if (delta < 0) 4783 delta = -delta; 4784 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 4785 if ((tp->t_rttvar += delta) <= 0) 4786 tp->t_rttvar = 1; 4787 if (tp->t_rttbest == 0 || 4788 tp->t_rttbest > (tp->t_srtt + tp->t_rttvar)) 4789 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 4790 } else { 4791 /* 4792 * No rtt measurement yet - use the unsmoothed rtt. 4793 * Set the variance to half the rtt (so our first 4794 * retransmit happens at 3*rtt). 4795 */ 4796 tp->t_srtt = rtt << TCP_RTT_SHIFT; 4797 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 4798 } 4799 4800compute_rto: 4801 nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt, 4802 tp->t_rttvar); 4803 tp->t_rtttime = 0; 4804 tp->t_rxtshift = 0; 4805 tp->t_rxtstart = 0; 4806 4807 /* 4808 * the retransmit should happen at rtt + 4 * rttvar. 4809 * Because of the way we do the smoothing, srtt and rttvar 4810 * will each average +1/2 tick of bias. When we compute 4811 * the retransmit timer, we want 1/2 tick of rounding and 4812 * 1 extra tick because of +-1/2 tick uncertainty in the 4813 * firing of the timer. The bias will give us exactly the 4814 * 1.5 tick we need. But, because the bias is 4815 * statistical, we have to test that we don't drop below 4816 * the minimum feasible timer (which is 2 ticks). 4817 */ 4818 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 4819 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX, 4820 TCP_ADD_REXMTSLOP(tp)); 4821 4822 /* 4823 * We received an ack for a packet that wasn't retransmitted; 4824 * it is probably safe to discard any error indications we've 4825 * received recently. This isn't quite right, but close enough 4826 * for now (a route might have failed after we sent a segment, 4827 * and the return path might not be symmetrical). 4828 */ 4829 tp->t_softerror = 0; 4830} 4831 4832static inline unsigned int 4833tcp_maxmtu(struct rtentry *rt) 4834{ 4835 unsigned int maxmtu; 4836 4837 RT_LOCK_ASSERT_HELD(rt); 4838 if (rt->rt_rmx.rmx_mtu == 0) 4839 maxmtu = rt->rt_ifp->if_mtu; 4840 else 4841 maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu); 4842 4843 return (maxmtu); 4844} 4845 4846#if INET6 4847static inline unsigned int 4848tcp_maxmtu6(struct rtentry *rt) 4849{ 4850 unsigned int maxmtu; 4851 struct nd_ifinfo *ndi; 4852 4853 RT_LOCK_ASSERT_HELD(rt); 4854 lck_rw_lock_shared(nd_if_rwlock); 4855 if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized) 4856 ndi = NULL; 4857 if (ndi != NULL) 4858 lck_mtx_lock(&ndi->lock); 4859 if (rt->rt_rmx.rmx_mtu == 0) 4860 maxmtu = IN6_LINKMTU(rt->rt_ifp); 4861 else 4862 maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp)); 4863 if (ndi != NULL) 4864 lck_mtx_unlock(&ndi->lock); 4865 lck_rw_done(nd_if_rwlock); 4866 4867 return (maxmtu); 4868} 4869#endif 4870 4871/* 4872 * Determine a reasonable value for maxseg size. 4873 * If the route is known, check route for mtu. 4874 * If none, use an mss that can be handled on the outgoing 4875 * interface without forcing IP to fragment; if bigger than 4876 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 4877 * to utilize large mbufs. If no route is found, route has no mtu, 4878 * or the destination isn't local, use a default, hopefully conservative 4879 * size (usually 512 or the default IP max size, but no more than the mtu 4880 * of the interface), as we can't discover anything about intervening 4881 * gateways or networks. We also initialize the congestion/slow start 4882 * window. While looking at the routing entry, we also initialize 4883 * other path-dependent parameters from pre-set or cached values 4884 * in the routing entry. 4885 * 4886 * Also take into account the space needed for options that we 4887 * send regularly. Make maxseg shorter by that amount to assure 4888 * that we can send maxseg amount of data even when the options 4889 * are present. Store the upper limit of the length of options plus 4890 * data in maxopd. 4891 * 4892 * NOTE that this routine is only called when we process an incoming 4893 * segment, for outgoing segments only tcp_mssopt is called. 4894 * 4895 */ 4896void 4897tcp_mss(tp, offer, input_ifscope) 4898 struct tcpcb *tp; 4899 int offer; 4900 unsigned int input_ifscope; 4901{ 4902 register struct rtentry *rt; 4903 struct ifnet *ifp; 4904 register int rtt, mss; 4905 u_int32_t bufsize; 4906 struct inpcb *inp; 4907 struct socket *so; 4908 struct rmxp_tao *taop; 4909 int origoffer = offer; 4910 u_int32_t sb_max_corrected; 4911 int isnetlocal = 0; 4912#if INET6 4913 int isipv6; 4914 int min_protoh; 4915#endif 4916 4917 inp = tp->t_inpcb; 4918#if INET6 4919 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 4920 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) 4921 : sizeof (struct tcpiphdr); 4922#else 4923#define min_protoh (sizeof (struct tcpiphdr)) 4924#endif 4925 4926#if INET6 4927 if (isipv6) { 4928 rt = tcp_rtlookup6(inp, input_ifscope); 4929 } 4930 else 4931#endif /* INET6 */ 4932 { 4933 rt = tcp_rtlookup(inp, input_ifscope); 4934 } 4935 isnetlocal = (tp->t_flags & TF_LOCAL); 4936 4937 if (rt == NULL) { 4938 tp->t_maxopd = tp->t_maxseg = 4939#if INET6 4940 isipv6 ? tcp_v6mssdflt : 4941#endif /* INET6 */ 4942 tcp_mssdflt; 4943 return; 4944 } 4945 ifp = rt->rt_ifp; 4946 /* 4947 * Slower link window correction: 4948 * If a value is specificied for slowlink_wsize use it for 4949 * PPP links believed to be on a serial modem (speed <128Kbps). 4950 * Excludes 9600bps as it is the default value adversized 4951 * by pseudo-devices over ppp. 4952 */ 4953 if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 && 4954 ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) { 4955 tp->t_flags |= TF_SLOWLINK; 4956 } 4957 so = inp->inp_socket; 4958 4959 taop = rmx_taop(rt->rt_rmx); 4960 /* 4961 * Offer == -1 means that we didn't receive SYN yet, 4962 * use cached value in that case; 4963 */ 4964 if (offer == -1) 4965 offer = taop->tao_mssopt; 4966 /* 4967 * Offer == 0 means that there was no MSS on the SYN segment, 4968 * in this case we use tcp_mssdflt. 4969 */ 4970 if (offer == 0) 4971 offer = 4972#if INET6 4973 isipv6 ? tcp_v6mssdflt : 4974#endif /* INET6 */ 4975 tcp_mssdflt; 4976 else { 4977 /* 4978 * Prevent DoS attack with too small MSS. Round up 4979 * to at least minmss. 4980 */ 4981 offer = max(offer, tcp_minmss); 4982 /* 4983 * Sanity check: make sure that maxopd will be large 4984 * enough to allow some data on segments even is the 4985 * all the option space is used (40bytes). Otherwise 4986 * funny things may happen in tcp_output. 4987 */ 4988 offer = max(offer, 64); 4989 } 4990 taop->tao_mssopt = offer; 4991 4992 /* 4993 * While we're here, check if there's an initial rtt 4994 * or rttvar. Convert from the route-table units 4995 * to scaled multiples of the slow timeout timer. 4996 */ 4997 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) { 4998 tcp_getrt_rtt(tp, rt); 4999 } else { 5000 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; 5001 } 5002 5003#if INET6 5004 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt)); 5005#else 5006 mss = tcp_maxmtu(rt); 5007#endif 5008 mss -= min_protoh; 5009 5010 if (rt->rt_rmx.rmx_mtu == 0) { 5011#if INET6 5012 if (isipv6) { 5013 if (!isnetlocal) 5014 mss = min(mss, tcp_v6mssdflt); 5015 } else 5016#endif /* INET6 */ 5017 if (!isnetlocal) 5018 mss = min(mss, tcp_mssdflt); 5019 } 5020 5021 mss = min(mss, offer); 5022 /* 5023 * maxopd stores the maximum length of data AND options 5024 * in a segment; maxseg is the amount of data in a normal 5025 * segment. We need to store this value (maxopd) apart 5026 * from maxseg, because now every segment carries options 5027 * and thus we normally have somewhat less data in segments. 5028 */ 5029 tp->t_maxopd = mss; 5030 5031 /* 5032 * origoffer==-1 indicates, that no segments were received yet. 5033 * In this case we just guess. 5034 */ 5035 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 5036 (origoffer == -1 || 5037 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) 5038 mss -= TCPOLEN_TSTAMP_APPA; 5039 5040#if MPTCP 5041 mss -= mptcp_adj_mss(tp, FALSE); 5042#endif /* MPTCP */ 5043 tp->t_maxseg = mss; 5044 5045 /* 5046 * Calculate corrected value for sb_max; ensure to upgrade the 5047 * numerator for large sb_max values else it will overflow. 5048 */ 5049 sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES); 5050 5051 /* 5052 * If there's a pipesize (ie loopback), change the socket 5053 * buffer to that size only if it's bigger than the current 5054 * sockbuf size. Make the socket buffers an integral 5055 * number of mss units; if the mss is larger than 5056 * the socket buffer, decrease the mss. 5057 */ 5058#if RTV_SPIPE 5059 bufsize = rt->rt_rmx.rmx_sendpipe; 5060 if (bufsize < so->so_snd.sb_hiwat) 5061#endif 5062 bufsize = so->so_snd.sb_hiwat; 5063 if (bufsize < mss) 5064 mss = bufsize; 5065 else { 5066 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss); 5067 if (bufsize > sb_max_corrected) 5068 bufsize = sb_max_corrected; 5069 (void)sbreserve(&so->so_snd, bufsize); 5070 } 5071 tp->t_maxseg = mss; 5072 5073#if RTV_RPIPE 5074 bufsize = rt->rt_rmx.rmx_recvpipe; 5075 if (bufsize < so->so_rcv.sb_hiwat) 5076#endif 5077 bufsize = so->so_rcv.sb_hiwat; 5078 if (bufsize > mss) { 5079 bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss); 5080 if (bufsize > sb_max_corrected) 5081 bufsize = sb_max_corrected; 5082 (void)sbreserve(&so->so_rcv, bufsize); 5083 } 5084 5085 set_tcp_stream_priority(so); 5086 5087 if (rt->rt_rmx.rmx_ssthresh) { 5088 /* 5089 * There's some sort of gateway or interface 5090 * buffer limit on the path. Use this to set 5091 * slow-start threshold, but set the threshold to 5092 * no less than 2*mss. 5093 */ 5094 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); 5095 tcpstat.tcps_usedssthresh++; 5096 } else { 5097 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 5098 } 5099 5100 /* 5101 * Set the slow-start flight size depending on whether this 5102 * is a local network or not. 5103 */ 5104 if (CC_ALGO(tp)->cwnd_init != NULL) 5105 CC_ALGO(tp)->cwnd_init(tp); 5106 5107 tcp_ccdbg_trace(tp, NULL, TCP_CC_CWND_INIT); 5108 5109 /* Route locked during lookup above */ 5110 RT_UNLOCK(rt); 5111} 5112 5113/* 5114 * Determine the MSS option to send on an outgoing SYN. 5115 */ 5116int 5117tcp_mssopt(tp) 5118 struct tcpcb *tp; 5119{ 5120 struct rtentry *rt; 5121 int mss; 5122#if INET6 5123 int isipv6; 5124 int min_protoh; 5125#endif 5126 5127#if INET6 5128 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 5129 min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) 5130 : sizeof (struct tcpiphdr); 5131#else 5132#define min_protoh (sizeof (struct tcpiphdr)) 5133#endif 5134 5135#if INET6 5136 if (isipv6) 5137 rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE); 5138 else 5139#endif /* INET6 */ 5140 rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE); 5141 if (rt == NULL) { 5142 return ( 5143#if INET6 5144 isipv6 ? tcp_v6mssdflt : 5145#endif /* INET6 */ 5146 tcp_mssdflt); 5147 } 5148 /* 5149 * Slower link window correction: 5150 * If a value is specificied for slowlink_wsize use it for PPP links 5151 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as 5152 * it is the default value adversized by pseudo-devices over ppp. 5153 */ 5154 if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 && 5155 rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) { 5156 tp->t_flags |= TF_SLOWLINK; 5157 } 5158 5159#if INET6 5160 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt)); 5161#else 5162 mss = tcp_maxmtu(rt); 5163#endif 5164 /* Route locked during lookup above */ 5165 RT_UNLOCK(rt); 5166 return (mss - min_protoh); 5167} 5168 5169/* 5170 * On a partial ack arrives, force the retransmission of the 5171 * next unacknowledged segment. Do not clear tp->t_dupacks. 5172 * By setting snd_nxt to th_ack, this forces retransmission timer to 5173 * be started again. 5174 */ 5175static void 5176tcp_newreno_partial_ack(tp, th) 5177 struct tcpcb *tp; 5178 struct tcphdr *th; 5179{ 5180 tcp_seq onxt = tp->snd_nxt; 5181 u_int32_t ocwnd = tp->snd_cwnd; 5182 tp->t_timer[TCPT_REXMT] = 0; 5183 tp->t_timer[TCPT_PTO] = 0; 5184 tp->t_rtttime = 0; 5185 tp->snd_nxt = th->th_ack; 5186 /* 5187 * Set snd_cwnd to one segment beyond acknowledged offset 5188 * (tp->snd_una has not yet been updated when this function 5189 * is called) 5190 */ 5191 tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp); 5192 tp->t_flags |= TF_ACKNOW; 5193 (void) tcp_output(tp); 5194 tp->snd_cwnd = ocwnd; 5195 if (SEQ_GT(onxt, tp->snd_nxt)) 5196 tp->snd_nxt = onxt; 5197 /* 5198 * Partial window deflation. Relies on fact that tp->snd_una 5199 * not updated yet. 5200 */ 5201 if (tp->snd_cwnd > BYTES_ACKED(th, tp)) 5202 tp->snd_cwnd -= BYTES_ACKED(th, tp); 5203 else 5204 tp->snd_cwnd = 0; 5205 tp->snd_cwnd += tp->t_maxseg; 5206 5207} 5208 5209/* 5210 * Drop a random TCP connection that hasn't been serviced yet and 5211 * is eligible for discard. There is a one in qlen chance that 5212 * we will return a null, saying that there are no dropable 5213 * requests. In this case, the protocol specific code should drop 5214 * the new request. This insures fairness. 5215 * 5216 * The listening TCP socket "head" must be locked 5217 */ 5218static int 5219tcp_dropdropablreq(struct socket *head) 5220{ 5221 struct socket *so, *sonext; 5222 unsigned int i, j, qlen; 5223 static u_int32_t rnd = 0; 5224 static u_int64_t old_runtime; 5225 static unsigned int cur_cnt, old_cnt; 5226 u_int64_t now_sec; 5227 struct inpcb *inp = NULL; 5228 struct tcpcb *tp; 5229 5230 if ((head->so_options & SO_ACCEPTCONN) == 0) 5231 return (0); 5232 5233 if (TAILQ_EMPTY(&head->so_incomp)) 5234 return (0); 5235 5236 /* 5237 * Check if there is any socket in the incomp queue 5238 * that is closed because of a reset from the peer and is 5239 * waiting to be garbage collected. If so, pick that as 5240 * the victim 5241 */ 5242 TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) { 5243 inp = sotoinpcb(so); 5244 tp = intotcpcb(inp); 5245 if (tp != NULL && tp->t_state == TCPS_CLOSED && 5246 so->so_head != NULL && 5247 (so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) == 5248 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) { 5249 /* 5250 * The listen socket is already locked but we 5251 * can lock this socket here without lock ordering 5252 * issues because it is in the incomp queue and 5253 * is not visible to others. 5254 */ 5255 if (lck_mtx_try_lock(&inp->inpcb_mtx)) { 5256 so->so_usecount++; 5257 goto found_victim; 5258 } else { 5259 continue; 5260 } 5261 } 5262 } 5263 5264 so = TAILQ_FIRST(&head->so_incomp); 5265 5266 now_sec = net_uptime(); 5267 if ((i = (now_sec - old_runtime)) != 0) { 5268 old_runtime = now_sec; 5269 old_cnt = cur_cnt / i; 5270 cur_cnt = 0; 5271 } 5272 5273 5274 qlen = head->so_incqlen; 5275 if (rnd == 0) 5276 rnd = RandomULong(); 5277 5278 if (++cur_cnt > qlen || old_cnt > qlen) { 5279 rnd = (314159 * rnd + 66329) & 0xffff; 5280 j = ((qlen + 1) * rnd) >> 16; 5281 5282 while (j-- && so) 5283 so = TAILQ_NEXT(so, so_list); 5284 } 5285 /* Find a connection that is not already closing (or being served) */ 5286 while (so) { 5287 inp = (struct inpcb *)so->so_pcb; 5288 5289 sonext = TAILQ_NEXT(so, so_list); 5290 5291 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) 5292 != WNT_STOPUSING) { 5293 /* 5294 * Avoid the issue of a socket being accepted 5295 * by one input thread and being dropped by 5296 * another input thread. If we can't get a hold 5297 * on this mutex, then grab the next socket in 5298 * line. 5299 */ 5300 if (lck_mtx_try_lock(&inp->inpcb_mtx)) { 5301 so->so_usecount++; 5302 if ((so->so_usecount == 2) && 5303 (so->so_state & SS_INCOMP) && 5304 !(so->so_flags & SOF_INCOMP_INPROGRESS)) { 5305 break; 5306 } else { 5307 /* 5308 * don't use if being accepted or 5309 * used in any other way 5310 */ 5311 in_pcb_checkstate(inp, WNT_RELEASE, 1); 5312 tcp_unlock(so, 1, 0); 5313 } 5314 } else { 5315 /* 5316 * do not try to lock the inp in 5317 * in_pcb_checkstate because the lock 5318 * is already held in some other thread. 5319 * Only drop the inp_wntcnt reference. 5320 */ 5321 in_pcb_checkstate(inp, WNT_RELEASE, 1); 5322 } 5323 } 5324 so = sonext; 5325 5326 } 5327 if (so == NULL) { 5328 return (0); 5329 } 5330 5331 /* Makes sure socket is still in the right state to be discarded */ 5332 5333 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { 5334 tcp_unlock(so, 1, 0); 5335 return (0); 5336 } 5337 5338found_victim: 5339 if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) { 5340 /* do not discard: that socket is being accepted */ 5341 tcp_unlock(so, 1, 0); 5342 return (0); 5343 } 5344 5345 TAILQ_REMOVE(&head->so_incomp, so, so_list); 5346 tcp_unlock(head, 0, 0); 5347 5348 lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); 5349 tp = sototcpcb(so); 5350 so->so_flags |= SOF_OVERFLOW; 5351 so->so_head = NULL; 5352 5353 tcp_close(tp); 5354 if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) { 5355 /* 5356 * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE 5357 * doesn't require a lock, it could have happened while 5358 * we are holding the lock. This pcb will have to 5359 * be garbage collected later. 5360 * Release the reference held for so_incomp queue 5361 */ 5362 so->so_usecount--; 5363 tcp_unlock(so, 1, 0); 5364 } else { 5365 /* 5366 * Unlock this socket and leave the reference on. 5367 * We need to acquire the pcbinfo lock in order to 5368 * fully dispose it off 5369 */ 5370 tcp_unlock(so, 0, 0); 5371 5372 lck_rw_lock_exclusive(tcbinfo.ipi_lock); 5373 5374 tcp_lock(so, 0, 0); 5375 /* Release the reference held for so_incomp queue */ 5376 so->so_usecount--; 5377 5378 if (so->so_usecount != 1 || 5379 (inp->inp_wantcnt > 0 && 5380 inp->inp_wantcnt != WNT_STOPUSING)) { 5381 /* 5382 * There is an extra wantcount or usecount 5383 * that must have been added when the socket 5384 * was unlocked. This socket will have to be 5385 * garbage collected later 5386 */ 5387 tcp_unlock(so, 1, 0); 5388 } else { 5389 5390 /* Drop the reference held for this function */ 5391 so->so_usecount--; 5392 5393 in_pcbdispose(inp); 5394 } 5395 lck_rw_done(tcbinfo.ipi_lock); 5396 } 5397 tcpstat.tcps_drops++; 5398 5399 tcp_lock(head, 0, 0); 5400 head->so_incqlen--; 5401 head->so_qlen--; 5402 return(1); 5403} 5404 5405/* Set background congestion control on a socket */ 5406void 5407tcp_set_background_cc(struct socket *so) 5408{ 5409 tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX); 5410} 5411 5412/* Set foreground congestion control on a socket */ 5413void 5414tcp_set_foreground_cc(struct socket *so) 5415{ 5416 if (tcp_use_newreno) 5417 tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX); 5418 else 5419 tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX); 5420} 5421 5422static void 5423tcp_set_new_cc(struct socket *so, uint16_t cc_index) 5424{ 5425 struct inpcb *inp = sotoinpcb(so); 5426 struct tcpcb *tp = intotcpcb(inp); 5427 u_char old_cc_index = 0; 5428 if (tp->tcp_cc_index != cc_index) { 5429 5430 old_cc_index = tp->tcp_cc_index; 5431 5432 if (CC_ALGO(tp)->cleanup != NULL) 5433 CC_ALGO(tp)->cleanup(tp); 5434 tp->tcp_cc_index = cc_index; 5435 5436 tcp_cc_allocate_state(tp); 5437 5438 if (CC_ALGO(tp)->switch_to != NULL) 5439 CC_ALGO(tp)->switch_to(tp, old_cc_index); 5440 5441 tcp_ccdbg_trace(tp, NULL, TCP_CC_CHANGE_ALGO); 5442 } 5443} 5444 5445void 5446tcp_set_recv_bg(struct socket *so) 5447{ 5448 if (!IS_TCP_RECV_BG(so)) 5449 so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG; 5450 5451 /* Unset Large Receive Offload on background sockets */ 5452 so_set_lro(so, SO_TC_BK); 5453} 5454 5455void 5456tcp_clear_recv_bg(struct socket *so) 5457{ 5458 if (IS_TCP_RECV_BG(so)) 5459 so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); 5460 5461 /* 5462 * Set/unset use of Large Receive Offload depending on 5463 * the traffic class 5464 */ 5465 so_set_lro(so, so->so_traffic_class); 5466} 5467 5468void 5469inp_fc_unthrottle_tcp(struct inpcb *inp) 5470{ 5471 struct tcpcb *tp = inp->inp_ppcb; 5472 /* 5473 * Back off the slow-start threshold and enter 5474 * congestion avoidance phase 5475 */ 5476 if (CC_ALGO(tp)->pre_fr != NULL) 5477 CC_ALGO(tp)->pre_fr(tp); 5478 5479 tp->snd_cwnd = tp->snd_ssthresh; 5480 5481 /* 5482 * Restart counting for ABC as we changed the 5483 * congestion window just now. 5484 */ 5485 tp->t_bytes_acked = 0; 5486 5487 /* Reset retransmit shift as we know that the reason 5488 * for delay in sending a packet is due to flow 5489 * control on the outgoing interface. There is no need 5490 * to backoff retransmit timer. 5491 */ 5492 tp->t_rxtshift = 0; 5493 5494 /* 5495 * Start the output stream again. Since we are 5496 * not retransmitting data, do not reset the 5497 * retransmit timer or rtt calculation. 5498 */ 5499 tcp_output(tp); 5500} 5501 5502static int 5503tcp_getstat SYSCTL_HANDLER_ARGS 5504{ 5505#pragma unused(oidp, arg1, arg2) 5506 5507 int error; 5508 5509 proc_t caller = PROC_NULL; 5510 proc_t caller_parent = PROC_NULL; 5511 char command_name[MAXCOMLEN + 1] = ""; 5512 char parent_name[MAXCOMLEN + 1] = ""; 5513 5514 if ((caller = proc_self()) != PROC_NULL) { 5515 /* get process name */ 5516 strlcpy(command_name, caller->p_comm, sizeof(command_name)); 5517 5518 /* get parent process name if possible */ 5519 if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) { 5520 strlcpy(parent_name, caller_parent->p_comm, 5521 sizeof(parent_name)); 5522 proc_rele(caller_parent); 5523 } 5524 5525 if ((escape_str(command_name, strlen(command_name), 5526 sizeof(command_name)) == 0) && 5527 (escape_str(parent_name, strlen(parent_name), 5528 sizeof(parent_name)) == 0)) { 5529 kern_asl_msg(LOG_DEBUG, "messagetracer", 5530 5, 5531 "com.apple.message.domain", 5532 "com.apple.kernel.tcpstat", /* 1 */ 5533 "com.apple.message.signature", 5534 "tcpstat", /* 2 */ 5535 "com.apple.message.signature2", command_name, /* 3 */ 5536 "com.apple.message.signature3", parent_name, /* 4 */ 5537 "com.apple.message.summarize", "YES", /* 5 */ 5538 NULL); 5539 } 5540 } 5541 if (caller != PROC_NULL) 5542 proc_rele(caller); 5543 5544 if (req->oldptr == 0) { 5545 req->oldlen= (size_t)sizeof(struct tcpstat); 5546 } 5547 5548 error = SYSCTL_OUT(req, &tcpstat, MIN(sizeof (tcpstat), req->oldlen)); 5549 5550 return (error); 5551 5552} 5553 5554/* 5555 * Checksum extended TCP header and data. 5556 */ 5557int 5558tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen) 5559{ 5560 struct ifnet *ifp = m->m_pkthdr.rcvif; 5561 5562 switch (af) { 5563 case AF_INET: { 5564 struct ip *ip = mtod(m, struct ip *); 5565 struct ipovly *ipov = (struct ipovly *)ip; 5566 5567 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM) 5568 return (0); 5569 5570 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) || 5571 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) && 5572 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) { 5573 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { 5574 th->th_sum = m->m_pkthdr.csum_rx_val; 5575 } else { 5576 uint16_t sum = m->m_pkthdr.csum_rx_val; 5577 uint16_t start = m->m_pkthdr.csum_rx_start; 5578 5579 /* 5580 * Perform 1's complement adjustment of octets 5581 * that got included/excluded in the hardware- 5582 * calculated checksum value. Ignore cases 5583 * where the value includes or excludes the IP 5584 * header span, as the sum for those octets 5585 * would already be 0xffff and thus no-op. 5586 */ 5587 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) && 5588 start != 0 && (off - start) != off) { 5589#if BYTE_ORDER != BIG_ENDIAN 5590 if (start < off) { 5591 HTONS(ip->ip_len); 5592 HTONS(ip->ip_off); 5593 } 5594#endif 5595 /* callee folds in sum */ 5596 sum = m_adj_sum16(m, start, off, sum); 5597#if BYTE_ORDER != BIG_ENDIAN 5598 if (start < off) { 5599 NTOHS(ip->ip_off); 5600 NTOHS(ip->ip_len); 5601 } 5602#endif 5603 } 5604 5605 /* callee folds in sum */ 5606 th->th_sum = in_pseudo(ip->ip_src.s_addr, 5607 ip->ip_dst.s_addr, 5608 sum + htonl(tlen + IPPROTO_TCP)); 5609 } 5610 th->th_sum ^= 0xffff; 5611 } else { 5612 uint16_t ip_sum; 5613 int len; 5614 char b[9]; 5615 5616 bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1)); 5617 bzero(ipov->ih_x1, sizeof (ipov->ih_x1)); 5618 ip_sum = ipov->ih_len; 5619 ipov->ih_len = (u_short)tlen; 5620#if BYTE_ORDER != BIG_ENDIAN 5621 HTONS(ipov->ih_len); 5622#endif 5623 len = sizeof (struct ip) + tlen; 5624 th->th_sum = in_cksum(m, len); 5625 bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1)); 5626 ipov->ih_len = ip_sum; 5627 5628 tcp_in_cksum_stats(len); 5629 } 5630 break; 5631 } 5632#if INET6 5633 case AF_INET6: { 5634 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 5635 5636 if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM) 5637 return (0); 5638 5639 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) || 5640 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) && 5641 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) { 5642 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { 5643 th->th_sum = m->m_pkthdr.csum_rx_val; 5644 } else { 5645 uint16_t sum = m->m_pkthdr.csum_rx_val; 5646 uint16_t start = m->m_pkthdr.csum_rx_start; 5647 5648 /* 5649 * Perform 1's complement adjustment of octets 5650 * that got included/excluded in the hardware- 5651 * calculated checksum value. 5652 */ 5653 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) && 5654 start != off) { 5655 uint16_t s, d; 5656 5657 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) { 5658 s = ip6->ip6_src.s6_addr16[1]; 5659 ip6->ip6_src.s6_addr16[1] = 0 ; 5660 } 5661 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) { 5662 d = ip6->ip6_dst.s6_addr16[1]; 5663 ip6->ip6_dst.s6_addr16[1] = 0; 5664 } 5665 5666 /* callee folds in sum */ 5667 sum = m_adj_sum16(m, start, off, sum); 5668 5669 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) 5670 ip6->ip6_src.s6_addr16[1] = s; 5671 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) 5672 ip6->ip6_dst.s6_addr16[1] = d; 5673 } 5674 5675 th->th_sum = in6_pseudo( 5676 &ip6->ip6_src, &ip6->ip6_dst, 5677 sum + htonl(tlen + IPPROTO_TCP)); 5678 } 5679 th->th_sum ^= 0xffff; 5680 } else { 5681 tcp_in6_cksum_stats(tlen); 5682 th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen); 5683 } 5684 break; 5685 } 5686#endif /* INET6 */ 5687 default: 5688 VERIFY(0); 5689 /* NOTREACHED */ 5690 } 5691 5692 if (th->th_sum != 0) { 5693 tcpstat.tcps_rcvbadsum++; 5694 IF_TCP_STATINC(ifp, badformat); 5695 return (-1); 5696 } 5697 5698 return (0); 5699} 5700 5701SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, 5702 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, 5703 tcp_getstat, "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 5704 5705static int 5706sysctl_rexmtthresh SYSCTL_HANDLER_ARGS 5707{ 5708#pragma unused(arg1, arg2) 5709 5710 int error, val = tcprexmtthresh; 5711 5712 error = sysctl_handle_int(oidp, &val, 0, req); 5713 if (error || !req->newptr) 5714 return (error); 5715 5716 /* 5717 * Constrain the number of duplicate ACKs 5718 * to consider for TCP fast retransmit 5719 * to either 2 or 3 5720 */ 5721 5722 if (val < 2 || val > 3) 5723 return (EINVAL); 5724 5725 tcprexmtthresh = val; 5726 5727 return (0); 5728} 5729 5730SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, 5731 &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I", "Duplicate ACK Threshold for Fast Retransmit"); 5732