1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections.  This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/kernel.h>
73#include <sys/sysctl.h>
74#include <sys/malloc.h>
75#include <sys/mbuf.h>
76#include <sys/proc.h>		/* for proc0 declaration */
77#include <sys/protosw.h>
78#include <sys/socket.h>
79#include <sys/socketvar.h>
80#include <sys/syslog.h>
81#include <sys/mcache.h>
82#include <sys/kasl.h>
83#include <kern/cpu_number.h>	/* before tcp_seq.h, for tcp_random18() */
84
85#include <machine/endian.h>
86
87#include <net/if.h>
88#include <net/if_types.h>
89#include <net/route.h>
90#include <net/ntstat.h>
91#include <net/dlil.h>
92
93#include <netinet/in.h>
94#include <netinet/in_systm.h>
95#include <netinet/ip.h>
96#include <netinet/ip_icmp.h>    /* for ICMP_BANDLIM		*/
97#include <netinet/in_var.h>
98#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM	*/
99#include <netinet/in_pcb.h>
100#include <netinet/ip_var.h>
101#include <mach/sdt.h>
102#if INET6
103#include <netinet/ip6.h>
104#include <netinet/icmp6.h>
105#include <netinet6/nd6.h>
106#include <netinet6/ip6_var.h>
107#include <netinet6/in6_pcb.h>
108#endif
109#include <netinet/tcp.h>
110#include <netinet/tcp_fsm.h>
111#include <netinet/tcp_seq.h>
112#include <netinet/tcp_timer.h>
113#include <netinet/tcp_var.h>
114#include <netinet/tcp_cc.h>
115#include <dev/random/randomdev.h>
116#include <kern/zalloc.h>
117#if INET6
118#include <netinet6/tcp6_var.h>
119#endif
120#include <netinet/tcpip.h>
121#if TCPDEBUG
122#include <netinet/tcp_debug.h>
123u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
124struct tcphdr tcp_savetcp;
125#endif /* TCPDEBUG */
126
127#if IPSEC
128#include <netinet6/ipsec.h>
129#if INET6
130#include <netinet6/ipsec6.h>
131#endif
132#include <netkey/key.h>
133#endif /*IPSEC*/
134
135#if CONFIG_MACF_NET || CONFIG_MACF_SOCKET
136#include <security/mac_framework.h>
137#endif /* CONFIG_MACF_NET || CONFIG_MACF_SOCKET */
138
139#include <sys/kdebug.h>
140#include <netinet/lro_ext.h>
141#if MPTCP
142#include <netinet/mptcp_var.h>
143#include <netinet/mptcp.h>
144#include <netinet/mptcp_opt.h>
145#endif /* MPTCP */
146
147#define DBG_LAYER_BEG		NETDBG_CODE(DBG_NETTCP, 0)
148#define DBG_LAYER_END		NETDBG_CODE(DBG_NETTCP, 2)
149#define DBG_FNC_TCP_INPUT       NETDBG_CODE(DBG_NETTCP, (3 << 8))
150#define DBG_FNC_TCP_NEWCONN     NETDBG_CODE(DBG_NETTCP, (7 << 8))
151
152tcp_cc	tcp_ccgen;
153
154struct	tcpstat tcpstat;
155
156static int log_in_vain = 0;
157SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED,
158    &log_in_vain, 0, "Log all incoming TCP connections");
159
160static int blackhole = 0;
161SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED,
162	&blackhole, 0, "Do not send RST when dropping refused connections");
163
164int tcp_delack_enabled = 3;
165SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW | CTLFLAG_LOCKED,
166    &tcp_delack_enabled, 0,
167    "Delay ACK to try and piggyback it onto a data packet");
168
169int tcp_lq_overflow = 1;
170SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW | CTLFLAG_LOCKED,
171    &tcp_lq_overflow, 0,
172    "Listen Queue Overflow");
173
174int tcp_recv_bg = 0;
175SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED,
176    &tcp_recv_bg, 0,
177    "Receive background");
178
179#if TCP_DROP_SYNFIN
180static int drop_synfin = 1;
181SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW | CTLFLAG_LOCKED,
182    &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
183#endif
184
185SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
186    "TCP Segment Reassembly Queue");
187
188static int tcp_reass_overflows = 0;
189SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD | CTLFLAG_LOCKED,
190    &tcp_reass_overflows, 0,
191    "Global number of TCP Segment Reassembly Queue Overflows");
192
193
194__private_extern__ int slowlink_wsize = 8192;
195SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW | CTLFLAG_LOCKED,
196	&slowlink_wsize, 0, "Maximum advertised window size for slowlink");
197
198int maxseg_unacked = 8;
199SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW | CTLFLAG_LOCKED,
200	&maxseg_unacked, 0, "Maximum number of outstanding segments left unacked");
201
202int	tcp_do_rfc3465 = 1;
203SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED,
204	&tcp_do_rfc3465, 0, "");
205
206int	tcp_do_rfc3465_lim2 = 1;
207SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW | CTLFLAG_LOCKED,
208	&tcp_do_rfc3465_lim2, 0, "Appropriate bytes counting w/ L=2*SMSS");
209
210int	rtt_samples_per_slot = 20;
211SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_samples_per_slot, CTLFLAG_RW | CTLFLAG_LOCKED,
212	&rtt_samples_per_slot, 0, "Number of RTT samples stored for rtt history");
213
214int	tcp_allowed_iaj = ALLOWED_IAJ;
215SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj, CTLFLAG_RW | CTLFLAG_LOCKED,
216        &tcp_allowed_iaj, 0, "Allowed inter-packet arrival jiter");
217
218int	tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
219SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
220        &tcp_acc_iaj_high_thresh, 0, "Used in calculating maximum accumulated IAJ");
221
222u_int32_t tcp_do_autorcvbuf = 1;
223SYSCTL_INT(_net_inet_tcp, OID_AUTO, doautorcvbuf, CTLFLAG_RW | CTLFLAG_LOCKED,
224        &tcp_do_autorcvbuf, 0, "Enable automatic socket buffer tuning");
225
226u_int32_t tcp_autorcvbuf_inc_shift = 3;
227SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufincshift, CTLFLAG_RW | CTLFLAG_LOCKED,
228        &tcp_autorcvbuf_inc_shift, 0, "Shift for increment in receive socket buffer size");
229
230u_int32_t tcp_autorcvbuf_max = 512 * 1024;
231SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmax, CTLFLAG_RW | CTLFLAG_LOCKED,
232        &tcp_autorcvbuf_max, 0, "Maximum receive socket buffer size");
233
234int sw_lro = 0;
235SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_LOCKED,
236        &sw_lro, 0, "Used to coalesce TCP packets");
237
238int lrodebug = 0;
239SYSCTL_INT(_net_inet_tcp, OID_AUTO, lrodbg, CTLFLAG_RW | CTLFLAG_LOCKED,
240        &lrodebug, 0, "Used to debug SW LRO");
241
242int lro_start = 4;
243SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_startcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
244	&lro_start, 0, "Segments for starting LRO computed as power of 2");
245
246extern int tcp_do_autosendbuf;
247
248int limited_txmt = 1;
249SYSCTL_INT(_net_inet_tcp, OID_AUTO, limited_transmit, CTLFLAG_RW | CTLFLAG_LOCKED,
250	&limited_txmt, 0, "Enable limited transmit");
251
252int early_rexmt = 1;
253SYSCTL_INT(_net_inet_tcp, OID_AUTO, early_rexmt, CTLFLAG_RW | CTLFLAG_LOCKED,
254	&early_rexmt, 0, "Enable Early Retransmit");
255
256int sack_ackadv = 1;
257SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_ackadv, CTLFLAG_RW | CTLFLAG_LOCKED,
258	&sack_ackadv, 0, "Use SACK with cumulative ack advancement as a dupack");
259
260#if CONFIG_IFEF_NOWINDOWSCALE
261int tcp_obey_ifef_nowindowscale = 0;
262SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW | CTLFLAG_LOCKED,
263	&tcp_obey_ifef_nowindowscale, 0, "");
264#endif
265
266extern int tcp_TCPTV_MIN;
267extern int tcp_acc_iaj_high;
268extern int tcp_acc_iaj_react_limit;
269extern struct zone *tcp_reass_zone;
270
271int tcprexmtthresh = 3;
272
273u_int32_t tcp_now;
274struct timeval tcp_uptime;	/* uptime when tcp_now was last updated */
275lck_spin_t *tcp_uptime_lock;	/* Used to sychronize updates to tcp_now */
276
277struct inpcbhead tcb;
278#define	tcb6	tcb  /* for KAME src sync over BSD*'s */
279struct inpcbinfo tcbinfo;
280
281static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
282    struct tcpopt *, unsigned int);
283static void	 tcp_pulloutofband(struct socket *,
284	    struct tcphdr *, struct mbuf *, int);
285static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *,
286    struct ifnet *);
287static void	tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq);
288static inline unsigned int tcp_maxmtu(struct rtentry *);
289static inline int tcp_stretch_ack_enable(struct tcpcb *tp);
290static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int);
291
292#if TRAFFIC_MGT
293static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen,
294	int reset_size);
295void compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor);
296static void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj);
297#endif /* TRAFFIC_MGT */
298
299#if INET6
300static inline unsigned int tcp_maxmtu6(struct rtentry *);
301#endif
302
303static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb,
304	struct tcpopt *to, u_int32_t tlen);
305
306void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
307static void tcp_sbsnd_trim(struct sockbuf *sbsnd);
308static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp);
309static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb,
310	u_int32_t newsize, u_int32_t idealsize);
311static void tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th);
312static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to,
313	struct tcphdr *th);
314static void tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th);
315static void tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th,
316	struct tcpopt *to);
317/*
318 * Constants used for resizing receive socket buffer
319 * when timestamps are not supported
320 */
321#define TCPTV_RCVNOTS_QUANTUM 100
322#define TCP_RCVNOTS_BYTELEVEL 204800
323
324/*
325 * Constants used for limiting early retransmits
326 * to 10 per minute.
327 */
328#define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
329#define TCP_EARLY_REXMT_LIMIT 10
330
331extern  void    ipfwsyslog( int level, const char *format,...);
332extern int fw_verbose;
333
334#if IPFIREWALL
335extern void ipfw_stealth_stats_incr_tcp(void);
336
337#define log_in_vain_log( a ) {            \
338        if ( (log_in_vain == 3 ) && (fw_verbose == 2)) {        /* Apple logging, log to ipfw.log */ \
339                ipfwsyslog a ;  \
340        } else if ( (log_in_vain == 4 ) && (fw_verbose == 2)) {   \
341                ipfw_stealth_stats_incr_tcp();                    \
342        }                       \
343        else log a ;            \
344}
345#else
346#define log_in_vain_log( a ) { log a; }
347#endif
348
349int tcp_rcvunackwin = TCPTV_UNACKWIN;
350int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
351int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT;
352SYSCTL_INT(_net_inet_tcp, OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
353	&tcp_rcvsspktcnt, 0, "packets to be seen before receiver stretches acks");
354
355#define DELAY_ACK(tp, th) \
356	(CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
357
358static int tcp_dropdropablreq(struct socket *head);
359static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
360
361static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
362void tcp_set_background_cc(struct socket *so);
363void tcp_set_foreground_cc(struct socket *so);
364static void tcp_set_new_cc(struct socket *so, uint16_t cc_index);
365static void tcp_bwmeas_check(struct tcpcb *tp);
366
367#if TRAFFIC_MGT
368void
369reset_acc_iaj(struct tcpcb *tp)
370{
371	tp->acc_iaj = 0;
372	tp->iaj_rwintop = 0;
373	CLEAR_IAJ_STATE(tp);
374}
375
376static inline void
377update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size)
378{
379	if (rst_size > 0)
380		tp->iaj_size = 0;
381	if (tp->iaj_size == 0 || size >= tp->iaj_size) {
382		tp->iaj_size = size;
383		tp->iaj_rcv_ts = tcp_now;
384		tp->iaj_small_pkt = 0;
385	}
386}
387
388/* For every 32 bit unsigned integer(v), this function will find the
389 * largest integer n such that (n*n <= v). This takes at most 16 iterations
390 * irrespective of the value of v and does not involve multiplications.
391 */
392static inline int
393isqrt(unsigned int val) {
394	unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
395	unsigned int temp, g=0, b=0x8000, bshft=15;
396	if ( val <= 100) {
397		for (g = 0; g <= 10; ++g) {
398			if (sqrt_cache[g] > val) {
399				g--;
400				break;
401			} else if (sqrt_cache[g] == val) {
402				break;
403			}
404		}
405	} else {
406		do {
407			temp = (((g << 1) + b) << (bshft--));
408			if (val >= temp) {
409				g += b;
410				val -= temp;
411			}
412			b >>= 1;
413		} while ( b > 0 && val > 0);
414	}
415	return(g);
416}
417
418/*
419* With LRO, roughly estimate the inter arrival time between
420* each sub coalesced packet as an average. Count the delay
421* cur_iaj to be the delay between the last packet received
422* and the first packet of the LRO stream. Due to round off errors
423* cur_iaj may be the same as lro_delay_factor. Averaging has
424* round off errors too. lro_delay_factor may be close to 0
425* in steady state leading to lower values fed to compute_iaj_meat.
426*/
427void
428compute_iaj(struct tcpcb *tp, int nlropkts, int lro_delay_factor)
429{
430	uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts;
431	uint32_t timediff = 0;
432
433	if (cur_iaj >= lro_delay_factor) {
434		cur_iaj = cur_iaj - lro_delay_factor;
435	}
436
437	compute_iaj_meat(tp, cur_iaj);
438
439	if (nlropkts <= 1)
440		return;
441
442	nlropkts--;
443
444	timediff = lro_delay_factor/nlropkts;
445
446	while (nlropkts > 0)
447	{
448		compute_iaj_meat(tp, timediff);
449		nlropkts--;
450	}
451}
452
453static
454void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
455{
456	/* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,
457	 * throttle the receive window to a minimum of MIN_IAJ_WIN packets
458	 */
459#define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
460#define IAJ_DIV_SHIFT 4
461#define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
462
463	uint32_t allowed_iaj, acc_iaj = 0;
464
465	uint32_t mean, temp;
466	int32_t cur_iaj_dev;
467
468	cur_iaj_dev = (cur_iaj - tp->avg_iaj);
469
470	/* Allow a jitter of "allowed_iaj" milliseconds. Some connections
471	 * may have a constant jitter more than that. We detect this by
472	 * using standard deviation.
473	 */
474	allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
475	if (allowed_iaj < tcp_allowed_iaj)
476		allowed_iaj = tcp_allowed_iaj;
477
478	/* Initially when the connection starts, the senders congestion
479	 * window is small. During this period we avoid throttling a
480	 * connection because we do not have a good starting point for
481	 * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
482	 * the first few packets.
483	 */
484	if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
485		if ( cur_iaj <= allowed_iaj ) {
486			if (tp->acc_iaj >= 2)
487				acc_iaj = tp->acc_iaj - 2;
488			else
489				acc_iaj = 0;
490
491		} else {
492			acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
493		}
494
495		if (acc_iaj > MAX_ACC_IAJ)
496			acc_iaj = MAX_ACC_IAJ;
497		tp->acc_iaj = acc_iaj;
498	}
499
500	/* Compute weighted average where the history has a weight of
501	 * 15 out of 16 and the current value has a weight of 1 out of 16.
502	 * This will make the short-term measurements have more weight.
503	 *
504	 * The addition of 8 will help to round-up the value
505	 * instead of round-down
506	 */
507	tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj)
508		+ cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
509
510	/* Compute Root-mean-square of deviation where mean is a weighted
511	 * average as described above.
512	 */
513	temp = tp->std_dev_iaj * tp->std_dev_iaj;
514	mean = (((temp << IAJ_DIV_SHIFT) - temp)
515		+ (cur_iaj_dev * cur_iaj_dev)
516		+ IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
517
518	tp->std_dev_iaj = isqrt(mean);
519
520	DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj,
521		uint32_t, allowed_iaj);
522
523	return;
524}
525#endif /* TRAFFIC_MGT */
526
527/* Check if enough amount of data has been acknowledged since
528 * bw measurement was started
529 */
530static void
531tcp_bwmeas_check(struct tcpcb *tp)
532{
533	int32_t bw_meas_bytes;
534	uint32_t bw, bytes, elapsed_time;
535	bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start;
536	if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) != 0 &&
537	    bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) {
538		bytes = bw_meas_bytes;
539		elapsed_time = tcp_now - tp->t_bwmeas->bw_ts;
540		if (elapsed_time > 0) {
541			bw = bytes / elapsed_time;
542			if ( bw > 0) {
543				if (tp->t_bwmeas->bw_sndbw > 0) {
544					tp->t_bwmeas->bw_sndbw =
545					    (((tp->t_bwmeas->bw_sndbw << 3) - tp->t_bwmeas->bw_sndbw) + bw) >> 3;
546				} else {
547					tp->t_bwmeas->bw_sndbw = bw;
548				}
549			}
550		}
551		tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
552	}
553}
554
555static int
556tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m,
557    struct ifnet *ifp)
558{
559	struct tseg_qent *q;
560	struct tseg_qent *p = NULL;
561	struct tseg_qent *nq;
562	struct tseg_qent *te = NULL;
563	struct inpcb *inp = tp->t_inpcb;
564	struct socket *so = inp->inp_socket;
565	int flags = 0;
566	int dowakeup = 0;
567	struct mbuf *oodata = NULL;
568	int copy_oodata = 0;
569	u_int16_t qlimit;
570	boolean_t cell = IFNET_IS_CELLULAR(ifp);
571	boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
572	boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
573
574	/*
575	 * Call with th==0 after become established to
576	 * force pre-ESTABLISHED data up to user socket.
577	 */
578	if (th == NULL)
579		goto present;
580
581	/*
582	 * If the reassembly queue already has entries or if we are going
583	 * to add a new one, then the connection has reached a loss state.
584	 * Reset the stretch-ack algorithm at this point.
585	 */
586	tcp_reset_stretch_ack(tp);
587
588#if TRAFFIC_MGT
589	if (tp->acc_iaj > 0)
590		reset_acc_iaj(tp);
591#endif /* TRAFFIC_MGT */
592
593	/*
594	 * Limit the number of segments in the reassembly queue to prevent
595	 * holding on to too many segments (and thus running out of mbufs).
596	 * Make sure to let the missing segment through which caused this
597	 * queue.  Always keep one global queue entry spare to be able to
598	 * process the missing segment.
599	 */
600	qlimit = min(max(100, so->so_rcv.sb_hiwat >> 10),
601	    tcp_autorcvbuf_max >> 10);
602	if (th->th_seq != tp->rcv_nxt &&
603	    (tp->t_reassqlen + 1) >= qlimit) {
604		tcp_reass_overflows++;
605		tcpstat.tcps_rcvmemdrop++;
606		m_freem(m);
607		*tlenp = 0;
608		return (0);
609	}
610
611	/* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
612	te = (struct tseg_qent *) zalloc(tcp_reass_zone);
613	if (te == NULL) {
614		tcpstat.tcps_rcvmemdrop++;
615		m_freem(m);
616		return (0);
617	}
618	tp->t_reassqlen++;
619
620	/*
621	 * Find a segment which begins after this one does.
622	 */
623	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
624		if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
625			break;
626		p = q;
627	}
628
629	/*
630	 * If there is a preceding segment, it may provide some of
631	 * our data already.  If so, drop the data from the incoming
632	 * segment.  If it provides all of our data, drop us.
633	 */
634	if (p != NULL) {
635		register int i;
636		/* conversion to int (in i) handles seq wraparound */
637		i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
638		if (i > 0) {
639			if (i >= *tlenp) {
640				tcpstat.tcps_rcvduppack++;
641				tcpstat.tcps_rcvdupbyte += *tlenp;
642				if (nstat_collect) {
643					nstat_route_rx(inp->inp_route.ro_rt,
644					    1, *tlenp,
645					    NSTAT_RX_FLAG_DUPLICATE);
646					INP_ADD_STAT(inp, cell, wifi, wired,
647					    rxpackets, 1);
648					INP_ADD_STAT(inp, cell, wifi, wired,
649					    rxbytes, *tlenp);
650					tp->t_stat.rxduplicatebytes += *tlenp;
651				}
652				m_freem(m);
653				zfree(tcp_reass_zone, te);
654				te = NULL;
655				tp->t_reassqlen--;
656				/*
657				 * Try to present any queued data
658				 * at the left window edge to the user.
659				 * This is needed after the 3-WHS
660				 * completes.
661				 */
662				goto present;
663			}
664			m_adj(m, i);
665			*tlenp -= i;
666			th->th_seq += i;
667		}
668	}
669	tcpstat.tcps_rcvoopack++;
670	tcpstat.tcps_rcvoobyte += *tlenp;
671	if (nstat_collect) {
672		nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp,
673		    NSTAT_RX_FLAG_OUT_OF_ORDER);
674		INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
675		INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, *tlenp);
676		tp->t_stat.rxoutoforderbytes += *tlenp;
677	}
678
679	/*
680	 * While we overlap succeeding segments trim them or,
681	 * if they are completely covered, dequeue them.
682	 */
683	while (q) {
684		register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
685		if (i <= 0)
686			break;
687		if (i < q->tqe_len) {
688			q->tqe_th->th_seq += i;
689			q->tqe_len -= i;
690			m_adj(q->tqe_m, i);
691			break;
692		}
693
694		nq = LIST_NEXT(q, tqe_q);
695		LIST_REMOVE(q, tqe_q);
696		m_freem(q->tqe_m);
697		zfree(tcp_reass_zone, q);
698		tp->t_reassqlen--;
699		q = nq;
700	}
701
702	/* Insert the new segment queue entry into place. */
703	te->tqe_m = m;
704	te->tqe_th = th;
705	te->tqe_len = *tlenp;
706
707	if (p == NULL) {
708		LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
709	} else {
710		LIST_INSERT_AFTER(p, te, tqe_q);
711	}
712
713	/*
714	 * New out-of-order data exists, and is pointed to by
715	 * queue entry te. Set copy_oodata to 1 so out-of-order data
716	 * can be copied off to sockbuf after in-order data
717	 * is copied off.
718	 */
719	if (!(so->so_state & SS_CANTRCVMORE))
720		copy_oodata = 1;
721
722present:
723	/*
724	 * Present data to user, advancing rcv_nxt through
725	 * completed sequence space.
726	 */
727	if (!TCPS_HAVEESTABLISHED(tp->t_state))
728		return (0);
729	q = LIST_FIRST(&tp->t_segq);
730	if (!q || q->tqe_th->th_seq != tp->rcv_nxt) {
731		/* Stop using LRO once out of order packets arrive */
732		if (tp->t_flagsext & TF_LRO_OFFLOADED) {
733			tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
734				th->th_dport, th->th_sport);
735			tp->t_flagsext &= ~TF_LRO_OFFLOADED;
736		}
737
738		/*
739		 * continue processing if out-of-order data
740		 * can be delivered
741		 */
742		if (q && (so->so_flags & SOF_ENABLE_MSGS))
743			goto msg_unordered_delivery;
744
745		return (0);
746	}
747
748	/* lost packet was recovered, so ooo data can be returned */
749	tcpstat.tcps_recovered_pkts++;
750
751	do {
752		tp->rcv_nxt += q->tqe_len;
753		flags = q->tqe_th->th_flags & TH_FIN;
754		nq = LIST_NEXT(q, tqe_q);
755		LIST_REMOVE(q, tqe_q);
756		if (so->so_state & SS_CANTRCVMORE) {
757			m_freem(q->tqe_m);
758		} else {
759			so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
760			if (so->so_flags & SOF_ENABLE_MSGS) {
761				/*
762				 * Append the inorder data as a message to the
763				 * receive socket buffer. Also check to see if
764				 * the data we are about to deliver is the same
765				 * data that we wanted to pass up to the user
766				 * out of order. If so, reset copy_oodata --
767				 * the received data filled a gap, and
768				 * is now in order!
769				 */
770				if (q == te)
771					copy_oodata = 0;
772			}
773			if (sbappendstream_rcvdemux(so, q->tqe_m,
774			    q->tqe_th->th_seq - (tp->irs + 1), 0))
775				dowakeup = 1;
776			if (tp->t_flagsext & TF_LRO_OFFLOADED) {
777				tcp_update_lro_seq(tp->rcv_nxt,
778				 inp->inp_laddr, inp->inp_faddr,
779				 th->th_dport, th->th_sport);
780			}
781		}
782		zfree(tcp_reass_zone, q);
783		tp->t_reassqlen--;
784		q = nq;
785	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
786
787#if INET6
788	if ((inp->inp_vflag & INP_IPV6) != 0) {
789
790		KERNEL_DEBUG(DBG_LAYER_BEG,
791		     ((inp->inp_fport << 16) | inp->inp_lport),
792		     (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
793		      (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
794		     0,0,0);
795	}
796	else
797#endif
798	{
799		KERNEL_DEBUG(DBG_LAYER_BEG,
800		     ((inp->inp_fport << 16) | inp->inp_lport),
801		     (((inp->inp_laddr.s_addr & 0xffff) << 16) |
802		      (inp->inp_faddr.s_addr & 0xffff)),
803		     0,0,0);
804	}
805
806msg_unordered_delivery:
807	/* Deliver out-of-order data as a message */
808	if (te && (so->so_flags & SOF_ENABLE_MSGS) && copy_oodata && te->tqe_len) {
809		/*
810		 * make a copy of the mbuf to be delivered up to
811		 * the user, and add it to the sockbuf
812		 */
813		oodata = m_copym(te->tqe_m, 0, M_COPYALL, M_DONTWAIT);
814		if (oodata != NULL) {
815			if (sbappendmsgstream_rcv(&so->so_rcv, oodata,
816				te->tqe_th->th_seq - (tp->irs + 1), 1)) {
817				dowakeup = 1;
818				tcpstat.tcps_msg_unopkts++;
819			} else {
820				tcpstat.tcps_msg_unoappendfail++;
821			}
822		}
823	}
824
825	if (dowakeup)
826		sorwakeup(so); /* done with socket lock held */
827	return (flags);
828}
829
830/*
831 * Reduce congestion window.
832 */
833static void
834tcp_reduce_congestion_window(
835	struct tcpcb	*tp)
836{
837	/*
838	 * If the current tcp cc module has
839	 * defined a hook for tasks to run
840	 * before entering FR, call it
841	 */
842	if (CC_ALGO(tp)->pre_fr != NULL)
843		CC_ALGO(tp)->pre_fr(tp);
844	ENTER_FASTRECOVERY(tp);
845	tp->snd_recover = tp->snd_max;
846	tp->t_timer[TCPT_REXMT] = 0;
847	tp->t_timer[TCPT_PTO] = 0;
848	tp->t_rtttime = 0;
849	tp->snd_cwnd = tp->snd_ssthresh +
850		 tp->t_maxseg * tcprexmtthresh;
851}
852
853/*
854 * The application wants to get an event if there
855 * is a stall during read. Set the initial keepalive
856 * timeout to be equal to twice RTO.
857 */
858static inline void
859tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen)
860{
861	if (tp->t_adaptive_rtimo > 0 && tlen > 0 &&
862		tp->t_state == TCPS_ESTABLISHED) {
863		tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
864			(TCP_REXMTVAL(tp) << 1));
865		tp->t_flagsext |= TF_DETECT_READSTALL;
866		tp->t_rtimo_probes = 0;
867	}
868}
869
870inline void
871tcp_keepalive_reset(struct tcpcb *tp)
872{
873	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
874		TCP_CONN_KEEPIDLE(tp));
875	tp->t_flagsext &= ~(TF_DETECT_READSTALL);
876	tp->t_rtimo_probes = 0;
877}
878
879/*
880 * TCP input routine, follows pages 65-76 of the
881 * protocol specification dated September, 1981 very closely.
882 */
883#if INET6
884int
885tcp6_input(struct mbuf **mp, int *offp, int proto)
886{
887#pragma unused(proto)
888	register struct mbuf *m = *mp;
889	uint32_t ia6_flags;
890	struct ifnet *ifp = m->m_pkthdr.rcvif;
891
892	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
893
894	/* Expect 32-bit aligned data pointer on strict-align platforms */
895	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
896
897	/*
898	 * draft-itojun-ipv6-tcp-to-anycast
899	 * better place to put this in?
900	 */
901	if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) {
902		if (ia6_flags & IN6_IFF_ANYCAST) {
903			struct ip6_hdr *ip6;
904
905			ip6 = mtod(m, struct ip6_hdr *);
906			icmp6_error(m, ICMP6_DST_UNREACH,
907			    ICMP6_DST_UNREACH_ADDR,
908			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
909
910			IF_TCP_STATINC(ifp, icmp6unreach);
911
912			return (IPPROTO_DONE);
913		}
914	}
915
916	tcp_input(m, *offp);
917	return (IPPROTO_DONE);
918}
919#endif
920
921/* Depending on the usage of mbuf space in the system, this function
922 * will return true or false. This is used to determine if a socket
923 * buffer can take more memory from the system for auto-tuning or not.
924 */
925u_int8_t
926tcp_cansbgrow(struct sockbuf *sb)
927{
928	/* Calculate the host level space limit in terms of MSIZE buffers.
929	 * We can use a maximum of half of the available mbuf space for
930	 * socket buffers.
931	 */
932	u_int32_t mblim = ((nmbclusters >> 1) << (MCLSHIFT - MSIZESHIFT));
933
934	/* Calculate per sb limit in terms of bytes. We optimize this limit
935	 * for upto 16 socket buffers.
936	 */
937
938	u_int32_t sbspacelim = ((nmbclusters >> 4) << MCLSHIFT);
939
940	if ((total_sbmb_cnt < mblim) &&
941		(sb->sb_hiwat < sbspacelim)) {
942		return(1);
943	} else {
944		OSIncrementAtomic64(&sbmb_limreached);
945	}
946	return(0);
947}
948
949static void
950tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv,
951	u_int32_t newsize, u_int32_t idealsize)
952{
953
954	/* newsize should not exceed max */
955	newsize = min(newsize, tcp_autorcvbuf_max);
956
957	/* The receive window scale negotiated at the
958	 * beginning of the connection will also set a
959	 * limit on the socket buffer size
960	 */
961	newsize = min(newsize, TCP_MAXWIN << tp->rcv_scale);
962
963	/* Set new socket buffer size */
964	if (newsize > sbrcv->sb_hiwat &&
965		(sbreserve(sbrcv, newsize) == 1)) {
966		sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
967			(idealsize != 0) ? idealsize : newsize),
968			tcp_autorcvbuf_max);
969
970		/* Again check the limit set by the advertised
971		 * window scale
972		 */
973		sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
974			TCP_MAXWIN << tp->rcv_scale);
975	}
976}
977
978/*
979 * This function is used to grow  a receive socket buffer. It
980 * will take into account system-level memory usage and the
981 * bandwidth available on the link to make a decision.
982 */
983static void
984tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
985	struct tcpopt *to, u_int32_t pktlen) {
986
987	/*
988	 * Do not grow the receive socket buffer if
989	 * - auto resizing is disabled, globally or on this socket
990	 * - the high water mark already reached the maximum
991	 * - the stream is in background and receive side is being
992	 * throttled
993	 * - if there are segments in reassembly queue indicating loss,
994	 * do not need to increase recv window during recovery as more
995	 * data is not going to be sent. A duplicate ack sent during
996	 * recovery should not change the receive window
997	 */
998	if (tcp_do_autorcvbuf == 0 ||
999		(sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
1000		tcp_cansbgrow(sbrcv) == 0 ||
1001		sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
1002		(tp->t_flagsext & TF_RECV_THROTTLE) ||
1003		!LIST_EMPTY(&tp->t_segq)) {
1004		/* Can not resize the socket buffer, just return */
1005		goto out;
1006	}
1007
1008	if (TSTMP_GT(tcp_now,
1009		tp->rfbuf_ts + TCPTV_RCVBUFIDLE)) {
1010		/* If there has been an idle period in the
1011		 * connection, just restart the measurement
1012		 */
1013		goto out;
1014	}
1015
1016	if (!TSTMP_SUPPORTED(tp)) {
1017		/*
1018		 * Timestamp option is not supported on this connection.
1019		 * If the connection reached a state to indicate that
1020		 * the receive socket buffer needs to grow, increase
1021		 * the high water mark.
1022		 */
1023		if (TSTMP_GEQ(tcp_now,
1024			tp->rfbuf_ts + TCPTV_RCVNOTS_QUANTUM)) {
1025			if (tp->rfbuf_cnt >= TCP_RCVNOTS_BYTELEVEL) {
1026				tcp_sbrcv_reserve(tp, sbrcv,
1027					tcp_autorcvbuf_max, 0);
1028			}
1029			goto out;
1030		} else {
1031			tp->rfbuf_cnt += pktlen;
1032			return;
1033		}
1034	} else if (to->to_tsecr != 0) {
1035		/*
1036		 * If the timestamp shows that one RTT has
1037		 * completed, we can stop counting the
1038		 * bytes. Here we consider increasing
1039		 * the socket buffer if the bandwidth measured in
1040		 * last rtt, is more than half of sb_hiwat, this will
1041		 * help to scale the buffer according to the bandwidth
1042		 * on the link.
1043		 */
1044		if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
1045			if (tp->rfbuf_cnt > (sbrcv->sb_hiwat -
1046				(sbrcv->sb_hiwat >> 1))) {
1047				int32_t rcvbuf_inc, min_incr;
1048				/*
1049				 * Increment the receive window by a
1050				 * multiple of maximum sized segments.
1051				 * This will prevent a connection from
1052				 * sending smaller segments on wire if it
1053				 * is limited by the receive window.
1054				 *
1055				 * Set the ideal size based on current
1056				 * bandwidth measurements. We set the
1057				 * ideal size on receive socket buffer to
1058				 * be twice the bandwidth delay product.
1059				 */
1060				rcvbuf_inc = (tp->rfbuf_cnt << 1)
1061				    - sbrcv->sb_hiwat;
1062
1063				/*
1064				 * Make the increment equal to 8 segments
1065				 * at least
1066				 */
1067				min_incr = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1068				if (rcvbuf_inc < min_incr)
1069				    rcvbuf_inc = min_incr;
1070
1071				rcvbuf_inc =
1072				    (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
1073				tcp_sbrcv_reserve(tp, sbrcv,
1074					sbrcv->sb_hiwat + rcvbuf_inc,
1075					(tp->rfbuf_cnt * 2));
1076			}
1077			goto out;
1078		} else {
1079			tp->rfbuf_cnt += pktlen;
1080			return;
1081		}
1082	}
1083out:
1084	/* Restart the measurement */
1085	tp->rfbuf_ts = 0;
1086	tp->rfbuf_cnt = 0;
1087	return;
1088}
1089
1090/* This function will trim the excess space added to the socket buffer
1091 * to help a slow-reading app. The ideal-size of a socket buffer depends
1092 * on the link bandwidth or it is set by an application and we aim to
1093 * reach that size.
1094 */
1095void
1096tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv) {
1097	if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
1098		sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
1099		int32_t trim;
1100		/* compute the difference between ideal and current sizes */
1101		u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
1102
1103		/* Compute the maximum advertised window for
1104		 * this connection.
1105		 */
1106		u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
1107
1108		/* How much can we trim the receive socket buffer?
1109		 * 1. it can not be trimmed beyond the max rcv win advertised
1110		 * 2. if possible, leave 1/16 of bandwidth*delay to
1111		 * avoid closing the win completely
1112		 */
1113		u_int32_t leave = max(advwin, (sbrcv->sb_idealsize >> 4));
1114
1115		/* Sometimes leave can be zero, in that case leave at least
1116 		 * a few segments worth of space.
1117		 */
1118		if (leave == 0)
1119			leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1120
1121		trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
1122		trim = imin(trim, (int32_t)diff);
1123
1124		if (trim > 0)
1125			sbreserve(sbrcv, (sbrcv->sb_hiwat - trim));
1126	}
1127}
1128
1129/* We may need to trim the send socket buffer size for two reasons:
1130 * 1. if the rtt seen on the connection is climbing up, we do not
1131 * want to fill the buffers any more.
1132 * 2. if the congestion win on the socket backed off, there is no need
1133 * to hold more mbufs for that connection than what the cwnd will allow.
1134 */
1135void
1136tcp_sbsnd_trim(struct sockbuf *sbsnd) {
1137	if (tcp_do_autosendbuf == 1 &&
1138		((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
1139			(SB_AUTOSIZE | SB_TRIM)) &&
1140		(sbsnd->sb_idealsize > 0) &&
1141		(sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
1142		u_int32_t trim = 0;
1143		if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
1144			trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
1145		} else {
1146			trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
1147		}
1148		sbreserve(sbsnd, (sbsnd->sb_hiwat - trim));
1149	}
1150	if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize)
1151		sbsnd->sb_flags &= ~(SB_TRIM);
1152}
1153
1154/*
1155 * If timestamp option was not negotiated on this connection
1156 * and this connection is on the receiving side of a stream
1157 * then we can not measure the delay on the link accurately.
1158 * Instead of enabling automatic receive socket buffer
1159 * resizing, just give more space to the receive socket buffer.
1160 */
1161static inline void
1162tcp_sbrcv_tstmp_check(struct tcpcb *tp) {
1163	struct socket *so = tp->t_inpcb->inp_socket;
1164	u_int32_t newsize = 2 * tcp_recvspace;
1165	struct sockbuf *sbrcv = &so->so_rcv;
1166
1167	if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
1168		(TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
1169		(sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
1170		tcp_sbrcv_reserve(tp, sbrcv, newsize, 0);
1171	}
1172}
1173
1174/* A receiver will evaluate the flow of packets on a connection
1175 * to see if it can reduce ack traffic. The receiver will start
1176 * stretching acks if all of the following conditions are met:
1177 * 1. tcp_delack_enabled is set to 3
1178 * 2. If the bytes received in the last 100ms is greater than a threshold
1179 *      defined by maxseg_unacked
1180 * 3. If the connection has not been idle for tcp_maxrcvidle period.
1181 * 4. If the connection has seen enough packets to let the slow-start
1182 *      finish after connection establishment or after some packet loss.
1183 *
1184 * The receiver will stop stretching acks if there is congestion/reordering
1185 * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
1186 * timer fires while stretching acks, it means that the packet flow has gone
1187 * below the threshold defined by maxseg_unacked and the receiver will stop
1188 * stretching acks. The receiver gets no indication when slow-start is completed
1189 * or when the connection reaches an idle state. That is why we use
1190 * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
1191 * state.
1192 */
1193static inline int
1194tcp_stretch_ack_enable(struct tcpcb *tp)
1195{
1196 	if (!(tp->t_flagsext & (TF_NOSTRETCHACK|TF_DISABLE_STRETCHACK)) &&
1197		tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
1198		TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) &&
1199		(!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
1200		(tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
1201		return(1);
1202	}
1203
1204	return(0);
1205}
1206
1207/*
1208 * Reset the state related to stretch-ack algorithm. This will make
1209 * the receiver generate an ack every other packet. The receiver
1210 * will start re-evaluating the rate at which packets come to decide
1211 * if it can benefit by lowering the ack traffic.
1212 */
1213void
1214tcp_reset_stretch_ack(struct tcpcb *tp)
1215{
1216	tp->t_flags &= ~(TF_STRETCHACK);
1217	tp->rcv_by_unackwin = 0;
1218	tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
1219
1220	/*
1221	 * When there is packet loss or packet re-ordering or CWR due to
1222	 * ECN, the sender's congestion window is reduced. In these states,
1223	 * generate an ack for every other packet for some time to allow
1224	 * the sender's congestion window to grow.
1225	 */
1226	tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1227	tp->rcv_waitforss = 0;
1228}
1229
1230/*
1231 * The last packet was a retransmission, check if this ack
1232 * indicates that the retransmission was spurious.
1233 *
1234 * If the connection supports timestamps, we could use it to
1235 * detect if the last retransmit was not needed. Otherwise,
1236 * we check if the ACK arrived within RTT/2 window, then it
1237 * was a mistake to do the retransmit in the first place.
1238 *
1239 * This function will return 1 if it is a spurious retransmit,
1240 * 0 otherwise.
1241 */
1242int
1243tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th,
1244	struct tcpopt *to, u_int32_t rxtime)
1245{
1246	int32_t tdiff, bad_rexmt_win;
1247	bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
1248
1249	/* If the ack has ECN CE bit, then cwnd has to be adjusted */
1250	if ((tp->ecn_flags & (TE_ECN_ON)) == (TE_ECN_ON)
1251	    && (th->th_flags & TH_ECE))
1252		return (0);
1253	if (TSTMP_SUPPORTED(tp)) {
1254		if (rxtime > 0 && (to->to_flags & TOF_TS)
1255		    && to->to_tsecr != 0
1256		    && TSTMP_LT(to->to_tsecr, rxtime))
1257		    return (1);
1258	} else {
1259		if ((tp->t_rxtshift == 1
1260		    || (tp->t_flagsext & TF_SENT_TLPROBE))
1261		    && rxtime > 0) {
1262			tdiff = (int32_t)(tcp_now - rxtime);
1263			if (tdiff < bad_rexmt_win)
1264				return(1);
1265		}
1266	}
1267	return(0);
1268}
1269
1270
1271/*
1272 * Restore congestion window state if a spurious timeout
1273 * was detected.
1274 */
1275static void
1276tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
1277{
1278	if (TSTMP_SUPPORTED(tp)) {
1279		u_int32_t fsize, acked;
1280		fsize = tp->snd_max - th->th_ack;
1281		acked = BYTES_ACKED(th, tp);
1282
1283		/*
1284		 * Implement bad retransmit recovery as
1285		 * described in RFC 4015.
1286		 */
1287		tp->snd_ssthresh = tp->snd_ssthresh_prev;
1288
1289		/* Initialize cwnd to the initial window */
1290		if (CC_ALGO(tp)->cwnd_init != NULL)
1291			CC_ALGO(tp)->cwnd_init(tp);
1292
1293		tp->snd_cwnd = fsize + min(acked, tp->snd_cwnd);
1294
1295	} else {
1296		tp->snd_cwnd = tp->snd_cwnd_prev;
1297		tp->snd_ssthresh = tp->snd_ssthresh_prev;
1298		if (tp->t_flags & TF_WASFRECOVERY)
1299			ENTER_FASTRECOVERY(tp);
1300	}
1301	tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES);
1302	tp->snd_recover = tp->snd_recover_prev;
1303	tp->snd_nxt = tp->snd_max;
1304	tp->t_rxtshift = 0;
1305	tp->t_rxtstart = 0;
1306
1307	/* Fix send socket buffer to reflect the change in cwnd */
1308	tcp_bad_rexmt_fix_sndbuf(tp);
1309
1310	/*
1311	 * This RTT might reflect the extra delay induced
1312	 * by the network. Skip using this sample for RTO
1313	 * calculation and mark the connection so we can
1314	 * recompute RTT when the next eligible sample is
1315	 * found.
1316	 */
1317	tp->t_flagsext |= TF_RECOMPUTE_RTT;
1318	tp->t_badrexmt_time = tcp_now;
1319	tp->t_rtttime = 0;
1320}
1321
1322/*
1323 * If the previous packet was sent in retransmission timer, and it was
1324 * not needed, then restore the congestion window to the state before that
1325 * transmission.
1326 *
1327 * If the last packet was sent in tail loss probe timeout, check if that
1328 * recovered the last packet. If so, that will indicate a real loss and
1329 * the congestion window needs to be lowered.
1330 */
1331static void
1332tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
1333{
1334	if (tp->t_rxtshift > 0 &&
1335	    tcp_detect_bad_rexmt(tp, th, to, tp->t_rxtstart)) {
1336		++tcpstat.tcps_sndrexmitbad;
1337		tcp_bad_rexmt_restore_state(tp, th);
1338		tcp_ccdbg_trace(tp, th, TCP_CC_BAD_REXMT_RECOVERY);
1339	} else if ((tp->t_flagsext & TF_SENT_TLPROBE)
1340	    && tp->t_tlphighrxt > 0
1341	    && SEQ_GEQ(th->th_ack, tp->t_tlphighrxt)
1342	    && !tcp_detect_bad_rexmt(tp, th, to, tp->t_tlpstart)) {
1343		/*
1344		 * The tail loss probe recovered the last packet and
1345		 * we need to adjust the congestion window to take
1346		 * this loss into account.
1347		 */
1348		++tcpstat.tcps_tlp_recoverlastpkt;
1349		if (!IN_FASTRECOVERY(tp)) {
1350			tcp_reduce_congestion_window(tp);
1351			EXIT_FASTRECOVERY(tp);
1352		}
1353		tcp_ccdbg_trace(tp, th, TCP_CC_TLP_RECOVER_LASTPACKET);
1354	}
1355
1356	tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1357	tp->t_tlphighrxt = 0;
1358	tp->t_tlpstart = 0;
1359
1360	/*
1361	 * check if the latest ack was for a segment sent during PMTU
1362	 * blackhole detection. If the timestamp on the ack is before
1363	 * PMTU blackhole detection, then revert the size of the max
1364	 * segment to previous size.
1365	 */
1366	if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
1367	    tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
1368		if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
1369		    && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
1370			tcp_pmtud_revert_segment_size(tp);
1371		}
1372	}
1373	if (tp->t_pmtud_start_ts > 0)
1374		tp->t_pmtud_start_ts = 0;
1375}
1376
1377/*
1378 * Check if early retransmit can be attempted according to RFC 5827.
1379 *
1380 * If packet reordering is detected on a connection, fast recovery will
1381 * be delayed until it is clear that the packet was lost and not reordered.
1382 * But reordering detection is done only when SACK is enabled.
1383 *
1384 * On connections that do not support SACK, there is a limit on the number
1385 * of early retransmits that can be done per minute. This limit is needed
1386 * to make sure that too many packets are not retransmitted when there is
1387 * packet reordering.
1388 */
1389static void
1390tcp_early_rexmt_check (struct tcpcb *tp, struct tcphdr *th)
1391{
1392	u_int32_t obytes, snd_off;
1393	int32_t snd_len;
1394	struct socket *so = tp->t_inpcb->inp_socket;
1395
1396	if (early_rexmt && (SACK_ENABLED(tp) ||
1397	    tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
1398	    SEQ_GT(tp->snd_max, tp->snd_una) &&
1399	    (tp->t_dupacks == 1 ||
1400	    (SACK_ENABLED(tp) &&
1401	    !TAILQ_EMPTY(&tp->snd_holes)))) {
1402		/*
1403		 * If there are only a few outstanding
1404		 * segments on the connection, we might need
1405		 * to lower the retransmit threshold. This
1406		 * will allow us to do Early Retransmit as
1407		 * described in RFC 5827.
1408		 */
1409		if (SACK_ENABLED(tp) &&
1410		    !TAILQ_EMPTY(&tp->snd_holes)) {
1411			obytes = (tp->snd_max - tp->snd_fack) +
1412				tp->sackhint.sack_bytes_rexmit;
1413		} else {
1414			obytes = (tp->snd_max - tp->snd_una);
1415		}
1416
1417		/*
1418		 * In order to lower retransmit threshold the
1419		 * following two conditions must be met.
1420		 * 1. the amount of outstanding data is less
1421		 * than 4*SMSS bytes
1422		 * 2. there is no unsent data ready for
1423		 * transmission or the advertised window
1424		 * will limit sending new segments.
1425		 */
1426		snd_off = tp->snd_max - tp->snd_una;
1427		snd_len = min(so->so_snd.sb_cc, tp->snd_wnd) - snd_off;
1428		if (obytes < (tp->t_maxseg << 2) &&
1429		    snd_len <= 0) {
1430			u_int32_t osegs;
1431
1432			osegs = obytes / tp->t_maxseg;
1433			if ((osegs * tp->t_maxseg) < obytes)
1434				osegs++;
1435
1436			/*
1437			 * Since the connection might have already
1438			 * received some dupacks, we add them to
1439			 * to the outstanding segments count to get
1440			 * the correct retransmit threshold.
1441			 *
1442			 * By checking for early retransmit after
1443			 * receiving some duplicate acks when SACK
1444			 * is supported, the connection will
1445			 * enter fast recovery even if multiple
1446			 * segments are lost in the same window.
1447			 */
1448			osegs += tp->t_dupacks;
1449			if (osegs < 4) {
1450				tp->t_rexmtthresh =
1451				    ((osegs - 1) > 1) ? (osegs - 1) : 1;
1452				tp->t_rexmtthresh =
1453				    min(tp->t_rexmtthresh, tcprexmtthresh);
1454				tp->t_rexmtthresh =
1455				    max(tp->t_rexmtthresh, tp->t_dupacks);
1456
1457				if (tp->t_early_rexmt_count == 0)
1458					tp->t_early_rexmt_win = tcp_now;
1459
1460				if (tp->t_flagsext & TF_SENT_TLPROBE) {
1461					tcpstat.tcps_tlp_recovery++;
1462					tcp_ccdbg_trace(tp, th,
1463					    TCP_CC_TLP_RECOVERY);
1464				} else {
1465					tcpstat.tcps_early_rexmt++;
1466					tp->t_early_rexmt_count++;
1467					tcp_ccdbg_trace(tp, th,
1468					    TCP_CC_EARLY_RETRANSMIT);
1469				}
1470			}
1471		}
1472	}
1473
1474	/*
1475	 * If we ever sent a TLP probe, the acknowledgement will trigger
1476	 * early retransmit because the value of snd_fack will be close
1477	 * to snd_max. This will take care of adjustments to the
1478	 * congestion window. So we can reset TF_SENT_PROBE flag.
1479	 */
1480	tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1481	tp->t_tlphighrxt = 0;
1482	tp->t_tlpstart = 0;
1483}
1484
1485void
1486tcp_input(m, off0)
1487	struct mbuf *m;
1488	int off0;
1489{
1490	register struct tcphdr *th;
1491	register struct ip *ip = NULL;
1492	register struct inpcb *inp;
1493	u_char *optp = NULL;
1494	int optlen = 0;
1495	int tlen, off;
1496	int drop_hdrlen;
1497	register struct tcpcb *tp = 0;
1498	register int thflags;
1499	struct socket *so = 0;
1500	int todrop, acked, ourfinisacked, needoutput = 0;
1501	struct in_addr laddr;
1502#if INET6
1503	struct in6_addr laddr6;
1504#endif
1505	int dropsocket = 0;
1506	int iss = 0, nosock = 0;
1507	u_int32_t tiwin, sack_bytes_acked = 0;
1508	struct tcpopt to;		/* options in this segment */
1509	struct sockaddr_in *next_hop = NULL;
1510#if TCPDEBUG
1511	short ostate = 0;
1512#endif
1513	struct m_tag *fwd_tag;
1514	u_char ip_ecn = IPTOS_ECN_NOTECT;
1515	unsigned int ifscope;
1516	uint8_t isconnected, isdisconnected;
1517	struct ifnet *ifp = m->m_pkthdr.rcvif;
1518	int pktf_sw_lro_pkt = (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) ? 1 : 0;
1519	int nlropkts = (pktf_sw_lro_pkt == 1) ? m->m_pkthdr.lro_npkts : 1;
1520	int turnoff_lro = 0, win;
1521#if MPTCP
1522	struct mptcb *mp_tp = NULL;
1523	uint16_t mptcp_csum = 0;
1524#endif /* MPTCP */
1525	boolean_t cell = IFNET_IS_CELLULAR(ifp);
1526	boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
1527	boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
1528	struct tcp_respond_args tra;
1529
1530#define TCP_INC_VAR(stat, npkts) do {			\
1531		stat += npkts;				\
1532} while (0)
1533
1534	TCP_INC_VAR(tcpstat.tcps_rcvtotal, nlropkts);
1535
1536	/* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
1537	if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1538		fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1539		    KERNEL_TAG_TYPE_IPFORWARD, NULL);
1540	} else {
1541		fwd_tag = NULL;
1542	}
1543	if (fwd_tag != NULL) {
1544		struct ip_fwd_tag *ipfwd_tag =
1545			(struct ip_fwd_tag *)(fwd_tag+1);
1546
1547		next_hop = ipfwd_tag->next_hop;
1548		m_tag_delete(m, fwd_tag);
1549	}
1550
1551#if INET6
1552	struct ip6_hdr *ip6 = NULL;
1553	int isipv6;
1554#endif /* INET6 */
1555	int rstreason; /* For badport_bandlim accounting purposes */
1556	struct proc *proc0=current_proc();
1557
1558	KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START,0,0,0,0,0);
1559
1560#if INET6
1561	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
1562#endif
1563	bzero((char *)&to, sizeof(to));
1564
1565#if INET6
1566	if (isipv6) {
1567		/*
1568		 * Expect 32-bit aligned data pointer on
1569		 * strict-align platforms
1570		 */
1571		MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1572
1573		/* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
1574		ip6 = mtod(m, struct ip6_hdr *);
1575		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
1576		th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1577
1578		if (tcp_input_checksum(AF_INET6, m, th, off0, tlen))
1579			goto dropnosock;
1580
1581		KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1582		     (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
1583		     th->th_seq, th->th_ack, th->th_win);
1584		/*
1585		 * Be proactive about unspecified IPv6 address in source.
1586		 * As we use all-zero to indicate unbounded/unconnected pcb,
1587		 * unspecified IPv6 address can be used to confuse us.
1588		 *
1589		 * Note that packets with unspecified IPv6 destination is
1590		 * already dropped in ip6_input.
1591		 */
1592		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
1593			/* XXX stat */
1594			IF_TCP_STATINC(ifp, unspecv6);
1595			goto dropnosock;
1596		}
1597		DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1598			struct ip6_hdr *, ip6, struct tcpcb *, NULL,
1599			struct tcphdr *, th);
1600
1601		ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
1602	} else
1603#endif /* INET6 */
1604	{
1605	/*
1606	 * Get IP and TCP header together in first mbuf.
1607	 * Note: IP leaves IP header in first mbuf.
1608	 */
1609	if (off0 > sizeof (struct ip)) {
1610		ip_stripoptions(m, (struct mbuf *)0);
1611		off0 = sizeof(struct ip);
1612	}
1613	if (m->m_len < sizeof (struct tcpiphdr)) {
1614		if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
1615			tcpstat.tcps_rcvshort++;
1616			return;
1617		}
1618	}
1619
1620	/* Expect 32-bit aligned data pointer on strict-align platforms */
1621	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1622
1623	ip = mtod(m, struct ip *);
1624	th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1625	tlen = ip->ip_len;
1626
1627	if (tcp_input_checksum(AF_INET, m, th, off0, tlen))
1628		goto dropnosock;
1629
1630#if INET6
1631	/* Re-initialization for later version check */
1632	ip->ip_v = IPVERSION;
1633#endif
1634	ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
1635
1636	DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
1637		struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th);
1638
1639	KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
1640		(((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
1641		  th->th_seq, th->th_ack, th->th_win);
1642
1643	}
1644
1645	/*
1646	 * Check that TCP offset makes sense,
1647	 * pull out TCP options and adjust length.		XXX
1648	 */
1649	off = th->th_off << 2;
1650	if (off < sizeof (struct tcphdr) || off > tlen) {
1651		tcpstat.tcps_rcvbadoff++;
1652		IF_TCP_STATINC(ifp, badformat);
1653		goto dropnosock;
1654	}
1655	tlen -= off;	/* tlen is used instead of ti->ti_len */
1656	if (off > sizeof (struct tcphdr)) {
1657#if INET6
1658		if (isipv6) {
1659			IP6_EXTHDR_CHECK(m, off0, off, return);
1660			ip6 = mtod(m, struct ip6_hdr *);
1661			th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1662		} else
1663#endif /* INET6 */
1664		{
1665			if (m->m_len < sizeof(struct ip) + off) {
1666				if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
1667					tcpstat.tcps_rcvshort++;
1668					return;
1669				}
1670				ip = mtod(m, struct ip *);
1671				th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
1672			}
1673		}
1674		optlen = off - sizeof (struct tcphdr);
1675		optp = (u_char *)(th + 1);
1676		/*
1677		 * Do quick retrieval of timestamp options ("options
1678		 * prediction?").  If timestamp is the only option and it's
1679		 * formatted as recommended in RFC 1323 appendix A, we
1680		 * quickly get the values now and not bother calling
1681		 * tcp_dooptions(), etc.
1682		 */
1683		if ((optlen == TCPOLEN_TSTAMP_APPA ||
1684			(optlen > TCPOLEN_TSTAMP_APPA &&
1685			optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1686			*(u_int32_t *)(void *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1687			(th->th_flags & TH_SYN) == 0) {
1688			to.to_flags |= TOF_TS;
1689			to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
1690			to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
1691			optp = NULL;	/* we've parsed the options */
1692		}
1693	}
1694	thflags = th->th_flags;
1695
1696#if TCP_DROP_SYNFIN
1697	/*
1698	 * If the drop_synfin option is enabled, drop all packets with
1699	 * both the SYN and FIN bits set. This prevents e.g. nmap from
1700	 * identifying the TCP/IP stack.
1701	 *
1702	 * This is a violation of the TCP specification.
1703	 */
1704	if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) {
1705		IF_TCP_STATINC(ifp, synfin);
1706		goto dropnosock;
1707	}
1708#endif
1709
1710	/*
1711	 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
1712	 * until after ip6_savecontrol() is called and before other functions
1713	 * which don't want those proto headers.
1714	 * Because ip6_savecontrol() is going to parse the mbuf to
1715	 * search for data to be passed up to user-land, it wants mbuf
1716	 * parameters to be unchanged.
1717	 */
1718	drop_hdrlen = off0 + off;
1719
1720	/* Since this is an entry point for input processing of tcp packets, we
1721	 * can update the tcp clock here.
1722	 */
1723	calculate_tcp_clock();
1724
1725	/*
1726	 * Record the interface where this segment arrived on; this does not
1727	 * affect normal data output (for non-detached TCP) as it provides a
1728	 * hint about which route and interface to use for sending in the
1729	 * absence of a PCB, when scoped routing (and thus source interface
1730	 * selection) are enabled.
1731	 */
1732	if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) || m->m_pkthdr.rcvif == NULL)
1733		ifscope = IFSCOPE_NONE;
1734	else
1735		ifscope = m->m_pkthdr.rcvif->if_index;
1736
1737    	/*
1738    	 * Convert TCP protocol specific fields to host format.
1739    	 */
1740
1741#if BYTE_ORDER != BIG_ENDIAN
1742    	NTOHL(th->th_seq);
1743    	NTOHL(th->th_ack);
1744    	NTOHS(th->th_win);
1745    	NTOHS(th->th_urp);
1746#endif
1747
1748	/*
1749	 * Locate pcb for segment.
1750	 */
1751findpcb:
1752
1753	isconnected = FALSE;
1754	isdisconnected = FALSE;
1755
1756#if IPFIREWALL_FORWARD
1757	if (next_hop != NULL
1758#if INET6
1759	    && isipv6 == 0 /* IPv6 support is not yet */
1760#endif /* INET6 */
1761	    ) {
1762		/*
1763		 * Diverted. Pretend to be the destination.
1764		 * already got one like this?
1765		 */
1766		inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
1767			ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
1768		if (!inp) {
1769			/*
1770			 * No, then it's new. Try find the ambushing socket
1771			 */
1772			if (!next_hop->sin_port) {
1773				inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
1774				    th->th_sport, next_hop->sin_addr,
1775				    th->th_dport, 1, m->m_pkthdr.rcvif);
1776			} else {
1777				inp = in_pcblookup_hash(&tcbinfo,
1778				    ip->ip_src, th->th_sport,
1779	    			    next_hop->sin_addr,
1780				    ntohs(next_hop->sin_port), 1,
1781				    m->m_pkthdr.rcvif);
1782			}
1783		}
1784	} else
1785#endif	/* IPFIREWALL_FORWARD */
1786      {
1787#if INET6
1788	if (isipv6)
1789		inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
1790					 &ip6->ip6_dst, th->th_dport, 1,
1791					 m->m_pkthdr.rcvif);
1792	else
1793#endif /* INET6 */
1794	inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
1795	    ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
1796      }
1797
1798	/*
1799	 * Use the interface scope information from the PCB for outbound
1800	 * segments.  If the PCB isn't present and if scoped routing is
1801	 * enabled, tcp_respond will use the scope of the interface where
1802	 * the segment arrived on.
1803	 */
1804	if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
1805		ifscope = inp->inp_boundifp->if_index;
1806#if NECP
1807	if (inp != NULL && (
1808#if INET6
1809		isipv6 ? !necp_socket_is_allowed_to_send_recv_v6(inp,
1810			th->th_dport, th->th_sport, &ip6->ip6_dst,
1811			&ip6->ip6_src, ifp, NULL) :
1812#endif
1813		!necp_socket_is_allowed_to_send_recv_v4(inp, th->th_dport,
1814			th->th_sport, &ip->ip_dst, &ip->ip_src,
1815			ifp, NULL))) {
1816		if (in_pcb_checkstate(inp, WNT_RELEASE, 0)
1817		    == WNT_STOPUSING) {
1818			inp = NULL;	/* pretend we didn't find it */
1819		}
1820		IF_TCP_STATINC(ifp, badformatipsec);
1821		goto dropnosock;
1822	}
1823#endif /* NECP */
1824
1825	/*
1826	 * If the state is CLOSED (i.e., TCB does not exist) then
1827	 * all data in the incoming segment is discarded.
1828	 * If the TCB exists but is in CLOSED state, it is embryonic,
1829	 * but should either do a listen or a connect soon.
1830	 */
1831	if (inp == NULL) {
1832		if (log_in_vain) {
1833#if INET6
1834			char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
1835#else /* INET6 */
1836			char dbuf[MAX_IPv4_STR_LEN], sbuf[MAX_IPv4_STR_LEN];
1837#endif /* INET6 */
1838
1839#if INET6
1840			if (isipv6) {
1841				inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
1842				inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
1843			} else
1844#endif
1845			{
1846				inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
1847				inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
1848			}
1849			switch (log_in_vain) {
1850			case 1:
1851				if(thflags & TH_SYN)
1852					log(LOG_INFO,
1853						"Connection attempt to TCP %s:%d from %s:%d\n",
1854						dbuf, ntohs(th->th_dport),
1855						sbuf,
1856						ntohs(th->th_sport));
1857				break;
1858			case 2:
1859				log(LOG_INFO,
1860					"Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
1861					dbuf, ntohs(th->th_dport), sbuf,
1862					ntohs(th->th_sport), thflags);
1863				break;
1864			case 3:
1865			case 4:
1866				if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
1867					!(m->m_flags & (M_BCAST | M_MCAST)) &&
1868#if INET6
1869					((isipv6 && !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) ||
1870					 (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))
1871#else
1872					ip->ip_dst.s_addr != ip->ip_src.s_addr
1873#endif
1874					 )
1875					log_in_vain_log((LOG_INFO,
1876						"Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
1877						dbuf, ntohs(th->th_dport),
1878						sbuf,
1879						ntohs(th->th_sport)));
1880				break;
1881			default:
1882				break;
1883			}
1884		}
1885		if (blackhole) {
1886			if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP)
1887
1888				switch (blackhole) {
1889				case 1:
1890					if (thflags & TH_SYN)
1891						goto dropnosock;
1892					break;
1893				case 2:
1894					goto dropnosock;
1895				default:
1896					goto dropnosock;
1897				}
1898		}
1899		rstreason = BANDLIM_RST_CLOSEDPORT;
1900		IF_TCP_STATINC(ifp, noconnnolist);
1901		goto dropwithresetnosock;
1902	}
1903	so = inp->inp_socket;
1904	if (so == NULL) {
1905		/* This case shouldn't happen  as the socket shouldn't be null
1906		 * if inp_state isn't set to INPCB_STATE_DEAD
1907		 * But just in case, we pretend we didn't find the socket if we hit this case
1908		 * as this isn't cause for a panic (the socket might be leaked however)...
1909		 */
1910		inp = NULL;
1911#if TEMPDEBUG
1912		printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
1913#endif
1914		goto dropnosock;
1915	}
1916
1917	tcp_lock(so, 1, 0);
1918	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1919		tcp_unlock(so, 1, (void *)2);
1920		inp = NULL;	// pretend we didn't find it
1921		goto dropnosock;
1922	}
1923
1924	tp = intotcpcb(inp);
1925	if (tp == 0) {
1926		rstreason = BANDLIM_RST_CLOSEDPORT;
1927		IF_TCP_STATINC(ifp, noconnlist);
1928		goto dropwithreset;
1929	}
1930	if (tp->t_state == TCPS_CLOSED)
1931		goto drop;
1932
1933	/* Unscale the window into a 32-bit value. */
1934	if ((thflags & TH_SYN) == 0)
1935		tiwin = th->th_win << tp->snd_scale;
1936	else
1937		tiwin = th->th_win;
1938
1939#if CONFIG_MACF_NET
1940	if (mac_inpcb_check_deliver(inp, m, AF_INET, SOCK_STREAM))
1941		goto drop;
1942#endif
1943
1944	/* Avoid processing packets while closing a listen socket */
1945	if (tp->t_state == TCPS_LISTEN &&
1946		(so->so_options & SO_ACCEPTCONN) == 0)
1947		goto drop;
1948
1949	if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
1950#if TCPDEBUG
1951		if (so->so_options & SO_DEBUG) {
1952			ostate = tp->t_state;
1953#if INET6
1954			if (isipv6)
1955				bcopy((char *)ip6, (char *)tcp_saveipgen,
1956				      sizeof(*ip6));
1957			else
1958#endif /* INET6 */
1959			bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
1960			tcp_savetcp = *th;
1961		}
1962#endif
1963		if (so->so_options & SO_ACCEPTCONN) {
1964		    register struct tcpcb *tp0 = tp;
1965			struct socket *so2;
1966			struct socket *oso;
1967			struct sockaddr_storage from;
1968#if INET6
1969			struct inpcb *oinp = sotoinpcb(so);
1970#endif /* INET6 */
1971			struct ifnet *head_ifscope;
1972			unsigned int head_nocell, head_recvanyif,
1973				     head_noexpensive, head_awdl_unrestricted;
1974
1975			/* Get listener's bound-to-interface, if any */
1976			head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
1977			    inp->inp_boundifp : NULL;
1978			/* Get listener's no-cellular information, if any */
1979			head_nocell = INP_NO_CELLULAR(inp);
1980			/* Get listener's recv-any-interface, if any */
1981			head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
1982			/* Get listener's no-expensive information, if any */
1983			head_noexpensive = INP_NO_EXPENSIVE(inp);
1984			head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
1985
1986			/*
1987			 * If the state is LISTEN then ignore segment if it contains an RST.
1988			 * If the segment contains an ACK then it is bad and send a RST.
1989			 * If it does not contain a SYN then it is not interesting; drop it.
1990			 * If it is from this socket, drop it, it must be forged.
1991			 */
1992			if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1993				IF_TCP_STATINC(ifp, listbadsyn);
1994
1995				if (thflags & TH_RST) {
1996					goto drop;
1997				}
1998				if (thflags & TH_ACK) {
1999					tp = NULL;
2000					tcpstat.tcps_badsyn++;
2001					rstreason = BANDLIM_RST_OPENPORT;
2002					goto dropwithreset;
2003				}
2004
2005				/* We come here if there is no SYN set */
2006				tcpstat.tcps_badsyn++;
2007				goto drop;
2008			}
2009			KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START,0,0,0,0,0);
2010	                if (th->th_dport == th->th_sport) {
2011#if INET6
2012				if (isipv6) {
2013					if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
2014                                                       &ip6->ip6_src))
2015						goto drop;
2016				} else
2017#endif /* INET6 */
2018					if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
2019						goto drop;
2020			}
2021			/*
2022			 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2023			 * in_broadcast() should never return true on a received
2024			 * packet with M_BCAST not set.
2025			 *
2026			 * Packets with a multicast source address should also
2027			 * be discarded.
2028			 */
2029			if (m->m_flags & (M_BCAST|M_MCAST))
2030				goto drop;
2031#if INET6
2032			if (isipv6) {
2033				if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2034					IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2035					goto drop;
2036			} else
2037#endif
2038			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2039				IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2040				ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2041				in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2042				goto drop;
2043
2044
2045#if INET6
2046			/*
2047			 * If deprecated address is forbidden,
2048			 * we do not accept SYN to deprecated interface
2049			 * address to prevent any new inbound connection from
2050			 * getting established.
2051			 * When we do not accept SYN, we send a TCP RST,
2052			 * with deprecated source address (instead of dropping
2053			 * it).  We compromise it as it is much better for peer
2054			 * to send a RST, and RST will be the final packet
2055			 * for the exchange.
2056			 *
2057			 * If we do not forbid deprecated addresses, we accept
2058			 * the SYN packet.  RFC 4862 forbids dropping SYN in
2059			 * this case.
2060			 */
2061			if (isipv6 && !ip6_use_deprecated) {
2062				uint32_t ia6_flags;
2063
2064				if (ip6_getdstifaddr_info(m, NULL,
2065				    &ia6_flags) == 0) {
2066					if (ia6_flags & IN6_IFF_DEPRECATED) {
2067						tp = NULL;
2068						rstreason = BANDLIM_RST_OPENPORT;
2069						IF_TCP_STATINC(ifp, deprecate6);
2070						goto dropwithreset;
2071					}
2072				}
2073			}
2074#endif
2075			if (so->so_filt) {
2076#if INET6
2077				if (isipv6) {
2078					struct sockaddr_in6	*sin6 = (struct sockaddr_in6*)&from;
2079
2080					sin6->sin6_len = sizeof(*sin6);
2081					sin6->sin6_family = AF_INET6;
2082					sin6->sin6_port = th->th_sport;
2083					sin6->sin6_flowinfo = 0;
2084					sin6->sin6_addr = ip6->ip6_src;
2085					sin6->sin6_scope_id = 0;
2086 				}
2087				else
2088#endif
2089				{
2090					struct sockaddr_in *sin = (struct sockaddr_in*)&from;
2091
2092					sin->sin_len = sizeof(*sin);
2093					sin->sin_family = AF_INET;
2094					sin->sin_port = th->th_sport;
2095					sin->sin_addr = ip->ip_src;
2096				}
2097				so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2098			} else {
2099				so2 = sonewconn(so, 0, NULL);
2100			}
2101			if (so2 == 0) {
2102				tcpstat.tcps_listendrop++;
2103				if (tcp_dropdropablreq(so)) {
2104					if (so->so_filt)
2105						so2 = sonewconn(so, 0, (struct sockaddr*)&from);
2106					else
2107						so2 = sonewconn(so, 0, NULL);
2108				}
2109				if (!so2)
2110					goto drop;
2111			}
2112
2113			/* Point "inp" and "tp" in tandem to new socket */
2114			inp = (struct inpcb *)so2->so_pcb;
2115			tp = intotcpcb(inp);
2116
2117			oso = so;
2118			tcp_unlock(so, 0, 0); /* Unlock but keep a reference on listener for now */
2119
2120			so = so2;
2121			tcp_lock(so, 1, 0);
2122			/*
2123			 * Mark socket as temporary until we're
2124			 * committed to keeping it.  The code at
2125			 * ``drop'' and ``dropwithreset'' check the
2126			 * flag dropsocket to see if the temporary
2127			 * socket created here should be discarded.
2128			 * We mark the socket as discardable until
2129			 * we're committed to it below in TCPS_LISTEN.
2130			 * There are some error conditions in which we
2131			 * have to drop the temporary socket.
2132			 */
2133			dropsocket++;
2134			/*
2135			 * Inherit INP_BOUND_IF from listener; testing if
2136			 * head_ifscope is non-NULL is sufficient, since it
2137			 * can only be set to a non-zero value earlier if
2138			 * the listener has such a flag set.
2139			 */
2140			if (head_ifscope != NULL) {
2141				inp->inp_flags |= INP_BOUND_IF;
2142				inp->inp_boundifp = head_ifscope;
2143			} else {
2144				inp->inp_flags &= ~INP_BOUND_IF;
2145			}
2146			/*
2147			 * Inherit restrictions from listener.
2148			 */
2149			if (head_nocell)
2150				inp_set_nocellular(inp);
2151			if (head_noexpensive)
2152				inp_set_noexpensive(inp);
2153			if (head_awdl_unrestricted)
2154				inp_set_awdl_unrestricted(inp);
2155			/*
2156			 * Inherit {IN,IN6}_RECV_ANYIF from listener.
2157			 */
2158			if (head_recvanyif)
2159				inp->inp_flags |= INP_RECV_ANYIF;
2160			else
2161				inp->inp_flags &= ~INP_RECV_ANYIF;
2162#if INET6
2163			if (isipv6)
2164				inp->in6p_laddr = ip6->ip6_dst;
2165			else {
2166				inp->inp_vflag &= ~INP_IPV6;
2167				inp->inp_vflag |= INP_IPV4;
2168#endif /* INET6 */
2169				inp->inp_laddr = ip->ip_dst;
2170#if INET6
2171			}
2172#endif /* INET6 */
2173			inp->inp_lport = th->th_dport;
2174			if (in_pcbinshash(inp, 0) != 0) {
2175				/*
2176				 * Undo the assignments above if we failed to
2177				 * put the PCB on the hash lists.
2178				 */
2179#if INET6
2180				if (isipv6)
2181					inp->in6p_laddr = in6addr_any;
2182				else
2183#endif /* INET6 */
2184					inp->inp_laddr.s_addr = INADDR_ANY;
2185				inp->inp_lport = 0;
2186				tcp_lock(oso, 0, 0);	/* release ref on parent */
2187				tcp_unlock(oso, 1, 0);
2188				goto drop;
2189			}
2190#if INET6
2191			if (isipv6) {
2192  				/*
2193 				 * Inherit socket options from the listening
2194  				 * socket.
2195 				 * Note that in6p_inputopts are not (even
2196 				 * should not be) copied, since it stores
2197				 * previously received options and is used to
2198 				 * detect if each new option is different than
2199 				 * the previous one and hence should be passed
2200 				 * to a user.
2201 				 * If we copied in6p_inputopts, a user would
2202 				 * not be able to receive options just after
2203 				 * calling the accept system call.
2204 				 */
2205				inp->inp_flags |=
2206					oinp->inp_flags & INP_CONTROLOPTS;
2207 				if (oinp->in6p_outputopts)
2208 					inp->in6p_outputopts =
2209 						ip6_copypktopts(oinp->in6p_outputopts,
2210 								M_NOWAIT);
2211			} else
2212#endif /* INET6 */
2213				inp->inp_options = ip_srcroute();
2214			tcp_lock(oso, 0, 0);
2215#if IPSEC
2216			/* copy old policy into new socket's */
2217			if (sotoinpcb(oso)->inp_sp)
2218			{
2219				int error = 0;
2220				/* Is it a security hole here to silently fail to copy the policy? */
2221				if (inp->inp_sp != NULL)
2222					error = ipsec_init_policy(so, &inp->inp_sp);
2223				if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
2224					printf("tcp_input: could not copy policy\n");
2225			}
2226#endif
2227			/* inherit states from the listener */
2228			DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2229				struct tcpcb *, tp, int32_t, TCPS_LISTEN);
2230			tp->t_state = TCPS_LISTEN;
2231			tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
2232			tp->t_flagsext |= (tp0->t_flagsext & (TF_RXTFINDROP|TF_NOTIMEWAIT));
2233			tp->t_keepinit = tp0->t_keepinit;
2234			tp->t_keepcnt = tp0->t_keepcnt;
2235			tp->t_keepintvl = tp0->t_keepintvl;
2236			tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
2237			tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
2238			tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
2239			if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0)
2240				tp->t_notsent_lowat = tp0->t_notsent_lowat;
2241
2242			/* now drop the reference on the listener */
2243			tcp_unlock(oso, 1, 0);
2244
2245			tcp_set_max_rwinscale(tp, so);
2246
2247			KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
2248		}
2249	}
2250	lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2251		LCK_MTX_ASSERT_OWNED);
2252
2253	if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
2254		/*
2255		 * Evaluate the rate of arrival of packets to see if the
2256		 * receiver can reduce the ack traffic. The algorithm to
2257		 * stretch acks will be enabled if the connection meets
2258		 * certain criteria defined in tcp_stretch_ack_enable function.
2259		 */
2260		if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
2261			TCP_INC_VAR(tp->rcv_waitforss, nlropkts);
2262		}
2263		if (tcp_stretch_ack_enable(tp)) {
2264			tp->t_flags |= TF_STRETCHACK;
2265			tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
2266			tp->rcv_waitforss = 0;
2267		} else {
2268			tp->t_flags &= ~(TF_STRETCHACK);
2269		}
2270		if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) {
2271			tp->rcv_by_unackwin += (tlen + off);
2272		} else {
2273			tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
2274			tp->rcv_by_unackwin = tlen + off;
2275		}
2276	}
2277
2278	/*
2279	 * Keep track of how many bytes were received in the LRO packet
2280	 */
2281	if ((pktf_sw_lro_pkt) && (nlropkts > 2))  {
2282		tp->t_lropktlen += tlen;
2283	}
2284	/*
2285	 * Explicit Congestion Notification - Flag that we need to send ECT if
2286	 * 	+ The IP Congestion experienced flag was set.
2287	 * 	+ Socket is in established state
2288	 * 	+ We negotiated ECN in the TCP setup
2289	 * 	+ This isn't a pure ack (tlen > 0)
2290	 * 	+ The data is in the valid window
2291	 *
2292	 * 	TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
2293	 */
2294	if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2295		((tp->ecn_flags & (TE_ECN_ON)) == (TE_ECN_ON)) && tlen > 0 &&
2296		SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2297		SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2298		tp->ecn_flags |= TE_SENDECE;
2299	}
2300
2301	/*
2302	 * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
2303	 * bother doing extensive checks for state and whatnot.
2304	 */
2305	if ((thflags & TH_CWR) == TH_CWR) {
2306		tp->ecn_flags &= ~TE_SENDECE;
2307	}
2308
2309	/*
2310	 * If we received an  explicit notification of congestion in
2311	 * ip tos ecn bits or by the CWR bit in TCP header flags, reset
2312	 * the ack-strteching state. We need to handle ECN notification if
2313	 * an ECN setup SYN was sent even once.
2314	 */
2315	if (tp->t_state == TCPS_ESTABLISHED
2316	    && (tp->ecn_flags & TE_SETUPSENT)
2317	    && (ip_ecn == IPTOS_ECN_CE || (thflags & TH_CWR)))
2318		tcp_reset_stretch_ack(tp);
2319
2320	/*
2321	 * Try to determine if we are receiving a packet after a long time.
2322	 * Use our own approximation of idletime to roughly measure remote
2323	 * end's idle time. Since slowstart is used after an idle period
2324	 * we want to avoid doing LRO if the remote end is not up to date
2325	 * on initial window support and starts with 1 or 2 packets as its IW.
2326	 */
2327	 if (sw_lro && (tp->t_flagsext & TF_LRO_OFFLOADED) &&
2328	 	((tcp_now - tp->t_rcvtime) >= (TCP_IDLETIMEOUT(tp)))) {
2329		turnoff_lro = 1;
2330	 }
2331
2332	/* Update rcvtime as a new segment was received on the connection */
2333	tp->t_rcvtime = tcp_now;
2334
2335	/*
2336	 * Segment received on connection.
2337	 * Reset idle time and keep-alive timer.
2338	 */
2339	if (TCPS_HAVEESTABLISHED(tp->t_state))
2340		tcp_keepalive_reset(tp);
2341
2342	/*
2343	 * Process options if not in LISTEN state,
2344	 * else do it below (after getting remote address).
2345	 */
2346	if (tp->t_state != TCPS_LISTEN && optp) {
2347		tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
2348#if MPTCP
2349		mptcp_csum = mptcp_input_csum(tp, m, drop_hdrlen);
2350		if (mptcp_csum) {
2351			tp->t_mpflags |= TMPF_SND_MPFAIL;
2352			tp->t_mpflags &= ~TMPF_EMBED_DSN;
2353			mptcp_notify_mpfail(so);
2354			m_freem(m);
2355			tcpstat.tcps_mp_badcsum++;
2356			tcp_check_timer_state(tp);
2357			tcp_unlock(so, 1, 0);
2358			KERNEL_DEBUG(DBG_FNC_TCP_INPUT |
2359			    DBG_FUNC_END,0,0,0,0,0);
2360			return;
2361		}
2362		mptcp_insert_rmap(tp, m);
2363#endif /* MPTCP */
2364	}
2365	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2366		if (to.to_flags & TOF_TS) {
2367			tp->t_flags |= TF_RCVD_TSTMP;
2368			tp->ts_recent = to.to_tsval;
2369			tp->ts_recent_age = tcp_now;
2370		}
2371		if (to.to_flags & TOF_MSS)
2372			tcp_mss(tp, to.to_mss, ifscope);
2373		if (SACK_ENABLED(tp)) {
2374			if (!(to.to_flags & TOF_SACK))
2375				tp->t_flagsext &= ~(TF_SACK_ENABLE);
2376			else
2377				tp->t_flags |= TF_SACK_PERMIT;
2378		}
2379	}
2380
2381#if TRAFFIC_MGT
2382	/* Compute inter-packet arrival jitter. According to RFC 3550, inter-packet
2383	 * arrival jitter is defined as the difference in packet spacing at the
2384	 * receiver compared to the sender for a pair of packets. When two packets
2385	 * of maximum segment size come one after the other with consecutive
2386	 * sequence numbers, we consider them as packets sent together at the
2387	 * sender and use them as a pair to compute inter-packet arrival jitter.
2388	 * This metric indicates the delay induced by the network components due
2389	 * to queuing in edge/access routers.
2390	 */
2391	if (tp->t_state == TCPS_ESTABLISHED &&
2392	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK &&
2393	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2394	    ((to.to_flags & TOF_TS) == 0 ||
2395            TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2396	    th->th_seq == tp->rcv_nxt &&
2397	    LIST_EMPTY(&tp->t_segq)) {
2398		int seg_size = tlen;
2399		if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
2400			TCP_INC_VAR(tp->iaj_pktcnt, nlropkts);
2401		}
2402
2403		if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2404			seg_size = m->m_pkthdr.lro_pktlen;
2405		}
2406		if ( tp->iaj_size == 0 || seg_size > tp->iaj_size ||
2407			(seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
2408			/* State related to inter-arrival jitter is uninitialized
2409			 * or we are trying to find a good first packet to start
2410			 * computing the metric
2411			 */
2412			update_iaj_state(tp, seg_size, 0);
2413		} else {
2414			if (seg_size == tp->iaj_size) {
2415				/* Compute inter-arrival jitter taking this packet
2416				 * as the second packet
2417				 */
2418				if (pktf_sw_lro_pkt)
2419					compute_iaj(tp, nlropkts,
2420					    m->m_pkthdr.lro_elapsed);
2421				else
2422					compute_iaj(tp, 1, 0);
2423			}
2424			if (seg_size  < tp->iaj_size) {
2425				/* There is a smaller packet in the stream.
2426				 * Some times the maximum size supported on a path can
2427				 * change if there is a new link with smaller MTU.
2428				 * The receiver will not know about this change.
2429				 * If there are too many packets smaller than iaj_size,
2430				 * we try to learn the iaj_size again.
2431				 */
2432				TCP_INC_VAR(tp->iaj_small_pkt, nlropkts);
2433				if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
2434					update_iaj_state(tp, seg_size, 1);
2435				} else {
2436					CLEAR_IAJ_STATE(tp);
2437				}
2438			} else {
2439				update_iaj_state(tp, seg_size, 0);
2440			}
2441		}
2442	} else {
2443		CLEAR_IAJ_STATE(tp);
2444	}
2445#endif /* TRAFFIC_MGT */
2446
2447	/*
2448	 * Header prediction: check for the two common cases
2449	 * of a uni-directional data xfer.  If the packet has
2450	 * no control flags, is in-sequence, the window didn't
2451	 * change and we're not retransmitting, it's a
2452	 * candidate.  If the length is zero and the ack moved
2453	 * forward, we're the sender side of the xfer.  Just
2454	 * free the data acked & wake any higher level process
2455	 * that was blocked waiting for space.  If the length
2456	 * is non-zero and the ack didn't move, we're the
2457	 * receiver side.  If we're getting packets in-order
2458	 * (the reassembly queue is empty), add the data to
2459	 * the socket buffer and note that we need a delayed ack.
2460	 * Make sure that the hidden state-flags are also off.
2461	 * Since we check for TCPS_ESTABLISHED above, it can only
2462	 * be TH_NEEDSYN.
2463	 */
2464	if (tp->t_state == TCPS_ESTABLISHED &&
2465	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE)) == TH_ACK &&
2466	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
2467	    ((to.to_flags & TOF_TS) == 0 ||
2468	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2469	    th->th_seq == tp->rcv_nxt &&
2470	    tiwin && tiwin == tp->snd_wnd &&
2471	    tp->snd_nxt == tp->snd_max) {
2472
2473		/*
2474		 * If last ACK falls within this segment's sequence numbers,
2475		 * record the timestamp.
2476		 * NOTE that the test is modified according to the latest
2477		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2478		 */
2479		if ((to.to_flags & TOF_TS) != 0 &&
2480		   SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2481			tp->ts_recent_age = tcp_now;
2482			tp->ts_recent = to.to_tsval;
2483		}
2484
2485		/* Force acknowledgment if we received a FIN */
2486
2487		if (thflags & TH_FIN)
2488			tp->t_flags |= TF_ACKNOW;
2489
2490		if (tlen == 0) {
2491			if (SEQ_GT(th->th_ack, tp->snd_una) &&
2492			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
2493			    tp->snd_cwnd >= tp->snd_ssthresh &&
2494			    (!IN_FASTRECOVERY(tp) &&
2495			    ((!(SACK_ENABLED(tp)) &&
2496			    tp->t_dupacks < tp->t_rexmtthresh) ||
2497			    (SACK_ENABLED(tp) && to.to_nsacks == 0 &&
2498			    TAILQ_EMPTY(&tp->snd_holes))))) {
2499				/*
2500				 * this is a pure ack for outstanding data.
2501				 */
2502				++tcpstat.tcps_predack;
2503
2504				tcp_bad_rexmt_check(tp, th, &to),
2505
2506				/* Recalculate the RTT */
2507				tcp_compute_rtt(tp, &to, th);
2508
2509				acked = BYTES_ACKED(th, tp);
2510				tcpstat.tcps_rcvackpack++;
2511				tcpstat.tcps_rcvackbyte += acked;
2512
2513				/* Handle an ack that is in sequence during congestion
2514				 * avoidance phase. The calculations in this function
2515				 * assume that snd_una is not updated yet.
2516				 */
2517				if (CC_ALGO(tp)->congestion_avd != NULL)
2518					CC_ALGO(tp)->congestion_avd(tp, th);
2519				tcp_ccdbg_trace(tp, th, TCP_CC_INSEQ_ACK_RCVD);
2520				sbdrop(&so->so_snd, acked);
2521				if (so->so_flags & SOF_ENABLE_MSGS) {
2522					VERIFY(acked <= so->so_msg_state->msg_serial_bytes);
2523					so->so_msg_state->msg_serial_bytes -= acked;
2524				}
2525				tcp_sbsnd_trim(&so->so_snd);
2526
2527				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2528				    SEQ_LEQ(th->th_ack, tp->snd_recover))
2529					tp->snd_recover = th->th_ack - 1;
2530				tp->snd_una = th->th_ack;
2531
2532				/*
2533				 * pull snd_wl2 up to prevent seq wrap relative
2534				 * to th_ack.
2535				 */
2536				tp->snd_wl2 = th->th_ack;
2537
2538				if (tp->t_dupacks > 0) {
2539					tp->t_dupacks = 0;
2540					tp->t_rexmtthresh = tcprexmtthresh;
2541				}
2542
2543				m_freem(m);
2544
2545				/*
2546				 * If all outstanding data are acked, stop
2547				 * retransmit timer, otherwise restart timer
2548				 * using current (possibly backed-off) value.
2549				 * If process is waiting for space,
2550				 * wakeup/selwakeup/signal.  If data
2551				 * are ready to send, let tcp_output
2552				 * decide between more output or persist.
2553				 */
2554				if (tp->snd_una == tp->snd_max) {
2555					tp->t_timer[TCPT_REXMT] = 0;
2556					tp->t_timer[TCPT_PTO] = 0;
2557				} else if (tp->t_timer[TCPT_PERSIST] == 0) {
2558					tp->t_timer[TCPT_REXMT] =
2559					    OFFSET_FROM_START(tp,
2560					    tp->t_rxtcur);
2561				}
2562
2563				if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
2564					tp->t_bwmeas != NULL)
2565					tcp_bwmeas_check(tp);
2566				sowwakeup(so); /* has to be done with socket lock held */
2567				if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
2568					(void) tcp_output(tp);
2569				}
2570
2571				tcp_check_timer_state(tp);
2572				tcp_unlock(so, 1, 0);
2573				KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2574				return;
2575			}
2576		} else if (th->th_ack == tp->snd_una &&
2577		    LIST_EMPTY(&tp->t_segq) &&
2578		    tlen <= tcp_sbspace(tp)) {
2579			/*
2580			 * this is a pure, in-sequence data packet
2581			 * with nothing on the reassembly queue and
2582			 * we have enough buffer space to take it.
2583			 */
2584
2585			/*
2586                 	 * If this is a connection in steady state, start
2587			 * coalescing packets belonging to this flow.
2588			 */
2589			if (turnoff_lro) {
2590				tcp_lro_remove_state(tp->t_inpcb->inp_laddr,
2591					tp->t_inpcb->inp_faddr,
2592					tp->t_inpcb->inp_lport,
2593					tp->t_inpcb->inp_fport);
2594				tp->t_flagsext &= ~TF_LRO_OFFLOADED;
2595				tp->t_idleat = tp->rcv_nxt;
2596			} else if (sw_lro && !pktf_sw_lro_pkt && !isipv6 &&
2597			    (so->so_flags & SOF_USELRO) &&
2598			    !IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) &&
2599  			    (m->m_pkthdr.rcvif->if_type != IFT_LOOP) &&
2600			    ((th->th_seq - tp->irs) >
2601			    (tp->t_maxseg << lro_start)) &&
2602			    ((tp->t_idleat == 0) || ((th->th_seq -
2603			     tp->t_idleat) > (tp->t_maxseg << lro_start)))) {
2604				tp->t_flagsext |= TF_LRO_OFFLOADED;
2605				tcp_start_coalescing(ip, th, tlen);
2606				tp->t_idleat = 0;
2607			}
2608
2609			/* Clean receiver SACK report if present */
2610			if (SACK_ENABLED(tp) && tp->rcv_numsacks)
2611				tcp_clean_sackreport(tp);
2612			++tcpstat.tcps_preddat;
2613			tp->rcv_nxt += tlen;
2614			/*
2615			 * Pull snd_wl1 up to prevent seq wrap relative to
2616			 * th_seq.
2617			 */
2618			tp->snd_wl1 = th->th_seq;
2619			/*
2620			 * Pull rcv_up up to prevent seq wrap relative to
2621			 * rcv_nxt.
2622			 */
2623			tp->rcv_up = tp->rcv_nxt;
2624			TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
2625			tcpstat.tcps_rcvbyte += tlen;
2626			if (nstat_collect) {
2627				if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
2628					INP_ADD_STAT(inp, cell, wifi, wired,
2629					    rxpackets, m->m_pkthdr.lro_npkts);
2630				} else {
2631					INP_ADD_STAT(inp, cell, wifi, wired,
2632					    rxpackets, 1);
2633				}
2634				INP_ADD_STAT(inp, cell, wifi, wired,rxbytes,
2635				    tlen);
2636			}
2637
2638			/*
2639			 * Calculate the RTT on the receiver only if the
2640			 * connection is in streaming mode and the last
2641			 * packet was not an end-of-write
2642			 */
2643			if ((tp->t_flags & TF_STRETCHACK) &&
2644				!(tp->t_flagsext & TF_STREAMEOW))
2645				tcp_compute_rtt(tp, &to, th);
2646
2647			tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
2648
2649			/*
2650			 * Add data to socket buffer.
2651			 */
2652			so_recv_data_stat(so, m, 0);
2653			m_adj(m, drop_hdrlen);	/* delayed header drop */
2654
2655			/*
2656			 * If message delivery (SOF_ENABLE_MSGS) is enabled on
2657			 * this socket, deliver the packet received as an
2658			 * in-order message with sequence number attached to it.
2659			 */
2660			if (sbappendstream_rcvdemux(so, m,
2661			    th->th_seq - (tp->irs + 1), 0)) {
2662				sorwakeup(so);
2663			}
2664#if INET6
2665			if (isipv6) {
2666				KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2667		     			(((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2668			     		th->th_seq, th->th_ack, th->th_win);
2669			}
2670			else
2671#endif
2672			{
2673				KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
2674		     			(((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2675			     		th->th_seq, th->th_ack, th->th_win);
2676			}
2677			TCP_INC_VAR(tp->t_unacksegs, nlropkts);
2678			if (DELAY_ACK(tp, th))  {
2679				if ((tp->t_flags & TF_DELACK) == 0) {
2680			    		tp->t_flags |= TF_DELACK;
2681					tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2682				}
2683			} else {
2684				tp->t_flags |= TF_ACKNOW;
2685				tcp_output(tp);
2686			}
2687
2688			tcp_adaptive_rwtimo_check(tp, tlen);
2689
2690			tcp_check_timer_state(tp);
2691			tcp_unlock(so, 1, 0);
2692			KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
2693			return;
2694		}
2695	}
2696
2697	/*
2698	 * Calculate amount of space in receive window,
2699	 * and then do TCP input processing.
2700	 * Receive window is amount of space in rcv queue,
2701	 * but not less than advertised window.
2702	 */
2703	lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2704	    LCK_MTX_ASSERT_OWNED);
2705	win = tcp_sbspace(tp);
2706	if (win < 0)
2707		win = 0;
2708	else {	/* clip rcv window to 4K for modems */
2709		if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0)
2710			win = min(win, slowlink_wsize);
2711	}
2712	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
2713#if MPTCP
2714	/*
2715	 * Ensure that the subflow receive window isn't greater
2716	 * than the connection level receive window.
2717	 */
2718	if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
2719	    (mp_tp = tptomptp(tp))) {
2720		MPT_LOCK(mp_tp);
2721		if (tp->rcv_wnd > mp_tp->mpt_rcvwnd) {
2722			tp->rcv_wnd = mp_tp->mpt_rcvwnd;
2723			tcpstat.tcps_mp_reducedwin++;
2724		}
2725		MPT_UNLOCK(mp_tp);
2726	}
2727#endif /* MPTCP */
2728
2729	switch (tp->t_state) {
2730
2731	/*
2732	 * Initialize tp->rcv_nxt, and tp->irs, select an initial
2733	 * tp->iss, and send a segment:
2734	 *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
2735	 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
2736	 * Fill in remote peer address fields if not previously specified.
2737	 * Enter SYN_RECEIVED state, and process any other fields of this
2738	 * segment in this state.
2739	 */
2740	case TCPS_LISTEN: {
2741		register struct sockaddr_in *sin;
2742#if INET6
2743		register struct sockaddr_in6 *sin6;
2744#endif
2745
2746		lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
2747		    LCK_MTX_ASSERT_OWNED);
2748#if INET6
2749		if (isipv6) {
2750			MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
2751			       M_SONAME, M_NOWAIT);
2752			if (sin6 == NULL)
2753				goto drop;
2754			bzero(sin6, sizeof(*sin6));
2755			sin6->sin6_family = AF_INET6;
2756			sin6->sin6_len = sizeof(*sin6);
2757			sin6->sin6_addr = ip6->ip6_src;
2758			sin6->sin6_port = th->th_sport;
2759			laddr6 = inp->in6p_laddr;
2760			if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
2761				inp->in6p_laddr = ip6->ip6_dst;
2762			if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
2763					   proc0)) {
2764				inp->in6p_laddr = laddr6;
2765				FREE(sin6, M_SONAME);
2766				goto drop;
2767			}
2768			FREE(sin6, M_SONAME);
2769		} else
2770#endif
2771	    {
2772			lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
2773			MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
2774		       M_NOWAIT);
2775			if (sin == NULL)
2776				goto drop;
2777			sin->sin_family = AF_INET;
2778			sin->sin_len = sizeof(*sin);
2779			sin->sin_addr = ip->ip_src;
2780			sin->sin_port = th->th_sport;
2781			bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
2782			laddr = inp->inp_laddr;
2783			if (inp->inp_laddr.s_addr == INADDR_ANY)
2784				inp->inp_laddr = ip->ip_dst;
2785			if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0,
2786			    IFSCOPE_NONE, NULL)) {
2787				inp->inp_laddr = laddr;
2788				FREE(sin, M_SONAME);
2789				goto drop;
2790			}
2791			FREE(sin, M_SONAME);
2792		}
2793
2794		tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
2795
2796		if (SACK_ENABLED(tp)) {
2797			if (!(to.to_flags & TOF_SACK))
2798				tp->t_flagsext &= ~(TF_SACK_ENABLE);
2799			else
2800				tp->t_flags |= TF_SACK_PERMIT;
2801		}
2802
2803		if (iss)
2804			tp->iss = iss;
2805		else {
2806			tp->iss = tcp_new_isn(tp);
2807 		}
2808		tp->irs = th->th_seq;
2809		tcp_sendseqinit(tp);
2810		tcp_rcvseqinit(tp);
2811		tp->snd_recover = tp->snd_una;
2812		/*
2813		 * Initialization of the tcpcb for transaction;
2814		 *   set SND.WND = SEG.WND,
2815		 *   initialize CCsend and CCrecv.
2816		 */
2817		tp->snd_wnd = tiwin;	/* initial send-window */
2818		tp->t_flags |= TF_ACKNOW;
2819		tp->t_unacksegs = 0;
2820		DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2821			struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
2822		tp->t_state = TCPS_SYN_RECEIVED;
2823		tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2824			TCP_CONN_KEEPINIT(tp));
2825		dropsocket = 0;		/* committed to socket */
2826
2827		if (inp->inp_flowhash == 0)
2828			inp->inp_flowhash = inp_calc_flowhash(inp);
2829#if INET6
2830		/* update flowinfo - RFC 6437 */
2831		if (inp->inp_flow == 0 &&
2832		    inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
2833			inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
2834			inp->inp_flow |=
2835			    (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
2836		}
2837#endif /* INET6 */
2838
2839		/* reset the incomp processing flag */
2840		so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
2841		tcpstat.tcps_accepts++;
2842		if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) {
2843			/* ECN-setup SYN */
2844			tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
2845		}
2846
2847#if CONFIG_IFEF_NOWINDOWSCALE
2848		if (tcp_obey_ifef_nowindowscale && m->m_pkthdr.rcvif != NULL &&
2849		    (m->m_pkthdr.rcvif->if_eflags & IFEF_NOWINDOWSCALE)) {
2850			/* Window scaling is not enabled on this interface */
2851			tp->t_flags &= ~TF_REQ_SCALE;
2852		}
2853#endif
2854		goto trimthenstep6;
2855		}
2856
2857	/*
2858	 * If the state is SYN_RECEIVED:
2859	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
2860	 */
2861	case TCPS_SYN_RECEIVED:
2862		if ((thflags & TH_ACK) &&
2863		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
2864		     SEQ_GT(th->th_ack, tp->snd_max))) {
2865				rstreason = BANDLIM_RST_OPENPORT;
2866				IF_TCP_STATINC(ifp, ooopacket);
2867				goto dropwithreset;
2868		}
2869
2870		/*
2871		 * In SYN_RECEIVED state, if we recv some SYNS with
2872		 * window scale and others without, window scaling should
2873		 * be disabled. Otherwise the window advertised will be
2874		 * lower if we assume scaling and the other end does not.
2875		 */
2876		if ((thflags & TH_SYN) &&
2877		    !(to.to_flags & TOF_SCALE))
2878			tp->t_flags &= ~TF_RCVD_SCALE;
2879		break;
2880
2881	/*
2882	 * If the state is SYN_SENT:
2883	 *	if seg contains an ACK, but not for our SYN, drop the input.
2884	 *	if seg contains a RST, then drop the connection.
2885	 *	if seg does not contain SYN, then drop it.
2886	 * Otherwise this is an acceptable SYN segment
2887	 *	initialize tp->rcv_nxt and tp->irs
2888	 *	if seg contains ack then advance tp->snd_una
2889	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
2890	 *	arrange for segment to be acked (eventually)
2891	 *	continue processing rest of data/controls, beginning with URG
2892	 */
2893	case TCPS_SYN_SENT:
2894		if ((thflags & TH_ACK) &&
2895		    (SEQ_LEQ(th->th_ack, tp->iss) ||
2896		     SEQ_GT(th->th_ack, tp->snd_max))) {
2897			rstreason = BANDLIM_UNLIMITED;
2898			IF_TCP_STATINC(ifp, ooopacket);
2899			goto dropwithreset;
2900		}
2901		if (thflags & TH_RST) {
2902			if ((thflags & TH_ACK) != 0) {
2903#if MPTCP
2904				if ((so->so_flags & SOF_MPTCP_FASTJOIN) &&
2905					SEQ_GT(th->th_ack, tp->iss+1)) {
2906					so->so_flags &= ~SOF_MPTCP_FASTJOIN;
2907					/* ignore the RST and retransmit SYN */
2908					goto drop;
2909				}
2910#endif /* MPTCP */
2911				soevent(so,
2912				    (SO_FILT_HINT_LOCKED |
2913				    SO_FILT_HINT_CONNRESET));
2914				tp = tcp_drop(tp, ECONNREFUSED);
2915				postevent(so, 0, EV_RESET);
2916			}
2917			goto drop;
2918		}
2919		if ((thflags & TH_SYN) == 0)
2920			goto drop;
2921		tp->snd_wnd = th->th_win;	/* initial send window */
2922
2923		tp->irs = th->th_seq;
2924		tcp_rcvseqinit(tp);
2925		if (thflags & TH_ACK) {
2926			tcpstat.tcps_connects++;
2927
2928			if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
2929				/* ECN-setup SYN-ACK */
2930				tp->ecn_flags |= TE_SETUPRECEIVED;
2931				tcpstat.tcps_ecn_setup++;
2932			}
2933			else {
2934				/* non-ECN-setup SYN-ACK */
2935				tp->ecn_flags &= ~TE_SENDIPECT;
2936			}
2937
2938#if CONFIG_MACF_NET && CONFIG_MACF_SOCKET
2939			/* XXXMAC: recursive lock: SOCK_LOCK(so); */
2940			mac_socketpeer_label_associate_mbuf(m, so);
2941			/* XXXMAC: SOCK_UNLOCK(so); */
2942#endif
2943			/* Do window scaling on this connection? */
2944			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2945				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
2946				tp->snd_scale = tp->requested_s_scale;
2947				tp->rcv_scale = tp->request_r_scale;
2948			}
2949			tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
2950			tp->snd_una++;		/* SYN is acked */
2951			/*
2952			 * If there's data, delay ACK; if there's also a FIN
2953			 * ACKNOW will be turned on later.
2954			 */
2955			TCP_INC_VAR(tp->t_unacksegs, nlropkts);
2956			if (DELAY_ACK(tp, th) && tlen != 0 ) {
2957				if ((tp->t_flags & TF_DELACK) == 0) {
2958					tp->t_flags |= TF_DELACK;
2959					tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
2960				}
2961			}
2962			else {
2963				tp->t_flags |= TF_ACKNOW;
2964			}
2965			/*
2966			 * Received <SYN,ACK> in SYN_SENT[*] state.
2967			 * Transitions:
2968			 *	SYN_SENT  --> ESTABLISHED
2969			 *	SYN_SENT* --> FIN_WAIT_1
2970			 */
2971			tp->t_starttime = tcp_now;
2972			tcp_sbrcv_tstmp_check(tp);
2973			if (tp->t_flags & TF_NEEDFIN) {
2974				DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2975					struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
2976				tp->t_state = TCPS_FIN_WAIT_1;
2977				tp->t_flags &= ~TF_NEEDFIN;
2978				thflags &= ~TH_SYN;
2979			} else {
2980				DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2981					struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
2982				tp->t_state = TCPS_ESTABLISHED;
2983				tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2984					TCP_CONN_KEEPIDLE(tp));
2985				if (nstat_collect)
2986					nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt);
2987			}
2988#if MPTCP
2989			/*
2990			 * Do not send the connect notification for additional
2991			 * subflows until ACK for 3-way handshake arrives.
2992			 */
2993			if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
2994			    (tp->t_mpflags & TMPF_SENT_JOIN)) {
2995				isconnected = FALSE;
2996				/* Start data xmit if fastjoin */
2997				if (mptcp_fastjoin && (so->so_flags & SOF_MPTCP_FASTJOIN)) {
2998					soevent(so, (SO_FILT_HINT_LOCKED |
2999					    SO_FILT_HINT_MPFASTJ));
3000				}
3001			} else
3002#endif /* MPTCP */
3003				isconnected = TRUE;
3004		} else {
3005			/*
3006			 *  Received initial SYN in SYN-SENT[*] state => simul-
3007		  	 *  taneous open.  If segment contains CC option and there is
3008			 *  a cached CC, apply TAO test; if it succeeds, connection is
3009			 *  half-synchronized.  Otherwise, do 3-way handshake:
3010			 *        SYN-SENT -> SYN-RECEIVED
3011			 *        SYN-SENT* -> SYN-RECEIVED*
3012			 */
3013			tp->t_flags |= TF_ACKNOW;
3014			tp->t_timer[TCPT_REXMT] = 0;
3015			DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3016				struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3017			tp->t_state = TCPS_SYN_RECEIVED;
3018
3019		}
3020
3021trimthenstep6:
3022		/*
3023		 * Advance th->th_seq to correspond to first data byte.
3024		 * If data, trim to stay within window,
3025		 * dropping FIN if necessary.
3026		 */
3027		th->th_seq++;
3028		if (tlen > tp->rcv_wnd) {
3029			todrop = tlen - tp->rcv_wnd;
3030			m_adj(m, -todrop);
3031			tlen = tp->rcv_wnd;
3032			thflags &= ~TH_FIN;
3033			tcpstat.tcps_rcvpackafterwin++;
3034			tcpstat.tcps_rcvbyteafterwin += todrop;
3035		}
3036		tp->snd_wl1 = th->th_seq - 1;
3037		tp->rcv_up = th->th_seq;
3038		/*
3039		 *  Client side of transaction: already sent SYN and data.
3040		 *  If the remote host used T/TCP to validate the SYN,
3041		 *  our data will be ACK'd; if so, enter normal data segment
3042		 *  processing in the middle of step 5, ack processing.
3043		 *  Otherwise, goto step 6.
3044		 */
3045 		if (thflags & TH_ACK)
3046			goto process_ACK;
3047		goto step6;
3048	/*
3049	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
3050	 *      do normal processing.
3051	 *
3052	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
3053	 */
3054	case TCPS_LAST_ACK:
3055	case TCPS_CLOSING:
3056	case TCPS_TIME_WAIT:
3057 		break;  /* continue normal processing */
3058
3059	/* Received a SYN while connection is already established.
3060	 * This is a "half open connection and other anomalies" described
3061	 * in RFC793 page 34, send an ACK so the remote reset the connection
3062	 * or recovers by adjusting its sequence numberering
3063	 */
3064	case TCPS_ESTABLISHED:
3065		if (thflags & TH_SYN)
3066			goto dropafterack;
3067		break;
3068	}
3069
3070	/*
3071	 * States other than LISTEN or SYN_SENT.
3072	 * First check the RST flag and sequence number since reset segments
3073	 * are exempt from the timestamp and connection count tests.  This
3074	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
3075	 * below which allowed reset segments in half the sequence space
3076	 * to fall though and be processed (which gives forged reset
3077	 * segments with a random sequence number a 50 percent chance of
3078	 * killing a connection).
3079	 * Then check timestamp, if present.
3080	 * Then check the connection count, if present.
3081	 * Then check that at least some bytes of segment are within
3082	 * receive window.  If segment begins before rcv_nxt,
3083	 * drop leading data (and SYN); if nothing left, just ack.
3084	 *
3085	 *
3086	 * If the RST bit is set, check the sequence number to see
3087	 * if this is a valid reset segment.
3088	 * RFC 793 page 37:
3089	 *   In all states except SYN-SENT, all reset (RST) segments
3090	 *   are validated by checking their SEQ-fields.  A reset is
3091	 *   valid if its sequence number is in the window.
3092	 * Note: this does not take into account delayed ACKs, so
3093	 *   we should test against last_ack_sent instead of rcv_nxt.
3094	 *   The sequence number in the reset segment is normally an
3095	 *   echo of our outgoing acknowlegement numbers, but some hosts
3096	 *   send a reset with the sequence number at the rightmost edge
3097	 *   of our receive window, and we have to handle this case.
3098	 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
3099	 *   that brute force RST attacks are possible.  To combat this,
3100	 *   we use a much stricter check while in the ESTABLISHED state,
3101	 *   only accepting RSTs where the sequence number is equal to
3102	 *   last_ack_sent.  In all other states (the states in which a
3103	 *   RST is more likely), the more permissive check is used.
3104	 * If we have multiple segments in flight, the intial reset
3105	 * segment sequence numbers will be to the left of last_ack_sent,
3106	 * but they will eventually catch up.
3107	 * In any case, it never made sense to trim reset segments to
3108	 * fit the receive window since RFC 1122 says:
3109	 *   4.2.2.12  RST Segment: RFC-793 Section 3.4
3110	 *
3111	 *    A TCP SHOULD allow a received RST segment to include data.
3112	 *
3113	 *    DISCUSSION
3114	 *         It has been suggested that a RST segment could contain
3115	 *         ASCII text that encoded and explained the cause of the
3116	 *         RST.  No standard has yet been established for such
3117	 *         data.
3118	 *
3119	 * If the reset segment passes the sequence number test examine
3120	 * the state:
3121	 *    SYN_RECEIVED STATE:
3122	 *	If passive open, return to LISTEN state.
3123	 *	If active open, inform user that connection was refused.
3124	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
3125	 *	Inform user that connection was reset, and close tcb.
3126	 *    CLOSING, LAST_ACK STATES:
3127	 *	Close the tcb.
3128	 *    TIME_WAIT STATE:
3129	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
3130	 *      RFC 1337.
3131	 *
3132	 *      Radar 4803931: Allows for the case where we ACKed the FIN but
3133	 *                     there is already a RST in flight from the peer.
3134	 *                     In that case, accept the RST for non-established
3135	 *                     state if it's one off from last_ack_sent.
3136
3137	 */
3138	if (thflags & TH_RST) {
3139		if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
3140		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
3141		    (tp->rcv_wnd == 0 &&
3142		    ((tp->last_ack_sent == th->th_seq) ||
3143		    ((tp->last_ack_sent -1) == th->th_seq)))) {
3144			switch (tp->t_state) {
3145
3146			case TCPS_SYN_RECEIVED:
3147				IF_TCP_STATINC(ifp, rstinsynrcv);
3148				so->so_error = ECONNREFUSED;
3149				goto close;
3150
3151			case TCPS_ESTABLISHED:
3152				if (tp->last_ack_sent != th->th_seq) {
3153					tcpstat.tcps_badrst++;
3154					goto drop;
3155				}
3156			case TCPS_FIN_WAIT_1:
3157			case TCPS_CLOSE_WAIT:
3158				/*
3159				  Drop through ...
3160				*/
3161			case TCPS_FIN_WAIT_2:
3162				so->so_error = ECONNRESET;
3163			close:
3164				postevent(so, 0, EV_RESET);
3165				soevent(so,
3166				    (SO_FILT_HINT_LOCKED |
3167				    SO_FILT_HINT_CONNRESET));
3168
3169				tcpstat.tcps_drops++;
3170				tp = tcp_close(tp);
3171				break;
3172
3173			case TCPS_CLOSING:
3174			case TCPS_LAST_ACK:
3175				tp = tcp_close(tp);
3176				break;
3177
3178			case TCPS_TIME_WAIT:
3179				break;
3180			}
3181		}
3182		goto drop;
3183	}
3184
3185	/*
3186	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
3187	 * and it's less than ts_recent, drop it.
3188	 */
3189	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
3190	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
3191
3192		/* Check to see if ts_recent is over 24 days old.  */
3193		if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
3194			/*
3195			 * Invalidate ts_recent.  If this segment updates
3196			 * ts_recent, the age will be reset later and ts_recent
3197			 * will get a valid value.  If it does not, setting
3198			 * ts_recent to zero will at least satisfy the
3199			 * requirement that zero be placed in the timestamp
3200			 * echo reply when ts_recent isn't valid.  The
3201			 * age isn't reset until we get a valid ts_recent
3202			 * because we don't want out-of-order segments to be
3203			 * dropped when ts_recent is old.
3204			 */
3205			tp->ts_recent = 0;
3206		} else {
3207			tcpstat.tcps_rcvduppack++;
3208			tcpstat.tcps_rcvdupbyte += tlen;
3209			tcpstat.tcps_pawsdrop++;
3210			if (nstat_collect) {
3211				nstat_route_rx(tp->t_inpcb->inp_route.ro_rt,
3212					1, tlen, NSTAT_RX_FLAG_DUPLICATE);
3213				INP_ADD_STAT(inp, cell, wifi, wired,
3214				    rxpackets, 1);
3215				INP_ADD_STAT(inp, cell, wifi, wired,
3216				    rxbytes, tlen);
3217				tp->t_stat.rxduplicatebytes += tlen;
3218			}
3219			if (tlen)
3220				goto dropafterack;
3221			goto drop;
3222		}
3223	}
3224
3225	/*
3226	 * In the SYN-RECEIVED state, validate that the packet belongs to
3227	 * this connection before trimming the data to fit the receive
3228	 * window.  Check the sequence number versus IRS since we know
3229	 * the sequence numbers haven't wrapped.  This is a partial fix
3230	 * for the "LAND" DoS attack.
3231	 */
3232	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
3233		rstreason = BANDLIM_RST_OPENPORT;
3234		IF_TCP_STATINC(ifp, dospacket);
3235		goto dropwithreset;
3236	}
3237
3238	todrop = tp->rcv_nxt - th->th_seq;
3239	if (todrop > 0) {
3240		if (thflags & TH_SYN) {
3241			thflags &= ~TH_SYN;
3242			th->th_seq++;
3243			if (th->th_urp > 1)
3244				th->th_urp--;
3245			else
3246				thflags &= ~TH_URG;
3247			todrop--;
3248		}
3249		/*
3250		 * Following if statement from Stevens, vol. 2, p. 960.
3251		 */
3252		if (todrop > tlen
3253		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
3254			/*
3255			 * Any valid FIN must be to the left of the window.
3256			 * At this point the FIN must be a duplicate or out
3257			 * of sequence; drop it.
3258			 */
3259			thflags &= ~TH_FIN;
3260
3261			/*
3262			 * Send an ACK to resynchronize and drop any data.
3263			 * But keep on processing for RST or ACK.
3264			 */
3265			tp->t_flags |= TF_ACKNOW;
3266			if (todrop == 1) {
3267				/* This could be a keepalive */
3268				soevent(so, SO_FILT_HINT_LOCKED |
3269					SO_FILT_HINT_KEEPALIVE);
3270			}
3271			todrop = tlen;
3272			tcpstat.tcps_rcvduppack++;
3273			tcpstat.tcps_rcvdupbyte += todrop;
3274		} else {
3275			tcpstat.tcps_rcvpartduppack++;
3276			tcpstat.tcps_rcvpartdupbyte += todrop;
3277		}
3278		if (nstat_collect) {
3279			nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1,
3280				todrop, NSTAT_RX_FLAG_DUPLICATE);
3281			INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
3282			INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop);
3283			tp->t_stat.rxduplicatebytes += todrop;
3284		}
3285		drop_hdrlen += todrop;	/* drop from the top afterwards */
3286		th->th_seq += todrop;
3287		tlen -= todrop;
3288		if (th->th_urp > todrop)
3289			th->th_urp -= todrop;
3290		else {
3291			thflags &= ~TH_URG;
3292			th->th_urp = 0;
3293		}
3294	}
3295
3296	/*
3297	 * If new data are received on a connection after the user processes
3298	 * are gone, then RST the other end.  Note that an MPTCP subflow socket
3299	 * would have SS_NOFDREF set by default, so check to make sure that
3300	 * we test for SOF_MP_SUBFLOW socket flag (which would be cleared when
3301	 * the socket is closed.)
3302	 */
3303	if (!(so->so_flags & SOF_MP_SUBFLOW) &&
3304	    (so->so_state & SS_NOFDREF) &&
3305	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
3306		tp = tcp_close(tp);
3307		tcpstat.tcps_rcvafterclose++;
3308		rstreason = BANDLIM_UNLIMITED;
3309		IF_TCP_STATINC(ifp, cleanup);
3310		goto dropwithreset;
3311	}
3312
3313	/*
3314	 * If segment ends after window, drop trailing data
3315	 * (and PUSH and FIN); if nothing left, just ACK.
3316	 */
3317	todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
3318	if (todrop > 0) {
3319		tcpstat.tcps_rcvpackafterwin++;
3320		if (todrop >= tlen) {
3321			tcpstat.tcps_rcvbyteafterwin += tlen;
3322			/*
3323			 * If a new connection request is received
3324			 * while in TIME_WAIT, drop the old connection
3325			 * and start over if the sequence numbers
3326			 * are above the previous ones.
3327			 */
3328			if (thflags & TH_SYN &&
3329			    tp->t_state == TCPS_TIME_WAIT &&
3330			    SEQ_GT(th->th_seq, tp->rcv_nxt)) {
3331				iss = tcp_new_isn(tp);
3332				tp = tcp_close(tp);
3333				tcp_unlock(so, 1, 0);
3334				goto findpcb;
3335			}
3336			/*
3337			 * If window is closed can only take segments at
3338			 * window edge, and have to drop data and PUSH from
3339			 * incoming segments.  Continue processing, but
3340			 * remember to ack.  Otherwise, drop segment
3341			 * and ack.
3342			 */
3343			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
3344				tp->t_flags |= TF_ACKNOW;
3345				tcpstat.tcps_rcvwinprobe++;
3346			} else
3347				goto dropafterack;
3348		} else
3349			tcpstat.tcps_rcvbyteafterwin += todrop;
3350		m_adj(m, -todrop);
3351		tlen -= todrop;
3352		thflags &= ~(TH_PUSH|TH_FIN);
3353	}
3354
3355	/*
3356	 * If last ACK falls within this segment's sequence numbers,
3357	 * record its timestamp.
3358	 * NOTE:
3359	 * 1) That the test incorporates suggestions from the latest
3360	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
3361	 * 2) That updating only on newer timestamps interferes with
3362	 *    our earlier PAWS tests, so this check should be solely
3363	 *    predicated on the sequence space of this segment.
3364	 * 3) That we modify the segment boundary check to be
3365	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
3366	 *    instead of RFC1323's
3367	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
3368	 *    This modified check allows us to overcome RFC1323's
3369	 *    limitations as described in Stevens TCP/IP Illustrated
3370	 *    Vol. 2 p.869. In such cases, we can still calculate the
3371	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
3372	 */
3373	if ((to.to_flags & TOF_TS) != 0 &&
3374	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
3375	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
3376		((thflags & (TH_SYN|TH_FIN)) != 0))) {
3377		tp->ts_recent_age = tcp_now;
3378		tp->ts_recent = to.to_tsval;
3379	}
3380
3381	/*
3382	 * If a SYN is in the window, then this is an
3383	 * error and we send an RST and drop the connection.
3384	 */
3385	if (thflags & TH_SYN) {
3386		tp = tcp_drop(tp, ECONNRESET);
3387		rstreason = BANDLIM_UNLIMITED;
3388		postevent(so, 0, EV_RESET);
3389		IF_TCP_STATINC(ifp, synwindow);
3390		goto dropwithreset;
3391	}
3392
3393	/*
3394	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
3395	 * flag is on (half-synchronized state), then queue data for
3396	 * later processing; else drop segment and return.
3397	 */
3398	if ((thflags & TH_ACK) == 0) {
3399		if (tp->t_state == TCPS_SYN_RECEIVED ||
3400		    (tp->t_flags & TF_NEEDSYN))
3401			goto step6;
3402		else if (tp->t_flags & TF_ACKNOW)
3403			goto dropafterack;
3404		else
3405			goto drop;
3406	}
3407
3408	/*
3409	 * Ack processing.
3410	 */
3411
3412	switch (tp->t_state) {
3413
3414	/*
3415	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
3416	 * ESTABLISHED state and continue processing.
3417	 * The ACK was checked above.
3418	 */
3419	case TCPS_SYN_RECEIVED:
3420
3421		tcpstat.tcps_connects++;
3422
3423		/* Do window scaling? */
3424		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3425			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
3426			tp->snd_scale = tp->requested_s_scale;
3427			tp->rcv_scale = tp->request_r_scale;
3428			tp->snd_wnd = th->th_win << tp->snd_scale;
3429			tiwin = tp->snd_wnd;
3430		}
3431		/*
3432		 * Make transitions:
3433		 *      SYN-RECEIVED  -> ESTABLISHED
3434		 *      SYN-RECEIVED* -> FIN-WAIT-1
3435		 */
3436		tp->t_starttime = tcp_now;
3437		tcp_sbrcv_tstmp_check(tp);
3438		if (tp->t_flags & TF_NEEDFIN) {
3439			DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3440				struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
3441			tp->t_state = TCPS_FIN_WAIT_1;
3442			tp->t_flags &= ~TF_NEEDFIN;
3443		} else {
3444			DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3445				struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
3446			tp->t_state = TCPS_ESTABLISHED;
3447			tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3448				TCP_CONN_KEEPIDLE(tp));
3449			if (nstat_collect)
3450				nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt);
3451		}
3452		/*
3453		 * If segment contains data or ACK, will call tcp_reass()
3454		 * later; if not, do so now to pass queued data to user.
3455		 */
3456		if (tlen == 0 && (thflags & TH_FIN) == 0)
3457			(void) tcp_reass(tp, (struct tcphdr *)0, &tlen,
3458			    NULL, ifp);
3459		tp->snd_wl1 = th->th_seq - 1;
3460
3461		/* FALLTHROUGH */
3462#if MPTCP
3463		/*
3464		 * Do not send the connect notification for additional subflows
3465		 * until ACK for 3-way handshake arrives.
3466		 */
3467		if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3468		    (tp->t_mpflags & TMPF_SENT_JOIN)) {
3469			isconnected = FALSE;
3470		} else
3471#endif /* MPTCP */
3472			isconnected = TRUE;
3473
3474	/*
3475	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
3476	 * ACKs.  If the ack is in the range
3477	 *	tp->snd_una < th->th_ack <= tp->snd_max
3478	 * then advance tp->snd_una to th->th_ack and drop
3479	 * data from the retransmission queue.  If this ACK reflects
3480	 * more up to date window information we update our window information.
3481	 */
3482	case TCPS_ESTABLISHED:
3483	case TCPS_FIN_WAIT_1:
3484	case TCPS_FIN_WAIT_2:
3485	case TCPS_CLOSE_WAIT:
3486	case TCPS_CLOSING:
3487	case TCPS_LAST_ACK:
3488	case TCPS_TIME_WAIT:
3489		if (SEQ_GT(th->th_ack, tp->snd_max)) {
3490			tcpstat.tcps_rcvacktoomuch++;
3491			goto dropafterack;
3492		}
3493		if (SACK_ENABLED(tp) &&
3494		    (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes)))
3495			tcp_sack_doack(tp, &to, th, &sack_bytes_acked);
3496
3497#if MPTCP
3498		if ((tp->t_mpuna) && (SEQ_GEQ(th->th_ack, tp->t_mpuna))) {
3499			if (tp->t_mpflags & TMPF_PREESTABLISHED) {
3500				/* MP TCP establishment succeeded */
3501				tp->t_mpuna = 0;
3502				if (tp->t_mpflags & TMPF_JOINED_FLOW) {
3503					if (tp->t_mpflags & TMPF_SENT_JOIN) {
3504						tp->t_mpflags &=
3505						    ~TMPF_PREESTABLISHED;
3506						tp->t_mpflags |=
3507						    TMPF_MPTCP_TRUE;
3508						so->so_flags |= SOF_MPTCP_TRUE;
3509						if (mptcp_dbg >= MP_ERR_DEBUG)
3510							printf("MPTCP SUCCESS"
3511							    " %s \n",__func__);
3512						tp->t_timer[TCPT_JACK_RXMT] = 0;
3513						tp->t_mprxtshift = 0;
3514						isconnected = TRUE;
3515					} else {
3516						isconnected = FALSE;
3517					}
3518				} else {
3519					isconnected = TRUE;
3520					tp->t_mpflags &= ~TMPF_SENT_KEYS;
3521				}
3522			}
3523		}
3524#endif /* MPTCP */
3525		/*
3526		 * If we have outstanding data (other than
3527		 * a window probe), this is a completely
3528		 * duplicate ack (ie, window info didn't
3529		 * change) and the ack is the biggest we've seen.
3530		 */
3531		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
3532			if (tlen == 0 && tiwin == tp->snd_wnd) {
3533				/*
3534				 * If both ends send FIN at the same time,
3535				 * then the ack will be a duplicate ack
3536				 * but we have to process the FIN. Check
3537				 * for this condition and process the FIN
3538				 * instead of the dupack
3539				 */
3540				if ((thflags & TH_FIN) &&
3541					(tp->t_flags & TF_SENTFIN) &&
3542					!TCPS_HAVERCVDFIN(tp->t_state) &&
3543					(th->th_ack + 1) == tp->snd_max) {
3544					break;
3545				}
3546process_dupack:
3547#if MPTCP
3548				/*
3549				 * MPTCP options that are ignored must
3550				 * not be treated as duplicate ACKs.
3551				 */
3552				if (to.to_flags & TOF_MPTCP) {
3553					goto drop;
3554				}
3555
3556				if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) {
3557					if (mptcp_dbg >= MP_ERR_DEBUG)
3558						printf("%s:  bypass ack recovery\n",__func__);
3559					break;
3560				}
3561#endif /* MPTCP */
3562				/*
3563				 * If a duplicate acknowledgement was seen
3564				 * after ECN, it indicates packet loss in
3565				 * addition to ECN. Reset INRECOVERY flag
3566				 * so that we can process partial acks
3567				 * correctly
3568				 */
3569				if (tp->ecn_flags & TE_INRECOVERY)
3570					tp->ecn_flags &= ~TE_INRECOVERY;
3571
3572				tcpstat.tcps_rcvdupack++;
3573				++tp->t_dupacks;
3574
3575				/*
3576				 * Check if we need to reset the limit on
3577				 * early retransmit
3578				 */
3579				if (tp->t_early_rexmt_count > 0 &&
3580				    TSTMP_GEQ(tcp_now,
3581				    (tp->t_early_rexmt_win +
3582				    TCP_EARLY_REXMT_WIN)))
3583					tp->t_early_rexmt_count = 0;
3584
3585				/*
3586				 * Is early retransmit needed? We check for
3587				 * this when the connection is waiting for
3588				 * duplicate acks to enter fast recovery.
3589				 */
3590				if (!IN_FASTRECOVERY(tp))
3591					tcp_early_rexmt_check(tp, th);
3592
3593				/*
3594				 * If we've seen exactly rexmt threshold
3595				 * of duplicate acks, assume a packet
3596				 * has been dropped and retransmit it.
3597				 * Kludge snd_nxt & the congestion
3598				 * window so we send only this one
3599				 * packet.
3600				 *
3601				 * We know we're losing at the current
3602				 * window size so do congestion avoidance
3603				 * (set ssthresh to half the current window
3604				 * and pull our congestion window back to
3605				 * the new ssthresh).
3606				 *
3607				 * Dup acks mean that packets have left the
3608				 * network (they're now cached at the receiver)
3609				 * so bump cwnd by the amount in the receiver
3610				 * to keep a constant cwnd packets in the
3611				 * network.
3612				 */
3613				if (tp->t_timer[TCPT_REXMT] == 0 ||
3614				    (th->th_ack != tp->snd_una
3615				    && sack_bytes_acked == 0)) {
3616					tp->t_dupacks = 0;
3617					tp->t_rexmtthresh = tcprexmtthresh;
3618				} else if (tp->t_dupacks > tp->t_rexmtthresh ||
3619					IN_FASTRECOVERY(tp)) {
3620
3621					/*
3622					 * If this connection was seeing packet
3623					 * reordering, then recovery might be
3624					 * delayed to disambiguate between
3625					 * reordering and loss
3626					 */
3627					if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
3628					    (tp->t_flagsext &
3629					    (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) ==
3630					    (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
3631						/*
3632						 * Since the SACK information is already
3633						 * updated, this ACK will be dropped
3634						 */
3635						break;
3636					}
3637
3638					if (SACK_ENABLED(tp)
3639					    && IN_FASTRECOVERY(tp)) {
3640						int awnd;
3641
3642						/*
3643						 * Compute the amount of data in flight first.
3644						 * We can inject new data into the pipe iff
3645						 * we have less than 1/2 the original window's
3646						 * worth of data in flight.
3647						 */
3648						awnd = (tp->snd_nxt - tp->snd_fack) +
3649							tp->sackhint.sack_bytes_rexmit;
3650						if (awnd < tp->snd_ssthresh) {
3651							tp->snd_cwnd += tp->t_maxseg;
3652							if (tp->snd_cwnd > tp->snd_ssthresh)
3653								tp->snd_cwnd = tp->snd_ssthresh;
3654						}
3655					} else
3656						tp->snd_cwnd += tp->t_maxseg;
3657
3658					tcp_ccdbg_trace(tp, th, TCP_CC_IN_FASTRECOVERY);
3659
3660					(void) tcp_output(tp);
3661					goto drop;
3662				} else if (tp->t_dupacks == tp->t_rexmtthresh) {
3663					tcp_seq onxt = tp->snd_nxt;
3664
3665					/*
3666					 * If we're doing sack, check to
3667					 * see if we're already in sack
3668					 * recovery. If we're not doing sack,
3669					 * check to see if we're in newreno
3670					 * recovery.
3671					 */
3672					if (SACK_ENABLED(tp)) {
3673						if (IN_FASTRECOVERY(tp)) {
3674							tp->t_dupacks = 0;
3675							break;
3676						} else if (tp->t_flagsext & TF_DELAY_RECOVERY) {
3677							break;
3678						}
3679					} else {
3680						if (SEQ_LEQ(th->th_ack,
3681						    tp->snd_recover)) {
3682							tp->t_dupacks = 0;
3683							break;
3684						}
3685					}
3686
3687					tp->snd_recover = tp->snd_max;
3688					tp->t_timer[TCPT_PTO] = 0;
3689					tp->t_rtttime = 0;
3690
3691					/*
3692					 * If the connection has seen pkt
3693					 * reordering, delay recovery until
3694					 * it is clear that the packet
3695					 * was lost.
3696					 */
3697					if (SACK_ENABLED(tp) &&
3698					    (tp->t_flagsext &
3699					    (TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
3700					    == TF_PKTS_REORDERED &&
3701					    !IN_FASTRECOVERY(tp) &&
3702					    tp->t_reorderwin > 0 &&
3703					    tp->t_state == TCPS_ESTABLISHED) {
3704						tp->t_timer[TCPT_DELAYFR] =
3705						    OFFSET_FROM_START(tp,
3706						    tp->t_reorderwin);
3707						tp->t_flagsext |= TF_DELAY_RECOVERY;
3708						tcpstat.tcps_delay_recovery++;
3709						tcp_ccdbg_trace(tp, th,
3710						    TCP_CC_DELAY_FASTRECOVERY);
3711						break;
3712					}
3713
3714					/*
3715					 * If the current tcp cc module has
3716					 * defined a hook for tasks to run
3717					 * before entering FR, call it
3718					 */
3719					if (CC_ALGO(tp)->pre_fr != NULL)
3720						CC_ALGO(tp)->pre_fr(tp);
3721					ENTER_FASTRECOVERY(tp);
3722					tp->t_timer[TCPT_REXMT] = 0;
3723					if ((tp->ecn_flags & TE_ECN_ON)
3724					    == TE_ECN_ON)
3725						tp->ecn_flags |= TE_SENDCWR;
3726
3727					if (SACK_ENABLED(tp)) {
3728						tcpstat.tcps_sack_recovery_episode++;
3729						tp->sack_newdata = tp->snd_nxt;
3730						tp->snd_cwnd = tp->t_maxseg;
3731
3732						/*
3733						 * Enable probe timeout to detect
3734						 * a tail loss in the recovery
3735						 * window.
3736						 */
3737						tp->t_timer[TCPT_PTO] =
3738						    OFFSET_FROM_START(tp,
3739						    max(10, (tp->t_srtt >> TCP_RTT_SHIFT)));
3740
3741						tcp_ccdbg_trace(tp, th,
3742						    TCP_CC_ENTER_FASTRECOVERY);
3743
3744						(void) tcp_output(tp);
3745						goto drop;
3746					}
3747					tp->snd_nxt = th->th_ack;
3748					tp->snd_cwnd = tp->t_maxseg;
3749					(void) tcp_output(tp);
3750					tp->snd_cwnd = tp->snd_ssthresh +
3751					     tp->t_maxseg * tp->t_dupacks;
3752					if (SEQ_GT(onxt, tp->snd_nxt))
3753						tp->snd_nxt = onxt;
3754					tcp_ccdbg_trace(tp, th,
3755					    TCP_CC_ENTER_FASTRECOVERY);
3756					goto drop;
3757				} else if (limited_txmt &&
3758					ALLOW_LIMITED_TRANSMIT(tp) &&
3759					(!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) &&
3760					(so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) {
3761					u_int32_t incr = (tp->t_maxseg * tp->t_dupacks);
3762
3763					/* Use Limited Transmit algorithm on the first two
3764					 * duplicate acks when there is new data to transmit
3765					 */
3766					tp->snd_cwnd += incr;
3767					tcpstat.tcps_limited_txt++;
3768					(void) tcp_output(tp);
3769
3770					tcp_ccdbg_trace(tp, th, TCP_CC_LIMITED_TRANSMIT);
3771
3772					/* Reset snd_cwnd back to normal */
3773					tp->snd_cwnd -= incr;
3774				}
3775			} else {
3776				tp->t_dupacks = 0;
3777				tp->t_rexmtthresh = tcprexmtthresh;
3778			}
3779			break;
3780		}
3781		/*
3782		 * If the congestion window was inflated to account
3783		 * for the other side's cached packets, retract it.
3784		 */
3785		if (IN_FASTRECOVERY(tp)) {
3786			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
3787				/*
3788				 * If we received an ECE and entered
3789				 * recovery, the subsequent ACKs should
3790				 * not be treated as partial acks.
3791				 */
3792				if (tp->ecn_flags & TE_INRECOVERY)
3793					goto process_ACK;
3794
3795				if (SACK_ENABLED(tp))
3796					tcp_sack_partialack(tp, th);
3797				else
3798					tcp_newreno_partial_ack(tp, th);
3799				tcp_ccdbg_trace(tp, th, TCP_CC_PARTIAL_ACK);
3800			} else {
3801				EXIT_FASTRECOVERY(tp);
3802				if (CC_ALGO(tp)->post_fr != NULL)
3803					CC_ALGO(tp)->post_fr(tp, th);
3804
3805				tcp_ccdbg_trace(tp, th,
3806				    TCP_CC_EXIT_FASTRECOVERY);
3807			}
3808		} else if ((tp->t_flagsext &
3809			(TF_PKTS_REORDERED|TF_DELAY_RECOVERY))
3810			== (TF_PKTS_REORDERED|TF_DELAY_RECOVERY)) {
3811			/*
3812			 * If the ack acknowledges upto snd_recover or if
3813			 * it acknowledges all the snd holes, exit
3814			 * recovery and cancel the timer. Otherwise,
3815			 * this is a partial ack. Wait for recovery timer
3816			 * to enter recovery. The snd_holes have already
3817			 * been updated.
3818			 */
3819			if (SEQ_GEQ(th->th_ack, tp->snd_recover) ||
3820			    TAILQ_EMPTY(&tp->snd_holes)) {
3821				tp->t_timer[TCPT_DELAYFR] = 0;
3822				tp->t_flagsext &= ~TF_DELAY_RECOVERY;
3823				EXIT_FASTRECOVERY(tp);
3824				tcp_ccdbg_trace(tp, th,
3825				    TCP_CC_EXIT_FASTRECOVERY);
3826			}
3827		} else {
3828			/*
3829			 * We were not in fast recovery. Reset the
3830			 * duplicate ack counter.
3831			 */
3832			tp->t_dupacks = 0;
3833			tp->t_rexmtthresh = tcprexmtthresh;
3834		}
3835
3836
3837		/*
3838		 * If we reach this point, ACK is not a duplicate,
3839		 *     i.e., it ACKs something we sent.
3840		 */
3841		if (tp->t_flags & TF_NEEDSYN) {
3842			/*
3843			 * T/TCP: Connection was half-synchronized, and our
3844			 * SYN has been ACK'd (so connection is now fully
3845			 * synchronized).  Go to non-starred state,
3846			 * increment snd_una for ACK of SYN, and check if
3847			 * we can do window scaling.
3848			 */
3849			tp->t_flags &= ~TF_NEEDSYN;
3850			tp->snd_una++;
3851			/* Do window scaling? */
3852			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3853				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
3854				tp->snd_scale = tp->requested_s_scale;
3855				tp->rcv_scale = tp->request_r_scale;
3856			}
3857		}
3858
3859process_ACK:
3860		acked = BYTES_ACKED(th, tp);
3861		tcpstat.tcps_rcvackpack++;
3862		tcpstat.tcps_rcvackbyte += acked;
3863
3864		/*
3865		 * If the last packet was a retransmit, make sure
3866		 * it was not spurious.
3867		 *
3868		 * This will also take care of congestion window
3869		 * adjustment if a last packet was recovered due to a
3870		 * tail loss probe.
3871		 */
3872		tcp_bad_rexmt_check(tp, th, &to);
3873
3874		/* Recalculate the RTT */
3875		tcp_compute_rtt(tp, &to, th);
3876
3877		/*
3878		 * If all outstanding data is acked, stop retransmit
3879		 * timer and remember to restart (more output or persist).
3880		 * If there is more data to be acked, restart retransmit
3881		 * timer, using current (possibly backed-off) value.
3882		 */
3883		if (th->th_ack == tp->snd_max) {
3884			tp->t_timer[TCPT_REXMT] = 0;
3885			tp->t_timer[TCPT_PTO] = 0;
3886			needoutput = 1;
3887		} else if (tp->t_timer[TCPT_PERSIST] == 0)
3888			tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
3889			    tp->t_rxtcur);
3890
3891		/*
3892		 * If no data (only SYN) was ACK'd, skip rest of ACK
3893		 * processing.
3894		 */
3895		if (acked == 0)
3896			goto step6;
3897
3898
3899		if ((thflags & TH_ECE) != 0 &&
3900			((tp->ecn_flags & TE_ECN_ON) == TE_ECN_ON)) {
3901			/*
3902			 * Reduce the congestion window if we haven't
3903			 * done so.
3904			 */
3905			if (!IN_FASTRECOVERY(tp)) {
3906				tcp_reduce_congestion_window(tp);
3907				tp->ecn_flags |= (TE_INRECOVERY|TE_SENDCWR);
3908				tcp_ccdbg_trace(tp, th, TCP_CC_ECN_RCVD);
3909			}
3910		}
3911
3912		/*
3913		 * When new data is acked, open the congestion window.
3914		 * The specifics of how this is achieved are up to the
3915		 * congestion control algorithm in use for this connection.
3916		 *
3917		 * The calculations in this function assume that snd_una is
3918		 * not updated yet.
3919		 */
3920		if (!IN_FASTRECOVERY(tp)) {
3921			if (CC_ALGO(tp)->ack_rcvd != NULL)
3922				CC_ALGO(tp)->ack_rcvd(tp, th);
3923			tcp_ccdbg_trace(tp, th, TCP_CC_ACK_RCVD);
3924		}
3925		if (acked > so->so_snd.sb_cc) {
3926			tp->snd_wnd -= so->so_snd.sb_cc;
3927			sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
3928			if (so->so_flags & SOF_ENABLE_MSGS) {
3929				so->so_msg_state->msg_serial_bytes -=
3930					(int)so->so_snd.sb_cc;
3931			}
3932			ourfinisacked = 1;
3933		} else {
3934			sbdrop(&so->so_snd, acked);
3935			if (so->so_flags & SOF_ENABLE_MSGS) {
3936				so->so_msg_state->msg_serial_bytes -=
3937					acked;
3938			}
3939			tcp_sbsnd_trim(&so->so_snd);
3940			tp->snd_wnd -= acked;
3941			ourfinisacked = 0;
3942		}
3943		/* detect una wraparound */
3944		if ( !IN_FASTRECOVERY(tp) &&
3945		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
3946		    SEQ_LEQ(th->th_ack, tp->snd_recover))
3947			tp->snd_recover = th->th_ack - 1;
3948
3949		if (IN_FASTRECOVERY(tp) &&
3950		    SEQ_GEQ(th->th_ack, tp->snd_recover))
3951			EXIT_FASTRECOVERY(tp);
3952
3953		tp->snd_una = th->th_ack;
3954		if (SACK_ENABLED(tp)) {
3955			if (SEQ_GT(tp->snd_una, tp->snd_recover))
3956				tp->snd_recover = tp->snd_una;
3957		}
3958		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
3959			tp->snd_nxt = tp->snd_una;
3960		if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
3961			tp->t_bwmeas != NULL)
3962			tcp_bwmeas_check(tp);
3963
3964		/*
3965		 * sowwakeup must happen after snd_una, et al. are updated so that
3966		 * the sequence numbers are in sync with so_snd
3967		 */
3968		sowwakeup(so);
3969
3970		switch (tp->t_state) {
3971
3972		/*
3973		 * In FIN_WAIT_1 STATE in addition to the processing
3974		 * for the ESTABLISHED state if our FIN is now acknowledged
3975		 * then enter FIN_WAIT_2.
3976		 */
3977		case TCPS_FIN_WAIT_1:
3978			if (ourfinisacked) {
3979				/*
3980				 * If we can't receive any more
3981				 * data, then closing user can proceed.
3982				 * Starting the TCPT_2MSL timer is contrary to the
3983				 * specification, but if we don't get a FIN
3984				 * we'll hang forever.
3985				 */
3986				if (so->so_state & SS_CANTRCVMORE) {
3987					tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
3988						TCP_CONN_MAXIDLE(tp));
3989					isconnected = FALSE;
3990					isdisconnected = TRUE;
3991				}
3992				DTRACE_TCP4(state__change, void, NULL,
3993					struct inpcb *, inp,
3994					struct tcpcb *, tp,
3995					int32_t, TCPS_FIN_WAIT_2);
3996				tp->t_state = TCPS_FIN_WAIT_2;
3997				/* fall through and make sure we also recognize
3998				 * data ACKed with the FIN
3999				 */
4000			}
4001			tp->t_flags |= TF_ACKNOW;
4002			break;
4003
4004	 	/*
4005		 * In CLOSING STATE in addition to the processing for
4006		 * the ESTABLISHED state if the ACK acknowledges our FIN
4007		 * then enter the TIME-WAIT state, otherwise ignore
4008		 * the segment.
4009		 */
4010		case TCPS_CLOSING:
4011			if (ourfinisacked) {
4012				DTRACE_TCP4(state__change, void, NULL,
4013					struct inpcb *, inp,
4014					struct tcpcb *, tp,
4015					int32_t, TCPS_TIME_WAIT);
4016				tp->t_state = TCPS_TIME_WAIT;
4017				tcp_canceltimers(tp);
4018				if (tp->t_flagsext & TF_NOTIMEWAIT) {
4019					tp->t_flags |= TF_CLOSING;
4020				} else {
4021					add_to_time_wait(tp, 2 * tcp_msl);
4022				}
4023				isconnected = FALSE;
4024				isdisconnected = TRUE;
4025			}
4026			tp->t_flags |= TF_ACKNOW;
4027			break;
4028
4029		/*
4030		 * In LAST_ACK, we may still be waiting for data to drain
4031		 * and/or to be acked, as well as for the ack of our FIN.
4032		 * If our FIN is now acknowledged, delete the TCB,
4033		 * enter the closed state and return.
4034		 */
4035		case TCPS_LAST_ACK:
4036			if (ourfinisacked) {
4037				tp = tcp_close(tp);
4038				goto drop;
4039			}
4040			break;
4041
4042		/*
4043		 * In TIME_WAIT state the only thing that should arrive
4044		 * is a retransmission of the remote FIN.  Acknowledge
4045		 * it and restart the finack timer.
4046		 */
4047		case TCPS_TIME_WAIT:
4048			add_to_time_wait(tp, 2 * tcp_msl);
4049			goto dropafterack;
4050		}
4051
4052		/*
4053		 * If there is a SACK option on the ACK and we
4054		 * haven't seen any duplicate acks before, count
4055		 * it as a duplicate ack even if the cumulative
4056		 * ack is advanced. If the receiver delayed an
4057		 * ack and detected loss afterwards, then the ack
4058		 * will advance cumulative ack and will also have
4059		 * a SACK option. So counting it as one duplicate
4060		 * ack is ok.
4061		 */
4062		if (sack_ackadv == 1 &&
4063		    tp->t_state == TCPS_ESTABLISHED &&
4064		    SACK_ENABLED(tp) && sack_bytes_acked > 0 &&
4065		    to.to_nsacks > 0 && tp->t_dupacks == 0 &&
4066		    SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0 &&
4067		    !(tp->t_flagsext & TF_PKTS_REORDERED)) {
4068			tcpstat.tcps_sack_ackadv++;
4069			goto process_dupack;
4070		}
4071	}
4072
4073step6:
4074	/*
4075	 * Update window information.
4076	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
4077	 */
4078	if ((thflags & TH_ACK) &&
4079	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
4080	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
4081	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
4082		/* keep track of pure window updates */
4083		if (tlen == 0 &&
4084		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
4085			tcpstat.tcps_rcvwinupd++;
4086		tp->snd_wnd = tiwin;
4087		tp->snd_wl1 = th->th_seq;
4088		tp->snd_wl2 = th->th_ack;
4089		if (tp->snd_wnd > tp->max_sndwnd)
4090			tp->max_sndwnd = tp->snd_wnd;
4091		needoutput = 1;
4092	}
4093
4094	/*
4095	 * Process segments with URG.
4096	 */
4097	if ((thflags & TH_URG) && th->th_urp &&
4098	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4099		/*
4100		 * This is a kludge, but if we receive and accept
4101		 * random urgent pointers, we'll crash in
4102		 * soreceive.  It's hard to imagine someone
4103		 * actually wanting to send this much urgent data.
4104		 */
4105		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
4106			th->th_urp = 0;			/* XXX */
4107			thflags &= ~TH_URG;		/* XXX */
4108			goto dodata;			/* XXX */
4109		}
4110		/*
4111		 * If this segment advances the known urgent pointer,
4112		 * then mark the data stream.  This should not happen
4113		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
4114		 * a FIN has been received from the remote side.
4115		 * In these states we ignore the URG.
4116		 *
4117		 * According to RFC961 (Assigned Protocols),
4118		 * the urgent pointer points to the last octet
4119		 * of urgent data.  We continue, however,
4120		 * to consider it to indicate the first octet
4121		 * of data past the urgent section as the original
4122		 * spec states (in one of two places).
4123		 */
4124		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
4125			tp->rcv_up = th->th_seq + th->th_urp;
4126			so->so_oobmark = so->so_rcv.sb_cc +
4127			    (tp->rcv_up - tp->rcv_nxt) - 1;
4128			if (so->so_oobmark == 0) {
4129				so->so_state |= SS_RCVATMARK;
4130				postevent(so, 0, EV_OOB);
4131			}
4132			sohasoutofband(so);
4133			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4134		}
4135		/*
4136		 * Remove out of band data so doesn't get presented to user.
4137		 * This can happen independent of advancing the URG pointer,
4138		 * but if two URG's are pending at once, some out-of-band
4139		 * data may creep in... ick.
4140		 */
4141		if (th->th_urp <= (u_int32_t)tlen
4142#if SO_OOBINLINE
4143		     && (so->so_options & SO_OOBINLINE) == 0
4144#endif
4145		     )
4146			tcp_pulloutofband(so, th, m,
4147				drop_hdrlen);	/* hdr drop is delayed */
4148	} else {
4149		/*
4150		 * If no out of band data is expected,
4151		 * pull receive urgent pointer along
4152		 * with the receive window.
4153		 */
4154		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4155			tp->rcv_up = tp->rcv_nxt;
4156	}
4157dodata:
4158
4159	/* Set socket's connect or disconnect state correcly before doing data.
4160	 * The following might unlock the socket if there is an upcall or a socket
4161	 * filter.
4162	 */
4163	if (isconnected) {
4164		soisconnected(so);
4165	} else if (isdisconnected) {
4166		soisdisconnected(so);
4167	}
4168
4169	/* Let's check the state of pcb just to make sure that it did not get closed
4170	 * when we unlocked above
4171	 */
4172	if (inp->inp_state == INPCB_STATE_DEAD) {
4173		/* Just drop the packet that we are processing and return */
4174		goto drop;
4175	}
4176
4177	/*
4178	 * Process the segment text, merging it into the TCP sequencing queue,
4179	 * and arranging for acknowledgment of receipt if necessary.
4180	 * This process logically involves adjusting tp->rcv_wnd as data
4181	 * is presented to the user (this happens in tcp_usrreq.c,
4182	 * case PRU_RCVD).  If a FIN has already been received on this
4183	 * connection then we just ignore the text.
4184	 */
4185	if ((tlen || (thflags & TH_FIN)) &&
4186	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4187		tcp_seq save_start = th->th_seq;
4188		tcp_seq save_end = th->th_seq + tlen;
4189		m_adj(m, drop_hdrlen);	/* delayed header drop */
4190		/*
4191		 * Insert segment which includes th into TCP reassembly queue
4192		 * with control block tp.  Set thflags to whether reassembly now
4193		 * includes a segment with FIN.  This handles the common case
4194		 * inline (segment is the next to be received on an established
4195		 * connection, and the queue is empty), avoiding linkage into
4196		 * and removal from the queue and repetition of various
4197		 * conversions.
4198		 * Set DELACK for segments received in order, but ack
4199		 * immediately when segments are out of order (so
4200		 * fast retransmit can work).
4201		 */
4202		if (th->th_seq == tp->rcv_nxt &&
4203		    LIST_EMPTY(&tp->t_segq) &&
4204		    TCPS_HAVEESTABLISHED(tp->t_state)) {
4205			TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4206			/*
4207			 * Calculate the RTT on the receiver only if the
4208			 * connection is in streaming mode and the last
4209			 * packet was not an end-of-write
4210			 */
4211			if ((tp->t_flags & TF_STRETCHACK) &&
4212				!(tp->t_flagsext & TF_STREAMEOW))
4213				tcp_compute_rtt(tp, &to, th);
4214
4215			if (DELAY_ACK(tp, th) &&
4216				((tp->t_flags & TF_ACKNOW) == 0) ) {
4217				if ((tp->t_flags & TF_DELACK) == 0) {
4218					tp->t_flags |= TF_DELACK;
4219					tp->t_timer[TCPT_DELACK] =
4220						OFFSET_FROM_START(tp, tcp_delack);
4221				}
4222			}
4223			else {
4224				tp->t_flags |= TF_ACKNOW;
4225			}
4226			tp->rcv_nxt += tlen;
4227			thflags = th->th_flags & TH_FIN;
4228			TCP_INC_VAR(tcpstat.tcps_rcvpack, nlropkts);
4229			tcpstat.tcps_rcvbyte += tlen;
4230			if (nstat_collect) {
4231				if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_PKT) {
4232					INP_ADD_STAT(inp, cell, wifi, wired,
4233					    rxpackets, m->m_pkthdr.lro_npkts);
4234				} else {
4235					INP_ADD_STAT(inp, cell, wifi, wired,
4236					    rxpackets, 1);
4237				}
4238				INP_ADD_STAT(inp, cell, wifi, wired,
4239				    rxbytes, tlen);
4240			}
4241			tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen);
4242			so_recv_data_stat(so, m, drop_hdrlen);
4243
4244			if (sbappendstream_rcvdemux(so, m,
4245			    th->th_seq - (tp->irs + 1), 0)) {
4246				sorwakeup(so);
4247			}
4248		} else {
4249			thflags = tcp_reass(tp, th, &tlen, m, ifp);
4250			tp->t_flags |= TF_ACKNOW;
4251		}
4252
4253		if (tlen > 0 && SACK_ENABLED(tp))
4254			tcp_update_sack_list(tp, save_start, save_end);
4255
4256		tcp_adaptive_rwtimo_check(tp, tlen);
4257
4258		if (tp->t_flags & TF_DELACK)
4259		{
4260#if INET6
4261			if (isipv6) {
4262				KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4263		     			(((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
4264			     		th->th_seq, th->th_ack, th->th_win);
4265			}
4266			else
4267#endif
4268			{
4269				KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
4270		     			(((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
4271			     		th->th_seq, th->th_ack, th->th_win);
4272			}
4273
4274		}
4275	} else {
4276		m_freem(m);
4277		thflags &= ~TH_FIN;
4278	}
4279
4280	/*
4281	 * If FIN is received ACK the FIN and let the user know
4282	 * that the connection is closing.
4283	 */
4284	if (thflags & TH_FIN) {
4285		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4286			socantrcvmore(so);
4287			postevent(so, 0, EV_FIN);
4288			/*
4289			 * If connection is half-synchronized
4290			 * (ie NEEDSYN flag on) then delay ACK,
4291			 * so it may be piggybacked when SYN is sent.
4292			 * Otherwise, since we received a FIN then no
4293			 * more input can be expected, send ACK now.
4294			 */
4295			TCP_INC_VAR(tp->t_unacksegs, nlropkts);
4296			if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) {
4297				if ((tp->t_flags & TF_DELACK) == 0) {
4298					tp->t_flags |= TF_DELACK;
4299					tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
4300				}
4301			}
4302			else {
4303				tp->t_flags |= TF_ACKNOW;
4304			}
4305			tp->rcv_nxt++;
4306		}
4307		switch (tp->t_state) {
4308
4309	 	/*
4310		 * In SYN_RECEIVED and ESTABLISHED STATES
4311		 * enter the CLOSE_WAIT state.
4312		 */
4313		case TCPS_SYN_RECEIVED:
4314			tp->t_starttime = tcp_now;
4315		case TCPS_ESTABLISHED:
4316			DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4317				struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
4318			tp->t_state = TCPS_CLOSE_WAIT;
4319			break;
4320
4321	 	/*
4322		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
4323		 * enter the CLOSING state.
4324		 */
4325		case TCPS_FIN_WAIT_1:
4326			DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
4327				struct tcpcb *, tp, int32_t, TCPS_CLOSING);
4328			tp->t_state = TCPS_CLOSING;
4329			break;
4330
4331	 	/*
4332		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
4333		 * starting the time-wait timer, turning off the other
4334		 * standard timers.
4335		 */
4336		case TCPS_FIN_WAIT_2:
4337			DTRACE_TCP4(state__change, void, NULL,
4338				struct inpcb *, inp,
4339				struct tcpcb *, tp,
4340				int32_t, TCPS_TIME_WAIT);
4341			tp->t_state = TCPS_TIME_WAIT;
4342			tcp_canceltimers(tp);
4343			tp->t_flags |= TF_ACKNOW;
4344			if (tp->t_flagsext & TF_NOTIMEWAIT) {
4345				tp->t_flags |= TF_CLOSING;
4346			} else {
4347				add_to_time_wait(tp, 2 * tcp_msl);
4348			}
4349			soisdisconnected(so);
4350			break;
4351
4352		/*
4353		 * In TIME_WAIT state restart the 2 MSL time_wait timer.
4354		 */
4355		case TCPS_TIME_WAIT:
4356			add_to_time_wait(tp, 2 * tcp_msl);
4357			break;
4358		}
4359	}
4360#if TCPDEBUG
4361	if (so->so_options & SO_DEBUG)
4362		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
4363			  &tcp_savetcp, 0);
4364#endif
4365
4366	/*
4367	 * Return any desired output.
4368	 */
4369	if (needoutput || (tp->t_flags & TF_ACKNOW)) {
4370		(void) tcp_output(tp);
4371	}
4372
4373	tcp_check_timer_state(tp);
4374
4375
4376	tcp_unlock(so, 1, 0);
4377	KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4378	return;
4379
4380dropafterack:
4381	/*
4382	 * Generate an ACK dropping incoming segment if it occupies
4383	 * sequence space, where the ACK reflects our state.
4384	 *
4385	 * We can now skip the test for the RST flag since all
4386	 * paths to this code happen after packets containing
4387	 * RST have been dropped.
4388	 *
4389	 * In the SYN-RECEIVED state, don't send an ACK unless the
4390	 * segment we received passes the SYN-RECEIVED ACK test.
4391	 * If it fails send a RST.  This breaks the loop in the
4392	 * "LAND" DoS attack, and also prevents an ACK storm
4393	 * between two listening ports that have been sent forged
4394	 * SYN segments, each with the source address of the other.
4395	 */
4396	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
4397	    (SEQ_GT(tp->snd_una, th->th_ack) ||
4398	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
4399		rstreason = BANDLIM_RST_OPENPORT;
4400		IF_TCP_STATINC(ifp, dospacket);
4401		goto dropwithreset;
4402	}
4403#if TCPDEBUG
4404	if (so->so_options & SO_DEBUG)
4405		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4406			  &tcp_savetcp, 0);
4407#endif
4408	m_freem(m);
4409	tp->t_flags |= TF_ACKNOW;
4410	(void) tcp_output(tp);
4411
4412	/* Don't need to check timer state as we should have done it during tcp_output */
4413	tcp_unlock(so, 1, 0);
4414	KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4415	return;
4416dropwithresetnosock:
4417	nosock = 1;
4418dropwithreset:
4419	/*
4420	 * Generate a RST, dropping incoming segment.
4421	 * Make ACK acceptable to originator of segment.
4422	 * Don't bother to respond if destination was broadcast/multicast.
4423	 */
4424	if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
4425		goto drop;
4426#if INET6
4427	if (isipv6) {
4428		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
4429		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
4430			goto drop;
4431	} else
4432#endif /* INET6 */
4433	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
4434	    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
4435	    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
4436	    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
4437		goto drop;
4438	/* IPv6 anycast check is done at tcp6_input() */
4439
4440	/*
4441	 * Perform bandwidth limiting.
4442	 */
4443#if ICMP_BANDLIM
4444	if (badport_bandlim(rstreason) < 0)
4445		goto drop;
4446#endif
4447
4448#if TCPDEBUG
4449	if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4450		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4451			  &tcp_savetcp, 0);
4452#endif
4453	bzero(&tra, sizeof(tra));
4454	tra.ifscope = ifscope;
4455	tra.awdl_unrestricted = 1;
4456	if (thflags & TH_ACK)
4457		/* mtod() below is safe as long as hdr dropping is delayed */
4458		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
4459		    TH_RST, &tra);
4460	else {
4461		if (thflags & TH_SYN)
4462			tlen++;
4463		/* mtod() below is safe as long as hdr dropping is delayed */
4464		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
4465		    (tcp_seq)0, TH_RST|TH_ACK, &tra);
4466	}
4467	/* destroy temporarily created socket */
4468	if (dropsocket) {
4469		(void) soabort(so);
4470		tcp_unlock(so, 1, 0);
4471	} else if ((inp != NULL) && (nosock == 0)) {
4472		tcp_unlock(so, 1, 0);
4473	}
4474	KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4475	return;
4476dropnosock:
4477	nosock = 1;
4478drop:
4479	/*
4480	 * Drop space held by incoming segment and return.
4481	 */
4482#if TCPDEBUG
4483	if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
4484		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
4485			  &tcp_savetcp, 0);
4486#endif
4487	m_freem(m);
4488	/* destroy temporarily created socket */
4489	if (dropsocket) {
4490		(void) soabort(so);
4491		tcp_unlock(so, 1, 0);
4492	}
4493	else if (nosock == 0) {
4494		tcp_unlock(so, 1, 0);
4495	}
4496	KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0);
4497	return;
4498}
4499
4500static void
4501tcp_dooptions(tp, cp, cnt, th, to, input_ifscope)
4502/*
4503 * Parse TCP options and place in tcpopt.
4504 */
4505	struct tcpcb *tp;
4506	u_char *cp;
4507	int cnt;
4508	struct tcphdr *th;
4509	struct tcpopt *to;
4510	unsigned int input_ifscope;
4511{
4512	u_short mss = 0;
4513	int opt, optlen;
4514
4515	for (; cnt > 0; cnt -= optlen, cp += optlen) {
4516		opt = cp[0];
4517		if (opt == TCPOPT_EOL)
4518			break;
4519		if (opt == TCPOPT_NOP)
4520			optlen = 1;
4521		else {
4522			if (cnt < 2)
4523				break;
4524			optlen = cp[1];
4525			if (optlen < 2 || optlen > cnt)
4526				break;
4527		}
4528		switch (opt) {
4529
4530		default:
4531			continue;
4532
4533		case TCPOPT_MAXSEG:
4534			if (optlen != TCPOLEN_MAXSEG)
4535				continue;
4536			if (!(th->th_flags & TH_SYN))
4537				continue;
4538			bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
4539			NTOHS(mss);
4540			break;
4541
4542		case TCPOPT_WINDOW:
4543			if (optlen != TCPOLEN_WINDOW)
4544				continue;
4545			if (!(th->th_flags & TH_SYN))
4546				continue;
4547			to->to_flags |= TOF_SCALE;
4548			tp->t_flags |= TF_RCVD_SCALE;
4549			tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
4550			break;
4551
4552		case TCPOPT_TIMESTAMP:
4553			if (optlen != TCPOLEN_TIMESTAMP)
4554				continue;
4555			to->to_flags |= TOF_TS;
4556			bcopy((char *)cp + 2,
4557			    (char *)&to->to_tsval, sizeof(to->to_tsval));
4558			NTOHL(to->to_tsval);
4559			bcopy((char *)cp + 6,
4560			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
4561			NTOHL(to->to_tsecr);
4562			/*
4563			 * A timestamp received in a SYN makes
4564			 * it ok to send timestamp requests and replies.
4565			 */
4566			if (th->th_flags & TH_SYN) {
4567				tp->t_flags |= TF_RCVD_TSTMP;
4568				tp->ts_recent = to->to_tsval;
4569				tp->ts_recent_age = tcp_now;
4570			}
4571			break;
4572		case TCPOPT_SACK_PERMITTED:
4573			if (!tcp_do_sack ||
4574			    optlen != TCPOLEN_SACK_PERMITTED)
4575				continue;
4576			if (th->th_flags & TH_SYN)
4577				to->to_flags |= TOF_SACK;
4578			break;
4579		case TCPOPT_SACK:
4580			if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
4581				continue;
4582			to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
4583			to->to_sacks = cp + 2;
4584			tcpstat.tcps_sack_rcv_blocks++;
4585
4586			break;
4587
4588#if MPTCP
4589		case TCPOPT_MULTIPATH:
4590			tcp_do_mptcp_options(tp, cp, th, to, optlen);
4591			break;
4592#endif /* MPTCP */
4593		}
4594	}
4595	if (th->th_flags & TH_SYN)
4596		tcp_mss(tp, mss, input_ifscope);	/* sets t_maxseg */
4597}
4598
4599/*
4600 * Pull out of band byte out of a segment so
4601 * it doesn't appear in the user's data queue.
4602 * It is still reflected in the segment length for
4603 * sequencing purposes.
4604 */
4605static void
4606tcp_pulloutofband(so, th, m, off)
4607	struct socket *so;
4608	struct tcphdr *th;
4609	register struct mbuf *m;
4610	int off;		/* delayed to be droped hdrlen */
4611{
4612	int cnt = off + th->th_urp - 1;
4613
4614	while (cnt >= 0) {
4615		if (m->m_len > cnt) {
4616			char *cp = mtod(m, caddr_t) + cnt;
4617			struct tcpcb *tp = sototcpcb(so);
4618
4619			tp->t_iobc = *cp;
4620			tp->t_oobflags |= TCPOOB_HAVEDATA;
4621			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
4622			m->m_len--;
4623			if (m->m_flags & M_PKTHDR)
4624				m->m_pkthdr.len--;
4625			return;
4626		}
4627		cnt -= m->m_len;
4628		m = m->m_next;
4629		if (m == 0)
4630			break;
4631	}
4632	panic("tcp_pulloutofband");
4633}
4634
4635uint32_t
4636get_base_rtt(struct tcpcb *tp)
4637{
4638	uint32_t base_rtt = 0, i;
4639	for (i = 0; i < N_RTT_BASE; ++i) {
4640		if (tp->rtt_hist[i] != 0 &&
4641			(base_rtt == 0 || tp->rtt_hist[i] < base_rtt))
4642			base_rtt = tp->rtt_hist[i];
4643	}
4644	return base_rtt;
4645}
4646
4647/* Each value of RTT base represents the minimum RTT seen in a minute.
4648 * We keep upto N_RTT_BASE minutes worth of history.
4649 */
4650void
4651update_base_rtt(struct tcpcb *tp, uint32_t rtt)
4652{
4653	int32_t i, qdelay;
4654	u_int32_t base_rtt;
4655
4656	if (++tp->rtt_count >= rtt_samples_per_slot) {
4657#if TRAFFIC_MGT
4658		/*
4659		 * If the recv side is being throttled, check if the
4660		 * current RTT is closer to the base RTT seen in
4661		 * first (recent) two slots. If so, unthrottle the stream.
4662		 */
4663		if (tp->t_flagsext & TF_RECV_THROTTLE) {
4664			base_rtt = min(tp->rtt_hist[0], tp->rtt_hist[1]);
4665			qdelay = tp->t_rttcur - base_rtt;
4666			if (qdelay < target_qdelay)
4667				tp->t_flagsext &= ~(TF_RECV_THROTTLE);
4668		}
4669#endif /* TRAFFIC_MGT */
4670
4671		for (i = (N_RTT_BASE-1); i > 0; --i) {
4672			tp->rtt_hist[i] = tp->rtt_hist[i-1];
4673		}
4674		tp->rtt_hist[0] = rtt;
4675		tp->rtt_count = 0;
4676	} else {
4677		tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt);
4678	}
4679}
4680
4681/*
4682 * If we have a timestamp reply, update smoothed RTT. If no timestamp is
4683 * present but transmit timer is running and timed sequence number was
4684 * acked, update smoothed RTT.
4685 *
4686 * If timestamps are supported, a receiver can update RTT even if
4687 * there is no outstanding data.
4688 *
4689 * Some boxes send broken timestamp replies during the SYN+ACK phase,
4690 * ignore timestamps of 0or we could calculate a huge RTT and blow up
4691 * the retransmit timer.
4692 */
4693static void
4694tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4695{
4696	VERIFY(to != NULL && th != NULL);
4697	if (((to->to_flags & TOF_TS) != 0) &&
4698		(to->to_tsecr != 0) &&
4699		TSTMP_GEQ(tcp_now, to->to_tsecr)) {
4700		tcp_xmit_timer(tp, tcp_now - to->to_tsecr,
4701			to->to_tsecr, th->th_ack);
4702	} else if (tp->t_rtttime != 0 && SEQ_GT(th->th_ack, tp->t_rtseq)) {
4703		tcp_xmit_timer(tp, tcp_now - tp->t_rtttime, 0,
4704			th->th_ack);
4705	}
4706}
4707
4708/*
4709 * Collect new round-trip time estimate
4710 * and update averages and current timeout.
4711 */
4712static void
4713tcp_xmit_timer(register struct tcpcb *tp, int rtt,
4714	u_int32_t tsecr, tcp_seq th_ack)
4715{
4716	register int delta;
4717
4718	if (tp->t_flagsext & TF_RECOMPUTE_RTT) {
4719		if (SEQ_GT(th_ack, tp->snd_una) &&
4720		    SEQ_LEQ(th_ack, tp->snd_max) &&
4721		    (tsecr == 0 ||
4722		    TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) {
4723			/*
4724			 * We received a new ACk after a
4725			 * spurious timeout. Adapt retransmission
4726			 * timer as described in rfc 4015.
4727			 */
4728			tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
4729			tp->t_badrexmt_time = 0;
4730			tp->t_srtt = max(tp->t_srtt_prev, rtt);
4731			tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT;
4732			tp->t_rttvar = max(tp->t_rttvar_prev, (rtt >> 1));
4733			tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT;
4734
4735			if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
4736				tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
4737
4738			goto compute_rto;
4739		} else {
4740			return;
4741		}
4742	}
4743
4744	tcpstat.tcps_rttupdated++;
4745	tp->t_rttupdated++;
4746
4747	if (rtt > 0) {
4748		tp->t_rttcur = rtt;
4749		update_base_rtt(tp, rtt);
4750	}
4751
4752	if (tp->t_srtt != 0) {
4753		/*
4754		 * srtt is stored as fixed point with 5 bits after the
4755		 * binary point (i.e., scaled by 32).  The following magic
4756		 * is equivalent to the smoothing algorithm in rfc793 with
4757		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
4758		 * point).
4759		 *
4760		 * Freebsd adjusts rtt to origin 0 by subtracting 1
4761		 * from the provided rtt value. This was required because
4762		 * of the way t_rtttime was initiailised to 1 before.
4763		 * Since we changed t_rtttime to be based on
4764		 * tcp_now, this extra adjustment is not needed.
4765		 */
4766		delta = (rtt << TCP_DELTA_SHIFT)
4767			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
4768
4769		if ((tp->t_srtt += delta) <= 0)
4770			tp->t_srtt = 1;
4771
4772		/*
4773		 * We accumulate a smoothed rtt variance (actually, a
4774		 * smoothed mean difference), then set the retransmit
4775		 * timer to smoothed rtt + 4 times the smoothed variance.
4776		 * rttvar is stored as fixed point with 4 bits after the
4777		 * binary point (scaled by 16).  The following is
4778		 * equivalent to rfc793 smoothing with an alpha of .75
4779		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
4780		 * rfc793's wired-in beta.
4781		 */
4782		if (delta < 0)
4783			delta = -delta;
4784		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
4785		if ((tp->t_rttvar += delta) <= 0)
4786			tp->t_rttvar = 1;
4787		if (tp->t_rttbest == 0  ||
4788			tp->t_rttbest > (tp->t_srtt + tp->t_rttvar))
4789			tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
4790	} else {
4791		/*
4792		 * No rtt measurement yet - use the unsmoothed rtt.
4793		 * Set the variance to half the rtt (so our first
4794		 * retransmit happens at 3*rtt).
4795		 */
4796		tp->t_srtt = rtt << TCP_RTT_SHIFT;
4797		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
4798	}
4799
4800compute_rto:
4801	nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt,
4802		tp->t_rttvar);
4803	tp->t_rtttime = 0;
4804	tp->t_rxtshift = 0;
4805	tp->t_rxtstart = 0;
4806
4807	/*
4808	 * the retransmit should happen at rtt + 4 * rttvar.
4809	 * Because of the way we do the smoothing, srtt and rttvar
4810	 * will each average +1/2 tick of bias.  When we compute
4811	 * the retransmit timer, we want 1/2 tick of rounding and
4812	 * 1 extra tick because of +-1/2 tick uncertainty in the
4813	 * firing of the timer.  The bias will give us exactly the
4814	 * 1.5 tick we need.  But, because the bias is
4815	 * statistical, we have to test that we don't drop below
4816	 * the minimum feasible timer (which is 2 ticks).
4817	 */
4818	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
4819		max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
4820		TCP_ADD_REXMTSLOP(tp));
4821
4822	/*
4823	 * We received an ack for a packet that wasn't retransmitted;
4824	 * it is probably safe to discard any error indications we've
4825	 * received recently.  This isn't quite right, but close enough
4826	 * for now (a route might have failed after we sent a segment,
4827	 * and the return path might not be symmetrical).
4828	 */
4829	tp->t_softerror = 0;
4830}
4831
4832static inline unsigned int
4833tcp_maxmtu(struct rtentry *rt)
4834{
4835	unsigned int maxmtu;
4836
4837	RT_LOCK_ASSERT_HELD(rt);
4838	if (rt->rt_rmx.rmx_mtu == 0)
4839		maxmtu = rt->rt_ifp->if_mtu;
4840	else
4841		maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu);
4842
4843	return (maxmtu);
4844}
4845
4846#if INET6
4847static inline unsigned int
4848tcp_maxmtu6(struct rtentry *rt)
4849{
4850	unsigned int maxmtu;
4851	struct nd_ifinfo *ndi;
4852
4853	RT_LOCK_ASSERT_HELD(rt);
4854	lck_rw_lock_shared(nd_if_rwlock);
4855	if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized)
4856		ndi = NULL;
4857	if (ndi != NULL)
4858		lck_mtx_lock(&ndi->lock);
4859	if (rt->rt_rmx.rmx_mtu == 0)
4860		maxmtu = IN6_LINKMTU(rt->rt_ifp);
4861	else
4862		maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
4863	if (ndi != NULL)
4864		lck_mtx_unlock(&ndi->lock);
4865	lck_rw_done(nd_if_rwlock);
4866
4867	return (maxmtu);
4868}
4869#endif
4870
4871/*
4872 * Determine a reasonable value for maxseg size.
4873 * If the route is known, check route for mtu.
4874 * If none, use an mss that can be handled on the outgoing
4875 * interface without forcing IP to fragment; if bigger than
4876 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
4877 * to utilize large mbufs.  If no route is found, route has no mtu,
4878 * or the destination isn't local, use a default, hopefully conservative
4879 * size (usually 512 or the default IP max size, but no more than the mtu
4880 * of the interface), as we can't discover anything about intervening
4881 * gateways or networks.  We also initialize the congestion/slow start
4882 * window. While looking at the routing entry, we also initialize
4883 * other path-dependent parameters from pre-set or cached values
4884 * in the routing entry.
4885 *
4886 * Also take into account the space needed for options that we
4887 * send regularly.  Make maxseg shorter by that amount to assure
4888 * that we can send maxseg amount of data even when the options
4889 * are present.  Store the upper limit of the length of options plus
4890 * data in maxopd.
4891 *
4892 * NOTE that this routine is only called when we process an incoming
4893 * segment, for outgoing segments only tcp_mssopt is called.
4894 *
4895 */
4896void
4897tcp_mss(tp, offer, input_ifscope)
4898	struct tcpcb *tp;
4899	int offer;
4900	unsigned int input_ifscope;
4901{
4902	register struct rtentry *rt;
4903	struct ifnet *ifp;
4904	register int rtt, mss;
4905	u_int32_t bufsize;
4906	struct inpcb *inp;
4907	struct socket *so;
4908	struct rmxp_tao *taop;
4909	int origoffer = offer;
4910	u_int32_t sb_max_corrected;
4911	int isnetlocal = 0;
4912#if INET6
4913	int isipv6;
4914	int min_protoh;
4915#endif
4916
4917	inp = tp->t_inpcb;
4918#if INET6
4919	isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
4920	min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
4921			    : sizeof (struct tcpiphdr);
4922#else
4923#define min_protoh  (sizeof (struct tcpiphdr))
4924#endif
4925
4926#if INET6
4927	if (isipv6) {
4928		rt = tcp_rtlookup6(inp, input_ifscope);
4929	}
4930	else
4931#endif /* INET6 */
4932	{
4933		rt = tcp_rtlookup(inp, input_ifscope);
4934	}
4935	isnetlocal = (tp->t_flags & TF_LOCAL);
4936
4937	if (rt == NULL) {
4938		tp->t_maxopd = tp->t_maxseg =
4939#if INET6
4940		isipv6 ? tcp_v6mssdflt :
4941#endif /* INET6 */
4942		tcp_mssdflt;
4943		return;
4944	}
4945	ifp = rt->rt_ifp;
4946	/*
4947	 * Slower link window correction:
4948	 * If a value is specificied for slowlink_wsize use it for
4949	 * PPP links believed to be on a serial modem (speed <128Kbps).
4950	 * Excludes 9600bps as it is the default value adversized
4951	 * by pseudo-devices over ppp.
4952	 */
4953	if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
4954	    ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
4955		tp->t_flags |= TF_SLOWLINK;
4956	}
4957	so = inp->inp_socket;
4958
4959	taop = rmx_taop(rt->rt_rmx);
4960	/*
4961	 * Offer == -1 means that we didn't receive SYN yet,
4962	 * use cached value in that case;
4963	 */
4964	if (offer == -1)
4965		offer = taop->tao_mssopt;
4966	/*
4967	 * Offer == 0 means that there was no MSS on the SYN segment,
4968	 * in this case we use tcp_mssdflt.
4969	 */
4970	if (offer == 0)
4971		offer =
4972#if INET6
4973			isipv6 ? tcp_v6mssdflt :
4974#endif /* INET6 */
4975			tcp_mssdflt;
4976	else {
4977		/*
4978		 * Prevent DoS attack with too small MSS. Round up
4979		 * to at least minmss.
4980		 */
4981		offer = max(offer, tcp_minmss);
4982		/*
4983		 * Sanity check: make sure that maxopd will be large
4984		 * enough to allow some data on segments even is the
4985		 * all the option space is used (40bytes).  Otherwise
4986		 * funny things may happen in tcp_output.
4987		 */
4988		offer = max(offer, 64);
4989	}
4990	taop->tao_mssopt = offer;
4991
4992	/*
4993	 * While we're here, check if there's an initial rtt
4994	 * or rttvar.  Convert from the route-table units
4995	 * to scaled multiples of the slow timeout timer.
4996	 */
4997	if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) {
4998		tcp_getrt_rtt(tp, rt);
4999	} else {
5000		tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
5001	}
5002
5003#if INET6
5004	mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5005#else
5006	mss = tcp_maxmtu(rt);
5007#endif
5008	mss -= min_protoh;
5009
5010	if (rt->rt_rmx.rmx_mtu == 0) {
5011#if INET6
5012		if (isipv6) {
5013			if (!isnetlocal)
5014				mss = min(mss, tcp_v6mssdflt);
5015		} else
5016#endif /* INET6 */
5017		if (!isnetlocal)
5018			mss = min(mss, tcp_mssdflt);
5019	}
5020
5021	mss = min(mss, offer);
5022	/*
5023	 * maxopd stores the maximum length of data AND options
5024	 * in a segment; maxseg is the amount of data in a normal
5025	 * segment.  We need to store this value (maxopd) apart
5026	 * from maxseg, because now every segment carries options
5027	 * and thus we normally have somewhat less data in segments.
5028	 */
5029	tp->t_maxopd = mss;
5030
5031	/*
5032	 * origoffer==-1 indicates, that no segments were received yet.
5033	 * In this case we just guess.
5034	 */
5035	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
5036	    (origoffer == -1 ||
5037	     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
5038		mss -= TCPOLEN_TSTAMP_APPA;
5039
5040#if MPTCP
5041	mss -= mptcp_adj_mss(tp, FALSE);
5042#endif /* MPTCP */
5043	tp->t_maxseg = mss;
5044
5045	/*
5046	 * Calculate corrected value for sb_max; ensure to upgrade the
5047	 * numerator for large sb_max values else it will overflow.
5048	 */
5049	sb_max_corrected = (sb_max * (u_int64_t)MCLBYTES) / (MSIZE + MCLBYTES);
5050
5051	/*
5052	 * If there's a pipesize (ie loopback), change the socket
5053	 * buffer to that size only if it's bigger than the current
5054	 * sockbuf size.  Make the socket buffers an integral
5055	 * number of mss units; if the mss is larger than
5056	 * the socket buffer, decrease the mss.
5057	 */
5058#if RTV_SPIPE
5059	bufsize = rt->rt_rmx.rmx_sendpipe;
5060	if (bufsize < so->so_snd.sb_hiwat)
5061#endif
5062		bufsize = so->so_snd.sb_hiwat;
5063	if (bufsize < mss)
5064		mss = bufsize;
5065	else {
5066		bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5067		if (bufsize > sb_max_corrected)
5068			bufsize = sb_max_corrected;
5069		(void)sbreserve(&so->so_snd, bufsize);
5070	}
5071	tp->t_maxseg = mss;
5072
5073#if RTV_RPIPE
5074	bufsize = rt->rt_rmx.rmx_recvpipe;
5075	if (bufsize < so->so_rcv.sb_hiwat)
5076#endif
5077		bufsize = so->so_rcv.sb_hiwat;
5078	if (bufsize > mss) {
5079		bufsize = (((bufsize + (u_int64_t)mss - 1) / (u_int64_t)mss) * (u_int64_t)mss);
5080		if (bufsize > sb_max_corrected)
5081			bufsize = sb_max_corrected;
5082		(void)sbreserve(&so->so_rcv, bufsize);
5083	}
5084
5085	set_tcp_stream_priority(so);
5086
5087	if (rt->rt_rmx.rmx_ssthresh) {
5088		/*
5089		 * There's some sort of gateway or interface
5090		 * buffer limit on the path.  Use this to set
5091		 * slow-start threshold, but set the threshold to
5092		 * no less than 2*mss.
5093		 */
5094		tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
5095		tcpstat.tcps_usedssthresh++;
5096	} else {
5097		tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
5098	}
5099
5100	/*
5101	 * Set the slow-start flight size depending on whether this
5102	 * is a local network or not.
5103	 */
5104	if (CC_ALGO(tp)->cwnd_init != NULL)
5105		CC_ALGO(tp)->cwnd_init(tp);
5106
5107	tcp_ccdbg_trace(tp, NULL, TCP_CC_CWND_INIT);
5108
5109	/* Route locked during lookup above */
5110	RT_UNLOCK(rt);
5111}
5112
5113/*
5114 * Determine the MSS option to send on an outgoing SYN.
5115 */
5116int
5117tcp_mssopt(tp)
5118	struct tcpcb *tp;
5119{
5120	struct rtentry *rt;
5121	int mss;
5122#if INET6
5123	int isipv6;
5124	int min_protoh;
5125#endif
5126
5127#if INET6
5128	isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
5129	min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
5130			    : sizeof (struct tcpiphdr);
5131#else
5132#define min_protoh  (sizeof (struct tcpiphdr))
5133#endif
5134
5135#if INET6
5136	if (isipv6)
5137		rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
5138	else
5139#endif /* INET6 */
5140	rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
5141	if (rt == NULL) {
5142		return (
5143#if INET6
5144			isipv6 ? tcp_v6mssdflt :
5145#endif /* INET6 */
5146			tcp_mssdflt);
5147	}
5148	/*
5149	 * Slower link window correction:
5150	 * If a value is specificied for slowlink_wsize use it for PPP links
5151	 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
5152	 * it is the default value adversized by pseudo-devices over ppp.
5153	 */
5154	if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
5155	    rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
5156		tp->t_flags |= TF_SLOWLINK;
5157	}
5158
5159#if INET6
5160	mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
5161#else
5162	mss = tcp_maxmtu(rt);
5163#endif
5164	/* Route locked during lookup above */
5165	RT_UNLOCK(rt);
5166	return (mss - min_protoh);
5167}
5168
5169/*
5170 * On a partial ack arrives, force the retransmission of the
5171 * next unacknowledged segment.  Do not clear tp->t_dupacks.
5172 * By setting snd_nxt to th_ack, this forces retransmission timer to
5173 * be started again.
5174 */
5175static void
5176tcp_newreno_partial_ack(tp, th)
5177	struct tcpcb *tp;
5178	struct tcphdr *th;
5179{
5180		tcp_seq onxt = tp->snd_nxt;
5181		u_int32_t  ocwnd = tp->snd_cwnd;
5182		tp->t_timer[TCPT_REXMT] = 0;
5183		tp->t_timer[TCPT_PTO] = 0;
5184		tp->t_rtttime = 0;
5185		tp->snd_nxt = th->th_ack;
5186		/*
5187		 * Set snd_cwnd to one segment beyond acknowledged offset
5188		 * (tp->snd_una has not yet been updated when this function
5189		 *  is called)
5190		 */
5191		tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp);
5192		tp->t_flags |= TF_ACKNOW;
5193		(void) tcp_output(tp);
5194		tp->snd_cwnd = ocwnd;
5195		if (SEQ_GT(onxt, tp->snd_nxt))
5196			tp->snd_nxt = onxt;
5197		/*
5198		 * Partial window deflation.  Relies on fact that tp->snd_una
5199		 * not updated yet.
5200		 */
5201		if (tp->snd_cwnd > BYTES_ACKED(th, tp))
5202			tp->snd_cwnd -= BYTES_ACKED(th, tp);
5203		else
5204			tp->snd_cwnd = 0;
5205		tp->snd_cwnd += tp->t_maxseg;
5206
5207}
5208
5209/*
5210 * Drop a random TCP connection that hasn't been serviced yet and
5211 * is eligible for discard.  There is a one in qlen chance that
5212 * we will return a null, saying that there are no dropable
5213 * requests.  In this case, the protocol specific code should drop
5214 * the new request.  This insures fairness.
5215 *
5216 * The listening TCP socket "head" must be locked
5217 */
5218static int
5219tcp_dropdropablreq(struct socket *head)
5220{
5221	struct socket *so, *sonext;
5222	unsigned int i, j, qlen;
5223	static u_int32_t rnd = 0;
5224	static u_int64_t old_runtime;
5225	static unsigned int cur_cnt, old_cnt;
5226	u_int64_t now_sec;
5227	struct inpcb *inp = NULL;
5228	struct tcpcb *tp;
5229
5230	if ((head->so_options & SO_ACCEPTCONN) == 0)
5231		return (0);
5232
5233	if (TAILQ_EMPTY(&head->so_incomp))
5234		return (0);
5235
5236	/*
5237	 * Check if there is any socket in the incomp queue
5238	 * that is closed because of a reset from the peer and is
5239	 * waiting to be garbage collected. If so, pick that as
5240	 * the victim
5241	 */
5242	TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) {
5243		inp = sotoinpcb(so);
5244		tp = intotcpcb(inp);
5245		if (tp != NULL && tp->t_state == TCPS_CLOSED &&
5246		    so->so_head != NULL &&
5247		    (so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
5248		    (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) {
5249			/*
5250			 * The listen socket is already locked but we
5251			 * can lock this socket here without lock ordering
5252			 * issues because it is in the incomp queue and
5253			 * is not visible to others.
5254			 */
5255			if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5256				so->so_usecount++;
5257				goto found_victim;
5258			} else {
5259				continue;
5260			}
5261		}
5262	}
5263
5264	so = TAILQ_FIRST(&head->so_incomp);
5265
5266	now_sec = net_uptime();
5267	if ((i = (now_sec - old_runtime)) != 0) {
5268		old_runtime = now_sec;
5269		old_cnt = cur_cnt / i;
5270		cur_cnt = 0;
5271	}
5272
5273
5274	qlen = head->so_incqlen;
5275	if (rnd == 0)
5276		rnd = RandomULong();
5277
5278	if (++cur_cnt > qlen || old_cnt > qlen) {
5279		rnd = (314159 * rnd + 66329) & 0xffff;
5280		j = ((qlen + 1) * rnd) >> 16;
5281
5282		while (j-- && so)
5283			so = TAILQ_NEXT(so, so_list);
5284	}
5285	/* Find a connection that is not already closing (or being served) */
5286	while (so) {
5287		inp = (struct inpcb *)so->so_pcb;
5288
5289		sonext = TAILQ_NEXT(so, so_list);
5290
5291		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0)
5292			!= WNT_STOPUSING) {
5293			/*
5294			 * Avoid the issue of a socket being accepted
5295			 * by one input thread and being dropped by
5296			 * another input thread. If we can't get a hold
5297			 * on this mutex, then grab the next socket in
5298			 * line.
5299			 */
5300			if (lck_mtx_try_lock(&inp->inpcb_mtx)) {
5301				so->so_usecount++;
5302				if ((so->so_usecount == 2) &&
5303				    (so->so_state & SS_INCOMP) &&
5304				    !(so->so_flags & SOF_INCOMP_INPROGRESS))  {
5305					break;
5306				} else {
5307					/*
5308					 * don't use if being accepted or
5309					 * used in any other way
5310					 */
5311					in_pcb_checkstate(inp, WNT_RELEASE, 1);
5312					tcp_unlock(so, 1, 0);
5313				}
5314			} else {
5315				/*
5316				 * do not try to lock the inp in
5317				 * in_pcb_checkstate because the lock
5318				 * is already held in some other thread.
5319				 * Only drop the inp_wntcnt reference.
5320				 */
5321				in_pcb_checkstate(inp, WNT_RELEASE, 1);
5322			}
5323		}
5324		so = sonext;
5325
5326	}
5327	if (so == NULL) {
5328		return (0);
5329	}
5330
5331	/* Makes sure socket is still in the right state to be discarded */
5332
5333	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
5334		tcp_unlock(so, 1, 0);
5335		return (0);
5336	}
5337
5338found_victim:
5339	if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) {
5340		/* do not discard: that socket is being accepted */
5341		tcp_unlock(so, 1, 0);
5342		return (0);
5343	}
5344
5345	TAILQ_REMOVE(&head->so_incomp, so, so_list);
5346	tcp_unlock(head, 0, 0);
5347
5348	lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
5349	tp = sototcpcb(so);
5350	so->so_flags |= SOF_OVERFLOW;
5351	so->so_head = NULL;
5352
5353	tcp_close(tp);
5354	if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
5355		/*
5356		 * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
5357		 * doesn't require a lock, it could have happened while
5358		 * we are holding the lock. This pcb will have to
5359		 * be garbage collected later.
5360		 * Release the reference held for so_incomp queue
5361		 */
5362		so->so_usecount--;
5363		tcp_unlock(so, 1, 0);
5364	} else {
5365		/*
5366		 * Unlock this socket and leave the reference on.
5367		 * We need to acquire the pcbinfo lock in order to
5368		 * fully dispose it off
5369		 */
5370		tcp_unlock(so, 0, 0);
5371
5372		lck_rw_lock_exclusive(tcbinfo.ipi_lock);
5373
5374		tcp_lock(so, 0, 0);
5375		/* Release the reference held for so_incomp queue */
5376		so->so_usecount--;
5377
5378		if (so->so_usecount != 1 ||
5379		    (inp->inp_wantcnt > 0 &&
5380		    inp->inp_wantcnt != WNT_STOPUSING)) {
5381			/*
5382			 * There is an extra wantcount or usecount
5383			 * that must have been added when the socket
5384			 * was unlocked. This socket will have to be
5385			 * garbage collected later
5386			 */
5387			tcp_unlock(so, 1, 0);
5388		} else {
5389
5390			/* Drop the reference held for this function */
5391			so->so_usecount--;
5392
5393			in_pcbdispose(inp);
5394		}
5395		lck_rw_done(tcbinfo.ipi_lock);
5396	}
5397	tcpstat.tcps_drops++;
5398
5399	tcp_lock(head, 0, 0);
5400	head->so_incqlen--;
5401	head->so_qlen--;
5402	return(1);
5403}
5404
5405/* Set background congestion control on a socket */
5406void
5407tcp_set_background_cc(struct socket *so)
5408{
5409	tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
5410}
5411
5412/* Set foreground congestion control on a socket */
5413void
5414tcp_set_foreground_cc(struct socket *so)
5415{
5416	if (tcp_use_newreno)
5417		tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
5418	else
5419		tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX);
5420}
5421
5422static void
5423tcp_set_new_cc(struct socket *so, uint16_t cc_index)
5424{
5425	struct inpcb *inp = sotoinpcb(so);
5426	struct tcpcb *tp = intotcpcb(inp);
5427	u_char old_cc_index = 0;
5428	if (tp->tcp_cc_index != cc_index) {
5429
5430		old_cc_index = tp->tcp_cc_index;
5431
5432		if (CC_ALGO(tp)->cleanup != NULL)
5433			CC_ALGO(tp)->cleanup(tp);
5434		tp->tcp_cc_index = cc_index;
5435
5436		tcp_cc_allocate_state(tp);
5437
5438		if (CC_ALGO(tp)->switch_to != NULL)
5439			CC_ALGO(tp)->switch_to(tp, old_cc_index);
5440
5441		tcp_ccdbg_trace(tp, NULL, TCP_CC_CHANGE_ALGO);
5442	}
5443}
5444
5445void
5446tcp_set_recv_bg(struct socket *so)
5447{
5448	if (!IS_TCP_RECV_BG(so))
5449		so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG;
5450
5451	/* Unset Large Receive Offload on background sockets */
5452	so_set_lro(so, SO_TC_BK);
5453}
5454
5455void
5456tcp_clear_recv_bg(struct socket *so)
5457{
5458	if (IS_TCP_RECV_BG(so))
5459		so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG);
5460
5461	/*
5462	 * Set/unset use of Large Receive Offload depending on
5463	 * the traffic class
5464	 */
5465	so_set_lro(so, so->so_traffic_class);
5466}
5467
5468void
5469inp_fc_unthrottle_tcp(struct inpcb *inp)
5470{
5471	struct tcpcb *tp = inp->inp_ppcb;
5472	/*
5473	 * Back off the slow-start threshold and enter
5474	 * congestion avoidance phase
5475	 */
5476	if (CC_ALGO(tp)->pre_fr != NULL)
5477		CC_ALGO(tp)->pre_fr(tp);
5478
5479	tp->snd_cwnd = tp->snd_ssthresh;
5480
5481	/*
5482	 * Restart counting for ABC as we changed the
5483	 * congestion window just now.
5484	 */
5485	tp->t_bytes_acked = 0;
5486
5487	/* Reset retransmit shift as we know that the reason
5488	 * for delay in sending a packet is due to flow
5489	 * control on the outgoing interface. There is no need
5490	 * to backoff retransmit timer.
5491	 */
5492	tp->t_rxtshift = 0;
5493
5494	/*
5495	 * Start the output stream again. Since we are
5496	 * not retransmitting data, do not reset the
5497	 * retransmit timer or rtt calculation.
5498	 */
5499	tcp_output(tp);
5500}
5501
5502static int
5503tcp_getstat SYSCTL_HANDLER_ARGS
5504{
5505#pragma unused(oidp, arg1, arg2)
5506
5507	int error;
5508
5509	proc_t caller = PROC_NULL;
5510	proc_t caller_parent = PROC_NULL;
5511	char command_name[MAXCOMLEN + 1] = "";
5512	char parent_name[MAXCOMLEN + 1] = "";
5513
5514	if ((caller = proc_self()) != PROC_NULL) {
5515		/* get process name */
5516		strlcpy(command_name, caller->p_comm, sizeof(command_name));
5517
5518		/* get parent process name if possible */
5519		if ((caller_parent = proc_find(caller->p_ppid)) != PROC_NULL) {
5520			strlcpy(parent_name, caller_parent->p_comm,
5521			    sizeof(parent_name));
5522			proc_rele(caller_parent);
5523		}
5524
5525		if ((escape_str(command_name, strlen(command_name),
5526		    sizeof(command_name)) == 0) &&
5527		    (escape_str(parent_name, strlen(parent_name),
5528		    sizeof(parent_name)) == 0)) {
5529			kern_asl_msg(LOG_DEBUG, "messagetracer",
5530			    5,
5531			    "com.apple.message.domain",
5532			    "com.apple.kernel.tcpstat", /* 1 */
5533			    "com.apple.message.signature",
5534			    "tcpstat", /* 2 */
5535			    "com.apple.message.signature2", command_name, /* 3 */
5536			    "com.apple.message.signature3", parent_name, /* 4 */
5537			    "com.apple.message.summarize", "YES", /* 5 */
5538			    NULL);
5539		}
5540	}
5541	if (caller != PROC_NULL)
5542		proc_rele(caller);
5543
5544	if (req->oldptr == 0) {
5545		req->oldlen= (size_t)sizeof(struct tcpstat);
5546	}
5547
5548	error = SYSCTL_OUT(req, &tcpstat, MIN(sizeof (tcpstat), req->oldlen));
5549
5550        return (error);
5551
5552}
5553
5554/*
5555 * Checksum extended TCP header and data.
5556 */
5557int
5558tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen)
5559{
5560	struct ifnet *ifp = m->m_pkthdr.rcvif;
5561
5562	switch (af) {
5563	case AF_INET: {
5564		struct ip *ip = mtod(m, struct ip *);
5565		struct ipovly *ipov = (struct ipovly *)ip;
5566
5567		if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
5568			return (0);
5569
5570		if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
5571		    (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
5572		    (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
5573			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5574				th->th_sum = m->m_pkthdr.csum_rx_val;
5575			} else {
5576				uint16_t sum = m->m_pkthdr.csum_rx_val;
5577				uint16_t start = m->m_pkthdr.csum_rx_start;
5578
5579				/*
5580				 * Perform 1's complement adjustment of octets
5581				 * that got included/excluded in the hardware-
5582				 * calculated checksum value.  Ignore cases
5583				 * where the value includes or excludes the IP
5584				 * header span, as the sum for those octets
5585				 * would already be 0xffff and thus no-op.
5586				 */
5587				if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
5588				    start != 0 && (off - start) != off) {
5589#if BYTE_ORDER != BIG_ENDIAN
5590					if (start < off) {
5591						HTONS(ip->ip_len);
5592						HTONS(ip->ip_off);
5593					}
5594#endif
5595					/* callee folds in sum */
5596					sum = m_adj_sum16(m, start, off, sum);
5597#if BYTE_ORDER != BIG_ENDIAN
5598					if (start < off) {
5599						NTOHS(ip->ip_off);
5600						NTOHS(ip->ip_len);
5601					}
5602#endif
5603				}
5604
5605				/* callee folds in sum */
5606				th->th_sum = in_pseudo(ip->ip_src.s_addr,
5607				    ip->ip_dst.s_addr,
5608				    sum + htonl(tlen + IPPROTO_TCP));
5609			}
5610			th->th_sum ^= 0xffff;
5611		} else {
5612			uint16_t ip_sum;
5613			int len;
5614			char b[9];
5615
5616			bcopy(ipov->ih_x1, b, sizeof (ipov->ih_x1));
5617			bzero(ipov->ih_x1, sizeof (ipov->ih_x1));
5618			ip_sum = ipov->ih_len;
5619			ipov->ih_len = (u_short)tlen;
5620#if BYTE_ORDER != BIG_ENDIAN
5621			HTONS(ipov->ih_len);
5622#endif
5623			len = sizeof (struct ip) + tlen;
5624			th->th_sum = in_cksum(m, len);
5625			bcopy(b, ipov->ih_x1, sizeof (ipov->ih_x1));
5626			ipov->ih_len = ip_sum;
5627
5628			tcp_in_cksum_stats(len);
5629		}
5630		break;
5631	}
5632#if INET6
5633	case AF_INET6: {
5634		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
5635
5636		if (m->m_pkthdr.pkt_flags & PKTF_SW_LRO_DID_CSUM)
5637			return (0);
5638
5639		if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
5640		    (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
5641		    (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
5642			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5643				th->th_sum = m->m_pkthdr.csum_rx_val;
5644			} else {
5645				uint16_t sum = m->m_pkthdr.csum_rx_val;
5646				uint16_t start = m->m_pkthdr.csum_rx_start;
5647
5648				/*
5649				 * Perform 1's complement adjustment of octets
5650				 * that got included/excluded in the hardware-
5651				 * calculated checksum value.
5652				 */
5653				if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
5654				    start != off) {
5655					uint16_t s, d;
5656
5657					if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
5658						s = ip6->ip6_src.s6_addr16[1];
5659						ip6->ip6_src.s6_addr16[1] = 0 ;
5660					}
5661					if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
5662						d = ip6->ip6_dst.s6_addr16[1];
5663						ip6->ip6_dst.s6_addr16[1] = 0;
5664					}
5665
5666					/* callee folds in sum */
5667					sum = m_adj_sum16(m, start, off, sum);
5668
5669					if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
5670						ip6->ip6_src.s6_addr16[1] = s;
5671					if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
5672						ip6->ip6_dst.s6_addr16[1] = d;
5673				}
5674
5675				th->th_sum = in6_pseudo(
5676				    &ip6->ip6_src, &ip6->ip6_dst,
5677				    sum + htonl(tlen + IPPROTO_TCP));
5678			}
5679			th->th_sum ^= 0xffff;
5680		} else {
5681			tcp_in6_cksum_stats(tlen);
5682			th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen);
5683		}
5684		break;
5685	}
5686#endif /* INET6 */
5687	default:
5688		VERIFY(0);
5689		/* NOTREACHED */
5690	}
5691
5692	if (th->th_sum != 0) {
5693		tcpstat.tcps_rcvbadsum++;
5694		IF_TCP_STATINC(ifp, badformat);
5695		return (-1);
5696	}
5697
5698	return (0);
5699}
5700
5701SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats,
5702    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
5703    tcp_getstat, "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
5704
5705static int
5706sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
5707{
5708#pragma unused(arg1, arg2)
5709
5710	int error, val = tcprexmtthresh;
5711
5712	error = sysctl_handle_int(oidp, &val, 0, req);
5713	if (error || !req->newptr)
5714                return (error);
5715
5716	/*
5717	 * Constrain the number of duplicate ACKs
5718	 * to consider for TCP fast retransmit
5719	 * to either 2 or 3
5720	 */
5721
5722        if (val < 2 || val > 3)
5723		return (EINVAL);
5724
5725	 tcprexmtthresh = val;
5726
5727	return (0);
5728}
5729
5730SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
5731	&tcprexmtthresh, 0, &sysctl_rexmtthresh, "I", "Duplicate ACK Threshold for Fast Retransmit");
5732