tcp_input.c revision 206456
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/netinet/tcp_input.c 206456 2010-04-10 12:47:06Z rpaulo $");
34
35#include "opt_ipfw.h"		/* for ipfw_fwd	*/
36#include "opt_inet.h"
37#include "opt_inet6.h"
38#include "opt_ipsec.h"
39#include "opt_tcpdebug.h"
40
41#include <sys/param.h>
42#include <sys/kernel.h>
43#include <sys/malloc.h>
44#include <sys/mbuf.h>
45#include <sys/proc.h>		/* for proc0 declaration */
46#include <sys/protosw.h>
47#include <sys/signalvar.h>
48#include <sys/socket.h>
49#include <sys/socketvar.h>
50#include <sys/sysctl.h>
51#include <sys/syslog.h>
52#include <sys/systm.h>
53
54#include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
55
56#include <vm/uma.h>
57
58#include <net/if.h>
59#include <net/route.h>
60#include <net/vnet.h>
61
62#define TCPSTATES		/* for logging */
63
64#include <netinet/in.h>
65#include <netinet/in_pcb.h>
66#include <netinet/in_systm.h>
67#include <netinet/in_var.h>
68#include <netinet/ip.h>
69#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
70#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
71#include <netinet/ip_var.h>
72#include <netinet/ip_options.h>
73#include <netinet/ip6.h>
74#include <netinet/icmp6.h>
75#include <netinet6/in6_pcb.h>
76#include <netinet6/ip6_var.h>
77#include <netinet6/nd6.h>
78#include <netinet/tcp.h>
79#include <netinet/tcp_fsm.h>
80#include <netinet/tcp_seq.h>
81#include <netinet/tcp_timer.h>
82#include <netinet/tcp_var.h>
83#include <netinet6/tcp6_var.h>
84#include <netinet/tcpip.h>
85#include <netinet/tcp_syncache.h>
86#ifdef TCPDEBUG
87#include <netinet/tcp_debug.h>
88#endif /* TCPDEBUG */
89
90#ifdef IPSEC
91#include <netipsec/ipsec.h>
92#include <netipsec/ipsec6.h>
93#endif /*IPSEC*/
94
95#include <machine/in_cksum.h>
96
97#include <security/mac/mac_framework.h>
98
99static const int tcprexmtthresh = 3;
100
101VNET_DEFINE(struct tcpstat, tcpstat);
102VNET_DEFINE(int, blackhole);
103VNET_DEFINE(int, tcp_delack_enabled);
104VNET_DEFINE(int, drop_synfin);
105VNET_DEFINE(int, tcp_do_rfc3042);
106VNET_DEFINE(int, tcp_do_rfc3390);
107VNET_DEFINE(int, tcp_do_ecn);
108VNET_DEFINE(int, tcp_ecn_maxretries);
109VNET_DEFINE(int, tcp_insecure_rst);
110VNET_DEFINE(int, tcp_do_autorcvbuf);
111VNET_DEFINE(int, tcp_autorcvbuf_inc);
112VNET_DEFINE(int, tcp_autorcvbuf_max);
113VNET_DEFINE(int, tcp_do_rfc3465);
114VNET_DEFINE(int, tcp_abc_l_var);
115
116SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
117    &VNET_NAME(tcpstat), tcpstat,
118    "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
119
120int tcp_log_in_vain = 0;
121SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
122    &tcp_log_in_vain, 0,
123    "Log all incoming TCP segments to closed ports");
124
125SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
126    &VNET_NAME(blackhole), 0,
127    "Do not send RST on segments to closed ports");
128
129SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
130    &VNET_NAME(tcp_delack_enabled), 0,
131    "Delay ACK to try and piggyback it onto a data packet");
132
133SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
134    &VNET_NAME(drop_synfin), 0,
135    "Drop TCP packets with SYN+FIN set");
136
137SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
138    &VNET_NAME(tcp_do_rfc3042), 0,
139    "Enable RFC 3042 (Limited Transmit)");
140
141SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
142    &VNET_NAME(tcp_do_rfc3390), 0,
143    "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
144
145SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
146    &VNET_NAME(tcp_do_rfc3465), 0,
147    "Enable RFC 3465 (Appropriate Byte Counting)");
148
149SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
150    &VNET_NAME(tcp_abc_l_var), 2,
151    "Cap the max cwnd increment during slow-start to this number of segments");
152
153SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
154
155SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
156    &VNET_NAME(tcp_do_ecn), 0,
157    "TCP ECN support");
158
159SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
160    &VNET_NAME(tcp_ecn_maxretries), 0,
161    "Max retries before giving up on ECN");
162
163SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
164    &VNET_NAME(tcp_insecure_rst), 0,
165    "Follow the old (insecure) criteria for accepting RST packets");
166
167SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
168    &VNET_NAME(tcp_do_autorcvbuf), 0,
169    "Enable automatic receive buffer sizing");
170
171SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
172    &VNET_NAME(tcp_autorcvbuf_inc), 0,
173    "Incrementor step size of automatic receive buffer");
174
175SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
176    &VNET_NAME(tcp_autorcvbuf_max), 0,
177    "Max size of automatic receive buffer");
178
179int	tcp_read_locking = 1;
180SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW,
181    &tcp_read_locking, 0, "Enable read locking strategy");
182
183VNET_DEFINE(struct inpcbhead, tcb);
184VNET_DEFINE(struct inpcbinfo, tcbinfo);
185#define	tcb6	tcb  /* for KAME src sync over BSD*'s */
186
187static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
188static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
189		     struct socket *, struct tcpcb *, int, int, uint8_t,
190		     int);
191static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
192		     struct tcpcb *, int, int);
193static void	 tcp_pulloutofband(struct socket *,
194		     struct tcphdr *, struct mbuf *, int);
195static void	 tcp_xmit_timer(struct tcpcb *, int);
196static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
197static void inline
198		 tcp_congestion_exp(struct tcpcb *);
199
200/*
201 * Kernel module interface for updating tcpstat.  The argument is an index
202 * into tcpstat treated as an array of u_long.  While this encodes the
203 * general layout of tcpstat into the caller, it doesn't encode its location,
204 * so that future changes to add, for example, per-CPU stats support won't
205 * cause binary compatibility problems for kernel modules.
206 */
207void
208kmod_tcpstat_inc(int statnum)
209{
210
211	(*((u_long *)&V_tcpstat + statnum))++;
212}
213
214static void inline
215tcp_congestion_exp(struct tcpcb *tp)
216{
217	u_int win;
218
219	win = min(tp->snd_wnd, tp->snd_cwnd) /
220	    2 / tp->t_maxseg;
221	if (win < 2)
222		win = 2;
223	tp->snd_ssthresh = win * tp->t_maxseg;
224	ENTER_FASTRECOVERY(tp);
225	tp->snd_recover = tp->snd_max;
226	if (tp->t_flags & TF_ECN_PERMIT)
227		tp->t_flags |= TF_ECN_SND_CWR;
228}
229
230/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
231#ifdef INET6
232#define ND6_HINT(tp) \
233do { \
234	if ((tp) && (tp)->t_inpcb && \
235	    ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
236		nd6_nud_hint(NULL, NULL, 0); \
237} while (0)
238#else
239#define ND6_HINT(tp)
240#endif
241
242/*
243 * Indicate whether this ack should be delayed.  We can delay the ack if
244 *	- there is no delayed ack timer in progress and
245 *	- our last ack wasn't a 0-sized window.  We never want to delay
246 *	  the ack that opens up a 0-sized window and
247 *		- delayed acks are enabled or
248 *		- this is a half-synchronized T/TCP connection.
249 */
250#define DELAY_ACK(tp)							\
251	((!tcp_timer_active(tp, TT_DELACK) &&				\
252	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
253	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
254
255/*
256 * TCP input handling is split into multiple parts:
257 *   tcp6_input is a thin wrapper around tcp_input for the extended
258 *	ip6_protox[] call format in ip6_input
259 *   tcp_input handles primary segment validation, inpcb lookup and
260 *	SYN processing on listen sockets
261 *   tcp_do_segment processes the ACK and text of the segment for
262 *	establishing, established and closing connections
263 */
264#ifdef INET6
265int
266tcp6_input(struct mbuf **mp, int *offp, int proto)
267{
268	struct mbuf *m = *mp;
269	struct in6_ifaddr *ia6;
270
271	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
272
273	/*
274	 * draft-itojun-ipv6-tcp-to-anycast
275	 * better place to put this in?
276	 */
277	ia6 = ip6_getdstifaddr(m);
278	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
279		struct ip6_hdr *ip6;
280
281		ifa_free(&ia6->ia_ifa);
282		ip6 = mtod(m, struct ip6_hdr *);
283		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
284			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
285		return IPPROTO_DONE;
286	}
287
288	tcp_input(m, *offp);
289	return IPPROTO_DONE;
290}
291#endif
292
293void
294tcp_input(struct mbuf *m, int off0)
295{
296	struct tcphdr *th;
297	struct ip *ip = NULL;
298	struct ipovly *ipov;
299	struct inpcb *inp = NULL;
300	struct tcpcb *tp = NULL;
301	struct socket *so = NULL;
302	u_char *optp = NULL;
303	int optlen = 0;
304	int len, tlen, off;
305	int drop_hdrlen;
306	int thflags;
307	int rstreason = 0;	/* For badport_bandlim accounting purposes */
308	uint8_t iptos;
309#ifdef IPFIREWALL_FORWARD
310	struct m_tag *fwd_tag;
311#endif
312#ifdef INET6
313	struct ip6_hdr *ip6 = NULL;
314	int isipv6;
315#else
316	const void *ip6 = NULL;
317	const int isipv6 = 0;
318#endif
319	struct tcpopt to;		/* options in this segment */
320	char *s = NULL;			/* address and port logging */
321	int ti_locked;
322#define	TI_UNLOCKED	1
323#define	TI_RLOCKED	2
324#define	TI_WLOCKED	3
325
326#ifdef TCPDEBUG
327	/*
328	 * The size of tcp_saveipgen must be the size of the max ip header,
329	 * now IPv6.
330	 */
331	u_char tcp_saveipgen[IP6_HDR_LEN];
332	struct tcphdr tcp_savetcp;
333	short ostate = 0;
334#endif
335
336#ifdef INET6
337	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
338#endif
339
340	to.to_flags = 0;
341	TCPSTAT_INC(tcps_rcvtotal);
342
343	if (isipv6) {
344#ifdef INET6
345		/* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
346		ip6 = mtod(m, struct ip6_hdr *);
347		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
348		if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
349			TCPSTAT_INC(tcps_rcvbadsum);
350			goto drop;
351		}
352		th = (struct tcphdr *)((caddr_t)ip6 + off0);
353
354		/*
355		 * Be proactive about unspecified IPv6 address in source.
356		 * As we use all-zero to indicate unbounded/unconnected pcb,
357		 * unspecified IPv6 address can be used to confuse us.
358		 *
359		 * Note that packets with unspecified IPv6 destination is
360		 * already dropped in ip6_input.
361		 */
362		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
363			/* XXX stat */
364			goto drop;
365		}
366#else
367		th = NULL;		/* XXX: Avoid compiler warning. */
368#endif
369	} else {
370		/*
371		 * Get IP and TCP header together in first mbuf.
372		 * Note: IP leaves IP header in first mbuf.
373		 */
374		if (off0 > sizeof (struct ip)) {
375			ip_stripoptions(m, (struct mbuf *)0);
376			off0 = sizeof(struct ip);
377		}
378		if (m->m_len < sizeof (struct tcpiphdr)) {
379			if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
380			    == NULL) {
381				TCPSTAT_INC(tcps_rcvshort);
382				return;
383			}
384		}
385		ip = mtod(m, struct ip *);
386		ipov = (struct ipovly *)ip;
387		th = (struct tcphdr *)((caddr_t)ip + off0);
388		tlen = ip->ip_len;
389
390		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
391			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
392				th->th_sum = m->m_pkthdr.csum_data;
393			else
394				th->th_sum = in_pseudo(ip->ip_src.s_addr,
395						ip->ip_dst.s_addr,
396						htonl(m->m_pkthdr.csum_data +
397							ip->ip_len +
398							IPPROTO_TCP));
399			th->th_sum ^= 0xffff;
400#ifdef TCPDEBUG
401			ipov->ih_len = (u_short)tlen;
402			ipov->ih_len = htons(ipov->ih_len);
403#endif
404		} else {
405			/*
406			 * Checksum extended TCP header and data.
407			 */
408			len = sizeof (struct ip) + tlen;
409			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
410			ipov->ih_len = (u_short)tlen;
411			ipov->ih_len = htons(ipov->ih_len);
412			th->th_sum = in_cksum(m, len);
413		}
414		if (th->th_sum) {
415			TCPSTAT_INC(tcps_rcvbadsum);
416			goto drop;
417		}
418		/* Re-initialization for later version check */
419		ip->ip_v = IPVERSION;
420	}
421
422#ifdef INET6
423	if (isipv6)
424		iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
425	else
426#endif
427		iptos = ip->ip_tos;
428
429	/*
430	 * Check that TCP offset makes sense,
431	 * pull out TCP options and adjust length.		XXX
432	 */
433	off = th->th_off << 2;
434	if (off < sizeof (struct tcphdr) || off > tlen) {
435		TCPSTAT_INC(tcps_rcvbadoff);
436		goto drop;
437	}
438	tlen -= off;	/* tlen is used instead of ti->ti_len */
439	if (off > sizeof (struct tcphdr)) {
440		if (isipv6) {
441#ifdef INET6
442			IP6_EXTHDR_CHECK(m, off0, off, );
443			ip6 = mtod(m, struct ip6_hdr *);
444			th = (struct tcphdr *)((caddr_t)ip6 + off0);
445#endif
446		} else {
447			if (m->m_len < sizeof(struct ip) + off) {
448				if ((m = m_pullup(m, sizeof (struct ip) + off))
449				    == NULL) {
450					TCPSTAT_INC(tcps_rcvshort);
451					return;
452				}
453				ip = mtod(m, struct ip *);
454				ipov = (struct ipovly *)ip;
455				th = (struct tcphdr *)((caddr_t)ip + off0);
456			}
457		}
458		optlen = off - sizeof (struct tcphdr);
459		optp = (u_char *)(th + 1);
460	}
461	thflags = th->th_flags;
462
463	/*
464	 * Convert TCP protocol specific fields to host format.
465	 */
466	th->th_seq = ntohl(th->th_seq);
467	th->th_ack = ntohl(th->th_ack);
468	th->th_win = ntohs(th->th_win);
469	th->th_urp = ntohs(th->th_urp);
470
471	/*
472	 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
473	 */
474	drop_hdrlen = off0 + off;
475
476	/*
477	 * Locate pcb for segment, which requires a lock on tcbinfo.
478	 * Optimisticaly acquire a global read lock rather than a write lock
479	 * unless header flags necessarily imply a state change.  There are
480	 * two cases where we might discover later we need a write lock
481	 * despite the flags: ACKs moving a connection out of the syncache,
482	 * and ACKs for a connection in TIMEWAIT.
483	 */
484	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
485	    tcp_read_locking == 0) {
486		INP_INFO_WLOCK(&V_tcbinfo);
487		ti_locked = TI_WLOCKED;
488	} else {
489		INP_INFO_RLOCK(&V_tcbinfo);
490		ti_locked = TI_RLOCKED;
491	}
492
493findpcb:
494#ifdef INVARIANTS
495	if (ti_locked == TI_RLOCKED)
496		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
497	else if (ti_locked == TI_WLOCKED)
498		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
499	else
500		panic("%s: findpcb ti_locked %d\n", __func__, ti_locked);
501#endif
502
503#ifdef IPFIREWALL_FORWARD
504	/*
505	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
506	 */
507	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
508
509	if (fwd_tag != NULL && isipv6 == 0) {	/* IPv6 support is not yet */
510		struct sockaddr_in *next_hop;
511
512		next_hop = (struct sockaddr_in *)(fwd_tag+1);
513		/*
514		 * Transparently forwarded. Pretend to be the destination.
515		 * already got one like this?
516		 */
517		inp = in_pcblookup_hash(&V_tcbinfo,
518					ip->ip_src, th->th_sport,
519					ip->ip_dst, th->th_dport,
520					0, m->m_pkthdr.rcvif);
521		if (!inp) {
522			/* It's new.  Try to find the ambushing socket. */
523			inp = in_pcblookup_hash(&V_tcbinfo,
524						ip->ip_src, th->th_sport,
525						next_hop->sin_addr,
526						next_hop->sin_port ?
527						    ntohs(next_hop->sin_port) :
528						    th->th_dport,
529						INPLOOKUP_WILDCARD,
530						m->m_pkthdr.rcvif);
531		}
532		/* Remove the tag from the packet.  We don't need it anymore. */
533		m_tag_delete(m, fwd_tag);
534	} else
535#endif /* IPFIREWALL_FORWARD */
536	{
537		if (isipv6) {
538#ifdef INET6
539			inp = in6_pcblookup_hash(&V_tcbinfo,
540						 &ip6->ip6_src, th->th_sport,
541						 &ip6->ip6_dst, th->th_dport,
542						 INPLOOKUP_WILDCARD,
543						 m->m_pkthdr.rcvif);
544#endif
545		} else
546			inp = in_pcblookup_hash(&V_tcbinfo,
547						ip->ip_src, th->th_sport,
548						ip->ip_dst, th->th_dport,
549						INPLOOKUP_WILDCARD,
550						m->m_pkthdr.rcvif);
551	}
552
553	/*
554	 * If the INPCB does not exist then all data in the incoming
555	 * segment is discarded and an appropriate RST is sent back.
556	 * XXX MRT Send RST using which routing table?
557	 */
558	if (inp == NULL) {
559		/*
560		 * Log communication attempts to ports that are not
561		 * in use.
562		 */
563		if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
564		    tcp_log_in_vain == 2) {
565			if ((s = tcp_log_addrs(NULL, th, (void *)ip, ip6)))
566				log(LOG_INFO, "%s; %s: Connection attempt "
567				    "to closed port\n", s, __func__);
568		}
569		/*
570		 * When blackholing do not respond with a RST but
571		 * completely ignore the segment and drop it.
572		 */
573		if ((V_blackhole == 1 && (thflags & TH_SYN)) ||
574		    V_blackhole == 2)
575			goto dropunlock;
576
577		rstreason = BANDLIM_RST_CLOSEDPORT;
578		goto dropwithreset;
579	}
580	INP_WLOCK(inp);
581	if (!(inp->inp_flags & INP_HW_FLOWID)
582	    && (m->m_flags & M_FLOWID)
583	    && ((inp->inp_socket == NULL)
584		|| !(inp->inp_socket->so_options & SO_ACCEPTCONN))) {
585		inp->inp_flags |= INP_HW_FLOWID;
586		inp->inp_flags &= ~INP_SW_FLOWID;
587		inp->inp_flowid = m->m_pkthdr.flowid;
588	}
589#ifdef IPSEC
590#ifdef INET6
591	if (isipv6 && ipsec6_in_reject(m, inp)) {
592		V_ipsec6stat.in_polvio++;
593		goto dropunlock;
594	} else
595#endif /* INET6 */
596	if (ipsec4_in_reject(m, inp) != 0) {
597		V_ipsec4stat.in_polvio++;
598		goto dropunlock;
599	}
600#endif /* IPSEC */
601
602	/*
603	 * Check the minimum TTL for socket.
604	 */
605	if (inp->inp_ip_minttl != 0) {
606#ifdef INET6
607		if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
608			goto dropunlock;
609		else
610#endif
611		if (inp->inp_ip_minttl > ip->ip_ttl)
612			goto dropunlock;
613	}
614
615	/*
616	 * A previous connection in TIMEWAIT state is supposed to catch stray
617	 * or duplicate segments arriving late.  If this segment was a
618	 * legitimate new connection attempt the old INPCB gets removed and
619	 * we can try again to find a listening socket.
620	 *
621	 * At this point, due to earlier optimism, we may hold a read lock on
622	 * the inpcbinfo, rather than a write lock.  If so, we need to
623	 * upgrade, or if that fails, acquire a reference on the inpcb, drop
624	 * all locks, acquire a global write lock, and then re-acquire the
625	 * inpcb lock.  We may at that point discover that another thread has
626	 * tried to free the inpcb, in which case we need to loop back and
627	 * try to find a new inpcb to deliver to.
628	 */
629relocked:
630	if (inp->inp_flags & INP_TIMEWAIT) {
631		KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
632		    ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked));
633
634		if (ti_locked == TI_RLOCKED) {
635			if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) {
636				in_pcbref(inp);
637				INP_WUNLOCK(inp);
638				INP_INFO_RUNLOCK(&V_tcbinfo);
639				INP_INFO_WLOCK(&V_tcbinfo);
640				ti_locked = TI_WLOCKED;
641				INP_WLOCK(inp);
642				if (in_pcbrele(inp)) {
643					inp = NULL;
644					goto findpcb;
645				}
646			} else
647				ti_locked = TI_WLOCKED;
648		}
649		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
650
651		if (thflags & TH_SYN)
652			tcp_dooptions(&to, optp, optlen, TO_SYN);
653		/*
654		 * NB: tcp_twcheck unlocks the INP and frees the mbuf.
655		 */
656		if (tcp_twcheck(inp, &to, th, m, tlen))
657			goto findpcb;
658		INP_INFO_WUNLOCK(&V_tcbinfo);
659		return;
660	}
661	/*
662	 * The TCPCB may no longer exist if the connection is winding
663	 * down or it is in the CLOSED state.  Either way we drop the
664	 * segment and send an appropriate response.
665	 */
666	tp = intotcpcb(inp);
667	if (tp == NULL || tp->t_state == TCPS_CLOSED) {
668		rstreason = BANDLIM_RST_CLOSEDPORT;
669		goto dropwithreset;
670	}
671
672	/*
673	 * We've identified a valid inpcb, but it could be that we need an
674	 * inpcbinfo write lock and have only a read lock.  In this case,
675	 * attempt to upgrade/relock using the same strategy as the TIMEWAIT
676	 * case above.  If we relock, we have to jump back to 'relocked' as
677	 * the connection might now be in TIMEWAIT.
678	 */
679	if (tp->t_state != TCPS_ESTABLISHED ||
680	    (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
681	    tcp_read_locking == 0) {
682		KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
683		    ("%s: upgrade check ti_locked %d", __func__, ti_locked));
684
685		if (ti_locked == TI_RLOCKED) {
686			if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) {
687				in_pcbref(inp);
688				INP_WUNLOCK(inp);
689				INP_INFO_RUNLOCK(&V_tcbinfo);
690				INP_INFO_WLOCK(&V_tcbinfo);
691				ti_locked = TI_WLOCKED;
692				INP_WLOCK(inp);
693				if (in_pcbrele(inp)) {
694					inp = NULL;
695					goto findpcb;
696				}
697				goto relocked;
698			} else
699				ti_locked = TI_WLOCKED;
700		}
701		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
702	}
703
704#ifdef MAC
705	INP_WLOCK_ASSERT(inp);
706	if (mac_inpcb_check_deliver(inp, m))
707		goto dropunlock;
708#endif
709	so = inp->inp_socket;
710	KASSERT(so != NULL, ("%s: so == NULL", __func__));
711#ifdef TCPDEBUG
712	if (so->so_options & SO_DEBUG) {
713		ostate = tp->t_state;
714		if (isipv6) {
715#ifdef INET6
716			bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6));
717#endif
718		} else
719			bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
720		tcp_savetcp = *th;
721	}
722#endif
723	/*
724	 * When the socket is accepting connections (the INPCB is in LISTEN
725	 * state) we look into the SYN cache if this is a new connection
726	 * attempt or the completion of a previous one.
727	 */
728	if (so->so_options & SO_ACCEPTCONN) {
729		struct in_conninfo inc;
730
731		KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
732		    "tp not listening", __func__));
733
734		bzero(&inc, sizeof(inc));
735#ifdef INET6
736		if (isipv6) {
737			inc.inc_flags |= INC_ISIPV6;
738			inc.inc6_faddr = ip6->ip6_src;
739			inc.inc6_laddr = ip6->ip6_dst;
740		} else
741#endif
742		{
743			inc.inc_faddr = ip->ip_src;
744			inc.inc_laddr = ip->ip_dst;
745		}
746		inc.inc_fport = th->th_sport;
747		inc.inc_lport = th->th_dport;
748		inc.inc_fibnum = so->so_fibnum;
749
750		/*
751		 * Check for an existing connection attempt in syncache if
752		 * the flag is only ACK.  A successful lookup creates a new
753		 * socket appended to the listen queue in SYN_RECEIVED state.
754		 */
755		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
756			/*
757			 * Parse the TCP options here because
758			 * syncookies need access to the reflected
759			 * timestamp.
760			 */
761			tcp_dooptions(&to, optp, optlen, 0);
762			/*
763			 * NB: syncache_expand() doesn't unlock
764			 * inp and tcpinfo locks.
765			 */
766			if (!syncache_expand(&inc, &to, th, &so, m)) {
767				/*
768				 * No syncache entry or ACK was not
769				 * for our SYN/ACK.  Send a RST.
770				 * NB: syncache did its own logging
771				 * of the failure cause.
772				 */
773				rstreason = BANDLIM_RST_OPENPORT;
774				goto dropwithreset;
775			}
776			if (so == NULL) {
777				/*
778				 * We completed the 3-way handshake
779				 * but could not allocate a socket
780				 * either due to memory shortage,
781				 * listen queue length limits or
782				 * global socket limits.  Send RST
783				 * or wait and have the remote end
784				 * retransmit the ACK for another
785				 * try.
786				 */
787				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
788					log(LOG_DEBUG, "%s; %s: Listen socket: "
789					    "Socket allocation failed due to "
790					    "limits or memory shortage, %s\n",
791					    s, __func__,
792					    V_tcp_sc_rst_sock_fail ?
793					    "sending RST" : "try again");
794				if (V_tcp_sc_rst_sock_fail) {
795					rstreason = BANDLIM_UNLIMITED;
796					goto dropwithreset;
797				} else
798					goto dropunlock;
799			}
800			/*
801			 * Socket is created in state SYN_RECEIVED.
802			 * Unlock the listen socket, lock the newly
803			 * created socket and update the tp variable.
804			 */
805			INP_WUNLOCK(inp);	/* listen socket */
806			inp = sotoinpcb(so);
807			INP_WLOCK(inp);		/* new connection */
808			tp = intotcpcb(inp);
809			KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
810			    ("%s: ", __func__));
811			/*
812			 * Process the segment and the data it
813			 * contains.  tcp_do_segment() consumes
814			 * the mbuf chain and unlocks the inpcb.
815			 */
816			tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
817			    iptos, ti_locked);
818			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
819			return;
820		}
821		/*
822		 * Segment flag validation for new connection attempts:
823		 *
824		 * Our (SYN|ACK) response was rejected.
825		 * Check with syncache and remove entry to prevent
826		 * retransmits.
827		 *
828		 * NB: syncache_chkrst does its own logging of failure
829		 * causes.
830		 */
831		if (thflags & TH_RST) {
832			syncache_chkrst(&inc, th);
833			goto dropunlock;
834		}
835		/*
836		 * We can't do anything without SYN.
837		 */
838		if ((thflags & TH_SYN) == 0) {
839			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
840				log(LOG_DEBUG, "%s; %s: Listen socket: "
841				    "SYN is missing, segment ignored\n",
842				    s, __func__);
843			TCPSTAT_INC(tcps_badsyn);
844			goto dropunlock;
845		}
846		/*
847		 * (SYN|ACK) is bogus on a listen socket.
848		 */
849		if (thflags & TH_ACK) {
850			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
851				log(LOG_DEBUG, "%s; %s: Listen socket: "
852				    "SYN|ACK invalid, segment rejected\n",
853				    s, __func__);
854			syncache_badack(&inc);	/* XXX: Not needed! */
855			TCPSTAT_INC(tcps_badsyn);
856			rstreason = BANDLIM_RST_OPENPORT;
857			goto dropwithreset;
858		}
859		/*
860		 * If the drop_synfin option is enabled, drop all
861		 * segments with both the SYN and FIN bits set.
862		 * This prevents e.g. nmap from identifying the
863		 * TCP/IP stack.
864		 * XXX: Poor reasoning.  nmap has other methods
865		 * and is constantly refining its stack detection
866		 * strategies.
867		 * XXX: This is a violation of the TCP specification
868		 * and was used by RFC1644.
869		 */
870		if ((thflags & TH_FIN) && V_drop_synfin) {
871			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
872				log(LOG_DEBUG, "%s; %s: Listen socket: "
873				    "SYN|FIN segment ignored (based on "
874				    "sysctl setting)\n", s, __func__);
875			TCPSTAT_INC(tcps_badsyn);
876                	goto dropunlock;
877		}
878		/*
879		 * Segment's flags are (SYN) or (SYN|FIN).
880		 *
881		 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
882		 * as they do not affect the state of the TCP FSM.
883		 * The data pointed to by TH_URG and th_urp is ignored.
884		 */
885		KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
886		    ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
887		KASSERT(thflags & (TH_SYN),
888		    ("%s: Listen socket: TH_SYN not set", __func__));
889#ifdef INET6
890		/*
891		 * If deprecated address is forbidden,
892		 * we do not accept SYN to deprecated interface
893		 * address to prevent any new inbound connection from
894		 * getting established.
895		 * When we do not accept SYN, we send a TCP RST,
896		 * with deprecated source address (instead of dropping
897		 * it).  We compromise it as it is much better for peer
898		 * to send a RST, and RST will be the final packet
899		 * for the exchange.
900		 *
901		 * If we do not forbid deprecated addresses, we accept
902		 * the SYN packet.  RFC2462 does not suggest dropping
903		 * SYN in this case.
904		 * If we decipher RFC2462 5.5.4, it says like this:
905		 * 1. use of deprecated addr with existing
906		 *    communication is okay - "SHOULD continue to be
907		 *    used"
908		 * 2. use of it with new communication:
909		 *   (2a) "SHOULD NOT be used if alternate address
910		 *        with sufficient scope is available"
911		 *   (2b) nothing mentioned otherwise.
912		 * Here we fall into (2b) case as we have no choice in
913		 * our source address selection - we must obey the peer.
914		 *
915		 * The wording in RFC2462 is confusing, and there are
916		 * multiple description text for deprecated address
917		 * handling - worse, they are not exactly the same.
918		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
919		 */
920		if (isipv6 && !V_ip6_use_deprecated) {
921			struct in6_ifaddr *ia6;
922
923			ia6 = ip6_getdstifaddr(m);
924			if (ia6 != NULL &&
925			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
926				ifa_free(&ia6->ia_ifa);
927				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
928				    log(LOG_DEBUG, "%s; %s: Listen socket: "
929					"Connection attempt to deprecated "
930					"IPv6 address rejected\n",
931					s, __func__);
932				rstreason = BANDLIM_RST_OPENPORT;
933				goto dropwithreset;
934			}
935			ifa_free(&ia6->ia_ifa);
936		}
937#endif
938		/*
939		 * Basic sanity checks on incoming SYN requests:
940		 *   Don't respond if the destination is a link layer
941		 *	broadcast according to RFC1122 4.2.3.10, p. 104.
942		 *   If it is from this socket it must be forged.
943		 *   Don't respond if the source or destination is a
944		 *	global or subnet broad- or multicast address.
945		 *   Note that it is quite possible to receive unicast
946		 *	link-layer packets with a broadcast IP address. Use
947		 *	in_broadcast() to find them.
948		 */
949		if (m->m_flags & (M_BCAST|M_MCAST)) {
950			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
951			    log(LOG_DEBUG, "%s; %s: Listen socket: "
952				"Connection attempt from broad- or multicast "
953				"link layer address ignored\n", s, __func__);
954			goto dropunlock;
955		}
956		if (isipv6) {
957#ifdef INET6
958			if (th->th_dport == th->th_sport &&
959			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
960				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
961				    log(LOG_DEBUG, "%s; %s: Listen socket: "
962					"Connection attempt to/from self "
963					"ignored\n", s, __func__);
964				goto dropunlock;
965			}
966			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
967			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
968				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
969				    log(LOG_DEBUG, "%s; %s: Listen socket: "
970					"Connection attempt from/to multicast "
971					"address ignored\n", s, __func__);
972				goto dropunlock;
973			}
974#endif
975		} else {
976			if (th->th_dport == th->th_sport &&
977			    ip->ip_dst.s_addr == ip->ip_src.s_addr) {
978				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
979				    log(LOG_DEBUG, "%s; %s: Listen socket: "
980					"Connection attempt from/to self "
981					"ignored\n", s, __func__);
982				goto dropunlock;
983			}
984			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
985			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
986			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
987			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
988				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
989				    log(LOG_DEBUG, "%s; %s: Listen socket: "
990					"Connection attempt from/to broad- "
991					"or multicast address ignored\n",
992					s, __func__);
993				goto dropunlock;
994			}
995		}
996		/*
997		 * SYN appears to be valid.  Create compressed TCP state
998		 * for syncache.
999		 */
1000#ifdef TCPDEBUG
1001		if (so->so_options & SO_DEBUG)
1002			tcp_trace(TA_INPUT, ostate, tp,
1003			    (void *)tcp_saveipgen, &tcp_savetcp, 0);
1004#endif
1005		tcp_dooptions(&to, optp, optlen, TO_SYN);
1006		syncache_add(&inc, &to, th, inp, &so, m);
1007		/*
1008		 * Entry added to syncache and mbuf consumed.
1009		 * Everything already unlocked by syncache_add().
1010		 */
1011		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1012		return;
1013	}
1014
1015	/*
1016	 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
1017	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
1018	 * the inpcb, and unlocks pcbinfo.
1019	 */
1020	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
1021	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1022	return;
1023
1024dropwithreset:
1025	if (ti_locked == TI_RLOCKED)
1026		INP_INFO_RUNLOCK(&V_tcbinfo);
1027	else if (ti_locked == TI_WLOCKED)
1028		INP_INFO_WUNLOCK(&V_tcbinfo);
1029	else
1030		panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
1031	ti_locked = TI_UNLOCKED;
1032
1033	if (inp != NULL) {
1034		tcp_dropwithreset(m, th, tp, tlen, rstreason);
1035		INP_WUNLOCK(inp);
1036	} else
1037		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1038	m = NULL;	/* mbuf chain got consumed. */
1039	goto drop;
1040
1041dropunlock:
1042	if (ti_locked == TI_RLOCKED)
1043		INP_INFO_RUNLOCK(&V_tcbinfo);
1044	else if (ti_locked == TI_WLOCKED)
1045		INP_INFO_WUNLOCK(&V_tcbinfo);
1046	else
1047		panic("%s: dropunlock ti_locked %d", __func__, ti_locked);
1048	ti_locked = TI_UNLOCKED;
1049
1050	if (inp != NULL)
1051		INP_WUNLOCK(inp);
1052
1053drop:
1054	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1055	if (s != NULL)
1056		free(s, M_TCPLOG);
1057	if (m != NULL)
1058		m_freem(m);
1059}
1060
1061static void
1062tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
1063    struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
1064    int ti_locked)
1065{
1066	int thflags, acked, ourfinisacked, needoutput = 0;
1067	int rstreason, todrop, win;
1068	u_long tiwin;
1069	struct tcpopt to;
1070
1071#ifdef TCPDEBUG
1072	/*
1073	 * The size of tcp_saveipgen must be the size of the max ip header,
1074	 * now IPv6.
1075	 */
1076	u_char tcp_saveipgen[IP6_HDR_LEN];
1077	struct tcphdr tcp_savetcp;
1078	short ostate = 0;
1079#endif
1080	thflags = th->th_flags;
1081
1082	/*
1083	 * If this is either a state-changing packet or current state isn't
1084	 * established, we require a write lock on tcbinfo.  Otherwise, we
1085	 * allow either a read lock or a write lock, as we may have acquired
1086	 * a write lock due to a race.
1087	 *
1088	 * Require a global write lock for SYN/FIN/RST segments or
1089	 * non-established connections; otherwise accept either a read or
1090	 * write lock, as we may have conservatively acquired a write lock in
1091	 * certain cases in tcp_input() (is this still true?).  Currently we
1092	 * will never enter with no lock, so we try to drop it quickly in the
1093	 * common pure ack/pure data cases.
1094	 */
1095	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
1096	    tp->t_state != TCPS_ESTABLISHED) {
1097		KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
1098		    "SYN/FIN/RST/!EST", __func__, ti_locked));
1099		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
1100	} else {
1101#ifdef INVARIANTS
1102		if (ti_locked == TI_RLOCKED)
1103			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1104		else if (ti_locked == TI_WLOCKED)
1105			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
1106		else
1107			panic("%s: ti_locked %d for EST", __func__,
1108			    ti_locked);
1109#endif
1110	}
1111	INP_WLOCK_ASSERT(tp->t_inpcb);
1112	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
1113	    __func__));
1114	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
1115	    __func__));
1116
1117	/*
1118	 * Segment received on connection.
1119	 * Reset idle time and keep-alive timer.
1120	 * XXX: This should be done after segment
1121	 * validation to ignore broken/spoofed segs.
1122	 */
1123	tp->t_rcvtime = ticks;
1124	if (TCPS_HAVEESTABLISHED(tp->t_state))
1125		tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
1126
1127	/*
1128	 * Unscale the window into a 32-bit value.
1129	 * For the SYN_SENT state the scale is zero.
1130	 */
1131	tiwin = th->th_win << tp->snd_scale;
1132
1133	/*
1134	 * TCP ECN processing.
1135	 */
1136	if (tp->t_flags & TF_ECN_PERMIT) {
1137		if (thflags & TH_CWR)
1138			tp->t_flags &= ~TF_ECN_SND_ECE;
1139		switch (iptos & IPTOS_ECN_MASK) {
1140		case IPTOS_ECN_CE:
1141			tp->t_flags |= TF_ECN_SND_ECE;
1142			TCPSTAT_INC(tcps_ecn_ce);
1143			break;
1144		case IPTOS_ECN_ECT0:
1145			TCPSTAT_INC(tcps_ecn_ect0);
1146			break;
1147		case IPTOS_ECN_ECT1:
1148			TCPSTAT_INC(tcps_ecn_ect1);
1149			break;
1150		}
1151		/*
1152		 * Congestion experienced.
1153		 * Ignore if we are already trying to recover.
1154		 */
1155		if ((thflags & TH_ECE) &&
1156		    SEQ_LEQ(th->th_ack, tp->snd_recover)) {
1157			TCPSTAT_INC(tcps_ecn_rcwnd);
1158			tcp_congestion_exp(tp);
1159		}
1160	}
1161
1162	/*
1163	 * Parse options on any incoming segment.
1164	 */
1165	tcp_dooptions(&to, (u_char *)(th + 1),
1166	    (th->th_off << 2) - sizeof(struct tcphdr),
1167	    (thflags & TH_SYN) ? TO_SYN : 0);
1168
1169	/*
1170	 * If echoed timestamp is later than the current time,
1171	 * fall back to non RFC1323 RTT calculation.  Normalize
1172	 * timestamp if syncookies were used when this connection
1173	 * was established.
1174	 */
1175	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1176		to.to_tsecr -= tp->ts_offset;
1177		if (TSTMP_GT(to.to_tsecr, ticks))
1178			to.to_tsecr = 0;
1179	}
1180
1181	/*
1182	 * Process options only when we get SYN/ACK back. The SYN case
1183	 * for incoming connections is handled in tcp_syncache.
1184	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1185	 * or <SYN,ACK>) segment itself is never scaled.
1186	 * XXX this is traditional behavior, may need to be cleaned up.
1187	 */
1188	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1189		if ((to.to_flags & TOF_SCALE) &&
1190		    (tp->t_flags & TF_REQ_SCALE)) {
1191			tp->t_flags |= TF_RCVD_SCALE;
1192			tp->snd_scale = to.to_wscale;
1193		}
1194		/*
1195		 * Initial send window.  It will be updated with
1196		 * the next incoming segment to the scaled value.
1197		 */
1198		tp->snd_wnd = th->th_win;
1199		if (to.to_flags & TOF_TS) {
1200			tp->t_flags |= TF_RCVD_TSTMP;
1201			tp->ts_recent = to.to_tsval;
1202			tp->ts_recent_age = ticks;
1203		}
1204		if (to.to_flags & TOF_MSS)
1205			tcp_mss(tp, to.to_mss);
1206		if ((tp->t_flags & TF_SACK_PERMIT) &&
1207		    (to.to_flags & TOF_SACKPERM) == 0)
1208			tp->t_flags &= ~TF_SACK_PERMIT;
1209	}
1210
1211	/*
1212	 * Header prediction: check for the two common cases
1213	 * of a uni-directional data xfer.  If the packet has
1214	 * no control flags, is in-sequence, the window didn't
1215	 * change and we're not retransmitting, it's a
1216	 * candidate.  If the length is zero and the ack moved
1217	 * forward, we're the sender side of the xfer.  Just
1218	 * free the data acked & wake any higher level process
1219	 * that was blocked waiting for space.  If the length
1220	 * is non-zero and the ack didn't move, we're the
1221	 * receiver side.  If we're getting packets in-order
1222	 * (the reassembly queue is empty), add the data to
1223	 * the socket buffer and note that we need a delayed ack.
1224	 * Make sure that the hidden state-flags are also off.
1225	 * Since we check for TCPS_ESTABLISHED first, it can only
1226	 * be TH_NEEDSYN.
1227	 */
1228	if (tp->t_state == TCPS_ESTABLISHED &&
1229	    th->th_seq == tp->rcv_nxt &&
1230	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1231	    tp->snd_nxt == tp->snd_max &&
1232	    tiwin && tiwin == tp->snd_wnd &&
1233	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1234	    LIST_EMPTY(&tp->t_segq) &&
1235	    ((to.to_flags & TOF_TS) == 0 ||
1236	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
1237
1238		/*
1239		 * If last ACK falls within this segment's sequence numbers,
1240		 * record the timestamp.
1241		 * NOTE that the test is modified according to the latest
1242		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1243		 */
1244		if ((to.to_flags & TOF_TS) != 0 &&
1245		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1246			tp->ts_recent_age = ticks;
1247			tp->ts_recent = to.to_tsval;
1248		}
1249
1250		if (tlen == 0) {
1251			if (SEQ_GT(th->th_ack, tp->snd_una) &&
1252			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
1253			    tp->snd_cwnd >= tp->snd_wnd &&
1254			    ((!V_tcp_do_newreno &&
1255			      !(tp->t_flags & TF_SACK_PERMIT) &&
1256			      tp->t_dupacks < tcprexmtthresh) ||
1257			     ((V_tcp_do_newreno ||
1258			       (tp->t_flags & TF_SACK_PERMIT)) &&
1259			      !IN_FASTRECOVERY(tp) &&
1260			      (to.to_flags & TOF_SACK) == 0 &&
1261			      TAILQ_EMPTY(&tp->snd_holes)))) {
1262				/*
1263				 * This is a pure ack for outstanding data.
1264				 */
1265				if (ti_locked == TI_RLOCKED)
1266					INP_INFO_RUNLOCK(&V_tcbinfo);
1267				else if (ti_locked == TI_WLOCKED)
1268					INP_INFO_WUNLOCK(&V_tcbinfo);
1269				else
1270					panic("%s: ti_locked %d on pure ACK",
1271					    __func__, ti_locked);
1272				ti_locked = TI_UNLOCKED;
1273
1274				TCPSTAT_INC(tcps_predack);
1275
1276				/*
1277				 * "bad retransmit" recovery.
1278				 */
1279				if (tp->t_rxtshift == 1 &&
1280				    (int)(ticks - tp->t_badrxtwin) < 0) {
1281					TCPSTAT_INC(tcps_sndrexmitbad);
1282					tp->snd_cwnd = tp->snd_cwnd_prev;
1283					tp->snd_ssthresh =
1284					    tp->snd_ssthresh_prev;
1285					tp->snd_recover = tp->snd_recover_prev;
1286					if (tp->t_flags & TF_WASFRECOVERY)
1287					    ENTER_FASTRECOVERY(tp);
1288					tp->snd_nxt = tp->snd_max;
1289					tp->t_badrxtwin = 0;
1290				}
1291
1292				/*
1293				 * Recalculate the transmit timer / rtt.
1294				 *
1295				 * Some boxes send broken timestamp replies
1296				 * during the SYN+ACK phase, ignore
1297				 * timestamps of 0 or we could calculate a
1298				 * huge RTT and blow up the retransmit timer.
1299				 */
1300				if ((to.to_flags & TOF_TS) != 0 &&
1301				    to.to_tsecr) {
1302					if (!tp->t_rttlow ||
1303					    tp->t_rttlow > ticks - to.to_tsecr)
1304						tp->t_rttlow = ticks - to.to_tsecr;
1305					tcp_xmit_timer(tp,
1306					    ticks - to.to_tsecr + 1);
1307				} else if (tp->t_rtttime &&
1308				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
1309					if (!tp->t_rttlow ||
1310					    tp->t_rttlow > ticks - tp->t_rtttime)
1311						tp->t_rttlow = ticks - tp->t_rtttime;
1312					tcp_xmit_timer(tp,
1313							ticks - tp->t_rtttime);
1314				}
1315				tcp_xmit_bandwidth_limit(tp, th->th_ack);
1316				acked = th->th_ack - tp->snd_una;
1317				TCPSTAT_INC(tcps_rcvackpack);
1318				TCPSTAT_ADD(tcps_rcvackbyte, acked);
1319				sbdrop(&so->so_snd, acked);
1320				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
1321				    SEQ_LEQ(th->th_ack, tp->snd_recover))
1322					tp->snd_recover = th->th_ack - 1;
1323				tp->snd_una = th->th_ack;
1324				/*
1325				 * Pull snd_wl2 up to prevent seq wrap relative
1326				 * to th_ack.
1327				 */
1328				tp->snd_wl2 = th->th_ack;
1329				tp->t_dupacks = 0;
1330				m_freem(m);
1331				ND6_HINT(tp); /* Some progress has been made. */
1332
1333				/*
1334				 * If all outstanding data are acked, stop
1335				 * retransmit timer, otherwise restart timer
1336				 * using current (possibly backed-off) value.
1337				 * If process is waiting for space,
1338				 * wakeup/selwakeup/signal.  If data
1339				 * are ready to send, let tcp_output
1340				 * decide between more output or persist.
1341				 */
1342#ifdef TCPDEBUG
1343				if (so->so_options & SO_DEBUG)
1344					tcp_trace(TA_INPUT, ostate, tp,
1345					    (void *)tcp_saveipgen,
1346					    &tcp_savetcp, 0);
1347#endif
1348				if (tp->snd_una == tp->snd_max)
1349					tcp_timer_activate(tp, TT_REXMT, 0);
1350				else if (!tcp_timer_active(tp, TT_PERSIST))
1351					tcp_timer_activate(tp, TT_REXMT,
1352						      tp->t_rxtcur);
1353				sowwakeup(so);
1354				if (so->so_snd.sb_cc)
1355					(void) tcp_output(tp);
1356				goto check_delack;
1357			}
1358		} else if (th->th_ack == tp->snd_una &&
1359		    tlen <= sbspace(&so->so_rcv)) {
1360			int newsize = 0;	/* automatic sockbuf scaling */
1361
1362			/*
1363			 * This is a pure, in-sequence data packet with
1364			 * nothing on the reassembly queue and we have enough
1365			 * buffer space to take it.
1366			 */
1367			if (ti_locked == TI_RLOCKED)
1368				INP_INFO_RUNLOCK(&V_tcbinfo);
1369			else if (ti_locked == TI_WLOCKED)
1370				INP_INFO_WUNLOCK(&V_tcbinfo);
1371			else
1372				panic("%s: ti_locked %d on pure data "
1373				    "segment", __func__, ti_locked);
1374			ti_locked = TI_UNLOCKED;
1375
1376			/* Clean receiver SACK report if present */
1377			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
1378				tcp_clean_sackreport(tp);
1379			TCPSTAT_INC(tcps_preddat);
1380			tp->rcv_nxt += tlen;
1381			/*
1382			 * Pull snd_wl1 up to prevent seq wrap relative to
1383			 * th_seq.
1384			 */
1385			tp->snd_wl1 = th->th_seq;
1386			/*
1387			 * Pull rcv_up up to prevent seq wrap relative to
1388			 * rcv_nxt.
1389			 */
1390			tp->rcv_up = tp->rcv_nxt;
1391			TCPSTAT_INC(tcps_rcvpack);
1392			TCPSTAT_ADD(tcps_rcvbyte, tlen);
1393			ND6_HINT(tp);	/* Some progress has been made */
1394#ifdef TCPDEBUG
1395			if (so->so_options & SO_DEBUG)
1396				tcp_trace(TA_INPUT, ostate, tp,
1397				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
1398#endif
1399		/*
1400		 * Automatic sizing of receive socket buffer.  Often the send
1401		 * buffer size is not optimally adjusted to the actual network
1402		 * conditions at hand (delay bandwidth product).  Setting the
1403		 * buffer size too small limits throughput on links with high
1404		 * bandwidth and high delay (eg. trans-continental/oceanic links).
1405		 *
1406		 * On the receive side the socket buffer memory is only rarely
1407		 * used to any significant extent.  This allows us to be much
1408		 * more aggressive in scaling the receive socket buffer.  For
1409		 * the case that the buffer space is actually used to a large
1410		 * extent and we run out of kernel memory we can simply drop
1411		 * the new segments; TCP on the sender will just retransmit it
1412		 * later.  Setting the buffer size too big may only consume too
1413		 * much kernel memory if the application doesn't read() from
1414		 * the socket or packet loss or reordering makes use of the
1415		 * reassembly queue.
1416		 *
1417		 * The criteria to step up the receive buffer one notch are:
1418		 *  1. the number of bytes received during the time it takes
1419		 *     one timestamp to be reflected back to us (the RTT);
1420		 *  2. received bytes per RTT is within seven eighth of the
1421		 *     current socket buffer size;
1422		 *  3. receive buffer size has not hit maximal automatic size;
1423		 *
1424		 * This algorithm does one step per RTT at most and only if
1425		 * we receive a bulk stream w/o packet losses or reorderings.
1426		 * Shrinking the buffer during idle times is not necessary as
1427		 * it doesn't consume any memory when idle.
1428		 *
1429		 * TODO: Only step up if the application is actually serving
1430		 * the buffer to better manage the socket buffer resources.
1431		 */
1432			if (V_tcp_do_autorcvbuf &&
1433			    to.to_tsecr &&
1434			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
1435				if (to.to_tsecr > tp->rfbuf_ts &&
1436				    to.to_tsecr - tp->rfbuf_ts < hz) {
1437					if (tp->rfbuf_cnt >
1438					    (so->so_rcv.sb_hiwat / 8 * 7) &&
1439					    so->so_rcv.sb_hiwat <
1440					    V_tcp_autorcvbuf_max) {
1441						newsize =
1442						    min(so->so_rcv.sb_hiwat +
1443						    V_tcp_autorcvbuf_inc,
1444						    V_tcp_autorcvbuf_max);
1445					}
1446					/* Start over with next RTT. */
1447					tp->rfbuf_ts = 0;
1448					tp->rfbuf_cnt = 0;
1449				} else
1450					tp->rfbuf_cnt += tlen;	/* add up */
1451			}
1452
1453			/* Add data to socket buffer. */
1454			SOCKBUF_LOCK(&so->so_rcv);
1455			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1456				m_freem(m);
1457			} else {
1458				/*
1459				 * Set new socket buffer size.
1460				 * Give up when limit is reached.
1461				 */
1462				if (newsize)
1463					if (!sbreserve_locked(&so->so_rcv,
1464					    newsize, so, NULL))
1465						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
1466				m_adj(m, drop_hdrlen);	/* delayed header drop */
1467				sbappendstream_locked(&so->so_rcv, m);
1468			}
1469			/* NB: sorwakeup_locked() does an implicit unlock. */
1470			sorwakeup_locked(so);
1471			if (DELAY_ACK(tp)) {
1472				tp->t_flags |= TF_DELACK;
1473			} else {
1474				tp->t_flags |= TF_ACKNOW;
1475				tcp_output(tp);
1476			}
1477			goto check_delack;
1478		}
1479	}
1480
1481	/*
1482	 * Calculate amount of space in receive window,
1483	 * and then do TCP input processing.
1484	 * Receive window is amount of space in rcv queue,
1485	 * but not less than advertised window.
1486	 */
1487	win = sbspace(&so->so_rcv);
1488	if (win < 0)
1489		win = 0;
1490	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1491
1492	/* Reset receive buffer auto scaling when not in bulk receive mode. */
1493	tp->rfbuf_ts = 0;
1494	tp->rfbuf_cnt = 0;
1495
1496	switch (tp->t_state) {
1497
1498	/*
1499	 * If the state is SYN_RECEIVED:
1500	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
1501	 */
1502	case TCPS_SYN_RECEIVED:
1503		if ((thflags & TH_ACK) &&
1504		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1505		     SEQ_GT(th->th_ack, tp->snd_max))) {
1506				rstreason = BANDLIM_RST_OPENPORT;
1507				goto dropwithreset;
1508		}
1509		break;
1510
1511	/*
1512	 * If the state is SYN_SENT:
1513	 *	if seg contains an ACK, but not for our SYN, drop the input.
1514	 *	if seg contains a RST, then drop the connection.
1515	 *	if seg does not contain SYN, then drop it.
1516	 * Otherwise this is an acceptable SYN segment
1517	 *	initialize tp->rcv_nxt and tp->irs
1518	 *	if seg contains ack then advance tp->snd_una
1519	 *	if seg contains an ECE and ECN support is enabled, the stream
1520	 *	    is ECN capable.
1521	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1522	 *	arrange for segment to be acked (eventually)
1523	 *	continue processing rest of data/controls, beginning with URG
1524	 */
1525	case TCPS_SYN_SENT:
1526		if ((thflags & TH_ACK) &&
1527		    (SEQ_LEQ(th->th_ack, tp->iss) ||
1528		     SEQ_GT(th->th_ack, tp->snd_max))) {
1529			rstreason = BANDLIM_UNLIMITED;
1530			goto dropwithreset;
1531		}
1532		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST))
1533			tp = tcp_drop(tp, ECONNREFUSED);
1534		if (thflags & TH_RST)
1535			goto drop;
1536		if (!(thflags & TH_SYN))
1537			goto drop;
1538
1539		tp->irs = th->th_seq;
1540		tcp_rcvseqinit(tp);
1541		if (thflags & TH_ACK) {
1542			TCPSTAT_INC(tcps_connects);
1543			soisconnected(so);
1544#ifdef MAC
1545			mac_socketpeer_set_from_mbuf(m, so);
1546#endif
1547			/* Do window scaling on this connection? */
1548			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1549				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
1550				tp->rcv_scale = tp->request_r_scale;
1551			}
1552			tp->rcv_adv += tp->rcv_wnd;
1553			tp->snd_una++;		/* SYN is acked */
1554			/*
1555			 * If there's data, delay ACK; if there's also a FIN
1556			 * ACKNOW will be turned on later.
1557			 */
1558			if (DELAY_ACK(tp) && tlen != 0)
1559				tcp_timer_activate(tp, TT_DELACK,
1560				    tcp_delacktime);
1561			else
1562				tp->t_flags |= TF_ACKNOW;
1563
1564			if ((thflags & TH_ECE) && V_tcp_do_ecn) {
1565				tp->t_flags |= TF_ECN_PERMIT;
1566				TCPSTAT_INC(tcps_ecn_shs);
1567			}
1568
1569			/*
1570			 * Received <SYN,ACK> in SYN_SENT[*] state.
1571			 * Transitions:
1572			 *	SYN_SENT  --> ESTABLISHED
1573			 *	SYN_SENT* --> FIN_WAIT_1
1574			 */
1575			tp->t_starttime = ticks;
1576			if (tp->t_flags & TF_NEEDFIN) {
1577				tp->t_state = TCPS_FIN_WAIT_1;
1578				tp->t_flags &= ~TF_NEEDFIN;
1579				thflags &= ~TH_SYN;
1580			} else {
1581				tp->t_state = TCPS_ESTABLISHED;
1582				tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
1583			}
1584		} else {
1585			/*
1586			 * Received initial SYN in SYN-SENT[*] state =>
1587			 * simultaneous open.  If segment contains CC option
1588			 * and there is a cached CC, apply TAO test.
1589			 * If it succeeds, connection is * half-synchronized.
1590			 * Otherwise, do 3-way handshake:
1591			 *        SYN-SENT -> SYN-RECEIVED
1592			 *        SYN-SENT* -> SYN-RECEIVED*
1593			 * If there was no CC option, clear cached CC value.
1594			 */
1595			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
1596			tcp_timer_activate(tp, TT_REXMT, 0);
1597			tp->t_state = TCPS_SYN_RECEIVED;
1598		}
1599
1600		KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
1601		    "ti_locked %d", __func__, ti_locked));
1602		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
1603		INP_WLOCK_ASSERT(tp->t_inpcb);
1604
1605		/*
1606		 * Advance th->th_seq to correspond to first data byte.
1607		 * If data, trim to stay within window,
1608		 * dropping FIN if necessary.
1609		 */
1610		th->th_seq++;
1611		if (tlen > tp->rcv_wnd) {
1612			todrop = tlen - tp->rcv_wnd;
1613			m_adj(m, -todrop);
1614			tlen = tp->rcv_wnd;
1615			thflags &= ~TH_FIN;
1616			TCPSTAT_INC(tcps_rcvpackafterwin);
1617			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
1618		}
1619		tp->snd_wl1 = th->th_seq - 1;
1620		tp->rcv_up = th->th_seq;
1621		/*
1622		 * Client side of transaction: already sent SYN and data.
1623		 * If the remote host used T/TCP to validate the SYN,
1624		 * our data will be ACK'd; if so, enter normal data segment
1625		 * processing in the middle of step 5, ack processing.
1626		 * Otherwise, goto step 6.
1627		 */
1628		if (thflags & TH_ACK)
1629			goto process_ACK;
1630
1631		goto step6;
1632
1633	/*
1634	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1635	 *      do normal processing.
1636	 *
1637	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
1638	 */
1639	case TCPS_LAST_ACK:
1640	case TCPS_CLOSING:
1641		break;  /* continue normal processing */
1642	}
1643
1644	/*
1645	 * States other than LISTEN or SYN_SENT.
1646	 * First check the RST flag and sequence number since reset segments
1647	 * are exempt from the timestamp and connection count tests.  This
1648	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1649	 * below which allowed reset segments in half the sequence space
1650	 * to fall though and be processed (which gives forged reset
1651	 * segments with a random sequence number a 50 percent chance of
1652	 * killing a connection).
1653	 * Then check timestamp, if present.
1654	 * Then check the connection count, if present.
1655	 * Then check that at least some bytes of segment are within
1656	 * receive window.  If segment begins before rcv_nxt,
1657	 * drop leading data (and SYN); if nothing left, just ack.
1658	 *
1659	 *
1660	 * If the RST bit is set, check the sequence number to see
1661	 * if this is a valid reset segment.
1662	 * RFC 793 page 37:
1663	 *   In all states except SYN-SENT, all reset (RST) segments
1664	 *   are validated by checking their SEQ-fields.  A reset is
1665	 *   valid if its sequence number is in the window.
1666	 * Note: this does not take into account delayed ACKs, so
1667	 *   we should test against last_ack_sent instead of rcv_nxt.
1668	 *   The sequence number in the reset segment is normally an
1669	 *   echo of our outgoing acknowlegement numbers, but some hosts
1670	 *   send a reset with the sequence number at the rightmost edge
1671	 *   of our receive window, and we have to handle this case.
1672	 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
1673	 *   that brute force RST attacks are possible.  To combat this,
1674	 *   we use a much stricter check while in the ESTABLISHED state,
1675	 *   only accepting RSTs where the sequence number is equal to
1676	 *   last_ack_sent.  In all other states (the states in which a
1677	 *   RST is more likely), the more permissive check is used.
1678	 * If we have multiple segments in flight, the initial reset
1679	 * segment sequence numbers will be to the left of last_ack_sent,
1680	 * but they will eventually catch up.
1681	 * In any case, it never made sense to trim reset segments to
1682	 * fit the receive window since RFC 1122 says:
1683	 *   4.2.2.12  RST Segment: RFC-793 Section 3.4
1684	 *
1685	 *    A TCP SHOULD allow a received RST segment to include data.
1686	 *
1687	 *    DISCUSSION
1688	 *         It has been suggested that a RST segment could contain
1689	 *         ASCII text that encoded and explained the cause of the
1690	 *         RST.  No standard has yet been established for such
1691	 *         data.
1692	 *
1693	 * If the reset segment passes the sequence number test examine
1694	 * the state:
1695	 *    SYN_RECEIVED STATE:
1696	 *	If passive open, return to LISTEN state.
1697	 *	If active open, inform user that connection was refused.
1698	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
1699	 *	Inform user that connection was reset, and close tcb.
1700	 *    CLOSING, LAST_ACK STATES:
1701	 *	Close the tcb.
1702	 *    TIME_WAIT STATE:
1703	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
1704	 *      RFC 1337.
1705	 */
1706	if (thflags & TH_RST) {
1707		if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
1708		    SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1709			switch (tp->t_state) {
1710
1711			case TCPS_SYN_RECEIVED:
1712				so->so_error = ECONNREFUSED;
1713				goto close;
1714
1715			case TCPS_ESTABLISHED:
1716				if (V_tcp_insecure_rst == 0 &&
1717				    !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
1718				    SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
1719				    !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
1720				    SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
1721					TCPSTAT_INC(tcps_badrst);
1722					goto drop;
1723				}
1724				/* FALLTHROUGH */
1725			case TCPS_FIN_WAIT_1:
1726			case TCPS_FIN_WAIT_2:
1727			case TCPS_CLOSE_WAIT:
1728				so->so_error = ECONNRESET;
1729			close:
1730				KASSERT(ti_locked == TI_WLOCKED,
1731				    ("tcp_do_segment: TH_RST 1 ti_locked %d",
1732				    ti_locked));
1733				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
1734
1735				tp->t_state = TCPS_CLOSED;
1736				TCPSTAT_INC(tcps_drops);
1737				tp = tcp_close(tp);
1738				break;
1739
1740			case TCPS_CLOSING:
1741			case TCPS_LAST_ACK:
1742				KASSERT(ti_locked == TI_WLOCKED,
1743				    ("tcp_do_segment: TH_RST 2 ti_locked %d",
1744				    ti_locked));
1745				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
1746
1747				tp = tcp_close(tp);
1748				break;
1749			}
1750		}
1751		goto drop;
1752	}
1753
1754	/*
1755	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1756	 * and it's less than ts_recent, drop it.
1757	 */
1758	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
1759	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1760
1761		/* Check to see if ts_recent is over 24 days old.  */
1762		if (ticks - tp->ts_recent_age > TCP_PAWS_IDLE) {
1763			/*
1764			 * Invalidate ts_recent.  If this segment updates
1765			 * ts_recent, the age will be reset later and ts_recent
1766			 * will get a valid value.  If it does not, setting
1767			 * ts_recent to zero will at least satisfy the
1768			 * requirement that zero be placed in the timestamp
1769			 * echo reply when ts_recent isn't valid.  The
1770			 * age isn't reset until we get a valid ts_recent
1771			 * because we don't want out-of-order segments to be
1772			 * dropped when ts_recent is old.
1773			 */
1774			tp->ts_recent = 0;
1775		} else {
1776			TCPSTAT_INC(tcps_rcvduppack);
1777			TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
1778			TCPSTAT_INC(tcps_pawsdrop);
1779			if (tlen)
1780				goto dropafterack;
1781			goto drop;
1782		}
1783	}
1784
1785	/*
1786	 * In the SYN-RECEIVED state, validate that the packet belongs to
1787	 * this connection before trimming the data to fit the receive
1788	 * window.  Check the sequence number versus IRS since we know
1789	 * the sequence numbers haven't wrapped.  This is a partial fix
1790	 * for the "LAND" DoS attack.
1791	 */
1792	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
1793		rstreason = BANDLIM_RST_OPENPORT;
1794		goto dropwithreset;
1795	}
1796
1797	todrop = tp->rcv_nxt - th->th_seq;
1798	if (todrop > 0) {
1799		/*
1800		 * If this is a duplicate SYN for our current connection,
1801		 * advance over it and pretend and it's not a SYN.
1802		 */
1803		if (thflags & TH_SYN && th->th_seq == tp->irs) {
1804			thflags &= ~TH_SYN;
1805			th->th_seq++;
1806			if (th->th_urp > 1)
1807				th->th_urp--;
1808			else
1809				thflags &= ~TH_URG;
1810			todrop--;
1811		}
1812		/*
1813		 * Following if statement from Stevens, vol. 2, p. 960.
1814		 */
1815		if (todrop > tlen
1816		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1817			/*
1818			 * Any valid FIN must be to the left of the window.
1819			 * At this point the FIN must be a duplicate or out
1820			 * of sequence; drop it.
1821			 */
1822			thflags &= ~TH_FIN;
1823
1824			/*
1825			 * Send an ACK to resynchronize and drop any data.
1826			 * But keep on processing for RST or ACK.
1827			 */
1828			tp->t_flags |= TF_ACKNOW;
1829			todrop = tlen;
1830			TCPSTAT_INC(tcps_rcvduppack);
1831			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
1832		} else {
1833			TCPSTAT_INC(tcps_rcvpartduppack);
1834			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
1835		}
1836		drop_hdrlen += todrop;	/* drop from the top afterwards */
1837		th->th_seq += todrop;
1838		tlen -= todrop;
1839		if (th->th_urp > todrop)
1840			th->th_urp -= todrop;
1841		else {
1842			thflags &= ~TH_URG;
1843			th->th_urp = 0;
1844		}
1845	}
1846
1847	/*
1848	 * If new data are received on a connection after the
1849	 * user processes are gone, then RST the other end.
1850	 */
1851	if ((so->so_state & SS_NOFDREF) &&
1852	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1853		char *s;
1854
1855		KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
1856		    "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
1857		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
1858
1859		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
1860			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket "
1861			    "was closed, sending RST and removing tcpcb\n",
1862			    s, __func__, tcpstates[tp->t_state], tlen);
1863			free(s, M_TCPLOG);
1864		}
1865		tp = tcp_close(tp);
1866		TCPSTAT_INC(tcps_rcvafterclose);
1867		rstreason = BANDLIM_UNLIMITED;
1868		goto dropwithreset;
1869	}
1870
1871	/*
1872	 * If segment ends after window, drop trailing data
1873	 * (and PUSH and FIN); if nothing left, just ACK.
1874	 */
1875	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
1876	if (todrop > 0) {
1877		TCPSTAT_INC(tcps_rcvpackafterwin);
1878		if (todrop >= tlen) {
1879			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
1880			/*
1881			 * If window is closed can only take segments at
1882			 * window edge, and have to drop data and PUSH from
1883			 * incoming segments.  Continue processing, but
1884			 * remember to ack.  Otherwise, drop segment
1885			 * and ack.
1886			 */
1887			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1888				tp->t_flags |= TF_ACKNOW;
1889				TCPSTAT_INC(tcps_rcvwinprobe);
1890			} else
1891				goto dropafterack;
1892		} else
1893			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
1894		m_adj(m, -todrop);
1895		tlen -= todrop;
1896		thflags &= ~(TH_PUSH|TH_FIN);
1897	}
1898
1899	/*
1900	 * If last ACK falls within this segment's sequence numbers,
1901	 * record its timestamp.
1902	 * NOTE:
1903	 * 1) That the test incorporates suggestions from the latest
1904	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
1905	 * 2) That updating only on newer timestamps interferes with
1906	 *    our earlier PAWS tests, so this check should be solely
1907	 *    predicated on the sequence space of this segment.
1908	 * 3) That we modify the segment boundary check to be
1909	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
1910	 *    instead of RFC1323's
1911	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
1912	 *    This modified check allows us to overcome RFC1323's
1913	 *    limitations as described in Stevens TCP/IP Illustrated
1914	 *    Vol. 2 p.869. In such cases, we can still calculate the
1915	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
1916	 */
1917	if ((to.to_flags & TOF_TS) != 0 &&
1918	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1919	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
1920		((thflags & (TH_SYN|TH_FIN)) != 0))) {
1921		tp->ts_recent_age = ticks;
1922		tp->ts_recent = to.to_tsval;
1923	}
1924
1925	/*
1926	 * If a SYN is in the window, then this is an
1927	 * error and we send an RST and drop the connection.
1928	 */
1929	if (thflags & TH_SYN) {
1930		KASSERT(ti_locked == TI_WLOCKED,
1931		    ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
1932		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
1933
1934		tp = tcp_drop(tp, ECONNRESET);
1935		rstreason = BANDLIM_UNLIMITED;
1936		goto drop;
1937	}
1938
1939	/*
1940	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
1941	 * flag is on (half-synchronized state), then queue data for
1942	 * later processing; else drop segment and return.
1943	 */
1944	if ((thflags & TH_ACK) == 0) {
1945		if (tp->t_state == TCPS_SYN_RECEIVED ||
1946		    (tp->t_flags & TF_NEEDSYN))
1947			goto step6;
1948		else if (tp->t_flags & TF_ACKNOW)
1949			goto dropafterack;
1950		else
1951			goto drop;
1952	}
1953
1954	/*
1955	 * Ack processing.
1956	 */
1957	switch (tp->t_state) {
1958
1959	/*
1960	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1961	 * ESTABLISHED state and continue processing.
1962	 * The ACK was checked above.
1963	 */
1964	case TCPS_SYN_RECEIVED:
1965
1966		TCPSTAT_INC(tcps_connects);
1967		soisconnected(so);
1968		/* Do window scaling? */
1969		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1970			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
1971			tp->rcv_scale = tp->request_r_scale;
1972			tp->snd_wnd = tiwin;
1973		}
1974		/*
1975		 * Make transitions:
1976		 *      SYN-RECEIVED  -> ESTABLISHED
1977		 *      SYN-RECEIVED* -> FIN-WAIT-1
1978		 */
1979		tp->t_starttime = ticks;
1980		if (tp->t_flags & TF_NEEDFIN) {
1981			tp->t_state = TCPS_FIN_WAIT_1;
1982			tp->t_flags &= ~TF_NEEDFIN;
1983		} else {
1984			tp->t_state = TCPS_ESTABLISHED;
1985			tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
1986		}
1987		/*
1988		 * If segment contains data or ACK, will call tcp_reass()
1989		 * later; if not, do so now to pass queued data to user.
1990		 */
1991		if (tlen == 0 && (thflags & TH_FIN) == 0)
1992			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
1993			    (struct mbuf *)0);
1994		tp->snd_wl1 = th->th_seq - 1;
1995		/* FALLTHROUGH */
1996
1997	/*
1998	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1999	 * ACKs.  If the ack is in the range
2000	 *	tp->snd_una < th->th_ack <= tp->snd_max
2001	 * then advance tp->snd_una to th->th_ack and drop
2002	 * data from the retransmission queue.  If this ACK reflects
2003	 * more up to date window information we update our window information.
2004	 */
2005	case TCPS_ESTABLISHED:
2006	case TCPS_FIN_WAIT_1:
2007	case TCPS_FIN_WAIT_2:
2008	case TCPS_CLOSE_WAIT:
2009	case TCPS_CLOSING:
2010	case TCPS_LAST_ACK:
2011		if (SEQ_GT(th->th_ack, tp->snd_max)) {
2012			TCPSTAT_INC(tcps_rcvacktoomuch);
2013			goto dropafterack;
2014		}
2015		if ((tp->t_flags & TF_SACK_PERMIT) &&
2016		    ((to.to_flags & TOF_SACK) ||
2017		     !TAILQ_EMPTY(&tp->snd_holes)))
2018			tcp_sack_doack(tp, &to, th->th_ack);
2019		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2020			if (tlen == 0 && tiwin == tp->snd_wnd) {
2021				TCPSTAT_INC(tcps_rcvdupack);
2022				/*
2023				 * If we have outstanding data (other than
2024				 * a window probe), this is a completely
2025				 * duplicate ack (ie, window info didn't
2026				 * change), the ack is the biggest we've
2027				 * seen and we've seen exactly our rexmt
2028				 * threshhold of them, assume a packet
2029				 * has been dropped and retransmit it.
2030				 * Kludge snd_nxt & the congestion
2031				 * window so we send only this one
2032				 * packet.
2033				 *
2034				 * We know we're losing at the current
2035				 * window size so do congestion avoidance
2036				 * (set ssthresh to half the current window
2037				 * and pull our congestion window back to
2038				 * the new ssthresh).
2039				 *
2040				 * Dup acks mean that packets have left the
2041				 * network (they're now cached at the receiver)
2042				 * so bump cwnd by the amount in the receiver
2043				 * to keep a constant cwnd packets in the
2044				 * network.
2045				 *
2046				 * When using TCP ECN, notify the peer that
2047				 * we reduced the cwnd.
2048				 */
2049				if (!tcp_timer_active(tp, TT_REXMT) ||
2050				    th->th_ack != tp->snd_una)
2051					tp->t_dupacks = 0;
2052				else if (++tp->t_dupacks > tcprexmtthresh ||
2053				    ((V_tcp_do_newreno ||
2054				      (tp->t_flags & TF_SACK_PERMIT)) &&
2055				     IN_FASTRECOVERY(tp))) {
2056					if ((tp->t_flags & TF_SACK_PERMIT) &&
2057					    IN_FASTRECOVERY(tp)) {
2058						int awnd;
2059
2060						/*
2061						 * Compute the amount of data in flight first.
2062						 * We can inject new data into the pipe iff
2063						 * we have less than 1/2 the original window's
2064						 * worth of data in flight.
2065						 */
2066						awnd = (tp->snd_nxt - tp->snd_fack) +
2067							tp->sackhint.sack_bytes_rexmit;
2068						if (awnd < tp->snd_ssthresh) {
2069							tp->snd_cwnd += tp->t_maxseg;
2070							if (tp->snd_cwnd > tp->snd_ssthresh)
2071								tp->snd_cwnd = tp->snd_ssthresh;
2072						}
2073					} else
2074						tp->snd_cwnd += tp->t_maxseg;
2075					(void) tcp_output(tp);
2076					goto drop;
2077				} else if (tp->t_dupacks == tcprexmtthresh) {
2078					tcp_seq onxt = tp->snd_nxt;
2079
2080					/*
2081					 * If we're doing sack, check to
2082					 * see if we're already in sack
2083					 * recovery. If we're not doing sack,
2084					 * check to see if we're in newreno
2085					 * recovery.
2086					 */
2087					if (tp->t_flags & TF_SACK_PERMIT) {
2088						if (IN_FASTRECOVERY(tp)) {
2089							tp->t_dupacks = 0;
2090							break;
2091						}
2092					} else if (V_tcp_do_newreno ||
2093					    V_tcp_do_ecn) {
2094						if (SEQ_LEQ(th->th_ack,
2095						    tp->snd_recover)) {
2096							tp->t_dupacks = 0;
2097							break;
2098						}
2099					}
2100					tcp_congestion_exp(tp);
2101					tcp_timer_activate(tp, TT_REXMT, 0);
2102					tp->t_rtttime = 0;
2103					if (tp->t_flags & TF_SACK_PERMIT) {
2104						TCPSTAT_INC(
2105						    tcps_sack_recovery_episode);
2106						tp->sack_newdata = tp->snd_nxt;
2107						tp->snd_cwnd = tp->t_maxseg;
2108						(void) tcp_output(tp);
2109						goto drop;
2110					}
2111					tp->snd_nxt = th->th_ack;
2112					tp->snd_cwnd = tp->t_maxseg;
2113					(void) tcp_output(tp);
2114					KASSERT(tp->snd_limited <= 2,
2115					    ("%s: tp->snd_limited too big",
2116					    __func__));
2117					tp->snd_cwnd = tp->snd_ssthresh +
2118					     tp->t_maxseg *
2119					     (tp->t_dupacks - tp->snd_limited);
2120					if (SEQ_GT(onxt, tp->snd_nxt))
2121						tp->snd_nxt = onxt;
2122					goto drop;
2123				} else if (V_tcp_do_rfc3042) {
2124					u_long oldcwnd = tp->snd_cwnd;
2125					tcp_seq oldsndmax = tp->snd_max;
2126					u_int sent;
2127
2128					KASSERT(tp->t_dupacks == 1 ||
2129					    tp->t_dupacks == 2,
2130					    ("%s: dupacks not 1 or 2",
2131					    __func__));
2132					if (tp->t_dupacks == 1)
2133						tp->snd_limited = 0;
2134					tp->snd_cwnd =
2135					    (tp->snd_nxt - tp->snd_una) +
2136					    (tp->t_dupacks - tp->snd_limited) *
2137					    tp->t_maxseg;
2138					(void) tcp_output(tp);
2139					sent = tp->snd_max - oldsndmax;
2140					if (sent > tp->t_maxseg) {
2141						KASSERT((tp->t_dupacks == 2 &&
2142						    tp->snd_limited == 0) ||
2143						   (sent == tp->t_maxseg + 1 &&
2144						    tp->t_flags & TF_SENTFIN),
2145						    ("%s: sent too much",
2146						    __func__));
2147						tp->snd_limited = 2;
2148					} else if (sent > 0)
2149						++tp->snd_limited;
2150					tp->snd_cwnd = oldcwnd;
2151					goto drop;
2152				}
2153			} else
2154				tp->t_dupacks = 0;
2155			break;
2156		}
2157
2158		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
2159		    ("%s: th_ack <= snd_una", __func__));
2160
2161		/*
2162		 * If the congestion window was inflated to account
2163		 * for the other side's cached packets, retract it.
2164		 */
2165		if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
2166			if (IN_FASTRECOVERY(tp)) {
2167				if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2168					if (tp->t_flags & TF_SACK_PERMIT)
2169						tcp_sack_partialack(tp, th);
2170					else
2171						tcp_newreno_partial_ack(tp, th);
2172				} else {
2173					/*
2174					 * Out of fast recovery.
2175					 * Window inflation should have left us
2176					 * with approximately snd_ssthresh
2177					 * outstanding data.
2178					 * But in case we would be inclined to
2179					 * send a burst, better to do it via
2180					 * the slow start mechanism.
2181					 */
2182					if (SEQ_GT(th->th_ack +
2183							tp->snd_ssthresh,
2184						   tp->snd_max))
2185						tp->snd_cwnd = tp->snd_max -
2186								th->th_ack +
2187								tp->t_maxseg;
2188					else
2189						tp->snd_cwnd = tp->snd_ssthresh;
2190				}
2191			}
2192		} else {
2193			if (tp->t_dupacks >= tcprexmtthresh &&
2194			    tp->snd_cwnd > tp->snd_ssthresh)
2195				tp->snd_cwnd = tp->snd_ssthresh;
2196		}
2197		tp->t_dupacks = 0;
2198		/*
2199		 * If we reach this point, ACK is not a duplicate,
2200		 *     i.e., it ACKs something we sent.
2201		 */
2202		if (tp->t_flags & TF_NEEDSYN) {
2203			/*
2204			 * T/TCP: Connection was half-synchronized, and our
2205			 * SYN has been ACK'd (so connection is now fully
2206			 * synchronized).  Go to non-starred state,
2207			 * increment snd_una for ACK of SYN, and check if
2208			 * we can do window scaling.
2209			 */
2210			tp->t_flags &= ~TF_NEEDSYN;
2211			tp->snd_una++;
2212			/* Do window scaling? */
2213			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2214				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
2215				tp->rcv_scale = tp->request_r_scale;
2216				/* Send window already scaled. */
2217			}
2218		}
2219
2220process_ACK:
2221		INP_INFO_LOCK_ASSERT(&V_tcbinfo);
2222		KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
2223		    ("tcp_input: process_ACK ti_locked %d", ti_locked));
2224		INP_WLOCK_ASSERT(tp->t_inpcb);
2225
2226		acked = th->th_ack - tp->snd_una;
2227		TCPSTAT_INC(tcps_rcvackpack);
2228		TCPSTAT_ADD(tcps_rcvackbyte, acked);
2229
2230		/*
2231		 * If we just performed our first retransmit, and the ACK
2232		 * arrives within our recovery window, then it was a mistake
2233		 * to do the retransmit in the first place.  Recover our
2234		 * original cwnd and ssthresh, and proceed to transmit where
2235		 * we left off.
2236		 */
2237		if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) {
2238			TCPSTAT_INC(tcps_sndrexmitbad);
2239			tp->snd_cwnd = tp->snd_cwnd_prev;
2240			tp->snd_ssthresh = tp->snd_ssthresh_prev;
2241			tp->snd_recover = tp->snd_recover_prev;
2242			if (tp->t_flags & TF_WASFRECOVERY)
2243				ENTER_FASTRECOVERY(tp);
2244			tp->snd_nxt = tp->snd_max;
2245			tp->t_badrxtwin = 0;	/* XXX probably not required */
2246		}
2247
2248		/*
2249		 * If we have a timestamp reply, update smoothed
2250		 * round trip time.  If no timestamp is present but
2251		 * transmit timer is running and timed sequence
2252		 * number was acked, update smoothed round trip time.
2253		 * Since we now have an rtt measurement, cancel the
2254		 * timer backoff (cf., Phil Karn's retransmit alg.).
2255		 * Recompute the initial retransmit timer.
2256		 *
2257		 * Some boxes send broken timestamp replies
2258		 * during the SYN+ACK phase, ignore
2259		 * timestamps of 0 or we could calculate a
2260		 * huge RTT and blow up the retransmit timer.
2261		 */
2262		if ((to.to_flags & TOF_TS) != 0 &&
2263		    to.to_tsecr) {
2264			if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr)
2265				tp->t_rttlow = ticks - to.to_tsecr;
2266			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
2267		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
2268			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
2269				tp->t_rttlow = ticks - tp->t_rtttime;
2270			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
2271		}
2272		tcp_xmit_bandwidth_limit(tp, th->th_ack);
2273
2274		/*
2275		 * If all outstanding data is acked, stop retransmit
2276		 * timer and remember to restart (more output or persist).
2277		 * If there is more data to be acked, restart retransmit
2278		 * timer, using current (possibly backed-off) value.
2279		 */
2280		if (th->th_ack == tp->snd_max) {
2281			tcp_timer_activate(tp, TT_REXMT, 0);
2282			needoutput = 1;
2283		} else if (!tcp_timer_active(tp, TT_PERSIST))
2284			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
2285
2286		/*
2287		 * If no data (only SYN) was ACK'd,
2288		 *    skip rest of ACK processing.
2289		 */
2290		if (acked == 0)
2291			goto step6;
2292
2293		/*
2294		 * When new data is acked, open the congestion window.
2295		 * Method depends on which congestion control state we're
2296		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
2297		 * enabled.
2298		 *
2299		 * slow start: cwnd <= ssthresh
2300		 * cong avoid: cwnd > ssthresh
2301		 *
2302		 * slow start and ABC (RFC 3465):
2303		 *   Grow cwnd exponentially by the amount of data
2304		 *   ACKed capping the max increment per ACK to
2305		 *   (abc_l_var * maxseg) bytes.
2306		 *
2307		 * slow start without ABC (RFC 2581):
2308		 *   Grow cwnd exponentially by maxseg per ACK.
2309		 *
2310		 * cong avoid and ABC (RFC 3465):
2311		 *   Grow cwnd linearly by maxseg per RTT for each
2312		 *   cwnd worth of ACKed data.
2313		 *
2314		 * cong avoid without ABC (RFC 2581):
2315		 *   Grow cwnd linearly by approximately maxseg per RTT using
2316		 *   maxseg^2 / cwnd per ACK as the increment.
2317		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
2318		 *   avoid capping cwnd.
2319		 */
2320		if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
2321		    !IN_FASTRECOVERY(tp)) {
2322			u_int cw = tp->snd_cwnd;
2323			u_int incr = tp->t_maxseg;
2324			/* In congestion avoidance? */
2325			if (cw > tp->snd_ssthresh) {
2326				if (V_tcp_do_rfc3465) {
2327					tp->t_bytes_acked += acked;
2328					if (tp->t_bytes_acked >= tp->snd_cwnd)
2329						tp->t_bytes_acked -= cw;
2330					else
2331						incr = 0;
2332				}
2333				else
2334					incr = max((incr * incr / cw), 1);
2335			/*
2336			 * In slow-start with ABC enabled and no RTO in sight?
2337			 * (Must not use abc_l_var > 1 if slow starting after an
2338			 * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt ==
2339			 * snd_max check is sufficient to handle this).
2340			 */
2341			} else if (V_tcp_do_rfc3465 &&
2342			    tp->snd_nxt == tp->snd_max)
2343				incr = min(acked,
2344				    V_tcp_abc_l_var * tp->t_maxseg);
2345			/* ABC is on by default, so (incr == 0) frequently. */
2346			if (incr > 0)
2347				tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
2348		}
2349		SOCKBUF_LOCK(&so->so_snd);
2350		if (acked > so->so_snd.sb_cc) {
2351			tp->snd_wnd -= so->so_snd.sb_cc;
2352			sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
2353			ourfinisacked = 1;
2354		} else {
2355			sbdrop_locked(&so->so_snd, acked);
2356			tp->snd_wnd -= acked;
2357			ourfinisacked = 0;
2358		}
2359		/* NB: sowwakeup_locked() does an implicit unlock. */
2360		sowwakeup_locked(so);
2361		/* Detect una wraparound. */
2362		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
2363		    !IN_FASTRECOVERY(tp) &&
2364		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
2365		    SEQ_LEQ(th->th_ack, tp->snd_recover))
2366			tp->snd_recover = th->th_ack - 1;
2367		if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
2368		    IN_FASTRECOVERY(tp) &&
2369		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
2370			EXIT_FASTRECOVERY(tp);
2371			tp->t_bytes_acked = 0;
2372		}
2373		tp->snd_una = th->th_ack;
2374		if (tp->t_flags & TF_SACK_PERMIT) {
2375			if (SEQ_GT(tp->snd_una, tp->snd_recover))
2376				tp->snd_recover = tp->snd_una;
2377		}
2378		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2379			tp->snd_nxt = tp->snd_una;
2380
2381		switch (tp->t_state) {
2382
2383		/*
2384		 * In FIN_WAIT_1 STATE in addition to the processing
2385		 * for the ESTABLISHED state if our FIN is now acknowledged
2386		 * then enter FIN_WAIT_2.
2387		 */
2388		case TCPS_FIN_WAIT_1:
2389			if (ourfinisacked) {
2390				/*
2391				 * If we can't receive any more
2392				 * data, then closing user can proceed.
2393				 * Starting the timer is contrary to the
2394				 * specification, but if we don't get a FIN
2395				 * we'll hang forever.
2396				 *
2397				 * XXXjl:
2398				 * we should release the tp also, and use a
2399				 * compressed state.
2400				 */
2401				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2402					int timeout;
2403
2404					soisdisconnected(so);
2405					timeout = (tcp_fast_finwait2_recycle) ?
2406						tcp_finwait2_timeout : tcp_maxidle;
2407					tcp_timer_activate(tp, TT_2MSL, timeout);
2408				}
2409				tp->t_state = TCPS_FIN_WAIT_2;
2410			}
2411			break;
2412
2413		/*
2414		 * In CLOSING STATE in addition to the processing for
2415		 * the ESTABLISHED state if the ACK acknowledges our FIN
2416		 * then enter the TIME-WAIT state, otherwise ignore
2417		 * the segment.
2418		 */
2419		case TCPS_CLOSING:
2420			if (ourfinisacked) {
2421				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
2422				tcp_twstart(tp);
2423				INP_INFO_WUNLOCK(&V_tcbinfo);
2424				m_freem(m);
2425				return;
2426			}
2427			break;
2428
2429		/*
2430		 * In LAST_ACK, we may still be waiting for data to drain
2431		 * and/or to be acked, as well as for the ack of our FIN.
2432		 * If our FIN is now acknowledged, delete the TCB,
2433		 * enter the closed state and return.
2434		 */
2435		case TCPS_LAST_ACK:
2436			if (ourfinisacked) {
2437				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
2438				tp = tcp_close(tp);
2439				goto drop;
2440			}
2441			break;
2442		}
2443	}
2444
2445step6:
2446	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
2447	KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
2448	    ("tcp_do_segment: step6 ti_locked %d", ti_locked));
2449	INP_WLOCK_ASSERT(tp->t_inpcb);
2450
2451	/*
2452	 * Update window information.
2453	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2454	 */
2455	if ((thflags & TH_ACK) &&
2456	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2457	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2458	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2459		/* keep track of pure window updates */
2460		if (tlen == 0 &&
2461		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2462			TCPSTAT_INC(tcps_rcvwinupd);
2463		tp->snd_wnd = tiwin;
2464		tp->snd_wl1 = th->th_seq;
2465		tp->snd_wl2 = th->th_ack;
2466		if (tp->snd_wnd > tp->max_sndwnd)
2467			tp->max_sndwnd = tp->snd_wnd;
2468		needoutput = 1;
2469	}
2470
2471	/*
2472	 * Process segments with URG.
2473	 */
2474	if ((thflags & TH_URG) && th->th_urp &&
2475	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2476		/*
2477		 * This is a kludge, but if we receive and accept
2478		 * random urgent pointers, we'll crash in
2479		 * soreceive.  It's hard to imagine someone
2480		 * actually wanting to send this much urgent data.
2481		 */
2482		SOCKBUF_LOCK(&so->so_rcv);
2483		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2484			th->th_urp = 0;			/* XXX */
2485			thflags &= ~TH_URG;		/* XXX */
2486			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
2487			goto dodata;			/* XXX */
2488		}
2489		/*
2490		 * If this segment advances the known urgent pointer,
2491		 * then mark the data stream.  This should not happen
2492		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2493		 * a FIN has been received from the remote side.
2494		 * In these states we ignore the URG.
2495		 *
2496		 * According to RFC961 (Assigned Protocols),
2497		 * the urgent pointer points to the last octet
2498		 * of urgent data.  We continue, however,
2499		 * to consider it to indicate the first octet
2500		 * of data past the urgent section as the original
2501		 * spec states (in one of two places).
2502		 */
2503		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2504			tp->rcv_up = th->th_seq + th->th_urp;
2505			so->so_oobmark = so->so_rcv.sb_cc +
2506			    (tp->rcv_up - tp->rcv_nxt) - 1;
2507			if (so->so_oobmark == 0)
2508				so->so_rcv.sb_state |= SBS_RCVATMARK;
2509			sohasoutofband(so);
2510			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2511		}
2512		SOCKBUF_UNLOCK(&so->so_rcv);
2513		/*
2514		 * Remove out of band data so doesn't get presented to user.
2515		 * This can happen independent of advancing the URG pointer,
2516		 * but if two URG's are pending at once, some out-of-band
2517		 * data may creep in... ick.
2518		 */
2519		if (th->th_urp <= (u_long)tlen &&
2520		    !(so->so_options & SO_OOBINLINE)) {
2521			/* hdr drop is delayed */
2522			tcp_pulloutofband(so, th, m, drop_hdrlen);
2523		}
2524	} else {
2525		/*
2526		 * If no out of band data is expected,
2527		 * pull receive urgent pointer along
2528		 * with the receive window.
2529		 */
2530		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2531			tp->rcv_up = tp->rcv_nxt;
2532	}
2533dodata:							/* XXX */
2534	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
2535	KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
2536	    ("tcp_do_segment: dodata ti_locked %d", ti_locked));
2537	INP_WLOCK_ASSERT(tp->t_inpcb);
2538
2539	/*
2540	 * Process the segment text, merging it into the TCP sequencing queue,
2541	 * and arranging for acknowledgment of receipt if necessary.
2542	 * This process logically involves adjusting tp->rcv_wnd as data
2543	 * is presented to the user (this happens in tcp_usrreq.c,
2544	 * case PRU_RCVD).  If a FIN has already been received on this
2545	 * connection then we just ignore the text.
2546	 */
2547	if ((tlen || (thflags & TH_FIN)) &&
2548	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2549		tcp_seq save_start = th->th_seq;
2550		m_adj(m, drop_hdrlen);	/* delayed header drop */
2551		/*
2552		 * Insert segment which includes th into TCP reassembly queue
2553		 * with control block tp.  Set thflags to whether reassembly now
2554		 * includes a segment with FIN.  This handles the common case
2555		 * inline (segment is the next to be received on an established
2556		 * connection, and the queue is empty), avoiding linkage into
2557		 * and removal from the queue and repetition of various
2558		 * conversions.
2559		 * Set DELACK for segments received in order, but ack
2560		 * immediately when segments are out of order (so
2561		 * fast retransmit can work).
2562		 */
2563		if (th->th_seq == tp->rcv_nxt &&
2564		    LIST_EMPTY(&tp->t_segq) &&
2565		    TCPS_HAVEESTABLISHED(tp->t_state)) {
2566			if (DELAY_ACK(tp))
2567				tp->t_flags |= TF_DELACK;
2568			else
2569				tp->t_flags |= TF_ACKNOW;
2570			tp->rcv_nxt += tlen;
2571			thflags = th->th_flags & TH_FIN;
2572			TCPSTAT_INC(tcps_rcvpack);
2573			TCPSTAT_ADD(tcps_rcvbyte, tlen);
2574			ND6_HINT(tp);
2575			SOCKBUF_LOCK(&so->so_rcv);
2576			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
2577				m_freem(m);
2578			else
2579				sbappendstream_locked(&so->so_rcv, m);
2580			/* NB: sorwakeup_locked() does an implicit unlock. */
2581			sorwakeup_locked(so);
2582		} else {
2583			/*
2584			 * XXX: Due to the header drop above "th" is
2585			 * theoretically invalid by now.  Fortunately
2586			 * m_adj() doesn't actually frees any mbufs
2587			 * when trimming from the head.
2588			 */
2589			thflags = tcp_reass(tp, th, &tlen, m);
2590			tp->t_flags |= TF_ACKNOW;
2591		}
2592		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
2593			tcp_update_sack_list(tp, save_start, save_start + tlen);
2594#if 0
2595		/*
2596		 * Note the amount of data that peer has sent into
2597		 * our window, in order to estimate the sender's
2598		 * buffer size.
2599		 * XXX: Unused.
2600		 */
2601		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2602#endif
2603	} else {
2604		m_freem(m);
2605		thflags &= ~TH_FIN;
2606	}
2607
2608	/*
2609	 * If FIN is received ACK the FIN and let the user know
2610	 * that the connection is closing.
2611	 */
2612	if (thflags & TH_FIN) {
2613		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2614			socantrcvmore(so);
2615			/*
2616			 * If connection is half-synchronized
2617			 * (ie NEEDSYN flag on) then delay ACK,
2618			 * so it may be piggybacked when SYN is sent.
2619			 * Otherwise, since we received a FIN then no
2620			 * more input can be expected, send ACK now.
2621			 */
2622			if (tp->t_flags & TF_NEEDSYN)
2623				tp->t_flags |= TF_DELACK;
2624			else
2625				tp->t_flags |= TF_ACKNOW;
2626			tp->rcv_nxt++;
2627		}
2628		switch (tp->t_state) {
2629
2630		/*
2631		 * In SYN_RECEIVED and ESTABLISHED STATES
2632		 * enter the CLOSE_WAIT state.
2633		 */
2634		case TCPS_SYN_RECEIVED:
2635			tp->t_starttime = ticks;
2636			/* FALLTHROUGH */
2637		case TCPS_ESTABLISHED:
2638			tp->t_state = TCPS_CLOSE_WAIT;
2639			break;
2640
2641		/*
2642		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2643		 * enter the CLOSING state.
2644		 */
2645		case TCPS_FIN_WAIT_1:
2646			tp->t_state = TCPS_CLOSING;
2647			break;
2648
2649		/*
2650		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2651		 * starting the time-wait timer, turning off the other
2652		 * standard timers.
2653		 */
2654		case TCPS_FIN_WAIT_2:
2655			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
2656			KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
2657			    "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
2658			    ti_locked));
2659
2660			tcp_twstart(tp);
2661			INP_INFO_WUNLOCK(&V_tcbinfo);
2662			return;
2663		}
2664	}
2665	if (ti_locked == TI_RLOCKED)
2666		INP_INFO_RUNLOCK(&V_tcbinfo);
2667	else if (ti_locked == TI_WLOCKED)
2668		INP_INFO_WUNLOCK(&V_tcbinfo);
2669	else
2670		panic("%s: dodata epilogue ti_locked %d", __func__,
2671		    ti_locked);
2672	ti_locked = TI_UNLOCKED;
2673
2674#ifdef TCPDEBUG
2675	if (so->so_options & SO_DEBUG)
2676		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
2677			  &tcp_savetcp, 0);
2678#endif
2679
2680	/*
2681	 * Return any desired output.
2682	 */
2683	if (needoutput || (tp->t_flags & TF_ACKNOW))
2684		(void) tcp_output(tp);
2685
2686check_delack:
2687	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
2688	    __func__, ti_locked));
2689	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2690	INP_WLOCK_ASSERT(tp->t_inpcb);
2691
2692	if (tp->t_flags & TF_DELACK) {
2693		tp->t_flags &= ~TF_DELACK;
2694		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
2695	}
2696	INP_WUNLOCK(tp->t_inpcb);
2697	return;
2698
2699dropafterack:
2700	KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
2701	    ("tcp_do_segment: dropafterack ti_locked %d", ti_locked));
2702
2703	/*
2704	 * Generate an ACK dropping incoming segment if it occupies
2705	 * sequence space, where the ACK reflects our state.
2706	 *
2707	 * We can now skip the test for the RST flag since all
2708	 * paths to this code happen after packets containing
2709	 * RST have been dropped.
2710	 *
2711	 * In the SYN-RECEIVED state, don't send an ACK unless the
2712	 * segment we received passes the SYN-RECEIVED ACK test.
2713	 * If it fails send a RST.  This breaks the loop in the
2714	 * "LAND" DoS attack, and also prevents an ACK storm
2715	 * between two listening ports that have been sent forged
2716	 * SYN segments, each with the source address of the other.
2717	 */
2718	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2719	    (SEQ_GT(tp->snd_una, th->th_ack) ||
2720	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
2721		rstreason = BANDLIM_RST_OPENPORT;
2722		goto dropwithreset;
2723	}
2724#ifdef TCPDEBUG
2725	if (so->so_options & SO_DEBUG)
2726		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2727			  &tcp_savetcp, 0);
2728#endif
2729	if (ti_locked == TI_RLOCKED)
2730		INP_INFO_RUNLOCK(&V_tcbinfo);
2731	else if (ti_locked == TI_WLOCKED)
2732		INP_INFO_WUNLOCK(&V_tcbinfo);
2733	else
2734		panic("%s: dropafterack epilogue ti_locked %d", __func__,
2735		    ti_locked);
2736	ti_locked = TI_UNLOCKED;
2737
2738	tp->t_flags |= TF_ACKNOW;
2739	(void) tcp_output(tp);
2740	INP_WUNLOCK(tp->t_inpcb);
2741	m_freem(m);
2742	return;
2743
2744dropwithreset:
2745	if (ti_locked == TI_RLOCKED)
2746		INP_INFO_RUNLOCK(&V_tcbinfo);
2747	else if (ti_locked == TI_WLOCKED)
2748		INP_INFO_WUNLOCK(&V_tcbinfo);
2749	else
2750		panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
2751	ti_locked = TI_UNLOCKED;
2752
2753	if (tp != NULL) {
2754		tcp_dropwithreset(m, th, tp, tlen, rstreason);
2755		INP_WUNLOCK(tp->t_inpcb);
2756	} else
2757		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
2758	return;
2759
2760drop:
2761	if (ti_locked == TI_RLOCKED)
2762		INP_INFO_RUNLOCK(&V_tcbinfo);
2763	else if (ti_locked == TI_WLOCKED)
2764		INP_INFO_WUNLOCK(&V_tcbinfo);
2765#ifdef INVARIANTS
2766	else
2767		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2768#endif
2769	ti_locked = TI_UNLOCKED;
2770
2771	/*
2772	 * Drop space held by incoming segment and return.
2773	 */
2774#ifdef TCPDEBUG
2775	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2776		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2777			  &tcp_savetcp, 0);
2778#endif
2779	if (tp != NULL)
2780		INP_WUNLOCK(tp->t_inpcb);
2781	m_freem(m);
2782}
2783
2784/*
2785 * Issue RST and make ACK acceptable to originator of segment.
2786 * The mbuf must still include the original packet header.
2787 * tp may be NULL.
2788 */
2789static void
2790tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
2791    int tlen, int rstreason)
2792{
2793	struct ip *ip;
2794#ifdef INET6
2795	struct ip6_hdr *ip6;
2796#endif
2797
2798	if (tp != NULL) {
2799		INP_WLOCK_ASSERT(tp->t_inpcb);
2800	}
2801
2802	/* Don't bother if destination was broadcast/multicast. */
2803	if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
2804		goto drop;
2805#ifdef INET6
2806	if (mtod(m, struct ip *)->ip_v == 6) {
2807		ip6 = mtod(m, struct ip6_hdr *);
2808		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2809		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2810			goto drop;
2811		/* IPv6 anycast check is done at tcp6_input() */
2812	} else
2813#endif
2814	{
2815		ip = mtod(m, struct ip *);
2816		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2817		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2818		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2819		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2820			goto drop;
2821	}
2822
2823	/* Perform bandwidth limiting. */
2824	if (badport_bandlim(rstreason) < 0)
2825		goto drop;
2826
2827	/* tcp_respond consumes the mbuf chain. */
2828	if (th->th_flags & TH_ACK) {
2829		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
2830		    th->th_ack, TH_RST);
2831	} else {
2832		if (th->th_flags & TH_SYN)
2833			tlen++;
2834		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
2835		    (tcp_seq)0, TH_RST|TH_ACK);
2836	}
2837	return;
2838drop:
2839	m_freem(m);
2840}
2841
2842/*
2843 * Parse TCP options and place in tcpopt.
2844 */
2845static void
2846tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
2847{
2848	int opt, optlen;
2849
2850	to->to_flags = 0;
2851	for (; cnt > 0; cnt -= optlen, cp += optlen) {
2852		opt = cp[0];
2853		if (opt == TCPOPT_EOL)
2854			break;
2855		if (opt == TCPOPT_NOP)
2856			optlen = 1;
2857		else {
2858			if (cnt < 2)
2859				break;
2860			optlen = cp[1];
2861			if (optlen < 2 || optlen > cnt)
2862				break;
2863		}
2864		switch (opt) {
2865		case TCPOPT_MAXSEG:
2866			if (optlen != TCPOLEN_MAXSEG)
2867				continue;
2868			if (!(flags & TO_SYN))
2869				continue;
2870			to->to_flags |= TOF_MSS;
2871			bcopy((char *)cp + 2,
2872			    (char *)&to->to_mss, sizeof(to->to_mss));
2873			to->to_mss = ntohs(to->to_mss);
2874			break;
2875		case TCPOPT_WINDOW:
2876			if (optlen != TCPOLEN_WINDOW)
2877				continue;
2878			if (!(flags & TO_SYN))
2879				continue;
2880			to->to_flags |= TOF_SCALE;
2881			to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
2882			break;
2883		case TCPOPT_TIMESTAMP:
2884			if (optlen != TCPOLEN_TIMESTAMP)
2885				continue;
2886			to->to_flags |= TOF_TS;
2887			bcopy((char *)cp + 2,
2888			    (char *)&to->to_tsval, sizeof(to->to_tsval));
2889			to->to_tsval = ntohl(to->to_tsval);
2890			bcopy((char *)cp + 6,
2891			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
2892			to->to_tsecr = ntohl(to->to_tsecr);
2893			break;
2894#ifdef TCP_SIGNATURE
2895		/*
2896		 * XXX In order to reply to a host which has set the
2897		 * TCP_SIGNATURE option in its initial SYN, we have to
2898		 * record the fact that the option was observed here
2899		 * for the syncache code to perform the correct response.
2900		 */
2901		case TCPOPT_SIGNATURE:
2902			if (optlen != TCPOLEN_SIGNATURE)
2903				continue;
2904			to->to_flags |= TOF_SIGNATURE;
2905			to->to_signature = cp + 2;
2906			break;
2907#endif
2908		case TCPOPT_SACK_PERMITTED:
2909			if (optlen != TCPOLEN_SACK_PERMITTED)
2910				continue;
2911			if (!(flags & TO_SYN))
2912				continue;
2913			if (!V_tcp_do_sack)
2914				continue;
2915			to->to_flags |= TOF_SACKPERM;
2916			break;
2917		case TCPOPT_SACK:
2918			if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
2919				continue;
2920			if (flags & TO_SYN)
2921				continue;
2922			to->to_flags |= TOF_SACK;
2923			to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
2924			to->to_sacks = cp + 2;
2925			TCPSTAT_INC(tcps_sack_rcv_blocks);
2926			break;
2927		default:
2928			continue;
2929		}
2930	}
2931}
2932
2933/*
2934 * Pull out of band byte out of a segment so
2935 * it doesn't appear in the user's data queue.
2936 * It is still reflected in the segment length for
2937 * sequencing purposes.
2938 */
2939static void
2940tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
2941    int off)
2942{
2943	int cnt = off + th->th_urp - 1;
2944
2945	while (cnt >= 0) {
2946		if (m->m_len > cnt) {
2947			char *cp = mtod(m, caddr_t) + cnt;
2948			struct tcpcb *tp = sototcpcb(so);
2949
2950			INP_WLOCK_ASSERT(tp->t_inpcb);
2951
2952			tp->t_iobc = *cp;
2953			tp->t_oobflags |= TCPOOB_HAVEDATA;
2954			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2955			m->m_len--;
2956			if (m->m_flags & M_PKTHDR)
2957				m->m_pkthdr.len--;
2958			return;
2959		}
2960		cnt -= m->m_len;
2961		m = m->m_next;
2962		if (m == NULL)
2963			break;
2964	}
2965	panic("tcp_pulloutofband");
2966}
2967
2968/*
2969 * Collect new round-trip time estimate
2970 * and update averages and current timeout.
2971 */
2972static void
2973tcp_xmit_timer(struct tcpcb *tp, int rtt)
2974{
2975	int delta;
2976
2977	INP_WLOCK_ASSERT(tp->t_inpcb);
2978
2979	TCPSTAT_INC(tcps_rttupdated);
2980	tp->t_rttupdated++;
2981	if (tp->t_srtt != 0) {
2982		/*
2983		 * srtt is stored as fixed point with 5 bits after the
2984		 * binary point (i.e., scaled by 8).  The following magic
2985		 * is equivalent to the smoothing algorithm in rfc793 with
2986		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2987		 * point).  Adjust rtt to origin 0.
2988		 */
2989		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2990			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2991
2992		if ((tp->t_srtt += delta) <= 0)
2993			tp->t_srtt = 1;
2994
2995		/*
2996		 * We accumulate a smoothed rtt variance (actually, a
2997		 * smoothed mean difference), then set the retransmit
2998		 * timer to smoothed rtt + 4 times the smoothed variance.
2999		 * rttvar is stored as fixed point with 4 bits after the
3000		 * binary point (scaled by 16).  The following is
3001		 * equivalent to rfc793 smoothing with an alpha of .75
3002		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
3003		 * rfc793's wired-in beta.
3004		 */
3005		if (delta < 0)
3006			delta = -delta;
3007		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3008		if ((tp->t_rttvar += delta) <= 0)
3009			tp->t_rttvar = 1;
3010		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3011		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3012	} else {
3013		/*
3014		 * No rtt measurement yet - use the unsmoothed rtt.
3015		 * Set the variance to half the rtt (so our first
3016		 * retransmit happens at 3*rtt).
3017		 */
3018		tp->t_srtt = rtt << TCP_RTT_SHIFT;
3019		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3020		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3021	}
3022	tp->t_rtttime = 0;
3023	tp->t_rxtshift = 0;
3024
3025	/*
3026	 * the retransmit should happen at rtt + 4 * rttvar.
3027	 * Because of the way we do the smoothing, srtt and rttvar
3028	 * will each average +1/2 tick of bias.  When we compute
3029	 * the retransmit timer, we want 1/2 tick of rounding and
3030	 * 1 extra tick because of +-1/2 tick uncertainty in the
3031	 * firing of the timer.  The bias will give us exactly the
3032	 * 1.5 tick we need.  But, because the bias is
3033	 * statistical, we have to test that we don't drop below
3034	 * the minimum feasible timer (which is 2 ticks).
3035	 */
3036	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3037		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
3038
3039	/*
3040	 * We received an ack for a packet that wasn't retransmitted;
3041	 * it is probably safe to discard any error indications we've
3042	 * received recently.  This isn't quite right, but close enough
3043	 * for now (a route might have failed after we sent a segment,
3044	 * and the return path might not be symmetrical).
3045	 */
3046	tp->t_softerror = 0;
3047}
3048
3049/*
3050 * Determine a reasonable value for maxseg size.
3051 * If the route is known, check route for mtu.
3052 * If none, use an mss that can be handled on the outgoing
3053 * interface without forcing IP to fragment; if bigger than
3054 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
3055 * to utilize large mbufs.  If no route is found, route has no mtu,
3056 * or the destination isn't local, use a default, hopefully conservative
3057 * size (usually 512 or the default IP max size, but no more than the mtu
3058 * of the interface), as we can't discover anything about intervening
3059 * gateways or networks.  We also initialize the congestion/slow start
3060 * window to be a single segment if the destination isn't local.
3061 * While looking at the routing entry, we also initialize other path-dependent
3062 * parameters from pre-set or cached values in the routing entry.
3063 *
3064 * Also take into account the space needed for options that we
3065 * send regularly.  Make maxseg shorter by that amount to assure
3066 * that we can send maxseg amount of data even when the options
3067 * are present.  Store the upper limit of the length of options plus
3068 * data in maxopd.
3069 *
3070 * In case of T/TCP, we call this routine during implicit connection
3071 * setup as well (offer = -1), to initialize maxseg from the cached
3072 * MSS of our peer.
3073 *
3074 * NOTE that this routine is only called when we process an incoming
3075 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
3076 */
3077void
3078tcp_mss_update(struct tcpcb *tp, int offer,
3079    struct hc_metrics_lite *metricptr, int *mtuflags)
3080{
3081	int mss;
3082	u_long maxmtu;
3083	struct inpcb *inp = tp->t_inpcb;
3084	struct hc_metrics_lite metrics;
3085	int origoffer = offer;
3086#ifdef INET6
3087	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3088	size_t min_protoh = isipv6 ?
3089			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
3090			    sizeof (struct tcpiphdr);
3091#else
3092	const size_t min_protoh = sizeof(struct tcpiphdr);
3093#endif
3094
3095	INP_WLOCK_ASSERT(tp->t_inpcb);
3096
3097	/* Initialize. */
3098#ifdef INET6
3099	if (isipv6) {
3100		maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags);
3101		tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
3102	} else
3103#endif
3104	{
3105		maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags);
3106		tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
3107	}
3108
3109	/*
3110	 * No route to sender, stay with default mss and return.
3111	 */
3112	if (maxmtu == 0) {
3113		/*
3114		 * In case we return early we need to initialize metrics
3115		 * to a defined state as tcp_hc_get() would do for us
3116		 * if there was no cache hit.
3117		 */
3118		if (metricptr != NULL)
3119			bzero(metricptr, sizeof(struct hc_metrics_lite));
3120		return;
3121	}
3122
3123	/* What have we got? */
3124	switch (offer) {
3125		case 0:
3126			/*
3127			 * Offer == 0 means that there was no MSS on the SYN
3128			 * segment, in this case we use tcp_mssdflt as
3129			 * already assigned to t_maxopd above.
3130			 */
3131			offer = tp->t_maxopd;
3132			break;
3133
3134		case -1:
3135			/*
3136			 * Offer == -1 means that we didn't receive SYN yet.
3137			 */
3138			/* FALLTHROUGH */
3139
3140		default:
3141			/*
3142			 * Prevent DoS attack with too small MSS. Round up
3143			 * to at least minmss.
3144			 */
3145			offer = max(offer, V_tcp_minmss);
3146	}
3147
3148	/*
3149	 * rmx information is now retrieved from tcp_hostcache.
3150	 */
3151	tcp_hc_get(&inp->inp_inc, &metrics);
3152	if (metricptr != NULL)
3153		bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
3154
3155	/*
3156	 * If there's a discovered mtu int tcp hostcache, use it
3157	 * else, use the link mtu.
3158	 */
3159	if (metrics.rmx_mtu)
3160		mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
3161	else {
3162#ifdef INET6
3163		if (isipv6) {
3164			mss = maxmtu - min_protoh;
3165			if (!V_path_mtu_discovery &&
3166			    !in6_localaddr(&inp->in6p_faddr))
3167				mss = min(mss, V_tcp_v6mssdflt);
3168		} else
3169#endif
3170		{
3171			mss = maxmtu - min_protoh;
3172			if (!V_path_mtu_discovery &&
3173			    !in_localaddr(inp->inp_faddr))
3174				mss = min(mss, V_tcp_mssdflt);
3175		}
3176		/*
3177		 * XXX - The above conditional (mss = maxmtu - min_protoh)
3178		 * probably violates the TCP spec.
3179		 * The problem is that, since we don't know the
3180		 * other end's MSS, we are supposed to use a conservative
3181		 * default.  But, if we do that, then MTU discovery will
3182		 * never actually take place, because the conservative
3183		 * default is much less than the MTUs typically seen
3184		 * on the Internet today.  For the moment, we'll sweep
3185		 * this under the carpet.
3186		 *
3187		 * The conservative default might not actually be a problem
3188		 * if the only case this occurs is when sending an initial
3189		 * SYN with options and data to a host we've never talked
3190		 * to before.  Then, they will reply with an MSS value which
3191		 * will get recorded and the new parameters should get
3192		 * recomputed.  For Further Study.
3193		 */
3194	}
3195	mss = min(mss, offer);
3196
3197	/*
3198	 * Sanity check: make sure that maxopd will be large
3199	 * enough to allow some data on segments even if the
3200	 * all the option space is used (40bytes).  Otherwise
3201	 * funny things may happen in tcp_output.
3202	 */
3203	mss = max(mss, 64);
3204
3205	/*
3206	 * maxopd stores the maximum length of data AND options
3207	 * in a segment; maxseg is the amount of data in a normal
3208	 * segment.  We need to store this value (maxopd) apart
3209	 * from maxseg, because now every segment carries options
3210	 * and thus we normally have somewhat less data in segments.
3211	 */
3212	tp->t_maxopd = mss;
3213
3214	/*
3215	 * origoffer==-1 indicates that no segments were received yet.
3216	 * In this case we just guess.
3217	 */
3218	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3219	    (origoffer == -1 ||
3220	     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
3221		mss -= TCPOLEN_TSTAMP_APPA;
3222
3223#if	(MCLBYTES & (MCLBYTES - 1)) == 0
3224	if (mss > MCLBYTES)
3225		mss &= ~(MCLBYTES-1);
3226#else
3227	if (mss > MCLBYTES)
3228		mss = mss / MCLBYTES * MCLBYTES;
3229#endif
3230	tp->t_maxseg = mss;
3231}
3232
3233void
3234tcp_mss(struct tcpcb *tp, int offer)
3235{
3236	int rtt, mss;
3237	u_long bufsize;
3238	struct inpcb *inp;
3239	struct socket *so;
3240	struct hc_metrics_lite metrics;
3241	int mtuflags = 0;
3242#ifdef INET6
3243	int isipv6;
3244#endif
3245	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
3246
3247	tcp_mss_update(tp, offer, &metrics, &mtuflags);
3248
3249	mss = tp->t_maxseg;
3250	inp = tp->t_inpcb;
3251#ifdef INET6
3252	isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3253#endif
3254
3255	/*
3256	 * If there's a pipesize, change the socket buffer to that size,
3257	 * don't change if sb_hiwat is different than default (then it
3258	 * has been changed on purpose with setsockopt).
3259	 * Make the socket buffers an integral number of mss units;
3260	 * if the mss is larger than the socket buffer, decrease the mss.
3261	 */
3262	so = inp->inp_socket;
3263	SOCKBUF_LOCK(&so->so_snd);
3264	if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
3265		bufsize = metrics.rmx_sendpipe;
3266	else
3267		bufsize = so->so_snd.sb_hiwat;
3268	if (bufsize < mss)
3269		mss = bufsize;
3270	else {
3271		bufsize = roundup(bufsize, mss);
3272		if (bufsize > sb_max)
3273			bufsize = sb_max;
3274		if (bufsize > so->so_snd.sb_hiwat)
3275			(void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
3276	}
3277	SOCKBUF_UNLOCK(&so->so_snd);
3278	tp->t_maxseg = mss;
3279
3280	SOCKBUF_LOCK(&so->so_rcv);
3281	if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
3282		bufsize = metrics.rmx_recvpipe;
3283	else
3284		bufsize = so->so_rcv.sb_hiwat;
3285	if (bufsize > mss) {
3286		bufsize = roundup(bufsize, mss);
3287		if (bufsize > sb_max)
3288			bufsize = sb_max;
3289		if (bufsize > so->so_rcv.sb_hiwat)
3290			(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
3291	}
3292	SOCKBUF_UNLOCK(&so->so_rcv);
3293	/*
3294	 * While we're here, check the others too.
3295	 */
3296	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
3297		tp->t_srtt = rtt;
3298		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
3299		TCPSTAT_INC(tcps_usedrtt);
3300		if (metrics.rmx_rttvar) {
3301			tp->t_rttvar = metrics.rmx_rttvar;
3302			TCPSTAT_INC(tcps_usedrttvar);
3303		} else {
3304			/* default variation is +- 1 rtt */
3305			tp->t_rttvar =
3306			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
3307		}
3308		TCPT_RANGESET(tp->t_rxtcur,
3309			      ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
3310			      tp->t_rttmin, TCPTV_REXMTMAX);
3311	}
3312	if (metrics.rmx_ssthresh) {
3313		/*
3314		 * There's some sort of gateway or interface
3315		 * buffer limit on the path.  Use this to set
3316		 * the slow start threshhold, but set the
3317		 * threshold to no less than 2*mss.
3318		 */
3319		tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
3320		TCPSTAT_INC(tcps_usedssthresh);
3321	}
3322	if (metrics.rmx_bandwidth)
3323		tp->snd_bandwidth = metrics.rmx_bandwidth;
3324
3325	/*
3326	 * Set the slow-start flight size depending on whether this
3327	 * is a local network or not.
3328	 *
3329	 * Extend this so we cache the cwnd too and retrieve it here.
3330	 * Make cwnd even bigger than RFC3390 suggests but only if we
3331	 * have previous experience with the remote host. Be careful
3332	 * not make cwnd bigger than remote receive window or our own
3333	 * send socket buffer. Maybe put some additional upper bound
3334	 * on the retrieved cwnd. Should do incremental updates to
3335	 * hostcache when cwnd collapses so next connection doesn't
3336	 * overloads the path again.
3337	 *
3338	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
3339	 * We currently check only in syncache_socket for that.
3340	 */
3341#define TCP_METRICS_CWND
3342#ifdef TCP_METRICS_CWND
3343	if (metrics.rmx_cwnd)
3344		tp->snd_cwnd = max(mss,
3345				min(metrics.rmx_cwnd / 2,
3346				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
3347	else
3348#endif
3349	if (V_tcp_do_rfc3390)
3350		tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
3351#ifdef INET6
3352	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
3353		 (!isipv6 && in_localaddr(inp->inp_faddr)))
3354#else
3355	else if (in_localaddr(inp->inp_faddr))
3356#endif
3357		tp->snd_cwnd = mss * V_ss_fltsz_local;
3358	else
3359		tp->snd_cwnd = mss * V_ss_fltsz;
3360
3361	/* Check the interface for TSO capabilities. */
3362	if (mtuflags & CSUM_TSO)
3363		tp->t_flags |= TF_TSO;
3364}
3365
3366/*
3367 * Determine the MSS option to send on an outgoing SYN.
3368 */
3369int
3370tcp_mssopt(struct in_conninfo *inc)
3371{
3372	int mss = 0;
3373	u_long maxmtu = 0;
3374	u_long thcmtu = 0;
3375	size_t min_protoh;
3376
3377	KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
3378
3379#ifdef INET6
3380	if (inc->inc_flags & INC_ISIPV6) {
3381		mss = V_tcp_v6mssdflt;
3382		maxmtu = tcp_maxmtu6(inc, NULL);
3383		thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
3384		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
3385	} else
3386#endif
3387	{
3388		mss = V_tcp_mssdflt;
3389		maxmtu = tcp_maxmtu(inc, NULL);
3390		thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
3391		min_protoh = sizeof(struct tcpiphdr);
3392	}
3393	if (maxmtu && thcmtu)
3394		mss = min(maxmtu, thcmtu) - min_protoh;
3395	else if (maxmtu || thcmtu)
3396		mss = max(maxmtu, thcmtu) - min_protoh;
3397
3398	return (mss);
3399}
3400
3401
3402/*
3403 * On a partial ack arrives, force the retransmission of the
3404 * next unacknowledged segment.  Do not clear tp->t_dupacks.
3405 * By setting snd_nxt to ti_ack, this forces retransmission timer to
3406 * be started again.
3407 */
3408static void
3409tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
3410{
3411	tcp_seq onxt = tp->snd_nxt;
3412	u_long  ocwnd = tp->snd_cwnd;
3413
3414	INP_WLOCK_ASSERT(tp->t_inpcb);
3415
3416	tcp_timer_activate(tp, TT_REXMT, 0);
3417	tp->t_rtttime = 0;
3418	tp->snd_nxt = th->th_ack;
3419	/*
3420	 * Set snd_cwnd to one segment beyond acknowledged offset.
3421	 * (tp->snd_una has not yet been updated when this function is called.)
3422	 */
3423	tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
3424	tp->t_flags |= TF_ACKNOW;
3425	(void) tcp_output(tp);
3426	tp->snd_cwnd = ocwnd;
3427	if (SEQ_GT(onxt, tp->snd_nxt))
3428		tp->snd_nxt = onxt;
3429	/*
3430	 * Partial window deflation.  Relies on fact that tp->snd_una
3431	 * not updated yet.
3432	 */
3433	if (tp->snd_cwnd > th->th_ack - tp->snd_una)
3434		tp->snd_cwnd -= th->th_ack - tp->snd_una;
3435	else
3436		tp->snd_cwnd = 0;
3437	tp->snd_cwnd += tp->t_maxseg;
3438}
3439