tcp_input.c revision 168615
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
30 * $FreeBSD: head/sys/netinet/tcp_input.c 168615 2007-04-11 09:45:16Z andre $
31 */
32
33#include "opt_ipfw.h"		/* for ipfw_fwd	*/
34#include "opt_inet.h"
35#include "opt_inet6.h"
36#include "opt_ipsec.h"
37#include "opt_mac.h"
38#include "opt_tcpdebug.h"
39
40#include <sys/param.h>
41#include <sys/kernel.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/proc.h>		/* for proc0 declaration */
45#include <sys/protosw.h>
46#include <sys/signalvar.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/sysctl.h>
50#include <sys/syslog.h>
51#include <sys/systm.h>
52
53#include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
54
55#include <vm/uma.h>
56
57#include <net/if.h>
58#include <net/route.h>
59
60#include <netinet/in.h>
61#include <netinet/in_pcb.h>
62#include <netinet/in_systm.h>
63#include <netinet/in_var.h>
64#include <netinet/ip.h>
65#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
66#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
67#include <netinet/ip_var.h>
68#include <netinet/ip_options.h>
69#include <netinet/ip6.h>
70#include <netinet/icmp6.h>
71#include <netinet6/in6_pcb.h>
72#include <netinet6/ip6_var.h>
73#include <netinet6/nd6.h>
74#include <netinet/tcp.h>
75#include <netinet/tcp_fsm.h>
76#include <netinet/tcp_seq.h>
77#include <netinet/tcp_timer.h>
78#include <netinet/tcp_var.h>
79#include <netinet6/tcp6_var.h>
80#include <netinet/tcpip.h>
81#ifdef TCPDEBUG
82#include <netinet/tcp_debug.h>
83#endif /* TCPDEBUG */
84
85#ifdef FAST_IPSEC
86#include <netipsec/ipsec.h>
87#include <netipsec/ipsec6.h>
88#endif /*FAST_IPSEC*/
89
90#ifdef IPSEC
91#include <netinet6/ipsec.h>
92#include <netinet6/ipsec6.h>
93#include <netkey/key.h>
94#endif /*IPSEC*/
95
96#include <machine/in_cksum.h>
97
98#include <security/mac/mac_framework.h>
99
100static const int tcprexmtthresh = 3;
101
102struct	tcpstat tcpstat;
103SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
104    &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
105
106static int tcp_log_in_vain = 0;
107SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
108    &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports");
109
110static int blackhole = 0;
111SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
112    &blackhole, 0, "Do not send RST on segments to closed ports");
113
114int tcp_delack_enabled = 1;
115SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
116    &tcp_delack_enabled, 0,
117    "Delay ACK to try and piggyback it onto a data packet");
118
119static int drop_synfin = 0;
120SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
121    &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
122
123static int tcp_do_rfc3042 = 1;
124SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
125    &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)");
126
127static int tcp_do_rfc3390 = 1;
128SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
129    &tcp_do_rfc3390, 0,
130    "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
131
132static int tcp_insecure_rst = 0;
133SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
134    &tcp_insecure_rst, 0,
135    "Follow the old (insecure) criteria for accepting RST packets");
136
137SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
138    "TCP Segment Reassembly Queue");
139
140static int tcp_reass_maxseg = 0;
141SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
142    &tcp_reass_maxseg, 0,
143    "Global maximum number of TCP Segments in Reassembly Queue");
144
145int tcp_reass_qsize = 0;
146SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD,
147    &tcp_reass_qsize, 0,
148    "Global number of TCP Segments currently in Reassembly Queue");
149
150static int tcp_reass_maxqlen = 48;
151SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW,
152    &tcp_reass_maxqlen, 0,
153    "Maximum number of TCP Segments per individual Reassembly Queue");
154
155static int tcp_reass_overflows = 0;
156SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
157    &tcp_reass_overflows, 0,
158    "Global number of TCP Segment Reassembly Queue Overflows");
159
160int	tcp_do_autorcvbuf = 1;
161SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
162    &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
163
164int	tcp_autorcvbuf_inc = 16*1024;
165SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
166    &tcp_autorcvbuf_inc, 0,
167    "Incrementor step size of automatic receive buffer");
168
169int	tcp_autorcvbuf_max = 256*1024;
170SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
171    &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
172
173struct inpcbhead tcb;
174#define	tcb6	tcb  /* for KAME src sync over BSD*'s */
175struct inpcbinfo tcbinfo;
176struct mtx	*tcbinfo_mtx;
177
178static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
179static int	 tcp_do_segment(struct mbuf *, struct tcphdr *,
180		     struct socket *, struct tcpcb *, int, int);
181static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
182		     struct tcpcb *, int, int);
183static void	 tcp_pulloutofband(struct socket *,
184		     struct tcphdr *, struct mbuf *, int);
185static int	 tcp_reass(struct tcpcb *, struct tcphdr *, int *,
186		     struct mbuf *);
187static void	 tcp_xmit_timer(struct tcpcb *, int);
188static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
189static int	 tcp_timewait(struct inpcb *, struct tcpopt *,
190		     struct tcphdr *, struct mbuf *, int);
191
192/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
193#ifdef INET6
194#define ND6_HINT(tp) \
195do { \
196	if ((tp) && (tp)->t_inpcb && \
197	    ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
198		nd6_nud_hint(NULL, NULL, 0); \
199} while (0)
200#else
201#define ND6_HINT(tp)
202#endif
203
204/*
205 * Indicate whether this ack should be delayed.  We can delay the ack if
206 *	- there is no delayed ack timer in progress and
207 *	- our last ack wasn't a 0-sized window.  We never want to delay
208 *	  the ack that opens up a 0-sized window and
209 *		- delayed acks are enabled or
210 *		- this is a half-synchronized T/TCP connection.
211 */
212#define DELAY_ACK(tp)							\
213	((!tcp_timer_active(tp, TT_DELACK) &&				\
214	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
215	    (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
216
217/* Initialize TCP reassembly queue */
218static void
219tcp_reass_zone_change(void *tag)
220{
221
222	tcp_reass_maxseg = nmbclusters / 16;
223	uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg);
224}
225
226uma_zone_t	tcp_reass_zone;
227void
228tcp_reass_init()
229{
230	tcp_reass_maxseg = nmbclusters / 16;
231	TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
232	    &tcp_reass_maxseg);
233	tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
234	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
235	uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg);
236	EVENTHANDLER_REGISTER(nmbclusters_change,
237	    tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
238}
239
240static int
241tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
242{
243	struct tseg_qent *q;
244	struct tseg_qent *p = NULL;
245	struct tseg_qent *nq;
246	struct tseg_qent *te = NULL;
247	struct socket *so = tp->t_inpcb->inp_socket;
248	int flags;
249
250	INP_LOCK_ASSERT(tp->t_inpcb);
251
252	/*
253	 * XXX: tcp_reass() is rather inefficient with its data structures
254	 * and should be rewritten (see NetBSD for optimizations).  While
255	 * doing that it should move to its own file tcp_reass.c.
256	 */
257
258	/*
259	 * Call with th==NULL after become established to
260	 * force pre-ESTABLISHED data up to user socket.
261	 */
262	if (th == NULL)
263		goto present;
264
265	/*
266	 * Limit the number of segments in the reassembly queue to prevent
267	 * holding on to too many segments (and thus running out of mbufs).
268	 * Make sure to let the missing segment through which caused this
269	 * queue.  Always keep one global queue entry spare to be able to
270	 * process the missing segment.
271	 */
272	if (th->th_seq != tp->rcv_nxt &&
273	    (tcp_reass_qsize + 1 >= tcp_reass_maxseg ||
274	     tp->t_segqlen >= tcp_reass_maxqlen)) {
275		tcp_reass_overflows++;
276		tcpstat.tcps_rcvmemdrop++;
277		m_freem(m);
278		*tlenp = 0;
279		return (0);
280	}
281
282	/*
283	 * Allocate a new queue entry. If we can't, or hit the zone limit
284	 * just drop the pkt.
285	 */
286	te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
287	if (te == NULL) {
288		tcpstat.tcps_rcvmemdrop++;
289		m_freem(m);
290		*tlenp = 0;
291		return (0);
292	}
293	tp->t_segqlen++;
294	tcp_reass_qsize++;
295
296	/*
297	 * Find a segment which begins after this one does.
298	 */
299	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
300		if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
301			break;
302		p = q;
303	}
304
305	/*
306	 * If there is a preceding segment, it may provide some of
307	 * our data already.  If so, drop the data from the incoming
308	 * segment.  If it provides all of our data, drop us.
309	 */
310	if (p != NULL) {
311		int i;
312		/* conversion to int (in i) handles seq wraparound */
313		i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
314		if (i > 0) {
315			if (i >= *tlenp) {
316				tcpstat.tcps_rcvduppack++;
317				tcpstat.tcps_rcvdupbyte += *tlenp;
318				m_freem(m);
319				uma_zfree(tcp_reass_zone, te);
320				tp->t_segqlen--;
321				tcp_reass_qsize--;
322				/*
323				 * Try to present any queued data
324				 * at the left window edge to the user.
325				 * This is needed after the 3-WHS
326				 * completes.
327				 */
328				goto present;	/* ??? */
329			}
330			m_adj(m, i);
331			*tlenp -= i;
332			th->th_seq += i;
333		}
334	}
335	tcpstat.tcps_rcvoopack++;
336	tcpstat.tcps_rcvoobyte += *tlenp;
337
338	/*
339	 * While we overlap succeeding segments trim them or,
340	 * if they are completely covered, dequeue them.
341	 */
342	while (q) {
343		int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
344		if (i <= 0)
345			break;
346		if (i < q->tqe_len) {
347			q->tqe_th->th_seq += i;
348			q->tqe_len -= i;
349			m_adj(q->tqe_m, i);
350			break;
351		}
352
353		nq = LIST_NEXT(q, tqe_q);
354		LIST_REMOVE(q, tqe_q);
355		m_freem(q->tqe_m);
356		uma_zfree(tcp_reass_zone, q);
357		tp->t_segqlen--;
358		tcp_reass_qsize--;
359		q = nq;
360	}
361
362	/* Insert the new segment queue entry into place. */
363	te->tqe_m = m;
364	te->tqe_th = th;
365	te->tqe_len = *tlenp;
366
367	if (p == NULL) {
368		LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
369	} else {
370		LIST_INSERT_AFTER(p, te, tqe_q);
371	}
372
373present:
374	/*
375	 * Present data to user, advancing rcv_nxt through
376	 * completed sequence space.
377	 */
378	if (!TCPS_HAVEESTABLISHED(tp->t_state))
379		return (0);
380	q = LIST_FIRST(&tp->t_segq);
381	if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
382		return (0);
383	SOCKBUF_LOCK(&so->so_rcv);
384	do {
385		tp->rcv_nxt += q->tqe_len;
386		flags = q->tqe_th->th_flags & TH_FIN;
387		nq = LIST_NEXT(q, tqe_q);
388		LIST_REMOVE(q, tqe_q);
389		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
390			m_freem(q->tqe_m);
391		else
392			sbappendstream_locked(&so->so_rcv, q->tqe_m);
393		uma_zfree(tcp_reass_zone, q);
394		tp->t_segqlen--;
395		tcp_reass_qsize--;
396		q = nq;
397	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
398	ND6_HINT(tp);
399	sorwakeup_locked(so);
400	return (flags);
401}
402
403/*
404 * TCP input routine, follows pages 65-76 of the
405 * protocol specification dated September, 1981 very closely.
406 */
407#ifdef INET6
408int
409tcp6_input(struct mbuf **mp, int *offp, int proto)
410{
411	struct mbuf *m = *mp;
412	struct in6_ifaddr *ia6;
413
414	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
415
416	/*
417	 * draft-itojun-ipv6-tcp-to-anycast
418	 * better place to put this in?
419	 */
420	ia6 = ip6_getdstifaddr(m);
421	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
422		struct ip6_hdr *ip6;
423
424		ip6 = mtod(m, struct ip6_hdr *);
425		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
426			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
427		return IPPROTO_DONE;
428	}
429
430	tcp_input(m, *offp);
431	return IPPROTO_DONE;
432}
433#endif
434
435void
436tcp_input(struct mbuf *m, int off0)
437{
438	struct tcphdr *th;
439	struct ip *ip = NULL;
440	struct ipovly *ipov;
441	struct inpcb *inp = NULL;
442	struct tcpcb *tp = NULL;
443	struct socket *so = NULL;
444	u_char *optp = NULL;
445	int optlen = 0;
446	int len, tlen, off;
447	int drop_hdrlen;
448	int thflags;
449	int rstreason = 0;	/* For badport_bandlim accounting purposes */
450#ifdef IPFIREWALL_FORWARD
451	struct m_tag *fwd_tag;
452#endif
453#ifdef INET6
454	struct ip6_hdr *ip6 = NULL;
455	int isipv6;
456	char ip6buf[INET6_ADDRSTRLEN];
457#else
458	const int isipv6 = 0;
459#endif
460	struct tcpopt to;		/* options in this segment */
461
462#ifdef TCPDEBUG
463	/*
464	 * The size of tcp_saveipgen must be the size of the max ip header,
465	 * now IPv6.
466	 */
467	u_char tcp_saveipgen[IP6_HDR_LEN];
468	struct tcphdr tcp_savetcp;
469	short ostate = 0;
470#endif
471
472#ifdef INET6
473	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
474#endif
475
476	to.to_flags = 0;
477	tcpstat.tcps_rcvtotal++;
478
479	if (isipv6) {
480#ifdef INET6
481		/* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
482		ip6 = mtod(m, struct ip6_hdr *);
483		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
484		if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
485			tcpstat.tcps_rcvbadsum++;
486			goto drop;
487		}
488		th = (struct tcphdr *)((caddr_t)ip6 + off0);
489
490		/*
491		 * Be proactive about unspecified IPv6 address in source.
492		 * As we use all-zero to indicate unbounded/unconnected pcb,
493		 * unspecified IPv6 address can be used to confuse us.
494		 *
495		 * Note that packets with unspecified IPv6 destination is
496		 * already dropped in ip6_input.
497		 */
498		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
499			/* XXX stat */
500			goto drop;
501		}
502#else
503		th = NULL;		/* XXX: avoid compiler warning */
504#endif
505	} else {
506		/*
507		 * Get IP and TCP header together in first mbuf.
508		 * Note: IP leaves IP header in first mbuf.
509		 */
510		if (off0 > sizeof (struct ip)) {
511			ip_stripoptions(m, (struct mbuf *)0);
512			off0 = sizeof(struct ip);
513		}
514		if (m->m_len < sizeof (struct tcpiphdr)) {
515			if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
516			    == NULL) {
517				tcpstat.tcps_rcvshort++;
518				return;
519			}
520		}
521		ip = mtod(m, struct ip *);
522		ipov = (struct ipovly *)ip;
523		th = (struct tcphdr *)((caddr_t)ip + off0);
524		tlen = ip->ip_len;
525
526		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
527			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
528				th->th_sum = m->m_pkthdr.csum_data;
529			else
530				th->th_sum = in_pseudo(ip->ip_src.s_addr,
531						ip->ip_dst.s_addr,
532						htonl(m->m_pkthdr.csum_data +
533							ip->ip_len +
534							IPPROTO_TCP));
535			th->th_sum ^= 0xffff;
536#ifdef TCPDEBUG
537			ipov->ih_len = (u_short)tlen;
538			ipov->ih_len = htons(ipov->ih_len);
539#endif
540		} else {
541			/*
542			 * Checksum extended TCP header and data.
543			 */
544			len = sizeof (struct ip) + tlen;
545			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
546			ipov->ih_len = (u_short)tlen;
547			ipov->ih_len = htons(ipov->ih_len);
548			th->th_sum = in_cksum(m, len);
549		}
550		if (th->th_sum) {
551			tcpstat.tcps_rcvbadsum++;
552			goto drop;
553		}
554		/* Re-initialization for later version check */
555		ip->ip_v = IPVERSION;
556	}
557
558	/*
559	 * Check that TCP offset makes sense,
560	 * pull out TCP options and adjust length.		XXX
561	 */
562	off = th->th_off << 2;
563	if (off < sizeof (struct tcphdr) || off > tlen) {
564		tcpstat.tcps_rcvbadoff++;
565		goto drop;
566	}
567	tlen -= off;	/* tlen is used instead of ti->ti_len */
568	if (off > sizeof (struct tcphdr)) {
569		if (isipv6) {
570#ifdef INET6
571			IP6_EXTHDR_CHECK(m, off0, off, );
572			ip6 = mtod(m, struct ip6_hdr *);
573			th = (struct tcphdr *)((caddr_t)ip6 + off0);
574#endif
575		} else {
576			if (m->m_len < sizeof(struct ip) + off) {
577				if ((m = m_pullup(m, sizeof (struct ip) + off))
578				    == NULL) {
579					tcpstat.tcps_rcvshort++;
580					return;
581				}
582				ip = mtod(m, struct ip *);
583				ipov = (struct ipovly *)ip;
584				th = (struct tcphdr *)((caddr_t)ip + off0);
585			}
586		}
587		optlen = off - sizeof (struct tcphdr);
588		optp = (u_char *)(th + 1);
589	}
590	thflags = th->th_flags;
591
592	/*
593	 * If the drop_synfin option is enabled, drop all packets with
594	 * both the SYN and FIN bits set. This prevents e.g. nmap from
595	 * identifying the TCP/IP stack.
596	 *
597	 * This is a violation of the TCP specification.
598	 */
599	if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
600		goto drop;
601
602	/*
603	 * Convert TCP protocol specific fields to host format.
604	 */
605	th->th_seq = ntohl(th->th_seq);
606	th->th_ack = ntohl(th->th_ack);
607	th->th_win = ntohs(th->th_win);
608	th->th_urp = ntohs(th->th_urp);
609
610	/*
611	 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
612	 */
613	drop_hdrlen = off0 + off;
614
615	/*
616	 * Locate pcb for segment.
617	 */
618	INP_INFO_WLOCK(&tcbinfo);
619findpcb:
620	INP_INFO_WLOCK_ASSERT(&tcbinfo);
621#ifdef IPFIREWALL_FORWARD
622	/* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
623	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
624
625	if (fwd_tag != NULL && isipv6 == 0) {	/* IPv6 support is not yet */
626		struct sockaddr_in *next_hop;
627
628		next_hop = (struct sockaddr_in *)(fwd_tag+1);
629		/*
630		 * Transparently forwarded. Pretend to be the destination.
631		 * already got one like this?
632		 */
633		inp = in_pcblookup_hash(&tcbinfo,
634					ip->ip_src, th->th_sport,
635					ip->ip_dst, th->th_dport,
636					0, m->m_pkthdr.rcvif);
637		if (!inp) {
638			/* It's new.  Try to find the ambushing socket. */
639			inp = in_pcblookup_hash(&tcbinfo,
640						ip->ip_src, th->th_sport,
641						next_hop->sin_addr,
642						next_hop->sin_port ?
643						    ntohs(next_hop->sin_port) :
644						    th->th_dport,
645						INPLOOKUP_WILDCARD,
646						m->m_pkthdr.rcvif);
647		}
648		/* Remove the tag from the packet.  We don't need it anymore. */
649		m_tag_delete(m, fwd_tag);
650	} else
651#endif /* IPFIREWALL_FORWARD */
652	{
653		if (isipv6) {
654#ifdef INET6
655			inp = in6_pcblookup_hash(&tcbinfo,
656						 &ip6->ip6_src, th->th_sport,
657						 &ip6->ip6_dst, th->th_dport,
658						 INPLOOKUP_WILDCARD,
659						 m->m_pkthdr.rcvif);
660#endif
661		} else
662			inp = in_pcblookup_hash(&tcbinfo,
663						ip->ip_src, th->th_sport,
664						ip->ip_dst, th->th_dport,
665						INPLOOKUP_WILDCARD,
666						m->m_pkthdr.rcvif);
667	}
668
669#if defined(IPSEC) || defined(FAST_IPSEC)
670#ifdef INET6
671	if (isipv6 && inp != NULL && ipsec6_in_reject(m, inp)) {
672#ifdef IPSEC
673		ipsec6stat.in_polvio++;
674#endif
675		goto dropunlock;
676	} else
677#endif /* INET6 */
678	if (inp != NULL && ipsec4_in_reject(m, inp)) {
679#ifdef IPSEC
680		ipsecstat.in_polvio++;
681#endif
682		goto dropunlock;
683	}
684#endif /*IPSEC || FAST_IPSEC*/
685
686	/*
687	 * If the INPCB does not exist then all data in the incoming
688	 * segment is discarded and an appropriate RST is sent back.
689	 */
690	if (inp == NULL) {
691		/*
692		 * Log communication attempts to ports that are not
693		 * in use.
694		 */
695		if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
696		    tcp_log_in_vain == 2) {
697#ifndef INET6
698			char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"];
699#else
700			char dbuf[INET6_ADDRSTRLEN+2], sbuf[INET6_ADDRSTRLEN+2];
701			if (isipv6) {
702				strcpy(dbuf, "[");
703				strcat(dbuf,
704				    ip6_sprintf(ip6buf, &ip6->ip6_dst));
705				strcat(dbuf, "]");
706				strcpy(sbuf, "[");
707				strcat(sbuf,
708				    ip6_sprintf(ip6buf, &ip6->ip6_src));
709				strcat(sbuf, "]");
710			} else
711#endif /* INET6 */
712			{
713				strcpy(dbuf, inet_ntoa(ip->ip_dst));
714				strcpy(sbuf, inet_ntoa(ip->ip_src));
715			}
716			log(LOG_INFO,
717			    "Connection attempt to TCP %s:%d "
718			    "from %s:%d flags:0x%02x\n",
719			    dbuf, ntohs(th->th_dport), sbuf,
720			    ntohs(th->th_sport), thflags);
721		}
722		/*
723		 * When blackholing do not respond with a RST but
724		 * completely ignore the segment and drop it.
725		 */
726		if ((blackhole == 1 && (thflags & TH_SYN)) ||
727		    blackhole == 2)
728			goto dropunlock;
729
730		rstreason = BANDLIM_RST_CLOSEDPORT;
731		goto dropwithreset;
732	}
733	INP_LOCK(inp);
734
735	/* Check the minimum TTL for socket. */
736	if (inp->inp_ip_minttl != 0) {
737#ifdef INET6
738		if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
739			goto dropunlock;
740		else
741#endif
742		if (inp->inp_ip_minttl > ip->ip_ttl)
743			goto dropunlock;
744	}
745
746	/*
747	 * A previous connection in TIMEWAIT state is supposed to catch
748	 * stray or duplicate segments arriving late.  If this segment
749	 * was a legitimate new connection attempt the old INPCB gets
750	 * removed and we can try again to find a listening socket.
751	 */
752	if (inp->inp_vflag & INP_TIMEWAIT) {
753		if (thflags & TH_SYN)
754			tcp_dooptions(&to, optp, optlen, TO_SYN);
755		if (tcp_timewait(inp, &to, th, m, tlen))
756			goto findpcb;
757		/* tcp_timewait unlocks inp. */
758		INP_INFO_WUNLOCK(&tcbinfo);
759		return;
760	}
761	/*
762	 * The TCPCB may no longer exist if the connection is winding
763	 * down or it is in the CLOSED state.  Either way we drop the
764	 * segment and send an appropriate response.
765	 */
766	tp = intotcpcb(inp);
767	if (tp == NULL) {
768		INP_UNLOCK(inp);
769		rstreason = BANDLIM_RST_CLOSEDPORT;
770		goto dropwithreset;
771	}
772	if (tp->t_state == TCPS_CLOSED)
773		goto dropunlock;	/* XXX: dropwithreset??? */
774
775#ifdef MAC
776	INP_LOCK_ASSERT(inp);
777	if (mac_check_inpcb_deliver(inp, m))
778		goto dropunlock;
779#endif
780	so = inp->inp_socket;
781	KASSERT(so != NULL, ("%s: so == NULL", __func__));
782#ifdef TCPDEBUG
783	if (so->so_options & SO_DEBUG) {
784		ostate = tp->t_state;
785		if (isipv6)
786			bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6));
787		else
788			bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
789		tcp_savetcp = *th;
790	}
791#endif
792	/*
793	 * When the socket is accepting connections (the INPCB is in LISTEN
794	 * state) we look into the SYN cache if this is a new connection
795	 * attempt or the completion of a previous one.
796	 */
797	if (so->so_options & SO_ACCEPTCONN) {
798		struct in_conninfo inc;
799
800		bzero(&inc, sizeof(inc));
801		inc.inc_isipv6 = isipv6;
802#ifdef INET6
803		if (isipv6) {
804			inc.inc6_faddr = ip6->ip6_src;
805			inc.inc6_laddr = ip6->ip6_dst;
806		} else
807#endif
808		{
809			inc.inc_faddr = ip->ip_src;
810			inc.inc_laddr = ip->ip_dst;
811		}
812		inc.inc_fport = th->th_sport;
813		inc.inc_lport = th->th_dport;
814
815		/*
816		 * If the state is LISTEN then ignore segment if it contains
817		 * a RST.  If the segment contains an ACK then it is bad and
818		 * send a RST.  If it does not contain a SYN then it is not
819		 * interesting; drop it.
820		 *
821		 * If the state is SYN_RECEIVED (syncache) and seg contains
822		 * an ACK, but not for our SYN/ACK, send a RST.  If the seg
823		 * contains a RST, check the sequence number to see if it
824		 * is a valid reset segment.
825		 */
826		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
827			if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
828				/*
829				 * Parse the TCP options here because
830				 * syncookies need access to the reflected
831				 * timestamp.
832				 */
833				tcp_dooptions(&to, optp, optlen, 0);
834				if (!syncache_expand(&inc, &to, th, &so, m)) {
835					/*
836					 * No syncache entry, or ACK was not
837					 * for our SYN/ACK.  Send a RST.
838					 */
839					tcpstat.tcps_badsyn++;
840					rstreason = BANDLIM_RST_OPENPORT;
841					goto dropwithreset;
842				}
843				if (so == NULL) {
844					/*
845					 * Could not complete 3-way handshake,
846					 * connection is being closed down, and
847					 * syncache has free'd mbuf.
848					 */
849					INP_UNLOCK(inp);
850					INP_INFO_WUNLOCK(&tcbinfo);
851					return;
852				}
853				/*
854				 * Socket is created in state SYN_RECEIVED.
855				 * Continue processing segment.
856				 */
857				INP_UNLOCK(inp);	/* listen socket */
858				inp = sotoinpcb(so);
859				INP_LOCK(inp);		/* new connection */
860				tp = intotcpcb(inp);
861				/*
862				 * Process the segment and the data it
863				 * contains.  tcp_do_segment() consumes
864				 * the mbuf chain and unlocks the inpcb.
865				 * XXX: The potential return value of
866				 * TIME_WAIT nuked is supposed to be
867				 * handled above.
868				 */
869				if (tcp_do_segment(m, th, so, tp,
870						   drop_hdrlen, tlen))
871					goto findpcb;	/* TIME_WAIT nuked */
872				return;
873			}
874			if (thflags & TH_RST) {
875				syncache_chkrst(&inc, th);
876				goto dropunlock;
877			}
878			if (thflags & TH_ACK) {
879				syncache_badack(&inc);
880				tcpstat.tcps_badsyn++;
881				rstreason = BANDLIM_RST_OPENPORT;
882				goto dropwithreset;
883			}
884			goto dropunlock;
885		}
886
887		/*
888		 * Segment's flags are (SYN) or (SYN|FIN).
889		 */
890#ifdef INET6
891		/*
892		 * If deprecated address is forbidden,
893		 * we do not accept SYN to deprecated interface
894		 * address to prevent any new inbound connection from
895		 * getting established.
896		 * When we do not accept SYN, we send a TCP RST,
897		 * with deprecated source address (instead of dropping
898		 * it).  We compromise it as it is much better for peer
899		 * to send a RST, and RST will be the final packet
900		 * for the exchange.
901		 *
902		 * If we do not forbid deprecated addresses, we accept
903		 * the SYN packet.  RFC2462 does not suggest dropping
904		 * SYN in this case.
905		 * If we decipher RFC2462 5.5.4, it says like this:
906		 * 1. use of deprecated addr with existing
907		 *    communication is okay - "SHOULD continue to be
908		 *    used"
909		 * 2. use of it with new communication:
910		 *   (2a) "SHOULD NOT be used if alternate address
911		 *        with sufficient scope is available"
912		 *   (2b) nothing mentioned otherwise.
913		 * Here we fall into (2b) case as we have no choice in
914		 * our source address selection - we must obey the peer.
915		 *
916		 * The wording in RFC2462 is confusing, and there are
917		 * multiple description text for deprecated address
918		 * handling - worse, they are not exactly the same.
919		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
920		 */
921		if (isipv6 && !ip6_use_deprecated) {
922			struct in6_ifaddr *ia6;
923
924			if ((ia6 = ip6_getdstifaddr(m)) &&
925			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
926				INP_UNLOCK(inp);
927				tp = NULL;
928				rstreason = BANDLIM_RST_OPENPORT;
929				goto dropwithreset;
930			}
931		}
932#endif
933		/*
934		 * Basic sanity checks on incoming SYN requests:
935		 *
936		 * Don't bother responding if the destination was a
937		 * broadcast according to RFC1122 4.2.3.10, p. 104.
938		 *
939		 * If it is from this socket, drop it, it must be forged.
940		 *
941		 * Note that it is quite possible to receive unicast
942		 * link-layer packets with a broadcast IP address. Use
943		 * in_broadcast() to find them.
944		 */
945		if (m->m_flags & (M_BCAST|M_MCAST))
946			goto dropunlock;
947		if (isipv6) {
948#ifdef INET6
949			if (th->th_dport == th->th_sport &&
950			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src))
951				goto dropunlock;
952			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
953			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
954				goto dropunlock;
955#endif
956		} else {
957			if (th->th_dport == th->th_sport &&
958			    ip->ip_dst.s_addr == ip->ip_src.s_addr)
959				goto dropunlock;
960			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
961			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
962			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
963			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
964				goto dropunlock;
965		}
966		/*
967		 * SYN appears to be valid.  Create compressed TCP state
968		 * for syncache.
969		 */
970		if (so->so_qlen <= so->so_qlimit) {
971#ifdef TCPDEBUG
972			if (so->so_options & SO_DEBUG)
973				tcp_trace(TA_INPUT, ostate, tp,
974				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
975#endif
976			tcp_dooptions(&to, optp, optlen, TO_SYN);
977			if (!syncache_add(&inc, &to, th, inp, &so, m))
978				goto dropunlock;
979			/*
980			 * Entry added to syncache, mbuf used to
981			 * send SYN-ACK packet.  Everything unlocked
982			 * already.
983			 */
984			return;
985		}
986		/* Catch all.  Everthing that makes it down here is junk. */
987		goto dropunlock;
988	}
989
990	/*
991	 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or
992	 * later state.  tcp_do_segment() always consumes the mbuf chain
993	 * and unlocks the inpcb.
994	 */
995	if (tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen))
996		goto findpcb;	/* XXX: TIME_WAIT was nuked. */
997	return;
998
999dropwithreset:
1000	INP_INFO_WLOCK_ASSERT(&tcbinfo);
1001	tcp_dropwithreset(m, th, tp, tlen, rstreason);
1002	m = NULL;	/* mbuf chain got consumed. */
1003dropunlock:
1004	INP_INFO_WLOCK_ASSERT(&tcbinfo);
1005	if (tp != NULL)
1006		INP_UNLOCK(inp);
1007	INP_INFO_WUNLOCK(&tcbinfo);
1008drop:
1009	INP_INFO_UNLOCK_ASSERT(&tcbinfo);
1010	if (m != NULL)
1011		m_freem(m);
1012	return;
1013}
1014
1015static int
1016tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
1017    struct tcpcb *tp, int drop_hdrlen, int tlen)
1018{
1019	int thflags, acked, ourfinisacked, needoutput = 0;
1020	int headlocked = 1;
1021	int rstreason, todrop, win;
1022	u_long tiwin;
1023	struct tcpopt to;
1024
1025#ifdef TCPDEBUG
1026	/*
1027	 * The size of tcp_saveipgen must be the size of the max ip header,
1028	 * now IPv6.
1029	 */
1030	u_char tcp_saveipgen[IP6_HDR_LEN];
1031	struct tcphdr tcp_savetcp;
1032	short ostate = 0;
1033#endif
1034	thflags = th->th_flags;
1035
1036	INP_INFO_WLOCK_ASSERT(&tcbinfo);
1037	INP_LOCK_ASSERT(tp->t_inpcb);
1038	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__));
1039
1040	/*
1041	 * Segment received on connection.
1042	 * Reset idle time and keep-alive timer.
1043	 */
1044	tp->t_rcvtime = ticks;
1045	if (TCPS_HAVEESTABLISHED(tp->t_state))
1046		tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
1047
1048	/*
1049	 * Unscale the window into a 32-bit value.
1050	 * This value is bogus for the TCPS_SYN_SENT state
1051	 * and is overwritten later.
1052	 */
1053	tiwin = th->th_win << tp->snd_scale;
1054
1055	/*
1056	 * Parse options on any incoming segment.
1057	 */
1058	tcp_dooptions(&to, (u_char *)(th + 1),
1059	    (th->th_off << 2) - sizeof(struct tcphdr),
1060	    (thflags & TH_SYN) ? TO_SYN : 0);
1061
1062	/*
1063	 * If echoed timestamp is later than the current time,
1064	 * fall back to non RFC1323 RTT calculation.  Normalize
1065	 * timestamp if syncookies were used when this connection
1066	 * was established.
1067	 */
1068	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1069		to.to_tsecr -= tp->ts_offset;
1070		if (TSTMP_GT(to.to_tsecr, ticks))
1071			to.to_tsecr = 0;
1072	}
1073
1074	/*
1075	 * Process options only when we get SYN/ACK back. The SYN case
1076	 * for incoming connections is handled in tcp_syncache.
1077	 * XXX this is traditional behavior, may need to be cleaned up.
1078	 */
1079	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1080		if ((to.to_flags & TOF_SCALE) &&
1081		    (tp->t_flags & TF_REQ_SCALE)) {
1082			tp->t_flags |= TF_RCVD_SCALE;
1083			tp->snd_scale = to.to_wscale;
1084			tp->snd_wnd = th->th_win << tp->snd_scale;
1085			tiwin = tp->snd_wnd;
1086		}
1087		if (to.to_flags & TOF_TS) {
1088			tp->t_flags |= TF_RCVD_TSTMP;
1089			tp->ts_recent = to.to_tsval;
1090			tp->ts_recent_age = ticks;
1091		}
1092		/* Initial send window, already scaled. */
1093		tp->snd_wnd = th->th_win;
1094		if (to.to_flags & TOF_MSS)
1095			tcp_mss(tp, to.to_mss);
1096		if (tp->sack_enable) {
1097			if (!(to.to_flags & TOF_SACKPERM))
1098				tp->sack_enable = 0;
1099			else
1100				tp->t_flags |= TF_SACK_PERMIT;
1101		}
1102
1103	}
1104
1105	/*
1106	 * Header prediction: check for the two common cases
1107	 * of a uni-directional data xfer.  If the packet has
1108	 * no control flags, is in-sequence, the window didn't
1109	 * change and we're not retransmitting, it's a
1110	 * candidate.  If the length is zero and the ack moved
1111	 * forward, we're the sender side of the xfer.  Just
1112	 * free the data acked & wake any higher level process
1113	 * that was blocked waiting for space.  If the length
1114	 * is non-zero and the ack didn't move, we're the
1115	 * receiver side.  If we're getting packets in-order
1116	 * (the reassembly queue is empty), add the data to
1117	 * the socket buffer and note that we need a delayed ack.
1118	 * Make sure that the hidden state-flags are also off.
1119	 * Since we check for TCPS_ESTABLISHED above, it can only
1120	 * be TH_NEEDSYN.
1121	 */
1122	if (tp->t_state == TCPS_ESTABLISHED &&
1123	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1124	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1125	    ((to.to_flags & TOF_TS) == 0 ||
1126	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
1127	     th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd &&
1128	     tp->snd_nxt == tp->snd_max) {
1129
1130		/*
1131		 * If last ACK falls within this segment's sequence numbers,
1132		 * record the timestamp.
1133		 * NOTE that the test is modified according to the latest
1134		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1135		 */
1136		if ((to.to_flags & TOF_TS) != 0 &&
1137		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1138			tp->ts_recent_age = ticks;
1139			tp->ts_recent = to.to_tsval;
1140		}
1141
1142		if (tlen == 0) {
1143			if (SEQ_GT(th->th_ack, tp->snd_una) &&
1144			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
1145			    tp->snd_cwnd >= tp->snd_wnd &&
1146			    ((!tcp_do_newreno && !tp->sack_enable &&
1147			      tp->t_dupacks < tcprexmtthresh) ||
1148			     ((tcp_do_newreno || tp->sack_enable) &&
1149			      !IN_FASTRECOVERY(tp) &&
1150			      (to.to_flags & TOF_SACK) == 0 &&
1151			      TAILQ_EMPTY(&tp->snd_holes)))) {
1152				KASSERT(headlocked,
1153				    ("%s: headlocked", __func__));
1154				INP_INFO_WUNLOCK(&tcbinfo);
1155				headlocked = 0;
1156				/*
1157				 * this is a pure ack for outstanding data.
1158				 */
1159				++tcpstat.tcps_predack;
1160				/*
1161				 * "bad retransmit" recovery
1162				 */
1163				if (tp->t_rxtshift == 1 &&
1164				    ticks < tp->t_badrxtwin) {
1165					++tcpstat.tcps_sndrexmitbad;
1166					tp->snd_cwnd = tp->snd_cwnd_prev;
1167					tp->snd_ssthresh =
1168					    tp->snd_ssthresh_prev;
1169					tp->snd_recover = tp->snd_recover_prev;
1170					if (tp->t_flags & TF_WASFRECOVERY)
1171					    ENTER_FASTRECOVERY(tp);
1172					tp->snd_nxt = tp->snd_max;
1173					tp->t_badrxtwin = 0;
1174				}
1175
1176				/*
1177				 * Recalculate the transmit timer / rtt.
1178				 *
1179				 * Some boxes send broken timestamp replies
1180				 * during the SYN+ACK phase, ignore
1181				 * timestamps of 0 or we could calculate a
1182				 * huge RTT and blow up the retransmit timer.
1183				 */
1184				if ((to.to_flags & TOF_TS) != 0 &&
1185				    to.to_tsecr) {
1186					if (!tp->t_rttlow ||
1187					    tp->t_rttlow > ticks - to.to_tsecr)
1188						tp->t_rttlow = ticks - to.to_tsecr;
1189					tcp_xmit_timer(tp,
1190					    ticks - to.to_tsecr + 1);
1191				} else if (tp->t_rtttime &&
1192				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
1193					if (!tp->t_rttlow ||
1194					    tp->t_rttlow > ticks - tp->t_rtttime)
1195						tp->t_rttlow = ticks - tp->t_rtttime;
1196					tcp_xmit_timer(tp,
1197							ticks - tp->t_rtttime);
1198				}
1199				tcp_xmit_bandwidth_limit(tp, th->th_ack);
1200				acked = th->th_ack - tp->snd_una;
1201				tcpstat.tcps_rcvackpack++;
1202				tcpstat.tcps_rcvackbyte += acked;
1203				sbdrop(&so->so_snd, acked);
1204				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
1205				    SEQ_LEQ(th->th_ack, tp->snd_recover))
1206					tp->snd_recover = th->th_ack - 1;
1207				tp->snd_una = th->th_ack;
1208				/*
1209				 * pull snd_wl2 up to prevent seq wrap relative
1210				 * to th_ack.
1211				 */
1212				tp->snd_wl2 = th->th_ack;
1213				tp->t_dupacks = 0;
1214				m_freem(m);
1215				ND6_HINT(tp); /* some progress has been done */
1216
1217				/*
1218				 * If all outstanding data are acked, stop
1219				 * retransmit timer, otherwise restart timer
1220				 * using current (possibly backed-off) value.
1221				 * If process is waiting for space,
1222				 * wakeup/selwakeup/signal.  If data
1223				 * are ready to send, let tcp_output
1224				 * decide between more output or persist.
1225
1226#ifdef TCPDEBUG
1227				if (so->so_options & SO_DEBUG)
1228					tcp_trace(TA_INPUT, ostate, tp,
1229					    (void *)tcp_saveipgen,
1230					    &tcp_savetcp, 0);
1231#endif
1232				 */
1233				if (tp->snd_una == tp->snd_max)
1234					tcp_timer_activate(tp, TT_REXMT, 0);
1235				else if (!tcp_timer_active(tp, TT_PERSIST))
1236					tcp_timer_activate(tp, TT_REXMT,
1237						      tp->t_rxtcur);
1238
1239				sowwakeup(so);
1240				if (so->so_snd.sb_cc)
1241					(void) tcp_output(tp);
1242				goto check_delack;
1243			}
1244		} else if (th->th_ack == tp->snd_una &&
1245		    LIST_EMPTY(&tp->t_segq) &&
1246		    tlen <= sbspace(&so->so_rcv)) {
1247			int newsize = 0;	/* automatic sockbuf scaling */
1248
1249			KASSERT(headlocked, ("%s: headlocked", __func__));
1250			INP_INFO_WUNLOCK(&tcbinfo);
1251			headlocked = 0;
1252			/*
1253			 * this is a pure, in-sequence data packet
1254			 * with nothing on the reassembly queue and
1255			 * we have enough buffer space to take it.
1256			 */
1257			/* Clean receiver SACK report if present */
1258			if (tp->sack_enable && tp->rcv_numsacks)
1259				tcp_clean_sackreport(tp);
1260			++tcpstat.tcps_preddat;
1261			tp->rcv_nxt += tlen;
1262			/*
1263			 * Pull snd_wl1 up to prevent seq wrap relative to
1264			 * th_seq.
1265			 */
1266			tp->snd_wl1 = th->th_seq;
1267			/*
1268			 * Pull rcv_up up to prevent seq wrap relative to
1269			 * rcv_nxt.
1270			 */
1271			tp->rcv_up = tp->rcv_nxt;
1272			tcpstat.tcps_rcvpack++;
1273			tcpstat.tcps_rcvbyte += tlen;
1274			ND6_HINT(tp);	/* some progress has been done */
1275#ifdef TCPDEBUG
1276			if (so->so_options & SO_DEBUG)
1277				tcp_trace(TA_INPUT, ostate, tp,
1278				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
1279#endif
1280		/*
1281		 * Automatic sizing of receive socket buffer.  Often the send
1282		 * buffer size is not optimally adjusted to the actual network
1283		 * conditions at hand (delay bandwidth product).  Setting the
1284		 * buffer size too small limits throughput on links with high
1285		 * bandwidth and high delay (eg. trans-continental/oceanic links).
1286		 *
1287		 * On the receive side the socket buffer memory is only rarely
1288		 * used to any significant extent.  This allows us to be much
1289		 * more aggressive in scaling the receive socket buffer.  For
1290		 * the case that the buffer space is actually used to a large
1291		 * extent and we run out of kernel memory we can simply drop
1292		 * the new segments; TCP on the sender will just retransmit it
1293		 * later.  Setting the buffer size too big may only consume too
1294		 * much kernel memory if the application doesn't read() from
1295		 * the socket or packet loss or reordering makes use of the
1296		 * reassembly queue.
1297		 *
1298		 * The criteria to step up the receive buffer one notch are:
1299		 *  1. the number of bytes received during the time it takes
1300		 *     one timestamp to be reflected back to us (the RTT);
1301		 *  2. received bytes per RTT is within seven eighth of the
1302		 *     current socket buffer size;
1303		 *  3. receive buffer size has not hit maximal automatic size;
1304		 *
1305		 * This algorithm does one step per RTT at most and only if
1306		 * we receive a bulk stream w/o packet losses or reorderings.
1307		 * Shrinking the buffer during idle times is not necessary as
1308		 * it doesn't consume any memory when idle.
1309		 *
1310		 * TODO: Only step up if the application is actually serving
1311		 * the buffer to better manage the socket buffer resources.
1312		 */
1313			if (tcp_do_autorcvbuf &&
1314			    to.to_tsecr &&
1315			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
1316				if (to.to_tsecr > tp->rfbuf_ts &&
1317				    to.to_tsecr - tp->rfbuf_ts < hz) {
1318					if (tp->rfbuf_cnt >
1319					    (so->so_rcv.sb_hiwat / 8 * 7) &&
1320					    so->so_rcv.sb_hiwat <
1321					    tcp_autorcvbuf_max) {
1322						newsize =
1323						    min(so->so_rcv.sb_hiwat +
1324						    tcp_autorcvbuf_inc,
1325						    tcp_autorcvbuf_max);
1326					}
1327					/* Start over with next RTT. */
1328					tp->rfbuf_ts = 0;
1329					tp->rfbuf_cnt = 0;
1330				} else
1331					tp->rfbuf_cnt += tlen;	/* add up */
1332			}
1333
1334			/* Add data to socket buffer. */
1335			SOCKBUF_LOCK(&so->so_rcv);
1336			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1337				m_freem(m);
1338			} else {
1339				/*
1340				 * Set new socket buffer size.
1341				 * Give up when limit is reached.
1342				 */
1343				if (newsize)
1344					if (!sbreserve_locked(&so->so_rcv,
1345					    newsize, so, curthread))
1346						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
1347				m_adj(m, drop_hdrlen);	/* delayed header drop */
1348				sbappendstream_locked(&so->so_rcv, m);
1349			}
1350			sorwakeup_locked(so);
1351			if (DELAY_ACK(tp)) {
1352				tp->t_flags |= TF_DELACK;
1353			} else {
1354				tp->t_flags |= TF_ACKNOW;
1355				tcp_output(tp);
1356			}
1357			goto check_delack;
1358		}
1359	}
1360
1361	/*
1362	 * Calculate amount of space in receive window,
1363	 * and then do TCP input processing.
1364	 * Receive window is amount of space in rcv queue,
1365	 * but not less than advertised window.
1366	 */
1367	win = sbspace(&so->so_rcv);
1368	if (win < 0)
1369		win = 0;
1370	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1371
1372	/* Reset receive buffer auto scaling when not in bulk receive mode. */
1373	tp->rfbuf_ts = 0;
1374	tp->rfbuf_cnt = 0;
1375
1376	switch (tp->t_state) {
1377
1378	/*
1379	 * If the state is SYN_RECEIVED:
1380	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
1381	 */
1382	case TCPS_SYN_RECEIVED:
1383		if ((thflags & TH_ACK) &&
1384		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1385		     SEQ_GT(th->th_ack, tp->snd_max))) {
1386				rstreason = BANDLIM_RST_OPENPORT;
1387				goto dropwithreset;
1388		}
1389		break;
1390
1391	/*
1392	 * If the state is SYN_SENT:
1393	 *	if seg contains an ACK, but not for our SYN, drop the input.
1394	 *	if seg contains a RST, then drop the connection.
1395	 *	if seg does not contain SYN, then drop it.
1396	 * Otherwise this is an acceptable SYN segment
1397	 *	initialize tp->rcv_nxt and tp->irs
1398	 *	if seg contains ack then advance tp->snd_una
1399	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1400	 *	arrange for segment to be acked (eventually)
1401	 *	continue processing rest of data/controls, beginning with URG
1402	 */
1403	case TCPS_SYN_SENT:
1404		if ((thflags & TH_ACK) &&
1405		    (SEQ_LEQ(th->th_ack, tp->iss) ||
1406		     SEQ_GT(th->th_ack, tp->snd_max))) {
1407			rstreason = BANDLIM_UNLIMITED;
1408			goto dropwithreset;
1409		}
1410		if (thflags & TH_RST) {
1411			if (thflags & TH_ACK) {
1412				KASSERT(headlocked, ("%s: after_listen: "
1413				    "tcp_drop.2: head not locked", __func__));
1414				tp = tcp_drop(tp, ECONNREFUSED);
1415			}
1416			goto drop;
1417		}
1418		if ((thflags & TH_SYN) == 0)
1419			goto drop;
1420
1421		tp->irs = th->th_seq;
1422		tcp_rcvseqinit(tp);
1423		if (thflags & TH_ACK) {
1424			tcpstat.tcps_connects++;
1425			soisconnected(so);
1426#ifdef MAC
1427			SOCK_LOCK(so);
1428			mac_set_socket_peer_from_mbuf(m, so);
1429			SOCK_UNLOCK(so);
1430#endif
1431			/* Do window scaling on this connection? */
1432			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1433				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
1434				tp->rcv_scale = tp->request_r_scale;
1435			}
1436			tp->rcv_adv += tp->rcv_wnd;
1437			tp->snd_una++;		/* SYN is acked */
1438			/*
1439			 * If there's data, delay ACK; if there's also a FIN
1440			 * ACKNOW will be turned on later.
1441			 */
1442			if (DELAY_ACK(tp) && tlen != 0)
1443				tcp_timer_activate(tp, TT_DELACK,
1444				    tcp_delacktime);
1445			else
1446				tp->t_flags |= TF_ACKNOW;
1447			/*
1448			 * Received <SYN,ACK> in SYN_SENT[*] state.
1449			 * Transitions:
1450			 *	SYN_SENT  --> ESTABLISHED
1451			 *	SYN_SENT* --> FIN_WAIT_1
1452			 */
1453			tp->t_starttime = ticks;
1454			if (tp->t_flags & TF_NEEDFIN) {
1455				tp->t_state = TCPS_FIN_WAIT_1;
1456				tp->t_flags &= ~TF_NEEDFIN;
1457				thflags &= ~TH_SYN;
1458			} else {
1459				tp->t_state = TCPS_ESTABLISHED;
1460				tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
1461			}
1462		} else {
1463			/*
1464			 * Received initial SYN in SYN-SENT[*] state =>
1465			 * simultaneous open.  If segment contains CC option
1466			 * and there is a cached CC, apply TAO test.
1467			 * If it succeeds, connection is * half-synchronized.
1468			 * Otherwise, do 3-way handshake:
1469			 *        SYN-SENT -> SYN-RECEIVED
1470			 *        SYN-SENT* -> SYN-RECEIVED*
1471			 * If there was no CC option, clear cached CC value.
1472			 */
1473			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
1474			tcp_timer_activate(tp, TT_REXMT, 0);
1475			tp->t_state = TCPS_SYN_RECEIVED;
1476		}
1477
1478		KASSERT(headlocked, ("%s: trimthenstep6: head not locked",
1479		    __func__));
1480		INP_LOCK_ASSERT(tp->t_inpcb);
1481
1482		/*
1483		 * Advance th->th_seq to correspond to first data byte.
1484		 * If data, trim to stay within window,
1485		 * dropping FIN if necessary.
1486		 */
1487		th->th_seq++;
1488		if (tlen > tp->rcv_wnd) {
1489			todrop = tlen - tp->rcv_wnd;
1490			m_adj(m, -todrop);
1491			tlen = tp->rcv_wnd;
1492			thflags &= ~TH_FIN;
1493			tcpstat.tcps_rcvpackafterwin++;
1494			tcpstat.tcps_rcvbyteafterwin += todrop;
1495		}
1496		tp->snd_wl1 = th->th_seq - 1;
1497		tp->rcv_up = th->th_seq;
1498		/*
1499		 * Client side of transaction: already sent SYN and data.
1500		 * If the remote host used T/TCP to validate the SYN,
1501		 * our data will be ACK'd; if so, enter normal data segment
1502		 * processing in the middle of step 5, ack processing.
1503		 * Otherwise, goto step 6.
1504		 */
1505		if (thflags & TH_ACK)
1506			goto process_ACK;
1507
1508		goto step6;
1509
1510	/*
1511	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1512	 *      do normal processing.
1513	 *
1514	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
1515	 */
1516	case TCPS_LAST_ACK:
1517	case TCPS_CLOSING:
1518	case TCPS_TIME_WAIT:
1519		KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: timewait",
1520		    __func__));
1521		break;  /* continue normal processing */
1522	}
1523
1524	/*
1525	 * States other than LISTEN or SYN_SENT.
1526	 * First check the RST flag and sequence number since reset segments
1527	 * are exempt from the timestamp and connection count tests.  This
1528	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1529	 * below which allowed reset segments in half the sequence space
1530	 * to fall though and be processed (which gives forged reset
1531	 * segments with a random sequence number a 50 percent chance of
1532	 * killing a connection).
1533	 * Then check timestamp, if present.
1534	 * Then check the connection count, if present.
1535	 * Then check that at least some bytes of segment are within
1536	 * receive window.  If segment begins before rcv_nxt,
1537	 * drop leading data (and SYN); if nothing left, just ack.
1538	 *
1539	 *
1540	 * If the RST bit is set, check the sequence number to see
1541	 * if this is a valid reset segment.
1542	 * RFC 793 page 37:
1543	 *   In all states except SYN-SENT, all reset (RST) segments
1544	 *   are validated by checking their SEQ-fields.  A reset is
1545	 *   valid if its sequence number is in the window.
1546	 * Note: this does not take into account delayed ACKs, so
1547	 *   we should test against last_ack_sent instead of rcv_nxt.
1548	 *   The sequence number in the reset segment is normally an
1549	 *   echo of our outgoing acknowlegement numbers, but some hosts
1550	 *   send a reset with the sequence number at the rightmost edge
1551	 *   of our receive window, and we have to handle this case.
1552	 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
1553	 *   that brute force RST attacks are possible.  To combat this,
1554	 *   we use a much stricter check while in the ESTABLISHED state,
1555	 *   only accepting RSTs where the sequence number is equal to
1556	 *   last_ack_sent.  In all other states (the states in which a
1557	 *   RST is more likely), the more permissive check is used.
1558	 * If we have multiple segments in flight, the intial reset
1559	 * segment sequence numbers will be to the left of last_ack_sent,
1560	 * but they will eventually catch up.
1561	 * In any case, it never made sense to trim reset segments to
1562	 * fit the receive window since RFC 1122 says:
1563	 *   4.2.2.12  RST Segment: RFC-793 Section 3.4
1564	 *
1565	 *    A TCP SHOULD allow a received RST segment to include data.
1566	 *
1567	 *    DISCUSSION
1568	 *         It has been suggested that a RST segment could contain
1569	 *         ASCII text that encoded and explained the cause of the
1570	 *         RST.  No standard has yet been established for such
1571	 *         data.
1572	 *
1573	 * If the reset segment passes the sequence number test examine
1574	 * the state:
1575	 *    SYN_RECEIVED STATE:
1576	 *	If passive open, return to LISTEN state.
1577	 *	If active open, inform user that connection was refused.
1578	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
1579	 *	Inform user that connection was reset, and close tcb.
1580	 *    CLOSING, LAST_ACK STATES:
1581	 *	Close the tcb.
1582	 *    TIME_WAIT STATE:
1583	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
1584	 *      RFC 1337.
1585	 */
1586	if (thflags & TH_RST) {
1587		if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
1588		    SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1589			switch (tp->t_state) {
1590
1591			case TCPS_SYN_RECEIVED:
1592				so->so_error = ECONNREFUSED;
1593				goto close;
1594
1595			case TCPS_ESTABLISHED:
1596				if (tcp_insecure_rst == 0 &&
1597				    !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
1598				    SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
1599				    !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
1600				    SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
1601					tcpstat.tcps_badrst++;
1602					goto drop;
1603				}
1604			case TCPS_FIN_WAIT_1:
1605			case TCPS_FIN_WAIT_2:
1606			case TCPS_CLOSE_WAIT:
1607				so->so_error = ECONNRESET;
1608			close:
1609				tp->t_state = TCPS_CLOSED;
1610				tcpstat.tcps_drops++;
1611				KASSERT(headlocked, ("%s: trimthenstep6: "
1612				    "tcp_close: head not locked", __func__));
1613				tp = tcp_close(tp);
1614				break;
1615
1616			case TCPS_CLOSING:
1617			case TCPS_LAST_ACK:
1618				KASSERT(headlocked, ("%s: trimthenstep6: "
1619				    "tcp_close.2: head not locked", __func__));
1620				tp = tcp_close(tp);
1621				break;
1622
1623			case TCPS_TIME_WAIT:
1624				KASSERT(tp->t_state != TCPS_TIME_WAIT,
1625				    ("%s: timewait", __func__));
1626				break;
1627			}
1628		}
1629		goto drop;
1630	}
1631
1632	/*
1633	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1634	 * and it's less than ts_recent, drop it.
1635	 */
1636	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
1637	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1638
1639		/* Check to see if ts_recent is over 24 days old.  */
1640		if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1641			/*
1642			 * Invalidate ts_recent.  If this segment updates
1643			 * ts_recent, the age will be reset later and ts_recent
1644			 * will get a valid value.  If it does not, setting
1645			 * ts_recent to zero will at least satisfy the
1646			 * requirement that zero be placed in the timestamp
1647			 * echo reply when ts_recent isn't valid.  The
1648			 * age isn't reset until we get a valid ts_recent
1649			 * because we don't want out-of-order segments to be
1650			 * dropped when ts_recent is old.
1651			 */
1652			tp->ts_recent = 0;
1653		} else {
1654			tcpstat.tcps_rcvduppack++;
1655			tcpstat.tcps_rcvdupbyte += tlen;
1656			tcpstat.tcps_pawsdrop++;
1657			if (tlen)
1658				goto dropafterack;
1659			goto drop;
1660		}
1661	}
1662
1663	/*
1664	 * In the SYN-RECEIVED state, validate that the packet belongs to
1665	 * this connection before trimming the data to fit the receive
1666	 * window.  Check the sequence number versus IRS since we know
1667	 * the sequence numbers haven't wrapped.  This is a partial fix
1668	 * for the "LAND" DoS attack.
1669	 */
1670	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
1671		rstreason = BANDLIM_RST_OPENPORT;
1672		goto dropwithreset;
1673	}
1674
1675	todrop = tp->rcv_nxt - th->th_seq;
1676	if (todrop > 0) {
1677		if (thflags & TH_SYN) {
1678			thflags &= ~TH_SYN;
1679			th->th_seq++;
1680			if (th->th_urp > 1)
1681				th->th_urp--;
1682			else
1683				thflags &= ~TH_URG;
1684			todrop--;
1685		}
1686		/*
1687		 * Following if statement from Stevens, vol. 2, p. 960.
1688		 */
1689		if (todrop > tlen
1690		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1691			/*
1692			 * Any valid FIN must be to the left of the window.
1693			 * At this point the FIN must be a duplicate or out
1694			 * of sequence; drop it.
1695			 */
1696			thflags &= ~TH_FIN;
1697
1698			/*
1699			 * Send an ACK to resynchronize and drop any data.
1700			 * But keep on processing for RST or ACK.
1701			 */
1702			tp->t_flags |= TF_ACKNOW;
1703			todrop = tlen;
1704			tcpstat.tcps_rcvduppack++;
1705			tcpstat.tcps_rcvdupbyte += todrop;
1706		} else {
1707			tcpstat.tcps_rcvpartduppack++;
1708			tcpstat.tcps_rcvpartdupbyte += todrop;
1709		}
1710		drop_hdrlen += todrop;	/* drop from the top afterwards */
1711		th->th_seq += todrop;
1712		tlen -= todrop;
1713		if (th->th_urp > todrop)
1714			th->th_urp -= todrop;
1715		else {
1716			thflags &= ~TH_URG;
1717			th->th_urp = 0;
1718		}
1719	}
1720
1721	/*
1722	 * If new data are received on a connection after the
1723	 * user processes are gone, then RST the other end.
1724	 */
1725	if ((so->so_state & SS_NOFDREF) &&
1726	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1727		KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head "
1728		    "not locked", __func__));
1729		tp = tcp_close(tp);
1730		tcpstat.tcps_rcvafterclose++;
1731		rstreason = BANDLIM_UNLIMITED;
1732		goto dropwithreset;
1733	}
1734
1735	/*
1736	 * If segment ends after window, drop trailing data
1737	 * (and PUSH and FIN); if nothing left, just ACK.
1738	 */
1739	todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1740	if (todrop > 0) {
1741		tcpstat.tcps_rcvpackafterwin++;
1742		if (todrop >= tlen) {
1743			tcpstat.tcps_rcvbyteafterwin += tlen;
1744			/*
1745			 * If a new connection request is received
1746			 * while in TIME_WAIT, drop the old connection
1747			 * and start over if the sequence numbers
1748			 * are above the previous ones.
1749			 */
1750			KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: timewait",
1751			    __func__));
1752			if (thflags & TH_SYN &&
1753			    tp->t_state == TCPS_TIME_WAIT &&
1754			    SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1755				KASSERT(headlocked, ("%s: trimthenstep6: "
1756				    "tcp_close.4: head not locked", __func__));
1757				tp = tcp_close(tp);
1758				/* XXX: Shouldn't be possible. */
1759				return (1);
1760			}
1761			/*
1762			 * If window is closed can only take segments at
1763			 * window edge, and have to drop data and PUSH from
1764			 * incoming segments.  Continue processing, but
1765			 * remember to ack.  Otherwise, drop segment
1766			 * and ack.
1767			 */
1768			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1769				tp->t_flags |= TF_ACKNOW;
1770				tcpstat.tcps_rcvwinprobe++;
1771			} else
1772				goto dropafterack;
1773		} else
1774			tcpstat.tcps_rcvbyteafterwin += todrop;
1775		m_adj(m, -todrop);
1776		tlen -= todrop;
1777		thflags &= ~(TH_PUSH|TH_FIN);
1778	}
1779
1780	/*
1781	 * If last ACK falls within this segment's sequence numbers,
1782	 * record its timestamp.
1783	 * NOTE:
1784	 * 1) That the test incorporates suggestions from the latest
1785	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
1786	 * 2) That updating only on newer timestamps interferes with
1787	 *    our earlier PAWS tests, so this check should be solely
1788	 *    predicated on the sequence space of this segment.
1789	 * 3) That we modify the segment boundary check to be
1790	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
1791	 *    instead of RFC1323's
1792	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
1793	 *    This modified check allows us to overcome RFC1323's
1794	 *    limitations as described in Stevens TCP/IP Illustrated
1795	 *    Vol. 2 p.869. In such cases, we can still calculate the
1796	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
1797	 */
1798	if ((to.to_flags & TOF_TS) != 0 &&
1799	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1800	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
1801		((thflags & (TH_SYN|TH_FIN)) != 0))) {
1802		tp->ts_recent_age = ticks;
1803		tp->ts_recent = to.to_tsval;
1804	}
1805
1806	/*
1807	 * If a SYN is in the window, then this is an
1808	 * error and we send an RST and drop the connection.
1809	 */
1810	if (thflags & TH_SYN) {
1811		KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: "
1812		    "head not locked", __func__));
1813		tp = tcp_drop(tp, ECONNRESET);
1814		rstreason = BANDLIM_UNLIMITED;
1815		goto drop;
1816	}
1817
1818	/*
1819	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
1820	 * flag is on (half-synchronized state), then queue data for
1821	 * later processing; else drop segment and return.
1822	 */
1823	if ((thflags & TH_ACK) == 0) {
1824		if (tp->t_state == TCPS_SYN_RECEIVED ||
1825		    (tp->t_flags & TF_NEEDSYN))
1826			goto step6;
1827		else if (tp->t_flags & TF_ACKNOW)
1828			goto dropafterack;
1829		else
1830			goto drop;
1831	}
1832
1833	/*
1834	 * Ack processing.
1835	 */
1836	switch (tp->t_state) {
1837
1838	/*
1839	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1840	 * ESTABLISHED state and continue processing.
1841	 * The ACK was checked above.
1842	 */
1843	case TCPS_SYN_RECEIVED:
1844
1845		tcpstat.tcps_connects++;
1846		soisconnected(so);
1847		/* Do window scaling? */
1848		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1849			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
1850			tp->rcv_scale = tp->request_r_scale;
1851			tp->snd_wnd = tiwin;
1852		}
1853		/*
1854		 * Make transitions:
1855		 *      SYN-RECEIVED  -> ESTABLISHED
1856		 *      SYN-RECEIVED* -> FIN-WAIT-1
1857		 */
1858		tp->t_starttime = ticks;
1859		if (tp->t_flags & TF_NEEDFIN) {
1860			tp->t_state = TCPS_FIN_WAIT_1;
1861			tp->t_flags &= ~TF_NEEDFIN;
1862		} else {
1863			tp->t_state = TCPS_ESTABLISHED;
1864			tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
1865		}
1866		/*
1867		 * If segment contains data or ACK, will call tcp_reass()
1868		 * later; if not, do so now to pass queued data to user.
1869		 */
1870		if (tlen == 0 && (thflags & TH_FIN) == 0)
1871			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
1872			    (struct mbuf *)0);
1873		tp->snd_wl1 = th->th_seq - 1;
1874		/* FALLTHROUGH */
1875
1876	/*
1877	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1878	 * ACKs.  If the ack is in the range
1879	 *	tp->snd_una < th->th_ack <= tp->snd_max
1880	 * then advance tp->snd_una to th->th_ack and drop
1881	 * data from the retransmission queue.  If this ACK reflects
1882	 * more up to date window information we update our window information.
1883	 */
1884	case TCPS_ESTABLISHED:
1885	case TCPS_FIN_WAIT_1:
1886	case TCPS_FIN_WAIT_2:
1887	case TCPS_CLOSE_WAIT:
1888	case TCPS_CLOSING:
1889	case TCPS_LAST_ACK:
1890	case TCPS_TIME_WAIT:
1891		KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: timewait",
1892		    __func__));
1893		if (SEQ_GT(th->th_ack, tp->snd_max)) {
1894			tcpstat.tcps_rcvacktoomuch++;
1895			goto dropafterack;
1896		}
1897		if (tp->sack_enable &&
1898		    ((to.to_flags & TOF_SACK) ||
1899		     !TAILQ_EMPTY(&tp->snd_holes)))
1900			tcp_sack_doack(tp, &to, th->th_ack);
1901		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1902			if (tlen == 0 && tiwin == tp->snd_wnd) {
1903				tcpstat.tcps_rcvdupack++;
1904				/*
1905				 * If we have outstanding data (other than
1906				 * a window probe), this is a completely
1907				 * duplicate ack (ie, window info didn't
1908				 * change), the ack is the biggest we've
1909				 * seen and we've seen exactly our rexmt
1910				 * threshhold of them, assume a packet
1911				 * has been dropped and retransmit it.
1912				 * Kludge snd_nxt & the congestion
1913				 * window so we send only this one
1914				 * packet.
1915				 *
1916				 * We know we're losing at the current
1917				 * window size so do congestion avoidance
1918				 * (set ssthresh to half the current window
1919				 * and pull our congestion window back to
1920				 * the new ssthresh).
1921				 *
1922				 * Dup acks mean that packets have left the
1923				 * network (they're now cached at the receiver)
1924				 * so bump cwnd by the amount in the receiver
1925				 * to keep a constant cwnd packets in the
1926				 * network.
1927				 */
1928				if (!tcp_timer_active(tp, TT_REXMT) ||
1929				    th->th_ack != tp->snd_una)
1930					tp->t_dupacks = 0;
1931				else if (++tp->t_dupacks > tcprexmtthresh ||
1932				    ((tcp_do_newreno || tp->sack_enable) &&
1933				     IN_FASTRECOVERY(tp))) {
1934					if (tp->sack_enable && IN_FASTRECOVERY(tp)) {
1935						int awnd;
1936
1937						/*
1938						 * Compute the amount of data in flight first.
1939						 * We can inject new data into the pipe iff
1940						 * we have less than 1/2 the original window's
1941						 * worth of data in flight.
1942						 */
1943						awnd = (tp->snd_nxt - tp->snd_fack) +
1944							tp->sackhint.sack_bytes_rexmit;
1945						if (awnd < tp->snd_ssthresh) {
1946							tp->snd_cwnd += tp->t_maxseg;
1947							if (tp->snd_cwnd > tp->snd_ssthresh)
1948								tp->snd_cwnd = tp->snd_ssthresh;
1949						}
1950					} else
1951						tp->snd_cwnd += tp->t_maxseg;
1952					(void) tcp_output(tp);
1953					goto drop;
1954				} else if (tp->t_dupacks == tcprexmtthresh) {
1955					tcp_seq onxt = tp->snd_nxt;
1956					u_int win;
1957
1958					/*
1959					 * If we're doing sack, check to
1960					 * see if we're already in sack
1961					 * recovery. If we're not doing sack,
1962					 * check to see if we're in newreno
1963					 * recovery.
1964					 */
1965					if (tp->sack_enable) {
1966						if (IN_FASTRECOVERY(tp)) {
1967							tp->t_dupacks = 0;
1968							break;
1969						}
1970					} else if (tcp_do_newreno) {
1971						if (SEQ_LEQ(th->th_ack,
1972						    tp->snd_recover)) {
1973							tp->t_dupacks = 0;
1974							break;
1975						}
1976					}
1977					win = min(tp->snd_wnd, tp->snd_cwnd) /
1978					    2 / tp->t_maxseg;
1979					if (win < 2)
1980						win = 2;
1981					tp->snd_ssthresh = win * tp->t_maxseg;
1982					ENTER_FASTRECOVERY(tp);
1983					tp->snd_recover = tp->snd_max;
1984					tcp_timer_activate(tp, TT_REXMT, 0);
1985					tp->t_rtttime = 0;
1986					if (tp->sack_enable) {
1987						tcpstat.tcps_sack_recovery_episode++;
1988						tp->sack_newdata = tp->snd_nxt;
1989						tp->snd_cwnd = tp->t_maxseg;
1990						(void) tcp_output(tp);
1991						goto drop;
1992					}
1993					tp->snd_nxt = th->th_ack;
1994					tp->snd_cwnd = tp->t_maxseg;
1995					(void) tcp_output(tp);
1996					KASSERT(tp->snd_limited <= 2,
1997					    ("%s: tp->snd_limited too big",
1998					    __func__));
1999					tp->snd_cwnd = tp->snd_ssthresh +
2000					     tp->t_maxseg *
2001					     (tp->t_dupacks - tp->snd_limited);
2002					if (SEQ_GT(onxt, tp->snd_nxt))
2003						tp->snd_nxt = onxt;
2004					goto drop;
2005				} else if (tcp_do_rfc3042) {
2006					u_long oldcwnd = tp->snd_cwnd;
2007					tcp_seq oldsndmax = tp->snd_max;
2008					u_int sent;
2009
2010					KASSERT(tp->t_dupacks == 1 ||
2011					    tp->t_dupacks == 2,
2012					    ("%s: dupacks not 1 or 2",
2013					    __func__));
2014					if (tp->t_dupacks == 1)
2015						tp->snd_limited = 0;
2016					tp->snd_cwnd =
2017					    (tp->snd_nxt - tp->snd_una) +
2018					    (tp->t_dupacks - tp->snd_limited) *
2019					    tp->t_maxseg;
2020					(void) tcp_output(tp);
2021					sent = tp->snd_max - oldsndmax;
2022					if (sent > tp->t_maxseg) {
2023						KASSERT((tp->t_dupacks == 2 &&
2024						    tp->snd_limited == 0) ||
2025						   (sent == tp->t_maxseg + 1 &&
2026						    tp->t_flags & TF_SENTFIN),
2027						    ("%s: sent too much",
2028						    __func__));
2029						tp->snd_limited = 2;
2030					} else if (sent > 0)
2031						++tp->snd_limited;
2032					tp->snd_cwnd = oldcwnd;
2033					goto drop;
2034				}
2035			} else
2036				tp->t_dupacks = 0;
2037			break;
2038		}
2039
2040		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
2041		    ("%s: th_ack <= snd_una", __func__));
2042
2043		/*
2044		 * If the congestion window was inflated to account
2045		 * for the other side's cached packets, retract it.
2046		 */
2047		if (tcp_do_newreno || tp->sack_enable) {
2048			if (IN_FASTRECOVERY(tp)) {
2049				if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2050					if (tp->sack_enable)
2051						tcp_sack_partialack(tp, th);
2052					else
2053						tcp_newreno_partial_ack(tp, th);
2054				} else {
2055					/*
2056					 * Out of fast recovery.
2057					 * Window inflation should have left us
2058					 * with approximately snd_ssthresh
2059					 * outstanding data.
2060					 * But in case we would be inclined to
2061					 * send a burst, better to do it via
2062					 * the slow start mechanism.
2063					 */
2064					if (SEQ_GT(th->th_ack +
2065							tp->snd_ssthresh,
2066						   tp->snd_max))
2067						tp->snd_cwnd = tp->snd_max -
2068								th->th_ack +
2069								tp->t_maxseg;
2070					else
2071						tp->snd_cwnd = tp->snd_ssthresh;
2072				}
2073			}
2074		} else {
2075			if (tp->t_dupacks >= tcprexmtthresh &&
2076			    tp->snd_cwnd > tp->snd_ssthresh)
2077				tp->snd_cwnd = tp->snd_ssthresh;
2078		}
2079		tp->t_dupacks = 0;
2080		/*
2081		 * If we reach this point, ACK is not a duplicate,
2082		 *     i.e., it ACKs something we sent.
2083		 */
2084		if (tp->t_flags & TF_NEEDSYN) {
2085			/*
2086			 * T/TCP: Connection was half-synchronized, and our
2087			 * SYN has been ACK'd (so connection is now fully
2088			 * synchronized).  Go to non-starred state,
2089			 * increment snd_una for ACK of SYN, and check if
2090			 * we can do window scaling.
2091			 */
2092			tp->t_flags &= ~TF_NEEDSYN;
2093			tp->snd_una++;
2094			/* Do window scaling? */
2095			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2096				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
2097				tp->rcv_scale = tp->request_r_scale;
2098				/* Send window already scaled. */
2099			}
2100		}
2101
2102process_ACK:
2103		KASSERT(headlocked, ("%s: process_ACK: head not locked",
2104		    __func__));
2105		INP_LOCK_ASSERT(tp->t_inpcb);
2106
2107		acked = th->th_ack - tp->snd_una;
2108		tcpstat.tcps_rcvackpack++;
2109		tcpstat.tcps_rcvackbyte += acked;
2110
2111		/*
2112		 * If we just performed our first retransmit, and the ACK
2113		 * arrives within our recovery window, then it was a mistake
2114		 * to do the retransmit in the first place.  Recover our
2115		 * original cwnd and ssthresh, and proceed to transmit where
2116		 * we left off.
2117		 */
2118		if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
2119			++tcpstat.tcps_sndrexmitbad;
2120			tp->snd_cwnd = tp->snd_cwnd_prev;
2121			tp->snd_ssthresh = tp->snd_ssthresh_prev;
2122			tp->snd_recover = tp->snd_recover_prev;
2123			if (tp->t_flags & TF_WASFRECOVERY)
2124				ENTER_FASTRECOVERY(tp);
2125			tp->snd_nxt = tp->snd_max;
2126			tp->t_badrxtwin = 0;	/* XXX probably not required */
2127		}
2128
2129		/*
2130		 * If we have a timestamp reply, update smoothed
2131		 * round trip time.  If no timestamp is present but
2132		 * transmit timer is running and timed sequence
2133		 * number was acked, update smoothed round trip time.
2134		 * Since we now have an rtt measurement, cancel the
2135		 * timer backoff (cf., Phil Karn's retransmit alg.).
2136		 * Recompute the initial retransmit timer.
2137		 *
2138		 * Some boxes send broken timestamp replies
2139		 * during the SYN+ACK phase, ignore
2140		 * timestamps of 0 or we could calculate a
2141		 * huge RTT and blow up the retransmit timer.
2142		 */
2143		if ((to.to_flags & TOF_TS) != 0 &&
2144		    to.to_tsecr) {
2145			if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr)
2146				tp->t_rttlow = ticks - to.to_tsecr;
2147			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
2148		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
2149			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
2150				tp->t_rttlow = ticks - tp->t_rtttime;
2151			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
2152		}
2153		tcp_xmit_bandwidth_limit(tp, th->th_ack);
2154
2155		/*
2156		 * If all outstanding data is acked, stop retransmit
2157		 * timer and remember to restart (more output or persist).
2158		 * If there is more data to be acked, restart retransmit
2159		 * timer, using current (possibly backed-off) value.
2160		 */
2161		if (th->th_ack == tp->snd_max) {
2162			tcp_timer_activate(tp, TT_REXMT, 0);
2163			needoutput = 1;
2164		} else if (!tcp_timer_active(tp, TT_PERSIST))
2165			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
2166
2167		/*
2168		 * If no data (only SYN) was ACK'd,
2169		 *    skip rest of ACK processing.
2170		 */
2171		if (acked == 0)
2172			goto step6;
2173
2174		/*
2175		 * When new data is acked, open the congestion window.
2176		 * If the window gives us less than ssthresh packets
2177		 * in flight, open exponentially (maxseg per packet).
2178		 * Otherwise open linearly: maxseg per window
2179		 * (maxseg^2 / cwnd per packet).
2180		 */
2181		if ((!tcp_do_newreno && !tp->sack_enable) ||
2182		    !IN_FASTRECOVERY(tp)) {
2183			u_int cw = tp->snd_cwnd;
2184			u_int incr = tp->t_maxseg;
2185			if (cw > tp->snd_ssthresh)
2186				incr = incr * incr / cw;
2187			tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
2188		}
2189		SOCKBUF_LOCK(&so->so_snd);
2190		if (acked > so->so_snd.sb_cc) {
2191			tp->snd_wnd -= so->so_snd.sb_cc;
2192			sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
2193			ourfinisacked = 1;
2194		} else {
2195			sbdrop_locked(&so->so_snd, acked);
2196			tp->snd_wnd -= acked;
2197			ourfinisacked = 0;
2198		}
2199		sowwakeup_locked(so);
2200		/* detect una wraparound */
2201		if ((tcp_do_newreno || tp->sack_enable) &&
2202		    !IN_FASTRECOVERY(tp) &&
2203		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
2204		    SEQ_LEQ(th->th_ack, tp->snd_recover))
2205			tp->snd_recover = th->th_ack - 1;
2206		if ((tcp_do_newreno || tp->sack_enable) &&
2207		    IN_FASTRECOVERY(tp) &&
2208		    SEQ_GEQ(th->th_ack, tp->snd_recover))
2209			EXIT_FASTRECOVERY(tp);
2210		tp->snd_una = th->th_ack;
2211		if (tp->sack_enable) {
2212			if (SEQ_GT(tp->snd_una, tp->snd_recover))
2213				tp->snd_recover = tp->snd_una;
2214		}
2215		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2216			tp->snd_nxt = tp->snd_una;
2217
2218		switch (tp->t_state) {
2219
2220		/*
2221		 * In FIN_WAIT_1 STATE in addition to the processing
2222		 * for the ESTABLISHED state if our FIN is now acknowledged
2223		 * then enter FIN_WAIT_2.
2224		 */
2225		case TCPS_FIN_WAIT_1:
2226			if (ourfinisacked) {
2227				/*
2228				 * If we can't receive any more
2229				 * data, then closing user can proceed.
2230				 * Starting the timer is contrary to the
2231				 * specification, but if we don't get a FIN
2232				 * we'll hang forever.
2233				 */
2234		/* XXXjl
2235		 * we should release the tp also, and use a
2236		 * compressed state.
2237		 */
2238				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
2239					int timeout;
2240
2241					soisdisconnected(so);
2242					timeout = (tcp_fast_finwait2_recycle) ?
2243						tcp_finwait2_timeout : tcp_maxidle;
2244					tcp_timer_activate(tp, TT_2MSL, timeout);
2245				}
2246				tp->t_state = TCPS_FIN_WAIT_2;
2247			}
2248			break;
2249
2250		/*
2251		 * In CLOSING STATE in addition to the processing for
2252		 * the ESTABLISHED state if the ACK acknowledges our FIN
2253		 * then enter the TIME-WAIT state, otherwise ignore
2254		 * the segment.
2255		 */
2256		case TCPS_CLOSING:
2257			if (ourfinisacked) {
2258				KASSERT(headlocked, ("%s: process_ACK: "
2259				    "head not locked", __func__));
2260				tcp_twstart(tp);
2261				INP_INFO_WUNLOCK(&tcbinfo);
2262				headlocked = 0;
2263				m_freem(m);
2264				return (0);
2265			}
2266			break;
2267
2268		/*
2269		 * In LAST_ACK, we may still be waiting for data to drain
2270		 * and/or to be acked, as well as for the ack of our FIN.
2271		 * If our FIN is now acknowledged, delete the TCB,
2272		 * enter the closed state and return.
2273		 */
2274		case TCPS_LAST_ACK:
2275			if (ourfinisacked) {
2276				KASSERT(headlocked, ("%s: process_ACK: "
2277				    "tcp_close: head not locked", __func__));
2278				tp = tcp_close(tp);
2279				goto drop;
2280			}
2281			break;
2282
2283		/*
2284		 * In TIME_WAIT state the only thing that should arrive
2285		 * is a retransmission of the remote FIN.  Acknowledge
2286		 * it and restart the finack timer.
2287		 */
2288		case TCPS_TIME_WAIT:
2289			KASSERT(tp->t_state != TCPS_TIME_WAIT,
2290			    ("%s: timewait", __func__));
2291			tcp_timer_activate(tp, TT_2MSL, 2 * tcp_msl);
2292			goto dropafterack;
2293		}
2294	}
2295
2296step6:
2297	KASSERT(headlocked, ("%s: step6: head not locked", __func__));
2298	INP_LOCK_ASSERT(tp->t_inpcb);
2299
2300	/*
2301	 * Update window information.
2302	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2303	 */
2304	if ((thflags & TH_ACK) &&
2305	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2306	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2307	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2308		/* keep track of pure window updates */
2309		if (tlen == 0 &&
2310		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2311			tcpstat.tcps_rcvwinupd++;
2312		tp->snd_wnd = tiwin;
2313		tp->snd_wl1 = th->th_seq;
2314		tp->snd_wl2 = th->th_ack;
2315		if (tp->snd_wnd > tp->max_sndwnd)
2316			tp->max_sndwnd = tp->snd_wnd;
2317		needoutput = 1;
2318	}
2319
2320	/*
2321	 * Process segments with URG.
2322	 */
2323	if ((thflags & TH_URG) && th->th_urp &&
2324	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2325		/*
2326		 * This is a kludge, but if we receive and accept
2327		 * random urgent pointers, we'll crash in
2328		 * soreceive.  It's hard to imagine someone
2329		 * actually wanting to send this much urgent data.
2330		 */
2331		SOCKBUF_LOCK(&so->so_rcv);
2332		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2333			th->th_urp = 0;			/* XXX */
2334			thflags &= ~TH_URG;		/* XXX */
2335			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
2336			goto dodata;			/* XXX */
2337		}
2338		/*
2339		 * If this segment advances the known urgent pointer,
2340		 * then mark the data stream.  This should not happen
2341		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2342		 * a FIN has been received from the remote side.
2343		 * In these states we ignore the URG.
2344		 *
2345		 * According to RFC961 (Assigned Protocols),
2346		 * the urgent pointer points to the last octet
2347		 * of urgent data.  We continue, however,
2348		 * to consider it to indicate the first octet
2349		 * of data past the urgent section as the original
2350		 * spec states (in one of two places).
2351		 */
2352		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2353			tp->rcv_up = th->th_seq + th->th_urp;
2354			so->so_oobmark = so->so_rcv.sb_cc +
2355			    (tp->rcv_up - tp->rcv_nxt) - 1;
2356			if (so->so_oobmark == 0)
2357				so->so_rcv.sb_state |= SBS_RCVATMARK;
2358			sohasoutofband(so);
2359			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2360		}
2361		SOCKBUF_UNLOCK(&so->so_rcv);
2362		/*
2363		 * Remove out of band data so doesn't get presented to user.
2364		 * This can happen independent of advancing the URG pointer,
2365		 * but if two URG's are pending at once, some out-of-band
2366		 * data may creep in... ick.
2367		 */
2368		if (th->th_urp <= (u_long)tlen &&
2369		    !(so->so_options & SO_OOBINLINE)) {
2370			/* hdr drop is delayed */
2371			tcp_pulloutofband(so, th, m, drop_hdrlen);
2372		}
2373	} else {
2374		/*
2375		 * If no out of band data is expected,
2376		 * pull receive urgent pointer along
2377		 * with the receive window.
2378		 */
2379		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2380			tp->rcv_up = tp->rcv_nxt;
2381	}
2382dodata:							/* XXX */
2383	KASSERT(headlocked, ("%s: dodata: head not locked", __func__));
2384	INP_LOCK_ASSERT(tp->t_inpcb);
2385
2386	/*
2387	 * Process the segment text, merging it into the TCP sequencing queue,
2388	 * and arranging for acknowledgment of receipt if necessary.
2389	 * This process logically involves adjusting tp->rcv_wnd as data
2390	 * is presented to the user (this happens in tcp_usrreq.c,
2391	 * case PRU_RCVD).  If a FIN has already been received on this
2392	 * connection then we just ignore the text.
2393	 */
2394	if ((tlen || (thflags & TH_FIN)) &&
2395	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2396		tcp_seq save_start = th->th_seq;
2397		tcp_seq save_end = th->th_seq + tlen;
2398		m_adj(m, drop_hdrlen);	/* delayed header drop */
2399		/*
2400		 * Insert segment which includes th into TCP reassembly queue
2401		 * with control block tp.  Set thflags to whether reassembly now
2402		 * includes a segment with FIN.  This handles the common case
2403		 * inline (segment is the next to be received on an established
2404		 * connection, and the queue is empty), avoiding linkage into
2405		 * and removal from the queue and repetition of various
2406		 * conversions.
2407		 * Set DELACK for segments received in order, but ack
2408		 * immediately when segments are out of order (so
2409		 * fast retransmit can work).
2410		 */
2411		if (th->th_seq == tp->rcv_nxt &&
2412		    LIST_EMPTY(&tp->t_segq) &&
2413		    TCPS_HAVEESTABLISHED(tp->t_state)) {
2414			if (DELAY_ACK(tp))
2415				tp->t_flags |= TF_DELACK;
2416			else
2417				tp->t_flags |= TF_ACKNOW;
2418			tp->rcv_nxt += tlen;
2419			thflags = th->th_flags & TH_FIN;
2420			tcpstat.tcps_rcvpack++;
2421			tcpstat.tcps_rcvbyte += tlen;
2422			ND6_HINT(tp);
2423			SOCKBUF_LOCK(&so->so_rcv);
2424			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
2425				m_freem(m);
2426			else
2427				sbappendstream_locked(&so->so_rcv, m);
2428			sorwakeup_locked(so);
2429		} else {
2430			thflags = tcp_reass(tp, th, &tlen, m);
2431			tp->t_flags |= TF_ACKNOW;
2432		}
2433		if (tlen > 0 && tp->sack_enable)
2434			tcp_update_sack_list(tp, save_start, save_end);
2435#if 0
2436		/*
2437		 * Note the amount of data that peer has sent into
2438		 * our window, in order to estimate the sender's
2439		 * buffer size.
2440		 * XXX: Unused.
2441		 */
2442		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2443#endif
2444	} else {
2445		m_freem(m);
2446		thflags &= ~TH_FIN;
2447	}
2448
2449	/*
2450	 * If FIN is received ACK the FIN and let the user know
2451	 * that the connection is closing.
2452	 */
2453	if (thflags & TH_FIN) {
2454		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2455			socantrcvmore(so);
2456			/*
2457			 * If connection is half-synchronized
2458			 * (ie NEEDSYN flag on) then delay ACK,
2459			 * so it may be piggybacked when SYN is sent.
2460			 * Otherwise, since we received a FIN then no
2461			 * more input can be expected, send ACK now.
2462			 */
2463			if (tp->t_flags & TF_NEEDSYN)
2464				tp->t_flags |= TF_DELACK;
2465			else
2466				tp->t_flags |= TF_ACKNOW;
2467			tp->rcv_nxt++;
2468		}
2469		switch (tp->t_state) {
2470
2471		/*
2472		 * In SYN_RECEIVED and ESTABLISHED STATES
2473		 * enter the CLOSE_WAIT state.
2474		 */
2475		case TCPS_SYN_RECEIVED:
2476			tp->t_starttime = ticks;
2477			/*FALLTHROUGH*/
2478		case TCPS_ESTABLISHED:
2479			tp->t_state = TCPS_CLOSE_WAIT;
2480			break;
2481
2482		/*
2483		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2484		 * enter the CLOSING state.
2485		 */
2486		case TCPS_FIN_WAIT_1:
2487			tp->t_state = TCPS_CLOSING;
2488			break;
2489
2490		/*
2491		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2492		 * starting the time-wait timer, turning off the other
2493		 * standard timers.
2494		 */
2495		case TCPS_FIN_WAIT_2:
2496			KASSERT(headlocked == 1, ("%s: dodata: "
2497			    "TCP_FIN_WAIT_2: head not locked", __func__));
2498			tcp_twstart(tp);
2499			INP_INFO_WUNLOCK(&tcbinfo);
2500			return (0);
2501
2502		/*
2503		 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2504		 */
2505		case TCPS_TIME_WAIT:
2506			KASSERT(tp->t_state != TCPS_TIME_WAIT,
2507			    ("%s: timewait", __func__));
2508			tcp_timer_activate(tp, TT_2MSL, 2 * tcp_msl);
2509			break;
2510		}
2511	}
2512	INP_INFO_WUNLOCK(&tcbinfo);
2513	headlocked = 0;
2514#ifdef TCPDEBUG
2515	if (so->so_options & SO_DEBUG)
2516		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
2517			  &tcp_savetcp, 0);
2518#endif
2519
2520	/*
2521	 * Return any desired output.
2522	 */
2523	if (needoutput || (tp->t_flags & TF_ACKNOW))
2524		(void) tcp_output(tp);
2525
2526check_delack:
2527	KASSERT(headlocked == 0, ("%s: check_delack: head locked",
2528	    __func__));
2529	INP_LOCK_ASSERT(tp->t_inpcb);
2530	if (tp->t_flags & TF_DELACK) {
2531		tp->t_flags &= ~TF_DELACK;
2532		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
2533	}
2534	INP_UNLOCK(tp->t_inpcb);
2535	return (0);
2536
2537dropafterack:
2538	KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__));
2539	/*
2540	 * Generate an ACK dropping incoming segment if it occupies
2541	 * sequence space, where the ACK reflects our state.
2542	 *
2543	 * We can now skip the test for the RST flag since all
2544	 * paths to this code happen after packets containing
2545	 * RST have been dropped.
2546	 *
2547	 * In the SYN-RECEIVED state, don't send an ACK unless the
2548	 * segment we received passes the SYN-RECEIVED ACK test.
2549	 * If it fails send a RST.  This breaks the loop in the
2550	 * "LAND" DoS attack, and also prevents an ACK storm
2551	 * between two listening ports that have been sent forged
2552	 * SYN segments, each with the source address of the other.
2553	 */
2554	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2555	    (SEQ_GT(tp->snd_una, th->th_ack) ||
2556	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
2557		rstreason = BANDLIM_RST_OPENPORT;
2558		goto dropwithreset;
2559	}
2560#ifdef TCPDEBUG
2561	if (so->so_options & SO_DEBUG)
2562		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2563			  &tcp_savetcp, 0);
2564#endif
2565	KASSERT(headlocked, ("%s: headlocked should be 1", __func__));
2566	INP_INFO_WUNLOCK(&tcbinfo);
2567	tp->t_flags |= TF_ACKNOW;
2568	(void) tcp_output(tp);
2569	INP_UNLOCK(tp->t_inpcb);
2570	m_freem(m);
2571	return (0);
2572
2573dropwithreset:
2574	KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__));
2575
2576	tcp_dropwithreset(m, th, tp, tlen, rstreason);
2577
2578	if (tp != NULL)
2579		INP_UNLOCK(tp->t_inpcb);
2580	if (headlocked)
2581		INP_INFO_WUNLOCK(&tcbinfo);
2582	return (0);
2583
2584drop:
2585	/*
2586	 * Drop space held by incoming segment and return.
2587	 */
2588#ifdef TCPDEBUG
2589	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2590		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2591			  &tcp_savetcp, 0);
2592#endif
2593	if (tp != NULL)
2594		INP_UNLOCK(tp->t_inpcb);
2595	if (headlocked)
2596		INP_INFO_WUNLOCK(&tcbinfo);
2597	m_freem(m);
2598	return (0);
2599}
2600
2601
2602/*
2603 * Issue RST on TCP segment.  The mbuf must still include the original
2604 * packet header.
2605 */
2606static void
2607tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
2608    int tlen, int rstreason)
2609{
2610	struct ip *ip;
2611#ifdef INET6
2612	struct ip6_hdr *ip6;
2613#endif
2614	/*
2615	 * Generate a RST, dropping incoming segment.
2616	 * Make ACK acceptable to originator of segment.
2617	 * Don't bother to respond if destination was broadcast/multicast.
2618	 * tp may be NULL.
2619	 */
2620	if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
2621		goto drop;
2622#ifdef INET6
2623	if (mtod(m, struct ip *)->ip_v == 6) {
2624		ip6 = mtod(m, struct ip6_hdr *);
2625		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2626		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2627			goto drop;
2628		/* IPv6 anycast check is done at tcp6_input() */
2629	} else
2630#endif
2631	{
2632		ip = mtod(m, struct ip *);
2633		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2634		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2635		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2636		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2637			goto drop;
2638	}
2639
2640	/* Perform bandwidth limiting. */
2641	if (badport_bandlim(rstreason) < 0)
2642		goto drop;
2643
2644	/* tcp_respond consumes the mbuf chain. */
2645	if (th->th_flags & TH_ACK) {
2646		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
2647		    th->th_ack, TH_RST);
2648	} else {
2649		if (th->th_flags & TH_SYN)
2650			tlen++;
2651		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
2652		    (tcp_seq)0, TH_RST|TH_ACK);
2653	}
2654	return;
2655drop:
2656	m_freem(m);
2657	return;
2658}
2659
2660/*
2661 * Parse TCP options and place in tcpopt.
2662 */
2663static void
2664tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
2665{
2666	int opt, optlen;
2667
2668	to->to_flags = 0;
2669	for (; cnt > 0; cnt -= optlen, cp += optlen) {
2670		opt = cp[0];
2671		if (opt == TCPOPT_EOL)
2672			break;
2673		if (opt == TCPOPT_NOP)
2674			optlen = 1;
2675		else {
2676			if (cnt < 2)
2677				break;
2678			optlen = cp[1];
2679			if (optlen < 2 || optlen > cnt)
2680				break;
2681		}
2682		switch (opt) {
2683		case TCPOPT_MAXSEG:
2684			if (optlen != TCPOLEN_MAXSEG)
2685				continue;
2686			if (!(flags & TO_SYN))
2687				continue;
2688			to->to_flags |= TOF_MSS;
2689			bcopy((char *)cp + 2,
2690			    (char *)&to->to_mss, sizeof(to->to_mss));
2691			to->to_mss = ntohs(to->to_mss);
2692			break;
2693		case TCPOPT_WINDOW:
2694			if (optlen != TCPOLEN_WINDOW)
2695				continue;
2696			if (!(flags & TO_SYN))
2697				continue;
2698			to->to_flags |= TOF_SCALE;
2699			to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
2700			break;
2701		case TCPOPT_TIMESTAMP:
2702			if (optlen != TCPOLEN_TIMESTAMP)
2703				continue;
2704			to->to_flags |= TOF_TS;
2705			bcopy((char *)cp + 2,
2706			    (char *)&to->to_tsval, sizeof(to->to_tsval));
2707			to->to_tsval = ntohl(to->to_tsval);
2708			bcopy((char *)cp + 6,
2709			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
2710			to->to_tsecr = ntohl(to->to_tsecr);
2711			break;
2712#ifdef TCP_SIGNATURE
2713		/*
2714		 * XXX In order to reply to a host which has set the
2715		 * TCP_SIGNATURE option in its initial SYN, we have to
2716		 * record the fact that the option was observed here
2717		 * for the syncache code to perform the correct response.
2718		 */
2719		case TCPOPT_SIGNATURE:
2720			if (optlen != TCPOLEN_SIGNATURE)
2721				continue;
2722			to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
2723			break;
2724#endif
2725		case TCPOPT_SACK_PERMITTED:
2726			if (optlen != TCPOLEN_SACK_PERMITTED)
2727				continue;
2728			if (!(flags & TO_SYN))
2729				continue;
2730			if (!tcp_do_sack)
2731				continue;
2732			to->to_flags |= TOF_SACKPERM;
2733			break;
2734		case TCPOPT_SACK:
2735			if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
2736				continue;
2737			if (flags & TO_SYN)
2738				continue;
2739			to->to_flags |= TOF_SACK;
2740			to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
2741			to->to_sacks = cp + 2;
2742			tcpstat.tcps_sack_rcv_blocks++;
2743			break;
2744		default:
2745			continue;
2746		}
2747	}
2748}
2749
2750/*
2751 * Pull out of band byte out of a segment so
2752 * it doesn't appear in the user's data queue.
2753 * It is still reflected in the segment length for
2754 * sequencing purposes.
2755 */
2756static void
2757tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
2758    int off)
2759{
2760	int cnt = off + th->th_urp - 1;
2761
2762	while (cnt >= 0) {
2763		if (m->m_len > cnt) {
2764			char *cp = mtod(m, caddr_t) + cnt;
2765			struct tcpcb *tp = sototcpcb(so);
2766
2767			tp->t_iobc = *cp;
2768			tp->t_oobflags |= TCPOOB_HAVEDATA;
2769			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2770			m->m_len--;
2771			if (m->m_flags & M_PKTHDR)
2772				m->m_pkthdr.len--;
2773			return;
2774		}
2775		cnt -= m->m_len;
2776		m = m->m_next;
2777		if (m == NULL)
2778			break;
2779	}
2780	panic("tcp_pulloutofband");
2781}
2782
2783/*
2784 * Collect new round-trip time estimate
2785 * and update averages and current timeout.
2786 */
2787static void
2788tcp_xmit_timer(struct tcpcb *tp, int rtt)
2789{
2790	int delta;
2791
2792	INP_LOCK_ASSERT(tp->t_inpcb);
2793
2794	tcpstat.tcps_rttupdated++;
2795	tp->t_rttupdated++;
2796	if (tp->t_srtt != 0) {
2797		/*
2798		 * srtt is stored as fixed point with 5 bits after the
2799		 * binary point (i.e., scaled by 8).  The following magic
2800		 * is equivalent to the smoothing algorithm in rfc793 with
2801		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2802		 * point).  Adjust rtt to origin 0.
2803		 */
2804		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2805			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2806
2807		if ((tp->t_srtt += delta) <= 0)
2808			tp->t_srtt = 1;
2809
2810		/*
2811		 * We accumulate a smoothed rtt variance (actually, a
2812		 * smoothed mean difference), then set the retransmit
2813		 * timer to smoothed rtt + 4 times the smoothed variance.
2814		 * rttvar is stored as fixed point with 4 bits after the
2815		 * binary point (scaled by 16).  The following is
2816		 * equivalent to rfc793 smoothing with an alpha of .75
2817		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
2818		 * rfc793's wired-in beta.
2819		 */
2820		if (delta < 0)
2821			delta = -delta;
2822		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2823		if ((tp->t_rttvar += delta) <= 0)
2824			tp->t_rttvar = 1;
2825		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
2826		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
2827	} else {
2828		/*
2829		 * No rtt measurement yet - use the unsmoothed rtt.
2830		 * Set the variance to half the rtt (so our first
2831		 * retransmit happens at 3*rtt).
2832		 */
2833		tp->t_srtt = rtt << TCP_RTT_SHIFT;
2834		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2835		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
2836	}
2837	tp->t_rtttime = 0;
2838	tp->t_rxtshift = 0;
2839
2840	/*
2841	 * the retransmit should happen at rtt + 4 * rttvar.
2842	 * Because of the way we do the smoothing, srtt and rttvar
2843	 * will each average +1/2 tick of bias.  When we compute
2844	 * the retransmit timer, we want 1/2 tick of rounding and
2845	 * 1 extra tick because of +-1/2 tick uncertainty in the
2846	 * firing of the timer.  The bias will give us exactly the
2847	 * 1.5 tick we need.  But, because the bias is
2848	 * statistical, we have to test that we don't drop below
2849	 * the minimum feasible timer (which is 2 ticks).
2850	 */
2851	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2852		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2853
2854	/*
2855	 * We received an ack for a packet that wasn't retransmitted;
2856	 * it is probably safe to discard any error indications we've
2857	 * received recently.  This isn't quite right, but close enough
2858	 * for now (a route might have failed after we sent a segment,
2859	 * and the return path might not be symmetrical).
2860	 */
2861	tp->t_softerror = 0;
2862}
2863
2864/*
2865 * Determine a reasonable value for maxseg size.
2866 * If the route is known, check route for mtu.
2867 * If none, use an mss that can be handled on the outgoing
2868 * interface without forcing IP to fragment; if bigger than
2869 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2870 * to utilize large mbufs.  If no route is found, route has no mtu,
2871 * or the destination isn't local, use a default, hopefully conservative
2872 * size (usually 512 or the default IP max size, but no more than the mtu
2873 * of the interface), as we can't discover anything about intervening
2874 * gateways or networks.  We also initialize the congestion/slow start
2875 * window to be a single segment if the destination isn't local.
2876 * While looking at the routing entry, we also initialize other path-dependent
2877 * parameters from pre-set or cached values in the routing entry.
2878 *
2879 * Also take into account the space needed for options that we
2880 * send regularly.  Make maxseg shorter by that amount to assure
2881 * that we can send maxseg amount of data even when the options
2882 * are present.  Store the upper limit of the length of options plus
2883 * data in maxopd.
2884 *
2885 *
2886 * In case of T/TCP, we call this routine during implicit connection
2887 * setup as well (offer = -1), to initialize maxseg from the cached
2888 * MSS of our peer.
2889 *
2890 * NOTE that this routine is only called when we process an incoming
2891 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
2892 */
2893void
2894tcp_mss(struct tcpcb *tp, int offer)
2895{
2896	int rtt, mss;
2897	u_long bufsize;
2898	u_long maxmtu;
2899	struct inpcb *inp = tp->t_inpcb;
2900	struct socket *so;
2901	struct hc_metrics_lite metrics;
2902	int origoffer = offer;
2903	int mtuflags = 0;
2904#ifdef INET6
2905	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
2906	size_t min_protoh = isipv6 ?
2907			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
2908			    sizeof (struct tcpiphdr);
2909#else
2910	const size_t min_protoh = sizeof(struct tcpiphdr);
2911#endif
2912
2913	/* initialize */
2914#ifdef INET6
2915	if (isipv6) {
2916		maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags);
2917		tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt;
2918	} else
2919#endif
2920	{
2921		maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags);
2922		tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
2923	}
2924	so = inp->inp_socket;
2925
2926	/*
2927	 * no route to sender, stay with default mss and return
2928	 */
2929	if (maxmtu == 0)
2930		return;
2931
2932	/* what have we got? */
2933	switch (offer) {
2934		case 0:
2935			/*
2936			 * Offer == 0 means that there was no MSS on the SYN
2937			 * segment, in this case we use tcp_mssdflt.
2938			 */
2939			offer =
2940#ifdef INET6
2941				isipv6 ? tcp_v6mssdflt :
2942#endif
2943				tcp_mssdflt;
2944			break;
2945
2946		case -1:
2947			/*
2948			 * Offer == -1 means that we didn't receive SYN yet.
2949			 */
2950			/* FALLTHROUGH */
2951
2952		default:
2953			/*
2954			 * Prevent DoS attack with too small MSS. Round up
2955			 * to at least minmss.
2956			 */
2957			offer = max(offer, tcp_minmss);
2958			/*
2959			 * Sanity check: make sure that maxopd will be large
2960			 * enough to allow some data on segments even if the
2961			 * all the option space is used (40bytes).  Otherwise
2962			 * funny things may happen in tcp_output.
2963			 */
2964			offer = max(offer, 64);
2965	}
2966
2967	/*
2968	 * rmx information is now retrieved from tcp_hostcache
2969	 */
2970	tcp_hc_get(&inp->inp_inc, &metrics);
2971
2972	/*
2973	 * if there's a discovered mtu int tcp hostcache, use it
2974	 * else, use the link mtu.
2975	 */
2976	if (metrics.rmx_mtu)
2977		mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
2978	else {
2979#ifdef INET6
2980		if (isipv6) {
2981			mss = maxmtu - min_protoh;
2982			if (!path_mtu_discovery &&
2983			    !in6_localaddr(&inp->in6p_faddr))
2984				mss = min(mss, tcp_v6mssdflt);
2985		} else
2986#endif
2987		{
2988			mss = maxmtu - min_protoh;
2989			if (!path_mtu_discovery &&
2990			    !in_localaddr(inp->inp_faddr))
2991				mss = min(mss, tcp_mssdflt);
2992		}
2993	}
2994	mss = min(mss, offer);
2995
2996	/*
2997	 * maxopd stores the maximum length of data AND options
2998	 * in a segment; maxseg is the amount of data in a normal
2999	 * segment.  We need to store this value (maxopd) apart
3000	 * from maxseg, because now every segment carries options
3001	 * and thus we normally have somewhat less data in segments.
3002	 */
3003	tp->t_maxopd = mss;
3004
3005	/*
3006	 * origoffer==-1 indicates, that no segments were received yet.
3007	 * In this case we just guess.
3008	 */
3009	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3010	    (origoffer == -1 ||
3011	     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
3012		mss -= TCPOLEN_TSTAMP_APPA;
3013	tp->t_maxseg = mss;
3014
3015#if	(MCLBYTES & (MCLBYTES - 1)) == 0
3016		if (mss > MCLBYTES)
3017			mss &= ~(MCLBYTES-1);
3018#else
3019		if (mss > MCLBYTES)
3020			mss = mss / MCLBYTES * MCLBYTES;
3021#endif
3022	tp->t_maxseg = mss;
3023
3024	/*
3025	 * If there's a pipesize, change the socket buffer to that size,
3026	 * don't change if sb_hiwat is different than default (then it
3027	 * has been changed on purpose with setsockopt).
3028	 * Make the socket buffers an integral number of mss units;
3029	 * if the mss is larger than the socket buffer, decrease the mss.
3030	 */
3031	SOCKBUF_LOCK(&so->so_snd);
3032	if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
3033		bufsize = metrics.rmx_sendpipe;
3034	else
3035		bufsize = so->so_snd.sb_hiwat;
3036	if (bufsize < mss)
3037		mss = bufsize;
3038	else {
3039		bufsize = roundup(bufsize, mss);
3040		if (bufsize > sb_max)
3041			bufsize = sb_max;
3042		if (bufsize > so->so_snd.sb_hiwat)
3043			(void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
3044	}
3045	SOCKBUF_UNLOCK(&so->so_snd);
3046	tp->t_maxseg = mss;
3047
3048	SOCKBUF_LOCK(&so->so_rcv);
3049	if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
3050		bufsize = metrics.rmx_recvpipe;
3051	else
3052		bufsize = so->so_rcv.sb_hiwat;
3053	if (bufsize > mss) {
3054		bufsize = roundup(bufsize, mss);
3055		if (bufsize > sb_max)
3056			bufsize = sb_max;
3057		if (bufsize > so->so_rcv.sb_hiwat)
3058			(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
3059	}
3060	SOCKBUF_UNLOCK(&so->so_rcv);
3061	/*
3062	 * While we're here, check the others too
3063	 */
3064	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
3065		tp->t_srtt = rtt;
3066		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
3067		tcpstat.tcps_usedrtt++;
3068		if (metrics.rmx_rttvar) {
3069			tp->t_rttvar = metrics.rmx_rttvar;
3070			tcpstat.tcps_usedrttvar++;
3071		} else {
3072			/* default variation is +- 1 rtt */
3073			tp->t_rttvar =
3074			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
3075		}
3076		TCPT_RANGESET(tp->t_rxtcur,
3077			      ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
3078			      tp->t_rttmin, TCPTV_REXMTMAX);
3079	}
3080	if (metrics.rmx_ssthresh) {
3081		/*
3082		 * There's some sort of gateway or interface
3083		 * buffer limit on the path.  Use this to set
3084		 * the slow start threshhold, but set the
3085		 * threshold to no less than 2*mss.
3086		 */
3087		tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
3088		tcpstat.tcps_usedssthresh++;
3089	}
3090	if (metrics.rmx_bandwidth)
3091		tp->snd_bandwidth = metrics.rmx_bandwidth;
3092
3093	/*
3094	 * Set the slow-start flight size depending on whether this
3095	 * is a local network or not.
3096	 *
3097	 * Extend this so we cache the cwnd too and retrieve it here.
3098	 * Make cwnd even bigger than RFC3390 suggests but only if we
3099	 * have previous experience with the remote host. Be careful
3100	 * not make cwnd bigger than remote receive window or our own
3101	 * send socket buffer. Maybe put some additional upper bound
3102	 * on the retrieved cwnd. Should do incremental updates to
3103	 * hostcache when cwnd collapses so next connection doesn't
3104	 * overloads the path again.
3105	 *
3106	 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
3107	 * We currently check only in syncache_socket for that.
3108	 */
3109#define TCP_METRICS_CWND
3110#ifdef TCP_METRICS_CWND
3111	if (metrics.rmx_cwnd)
3112		tp->snd_cwnd = max(mss,
3113				min(metrics.rmx_cwnd / 2,
3114				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
3115	else
3116#endif
3117	if (tcp_do_rfc3390)
3118		tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
3119#ifdef INET6
3120	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
3121		 (!isipv6 && in_localaddr(inp->inp_faddr)))
3122#else
3123	else if (in_localaddr(inp->inp_faddr))
3124#endif
3125		tp->snd_cwnd = mss * ss_fltsz_local;
3126	else
3127		tp->snd_cwnd = mss * ss_fltsz;
3128
3129	/* Check the interface for TSO capabilities. */
3130	if (mtuflags & CSUM_TSO)
3131		tp->t_flags |= TF_TSO;
3132}
3133
3134/*
3135 * Determine the MSS option to send on an outgoing SYN.
3136 */
3137int
3138tcp_mssopt(struct in_conninfo *inc)
3139{
3140	int mss = 0;
3141	u_long maxmtu = 0;
3142	u_long thcmtu = 0;
3143	size_t min_protoh;
3144#ifdef INET6
3145	int isipv6 = inc->inc_isipv6 ? 1 : 0;
3146#endif
3147
3148	KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
3149
3150#ifdef INET6
3151	if (isipv6) {
3152		mss = tcp_v6mssdflt;
3153		maxmtu = tcp_maxmtu6(inc, NULL);
3154		thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
3155		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
3156	} else
3157#endif
3158	{
3159		mss = tcp_mssdflt;
3160		maxmtu = tcp_maxmtu(inc, NULL);
3161		thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
3162		min_protoh = sizeof(struct tcpiphdr);
3163	}
3164	if (maxmtu && thcmtu)
3165		mss = min(maxmtu, thcmtu) - min_protoh;
3166	else if (maxmtu || thcmtu)
3167		mss = max(maxmtu, thcmtu) - min_protoh;
3168
3169	return (mss);
3170}
3171
3172
3173/*
3174 * On a partial ack arrives, force the retransmission of the
3175 * next unacknowledged segment.  Do not clear tp->t_dupacks.
3176 * By setting snd_nxt to ti_ack, this forces retransmission timer to
3177 * be started again.
3178 */
3179static void
3180tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
3181{
3182	tcp_seq onxt = tp->snd_nxt;
3183	u_long  ocwnd = tp->snd_cwnd;
3184
3185	tcp_timer_activate(tp, TT_REXMT, 0);
3186	tp->t_rtttime = 0;
3187	tp->snd_nxt = th->th_ack;
3188	/*
3189	 * Set snd_cwnd to one segment beyond acknowledged offset.
3190	 * (tp->snd_una has not yet been updated when this function is called.)
3191	 */
3192	tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
3193	tp->t_flags |= TF_ACKNOW;
3194	(void) tcp_output(tp);
3195	tp->snd_cwnd = ocwnd;
3196	if (SEQ_GT(onxt, tp->snd_nxt))
3197		tp->snd_nxt = onxt;
3198	/*
3199	 * Partial window deflation.  Relies on fact that tp->snd_una
3200	 * not updated yet.
3201	 */
3202	if (tp->snd_cwnd > th->th_ack - tp->snd_una)
3203		tp->snd_cwnd -= th->th_ack - tp->snd_una;
3204	else
3205		tp->snd_cwnd = 0;
3206	tp->snd_cwnd += tp->t_maxseg;
3207}
3208
3209/*
3210 * Returns 1 if the TIME_WAIT state was killed and we should start over,
3211 * looking for a pcb in the listen state.  Returns 0 otherwise.
3212 */
3213static int
3214tcp_timewait(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
3215    struct mbuf *m, int tlen)
3216{
3217	struct tcptw *tw;
3218	int thflags;
3219	tcp_seq seq;
3220#ifdef INET6
3221	int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
3222#else
3223	const int isipv6 = 0;
3224#endif
3225
3226	/* tcbinfo lock required for tcp_twclose(), tcp_timer_2msl_reset(). */
3227	INP_INFO_WLOCK_ASSERT(&tcbinfo);
3228	INP_LOCK_ASSERT(inp);
3229
3230	/*
3231	 * XXXRW: Time wait state for inpcb has been recycled, but inpcb is
3232	 * still present.  This is undesirable, but temporarily necessary
3233	 * until we work out how to handle inpcb's who's timewait state has
3234	 * been removed.
3235	 */
3236	tw = intotw(inp);
3237	if (tw == NULL)
3238		goto drop;
3239
3240	thflags = th->th_flags;
3241
3242	/*
3243	 * NOTE: for FIN_WAIT_2 (to be added later),
3244	 * must validate sequence number before accepting RST
3245	 */
3246
3247	/*
3248	 * If the segment contains RST:
3249	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
3250	 *      RFC 1337.
3251	 */
3252	if (thflags & TH_RST)
3253		goto drop;
3254
3255#if 0
3256/* PAWS not needed at the moment */
3257	/*
3258	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
3259	 * and it's less than ts_recent, drop it.
3260	 */
3261	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
3262	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
3263		if ((thflags & TH_ACK) == 0)
3264			goto drop;
3265		goto ack;
3266	}
3267	/*
3268	 * ts_recent is never updated because we never accept new segments.
3269	 */
3270#endif
3271
3272	/*
3273	 * If a new connection request is received
3274	 * while in TIME_WAIT, drop the old connection
3275	 * and start over if the sequence numbers
3276	 * are above the previous ones.
3277	 */
3278	if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
3279		tcp_twclose(tw, 0);
3280		return (1);
3281	}
3282
3283	/*
3284	 * Drop the the segment if it does not contain an ACK.
3285	 */
3286	if ((thflags & TH_ACK) == 0)
3287		goto drop;
3288
3289	/*
3290	 * Reset the 2MSL timer if this is a duplicate FIN.
3291	 */
3292	if (thflags & TH_FIN) {
3293		seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
3294		if (seq + 1 == tw->rcv_nxt)
3295			tcp_timer_2msl_reset(tw, 1);
3296	}
3297
3298	/*
3299	 * Acknowledge the segment if it has data or is not a duplicate ACK.
3300	 */
3301	if (thflags != TH_ACK || tlen != 0 ||
3302	    th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt)
3303		tcp_twrespond(tw, TH_ACK);
3304	goto drop;
3305
3306	/*
3307	 * Generate a RST, dropping incoming segment.
3308	 * Make ACK acceptable to originator of segment.
3309	 * Don't bother to respond if destination was broadcast/multicast.
3310	 */
3311	if (m->m_flags & (M_BCAST|M_MCAST))
3312		goto drop;
3313	if (isipv6) {
3314		struct ip6_hdr *ip6;
3315
3316		/* IPv6 anycast check is done at tcp6_input() */
3317		ip6 = mtod(m, struct ip6_hdr *);
3318		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
3319		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
3320			goto drop;
3321	} else {
3322		struct ip *ip;
3323
3324		ip = mtod(m, struct ip *);
3325		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
3326		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
3327		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
3328		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
3329			goto drop;
3330	}
3331	if (thflags & TH_ACK) {
3332		tcp_respond(NULL,
3333		    mtod(m, void *), th, m, 0, th->th_ack, TH_RST);
3334	} else {
3335		seq = th->th_seq + (thflags & TH_SYN ? 1 : 0);
3336		tcp_respond(NULL,
3337		    mtod(m, void *), th, m, seq, 0, TH_RST|TH_ACK);
3338	}
3339	INP_UNLOCK(inp);
3340	return (0);
3341
3342drop:
3343	INP_UNLOCK(inp);
3344	m_freem(m);
3345	return (0);
3346}
3347