tcp_input.c revision 62587
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 *    must display the following acknowledgement:
15 *	This product includes software developed by the University of
16 *	California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
34 * $FreeBSD: head/sys/netinet/tcp_input.c 62587 2000-07-04 16:35:15Z itojun $
35 */
36
37#include "opt_ipfw.h"		/* for ipfw_fwd		*/
38#include "opt_inet6.h"
39#include "opt_ipsec.h"
40#include "opt_tcpdebug.h"
41#include "opt_tcp_input.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/kernel.h>
46#include <sys/sysctl.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/proc.h>		/* for proc0 declaration */
50#include <sys/protosw.h>
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/syslog.h>
54
55#include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
56
57#include <net/if.h>
58#include <net/route.h>
59
60#include <netinet/in.h>
61#include <netinet/in_systm.h>
62#include <netinet/ip.h>
63#include <netinet/ip_icmp.h>	/* for ICMP_BANDLIM		*/
64#include <netinet/in_var.h>
65#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM		*/
66#include <netinet/in_pcb.h>
67#include <netinet/ip_var.h>
68#ifdef INET6
69#include <netinet/ip6.h>
70#include <netinet/icmp6.h>
71#include <netinet6/nd6.h>
72#include <netinet6/ip6_var.h>
73#include <netinet6/in6_pcb.h>
74#endif
75#include <netinet/tcp.h>
76#include <netinet/tcp_fsm.h>
77#include <netinet/tcp_seq.h>
78#include <netinet/tcp_timer.h>
79#include <netinet/tcp_var.h>
80#ifdef INET6
81#include <netinet6/tcp6_var.h>
82#endif
83#include <netinet/tcpip.h>
84#ifdef TCPDEBUG
85#include <netinet/tcp_debug.h>
86
87u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
88struct tcphdr tcp_savetcp;
89#endif /* TCPDEBUG */
90
91#ifdef IPSEC
92#include <netinet6/ipsec.h>
93#ifdef INET6
94#include <netinet6/ipsec6.h>
95#endif
96#include <netkey/key.h>
97#endif /*IPSEC*/
98
99#include <machine/in_cksum.h>
100
101MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
102
103static int	tcprexmtthresh = 3;
104tcp_seq	tcp_iss;
105tcp_cc	tcp_ccgen;
106
107struct	tcpstat tcpstat;
108SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD,
109    &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
110
111static int log_in_vain = 0;
112SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
113    &log_in_vain, 0, "Log all incoming TCP connections");
114
115static int blackhole = 0;
116SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
117	&blackhole, 0, "Do not send RST when dropping refused connections");
118
119int tcp_delack_enabled = 1;
120SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
121    &tcp_delack_enabled, 0,
122    "Delay ACK to try and piggyback it onto a data packet");
123
124#ifdef TCP_DROP_SYNFIN
125static int drop_synfin = 0;
126SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
127    &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
128#endif
129
130#ifdef TCP_RESTRICT_RST
131static int restrict_rst = 0;
132SYSCTL_INT(_net_inet_tcp, OID_AUTO, restrict_rst, CTLFLAG_RW,
133    &restrict_rst, 0, "Restrict RST emission");
134#endif
135
136struct inpcbhead tcb;
137#define	tcb6	tcb  /* for KAME src sync over BSD*'s */
138struct inpcbinfo tcbinfo;
139
140static void	 tcp_dooptions __P((struct tcpcb *,
141	    u_char *, int, struct tcphdr *, struct tcpopt *));
142static void	 tcp_pulloutofband __P((struct socket *,
143	    struct tcphdr *, struct mbuf *, int));
144static int	 tcp_reass __P((struct tcpcb *, struct tcphdr *, int *,
145				struct mbuf *));
146static void	 tcp_xmit_timer __P((struct tcpcb *, int));
147static int	 tcp_newreno __P((struct tcpcb *, struct tcphdr *));
148
149/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
150#ifdef INET6
151#define ND6_HINT(tp) \
152do { \
153	if ((tp) && (tp)->t_inpcb && \
154	    ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
155	    (tp)->t_inpcb->in6p_route.ro_rt) \
156		nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
157} while (0)
158#else
159#define ND6_HINT(tp)
160#endif
161
162/*
163 * Insert segment which inludes th into reassembly queue of tcp with
164 * control block tp.  Return TH_FIN if reassembly now includes
165 * a segment with FIN.  The macro form does the common case inline
166 * (segment is the next to be received on an established connection,
167 * and the queue is empty), avoiding linkage into and removal
168 * from the queue and repetition of various conversions.
169 * Set DELACK for segments received in order, but ack immediately
170 * when segments are out of order (so fast retransmit can work).
171 */
172#define	TCP_REASS(tp, th, tlenp, m, so, flags) { \
173	if ((th)->th_seq == (tp)->rcv_nxt && \
174	    LIST_EMPTY(&(tp)->t_segq) && \
175	    (tp)->t_state == TCPS_ESTABLISHED) { \
176		if (tcp_delack_enabled) \
177			callout_reset(tp->tt_delack, tcp_delacktime, \
178			    tcp_timer_delack, tp); \
179		else \
180			tp->t_flags |= TF_ACKNOW; \
181		(tp)->rcv_nxt += *(tlenp); \
182		flags = (th)->th_flags & TH_FIN; \
183		tcpstat.tcps_rcvpack++;\
184		tcpstat.tcps_rcvbyte += *(tlenp);\
185		ND6_HINT(tp); \
186		sbappend(&(so)->so_rcv, (m)); \
187		sorwakeup(so); \
188	} else { \
189		(flags) = tcp_reass((tp), (th), (tlenp), (m)); \
190		tp->t_flags |= TF_ACKNOW; \
191	} \
192}
193
194static int
195tcp_reass(tp, th, tlenp, m)
196	register struct tcpcb *tp;
197	register struct tcphdr *th;
198	int *tlenp;
199	struct mbuf *m;
200{
201	struct tseg_qent *q;
202	struct tseg_qent *p = NULL;
203	struct tseg_qent *nq;
204	struct tseg_qent *te;
205	struct socket *so = tp->t_inpcb->inp_socket;
206	int flags;
207
208	/*
209	 * Call with th==0 after become established to
210	 * force pre-ESTABLISHED data up to user socket.
211	 */
212	if (th == 0)
213		goto present;
214
215	/* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
216	MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ,
217	       M_NOWAIT);
218	if (te == NULL) {
219		tcpstat.tcps_rcvmemdrop++;
220		m_freem(m);
221		return (0);
222	}
223
224	/*
225	 * Find a segment which begins after this one does.
226	 */
227	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
228		if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
229			break;
230		p = q;
231	}
232
233	/*
234	 * If there is a preceding segment, it may provide some of
235	 * our data already.  If so, drop the data from the incoming
236	 * segment.  If it provides all of our data, drop us.
237	 */
238	if (p != NULL) {
239		register int i;
240		/* conversion to int (in i) handles seq wraparound */
241		i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
242		if (i > 0) {
243			if (i >= *tlenp) {
244				tcpstat.tcps_rcvduppack++;
245				tcpstat.tcps_rcvdupbyte += *tlenp;
246				m_freem(m);
247				FREE(te, M_TSEGQ);
248				/*
249				 * Try to present any queued data
250				 * at the left window edge to the user.
251				 * This is needed after the 3-WHS
252				 * completes.
253				 */
254				goto present;	/* ??? */
255			}
256			m_adj(m, i);
257			*tlenp -= i;
258			th->th_seq += i;
259		}
260	}
261	tcpstat.tcps_rcvoopack++;
262	tcpstat.tcps_rcvoobyte += *tlenp;
263
264	/*
265	 * While we overlap succeeding segments trim them or,
266	 * if they are completely covered, dequeue them.
267	 */
268	while (q) {
269		register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
270		if (i <= 0)
271			break;
272		if (i < q->tqe_len) {
273			q->tqe_th->th_seq += i;
274			q->tqe_len -= i;
275			m_adj(q->tqe_m, i);
276			break;
277		}
278
279		nq = LIST_NEXT(q, tqe_q);
280		LIST_REMOVE(q, tqe_q);
281		m_freem(q->tqe_m);
282		FREE(q, M_TSEGQ);
283		q = nq;
284	}
285
286	/* Insert the new segment queue entry into place. */
287	te->tqe_m = m;
288	te->tqe_th = th;
289	te->tqe_len = *tlenp;
290
291	if (p == NULL) {
292		LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
293	} else {
294		LIST_INSERT_AFTER(p, te, tqe_q);
295	}
296
297present:
298	/*
299	 * Present data to user, advancing rcv_nxt through
300	 * completed sequence space.
301	 */
302	if (!TCPS_HAVEESTABLISHED(tp->t_state))
303		return (0);
304	q = LIST_FIRST(&tp->t_segq);
305	if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
306		return (0);
307	do {
308		tp->rcv_nxt += q->tqe_len;
309		flags = q->tqe_th->th_flags & TH_FIN;
310		nq = LIST_NEXT(q, tqe_q);
311		LIST_REMOVE(q, tqe_q);
312		if (so->so_state & SS_CANTRCVMORE)
313			m_freem(q->tqe_m);
314		else
315			sbappend(&so->so_rcv, q->tqe_m);
316		FREE(q, M_TSEGQ);
317		q = nq;
318	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
319	ND6_HINT(tp);
320	sorwakeup(so);
321	return (flags);
322}
323
324/*
325 * TCP input routine, follows pages 65-76 of the
326 * protocol specification dated September, 1981 very closely.
327 */
328#ifdef INET6
329int
330tcp6_input(mp, offp, proto)
331	struct mbuf **mp;
332	int *offp, proto;
333{
334	register struct mbuf *m = *mp;
335
336	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
337
338	/*
339	 * draft-itojun-ipv6-tcp-to-anycast
340	 * better place to put this in?
341	 */
342	if (m->m_flags & M_ANYCAST6) {
343		struct ip6_hdr *ip6;
344
345		ip6 = mtod(m, struct ip6_hdr *);
346		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
347			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
348		return IPPROTO_DONE;
349	}
350
351	tcp_input(m, *offp, proto);
352	return IPPROTO_DONE;
353}
354#endif
355
356void
357tcp_input(m, off0, proto)
358	register struct mbuf *m;
359	int off0, proto;
360{
361	register struct tcphdr *th;
362	register struct ip *ip = NULL;
363	register struct ipovly *ipov;
364	register struct inpcb *inp;
365	u_char *optp = NULL;
366	int optlen = 0;
367	int len, tlen, off;
368	int drop_hdrlen;
369	register struct tcpcb *tp = 0;
370	register int thflags;
371	struct socket *so = 0;
372	int todrop, acked, ourfinisacked, needoutput = 0;
373	struct in_addr laddr;
374#ifdef INET6
375	struct in6_addr laddr6;
376#endif
377	int dropsocket = 0;
378	int iss = 0;
379	u_long tiwin;
380	struct tcpopt to;		/* options in this segment */
381	struct rmxp_tao *taop;		/* pointer to our TAO cache entry */
382	struct rmxp_tao	tao_noncached;	/* in case there's no cached entry */
383#ifdef TCPDEBUG
384	short ostate = 0;
385#endif
386#ifdef INET6
387	struct ip6_hdr *ip6 = NULL;
388	int isipv6;
389#endif /* INET6 */
390
391#ifdef INET6
392	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
393#endif
394	bzero((char *)&to, sizeof(to));
395
396	tcpstat.tcps_rcvtotal++;
397
398#ifdef INET6
399	if (isipv6) {
400		/* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
401		ip6 = mtod(m, struct ip6_hdr *);
402		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
403		if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
404			tcpstat.tcps_rcvbadsum++;
405			goto drop;
406		}
407		th = (struct tcphdr *)((caddr_t)ip6 + off0);
408	} else
409#endif /* INET6 */
410      {
411	/*
412	 * Get IP and TCP header together in first mbuf.
413	 * Note: IP leaves IP header in first mbuf.
414	 */
415	if (off0 > sizeof (struct ip)) {
416		ip_stripoptions(m, (struct mbuf *)0);
417		off0 = sizeof(struct ip);
418	}
419	if (m->m_len < sizeof (struct tcpiphdr)) {
420		if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
421			tcpstat.tcps_rcvshort++;
422			return;
423		}
424	}
425	ip = mtod(m, struct ip *);
426	ipov = (struct ipovly *)ip;
427	th = (struct tcphdr *)((caddr_t)ip + off0);
428	tlen = ip->ip_len;
429
430	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
431		if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
432                	th->th_sum = m->m_pkthdr.csum_data;
433		else
434	                th->th_sum = in_pseudo(ip->ip_src.s_addr,
435			    ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
436			    ip->ip_len + IPPROTO_TCP));
437		th->th_sum ^= 0xffff;
438	} else {
439		/*
440		 * Checksum extended TCP header and data.
441		 */
442		len = sizeof (struct ip) + tlen;
443		bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
444		ipov->ih_len = (u_short)tlen;
445		HTONS(ipov->ih_len);
446		th->th_sum = in_cksum(m, len);
447	}
448	if (th->th_sum) {
449		tcpstat.tcps_rcvbadsum++;
450		goto drop;
451	}
452#ifdef INET6
453	/* Re-initialization for later version check */
454	ip->ip_v = IPVERSION;
455#endif
456      }
457
458	/*
459	 * Check that TCP offset makes sense,
460	 * pull out TCP options and adjust length.		XXX
461	 */
462	off = th->th_off << 2;
463	if (off < sizeof (struct tcphdr) || off > tlen) {
464		tcpstat.tcps_rcvbadoff++;
465		goto drop;
466	}
467	tlen -= off;	/* tlen is used instead of ti->ti_len */
468	if (off > sizeof (struct tcphdr)) {
469#ifdef INET6
470		if (isipv6) {
471			IP6_EXTHDR_CHECK(m, off0, off, );
472			ip6 = mtod(m, struct ip6_hdr *);
473			th = (struct tcphdr *)((caddr_t)ip6 + off0);
474		} else
475#endif /* INET6 */
476	      {
477		if (m->m_len < sizeof(struct ip) + off) {
478			if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
479				tcpstat.tcps_rcvshort++;
480				return;
481			}
482			ip = mtod(m, struct ip *);
483			ipov = (struct ipovly *)ip;
484			th = (struct tcphdr *)((caddr_t)ip + off0);
485		}
486	      }
487		optlen = off - sizeof (struct tcphdr);
488		optp = (u_char *)(th + 1);
489	}
490	thflags = th->th_flags;
491
492#ifdef TCP_DROP_SYNFIN
493	/*
494	 * If the drop_synfin option is enabled, drop all packets with
495	 * both the SYN and FIN bits set. This prevents e.g. nmap from
496	 * identifying the TCP/IP stack.
497	 *
498	 * This is incompatible with RFC1644 extensions (T/TCP).
499	 */
500	if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
501		goto drop;
502#endif
503
504	/*
505	 * Convert TCP protocol specific fields to host format.
506	 */
507	NTOHL(th->th_seq);
508	NTOHL(th->th_ack);
509	NTOHS(th->th_win);
510	NTOHS(th->th_urp);
511
512	/*
513	 * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options,
514	 * until after ip6_savecontrol() is called and before other functions
515	 * which don't want those proto headers.
516	 * Because ip6_savecontrol() is going to parse the mbuf to
517	 * search for data to be passed up to user-land, it wants mbuf
518	 * parameters to be unchanged.
519	 */
520	drop_hdrlen = off0 + off;
521
522	/*
523	 * Locate pcb for segment.
524	 */
525findpcb:
526#ifdef IPFIREWALL_FORWARD
527	if (ip_fw_fwd_addr != NULL
528#ifdef INET6
529	    && isipv6 == NULL /* IPv6 support is not yet */
530#endif /* INET6 */
531	    ) {
532		/*
533		 * Diverted. Pretend to be the destination.
534		 * already got one like this?
535		 */
536		inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
537			ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
538		if (!inp) {
539			/*
540			 * No, then it's new. Try find the ambushing socket
541			 */
542			if (!ip_fw_fwd_addr->sin_port) {
543				inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
544				    th->th_sport, ip_fw_fwd_addr->sin_addr,
545				    th->th_dport, 1, m->m_pkthdr.rcvif);
546			} else {
547				inp = in_pcblookup_hash(&tcbinfo,
548				    ip->ip_src, th->th_sport,
549	    			    ip_fw_fwd_addr->sin_addr,
550				    ntohs(ip_fw_fwd_addr->sin_port), 1,
551				    m->m_pkthdr.rcvif);
552			}
553		}
554		ip_fw_fwd_addr = NULL;
555	} else
556#endif	/* IPFIREWALL_FORWARD */
557      {
558#ifdef INET6
559	if (isipv6)
560		inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
561					 &ip6->ip6_dst, th->th_dport, 1,
562					 m->m_pkthdr.rcvif);
563	else
564#endif /* INET6 */
565	inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
566	    ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
567      }
568
569#ifdef IPSEC
570#ifdef INET6
571	if (isipv6) {
572		if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
573			ipsec6stat.in_polvio++;
574			goto drop;
575		}
576	} else
577#endif /* INET6 */
578	if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
579		ipsecstat.in_polvio++;
580		goto drop;
581	}
582#endif /*IPSEC*/
583
584	/*
585	 * If the state is CLOSED (i.e., TCB does not exist) then
586	 * all data in the incoming segment is discarded.
587	 * If the TCB exists but is in CLOSED state, it is embryonic,
588	 * but should either do a listen or a connect soon.
589	 */
590	if (inp == NULL) {
591		if (log_in_vain) {
592#ifdef INET6
593			char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN];
594#else /* INET6 */
595			char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"];
596#endif /* INET6 */
597
598#ifdef INET6
599			if (isipv6) {
600				strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst));
601				strcpy(sbuf, ip6_sprintf(&ip6->ip6_src));
602			} else
603#endif
604		      {
605			strcpy(dbuf, inet_ntoa(ip->ip_dst));
606			strcpy(sbuf, inet_ntoa(ip->ip_src));
607		      }
608			switch (log_in_vain) {
609			case 1:
610				if(thflags & TH_SYN)
611					log(LOG_INFO,
612			    		"Connection attempt to TCP %s:%d from %s:%d\n",
613			    		dbuf, ntohs(th->th_dport),
614					sbuf,
615					ntohs(th->th_sport));
616				break;
617			case 2:
618				log(LOG_INFO,
619			    	"Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
620			    	dbuf, ntohs(th->th_dport), sbuf,
621			    	ntohs(th->th_sport), thflags);
622				break;
623			default:
624				break;
625			}
626		}
627		if (blackhole) {
628			switch (blackhole) {
629			case 1:
630				if (thflags & TH_SYN)
631					goto drop;
632				break;
633			case 2:
634				goto drop;
635			default:
636				goto drop;
637			}
638		}
639		goto maybedropwithreset;
640	}
641	tp = intotcpcb(inp);
642	if (tp == 0)
643		goto maybedropwithreset;
644	if (tp->t_state == TCPS_CLOSED)
645		goto drop;
646
647	/* Unscale the window into a 32-bit value. */
648	if ((thflags & TH_SYN) == 0)
649		tiwin = th->th_win << tp->snd_scale;
650	else
651		tiwin = th->th_win;
652
653#ifdef INET6
654	/* save packet options if user wanted */
655	if (isipv6 && inp->in6p_flags & INP_CONTROLOPTS) {
656		if (inp->in6p_options) {
657			m_freem(inp->in6p_options);
658			inp->in6p_options = 0;
659		}
660		ip6_savecontrol(inp, &inp->in6p_options, ip6, m);
661	}
662        /* else, should also do ip_srcroute() here? */
663#endif /* INET6 */
664
665	so = inp->inp_socket;
666	if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
667#ifdef TCPDEBUG
668		if (so->so_options & SO_DEBUG) {
669			ostate = tp->t_state;
670#ifdef INET6
671			if (isipv6)
672				bcopy((char *)ip6, (char *)tcp_saveipgen,
673				      sizeof(*ip6));
674			else
675#endif /* INET6 */
676			bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
677			tcp_savetcp = *th;
678		}
679#endif
680		if (so->so_options & SO_ACCEPTCONN) {
681			register struct tcpcb *tp0 = tp;
682			struct socket *so2;
683#ifdef IPSEC
684			struct socket *oso;
685#endif
686#ifdef INET6
687			struct inpcb *oinp = sotoinpcb(so);
688#endif /* INET6 */
689
690#ifndef IPSEC
691			/*
692			 * Current IPsec implementation makes incorrect IPsec
693			 * cache if this check is done here.
694			 * So delay this until duplicated socket is created.
695			 */
696			if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
697				/*
698				 * Note: dropwithreset makes sure we don't
699				 * send a RST in response to a RST.
700				 */
701				if (thflags & TH_ACK) {
702					tcpstat.tcps_badsyn++;
703					goto maybedropwithreset;
704				}
705				goto drop;
706			}
707#endif
708			so2 = sonewconn(so, 0);
709			if (so2 == 0) {
710				tcpstat.tcps_listendrop++;
711				so2 = sodropablereq(so);
712				if (so2) {
713					tcp_drop(sototcpcb(so2), ETIMEDOUT);
714					so2 = sonewconn(so, 0);
715				}
716				if (!so2)
717					goto drop;
718			}
719#ifdef IPSEC
720			oso = so;
721#endif
722			so = so2;
723			/*
724			 * This is ugly, but ....
725			 *
726			 * Mark socket as temporary until we're
727			 * committed to keeping it.  The code at
728			 * ``drop'' and ``dropwithreset'' check the
729			 * flag dropsocket to see if the temporary
730			 * socket created here should be discarded.
731			 * We mark the socket as discardable until
732			 * we're committed to it below in TCPS_LISTEN.
733			 */
734			dropsocket++;
735			inp = (struct inpcb *)so->so_pcb;
736#ifdef INET6
737			if (isipv6)
738				inp->in6p_laddr = ip6->ip6_dst;
739			else {
740				if ((inp->inp_flags & IN6P_BINDV6ONLY) == 0) {
741					inp->inp_vflag &= ~INP_IPV6;
742					inp->inp_vflag |= INP_IPV4;
743				}
744#endif /* INET6 */
745			inp->inp_laddr = ip->ip_dst;
746#ifdef INET6
747			}
748#endif /* INET6 */
749			inp->inp_lport = th->th_dport;
750			if (in_pcbinshash(inp) != 0) {
751				/*
752				 * Undo the assignments above if we failed to
753				 * put the PCB on the hash lists.
754				 */
755#ifdef INET6
756				if (isipv6)
757					inp->in6p_laddr = in6addr_any;
758				else
759#endif /* INET6 */
760				inp->inp_laddr.s_addr = INADDR_ANY;
761				inp->inp_lport = 0;
762				goto drop;
763			}
764#ifdef IPSEC
765			/*
766			 * To avoid creating incorrectly cached IPsec
767			 * association, this is need to be done here.
768			 *
769			 * Subject: (KAME-snap 748)
770			 * From: Wayne Knowles <w.knowles@niwa.cri.nz>
771			 * ftp://ftp.kame.net/pub/mail-list/snap-users/748
772			 */
773			if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
774				/*
775				 * Note: dropwithreset makes sure we don't
776				 * send a RST in response to a RST.
777				 */
778				if (thflags & TH_ACK) {
779					tcpstat.tcps_badsyn++;
780					goto maybedropwithreset;
781				}
782				goto drop;
783			}
784#endif
785#ifdef INET6
786			if (isipv6) {
787				/*
788				 * inherit socket options from the listening
789				 * socket.
790				 */
791				inp->inp_flags |=
792					oinp->inp_flags & INP_CONTROLOPTS;
793				if (inp->inp_flags & INP_CONTROLOPTS) {
794					if (inp->in6p_options) {
795						m_freem(inp->in6p_options);
796						inp->in6p_options = 0;
797					}
798					ip6_savecontrol(inp,
799							&inp->in6p_options,
800							ip6, m);
801				}
802			} else
803#endif /* INET6 */
804			inp->inp_options = ip_srcroute();
805#ifdef IPSEC
806			/* copy old policy into new socket's */
807			if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp,
808			                      inp->inp_sp))
809				printf("tcp_input: could not copy policy\n");
810#endif
811			tp = intotcpcb(inp);
812			tp->t_state = TCPS_LISTEN;
813			tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT);
814
815			/* Compute proper scaling value from buffer space */
816			while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
817			   TCP_MAXWIN << tp->request_r_scale <
818			   so->so_rcv.sb_hiwat)
819				tp->request_r_scale++;
820		}
821	}
822
823	/*
824	 * Segment received on connection.
825	 * Reset idle time and keep-alive timer.
826	 */
827	tp->t_rcvtime = ticks;
828	if (TCPS_HAVEESTABLISHED(tp->t_state))
829		callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
830
831	/*
832	 * Process options if not in LISTEN state,
833	 * else do it below (after getting remote address).
834	 */
835	if (tp->t_state != TCPS_LISTEN)
836		tcp_dooptions(tp, optp, optlen, th, &to);
837
838	/*
839	 * Header prediction: check for the two common cases
840	 * of a uni-directional data xfer.  If the packet has
841	 * no control flags, is in-sequence, the window didn't
842	 * change and we're not retransmitting, it's a
843	 * candidate.  If the length is zero and the ack moved
844	 * forward, we're the sender side of the xfer.  Just
845	 * free the data acked & wake any higher level process
846	 * that was blocked waiting for space.  If the length
847	 * is non-zero and the ack didn't move, we're the
848	 * receiver side.  If we're getting packets in-order
849	 * (the reassembly queue is empty), add the data to
850	 * the socket buffer and note that we need a delayed ack.
851	 * Make sure that the hidden state-flags are also off.
852	 * Since we check for TCPS_ESTABLISHED above, it can only
853	 * be TH_NEEDSYN.
854	 */
855	if (tp->t_state == TCPS_ESTABLISHED &&
856	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
857	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
858	    ((to.to_flag & TOF_TS) == 0 ||
859	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
860	    /*
861	     * Using the CC option is compulsory if once started:
862	     *   the segment is OK if no T/TCP was negotiated or
863	     *   if the segment has a CC option equal to CCrecv
864	     */
865	    ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
866	     ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
867	    th->th_seq == tp->rcv_nxt &&
868	    tiwin && tiwin == tp->snd_wnd &&
869	    tp->snd_nxt == tp->snd_max) {
870
871		/*
872		 * If last ACK falls within this segment's sequence numbers,
873		 * record the timestamp.
874		 * NOTE that the test is modified according to the latest
875		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
876		 */
877		if ((to.to_flag & TOF_TS) != 0 &&
878		   SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
879			tp->ts_recent_age = ticks;
880			tp->ts_recent = to.to_tsval;
881		}
882
883		if (tlen == 0) {
884			if (SEQ_GT(th->th_ack, tp->snd_una) &&
885			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
886			    tp->snd_cwnd >= tp->snd_wnd &&
887			    tp->t_dupacks < tcprexmtthresh) {
888				/*
889				 * this is a pure ack for outstanding data.
890				 */
891				++tcpstat.tcps_predack;
892				/*
893				 * "bad retransmit" recovery
894				 */
895				if (tp->t_rxtshift == 1 &&
896				    ticks < tp->t_badrxtwin) {
897					tp->snd_cwnd = tp->snd_cwnd_prev;
898					tp->snd_ssthresh =
899					    tp->snd_ssthresh_prev;
900					tp->snd_nxt = tp->snd_max;
901					tp->t_badrxtwin = 0;
902				}
903				if ((to.to_flag & TOF_TS) != 0)
904					tcp_xmit_timer(tp,
905					    ticks - to.to_tsecr + 1);
906				else if (tp->t_rtttime &&
907					    SEQ_GT(th->th_ack, tp->t_rtseq))
908					tcp_xmit_timer(tp, ticks - tp->t_rtttime);
909				acked = th->th_ack - tp->snd_una;
910				tcpstat.tcps_rcvackpack++;
911				tcpstat.tcps_rcvackbyte += acked;
912				sbdrop(&so->so_snd, acked);
913				tp->snd_una = th->th_ack;
914				m_freem(m);
915				ND6_HINT(tp); /* some progress has been done */
916
917				/*
918				 * If all outstanding data are acked, stop
919				 * retransmit timer, otherwise restart timer
920				 * using current (possibly backed-off) value.
921				 * If process is waiting for space,
922				 * wakeup/selwakeup/signal.  If data
923				 * are ready to send, let tcp_output
924				 * decide between more output or persist.
925				 */
926				if (tp->snd_una == tp->snd_max)
927					callout_stop(tp->tt_rexmt);
928				else if (!callout_active(tp->tt_persist))
929					callout_reset(tp->tt_rexmt,
930						      tp->t_rxtcur,
931						      tcp_timer_rexmt, tp);
932
933				sowwakeup(so);
934				if (so->so_snd.sb_cc)
935					(void) tcp_output(tp);
936				return;
937			}
938		} else if (th->th_ack == tp->snd_una &&
939		    LIST_EMPTY(&tp->t_segq) &&
940		    tlen <= sbspace(&so->so_rcv)) {
941			/*
942			 * this is a pure, in-sequence data packet
943			 * with nothing on the reassembly queue and
944			 * we have enough buffer space to take it.
945			 */
946			++tcpstat.tcps_preddat;
947			tp->rcv_nxt += tlen;
948			tcpstat.tcps_rcvpack++;
949			tcpstat.tcps_rcvbyte += tlen;
950			ND6_HINT(tp);	/* some progress has been done */
951			/*
952			 * Add data to socket buffer.
953			 */
954			m_adj(m, drop_hdrlen);	/* delayed header drop */
955			sbappend(&so->so_rcv, m);
956			sorwakeup(so);
957			if (tcp_delack_enabled) {
958	                        callout_reset(tp->tt_delack, tcp_delacktime,
959	                            tcp_timer_delack, tp);
960			} else {
961				tp->t_flags |= TF_ACKNOW;
962				tcp_output(tp);
963			}
964			return;
965		}
966	}
967
968	/*
969	 * Calculate amount of space in receive window,
970	 * and then do TCP input processing.
971	 * Receive window is amount of space in rcv queue,
972	 * but not less than advertised window.
973	 */
974	{ int win;
975
976	win = sbspace(&so->so_rcv);
977	if (win < 0)
978		win = 0;
979	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
980	}
981
982	switch (tp->t_state) {
983
984	/*
985	 * If the state is LISTEN then ignore segment if it contains an RST.
986	 * If the segment contains an ACK then it is bad and send a RST.
987	 * If it does not contain a SYN then it is not interesting; drop it.
988	 * If it is from this socket, drop it, it must be forged.
989	 * Don't bother responding if the destination was a broadcast.
990	 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
991	 * tp->iss, and send a segment:
992	 *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
993	 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
994	 * Fill in remote peer address fields if not previously specified.
995	 * Enter SYN_RECEIVED state, and process any other fields of this
996	 * segment in this state.
997	 */
998	case TCPS_LISTEN: {
999		register struct sockaddr_in *sin;
1000#ifdef INET6
1001		register struct sockaddr_in6 *sin6;
1002#endif
1003
1004		if (thflags & TH_RST)
1005			goto drop;
1006		if (thflags & TH_ACK)
1007			goto maybedropwithreset;
1008		if ((thflags & TH_SYN) == 0)
1009			goto drop;
1010		if (th->th_dport == th->th_sport) {
1011#ifdef INET6
1012			if (isipv6) {
1013				if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
1014						       &ip6->ip6_src))
1015					goto drop;
1016			} else
1017#endif /* INET6 */
1018			if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1019				goto drop;
1020		}
1021		/*
1022		 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1023		 * in_broadcast() should never return true on a received
1024		 * packet with M_BCAST not set.
1025 		 *
1026 		 * Packets with a multicast source address should also
1027 		 * be discarded.
1028		 */
1029		if (m->m_flags & (M_BCAST|M_MCAST))
1030			goto drop;
1031#ifdef INET6
1032		if (isipv6) {
1033			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1034			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
1035				goto drop;
1036		} else
1037#endif
1038		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1039		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1040		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST))
1041			goto drop;
1042#ifdef INET6
1043		if (isipv6) {
1044			MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
1045			       M_SONAME, M_NOWAIT);
1046			if (sin6 == NULL)
1047				goto drop;
1048			bzero(sin6, sizeof(*sin6));
1049			sin6->sin6_family = AF_INET6;
1050			sin6->sin6_len = sizeof(*sin6);
1051			sin6->sin6_addr = ip6->ip6_src;
1052			sin6->sin6_port = th->th_sport;
1053			laddr6 = inp->in6p_laddr;
1054			if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
1055				inp->in6p_laddr = ip6->ip6_dst;
1056			if (in6_pcbconnect(inp, (struct sockaddr *)sin6,
1057					   &proc0)) {
1058				inp->in6p_laddr = laddr6;
1059				FREE(sin6, M_SONAME);
1060				goto drop;
1061			}
1062			FREE(sin6, M_SONAME);
1063		} else
1064#endif
1065	      {
1066		MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
1067		       M_NOWAIT);
1068		if (sin == NULL)
1069			goto drop;
1070		sin->sin_family = AF_INET;
1071		sin->sin_len = sizeof(*sin);
1072		sin->sin_addr = ip->ip_src;
1073		sin->sin_port = th->th_sport;
1074		bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
1075		laddr = inp->inp_laddr;
1076		if (inp->inp_laddr.s_addr == INADDR_ANY)
1077			inp->inp_laddr = ip->ip_dst;
1078		if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) {
1079			inp->inp_laddr = laddr;
1080			FREE(sin, M_SONAME);
1081			goto drop;
1082		}
1083		FREE(sin, M_SONAME);
1084	      }
1085		tp->t_template = tcp_template(tp);
1086		if (tp->t_template == 0) {
1087			tp = tcp_drop(tp, ENOBUFS);
1088			dropsocket = 0;		/* socket is already gone */
1089			goto drop;
1090		}
1091		if ((taop = tcp_gettaocache(inp)) == NULL) {
1092			taop = &tao_noncached;
1093			bzero(taop, sizeof(*taop));
1094		}
1095		tcp_dooptions(tp, optp, optlen, th, &to);
1096		if (iss)
1097			tp->iss = iss;
1098		else
1099			tp->iss = tcp_iss;
1100		tcp_iss += TCP_ISSINCR/4;
1101		tp->irs = th->th_seq;
1102		tcp_sendseqinit(tp);
1103		tcp_rcvseqinit(tp);
1104		tp->snd_recover = tp->snd_una;
1105		/*
1106		 * Initialization of the tcpcb for transaction;
1107		 *   set SND.WND = SEG.WND,
1108		 *   initialize CCsend and CCrecv.
1109		 */
1110		tp->snd_wnd = tiwin;	/* initial send-window */
1111		tp->cc_send = CC_INC(tcp_ccgen);
1112		tp->cc_recv = to.to_cc;
1113		/*
1114		 * Perform TAO test on incoming CC (SEG.CC) option, if any.
1115		 * - compare SEG.CC against cached CC from the same host,
1116		 *	if any.
1117		 * - if SEG.CC > chached value, SYN must be new and is accepted
1118		 *	immediately: save new CC in the cache, mark the socket
1119		 *	connected, enter ESTABLISHED state, turn on flag to
1120		 *	send a SYN in the next segment.
1121		 *	A virtual advertised window is set in rcv_adv to
1122		 *	initialize SWS prevention.  Then enter normal segment
1123		 *	processing: drop SYN, process data and FIN.
1124		 * - otherwise do a normal 3-way handshake.
1125		 */
1126		if ((to.to_flag & TOF_CC) != 0) {
1127		    if (((tp->t_flags & TF_NOPUSH) != 0) &&
1128			taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) {
1129
1130			taop->tao_cc = to.to_cc;
1131			tp->t_starttime = ticks;
1132			tp->t_state = TCPS_ESTABLISHED;
1133
1134			/*
1135			 * If there is a FIN, or if there is data and the
1136			 * connection is local, then delay SYN,ACK(SYN) in
1137			 * the hope of piggy-backing it on a response
1138			 * segment.  Otherwise must send ACK now in case
1139			 * the other side is slow starting.
1140			 */
1141			if (tcp_delack_enabled && ((thflags & TH_FIN) ||
1142			    (tlen != 0 &&
1143#ifdef INET6
1144			      ((isipv6 && in6_localaddr(&inp->in6p_faddr))
1145			      ||
1146			      (!isipv6 &&
1147#endif
1148			    in_localaddr(inp->inp_faddr)
1149#ifdef INET6
1150			       ))
1151#endif
1152			     ))) {
1153                                callout_reset(tp->tt_delack, tcp_delacktime,
1154                                    tcp_timer_delack, tp);
1155				tp->t_flags |= TF_NEEDSYN;
1156			} else
1157				tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
1158
1159			/*
1160			 * Limit the `virtual advertised window' to TCP_MAXWIN
1161			 * here.  Even if we requested window scaling, it will
1162			 * become effective only later when our SYN is acked.
1163			 */
1164			tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN);
1165			tcpstat.tcps_connects++;
1166			soisconnected(so);
1167			callout_reset(tp->tt_keep, tcp_keepinit,
1168				      tcp_timer_keep, tp);
1169			dropsocket = 0;		/* committed to socket */
1170			tcpstat.tcps_accepts++;
1171			goto trimthenstep6;
1172		    }
1173		/* else do standard 3-way handshake */
1174		} else {
1175		    /*
1176		     * No CC option, but maybe CC.NEW:
1177		     *   invalidate cached value.
1178		     */
1179		     taop->tao_cc = 0;
1180		}
1181		/*
1182		 * TAO test failed or there was no CC option,
1183		 *    do a standard 3-way handshake.
1184		 */
1185		tp->t_flags |= TF_ACKNOW;
1186		tp->t_state = TCPS_SYN_RECEIVED;
1187		callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
1188		dropsocket = 0;		/* committed to socket */
1189		tcpstat.tcps_accepts++;
1190		goto trimthenstep6;
1191		}
1192
1193	/*
1194	 * If the state is SYN_RECEIVED:
1195	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
1196	 */
1197	case TCPS_SYN_RECEIVED:
1198		if ((thflags & TH_ACK) &&
1199		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1200		     SEQ_GT(th->th_ack, tp->snd_max)))
1201				goto maybedropwithreset;
1202		break;
1203
1204	/*
1205	 * If the state is SYN_SENT:
1206	 *	if seg contains an ACK, but not for our SYN, drop the input.
1207	 *	if seg contains a RST, then drop the connection.
1208	 *	if seg does not contain SYN, then drop it.
1209	 * Otherwise this is an acceptable SYN segment
1210	 *	initialize tp->rcv_nxt and tp->irs
1211	 *	if seg contains ack then advance tp->snd_una
1212	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1213	 *	arrange for segment to be acked (eventually)
1214	 *	continue processing rest of data/controls, beginning with URG
1215	 */
1216	case TCPS_SYN_SENT:
1217		if ((taop = tcp_gettaocache(inp)) == NULL) {
1218			taop = &tao_noncached;
1219			bzero(taop, sizeof(*taop));
1220		}
1221
1222		if ((thflags & TH_ACK) &&
1223		    (SEQ_LEQ(th->th_ack, tp->iss) ||
1224		     SEQ_GT(th->th_ack, tp->snd_max))) {
1225			/*
1226			 * If we have a cached CCsent for the remote host,
1227			 * hence we haven't just crashed and restarted,
1228			 * do not send a RST.  This may be a retransmission
1229			 * from the other side after our earlier ACK was lost.
1230			 * Our new SYN, when it arrives, will serve as the
1231			 * needed ACK.
1232			 */
1233			if (taop->tao_ccsent != 0)
1234				goto drop;
1235			else
1236				goto dropwithreset;
1237		}
1238		if (thflags & TH_RST) {
1239			if (thflags & TH_ACK)
1240				tp = tcp_drop(tp, ECONNREFUSED);
1241			goto drop;
1242		}
1243		if ((thflags & TH_SYN) == 0)
1244			goto drop;
1245		tp->snd_wnd = th->th_win;	/* initial send window */
1246		tp->cc_recv = to.to_cc;		/* foreign CC */
1247
1248		tp->irs = th->th_seq;
1249		tcp_rcvseqinit(tp);
1250		if (thflags & TH_ACK) {
1251			/*
1252			 * Our SYN was acked.  If segment contains CC.ECHO
1253			 * option, check it to make sure this segment really
1254			 * matches our SYN.  If not, just drop it as old
1255			 * duplicate, but send an RST if we're still playing
1256			 * by the old rules.  If no CC.ECHO option, make sure
1257			 * we don't get fooled into using T/TCP.
1258			 */
1259			if (to.to_flag & TOF_CCECHO) {
1260				if (tp->cc_send != to.to_ccecho) {
1261					if (taop->tao_ccsent != 0)
1262						goto drop;
1263					else
1264						goto dropwithreset;
1265				}
1266			} else
1267				tp->t_flags &= ~TF_RCVD_CC;
1268			tcpstat.tcps_connects++;
1269			soisconnected(so);
1270			/* Do window scaling on this connection? */
1271			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1272				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
1273				tp->snd_scale = tp->requested_s_scale;
1274				tp->rcv_scale = tp->request_r_scale;
1275			}
1276			/* Segment is acceptable, update cache if undefined. */
1277			if (taop->tao_ccsent == 0)
1278				taop->tao_ccsent = to.to_ccecho;
1279
1280			tp->rcv_adv += tp->rcv_wnd;
1281			tp->snd_una++;		/* SYN is acked */
1282			/*
1283			 * If there's data, delay ACK; if there's also a FIN
1284			 * ACKNOW will be turned on later.
1285			 */
1286			if (tcp_delack_enabled && tlen != 0)
1287                                callout_reset(tp->tt_delack, tcp_delacktime,
1288                                    tcp_timer_delack, tp);
1289			else
1290				tp->t_flags |= TF_ACKNOW;
1291			/*
1292			 * Received <SYN,ACK> in SYN_SENT[*] state.
1293			 * Transitions:
1294			 *	SYN_SENT  --> ESTABLISHED
1295			 *	SYN_SENT* --> FIN_WAIT_1
1296			 */
1297			tp->t_starttime = ticks;
1298			if (tp->t_flags & TF_NEEDFIN) {
1299				tp->t_state = TCPS_FIN_WAIT_1;
1300				tp->t_flags &= ~TF_NEEDFIN;
1301				thflags &= ~TH_SYN;
1302			} else {
1303				tp->t_state = TCPS_ESTABLISHED;
1304				callout_reset(tp->tt_keep, tcp_keepidle,
1305					      tcp_timer_keep, tp);
1306			}
1307		} else {
1308		/*
1309		 *  Received initial SYN in SYN-SENT[*] state => simul-
1310		 *  taneous open.  If segment contains CC option and there is
1311		 *  a cached CC, apply TAO test; if it succeeds, connection is
1312		 *  half-synchronized.  Otherwise, do 3-way handshake:
1313		 *        SYN-SENT -> SYN-RECEIVED
1314		 *        SYN-SENT* -> SYN-RECEIVED*
1315		 *  If there was no CC option, clear cached CC value.
1316		 */
1317			tp->t_flags |= TF_ACKNOW;
1318			callout_stop(tp->tt_rexmt);
1319			if (to.to_flag & TOF_CC) {
1320				if (taop->tao_cc != 0 &&
1321				    CC_GT(to.to_cc, taop->tao_cc)) {
1322					/*
1323					 * update cache and make transition:
1324					 *        SYN-SENT -> ESTABLISHED*
1325					 *        SYN-SENT* -> FIN-WAIT-1*
1326					 */
1327					taop->tao_cc = to.to_cc;
1328					tp->t_starttime = ticks;
1329					if (tp->t_flags & TF_NEEDFIN) {
1330						tp->t_state = TCPS_FIN_WAIT_1;
1331						tp->t_flags &= ~TF_NEEDFIN;
1332					} else {
1333						tp->t_state = TCPS_ESTABLISHED;
1334						callout_reset(tp->tt_keep,
1335							      tcp_keepidle,
1336							      tcp_timer_keep,
1337							      tp);
1338					}
1339					tp->t_flags |= TF_NEEDSYN;
1340				} else
1341					tp->t_state = TCPS_SYN_RECEIVED;
1342			} else {
1343				/* CC.NEW or no option => invalidate cache */
1344				taop->tao_cc = 0;
1345				tp->t_state = TCPS_SYN_RECEIVED;
1346			}
1347		}
1348
1349trimthenstep6:
1350		/*
1351		 * Advance th->th_seq to correspond to first data byte.
1352		 * If data, trim to stay within window,
1353		 * dropping FIN if necessary.
1354		 */
1355		th->th_seq++;
1356		if (tlen > tp->rcv_wnd) {
1357			todrop = tlen - tp->rcv_wnd;
1358			m_adj(m, -todrop);
1359			tlen = tp->rcv_wnd;
1360			thflags &= ~TH_FIN;
1361			tcpstat.tcps_rcvpackafterwin++;
1362			tcpstat.tcps_rcvbyteafterwin += todrop;
1363		}
1364		tp->snd_wl1 = th->th_seq - 1;
1365		tp->rcv_up = th->th_seq;
1366		/*
1367		 *  Client side of transaction: already sent SYN and data.
1368		 *  If the remote host used T/TCP to validate the SYN,
1369		 *  our data will be ACK'd; if so, enter normal data segment
1370		 *  processing in the middle of step 5, ack processing.
1371		 *  Otherwise, goto step 6.
1372		 */
1373 		if (thflags & TH_ACK)
1374			goto process_ACK;
1375		goto step6;
1376	/*
1377	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1378	 *	if segment contains a SYN and CC [not CC.NEW] option:
1379	 *              if state == TIME_WAIT and connection duration > MSL,
1380	 *                  drop packet and send RST;
1381	 *
1382	 *		if SEG.CC > CCrecv then is new SYN, and can implicitly
1383	 *		    ack the FIN (and data) in retransmission queue.
1384	 *                  Complete close and delete TCPCB.  Then reprocess
1385	 *                  segment, hoping to find new TCPCB in LISTEN state;
1386	 *
1387	 *		else must be old SYN; drop it.
1388	 *      else do normal processing.
1389	 */
1390	case TCPS_LAST_ACK:
1391	case TCPS_CLOSING:
1392	case TCPS_TIME_WAIT:
1393		if ((thflags & TH_SYN) &&
1394		    (to.to_flag & TOF_CC) && tp->cc_recv != 0) {
1395			if (tp->t_state == TCPS_TIME_WAIT &&
1396					(ticks - tp->t_starttime) > tcp_msl)
1397				goto dropwithreset;
1398			if (CC_GT(to.to_cc, tp->cc_recv)) {
1399				tp = tcp_close(tp);
1400				goto findpcb;
1401			}
1402			else
1403				goto drop;
1404		}
1405 		break;  /* continue normal processing */
1406	}
1407
1408	/*
1409	 * States other than LISTEN or SYN_SENT.
1410	 * First check the RST flag and sequence number since reset segments
1411	 * are exempt from the timestamp and connection count tests.  This
1412	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1413	 * below which allowed reset segments in half the sequence space
1414	 * to fall though and be processed (which gives forged reset
1415	 * segments with a random sequence number a 50 percent chance of
1416	 * killing a connection).
1417	 * Then check timestamp, if present.
1418	 * Then check the connection count, if present.
1419	 * Then check that at least some bytes of segment are within
1420	 * receive window.  If segment begins before rcv_nxt,
1421	 * drop leading data (and SYN); if nothing left, just ack.
1422	 *
1423	 *
1424	 * If the RST bit is set, check the sequence number to see
1425	 * if this is a valid reset segment.
1426	 * RFC 793 page 37:
1427	 *   In all states except SYN-SENT, all reset (RST) segments
1428	 *   are validated by checking their SEQ-fields.  A reset is
1429	 *   valid if its sequence number is in the window.
1430	 * Note: this does not take into account delayed ACKs, so
1431	 *   we should test against last_ack_sent instead of rcv_nxt.
1432	 *   The sequence number in the reset segment is normally an
1433	 *   echo of our outgoing acknowlegement numbers, but some hosts
1434	 *   send a reset with the sequence number at the rightmost edge
1435	 *   of our receive window, and we have to handle this case.
1436	 * If we have multiple segments in flight, the intial reset
1437	 * segment sequence numbers will be to the left of last_ack_sent,
1438	 * but they will eventually catch up.
1439	 * In any case, it never made sense to trim reset segments to
1440	 * fit the receive window since RFC 1122 says:
1441	 *   4.2.2.12  RST Segment: RFC-793 Section 3.4
1442	 *
1443	 *    A TCP SHOULD allow a received RST segment to include data.
1444	 *
1445	 *    DISCUSSION
1446	 *         It has been suggested that a RST segment could contain
1447	 *         ASCII text that encoded and explained the cause of the
1448	 *         RST.  No standard has yet been established for such
1449	 *         data.
1450	 *
1451	 * If the reset segment passes the sequence number test examine
1452	 * the state:
1453	 *    SYN_RECEIVED STATE:
1454	 *	If passive open, return to LISTEN state.
1455	 *	If active open, inform user that connection was refused.
1456	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1457	 *	Inform user that connection was reset, and close tcb.
1458	 *    CLOSING, LAST_ACK STATES:
1459	 *	Close the tcb.
1460	 *    TIME_WAIT STATE:
1461	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
1462	 *      RFC 1337.
1463	 */
1464	if (thflags & TH_RST) {
1465		if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1466		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1467			switch (tp->t_state) {
1468
1469			case TCPS_SYN_RECEIVED:
1470				so->so_error = ECONNREFUSED;
1471				goto close;
1472
1473			case TCPS_ESTABLISHED:
1474			case TCPS_FIN_WAIT_1:
1475			case TCPS_FIN_WAIT_2:
1476			case TCPS_CLOSE_WAIT:
1477				so->so_error = ECONNRESET;
1478			close:
1479				tp->t_state = TCPS_CLOSED;
1480				tcpstat.tcps_drops++;
1481				tp = tcp_close(tp);
1482				break;
1483
1484			case TCPS_CLOSING:
1485			case TCPS_LAST_ACK:
1486				tp = tcp_close(tp);
1487				break;
1488
1489			case TCPS_TIME_WAIT:
1490				break;
1491			}
1492		}
1493		goto drop;
1494	}
1495
1496	/*
1497	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1498	 * and it's less than ts_recent, drop it.
1499	 */
1500	if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent &&
1501	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1502
1503		/* Check to see if ts_recent is over 24 days old.  */
1504		if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1505			/*
1506			 * Invalidate ts_recent.  If this segment updates
1507			 * ts_recent, the age will be reset later and ts_recent
1508			 * will get a valid value.  If it does not, setting
1509			 * ts_recent to zero will at least satisfy the
1510			 * requirement that zero be placed in the timestamp
1511			 * echo reply when ts_recent isn't valid.  The
1512			 * age isn't reset until we get a valid ts_recent
1513			 * because we don't want out-of-order segments to be
1514			 * dropped when ts_recent is old.
1515			 */
1516			tp->ts_recent = 0;
1517		} else {
1518			tcpstat.tcps_rcvduppack++;
1519			tcpstat.tcps_rcvdupbyte += tlen;
1520			tcpstat.tcps_pawsdrop++;
1521			goto dropafterack;
1522		}
1523	}
1524
1525	/*
1526	 * T/TCP mechanism
1527	 *   If T/TCP was negotiated and the segment doesn't have CC,
1528	 *   or if its CC is wrong then drop the segment.
1529	 *   RST segments do not have to comply with this.
1530	 */
1531	if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
1532	    ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
1533 		goto dropafterack;
1534
1535	/*
1536	 * In the SYN-RECEIVED state, validate that the packet belongs to
1537	 * this connection before trimming the data to fit the receive
1538	 * window.  Check the sequence number versus IRS since we know
1539	 * the sequence numbers haven't wrapped.  This is a partial fix
1540	 * for the "LAND" DoS attack.
1541	 */
1542	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs))
1543		goto maybedropwithreset;
1544
1545	todrop = tp->rcv_nxt - th->th_seq;
1546	if (todrop > 0) {
1547		if (thflags & TH_SYN) {
1548			thflags &= ~TH_SYN;
1549			th->th_seq++;
1550			if (th->th_urp > 1)
1551				th->th_urp--;
1552			else
1553				thflags &= ~TH_URG;
1554			todrop--;
1555		}
1556		/*
1557		 * Following if statement from Stevens, vol. 2, p. 960.
1558		 */
1559		if (todrop > tlen
1560		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1561			/*
1562			 * Any valid FIN must be to the left of the window.
1563			 * At this point the FIN must be a duplicate or out
1564			 * of sequence; drop it.
1565			 */
1566			thflags &= ~TH_FIN;
1567
1568			/*
1569			 * Send an ACK to resynchronize and drop any data.
1570			 * But keep on processing for RST or ACK.
1571			 */
1572			tp->t_flags |= TF_ACKNOW;
1573			todrop = tlen;
1574			tcpstat.tcps_rcvduppack++;
1575			tcpstat.tcps_rcvdupbyte += todrop;
1576		} else {
1577			tcpstat.tcps_rcvpartduppack++;
1578			tcpstat.tcps_rcvpartdupbyte += todrop;
1579		}
1580		drop_hdrlen += todrop;	/* drop from the top afterwards */
1581		th->th_seq += todrop;
1582		tlen -= todrop;
1583		if (th->th_urp > todrop)
1584			th->th_urp -= todrop;
1585		else {
1586			thflags &= ~TH_URG;
1587			th->th_urp = 0;
1588		}
1589	}
1590
1591	/*
1592	 * If new data are received on a connection after the
1593	 * user processes are gone, then RST the other end.
1594	 */
1595	if ((so->so_state & SS_NOFDREF) &&
1596	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1597		tp = tcp_close(tp);
1598		tcpstat.tcps_rcvafterclose++;
1599		goto dropwithreset;
1600	}
1601
1602	/*
1603	 * If segment ends after window, drop trailing data
1604	 * (and PUSH and FIN); if nothing left, just ACK.
1605	 */
1606	todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1607	if (todrop > 0) {
1608		tcpstat.tcps_rcvpackafterwin++;
1609		if (todrop >= tlen) {
1610			tcpstat.tcps_rcvbyteafterwin += tlen;
1611			/*
1612			 * If a new connection request is received
1613			 * while in TIME_WAIT, drop the old connection
1614			 * and start over if the sequence numbers
1615			 * are above the previous ones.
1616			 */
1617			if (thflags & TH_SYN &&
1618			    tp->t_state == TCPS_TIME_WAIT &&
1619			    SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1620				iss = tp->snd_nxt + TCP_ISSINCR;
1621				tp = tcp_close(tp);
1622				goto findpcb;
1623			}
1624			/*
1625			 * If window is closed can only take segments at
1626			 * window edge, and have to drop data and PUSH from
1627			 * incoming segments.  Continue processing, but
1628			 * remember to ack.  Otherwise, drop segment
1629			 * and ack.
1630			 */
1631			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1632				tp->t_flags |= TF_ACKNOW;
1633				tcpstat.tcps_rcvwinprobe++;
1634			} else
1635				goto dropafterack;
1636		} else
1637			tcpstat.tcps_rcvbyteafterwin += todrop;
1638		m_adj(m, -todrop);
1639		tlen -= todrop;
1640		thflags &= ~(TH_PUSH|TH_FIN);
1641	}
1642
1643	/*
1644	 * If last ACK falls within this segment's sequence numbers,
1645	 * record its timestamp.
1646	 * NOTE that the test is modified according to the latest
1647	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1648	 */
1649	if ((to.to_flag & TOF_TS) != 0 &&
1650	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1651		tp->ts_recent_age = ticks;
1652		tp->ts_recent = to.to_tsval;
1653	}
1654
1655	/*
1656	 * If a SYN is in the window, then this is an
1657	 * error and we send an RST and drop the connection.
1658	 */
1659	if (thflags & TH_SYN) {
1660		tp = tcp_drop(tp, ECONNRESET);
1661		goto dropwithreset;
1662	}
1663
1664	/*
1665	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
1666	 * flag is on (half-synchronized state), then queue data for
1667	 * later processing; else drop segment and return.
1668	 */
1669	if ((thflags & TH_ACK) == 0) {
1670		if (tp->t_state == TCPS_SYN_RECEIVED ||
1671		    (tp->t_flags & TF_NEEDSYN))
1672			goto step6;
1673		else
1674			goto drop;
1675	}
1676
1677	/*
1678	 * Ack processing.
1679	 */
1680	switch (tp->t_state) {
1681
1682	/*
1683	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1684	 * ESTABLISHED state and continue processing.
1685	 * The ACK was checked above.
1686	 */
1687	case TCPS_SYN_RECEIVED:
1688
1689		tcpstat.tcps_connects++;
1690		soisconnected(so);
1691		/* Do window scaling? */
1692		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1693			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
1694			tp->snd_scale = tp->requested_s_scale;
1695			tp->rcv_scale = tp->request_r_scale;
1696		}
1697		/*
1698		 * Upon successful completion of 3-way handshake,
1699		 * update cache.CC if it was undefined, pass any queued
1700		 * data to the user, and advance state appropriately.
1701		 */
1702		if ((taop = tcp_gettaocache(inp)) != NULL &&
1703		    taop->tao_cc == 0)
1704			taop->tao_cc = tp->cc_recv;
1705
1706		/*
1707		 * Make transitions:
1708		 *      SYN-RECEIVED  -> ESTABLISHED
1709		 *      SYN-RECEIVED* -> FIN-WAIT-1
1710		 */
1711		tp->t_starttime = ticks;
1712		if (tp->t_flags & TF_NEEDFIN) {
1713			tp->t_state = TCPS_FIN_WAIT_1;
1714			tp->t_flags &= ~TF_NEEDFIN;
1715		} else {
1716			tp->t_state = TCPS_ESTABLISHED;
1717			callout_reset(tp->tt_keep, tcp_keepidle,
1718				      tcp_timer_keep, tp);
1719		}
1720		/*
1721		 * If segment contains data or ACK, will call tcp_reass()
1722		 * later; if not, do so now to pass queued data to user.
1723		 */
1724		if (tlen == 0 && (thflags & TH_FIN) == 0)
1725			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
1726			    (struct mbuf *)0);
1727		tp->snd_wl1 = th->th_seq - 1;
1728		/* fall into ... */
1729
1730	/*
1731	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1732	 * ACKs.  If the ack is in the range
1733	 *	tp->snd_una < th->th_ack <= tp->snd_max
1734	 * then advance tp->snd_una to th->th_ack and drop
1735	 * data from the retransmission queue.  If this ACK reflects
1736	 * more up to date window information we update our window information.
1737	 */
1738	case TCPS_ESTABLISHED:
1739	case TCPS_FIN_WAIT_1:
1740	case TCPS_FIN_WAIT_2:
1741	case TCPS_CLOSE_WAIT:
1742	case TCPS_CLOSING:
1743	case TCPS_LAST_ACK:
1744	case TCPS_TIME_WAIT:
1745
1746		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1747			if (tlen == 0 && tiwin == tp->snd_wnd) {
1748				tcpstat.tcps_rcvdupack++;
1749				/*
1750				 * If we have outstanding data (other than
1751				 * a window probe), this is a completely
1752				 * duplicate ack (ie, window info didn't
1753				 * change), the ack is the biggest we've
1754				 * seen and we've seen exactly our rexmt
1755				 * threshhold of them, assume a packet
1756				 * has been dropped and retransmit it.
1757				 * Kludge snd_nxt & the congestion
1758				 * window so we send only this one
1759				 * packet.
1760				 *
1761				 * We know we're losing at the current
1762				 * window size so do congestion avoidance
1763				 * (set ssthresh to half the current window
1764				 * and pull our congestion window back to
1765				 * the new ssthresh).
1766				 *
1767				 * Dup acks mean that packets have left the
1768				 * network (they're now cached at the receiver)
1769				 * so bump cwnd by the amount in the receiver
1770				 * to keep a constant cwnd packets in the
1771				 * network.
1772				 */
1773				if (!callout_active(tp->tt_rexmt) ||
1774				    th->th_ack != tp->snd_una)
1775					tp->t_dupacks = 0;
1776				else if (++tp->t_dupacks == tcprexmtthresh) {
1777					tcp_seq onxt = tp->snd_nxt;
1778					u_int win =
1779					    min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1780						tp->t_maxseg;
1781					if (tcp_do_newreno && SEQ_LT(th->th_ack,
1782					    tp->snd_recover)) {
1783						/* False retransmit, should not
1784						 * cut window
1785						 */
1786						tp->snd_cwnd += tp->t_maxseg;
1787						tp->t_dupacks = 0;
1788						(void) tcp_output(tp);
1789						goto drop;
1790					}
1791					if (win < 2)
1792						win = 2;
1793					tp->snd_ssthresh = win * tp->t_maxseg;
1794					tp->snd_recover = tp->snd_max;
1795					callout_stop(tp->tt_rexmt);
1796					tp->t_rtttime = 0;
1797					tp->snd_nxt = th->th_ack;
1798					tp->snd_cwnd = tp->t_maxseg;
1799					(void) tcp_output(tp);
1800					tp->snd_cwnd = tp->snd_ssthresh +
1801					       tp->t_maxseg * tp->t_dupacks;
1802					if (SEQ_GT(onxt, tp->snd_nxt))
1803						tp->snd_nxt = onxt;
1804					goto drop;
1805				} else if (tp->t_dupacks > tcprexmtthresh) {
1806					tp->snd_cwnd += tp->t_maxseg;
1807					(void) tcp_output(tp);
1808					goto drop;
1809				}
1810			} else
1811				tp->t_dupacks = 0;
1812			break;
1813		}
1814		/*
1815		 * If the congestion window was inflated to account
1816		 * for the other side's cached packets, retract it.
1817		 */
1818		if (tcp_do_newreno == 0) {
1819                        if (tp->t_dupacks >= tcprexmtthresh &&
1820                                tp->snd_cwnd > tp->snd_ssthresh)
1821                                tp->snd_cwnd = tp->snd_ssthresh;
1822                        tp->t_dupacks = 0;
1823                } else if (tp->t_dupacks >= tcprexmtthresh &&
1824		    !tcp_newreno(tp, th)) {
1825                        /*
1826                         * Window inflation should have left us with approx.
1827                         * snd_ssthresh outstanding data.  But in case we
1828                         * would be inclined to send a burst, better to do
1829                         * it via the slow start mechanism.
1830                         */
1831			if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
1832                                tp->snd_cwnd =
1833				    tp->snd_max - th->th_ack + tp->t_maxseg;
1834			else
1835                        	tp->snd_cwnd = tp->snd_ssthresh;
1836                        tp->t_dupacks = 0;
1837                }
1838		if (SEQ_GT(th->th_ack, tp->snd_max)) {
1839			tcpstat.tcps_rcvacktoomuch++;
1840			goto dropafterack;
1841		}
1842		/*
1843		 *  If we reach this point, ACK is not a duplicate,
1844		 *     i.e., it ACKs something we sent.
1845		 */
1846		if (tp->t_flags & TF_NEEDSYN) {
1847			/*
1848			 * T/TCP: Connection was half-synchronized, and our
1849			 * SYN has been ACK'd (so connection is now fully
1850			 * synchronized).  Go to non-starred state,
1851			 * increment snd_una for ACK of SYN, and check if
1852			 * we can do window scaling.
1853			 */
1854			tp->t_flags &= ~TF_NEEDSYN;
1855			tp->snd_una++;
1856			/* Do window scaling? */
1857			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1858				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
1859				tp->snd_scale = tp->requested_s_scale;
1860				tp->rcv_scale = tp->request_r_scale;
1861			}
1862		}
1863
1864process_ACK:
1865		acked = th->th_ack - tp->snd_una;
1866		tcpstat.tcps_rcvackpack++;
1867		tcpstat.tcps_rcvackbyte += acked;
1868
1869		/*
1870		 * If we just performed our first retransmit, and the ACK
1871		 * arrives within our recovery window, then it was a mistake
1872		 * to do the retransmit in the first place.  Recover our
1873		 * original cwnd and ssthresh, and proceed to transmit where
1874		 * we left off.
1875		 */
1876		if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
1877			tp->snd_cwnd = tp->snd_cwnd_prev;
1878			tp->snd_ssthresh = tp->snd_ssthresh_prev;
1879			tp->snd_nxt = tp->snd_max;
1880			tp->t_badrxtwin = 0;	/* XXX probably not required */
1881		}
1882
1883		/*
1884		 * If we have a timestamp reply, update smoothed
1885		 * round trip time.  If no timestamp is present but
1886		 * transmit timer is running and timed sequence
1887		 * number was acked, update smoothed round trip time.
1888		 * Since we now have an rtt measurement, cancel the
1889		 * timer backoff (cf., Phil Karn's retransmit alg.).
1890		 * Recompute the initial retransmit timer.
1891		 */
1892		if (to.to_flag & TOF_TS)
1893			tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
1894		else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
1895			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
1896
1897		/*
1898		 * If all outstanding data is acked, stop retransmit
1899		 * timer and remember to restart (more output or persist).
1900		 * If there is more data to be acked, restart retransmit
1901		 * timer, using current (possibly backed-off) value.
1902		 */
1903		if (th->th_ack == tp->snd_max) {
1904			callout_stop(tp->tt_rexmt);
1905			needoutput = 1;
1906		} else if (!callout_active(tp->tt_persist))
1907			callout_reset(tp->tt_rexmt, tp->t_rxtcur,
1908				      tcp_timer_rexmt, tp);
1909
1910		/*
1911		 * If no data (only SYN) was ACK'd,
1912		 *    skip rest of ACK processing.
1913		 */
1914		if (acked == 0)
1915			goto step6;
1916
1917		/*
1918		 * When new data is acked, open the congestion window.
1919		 * If the window gives us less than ssthresh packets
1920		 * in flight, open exponentially (maxseg per packet).
1921		 * Otherwise open linearly: maxseg per window
1922		 * (maxseg^2 / cwnd per packet).
1923		 */
1924		{
1925		register u_int cw = tp->snd_cwnd;
1926		register u_int incr = tp->t_maxseg;
1927
1928		if (cw > tp->snd_ssthresh)
1929			incr = incr * incr / cw;
1930		if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover))
1931			tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
1932		}
1933		if (acked > so->so_snd.sb_cc) {
1934			tp->snd_wnd -= so->so_snd.sb_cc;
1935			sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1936			ourfinisacked = 1;
1937		} else {
1938			sbdrop(&so->so_snd, acked);
1939			tp->snd_wnd -= acked;
1940			ourfinisacked = 0;
1941		}
1942		sowwakeup(so);
1943		tp->snd_una = th->th_ack;
1944		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1945			tp->snd_nxt = tp->snd_una;
1946
1947		switch (tp->t_state) {
1948
1949		/*
1950		 * In FIN_WAIT_1 STATE in addition to the processing
1951		 * for the ESTABLISHED state if our FIN is now acknowledged
1952		 * then enter FIN_WAIT_2.
1953		 */
1954		case TCPS_FIN_WAIT_1:
1955			if (ourfinisacked) {
1956				/*
1957				 * If we can't receive any more
1958				 * data, then closing user can proceed.
1959				 * Starting the timer is contrary to the
1960				 * specification, but if we don't get a FIN
1961				 * we'll hang forever.
1962				 */
1963				if (so->so_state & SS_CANTRCVMORE) {
1964					soisdisconnected(so);
1965					callout_reset(tp->tt_2msl, tcp_maxidle,
1966						      tcp_timer_2msl, tp);
1967				}
1968				tp->t_state = TCPS_FIN_WAIT_2;
1969			}
1970			break;
1971
1972	 	/*
1973		 * In CLOSING STATE in addition to the processing for
1974		 * the ESTABLISHED state if the ACK acknowledges our FIN
1975		 * then enter the TIME-WAIT state, otherwise ignore
1976		 * the segment.
1977		 */
1978		case TCPS_CLOSING:
1979			if (ourfinisacked) {
1980				tp->t_state = TCPS_TIME_WAIT;
1981				tcp_canceltimers(tp);
1982				/* Shorten TIME_WAIT [RFC-1644, p.28] */
1983				if (tp->cc_recv != 0 &&
1984				    (ticks - tp->t_starttime) < tcp_msl)
1985					callout_reset(tp->tt_2msl,
1986						      tp->t_rxtcur *
1987						      TCPTV_TWTRUNC,
1988						      tcp_timer_2msl, tp);
1989				else
1990					callout_reset(tp->tt_2msl, 2 * tcp_msl,
1991						      tcp_timer_2msl, tp);
1992				soisdisconnected(so);
1993			}
1994			break;
1995
1996		/*
1997		 * In LAST_ACK, we may still be waiting for data to drain
1998		 * and/or to be acked, as well as for the ack of our FIN.
1999		 * If our FIN is now acknowledged, delete the TCB,
2000		 * enter the closed state and return.
2001		 */
2002		case TCPS_LAST_ACK:
2003			if (ourfinisacked) {
2004				tp = tcp_close(tp);
2005				goto drop;
2006			}
2007			break;
2008
2009		/*
2010		 * In TIME_WAIT state the only thing that should arrive
2011		 * is a retransmission of the remote FIN.  Acknowledge
2012		 * it and restart the finack timer.
2013		 */
2014		case TCPS_TIME_WAIT:
2015			callout_reset(tp->tt_2msl, 2 * tcp_msl,
2016				      tcp_timer_2msl, tp);
2017			goto dropafterack;
2018		}
2019	}
2020
2021step6:
2022	/*
2023	 * Update window information.
2024	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2025	 */
2026	if ((thflags & TH_ACK) &&
2027	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2028	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2029	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2030		/* keep track of pure window updates */
2031		if (tlen == 0 &&
2032		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2033			tcpstat.tcps_rcvwinupd++;
2034		tp->snd_wnd = tiwin;
2035		tp->snd_wl1 = th->th_seq;
2036		tp->snd_wl2 = th->th_ack;
2037		if (tp->snd_wnd > tp->max_sndwnd)
2038			tp->max_sndwnd = tp->snd_wnd;
2039		needoutput = 1;
2040	}
2041
2042	/*
2043	 * Process segments with URG.
2044	 */
2045	if ((thflags & TH_URG) && th->th_urp &&
2046	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2047		/*
2048		 * This is a kludge, but if we receive and accept
2049		 * random urgent pointers, we'll crash in
2050		 * soreceive.  It's hard to imagine someone
2051		 * actually wanting to send this much urgent data.
2052		 */
2053		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2054			th->th_urp = 0;			/* XXX */
2055			thflags &= ~TH_URG;		/* XXX */
2056			goto dodata;			/* XXX */
2057		}
2058		/*
2059		 * If this segment advances the known urgent pointer,
2060		 * then mark the data stream.  This should not happen
2061		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2062		 * a FIN has been received from the remote side.
2063		 * In these states we ignore the URG.
2064		 *
2065		 * According to RFC961 (Assigned Protocols),
2066		 * the urgent pointer points to the last octet
2067		 * of urgent data.  We continue, however,
2068		 * to consider it to indicate the first octet
2069		 * of data past the urgent section as the original
2070		 * spec states (in one of two places).
2071		 */
2072		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2073			tp->rcv_up = th->th_seq + th->th_urp;
2074			so->so_oobmark = so->so_rcv.sb_cc +
2075			    (tp->rcv_up - tp->rcv_nxt) - 1;
2076			if (so->so_oobmark == 0)
2077				so->so_state |= SS_RCVATMARK;
2078			sohasoutofband(so);
2079			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2080		}
2081		/*
2082		 * Remove out of band data so doesn't get presented to user.
2083		 * This can happen independent of advancing the URG pointer,
2084		 * but if two URG's are pending at once, some out-of-band
2085		 * data may creep in... ick.
2086		 */
2087		if (th->th_urp <= (u_long)tlen
2088#ifdef SO_OOBINLINE
2089		     && (so->so_options & SO_OOBINLINE) == 0
2090#endif
2091		     )
2092			tcp_pulloutofband(so, th, m,
2093				drop_hdrlen);	/* hdr drop is delayed */
2094	} else
2095		/*
2096		 * If no out of band data is expected,
2097		 * pull receive urgent pointer along
2098		 * with the receive window.
2099		 */
2100		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2101			tp->rcv_up = tp->rcv_nxt;
2102dodata:							/* XXX */
2103
2104	/*
2105	 * Process the segment text, merging it into the TCP sequencing queue,
2106	 * and arranging for acknowledgment of receipt if necessary.
2107	 * This process logically involves adjusting tp->rcv_wnd as data
2108	 * is presented to the user (this happens in tcp_usrreq.c,
2109	 * case PRU_RCVD).  If a FIN has already been received on this
2110	 * connection then we just ignore the text.
2111	 */
2112	if ((tlen || (thflags&TH_FIN)) &&
2113	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2114		m_adj(m, drop_hdrlen);	/* delayed header drop */
2115		TCP_REASS(tp, th, &tlen, m, so, thflags);
2116		/*
2117		 * Note the amount of data that peer has sent into
2118		 * our window, in order to estimate the sender's
2119		 * buffer size.
2120		 */
2121		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2122	} else {
2123		m_freem(m);
2124		thflags &= ~TH_FIN;
2125	}
2126
2127	/*
2128	 * If FIN is received ACK the FIN and let the user know
2129	 * that the connection is closing.
2130	 */
2131	if (thflags & TH_FIN) {
2132		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2133			socantrcvmore(so);
2134			/*
2135			 *  If connection is half-synchronized
2136			 *  (ie NEEDSYN flag on) then delay ACK,
2137			 *  so it may be piggybacked when SYN is sent.
2138			 *  Otherwise, since we received a FIN then no
2139			 *  more input can be expected, send ACK now.
2140			 */
2141			if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN))
2142                                callout_reset(tp->tt_delack, tcp_delacktime,
2143                                    tcp_timer_delack, tp);
2144			else
2145				tp->t_flags |= TF_ACKNOW;
2146			tp->rcv_nxt++;
2147		}
2148		switch (tp->t_state) {
2149
2150	 	/*
2151		 * In SYN_RECEIVED and ESTABLISHED STATES
2152		 * enter the CLOSE_WAIT state.
2153		 */
2154		case TCPS_SYN_RECEIVED:
2155			tp->t_starttime = ticks;
2156			/*FALLTHROUGH*/
2157		case TCPS_ESTABLISHED:
2158			tp->t_state = TCPS_CLOSE_WAIT;
2159			break;
2160
2161	 	/*
2162		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2163		 * enter the CLOSING state.
2164		 */
2165		case TCPS_FIN_WAIT_1:
2166			tp->t_state = TCPS_CLOSING;
2167			break;
2168
2169	 	/*
2170		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2171		 * starting the time-wait timer, turning off the other
2172		 * standard timers.
2173		 */
2174		case TCPS_FIN_WAIT_2:
2175			tp->t_state = TCPS_TIME_WAIT;
2176			tcp_canceltimers(tp);
2177			/* Shorten TIME_WAIT [RFC-1644, p.28] */
2178			if (tp->cc_recv != 0 &&
2179			    (ticks - tp->t_starttime) < tcp_msl) {
2180				callout_reset(tp->tt_2msl,
2181					      tp->t_rxtcur * TCPTV_TWTRUNC,
2182					      tcp_timer_2msl, tp);
2183				/* For transaction client, force ACK now. */
2184				tp->t_flags |= TF_ACKNOW;
2185			}
2186			else
2187				callout_reset(tp->tt_2msl, 2 * tcp_msl,
2188					      tcp_timer_2msl, tp);
2189			soisdisconnected(so);
2190			break;
2191
2192		/*
2193		 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2194		 */
2195		case TCPS_TIME_WAIT:
2196			callout_reset(tp->tt_2msl, 2 * tcp_msl,
2197				      tcp_timer_2msl, tp);
2198			break;
2199		}
2200	}
2201#ifdef TCPDEBUG
2202	if (so->so_options & SO_DEBUG)
2203		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
2204			  &tcp_savetcp, 0);
2205#endif
2206
2207	/*
2208	 * Return any desired output.
2209	 */
2210	if (needoutput || (tp->t_flags & TF_ACKNOW))
2211		(void) tcp_output(tp);
2212	return;
2213
2214dropafterack:
2215	/*
2216	 * Generate an ACK dropping incoming segment if it occupies
2217	 * sequence space, where the ACK reflects our state.
2218	 *
2219	 * We can now skip the test for the RST flag since all
2220	 * paths to this code happen after packets containing
2221	 * RST have been dropped.
2222	 *
2223	 * In the SYN-RECEIVED state, don't send an ACK unless the
2224	 * segment we received passes the SYN-RECEIVED ACK test.
2225	 * If it fails send a RST.  This breaks the loop in the
2226	 * "LAND" DoS attack, and also prevents an ACK storm
2227	 * between two listening ports that have been sent forged
2228	 * SYN segments, each with the source address of the other.
2229	 */
2230	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2231	    (SEQ_GT(tp->snd_una, th->th_ack) ||
2232	     SEQ_GT(th->th_ack, tp->snd_max)) )
2233		goto maybedropwithreset;
2234#ifdef TCPDEBUG
2235	if (so->so_options & SO_DEBUG)
2236		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2237			  &tcp_savetcp, 0);
2238#endif
2239	m_freem(m);
2240	tp->t_flags |= TF_ACKNOW;
2241	(void) tcp_output(tp);
2242	return;
2243
2244
2245	/*
2246	 * Conditionally drop with reset or just drop depending on whether
2247	 * we think we are under attack or not.
2248	 */
2249maybedropwithreset:
2250	if (badport_bandlim(1) < 0)
2251		goto drop;
2252	/* fall through */
2253dropwithreset:
2254#ifdef TCP_RESTRICT_RST
2255	if (restrict_rst)
2256		goto drop;
2257#endif
2258	/*
2259	 * Generate a RST, dropping incoming segment.
2260	 * Make ACK acceptable to originator of segment.
2261	 * Don't bother to respond if destination was broadcast/multicast.
2262	 */
2263	if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
2264		goto drop;
2265#ifdef INET6
2266	if (isipv6) {
2267		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2268		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
2269			goto drop;
2270	} else
2271#endif /* INET6 */
2272	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2273	    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2274	    ip->ip_src.s_addr == htonl(INADDR_BROADCAST))
2275		goto drop;
2276	/* IPv6 anycast check is done at tcp6_input() */
2277#ifdef TCPDEBUG
2278	if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2279		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2280			  &tcp_savetcp, 0);
2281#endif
2282	if (thflags & TH_ACK)
2283		/* mtod() below is safe as long as hdr dropping is delayed */
2284		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
2285			    TH_RST);
2286	else {
2287		if (thflags & TH_SYN)
2288			tlen++;
2289		/* mtod() below is safe as long as hdr dropping is delayed */
2290		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
2291			    (tcp_seq)0, TH_RST|TH_ACK);
2292	}
2293	/* destroy temporarily created socket */
2294	if (dropsocket)
2295		(void) soabort(so);
2296	return;
2297
2298drop:
2299	/*
2300	 * Drop space held by incoming segment and return.
2301	 */
2302#ifdef TCPDEBUG
2303	if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
2304		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
2305			  &tcp_savetcp, 0);
2306#endif
2307	m_freem(m);
2308	/* destroy temporarily created socket */
2309	if (dropsocket)
2310		(void) soabort(so);
2311	return;
2312}
2313
2314static void
2315tcp_dooptions(tp, cp, cnt, th, to)
2316	struct tcpcb *tp;
2317	u_char *cp;
2318	int cnt;
2319	struct tcphdr *th;
2320	struct tcpopt *to;
2321{
2322	u_short mss = 0;
2323	int opt, optlen;
2324
2325	for (; cnt > 0; cnt -= optlen, cp += optlen) {
2326		opt = cp[0];
2327		if (opt == TCPOPT_EOL)
2328			break;
2329		if (opt == TCPOPT_NOP)
2330			optlen = 1;
2331		else {
2332			optlen = cp[1];
2333			if (optlen <= 0)
2334				break;
2335		}
2336		switch (opt) {
2337
2338		default:
2339			continue;
2340
2341		case TCPOPT_MAXSEG:
2342			if (optlen != TCPOLEN_MAXSEG)
2343				continue;
2344			if (!(th->th_flags & TH_SYN))
2345				continue;
2346			bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
2347			NTOHS(mss);
2348			break;
2349
2350		case TCPOPT_WINDOW:
2351			if (optlen != TCPOLEN_WINDOW)
2352				continue;
2353			if (!(th->th_flags & TH_SYN))
2354				continue;
2355			tp->t_flags |= TF_RCVD_SCALE;
2356			tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
2357			break;
2358
2359		case TCPOPT_TIMESTAMP:
2360			if (optlen != TCPOLEN_TIMESTAMP)
2361				continue;
2362			to->to_flag |= TOF_TS;
2363			bcopy((char *)cp + 2,
2364			    (char *)&to->to_tsval, sizeof(to->to_tsval));
2365			NTOHL(to->to_tsval);
2366			bcopy((char *)cp + 6,
2367			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
2368			NTOHL(to->to_tsecr);
2369
2370			/*
2371			 * A timestamp received in a SYN makes
2372			 * it ok to send timestamp requests and replies.
2373			 */
2374			if (th->th_flags & TH_SYN) {
2375				tp->t_flags |= TF_RCVD_TSTMP;
2376				tp->ts_recent = to->to_tsval;
2377				tp->ts_recent_age = ticks;
2378			}
2379			break;
2380		case TCPOPT_CC:
2381			if (optlen != TCPOLEN_CC)
2382				continue;
2383			to->to_flag |= TOF_CC;
2384			bcopy((char *)cp + 2,
2385			    (char *)&to->to_cc, sizeof(to->to_cc));
2386			NTOHL(to->to_cc);
2387			/*
2388			 * A CC or CC.new option received in a SYN makes
2389			 * it ok to send CC in subsequent segments.
2390			 */
2391			if (th->th_flags & TH_SYN)
2392				tp->t_flags |= TF_RCVD_CC;
2393			break;
2394		case TCPOPT_CCNEW:
2395			if (optlen != TCPOLEN_CC)
2396				continue;
2397			if (!(th->th_flags & TH_SYN))
2398				continue;
2399			to->to_flag |= TOF_CCNEW;
2400			bcopy((char *)cp + 2,
2401			    (char *)&to->to_cc, sizeof(to->to_cc));
2402			NTOHL(to->to_cc);
2403			/*
2404			 * A CC or CC.new option received in a SYN makes
2405			 * it ok to send CC in subsequent segments.
2406			 */
2407			tp->t_flags |= TF_RCVD_CC;
2408			break;
2409		case TCPOPT_CCECHO:
2410			if (optlen != TCPOLEN_CC)
2411				continue;
2412			if (!(th->th_flags & TH_SYN))
2413				continue;
2414			to->to_flag |= TOF_CCECHO;
2415			bcopy((char *)cp + 2,
2416			    (char *)&to->to_ccecho, sizeof(to->to_ccecho));
2417			NTOHL(to->to_ccecho);
2418			break;
2419		}
2420	}
2421	if (th->th_flags & TH_SYN)
2422		tcp_mss(tp, mss);	/* sets t_maxseg */
2423}
2424
2425/*
2426 * Pull out of band byte out of a segment so
2427 * it doesn't appear in the user's data queue.
2428 * It is still reflected in the segment length for
2429 * sequencing purposes.
2430 */
2431static void
2432tcp_pulloutofband(so, th, m, off)
2433	struct socket *so;
2434	struct tcphdr *th;
2435	register struct mbuf *m;
2436	int off;		/* delayed to be droped hdrlen */
2437{
2438	int cnt = off + th->th_urp - 1;
2439
2440	while (cnt >= 0) {
2441		if (m->m_len > cnt) {
2442			char *cp = mtod(m, caddr_t) + cnt;
2443			struct tcpcb *tp = sototcpcb(so);
2444
2445			tp->t_iobc = *cp;
2446			tp->t_oobflags |= TCPOOB_HAVEDATA;
2447			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2448			m->m_len--;
2449			if (m->m_flags & M_PKTHDR)
2450				m->m_pkthdr.len--;
2451			return;
2452		}
2453		cnt -= m->m_len;
2454		m = m->m_next;
2455		if (m == 0)
2456			break;
2457	}
2458	panic("tcp_pulloutofband");
2459}
2460
2461/*
2462 * Collect new round-trip time estimate
2463 * and update averages and current timeout.
2464 */
2465static void
2466tcp_xmit_timer(tp, rtt)
2467	register struct tcpcb *tp;
2468	int rtt;
2469{
2470	register int delta;
2471
2472	tcpstat.tcps_rttupdated++;
2473	tp->t_rttupdated++;
2474	if (tp->t_srtt != 0) {
2475		/*
2476		 * srtt is stored as fixed point with 5 bits after the
2477		 * binary point (i.e., scaled by 8).  The following magic
2478		 * is equivalent to the smoothing algorithm in rfc793 with
2479		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2480		 * point).  Adjust rtt to origin 0.
2481		 */
2482		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2483			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2484
2485		if ((tp->t_srtt += delta) <= 0)
2486			tp->t_srtt = 1;
2487
2488		/*
2489		 * We accumulate a smoothed rtt variance (actually, a
2490		 * smoothed mean difference), then set the retransmit
2491		 * timer to smoothed rtt + 4 times the smoothed variance.
2492		 * rttvar is stored as fixed point with 4 bits after the
2493		 * binary point (scaled by 16).  The following is
2494		 * equivalent to rfc793 smoothing with an alpha of .75
2495		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
2496		 * rfc793's wired-in beta.
2497		 */
2498		if (delta < 0)
2499			delta = -delta;
2500		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2501		if ((tp->t_rttvar += delta) <= 0)
2502			tp->t_rttvar = 1;
2503	} else {
2504		/*
2505		 * No rtt measurement yet - use the unsmoothed rtt.
2506		 * Set the variance to half the rtt (so our first
2507		 * retransmit happens at 3*rtt).
2508		 */
2509		tp->t_srtt = rtt << TCP_RTT_SHIFT;
2510		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2511	}
2512	tp->t_rtttime = 0;
2513	tp->t_rxtshift = 0;
2514
2515	/*
2516	 * the retransmit should happen at rtt + 4 * rttvar.
2517	 * Because of the way we do the smoothing, srtt and rttvar
2518	 * will each average +1/2 tick of bias.  When we compute
2519	 * the retransmit timer, we want 1/2 tick of rounding and
2520	 * 1 extra tick because of +-1/2 tick uncertainty in the
2521	 * firing of the timer.  The bias will give us exactly the
2522	 * 1.5 tick we need.  But, because the bias is
2523	 * statistical, we have to test that we don't drop below
2524	 * the minimum feasible timer (which is 2 ticks).
2525	 */
2526	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2527		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2528
2529	/*
2530	 * We received an ack for a packet that wasn't retransmitted;
2531	 * it is probably safe to discard any error indications we've
2532	 * received recently.  This isn't quite right, but close enough
2533	 * for now (a route might have failed after we sent a segment,
2534	 * and the return path might not be symmetrical).
2535	 */
2536	tp->t_softerror = 0;
2537}
2538
2539/*
2540 * Determine a reasonable value for maxseg size.
2541 * If the route is known, check route for mtu.
2542 * If none, use an mss that can be handled on the outgoing
2543 * interface without forcing IP to fragment; if bigger than
2544 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2545 * to utilize large mbufs.  If no route is found, route has no mtu,
2546 * or the destination isn't local, use a default, hopefully conservative
2547 * size (usually 512 or the default IP max size, but no more than the mtu
2548 * of the interface), as we can't discover anything about intervening
2549 * gateways or networks.  We also initialize the congestion/slow start
2550 * window to be a single segment if the destination isn't local.
2551 * While looking at the routing entry, we also initialize other path-dependent
2552 * parameters from pre-set or cached values in the routing entry.
2553 *
2554 * Also take into account the space needed for options that we
2555 * send regularly.  Make maxseg shorter by that amount to assure
2556 * that we can send maxseg amount of data even when the options
2557 * are present.  Store the upper limit of the length of options plus
2558 * data in maxopd.
2559 *
2560 * NOTE that this routine is only called when we process an incoming
2561 * segment, for outgoing segments only tcp_mssopt is called.
2562 *
2563 * In case of T/TCP, we call this routine during implicit connection
2564 * setup as well (offer = -1), to initialize maxseg from the cached
2565 * MSS of our peer.
2566 */
2567void
2568tcp_mss(tp, offer)
2569	struct tcpcb *tp;
2570	int offer;
2571{
2572	register struct rtentry *rt;
2573	struct ifnet *ifp;
2574	register int rtt, mss;
2575	u_long bufsize;
2576	struct inpcb *inp;
2577	struct socket *so;
2578	struct rmxp_tao *taop;
2579	int origoffer = offer;
2580#ifdef INET6
2581	int isipv6;
2582	int min_protoh;
2583#endif
2584
2585	inp = tp->t_inpcb;
2586#ifdef INET6
2587	isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
2588	min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
2589			    : sizeof (struct tcpiphdr);
2590#else
2591#define min_protoh  (sizeof (struct tcpiphdr))
2592#endif
2593#ifdef INET6
2594	if (isipv6)
2595		rt = tcp_rtlookup6(inp);
2596	else
2597#endif
2598	rt = tcp_rtlookup(inp);
2599	if (rt == NULL) {
2600		tp->t_maxopd = tp->t_maxseg =
2601#ifdef INET6
2602		isipv6 ? tcp_v6mssdflt :
2603#endif /* INET6 */
2604		tcp_mssdflt;
2605		return;
2606	}
2607	ifp = rt->rt_ifp;
2608	so = inp->inp_socket;
2609
2610	taop = rmx_taop(rt->rt_rmx);
2611	/*
2612	 * Offer == -1 means that we didn't receive SYN yet,
2613	 * use cached value in that case;
2614	 */
2615	if (offer == -1)
2616		offer = taop->tao_mssopt;
2617	/*
2618	 * Offer == 0 means that there was no MSS on the SYN segment,
2619	 * in this case we use tcp_mssdflt.
2620	 */
2621	if (offer == 0)
2622		offer =
2623#ifdef INET6
2624			isipv6 ? tcp_v6mssdflt :
2625#endif /* INET6 */
2626			tcp_mssdflt;
2627	else
2628		/*
2629		 * Sanity check: make sure that maxopd will be large
2630		 * enough to allow some data on segments even is the
2631		 * all the option space is used (40bytes).  Otherwise
2632		 * funny things may happen in tcp_output.
2633		 */
2634		offer = max(offer, 64);
2635	taop->tao_mssopt = offer;
2636
2637	/*
2638	 * While we're here, check if there's an initial rtt
2639	 * or rttvar.  Convert from the route-table units
2640	 * to scaled multiples of the slow timeout timer.
2641	 */
2642	if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
2643		/*
2644		 * XXX the lock bit for RTT indicates that the value
2645		 * is also a minimum value; this is subject to time.
2646		 */
2647		if (rt->rt_rmx.rmx_locks & RTV_RTT)
2648			tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
2649		tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
2650		tcpstat.tcps_usedrtt++;
2651		if (rt->rt_rmx.rmx_rttvar) {
2652			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
2653			    (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
2654			tcpstat.tcps_usedrttvar++;
2655		} else {
2656			/* default variation is +- 1 rtt */
2657			tp->t_rttvar =
2658			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
2659		}
2660		TCPT_RANGESET(tp->t_rxtcur,
2661			      ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
2662			      tp->t_rttmin, TCPTV_REXMTMAX);
2663	}
2664	/*
2665	 * if there's an mtu associated with the route, use it
2666	 * else, use the link mtu.
2667	 */
2668	if (rt->rt_rmx.rmx_mtu)
2669		mss = rt->rt_rmx.rmx_mtu - min_protoh;
2670	else
2671	{
2672		mss =
2673#ifdef INET6
2674			(isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu :
2675#endif
2676			 ifp->if_mtu
2677#ifdef INET6
2678			 )
2679#endif
2680			- min_protoh;
2681#ifdef INET6
2682		if (isipv6) {
2683			if (!in6_localaddr(&inp->in6p_faddr))
2684				mss = min(mss, tcp_v6mssdflt);
2685		} else
2686#endif
2687		if (!in_localaddr(inp->inp_faddr))
2688			mss = min(mss, tcp_mssdflt);
2689	}
2690	mss = min(mss, offer);
2691	/*
2692	 * maxopd stores the maximum length of data AND options
2693	 * in a segment; maxseg is the amount of data in a normal
2694	 * segment.  We need to store this value (maxopd) apart
2695	 * from maxseg, because now every segment carries options
2696	 * and thus we normally have somewhat less data in segments.
2697	 */
2698	tp->t_maxopd = mss;
2699
2700	/*
2701	 * In case of T/TCP, origoffer==-1 indicates, that no segments
2702	 * were received yet.  In this case we just guess, otherwise
2703	 * we do the same as before T/TCP.
2704	 */
2705 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2706	    (origoffer == -1 ||
2707	     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
2708		mss -= TCPOLEN_TSTAMP_APPA;
2709 	if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
2710	    (origoffer == -1 ||
2711	     (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
2712		mss -= TCPOLEN_CC_APPA;
2713
2714#if	(MCLBYTES & (MCLBYTES - 1)) == 0
2715		if (mss > MCLBYTES)
2716			mss &= ~(MCLBYTES-1);
2717#else
2718		if (mss > MCLBYTES)
2719			mss = mss / MCLBYTES * MCLBYTES;
2720#endif
2721	/*
2722	 * If there's a pipesize, change the socket buffer
2723	 * to that size.  Make the socket buffers an integral
2724	 * number of mss units; if the mss is larger than
2725	 * the socket buffer, decrease the mss.
2726	 */
2727#ifdef RTV_SPIPE
2728	if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
2729#endif
2730		bufsize = so->so_snd.sb_hiwat;
2731	if (bufsize < mss)
2732		mss = bufsize;
2733	else {
2734		bufsize = roundup(bufsize, mss);
2735		if (bufsize > sb_max)
2736			bufsize = sb_max;
2737		(void)sbreserve(&so->so_snd, bufsize, so, NULL);
2738	}
2739	tp->t_maxseg = mss;
2740
2741#ifdef RTV_RPIPE
2742	if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
2743#endif
2744		bufsize = so->so_rcv.sb_hiwat;
2745	if (bufsize > mss) {
2746		bufsize = roundup(bufsize, mss);
2747		if (bufsize > sb_max)
2748			bufsize = sb_max;
2749		(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
2750	}
2751
2752	/*
2753	 * Set the slow-start flight size depending on whether this
2754	 * is a local network or not.
2755	 */
2756	if (
2757#ifdef INET6
2758	    (isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
2759	    (!isipv6 &&
2760#endif
2761	     in_localaddr(inp->inp_faddr)
2762#ifdef INET6
2763	     )
2764#endif
2765	    )
2766		tp->snd_cwnd = mss * ss_fltsz_local;
2767	else
2768		tp->snd_cwnd = mss * ss_fltsz;
2769
2770	if (rt->rt_rmx.rmx_ssthresh) {
2771		/*
2772		 * There's some sort of gateway or interface
2773		 * buffer limit on the path.  Use this to set
2774		 * the slow start threshhold, but set the
2775		 * threshold to no less than 2*mss.
2776		 */
2777		tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
2778		tcpstat.tcps_usedssthresh++;
2779	}
2780}
2781
2782/*
2783 * Determine the MSS option to send on an outgoing SYN.
2784 */
2785int
2786tcp_mssopt(tp)
2787	struct tcpcb *tp;
2788{
2789	struct rtentry *rt;
2790#ifdef INET6
2791	int isipv6;
2792	int min_protoh;
2793#endif
2794
2795#ifdef INET6
2796	isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
2797	min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
2798			    : sizeof (struct tcpiphdr);
2799#else
2800#define min_protoh  (sizeof (struct tcpiphdr))
2801#endif
2802#ifdef INET6
2803	if (isipv6)
2804		rt = tcp_rtlookup6(tp->t_inpcb);
2805	else
2806#endif /* INET6 */
2807	rt = tcp_rtlookup(tp->t_inpcb);
2808	if (rt == NULL)
2809		return
2810#ifdef INET6
2811			isipv6 ? tcp_v6mssdflt :
2812#endif /* INET6 */
2813			tcp_mssdflt;
2814
2815	return rt->rt_ifp->if_mtu - min_protoh;
2816}
2817
2818
2819/*
2820 * Checks for partial ack.  If partial ack arrives, force the retransmission
2821 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
2822 * 1.  By setting snd_nxt to ti_ack, this forces retransmission timer to
2823 * be started again.  If the ack advances at least to tp->snd_recover, return 0.
2824 */
2825static int
2826tcp_newreno(tp, th)
2827	struct tcpcb *tp;
2828	struct tcphdr *th;
2829{
2830	if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2831		tcp_seq onxt = tp->snd_nxt;
2832		u_long  ocwnd = tp->snd_cwnd;
2833
2834		callout_stop(tp->tt_rexmt);
2835		tp->t_rtttime = 0;
2836		tp->snd_nxt = th->th_ack;
2837		/*
2838		 * Set snd_cwnd to one segment beyond acknowledged offset
2839		 * (tp->snd_una has not yet been updated when this function
2840		 *  is called)
2841		 */
2842		tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
2843		(void) tcp_output(tp);
2844		tp->snd_cwnd = ocwnd;
2845		if (SEQ_GT(onxt, tp->snd_nxt))
2846			tp->snd_nxt = onxt;
2847		/*
2848		 * Partial window deflation.  Relies on fact that tp->snd_una
2849		 * not updated yet.
2850		 */
2851		tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
2852		return (1);
2853	}
2854	return (0);
2855}
2856