Deleted Added
full compact
tcp_reass.c (54601) tcp_reass.c (55009)
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
34 * $FreeBSD: head/sys/netinet/tcp_reass.c 54601 1999-12-14 15:43:56Z jlemon $
34 * $FreeBSD: head/sys/netinet/tcp_reass.c 55009 1999-12-22 19:13:38Z shin $
35 */
36
37#include "opt_ipfw.h" /* for ipfw_fwd */
38#include "opt_tcpdebug.h"
39#include "opt_tcp_input.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/kernel.h>
44#include <sys/sysctl.h>
45#include <sys/malloc.h>
46#include <sys/mbuf.h>
47#include <sys/proc.h> /* for proc0 declaration */
48#include <sys/protosw.h>
49#include <sys/socket.h>
50#include <sys/socketvar.h>
51#include <sys/syslog.h>
52
53#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
54
55#include <net/if.h>
56#include <net/route.h>
57
58#include <netinet/in.h>
59#include <netinet/in_systm.h>
60#include <netinet/ip.h>
61#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
62#include <netinet/in_pcb.h>
63#include <netinet/ip_var.h>
64#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
65#include <netinet/tcp.h>
66#include <netinet/tcp_fsm.h>
67#include <netinet/tcp_seq.h>
68#include <netinet/tcp_timer.h>
69#include <netinet/tcp_var.h>
70#include <netinet/tcpip.h>
71#ifdef TCPDEBUG
72#include <netinet/tcp_debug.h>
73static struct tcpiphdr tcp_saveti;
74#endif
75
76static int tcprexmtthresh = 3;
77tcp_seq tcp_iss;
78tcp_cc tcp_ccgen;
79
80struct tcpstat tcpstat;
81SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD,
82 &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
83
84static int log_in_vain = 0;
85SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
86 &log_in_vain, 0, "Log all incoming TCP connections");
87
88static int blackhole = 0;
89SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
90 &blackhole, 0, "Do not send RST when dropping refused connections");
91
92int tcp_delack_enabled = 1;
93SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
94 &tcp_delack_enabled, 0,
95 "Delay ACK to try and piggyback it onto a data packet");
96
97#ifdef TCP_DROP_SYNFIN
98static int drop_synfin = 0;
99SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
100 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
101#endif
102
103#ifdef TCP_RESTRICT_RST
104static int restrict_rst = 0;
105SYSCTL_INT(_net_inet_tcp, OID_AUTO, restrict_rst, CTLFLAG_RW,
106 &restrict_rst, 0, "Restrict RST emission");
107#endif
108
109struct inpcbhead tcb;
110struct inpcbinfo tcbinfo;
111
112static void tcp_dooptions __P((struct tcpcb *,
113 u_char *, int, struct tcpiphdr *, struct tcpopt *));
114static void tcp_pulloutofband __P((struct socket *,
115 struct tcpiphdr *, struct mbuf *));
116static int tcp_reass __P((struct tcpcb *, struct tcpiphdr *, struct mbuf *));
117static void tcp_xmit_timer __P((struct tcpcb *, int));
118
119
120/*
121 * Insert segment ti into reassembly queue of tcp with
122 * control block tp. Return TH_FIN if reassembly now includes
123 * a segment with FIN. The macro form does the common case inline
124 * (segment is the next to be received on an established connection,
125 * and the queue is empty), avoiding linkage into and removal
126 * from the queue and repetition of various conversions.
127 * Set DELACK for segments received in order, but ack immediately
128 * when segments are out of order (so fast retransmit can work).
129 */
130#define TCP_REASS(tp, ti, m, so, flags) { \
131 if ((ti)->ti_seq == (tp)->rcv_nxt && \
132 (tp)->t_segq == NULL && \
133 (tp)->t_state == TCPS_ESTABLISHED) { \
134 if (tcp_delack_enabled) \
135 callout_reset(tp->tt_delack, tcp_delacktime, \
136 tcp_timer_delack, tp); \
137 else \
138 tp->t_flags |= TF_ACKNOW; \
139 (tp)->rcv_nxt += (ti)->ti_len; \
140 flags = (ti)->ti_flags & TH_FIN; \
141 tcpstat.tcps_rcvpack++;\
142 tcpstat.tcps_rcvbyte += (ti)->ti_len;\
143 sbappend(&(so)->so_rcv, (m)); \
144 sorwakeup(so); \
145 } else { \
146 (flags) = tcp_reass((tp), (ti), (m)); \
147 tp->t_flags |= TF_ACKNOW; \
148 } \
149}
150
151static int
152tcp_reass(tp, ti, m)
153 register struct tcpcb *tp;
154 register struct tcpiphdr *ti;
155 struct mbuf *m;
156{
157 struct mbuf *q;
158 struct mbuf *p;
159 struct mbuf *nq;
160 struct socket *so = tp->t_inpcb->inp_socket;
161 int flags;
162
163#define GETTCP(m) ((struct tcpiphdr *)m->m_pkthdr.header)
164
165 /*
166 * Call with ti==0 after become established to
167 * force pre-ESTABLISHED data up to user socket.
168 */
169 if (ti == 0)
170 goto present;
171
172 m->m_pkthdr.header = ti;
173
174 /*
175 * Find a segment which begins after this one does.
176 */
177 for (q = tp->t_segq, p = NULL; q; p = q, q = q->m_nextpkt)
178 if (SEQ_GT(GETTCP(q)->ti_seq, ti->ti_seq))
179 break;
180
181 /*
182 * If there is a preceding segment, it may provide some of
183 * our data already. If so, drop the data from the incoming
184 * segment. If it provides all of our data, drop us.
185 */
186 if (p != NULL) {
187 register int i;
188 /* conversion to int (in i) handles seq wraparound */
189 i = GETTCP(p)->ti_seq + GETTCP(p)->ti_len - ti->ti_seq;
190 if (i > 0) {
191 if (i >= ti->ti_len) {
192 tcpstat.tcps_rcvduppack++;
193 tcpstat.tcps_rcvdupbyte += ti->ti_len;
194 m_freem(m);
195 /*
196 * Try to present any queued data
197 * at the left window edge to the user.
198 * This is needed after the 3-WHS
199 * completes.
200 */
201 goto present; /* ??? */
202 }
203 m_adj(m, i);
204 ti->ti_len -= i;
205 ti->ti_seq += i;
206 }
207 }
208 tcpstat.tcps_rcvoopack++;
209 tcpstat.tcps_rcvoobyte += ti->ti_len;
210
211 /*
212 * While we overlap succeeding segments trim them or,
213 * if they are completely covered, dequeue them.
214 */
215 while (q) {
216 register int i = (ti->ti_seq + ti->ti_len) - GETTCP(q)->ti_seq;
217 if (i <= 0)
218 break;
219 if (i < GETTCP(q)->ti_len) {
220 GETTCP(q)->ti_seq += i;
221 GETTCP(q)->ti_len -= i;
222 m_adj(q, i);
223 break;
224 }
225
226 nq = q->m_nextpkt;
227 if (p)
228 p->m_nextpkt = nq;
229 else
230 tp->t_segq = nq;
231 m_freem(q);
232 q = nq;
233 }
234
235 if (p == NULL) {
236 m->m_nextpkt = tp->t_segq;
237 tp->t_segq = m;
238 } else {
239 m->m_nextpkt = p->m_nextpkt;
240 p->m_nextpkt = m;
241 }
242
243present:
244 /*
245 * Present data to user, advancing rcv_nxt through
246 * completed sequence space.
247 */
248 if (!TCPS_HAVEESTABLISHED(tp->t_state))
249 return (0);
250 q = tp->t_segq;
251 if (!q || GETTCP(q)->ti_seq != tp->rcv_nxt)
252 return (0);
253 do {
254 tp->rcv_nxt += GETTCP(q)->ti_len;
255 flags = GETTCP(q)->ti_flags & TH_FIN;
256 nq = q->m_nextpkt;
257 tp->t_segq = nq;
258 q->m_nextpkt = NULL;
259 if (so->so_state & SS_CANTRCVMORE)
260 m_freem(q);
261 else
262 sbappend(&so->so_rcv, q);
263 q = nq;
264 } while (q && GETTCP(q)->ti_seq == tp->rcv_nxt);
265 sorwakeup(so);
266 return (flags);
267
268#undef GETTCP
269}
270
271/*
272 * TCP input routine, follows pages 65-76 of the
273 * protocol specification dated September, 1981 very closely.
274 */
275void
35 */
36
37#include "opt_ipfw.h" /* for ipfw_fwd */
38#include "opt_tcpdebug.h"
39#include "opt_tcp_input.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/kernel.h>
44#include <sys/sysctl.h>
45#include <sys/malloc.h>
46#include <sys/mbuf.h>
47#include <sys/proc.h> /* for proc0 declaration */
48#include <sys/protosw.h>
49#include <sys/socket.h>
50#include <sys/socketvar.h>
51#include <sys/syslog.h>
52
53#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
54
55#include <net/if.h>
56#include <net/route.h>
57
58#include <netinet/in.h>
59#include <netinet/in_systm.h>
60#include <netinet/ip.h>
61#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
62#include <netinet/in_pcb.h>
63#include <netinet/ip_var.h>
64#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
65#include <netinet/tcp.h>
66#include <netinet/tcp_fsm.h>
67#include <netinet/tcp_seq.h>
68#include <netinet/tcp_timer.h>
69#include <netinet/tcp_var.h>
70#include <netinet/tcpip.h>
71#ifdef TCPDEBUG
72#include <netinet/tcp_debug.h>
73static struct tcpiphdr tcp_saveti;
74#endif
75
76static int tcprexmtthresh = 3;
77tcp_seq tcp_iss;
78tcp_cc tcp_ccgen;
79
80struct tcpstat tcpstat;
81SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD,
82 &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
83
84static int log_in_vain = 0;
85SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
86 &log_in_vain, 0, "Log all incoming TCP connections");
87
88static int blackhole = 0;
89SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
90 &blackhole, 0, "Do not send RST when dropping refused connections");
91
92int tcp_delack_enabled = 1;
93SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
94 &tcp_delack_enabled, 0,
95 "Delay ACK to try and piggyback it onto a data packet");
96
97#ifdef TCP_DROP_SYNFIN
98static int drop_synfin = 0;
99SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
100 &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
101#endif
102
103#ifdef TCP_RESTRICT_RST
104static int restrict_rst = 0;
105SYSCTL_INT(_net_inet_tcp, OID_AUTO, restrict_rst, CTLFLAG_RW,
106 &restrict_rst, 0, "Restrict RST emission");
107#endif
108
109struct inpcbhead tcb;
110struct inpcbinfo tcbinfo;
111
112static void tcp_dooptions __P((struct tcpcb *,
113 u_char *, int, struct tcpiphdr *, struct tcpopt *));
114static void tcp_pulloutofband __P((struct socket *,
115 struct tcpiphdr *, struct mbuf *));
116static int tcp_reass __P((struct tcpcb *, struct tcpiphdr *, struct mbuf *));
117static void tcp_xmit_timer __P((struct tcpcb *, int));
118
119
120/*
121 * Insert segment ti into reassembly queue of tcp with
122 * control block tp. Return TH_FIN if reassembly now includes
123 * a segment with FIN. The macro form does the common case inline
124 * (segment is the next to be received on an established connection,
125 * and the queue is empty), avoiding linkage into and removal
126 * from the queue and repetition of various conversions.
127 * Set DELACK for segments received in order, but ack immediately
128 * when segments are out of order (so fast retransmit can work).
129 */
130#define TCP_REASS(tp, ti, m, so, flags) { \
131 if ((ti)->ti_seq == (tp)->rcv_nxt && \
132 (tp)->t_segq == NULL && \
133 (tp)->t_state == TCPS_ESTABLISHED) { \
134 if (tcp_delack_enabled) \
135 callout_reset(tp->tt_delack, tcp_delacktime, \
136 tcp_timer_delack, tp); \
137 else \
138 tp->t_flags |= TF_ACKNOW; \
139 (tp)->rcv_nxt += (ti)->ti_len; \
140 flags = (ti)->ti_flags & TH_FIN; \
141 tcpstat.tcps_rcvpack++;\
142 tcpstat.tcps_rcvbyte += (ti)->ti_len;\
143 sbappend(&(so)->so_rcv, (m)); \
144 sorwakeup(so); \
145 } else { \
146 (flags) = tcp_reass((tp), (ti), (m)); \
147 tp->t_flags |= TF_ACKNOW; \
148 } \
149}
150
151static int
152tcp_reass(tp, ti, m)
153 register struct tcpcb *tp;
154 register struct tcpiphdr *ti;
155 struct mbuf *m;
156{
157 struct mbuf *q;
158 struct mbuf *p;
159 struct mbuf *nq;
160 struct socket *so = tp->t_inpcb->inp_socket;
161 int flags;
162
163#define GETTCP(m) ((struct tcpiphdr *)m->m_pkthdr.header)
164
165 /*
166 * Call with ti==0 after become established to
167 * force pre-ESTABLISHED data up to user socket.
168 */
169 if (ti == 0)
170 goto present;
171
172 m->m_pkthdr.header = ti;
173
174 /*
175 * Find a segment which begins after this one does.
176 */
177 for (q = tp->t_segq, p = NULL; q; p = q, q = q->m_nextpkt)
178 if (SEQ_GT(GETTCP(q)->ti_seq, ti->ti_seq))
179 break;
180
181 /*
182 * If there is a preceding segment, it may provide some of
183 * our data already. If so, drop the data from the incoming
184 * segment. If it provides all of our data, drop us.
185 */
186 if (p != NULL) {
187 register int i;
188 /* conversion to int (in i) handles seq wraparound */
189 i = GETTCP(p)->ti_seq + GETTCP(p)->ti_len - ti->ti_seq;
190 if (i > 0) {
191 if (i >= ti->ti_len) {
192 tcpstat.tcps_rcvduppack++;
193 tcpstat.tcps_rcvdupbyte += ti->ti_len;
194 m_freem(m);
195 /*
196 * Try to present any queued data
197 * at the left window edge to the user.
198 * This is needed after the 3-WHS
199 * completes.
200 */
201 goto present; /* ??? */
202 }
203 m_adj(m, i);
204 ti->ti_len -= i;
205 ti->ti_seq += i;
206 }
207 }
208 tcpstat.tcps_rcvoopack++;
209 tcpstat.tcps_rcvoobyte += ti->ti_len;
210
211 /*
212 * While we overlap succeeding segments trim them or,
213 * if they are completely covered, dequeue them.
214 */
215 while (q) {
216 register int i = (ti->ti_seq + ti->ti_len) - GETTCP(q)->ti_seq;
217 if (i <= 0)
218 break;
219 if (i < GETTCP(q)->ti_len) {
220 GETTCP(q)->ti_seq += i;
221 GETTCP(q)->ti_len -= i;
222 m_adj(q, i);
223 break;
224 }
225
226 nq = q->m_nextpkt;
227 if (p)
228 p->m_nextpkt = nq;
229 else
230 tp->t_segq = nq;
231 m_freem(q);
232 q = nq;
233 }
234
235 if (p == NULL) {
236 m->m_nextpkt = tp->t_segq;
237 tp->t_segq = m;
238 } else {
239 m->m_nextpkt = p->m_nextpkt;
240 p->m_nextpkt = m;
241 }
242
243present:
244 /*
245 * Present data to user, advancing rcv_nxt through
246 * completed sequence space.
247 */
248 if (!TCPS_HAVEESTABLISHED(tp->t_state))
249 return (0);
250 q = tp->t_segq;
251 if (!q || GETTCP(q)->ti_seq != tp->rcv_nxt)
252 return (0);
253 do {
254 tp->rcv_nxt += GETTCP(q)->ti_len;
255 flags = GETTCP(q)->ti_flags & TH_FIN;
256 nq = q->m_nextpkt;
257 tp->t_segq = nq;
258 q->m_nextpkt = NULL;
259 if (so->so_state & SS_CANTRCVMORE)
260 m_freem(q);
261 else
262 sbappend(&so->so_rcv, q);
263 q = nq;
264 } while (q && GETTCP(q)->ti_seq == tp->rcv_nxt);
265 sorwakeup(so);
266 return (flags);
267
268#undef GETTCP
269}
270
271/*
272 * TCP input routine, follows pages 65-76 of the
273 * protocol specification dated September, 1981 very closely.
274 */
275void
276tcp_input(m, iphlen)
276tcp_input(m, off0, proto)
277 register struct mbuf *m;
277 register struct mbuf *m;
278 int iphlen;
278 int off0, proto;
279{
279{
280 int iphlen = off0;
280 register struct tcpiphdr *ti;
281 register struct inpcb *inp;
282 u_char *optp = NULL;
283 int optlen = 0;
284 int len, tlen, off;
285 register struct tcpcb *tp = 0;
286 register int tiflags;
287 struct socket *so = 0;
288 int todrop, acked, ourfinisacked, needoutput = 0;
289 struct in_addr laddr;
290 int dropsocket = 0;
291 int iss = 0;
292 u_long tiwin;
293 struct tcpopt to; /* options in this segment */
294 struct rmxp_tao *taop; /* pointer to our TAO cache entry */
295 struct rmxp_tao tao_noncached; /* in case there's no cached entry */
296#ifdef TCPDEBUG
297 short ostate = 0;
298#endif
299
300 bzero((char *)&to, sizeof(to));
301
302 tcpstat.tcps_rcvtotal++;
303 /*
304 * Get IP and TCP header together in first mbuf.
305 * Note: IP leaves IP header in first mbuf.
306 */
307 ti = mtod(m, struct tcpiphdr *);
308 if (iphlen > sizeof (struct ip))
309 ip_stripoptions(m, (struct mbuf *)0);
310 if (m->m_len < sizeof (struct tcpiphdr)) {
311 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
312 tcpstat.tcps_rcvshort++;
313 return;
314 }
315 ti = mtod(m, struct tcpiphdr *);
316 }
317
318 /*
319 * Checksum extended TCP header and data.
320 */
321 tlen = ((struct ip *)ti)->ip_len;
322 len = sizeof (struct ip) + tlen;
323 bzero(ti->ti_x1, sizeof(ti->ti_x1));
324 ti->ti_len = (u_short)tlen;
325 HTONS(ti->ti_len);
326 ti->ti_sum = in_cksum(m, len);
327 if (ti->ti_sum) {
328 tcpstat.tcps_rcvbadsum++;
329 goto drop;
330 }
331
332 /*
333 * Check that TCP offset makes sense,
334 * pull out TCP options and adjust length. XXX
335 */
336 off = ti->ti_off << 2;
337 if (off < sizeof (struct tcphdr) || off > tlen) {
338 tcpstat.tcps_rcvbadoff++;
339 goto drop;
340 }
341 tlen -= off;
342 ti->ti_len = tlen;
343 if (off > sizeof (struct tcphdr)) {
344 if (m->m_len < sizeof(struct ip) + off) {
345 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
346 tcpstat.tcps_rcvshort++;
347 return;
348 }
349 ti = mtod(m, struct tcpiphdr *);
350 }
351 optlen = off - sizeof (struct tcphdr);
352 optp = mtod(m, u_char *) + sizeof (struct tcpiphdr);
353 }
354 tiflags = ti->ti_flags;
355
356#ifdef TCP_DROP_SYNFIN
357 /*
358 * If the drop_synfin option is enabled, drop all packets with
359 * both the SYN and FIN bits set. This prevents e.g. nmap from
360 * identifying the TCP/IP stack.
361 *
362 * This is incompatible with RFC1644 extensions (T/TCP).
363 */
364 if (drop_synfin && (tiflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
365 goto drop;
366#endif
367
368 /*
369 * Convert TCP protocol specific fields to host format.
370 */
371 NTOHL(ti->ti_seq);
372 NTOHL(ti->ti_ack);
373 NTOHS(ti->ti_win);
374 NTOHS(ti->ti_urp);
375
376 /*
377 * Drop TCP, IP headers and TCP options.
378 */
379 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
380 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
381
382 /*
383 * Locate pcb for segment.
384 */
385findpcb:
386#ifdef IPFIREWALL_FORWARD
387 if (ip_fw_fwd_addr != NULL) {
388 /*
389 * Diverted. Pretend to be the destination.
390 * already got one like this?
391 */
392 inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport,
393 ti->ti_dst, ti->ti_dport, 0, m->m_pkthdr.rcvif);
394 if (!inp) {
395 /*
396 * No, then it's new. Try find the ambushing socket
397 */
398 if (!ip_fw_fwd_addr->sin_port) {
399 inp = in_pcblookup_hash(&tcbinfo, ti->ti_src,
400 ti->ti_sport, ip_fw_fwd_addr->sin_addr,
401 ti->ti_dport, 1, m->m_pkthdr.rcvif);
402 } else {
403 inp = in_pcblookup_hash(&tcbinfo,
404 ti->ti_src, ti->ti_sport,
405 ip_fw_fwd_addr->sin_addr,
406 ntohs(ip_fw_fwd_addr->sin_port), 1,
407 m->m_pkthdr.rcvif);
408 }
409 }
410 ip_fw_fwd_addr = NULL;
411 } else
412#endif /* IPFIREWALL_FORWARD */
413
414 inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport,
415 ti->ti_dst, ti->ti_dport, 1, m->m_pkthdr.rcvif);
416
417 /*
418 * If the state is CLOSED (i.e., TCB does not exist) then
419 * all data in the incoming segment is discarded.
420 * If the TCB exists but is in CLOSED state, it is embryonic,
421 * but should either do a listen or a connect soon.
422 */
423 if (inp == NULL) {
424 if (log_in_vain) {
425 char buf[4*sizeof "123"];
426
427 strcpy(buf, inet_ntoa(ti->ti_dst));
428 switch (log_in_vain) {
429 case 1:
430 if(tiflags & TH_SYN)
431 log(LOG_INFO,
432 "Connection attempt to TCP %s:%d from %s:%d\n",
433 buf, ntohs(ti->ti_dport),
434 inet_ntoa(ti->ti_src),
435 ntohs(ti->ti_sport));
436 break;
437 case 2:
438 log(LOG_INFO,
439 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
440 buf, ntohs(ti->ti_dport), inet_ntoa(ti->ti_src),
441 ntohs(ti->ti_sport), tiflags);
442 break;
443 default:
444 break;
445 }
446 }
447#ifdef ICMP_BANDLIM
448 if (badport_bandlim(1) < 0)
449 goto drop;
450#endif
451 if (blackhole) {
452 switch (blackhole) {
453 case 1:
454 if (tiflags & TH_SYN)
455 goto drop;
456 break;
457 case 2:
458 goto drop;
459 default:
460 goto drop;
461 }
462 }
463 goto dropwithreset;
464 }
465 tp = intotcpcb(inp);
466 if (tp == 0)
467 goto dropwithreset;
468 if (tp->t_state == TCPS_CLOSED)
469 goto drop;
470
471 /* Unscale the window into a 32-bit value. */
472 if ((tiflags & TH_SYN) == 0)
473 tiwin = ti->ti_win << tp->snd_scale;
474 else
475 tiwin = ti->ti_win;
476
477 so = inp->inp_socket;
478 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
479#ifdef TCPDEBUG
480 if (so->so_options & SO_DEBUG) {
481 ostate = tp->t_state;
482 tcp_saveti = *ti;
483 }
484#endif
485 if (so->so_options & SO_ACCEPTCONN) {
486 register struct tcpcb *tp0 = tp;
487 struct socket *so2;
488 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
489 /*
490 * Note: dropwithreset makes sure we don't
491 * send a RST in response to a RST.
492 */
493 if (tiflags & TH_ACK) {
494 tcpstat.tcps_badsyn++;
495 goto dropwithreset;
496 }
497 goto drop;
498 }
499 so2 = sonewconn(so, 0);
500 if (so2 == 0) {
501 tcpstat.tcps_listendrop++;
502 so2 = sodropablereq(so);
503 if (so2) {
504 tcp_drop(sototcpcb(so2), ETIMEDOUT);
505 so2 = sonewconn(so, 0);
506 }
507 if (!so2)
508 goto drop;
509 }
510 so = so2;
511 /*
512 * This is ugly, but ....
513 *
514 * Mark socket as temporary until we're
515 * committed to keeping it. The code at
516 * ``drop'' and ``dropwithreset'' check the
517 * flag dropsocket to see if the temporary
518 * socket created here should be discarded.
519 * We mark the socket as discardable until
520 * we're committed to it below in TCPS_LISTEN.
521 */
522 dropsocket++;
523 inp = (struct inpcb *)so->so_pcb;
524 inp->inp_laddr = ti->ti_dst;
525 inp->inp_lport = ti->ti_dport;
526 if (in_pcbinshash(inp) != 0) {
527 /*
528 * Undo the assignments above if we failed to
529 * put the PCB on the hash lists.
530 */
531 inp->inp_laddr.s_addr = INADDR_ANY;
532 inp->inp_lport = 0;
533 goto drop;
534 }
535 inp->inp_options = ip_srcroute();
536 tp = intotcpcb(inp);
537 tp->t_state = TCPS_LISTEN;
538 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT);
539
540 /* Compute proper scaling value from buffer space */
541 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
542 TCP_MAXWIN << tp->request_r_scale <
543 so->so_rcv.sb_hiwat)
544 tp->request_r_scale++;
545 }
546 }
547
548 /*
549 * Segment received on connection.
550 * Reset idle time and keep-alive timer.
551 */
552 tp->t_rcvtime = ticks;
553 if (TCPS_HAVEESTABLISHED(tp->t_state))
554 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
555
556 /*
557 * Process options if not in LISTEN state,
558 * else do it below (after getting remote address).
559 */
560 if (tp->t_state != TCPS_LISTEN)
561 tcp_dooptions(tp, optp, optlen, ti, &to);
562
563 /*
564 * Header prediction: check for the two common cases
565 * of a uni-directional data xfer. If the packet has
566 * no control flags, is in-sequence, the window didn't
567 * change and we're not retransmitting, it's a
568 * candidate. If the length is zero and the ack moved
569 * forward, we're the sender side of the xfer. Just
570 * free the data acked & wake any higher level process
571 * that was blocked waiting for space. If the length
572 * is non-zero and the ack didn't move, we're the
573 * receiver side. If we're getting packets in-order
574 * (the reassembly queue is empty), add the data to
575 * the socket buffer and note that we need a delayed ack.
576 * Make sure that the hidden state-flags are also off.
577 * Since we check for TCPS_ESTABLISHED above, it can only
578 * be TH_NEEDSYN.
579 */
580 if (tp->t_state == TCPS_ESTABLISHED &&
581 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
582 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
583 ((to.to_flag & TOF_TS) == 0 ||
584 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
585 /*
586 * Using the CC option is compulsory if once started:
587 * the segment is OK if no T/TCP was negotiated or
588 * if the segment has a CC option equal to CCrecv
589 */
590 ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
591 ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
592 ti->ti_seq == tp->rcv_nxt &&
593 tiwin && tiwin == tp->snd_wnd &&
594 tp->snd_nxt == tp->snd_max) {
595
596 /*
597 * If last ACK falls within this segment's sequence numbers,
598 * record the timestamp.
599 * NOTE that the test is modified according to the latest
600 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
601 */
602 if ((to.to_flag & TOF_TS) != 0 &&
603 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) {
604 tp->ts_recent_age = ticks;
605 tp->ts_recent = to.to_tsval;
606 }
607
608 if (ti->ti_len == 0) {
609 if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
610 SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
611 tp->snd_cwnd >= tp->snd_wnd &&
612 tp->t_dupacks < tcprexmtthresh) {
613 /*
614 * this is a pure ack for outstanding data.
615 */
616 ++tcpstat.tcps_predack;
617 /*
618 * "bad retransmit" recovery
619 */
620 if (tp->t_rxtshift == 1 &&
621 ticks < tp->t_badrxtwin) {
622 tp->snd_cwnd = tp->snd_cwnd_prev;
623 tp->snd_ssthresh =
624 tp->snd_ssthresh_prev;
625 tp->snd_nxt = tp->snd_max;
626 tp->t_badrxtwin = 0;
627 }
628 if ((to.to_flag & TOF_TS) != 0)
629 tcp_xmit_timer(tp,
630 ticks - to.to_tsecr + 1);
631 else if (tp->t_rtttime &&
632 SEQ_GT(ti->ti_ack, tp->t_rtseq))
633 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
634 acked = ti->ti_ack - tp->snd_una;
635 tcpstat.tcps_rcvackpack++;
636 tcpstat.tcps_rcvackbyte += acked;
637 sbdrop(&so->so_snd, acked);
638 tp->snd_una = ti->ti_ack;
639 m_freem(m);
640
641 /*
642 * If all outstanding data are acked, stop
643 * retransmit timer, otherwise restart timer
644 * using current (possibly backed-off) value.
645 * If process is waiting for space,
646 * wakeup/selwakeup/signal. If data
647 * are ready to send, let tcp_output
648 * decide between more output or persist.
649 */
650 if (tp->snd_una == tp->snd_max)
651 callout_stop(tp->tt_rexmt);
652 else if (!callout_active(tp->tt_persist))
653 callout_reset(tp->tt_rexmt,
654 tp->t_rxtcur,
655 tcp_timer_rexmt, tp);
656
657 sowwakeup(so);
658 if (so->so_snd.sb_cc)
659 (void) tcp_output(tp);
660 return;
661 }
662 } else if (ti->ti_ack == tp->snd_una &&
663 tp->t_segq == NULL &&
664 ti->ti_len <= sbspace(&so->so_rcv)) {
665 /*
666 * this is a pure, in-sequence data packet
667 * with nothing on the reassembly queue and
668 * we have enough buffer space to take it.
669 */
670 ++tcpstat.tcps_preddat;
671 tp->rcv_nxt += ti->ti_len;
672 tcpstat.tcps_rcvpack++;
673 tcpstat.tcps_rcvbyte += ti->ti_len;
674 /*
675 * Add data to socket buffer.
676 */
677 sbappend(&so->so_rcv, m);
678 sorwakeup(so);
679 if (tcp_delack_enabled) {
680 callout_reset(tp->tt_delack, tcp_delacktime,
681 tcp_timer_delack, tp);
682 } else {
683 tp->t_flags |= TF_ACKNOW;
684 tcp_output(tp);
685 }
686 return;
687 }
688 }
689
690 /*
691 * Calculate amount of space in receive window,
692 * and then do TCP input processing.
693 * Receive window is amount of space in rcv queue,
694 * but not less than advertised window.
695 */
696 { int win;
697
698 win = sbspace(&so->so_rcv);
699 if (win < 0)
700 win = 0;
701 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
702 }
703
704 switch (tp->t_state) {
705
706 /*
707 * If the state is LISTEN then ignore segment if it contains an RST.
708 * If the segment contains an ACK then it is bad and send a RST.
709 * If it does not contain a SYN then it is not interesting; drop it.
710 * If it is from this socket, drop it, it must be forged.
711 * Don't bother responding if the destination was a broadcast.
712 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
713 * tp->iss, and send a segment:
714 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
715 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
716 * Fill in remote peer address fields if not previously specified.
717 * Enter SYN_RECEIVED state, and process any other fields of this
718 * segment in this state.
719 */
720 case TCPS_LISTEN: {
721 register struct sockaddr_in *sin;
722
723 if (tiflags & TH_RST)
724 goto drop;
725 if (tiflags & TH_ACK)
726 goto dropwithreset;
727 if ((tiflags & TH_SYN) == 0)
728 goto drop;
729 if ((ti->ti_dport == ti->ti_sport) &&
730 (ti->ti_dst.s_addr == ti->ti_src.s_addr))
731 goto drop;
732 /*
733 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
734 * in_broadcast() should never return true on a received
735 * packet with M_BCAST not set.
736 */
737 if (m->m_flags & (M_BCAST|M_MCAST) ||
738 IN_MULTICAST(ntohl(ti->ti_dst.s_addr)))
739 goto drop;
740 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
741 M_NOWAIT);
742 if (sin == NULL)
743 goto drop;
744 sin->sin_family = AF_INET;
745 sin->sin_len = sizeof(*sin);
746 sin->sin_addr = ti->ti_src;
747 sin->sin_port = ti->ti_sport;
748 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
749 laddr = inp->inp_laddr;
750 if (inp->inp_laddr.s_addr == INADDR_ANY)
751 inp->inp_laddr = ti->ti_dst;
752 if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) {
753 inp->inp_laddr = laddr;
754 FREE(sin, M_SONAME);
755 goto drop;
756 }
757 FREE(sin, M_SONAME);
758 tp->t_template = tcp_template(tp);
759 if (tp->t_template == 0) {
760 tp = tcp_drop(tp, ENOBUFS);
761 dropsocket = 0; /* socket is already gone */
762 goto drop;
763 }
764 if ((taop = tcp_gettaocache(inp)) == NULL) {
765 taop = &tao_noncached;
766 bzero(taop, sizeof(*taop));
767 }
768 tcp_dooptions(tp, optp, optlen, ti, &to);
769 if (iss)
770 tp->iss = iss;
771 else
772 tp->iss = tcp_iss;
773 tcp_iss += TCP_ISSINCR/4;
774 tp->irs = ti->ti_seq;
775 tcp_sendseqinit(tp);
776 tcp_rcvseqinit(tp);
777 /*
778 * Initialization of the tcpcb for transaction;
779 * set SND.WND = SEG.WND,
780 * initialize CCsend and CCrecv.
781 */
782 tp->snd_wnd = tiwin; /* initial send-window */
783 tp->cc_send = CC_INC(tcp_ccgen);
784 tp->cc_recv = to.to_cc;
785 /*
786 * Perform TAO test on incoming CC (SEG.CC) option, if any.
787 * - compare SEG.CC against cached CC from the same host,
788 * if any.
789 * - if SEG.CC > chached value, SYN must be new and is accepted
790 * immediately: save new CC in the cache, mark the socket
791 * connected, enter ESTABLISHED state, turn on flag to
792 * send a SYN in the next segment.
793 * A virtual advertised window is set in rcv_adv to
794 * initialize SWS prevention. Then enter normal segment
795 * processing: drop SYN, process data and FIN.
796 * - otherwise do a normal 3-way handshake.
797 */
798 if ((to.to_flag & TOF_CC) != 0) {
799 if (((tp->t_flags & TF_NOPUSH) != 0) &&
800 taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) {
801
802 taop->tao_cc = to.to_cc;
803 tp->t_starttime = ticks;
804 tp->t_state = TCPS_ESTABLISHED;
805
806 /*
807 * If there is a FIN, or if there is data and the
808 * connection is local, then delay SYN,ACK(SYN) in
809 * the hope of piggy-backing it on a response
810 * segment. Otherwise must send ACK now in case
811 * the other side is slow starting.
812 */
813 if (tcp_delack_enabled && ((tiflags & TH_FIN) ||
814 (ti->ti_len != 0 &&
815 in_localaddr(inp->inp_faddr)))) {
816 callout_reset(tp->tt_delack, tcp_delacktime,
817 tcp_timer_delack, tp);
818 tp->t_flags |= TF_NEEDSYN;
819 } else
820 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
821
822 /*
823 * Limit the `virtual advertised window' to TCP_MAXWIN
824 * here. Even if we requested window scaling, it will
825 * become effective only later when our SYN is acked.
826 */
827 tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN);
828 tcpstat.tcps_connects++;
829 soisconnected(so);
830 callout_reset(tp->tt_keep, tcp_keepinit,
831 tcp_timer_keep, tp);
832 dropsocket = 0; /* committed to socket */
833 tcpstat.tcps_accepts++;
834 goto trimthenstep6;
835 }
836 /* else do standard 3-way handshake */
837 } else {
838 /*
839 * No CC option, but maybe CC.NEW:
840 * invalidate cached value.
841 */
842 taop->tao_cc = 0;
843 }
844 /*
845 * TAO test failed or there was no CC option,
846 * do a standard 3-way handshake.
847 */
848 tp->t_flags |= TF_ACKNOW;
849 tp->t_state = TCPS_SYN_RECEIVED;
850 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
851 dropsocket = 0; /* committed to socket */
852 tcpstat.tcps_accepts++;
853 goto trimthenstep6;
854 }
855
856 /*
857 * If the state is SYN_RECEIVED:
858 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
859 */
860 case TCPS_SYN_RECEIVED:
861 if ((tiflags & TH_ACK) &&
862 (SEQ_LEQ(ti->ti_ack, tp->snd_una) ||
863 SEQ_GT(ti->ti_ack, tp->snd_max)))
864 goto dropwithreset;
865 break;
866
867 /*
868 * If the state is SYN_SENT:
869 * if seg contains an ACK, but not for our SYN, drop the input.
870 * if seg contains a RST, then drop the connection.
871 * if seg does not contain SYN, then drop it.
872 * Otherwise this is an acceptable SYN segment
873 * initialize tp->rcv_nxt and tp->irs
874 * if seg contains ack then advance tp->snd_una
875 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
876 * arrange for segment to be acked (eventually)
877 * continue processing rest of data/controls, beginning with URG
878 */
879 case TCPS_SYN_SENT:
880 if ((taop = tcp_gettaocache(inp)) == NULL) {
881 taop = &tao_noncached;
882 bzero(taop, sizeof(*taop));
883 }
884
885 if ((tiflags & TH_ACK) &&
886 (SEQ_LEQ(ti->ti_ack, tp->iss) ||
887 SEQ_GT(ti->ti_ack, tp->snd_max))) {
888 /*
889 * If we have a cached CCsent for the remote host,
890 * hence we haven't just crashed and restarted,
891 * do not send a RST. This may be a retransmission
892 * from the other side after our earlier ACK was lost.
893 * Our new SYN, when it arrives, will serve as the
894 * needed ACK.
895 */
896 if (taop->tao_ccsent != 0)
897 goto drop;
898 else
899 goto dropwithreset;
900 }
901 if (tiflags & TH_RST) {
902 if (tiflags & TH_ACK)
903 tp = tcp_drop(tp, ECONNREFUSED);
904 goto drop;
905 }
906 if ((tiflags & TH_SYN) == 0)
907 goto drop;
908 tp->snd_wnd = ti->ti_win; /* initial send window */
909 tp->cc_recv = to.to_cc; /* foreign CC */
910
911 tp->irs = ti->ti_seq;
912 tcp_rcvseqinit(tp);
913 if (tiflags & TH_ACK) {
914 /*
915 * Our SYN was acked. If segment contains CC.ECHO
916 * option, check it to make sure this segment really
917 * matches our SYN. If not, just drop it as old
918 * duplicate, but send an RST if we're still playing
919 * by the old rules. If no CC.ECHO option, make sure
920 * we don't get fooled into using T/TCP.
921 */
922 if (to.to_flag & TOF_CCECHO) {
923 if (tp->cc_send != to.to_ccecho) {
924 if (taop->tao_ccsent != 0)
925 goto drop;
926 else
927 goto dropwithreset;
928 }
929 } else
930 tp->t_flags &= ~TF_RCVD_CC;
931 tcpstat.tcps_connects++;
932 soisconnected(so);
933 /* Do window scaling on this connection? */
934 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
935 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
936 tp->snd_scale = tp->requested_s_scale;
937 tp->rcv_scale = tp->request_r_scale;
938 }
939 /* Segment is acceptable, update cache if undefined. */
940 if (taop->tao_ccsent == 0)
941 taop->tao_ccsent = to.to_ccecho;
942
943 tp->rcv_adv += tp->rcv_wnd;
944 tp->snd_una++; /* SYN is acked */
945 /*
946 * If there's data, delay ACK; if there's also a FIN
947 * ACKNOW will be turned on later.
948 */
949 if (tcp_delack_enabled && ti->ti_len != 0)
950 callout_reset(tp->tt_delack, tcp_delacktime,
951 tcp_timer_delack, tp);
952 else
953 tp->t_flags |= TF_ACKNOW;
954 /*
955 * Received <SYN,ACK> in SYN_SENT[*] state.
956 * Transitions:
957 * SYN_SENT --> ESTABLISHED
958 * SYN_SENT* --> FIN_WAIT_1
959 */
960 tp->t_starttime = ticks;
961 if (tp->t_flags & TF_NEEDFIN) {
962 tp->t_state = TCPS_FIN_WAIT_1;
963 tp->t_flags &= ~TF_NEEDFIN;
964 tiflags &= ~TH_SYN;
965 } else {
966 tp->t_state = TCPS_ESTABLISHED;
967 callout_reset(tp->tt_keep, tcp_keepidle,
968 tcp_timer_keep, tp);
969 }
970 } else {
971 /*
972 * Received initial SYN in SYN-SENT[*] state => simul-
973 * taneous open. If segment contains CC option and there is
974 * a cached CC, apply TAO test; if it succeeds, connection is
975 * half-synchronized. Otherwise, do 3-way handshake:
976 * SYN-SENT -> SYN-RECEIVED
977 * SYN-SENT* -> SYN-RECEIVED*
978 * If there was no CC option, clear cached CC value.
979 */
980 tp->t_flags |= TF_ACKNOW;
981 callout_stop(tp->tt_rexmt);
982 if (to.to_flag & TOF_CC) {
983 if (taop->tao_cc != 0 &&
984 CC_GT(to.to_cc, taop->tao_cc)) {
985 /*
986 * update cache and make transition:
987 * SYN-SENT -> ESTABLISHED*
988 * SYN-SENT* -> FIN-WAIT-1*
989 */
990 taop->tao_cc = to.to_cc;
991 tp->t_starttime = ticks;
992 if (tp->t_flags & TF_NEEDFIN) {
993 tp->t_state = TCPS_FIN_WAIT_1;
994 tp->t_flags &= ~TF_NEEDFIN;
995 } else {
996 tp->t_state = TCPS_ESTABLISHED;
997 callout_reset(tp->tt_keep,
998 tcp_keepidle,
999 tcp_timer_keep,
1000 tp);
1001 }
1002 tp->t_flags |= TF_NEEDSYN;
1003 } else
1004 tp->t_state = TCPS_SYN_RECEIVED;
1005 } else {
1006 /* CC.NEW or no option => invalidate cache */
1007 taop->tao_cc = 0;
1008 tp->t_state = TCPS_SYN_RECEIVED;
1009 }
1010 }
1011
1012trimthenstep6:
1013 /*
1014 * Advance ti->ti_seq to correspond to first data byte.
1015 * If data, trim to stay within window,
1016 * dropping FIN if necessary.
1017 */
1018 ti->ti_seq++;
1019 if (ti->ti_len > tp->rcv_wnd) {
1020 todrop = ti->ti_len - tp->rcv_wnd;
1021 m_adj(m, -todrop);
1022 ti->ti_len = tp->rcv_wnd;
1023 tiflags &= ~TH_FIN;
1024 tcpstat.tcps_rcvpackafterwin++;
1025 tcpstat.tcps_rcvbyteafterwin += todrop;
1026 }
1027 tp->snd_wl1 = ti->ti_seq - 1;
1028 tp->rcv_up = ti->ti_seq;
1029 /*
1030 * Client side of transaction: already sent SYN and data.
1031 * If the remote host used T/TCP to validate the SYN,
1032 * our data will be ACK'd; if so, enter normal data segment
1033 * processing in the middle of step 5, ack processing.
1034 * Otherwise, goto step 6.
1035 */
1036 if (tiflags & TH_ACK)
1037 goto process_ACK;
1038 goto step6;
1039 /*
1040 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1041 * if segment contains a SYN and CC [not CC.NEW] option:
1042 * if state == TIME_WAIT and connection duration > MSL,
1043 * drop packet and send RST;
1044 *
1045 * if SEG.CC > CCrecv then is new SYN, and can implicitly
1046 * ack the FIN (and data) in retransmission queue.
1047 * Complete close and delete TCPCB. Then reprocess
1048 * segment, hoping to find new TCPCB in LISTEN state;
1049 *
1050 * else must be old SYN; drop it.
1051 * else do normal processing.
1052 */
1053 case TCPS_LAST_ACK:
1054 case TCPS_CLOSING:
1055 case TCPS_TIME_WAIT:
1056 if ((tiflags & TH_SYN) &&
1057 (to.to_flag & TOF_CC) && tp->cc_recv != 0) {
1058 if (tp->t_state == TCPS_TIME_WAIT &&
1059 (ticks - tp->t_starttime) > tcp_msl)
1060 goto dropwithreset;
1061 if (CC_GT(to.to_cc, tp->cc_recv)) {
1062 tp = tcp_close(tp);
1063 goto findpcb;
1064 }
1065 else
1066 goto drop;
1067 }
1068 break; /* continue normal processing */
1069 }
1070
1071 /*
1072 * States other than LISTEN or SYN_SENT.
1073 * First check the RST flag and sequence number since reset segments
1074 * are exempt from the timestamp and connection count tests. This
1075 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1076 * below which allowed reset segments in half the sequence space
1077 * to fall though and be processed (which gives forged reset
1078 * segments with a random sequence number a 50 percent chance of
1079 * killing a connection).
1080 * Then check timestamp, if present.
1081 * Then check the connection count, if present.
1082 * Then check that at least some bytes of segment are within
1083 * receive window. If segment begins before rcv_nxt,
1084 * drop leading data (and SYN); if nothing left, just ack.
1085 *
1086 *
1087 * If the RST bit is set, check the sequence number to see
1088 * if this is a valid reset segment.
1089 * RFC 793 page 37:
1090 * In all states except SYN-SENT, all reset (RST) segments
1091 * are validated by checking their SEQ-fields. A reset is
1092 * valid if its sequence number is in the window.
1093 * Note: this does not take into account delayed ACKs, so
1094 * we should test against last_ack_sent instead of rcv_nxt.
1095 * The sequence number in the reset segment is normally an
1096 * echo of our outgoing acknowlegement numbers, but some hosts
1097 * send a reset with the sequence number at the rightmost edge
1098 * of our receive window, and we have to handle this case.
1099 * If we have multiple segments in flight, the intial reset
1100 * segment sequence numbers will be to the left of last_ack_sent,
1101 * but they will eventually catch up.
1102 * In any case, it never made sense to trim reset segments to
1103 * fit the receive window since RFC 1122 says:
1104 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
1105 *
1106 * A TCP SHOULD allow a received RST segment to include data.
1107 *
1108 * DISCUSSION
1109 * It has been suggested that a RST segment could contain
1110 * ASCII text that encoded and explained the cause of the
1111 * RST. No standard has yet been established for such
1112 * data.
1113 *
1114 * If the reset segment passes the sequence number test examine
1115 * the state:
1116 * SYN_RECEIVED STATE:
1117 * If passive open, return to LISTEN state.
1118 * If active open, inform user that connection was refused.
1119 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1120 * Inform user that connection was reset, and close tcb.
1121 * CLOSING, LAST_ACK STATES:
1122 * Close the tcb.
1123 * TIME_WAIT STATE:
1124 * Drop the segment - see Stevens, vol. 2, p. 964 and
1125 * RFC 1337.
1126 */
1127 if (tiflags & TH_RST) {
1128 if (SEQ_GEQ(ti->ti_seq, tp->last_ack_sent) &&
1129 SEQ_LT(ti->ti_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1130 switch (tp->t_state) {
1131
1132 case TCPS_SYN_RECEIVED:
1133 so->so_error = ECONNREFUSED;
1134 goto close;
1135
1136 case TCPS_ESTABLISHED:
1137 case TCPS_FIN_WAIT_1:
1138 case TCPS_FIN_WAIT_2:
1139 case TCPS_CLOSE_WAIT:
1140 so->so_error = ECONNRESET;
1141 close:
1142 tp->t_state = TCPS_CLOSED;
1143 tcpstat.tcps_drops++;
1144 tp = tcp_close(tp);
1145 break;
1146
1147 case TCPS_CLOSING:
1148 case TCPS_LAST_ACK:
1149 tp = tcp_close(tp);
1150 break;
1151
1152 case TCPS_TIME_WAIT:
1153 break;
1154 }
1155 }
1156 goto drop;
1157 }
1158
1159 /*
1160 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1161 * and it's less than ts_recent, drop it.
1162 */
1163 if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent &&
1164 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1165
1166 /* Check to see if ts_recent is over 24 days old. */
1167 if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1168 /*
1169 * Invalidate ts_recent. If this segment updates
1170 * ts_recent, the age will be reset later and ts_recent
1171 * will get a valid value. If it does not, setting
1172 * ts_recent to zero will at least satisfy the
1173 * requirement that zero be placed in the timestamp
1174 * echo reply when ts_recent isn't valid. The
1175 * age isn't reset until we get a valid ts_recent
1176 * because we don't want out-of-order segments to be
1177 * dropped when ts_recent is old.
1178 */
1179 tp->ts_recent = 0;
1180 } else {
1181 tcpstat.tcps_rcvduppack++;
1182 tcpstat.tcps_rcvdupbyte += ti->ti_len;
1183 tcpstat.tcps_pawsdrop++;
1184 goto dropafterack;
1185 }
1186 }
1187
1188 /*
1189 * T/TCP mechanism
1190 * If T/TCP was negotiated and the segment doesn't have CC,
1191 * or if its CC is wrong then drop the segment.
1192 * RST segments do not have to comply with this.
1193 */
1194 if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
1195 ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
1196 goto dropafterack;
1197
1198 /*
1199 * In the SYN-RECEIVED state, validate that the packet belongs to
1200 * this connection before trimming the data to fit the receive
1201 * window. Check the sequence number versus IRS since we know
1202 * the sequence numbers haven't wrapped. This is a partial fix
1203 * for the "LAND" DoS attack.
1204 */
1205 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(ti->ti_seq, tp->irs))
1206 goto dropwithreset;
1207
1208 todrop = tp->rcv_nxt - ti->ti_seq;
1209 if (todrop > 0) {
1210 if (tiflags & TH_SYN) {
1211 tiflags &= ~TH_SYN;
1212 ti->ti_seq++;
1213 if (ti->ti_urp > 1)
1214 ti->ti_urp--;
1215 else
1216 tiflags &= ~TH_URG;
1217 todrop--;
1218 }
1219 /*
1220 * Following if statement from Stevens, vol. 2, p. 960.
1221 */
1222 if (todrop > ti->ti_len
1223 || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) {
1224 /*
1225 * Any valid FIN must be to the left of the window.
1226 * At this point the FIN must be a duplicate or out
1227 * of sequence; drop it.
1228 */
1229 tiflags &= ~TH_FIN;
1230
1231 /*
1232 * Send an ACK to resynchronize and drop any data.
1233 * But keep on processing for RST or ACK.
1234 */
1235 tp->t_flags |= TF_ACKNOW;
1236 todrop = ti->ti_len;
1237 tcpstat.tcps_rcvduppack++;
1238 tcpstat.tcps_rcvdupbyte += todrop;
1239 } else {
1240 tcpstat.tcps_rcvpartduppack++;
1241 tcpstat.tcps_rcvpartdupbyte += todrop;
1242 }
1243 m_adj(m, todrop);
1244 ti->ti_seq += todrop;
1245 ti->ti_len -= todrop;
1246 if (ti->ti_urp > todrop)
1247 ti->ti_urp -= todrop;
1248 else {
1249 tiflags &= ~TH_URG;
1250 ti->ti_urp = 0;
1251 }
1252 }
1253
1254 /*
1255 * If new data are received on a connection after the
1256 * user processes are gone, then RST the other end.
1257 */
1258 if ((so->so_state & SS_NOFDREF) &&
1259 tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) {
1260 tp = tcp_close(tp);
1261 tcpstat.tcps_rcvafterclose++;
1262 goto dropwithreset;
1263 }
1264
1265 /*
1266 * If segment ends after window, drop trailing data
1267 * (and PUSH and FIN); if nothing left, just ACK.
1268 */
1269 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
1270 if (todrop > 0) {
1271 tcpstat.tcps_rcvpackafterwin++;
1272 if (todrop >= ti->ti_len) {
1273 tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
1274 /*
1275 * If a new connection request is received
1276 * while in TIME_WAIT, drop the old connection
1277 * and start over if the sequence numbers
1278 * are above the previous ones.
1279 */
1280 if (tiflags & TH_SYN &&
1281 tp->t_state == TCPS_TIME_WAIT &&
1282 SEQ_GT(ti->ti_seq, tp->rcv_nxt)) {
1283 iss = tp->snd_nxt + TCP_ISSINCR;
1284 tp = tcp_close(tp);
1285 goto findpcb;
1286 }
1287 /*
1288 * If window is closed can only take segments at
1289 * window edge, and have to drop data and PUSH from
1290 * incoming segments. Continue processing, but
1291 * remember to ack. Otherwise, drop segment
1292 * and ack.
1293 */
1294 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) {
1295 tp->t_flags |= TF_ACKNOW;
1296 tcpstat.tcps_rcvwinprobe++;
1297 } else
1298 goto dropafterack;
1299 } else
1300 tcpstat.tcps_rcvbyteafterwin += todrop;
1301 m_adj(m, -todrop);
1302 ti->ti_len -= todrop;
1303 tiflags &= ~(TH_PUSH|TH_FIN);
1304 }
1305
1306 /*
1307 * If last ACK falls within this segment's sequence numbers,
1308 * record its timestamp.
1309 * NOTE that the test is modified according to the latest
1310 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1311 */
1312 if ((to.to_flag & TOF_TS) != 0 &&
1313 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) {
1314 tp->ts_recent_age = ticks;
1315 tp->ts_recent = to.to_tsval;
1316 }
1317
1318 /*
1319 * If a SYN is in the window, then this is an
1320 * error and we send an RST and drop the connection.
1321 */
1322 if (tiflags & TH_SYN) {
1323 tp = tcp_drop(tp, ECONNRESET);
1324 goto dropwithreset;
1325 }
1326
1327 /*
1328 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
1329 * flag is on (half-synchronized state), then queue data for
1330 * later processing; else drop segment and return.
1331 */
1332 if ((tiflags & TH_ACK) == 0) {
1333 if (tp->t_state == TCPS_SYN_RECEIVED ||
1334 (tp->t_flags & TF_NEEDSYN))
1335 goto step6;
1336 else
1337 goto drop;
1338 }
1339
1340 /*
1341 * Ack processing.
1342 */
1343 switch (tp->t_state) {
1344
1345 /*
1346 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1347 * ESTABLISHED state and continue processing.
1348 * The ACK was checked above.
1349 */
1350 case TCPS_SYN_RECEIVED:
1351
1352 tcpstat.tcps_connects++;
1353 soisconnected(so);
1354 /* Do window scaling? */
1355 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1356 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1357 tp->snd_scale = tp->requested_s_scale;
1358 tp->rcv_scale = tp->request_r_scale;
1359 }
1360 /*
1361 * Upon successful completion of 3-way handshake,
1362 * update cache.CC if it was undefined, pass any queued
1363 * data to the user, and advance state appropriately.
1364 */
1365 if ((taop = tcp_gettaocache(inp)) != NULL &&
1366 taop->tao_cc == 0)
1367 taop->tao_cc = tp->cc_recv;
1368
1369 /*
1370 * Make transitions:
1371 * SYN-RECEIVED -> ESTABLISHED
1372 * SYN-RECEIVED* -> FIN-WAIT-1
1373 */
1374 tp->t_starttime = ticks;
1375 if (tp->t_flags & TF_NEEDFIN) {
1376 tp->t_state = TCPS_FIN_WAIT_1;
1377 tp->t_flags &= ~TF_NEEDFIN;
1378 } else {
1379 tp->t_state = TCPS_ESTABLISHED;
1380 callout_reset(tp->tt_keep, tcp_keepidle,
1381 tcp_timer_keep, tp);
1382 }
1383 /*
1384 * If segment contains data or ACK, will call tcp_reass()
1385 * later; if not, do so now to pass queued data to user.
1386 */
1387 if (ti->ti_len == 0 && (tiflags & TH_FIN) == 0)
1388 (void) tcp_reass(tp, (struct tcpiphdr *)0,
1389 (struct mbuf *)0);
1390 tp->snd_wl1 = ti->ti_seq - 1;
1391 /* fall into ... */
1392
1393 /*
1394 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1395 * ACKs. If the ack is in the range
1396 * tp->snd_una < ti->ti_ack <= tp->snd_max
1397 * then advance tp->snd_una to ti->ti_ack and drop
1398 * data from the retransmission queue. If this ACK reflects
1399 * more up to date window information we update our window information.
1400 */
1401 case TCPS_ESTABLISHED:
1402 case TCPS_FIN_WAIT_1:
1403 case TCPS_FIN_WAIT_2:
1404 case TCPS_CLOSE_WAIT:
1405 case TCPS_CLOSING:
1406 case TCPS_LAST_ACK:
1407 case TCPS_TIME_WAIT:
1408
1409 if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
1410 if (ti->ti_len == 0 && tiwin == tp->snd_wnd) {
1411 tcpstat.tcps_rcvdupack++;
1412 /*
1413 * If we have outstanding data (other than
1414 * a window probe), this is a completely
1415 * duplicate ack (ie, window info didn't
1416 * change), the ack is the biggest we've
1417 * seen and we've seen exactly our rexmt
1418 * threshhold of them, assume a packet
1419 * has been dropped and retransmit it.
1420 * Kludge snd_nxt & the congestion
1421 * window so we send only this one
1422 * packet.
1423 *
1424 * We know we're losing at the current
1425 * window size so do congestion avoidance
1426 * (set ssthresh to half the current window
1427 * and pull our congestion window back to
1428 * the new ssthresh).
1429 *
1430 * Dup acks mean that packets have left the
1431 * network (they're now cached at the receiver)
1432 * so bump cwnd by the amount in the receiver
1433 * to keep a constant cwnd packets in the
1434 * network.
1435 */
1436 if (!callout_active(tp->tt_rexmt) ||
1437 ti->ti_ack != tp->snd_una)
1438 tp->t_dupacks = 0;
1439 else if (++tp->t_dupacks == tcprexmtthresh) {
1440 tcp_seq onxt = tp->snd_nxt;
1441 u_int win =
1442 min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1443 tp->t_maxseg;
1444
1445 if (win < 2)
1446 win = 2;
1447 tp->snd_ssthresh = win * tp->t_maxseg;
1448 callout_stop(tp->tt_rexmt);
1449 tp->t_rtttime = 0;
1450 tp->snd_nxt = ti->ti_ack;
1451 tp->snd_cwnd = tp->t_maxseg;
1452 (void) tcp_output(tp);
1453 tp->snd_cwnd = tp->snd_ssthresh +
1454 tp->t_maxseg * tp->t_dupacks;
1455 if (SEQ_GT(onxt, tp->snd_nxt))
1456 tp->snd_nxt = onxt;
1457 goto drop;
1458 } else if (tp->t_dupacks > tcprexmtthresh) {
1459 tp->snd_cwnd += tp->t_maxseg;
1460 (void) tcp_output(tp);
1461 goto drop;
1462 }
1463 } else
1464 tp->t_dupacks = 0;
1465 break;
1466 }
1467 /*
1468 * If the congestion window was inflated to account
1469 * for the other side's cached packets, retract it.
1470 */
1471 if (tp->t_dupacks >= tcprexmtthresh &&
1472 tp->snd_cwnd > tp->snd_ssthresh)
1473 tp->snd_cwnd = tp->snd_ssthresh;
1474 tp->t_dupacks = 0;
1475 if (SEQ_GT(ti->ti_ack, tp->snd_max)) {
1476 tcpstat.tcps_rcvacktoomuch++;
1477 goto dropafterack;
1478 }
1479 /*
1480 * If we reach this point, ACK is not a duplicate,
1481 * i.e., it ACKs something we sent.
1482 */
1483 if (tp->t_flags & TF_NEEDSYN) {
1484 /*
1485 * T/TCP: Connection was half-synchronized, and our
1486 * SYN has been ACK'd (so connection is now fully
1487 * synchronized). Go to non-starred state,
1488 * increment snd_una for ACK of SYN, and check if
1489 * we can do window scaling.
1490 */
1491 tp->t_flags &= ~TF_NEEDSYN;
1492 tp->snd_una++;
1493 /* Do window scaling? */
1494 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1495 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1496 tp->snd_scale = tp->requested_s_scale;
1497 tp->rcv_scale = tp->request_r_scale;
1498 }
1499 }
1500
1501process_ACK:
1502 acked = ti->ti_ack - tp->snd_una;
1503 tcpstat.tcps_rcvackpack++;
1504 tcpstat.tcps_rcvackbyte += acked;
1505
1506 /*
1507 * If we just performed our first retransmit, and the ACK
1508 * arrives within our recovery window, then it was a mistake
1509 * to do the retransmit in the first place. Recover our
1510 * original cwnd and ssthresh, and proceed to transmit where
1511 * we left off.
1512 */
1513 if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
1514 tp->snd_cwnd = tp->snd_cwnd_prev;
1515 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1516 tp->snd_nxt = tp->snd_max;
1517 tp->t_badrxtwin = 0; /* XXX probably not required */
1518 }
1519
1520 /*
1521 * If we have a timestamp reply, update smoothed
1522 * round trip time. If no timestamp is present but
1523 * transmit timer is running and timed sequence
1524 * number was acked, update smoothed round trip time.
1525 * Since we now have an rtt measurement, cancel the
1526 * timer backoff (cf., Phil Karn's retransmit alg.).
1527 * Recompute the initial retransmit timer.
1528 */
1529 if (to.to_flag & TOF_TS)
1530 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
1531 else if (tp->t_rtttime && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1532 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
1533
1534 /*
1535 * If all outstanding data is acked, stop retransmit
1536 * timer and remember to restart (more output or persist).
1537 * If there is more data to be acked, restart retransmit
1538 * timer, using current (possibly backed-off) value.
1539 */
1540 if (ti->ti_ack == tp->snd_max) {
1541 callout_stop(tp->tt_rexmt);
1542 needoutput = 1;
1543 } else if (!callout_active(tp->tt_persist))
1544 callout_reset(tp->tt_rexmt, tp->t_rxtcur,
1545 tcp_timer_rexmt, tp);
1546
1547 /*
1548 * If no data (only SYN) was ACK'd,
1549 * skip rest of ACK processing.
1550 */
1551 if (acked == 0)
1552 goto step6;
1553
1554 /*
1555 * When new data is acked, open the congestion window.
1556 * If the window gives us less than ssthresh packets
1557 * in flight, open exponentially (maxseg per packet).
1558 * Otherwise open linearly: maxseg per window
1559 * (maxseg^2 / cwnd per packet).
1560 */
1561 {
1562 register u_int cw = tp->snd_cwnd;
1563 register u_int incr = tp->t_maxseg;
1564
1565 if (cw > tp->snd_ssthresh)
1566 incr = incr * incr / cw;
1567 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale);
1568 }
1569 if (acked > so->so_snd.sb_cc) {
1570 tp->snd_wnd -= so->so_snd.sb_cc;
1571 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1572 ourfinisacked = 1;
1573 } else {
1574 sbdrop(&so->so_snd, acked);
1575 tp->snd_wnd -= acked;
1576 ourfinisacked = 0;
1577 }
1578 sowwakeup(so);
1579 tp->snd_una = ti->ti_ack;
1580 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1581 tp->snd_nxt = tp->snd_una;
1582
1583 switch (tp->t_state) {
1584
1585 /*
1586 * In FIN_WAIT_1 STATE in addition to the processing
1587 * for the ESTABLISHED state if our FIN is now acknowledged
1588 * then enter FIN_WAIT_2.
1589 */
1590 case TCPS_FIN_WAIT_1:
1591 if (ourfinisacked) {
1592 /*
1593 * If we can't receive any more
1594 * data, then closing user can proceed.
1595 * Starting the timer is contrary to the
1596 * specification, but if we don't get a FIN
1597 * we'll hang forever.
1598 */
1599 if (so->so_state & SS_CANTRCVMORE) {
1600 soisdisconnected(so);
1601 callout_reset(tp->tt_2msl, tcp_maxidle,
1602 tcp_timer_2msl, tp);
1603 }
1604 tp->t_state = TCPS_FIN_WAIT_2;
1605 }
1606 break;
1607
1608 /*
1609 * In CLOSING STATE in addition to the processing for
1610 * the ESTABLISHED state if the ACK acknowledges our FIN
1611 * then enter the TIME-WAIT state, otherwise ignore
1612 * the segment.
1613 */
1614 case TCPS_CLOSING:
1615 if (ourfinisacked) {
1616 tp->t_state = TCPS_TIME_WAIT;
1617 tcp_canceltimers(tp);
1618 /* Shorten TIME_WAIT [RFC-1644, p.28] */
1619 if (tp->cc_recv != 0 &&
1620 (ticks - tp->t_starttime) < tcp_msl)
1621 callout_reset(tp->tt_2msl,
1622 tp->t_rxtcur *
1623 TCPTV_TWTRUNC,
1624 tcp_timer_2msl, tp);
1625 else
1626 callout_reset(tp->tt_2msl, 2 * tcp_msl,
1627 tcp_timer_2msl, tp);
1628 soisdisconnected(so);
1629 }
1630 break;
1631
1632 /*
1633 * In LAST_ACK, we may still be waiting for data to drain
1634 * and/or to be acked, as well as for the ack of our FIN.
1635 * If our FIN is now acknowledged, delete the TCB,
1636 * enter the closed state and return.
1637 */
1638 case TCPS_LAST_ACK:
1639 if (ourfinisacked) {
1640 tp = tcp_close(tp);
1641 goto drop;
1642 }
1643 break;
1644
1645 /*
1646 * In TIME_WAIT state the only thing that should arrive
1647 * is a retransmission of the remote FIN. Acknowledge
1648 * it and restart the finack timer.
1649 */
1650 case TCPS_TIME_WAIT:
1651 callout_reset(tp->tt_2msl, 2 * tcp_msl,
1652 tcp_timer_2msl, tp);
1653 goto dropafterack;
1654 }
1655 }
1656
1657step6:
1658 /*
1659 * Update window information.
1660 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1661 */
1662 if ((tiflags & TH_ACK) &&
1663 (SEQ_LT(tp->snd_wl1, ti->ti_seq) ||
1664 (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) ||
1665 (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) {
1666 /* keep track of pure window updates */
1667 if (ti->ti_len == 0 &&
1668 tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd)
1669 tcpstat.tcps_rcvwinupd++;
1670 tp->snd_wnd = tiwin;
1671 tp->snd_wl1 = ti->ti_seq;
1672 tp->snd_wl2 = ti->ti_ack;
1673 if (tp->snd_wnd > tp->max_sndwnd)
1674 tp->max_sndwnd = tp->snd_wnd;
1675 needoutput = 1;
1676 }
1677
1678 /*
1679 * Process segments with URG.
1680 */
1681 if ((tiflags & TH_URG) && ti->ti_urp &&
1682 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1683 /*
1684 * This is a kludge, but if we receive and accept
1685 * random urgent pointers, we'll crash in
1686 * soreceive. It's hard to imagine someone
1687 * actually wanting to send this much urgent data.
1688 */
1689 if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) {
1690 ti->ti_urp = 0; /* XXX */
1691 tiflags &= ~TH_URG; /* XXX */
1692 goto dodata; /* XXX */
1693 }
1694 /*
1695 * If this segment advances the known urgent pointer,
1696 * then mark the data stream. This should not happen
1697 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1698 * a FIN has been received from the remote side.
1699 * In these states we ignore the URG.
1700 *
1701 * According to RFC961 (Assigned Protocols),
1702 * the urgent pointer points to the last octet
1703 * of urgent data. We continue, however,
1704 * to consider it to indicate the first octet
1705 * of data past the urgent section as the original
1706 * spec states (in one of two places).
1707 */
1708 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) {
1709 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1710 so->so_oobmark = so->so_rcv.sb_cc +
1711 (tp->rcv_up - tp->rcv_nxt) - 1;
1712 if (so->so_oobmark == 0)
1713 so->so_state |= SS_RCVATMARK;
1714 sohasoutofband(so);
1715 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1716 }
1717 /*
1718 * Remove out of band data so doesn't get presented to user.
1719 * This can happen independent of advancing the URG pointer,
1720 * but if two URG's are pending at once, some out-of-band
1721 * data may creep in... ick.
1722 */
1723 if (ti->ti_urp <= (u_long)ti->ti_len
1724#ifdef SO_OOBINLINE
1725 && (so->so_options & SO_OOBINLINE) == 0
1726#endif
1727 )
1728 tcp_pulloutofband(so, ti, m);
1729 } else
1730 /*
1731 * If no out of band data is expected,
1732 * pull receive urgent pointer along
1733 * with the receive window.
1734 */
1735 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1736 tp->rcv_up = tp->rcv_nxt;
1737dodata: /* XXX */
1738
1739 /*
1740 * Process the segment text, merging it into the TCP sequencing queue,
1741 * and arranging for acknowledgment of receipt if necessary.
1742 * This process logically involves adjusting tp->rcv_wnd as data
1743 * is presented to the user (this happens in tcp_usrreq.c,
1744 * case PRU_RCVD). If a FIN has already been received on this
1745 * connection then we just ignore the text.
1746 */
1747 if ((ti->ti_len || (tiflags&TH_FIN)) &&
1748 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1749 TCP_REASS(tp, ti, m, so, tiflags);
1750 /*
1751 * Note the amount of data that peer has sent into
1752 * our window, in order to estimate the sender's
1753 * buffer size.
1754 */
1755 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1756 } else {
1757 m_freem(m);
1758 tiflags &= ~TH_FIN;
1759 }
1760
1761 /*
1762 * If FIN is received ACK the FIN and let the user know
1763 * that the connection is closing.
1764 */
1765 if (tiflags & TH_FIN) {
1766 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1767 socantrcvmore(so);
1768 /*
1769 * If connection is half-synchronized
1770 * (ie NEEDSYN flag on) then delay ACK,
1771 * so it may be piggybacked when SYN is sent.
1772 * Otherwise, since we received a FIN then no
1773 * more input can be expected, send ACK now.
1774 */
1775 if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN))
1776 callout_reset(tp->tt_delack, tcp_delacktime,
1777 tcp_timer_delack, tp);
1778 else
1779 tp->t_flags |= TF_ACKNOW;
1780 tp->rcv_nxt++;
1781 }
1782 switch (tp->t_state) {
1783
1784 /*
1785 * In SYN_RECEIVED and ESTABLISHED STATES
1786 * enter the CLOSE_WAIT state.
1787 */
1788 case TCPS_SYN_RECEIVED:
1789 tp->t_starttime = ticks;
1790 /*FALLTHROUGH*/
1791 case TCPS_ESTABLISHED:
1792 tp->t_state = TCPS_CLOSE_WAIT;
1793 break;
1794
1795 /*
1796 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1797 * enter the CLOSING state.
1798 */
1799 case TCPS_FIN_WAIT_1:
1800 tp->t_state = TCPS_CLOSING;
1801 break;
1802
1803 /*
1804 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1805 * starting the time-wait timer, turning off the other
1806 * standard timers.
1807 */
1808 case TCPS_FIN_WAIT_2:
1809 tp->t_state = TCPS_TIME_WAIT;
1810 tcp_canceltimers(tp);
1811 /* Shorten TIME_WAIT [RFC-1644, p.28] */
1812 if (tp->cc_recv != 0 &&
1813 (ticks - tp->t_starttime) < tcp_msl) {
1814 callout_reset(tp->tt_2msl,
1815 tp->t_rxtcur * TCPTV_TWTRUNC,
1816 tcp_timer_2msl, tp);
1817 /* For transaction client, force ACK now. */
1818 tp->t_flags |= TF_ACKNOW;
1819 }
1820 else
1821 callout_reset(tp->tt_2msl, 2 * tcp_msl,
1822 tcp_timer_2msl, tp);
1823 soisdisconnected(so);
1824 break;
1825
1826 /*
1827 * In TIME_WAIT state restart the 2 MSL time_wait timer.
1828 */
1829 case TCPS_TIME_WAIT:
1830 callout_reset(tp->tt_2msl, 2 * tcp_msl,
1831 tcp_timer_2msl, tp);
1832 break;
1833 }
1834 }
1835#ifdef TCPDEBUG
1836 if (so->so_options & SO_DEBUG)
1837 tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0);
1838#endif
1839
1840 /*
1841 * Return any desired output.
1842 */
1843 if (needoutput || (tp->t_flags & TF_ACKNOW))
1844 (void) tcp_output(tp);
1845 return;
1846
1847dropafterack:
1848 /*
1849 * Generate an ACK dropping incoming segment if it occupies
1850 * sequence space, where the ACK reflects our state.
1851 *
1852 * We can now skip the test for the RST flag since all
1853 * paths to this code happen after packets containing
1854 * RST have been dropped.
1855 *
1856 * In the SYN-RECEIVED state, don't send an ACK unless the
1857 * segment we received passes the SYN-RECEIVED ACK test.
1858 * If it fails send a RST. This breaks the loop in the
1859 * "LAND" DoS attack, and also prevents an ACK storm
1860 * between two listening ports that have been sent forged
1861 * SYN segments, each with the source address of the other.
1862 */
1863 if (tp->t_state == TCPS_SYN_RECEIVED && (tiflags & TH_ACK) &&
1864 (SEQ_GT(tp->snd_una, ti->ti_ack) ||
1865 SEQ_GT(ti->ti_ack, tp->snd_max)) )
1866 goto dropwithreset;
1867#ifdef TCPDEBUG
1868 if (so->so_options & SO_DEBUG)
1869 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0);
1870#endif
1871 m_freem(m);
1872 tp->t_flags |= TF_ACKNOW;
1873 (void) tcp_output(tp);
1874 return;
1875
1876dropwithreset:
1877#ifdef TCP_RESTRICT_RST
1878 if (restrict_rst)
1879 goto drop;
1880#endif
1881 /*
1882 * Generate a RST, dropping incoming segment.
1883 * Make ACK acceptable to originator of segment.
1884 * Don't bother to respond if destination was broadcast/multicast.
1885 */
1886 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) ||
1887 IN_MULTICAST(ntohl(ti->ti_dst.s_addr)))
1888 goto drop;
1889#ifdef TCPDEBUG
1890 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1891 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0);
1892#endif
1893 if (tiflags & TH_ACK)
1894 tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1895 else {
1896 if (tiflags & TH_SYN)
1897 ti->ti_len++;
1898 tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1899 TH_RST|TH_ACK);
1900 }
1901 /* destroy temporarily created socket */
1902 if (dropsocket)
1903 (void) soabort(so);
1904 return;
1905
1906drop:
1907 /*
1908 * Drop space held by incoming segment and return.
1909 */
1910#ifdef TCPDEBUG
1911 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1912 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0);
1913#endif
1914 m_freem(m);
1915 /* destroy temporarily created socket */
1916 if (dropsocket)
1917 (void) soabort(so);
1918 return;
1919}
1920
1921static void
1922tcp_dooptions(tp, cp, cnt, ti, to)
1923 struct tcpcb *tp;
1924 u_char *cp;
1925 int cnt;
1926 struct tcpiphdr *ti;
1927 struct tcpopt *to;
1928{
1929 u_short mss = 0;
1930 int opt, optlen;
1931
1932 for (; cnt > 0; cnt -= optlen, cp += optlen) {
1933 opt = cp[0];
1934 if (opt == TCPOPT_EOL)
1935 break;
1936 if (opt == TCPOPT_NOP)
1937 optlen = 1;
1938 else {
1939 optlen = cp[1];
1940 if (optlen <= 0)
1941 break;
1942 }
1943 switch (opt) {
1944
1945 default:
1946 continue;
1947
1948 case TCPOPT_MAXSEG:
1949 if (optlen != TCPOLEN_MAXSEG)
1950 continue;
1951 if (!(ti->ti_flags & TH_SYN))
1952 continue;
1953 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
1954 NTOHS(mss);
1955 break;
1956
1957 case TCPOPT_WINDOW:
1958 if (optlen != TCPOLEN_WINDOW)
1959 continue;
1960 if (!(ti->ti_flags & TH_SYN))
1961 continue;
1962 tp->t_flags |= TF_RCVD_SCALE;
1963 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1964 break;
1965
1966 case TCPOPT_TIMESTAMP:
1967 if (optlen != TCPOLEN_TIMESTAMP)
1968 continue;
1969 to->to_flag |= TOF_TS;
1970 bcopy((char *)cp + 2,
1971 (char *)&to->to_tsval, sizeof(to->to_tsval));
1972 NTOHL(to->to_tsval);
1973 bcopy((char *)cp + 6,
1974 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
1975 NTOHL(to->to_tsecr);
1976
1977 /*
1978 * A timestamp received in a SYN makes
1979 * it ok to send timestamp requests and replies.
1980 */
1981 if (ti->ti_flags & TH_SYN) {
1982 tp->t_flags |= TF_RCVD_TSTMP;
1983 tp->ts_recent = to->to_tsval;
1984 tp->ts_recent_age = ticks;
1985 }
1986 break;
1987 case TCPOPT_CC:
1988 if (optlen != TCPOLEN_CC)
1989 continue;
1990 to->to_flag |= TOF_CC;
1991 bcopy((char *)cp + 2,
1992 (char *)&to->to_cc, sizeof(to->to_cc));
1993 NTOHL(to->to_cc);
1994 /*
1995 * A CC or CC.new option received in a SYN makes
1996 * it ok to send CC in subsequent segments.
1997 */
1998 if (ti->ti_flags & TH_SYN)
1999 tp->t_flags |= TF_RCVD_CC;
2000 break;
2001 case TCPOPT_CCNEW:
2002 if (optlen != TCPOLEN_CC)
2003 continue;
2004 if (!(ti->ti_flags & TH_SYN))
2005 continue;
2006 to->to_flag |= TOF_CCNEW;
2007 bcopy((char *)cp + 2,
2008 (char *)&to->to_cc, sizeof(to->to_cc));
2009 NTOHL(to->to_cc);
2010 /*
2011 * A CC or CC.new option received in a SYN makes
2012 * it ok to send CC in subsequent segments.
2013 */
2014 tp->t_flags |= TF_RCVD_CC;
2015 break;
2016 case TCPOPT_CCECHO:
2017 if (optlen != TCPOLEN_CC)
2018 continue;
2019 if (!(ti->ti_flags & TH_SYN))
2020 continue;
2021 to->to_flag |= TOF_CCECHO;
2022 bcopy((char *)cp + 2,
2023 (char *)&to->to_ccecho, sizeof(to->to_ccecho));
2024 NTOHL(to->to_ccecho);
2025 break;
2026 }
2027 }
2028 if (ti->ti_flags & TH_SYN)
2029 tcp_mss(tp, mss); /* sets t_maxseg */
2030}
2031
2032/*
2033 * Pull out of band byte out of a segment so
2034 * it doesn't appear in the user's data queue.
2035 * It is still reflected in the segment length for
2036 * sequencing purposes.
2037 */
2038static void
2039tcp_pulloutofband(so, ti, m)
2040 struct socket *so;
2041 struct tcpiphdr *ti;
2042 register struct mbuf *m;
2043{
2044 int cnt = ti->ti_urp - 1;
2045
2046 while (cnt >= 0) {
2047 if (m->m_len > cnt) {
2048 char *cp = mtod(m, caddr_t) + cnt;
2049 struct tcpcb *tp = sototcpcb(so);
2050
2051 tp->t_iobc = *cp;
2052 tp->t_oobflags |= TCPOOB_HAVEDATA;
2053 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2054 m->m_len--;
2055 return;
2056 }
2057 cnt -= m->m_len;
2058 m = m->m_next;
2059 if (m == 0)
2060 break;
2061 }
2062 panic("tcp_pulloutofband");
2063}
2064
2065/*
2066 * Collect new round-trip time estimate
2067 * and update averages and current timeout.
2068 */
2069static void
2070tcp_xmit_timer(tp, rtt)
2071 register struct tcpcb *tp;
2072 int rtt;
2073{
2074 register int delta;
2075
2076 tcpstat.tcps_rttupdated++;
2077 tp->t_rttupdated++;
2078 if (tp->t_srtt != 0) {
2079 /*
2080 * srtt is stored as fixed point with 5 bits after the
2081 * binary point (i.e., scaled by 8). The following magic
2082 * is equivalent to the smoothing algorithm in rfc793 with
2083 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2084 * point). Adjust rtt to origin 0.
2085 */
2086 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2087 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2088
2089 if ((tp->t_srtt += delta) <= 0)
2090 tp->t_srtt = 1;
2091
2092 /*
2093 * We accumulate a smoothed rtt variance (actually, a
2094 * smoothed mean difference), then set the retransmit
2095 * timer to smoothed rtt + 4 times the smoothed variance.
2096 * rttvar is stored as fixed point with 4 bits after the
2097 * binary point (scaled by 16). The following is
2098 * equivalent to rfc793 smoothing with an alpha of .75
2099 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
2100 * rfc793's wired-in beta.
2101 */
2102 if (delta < 0)
2103 delta = -delta;
2104 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2105 if ((tp->t_rttvar += delta) <= 0)
2106 tp->t_rttvar = 1;
2107 } else {
2108 /*
2109 * No rtt measurement yet - use the unsmoothed rtt.
2110 * Set the variance to half the rtt (so our first
2111 * retransmit happens at 3*rtt).
2112 */
2113 tp->t_srtt = rtt << TCP_RTT_SHIFT;
2114 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2115 }
2116 tp->t_rtttime = 0;
2117 tp->t_rxtshift = 0;
2118
2119 /*
2120 * the retransmit should happen at rtt + 4 * rttvar.
2121 * Because of the way we do the smoothing, srtt and rttvar
2122 * will each average +1/2 tick of bias. When we compute
2123 * the retransmit timer, we want 1/2 tick of rounding and
2124 * 1 extra tick because of +-1/2 tick uncertainty in the
2125 * firing of the timer. The bias will give us exactly the
2126 * 1.5 tick we need. But, because the bias is
2127 * statistical, we have to test that we don't drop below
2128 * the minimum feasible timer (which is 2 ticks).
2129 */
2130 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2131 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2132
2133 /*
2134 * We received an ack for a packet that wasn't retransmitted;
2135 * it is probably safe to discard any error indications we've
2136 * received recently. This isn't quite right, but close enough
2137 * for now (a route might have failed after we sent a segment,
2138 * and the return path might not be symmetrical).
2139 */
2140 tp->t_softerror = 0;
2141}
2142
2143/*
2144 * Determine a reasonable value for maxseg size.
2145 * If the route is known, check route for mtu.
2146 * If none, use an mss that can be handled on the outgoing
2147 * interface without forcing IP to fragment; if bigger than
2148 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2149 * to utilize large mbufs. If no route is found, route has no mtu,
2150 * or the destination isn't local, use a default, hopefully conservative
2151 * size (usually 512 or the default IP max size, but no more than the mtu
2152 * of the interface), as we can't discover anything about intervening
2153 * gateways or networks. We also initialize the congestion/slow start
2154 * window to be a single segment if the destination isn't local.
2155 * While looking at the routing entry, we also initialize other path-dependent
2156 * parameters from pre-set or cached values in the routing entry.
2157 *
2158 * Also take into account the space needed for options that we
2159 * send regularly. Make maxseg shorter by that amount to assure
2160 * that we can send maxseg amount of data even when the options
2161 * are present. Store the upper limit of the length of options plus
2162 * data in maxopd.
2163 *
2164 * NOTE that this routine is only called when we process an incoming
2165 * segment, for outgoing segments only tcp_mssopt is called.
2166 *
2167 * In case of T/TCP, we call this routine during implicit connection
2168 * setup as well (offer = -1), to initialize maxseg from the cached
2169 * MSS of our peer.
2170 */
2171void
2172tcp_mss(tp, offer)
2173 struct tcpcb *tp;
2174 int offer;
2175{
2176 register struct rtentry *rt;
2177 struct ifnet *ifp;
2178 register int rtt, mss;
2179 u_long bufsize;
2180 struct inpcb *inp;
2181 struct socket *so;
2182 struct rmxp_tao *taop;
2183 int origoffer = offer;
2184
2185 inp = tp->t_inpcb;
2186 if ((rt = tcp_rtlookup(inp)) == NULL) {
2187 tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
2188 return;
2189 }
2190 ifp = rt->rt_ifp;
2191 so = inp->inp_socket;
2192
2193 taop = rmx_taop(rt->rt_rmx);
2194 /*
2195 * Offer == -1 means that we didn't receive SYN yet,
2196 * use cached value in that case;
2197 */
2198 if (offer == -1)
2199 offer = taop->tao_mssopt;
2200 /*
2201 * Offer == 0 means that there was no MSS on the SYN segment,
2202 * in this case we use tcp_mssdflt.
2203 */
2204 if (offer == 0)
2205 offer = tcp_mssdflt;
2206 else
2207 /*
2208 * Sanity check: make sure that maxopd will be large
2209 * enough to allow some data on segments even is the
2210 * all the option space is used (40bytes). Otherwise
2211 * funny things may happen in tcp_output.
2212 */
2213 offer = max(offer, 64);
2214 taop->tao_mssopt = offer;
2215
2216 /*
2217 * While we're here, check if there's an initial rtt
2218 * or rttvar. Convert from the route-table units
2219 * to scaled multiples of the slow timeout timer.
2220 */
2221 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
2222 /*
2223 * XXX the lock bit for RTT indicates that the value
2224 * is also a minimum value; this is subject to time.
2225 */
2226 if (rt->rt_rmx.rmx_locks & RTV_RTT)
2227 tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
2228 tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
2229 tcpstat.tcps_usedrtt++;
2230 if (rt->rt_rmx.rmx_rttvar) {
2231 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
2232 (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
2233 tcpstat.tcps_usedrttvar++;
2234 } else {
2235 /* default variation is +- 1 rtt */
2236 tp->t_rttvar =
2237 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
2238 }
2239 TCPT_RANGESET(tp->t_rxtcur,
2240 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
2241 tp->t_rttmin, TCPTV_REXMTMAX);
2242 }
2243 /*
2244 * if there's an mtu associated with the route, use it
2245 */
2246 if (rt->rt_rmx.rmx_mtu)
2247 mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr);
2248 else
2249 {
2250 mss = ifp->if_mtu - sizeof(struct tcpiphdr);
2251 if (!in_localaddr(inp->inp_faddr))
2252 mss = min(mss, tcp_mssdflt);
2253 }
2254 mss = min(mss, offer);
2255 /*
2256 * maxopd stores the maximum length of data AND options
2257 * in a segment; maxseg is the amount of data in a normal
2258 * segment. We need to store this value (maxopd) apart
2259 * from maxseg, because now every segment carries options
2260 * and thus we normally have somewhat less data in segments.
2261 */
2262 tp->t_maxopd = mss;
2263
2264 /*
2265 * In case of T/TCP, origoffer==-1 indicates, that no segments
2266 * were received yet. In this case we just guess, otherwise
2267 * we do the same as before T/TCP.
2268 */
2269 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2270 (origoffer == -1 ||
2271 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
2272 mss -= TCPOLEN_TSTAMP_APPA;
2273 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
2274 (origoffer == -1 ||
2275 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
2276 mss -= TCPOLEN_CC_APPA;
2277
2278#if (MCLBYTES & (MCLBYTES - 1)) == 0
2279 if (mss > MCLBYTES)
2280 mss &= ~(MCLBYTES-1);
2281#else
2282 if (mss > MCLBYTES)
2283 mss = mss / MCLBYTES * MCLBYTES;
2284#endif
2285 /*
2286 * If there's a pipesize, change the socket buffer
2287 * to that size. Make the socket buffers an integral
2288 * number of mss units; if the mss is larger than
2289 * the socket buffer, decrease the mss.
2290 */
2291#ifdef RTV_SPIPE
2292 if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
2293#endif
2294 bufsize = so->so_snd.sb_hiwat;
2295 if (bufsize < mss)
2296 mss = bufsize;
2297 else {
2298 bufsize = roundup(bufsize, mss);
2299 if (bufsize > sb_max)
2300 bufsize = sb_max;
2301 (void)sbreserve(&so->so_snd, bufsize, so, NULL);
2302 }
2303 tp->t_maxseg = mss;
2304
2305#ifdef RTV_RPIPE
2306 if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
2307#endif
2308 bufsize = so->so_rcv.sb_hiwat;
2309 if (bufsize > mss) {
2310 bufsize = roundup(bufsize, mss);
2311 if (bufsize > sb_max)
2312 bufsize = sb_max;
2313 (void)sbreserve(&so->so_rcv, bufsize, so, NULL);
2314 }
2315
2316 /*
2317 * Set the slow-start flight size depending on whether this
2318 * is a local network or not.
2319 */
2320 if (in_localaddr(inp->inp_faddr))
2321 tp->snd_cwnd = mss * ss_fltsz_local;
2322 else
2323 tp->snd_cwnd = mss * ss_fltsz;
2324
2325 if (rt->rt_rmx.rmx_ssthresh) {
2326 /*
2327 * There's some sort of gateway or interface
2328 * buffer limit on the path. Use this to set
2329 * the slow start threshhold, but set the
2330 * threshold to no less than 2*mss.
2331 */
2332 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
2333 tcpstat.tcps_usedssthresh++;
2334 }
2335}
2336
2337/*
2338 * Determine the MSS option to send on an outgoing SYN.
2339 */
2340int
2341tcp_mssopt(tp)
2342 struct tcpcb *tp;
2343{
2344 struct rtentry *rt;
2345
2346 rt = tcp_rtlookup(tp->t_inpcb);
2347 if (rt == NULL)
2348 return tcp_mssdflt;
2349
2350 return rt->rt_ifp->if_mtu - sizeof(struct tcpiphdr);
2351}
281 register struct tcpiphdr *ti;
282 register struct inpcb *inp;
283 u_char *optp = NULL;
284 int optlen = 0;
285 int len, tlen, off;
286 register struct tcpcb *tp = 0;
287 register int tiflags;
288 struct socket *so = 0;
289 int todrop, acked, ourfinisacked, needoutput = 0;
290 struct in_addr laddr;
291 int dropsocket = 0;
292 int iss = 0;
293 u_long tiwin;
294 struct tcpopt to; /* options in this segment */
295 struct rmxp_tao *taop; /* pointer to our TAO cache entry */
296 struct rmxp_tao tao_noncached; /* in case there's no cached entry */
297#ifdef TCPDEBUG
298 short ostate = 0;
299#endif
300
301 bzero((char *)&to, sizeof(to));
302
303 tcpstat.tcps_rcvtotal++;
304 /*
305 * Get IP and TCP header together in first mbuf.
306 * Note: IP leaves IP header in first mbuf.
307 */
308 ti = mtod(m, struct tcpiphdr *);
309 if (iphlen > sizeof (struct ip))
310 ip_stripoptions(m, (struct mbuf *)0);
311 if (m->m_len < sizeof (struct tcpiphdr)) {
312 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
313 tcpstat.tcps_rcvshort++;
314 return;
315 }
316 ti = mtod(m, struct tcpiphdr *);
317 }
318
319 /*
320 * Checksum extended TCP header and data.
321 */
322 tlen = ((struct ip *)ti)->ip_len;
323 len = sizeof (struct ip) + tlen;
324 bzero(ti->ti_x1, sizeof(ti->ti_x1));
325 ti->ti_len = (u_short)tlen;
326 HTONS(ti->ti_len);
327 ti->ti_sum = in_cksum(m, len);
328 if (ti->ti_sum) {
329 tcpstat.tcps_rcvbadsum++;
330 goto drop;
331 }
332
333 /*
334 * Check that TCP offset makes sense,
335 * pull out TCP options and adjust length. XXX
336 */
337 off = ti->ti_off << 2;
338 if (off < sizeof (struct tcphdr) || off > tlen) {
339 tcpstat.tcps_rcvbadoff++;
340 goto drop;
341 }
342 tlen -= off;
343 ti->ti_len = tlen;
344 if (off > sizeof (struct tcphdr)) {
345 if (m->m_len < sizeof(struct ip) + off) {
346 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
347 tcpstat.tcps_rcvshort++;
348 return;
349 }
350 ti = mtod(m, struct tcpiphdr *);
351 }
352 optlen = off - sizeof (struct tcphdr);
353 optp = mtod(m, u_char *) + sizeof (struct tcpiphdr);
354 }
355 tiflags = ti->ti_flags;
356
357#ifdef TCP_DROP_SYNFIN
358 /*
359 * If the drop_synfin option is enabled, drop all packets with
360 * both the SYN and FIN bits set. This prevents e.g. nmap from
361 * identifying the TCP/IP stack.
362 *
363 * This is incompatible with RFC1644 extensions (T/TCP).
364 */
365 if (drop_synfin && (tiflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
366 goto drop;
367#endif
368
369 /*
370 * Convert TCP protocol specific fields to host format.
371 */
372 NTOHL(ti->ti_seq);
373 NTOHL(ti->ti_ack);
374 NTOHS(ti->ti_win);
375 NTOHS(ti->ti_urp);
376
377 /*
378 * Drop TCP, IP headers and TCP options.
379 */
380 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
381 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
382
383 /*
384 * Locate pcb for segment.
385 */
386findpcb:
387#ifdef IPFIREWALL_FORWARD
388 if (ip_fw_fwd_addr != NULL) {
389 /*
390 * Diverted. Pretend to be the destination.
391 * already got one like this?
392 */
393 inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport,
394 ti->ti_dst, ti->ti_dport, 0, m->m_pkthdr.rcvif);
395 if (!inp) {
396 /*
397 * No, then it's new. Try find the ambushing socket
398 */
399 if (!ip_fw_fwd_addr->sin_port) {
400 inp = in_pcblookup_hash(&tcbinfo, ti->ti_src,
401 ti->ti_sport, ip_fw_fwd_addr->sin_addr,
402 ti->ti_dport, 1, m->m_pkthdr.rcvif);
403 } else {
404 inp = in_pcblookup_hash(&tcbinfo,
405 ti->ti_src, ti->ti_sport,
406 ip_fw_fwd_addr->sin_addr,
407 ntohs(ip_fw_fwd_addr->sin_port), 1,
408 m->m_pkthdr.rcvif);
409 }
410 }
411 ip_fw_fwd_addr = NULL;
412 } else
413#endif /* IPFIREWALL_FORWARD */
414
415 inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport,
416 ti->ti_dst, ti->ti_dport, 1, m->m_pkthdr.rcvif);
417
418 /*
419 * If the state is CLOSED (i.e., TCB does not exist) then
420 * all data in the incoming segment is discarded.
421 * If the TCB exists but is in CLOSED state, it is embryonic,
422 * but should either do a listen or a connect soon.
423 */
424 if (inp == NULL) {
425 if (log_in_vain) {
426 char buf[4*sizeof "123"];
427
428 strcpy(buf, inet_ntoa(ti->ti_dst));
429 switch (log_in_vain) {
430 case 1:
431 if(tiflags & TH_SYN)
432 log(LOG_INFO,
433 "Connection attempt to TCP %s:%d from %s:%d\n",
434 buf, ntohs(ti->ti_dport),
435 inet_ntoa(ti->ti_src),
436 ntohs(ti->ti_sport));
437 break;
438 case 2:
439 log(LOG_INFO,
440 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
441 buf, ntohs(ti->ti_dport), inet_ntoa(ti->ti_src),
442 ntohs(ti->ti_sport), tiflags);
443 break;
444 default:
445 break;
446 }
447 }
448#ifdef ICMP_BANDLIM
449 if (badport_bandlim(1) < 0)
450 goto drop;
451#endif
452 if (blackhole) {
453 switch (blackhole) {
454 case 1:
455 if (tiflags & TH_SYN)
456 goto drop;
457 break;
458 case 2:
459 goto drop;
460 default:
461 goto drop;
462 }
463 }
464 goto dropwithreset;
465 }
466 tp = intotcpcb(inp);
467 if (tp == 0)
468 goto dropwithreset;
469 if (tp->t_state == TCPS_CLOSED)
470 goto drop;
471
472 /* Unscale the window into a 32-bit value. */
473 if ((tiflags & TH_SYN) == 0)
474 tiwin = ti->ti_win << tp->snd_scale;
475 else
476 tiwin = ti->ti_win;
477
478 so = inp->inp_socket;
479 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
480#ifdef TCPDEBUG
481 if (so->so_options & SO_DEBUG) {
482 ostate = tp->t_state;
483 tcp_saveti = *ti;
484 }
485#endif
486 if (so->so_options & SO_ACCEPTCONN) {
487 register struct tcpcb *tp0 = tp;
488 struct socket *so2;
489 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
490 /*
491 * Note: dropwithreset makes sure we don't
492 * send a RST in response to a RST.
493 */
494 if (tiflags & TH_ACK) {
495 tcpstat.tcps_badsyn++;
496 goto dropwithreset;
497 }
498 goto drop;
499 }
500 so2 = sonewconn(so, 0);
501 if (so2 == 0) {
502 tcpstat.tcps_listendrop++;
503 so2 = sodropablereq(so);
504 if (so2) {
505 tcp_drop(sototcpcb(so2), ETIMEDOUT);
506 so2 = sonewconn(so, 0);
507 }
508 if (!so2)
509 goto drop;
510 }
511 so = so2;
512 /*
513 * This is ugly, but ....
514 *
515 * Mark socket as temporary until we're
516 * committed to keeping it. The code at
517 * ``drop'' and ``dropwithreset'' check the
518 * flag dropsocket to see if the temporary
519 * socket created here should be discarded.
520 * We mark the socket as discardable until
521 * we're committed to it below in TCPS_LISTEN.
522 */
523 dropsocket++;
524 inp = (struct inpcb *)so->so_pcb;
525 inp->inp_laddr = ti->ti_dst;
526 inp->inp_lport = ti->ti_dport;
527 if (in_pcbinshash(inp) != 0) {
528 /*
529 * Undo the assignments above if we failed to
530 * put the PCB on the hash lists.
531 */
532 inp->inp_laddr.s_addr = INADDR_ANY;
533 inp->inp_lport = 0;
534 goto drop;
535 }
536 inp->inp_options = ip_srcroute();
537 tp = intotcpcb(inp);
538 tp->t_state = TCPS_LISTEN;
539 tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT);
540
541 /* Compute proper scaling value from buffer space */
542 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
543 TCP_MAXWIN << tp->request_r_scale <
544 so->so_rcv.sb_hiwat)
545 tp->request_r_scale++;
546 }
547 }
548
549 /*
550 * Segment received on connection.
551 * Reset idle time and keep-alive timer.
552 */
553 tp->t_rcvtime = ticks;
554 if (TCPS_HAVEESTABLISHED(tp->t_state))
555 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
556
557 /*
558 * Process options if not in LISTEN state,
559 * else do it below (after getting remote address).
560 */
561 if (tp->t_state != TCPS_LISTEN)
562 tcp_dooptions(tp, optp, optlen, ti, &to);
563
564 /*
565 * Header prediction: check for the two common cases
566 * of a uni-directional data xfer. If the packet has
567 * no control flags, is in-sequence, the window didn't
568 * change and we're not retransmitting, it's a
569 * candidate. If the length is zero and the ack moved
570 * forward, we're the sender side of the xfer. Just
571 * free the data acked & wake any higher level process
572 * that was blocked waiting for space. If the length
573 * is non-zero and the ack didn't move, we're the
574 * receiver side. If we're getting packets in-order
575 * (the reassembly queue is empty), add the data to
576 * the socket buffer and note that we need a delayed ack.
577 * Make sure that the hidden state-flags are also off.
578 * Since we check for TCPS_ESTABLISHED above, it can only
579 * be TH_NEEDSYN.
580 */
581 if (tp->t_state == TCPS_ESTABLISHED &&
582 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
583 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
584 ((to.to_flag & TOF_TS) == 0 ||
585 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
586 /*
587 * Using the CC option is compulsory if once started:
588 * the segment is OK if no T/TCP was negotiated or
589 * if the segment has a CC option equal to CCrecv
590 */
591 ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
592 ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
593 ti->ti_seq == tp->rcv_nxt &&
594 tiwin && tiwin == tp->snd_wnd &&
595 tp->snd_nxt == tp->snd_max) {
596
597 /*
598 * If last ACK falls within this segment's sequence numbers,
599 * record the timestamp.
600 * NOTE that the test is modified according to the latest
601 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
602 */
603 if ((to.to_flag & TOF_TS) != 0 &&
604 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) {
605 tp->ts_recent_age = ticks;
606 tp->ts_recent = to.to_tsval;
607 }
608
609 if (ti->ti_len == 0) {
610 if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
611 SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
612 tp->snd_cwnd >= tp->snd_wnd &&
613 tp->t_dupacks < tcprexmtthresh) {
614 /*
615 * this is a pure ack for outstanding data.
616 */
617 ++tcpstat.tcps_predack;
618 /*
619 * "bad retransmit" recovery
620 */
621 if (tp->t_rxtshift == 1 &&
622 ticks < tp->t_badrxtwin) {
623 tp->snd_cwnd = tp->snd_cwnd_prev;
624 tp->snd_ssthresh =
625 tp->snd_ssthresh_prev;
626 tp->snd_nxt = tp->snd_max;
627 tp->t_badrxtwin = 0;
628 }
629 if ((to.to_flag & TOF_TS) != 0)
630 tcp_xmit_timer(tp,
631 ticks - to.to_tsecr + 1);
632 else if (tp->t_rtttime &&
633 SEQ_GT(ti->ti_ack, tp->t_rtseq))
634 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
635 acked = ti->ti_ack - tp->snd_una;
636 tcpstat.tcps_rcvackpack++;
637 tcpstat.tcps_rcvackbyte += acked;
638 sbdrop(&so->so_snd, acked);
639 tp->snd_una = ti->ti_ack;
640 m_freem(m);
641
642 /*
643 * If all outstanding data are acked, stop
644 * retransmit timer, otherwise restart timer
645 * using current (possibly backed-off) value.
646 * If process is waiting for space,
647 * wakeup/selwakeup/signal. If data
648 * are ready to send, let tcp_output
649 * decide between more output or persist.
650 */
651 if (tp->snd_una == tp->snd_max)
652 callout_stop(tp->tt_rexmt);
653 else if (!callout_active(tp->tt_persist))
654 callout_reset(tp->tt_rexmt,
655 tp->t_rxtcur,
656 tcp_timer_rexmt, tp);
657
658 sowwakeup(so);
659 if (so->so_snd.sb_cc)
660 (void) tcp_output(tp);
661 return;
662 }
663 } else if (ti->ti_ack == tp->snd_una &&
664 tp->t_segq == NULL &&
665 ti->ti_len <= sbspace(&so->so_rcv)) {
666 /*
667 * this is a pure, in-sequence data packet
668 * with nothing on the reassembly queue and
669 * we have enough buffer space to take it.
670 */
671 ++tcpstat.tcps_preddat;
672 tp->rcv_nxt += ti->ti_len;
673 tcpstat.tcps_rcvpack++;
674 tcpstat.tcps_rcvbyte += ti->ti_len;
675 /*
676 * Add data to socket buffer.
677 */
678 sbappend(&so->so_rcv, m);
679 sorwakeup(so);
680 if (tcp_delack_enabled) {
681 callout_reset(tp->tt_delack, tcp_delacktime,
682 tcp_timer_delack, tp);
683 } else {
684 tp->t_flags |= TF_ACKNOW;
685 tcp_output(tp);
686 }
687 return;
688 }
689 }
690
691 /*
692 * Calculate amount of space in receive window,
693 * and then do TCP input processing.
694 * Receive window is amount of space in rcv queue,
695 * but not less than advertised window.
696 */
697 { int win;
698
699 win = sbspace(&so->so_rcv);
700 if (win < 0)
701 win = 0;
702 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
703 }
704
705 switch (tp->t_state) {
706
707 /*
708 * If the state is LISTEN then ignore segment if it contains an RST.
709 * If the segment contains an ACK then it is bad and send a RST.
710 * If it does not contain a SYN then it is not interesting; drop it.
711 * If it is from this socket, drop it, it must be forged.
712 * Don't bother responding if the destination was a broadcast.
713 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
714 * tp->iss, and send a segment:
715 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
716 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
717 * Fill in remote peer address fields if not previously specified.
718 * Enter SYN_RECEIVED state, and process any other fields of this
719 * segment in this state.
720 */
721 case TCPS_LISTEN: {
722 register struct sockaddr_in *sin;
723
724 if (tiflags & TH_RST)
725 goto drop;
726 if (tiflags & TH_ACK)
727 goto dropwithreset;
728 if ((tiflags & TH_SYN) == 0)
729 goto drop;
730 if ((ti->ti_dport == ti->ti_sport) &&
731 (ti->ti_dst.s_addr == ti->ti_src.s_addr))
732 goto drop;
733 /*
734 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
735 * in_broadcast() should never return true on a received
736 * packet with M_BCAST not set.
737 */
738 if (m->m_flags & (M_BCAST|M_MCAST) ||
739 IN_MULTICAST(ntohl(ti->ti_dst.s_addr)))
740 goto drop;
741 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
742 M_NOWAIT);
743 if (sin == NULL)
744 goto drop;
745 sin->sin_family = AF_INET;
746 sin->sin_len = sizeof(*sin);
747 sin->sin_addr = ti->ti_src;
748 sin->sin_port = ti->ti_sport;
749 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
750 laddr = inp->inp_laddr;
751 if (inp->inp_laddr.s_addr == INADDR_ANY)
752 inp->inp_laddr = ti->ti_dst;
753 if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) {
754 inp->inp_laddr = laddr;
755 FREE(sin, M_SONAME);
756 goto drop;
757 }
758 FREE(sin, M_SONAME);
759 tp->t_template = tcp_template(tp);
760 if (tp->t_template == 0) {
761 tp = tcp_drop(tp, ENOBUFS);
762 dropsocket = 0; /* socket is already gone */
763 goto drop;
764 }
765 if ((taop = tcp_gettaocache(inp)) == NULL) {
766 taop = &tao_noncached;
767 bzero(taop, sizeof(*taop));
768 }
769 tcp_dooptions(tp, optp, optlen, ti, &to);
770 if (iss)
771 tp->iss = iss;
772 else
773 tp->iss = tcp_iss;
774 tcp_iss += TCP_ISSINCR/4;
775 tp->irs = ti->ti_seq;
776 tcp_sendseqinit(tp);
777 tcp_rcvseqinit(tp);
778 /*
779 * Initialization of the tcpcb for transaction;
780 * set SND.WND = SEG.WND,
781 * initialize CCsend and CCrecv.
782 */
783 tp->snd_wnd = tiwin; /* initial send-window */
784 tp->cc_send = CC_INC(tcp_ccgen);
785 tp->cc_recv = to.to_cc;
786 /*
787 * Perform TAO test on incoming CC (SEG.CC) option, if any.
788 * - compare SEG.CC against cached CC from the same host,
789 * if any.
790 * - if SEG.CC > chached value, SYN must be new and is accepted
791 * immediately: save new CC in the cache, mark the socket
792 * connected, enter ESTABLISHED state, turn on flag to
793 * send a SYN in the next segment.
794 * A virtual advertised window is set in rcv_adv to
795 * initialize SWS prevention. Then enter normal segment
796 * processing: drop SYN, process data and FIN.
797 * - otherwise do a normal 3-way handshake.
798 */
799 if ((to.to_flag & TOF_CC) != 0) {
800 if (((tp->t_flags & TF_NOPUSH) != 0) &&
801 taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) {
802
803 taop->tao_cc = to.to_cc;
804 tp->t_starttime = ticks;
805 tp->t_state = TCPS_ESTABLISHED;
806
807 /*
808 * If there is a FIN, or if there is data and the
809 * connection is local, then delay SYN,ACK(SYN) in
810 * the hope of piggy-backing it on a response
811 * segment. Otherwise must send ACK now in case
812 * the other side is slow starting.
813 */
814 if (tcp_delack_enabled && ((tiflags & TH_FIN) ||
815 (ti->ti_len != 0 &&
816 in_localaddr(inp->inp_faddr)))) {
817 callout_reset(tp->tt_delack, tcp_delacktime,
818 tcp_timer_delack, tp);
819 tp->t_flags |= TF_NEEDSYN;
820 } else
821 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
822
823 /*
824 * Limit the `virtual advertised window' to TCP_MAXWIN
825 * here. Even if we requested window scaling, it will
826 * become effective only later when our SYN is acked.
827 */
828 tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN);
829 tcpstat.tcps_connects++;
830 soisconnected(so);
831 callout_reset(tp->tt_keep, tcp_keepinit,
832 tcp_timer_keep, tp);
833 dropsocket = 0; /* committed to socket */
834 tcpstat.tcps_accepts++;
835 goto trimthenstep6;
836 }
837 /* else do standard 3-way handshake */
838 } else {
839 /*
840 * No CC option, but maybe CC.NEW:
841 * invalidate cached value.
842 */
843 taop->tao_cc = 0;
844 }
845 /*
846 * TAO test failed or there was no CC option,
847 * do a standard 3-way handshake.
848 */
849 tp->t_flags |= TF_ACKNOW;
850 tp->t_state = TCPS_SYN_RECEIVED;
851 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
852 dropsocket = 0; /* committed to socket */
853 tcpstat.tcps_accepts++;
854 goto trimthenstep6;
855 }
856
857 /*
858 * If the state is SYN_RECEIVED:
859 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
860 */
861 case TCPS_SYN_RECEIVED:
862 if ((tiflags & TH_ACK) &&
863 (SEQ_LEQ(ti->ti_ack, tp->snd_una) ||
864 SEQ_GT(ti->ti_ack, tp->snd_max)))
865 goto dropwithreset;
866 break;
867
868 /*
869 * If the state is SYN_SENT:
870 * if seg contains an ACK, but not for our SYN, drop the input.
871 * if seg contains a RST, then drop the connection.
872 * if seg does not contain SYN, then drop it.
873 * Otherwise this is an acceptable SYN segment
874 * initialize tp->rcv_nxt and tp->irs
875 * if seg contains ack then advance tp->snd_una
876 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
877 * arrange for segment to be acked (eventually)
878 * continue processing rest of data/controls, beginning with URG
879 */
880 case TCPS_SYN_SENT:
881 if ((taop = tcp_gettaocache(inp)) == NULL) {
882 taop = &tao_noncached;
883 bzero(taop, sizeof(*taop));
884 }
885
886 if ((tiflags & TH_ACK) &&
887 (SEQ_LEQ(ti->ti_ack, tp->iss) ||
888 SEQ_GT(ti->ti_ack, tp->snd_max))) {
889 /*
890 * If we have a cached CCsent for the remote host,
891 * hence we haven't just crashed and restarted,
892 * do not send a RST. This may be a retransmission
893 * from the other side after our earlier ACK was lost.
894 * Our new SYN, when it arrives, will serve as the
895 * needed ACK.
896 */
897 if (taop->tao_ccsent != 0)
898 goto drop;
899 else
900 goto dropwithreset;
901 }
902 if (tiflags & TH_RST) {
903 if (tiflags & TH_ACK)
904 tp = tcp_drop(tp, ECONNREFUSED);
905 goto drop;
906 }
907 if ((tiflags & TH_SYN) == 0)
908 goto drop;
909 tp->snd_wnd = ti->ti_win; /* initial send window */
910 tp->cc_recv = to.to_cc; /* foreign CC */
911
912 tp->irs = ti->ti_seq;
913 tcp_rcvseqinit(tp);
914 if (tiflags & TH_ACK) {
915 /*
916 * Our SYN was acked. If segment contains CC.ECHO
917 * option, check it to make sure this segment really
918 * matches our SYN. If not, just drop it as old
919 * duplicate, but send an RST if we're still playing
920 * by the old rules. If no CC.ECHO option, make sure
921 * we don't get fooled into using T/TCP.
922 */
923 if (to.to_flag & TOF_CCECHO) {
924 if (tp->cc_send != to.to_ccecho) {
925 if (taop->tao_ccsent != 0)
926 goto drop;
927 else
928 goto dropwithreset;
929 }
930 } else
931 tp->t_flags &= ~TF_RCVD_CC;
932 tcpstat.tcps_connects++;
933 soisconnected(so);
934 /* Do window scaling on this connection? */
935 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
936 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
937 tp->snd_scale = tp->requested_s_scale;
938 tp->rcv_scale = tp->request_r_scale;
939 }
940 /* Segment is acceptable, update cache if undefined. */
941 if (taop->tao_ccsent == 0)
942 taop->tao_ccsent = to.to_ccecho;
943
944 tp->rcv_adv += tp->rcv_wnd;
945 tp->snd_una++; /* SYN is acked */
946 /*
947 * If there's data, delay ACK; if there's also a FIN
948 * ACKNOW will be turned on later.
949 */
950 if (tcp_delack_enabled && ti->ti_len != 0)
951 callout_reset(tp->tt_delack, tcp_delacktime,
952 tcp_timer_delack, tp);
953 else
954 tp->t_flags |= TF_ACKNOW;
955 /*
956 * Received <SYN,ACK> in SYN_SENT[*] state.
957 * Transitions:
958 * SYN_SENT --> ESTABLISHED
959 * SYN_SENT* --> FIN_WAIT_1
960 */
961 tp->t_starttime = ticks;
962 if (tp->t_flags & TF_NEEDFIN) {
963 tp->t_state = TCPS_FIN_WAIT_1;
964 tp->t_flags &= ~TF_NEEDFIN;
965 tiflags &= ~TH_SYN;
966 } else {
967 tp->t_state = TCPS_ESTABLISHED;
968 callout_reset(tp->tt_keep, tcp_keepidle,
969 tcp_timer_keep, tp);
970 }
971 } else {
972 /*
973 * Received initial SYN in SYN-SENT[*] state => simul-
974 * taneous open. If segment contains CC option and there is
975 * a cached CC, apply TAO test; if it succeeds, connection is
976 * half-synchronized. Otherwise, do 3-way handshake:
977 * SYN-SENT -> SYN-RECEIVED
978 * SYN-SENT* -> SYN-RECEIVED*
979 * If there was no CC option, clear cached CC value.
980 */
981 tp->t_flags |= TF_ACKNOW;
982 callout_stop(tp->tt_rexmt);
983 if (to.to_flag & TOF_CC) {
984 if (taop->tao_cc != 0 &&
985 CC_GT(to.to_cc, taop->tao_cc)) {
986 /*
987 * update cache and make transition:
988 * SYN-SENT -> ESTABLISHED*
989 * SYN-SENT* -> FIN-WAIT-1*
990 */
991 taop->tao_cc = to.to_cc;
992 tp->t_starttime = ticks;
993 if (tp->t_flags & TF_NEEDFIN) {
994 tp->t_state = TCPS_FIN_WAIT_1;
995 tp->t_flags &= ~TF_NEEDFIN;
996 } else {
997 tp->t_state = TCPS_ESTABLISHED;
998 callout_reset(tp->tt_keep,
999 tcp_keepidle,
1000 tcp_timer_keep,
1001 tp);
1002 }
1003 tp->t_flags |= TF_NEEDSYN;
1004 } else
1005 tp->t_state = TCPS_SYN_RECEIVED;
1006 } else {
1007 /* CC.NEW or no option => invalidate cache */
1008 taop->tao_cc = 0;
1009 tp->t_state = TCPS_SYN_RECEIVED;
1010 }
1011 }
1012
1013trimthenstep6:
1014 /*
1015 * Advance ti->ti_seq to correspond to first data byte.
1016 * If data, trim to stay within window,
1017 * dropping FIN if necessary.
1018 */
1019 ti->ti_seq++;
1020 if (ti->ti_len > tp->rcv_wnd) {
1021 todrop = ti->ti_len - tp->rcv_wnd;
1022 m_adj(m, -todrop);
1023 ti->ti_len = tp->rcv_wnd;
1024 tiflags &= ~TH_FIN;
1025 tcpstat.tcps_rcvpackafterwin++;
1026 tcpstat.tcps_rcvbyteafterwin += todrop;
1027 }
1028 tp->snd_wl1 = ti->ti_seq - 1;
1029 tp->rcv_up = ti->ti_seq;
1030 /*
1031 * Client side of transaction: already sent SYN and data.
1032 * If the remote host used T/TCP to validate the SYN,
1033 * our data will be ACK'd; if so, enter normal data segment
1034 * processing in the middle of step 5, ack processing.
1035 * Otherwise, goto step 6.
1036 */
1037 if (tiflags & TH_ACK)
1038 goto process_ACK;
1039 goto step6;
1040 /*
1041 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1042 * if segment contains a SYN and CC [not CC.NEW] option:
1043 * if state == TIME_WAIT and connection duration > MSL,
1044 * drop packet and send RST;
1045 *
1046 * if SEG.CC > CCrecv then is new SYN, and can implicitly
1047 * ack the FIN (and data) in retransmission queue.
1048 * Complete close and delete TCPCB. Then reprocess
1049 * segment, hoping to find new TCPCB in LISTEN state;
1050 *
1051 * else must be old SYN; drop it.
1052 * else do normal processing.
1053 */
1054 case TCPS_LAST_ACK:
1055 case TCPS_CLOSING:
1056 case TCPS_TIME_WAIT:
1057 if ((tiflags & TH_SYN) &&
1058 (to.to_flag & TOF_CC) && tp->cc_recv != 0) {
1059 if (tp->t_state == TCPS_TIME_WAIT &&
1060 (ticks - tp->t_starttime) > tcp_msl)
1061 goto dropwithreset;
1062 if (CC_GT(to.to_cc, tp->cc_recv)) {
1063 tp = tcp_close(tp);
1064 goto findpcb;
1065 }
1066 else
1067 goto drop;
1068 }
1069 break; /* continue normal processing */
1070 }
1071
1072 /*
1073 * States other than LISTEN or SYN_SENT.
1074 * First check the RST flag and sequence number since reset segments
1075 * are exempt from the timestamp and connection count tests. This
1076 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1077 * below which allowed reset segments in half the sequence space
1078 * to fall though and be processed (which gives forged reset
1079 * segments with a random sequence number a 50 percent chance of
1080 * killing a connection).
1081 * Then check timestamp, if present.
1082 * Then check the connection count, if present.
1083 * Then check that at least some bytes of segment are within
1084 * receive window. If segment begins before rcv_nxt,
1085 * drop leading data (and SYN); if nothing left, just ack.
1086 *
1087 *
1088 * If the RST bit is set, check the sequence number to see
1089 * if this is a valid reset segment.
1090 * RFC 793 page 37:
1091 * In all states except SYN-SENT, all reset (RST) segments
1092 * are validated by checking their SEQ-fields. A reset is
1093 * valid if its sequence number is in the window.
1094 * Note: this does not take into account delayed ACKs, so
1095 * we should test against last_ack_sent instead of rcv_nxt.
1096 * The sequence number in the reset segment is normally an
1097 * echo of our outgoing acknowlegement numbers, but some hosts
1098 * send a reset with the sequence number at the rightmost edge
1099 * of our receive window, and we have to handle this case.
1100 * If we have multiple segments in flight, the intial reset
1101 * segment sequence numbers will be to the left of last_ack_sent,
1102 * but they will eventually catch up.
1103 * In any case, it never made sense to trim reset segments to
1104 * fit the receive window since RFC 1122 says:
1105 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
1106 *
1107 * A TCP SHOULD allow a received RST segment to include data.
1108 *
1109 * DISCUSSION
1110 * It has been suggested that a RST segment could contain
1111 * ASCII text that encoded and explained the cause of the
1112 * RST. No standard has yet been established for such
1113 * data.
1114 *
1115 * If the reset segment passes the sequence number test examine
1116 * the state:
1117 * SYN_RECEIVED STATE:
1118 * If passive open, return to LISTEN state.
1119 * If active open, inform user that connection was refused.
1120 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1121 * Inform user that connection was reset, and close tcb.
1122 * CLOSING, LAST_ACK STATES:
1123 * Close the tcb.
1124 * TIME_WAIT STATE:
1125 * Drop the segment - see Stevens, vol. 2, p. 964 and
1126 * RFC 1337.
1127 */
1128 if (tiflags & TH_RST) {
1129 if (SEQ_GEQ(ti->ti_seq, tp->last_ack_sent) &&
1130 SEQ_LT(ti->ti_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1131 switch (tp->t_state) {
1132
1133 case TCPS_SYN_RECEIVED:
1134 so->so_error = ECONNREFUSED;
1135 goto close;
1136
1137 case TCPS_ESTABLISHED:
1138 case TCPS_FIN_WAIT_1:
1139 case TCPS_FIN_WAIT_2:
1140 case TCPS_CLOSE_WAIT:
1141 so->so_error = ECONNRESET;
1142 close:
1143 tp->t_state = TCPS_CLOSED;
1144 tcpstat.tcps_drops++;
1145 tp = tcp_close(tp);
1146 break;
1147
1148 case TCPS_CLOSING:
1149 case TCPS_LAST_ACK:
1150 tp = tcp_close(tp);
1151 break;
1152
1153 case TCPS_TIME_WAIT:
1154 break;
1155 }
1156 }
1157 goto drop;
1158 }
1159
1160 /*
1161 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1162 * and it's less than ts_recent, drop it.
1163 */
1164 if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent &&
1165 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1166
1167 /* Check to see if ts_recent is over 24 days old. */
1168 if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1169 /*
1170 * Invalidate ts_recent. If this segment updates
1171 * ts_recent, the age will be reset later and ts_recent
1172 * will get a valid value. If it does not, setting
1173 * ts_recent to zero will at least satisfy the
1174 * requirement that zero be placed in the timestamp
1175 * echo reply when ts_recent isn't valid. The
1176 * age isn't reset until we get a valid ts_recent
1177 * because we don't want out-of-order segments to be
1178 * dropped when ts_recent is old.
1179 */
1180 tp->ts_recent = 0;
1181 } else {
1182 tcpstat.tcps_rcvduppack++;
1183 tcpstat.tcps_rcvdupbyte += ti->ti_len;
1184 tcpstat.tcps_pawsdrop++;
1185 goto dropafterack;
1186 }
1187 }
1188
1189 /*
1190 * T/TCP mechanism
1191 * If T/TCP was negotiated and the segment doesn't have CC,
1192 * or if its CC is wrong then drop the segment.
1193 * RST segments do not have to comply with this.
1194 */
1195 if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
1196 ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
1197 goto dropafterack;
1198
1199 /*
1200 * In the SYN-RECEIVED state, validate that the packet belongs to
1201 * this connection before trimming the data to fit the receive
1202 * window. Check the sequence number versus IRS since we know
1203 * the sequence numbers haven't wrapped. This is a partial fix
1204 * for the "LAND" DoS attack.
1205 */
1206 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(ti->ti_seq, tp->irs))
1207 goto dropwithreset;
1208
1209 todrop = tp->rcv_nxt - ti->ti_seq;
1210 if (todrop > 0) {
1211 if (tiflags & TH_SYN) {
1212 tiflags &= ~TH_SYN;
1213 ti->ti_seq++;
1214 if (ti->ti_urp > 1)
1215 ti->ti_urp--;
1216 else
1217 tiflags &= ~TH_URG;
1218 todrop--;
1219 }
1220 /*
1221 * Following if statement from Stevens, vol. 2, p. 960.
1222 */
1223 if (todrop > ti->ti_len
1224 || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) {
1225 /*
1226 * Any valid FIN must be to the left of the window.
1227 * At this point the FIN must be a duplicate or out
1228 * of sequence; drop it.
1229 */
1230 tiflags &= ~TH_FIN;
1231
1232 /*
1233 * Send an ACK to resynchronize and drop any data.
1234 * But keep on processing for RST or ACK.
1235 */
1236 tp->t_flags |= TF_ACKNOW;
1237 todrop = ti->ti_len;
1238 tcpstat.tcps_rcvduppack++;
1239 tcpstat.tcps_rcvdupbyte += todrop;
1240 } else {
1241 tcpstat.tcps_rcvpartduppack++;
1242 tcpstat.tcps_rcvpartdupbyte += todrop;
1243 }
1244 m_adj(m, todrop);
1245 ti->ti_seq += todrop;
1246 ti->ti_len -= todrop;
1247 if (ti->ti_urp > todrop)
1248 ti->ti_urp -= todrop;
1249 else {
1250 tiflags &= ~TH_URG;
1251 ti->ti_urp = 0;
1252 }
1253 }
1254
1255 /*
1256 * If new data are received on a connection after the
1257 * user processes are gone, then RST the other end.
1258 */
1259 if ((so->so_state & SS_NOFDREF) &&
1260 tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) {
1261 tp = tcp_close(tp);
1262 tcpstat.tcps_rcvafterclose++;
1263 goto dropwithreset;
1264 }
1265
1266 /*
1267 * If segment ends after window, drop trailing data
1268 * (and PUSH and FIN); if nothing left, just ACK.
1269 */
1270 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
1271 if (todrop > 0) {
1272 tcpstat.tcps_rcvpackafterwin++;
1273 if (todrop >= ti->ti_len) {
1274 tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
1275 /*
1276 * If a new connection request is received
1277 * while in TIME_WAIT, drop the old connection
1278 * and start over if the sequence numbers
1279 * are above the previous ones.
1280 */
1281 if (tiflags & TH_SYN &&
1282 tp->t_state == TCPS_TIME_WAIT &&
1283 SEQ_GT(ti->ti_seq, tp->rcv_nxt)) {
1284 iss = tp->snd_nxt + TCP_ISSINCR;
1285 tp = tcp_close(tp);
1286 goto findpcb;
1287 }
1288 /*
1289 * If window is closed can only take segments at
1290 * window edge, and have to drop data and PUSH from
1291 * incoming segments. Continue processing, but
1292 * remember to ack. Otherwise, drop segment
1293 * and ack.
1294 */
1295 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) {
1296 tp->t_flags |= TF_ACKNOW;
1297 tcpstat.tcps_rcvwinprobe++;
1298 } else
1299 goto dropafterack;
1300 } else
1301 tcpstat.tcps_rcvbyteafterwin += todrop;
1302 m_adj(m, -todrop);
1303 ti->ti_len -= todrop;
1304 tiflags &= ~(TH_PUSH|TH_FIN);
1305 }
1306
1307 /*
1308 * If last ACK falls within this segment's sequence numbers,
1309 * record its timestamp.
1310 * NOTE that the test is modified according to the latest
1311 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1312 */
1313 if ((to.to_flag & TOF_TS) != 0 &&
1314 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) {
1315 tp->ts_recent_age = ticks;
1316 tp->ts_recent = to.to_tsval;
1317 }
1318
1319 /*
1320 * If a SYN is in the window, then this is an
1321 * error and we send an RST and drop the connection.
1322 */
1323 if (tiflags & TH_SYN) {
1324 tp = tcp_drop(tp, ECONNRESET);
1325 goto dropwithreset;
1326 }
1327
1328 /*
1329 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
1330 * flag is on (half-synchronized state), then queue data for
1331 * later processing; else drop segment and return.
1332 */
1333 if ((tiflags & TH_ACK) == 0) {
1334 if (tp->t_state == TCPS_SYN_RECEIVED ||
1335 (tp->t_flags & TF_NEEDSYN))
1336 goto step6;
1337 else
1338 goto drop;
1339 }
1340
1341 /*
1342 * Ack processing.
1343 */
1344 switch (tp->t_state) {
1345
1346 /*
1347 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1348 * ESTABLISHED state and continue processing.
1349 * The ACK was checked above.
1350 */
1351 case TCPS_SYN_RECEIVED:
1352
1353 tcpstat.tcps_connects++;
1354 soisconnected(so);
1355 /* Do window scaling? */
1356 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1357 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1358 tp->snd_scale = tp->requested_s_scale;
1359 tp->rcv_scale = tp->request_r_scale;
1360 }
1361 /*
1362 * Upon successful completion of 3-way handshake,
1363 * update cache.CC if it was undefined, pass any queued
1364 * data to the user, and advance state appropriately.
1365 */
1366 if ((taop = tcp_gettaocache(inp)) != NULL &&
1367 taop->tao_cc == 0)
1368 taop->tao_cc = tp->cc_recv;
1369
1370 /*
1371 * Make transitions:
1372 * SYN-RECEIVED -> ESTABLISHED
1373 * SYN-RECEIVED* -> FIN-WAIT-1
1374 */
1375 tp->t_starttime = ticks;
1376 if (tp->t_flags & TF_NEEDFIN) {
1377 tp->t_state = TCPS_FIN_WAIT_1;
1378 tp->t_flags &= ~TF_NEEDFIN;
1379 } else {
1380 tp->t_state = TCPS_ESTABLISHED;
1381 callout_reset(tp->tt_keep, tcp_keepidle,
1382 tcp_timer_keep, tp);
1383 }
1384 /*
1385 * If segment contains data or ACK, will call tcp_reass()
1386 * later; if not, do so now to pass queued data to user.
1387 */
1388 if (ti->ti_len == 0 && (tiflags & TH_FIN) == 0)
1389 (void) tcp_reass(tp, (struct tcpiphdr *)0,
1390 (struct mbuf *)0);
1391 tp->snd_wl1 = ti->ti_seq - 1;
1392 /* fall into ... */
1393
1394 /*
1395 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1396 * ACKs. If the ack is in the range
1397 * tp->snd_una < ti->ti_ack <= tp->snd_max
1398 * then advance tp->snd_una to ti->ti_ack and drop
1399 * data from the retransmission queue. If this ACK reflects
1400 * more up to date window information we update our window information.
1401 */
1402 case TCPS_ESTABLISHED:
1403 case TCPS_FIN_WAIT_1:
1404 case TCPS_FIN_WAIT_2:
1405 case TCPS_CLOSE_WAIT:
1406 case TCPS_CLOSING:
1407 case TCPS_LAST_ACK:
1408 case TCPS_TIME_WAIT:
1409
1410 if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
1411 if (ti->ti_len == 0 && tiwin == tp->snd_wnd) {
1412 tcpstat.tcps_rcvdupack++;
1413 /*
1414 * If we have outstanding data (other than
1415 * a window probe), this is a completely
1416 * duplicate ack (ie, window info didn't
1417 * change), the ack is the biggest we've
1418 * seen and we've seen exactly our rexmt
1419 * threshhold of them, assume a packet
1420 * has been dropped and retransmit it.
1421 * Kludge snd_nxt & the congestion
1422 * window so we send only this one
1423 * packet.
1424 *
1425 * We know we're losing at the current
1426 * window size so do congestion avoidance
1427 * (set ssthresh to half the current window
1428 * and pull our congestion window back to
1429 * the new ssthresh).
1430 *
1431 * Dup acks mean that packets have left the
1432 * network (they're now cached at the receiver)
1433 * so bump cwnd by the amount in the receiver
1434 * to keep a constant cwnd packets in the
1435 * network.
1436 */
1437 if (!callout_active(tp->tt_rexmt) ||
1438 ti->ti_ack != tp->snd_una)
1439 tp->t_dupacks = 0;
1440 else if (++tp->t_dupacks == tcprexmtthresh) {
1441 tcp_seq onxt = tp->snd_nxt;
1442 u_int win =
1443 min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1444 tp->t_maxseg;
1445
1446 if (win < 2)
1447 win = 2;
1448 tp->snd_ssthresh = win * tp->t_maxseg;
1449 callout_stop(tp->tt_rexmt);
1450 tp->t_rtttime = 0;
1451 tp->snd_nxt = ti->ti_ack;
1452 tp->snd_cwnd = tp->t_maxseg;
1453 (void) tcp_output(tp);
1454 tp->snd_cwnd = tp->snd_ssthresh +
1455 tp->t_maxseg * tp->t_dupacks;
1456 if (SEQ_GT(onxt, tp->snd_nxt))
1457 tp->snd_nxt = onxt;
1458 goto drop;
1459 } else if (tp->t_dupacks > tcprexmtthresh) {
1460 tp->snd_cwnd += tp->t_maxseg;
1461 (void) tcp_output(tp);
1462 goto drop;
1463 }
1464 } else
1465 tp->t_dupacks = 0;
1466 break;
1467 }
1468 /*
1469 * If the congestion window was inflated to account
1470 * for the other side's cached packets, retract it.
1471 */
1472 if (tp->t_dupacks >= tcprexmtthresh &&
1473 tp->snd_cwnd > tp->snd_ssthresh)
1474 tp->snd_cwnd = tp->snd_ssthresh;
1475 tp->t_dupacks = 0;
1476 if (SEQ_GT(ti->ti_ack, tp->snd_max)) {
1477 tcpstat.tcps_rcvacktoomuch++;
1478 goto dropafterack;
1479 }
1480 /*
1481 * If we reach this point, ACK is not a duplicate,
1482 * i.e., it ACKs something we sent.
1483 */
1484 if (tp->t_flags & TF_NEEDSYN) {
1485 /*
1486 * T/TCP: Connection was half-synchronized, and our
1487 * SYN has been ACK'd (so connection is now fully
1488 * synchronized). Go to non-starred state,
1489 * increment snd_una for ACK of SYN, and check if
1490 * we can do window scaling.
1491 */
1492 tp->t_flags &= ~TF_NEEDSYN;
1493 tp->snd_una++;
1494 /* Do window scaling? */
1495 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1496 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1497 tp->snd_scale = tp->requested_s_scale;
1498 tp->rcv_scale = tp->request_r_scale;
1499 }
1500 }
1501
1502process_ACK:
1503 acked = ti->ti_ack - tp->snd_una;
1504 tcpstat.tcps_rcvackpack++;
1505 tcpstat.tcps_rcvackbyte += acked;
1506
1507 /*
1508 * If we just performed our first retransmit, and the ACK
1509 * arrives within our recovery window, then it was a mistake
1510 * to do the retransmit in the first place. Recover our
1511 * original cwnd and ssthresh, and proceed to transmit where
1512 * we left off.
1513 */
1514 if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
1515 tp->snd_cwnd = tp->snd_cwnd_prev;
1516 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1517 tp->snd_nxt = tp->snd_max;
1518 tp->t_badrxtwin = 0; /* XXX probably not required */
1519 }
1520
1521 /*
1522 * If we have a timestamp reply, update smoothed
1523 * round trip time. If no timestamp is present but
1524 * transmit timer is running and timed sequence
1525 * number was acked, update smoothed round trip time.
1526 * Since we now have an rtt measurement, cancel the
1527 * timer backoff (cf., Phil Karn's retransmit alg.).
1528 * Recompute the initial retransmit timer.
1529 */
1530 if (to.to_flag & TOF_TS)
1531 tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
1532 else if (tp->t_rtttime && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1533 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
1534
1535 /*
1536 * If all outstanding data is acked, stop retransmit
1537 * timer and remember to restart (more output or persist).
1538 * If there is more data to be acked, restart retransmit
1539 * timer, using current (possibly backed-off) value.
1540 */
1541 if (ti->ti_ack == tp->snd_max) {
1542 callout_stop(tp->tt_rexmt);
1543 needoutput = 1;
1544 } else if (!callout_active(tp->tt_persist))
1545 callout_reset(tp->tt_rexmt, tp->t_rxtcur,
1546 tcp_timer_rexmt, tp);
1547
1548 /*
1549 * If no data (only SYN) was ACK'd,
1550 * skip rest of ACK processing.
1551 */
1552 if (acked == 0)
1553 goto step6;
1554
1555 /*
1556 * When new data is acked, open the congestion window.
1557 * If the window gives us less than ssthresh packets
1558 * in flight, open exponentially (maxseg per packet).
1559 * Otherwise open linearly: maxseg per window
1560 * (maxseg^2 / cwnd per packet).
1561 */
1562 {
1563 register u_int cw = tp->snd_cwnd;
1564 register u_int incr = tp->t_maxseg;
1565
1566 if (cw > tp->snd_ssthresh)
1567 incr = incr * incr / cw;
1568 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale);
1569 }
1570 if (acked > so->so_snd.sb_cc) {
1571 tp->snd_wnd -= so->so_snd.sb_cc;
1572 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1573 ourfinisacked = 1;
1574 } else {
1575 sbdrop(&so->so_snd, acked);
1576 tp->snd_wnd -= acked;
1577 ourfinisacked = 0;
1578 }
1579 sowwakeup(so);
1580 tp->snd_una = ti->ti_ack;
1581 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1582 tp->snd_nxt = tp->snd_una;
1583
1584 switch (tp->t_state) {
1585
1586 /*
1587 * In FIN_WAIT_1 STATE in addition to the processing
1588 * for the ESTABLISHED state if our FIN is now acknowledged
1589 * then enter FIN_WAIT_2.
1590 */
1591 case TCPS_FIN_WAIT_1:
1592 if (ourfinisacked) {
1593 /*
1594 * If we can't receive any more
1595 * data, then closing user can proceed.
1596 * Starting the timer is contrary to the
1597 * specification, but if we don't get a FIN
1598 * we'll hang forever.
1599 */
1600 if (so->so_state & SS_CANTRCVMORE) {
1601 soisdisconnected(so);
1602 callout_reset(tp->tt_2msl, tcp_maxidle,
1603 tcp_timer_2msl, tp);
1604 }
1605 tp->t_state = TCPS_FIN_WAIT_2;
1606 }
1607 break;
1608
1609 /*
1610 * In CLOSING STATE in addition to the processing for
1611 * the ESTABLISHED state if the ACK acknowledges our FIN
1612 * then enter the TIME-WAIT state, otherwise ignore
1613 * the segment.
1614 */
1615 case TCPS_CLOSING:
1616 if (ourfinisacked) {
1617 tp->t_state = TCPS_TIME_WAIT;
1618 tcp_canceltimers(tp);
1619 /* Shorten TIME_WAIT [RFC-1644, p.28] */
1620 if (tp->cc_recv != 0 &&
1621 (ticks - tp->t_starttime) < tcp_msl)
1622 callout_reset(tp->tt_2msl,
1623 tp->t_rxtcur *
1624 TCPTV_TWTRUNC,
1625 tcp_timer_2msl, tp);
1626 else
1627 callout_reset(tp->tt_2msl, 2 * tcp_msl,
1628 tcp_timer_2msl, tp);
1629 soisdisconnected(so);
1630 }
1631 break;
1632
1633 /*
1634 * In LAST_ACK, we may still be waiting for data to drain
1635 * and/or to be acked, as well as for the ack of our FIN.
1636 * If our FIN is now acknowledged, delete the TCB,
1637 * enter the closed state and return.
1638 */
1639 case TCPS_LAST_ACK:
1640 if (ourfinisacked) {
1641 tp = tcp_close(tp);
1642 goto drop;
1643 }
1644 break;
1645
1646 /*
1647 * In TIME_WAIT state the only thing that should arrive
1648 * is a retransmission of the remote FIN. Acknowledge
1649 * it and restart the finack timer.
1650 */
1651 case TCPS_TIME_WAIT:
1652 callout_reset(tp->tt_2msl, 2 * tcp_msl,
1653 tcp_timer_2msl, tp);
1654 goto dropafterack;
1655 }
1656 }
1657
1658step6:
1659 /*
1660 * Update window information.
1661 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1662 */
1663 if ((tiflags & TH_ACK) &&
1664 (SEQ_LT(tp->snd_wl1, ti->ti_seq) ||
1665 (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) ||
1666 (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) {
1667 /* keep track of pure window updates */
1668 if (ti->ti_len == 0 &&
1669 tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd)
1670 tcpstat.tcps_rcvwinupd++;
1671 tp->snd_wnd = tiwin;
1672 tp->snd_wl1 = ti->ti_seq;
1673 tp->snd_wl2 = ti->ti_ack;
1674 if (tp->snd_wnd > tp->max_sndwnd)
1675 tp->max_sndwnd = tp->snd_wnd;
1676 needoutput = 1;
1677 }
1678
1679 /*
1680 * Process segments with URG.
1681 */
1682 if ((tiflags & TH_URG) && ti->ti_urp &&
1683 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1684 /*
1685 * This is a kludge, but if we receive and accept
1686 * random urgent pointers, we'll crash in
1687 * soreceive. It's hard to imagine someone
1688 * actually wanting to send this much urgent data.
1689 */
1690 if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) {
1691 ti->ti_urp = 0; /* XXX */
1692 tiflags &= ~TH_URG; /* XXX */
1693 goto dodata; /* XXX */
1694 }
1695 /*
1696 * If this segment advances the known urgent pointer,
1697 * then mark the data stream. This should not happen
1698 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1699 * a FIN has been received from the remote side.
1700 * In these states we ignore the URG.
1701 *
1702 * According to RFC961 (Assigned Protocols),
1703 * the urgent pointer points to the last octet
1704 * of urgent data. We continue, however,
1705 * to consider it to indicate the first octet
1706 * of data past the urgent section as the original
1707 * spec states (in one of two places).
1708 */
1709 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) {
1710 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1711 so->so_oobmark = so->so_rcv.sb_cc +
1712 (tp->rcv_up - tp->rcv_nxt) - 1;
1713 if (so->so_oobmark == 0)
1714 so->so_state |= SS_RCVATMARK;
1715 sohasoutofband(so);
1716 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1717 }
1718 /*
1719 * Remove out of band data so doesn't get presented to user.
1720 * This can happen independent of advancing the URG pointer,
1721 * but if two URG's are pending at once, some out-of-band
1722 * data may creep in... ick.
1723 */
1724 if (ti->ti_urp <= (u_long)ti->ti_len
1725#ifdef SO_OOBINLINE
1726 && (so->so_options & SO_OOBINLINE) == 0
1727#endif
1728 )
1729 tcp_pulloutofband(so, ti, m);
1730 } else
1731 /*
1732 * If no out of band data is expected,
1733 * pull receive urgent pointer along
1734 * with the receive window.
1735 */
1736 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1737 tp->rcv_up = tp->rcv_nxt;
1738dodata: /* XXX */
1739
1740 /*
1741 * Process the segment text, merging it into the TCP sequencing queue,
1742 * and arranging for acknowledgment of receipt if necessary.
1743 * This process logically involves adjusting tp->rcv_wnd as data
1744 * is presented to the user (this happens in tcp_usrreq.c,
1745 * case PRU_RCVD). If a FIN has already been received on this
1746 * connection then we just ignore the text.
1747 */
1748 if ((ti->ti_len || (tiflags&TH_FIN)) &&
1749 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1750 TCP_REASS(tp, ti, m, so, tiflags);
1751 /*
1752 * Note the amount of data that peer has sent into
1753 * our window, in order to estimate the sender's
1754 * buffer size.
1755 */
1756 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1757 } else {
1758 m_freem(m);
1759 tiflags &= ~TH_FIN;
1760 }
1761
1762 /*
1763 * If FIN is received ACK the FIN and let the user know
1764 * that the connection is closing.
1765 */
1766 if (tiflags & TH_FIN) {
1767 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1768 socantrcvmore(so);
1769 /*
1770 * If connection is half-synchronized
1771 * (ie NEEDSYN flag on) then delay ACK,
1772 * so it may be piggybacked when SYN is sent.
1773 * Otherwise, since we received a FIN then no
1774 * more input can be expected, send ACK now.
1775 */
1776 if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN))
1777 callout_reset(tp->tt_delack, tcp_delacktime,
1778 tcp_timer_delack, tp);
1779 else
1780 tp->t_flags |= TF_ACKNOW;
1781 tp->rcv_nxt++;
1782 }
1783 switch (tp->t_state) {
1784
1785 /*
1786 * In SYN_RECEIVED and ESTABLISHED STATES
1787 * enter the CLOSE_WAIT state.
1788 */
1789 case TCPS_SYN_RECEIVED:
1790 tp->t_starttime = ticks;
1791 /*FALLTHROUGH*/
1792 case TCPS_ESTABLISHED:
1793 tp->t_state = TCPS_CLOSE_WAIT;
1794 break;
1795
1796 /*
1797 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1798 * enter the CLOSING state.
1799 */
1800 case TCPS_FIN_WAIT_1:
1801 tp->t_state = TCPS_CLOSING;
1802 break;
1803
1804 /*
1805 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1806 * starting the time-wait timer, turning off the other
1807 * standard timers.
1808 */
1809 case TCPS_FIN_WAIT_2:
1810 tp->t_state = TCPS_TIME_WAIT;
1811 tcp_canceltimers(tp);
1812 /* Shorten TIME_WAIT [RFC-1644, p.28] */
1813 if (tp->cc_recv != 0 &&
1814 (ticks - tp->t_starttime) < tcp_msl) {
1815 callout_reset(tp->tt_2msl,
1816 tp->t_rxtcur * TCPTV_TWTRUNC,
1817 tcp_timer_2msl, tp);
1818 /* For transaction client, force ACK now. */
1819 tp->t_flags |= TF_ACKNOW;
1820 }
1821 else
1822 callout_reset(tp->tt_2msl, 2 * tcp_msl,
1823 tcp_timer_2msl, tp);
1824 soisdisconnected(so);
1825 break;
1826
1827 /*
1828 * In TIME_WAIT state restart the 2 MSL time_wait timer.
1829 */
1830 case TCPS_TIME_WAIT:
1831 callout_reset(tp->tt_2msl, 2 * tcp_msl,
1832 tcp_timer_2msl, tp);
1833 break;
1834 }
1835 }
1836#ifdef TCPDEBUG
1837 if (so->so_options & SO_DEBUG)
1838 tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0);
1839#endif
1840
1841 /*
1842 * Return any desired output.
1843 */
1844 if (needoutput || (tp->t_flags & TF_ACKNOW))
1845 (void) tcp_output(tp);
1846 return;
1847
1848dropafterack:
1849 /*
1850 * Generate an ACK dropping incoming segment if it occupies
1851 * sequence space, where the ACK reflects our state.
1852 *
1853 * We can now skip the test for the RST flag since all
1854 * paths to this code happen after packets containing
1855 * RST have been dropped.
1856 *
1857 * In the SYN-RECEIVED state, don't send an ACK unless the
1858 * segment we received passes the SYN-RECEIVED ACK test.
1859 * If it fails send a RST. This breaks the loop in the
1860 * "LAND" DoS attack, and also prevents an ACK storm
1861 * between two listening ports that have been sent forged
1862 * SYN segments, each with the source address of the other.
1863 */
1864 if (tp->t_state == TCPS_SYN_RECEIVED && (tiflags & TH_ACK) &&
1865 (SEQ_GT(tp->snd_una, ti->ti_ack) ||
1866 SEQ_GT(ti->ti_ack, tp->snd_max)) )
1867 goto dropwithreset;
1868#ifdef TCPDEBUG
1869 if (so->so_options & SO_DEBUG)
1870 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0);
1871#endif
1872 m_freem(m);
1873 tp->t_flags |= TF_ACKNOW;
1874 (void) tcp_output(tp);
1875 return;
1876
1877dropwithreset:
1878#ifdef TCP_RESTRICT_RST
1879 if (restrict_rst)
1880 goto drop;
1881#endif
1882 /*
1883 * Generate a RST, dropping incoming segment.
1884 * Make ACK acceptable to originator of segment.
1885 * Don't bother to respond if destination was broadcast/multicast.
1886 */
1887 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) ||
1888 IN_MULTICAST(ntohl(ti->ti_dst.s_addr)))
1889 goto drop;
1890#ifdef TCPDEBUG
1891 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1892 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0);
1893#endif
1894 if (tiflags & TH_ACK)
1895 tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1896 else {
1897 if (tiflags & TH_SYN)
1898 ti->ti_len++;
1899 tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1900 TH_RST|TH_ACK);
1901 }
1902 /* destroy temporarily created socket */
1903 if (dropsocket)
1904 (void) soabort(so);
1905 return;
1906
1907drop:
1908 /*
1909 * Drop space held by incoming segment and return.
1910 */
1911#ifdef TCPDEBUG
1912 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1913 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0);
1914#endif
1915 m_freem(m);
1916 /* destroy temporarily created socket */
1917 if (dropsocket)
1918 (void) soabort(so);
1919 return;
1920}
1921
1922static void
1923tcp_dooptions(tp, cp, cnt, ti, to)
1924 struct tcpcb *tp;
1925 u_char *cp;
1926 int cnt;
1927 struct tcpiphdr *ti;
1928 struct tcpopt *to;
1929{
1930 u_short mss = 0;
1931 int opt, optlen;
1932
1933 for (; cnt > 0; cnt -= optlen, cp += optlen) {
1934 opt = cp[0];
1935 if (opt == TCPOPT_EOL)
1936 break;
1937 if (opt == TCPOPT_NOP)
1938 optlen = 1;
1939 else {
1940 optlen = cp[1];
1941 if (optlen <= 0)
1942 break;
1943 }
1944 switch (opt) {
1945
1946 default:
1947 continue;
1948
1949 case TCPOPT_MAXSEG:
1950 if (optlen != TCPOLEN_MAXSEG)
1951 continue;
1952 if (!(ti->ti_flags & TH_SYN))
1953 continue;
1954 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
1955 NTOHS(mss);
1956 break;
1957
1958 case TCPOPT_WINDOW:
1959 if (optlen != TCPOLEN_WINDOW)
1960 continue;
1961 if (!(ti->ti_flags & TH_SYN))
1962 continue;
1963 tp->t_flags |= TF_RCVD_SCALE;
1964 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1965 break;
1966
1967 case TCPOPT_TIMESTAMP:
1968 if (optlen != TCPOLEN_TIMESTAMP)
1969 continue;
1970 to->to_flag |= TOF_TS;
1971 bcopy((char *)cp + 2,
1972 (char *)&to->to_tsval, sizeof(to->to_tsval));
1973 NTOHL(to->to_tsval);
1974 bcopy((char *)cp + 6,
1975 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
1976 NTOHL(to->to_tsecr);
1977
1978 /*
1979 * A timestamp received in a SYN makes
1980 * it ok to send timestamp requests and replies.
1981 */
1982 if (ti->ti_flags & TH_SYN) {
1983 tp->t_flags |= TF_RCVD_TSTMP;
1984 tp->ts_recent = to->to_tsval;
1985 tp->ts_recent_age = ticks;
1986 }
1987 break;
1988 case TCPOPT_CC:
1989 if (optlen != TCPOLEN_CC)
1990 continue;
1991 to->to_flag |= TOF_CC;
1992 bcopy((char *)cp + 2,
1993 (char *)&to->to_cc, sizeof(to->to_cc));
1994 NTOHL(to->to_cc);
1995 /*
1996 * A CC or CC.new option received in a SYN makes
1997 * it ok to send CC in subsequent segments.
1998 */
1999 if (ti->ti_flags & TH_SYN)
2000 tp->t_flags |= TF_RCVD_CC;
2001 break;
2002 case TCPOPT_CCNEW:
2003 if (optlen != TCPOLEN_CC)
2004 continue;
2005 if (!(ti->ti_flags & TH_SYN))
2006 continue;
2007 to->to_flag |= TOF_CCNEW;
2008 bcopy((char *)cp + 2,
2009 (char *)&to->to_cc, sizeof(to->to_cc));
2010 NTOHL(to->to_cc);
2011 /*
2012 * A CC or CC.new option received in a SYN makes
2013 * it ok to send CC in subsequent segments.
2014 */
2015 tp->t_flags |= TF_RCVD_CC;
2016 break;
2017 case TCPOPT_CCECHO:
2018 if (optlen != TCPOLEN_CC)
2019 continue;
2020 if (!(ti->ti_flags & TH_SYN))
2021 continue;
2022 to->to_flag |= TOF_CCECHO;
2023 bcopy((char *)cp + 2,
2024 (char *)&to->to_ccecho, sizeof(to->to_ccecho));
2025 NTOHL(to->to_ccecho);
2026 break;
2027 }
2028 }
2029 if (ti->ti_flags & TH_SYN)
2030 tcp_mss(tp, mss); /* sets t_maxseg */
2031}
2032
2033/*
2034 * Pull out of band byte out of a segment so
2035 * it doesn't appear in the user's data queue.
2036 * It is still reflected in the segment length for
2037 * sequencing purposes.
2038 */
2039static void
2040tcp_pulloutofband(so, ti, m)
2041 struct socket *so;
2042 struct tcpiphdr *ti;
2043 register struct mbuf *m;
2044{
2045 int cnt = ti->ti_urp - 1;
2046
2047 while (cnt >= 0) {
2048 if (m->m_len > cnt) {
2049 char *cp = mtod(m, caddr_t) + cnt;
2050 struct tcpcb *tp = sototcpcb(so);
2051
2052 tp->t_iobc = *cp;
2053 tp->t_oobflags |= TCPOOB_HAVEDATA;
2054 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2055 m->m_len--;
2056 return;
2057 }
2058 cnt -= m->m_len;
2059 m = m->m_next;
2060 if (m == 0)
2061 break;
2062 }
2063 panic("tcp_pulloutofband");
2064}
2065
2066/*
2067 * Collect new round-trip time estimate
2068 * and update averages and current timeout.
2069 */
2070static void
2071tcp_xmit_timer(tp, rtt)
2072 register struct tcpcb *tp;
2073 int rtt;
2074{
2075 register int delta;
2076
2077 tcpstat.tcps_rttupdated++;
2078 tp->t_rttupdated++;
2079 if (tp->t_srtt != 0) {
2080 /*
2081 * srtt is stored as fixed point with 5 bits after the
2082 * binary point (i.e., scaled by 8). The following magic
2083 * is equivalent to the smoothing algorithm in rfc793 with
2084 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2085 * point). Adjust rtt to origin 0.
2086 */
2087 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2088 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2089
2090 if ((tp->t_srtt += delta) <= 0)
2091 tp->t_srtt = 1;
2092
2093 /*
2094 * We accumulate a smoothed rtt variance (actually, a
2095 * smoothed mean difference), then set the retransmit
2096 * timer to smoothed rtt + 4 times the smoothed variance.
2097 * rttvar is stored as fixed point with 4 bits after the
2098 * binary point (scaled by 16). The following is
2099 * equivalent to rfc793 smoothing with an alpha of .75
2100 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
2101 * rfc793's wired-in beta.
2102 */
2103 if (delta < 0)
2104 delta = -delta;
2105 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2106 if ((tp->t_rttvar += delta) <= 0)
2107 tp->t_rttvar = 1;
2108 } else {
2109 /*
2110 * No rtt measurement yet - use the unsmoothed rtt.
2111 * Set the variance to half the rtt (so our first
2112 * retransmit happens at 3*rtt).
2113 */
2114 tp->t_srtt = rtt << TCP_RTT_SHIFT;
2115 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2116 }
2117 tp->t_rtttime = 0;
2118 tp->t_rxtshift = 0;
2119
2120 /*
2121 * the retransmit should happen at rtt + 4 * rttvar.
2122 * Because of the way we do the smoothing, srtt and rttvar
2123 * will each average +1/2 tick of bias. When we compute
2124 * the retransmit timer, we want 1/2 tick of rounding and
2125 * 1 extra tick because of +-1/2 tick uncertainty in the
2126 * firing of the timer. The bias will give us exactly the
2127 * 1.5 tick we need. But, because the bias is
2128 * statistical, we have to test that we don't drop below
2129 * the minimum feasible timer (which is 2 ticks).
2130 */
2131 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2132 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2133
2134 /*
2135 * We received an ack for a packet that wasn't retransmitted;
2136 * it is probably safe to discard any error indications we've
2137 * received recently. This isn't quite right, but close enough
2138 * for now (a route might have failed after we sent a segment,
2139 * and the return path might not be symmetrical).
2140 */
2141 tp->t_softerror = 0;
2142}
2143
2144/*
2145 * Determine a reasonable value for maxseg size.
2146 * If the route is known, check route for mtu.
2147 * If none, use an mss that can be handled on the outgoing
2148 * interface without forcing IP to fragment; if bigger than
2149 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2150 * to utilize large mbufs. If no route is found, route has no mtu,
2151 * or the destination isn't local, use a default, hopefully conservative
2152 * size (usually 512 or the default IP max size, but no more than the mtu
2153 * of the interface), as we can't discover anything about intervening
2154 * gateways or networks. We also initialize the congestion/slow start
2155 * window to be a single segment if the destination isn't local.
2156 * While looking at the routing entry, we also initialize other path-dependent
2157 * parameters from pre-set or cached values in the routing entry.
2158 *
2159 * Also take into account the space needed for options that we
2160 * send regularly. Make maxseg shorter by that amount to assure
2161 * that we can send maxseg amount of data even when the options
2162 * are present. Store the upper limit of the length of options plus
2163 * data in maxopd.
2164 *
2165 * NOTE that this routine is only called when we process an incoming
2166 * segment, for outgoing segments only tcp_mssopt is called.
2167 *
2168 * In case of T/TCP, we call this routine during implicit connection
2169 * setup as well (offer = -1), to initialize maxseg from the cached
2170 * MSS of our peer.
2171 */
2172void
2173tcp_mss(tp, offer)
2174 struct tcpcb *tp;
2175 int offer;
2176{
2177 register struct rtentry *rt;
2178 struct ifnet *ifp;
2179 register int rtt, mss;
2180 u_long bufsize;
2181 struct inpcb *inp;
2182 struct socket *so;
2183 struct rmxp_tao *taop;
2184 int origoffer = offer;
2185
2186 inp = tp->t_inpcb;
2187 if ((rt = tcp_rtlookup(inp)) == NULL) {
2188 tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
2189 return;
2190 }
2191 ifp = rt->rt_ifp;
2192 so = inp->inp_socket;
2193
2194 taop = rmx_taop(rt->rt_rmx);
2195 /*
2196 * Offer == -1 means that we didn't receive SYN yet,
2197 * use cached value in that case;
2198 */
2199 if (offer == -1)
2200 offer = taop->tao_mssopt;
2201 /*
2202 * Offer == 0 means that there was no MSS on the SYN segment,
2203 * in this case we use tcp_mssdflt.
2204 */
2205 if (offer == 0)
2206 offer = tcp_mssdflt;
2207 else
2208 /*
2209 * Sanity check: make sure that maxopd will be large
2210 * enough to allow some data on segments even is the
2211 * all the option space is used (40bytes). Otherwise
2212 * funny things may happen in tcp_output.
2213 */
2214 offer = max(offer, 64);
2215 taop->tao_mssopt = offer;
2216
2217 /*
2218 * While we're here, check if there's an initial rtt
2219 * or rttvar. Convert from the route-table units
2220 * to scaled multiples of the slow timeout timer.
2221 */
2222 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
2223 /*
2224 * XXX the lock bit for RTT indicates that the value
2225 * is also a minimum value; this is subject to time.
2226 */
2227 if (rt->rt_rmx.rmx_locks & RTV_RTT)
2228 tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
2229 tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
2230 tcpstat.tcps_usedrtt++;
2231 if (rt->rt_rmx.rmx_rttvar) {
2232 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
2233 (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
2234 tcpstat.tcps_usedrttvar++;
2235 } else {
2236 /* default variation is +- 1 rtt */
2237 tp->t_rttvar =
2238 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
2239 }
2240 TCPT_RANGESET(tp->t_rxtcur,
2241 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
2242 tp->t_rttmin, TCPTV_REXMTMAX);
2243 }
2244 /*
2245 * if there's an mtu associated with the route, use it
2246 */
2247 if (rt->rt_rmx.rmx_mtu)
2248 mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr);
2249 else
2250 {
2251 mss = ifp->if_mtu - sizeof(struct tcpiphdr);
2252 if (!in_localaddr(inp->inp_faddr))
2253 mss = min(mss, tcp_mssdflt);
2254 }
2255 mss = min(mss, offer);
2256 /*
2257 * maxopd stores the maximum length of data AND options
2258 * in a segment; maxseg is the amount of data in a normal
2259 * segment. We need to store this value (maxopd) apart
2260 * from maxseg, because now every segment carries options
2261 * and thus we normally have somewhat less data in segments.
2262 */
2263 tp->t_maxopd = mss;
2264
2265 /*
2266 * In case of T/TCP, origoffer==-1 indicates, that no segments
2267 * were received yet. In this case we just guess, otherwise
2268 * we do the same as before T/TCP.
2269 */
2270 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2271 (origoffer == -1 ||
2272 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
2273 mss -= TCPOLEN_TSTAMP_APPA;
2274 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
2275 (origoffer == -1 ||
2276 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
2277 mss -= TCPOLEN_CC_APPA;
2278
2279#if (MCLBYTES & (MCLBYTES - 1)) == 0
2280 if (mss > MCLBYTES)
2281 mss &= ~(MCLBYTES-1);
2282#else
2283 if (mss > MCLBYTES)
2284 mss = mss / MCLBYTES * MCLBYTES;
2285#endif
2286 /*
2287 * If there's a pipesize, change the socket buffer
2288 * to that size. Make the socket buffers an integral
2289 * number of mss units; if the mss is larger than
2290 * the socket buffer, decrease the mss.
2291 */
2292#ifdef RTV_SPIPE
2293 if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
2294#endif
2295 bufsize = so->so_snd.sb_hiwat;
2296 if (bufsize < mss)
2297 mss = bufsize;
2298 else {
2299 bufsize = roundup(bufsize, mss);
2300 if (bufsize > sb_max)
2301 bufsize = sb_max;
2302 (void)sbreserve(&so->so_snd, bufsize, so, NULL);
2303 }
2304 tp->t_maxseg = mss;
2305
2306#ifdef RTV_RPIPE
2307 if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
2308#endif
2309 bufsize = so->so_rcv.sb_hiwat;
2310 if (bufsize > mss) {
2311 bufsize = roundup(bufsize, mss);
2312 if (bufsize > sb_max)
2313 bufsize = sb_max;
2314 (void)sbreserve(&so->so_rcv, bufsize, so, NULL);
2315 }
2316
2317 /*
2318 * Set the slow-start flight size depending on whether this
2319 * is a local network or not.
2320 */
2321 if (in_localaddr(inp->inp_faddr))
2322 tp->snd_cwnd = mss * ss_fltsz_local;
2323 else
2324 tp->snd_cwnd = mss * ss_fltsz;
2325
2326 if (rt->rt_rmx.rmx_ssthresh) {
2327 /*
2328 * There's some sort of gateway or interface
2329 * buffer limit on the path. Use this to set
2330 * the slow start threshhold, but set the
2331 * threshold to no less than 2*mss.
2332 */
2333 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
2334 tcpstat.tcps_usedssthresh++;
2335 }
2336}
2337
2338/*
2339 * Determine the MSS option to send on an outgoing SYN.
2340 */
2341int
2342tcp_mssopt(tp)
2343 struct tcpcb *tp;
2344{
2345 struct rtentry *rt;
2346
2347 rt = tcp_rtlookup(tp->t_inpcb);
2348 if (rt == NULL)
2349 return tcp_mssdflt;
2350
2351 return rt->rt_ifp->if_mtu - sizeof(struct tcpiphdr);
2352}