Deleted Added
full compact
fastpath.c (294535) fastpath.c (294931)
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 * Copyright (c) 2007-2008,2010
5 * Swinburne University of Technology, Melbourne, Australia.
6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010 The FreeBSD Foundation
8 * Copyright (c) 2010-2011 Juniper Networks, Inc.
9 * Copyright (c) 2015 Netflix Inc.
10 * All rights reserved.
11 *
12 * Portions of this software were developed at the Centre for Advanced Internet
13 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
14 * James Healy and David Hayes, made possible in part by a grant from the Cisco
15 * University Research Program Fund at Community Foundation Silicon Valley.
16 *
17 * Portions of this software were developed at the Centre for Advanced
18 * Internet Architectures, Swinburne University of Technology, Melbourne,
19 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
20 *
21 * Portions of this software were developed by Robert N. M. Watson under
22 * contract to Juniper Networks, Inc.
23 *
24 * Portions of this software were developed by Randall R. Stewart while
25 * working for Netflix Inc.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 4. Neither the name of the University nor the names of its contributors
36 * may be used to endorse or promote products derived from this software
37 * without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
40 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
42 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
43 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
44 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
45 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
47 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
48 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49 * SUCH DAMAGE.
50 *
51 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
52 */
53
54#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 * Copyright (c) 2007-2008,2010
5 * Swinburne University of Technology, Melbourne, Australia.
6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010 The FreeBSD Foundation
8 * Copyright (c) 2010-2011 Juniper Networks, Inc.
9 * Copyright (c) 2015 Netflix Inc.
10 * All rights reserved.
11 *
12 * Portions of this software were developed at the Centre for Advanced Internet
13 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
14 * James Healy and David Hayes, made possible in part by a grant from the Cisco
15 * University Research Program Fund at Community Foundation Silicon Valley.
16 *
17 * Portions of this software were developed at the Centre for Advanced
18 * Internet Architectures, Swinburne University of Technology, Melbourne,
19 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
20 *
21 * Portions of this software were developed by Robert N. M. Watson under
22 * contract to Juniper Networks, Inc.
23 *
24 * Portions of this software were developed by Randall R. Stewart while
25 * working for Netflix Inc.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 4. Neither the name of the University nor the names of its contributors
36 * may be used to endorse or promote products derived from this software
37 * without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
40 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
42 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
43 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
44 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
45 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
47 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
48 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49 * SUCH DAMAGE.
50 *
51 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
52 */
53
54#include <sys/cdefs.h>
55__FBSDID("$FreeBSD: head/sys/netinet/tcp_stacks/fastpath.c 294535 2016-01-21 22:34:51Z glebius $");
55__FBSDID("$FreeBSD: head/sys/netinet/tcp_stacks/fastpath.c 294931 2016-01-27 17:59:39Z glebius $");
56
57#include "opt_ipfw.h" /* for ipfw_fwd */
58#include "opt_inet.h"
59#include "opt_inet6.h"
60#include "opt_ipsec.h"
61#include "opt_kdtrace.h"
62#include "opt_tcpdebug.h"
63
64#include <sys/param.h>
65#include <sys/module.h>
66#include <sys/kernel.h>
67#include <sys/hhook.h>
68#include <sys/malloc.h>
69#include <sys/mbuf.h>
70#include <sys/proc.h> /* for proc0 declaration */
71#include <sys/protosw.h>
72#include <sys/sdt.h>
73#include <sys/signalvar.h>
74#include <sys/socket.h>
75#include <sys/socketvar.h>
76#include <sys/sysctl.h>
77#include <sys/syslog.h>
78#include <sys/systm.h>
79
80#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
81
82#include <vm/uma.h>
83
84#include <net/route.h>
85#include <net/vnet.h>
86
87#define TCPSTATES /* for logging */
88
89#include <netinet/in.h>
90#include <netinet/in_kdtrace.h>
91#include <netinet/in_pcb.h>
92#include <netinet/in_systm.h>
93#include <netinet/ip.h>
94#include <netinet/ip_icmp.h> /* required for icmp_var.h */
95#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
96#include <netinet/ip_var.h>
97#include <netinet/ip_options.h>
98#include <netinet/ip6.h>
99#include <netinet/icmp6.h>
100#include <netinet6/in6_pcb.h>
101#include <netinet6/ip6_var.h>
102#include <netinet/tcp.h>
103#include <netinet/tcp_fsm.h>
104#include <netinet/tcp_seq.h>
105#include <netinet/tcp_timer.h>
106#include <netinet/tcp_var.h>
107#include <netinet6/tcp6_var.h>
108#include <netinet/tcpip.h>
109#include <netinet/tcp_syncache.h>
56
57#include "opt_ipfw.h" /* for ipfw_fwd */
58#include "opt_inet.h"
59#include "opt_inet6.h"
60#include "opt_ipsec.h"
61#include "opt_kdtrace.h"
62#include "opt_tcpdebug.h"
63
64#include <sys/param.h>
65#include <sys/module.h>
66#include <sys/kernel.h>
67#include <sys/hhook.h>
68#include <sys/malloc.h>
69#include <sys/mbuf.h>
70#include <sys/proc.h> /* for proc0 declaration */
71#include <sys/protosw.h>
72#include <sys/sdt.h>
73#include <sys/signalvar.h>
74#include <sys/socket.h>
75#include <sys/socketvar.h>
76#include <sys/sysctl.h>
77#include <sys/syslog.h>
78#include <sys/systm.h>
79
80#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
81
82#include <vm/uma.h>
83
84#include <net/route.h>
85#include <net/vnet.h>
86
87#define TCPSTATES /* for logging */
88
89#include <netinet/in.h>
90#include <netinet/in_kdtrace.h>
91#include <netinet/in_pcb.h>
92#include <netinet/in_systm.h>
93#include <netinet/ip.h>
94#include <netinet/ip_icmp.h> /* required for icmp_var.h */
95#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
96#include <netinet/ip_var.h>
97#include <netinet/ip_options.h>
98#include <netinet/ip6.h>
99#include <netinet/icmp6.h>
100#include <netinet6/in6_pcb.h>
101#include <netinet6/ip6_var.h>
102#include <netinet/tcp.h>
103#include <netinet/tcp_fsm.h>
104#include <netinet/tcp_seq.h>
105#include <netinet/tcp_timer.h>
106#include <netinet/tcp_var.h>
107#include <netinet6/tcp6_var.h>
108#include <netinet/tcpip.h>
109#include <netinet/tcp_syncache.h>
110#include <netinet/tcp_cc.h>
110#include <netinet/cc/cc.h>
111#ifdef TCPDEBUG
112#include <netinet/tcp_debug.h>
113#endif /* TCPDEBUG */
114#ifdef TCP_OFFLOAD
115#include <netinet/tcp_offload.h>
116#endif
117
118#ifdef IPSEC
119#include <netipsec/ipsec.h>
120#include <netipsec/ipsec6.h>
121#endif /*IPSEC*/
122
123#include <machine/in_cksum.h>
124
125#include <security/mac/mac_framework.h>
126
127const int tcprexmtthresh;
128
129VNET_DECLARE(int, tcp_autorcvbuf_inc);
130#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
131VNET_DECLARE(int, tcp_autorcvbuf_max);
132#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
133VNET_DECLARE(int, tcp_do_rfc3042);
134#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042)
135VNET_DECLARE(int, tcp_do_autorcvbuf);
136#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
137VNET_DECLARE(int, tcp_insecure_rst);
138#define V_tcp_insecure_rst VNET(tcp_insecure_rst)
139VNET_DECLARE(int, tcp_insecure_syn);
140#define V_tcp_insecure_syn VNET(tcp_insecure_syn)
141
142static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *,
143 struct socket *, struct tcpcb *, int, int, uint8_t,
144 int);
145
146static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *,
147 struct socket *, struct tcpcb *, int, int, uint8_t,
148 int);
149
150/*
151 * Indicate whether this ack should be delayed. We can delay the ack if
152 * following conditions are met:
153 * - There is no delayed ack timer in progress.
154 * - Our last ack wasn't a 0-sized window. We never want to delay
155 * the ack that opens up a 0-sized window.
156 * - LRO wasn't used for this segment. We make sure by checking that the
157 * segment size is not larger than the MSS.
158 */
159#define DELAY_ACK(tp, tlen) \
160 ((!tcp_timer_active(tp, TT_DELACK) && \
161 (tp->t_flags & TF_RXWIN0SENT) == 0) && \
162 (tlen <= tp->t_maxseg) && \
163 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
164
165/*
166 * So how is this faster than the normal fast ack?
167 * It basically allows us to also stay in the fastpath
168 * when a window-update ack also arrives. In testing
169 * we saw only 25-30% of connections doing fastpath
170 * due to the fact that along with moving forward
171 * in sequence the window was also updated.
172 */
173static void
174tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
175 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
176 int ti_locked, u_long tiwin)
177{
178 int acked;
179 int winup_only=0;
180#ifdef TCPDEBUG
181 /*
182 * The size of tcp_saveipgen must be the size of the max ip header,
183 * now IPv6.
184 */
185 u_char tcp_saveipgen[IP6_HDR_LEN];
186 struct tcphdr tcp_savetcp;
187 short ostate = 0;
188#endif
189 /*
190 * The following if statment will be true if
191 * we are doing the win_up_in_fp <and>
192 * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or>
193 * - No more new data, but we have an ack for new data
194 * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack))
195 * - No more new data, the same ack point but the window grew
196 * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd)
197 */
198 if ((SEQ_LT(tp->snd_wl1, th->th_seq) ||
199 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
200 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
201 /* keep track of pure window updates */
202 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
203 winup_only = 1;
204 TCPSTAT_INC(tcps_rcvwinupd);
205 }
206 tp->snd_wnd = tiwin;
207 tp->snd_wl1 = th->th_seq;
208 tp->snd_wl2 = th->th_ack;
209 if (tp->snd_wnd > tp->max_sndwnd)
210 tp->max_sndwnd = tp->snd_wnd;
211 }
212 /*
213 * If last ACK falls within this segment's sequence numbers,
214 * record the timestamp.
215 * NOTE that the test is modified according to the latest
216 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
217 */
218 if ((to->to_flags & TOF_TS) != 0 &&
219 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
220 tp->ts_recent_age = tcp_ts_getticks();
221 tp->ts_recent = to->to_tsval;
222 }
223 /*
224 * This is a pure ack for outstanding data.
225 */
226 if (ti_locked == TI_RLOCKED) {
227 INP_INFO_RUNLOCK(&V_tcbinfo);
228 }
229 ti_locked = TI_UNLOCKED;
230
231 TCPSTAT_INC(tcps_predack);
232
233 /*
234 * "bad retransmit" recovery.
235 */
236 if (tp->t_rxtshift == 1 &&
237 tp->t_flags & TF_PREVVALID &&
238 (int)(ticks - tp->t_badrxtwin) < 0) {
239 cc_cong_signal(tp, th, CC_RTO_ERR);
240 }
241
242 /*
243 * Recalculate the transmit timer / rtt.
244 *
245 * Some boxes send broken timestamp replies
246 * during the SYN+ACK phase, ignore
247 * timestamps of 0 or we could calculate a
248 * huge RTT and blow up the retransmit timer.
249 */
250 if ((to->to_flags & TOF_TS) != 0 &&
251 to->to_tsecr) {
252 u_int t;
253
254 t = tcp_ts_getticks() - to->to_tsecr;
255 if (!tp->t_rttlow || tp->t_rttlow > t)
256 tp->t_rttlow = t;
257 tcp_xmit_timer(tp,
258 TCP_TS_TO_TICKS(t) + 1);
259 } else if (tp->t_rtttime &&
260 SEQ_GT(th->th_ack, tp->t_rtseq)) {
261 if (!tp->t_rttlow ||
262 tp->t_rttlow > ticks - tp->t_rtttime)
263 tp->t_rttlow = ticks - tp->t_rtttime;
264 tcp_xmit_timer(tp,
265 ticks - tp->t_rtttime);
266 }
267 if (winup_only == 0) {
268 acked = BYTES_THIS_ACK(tp, th);
269
270 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
271 hhook_run_tcp_est_in(tp, th, to);
272
273 TCPSTAT_ADD(tcps_rcvackbyte, acked);
274 sbdrop(&so->so_snd, acked);
275 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
276 SEQ_LEQ(th->th_ack, tp->snd_recover))
277 tp->snd_recover = th->th_ack - 1;
278
279 /*
280 * Let the congestion control algorithm update
281 * congestion control related information. This
282 * typically means increasing the congestion
283 * window.
284 */
285 cc_ack_received(tp, th, CC_ACK);
286
287 tp->snd_una = th->th_ack;
288 /*
289 * Pull snd_wl2 up to prevent seq wrap relative
290 * to th_ack.
291 */
292 tp->snd_wl2 = th->th_ack;
293 tp->t_dupacks = 0;
294 m_freem(m);
295
296 /*
297 * If all outstanding data are acked, stop
298 * retransmit timer, otherwise restart timer
299 * using current (possibly backed-off) value.
300 * If process is waiting for space,
301 * wakeup/selwakeup/signal. If data
302 * are ready to send, let tcp_output
303 * decide between more output or persist.
304 */
305#ifdef TCPDEBUG
306 if (so->so_options & SO_DEBUG)
307 tcp_trace(TA_INPUT, ostate, tp,
308 (void *)tcp_saveipgen,
309 &tcp_savetcp, 0);
310#endif
311 if (tp->snd_una == tp->snd_max)
312 tcp_timer_activate(tp, TT_REXMT, 0);
313 else if (!tcp_timer_active(tp, TT_PERSIST))
314 tcp_timer_activate(tp, TT_REXMT,
315 tp->t_rxtcur);
316 } else {
317 /*
318 * Window update only, just free the mbufs and
319 * send out whatever we can.
320 */
321 m_freem(m);
322 }
323 sowwakeup(so);
324 if (sbavail(&so->so_snd))
325 (void) tcp_output(tp);
326 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
327 __func__, ti_locked));
328 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
329 INP_WLOCK_ASSERT(tp->t_inpcb);
330
331 if (tp->t_flags & TF_DELACK) {
332 tp->t_flags &= ~TF_DELACK;
333 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
334 }
335 INP_WUNLOCK(tp->t_inpcb);
336}
337
338/*
339 * Here nothing is really faster, its just that we
340 * have broken out the fast-data path also just like
341 * the fast-ack.
342 */
343static void
344tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
345 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
346 int ti_locked, u_long tiwin)
347{
348 int newsize = 0; /* automatic sockbuf scaling */
349#ifdef TCPDEBUG
350 /*
351 * The size of tcp_saveipgen must be the size of the max ip header,
352 * now IPv6.
353 */
354 u_char tcp_saveipgen[IP6_HDR_LEN];
355 struct tcphdr tcp_savetcp;
356 short ostate = 0;
357#endif
358 /*
359 * If last ACK falls within this segment's sequence numbers,
360 * record the timestamp.
361 * NOTE that the test is modified according to the latest
362 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
363 */
364 if ((to->to_flags & TOF_TS) != 0 &&
365 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
366 tp->ts_recent_age = tcp_ts_getticks();
367 tp->ts_recent = to->to_tsval;
368 }
369
370 /*
371 * This is a pure, in-sequence data packet with
372 * nothing on the reassembly queue and we have enough
373 * buffer space to take it.
374 */
375 if (ti_locked == TI_RLOCKED) {
376 INP_INFO_RUNLOCK(&V_tcbinfo);
377 }
378 ti_locked = TI_UNLOCKED;
379
380 /* Clean receiver SACK report if present */
381 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
382 tcp_clean_sackreport(tp);
383 TCPSTAT_INC(tcps_preddat);
384 tp->rcv_nxt += tlen;
385 /*
386 * Pull snd_wl1 up to prevent seq wrap relative to
387 * th_seq.
388 */
389 tp->snd_wl1 = th->th_seq;
390 /*
391 * Pull rcv_up up to prevent seq wrap relative to
392 * rcv_nxt.
393 */
394 tp->rcv_up = tp->rcv_nxt;
395 TCPSTAT_ADD(tcps_rcvbyte, tlen);
396#ifdef TCPDEBUG
397 if (so->so_options & SO_DEBUG)
398 tcp_trace(TA_INPUT, ostate, tp,
399 (void *)tcp_saveipgen, &tcp_savetcp, 0);
400#endif
401 /*
402 * Automatic sizing of receive socket buffer. Often the send
403 * buffer size is not optimally adjusted to the actual network
404 * conditions at hand (delay bandwidth product). Setting the
405 * buffer size too small limits throughput on links with high
406 * bandwidth and high delay (eg. trans-continental/oceanic links).
407 *
408 * On the receive side the socket buffer memory is only rarely
409 * used to any significant extent. This allows us to be much
410 * more aggressive in scaling the receive socket buffer. For
411 * the case that the buffer space is actually used to a large
412 * extent and we run out of kernel memory we can simply drop
413 * the new segments; TCP on the sender will just retransmit it
414 * later. Setting the buffer size too big may only consume too
415 * much kernel memory if the application doesn't read() from
416 * the socket or packet loss or reordering makes use of the
417 * reassembly queue.
418 *
419 * The criteria to step up the receive buffer one notch are:
420 * 1. Application has not set receive buffer size with
421 * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
422 * 2. the number of bytes received during the time it takes
423 * one timestamp to be reflected back to us (the RTT);
424 * 3. received bytes per RTT is within seven eighth of the
425 * current socket buffer size;
426 * 4. receive buffer size has not hit maximal automatic size;
427 *
428 * This algorithm does one step per RTT at most and only if
429 * we receive a bulk stream w/o packet losses or reorderings.
430 * Shrinking the buffer during idle times is not necessary as
431 * it doesn't consume any memory when idle.
432 *
433 * TODO: Only step up if the application is actually serving
434 * the buffer to better manage the socket buffer resources.
435 */
436 if (V_tcp_do_autorcvbuf &&
437 (to->to_flags & TOF_TS) &&
438 to->to_tsecr &&
439 (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
440 if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
441 to->to_tsecr - tp->rfbuf_ts < hz) {
442 if (tp->rfbuf_cnt >
443 (so->so_rcv.sb_hiwat / 8 * 7) &&
444 so->so_rcv.sb_hiwat <
445 V_tcp_autorcvbuf_max) {
446 newsize =
447 min(so->so_rcv.sb_hiwat +
448 V_tcp_autorcvbuf_inc,
449 V_tcp_autorcvbuf_max);
450 }
451 /* Start over with next RTT. */
452 tp->rfbuf_ts = 0;
453 tp->rfbuf_cnt = 0;
454 } else
455 tp->rfbuf_cnt += tlen; /* add up */
456 }
457
458 /* Add data to socket buffer. */
459 SOCKBUF_LOCK(&so->so_rcv);
460 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
461 m_freem(m);
462 } else {
463 /*
464 * Set new socket buffer size.
465 * Give up when limit is reached.
466 */
467 if (newsize)
468 if (!sbreserve_locked(&so->so_rcv,
469 newsize, so, NULL))
470 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
471 m_adj(m, drop_hdrlen); /* delayed header drop */
472 sbappendstream_locked(&so->so_rcv, m, 0);
473 }
474 /* NB: sorwakeup_locked() does an implicit unlock. */
475 sorwakeup_locked(so);
476 if (DELAY_ACK(tp, tlen)) {
477 tp->t_flags |= TF_DELACK;
478 } else {
479 tp->t_flags |= TF_ACKNOW;
480 tcp_output(tp);
481 }
482 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
483 __func__, ti_locked));
484 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
485 INP_WLOCK_ASSERT(tp->t_inpcb);
486
487 if (tp->t_flags & TF_DELACK) {
488 tp->t_flags &= ~TF_DELACK;
489 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
490 }
491 INP_WUNLOCK(tp->t_inpcb);
492}
493
494/*
495 * The slow-path is the clone of the long long part
496 * of tcp_do_segment past all the fast-path stuff. We
497 * use it here by two different callers, the fast/slow and
498 * the fastack only.
499 */
500static void
501tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
502 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
503 int ti_locked, u_long tiwin, int thflags)
504{
505 int acked, ourfinisacked, needoutput = 0;
506 int rstreason, todrop, win;
507 char *s;
508 struct in_conninfo *inc;
509 struct mbuf *mfree = NULL;
510#ifdef TCPDEBUG
511 /*
512 * The size of tcp_saveipgen must be the size of the max ip header,
513 * now IPv6.
514 */
515 u_char tcp_saveipgen[IP6_HDR_LEN];
516 struct tcphdr tcp_savetcp;
517 short ostate = 0;
518#endif
519 /*
520 * Calculate amount of space in receive window,
521 * and then do TCP input processing.
522 * Receive window is amount of space in rcv queue,
523 * but not less than advertised window.
524 */
525 inc = &tp->t_inpcb->inp_inc;
526 win = sbspace(&so->so_rcv);
527 if (win < 0)
528 win = 0;
529 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
530
531 /* Reset receive buffer auto scaling when not in bulk receive mode. */
532 tp->rfbuf_ts = 0;
533 tp->rfbuf_cnt = 0;
534
535 switch (tp->t_state) {
536
537 /*
538 * If the state is SYN_RECEIVED:
539 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
540 */
541 case TCPS_SYN_RECEIVED:
542 if ((thflags & TH_ACK) &&
543 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
544 SEQ_GT(th->th_ack, tp->snd_max))) {
545 rstreason = BANDLIM_RST_OPENPORT;
546 goto dropwithreset;
547 }
548 break;
549
550 /*
551 * If the state is SYN_SENT:
552 * if seg contains an ACK, but not for our SYN, drop the input.
553 * if seg contains a RST, then drop the connection.
554 * if seg does not contain SYN, then drop it.
555 * Otherwise this is an acceptable SYN segment
556 * initialize tp->rcv_nxt and tp->irs
557 * if seg contains ack then advance tp->snd_una
558 * if seg contains an ECE and ECN support is enabled, the stream
559 * is ECN capable.
560 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
561 * arrange for segment to be acked (eventually)
562 * continue processing rest of data/controls, beginning with URG
563 */
564 case TCPS_SYN_SENT:
565 if ((thflags & TH_ACK) &&
566 (SEQ_LEQ(th->th_ack, tp->iss) ||
567 SEQ_GT(th->th_ack, tp->snd_max))) {
568 rstreason = BANDLIM_UNLIMITED;
569 goto dropwithreset;
570 }
571 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
572 TCP_PROBE5(connect__refused, NULL, tp,
573 mtod(m, const char *), tp, th);
574 tp = tcp_drop(tp, ECONNREFUSED);
575 }
576 if (thflags & TH_RST)
577 goto drop;
578 if (!(thflags & TH_SYN))
579 goto drop;
580
581 tp->irs = th->th_seq;
582 tcp_rcvseqinit(tp);
583 if (thflags & TH_ACK) {
584 TCPSTAT_INC(tcps_connects);
585 soisconnected(so);
586#ifdef MAC
587 mac_socketpeer_set_from_mbuf(m, so);
588#endif
589 /* Do window scaling on this connection? */
590 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
591 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
592 tp->rcv_scale = tp->request_r_scale;
593 }
594 tp->rcv_adv += imin(tp->rcv_wnd,
595 TCP_MAXWIN << tp->rcv_scale);
596 tp->snd_una++; /* SYN is acked */
597 /*
598 * If there's data, delay ACK; if there's also a FIN
599 * ACKNOW will be turned on later.
600 */
601 if (DELAY_ACK(tp, tlen) && tlen != 0)
602 tcp_timer_activate(tp, TT_DELACK,
603 tcp_delacktime);
604 else
605 tp->t_flags |= TF_ACKNOW;
606
607 if ((thflags & TH_ECE) && V_tcp_do_ecn) {
608 tp->t_flags |= TF_ECN_PERMIT;
609 TCPSTAT_INC(tcps_ecn_shs);
610 }
611
612 /*
613 * Received <SYN,ACK> in SYN_SENT[*] state.
614 * Transitions:
615 * SYN_SENT --> ESTABLISHED
616 * SYN_SENT* --> FIN_WAIT_1
617 */
618 tp->t_starttime = ticks;
619 if (tp->t_flags & TF_NEEDFIN) {
620 tcp_state_change(tp, TCPS_FIN_WAIT_1);
621 tp->t_flags &= ~TF_NEEDFIN;
622 thflags &= ~TH_SYN;
623 } else {
624 tcp_state_change(tp, TCPS_ESTABLISHED);
625 TCP_PROBE5(connect__established, NULL, tp,
626 mtod(m, const char *), tp, th);
627 cc_conn_init(tp);
628 tcp_timer_activate(tp, TT_KEEP,
629 TP_KEEPIDLE(tp));
630 }
631 } else {
632 /*
633 * Received initial SYN in SYN-SENT[*] state =>
634 * simultaneous open.
635 * If it succeeds, connection is * half-synchronized.
636 * Otherwise, do 3-way handshake:
637 * SYN-SENT -> SYN-RECEIVED
638 * SYN-SENT* -> SYN-RECEIVED*
639 */
640 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
641 tcp_timer_activate(tp, TT_REXMT, 0);
642 tcp_state_change(tp, TCPS_SYN_RECEIVED);
643 }
644
645 KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
646 "ti_locked %d", __func__, ti_locked));
647 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
648 INP_WLOCK_ASSERT(tp->t_inpcb);
649
650 /*
651 * Advance th->th_seq to correspond to first data byte.
652 * If data, trim to stay within window,
653 * dropping FIN if necessary.
654 */
655 th->th_seq++;
656 if (tlen > tp->rcv_wnd) {
657 todrop = tlen - tp->rcv_wnd;
658 m_adj(m, -todrop);
659 tlen = tp->rcv_wnd;
660 thflags &= ~TH_FIN;
661 TCPSTAT_INC(tcps_rcvpackafterwin);
662 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
663 }
664 tp->snd_wl1 = th->th_seq - 1;
665 tp->rcv_up = th->th_seq;
666 /*
667 * Client side of transaction: already sent SYN and data.
668 * If the remote host used T/TCP to validate the SYN,
669 * our data will be ACK'd; if so, enter normal data segment
670 * processing in the middle of step 5, ack processing.
671 * Otherwise, goto step 6.
672 */
673 if (thflags & TH_ACK)
674 goto process_ACK;
675
676 goto step6;
677
678 /*
679 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
680 * do normal processing.
681 *
682 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
683 */
684 case TCPS_LAST_ACK:
685 case TCPS_CLOSING:
686 break; /* continue normal processing */
687 }
688
689 /*
690 * States other than LISTEN or SYN_SENT.
691 * First check the RST flag and sequence number since reset segments
692 * are exempt from the timestamp and connection count tests. This
693 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
694 * below which allowed reset segments in half the sequence space
695 * to fall though and be processed (which gives forged reset
696 * segments with a random sequence number a 50 percent chance of
697 * killing a connection).
698 * Then check timestamp, if present.
699 * Then check the connection count, if present.
700 * Then check that at least some bytes of segment are within
701 * receive window. If segment begins before rcv_nxt,
702 * drop leading data (and SYN); if nothing left, just ack.
703 */
704 if (thflags & TH_RST) {
705 /*
706 * RFC5961 Section 3.2
707 *
708 * - RST drops connection only if SEG.SEQ == RCV.NXT.
709 * - If RST is in window, we send challenge ACK.
710 *
711 * Note: to take into account delayed ACKs, we should
712 * test against last_ack_sent instead of rcv_nxt.
713 * Note 2: we handle special case of closed window, not
714 * covered by the RFC.
715 */
716 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
717 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
718 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
719 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
720 KASSERT(ti_locked == TI_RLOCKED,
721 ("%s: TH_RST ti_locked %d, th %p tp %p",
722 __func__, ti_locked, th, tp));
723 KASSERT(tp->t_state != TCPS_SYN_SENT,
724 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
725 __func__, th, tp));
726
727 if (V_tcp_insecure_rst ||
728 tp->last_ack_sent == th->th_seq) {
729 TCPSTAT_INC(tcps_drops);
730 /* Drop the connection. */
731 switch (tp->t_state) {
732 case TCPS_SYN_RECEIVED:
733 so->so_error = ECONNREFUSED;
734 goto close;
735 case TCPS_ESTABLISHED:
736 case TCPS_FIN_WAIT_1:
737 case TCPS_FIN_WAIT_2:
738 case TCPS_CLOSE_WAIT:
739 so->so_error = ECONNRESET;
740 close:
741 tcp_state_change(tp, TCPS_CLOSED);
742 /* FALLTHROUGH */
743 default:
744 tp = tcp_close(tp);
745 }
746 } else {
747 TCPSTAT_INC(tcps_badrst);
748 /* Send challenge ACK. */
749 tcp_respond(tp, mtod(m, void *), th, m,
750 tp->rcv_nxt, tp->snd_nxt, TH_ACK);
751 tp->last_ack_sent = tp->rcv_nxt;
752 m = NULL;
753 }
754 }
755 goto drop;
756 }
757
758 /*
759 * RFC5961 Section 4.2
760 * Send challenge ACK for any SYN in synchronized state.
761 */
762 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) {
763 KASSERT(ti_locked == TI_RLOCKED,
764 ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
765 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
766
767 TCPSTAT_INC(tcps_badsyn);
768 if (V_tcp_insecure_syn &&
769 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
770 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
771 tp = tcp_drop(tp, ECONNRESET);
772 rstreason = BANDLIM_UNLIMITED;
773 } else {
774 /* Send challenge ACK. */
775 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
776 tp->snd_nxt, TH_ACK);
777 tp->last_ack_sent = tp->rcv_nxt;
778 m = NULL;
779 }
780 goto drop;
781 }
782
783 /*
784 * RFC 1323 PAWS: If we have a timestamp reply on this segment
785 * and it's less than ts_recent, drop it.
786 */
787 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
788 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
789
790 /* Check to see if ts_recent is over 24 days old. */
791 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
792 /*
793 * Invalidate ts_recent. If this segment updates
794 * ts_recent, the age will be reset later and ts_recent
795 * will get a valid value. If it does not, setting
796 * ts_recent to zero will at least satisfy the
797 * requirement that zero be placed in the timestamp
798 * echo reply when ts_recent isn't valid. The
799 * age isn't reset until we get a valid ts_recent
800 * because we don't want out-of-order segments to be
801 * dropped when ts_recent is old.
802 */
803 tp->ts_recent = 0;
804 } else {
805 TCPSTAT_INC(tcps_rcvduppack);
806 TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
807 TCPSTAT_INC(tcps_pawsdrop);
808 if (tlen)
809 goto dropafterack;
810 goto drop;
811 }
812 }
813
814 /*
815 * In the SYN-RECEIVED state, validate that the packet belongs to
816 * this connection before trimming the data to fit the receive
817 * window. Check the sequence number versus IRS since we know
818 * the sequence numbers haven't wrapped. This is a partial fix
819 * for the "LAND" DoS attack.
820 */
821 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
822 rstreason = BANDLIM_RST_OPENPORT;
823 goto dropwithreset;
824 }
825
826 todrop = tp->rcv_nxt - th->th_seq;
827 if (todrop > 0) {
828 if (thflags & TH_SYN) {
829 thflags &= ~TH_SYN;
830 th->th_seq++;
831 if (th->th_urp > 1)
832 th->th_urp--;
833 else
834 thflags &= ~TH_URG;
835 todrop--;
836 }
837 /*
838 * Following if statement from Stevens, vol. 2, p. 960.
839 */
840 if (todrop > tlen
841 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
842 /*
843 * Any valid FIN must be to the left of the window.
844 * At this point the FIN must be a duplicate or out
845 * of sequence; drop it.
846 */
847 thflags &= ~TH_FIN;
848
849 /*
850 * Send an ACK to resynchronize and drop any data.
851 * But keep on processing for RST or ACK.
852 */
853 tp->t_flags |= TF_ACKNOW;
854 todrop = tlen;
855 TCPSTAT_INC(tcps_rcvduppack);
856 TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
857 } else {
858 TCPSTAT_INC(tcps_rcvpartduppack);
859 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
860 }
861 drop_hdrlen += todrop; /* drop from the top afterwards */
862 th->th_seq += todrop;
863 tlen -= todrop;
864 if (th->th_urp > todrop)
865 th->th_urp -= todrop;
866 else {
867 thflags &= ~TH_URG;
868 th->th_urp = 0;
869 }
870 }
871
872 /*
873 * If new data are received on a connection after the
874 * user processes are gone, then RST the other end.
875 */
876 if ((so->so_state & SS_NOFDREF) &&
877 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
878 KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
879 "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
880 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
881
882 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
883 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
884 "after socket was closed, "
885 "sending RST and removing tcpcb\n",
886 s, __func__, tcpstates[tp->t_state], tlen);
887 free(s, M_TCPLOG);
888 }
889 tp = tcp_close(tp);
890 TCPSTAT_INC(tcps_rcvafterclose);
891 rstreason = BANDLIM_UNLIMITED;
892 goto dropwithreset;
893 }
894
895 /*
896 * If segment ends after window, drop trailing data
897 * (and PUSH and FIN); if nothing left, just ACK.
898 */
899 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
900 if (todrop > 0) {
901 TCPSTAT_INC(tcps_rcvpackafterwin);
902 if (todrop >= tlen) {
903 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
904 /*
905 * If window is closed can only take segments at
906 * window edge, and have to drop data and PUSH from
907 * incoming segments. Continue processing, but
908 * remember to ack. Otherwise, drop segment
909 * and ack.
910 */
911 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
912 tp->t_flags |= TF_ACKNOW;
913 TCPSTAT_INC(tcps_rcvwinprobe);
914 } else
915 goto dropafterack;
916 } else
917 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
918 m_adj(m, -todrop);
919 tlen -= todrop;
920 thflags &= ~(TH_PUSH|TH_FIN);
921 }
922
923 /*
924 * If last ACK falls within this segment's sequence numbers,
925 * record its timestamp.
926 * NOTE:
927 * 1) That the test incorporates suggestions from the latest
928 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
929 * 2) That updating only on newer timestamps interferes with
930 * our earlier PAWS tests, so this check should be solely
931 * predicated on the sequence space of this segment.
932 * 3) That we modify the segment boundary check to be
933 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
934 * instead of RFC1323's
935 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
936 * This modified check allows us to overcome RFC1323's
937 * limitations as described in Stevens TCP/IP Illustrated
938 * Vol. 2 p.869. In such cases, we can still calculate the
939 * RTT correctly when RCV.NXT == Last.ACK.Sent.
940 */
941 if ((to->to_flags & TOF_TS) != 0 &&
942 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
943 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
944 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
945 tp->ts_recent_age = tcp_ts_getticks();
946 tp->ts_recent = to->to_tsval;
947 }
948
949 /*
950 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
951 * flag is on (half-synchronized state), then queue data for
952 * later processing; else drop segment and return.
953 */
954 if ((thflags & TH_ACK) == 0) {
955 if (tp->t_state == TCPS_SYN_RECEIVED ||
956 (tp->t_flags & TF_NEEDSYN))
957 goto step6;
958 else if (tp->t_flags & TF_ACKNOW)
959 goto dropafterack;
960 else
961 goto drop;
962 }
963
964 /*
965 * Ack processing.
966 */
967 switch (tp->t_state) {
968
969 /*
970 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
971 * ESTABLISHED state and continue processing.
972 * The ACK was checked above.
973 */
974 case TCPS_SYN_RECEIVED:
975
976 TCPSTAT_INC(tcps_connects);
977 soisconnected(so);
978 /* Do window scaling? */
979 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
980 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
981 tp->rcv_scale = tp->request_r_scale;
982 tp->snd_wnd = tiwin;
983 }
984 /*
985 * Make transitions:
986 * SYN-RECEIVED -> ESTABLISHED
987 * SYN-RECEIVED* -> FIN-WAIT-1
988 */
989 tp->t_starttime = ticks;
990 if (tp->t_flags & TF_NEEDFIN) {
991 tcp_state_change(tp, TCPS_FIN_WAIT_1);
992 tp->t_flags &= ~TF_NEEDFIN;
993 } else {
994 tcp_state_change(tp, TCPS_ESTABLISHED);
995 TCP_PROBE5(accept__established, NULL, tp,
996 mtod(m, const char *), tp, th);
997 cc_conn_init(tp);
998 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
999 }
1000 /*
1001 * If segment contains data or ACK, will call tcp_reass()
1002 * later; if not, do so now to pass queued data to user.
1003 */
1004 if (tlen == 0 && (thflags & TH_FIN) == 0)
1005 (void) tcp_reass(tp, (struct tcphdr *)0, 0,
1006 (struct mbuf *)0);
1007 tp->snd_wl1 = th->th_seq - 1;
1008 /* FALLTHROUGH */
1009
1010 /*
1011 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1012 * ACKs. If the ack is in the range
1013 * tp->snd_una < th->th_ack <= tp->snd_max
1014 * then advance tp->snd_una to th->th_ack and drop
1015 * data from the retransmission queue. If this ACK reflects
1016 * more up to date window information we update our window information.
1017 */
1018 case TCPS_ESTABLISHED:
1019 case TCPS_FIN_WAIT_1:
1020 case TCPS_FIN_WAIT_2:
1021 case TCPS_CLOSE_WAIT:
1022 case TCPS_CLOSING:
1023 case TCPS_LAST_ACK:
1024 if (SEQ_GT(th->th_ack, tp->snd_max)) {
1025 TCPSTAT_INC(tcps_rcvacktoomuch);
1026 goto dropafterack;
1027 }
1028 if ((tp->t_flags & TF_SACK_PERMIT) &&
1029 ((to->to_flags & TOF_SACK) ||
1030 !TAILQ_EMPTY(&tp->snd_holes)))
1031 tcp_sack_doack(tp, to, th->th_ack);
1032 else
1033 /*
1034 * Reset the value so that previous (valid) value
1035 * from the last ack with SACK doesn't get used.
1036 */
1037 tp->sackhint.sacked_bytes = 0;
1038
1039 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
1040 hhook_run_tcp_est_in(tp, th, to);
1041
1042 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1043 if (tlen == 0 && tiwin == tp->snd_wnd) {
1044 /*
1045 * If this is the first time we've seen a
1046 * FIN from the remote, this is not a
1047 * duplicate and it needs to be processed
1048 * normally. This happens during a
1049 * simultaneous close.
1050 */
1051 if ((thflags & TH_FIN) &&
1052 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
1053 tp->t_dupacks = 0;
1054 break;
1055 }
1056 TCPSTAT_INC(tcps_rcvdupack);
1057 /*
1058 * If we have outstanding data (other than
1059 * a window probe), this is a completely
1060 * duplicate ack (ie, window info didn't
1061 * change and FIN isn't set),
1062 * the ack is the biggest we've
1063 * seen and we've seen exactly our rexmt
1064 * threshhold of them, assume a packet
1065 * has been dropped and retransmit it.
1066 * Kludge snd_nxt & the congestion
1067 * window so we send only this one
1068 * packet.
1069 *
1070 * We know we're losing at the current
1071 * window size so do congestion avoidance
1072 * (set ssthresh to half the current window
1073 * and pull our congestion window back to
1074 * the new ssthresh).
1075 *
1076 * Dup acks mean that packets have left the
1077 * network (they're now cached at the receiver)
1078 * so bump cwnd by the amount in the receiver
1079 * to keep a constant cwnd packets in the
1080 * network.
1081 *
1082 * When using TCP ECN, notify the peer that
1083 * we reduced the cwnd.
1084 */
1085 if (!tcp_timer_active(tp, TT_REXMT) ||
1086 th->th_ack != tp->snd_una)
1087 tp->t_dupacks = 0;
1088 else if (++tp->t_dupacks > tcprexmtthresh ||
1089 IN_FASTRECOVERY(tp->t_flags)) {
1090 cc_ack_received(tp, th, CC_DUPACK);
1091 if ((tp->t_flags & TF_SACK_PERMIT) &&
1092 IN_FASTRECOVERY(tp->t_flags)) {
1093 int awnd;
1094
1095 /*
1096 * Compute the amount of data in flight first.
1097 * We can inject new data into the pipe iff
1098 * we have less than 1/2 the original window's
1099 * worth of data in flight.
1100 */
1101 if (V_tcp_do_rfc6675_pipe)
1102 awnd = tcp_compute_pipe(tp);
1103 else
1104 awnd = (tp->snd_nxt - tp->snd_fack) +
1105 tp->sackhint.sack_bytes_rexmit;
1106
1107 if (awnd < tp->snd_ssthresh) {
1108 tp->snd_cwnd += tp->t_maxseg;
1109 if (tp->snd_cwnd > tp->snd_ssthresh)
1110 tp->snd_cwnd = tp->snd_ssthresh;
1111 }
1112 } else
1113 tp->snd_cwnd += tp->t_maxseg;
1114 (void) tp->t_fb->tfb_tcp_output(tp);
1115 goto drop;
1116 } else if (tp->t_dupacks == tcprexmtthresh) {
1117 tcp_seq onxt = tp->snd_nxt;
1118
1119 /*
1120 * If we're doing sack, check to
1121 * see if we're already in sack
1122 * recovery. If we're not doing sack,
1123 * check to see if we're in newreno
1124 * recovery.
1125 */
1126 if (tp->t_flags & TF_SACK_PERMIT) {
1127 if (IN_FASTRECOVERY(tp->t_flags)) {
1128 tp->t_dupacks = 0;
1129 break;
1130 }
1131 } else {
1132 if (SEQ_LEQ(th->th_ack,
1133 tp->snd_recover)) {
1134 tp->t_dupacks = 0;
1135 break;
1136 }
1137 }
1138 /* Congestion signal before ack. */
1139 cc_cong_signal(tp, th, CC_NDUPACK);
1140 cc_ack_received(tp, th, CC_DUPACK);
1141 tcp_timer_activate(tp, TT_REXMT, 0);
1142 tp->t_rtttime = 0;
1143 if (tp->t_flags & TF_SACK_PERMIT) {
1144 TCPSTAT_INC(
1145 tcps_sack_recovery_episode);
1146 tp->sack_newdata = tp->snd_nxt;
1147 tp->snd_cwnd = tp->t_maxseg;
1148 (void) tp->t_fb->tfb_tcp_output(tp);
1149 goto drop;
1150 }
1151 tp->snd_nxt = th->th_ack;
1152 tp->snd_cwnd = tp->t_maxseg;
1153 (void) tp->t_fb->tfb_tcp_output(tp);
1154 KASSERT(tp->snd_limited <= 2,
1155 ("%s: tp->snd_limited too big",
1156 __func__));
1157 tp->snd_cwnd = tp->snd_ssthresh +
1158 tp->t_maxseg *
1159 (tp->t_dupacks - tp->snd_limited);
1160 if (SEQ_GT(onxt, tp->snd_nxt))
1161 tp->snd_nxt = onxt;
1162 goto drop;
1163 } else if (V_tcp_do_rfc3042) {
1164 /*
1165 * Process first and second duplicate
1166 * ACKs. Each indicates a segment
1167 * leaving the network, creating room
1168 * for more. Make sure we can send a
1169 * packet on reception of each duplicate
1170 * ACK by increasing snd_cwnd by one
1171 * segment. Restore the original
1172 * snd_cwnd after packet transmission.
1173 */
1174 cc_ack_received(tp, th, CC_DUPACK);
1175 u_long oldcwnd = tp->snd_cwnd;
1176 tcp_seq oldsndmax = tp->snd_max;
1177 u_int sent;
1178 int avail;
1179
1180 KASSERT(tp->t_dupacks == 1 ||
1181 tp->t_dupacks == 2,
1182 ("%s: dupacks not 1 or 2",
1183 __func__));
1184 if (tp->t_dupacks == 1)
1185 tp->snd_limited = 0;
1186 tp->snd_cwnd =
1187 (tp->snd_nxt - tp->snd_una) +
1188 (tp->t_dupacks - tp->snd_limited) *
1189 tp->t_maxseg;
1190 /*
1191 * Only call tcp_output when there
1192 * is new data available to be sent.
1193 * Otherwise we would send pure ACKs.
1194 */
1195 SOCKBUF_LOCK(&so->so_snd);
1196 avail = sbavail(&so->so_snd) -
1197 (tp->snd_nxt - tp->snd_una);
1198 SOCKBUF_UNLOCK(&so->so_snd);
1199 if (avail > 0)
1200 (void) tp->t_fb->tfb_tcp_output(tp);
1201 sent = tp->snd_max - oldsndmax;
1202 if (sent > tp->t_maxseg) {
1203 KASSERT((tp->t_dupacks == 2 &&
1204 tp->snd_limited == 0) ||
1205 (sent == tp->t_maxseg + 1 &&
1206 tp->t_flags & TF_SENTFIN),
1207 ("%s: sent too much",
1208 __func__));
1209 tp->snd_limited = 2;
1210 } else if (sent > 0)
1211 ++tp->snd_limited;
1212 tp->snd_cwnd = oldcwnd;
1213 goto drop;
1214 }
1215 } else
1216 tp->t_dupacks = 0;
1217 break;
1218 }
1219
1220 KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
1221 ("%s: th_ack <= snd_una", __func__));
1222
1223 /*
1224 * If the congestion window was inflated to account
1225 * for the other side's cached packets, retract it.
1226 */
1227 if (IN_FASTRECOVERY(tp->t_flags)) {
1228 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
1229 if (tp->t_flags & TF_SACK_PERMIT)
1230 tcp_sack_partialack(tp, th);
1231 else
1232 tcp_newreno_partial_ack(tp, th);
1233 } else
1234 cc_post_recovery(tp, th);
1235 }
1236 tp->t_dupacks = 0;
1237 /*
1238 * If we reach this point, ACK is not a duplicate,
1239 * i.e., it ACKs something we sent.
1240 */
1241 if (tp->t_flags & TF_NEEDSYN) {
1242 /*
1243 * T/TCP: Connection was half-synchronized, and our
1244 * SYN has been ACK'd (so connection is now fully
1245 * synchronized). Go to non-starred state,
1246 * increment snd_una for ACK of SYN, and check if
1247 * we can do window scaling.
1248 */
1249 tp->t_flags &= ~TF_NEEDSYN;
1250 tp->snd_una++;
1251 /* Do window scaling? */
1252 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1253 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1254 tp->rcv_scale = tp->request_r_scale;
1255 /* Send window already scaled. */
1256 }
1257 }
1258
1259process_ACK:
1260 INP_WLOCK_ASSERT(tp->t_inpcb);
1261
1262 acked = BYTES_THIS_ACK(tp, th);
1263 TCPSTAT_INC(tcps_rcvackpack);
1264 TCPSTAT_ADD(tcps_rcvackbyte, acked);
1265
1266 /*
1267 * If we just performed our first retransmit, and the ACK
1268 * arrives within our recovery window, then it was a mistake
1269 * to do the retransmit in the first place. Recover our
1270 * original cwnd and ssthresh, and proceed to transmit where
1271 * we left off.
1272 */
1273 if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
1274 (int)(ticks - tp->t_badrxtwin) < 0)
1275 cc_cong_signal(tp, th, CC_RTO_ERR);
1276
1277 /*
1278 * If we have a timestamp reply, update smoothed
1279 * round trip time. If no timestamp is present but
1280 * transmit timer is running and timed sequence
1281 * number was acked, update smoothed round trip time.
1282 * Since we now have an rtt measurement, cancel the
1283 * timer backoff (cf., Phil Karn's retransmit alg.).
1284 * Recompute the initial retransmit timer.
1285 *
1286 * Some boxes send broken timestamp replies
1287 * during the SYN+ACK phase, ignore
1288 * timestamps of 0 or we could calculate a
1289 * huge RTT and blow up the retransmit timer.
1290 */
1291 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
1292 u_int t;
1293
1294 t = tcp_ts_getticks() - to->to_tsecr;
1295 if (!tp->t_rttlow || tp->t_rttlow > t)
1296 tp->t_rttlow = t;
1297 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
1298 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
1299 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
1300 tp->t_rttlow = ticks - tp->t_rtttime;
1301 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
1302 }
1303
1304 /*
1305 * If all outstanding data is acked, stop retransmit
1306 * timer and remember to restart (more output or persist).
1307 * If there is more data to be acked, restart retransmit
1308 * timer, using current (possibly backed-off) value.
1309 */
1310 if (th->th_ack == tp->snd_max) {
1311 tcp_timer_activate(tp, TT_REXMT, 0);
1312 needoutput = 1;
1313 } else if (!tcp_timer_active(tp, TT_PERSIST))
1314 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1315
1316 /*
1317 * If no data (only SYN) was ACK'd,
1318 * skip rest of ACK processing.
1319 */
1320 if (acked == 0)
1321 goto step6;
1322
1323 /*
1324 * Let the congestion control algorithm update congestion
1325 * control related information. This typically means increasing
1326 * the congestion window.
1327 */
1328 cc_ack_received(tp, th, CC_ACK);
1329
1330 SOCKBUF_LOCK(&so->so_snd);
1331 if (acked > sbavail(&so->so_snd)) {
1332 tp->snd_wnd -= sbavail(&so->so_snd);
1333 mfree = sbcut_locked(&so->so_snd,
1334 (int)sbavail(&so->so_snd));
1335 ourfinisacked = 1;
1336 } else {
1337 mfree = sbcut_locked(&so->so_snd, acked);
1338 tp->snd_wnd -= acked;
1339 ourfinisacked = 0;
1340 }
1341 /* NB: sowwakeup_locked() does an implicit unlock. */
1342 sowwakeup_locked(so);
1343 m_freem(mfree);
1344 /* Detect una wraparound. */
1345 if (!IN_RECOVERY(tp->t_flags) &&
1346 SEQ_GT(tp->snd_una, tp->snd_recover) &&
1347 SEQ_LEQ(th->th_ack, tp->snd_recover))
1348 tp->snd_recover = th->th_ack - 1;
1349 /* XXXLAS: Can this be moved up into cc_post_recovery? */
1350 if (IN_RECOVERY(tp->t_flags) &&
1351 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
1352 EXIT_RECOVERY(tp->t_flags);
1353 }
1354 tp->snd_una = th->th_ack;
1355 if (tp->t_flags & TF_SACK_PERMIT) {
1356 if (SEQ_GT(tp->snd_una, tp->snd_recover))
1357 tp->snd_recover = tp->snd_una;
1358 }
1359 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1360 tp->snd_nxt = tp->snd_una;
1361
1362 switch (tp->t_state) {
1363
1364 /*
1365 * In FIN_WAIT_1 STATE in addition to the processing
1366 * for the ESTABLISHED state if our FIN is now acknowledged
1367 * then enter FIN_WAIT_2.
1368 */
1369 case TCPS_FIN_WAIT_1:
1370 if (ourfinisacked) {
1371 /*
1372 * If we can't receive any more
1373 * data, then closing user can proceed.
1374 * Starting the timer is contrary to the
1375 * specification, but if we don't get a FIN
1376 * we'll hang forever.
1377 *
1378 * XXXjl:
1379 * we should release the tp also, and use a
1380 * compressed state.
1381 */
1382 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1383 soisdisconnected(so);
1384 tcp_timer_activate(tp, TT_2MSL,
1385 (tcp_fast_finwait2_recycle ?
1386 tcp_finwait2_timeout :
1387 TP_MAXIDLE(tp)));
1388 }
1389 tcp_state_change(tp, TCPS_FIN_WAIT_2);
1390 }
1391 break;
1392
1393 /*
1394 * In CLOSING STATE in addition to the processing for
1395 * the ESTABLISHED state if the ACK acknowledges our FIN
1396 * then enter the TIME-WAIT state, otherwise ignore
1397 * the segment.
1398 */
1399 case TCPS_CLOSING:
1400 if (ourfinisacked) {
1401 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1402 tcp_twstart(tp);
1403 INP_INFO_RUNLOCK(&V_tcbinfo);
1404 m_freem(m);
1405 return;
1406 }
1407 break;
1408
1409 /*
1410 * In LAST_ACK, we may still be waiting for data to drain
1411 * and/or to be acked, as well as for the ack of our FIN.
1412 * If our FIN is now acknowledged, delete the TCB,
1413 * enter the closed state and return.
1414 */
1415 case TCPS_LAST_ACK:
1416 if (ourfinisacked) {
1417 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1418 tp = tcp_close(tp);
1419 goto drop;
1420 }
1421 break;
1422 }
1423 }
1424
1425step6:
1426 INP_WLOCK_ASSERT(tp->t_inpcb);
1427
1428 /*
1429 * Update window information.
1430 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1431 */
1432 if ((thflags & TH_ACK) &&
1433 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
1434 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
1435 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
1436 /* keep track of pure window updates */
1437 if (tlen == 0 &&
1438 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
1439 TCPSTAT_INC(tcps_rcvwinupd);
1440 tp->snd_wnd = tiwin;
1441 tp->snd_wl1 = th->th_seq;
1442 tp->snd_wl2 = th->th_ack;
1443 if (tp->snd_wnd > tp->max_sndwnd)
1444 tp->max_sndwnd = tp->snd_wnd;
1445 needoutput = 1;
1446 }
1447
1448 /*
1449 * Process segments with URG.
1450 */
1451 if ((thflags & TH_URG) && th->th_urp &&
1452 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1453 /*
1454 * This is a kludge, but if we receive and accept
1455 * random urgent pointers, we'll crash in
1456 * soreceive. It's hard to imagine someone
1457 * actually wanting to send this much urgent data.
1458 */
1459 SOCKBUF_LOCK(&so->so_rcv);
1460 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
1461 th->th_urp = 0; /* XXX */
1462 thflags &= ~TH_URG; /* XXX */
1463 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
1464 goto dodata; /* XXX */
1465 }
1466 /*
1467 * If this segment advances the known urgent pointer,
1468 * then mark the data stream. This should not happen
1469 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1470 * a FIN has been received from the remote side.
1471 * In these states we ignore the URG.
1472 *
1473 * According to RFC961 (Assigned Protocols),
1474 * the urgent pointer points to the last octet
1475 * of urgent data. We continue, however,
1476 * to consider it to indicate the first octet
1477 * of data past the urgent section as the original
1478 * spec states (in one of two places).
1479 */
1480 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
1481 tp->rcv_up = th->th_seq + th->th_urp;
1482 so->so_oobmark = sbavail(&so->so_rcv) +
1483 (tp->rcv_up - tp->rcv_nxt) - 1;
1484 if (so->so_oobmark == 0)
1485 so->so_rcv.sb_state |= SBS_RCVATMARK;
1486 sohasoutofband(so);
1487 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1488 }
1489 SOCKBUF_UNLOCK(&so->so_rcv);
1490 /*
1491 * Remove out of band data so doesn't get presented to user.
1492 * This can happen independent of advancing the URG pointer,
1493 * but if two URG's are pending at once, some out-of-band
1494 * data may creep in... ick.
1495 */
1496 if (th->th_urp <= (u_long)tlen &&
1497 !(so->so_options & SO_OOBINLINE)) {
1498 /* hdr drop is delayed */
1499 tcp_pulloutofband(so, th, m, drop_hdrlen);
1500 }
1501 } else {
1502 /*
1503 * If no out of band data is expected,
1504 * pull receive urgent pointer along
1505 * with the receive window.
1506 */
1507 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1508 tp->rcv_up = tp->rcv_nxt;
1509 }
1510dodata: /* XXX */
1511 INP_WLOCK_ASSERT(tp->t_inpcb);
1512
1513 /*
1514 * Process the segment text, merging it into the TCP sequencing queue,
1515 * and arranging for acknowledgment of receipt if necessary.
1516 * This process logically involves adjusting tp->rcv_wnd as data
1517 * is presented to the user (this happens in tcp_usrreq.c,
1518 * case PRU_RCVD). If a FIN has already been received on this
1519 * connection then we just ignore the text.
1520 */
1521 if ((tlen || (thflags & TH_FIN)) &&
1522 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1523 tcp_seq save_start = th->th_seq;
1524 m_adj(m, drop_hdrlen); /* delayed header drop */
1525 /*
1526 * Insert segment which includes th into TCP reassembly queue
1527 * with control block tp. Set thflags to whether reassembly now
1528 * includes a segment with FIN. This handles the common case
1529 * inline (segment is the next to be received on an established
1530 * connection, and the queue is empty), avoiding linkage into
1531 * and removal from the queue and repetition of various
1532 * conversions.
1533 * Set DELACK for segments received in order, but ack
1534 * immediately when segments are out of order (so
1535 * fast retransmit can work).
1536 */
1537 if (th->th_seq == tp->rcv_nxt &&
1538 LIST_EMPTY(&tp->t_segq) &&
1539 TCPS_HAVEESTABLISHED(tp->t_state)) {
1540 if (DELAY_ACK(tp, tlen))
1541 tp->t_flags |= TF_DELACK;
1542 else
1543 tp->t_flags |= TF_ACKNOW;
1544 tp->rcv_nxt += tlen;
1545 thflags = th->th_flags & TH_FIN;
1546 TCPSTAT_INC(tcps_rcvpack);
1547 TCPSTAT_ADD(tcps_rcvbyte, tlen);
1548 SOCKBUF_LOCK(&so->so_rcv);
1549 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1550 m_freem(m);
1551 else
1552 sbappendstream_locked(&so->so_rcv, m, 0);
1553 /* NB: sorwakeup_locked() does an implicit unlock. */
1554 sorwakeup_locked(so);
1555 } else {
1556 /*
1557 * XXX: Due to the header drop above "th" is
1558 * theoretically invalid by now. Fortunately
1559 * m_adj() doesn't actually frees any mbufs
1560 * when trimming from the head.
1561 */
1562 thflags = tcp_reass(tp, th, &tlen, m);
1563 tp->t_flags |= TF_ACKNOW;
1564 }
1565 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
1566 tcp_update_sack_list(tp, save_start, save_start + tlen);
1567#if 0
1568 /*
1569 * Note the amount of data that peer has sent into
1570 * our window, in order to estimate the sender's
1571 * buffer size.
1572 * XXX: Unused.
1573 */
1574 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
1575 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1576 else
1577 len = so->so_rcv.sb_hiwat;
1578#endif
1579 } else {
1580 m_freem(m);
1581 thflags &= ~TH_FIN;
1582 }
1583
1584 /*
1585 * If FIN is received ACK the FIN and let the user know
1586 * that the connection is closing.
1587 */
1588 if (thflags & TH_FIN) {
1589 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1590 socantrcvmore(so);
1591 /*
1592 * If connection is half-synchronized
1593 * (ie NEEDSYN flag on) then delay ACK,
1594 * so it may be piggybacked when SYN is sent.
1595 * Otherwise, since we received a FIN then no
1596 * more input can be expected, send ACK now.
1597 */
1598 if (tp->t_flags & TF_NEEDSYN)
1599 tp->t_flags |= TF_DELACK;
1600 else
1601 tp->t_flags |= TF_ACKNOW;
1602 tp->rcv_nxt++;
1603 }
1604 switch (tp->t_state) {
1605
1606 /*
1607 * In SYN_RECEIVED and ESTABLISHED STATES
1608 * enter the CLOSE_WAIT state.
1609 */
1610 case TCPS_SYN_RECEIVED:
1611 tp->t_starttime = ticks;
1612 /* FALLTHROUGH */
1613 case TCPS_ESTABLISHED:
1614 tcp_state_change(tp, TCPS_CLOSE_WAIT);
1615 break;
1616
1617 /*
1618 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1619 * enter the CLOSING state.
1620 */
1621 case TCPS_FIN_WAIT_1:
1622 tcp_state_change(tp, TCPS_CLOSING);
1623 break;
1624
1625 /*
1626 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1627 * starting the time-wait timer, turning off the other
1628 * standard timers.
1629 */
1630 case TCPS_FIN_WAIT_2:
1631 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1632 KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
1633 "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
1634 ti_locked));
1635
1636 tcp_twstart(tp);
1637 INP_INFO_RUNLOCK(&V_tcbinfo);
1638 return;
1639 }
1640 }
1641 if (ti_locked == TI_RLOCKED) {
1642 INP_INFO_RUNLOCK(&V_tcbinfo);
1643 }
1644 ti_locked = TI_UNLOCKED;
1645
1646#ifdef TCPDEBUG
1647 if (so->so_options & SO_DEBUG)
1648 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
1649 &tcp_savetcp, 0);
1650#endif
1651 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
1652
1653 /*
1654 * Return any desired output.
1655 */
1656 if (needoutput || (tp->t_flags & TF_ACKNOW))
1657 (void) tp->t_fb->tfb_tcp_output(tp);
1658
1659 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
1660 __func__, ti_locked));
1661 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1662 INP_WLOCK_ASSERT(tp->t_inpcb);
1663
1664 if (tp->t_flags & TF_DELACK) {
1665 tp->t_flags &= ~TF_DELACK;
1666 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
1667 }
1668 INP_WUNLOCK(tp->t_inpcb);
1669 return;
1670
1671dropafterack:
1672 /*
1673 * Generate an ACK dropping incoming segment if it occupies
1674 * sequence space, where the ACK reflects our state.
1675 *
1676 * We can now skip the test for the RST flag since all
1677 * paths to this code happen after packets containing
1678 * RST have been dropped.
1679 *
1680 * In the SYN-RECEIVED state, don't send an ACK unless the
1681 * segment we received passes the SYN-RECEIVED ACK test.
1682 * If it fails send a RST. This breaks the loop in the
1683 * "LAND" DoS attack, and also prevents an ACK storm
1684 * between two listening ports that have been sent forged
1685 * SYN segments, each with the source address of the other.
1686 */
1687 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
1688 (SEQ_GT(tp->snd_una, th->th_ack) ||
1689 SEQ_GT(th->th_ack, tp->snd_max)) ) {
1690 rstreason = BANDLIM_RST_OPENPORT;
1691 goto dropwithreset;
1692 }
1693#ifdef TCPDEBUG
1694 if (so->so_options & SO_DEBUG)
1695 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1696 &tcp_savetcp, 0);
1697#endif
1698 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
1699 if (ti_locked == TI_RLOCKED) {
1700 INP_INFO_RUNLOCK(&V_tcbinfo);
1701 }
1702 ti_locked = TI_UNLOCKED;
1703
1704 tp->t_flags |= TF_ACKNOW;
1705 (void) tp->t_fb->tfb_tcp_output(tp);
1706 INP_WUNLOCK(tp->t_inpcb);
1707 m_freem(m);
1708 return;
1709
1710dropwithreset:
1711 if (ti_locked == TI_RLOCKED) {
1712 INP_INFO_RUNLOCK(&V_tcbinfo);
1713 }
1714 ti_locked = TI_UNLOCKED;
1715
1716 if (tp != NULL) {
1717 tcp_dropwithreset(m, th, tp, tlen, rstreason);
1718 INP_WUNLOCK(tp->t_inpcb);
1719 } else
1720 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1721 return;
1722
1723drop:
1724 if (ti_locked == TI_RLOCKED) {
1725 INP_INFO_RUNLOCK(&V_tcbinfo);
1726 ti_locked = TI_UNLOCKED;
1727 }
1728#ifdef INVARIANTS
1729 else
1730 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1731#endif
1732
1733 /*
1734 * Drop space held by incoming segment and return.
1735 */
1736#ifdef TCPDEBUG
1737 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1738 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1739 &tcp_savetcp, 0);
1740#endif
1741 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
1742 if (tp != NULL)
1743 INP_WUNLOCK(tp->t_inpcb);
1744 m_freem(m);
1745}
1746
1747
1748/*
1749 * Do fast slow is a combination of the original
1750 * tcp_dosegment and a split fastpath, one function
1751 * for the fast-ack which also includes allowing fastpath
1752 * for window advanced in sequence acks. And also a
1753 * sub-function that handles the insequence data.
1754 */
1755void
1756tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so,
1757 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
1758 int ti_locked)
1759{
1760 int thflags;
1761 u_long tiwin;
1762 char *s;
1763 int can_enter;
1764 struct in_conninfo *inc;
1765 struct tcpopt to;
1766
1767 thflags = th->th_flags;
1768 tp->sackhint.last_sack_ack = 0;
1769 inc = &tp->t_inpcb->inp_inc;
1770 /*
1771 * If this is either a state-changing packet or current state isn't
1772 * established, we require a write lock on tcbinfo. Otherwise, we
1773 * allow the tcbinfo to be in either alocked or unlocked, as the
1774 * caller may have unnecessarily acquired a write lock due to a race.
1775 */
1776 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
1777 tp->t_state != TCPS_ESTABLISHED) {
1778 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
1779 "SYN/FIN/RST/!EST", __func__, ti_locked));
1780 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1781 } else {
1782#ifdef INVARIANTS
1783 if (ti_locked == TI_RLOCKED) {
1784 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1785 } else {
1786 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
1787 "ti_locked: %d", __func__, ti_locked));
1788 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1789 }
1790#endif
1791 }
1792 INP_WLOCK_ASSERT(tp->t_inpcb);
1793 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
1794 __func__));
1795 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
1796 __func__));
1797
1798 /*
1799 * Segment received on connection.
1800 * Reset idle time and keep-alive timer.
1801 * XXX: This should be done after segment
1802 * validation to ignore broken/spoofed segs.
1803 */
1804 tp->t_rcvtime = ticks;
1805 if (TCPS_HAVEESTABLISHED(tp->t_state))
1806 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
1807
1808 /*
1809 * Unscale the window into a 32-bit value.
1810 * For the SYN_SENT state the scale is zero.
1811 */
1812 tiwin = th->th_win << tp->snd_scale;
1813
1814 /*
1815 * TCP ECN processing.
1816 */
1817 if (tp->t_flags & TF_ECN_PERMIT) {
1818 if (thflags & TH_CWR)
1819 tp->t_flags &= ~TF_ECN_SND_ECE;
1820 switch (iptos & IPTOS_ECN_MASK) {
1821 case IPTOS_ECN_CE:
1822 tp->t_flags |= TF_ECN_SND_ECE;
1823 TCPSTAT_INC(tcps_ecn_ce);
1824 break;
1825 case IPTOS_ECN_ECT0:
1826 TCPSTAT_INC(tcps_ecn_ect0);
1827 break;
1828 case IPTOS_ECN_ECT1:
1829 TCPSTAT_INC(tcps_ecn_ect1);
1830 break;
1831 }
1832 /* Congestion experienced. */
1833 if (thflags & TH_ECE) {
1834 cc_cong_signal(tp, th, CC_ECN);
1835 }
1836 }
1837
1838 /*
1839 * Parse options on any incoming segment.
1840 */
1841 tcp_dooptions(&to, (u_char *)(th + 1),
1842 (th->th_off << 2) - sizeof(struct tcphdr),
1843 (thflags & TH_SYN) ? TO_SYN : 0);
1844
1845 /*
1846 * If echoed timestamp is later than the current time,
1847 * fall back to non RFC1323 RTT calculation. Normalize
1848 * timestamp if syncookies were used when this connection
1849 * was established.
1850 */
1851 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1852 to.to_tsecr -= tp->ts_offset;
1853 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
1854 to.to_tsecr = 0;
1855 }
1856 /*
1857 * If timestamps were negotiated during SYN/ACK they should
1858 * appear on every segment during this session and vice versa.
1859 */
1860 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
1861 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1862 log(LOG_DEBUG, "%s; %s: Timestamp missing, "
1863 "no action\n", s, __func__);
1864 free(s, M_TCPLOG);
1865 }
1866 }
1867 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
1868 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1869 log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
1870 "no action\n", s, __func__);
1871 free(s, M_TCPLOG);
1872 }
1873 }
1874
1875 /*
1876 * Process options only when we get SYN/ACK back. The SYN case
1877 * for incoming connections is handled in tcp_syncache.
1878 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1879 * or <SYN,ACK>) segment itself is never scaled.
1880 * XXX this is traditional behavior, may need to be cleaned up.
1881 */
1882 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1883 if ((to.to_flags & TOF_SCALE) &&
1884 (tp->t_flags & TF_REQ_SCALE)) {
1885 tp->t_flags |= TF_RCVD_SCALE;
1886 tp->snd_scale = to.to_wscale;
1887 }
1888 /*
1889 * Initial send window. It will be updated with
1890 * the next incoming segment to the scaled value.
1891 */
1892 tp->snd_wnd = th->th_win;
1893 if (to.to_flags & TOF_TS) {
1894 tp->t_flags |= TF_RCVD_TSTMP;
1895 tp->ts_recent = to.to_tsval;
1896 tp->ts_recent_age = tcp_ts_getticks();
1897 }
1898 if (to.to_flags & TOF_MSS)
1899 tcp_mss(tp, to.to_mss);
1900 if ((tp->t_flags & TF_SACK_PERMIT) &&
1901 (to.to_flags & TOF_SACKPERM) == 0)
1902 tp->t_flags &= ~TF_SACK_PERMIT;
1903 }
1904 can_enter = 0;
1905 if (__predict_true((tlen == 0))) {
1906 /*
1907 * The ack moved forward and we have a window (non-zero)
1908 * <or>
1909 * The ack did not move forward, but the window increased.
1910 */
1911 if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) ||
1912 ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) {
1913 can_enter = 1;
1914 }
1915 } else {
1916 /*
1917 * Data incoming, use the old entry criteria
1918 * for fast-path with data.
1919 */
1920 if ((tiwin && tiwin == tp->snd_wnd)) {
1921 can_enter = 1;
1922 }
1923 }
1924 /*
1925 * Header prediction: check for the two common cases
1926 * of a uni-directional data xfer. If the packet has
1927 * no control flags, is in-sequence, the window didn't
1928 * change and we're not retransmitting, it's a
1929 * candidate. If the length is zero and the ack moved
1930 * forward, we're the sender side of the xfer. Just
1931 * free the data acked & wake any higher level process
1932 * that was blocked waiting for space. If the length
1933 * is non-zero and the ack didn't move, we're the
1934 * receiver side. If we're getting packets in-order
1935 * (the reassembly queue is empty), add the data to
1936 * the socket buffer and note that we need a delayed ack.
1937 * Make sure that the hidden state-flags are also off.
1938 * Since we check for TCPS_ESTABLISHED first, it can only
1939 * be TH_NEEDSYN.
1940 */
1941 if (__predict_true(tp->t_state == TCPS_ESTABLISHED &&
1942 th->th_seq == tp->rcv_nxt &&
1943 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1944 tp->snd_nxt == tp->snd_max &&
1945 can_enter &&
1946 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1947 LIST_EMPTY(&tp->t_segq) &&
1948 ((to.to_flags & TOF_TS) == 0 ||
1949 TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) {
1950 if (__predict_true((tlen == 0) &&
1951 (SEQ_LEQ(th->th_ack, tp->snd_max) &&
1952 !IN_RECOVERY(tp->t_flags) &&
1953 (to.to_flags & TOF_SACK) == 0 &&
1954 TAILQ_EMPTY(&tp->snd_holes)))) {
1955 /* We are done */
1956 tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
1957 ti_locked, tiwin);
1958 return;
1959 } else if ((tlen) &&
1960 (th->th_ack == tp->snd_una &&
1961 tlen <= sbspace(&so->so_rcv))) {
1962 tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen,
1963 ti_locked, tiwin);
1964 /* We are done */
1965 return;
1966 }
1967 }
1968 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
1969 ti_locked, tiwin, thflags);
1970}
1971
1972
1973/*
1974 * This subfunction is used to try to highly optimize the
1975 * fast path. We again allow window updates that are
1976 * in sequence to remain in the fast-path. We also add
1977 * in the __predict's to attempt to help the compiler.
1978 * Note that if we return a 0, then we can *not* process
1979 * it and the caller should push the packet into the
1980 * slow-path.
1981 */
1982static int
1983tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
1984 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
1985 int ti_locked, u_long tiwin)
1986{
1987 int acked;
1988 int winup_only=0;
1989#ifdef TCPDEBUG
1990 /*
1991 * The size of tcp_saveipgen must be the size of the max ip header,
1992 * now IPv6.
1993 */
1994 u_char tcp_saveipgen[IP6_HDR_LEN];
1995 struct tcphdr tcp_savetcp;
1996 short ostate = 0;
1997#endif
1998
1999
2000 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
2001 /* Old ack, behind (or duplicate to) the last one rcv'd */
2002 return (0);
2003 }
2004 if (__predict_false(th->th_ack == tp->snd_una) &&
2005 __predict_false(tiwin <= tp->snd_wnd)) {
2006 /* duplicate ack <or> a shrinking dup ack with shrinking window */
2007 return (0);
2008 }
2009 if (__predict_false(tiwin == 0)) {
2010 /* zero window */
2011 return (0);
2012 }
2013 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
2014 /* Above what we have sent? */
2015 return (0);
2016 }
2017 if (__predict_false(tp->snd_nxt != tp->snd_max)) {
2018 /* We are retransmitting */
2019 return (0);
2020 }
2021 if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) {
2022 /* We need a SYN or a FIN, unlikely.. */
2023 return (0);
2024 }
2025 if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
2026 /* Timestamp is behind .. old ack with seq wrap? */
2027 return (0);
2028 }
2029 if (__predict_false(IN_RECOVERY(tp->t_flags))) {
2030 /* Still recovering */
2031 return (0);
2032 }
2033 if (__predict_false(to->to_flags & TOF_SACK)) {
2034 /* Sack included in the ack.. */
2035 return (0);
2036 }
2037 if (!TAILQ_EMPTY(&tp->snd_holes)) {
2038 /* We have sack holes on our scoreboard */
2039 return (0);
2040 }
2041 /* Ok if we reach here, we can process a fast-ack */
2042
2043 /* Did the window get updated? */
2044 if (tiwin != tp->snd_wnd) {
2045 /* keep track of pure window updates */
2046 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
2047 winup_only = 1;
2048 TCPSTAT_INC(tcps_rcvwinupd);
2049 }
2050 tp->snd_wnd = tiwin;
2051 tp->snd_wl1 = th->th_seq;
2052 if (tp->snd_wnd > tp->max_sndwnd)
2053 tp->max_sndwnd = tp->snd_wnd;
2054 }
2055 /*
2056 * Pull snd_wl2 up to prevent seq wrap relative
2057 * to th_ack.
2058 */
2059 tp->snd_wl2 = th->th_ack;
2060 /*
2061 * If last ACK falls within this segment's sequence numbers,
2062 * record the timestamp.
2063 * NOTE that the test is modified according to the latest
2064 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2065 */
2066 if ((to->to_flags & TOF_TS) != 0 &&
2067 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2068 tp->ts_recent_age = tcp_ts_getticks();
2069 tp->ts_recent = to->to_tsval;
2070 }
2071 /*
2072 * This is a pure ack for outstanding data.
2073 */
2074 if (ti_locked == TI_RLOCKED) {
2075 INP_INFO_RUNLOCK(&V_tcbinfo);
2076 }
2077 ti_locked = TI_UNLOCKED;
2078
2079 TCPSTAT_INC(tcps_predack);
2080
2081 /*
2082 * "bad retransmit" recovery.
2083 */
2084 if (tp->t_rxtshift == 1 &&
2085 tp->t_flags & TF_PREVVALID &&
2086 (int)(ticks - tp->t_badrxtwin) < 0) {
2087 cc_cong_signal(tp, th, CC_RTO_ERR);
2088 }
2089
2090 /*
2091 * Recalculate the transmit timer / rtt.
2092 *
2093 * Some boxes send broken timestamp replies
2094 * during the SYN+ACK phase, ignore
2095 * timestamps of 0 or we could calculate a
2096 * huge RTT and blow up the retransmit timer.
2097 */
2098 if ((to->to_flags & TOF_TS) != 0 &&
2099 to->to_tsecr) {
2100 u_int t;
2101
2102 t = tcp_ts_getticks() - to->to_tsecr;
2103 if (!tp->t_rttlow || tp->t_rttlow > t)
2104 tp->t_rttlow = t;
2105 tcp_xmit_timer(tp,
2106 TCP_TS_TO_TICKS(t) + 1);
2107 } else if (tp->t_rtttime &&
2108 SEQ_GT(th->th_ack, tp->t_rtseq)) {
2109 if (!tp->t_rttlow ||
2110 tp->t_rttlow > ticks - tp->t_rtttime)
2111 tp->t_rttlow = ticks - tp->t_rtttime;
2112 tcp_xmit_timer(tp,
2113 ticks - tp->t_rtttime);
2114 }
2115 if (winup_only == 0) {
2116 acked = BYTES_THIS_ACK(tp, th);
2117
2118 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
2119 hhook_run_tcp_est_in(tp, th, to);
2120
2121 TCPSTAT_ADD(tcps_rcvackbyte, acked);
2122 sbdrop(&so->so_snd, acked);
2123 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2124 SEQ_LEQ(th->th_ack, tp->snd_recover))
2125 tp->snd_recover = th->th_ack - 1;
2126
2127 /*
2128 * Let the congestion control algorithm update
2129 * congestion control related information. This
2130 * typically means increasing the congestion
2131 * window.
2132 */
2133 cc_ack_received(tp, th, CC_ACK);
2134
2135 tp->snd_una = th->th_ack;
2136 tp->t_dupacks = 0;
2137 m_freem(m);
2138
2139 /*
2140 * If all outstanding data are acked, stop
2141 * retransmit timer, otherwise restart timer
2142 * using current (possibly backed-off) value.
2143 * If process is waiting for space,
2144 * wakeup/selwakeup/signal. If data
2145 * are ready to send, let tcp_output
2146 * decide between more output or persist.
2147 */
2148#ifdef TCPDEBUG
2149 if (so->so_options & SO_DEBUG)
2150 tcp_trace(TA_INPUT, ostate, tp,
2151 (void *)tcp_saveipgen,
2152 &tcp_savetcp, 0);
2153#endif
2154 if (tp->snd_una == tp->snd_max)
2155 tcp_timer_activate(tp, TT_REXMT, 0);
2156 else if (!tcp_timer_active(tp, TT_PERSIST))
2157 tcp_timer_activate(tp, TT_REXMT,
2158 tp->t_rxtcur);
2159 /* Wake up the socket if we have room to write more */
2160 sowwakeup(so);
2161 } else {
2162 /*
2163 * Window update only, just free the mbufs and
2164 * send out whatever we can.
2165 */
2166 m_freem(m);
2167 }
2168 if (sbavail(&so->so_snd))
2169 (void) tcp_output(tp);
2170 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
2171 __func__, ti_locked));
2172 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2173 INP_WLOCK_ASSERT(tp->t_inpcb);
2174
2175 if (tp->t_flags & TF_DELACK) {
2176 tp->t_flags &= ~TF_DELACK;
2177 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
2178 }
2179 INP_WUNLOCK(tp->t_inpcb);
2180 return (1);
2181}
2182
2183/*
2184 * This tcp-do-segment concentrates on making the fastest
2185 * ack processing path. It does not have a fast-path for
2186 * data (it possibly could which would then eliminate the
2187 * need for fast-slow above). For a content distributor having
2188 * large outgoing elephants and very very little coming in
2189 * having no fastpath for data does not really help (since you
2190 * don't get much data in). The most important thing is
2191 * processing ack's quickly and getting the rest of the data
2192 * output to the peer as quickly as possible. This routine
2193 * seems to be about an overall 3% faster then the old
2194 * tcp_do_segment and keeps us in the fast-path for packets
2195 * much more (by allowing window updates to also stay in the fastpath).
2196 */
2197void
2198tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
2199 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
2200 int ti_locked)
2201{
2202 int thflags;
2203 u_long tiwin;
2204 char *s;
2205 struct in_conninfo *inc;
2206 struct tcpopt to;
2207
2208 thflags = th->th_flags;
2209 tp->sackhint.last_sack_ack = 0;
2210 inc = &tp->t_inpcb->inp_inc;
2211 /*
2212 * If this is either a state-changing packet or current state isn't
2213 * established, we require a write lock on tcbinfo. Otherwise, we
2214 * allow the tcbinfo to be in either alocked or unlocked, as the
2215 * caller may have unnecessarily acquired a write lock due to a race.
2216 */
2217 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
2218 tp->t_state != TCPS_ESTABLISHED) {
2219 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
2220 "SYN/FIN/RST/!EST", __func__, ti_locked));
2221 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
2222 } else {
2223#ifdef INVARIANTS
2224 if (ti_locked == TI_RLOCKED) {
2225 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
2226 } else {
2227 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
2228 "ti_locked: %d", __func__, ti_locked));
2229 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2230 }
2231#endif
2232 }
2233 INP_WLOCK_ASSERT(tp->t_inpcb);
2234 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
2235 __func__));
2236 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
2237 __func__));
2238
2239 /*
2240 * Segment received on connection.
2241 * Reset idle time and keep-alive timer.
2242 * XXX: This should be done after segment
2243 * validation to ignore broken/spoofed segs.
2244 */
2245 tp->t_rcvtime = ticks;
2246 if (TCPS_HAVEESTABLISHED(tp->t_state))
2247 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
2248
2249 /*
2250 * Unscale the window into a 32-bit value.
2251 * For the SYN_SENT state the scale is zero.
2252 */
2253 tiwin = th->th_win << tp->snd_scale;
2254
2255 /*
2256 * TCP ECN processing.
2257 */
2258 if (tp->t_flags & TF_ECN_PERMIT) {
2259 if (thflags & TH_CWR)
2260 tp->t_flags &= ~TF_ECN_SND_ECE;
2261 switch (iptos & IPTOS_ECN_MASK) {
2262 case IPTOS_ECN_CE:
2263 tp->t_flags |= TF_ECN_SND_ECE;
2264 TCPSTAT_INC(tcps_ecn_ce);
2265 break;
2266 case IPTOS_ECN_ECT0:
2267 TCPSTAT_INC(tcps_ecn_ect0);
2268 break;
2269 case IPTOS_ECN_ECT1:
2270 TCPSTAT_INC(tcps_ecn_ect1);
2271 break;
2272 }
2273 /* Congestion experienced. */
2274 if (thflags & TH_ECE) {
2275 cc_cong_signal(tp, th, CC_ECN);
2276 }
2277 }
2278
2279 /*
2280 * Parse options on any incoming segment.
2281 */
2282 tcp_dooptions(&to, (u_char *)(th + 1),
2283 (th->th_off << 2) - sizeof(struct tcphdr),
2284 (thflags & TH_SYN) ? TO_SYN : 0);
2285
2286 /*
2287 * If echoed timestamp is later than the current time,
2288 * fall back to non RFC1323 RTT calculation. Normalize
2289 * timestamp if syncookies were used when this connection
2290 * was established.
2291 */
2292 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
2293 to.to_tsecr -= tp->ts_offset;
2294 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
2295 to.to_tsecr = 0;
2296 }
2297 /*
2298 * If timestamps were negotiated during SYN/ACK they should
2299 * appear on every segment during this session and vice versa.
2300 */
2301 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
2302 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2303 log(LOG_DEBUG, "%s; %s: Timestamp missing, "
2304 "no action\n", s, __func__);
2305 free(s, M_TCPLOG);
2306 }
2307 }
2308 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
2309 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2310 log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
2311 "no action\n", s, __func__);
2312 free(s, M_TCPLOG);
2313 }
2314 }
2315
2316 /*
2317 * Process options only when we get SYN/ACK back. The SYN case
2318 * for incoming connections is handled in tcp_syncache.
2319 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
2320 * or <SYN,ACK>) segment itself is never scaled.
2321 * XXX this is traditional behavior, may need to be cleaned up.
2322 */
2323 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2324 if ((to.to_flags & TOF_SCALE) &&
2325 (tp->t_flags & TF_REQ_SCALE)) {
2326 tp->t_flags |= TF_RCVD_SCALE;
2327 tp->snd_scale = to.to_wscale;
2328 }
2329 /*
2330 * Initial send window. It will be updated with
2331 * the next incoming segment to the scaled value.
2332 */
2333 tp->snd_wnd = th->th_win;
2334 if (to.to_flags & TOF_TS) {
2335 tp->t_flags |= TF_RCVD_TSTMP;
2336 tp->ts_recent = to.to_tsval;
2337 tp->ts_recent_age = tcp_ts_getticks();
2338 }
2339 if (to.to_flags & TOF_MSS)
2340 tcp_mss(tp, to.to_mss);
2341 if ((tp->t_flags & TF_SACK_PERMIT) &&
2342 (to.to_flags & TOF_SACKPERM) == 0)
2343 tp->t_flags &= ~TF_SACK_PERMIT;
2344 }
2345 /*
2346 * Header prediction: check for the two common cases
2347 * of a uni-directional data xfer. If the packet has
2348 * no control flags, is in-sequence, the window didn't
2349 * change and we're not retransmitting, it's a
2350 * candidate. If the length is zero and the ack moved
2351 * forward, we're the sender side of the xfer. Just
2352 * free the data acked & wake any higher level process
2353 * that was blocked waiting for space. If the length
2354 * is non-zero and the ack didn't move, we're the
2355 * receiver side. If we're getting packets in-order
2356 * (the reassembly queue is empty), add the data to
2357 * the socket buffer and note that we need a delayed ack.
2358 * Make sure that the hidden state-flags are also off.
2359 * Since we check for TCPS_ESTABLISHED first, it can only
2360 * be TH_NEEDSYN.
2361 */
2362 if (__predict_true(tp->t_state == TCPS_ESTABLISHED) &&
2363 __predict_true(((to.to_flags & TOF_SACK) == 0)) &&
2364 __predict_true(tlen == 0) &&
2365 __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) &&
2366 __predict_true(LIST_EMPTY(&tp->t_segq)) &&
2367 __predict_true(th->th_seq == tp->rcv_nxt)) {
2368 if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
2369 ti_locked, tiwin)) {
2370 return;
2371 }
2372 }
2373 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
2374 ti_locked, tiwin, thflags);
2375}
2376
2377struct tcp_function_block __tcp_fastslow = {
2378 "fastslow",
2379 tcp_output,
2380 tcp_do_segment_fastslow,
2381 tcp_default_ctloutput,
2382 NULL,
2383 NULL,
2384 NULL,
2385 NULL,
2386 NULL,
2387 NULL,
2388 NULL,
2389 0,
2390 0
2391
2392};
2393
2394struct tcp_function_block __tcp_fastack = {
2395 "fastack",
2396 tcp_output,
2397 tcp_do_segment_fastack,
2398 tcp_default_ctloutput,
2399 NULL,
2400 NULL,
2401 NULL,
2402 NULL,
2403 NULL,
2404 NULL,
2405 NULL,
2406 0,
2407 0
2408};
2409
2410static int
2411tcp_addfastpaths(module_t mod, int type, void *data)
2412{
2413 int err=0;
2414
2415 switch (type) {
2416 case MOD_LOAD:
2417 err = register_tcp_functions(&__tcp_fastack, M_WAITOK);
2418 if (err) {
2419 printf("Failed to register fastack module -- err:%d\n", err);
2420 return(err);
2421 }
2422 err = register_tcp_functions(&__tcp_fastslow, M_WAITOK);
2423 if (err) {
2424 printf("Failed to register fastslow module -- err:%d\n", err);
2425 deregister_tcp_functions(&__tcp_fastack);
2426 return(err);
2427 }
2428 break;
2429 case MOD_QUIESCE:
2430 if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) {
2431 return(EBUSY);
2432 }
2433 break;
2434 case MOD_UNLOAD:
2435 err = deregister_tcp_functions(&__tcp_fastack);
2436 if (err == EBUSY)
2437 break;
2438 err = deregister_tcp_functions(&__tcp_fastslow);
2439 if (err == EBUSY)
2440 break;
2441 err = 0;
2442 break;
2443 default:
2444 return (EOPNOTSUPP);
2445 }
2446 return (err);
2447}
2448
2449static moduledata_t new_tcp_fastpaths = {
2450 .name = "tcp_fastpaths",
2451 .evhand = tcp_addfastpaths,
2452 .priv = 0
2453};
2454
2455MODULE_VERSION(kern_tcpfastpaths, 1);
2456DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PSEUDO, SI_ORDER_ANY);
111#ifdef TCPDEBUG
112#include <netinet/tcp_debug.h>
113#endif /* TCPDEBUG */
114#ifdef TCP_OFFLOAD
115#include <netinet/tcp_offload.h>
116#endif
117
118#ifdef IPSEC
119#include <netipsec/ipsec.h>
120#include <netipsec/ipsec6.h>
121#endif /*IPSEC*/
122
123#include <machine/in_cksum.h>
124
125#include <security/mac/mac_framework.h>
126
127const int tcprexmtthresh;
128
129VNET_DECLARE(int, tcp_autorcvbuf_inc);
130#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
131VNET_DECLARE(int, tcp_autorcvbuf_max);
132#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
133VNET_DECLARE(int, tcp_do_rfc3042);
134#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042)
135VNET_DECLARE(int, tcp_do_autorcvbuf);
136#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
137VNET_DECLARE(int, tcp_insecure_rst);
138#define V_tcp_insecure_rst VNET(tcp_insecure_rst)
139VNET_DECLARE(int, tcp_insecure_syn);
140#define V_tcp_insecure_syn VNET(tcp_insecure_syn)
141
142static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *,
143 struct socket *, struct tcpcb *, int, int, uint8_t,
144 int);
145
146static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *,
147 struct socket *, struct tcpcb *, int, int, uint8_t,
148 int);
149
150/*
151 * Indicate whether this ack should be delayed. We can delay the ack if
152 * following conditions are met:
153 * - There is no delayed ack timer in progress.
154 * - Our last ack wasn't a 0-sized window. We never want to delay
155 * the ack that opens up a 0-sized window.
156 * - LRO wasn't used for this segment. We make sure by checking that the
157 * segment size is not larger than the MSS.
158 */
159#define DELAY_ACK(tp, tlen) \
160 ((!tcp_timer_active(tp, TT_DELACK) && \
161 (tp->t_flags & TF_RXWIN0SENT) == 0) && \
162 (tlen <= tp->t_maxseg) && \
163 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
164
165/*
166 * So how is this faster than the normal fast ack?
167 * It basically allows us to also stay in the fastpath
168 * when a window-update ack also arrives. In testing
169 * we saw only 25-30% of connections doing fastpath
170 * due to the fact that along with moving forward
171 * in sequence the window was also updated.
172 */
173static void
174tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
175 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
176 int ti_locked, u_long tiwin)
177{
178 int acked;
179 int winup_only=0;
180#ifdef TCPDEBUG
181 /*
182 * The size of tcp_saveipgen must be the size of the max ip header,
183 * now IPv6.
184 */
185 u_char tcp_saveipgen[IP6_HDR_LEN];
186 struct tcphdr tcp_savetcp;
187 short ostate = 0;
188#endif
189 /*
190 * The following if statment will be true if
191 * we are doing the win_up_in_fp <and>
192 * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or>
193 * - No more new data, but we have an ack for new data
194 * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack))
195 * - No more new data, the same ack point but the window grew
196 * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd)
197 */
198 if ((SEQ_LT(tp->snd_wl1, th->th_seq) ||
199 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
200 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
201 /* keep track of pure window updates */
202 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
203 winup_only = 1;
204 TCPSTAT_INC(tcps_rcvwinupd);
205 }
206 tp->snd_wnd = tiwin;
207 tp->snd_wl1 = th->th_seq;
208 tp->snd_wl2 = th->th_ack;
209 if (tp->snd_wnd > tp->max_sndwnd)
210 tp->max_sndwnd = tp->snd_wnd;
211 }
212 /*
213 * If last ACK falls within this segment's sequence numbers,
214 * record the timestamp.
215 * NOTE that the test is modified according to the latest
216 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
217 */
218 if ((to->to_flags & TOF_TS) != 0 &&
219 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
220 tp->ts_recent_age = tcp_ts_getticks();
221 tp->ts_recent = to->to_tsval;
222 }
223 /*
224 * This is a pure ack for outstanding data.
225 */
226 if (ti_locked == TI_RLOCKED) {
227 INP_INFO_RUNLOCK(&V_tcbinfo);
228 }
229 ti_locked = TI_UNLOCKED;
230
231 TCPSTAT_INC(tcps_predack);
232
233 /*
234 * "bad retransmit" recovery.
235 */
236 if (tp->t_rxtshift == 1 &&
237 tp->t_flags & TF_PREVVALID &&
238 (int)(ticks - tp->t_badrxtwin) < 0) {
239 cc_cong_signal(tp, th, CC_RTO_ERR);
240 }
241
242 /*
243 * Recalculate the transmit timer / rtt.
244 *
245 * Some boxes send broken timestamp replies
246 * during the SYN+ACK phase, ignore
247 * timestamps of 0 or we could calculate a
248 * huge RTT and blow up the retransmit timer.
249 */
250 if ((to->to_flags & TOF_TS) != 0 &&
251 to->to_tsecr) {
252 u_int t;
253
254 t = tcp_ts_getticks() - to->to_tsecr;
255 if (!tp->t_rttlow || tp->t_rttlow > t)
256 tp->t_rttlow = t;
257 tcp_xmit_timer(tp,
258 TCP_TS_TO_TICKS(t) + 1);
259 } else if (tp->t_rtttime &&
260 SEQ_GT(th->th_ack, tp->t_rtseq)) {
261 if (!tp->t_rttlow ||
262 tp->t_rttlow > ticks - tp->t_rtttime)
263 tp->t_rttlow = ticks - tp->t_rtttime;
264 tcp_xmit_timer(tp,
265 ticks - tp->t_rtttime);
266 }
267 if (winup_only == 0) {
268 acked = BYTES_THIS_ACK(tp, th);
269
270 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
271 hhook_run_tcp_est_in(tp, th, to);
272
273 TCPSTAT_ADD(tcps_rcvackbyte, acked);
274 sbdrop(&so->so_snd, acked);
275 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
276 SEQ_LEQ(th->th_ack, tp->snd_recover))
277 tp->snd_recover = th->th_ack - 1;
278
279 /*
280 * Let the congestion control algorithm update
281 * congestion control related information. This
282 * typically means increasing the congestion
283 * window.
284 */
285 cc_ack_received(tp, th, CC_ACK);
286
287 tp->snd_una = th->th_ack;
288 /*
289 * Pull snd_wl2 up to prevent seq wrap relative
290 * to th_ack.
291 */
292 tp->snd_wl2 = th->th_ack;
293 tp->t_dupacks = 0;
294 m_freem(m);
295
296 /*
297 * If all outstanding data are acked, stop
298 * retransmit timer, otherwise restart timer
299 * using current (possibly backed-off) value.
300 * If process is waiting for space,
301 * wakeup/selwakeup/signal. If data
302 * are ready to send, let tcp_output
303 * decide between more output or persist.
304 */
305#ifdef TCPDEBUG
306 if (so->so_options & SO_DEBUG)
307 tcp_trace(TA_INPUT, ostate, tp,
308 (void *)tcp_saveipgen,
309 &tcp_savetcp, 0);
310#endif
311 if (tp->snd_una == tp->snd_max)
312 tcp_timer_activate(tp, TT_REXMT, 0);
313 else if (!tcp_timer_active(tp, TT_PERSIST))
314 tcp_timer_activate(tp, TT_REXMT,
315 tp->t_rxtcur);
316 } else {
317 /*
318 * Window update only, just free the mbufs and
319 * send out whatever we can.
320 */
321 m_freem(m);
322 }
323 sowwakeup(so);
324 if (sbavail(&so->so_snd))
325 (void) tcp_output(tp);
326 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
327 __func__, ti_locked));
328 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
329 INP_WLOCK_ASSERT(tp->t_inpcb);
330
331 if (tp->t_flags & TF_DELACK) {
332 tp->t_flags &= ~TF_DELACK;
333 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
334 }
335 INP_WUNLOCK(tp->t_inpcb);
336}
337
338/*
339 * Here nothing is really faster, its just that we
340 * have broken out the fast-data path also just like
341 * the fast-ack.
342 */
343static void
344tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
345 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
346 int ti_locked, u_long tiwin)
347{
348 int newsize = 0; /* automatic sockbuf scaling */
349#ifdef TCPDEBUG
350 /*
351 * The size of tcp_saveipgen must be the size of the max ip header,
352 * now IPv6.
353 */
354 u_char tcp_saveipgen[IP6_HDR_LEN];
355 struct tcphdr tcp_savetcp;
356 short ostate = 0;
357#endif
358 /*
359 * If last ACK falls within this segment's sequence numbers,
360 * record the timestamp.
361 * NOTE that the test is modified according to the latest
362 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
363 */
364 if ((to->to_flags & TOF_TS) != 0 &&
365 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
366 tp->ts_recent_age = tcp_ts_getticks();
367 tp->ts_recent = to->to_tsval;
368 }
369
370 /*
371 * This is a pure, in-sequence data packet with
372 * nothing on the reassembly queue and we have enough
373 * buffer space to take it.
374 */
375 if (ti_locked == TI_RLOCKED) {
376 INP_INFO_RUNLOCK(&V_tcbinfo);
377 }
378 ti_locked = TI_UNLOCKED;
379
380 /* Clean receiver SACK report if present */
381 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
382 tcp_clean_sackreport(tp);
383 TCPSTAT_INC(tcps_preddat);
384 tp->rcv_nxt += tlen;
385 /*
386 * Pull snd_wl1 up to prevent seq wrap relative to
387 * th_seq.
388 */
389 tp->snd_wl1 = th->th_seq;
390 /*
391 * Pull rcv_up up to prevent seq wrap relative to
392 * rcv_nxt.
393 */
394 tp->rcv_up = tp->rcv_nxt;
395 TCPSTAT_ADD(tcps_rcvbyte, tlen);
396#ifdef TCPDEBUG
397 if (so->so_options & SO_DEBUG)
398 tcp_trace(TA_INPUT, ostate, tp,
399 (void *)tcp_saveipgen, &tcp_savetcp, 0);
400#endif
401 /*
402 * Automatic sizing of receive socket buffer. Often the send
403 * buffer size is not optimally adjusted to the actual network
404 * conditions at hand (delay bandwidth product). Setting the
405 * buffer size too small limits throughput on links with high
406 * bandwidth and high delay (eg. trans-continental/oceanic links).
407 *
408 * On the receive side the socket buffer memory is only rarely
409 * used to any significant extent. This allows us to be much
410 * more aggressive in scaling the receive socket buffer. For
411 * the case that the buffer space is actually used to a large
412 * extent and we run out of kernel memory we can simply drop
413 * the new segments; TCP on the sender will just retransmit it
414 * later. Setting the buffer size too big may only consume too
415 * much kernel memory if the application doesn't read() from
416 * the socket or packet loss or reordering makes use of the
417 * reassembly queue.
418 *
419 * The criteria to step up the receive buffer one notch are:
420 * 1. Application has not set receive buffer size with
421 * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
422 * 2. the number of bytes received during the time it takes
423 * one timestamp to be reflected back to us (the RTT);
424 * 3. received bytes per RTT is within seven eighth of the
425 * current socket buffer size;
426 * 4. receive buffer size has not hit maximal automatic size;
427 *
428 * This algorithm does one step per RTT at most and only if
429 * we receive a bulk stream w/o packet losses or reorderings.
430 * Shrinking the buffer during idle times is not necessary as
431 * it doesn't consume any memory when idle.
432 *
433 * TODO: Only step up if the application is actually serving
434 * the buffer to better manage the socket buffer resources.
435 */
436 if (V_tcp_do_autorcvbuf &&
437 (to->to_flags & TOF_TS) &&
438 to->to_tsecr &&
439 (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
440 if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
441 to->to_tsecr - tp->rfbuf_ts < hz) {
442 if (tp->rfbuf_cnt >
443 (so->so_rcv.sb_hiwat / 8 * 7) &&
444 so->so_rcv.sb_hiwat <
445 V_tcp_autorcvbuf_max) {
446 newsize =
447 min(so->so_rcv.sb_hiwat +
448 V_tcp_autorcvbuf_inc,
449 V_tcp_autorcvbuf_max);
450 }
451 /* Start over with next RTT. */
452 tp->rfbuf_ts = 0;
453 tp->rfbuf_cnt = 0;
454 } else
455 tp->rfbuf_cnt += tlen; /* add up */
456 }
457
458 /* Add data to socket buffer. */
459 SOCKBUF_LOCK(&so->so_rcv);
460 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
461 m_freem(m);
462 } else {
463 /*
464 * Set new socket buffer size.
465 * Give up when limit is reached.
466 */
467 if (newsize)
468 if (!sbreserve_locked(&so->so_rcv,
469 newsize, so, NULL))
470 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
471 m_adj(m, drop_hdrlen); /* delayed header drop */
472 sbappendstream_locked(&so->so_rcv, m, 0);
473 }
474 /* NB: sorwakeup_locked() does an implicit unlock. */
475 sorwakeup_locked(so);
476 if (DELAY_ACK(tp, tlen)) {
477 tp->t_flags |= TF_DELACK;
478 } else {
479 tp->t_flags |= TF_ACKNOW;
480 tcp_output(tp);
481 }
482 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
483 __func__, ti_locked));
484 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
485 INP_WLOCK_ASSERT(tp->t_inpcb);
486
487 if (tp->t_flags & TF_DELACK) {
488 tp->t_flags &= ~TF_DELACK;
489 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
490 }
491 INP_WUNLOCK(tp->t_inpcb);
492}
493
494/*
495 * The slow-path is the clone of the long long part
496 * of tcp_do_segment past all the fast-path stuff. We
497 * use it here by two different callers, the fast/slow and
498 * the fastack only.
499 */
500static void
501tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
502 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
503 int ti_locked, u_long tiwin, int thflags)
504{
505 int acked, ourfinisacked, needoutput = 0;
506 int rstreason, todrop, win;
507 char *s;
508 struct in_conninfo *inc;
509 struct mbuf *mfree = NULL;
510#ifdef TCPDEBUG
511 /*
512 * The size of tcp_saveipgen must be the size of the max ip header,
513 * now IPv6.
514 */
515 u_char tcp_saveipgen[IP6_HDR_LEN];
516 struct tcphdr tcp_savetcp;
517 short ostate = 0;
518#endif
519 /*
520 * Calculate amount of space in receive window,
521 * and then do TCP input processing.
522 * Receive window is amount of space in rcv queue,
523 * but not less than advertised window.
524 */
525 inc = &tp->t_inpcb->inp_inc;
526 win = sbspace(&so->so_rcv);
527 if (win < 0)
528 win = 0;
529 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
530
531 /* Reset receive buffer auto scaling when not in bulk receive mode. */
532 tp->rfbuf_ts = 0;
533 tp->rfbuf_cnt = 0;
534
535 switch (tp->t_state) {
536
537 /*
538 * If the state is SYN_RECEIVED:
539 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
540 */
541 case TCPS_SYN_RECEIVED:
542 if ((thflags & TH_ACK) &&
543 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
544 SEQ_GT(th->th_ack, tp->snd_max))) {
545 rstreason = BANDLIM_RST_OPENPORT;
546 goto dropwithreset;
547 }
548 break;
549
550 /*
551 * If the state is SYN_SENT:
552 * if seg contains an ACK, but not for our SYN, drop the input.
553 * if seg contains a RST, then drop the connection.
554 * if seg does not contain SYN, then drop it.
555 * Otherwise this is an acceptable SYN segment
556 * initialize tp->rcv_nxt and tp->irs
557 * if seg contains ack then advance tp->snd_una
558 * if seg contains an ECE and ECN support is enabled, the stream
559 * is ECN capable.
560 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
561 * arrange for segment to be acked (eventually)
562 * continue processing rest of data/controls, beginning with URG
563 */
564 case TCPS_SYN_SENT:
565 if ((thflags & TH_ACK) &&
566 (SEQ_LEQ(th->th_ack, tp->iss) ||
567 SEQ_GT(th->th_ack, tp->snd_max))) {
568 rstreason = BANDLIM_UNLIMITED;
569 goto dropwithreset;
570 }
571 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
572 TCP_PROBE5(connect__refused, NULL, tp,
573 mtod(m, const char *), tp, th);
574 tp = tcp_drop(tp, ECONNREFUSED);
575 }
576 if (thflags & TH_RST)
577 goto drop;
578 if (!(thflags & TH_SYN))
579 goto drop;
580
581 tp->irs = th->th_seq;
582 tcp_rcvseqinit(tp);
583 if (thflags & TH_ACK) {
584 TCPSTAT_INC(tcps_connects);
585 soisconnected(so);
586#ifdef MAC
587 mac_socketpeer_set_from_mbuf(m, so);
588#endif
589 /* Do window scaling on this connection? */
590 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
591 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
592 tp->rcv_scale = tp->request_r_scale;
593 }
594 tp->rcv_adv += imin(tp->rcv_wnd,
595 TCP_MAXWIN << tp->rcv_scale);
596 tp->snd_una++; /* SYN is acked */
597 /*
598 * If there's data, delay ACK; if there's also a FIN
599 * ACKNOW will be turned on later.
600 */
601 if (DELAY_ACK(tp, tlen) && tlen != 0)
602 tcp_timer_activate(tp, TT_DELACK,
603 tcp_delacktime);
604 else
605 tp->t_flags |= TF_ACKNOW;
606
607 if ((thflags & TH_ECE) && V_tcp_do_ecn) {
608 tp->t_flags |= TF_ECN_PERMIT;
609 TCPSTAT_INC(tcps_ecn_shs);
610 }
611
612 /*
613 * Received <SYN,ACK> in SYN_SENT[*] state.
614 * Transitions:
615 * SYN_SENT --> ESTABLISHED
616 * SYN_SENT* --> FIN_WAIT_1
617 */
618 tp->t_starttime = ticks;
619 if (tp->t_flags & TF_NEEDFIN) {
620 tcp_state_change(tp, TCPS_FIN_WAIT_1);
621 tp->t_flags &= ~TF_NEEDFIN;
622 thflags &= ~TH_SYN;
623 } else {
624 tcp_state_change(tp, TCPS_ESTABLISHED);
625 TCP_PROBE5(connect__established, NULL, tp,
626 mtod(m, const char *), tp, th);
627 cc_conn_init(tp);
628 tcp_timer_activate(tp, TT_KEEP,
629 TP_KEEPIDLE(tp));
630 }
631 } else {
632 /*
633 * Received initial SYN in SYN-SENT[*] state =>
634 * simultaneous open.
635 * If it succeeds, connection is * half-synchronized.
636 * Otherwise, do 3-way handshake:
637 * SYN-SENT -> SYN-RECEIVED
638 * SYN-SENT* -> SYN-RECEIVED*
639 */
640 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
641 tcp_timer_activate(tp, TT_REXMT, 0);
642 tcp_state_change(tp, TCPS_SYN_RECEIVED);
643 }
644
645 KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
646 "ti_locked %d", __func__, ti_locked));
647 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
648 INP_WLOCK_ASSERT(tp->t_inpcb);
649
650 /*
651 * Advance th->th_seq to correspond to first data byte.
652 * If data, trim to stay within window,
653 * dropping FIN if necessary.
654 */
655 th->th_seq++;
656 if (tlen > tp->rcv_wnd) {
657 todrop = tlen - tp->rcv_wnd;
658 m_adj(m, -todrop);
659 tlen = tp->rcv_wnd;
660 thflags &= ~TH_FIN;
661 TCPSTAT_INC(tcps_rcvpackafterwin);
662 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
663 }
664 tp->snd_wl1 = th->th_seq - 1;
665 tp->rcv_up = th->th_seq;
666 /*
667 * Client side of transaction: already sent SYN and data.
668 * If the remote host used T/TCP to validate the SYN,
669 * our data will be ACK'd; if so, enter normal data segment
670 * processing in the middle of step 5, ack processing.
671 * Otherwise, goto step 6.
672 */
673 if (thflags & TH_ACK)
674 goto process_ACK;
675
676 goto step6;
677
678 /*
679 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
680 * do normal processing.
681 *
682 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
683 */
684 case TCPS_LAST_ACK:
685 case TCPS_CLOSING:
686 break; /* continue normal processing */
687 }
688
689 /*
690 * States other than LISTEN or SYN_SENT.
691 * First check the RST flag and sequence number since reset segments
692 * are exempt from the timestamp and connection count tests. This
693 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
694 * below which allowed reset segments in half the sequence space
695 * to fall though and be processed (which gives forged reset
696 * segments with a random sequence number a 50 percent chance of
697 * killing a connection).
698 * Then check timestamp, if present.
699 * Then check the connection count, if present.
700 * Then check that at least some bytes of segment are within
701 * receive window. If segment begins before rcv_nxt,
702 * drop leading data (and SYN); if nothing left, just ack.
703 */
704 if (thflags & TH_RST) {
705 /*
706 * RFC5961 Section 3.2
707 *
708 * - RST drops connection only if SEG.SEQ == RCV.NXT.
709 * - If RST is in window, we send challenge ACK.
710 *
711 * Note: to take into account delayed ACKs, we should
712 * test against last_ack_sent instead of rcv_nxt.
713 * Note 2: we handle special case of closed window, not
714 * covered by the RFC.
715 */
716 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
717 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
718 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
719 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
720 KASSERT(ti_locked == TI_RLOCKED,
721 ("%s: TH_RST ti_locked %d, th %p tp %p",
722 __func__, ti_locked, th, tp));
723 KASSERT(tp->t_state != TCPS_SYN_SENT,
724 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
725 __func__, th, tp));
726
727 if (V_tcp_insecure_rst ||
728 tp->last_ack_sent == th->th_seq) {
729 TCPSTAT_INC(tcps_drops);
730 /* Drop the connection. */
731 switch (tp->t_state) {
732 case TCPS_SYN_RECEIVED:
733 so->so_error = ECONNREFUSED;
734 goto close;
735 case TCPS_ESTABLISHED:
736 case TCPS_FIN_WAIT_1:
737 case TCPS_FIN_WAIT_2:
738 case TCPS_CLOSE_WAIT:
739 so->so_error = ECONNRESET;
740 close:
741 tcp_state_change(tp, TCPS_CLOSED);
742 /* FALLTHROUGH */
743 default:
744 tp = tcp_close(tp);
745 }
746 } else {
747 TCPSTAT_INC(tcps_badrst);
748 /* Send challenge ACK. */
749 tcp_respond(tp, mtod(m, void *), th, m,
750 tp->rcv_nxt, tp->snd_nxt, TH_ACK);
751 tp->last_ack_sent = tp->rcv_nxt;
752 m = NULL;
753 }
754 }
755 goto drop;
756 }
757
758 /*
759 * RFC5961 Section 4.2
760 * Send challenge ACK for any SYN in synchronized state.
761 */
762 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) {
763 KASSERT(ti_locked == TI_RLOCKED,
764 ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
765 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
766
767 TCPSTAT_INC(tcps_badsyn);
768 if (V_tcp_insecure_syn &&
769 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
770 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
771 tp = tcp_drop(tp, ECONNRESET);
772 rstreason = BANDLIM_UNLIMITED;
773 } else {
774 /* Send challenge ACK. */
775 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
776 tp->snd_nxt, TH_ACK);
777 tp->last_ack_sent = tp->rcv_nxt;
778 m = NULL;
779 }
780 goto drop;
781 }
782
783 /*
784 * RFC 1323 PAWS: If we have a timestamp reply on this segment
785 * and it's less than ts_recent, drop it.
786 */
787 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
788 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
789
790 /* Check to see if ts_recent is over 24 days old. */
791 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
792 /*
793 * Invalidate ts_recent. If this segment updates
794 * ts_recent, the age will be reset later and ts_recent
795 * will get a valid value. If it does not, setting
796 * ts_recent to zero will at least satisfy the
797 * requirement that zero be placed in the timestamp
798 * echo reply when ts_recent isn't valid. The
799 * age isn't reset until we get a valid ts_recent
800 * because we don't want out-of-order segments to be
801 * dropped when ts_recent is old.
802 */
803 tp->ts_recent = 0;
804 } else {
805 TCPSTAT_INC(tcps_rcvduppack);
806 TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
807 TCPSTAT_INC(tcps_pawsdrop);
808 if (tlen)
809 goto dropafterack;
810 goto drop;
811 }
812 }
813
814 /*
815 * In the SYN-RECEIVED state, validate that the packet belongs to
816 * this connection before trimming the data to fit the receive
817 * window. Check the sequence number versus IRS since we know
818 * the sequence numbers haven't wrapped. This is a partial fix
819 * for the "LAND" DoS attack.
820 */
821 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
822 rstreason = BANDLIM_RST_OPENPORT;
823 goto dropwithreset;
824 }
825
826 todrop = tp->rcv_nxt - th->th_seq;
827 if (todrop > 0) {
828 if (thflags & TH_SYN) {
829 thflags &= ~TH_SYN;
830 th->th_seq++;
831 if (th->th_urp > 1)
832 th->th_urp--;
833 else
834 thflags &= ~TH_URG;
835 todrop--;
836 }
837 /*
838 * Following if statement from Stevens, vol. 2, p. 960.
839 */
840 if (todrop > tlen
841 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
842 /*
843 * Any valid FIN must be to the left of the window.
844 * At this point the FIN must be a duplicate or out
845 * of sequence; drop it.
846 */
847 thflags &= ~TH_FIN;
848
849 /*
850 * Send an ACK to resynchronize and drop any data.
851 * But keep on processing for RST or ACK.
852 */
853 tp->t_flags |= TF_ACKNOW;
854 todrop = tlen;
855 TCPSTAT_INC(tcps_rcvduppack);
856 TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
857 } else {
858 TCPSTAT_INC(tcps_rcvpartduppack);
859 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
860 }
861 drop_hdrlen += todrop; /* drop from the top afterwards */
862 th->th_seq += todrop;
863 tlen -= todrop;
864 if (th->th_urp > todrop)
865 th->th_urp -= todrop;
866 else {
867 thflags &= ~TH_URG;
868 th->th_urp = 0;
869 }
870 }
871
872 /*
873 * If new data are received on a connection after the
874 * user processes are gone, then RST the other end.
875 */
876 if ((so->so_state & SS_NOFDREF) &&
877 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
878 KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
879 "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
880 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
881
882 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
883 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
884 "after socket was closed, "
885 "sending RST and removing tcpcb\n",
886 s, __func__, tcpstates[tp->t_state], tlen);
887 free(s, M_TCPLOG);
888 }
889 tp = tcp_close(tp);
890 TCPSTAT_INC(tcps_rcvafterclose);
891 rstreason = BANDLIM_UNLIMITED;
892 goto dropwithreset;
893 }
894
895 /*
896 * If segment ends after window, drop trailing data
897 * (and PUSH and FIN); if nothing left, just ACK.
898 */
899 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
900 if (todrop > 0) {
901 TCPSTAT_INC(tcps_rcvpackafterwin);
902 if (todrop >= tlen) {
903 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
904 /*
905 * If window is closed can only take segments at
906 * window edge, and have to drop data and PUSH from
907 * incoming segments. Continue processing, but
908 * remember to ack. Otherwise, drop segment
909 * and ack.
910 */
911 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
912 tp->t_flags |= TF_ACKNOW;
913 TCPSTAT_INC(tcps_rcvwinprobe);
914 } else
915 goto dropafterack;
916 } else
917 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
918 m_adj(m, -todrop);
919 tlen -= todrop;
920 thflags &= ~(TH_PUSH|TH_FIN);
921 }
922
923 /*
924 * If last ACK falls within this segment's sequence numbers,
925 * record its timestamp.
926 * NOTE:
927 * 1) That the test incorporates suggestions from the latest
928 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
929 * 2) That updating only on newer timestamps interferes with
930 * our earlier PAWS tests, so this check should be solely
931 * predicated on the sequence space of this segment.
932 * 3) That we modify the segment boundary check to be
933 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
934 * instead of RFC1323's
935 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
936 * This modified check allows us to overcome RFC1323's
937 * limitations as described in Stevens TCP/IP Illustrated
938 * Vol. 2 p.869. In such cases, we can still calculate the
939 * RTT correctly when RCV.NXT == Last.ACK.Sent.
940 */
941 if ((to->to_flags & TOF_TS) != 0 &&
942 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
943 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
944 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
945 tp->ts_recent_age = tcp_ts_getticks();
946 tp->ts_recent = to->to_tsval;
947 }
948
949 /*
950 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
951 * flag is on (half-synchronized state), then queue data for
952 * later processing; else drop segment and return.
953 */
954 if ((thflags & TH_ACK) == 0) {
955 if (tp->t_state == TCPS_SYN_RECEIVED ||
956 (tp->t_flags & TF_NEEDSYN))
957 goto step6;
958 else if (tp->t_flags & TF_ACKNOW)
959 goto dropafterack;
960 else
961 goto drop;
962 }
963
964 /*
965 * Ack processing.
966 */
967 switch (tp->t_state) {
968
969 /*
970 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
971 * ESTABLISHED state and continue processing.
972 * The ACK was checked above.
973 */
974 case TCPS_SYN_RECEIVED:
975
976 TCPSTAT_INC(tcps_connects);
977 soisconnected(so);
978 /* Do window scaling? */
979 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
980 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
981 tp->rcv_scale = tp->request_r_scale;
982 tp->snd_wnd = tiwin;
983 }
984 /*
985 * Make transitions:
986 * SYN-RECEIVED -> ESTABLISHED
987 * SYN-RECEIVED* -> FIN-WAIT-1
988 */
989 tp->t_starttime = ticks;
990 if (tp->t_flags & TF_NEEDFIN) {
991 tcp_state_change(tp, TCPS_FIN_WAIT_1);
992 tp->t_flags &= ~TF_NEEDFIN;
993 } else {
994 tcp_state_change(tp, TCPS_ESTABLISHED);
995 TCP_PROBE5(accept__established, NULL, tp,
996 mtod(m, const char *), tp, th);
997 cc_conn_init(tp);
998 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
999 }
1000 /*
1001 * If segment contains data or ACK, will call tcp_reass()
1002 * later; if not, do so now to pass queued data to user.
1003 */
1004 if (tlen == 0 && (thflags & TH_FIN) == 0)
1005 (void) tcp_reass(tp, (struct tcphdr *)0, 0,
1006 (struct mbuf *)0);
1007 tp->snd_wl1 = th->th_seq - 1;
1008 /* FALLTHROUGH */
1009
1010 /*
1011 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1012 * ACKs. If the ack is in the range
1013 * tp->snd_una < th->th_ack <= tp->snd_max
1014 * then advance tp->snd_una to th->th_ack and drop
1015 * data from the retransmission queue. If this ACK reflects
1016 * more up to date window information we update our window information.
1017 */
1018 case TCPS_ESTABLISHED:
1019 case TCPS_FIN_WAIT_1:
1020 case TCPS_FIN_WAIT_2:
1021 case TCPS_CLOSE_WAIT:
1022 case TCPS_CLOSING:
1023 case TCPS_LAST_ACK:
1024 if (SEQ_GT(th->th_ack, tp->snd_max)) {
1025 TCPSTAT_INC(tcps_rcvacktoomuch);
1026 goto dropafterack;
1027 }
1028 if ((tp->t_flags & TF_SACK_PERMIT) &&
1029 ((to->to_flags & TOF_SACK) ||
1030 !TAILQ_EMPTY(&tp->snd_holes)))
1031 tcp_sack_doack(tp, to, th->th_ack);
1032 else
1033 /*
1034 * Reset the value so that previous (valid) value
1035 * from the last ack with SACK doesn't get used.
1036 */
1037 tp->sackhint.sacked_bytes = 0;
1038
1039 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
1040 hhook_run_tcp_est_in(tp, th, to);
1041
1042 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1043 if (tlen == 0 && tiwin == tp->snd_wnd) {
1044 /*
1045 * If this is the first time we've seen a
1046 * FIN from the remote, this is not a
1047 * duplicate and it needs to be processed
1048 * normally. This happens during a
1049 * simultaneous close.
1050 */
1051 if ((thflags & TH_FIN) &&
1052 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
1053 tp->t_dupacks = 0;
1054 break;
1055 }
1056 TCPSTAT_INC(tcps_rcvdupack);
1057 /*
1058 * If we have outstanding data (other than
1059 * a window probe), this is a completely
1060 * duplicate ack (ie, window info didn't
1061 * change and FIN isn't set),
1062 * the ack is the biggest we've
1063 * seen and we've seen exactly our rexmt
1064 * threshhold of them, assume a packet
1065 * has been dropped and retransmit it.
1066 * Kludge snd_nxt & the congestion
1067 * window so we send only this one
1068 * packet.
1069 *
1070 * We know we're losing at the current
1071 * window size so do congestion avoidance
1072 * (set ssthresh to half the current window
1073 * and pull our congestion window back to
1074 * the new ssthresh).
1075 *
1076 * Dup acks mean that packets have left the
1077 * network (they're now cached at the receiver)
1078 * so bump cwnd by the amount in the receiver
1079 * to keep a constant cwnd packets in the
1080 * network.
1081 *
1082 * When using TCP ECN, notify the peer that
1083 * we reduced the cwnd.
1084 */
1085 if (!tcp_timer_active(tp, TT_REXMT) ||
1086 th->th_ack != tp->snd_una)
1087 tp->t_dupacks = 0;
1088 else if (++tp->t_dupacks > tcprexmtthresh ||
1089 IN_FASTRECOVERY(tp->t_flags)) {
1090 cc_ack_received(tp, th, CC_DUPACK);
1091 if ((tp->t_flags & TF_SACK_PERMIT) &&
1092 IN_FASTRECOVERY(tp->t_flags)) {
1093 int awnd;
1094
1095 /*
1096 * Compute the amount of data in flight first.
1097 * We can inject new data into the pipe iff
1098 * we have less than 1/2 the original window's
1099 * worth of data in flight.
1100 */
1101 if (V_tcp_do_rfc6675_pipe)
1102 awnd = tcp_compute_pipe(tp);
1103 else
1104 awnd = (tp->snd_nxt - tp->snd_fack) +
1105 tp->sackhint.sack_bytes_rexmit;
1106
1107 if (awnd < tp->snd_ssthresh) {
1108 tp->snd_cwnd += tp->t_maxseg;
1109 if (tp->snd_cwnd > tp->snd_ssthresh)
1110 tp->snd_cwnd = tp->snd_ssthresh;
1111 }
1112 } else
1113 tp->snd_cwnd += tp->t_maxseg;
1114 (void) tp->t_fb->tfb_tcp_output(tp);
1115 goto drop;
1116 } else if (tp->t_dupacks == tcprexmtthresh) {
1117 tcp_seq onxt = tp->snd_nxt;
1118
1119 /*
1120 * If we're doing sack, check to
1121 * see if we're already in sack
1122 * recovery. If we're not doing sack,
1123 * check to see if we're in newreno
1124 * recovery.
1125 */
1126 if (tp->t_flags & TF_SACK_PERMIT) {
1127 if (IN_FASTRECOVERY(tp->t_flags)) {
1128 tp->t_dupacks = 0;
1129 break;
1130 }
1131 } else {
1132 if (SEQ_LEQ(th->th_ack,
1133 tp->snd_recover)) {
1134 tp->t_dupacks = 0;
1135 break;
1136 }
1137 }
1138 /* Congestion signal before ack. */
1139 cc_cong_signal(tp, th, CC_NDUPACK);
1140 cc_ack_received(tp, th, CC_DUPACK);
1141 tcp_timer_activate(tp, TT_REXMT, 0);
1142 tp->t_rtttime = 0;
1143 if (tp->t_flags & TF_SACK_PERMIT) {
1144 TCPSTAT_INC(
1145 tcps_sack_recovery_episode);
1146 tp->sack_newdata = tp->snd_nxt;
1147 tp->snd_cwnd = tp->t_maxseg;
1148 (void) tp->t_fb->tfb_tcp_output(tp);
1149 goto drop;
1150 }
1151 tp->snd_nxt = th->th_ack;
1152 tp->snd_cwnd = tp->t_maxseg;
1153 (void) tp->t_fb->tfb_tcp_output(tp);
1154 KASSERT(tp->snd_limited <= 2,
1155 ("%s: tp->snd_limited too big",
1156 __func__));
1157 tp->snd_cwnd = tp->snd_ssthresh +
1158 tp->t_maxseg *
1159 (tp->t_dupacks - tp->snd_limited);
1160 if (SEQ_GT(onxt, tp->snd_nxt))
1161 tp->snd_nxt = onxt;
1162 goto drop;
1163 } else if (V_tcp_do_rfc3042) {
1164 /*
1165 * Process first and second duplicate
1166 * ACKs. Each indicates a segment
1167 * leaving the network, creating room
1168 * for more. Make sure we can send a
1169 * packet on reception of each duplicate
1170 * ACK by increasing snd_cwnd by one
1171 * segment. Restore the original
1172 * snd_cwnd after packet transmission.
1173 */
1174 cc_ack_received(tp, th, CC_DUPACK);
1175 u_long oldcwnd = tp->snd_cwnd;
1176 tcp_seq oldsndmax = tp->snd_max;
1177 u_int sent;
1178 int avail;
1179
1180 KASSERT(tp->t_dupacks == 1 ||
1181 tp->t_dupacks == 2,
1182 ("%s: dupacks not 1 or 2",
1183 __func__));
1184 if (tp->t_dupacks == 1)
1185 tp->snd_limited = 0;
1186 tp->snd_cwnd =
1187 (tp->snd_nxt - tp->snd_una) +
1188 (tp->t_dupacks - tp->snd_limited) *
1189 tp->t_maxseg;
1190 /*
1191 * Only call tcp_output when there
1192 * is new data available to be sent.
1193 * Otherwise we would send pure ACKs.
1194 */
1195 SOCKBUF_LOCK(&so->so_snd);
1196 avail = sbavail(&so->so_snd) -
1197 (tp->snd_nxt - tp->snd_una);
1198 SOCKBUF_UNLOCK(&so->so_snd);
1199 if (avail > 0)
1200 (void) tp->t_fb->tfb_tcp_output(tp);
1201 sent = tp->snd_max - oldsndmax;
1202 if (sent > tp->t_maxseg) {
1203 KASSERT((tp->t_dupacks == 2 &&
1204 tp->snd_limited == 0) ||
1205 (sent == tp->t_maxseg + 1 &&
1206 tp->t_flags & TF_SENTFIN),
1207 ("%s: sent too much",
1208 __func__));
1209 tp->snd_limited = 2;
1210 } else if (sent > 0)
1211 ++tp->snd_limited;
1212 tp->snd_cwnd = oldcwnd;
1213 goto drop;
1214 }
1215 } else
1216 tp->t_dupacks = 0;
1217 break;
1218 }
1219
1220 KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
1221 ("%s: th_ack <= snd_una", __func__));
1222
1223 /*
1224 * If the congestion window was inflated to account
1225 * for the other side's cached packets, retract it.
1226 */
1227 if (IN_FASTRECOVERY(tp->t_flags)) {
1228 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
1229 if (tp->t_flags & TF_SACK_PERMIT)
1230 tcp_sack_partialack(tp, th);
1231 else
1232 tcp_newreno_partial_ack(tp, th);
1233 } else
1234 cc_post_recovery(tp, th);
1235 }
1236 tp->t_dupacks = 0;
1237 /*
1238 * If we reach this point, ACK is not a duplicate,
1239 * i.e., it ACKs something we sent.
1240 */
1241 if (tp->t_flags & TF_NEEDSYN) {
1242 /*
1243 * T/TCP: Connection was half-synchronized, and our
1244 * SYN has been ACK'd (so connection is now fully
1245 * synchronized). Go to non-starred state,
1246 * increment snd_una for ACK of SYN, and check if
1247 * we can do window scaling.
1248 */
1249 tp->t_flags &= ~TF_NEEDSYN;
1250 tp->snd_una++;
1251 /* Do window scaling? */
1252 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1253 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1254 tp->rcv_scale = tp->request_r_scale;
1255 /* Send window already scaled. */
1256 }
1257 }
1258
1259process_ACK:
1260 INP_WLOCK_ASSERT(tp->t_inpcb);
1261
1262 acked = BYTES_THIS_ACK(tp, th);
1263 TCPSTAT_INC(tcps_rcvackpack);
1264 TCPSTAT_ADD(tcps_rcvackbyte, acked);
1265
1266 /*
1267 * If we just performed our first retransmit, and the ACK
1268 * arrives within our recovery window, then it was a mistake
1269 * to do the retransmit in the first place. Recover our
1270 * original cwnd and ssthresh, and proceed to transmit where
1271 * we left off.
1272 */
1273 if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
1274 (int)(ticks - tp->t_badrxtwin) < 0)
1275 cc_cong_signal(tp, th, CC_RTO_ERR);
1276
1277 /*
1278 * If we have a timestamp reply, update smoothed
1279 * round trip time. If no timestamp is present but
1280 * transmit timer is running and timed sequence
1281 * number was acked, update smoothed round trip time.
1282 * Since we now have an rtt measurement, cancel the
1283 * timer backoff (cf., Phil Karn's retransmit alg.).
1284 * Recompute the initial retransmit timer.
1285 *
1286 * Some boxes send broken timestamp replies
1287 * during the SYN+ACK phase, ignore
1288 * timestamps of 0 or we could calculate a
1289 * huge RTT and blow up the retransmit timer.
1290 */
1291 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
1292 u_int t;
1293
1294 t = tcp_ts_getticks() - to->to_tsecr;
1295 if (!tp->t_rttlow || tp->t_rttlow > t)
1296 tp->t_rttlow = t;
1297 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
1298 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
1299 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
1300 tp->t_rttlow = ticks - tp->t_rtttime;
1301 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
1302 }
1303
1304 /*
1305 * If all outstanding data is acked, stop retransmit
1306 * timer and remember to restart (more output or persist).
1307 * If there is more data to be acked, restart retransmit
1308 * timer, using current (possibly backed-off) value.
1309 */
1310 if (th->th_ack == tp->snd_max) {
1311 tcp_timer_activate(tp, TT_REXMT, 0);
1312 needoutput = 1;
1313 } else if (!tcp_timer_active(tp, TT_PERSIST))
1314 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1315
1316 /*
1317 * If no data (only SYN) was ACK'd,
1318 * skip rest of ACK processing.
1319 */
1320 if (acked == 0)
1321 goto step6;
1322
1323 /*
1324 * Let the congestion control algorithm update congestion
1325 * control related information. This typically means increasing
1326 * the congestion window.
1327 */
1328 cc_ack_received(tp, th, CC_ACK);
1329
1330 SOCKBUF_LOCK(&so->so_snd);
1331 if (acked > sbavail(&so->so_snd)) {
1332 tp->snd_wnd -= sbavail(&so->so_snd);
1333 mfree = sbcut_locked(&so->so_snd,
1334 (int)sbavail(&so->so_snd));
1335 ourfinisacked = 1;
1336 } else {
1337 mfree = sbcut_locked(&so->so_snd, acked);
1338 tp->snd_wnd -= acked;
1339 ourfinisacked = 0;
1340 }
1341 /* NB: sowwakeup_locked() does an implicit unlock. */
1342 sowwakeup_locked(so);
1343 m_freem(mfree);
1344 /* Detect una wraparound. */
1345 if (!IN_RECOVERY(tp->t_flags) &&
1346 SEQ_GT(tp->snd_una, tp->snd_recover) &&
1347 SEQ_LEQ(th->th_ack, tp->snd_recover))
1348 tp->snd_recover = th->th_ack - 1;
1349 /* XXXLAS: Can this be moved up into cc_post_recovery? */
1350 if (IN_RECOVERY(tp->t_flags) &&
1351 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
1352 EXIT_RECOVERY(tp->t_flags);
1353 }
1354 tp->snd_una = th->th_ack;
1355 if (tp->t_flags & TF_SACK_PERMIT) {
1356 if (SEQ_GT(tp->snd_una, tp->snd_recover))
1357 tp->snd_recover = tp->snd_una;
1358 }
1359 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1360 tp->snd_nxt = tp->snd_una;
1361
1362 switch (tp->t_state) {
1363
1364 /*
1365 * In FIN_WAIT_1 STATE in addition to the processing
1366 * for the ESTABLISHED state if our FIN is now acknowledged
1367 * then enter FIN_WAIT_2.
1368 */
1369 case TCPS_FIN_WAIT_1:
1370 if (ourfinisacked) {
1371 /*
1372 * If we can't receive any more
1373 * data, then closing user can proceed.
1374 * Starting the timer is contrary to the
1375 * specification, but if we don't get a FIN
1376 * we'll hang forever.
1377 *
1378 * XXXjl:
1379 * we should release the tp also, and use a
1380 * compressed state.
1381 */
1382 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1383 soisdisconnected(so);
1384 tcp_timer_activate(tp, TT_2MSL,
1385 (tcp_fast_finwait2_recycle ?
1386 tcp_finwait2_timeout :
1387 TP_MAXIDLE(tp)));
1388 }
1389 tcp_state_change(tp, TCPS_FIN_WAIT_2);
1390 }
1391 break;
1392
1393 /*
1394 * In CLOSING STATE in addition to the processing for
1395 * the ESTABLISHED state if the ACK acknowledges our FIN
1396 * then enter the TIME-WAIT state, otherwise ignore
1397 * the segment.
1398 */
1399 case TCPS_CLOSING:
1400 if (ourfinisacked) {
1401 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1402 tcp_twstart(tp);
1403 INP_INFO_RUNLOCK(&V_tcbinfo);
1404 m_freem(m);
1405 return;
1406 }
1407 break;
1408
1409 /*
1410 * In LAST_ACK, we may still be waiting for data to drain
1411 * and/or to be acked, as well as for the ack of our FIN.
1412 * If our FIN is now acknowledged, delete the TCB,
1413 * enter the closed state and return.
1414 */
1415 case TCPS_LAST_ACK:
1416 if (ourfinisacked) {
1417 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1418 tp = tcp_close(tp);
1419 goto drop;
1420 }
1421 break;
1422 }
1423 }
1424
1425step6:
1426 INP_WLOCK_ASSERT(tp->t_inpcb);
1427
1428 /*
1429 * Update window information.
1430 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1431 */
1432 if ((thflags & TH_ACK) &&
1433 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
1434 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
1435 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
1436 /* keep track of pure window updates */
1437 if (tlen == 0 &&
1438 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
1439 TCPSTAT_INC(tcps_rcvwinupd);
1440 tp->snd_wnd = tiwin;
1441 tp->snd_wl1 = th->th_seq;
1442 tp->snd_wl2 = th->th_ack;
1443 if (tp->snd_wnd > tp->max_sndwnd)
1444 tp->max_sndwnd = tp->snd_wnd;
1445 needoutput = 1;
1446 }
1447
1448 /*
1449 * Process segments with URG.
1450 */
1451 if ((thflags & TH_URG) && th->th_urp &&
1452 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1453 /*
1454 * This is a kludge, but if we receive and accept
1455 * random urgent pointers, we'll crash in
1456 * soreceive. It's hard to imagine someone
1457 * actually wanting to send this much urgent data.
1458 */
1459 SOCKBUF_LOCK(&so->so_rcv);
1460 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
1461 th->th_urp = 0; /* XXX */
1462 thflags &= ~TH_URG; /* XXX */
1463 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
1464 goto dodata; /* XXX */
1465 }
1466 /*
1467 * If this segment advances the known urgent pointer,
1468 * then mark the data stream. This should not happen
1469 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1470 * a FIN has been received from the remote side.
1471 * In these states we ignore the URG.
1472 *
1473 * According to RFC961 (Assigned Protocols),
1474 * the urgent pointer points to the last octet
1475 * of urgent data. We continue, however,
1476 * to consider it to indicate the first octet
1477 * of data past the urgent section as the original
1478 * spec states (in one of two places).
1479 */
1480 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
1481 tp->rcv_up = th->th_seq + th->th_urp;
1482 so->so_oobmark = sbavail(&so->so_rcv) +
1483 (tp->rcv_up - tp->rcv_nxt) - 1;
1484 if (so->so_oobmark == 0)
1485 so->so_rcv.sb_state |= SBS_RCVATMARK;
1486 sohasoutofband(so);
1487 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1488 }
1489 SOCKBUF_UNLOCK(&so->so_rcv);
1490 /*
1491 * Remove out of band data so doesn't get presented to user.
1492 * This can happen independent of advancing the URG pointer,
1493 * but if two URG's are pending at once, some out-of-band
1494 * data may creep in... ick.
1495 */
1496 if (th->th_urp <= (u_long)tlen &&
1497 !(so->so_options & SO_OOBINLINE)) {
1498 /* hdr drop is delayed */
1499 tcp_pulloutofband(so, th, m, drop_hdrlen);
1500 }
1501 } else {
1502 /*
1503 * If no out of band data is expected,
1504 * pull receive urgent pointer along
1505 * with the receive window.
1506 */
1507 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1508 tp->rcv_up = tp->rcv_nxt;
1509 }
1510dodata: /* XXX */
1511 INP_WLOCK_ASSERT(tp->t_inpcb);
1512
1513 /*
1514 * Process the segment text, merging it into the TCP sequencing queue,
1515 * and arranging for acknowledgment of receipt if necessary.
1516 * This process logically involves adjusting tp->rcv_wnd as data
1517 * is presented to the user (this happens in tcp_usrreq.c,
1518 * case PRU_RCVD). If a FIN has already been received on this
1519 * connection then we just ignore the text.
1520 */
1521 if ((tlen || (thflags & TH_FIN)) &&
1522 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1523 tcp_seq save_start = th->th_seq;
1524 m_adj(m, drop_hdrlen); /* delayed header drop */
1525 /*
1526 * Insert segment which includes th into TCP reassembly queue
1527 * with control block tp. Set thflags to whether reassembly now
1528 * includes a segment with FIN. This handles the common case
1529 * inline (segment is the next to be received on an established
1530 * connection, and the queue is empty), avoiding linkage into
1531 * and removal from the queue and repetition of various
1532 * conversions.
1533 * Set DELACK for segments received in order, but ack
1534 * immediately when segments are out of order (so
1535 * fast retransmit can work).
1536 */
1537 if (th->th_seq == tp->rcv_nxt &&
1538 LIST_EMPTY(&tp->t_segq) &&
1539 TCPS_HAVEESTABLISHED(tp->t_state)) {
1540 if (DELAY_ACK(tp, tlen))
1541 tp->t_flags |= TF_DELACK;
1542 else
1543 tp->t_flags |= TF_ACKNOW;
1544 tp->rcv_nxt += tlen;
1545 thflags = th->th_flags & TH_FIN;
1546 TCPSTAT_INC(tcps_rcvpack);
1547 TCPSTAT_ADD(tcps_rcvbyte, tlen);
1548 SOCKBUF_LOCK(&so->so_rcv);
1549 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1550 m_freem(m);
1551 else
1552 sbappendstream_locked(&so->so_rcv, m, 0);
1553 /* NB: sorwakeup_locked() does an implicit unlock. */
1554 sorwakeup_locked(so);
1555 } else {
1556 /*
1557 * XXX: Due to the header drop above "th" is
1558 * theoretically invalid by now. Fortunately
1559 * m_adj() doesn't actually frees any mbufs
1560 * when trimming from the head.
1561 */
1562 thflags = tcp_reass(tp, th, &tlen, m);
1563 tp->t_flags |= TF_ACKNOW;
1564 }
1565 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
1566 tcp_update_sack_list(tp, save_start, save_start + tlen);
1567#if 0
1568 /*
1569 * Note the amount of data that peer has sent into
1570 * our window, in order to estimate the sender's
1571 * buffer size.
1572 * XXX: Unused.
1573 */
1574 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
1575 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1576 else
1577 len = so->so_rcv.sb_hiwat;
1578#endif
1579 } else {
1580 m_freem(m);
1581 thflags &= ~TH_FIN;
1582 }
1583
1584 /*
1585 * If FIN is received ACK the FIN and let the user know
1586 * that the connection is closing.
1587 */
1588 if (thflags & TH_FIN) {
1589 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1590 socantrcvmore(so);
1591 /*
1592 * If connection is half-synchronized
1593 * (ie NEEDSYN flag on) then delay ACK,
1594 * so it may be piggybacked when SYN is sent.
1595 * Otherwise, since we received a FIN then no
1596 * more input can be expected, send ACK now.
1597 */
1598 if (tp->t_flags & TF_NEEDSYN)
1599 tp->t_flags |= TF_DELACK;
1600 else
1601 tp->t_flags |= TF_ACKNOW;
1602 tp->rcv_nxt++;
1603 }
1604 switch (tp->t_state) {
1605
1606 /*
1607 * In SYN_RECEIVED and ESTABLISHED STATES
1608 * enter the CLOSE_WAIT state.
1609 */
1610 case TCPS_SYN_RECEIVED:
1611 tp->t_starttime = ticks;
1612 /* FALLTHROUGH */
1613 case TCPS_ESTABLISHED:
1614 tcp_state_change(tp, TCPS_CLOSE_WAIT);
1615 break;
1616
1617 /*
1618 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1619 * enter the CLOSING state.
1620 */
1621 case TCPS_FIN_WAIT_1:
1622 tcp_state_change(tp, TCPS_CLOSING);
1623 break;
1624
1625 /*
1626 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1627 * starting the time-wait timer, turning off the other
1628 * standard timers.
1629 */
1630 case TCPS_FIN_WAIT_2:
1631 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1632 KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
1633 "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
1634 ti_locked));
1635
1636 tcp_twstart(tp);
1637 INP_INFO_RUNLOCK(&V_tcbinfo);
1638 return;
1639 }
1640 }
1641 if (ti_locked == TI_RLOCKED) {
1642 INP_INFO_RUNLOCK(&V_tcbinfo);
1643 }
1644 ti_locked = TI_UNLOCKED;
1645
1646#ifdef TCPDEBUG
1647 if (so->so_options & SO_DEBUG)
1648 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
1649 &tcp_savetcp, 0);
1650#endif
1651 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
1652
1653 /*
1654 * Return any desired output.
1655 */
1656 if (needoutput || (tp->t_flags & TF_ACKNOW))
1657 (void) tp->t_fb->tfb_tcp_output(tp);
1658
1659 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
1660 __func__, ti_locked));
1661 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1662 INP_WLOCK_ASSERT(tp->t_inpcb);
1663
1664 if (tp->t_flags & TF_DELACK) {
1665 tp->t_flags &= ~TF_DELACK;
1666 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
1667 }
1668 INP_WUNLOCK(tp->t_inpcb);
1669 return;
1670
1671dropafterack:
1672 /*
1673 * Generate an ACK dropping incoming segment if it occupies
1674 * sequence space, where the ACK reflects our state.
1675 *
1676 * We can now skip the test for the RST flag since all
1677 * paths to this code happen after packets containing
1678 * RST have been dropped.
1679 *
1680 * In the SYN-RECEIVED state, don't send an ACK unless the
1681 * segment we received passes the SYN-RECEIVED ACK test.
1682 * If it fails send a RST. This breaks the loop in the
1683 * "LAND" DoS attack, and also prevents an ACK storm
1684 * between two listening ports that have been sent forged
1685 * SYN segments, each with the source address of the other.
1686 */
1687 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
1688 (SEQ_GT(tp->snd_una, th->th_ack) ||
1689 SEQ_GT(th->th_ack, tp->snd_max)) ) {
1690 rstreason = BANDLIM_RST_OPENPORT;
1691 goto dropwithreset;
1692 }
1693#ifdef TCPDEBUG
1694 if (so->so_options & SO_DEBUG)
1695 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1696 &tcp_savetcp, 0);
1697#endif
1698 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
1699 if (ti_locked == TI_RLOCKED) {
1700 INP_INFO_RUNLOCK(&V_tcbinfo);
1701 }
1702 ti_locked = TI_UNLOCKED;
1703
1704 tp->t_flags |= TF_ACKNOW;
1705 (void) tp->t_fb->tfb_tcp_output(tp);
1706 INP_WUNLOCK(tp->t_inpcb);
1707 m_freem(m);
1708 return;
1709
1710dropwithreset:
1711 if (ti_locked == TI_RLOCKED) {
1712 INP_INFO_RUNLOCK(&V_tcbinfo);
1713 }
1714 ti_locked = TI_UNLOCKED;
1715
1716 if (tp != NULL) {
1717 tcp_dropwithreset(m, th, tp, tlen, rstreason);
1718 INP_WUNLOCK(tp->t_inpcb);
1719 } else
1720 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1721 return;
1722
1723drop:
1724 if (ti_locked == TI_RLOCKED) {
1725 INP_INFO_RUNLOCK(&V_tcbinfo);
1726 ti_locked = TI_UNLOCKED;
1727 }
1728#ifdef INVARIANTS
1729 else
1730 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1731#endif
1732
1733 /*
1734 * Drop space held by incoming segment and return.
1735 */
1736#ifdef TCPDEBUG
1737 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1738 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
1739 &tcp_savetcp, 0);
1740#endif
1741 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
1742 if (tp != NULL)
1743 INP_WUNLOCK(tp->t_inpcb);
1744 m_freem(m);
1745}
1746
1747
1748/*
1749 * Do fast slow is a combination of the original
1750 * tcp_dosegment and a split fastpath, one function
1751 * for the fast-ack which also includes allowing fastpath
1752 * for window advanced in sequence acks. And also a
1753 * sub-function that handles the insequence data.
1754 */
1755void
1756tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so,
1757 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
1758 int ti_locked)
1759{
1760 int thflags;
1761 u_long tiwin;
1762 char *s;
1763 int can_enter;
1764 struct in_conninfo *inc;
1765 struct tcpopt to;
1766
1767 thflags = th->th_flags;
1768 tp->sackhint.last_sack_ack = 0;
1769 inc = &tp->t_inpcb->inp_inc;
1770 /*
1771 * If this is either a state-changing packet or current state isn't
1772 * established, we require a write lock on tcbinfo. Otherwise, we
1773 * allow the tcbinfo to be in either alocked or unlocked, as the
1774 * caller may have unnecessarily acquired a write lock due to a race.
1775 */
1776 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
1777 tp->t_state != TCPS_ESTABLISHED) {
1778 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
1779 "SYN/FIN/RST/!EST", __func__, ti_locked));
1780 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1781 } else {
1782#ifdef INVARIANTS
1783 if (ti_locked == TI_RLOCKED) {
1784 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1785 } else {
1786 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
1787 "ti_locked: %d", __func__, ti_locked));
1788 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1789 }
1790#endif
1791 }
1792 INP_WLOCK_ASSERT(tp->t_inpcb);
1793 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
1794 __func__));
1795 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
1796 __func__));
1797
1798 /*
1799 * Segment received on connection.
1800 * Reset idle time and keep-alive timer.
1801 * XXX: This should be done after segment
1802 * validation to ignore broken/spoofed segs.
1803 */
1804 tp->t_rcvtime = ticks;
1805 if (TCPS_HAVEESTABLISHED(tp->t_state))
1806 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
1807
1808 /*
1809 * Unscale the window into a 32-bit value.
1810 * For the SYN_SENT state the scale is zero.
1811 */
1812 tiwin = th->th_win << tp->snd_scale;
1813
1814 /*
1815 * TCP ECN processing.
1816 */
1817 if (tp->t_flags & TF_ECN_PERMIT) {
1818 if (thflags & TH_CWR)
1819 tp->t_flags &= ~TF_ECN_SND_ECE;
1820 switch (iptos & IPTOS_ECN_MASK) {
1821 case IPTOS_ECN_CE:
1822 tp->t_flags |= TF_ECN_SND_ECE;
1823 TCPSTAT_INC(tcps_ecn_ce);
1824 break;
1825 case IPTOS_ECN_ECT0:
1826 TCPSTAT_INC(tcps_ecn_ect0);
1827 break;
1828 case IPTOS_ECN_ECT1:
1829 TCPSTAT_INC(tcps_ecn_ect1);
1830 break;
1831 }
1832 /* Congestion experienced. */
1833 if (thflags & TH_ECE) {
1834 cc_cong_signal(tp, th, CC_ECN);
1835 }
1836 }
1837
1838 /*
1839 * Parse options on any incoming segment.
1840 */
1841 tcp_dooptions(&to, (u_char *)(th + 1),
1842 (th->th_off << 2) - sizeof(struct tcphdr),
1843 (thflags & TH_SYN) ? TO_SYN : 0);
1844
1845 /*
1846 * If echoed timestamp is later than the current time,
1847 * fall back to non RFC1323 RTT calculation. Normalize
1848 * timestamp if syncookies were used when this connection
1849 * was established.
1850 */
1851 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1852 to.to_tsecr -= tp->ts_offset;
1853 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
1854 to.to_tsecr = 0;
1855 }
1856 /*
1857 * If timestamps were negotiated during SYN/ACK they should
1858 * appear on every segment during this session and vice versa.
1859 */
1860 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
1861 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1862 log(LOG_DEBUG, "%s; %s: Timestamp missing, "
1863 "no action\n", s, __func__);
1864 free(s, M_TCPLOG);
1865 }
1866 }
1867 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
1868 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1869 log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
1870 "no action\n", s, __func__);
1871 free(s, M_TCPLOG);
1872 }
1873 }
1874
1875 /*
1876 * Process options only when we get SYN/ACK back. The SYN case
1877 * for incoming connections is handled in tcp_syncache.
1878 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1879 * or <SYN,ACK>) segment itself is never scaled.
1880 * XXX this is traditional behavior, may need to be cleaned up.
1881 */
1882 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1883 if ((to.to_flags & TOF_SCALE) &&
1884 (tp->t_flags & TF_REQ_SCALE)) {
1885 tp->t_flags |= TF_RCVD_SCALE;
1886 tp->snd_scale = to.to_wscale;
1887 }
1888 /*
1889 * Initial send window. It will be updated with
1890 * the next incoming segment to the scaled value.
1891 */
1892 tp->snd_wnd = th->th_win;
1893 if (to.to_flags & TOF_TS) {
1894 tp->t_flags |= TF_RCVD_TSTMP;
1895 tp->ts_recent = to.to_tsval;
1896 tp->ts_recent_age = tcp_ts_getticks();
1897 }
1898 if (to.to_flags & TOF_MSS)
1899 tcp_mss(tp, to.to_mss);
1900 if ((tp->t_flags & TF_SACK_PERMIT) &&
1901 (to.to_flags & TOF_SACKPERM) == 0)
1902 tp->t_flags &= ~TF_SACK_PERMIT;
1903 }
1904 can_enter = 0;
1905 if (__predict_true((tlen == 0))) {
1906 /*
1907 * The ack moved forward and we have a window (non-zero)
1908 * <or>
1909 * The ack did not move forward, but the window increased.
1910 */
1911 if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) ||
1912 ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) {
1913 can_enter = 1;
1914 }
1915 } else {
1916 /*
1917 * Data incoming, use the old entry criteria
1918 * for fast-path with data.
1919 */
1920 if ((tiwin && tiwin == tp->snd_wnd)) {
1921 can_enter = 1;
1922 }
1923 }
1924 /*
1925 * Header prediction: check for the two common cases
1926 * of a uni-directional data xfer. If the packet has
1927 * no control flags, is in-sequence, the window didn't
1928 * change and we're not retransmitting, it's a
1929 * candidate. If the length is zero and the ack moved
1930 * forward, we're the sender side of the xfer. Just
1931 * free the data acked & wake any higher level process
1932 * that was blocked waiting for space. If the length
1933 * is non-zero and the ack didn't move, we're the
1934 * receiver side. If we're getting packets in-order
1935 * (the reassembly queue is empty), add the data to
1936 * the socket buffer and note that we need a delayed ack.
1937 * Make sure that the hidden state-flags are also off.
1938 * Since we check for TCPS_ESTABLISHED first, it can only
1939 * be TH_NEEDSYN.
1940 */
1941 if (__predict_true(tp->t_state == TCPS_ESTABLISHED &&
1942 th->th_seq == tp->rcv_nxt &&
1943 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1944 tp->snd_nxt == tp->snd_max &&
1945 can_enter &&
1946 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1947 LIST_EMPTY(&tp->t_segq) &&
1948 ((to.to_flags & TOF_TS) == 0 ||
1949 TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) {
1950 if (__predict_true((tlen == 0) &&
1951 (SEQ_LEQ(th->th_ack, tp->snd_max) &&
1952 !IN_RECOVERY(tp->t_flags) &&
1953 (to.to_flags & TOF_SACK) == 0 &&
1954 TAILQ_EMPTY(&tp->snd_holes)))) {
1955 /* We are done */
1956 tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
1957 ti_locked, tiwin);
1958 return;
1959 } else if ((tlen) &&
1960 (th->th_ack == tp->snd_una &&
1961 tlen <= sbspace(&so->so_rcv))) {
1962 tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen,
1963 ti_locked, tiwin);
1964 /* We are done */
1965 return;
1966 }
1967 }
1968 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
1969 ti_locked, tiwin, thflags);
1970}
1971
1972
1973/*
1974 * This subfunction is used to try to highly optimize the
1975 * fast path. We again allow window updates that are
1976 * in sequence to remain in the fast-path. We also add
1977 * in the __predict's to attempt to help the compiler.
1978 * Note that if we return a 0, then we can *not* process
1979 * it and the caller should push the packet into the
1980 * slow-path.
1981 */
1982static int
1983tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
1984 struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen,
1985 int ti_locked, u_long tiwin)
1986{
1987 int acked;
1988 int winup_only=0;
1989#ifdef TCPDEBUG
1990 /*
1991 * The size of tcp_saveipgen must be the size of the max ip header,
1992 * now IPv6.
1993 */
1994 u_char tcp_saveipgen[IP6_HDR_LEN];
1995 struct tcphdr tcp_savetcp;
1996 short ostate = 0;
1997#endif
1998
1999
2000 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
2001 /* Old ack, behind (or duplicate to) the last one rcv'd */
2002 return (0);
2003 }
2004 if (__predict_false(th->th_ack == tp->snd_una) &&
2005 __predict_false(tiwin <= tp->snd_wnd)) {
2006 /* duplicate ack <or> a shrinking dup ack with shrinking window */
2007 return (0);
2008 }
2009 if (__predict_false(tiwin == 0)) {
2010 /* zero window */
2011 return (0);
2012 }
2013 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
2014 /* Above what we have sent? */
2015 return (0);
2016 }
2017 if (__predict_false(tp->snd_nxt != tp->snd_max)) {
2018 /* We are retransmitting */
2019 return (0);
2020 }
2021 if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) {
2022 /* We need a SYN or a FIN, unlikely.. */
2023 return (0);
2024 }
2025 if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
2026 /* Timestamp is behind .. old ack with seq wrap? */
2027 return (0);
2028 }
2029 if (__predict_false(IN_RECOVERY(tp->t_flags))) {
2030 /* Still recovering */
2031 return (0);
2032 }
2033 if (__predict_false(to->to_flags & TOF_SACK)) {
2034 /* Sack included in the ack.. */
2035 return (0);
2036 }
2037 if (!TAILQ_EMPTY(&tp->snd_holes)) {
2038 /* We have sack holes on our scoreboard */
2039 return (0);
2040 }
2041 /* Ok if we reach here, we can process a fast-ack */
2042
2043 /* Did the window get updated? */
2044 if (tiwin != tp->snd_wnd) {
2045 /* keep track of pure window updates */
2046 if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
2047 winup_only = 1;
2048 TCPSTAT_INC(tcps_rcvwinupd);
2049 }
2050 tp->snd_wnd = tiwin;
2051 tp->snd_wl1 = th->th_seq;
2052 if (tp->snd_wnd > tp->max_sndwnd)
2053 tp->max_sndwnd = tp->snd_wnd;
2054 }
2055 /*
2056 * Pull snd_wl2 up to prevent seq wrap relative
2057 * to th_ack.
2058 */
2059 tp->snd_wl2 = th->th_ack;
2060 /*
2061 * If last ACK falls within this segment's sequence numbers,
2062 * record the timestamp.
2063 * NOTE that the test is modified according to the latest
2064 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2065 */
2066 if ((to->to_flags & TOF_TS) != 0 &&
2067 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
2068 tp->ts_recent_age = tcp_ts_getticks();
2069 tp->ts_recent = to->to_tsval;
2070 }
2071 /*
2072 * This is a pure ack for outstanding data.
2073 */
2074 if (ti_locked == TI_RLOCKED) {
2075 INP_INFO_RUNLOCK(&V_tcbinfo);
2076 }
2077 ti_locked = TI_UNLOCKED;
2078
2079 TCPSTAT_INC(tcps_predack);
2080
2081 /*
2082 * "bad retransmit" recovery.
2083 */
2084 if (tp->t_rxtshift == 1 &&
2085 tp->t_flags & TF_PREVVALID &&
2086 (int)(ticks - tp->t_badrxtwin) < 0) {
2087 cc_cong_signal(tp, th, CC_RTO_ERR);
2088 }
2089
2090 /*
2091 * Recalculate the transmit timer / rtt.
2092 *
2093 * Some boxes send broken timestamp replies
2094 * during the SYN+ACK phase, ignore
2095 * timestamps of 0 or we could calculate a
2096 * huge RTT and blow up the retransmit timer.
2097 */
2098 if ((to->to_flags & TOF_TS) != 0 &&
2099 to->to_tsecr) {
2100 u_int t;
2101
2102 t = tcp_ts_getticks() - to->to_tsecr;
2103 if (!tp->t_rttlow || tp->t_rttlow > t)
2104 tp->t_rttlow = t;
2105 tcp_xmit_timer(tp,
2106 TCP_TS_TO_TICKS(t) + 1);
2107 } else if (tp->t_rtttime &&
2108 SEQ_GT(th->th_ack, tp->t_rtseq)) {
2109 if (!tp->t_rttlow ||
2110 tp->t_rttlow > ticks - tp->t_rtttime)
2111 tp->t_rttlow = ticks - tp->t_rtttime;
2112 tcp_xmit_timer(tp,
2113 ticks - tp->t_rtttime);
2114 }
2115 if (winup_only == 0) {
2116 acked = BYTES_THIS_ACK(tp, th);
2117
2118 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
2119 hhook_run_tcp_est_in(tp, th, to);
2120
2121 TCPSTAT_ADD(tcps_rcvackbyte, acked);
2122 sbdrop(&so->so_snd, acked);
2123 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
2124 SEQ_LEQ(th->th_ack, tp->snd_recover))
2125 tp->snd_recover = th->th_ack - 1;
2126
2127 /*
2128 * Let the congestion control algorithm update
2129 * congestion control related information. This
2130 * typically means increasing the congestion
2131 * window.
2132 */
2133 cc_ack_received(tp, th, CC_ACK);
2134
2135 tp->snd_una = th->th_ack;
2136 tp->t_dupacks = 0;
2137 m_freem(m);
2138
2139 /*
2140 * If all outstanding data are acked, stop
2141 * retransmit timer, otherwise restart timer
2142 * using current (possibly backed-off) value.
2143 * If process is waiting for space,
2144 * wakeup/selwakeup/signal. If data
2145 * are ready to send, let tcp_output
2146 * decide between more output or persist.
2147 */
2148#ifdef TCPDEBUG
2149 if (so->so_options & SO_DEBUG)
2150 tcp_trace(TA_INPUT, ostate, tp,
2151 (void *)tcp_saveipgen,
2152 &tcp_savetcp, 0);
2153#endif
2154 if (tp->snd_una == tp->snd_max)
2155 tcp_timer_activate(tp, TT_REXMT, 0);
2156 else if (!tcp_timer_active(tp, TT_PERSIST))
2157 tcp_timer_activate(tp, TT_REXMT,
2158 tp->t_rxtcur);
2159 /* Wake up the socket if we have room to write more */
2160 sowwakeup(so);
2161 } else {
2162 /*
2163 * Window update only, just free the mbufs and
2164 * send out whatever we can.
2165 */
2166 m_freem(m);
2167 }
2168 if (sbavail(&so->so_snd))
2169 (void) tcp_output(tp);
2170 KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
2171 __func__, ti_locked));
2172 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2173 INP_WLOCK_ASSERT(tp->t_inpcb);
2174
2175 if (tp->t_flags & TF_DELACK) {
2176 tp->t_flags &= ~TF_DELACK;
2177 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
2178 }
2179 INP_WUNLOCK(tp->t_inpcb);
2180 return (1);
2181}
2182
2183/*
2184 * This tcp-do-segment concentrates on making the fastest
2185 * ack processing path. It does not have a fast-path for
2186 * data (it possibly could which would then eliminate the
2187 * need for fast-slow above). For a content distributor having
2188 * large outgoing elephants and very very little coming in
2189 * having no fastpath for data does not really help (since you
2190 * don't get much data in). The most important thing is
2191 * processing ack's quickly and getting the rest of the data
2192 * output to the peer as quickly as possible. This routine
2193 * seems to be about an overall 3% faster then the old
2194 * tcp_do_segment and keeps us in the fast-path for packets
2195 * much more (by allowing window updates to also stay in the fastpath).
2196 */
2197void
2198tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
2199 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
2200 int ti_locked)
2201{
2202 int thflags;
2203 u_long tiwin;
2204 char *s;
2205 struct in_conninfo *inc;
2206 struct tcpopt to;
2207
2208 thflags = th->th_flags;
2209 tp->sackhint.last_sack_ack = 0;
2210 inc = &tp->t_inpcb->inp_inc;
2211 /*
2212 * If this is either a state-changing packet or current state isn't
2213 * established, we require a write lock on tcbinfo. Otherwise, we
2214 * allow the tcbinfo to be in either alocked or unlocked, as the
2215 * caller may have unnecessarily acquired a write lock due to a race.
2216 */
2217 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
2218 tp->t_state != TCPS_ESTABLISHED) {
2219 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
2220 "SYN/FIN/RST/!EST", __func__, ti_locked));
2221 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
2222 } else {
2223#ifdef INVARIANTS
2224 if (ti_locked == TI_RLOCKED) {
2225 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
2226 } else {
2227 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
2228 "ti_locked: %d", __func__, ti_locked));
2229 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
2230 }
2231#endif
2232 }
2233 INP_WLOCK_ASSERT(tp->t_inpcb);
2234 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
2235 __func__));
2236 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
2237 __func__));
2238
2239 /*
2240 * Segment received on connection.
2241 * Reset idle time and keep-alive timer.
2242 * XXX: This should be done after segment
2243 * validation to ignore broken/spoofed segs.
2244 */
2245 tp->t_rcvtime = ticks;
2246 if (TCPS_HAVEESTABLISHED(tp->t_state))
2247 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
2248
2249 /*
2250 * Unscale the window into a 32-bit value.
2251 * For the SYN_SENT state the scale is zero.
2252 */
2253 tiwin = th->th_win << tp->snd_scale;
2254
2255 /*
2256 * TCP ECN processing.
2257 */
2258 if (tp->t_flags & TF_ECN_PERMIT) {
2259 if (thflags & TH_CWR)
2260 tp->t_flags &= ~TF_ECN_SND_ECE;
2261 switch (iptos & IPTOS_ECN_MASK) {
2262 case IPTOS_ECN_CE:
2263 tp->t_flags |= TF_ECN_SND_ECE;
2264 TCPSTAT_INC(tcps_ecn_ce);
2265 break;
2266 case IPTOS_ECN_ECT0:
2267 TCPSTAT_INC(tcps_ecn_ect0);
2268 break;
2269 case IPTOS_ECN_ECT1:
2270 TCPSTAT_INC(tcps_ecn_ect1);
2271 break;
2272 }
2273 /* Congestion experienced. */
2274 if (thflags & TH_ECE) {
2275 cc_cong_signal(tp, th, CC_ECN);
2276 }
2277 }
2278
2279 /*
2280 * Parse options on any incoming segment.
2281 */
2282 tcp_dooptions(&to, (u_char *)(th + 1),
2283 (th->th_off << 2) - sizeof(struct tcphdr),
2284 (thflags & TH_SYN) ? TO_SYN : 0);
2285
2286 /*
2287 * If echoed timestamp is later than the current time,
2288 * fall back to non RFC1323 RTT calculation. Normalize
2289 * timestamp if syncookies were used when this connection
2290 * was established.
2291 */
2292 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
2293 to.to_tsecr -= tp->ts_offset;
2294 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
2295 to.to_tsecr = 0;
2296 }
2297 /*
2298 * If timestamps were negotiated during SYN/ACK they should
2299 * appear on every segment during this session and vice versa.
2300 */
2301 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
2302 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2303 log(LOG_DEBUG, "%s; %s: Timestamp missing, "
2304 "no action\n", s, __func__);
2305 free(s, M_TCPLOG);
2306 }
2307 }
2308 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
2309 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2310 log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
2311 "no action\n", s, __func__);
2312 free(s, M_TCPLOG);
2313 }
2314 }
2315
2316 /*
2317 * Process options only when we get SYN/ACK back. The SYN case
2318 * for incoming connections is handled in tcp_syncache.
2319 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
2320 * or <SYN,ACK>) segment itself is never scaled.
2321 * XXX this is traditional behavior, may need to be cleaned up.
2322 */
2323 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2324 if ((to.to_flags & TOF_SCALE) &&
2325 (tp->t_flags & TF_REQ_SCALE)) {
2326 tp->t_flags |= TF_RCVD_SCALE;
2327 tp->snd_scale = to.to_wscale;
2328 }
2329 /*
2330 * Initial send window. It will be updated with
2331 * the next incoming segment to the scaled value.
2332 */
2333 tp->snd_wnd = th->th_win;
2334 if (to.to_flags & TOF_TS) {
2335 tp->t_flags |= TF_RCVD_TSTMP;
2336 tp->ts_recent = to.to_tsval;
2337 tp->ts_recent_age = tcp_ts_getticks();
2338 }
2339 if (to.to_flags & TOF_MSS)
2340 tcp_mss(tp, to.to_mss);
2341 if ((tp->t_flags & TF_SACK_PERMIT) &&
2342 (to.to_flags & TOF_SACKPERM) == 0)
2343 tp->t_flags &= ~TF_SACK_PERMIT;
2344 }
2345 /*
2346 * Header prediction: check for the two common cases
2347 * of a uni-directional data xfer. If the packet has
2348 * no control flags, is in-sequence, the window didn't
2349 * change and we're not retransmitting, it's a
2350 * candidate. If the length is zero and the ack moved
2351 * forward, we're the sender side of the xfer. Just
2352 * free the data acked & wake any higher level process
2353 * that was blocked waiting for space. If the length
2354 * is non-zero and the ack didn't move, we're the
2355 * receiver side. If we're getting packets in-order
2356 * (the reassembly queue is empty), add the data to
2357 * the socket buffer and note that we need a delayed ack.
2358 * Make sure that the hidden state-flags are also off.
2359 * Since we check for TCPS_ESTABLISHED first, it can only
2360 * be TH_NEEDSYN.
2361 */
2362 if (__predict_true(tp->t_state == TCPS_ESTABLISHED) &&
2363 __predict_true(((to.to_flags & TOF_SACK) == 0)) &&
2364 __predict_true(tlen == 0) &&
2365 __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) &&
2366 __predict_true(LIST_EMPTY(&tp->t_segq)) &&
2367 __predict_true(th->th_seq == tp->rcv_nxt)) {
2368 if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen,
2369 ti_locked, tiwin)) {
2370 return;
2371 }
2372 }
2373 tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
2374 ti_locked, tiwin, thflags);
2375}
2376
2377struct tcp_function_block __tcp_fastslow = {
2378 "fastslow",
2379 tcp_output,
2380 tcp_do_segment_fastslow,
2381 tcp_default_ctloutput,
2382 NULL,
2383 NULL,
2384 NULL,
2385 NULL,
2386 NULL,
2387 NULL,
2388 NULL,
2389 0,
2390 0
2391
2392};
2393
2394struct tcp_function_block __tcp_fastack = {
2395 "fastack",
2396 tcp_output,
2397 tcp_do_segment_fastack,
2398 tcp_default_ctloutput,
2399 NULL,
2400 NULL,
2401 NULL,
2402 NULL,
2403 NULL,
2404 NULL,
2405 NULL,
2406 0,
2407 0
2408};
2409
2410static int
2411tcp_addfastpaths(module_t mod, int type, void *data)
2412{
2413 int err=0;
2414
2415 switch (type) {
2416 case MOD_LOAD:
2417 err = register_tcp_functions(&__tcp_fastack, M_WAITOK);
2418 if (err) {
2419 printf("Failed to register fastack module -- err:%d\n", err);
2420 return(err);
2421 }
2422 err = register_tcp_functions(&__tcp_fastslow, M_WAITOK);
2423 if (err) {
2424 printf("Failed to register fastslow module -- err:%d\n", err);
2425 deregister_tcp_functions(&__tcp_fastack);
2426 return(err);
2427 }
2428 break;
2429 case MOD_QUIESCE:
2430 if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) {
2431 return(EBUSY);
2432 }
2433 break;
2434 case MOD_UNLOAD:
2435 err = deregister_tcp_functions(&__tcp_fastack);
2436 if (err == EBUSY)
2437 break;
2438 err = deregister_tcp_functions(&__tcp_fastslow);
2439 if (err == EBUSY)
2440 break;
2441 err = 0;
2442 break;
2443 default:
2444 return (EOPNOTSUPP);
2445 }
2446 return (err);
2447}
2448
2449static moduledata_t new_tcp_fastpaths = {
2450 .name = "tcp_fastpaths",
2451 .evhand = tcp_addfastpaths,
2452 .priv = 0
2453};
2454
2455MODULE_VERSION(kern_tcpfastpaths, 1);
2456DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PSEUDO, SI_ORDER_ANY);