Deleted Added
full compact
tcp_output.c (61179) tcp_output.c (62587)
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
34 * $FreeBSD: head/sys/netinet/tcp_output.c 61179 2000-06-02 17:38:45Z jlemon $
34 * $FreeBSD: head/sys/netinet/tcp_output.c 62587 2000-07-04 16:35:15Z itojun $
35 */
36
37#include "opt_inet6.h"
38#include "opt_ipsec.h"
39#include "opt_tcpdebug.h"
40
41#include <stddef.h>
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/kernel.h>
46#include <sys/sysctl.h>
47#include <sys/mbuf.h>
35 */
36
37#include "opt_inet6.h"
38#include "opt_ipsec.h"
39#include "opt_tcpdebug.h"
40
41#include <stddef.h>
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/kernel.h>
46#include <sys/sysctl.h>
47#include <sys/mbuf.h>
48#include <sys/domain.h>
48#include <sys/protosw.h>
49#include <sys/socket.h>
50#include <sys/socketvar.h>
51
52#include <net/route.h>
53
54#include <netinet/in.h>
55#include <netinet/in_systm.h>
56#include <netinet/ip.h>
49#include <sys/protosw.h>
50#include <sys/socket.h>
51#include <sys/socketvar.h>
52
53#include <net/route.h>
54
55#include <netinet/in.h>
56#include <netinet/in_systm.h>
57#include <netinet/ip.h>
57#ifdef INET6
58#include <netinet/ip6.h>
59#endif
60#include <netinet/in_pcb.h>
58#include <netinet/in_pcb.h>
61#ifdef INET6
62#include <netinet6/in6_pcb.h>
63#endif
64#include <netinet/ip_var.h>
65#ifdef INET6
59#include <netinet/ip_var.h>
60#ifdef INET6
61#include <netinet6/in6_pcb.h>
62#include <netinet/ip6.h>
66#include <netinet6/ip6_var.h>
67#endif
68#include <netinet/tcp.h>
69#define TCPOUTFLAGS
70#include <netinet/tcp_fsm.h>
71#include <netinet/tcp_seq.h>
72#include <netinet/tcp_timer.h>
73#include <netinet/tcp_var.h>
74#include <netinet/tcpip.h>
75#ifdef TCPDEBUG
76#include <netinet/tcp_debug.h>
77#endif
78
79#ifdef IPSEC
80#include <netinet6/ipsec.h>
81#endif /*IPSEC*/
82
83#include <machine/in_cksum.h>
84
85#ifdef notyet
86extern struct mbuf *m_copypack();
87#endif
88
89static int path_mtu_discovery = 1;
90SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
91 &path_mtu_discovery, 1, "Enable Path MTU Discovery");
92
93int ss_fltsz = 1;
94SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
95 &ss_fltsz, 1, "Slow start flight size");
96
97int ss_fltsz_local = TCP_MAXWIN; /* something large */
98SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
99 &ss_fltsz_local, 1, "Slow start flight size for local networks");
100
101int tcp_do_newreno = 0;
102SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
103 0, "Enable NewReno Algorithms");
104/*
105 * Tcp output routine: figure out what should be sent and send it.
106 */
107int
108tcp_output(tp)
109 register struct tcpcb *tp;
110{
111 register struct socket *so = tp->t_inpcb->inp_socket;
112 register long len, win;
113 int off, flags, error;
114 register struct mbuf *m;
115 struct ip *ip = NULL;
116 register struct ipovly *ipov = NULL;
117#ifdef INET6
118 struct ip6_hdr *ip6 = NULL;
119#endif /* INET6 */
120 register struct tcphdr *th;
121 u_char opt[TCP_MAXOLEN];
122 unsigned ipoptlen, optlen, hdrlen;
123 int idle, sendalot;
124 int maxburst = TCP_MAXBURST;
125 struct rmxp_tao *taop;
126 struct rmxp_tao tao_noncached;
127#ifdef INET6
128 int isipv6;
129#endif
130
131#ifdef INET6
132 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
133#endif
134
135 /*
136 * Determine length of data that should be transmitted,
137 * and flags that will be used.
138 * If there is some data or critical controls (SYN, RST)
139 * to send, then transmit; otherwise, investigate further.
140 */
141 idle = (tp->snd_max == tp->snd_una);
142 if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
143 /*
144 * We have been idle for "a while" and no acks are
145 * expected to clock out any data we send --
146 * slow start to get ack "clock" running again.
147 *
148 * Set the slow-start flight size depending on whether
149 * this is a local network or not.
150 */
151 if (
152#ifdef INET6
153 (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) ||
154 (!isipv6 &&
155#endif
156 in_localaddr(tp->t_inpcb->inp_faddr)
157#ifdef INET6
158 )
159#endif
160 )
161 tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
162 else
163 tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
164 }
165again:
166 sendalot = 0;
167 off = tp->snd_nxt - tp->snd_una;
168 win = min(tp->snd_wnd, tp->snd_cwnd);
169
170 flags = tcp_outflags[tp->t_state];
171 /*
172 * Get standard flags, and add SYN or FIN if requested by 'hidden'
173 * state flags.
174 */
175 if (tp->t_flags & TF_NEEDFIN)
176 flags |= TH_FIN;
177 if (tp->t_flags & TF_NEEDSYN)
178 flags |= TH_SYN;
179
180 /*
181 * If in persist timeout with window of 0, send 1 byte.
182 * Otherwise, if window is small but nonzero
183 * and timer expired, we will send what we can
184 * and go to transmit state.
185 */
186 if (tp->t_force) {
187 if (win == 0) {
188 /*
189 * If we still have some data to send, then
190 * clear the FIN bit. Usually this would
191 * happen below when it realizes that we
192 * aren't sending all the data. However,
193 * if we have exactly 1 byte of unsent data,
194 * then it won't clear the FIN bit below,
195 * and if we are in persist state, we wind
196 * up sending the packet without recording
197 * that we sent the FIN bit.
198 *
199 * We can't just blindly clear the FIN bit,
200 * because if we don't have any more data
201 * to send then the probe will be the FIN
202 * itself.
203 */
204 if (off < so->so_snd.sb_cc)
205 flags &= ~TH_FIN;
206 win = 1;
207 } else {
208 callout_stop(tp->tt_persist);
209 tp->t_rxtshift = 0;
210 }
211 }
212
213 len = (long)ulmin(so->so_snd.sb_cc, win) - off;
214
215 if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
216 taop = &tao_noncached;
217 bzero(taop, sizeof(*taop));
218 }
219
220 /*
221 * Lop off SYN bit if it has already been sent. However, if this
222 * is SYN-SENT state and if segment contains data and if we don't
223 * know that foreign host supports TAO, suppress sending segment.
224 */
225 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
226 flags &= ~TH_SYN;
227 off--, len++;
228 if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
229 taop->tao_ccsent == 0)
230 return 0;
231 }
232
233 /*
234 * Be careful not to send data and/or FIN on SYN segments
235 * in cases when no CC option will be sent.
236 * This measure is needed to prevent interoperability problems
237 * with not fully conformant TCP implementations.
238 */
239 if ((flags & TH_SYN) &&
240 ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
241 ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
242 len = 0;
243 flags &= ~TH_FIN;
244 }
245
246 if (len < 0) {
247 /*
248 * If FIN has been sent but not acked,
249 * but we haven't been called to retransmit,
250 * len will be -1. Otherwise, window shrank
251 * after we sent into it. If window shrank to 0,
252 * cancel pending retransmit, pull snd_nxt back
253 * to (closed) window, and set the persist timer
254 * if it isn't already going. If the window didn't
255 * close completely, just wait for an ACK.
256 */
257 len = 0;
258 if (win == 0) {
259 callout_stop(tp->tt_rexmt);
260 tp->t_rxtshift = 0;
261 tp->snd_nxt = tp->snd_una;
262 if (!callout_active(tp->tt_persist))
263 tcp_setpersist(tp);
264 }
265 }
266 if (len > tp->t_maxseg) {
267 len = tp->t_maxseg;
268 sendalot = 1;
269 }
270 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
271 flags &= ~TH_FIN;
272
273 win = sbspace(&so->so_rcv);
274
275 /*
276 * Sender silly window avoidance. If connection is idle
277 * and can send all data, a maximum segment,
278 * at least a maximum default-size segment do it,
279 * or are forced, do it; otherwise don't bother.
280 * If peer's buffer is tiny, then send
281 * when window is at least half open.
282 * If retransmitting (possibly after persist timer forced us
283 * to send into a small window), then must resend.
284 */
285 if (len) {
286 if (len == tp->t_maxseg)
287 goto send;
288 if (!(tp->t_flags & TF_MORETOCOME) &&
289 (idle || tp->t_flags & TF_NODELAY) &&
290 (tp->t_flags & TF_NOPUSH) == 0 &&
291 len + off >= so->so_snd.sb_cc)
292 goto send;
293 if (tp->t_force)
294 goto send;
295 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
296 goto send;
297 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
298 goto send;
299 }
300
301 /*
302 * Compare available window to amount of window
303 * known to peer (as advertised window less
304 * next expected input). If the difference is at least two
305 * max size segments, or at least 50% of the maximum possible
306 * window, then want to send a window update to peer.
307 */
308 if (win > 0) {
309 /*
310 * "adv" is the amount we can increase the window,
311 * taking into account that we are limited by
312 * TCP_MAXWIN << tp->rcv_scale.
313 */
314 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
315 (tp->rcv_adv - tp->rcv_nxt);
316
317 if (adv >= (long) (2 * tp->t_maxseg))
318 goto send;
319 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
320 goto send;
321 }
322
323 /*
324 * Send if we owe peer an ACK.
325 */
326 if (tp->t_flags & TF_ACKNOW)
327 goto send;
328 if ((flags & TH_RST) ||
329 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
330 goto send;
331 if (SEQ_GT(tp->snd_up, tp->snd_una))
332 goto send;
333 /*
334 * If our state indicates that FIN should be sent
335 * and we have not yet done so, or we're retransmitting the FIN,
336 * then we need to send.
337 */
338 if (flags & TH_FIN &&
339 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
340 goto send;
341
342 /*
343 * TCP window updates are not reliable, rather a polling protocol
344 * using ``persist'' packets is used to insure receipt of window
345 * updates. The three ``states'' for the output side are:
346 * idle not doing retransmits or persists
347 * persisting to move a small or zero window
348 * (re)transmitting and thereby not persisting
349 *
350 * callout_active(tp->tt_persist)
351 * is true when we are in persist state.
352 * tp->t_force
353 * is set when we are called to send a persist packet.
354 * callout_active(tp->tt_rexmt)
355 * is set when we are retransmitting
356 * The output side is idle when both timers are zero.
357 *
358 * If send window is too small, there is data to transmit, and no
359 * retransmit or persist is pending, then go to persist state.
360 * If nothing happens soon, send when timer expires:
361 * if window is nonzero, transmit what we can,
362 * otherwise force out a byte.
363 */
364 if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) &&
365 !callout_active(tp->tt_persist)) {
366 tp->t_rxtshift = 0;
367 tcp_setpersist(tp);
368 }
369
370 /*
371 * No reason to send a segment, just return.
372 */
373 return (0);
374
375send:
376 /*
377 * Before ESTABLISHED, force sending of initial options
378 * unless TCP set not to do any options.
379 * NOTE: we assume that the IP/TCP header plus TCP options
380 * always fit in a single mbuf, leaving room for a maximum
381 * link header, i.e.
382 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
383 */
384 optlen = 0;
385#ifdef INET6
386 if (isipv6)
387 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
388 else
389#endif
390 hdrlen = sizeof (struct tcpiphdr);
391 if (flags & TH_SYN) {
392 tp->snd_nxt = tp->iss;
393 if ((tp->t_flags & TF_NOOPT) == 0) {
394 u_short mss;
395
396 opt[0] = TCPOPT_MAXSEG;
397 opt[1] = TCPOLEN_MAXSEG;
398 mss = htons((u_short) tcp_mssopt(tp));
399 (void)memcpy(opt + 2, &mss, sizeof(mss));
400 optlen = TCPOLEN_MAXSEG;
401
402 if ((tp->t_flags & TF_REQ_SCALE) &&
403 ((flags & TH_ACK) == 0 ||
404 (tp->t_flags & TF_RCVD_SCALE))) {
405 *((u_int32_t *)(opt + optlen)) = htonl(
406 TCPOPT_NOP << 24 |
407 TCPOPT_WINDOW << 16 |
408 TCPOLEN_WINDOW << 8 |
409 tp->request_r_scale);
410 optlen += 4;
411 }
412 }
413 }
414
415 /*
416 * Send a timestamp and echo-reply if this is a SYN and our side
417 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
418 * and our peer have sent timestamps in our SYN's.
419 */
420 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
421 (flags & TH_RST) == 0 &&
422 ((flags & TH_ACK) == 0 ||
423 (tp->t_flags & TF_RCVD_TSTMP))) {
424 u_int32_t *lp = (u_int32_t *)(opt + optlen);
425
426 /* Form timestamp option as shown in appendix A of RFC 1323. */
427 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
428 *lp++ = htonl(ticks);
429 *lp = htonl(tp->ts_recent);
430 optlen += TCPOLEN_TSTAMP_APPA;
431 }
432
433 /*
434 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
435 * options are allowed (!TF_NOOPT) and it's not a RST.
436 */
437 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
438 (flags & TH_RST) == 0) {
439 switch (flags & (TH_SYN|TH_ACK)) {
440 /*
441 * This is a normal ACK, send CC if we received CC before
442 * from our peer.
443 */
444 case TH_ACK:
445 if (!(tp->t_flags & TF_RCVD_CC))
446 break;
447 /*FALLTHROUGH*/
448
449 /*
450 * We can only get here in T/TCP's SYN_SENT* state, when
451 * we're a sending a non-SYN segment without waiting for
452 * the ACK of our SYN. A check above assures that we only
453 * do this if our peer understands T/TCP.
454 */
455 case 0:
456 opt[optlen++] = TCPOPT_NOP;
457 opt[optlen++] = TCPOPT_NOP;
458 opt[optlen++] = TCPOPT_CC;
459 opt[optlen++] = TCPOLEN_CC;
460 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
461
462 optlen += 4;
463 break;
464
465 /*
466 * This is our initial SYN, check whether we have to use
467 * CC or CC.new.
468 */
469 case TH_SYN:
470 opt[optlen++] = TCPOPT_NOP;
471 opt[optlen++] = TCPOPT_NOP;
472 opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
473 TCPOPT_CCNEW : TCPOPT_CC;
474 opt[optlen++] = TCPOLEN_CC;
475 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
476 optlen += 4;
477 break;
478
479 /*
480 * This is a SYN,ACK; send CC and CC.echo if we received
481 * CC from our peer.
482 */
483 case (TH_SYN|TH_ACK):
484 if (tp->t_flags & TF_RCVD_CC) {
485 opt[optlen++] = TCPOPT_NOP;
486 opt[optlen++] = TCPOPT_NOP;
487 opt[optlen++] = TCPOPT_CC;
488 opt[optlen++] = TCPOLEN_CC;
489 *(u_int32_t *)&opt[optlen] =
490 htonl(tp->cc_send);
491 optlen += 4;
492 opt[optlen++] = TCPOPT_NOP;
493 opt[optlen++] = TCPOPT_NOP;
494 opt[optlen++] = TCPOPT_CCECHO;
495 opt[optlen++] = TCPOLEN_CC;
496 *(u_int32_t *)&opt[optlen] =
497 htonl(tp->cc_recv);
498 optlen += 4;
499 }
500 break;
501 }
502 }
503
504 hdrlen += optlen;
505
506#ifdef INET6
507 if (isipv6)
508 ipoptlen = ip6_optlen(tp->t_inpcb);
509 else
510#endif
511 {
512 if (tp->t_inpcb->inp_options) {
513 ipoptlen = tp->t_inpcb->inp_options->m_len -
514 offsetof(struct ipoption, ipopt_list);
515 } else {
516 ipoptlen = 0;
517 }
518 }
519#ifdef IPSEC
520 ipoptlen += ipsec_hdrsiz_tcp(tp);
521#endif
522
523 /*
524 * Adjust data length if insertion of options will
525 * bump the packet length beyond the t_maxopd length.
526 * Clear the FIN bit because we cut off the tail of
527 * the segment.
528 */
529 if (len + optlen + ipoptlen > tp->t_maxopd) {
530 /*
531 * If there is still more to send, don't close the connection.
532 */
533 flags &= ~TH_FIN;
534 len = tp->t_maxopd - optlen - ipoptlen;
535 sendalot = 1;
536 }
537
538/*#ifdef DIAGNOSTIC*/
539#ifdef INET6
540 if (max_linkhdr + hdrlen > MCLBYTES)
541 panic("tcphdr too big");
542#else
543 if (max_linkhdr + hdrlen > MHLEN)
544 panic("tcphdr too big");
545#endif
546/*#endif*/
547
548 /*
549 * Grab a header mbuf, attaching a copy of data to
550 * be transmitted, and initialize the header from
551 * the template for sends on this connection.
552 */
553 if (len) {
554 if (tp->t_force && len == 1)
555 tcpstat.tcps_sndprobe++;
556 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
557 tcpstat.tcps_sndrexmitpack++;
558 tcpstat.tcps_sndrexmitbyte += len;
559 } else {
560 tcpstat.tcps_sndpack++;
561 tcpstat.tcps_sndbyte += len;
562 }
563#ifdef notyet
564 if ((m = m_copypack(so->so_snd.sb_mb, off,
565 (int)len, max_linkhdr + hdrlen)) == 0) {
566 error = ENOBUFS;
567 goto out;
568 }
569 /*
570 * m_copypack left space for our hdr; use it.
571 */
572 m->m_len += hdrlen;
573 m->m_data -= hdrlen;
574#else
575 MGETHDR(m, M_DONTWAIT, MT_HEADER);
576 if (m == NULL) {
577 error = ENOBUFS;
578 goto out;
579 }
580#ifdef INET6
581 if (MHLEN < hdrlen + max_linkhdr) {
582 MCLGET(m, M_DONTWAIT);
583 if ((m->m_flags & M_EXT) == 0) {
584 m_freem(m);
585 error = ENOBUFS;
586 goto out;
587 }
588 }
589#endif
590 m->m_data += max_linkhdr;
591 m->m_len = hdrlen;
592 if (len <= MHLEN - hdrlen - max_linkhdr) {
593 m_copydata(so->so_snd.sb_mb, off, (int) len,
594 mtod(m, caddr_t) + hdrlen);
595 m->m_len += len;
596 } else {
597 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
598 if (m->m_next == 0) {
599 (void) m_free(m);
600 error = ENOBUFS;
601 goto out;
602 }
603 }
604#endif
605 /*
606 * If we're sending everything we've got, set PUSH.
607 * (This will keep happy those implementations which only
608 * give data to the user when a buffer fills or
609 * a PUSH comes in.)
610 */
611 if (off + len == so->so_snd.sb_cc)
612 flags |= TH_PUSH;
613 } else {
614 if (tp->t_flags & TF_ACKNOW)
615 tcpstat.tcps_sndacks++;
616 else if (flags & (TH_SYN|TH_FIN|TH_RST))
617 tcpstat.tcps_sndctrl++;
618 else if (SEQ_GT(tp->snd_up, tp->snd_una))
619 tcpstat.tcps_sndurg++;
620 else
621 tcpstat.tcps_sndwinup++;
622
623 MGETHDR(m, M_DONTWAIT, MT_HEADER);
624 if (m == NULL) {
625 error = ENOBUFS;
626 goto out;
627 }
628#ifdef INET6
629 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
630 MHLEN >= hdrlen) {
631 MH_ALIGN(m, hdrlen);
632 } else
633#endif
634 m->m_data += max_linkhdr;
635 m->m_len = hdrlen;
636 }
637 m->m_pkthdr.rcvif = (struct ifnet *)0;
638 if (tp->t_template == 0)
639 panic("tcp_output");
640#ifdef INET6
641 if (isipv6) {
642 ip6 = mtod(m, struct ip6_hdr *);
643 th = (struct tcphdr *)(ip6 + 1);
644 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6,
645 sizeof(struct ip6_hdr));
646 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
647 sizeof(struct tcphdr));
648 } else
649#endif /* INET6 */
650 {
651 ip = mtod(m, struct ip *);
652 ipov = (struct ipovly *)ip;
653 th = (struct tcphdr *)(ip + 1);
654 /* this picks up the pseudo header (w/o the length) */
655 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip,
656 sizeof(struct ip));
657 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
658 sizeof(struct tcphdr));
659 }
660
661 /*
662 * Fill in fields, remembering maximum advertised
663 * window for use in delaying messages about window sizes.
664 * If resending a FIN, be sure not to use a new sequence number.
665 */
666 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
667 tp->snd_nxt == tp->snd_max)
668 tp->snd_nxt--;
669 /*
670 * If we are doing retransmissions, then snd_nxt will
671 * not reflect the first unsent octet. For ACK only
672 * packets, we do not want the sequence number of the
673 * retransmitted packet, we want the sequence number
674 * of the next unsent octet. So, if there is no data
675 * (and no SYN or FIN), use snd_max instead of snd_nxt
676 * when filling in ti_seq. But if we are in persist
677 * state, snd_max might reflect one byte beyond the
678 * right edge of the window, so use snd_nxt in that
679 * case, since we know we aren't doing a retransmission.
680 * (retransmit and persist are mutually exclusive...)
681 */
682 if (len || (flags & (TH_SYN|TH_FIN))
683 || callout_active(tp->tt_persist))
684 th->th_seq = htonl(tp->snd_nxt);
685 else
686 th->th_seq = htonl(tp->snd_max);
687 th->th_ack = htonl(tp->rcv_nxt);
688 if (optlen) {
689 bcopy(opt, th + 1, optlen);
690 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
691 }
692 th->th_flags = flags;
693 /*
694 * Calculate receive window. Don't shrink window,
695 * but avoid silly window syndrome.
696 */
697 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
698 win = 0;
699 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
700 win = (long)(tp->rcv_adv - tp->rcv_nxt);
701 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
702 win = (long)TCP_MAXWIN << tp->rcv_scale;
703 th->th_win = htons((u_short) (win>>tp->rcv_scale));
704 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
705 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
706 th->th_flags |= TH_URG;
707 } else
708 /*
709 * If no urgent pointer to send, then we pull
710 * the urgent pointer to the left edge of the send window
711 * so that it doesn't drift into the send window on sequence
712 * number wraparound.
713 */
714 tp->snd_up = tp->snd_una; /* drag it along */
715
716 /*
717 * Put TCP length in extended header, and then
718 * checksum extended header and data.
719 */
720 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
721#ifdef INET6
722 if (isipv6)
723 /*
724 * ip6_plen is not need to be filled now, and will be filled
725 * in ip6_output.
726 */
727 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
728 sizeof(struct tcphdr) + optlen + len);
729 else
730#endif /* INET6 */
731 {
732 m->m_pkthdr.csum_flags = CSUM_TCP;
733 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
734 if (len + optlen)
735 th->th_sum = in_addword(th->th_sum,
736 htons((u_short)(optlen + len)));
737
738 /* IP version must be set here for ipv4/ipv6 checking later */
739 KASSERT(ip->ip_v == IPVERSION,
740 ("%s: IP version incorrect: %d", __FUNCTION__, ip->ip_v));
741 }
742
743 /*
744 * In transmit state, time the transmission and arrange for
745 * the retransmit. In persist state, just set snd_max.
746 */
747 if (tp->t_force == 0 || !callout_active(tp->tt_persist)) {
748 tcp_seq startseq = tp->snd_nxt;
749
750 /*
751 * Advance snd_nxt over sequence space of this segment.
752 */
753 if (flags & (TH_SYN|TH_FIN)) {
754 if (flags & TH_SYN)
755 tp->snd_nxt++;
756 if (flags & TH_FIN) {
757 tp->snd_nxt++;
758 tp->t_flags |= TF_SENTFIN;
759 }
760 }
761 tp->snd_nxt += len;
762 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
763 tp->snd_max = tp->snd_nxt;
764 /*
765 * Time this transmission if not a retransmission and
766 * not currently timing anything.
767 */
768 if (tp->t_rtttime == 0) {
769 tp->t_rtttime = ticks;
770 tp->t_rtseq = startseq;
771 tcpstat.tcps_segstimed++;
772 }
773 }
774
775 /*
776 * Set retransmit timer if not currently set,
777 * and not doing an ack or a keep-alive probe.
778 * Initial value for retransmit timer is smoothed
779 * round-trip time + 2 * round-trip time variance.
780 * Initialize shift counter which is used for backoff
781 * of retransmit time.
782 */
783 if (!callout_active(tp->tt_rexmt) &&
784 tp->snd_nxt != tp->snd_una) {
785 if (callout_active(tp->tt_persist)) {
786 callout_stop(tp->tt_persist);
787 tp->t_rxtshift = 0;
788 }
789 callout_reset(tp->tt_rexmt, tp->t_rxtcur,
790 tcp_timer_rexmt, tp);
791 }
792 } else
793 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
794 tp->snd_max = tp->snd_nxt + len;
795
796#ifdef TCPDEBUG
797 /*
798 * Trace.
799 */
800 if (so->so_options & SO_DEBUG)
801 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
802#endif
803
804 /*
805 * Fill in IP length and desired time to live and
806 * send to IP level. There should be a better way
807 * to handle ttl and tos; we could keep them in
808 * the template, but need a way to checksum without them.
809 */
810 /*
811 * m->m_pkthdr.len should have been set before cksum calcuration,
812 * because in6_cksum() need it.
813 */
814#ifdef INET6
815 if (isipv6) {
63#include <netinet6/ip6_var.h>
64#endif
65#include <netinet/tcp.h>
66#define TCPOUTFLAGS
67#include <netinet/tcp_fsm.h>
68#include <netinet/tcp_seq.h>
69#include <netinet/tcp_timer.h>
70#include <netinet/tcp_var.h>
71#include <netinet/tcpip.h>
72#ifdef TCPDEBUG
73#include <netinet/tcp_debug.h>
74#endif
75
76#ifdef IPSEC
77#include <netinet6/ipsec.h>
78#endif /*IPSEC*/
79
80#include <machine/in_cksum.h>
81
82#ifdef notyet
83extern struct mbuf *m_copypack();
84#endif
85
86static int path_mtu_discovery = 1;
87SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
88 &path_mtu_discovery, 1, "Enable Path MTU Discovery");
89
90int ss_fltsz = 1;
91SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
92 &ss_fltsz, 1, "Slow start flight size");
93
94int ss_fltsz_local = TCP_MAXWIN; /* something large */
95SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
96 &ss_fltsz_local, 1, "Slow start flight size for local networks");
97
98int tcp_do_newreno = 0;
99SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
100 0, "Enable NewReno Algorithms");
101/*
102 * Tcp output routine: figure out what should be sent and send it.
103 */
104int
105tcp_output(tp)
106 register struct tcpcb *tp;
107{
108 register struct socket *so = tp->t_inpcb->inp_socket;
109 register long len, win;
110 int off, flags, error;
111 register struct mbuf *m;
112 struct ip *ip = NULL;
113 register struct ipovly *ipov = NULL;
114#ifdef INET6
115 struct ip6_hdr *ip6 = NULL;
116#endif /* INET6 */
117 register struct tcphdr *th;
118 u_char opt[TCP_MAXOLEN];
119 unsigned ipoptlen, optlen, hdrlen;
120 int idle, sendalot;
121 int maxburst = TCP_MAXBURST;
122 struct rmxp_tao *taop;
123 struct rmxp_tao tao_noncached;
124#ifdef INET6
125 int isipv6;
126#endif
127
128#ifdef INET6
129 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
130#endif
131
132 /*
133 * Determine length of data that should be transmitted,
134 * and flags that will be used.
135 * If there is some data or critical controls (SYN, RST)
136 * to send, then transmit; otherwise, investigate further.
137 */
138 idle = (tp->snd_max == tp->snd_una);
139 if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
140 /*
141 * We have been idle for "a while" and no acks are
142 * expected to clock out any data we send --
143 * slow start to get ack "clock" running again.
144 *
145 * Set the slow-start flight size depending on whether
146 * this is a local network or not.
147 */
148 if (
149#ifdef INET6
150 (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) ||
151 (!isipv6 &&
152#endif
153 in_localaddr(tp->t_inpcb->inp_faddr)
154#ifdef INET6
155 )
156#endif
157 )
158 tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
159 else
160 tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
161 }
162again:
163 sendalot = 0;
164 off = tp->snd_nxt - tp->snd_una;
165 win = min(tp->snd_wnd, tp->snd_cwnd);
166
167 flags = tcp_outflags[tp->t_state];
168 /*
169 * Get standard flags, and add SYN or FIN if requested by 'hidden'
170 * state flags.
171 */
172 if (tp->t_flags & TF_NEEDFIN)
173 flags |= TH_FIN;
174 if (tp->t_flags & TF_NEEDSYN)
175 flags |= TH_SYN;
176
177 /*
178 * If in persist timeout with window of 0, send 1 byte.
179 * Otherwise, if window is small but nonzero
180 * and timer expired, we will send what we can
181 * and go to transmit state.
182 */
183 if (tp->t_force) {
184 if (win == 0) {
185 /*
186 * If we still have some data to send, then
187 * clear the FIN bit. Usually this would
188 * happen below when it realizes that we
189 * aren't sending all the data. However,
190 * if we have exactly 1 byte of unsent data,
191 * then it won't clear the FIN bit below,
192 * and if we are in persist state, we wind
193 * up sending the packet without recording
194 * that we sent the FIN bit.
195 *
196 * We can't just blindly clear the FIN bit,
197 * because if we don't have any more data
198 * to send then the probe will be the FIN
199 * itself.
200 */
201 if (off < so->so_snd.sb_cc)
202 flags &= ~TH_FIN;
203 win = 1;
204 } else {
205 callout_stop(tp->tt_persist);
206 tp->t_rxtshift = 0;
207 }
208 }
209
210 len = (long)ulmin(so->so_snd.sb_cc, win) - off;
211
212 if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
213 taop = &tao_noncached;
214 bzero(taop, sizeof(*taop));
215 }
216
217 /*
218 * Lop off SYN bit if it has already been sent. However, if this
219 * is SYN-SENT state and if segment contains data and if we don't
220 * know that foreign host supports TAO, suppress sending segment.
221 */
222 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
223 flags &= ~TH_SYN;
224 off--, len++;
225 if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
226 taop->tao_ccsent == 0)
227 return 0;
228 }
229
230 /*
231 * Be careful not to send data and/or FIN on SYN segments
232 * in cases when no CC option will be sent.
233 * This measure is needed to prevent interoperability problems
234 * with not fully conformant TCP implementations.
235 */
236 if ((flags & TH_SYN) &&
237 ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
238 ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
239 len = 0;
240 flags &= ~TH_FIN;
241 }
242
243 if (len < 0) {
244 /*
245 * If FIN has been sent but not acked,
246 * but we haven't been called to retransmit,
247 * len will be -1. Otherwise, window shrank
248 * after we sent into it. If window shrank to 0,
249 * cancel pending retransmit, pull snd_nxt back
250 * to (closed) window, and set the persist timer
251 * if it isn't already going. If the window didn't
252 * close completely, just wait for an ACK.
253 */
254 len = 0;
255 if (win == 0) {
256 callout_stop(tp->tt_rexmt);
257 tp->t_rxtshift = 0;
258 tp->snd_nxt = tp->snd_una;
259 if (!callout_active(tp->tt_persist))
260 tcp_setpersist(tp);
261 }
262 }
263 if (len > tp->t_maxseg) {
264 len = tp->t_maxseg;
265 sendalot = 1;
266 }
267 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
268 flags &= ~TH_FIN;
269
270 win = sbspace(&so->so_rcv);
271
272 /*
273 * Sender silly window avoidance. If connection is idle
274 * and can send all data, a maximum segment,
275 * at least a maximum default-size segment do it,
276 * or are forced, do it; otherwise don't bother.
277 * If peer's buffer is tiny, then send
278 * when window is at least half open.
279 * If retransmitting (possibly after persist timer forced us
280 * to send into a small window), then must resend.
281 */
282 if (len) {
283 if (len == tp->t_maxseg)
284 goto send;
285 if (!(tp->t_flags & TF_MORETOCOME) &&
286 (idle || tp->t_flags & TF_NODELAY) &&
287 (tp->t_flags & TF_NOPUSH) == 0 &&
288 len + off >= so->so_snd.sb_cc)
289 goto send;
290 if (tp->t_force)
291 goto send;
292 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
293 goto send;
294 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
295 goto send;
296 }
297
298 /*
299 * Compare available window to amount of window
300 * known to peer (as advertised window less
301 * next expected input). If the difference is at least two
302 * max size segments, or at least 50% of the maximum possible
303 * window, then want to send a window update to peer.
304 */
305 if (win > 0) {
306 /*
307 * "adv" is the amount we can increase the window,
308 * taking into account that we are limited by
309 * TCP_MAXWIN << tp->rcv_scale.
310 */
311 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
312 (tp->rcv_adv - tp->rcv_nxt);
313
314 if (adv >= (long) (2 * tp->t_maxseg))
315 goto send;
316 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
317 goto send;
318 }
319
320 /*
321 * Send if we owe peer an ACK.
322 */
323 if (tp->t_flags & TF_ACKNOW)
324 goto send;
325 if ((flags & TH_RST) ||
326 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
327 goto send;
328 if (SEQ_GT(tp->snd_up, tp->snd_una))
329 goto send;
330 /*
331 * If our state indicates that FIN should be sent
332 * and we have not yet done so, or we're retransmitting the FIN,
333 * then we need to send.
334 */
335 if (flags & TH_FIN &&
336 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
337 goto send;
338
339 /*
340 * TCP window updates are not reliable, rather a polling protocol
341 * using ``persist'' packets is used to insure receipt of window
342 * updates. The three ``states'' for the output side are:
343 * idle not doing retransmits or persists
344 * persisting to move a small or zero window
345 * (re)transmitting and thereby not persisting
346 *
347 * callout_active(tp->tt_persist)
348 * is true when we are in persist state.
349 * tp->t_force
350 * is set when we are called to send a persist packet.
351 * callout_active(tp->tt_rexmt)
352 * is set when we are retransmitting
353 * The output side is idle when both timers are zero.
354 *
355 * If send window is too small, there is data to transmit, and no
356 * retransmit or persist is pending, then go to persist state.
357 * If nothing happens soon, send when timer expires:
358 * if window is nonzero, transmit what we can,
359 * otherwise force out a byte.
360 */
361 if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) &&
362 !callout_active(tp->tt_persist)) {
363 tp->t_rxtshift = 0;
364 tcp_setpersist(tp);
365 }
366
367 /*
368 * No reason to send a segment, just return.
369 */
370 return (0);
371
372send:
373 /*
374 * Before ESTABLISHED, force sending of initial options
375 * unless TCP set not to do any options.
376 * NOTE: we assume that the IP/TCP header plus TCP options
377 * always fit in a single mbuf, leaving room for a maximum
378 * link header, i.e.
379 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
380 */
381 optlen = 0;
382#ifdef INET6
383 if (isipv6)
384 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
385 else
386#endif
387 hdrlen = sizeof (struct tcpiphdr);
388 if (flags & TH_SYN) {
389 tp->snd_nxt = tp->iss;
390 if ((tp->t_flags & TF_NOOPT) == 0) {
391 u_short mss;
392
393 opt[0] = TCPOPT_MAXSEG;
394 opt[1] = TCPOLEN_MAXSEG;
395 mss = htons((u_short) tcp_mssopt(tp));
396 (void)memcpy(opt + 2, &mss, sizeof(mss));
397 optlen = TCPOLEN_MAXSEG;
398
399 if ((tp->t_flags & TF_REQ_SCALE) &&
400 ((flags & TH_ACK) == 0 ||
401 (tp->t_flags & TF_RCVD_SCALE))) {
402 *((u_int32_t *)(opt + optlen)) = htonl(
403 TCPOPT_NOP << 24 |
404 TCPOPT_WINDOW << 16 |
405 TCPOLEN_WINDOW << 8 |
406 tp->request_r_scale);
407 optlen += 4;
408 }
409 }
410 }
411
412 /*
413 * Send a timestamp and echo-reply if this is a SYN and our side
414 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
415 * and our peer have sent timestamps in our SYN's.
416 */
417 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
418 (flags & TH_RST) == 0 &&
419 ((flags & TH_ACK) == 0 ||
420 (tp->t_flags & TF_RCVD_TSTMP))) {
421 u_int32_t *lp = (u_int32_t *)(opt + optlen);
422
423 /* Form timestamp option as shown in appendix A of RFC 1323. */
424 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
425 *lp++ = htonl(ticks);
426 *lp = htonl(tp->ts_recent);
427 optlen += TCPOLEN_TSTAMP_APPA;
428 }
429
430 /*
431 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
432 * options are allowed (!TF_NOOPT) and it's not a RST.
433 */
434 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
435 (flags & TH_RST) == 0) {
436 switch (flags & (TH_SYN|TH_ACK)) {
437 /*
438 * This is a normal ACK, send CC if we received CC before
439 * from our peer.
440 */
441 case TH_ACK:
442 if (!(tp->t_flags & TF_RCVD_CC))
443 break;
444 /*FALLTHROUGH*/
445
446 /*
447 * We can only get here in T/TCP's SYN_SENT* state, when
448 * we're a sending a non-SYN segment without waiting for
449 * the ACK of our SYN. A check above assures that we only
450 * do this if our peer understands T/TCP.
451 */
452 case 0:
453 opt[optlen++] = TCPOPT_NOP;
454 opt[optlen++] = TCPOPT_NOP;
455 opt[optlen++] = TCPOPT_CC;
456 opt[optlen++] = TCPOLEN_CC;
457 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
458
459 optlen += 4;
460 break;
461
462 /*
463 * This is our initial SYN, check whether we have to use
464 * CC or CC.new.
465 */
466 case TH_SYN:
467 opt[optlen++] = TCPOPT_NOP;
468 opt[optlen++] = TCPOPT_NOP;
469 opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
470 TCPOPT_CCNEW : TCPOPT_CC;
471 opt[optlen++] = TCPOLEN_CC;
472 *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
473 optlen += 4;
474 break;
475
476 /*
477 * This is a SYN,ACK; send CC and CC.echo if we received
478 * CC from our peer.
479 */
480 case (TH_SYN|TH_ACK):
481 if (tp->t_flags & TF_RCVD_CC) {
482 opt[optlen++] = TCPOPT_NOP;
483 opt[optlen++] = TCPOPT_NOP;
484 opt[optlen++] = TCPOPT_CC;
485 opt[optlen++] = TCPOLEN_CC;
486 *(u_int32_t *)&opt[optlen] =
487 htonl(tp->cc_send);
488 optlen += 4;
489 opt[optlen++] = TCPOPT_NOP;
490 opt[optlen++] = TCPOPT_NOP;
491 opt[optlen++] = TCPOPT_CCECHO;
492 opt[optlen++] = TCPOLEN_CC;
493 *(u_int32_t *)&opt[optlen] =
494 htonl(tp->cc_recv);
495 optlen += 4;
496 }
497 break;
498 }
499 }
500
501 hdrlen += optlen;
502
503#ifdef INET6
504 if (isipv6)
505 ipoptlen = ip6_optlen(tp->t_inpcb);
506 else
507#endif
508 {
509 if (tp->t_inpcb->inp_options) {
510 ipoptlen = tp->t_inpcb->inp_options->m_len -
511 offsetof(struct ipoption, ipopt_list);
512 } else {
513 ipoptlen = 0;
514 }
515 }
516#ifdef IPSEC
517 ipoptlen += ipsec_hdrsiz_tcp(tp);
518#endif
519
520 /*
521 * Adjust data length if insertion of options will
522 * bump the packet length beyond the t_maxopd length.
523 * Clear the FIN bit because we cut off the tail of
524 * the segment.
525 */
526 if (len + optlen + ipoptlen > tp->t_maxopd) {
527 /*
528 * If there is still more to send, don't close the connection.
529 */
530 flags &= ~TH_FIN;
531 len = tp->t_maxopd - optlen - ipoptlen;
532 sendalot = 1;
533 }
534
535/*#ifdef DIAGNOSTIC*/
536#ifdef INET6
537 if (max_linkhdr + hdrlen > MCLBYTES)
538 panic("tcphdr too big");
539#else
540 if (max_linkhdr + hdrlen > MHLEN)
541 panic("tcphdr too big");
542#endif
543/*#endif*/
544
545 /*
546 * Grab a header mbuf, attaching a copy of data to
547 * be transmitted, and initialize the header from
548 * the template for sends on this connection.
549 */
550 if (len) {
551 if (tp->t_force && len == 1)
552 tcpstat.tcps_sndprobe++;
553 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
554 tcpstat.tcps_sndrexmitpack++;
555 tcpstat.tcps_sndrexmitbyte += len;
556 } else {
557 tcpstat.tcps_sndpack++;
558 tcpstat.tcps_sndbyte += len;
559 }
560#ifdef notyet
561 if ((m = m_copypack(so->so_snd.sb_mb, off,
562 (int)len, max_linkhdr + hdrlen)) == 0) {
563 error = ENOBUFS;
564 goto out;
565 }
566 /*
567 * m_copypack left space for our hdr; use it.
568 */
569 m->m_len += hdrlen;
570 m->m_data -= hdrlen;
571#else
572 MGETHDR(m, M_DONTWAIT, MT_HEADER);
573 if (m == NULL) {
574 error = ENOBUFS;
575 goto out;
576 }
577#ifdef INET6
578 if (MHLEN < hdrlen + max_linkhdr) {
579 MCLGET(m, M_DONTWAIT);
580 if ((m->m_flags & M_EXT) == 0) {
581 m_freem(m);
582 error = ENOBUFS;
583 goto out;
584 }
585 }
586#endif
587 m->m_data += max_linkhdr;
588 m->m_len = hdrlen;
589 if (len <= MHLEN - hdrlen - max_linkhdr) {
590 m_copydata(so->so_snd.sb_mb, off, (int) len,
591 mtod(m, caddr_t) + hdrlen);
592 m->m_len += len;
593 } else {
594 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
595 if (m->m_next == 0) {
596 (void) m_free(m);
597 error = ENOBUFS;
598 goto out;
599 }
600 }
601#endif
602 /*
603 * If we're sending everything we've got, set PUSH.
604 * (This will keep happy those implementations which only
605 * give data to the user when a buffer fills or
606 * a PUSH comes in.)
607 */
608 if (off + len == so->so_snd.sb_cc)
609 flags |= TH_PUSH;
610 } else {
611 if (tp->t_flags & TF_ACKNOW)
612 tcpstat.tcps_sndacks++;
613 else if (flags & (TH_SYN|TH_FIN|TH_RST))
614 tcpstat.tcps_sndctrl++;
615 else if (SEQ_GT(tp->snd_up, tp->snd_una))
616 tcpstat.tcps_sndurg++;
617 else
618 tcpstat.tcps_sndwinup++;
619
620 MGETHDR(m, M_DONTWAIT, MT_HEADER);
621 if (m == NULL) {
622 error = ENOBUFS;
623 goto out;
624 }
625#ifdef INET6
626 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
627 MHLEN >= hdrlen) {
628 MH_ALIGN(m, hdrlen);
629 } else
630#endif
631 m->m_data += max_linkhdr;
632 m->m_len = hdrlen;
633 }
634 m->m_pkthdr.rcvif = (struct ifnet *)0;
635 if (tp->t_template == 0)
636 panic("tcp_output");
637#ifdef INET6
638 if (isipv6) {
639 ip6 = mtod(m, struct ip6_hdr *);
640 th = (struct tcphdr *)(ip6 + 1);
641 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6,
642 sizeof(struct ip6_hdr));
643 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
644 sizeof(struct tcphdr));
645 } else
646#endif /* INET6 */
647 {
648 ip = mtod(m, struct ip *);
649 ipov = (struct ipovly *)ip;
650 th = (struct tcphdr *)(ip + 1);
651 /* this picks up the pseudo header (w/o the length) */
652 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip,
653 sizeof(struct ip));
654 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
655 sizeof(struct tcphdr));
656 }
657
658 /*
659 * Fill in fields, remembering maximum advertised
660 * window for use in delaying messages about window sizes.
661 * If resending a FIN, be sure not to use a new sequence number.
662 */
663 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
664 tp->snd_nxt == tp->snd_max)
665 tp->snd_nxt--;
666 /*
667 * If we are doing retransmissions, then snd_nxt will
668 * not reflect the first unsent octet. For ACK only
669 * packets, we do not want the sequence number of the
670 * retransmitted packet, we want the sequence number
671 * of the next unsent octet. So, if there is no data
672 * (and no SYN or FIN), use snd_max instead of snd_nxt
673 * when filling in ti_seq. But if we are in persist
674 * state, snd_max might reflect one byte beyond the
675 * right edge of the window, so use snd_nxt in that
676 * case, since we know we aren't doing a retransmission.
677 * (retransmit and persist are mutually exclusive...)
678 */
679 if (len || (flags & (TH_SYN|TH_FIN))
680 || callout_active(tp->tt_persist))
681 th->th_seq = htonl(tp->snd_nxt);
682 else
683 th->th_seq = htonl(tp->snd_max);
684 th->th_ack = htonl(tp->rcv_nxt);
685 if (optlen) {
686 bcopy(opt, th + 1, optlen);
687 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
688 }
689 th->th_flags = flags;
690 /*
691 * Calculate receive window. Don't shrink window,
692 * but avoid silly window syndrome.
693 */
694 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
695 win = 0;
696 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
697 win = (long)(tp->rcv_adv - tp->rcv_nxt);
698 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
699 win = (long)TCP_MAXWIN << tp->rcv_scale;
700 th->th_win = htons((u_short) (win>>tp->rcv_scale));
701 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
702 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
703 th->th_flags |= TH_URG;
704 } else
705 /*
706 * If no urgent pointer to send, then we pull
707 * the urgent pointer to the left edge of the send window
708 * so that it doesn't drift into the send window on sequence
709 * number wraparound.
710 */
711 tp->snd_up = tp->snd_una; /* drag it along */
712
713 /*
714 * Put TCP length in extended header, and then
715 * checksum extended header and data.
716 */
717 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
718#ifdef INET6
719 if (isipv6)
720 /*
721 * ip6_plen is not need to be filled now, and will be filled
722 * in ip6_output.
723 */
724 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
725 sizeof(struct tcphdr) + optlen + len);
726 else
727#endif /* INET6 */
728 {
729 m->m_pkthdr.csum_flags = CSUM_TCP;
730 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
731 if (len + optlen)
732 th->th_sum = in_addword(th->th_sum,
733 htons((u_short)(optlen + len)));
734
735 /* IP version must be set here for ipv4/ipv6 checking later */
736 KASSERT(ip->ip_v == IPVERSION,
737 ("%s: IP version incorrect: %d", __FUNCTION__, ip->ip_v));
738 }
739
740 /*
741 * In transmit state, time the transmission and arrange for
742 * the retransmit. In persist state, just set snd_max.
743 */
744 if (tp->t_force == 0 || !callout_active(tp->tt_persist)) {
745 tcp_seq startseq = tp->snd_nxt;
746
747 /*
748 * Advance snd_nxt over sequence space of this segment.
749 */
750 if (flags & (TH_SYN|TH_FIN)) {
751 if (flags & TH_SYN)
752 tp->snd_nxt++;
753 if (flags & TH_FIN) {
754 tp->snd_nxt++;
755 tp->t_flags |= TF_SENTFIN;
756 }
757 }
758 tp->snd_nxt += len;
759 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
760 tp->snd_max = tp->snd_nxt;
761 /*
762 * Time this transmission if not a retransmission and
763 * not currently timing anything.
764 */
765 if (tp->t_rtttime == 0) {
766 tp->t_rtttime = ticks;
767 tp->t_rtseq = startseq;
768 tcpstat.tcps_segstimed++;
769 }
770 }
771
772 /*
773 * Set retransmit timer if not currently set,
774 * and not doing an ack or a keep-alive probe.
775 * Initial value for retransmit timer is smoothed
776 * round-trip time + 2 * round-trip time variance.
777 * Initialize shift counter which is used for backoff
778 * of retransmit time.
779 */
780 if (!callout_active(tp->tt_rexmt) &&
781 tp->snd_nxt != tp->snd_una) {
782 if (callout_active(tp->tt_persist)) {
783 callout_stop(tp->tt_persist);
784 tp->t_rxtshift = 0;
785 }
786 callout_reset(tp->tt_rexmt, tp->t_rxtcur,
787 tcp_timer_rexmt, tp);
788 }
789 } else
790 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
791 tp->snd_max = tp->snd_nxt + len;
792
793#ifdef TCPDEBUG
794 /*
795 * Trace.
796 */
797 if (so->so_options & SO_DEBUG)
798 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
799#endif
800
801 /*
802 * Fill in IP length and desired time to live and
803 * send to IP level. There should be a better way
804 * to handle ttl and tos; we could keep them in
805 * the template, but need a way to checksum without them.
806 */
807 /*
808 * m->m_pkthdr.len should have been set before cksum calcuration,
809 * because in6_cksum() need it.
810 */
811#ifdef INET6
812 if (isipv6) {
816 /*
813 /*
817 * we separately set hoplimit for every segment, since the
818 * user might want to change the value via setsockopt.
819 * Also, desired default hop limit might be changed via
814 * we separately set hoplimit for every segment, since the
815 * user might want to change the value via setsockopt.
816 * Also, desired default hop limit might be changed via
820 * Neighbor Discovery.
821 */
822 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
823 tp->t_inpcb->in6p_route.ro_rt ?
824 tp->t_inpcb->in6p_route.ro_rt->rt_ifp
825 : NULL);
817 * Neighbor Discovery.
818 */
819 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
820 tp->t_inpcb->in6p_route.ro_rt ?
821 tp->t_inpcb->in6p_route.ro_rt->rt_ifp
822 : NULL);
826
827 /* TODO: IPv6 IP6TOS_ECT bit on */
828#ifdef IPSEC
823
824 /* TODO: IPv6 IP6TOS_ECT bit on */
825#ifdef IPSEC
829 m->m_pkthdr.rcvif = (struct ifnet *)so;
826 ipsec_setsocket(m, so);
830#endif /*IPSEC*/
831 error = ip6_output(m,
832 tp->t_inpcb->in6p_outputopts,
833 &tp->t_inpcb->in6p_route,
827#endif /*IPSEC*/
828 error = ip6_output(m,
829 tp->t_inpcb->in6p_outputopts,
830 &tp->t_inpcb->in6p_route,
834 (so->so_options & SO_DONTROUTE)|IPV6_SOCKINMRCVIF,
835 NULL, NULL);
831 (so->so_options & SO_DONTROUTE), NULL, NULL);
836 } else
837#endif /* INET6 */
838 {
839 struct rtentry *rt;
840 ip->ip_len = m->m_pkthdr.len;
832 } else
833#endif /* INET6 */
834 {
835 struct rtentry *rt;
836 ip->ip_len = m->m_pkthdr.len;
837#ifdef INET6
838 if (INP_CHECK_SOCKAF(so, AF_INET6))
839 ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
840 tp->t_inpcb->in6p_route.ro_rt ?
841 tp->t_inpcb->in6p_route.ro_rt->rt_ifp
842 : NULL);
843 else
844#endif /* INET6 */
841 ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */
842 ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */
843 /*
844 * See if we should do MTU discovery. We do it only if the following
845 * are true:
846 * 1) we have a valid route to the destination
847 * 2) the MTU is not locked (if it is, then discovery has been
848 * disabled)
849 */
850 if (path_mtu_discovery
851 && (rt = tp->t_inpcb->inp_route.ro_rt)
852 && rt->rt_flags & RTF_UP
853 && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
854 ip->ip_off |= IP_DF;
855 }
845 ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */
846 ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */
847 /*
848 * See if we should do MTU discovery. We do it only if the following
849 * are true:
850 * 1) we have a valid route to the destination
851 * 2) the MTU is not locked (if it is, then discovery has been
852 * disabled)
853 */
854 if (path_mtu_discovery
855 && (rt = tp->t_inpcb->inp_route.ro_rt)
856 && rt->rt_flags & RTF_UP
857 && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
858 ip->ip_off |= IP_DF;
859 }
860#ifdef IPSEC
861 ipsec_setsocket(m, so);
862#endif /*IPSEC*/
856 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
863 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
857 (so->so_options & SO_DONTROUTE)|IP_SOCKINMRCVIF, 0);
864 (so->so_options & SO_DONTROUTE), 0);
858 }
859 if (error) {
860out:
861 if (error == ENOBUFS) {
862 if (!callout_active(tp->tt_rexmt) &&
863 !callout_active(tp->tt_persist))
864 callout_reset(tp->tt_rexmt, tp->t_rxtcur,
865 tcp_timer_rexmt, tp);
866 tcp_quench(tp->t_inpcb, 0);
867 return (0);
868 }
869 if (error == EMSGSIZE) {
870 /*
871 * ip_output() will have already fixed the route
872 * for us. tcp_mtudisc() will, as its last action,
873 * initiate retransmission, so it is important to
874 * not do so here.
875 */
876 tcp_mtudisc(tp->t_inpcb, 0);
877 return 0;
878 }
879 if ((error == EHOSTUNREACH || error == ENETDOWN)
880 && TCPS_HAVERCVDSYN(tp->t_state)) {
881 tp->t_softerror = error;
882 return (0);
883 }
884 return (error);
885 }
886 tcpstat.tcps_sndtotal++;
887
888 /*
889 * Data sent (as far as we can tell).
890 * If this advertises a larger window than any other segment,
891 * then remember the size of the advertised window.
892 * Any pending ACK has now been sent.
893 */
894 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
895 tp->rcv_adv = tp->rcv_nxt + win;
896 tp->last_ack_sent = tp->rcv_nxt;
897 tp->t_flags &= ~TF_ACKNOW;
898 if (tcp_delack_enabled)
899 callout_stop(tp->tt_delack);
900 if (sendalot && (!tcp_do_newreno || --maxburst))
901 goto again;
902 return (0);
903}
904
905void
906tcp_setpersist(tp)
907 register struct tcpcb *tp;
908{
909 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
910 int tt;
911
912 if (callout_active(tp->tt_rexmt))
913 panic("tcp_setpersist: retransmit pending");
914 /*
915 * Start/restart persistance timer.
916 */
917 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
918 TCPTV_PERSMIN, TCPTV_PERSMAX);
919 callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp);
920 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
921 tp->t_rxtshift++;
922}
865 }
866 if (error) {
867out:
868 if (error == ENOBUFS) {
869 if (!callout_active(tp->tt_rexmt) &&
870 !callout_active(tp->tt_persist))
871 callout_reset(tp->tt_rexmt, tp->t_rxtcur,
872 tcp_timer_rexmt, tp);
873 tcp_quench(tp->t_inpcb, 0);
874 return (0);
875 }
876 if (error == EMSGSIZE) {
877 /*
878 * ip_output() will have already fixed the route
879 * for us. tcp_mtudisc() will, as its last action,
880 * initiate retransmission, so it is important to
881 * not do so here.
882 */
883 tcp_mtudisc(tp->t_inpcb, 0);
884 return 0;
885 }
886 if ((error == EHOSTUNREACH || error == ENETDOWN)
887 && TCPS_HAVERCVDSYN(tp->t_state)) {
888 tp->t_softerror = error;
889 return (0);
890 }
891 return (error);
892 }
893 tcpstat.tcps_sndtotal++;
894
895 /*
896 * Data sent (as far as we can tell).
897 * If this advertises a larger window than any other segment,
898 * then remember the size of the advertised window.
899 * Any pending ACK has now been sent.
900 */
901 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
902 tp->rcv_adv = tp->rcv_nxt + win;
903 tp->last_ack_sent = tp->rcv_nxt;
904 tp->t_flags &= ~TF_ACKNOW;
905 if (tcp_delack_enabled)
906 callout_stop(tp->tt_delack);
907 if (sendalot && (!tcp_do_newreno || --maxburst))
908 goto again;
909 return (0);
910}
911
912void
913tcp_setpersist(tp)
914 register struct tcpcb *tp;
915{
916 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
917 int tt;
918
919 if (callout_active(tp->tt_rexmt))
920 panic("tcp_setpersist: retransmit pending");
921 /*
922 * Start/restart persistance timer.
923 */
924 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
925 TCPTV_PERSMIN, TCPTV_PERSMAX);
926 callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp);
927 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
928 tp->t_rxtshift++;
929}