Deleted Added
full compact
tcp_output.c (293910) tcp_output.c (294535)
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/netinet/tcp_output.c 293910 2016-01-14 10:22:45Z glebius $");
33__FBSDID("$FreeBSD: head/sys/netinet/tcp_output.c 294535 2016-01-21 22:34:51Z glebius $");
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37#include "opt_ipsec.h"
38#include "opt_tcpdebug.h"
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/domain.h>
43#include <sys/hhook.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/mbuf.h>
47#include <sys/mutex.h>
48#include <sys/protosw.h>
49#include <sys/sdt.h>
50#include <sys/socket.h>
51#include <sys/socketvar.h>
52#include <sys/sysctl.h>
53
54#include <net/if.h>
55#include <net/route.h>
56#include <net/vnet.h>
57
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37#include "opt_ipsec.h"
38#include "opt_tcpdebug.h"
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/domain.h>
43#include <sys/hhook.h>
44#include <sys/kernel.h>
45#include <sys/lock.h>
46#include <sys/mbuf.h>
47#include <sys/mutex.h>
48#include <sys/protosw.h>
49#include <sys/sdt.h>
50#include <sys/socket.h>
51#include <sys/socketvar.h>
52#include <sys/sysctl.h>
53
54#include <net/if.h>
55#include <net/route.h>
56#include <net/vnet.h>
57
58#include <netinet/cc.h>
59#include <netinet/in.h>
60#include <netinet/in_kdtrace.h>
61#include <netinet/in_systm.h>
62#include <netinet/ip.h>
63#include <netinet/in_pcb.h>
64#include <netinet/ip_var.h>
65#include <netinet/ip_options.h>
66#ifdef INET6
67#include <netinet6/in6_pcb.h>
68#include <netinet/ip6.h>
69#include <netinet6/ip6_var.h>
70#endif
71#ifdef TCP_RFC7413
72#include <netinet/tcp_fastopen.h>
73#endif
58#include <netinet/in.h>
59#include <netinet/in_kdtrace.h>
60#include <netinet/in_systm.h>
61#include <netinet/ip.h>
62#include <netinet/in_pcb.h>
63#include <netinet/ip_var.h>
64#include <netinet/ip_options.h>
65#ifdef INET6
66#include <netinet6/in6_pcb.h>
67#include <netinet/ip6.h>
68#include <netinet6/ip6_var.h>
69#endif
70#ifdef TCP_RFC7413
71#include <netinet/tcp_fastopen.h>
72#endif
73#include <netinet/tcp.h>
74#define TCPOUTFLAGS
75#include <netinet/tcp_fsm.h>
76#include <netinet/tcp_seq.h>
77#include <netinet/tcp_timer.h>
78#include <netinet/tcp_var.h>
79#include <netinet/tcpip.h>
74#define TCPOUTFLAGS
75#include <netinet/tcp_fsm.h>
76#include <netinet/tcp_seq.h>
77#include <netinet/tcp_timer.h>
78#include <netinet/tcp_var.h>
79#include <netinet/tcpip.h>
80#include <netinet/tcp_cc.h>
80#ifdef TCPPCAP
81#include <netinet/tcp_pcap.h>
82#endif
83#ifdef TCPDEBUG
84#include <netinet/tcp_debug.h>
85#endif
86#ifdef TCP_OFFLOAD
87#include <netinet/tcp_offload.h>
88#endif
89
90#ifdef IPSEC
91#include <netipsec/ipsec.h>
92#endif /*IPSEC*/
93
94#include <machine/in_cksum.h>
95
96#include <security/mac/mac_framework.h>
97
98VNET_DEFINE(int, path_mtu_discovery) = 1;
99SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW,
100 &VNET_NAME(path_mtu_discovery), 1,
101 "Enable Path MTU Discovery");
102
103VNET_DEFINE(int, tcp_do_tso) = 1;
104#define V_tcp_do_tso VNET(tcp_do_tso)
105SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW,
106 &VNET_NAME(tcp_do_tso), 0,
107 "Enable TCP Segmentation Offload");
108
109VNET_DEFINE(int, tcp_sendspace) = 1024*32;
110#define V_tcp_sendspace VNET(tcp_sendspace)
111SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW,
112 &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
113
114VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
115#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
116SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
117 &VNET_NAME(tcp_do_autosndbuf), 0,
118 "Enable automatic send buffer sizing");
119
120VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
121#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
122SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
123 &VNET_NAME(tcp_autosndbuf_inc), 0,
124 "Incrementor step size of automatic send buffer");
125
126VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
127#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
128SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
129 &VNET_NAME(tcp_autosndbuf_max), 0,
130 "Max size of automatic send buffer");
131
132static void inline hhook_run_tcp_est_out(struct tcpcb *tp,
133 struct tcphdr *th, struct tcpopt *to,
134 long len, int tso);
135static void inline cc_after_idle(struct tcpcb *tp);
136
137/*
138 * Wrapper for the TCP established output helper hook.
139 */
140static void inline
141hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
142 struct tcpopt *to, long len, int tso)
143{
144 struct tcp_hhook_data hhook_data;
145
146 if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) {
147 hhook_data.tp = tp;
148 hhook_data.th = th;
149 hhook_data.to = to;
150 hhook_data.len = len;
151 hhook_data.tso = tso;
152
153 hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data,
154 tp->osd);
155 }
156}
157
158/*
159 * CC wrapper hook functions
160 */
161static void inline
162cc_after_idle(struct tcpcb *tp)
163{
164 INP_WLOCK_ASSERT(tp->t_inpcb);
165
166 if (CC_ALGO(tp)->after_idle != NULL)
167 CC_ALGO(tp)->after_idle(tp->ccv);
168}
169
170/*
171 * Tcp output routine: figure out what should be sent and send it.
172 */
173int
174tcp_output(struct tcpcb *tp)
175{
176 struct socket *so = tp->t_inpcb->inp_socket;
177 long len, recwin, sendwin;
178 int off, flags, error = 0; /* Keep compiler happy */
179 struct mbuf *m;
180 struct ip *ip = NULL;
181 struct ipovly *ipov = NULL;
182 struct tcphdr *th;
183 u_char opt[TCP_MAXOLEN];
184 unsigned ipoptlen, optlen, hdrlen;
185#ifdef IPSEC
186 unsigned ipsec_optlen = 0;
187#endif
188 int idle, sendalot;
189 int sack_rxmit, sack_bytes_rxmt;
190 struct sackhole *p;
191 int tso, mtu;
192 struct tcpopt to;
193#if 0
194 int maxburst = TCP_MAXBURST;
195#endif
196#ifdef INET6
197 struct ip6_hdr *ip6 = NULL;
198 int isipv6;
199
200 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
201#endif
202
203 INP_WLOCK_ASSERT(tp->t_inpcb);
204
205#ifdef TCP_OFFLOAD
206 if (tp->t_flags & TF_TOE)
207 return (tcp_offload_output(tp));
208#endif
209
210#ifdef TCP_RFC7413
211 /*
212 * For TFO connections in SYN_RECEIVED, only allow the initial
213 * SYN|ACK and those sent by the retransmit timer.
214 */
215 if ((tp->t_flags & TF_FASTOPEN) &&
216 (tp->t_state == TCPS_SYN_RECEIVED) &&
217 SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */
218 (tp->snd_nxt != tp->snd_una)) /* not a retransmit */
219 return (0);
220#endif
221 /*
222 * Determine length of data that should be transmitted,
223 * and flags that will be used.
224 * If there is some data or critical controls (SYN, RST)
225 * to send, then transmit; otherwise, investigate further.
226 */
227 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
228 if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
229 cc_after_idle(tp);
230 tp->t_flags &= ~TF_LASTIDLE;
231 if (idle) {
232 if (tp->t_flags & TF_MORETOCOME) {
233 tp->t_flags |= TF_LASTIDLE;
234 idle = 0;
235 }
236 }
237again:
238 /*
239 * If we've recently taken a timeout, snd_max will be greater than
240 * snd_nxt. There may be SACK information that allows us to avoid
241 * resending already delivered data. Adjust snd_nxt accordingly.
242 */
243 if ((tp->t_flags & TF_SACK_PERMIT) &&
244 SEQ_LT(tp->snd_nxt, tp->snd_max))
245 tcp_sack_adjust(tp);
246 sendalot = 0;
247 tso = 0;
248 mtu = 0;
249 off = tp->snd_nxt - tp->snd_una;
250 sendwin = min(tp->snd_wnd, tp->snd_cwnd);
251
252 flags = tcp_outflags[tp->t_state];
253 /*
254 * Send any SACK-generated retransmissions. If we're explicitly trying
255 * to send out new data (when sendalot is 1), bypass this function.
256 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
257 * we're replacing a (future) new transmission with a retransmission
258 * now, and we previously incremented snd_cwnd in tcp_input().
259 */
260 /*
261 * Still in sack recovery , reset rxmit flag to zero.
262 */
263 sack_rxmit = 0;
264 sack_bytes_rxmt = 0;
265 len = 0;
266 p = NULL;
267 if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
268 (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
269 long cwin;
270
271 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
272 if (cwin < 0)
273 cwin = 0;
274 /* Do not retransmit SACK segments beyond snd_recover */
275 if (SEQ_GT(p->end, tp->snd_recover)) {
276 /*
277 * (At least) part of sack hole extends beyond
278 * snd_recover. Check to see if we can rexmit data
279 * for this hole.
280 */
281 if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
282 /*
283 * Can't rexmit any more data for this hole.
284 * That data will be rexmitted in the next
285 * sack recovery episode, when snd_recover
286 * moves past p->rxmit.
287 */
288 p = NULL;
289 goto after_sack_rexmit;
290 } else
291 /* Can rexmit part of the current hole */
292 len = ((long)ulmin(cwin,
293 tp->snd_recover - p->rxmit));
294 } else
295 len = ((long)ulmin(cwin, p->end - p->rxmit));
296 off = p->rxmit - tp->snd_una;
297 KASSERT(off >= 0,("%s: sack block to the left of una : %d",
298 __func__, off));
299 if (len > 0) {
300 sack_rxmit = 1;
301 sendalot = 1;
302 TCPSTAT_INC(tcps_sack_rexmits);
303 TCPSTAT_ADD(tcps_sack_rexmit_bytes,
304 min(len, tp->t_maxseg));
305 }
306 }
307after_sack_rexmit:
308 /*
309 * Get standard flags, and add SYN or FIN if requested by 'hidden'
310 * state flags.
311 */
312 if (tp->t_flags & TF_NEEDFIN)
313 flags |= TH_FIN;
314 if (tp->t_flags & TF_NEEDSYN)
315 flags |= TH_SYN;
316
317 SOCKBUF_LOCK(&so->so_snd);
318 /*
319 * If in persist timeout with window of 0, send 1 byte.
320 * Otherwise, if window is small but nonzero
321 * and timer expired, we will send what we can
322 * and go to transmit state.
323 */
324 if (tp->t_flags & TF_FORCEDATA) {
325 if (sendwin == 0) {
326 /*
327 * If we still have some data to send, then
328 * clear the FIN bit. Usually this would
329 * happen below when it realizes that we
330 * aren't sending all the data. However,
331 * if we have exactly 1 byte of unsent data,
332 * then it won't clear the FIN bit below,
333 * and if we are in persist state, we wind
334 * up sending the packet without recording
335 * that we sent the FIN bit.
336 *
337 * We can't just blindly clear the FIN bit,
338 * because if we don't have any more data
339 * to send then the probe will be the FIN
340 * itself.
341 */
342 if (off < sbused(&so->so_snd))
343 flags &= ~TH_FIN;
344 sendwin = 1;
345 } else {
346 tcp_timer_activate(tp, TT_PERSIST, 0);
347 tp->t_rxtshift = 0;
348 }
349 }
350
351 /*
352 * If snd_nxt == snd_max and we have transmitted a FIN, the
353 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
354 * a negative length. This can also occur when TCP opens up
355 * its congestion window while receiving additional duplicate
356 * acks after fast-retransmit because TCP will reset snd_nxt
357 * to snd_max after the fast-retransmit.
358 *
359 * In the normal retransmit-FIN-only case, however, snd_nxt will
360 * be set to snd_una, the offset will be 0, and the length may
361 * wind up 0.
362 *
363 * If sack_rxmit is true we are retransmitting from the scoreboard
364 * in which case len is already set.
365 */
366 if (sack_rxmit == 0) {
367 if (sack_bytes_rxmt == 0)
368 len = ((long)ulmin(sbavail(&so->so_snd), sendwin) -
369 off);
370 else {
371 long cwin;
372
373 /*
374 * We are inside of a SACK recovery episode and are
375 * sending new data, having retransmitted all the
376 * data possible in the scoreboard.
377 */
378 len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) -
379 off);
380 /*
381 * Don't remove this (len > 0) check !
382 * We explicitly check for len > 0 here (although it
383 * isn't really necessary), to work around a gcc
384 * optimization issue - to force gcc to compute
385 * len above. Without this check, the computation
386 * of len is bungled by the optimizer.
387 */
388 if (len > 0) {
389 cwin = tp->snd_cwnd -
390 (tp->snd_nxt - tp->sack_newdata) -
391 sack_bytes_rxmt;
392 if (cwin < 0)
393 cwin = 0;
394 len = lmin(len, cwin);
395 }
396 }
397 }
398
399 /*
400 * Lop off SYN bit if it has already been sent. However, if this
401 * is SYN-SENT state and if segment contains data and if we don't
402 * know that foreign host supports TAO, suppress sending segment.
403 */
404 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
405 if (tp->t_state != TCPS_SYN_RECEIVED)
406 flags &= ~TH_SYN;
407#ifdef TCP_RFC7413
408 /*
409 * When sending additional segments following a TFO SYN|ACK,
410 * do not include the SYN bit.
411 */
412 if ((tp->t_flags & TF_FASTOPEN) &&
413 (tp->t_state == TCPS_SYN_RECEIVED))
414 flags &= ~TH_SYN;
415#endif
416 off--, len++;
417 }
418
419 /*
420 * Be careful not to send data and/or FIN on SYN segments.
421 * This measure is needed to prevent interoperability problems
422 * with not fully conformant TCP implementations.
423 */
424 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
425 len = 0;
426 flags &= ~TH_FIN;
427 }
428
429#ifdef TCP_RFC7413
430 /*
431 * When retransmitting SYN|ACK on a passively-created TFO socket,
432 * don't include data, as the presence of data may have caused the
433 * original SYN|ACK to have been dropped by a middlebox.
434 */
435 if ((tp->t_flags & TF_FASTOPEN) &&
436 (((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)) ||
437 (flags & TH_RST)))
438 len = 0;
439#endif
440 if (len <= 0) {
441 /*
442 * If FIN has been sent but not acked,
443 * but we haven't been called to retransmit,
444 * len will be < 0. Otherwise, window shrank
445 * after we sent into it. If window shrank to 0,
446 * cancel pending retransmit, pull snd_nxt back
447 * to (closed) window, and set the persist timer
448 * if it isn't already going. If the window didn't
449 * close completely, just wait for an ACK.
450 *
451 * We also do a general check here to ensure that
452 * we will set the persist timer when we have data
453 * to send, but a 0-byte window. This makes sure
454 * the persist timer is set even if the packet
455 * hits one of the "goto send" lines below.
456 */
457 len = 0;
458 if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) &&
459 (off < (int) sbavail(&so->so_snd))) {
460 tcp_timer_activate(tp, TT_REXMT, 0);
461 tp->t_rxtshift = 0;
462 tp->snd_nxt = tp->snd_una;
463 if (!tcp_timer_active(tp, TT_PERSIST))
464 tcp_setpersist(tp);
465 }
466 }
467
468 /* len will be >= 0 after this point. */
469 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
470
471 /*
472 * Automatic sizing of send socket buffer. Often the send buffer
473 * size is not optimally adjusted to the actual network conditions
474 * at hand (delay bandwidth product). Setting the buffer size too
475 * small limits throughput on links with high bandwidth and high
476 * delay (eg. trans-continental/oceanic links). Setting the
477 * buffer size too big consumes too much real kernel memory,
478 * especially with many connections on busy servers.
479 *
480 * The criteria to step up the send buffer one notch are:
481 * 1. receive window of remote host is larger than send buffer
482 * (with a fudge factor of 5/4th);
483 * 2. send buffer is filled to 7/8th with data (so we actually
484 * have data to make use of it);
485 * 3. send buffer fill has not hit maximal automatic size;
486 * 4. our send window (slow start and cogestion controlled) is
487 * larger than sent but unacknowledged data in send buffer.
488 *
489 * The remote host receive window scaling factor may limit the
490 * growing of the send buffer before it reaches its allowed
491 * maximum.
492 *
493 * It scales directly with slow start or congestion window
494 * and does at most one step per received ACK. This fast
495 * scaling has the drawback of growing the send buffer beyond
496 * what is strictly necessary to make full use of a given
497 * delay*bandwith product. However testing has shown this not
498 * to be much of an problem. At worst we are trading wasting
499 * of available bandwith (the non-use of it) for wasting some
500 * socket buffer memory.
501 *
502 * TODO: Shrink send buffer during idle periods together
503 * with congestion window. Requires another timer. Has to
504 * wait for upcoming tcp timer rewrite.
505 *
506 * XXXGL: should there be used sbused() or sbavail()?
507 */
508 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
509 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
510 sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) &&
511 sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
512 sendwin >= (sbused(&so->so_snd) -
513 (tp->snd_nxt - tp->snd_una))) {
514 if (!sbreserve_locked(&so->so_snd,
515 min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
516 V_tcp_autosndbuf_max), so, curthread))
517 so->so_snd.sb_flags &= ~SB_AUTOSIZE;
518 }
519 }
520
521 /*
522 * Decide if we can use TCP Segmentation Offloading (if supported by
523 * hardware).
524 *
525 * TSO may only be used if we are in a pure bulk sending state. The
526 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
527 * IP options prevent using TSO. With TSO the TCP header is the same
528 * (except for the sequence number) for all generated packets. This
529 * makes it impossible to transmit any options which vary per generated
530 * segment or packet.
531 */
532#ifdef IPSEC
533 /*
534 * Pre-calculate here as we save another lookup into the darknesses
535 * of IPsec that way and can actually decide if TSO is ok.
536 */
537 ipsec_optlen = ipsec_hdrsiz_tcp(tp);
538#endif
539 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
540 ((tp->t_flags & TF_SIGNATURE) == 0) &&
541 tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
542#ifdef IPSEC
543 ipsec_optlen == 0 &&
544#endif
545 tp->t_inpcb->inp_options == NULL &&
546 tp->t_inpcb->in6p_options == NULL)
547 tso = 1;
548
549 if (sack_rxmit) {
550 if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd)))
551 flags &= ~TH_FIN;
552 } else {
553 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
554 sbused(&so->so_snd)))
555 flags &= ~TH_FIN;
556 }
557
558 recwin = sbspace(&so->so_rcv);
559
560 /*
561 * Sender silly window avoidance. We transmit under the following
562 * conditions when len is non-zero:
563 *
564 * - We have a full segment (or more with TSO)
565 * - This is the last buffer in a write()/send() and we are
566 * either idle or running NODELAY
567 * - we've timed out (e.g. persist timer)
568 * - we have more then 1/2 the maximum send window's worth of
569 * data (receiver may be limited the window size)
570 * - we need to retransmit
571 */
572 if (len) {
573 if (len >= tp->t_maxseg)
574 goto send;
575 /*
576 * NOTE! on localhost connections an 'ack' from the remote
577 * end may occur synchronously with the output and cause
578 * us to flush a buffer queued with moretocome. XXX
579 *
580 * note: the len + off check is almost certainly unnecessary.
581 */
582 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
583 (idle || (tp->t_flags & TF_NODELAY)) &&
584 len + off >= sbavail(&so->so_snd) &&
585 (tp->t_flags & TF_NOPUSH) == 0) {
586 goto send;
587 }
588 if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */
589 goto send;
590 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
591 goto send;
592 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
593 goto send;
594 if (sack_rxmit)
595 goto send;
596 }
597
598 /*
599 * Sending of standalone window updates.
600 *
601 * Window updates are important when we close our window due to a
602 * full socket buffer and are opening it again after the application
603 * reads data from it. Once the window has opened again and the
604 * remote end starts to send again the ACK clock takes over and
605 * provides the most current window information.
606 *
607 * We must avoid the silly window syndrome whereas every read
608 * from the receive buffer, no matter how small, causes a window
609 * update to be sent. We also should avoid sending a flurry of
610 * window updates when the socket buffer had queued a lot of data
611 * and the application is doing small reads.
612 *
613 * Prevent a flurry of pointless window updates by only sending
614 * an update when we can increase the advertized window by more
615 * than 1/4th of the socket buffer capacity. When the buffer is
616 * getting full or is very small be more aggressive and send an
617 * update whenever we can increase by two mss sized segments.
618 * In all other situations the ACK's to new incoming data will
619 * carry further window increases.
620 *
621 * Don't send an independent window update if a delayed
622 * ACK is pending (it will get piggy-backed on it) or the
623 * remote side already has done a half-close and won't send
624 * more data. Skip this if the connection is in T/TCP
625 * half-open state.
626 */
627 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
628 !(tp->t_flags & TF_DELACK) &&
629 !TCPS_HAVERCVDFIN(tp->t_state)) {
630 /*
631 * "adv" is the amount we could increase the window,
632 * taking into account that we are limited by
633 * TCP_MAXWIN << tp->rcv_scale.
634 */
635 long adv;
636 int oldwin;
637
638 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
639 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
640 oldwin = (tp->rcv_adv - tp->rcv_nxt);
641 adv -= oldwin;
642 } else
643 oldwin = 0;
644
645 /*
646 * If the new window size ends up being the same as the old
647 * size when it is scaled, then don't force a window update.
648 */
649 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
650 goto dontupdate;
651
652 if (adv >= (long)(2 * tp->t_maxseg) &&
653 (adv >= (long)(so->so_rcv.sb_hiwat / 4) ||
654 recwin <= (long)(so->so_rcv.sb_hiwat / 8) ||
655 so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))
656 goto send;
657 }
658dontupdate:
659
660 /*
661 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
662 * is also a catch-all for the retransmit timer timeout case.
663 */
664 if (tp->t_flags & TF_ACKNOW)
665 goto send;
666 if ((flags & TH_RST) ||
667 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
668 goto send;
669 if (SEQ_GT(tp->snd_up, tp->snd_una))
670 goto send;
671 /*
672 * If our state indicates that FIN should be sent
673 * and we have not yet done so, then we need to send.
674 */
675 if (flags & TH_FIN &&
676 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
677 goto send;
678 /*
679 * In SACK, it is possible for tcp_output to fail to send a segment
680 * after the retransmission timer has been turned off. Make sure
681 * that the retransmission timer is set.
682 */
683 if ((tp->t_flags & TF_SACK_PERMIT) &&
684 SEQ_GT(tp->snd_max, tp->snd_una) &&
685 !tcp_timer_active(tp, TT_REXMT) &&
686 !tcp_timer_active(tp, TT_PERSIST)) {
687 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
688 goto just_return;
689 }
690 /*
691 * TCP window updates are not reliable, rather a polling protocol
692 * using ``persist'' packets is used to insure receipt of window
693 * updates. The three ``states'' for the output side are:
694 * idle not doing retransmits or persists
695 * persisting to move a small or zero window
696 * (re)transmitting and thereby not persisting
697 *
698 * tcp_timer_active(tp, TT_PERSIST)
699 * is true when we are in persist state.
700 * (tp->t_flags & TF_FORCEDATA)
701 * is set when we are called to send a persist packet.
702 * tcp_timer_active(tp, TT_REXMT)
703 * is set when we are retransmitting
704 * The output side is idle when both timers are zero.
705 *
706 * If send window is too small, there is data to transmit, and no
707 * retransmit or persist is pending, then go to persist state.
708 * If nothing happens soon, send when timer expires:
709 * if window is nonzero, transmit what we can,
710 * otherwise force out a byte.
711 */
712 if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) &&
713 !tcp_timer_active(tp, TT_PERSIST)) {
714 tp->t_rxtshift = 0;
715 tcp_setpersist(tp);
716 }
717
718 /*
719 * No reason to send a segment, just return.
720 */
721just_return:
722 SOCKBUF_UNLOCK(&so->so_snd);
723 return (0);
724
725send:
726 SOCKBUF_LOCK_ASSERT(&so->so_snd);
727 if (len > 0) {
728 if (len >= tp->t_maxseg)
729 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
730 else
731 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
732 }
733 /*
734 * Before ESTABLISHED, force sending of initial options
735 * unless TCP set not to do any options.
736 * NOTE: we assume that the IP/TCP header plus TCP options
737 * always fit in a single mbuf, leaving room for a maximum
738 * link header, i.e.
739 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
740 */
741 optlen = 0;
742#ifdef INET6
743 if (isipv6)
744 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
745 else
746#endif
747 hdrlen = sizeof (struct tcpiphdr);
748
749 /*
750 * Compute options for segment.
751 * We only have to care about SYN and established connection
752 * segments. Options for SYN-ACK segments are handled in TCP
753 * syncache.
754 */
755 to.to_flags = 0;
756 if ((tp->t_flags & TF_NOOPT) == 0) {
757 /* Maximum segment size. */
758 if (flags & TH_SYN) {
759 tp->snd_nxt = tp->iss;
760 to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
761 to.to_flags |= TOF_MSS;
762#ifdef TCP_RFC7413
763 /*
764 * Only include the TFO option on the first
765 * transmission of the SYN|ACK on a
766 * passively-created TFO socket, as the presence of
767 * the TFO option may have caused the original
768 * SYN|ACK to have been dropped by a middlebox.
769 */
770 if ((tp->t_flags & TF_FASTOPEN) &&
771 (tp->t_state == TCPS_SYN_RECEIVED) &&
772 (tp->t_rxtshift == 0)) {
773 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
774 to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
775 to.to_flags |= TOF_FASTOPEN;
776 }
777#endif
778 }
779 /* Window scaling. */
780 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
781 to.to_wscale = tp->request_r_scale;
782 to.to_flags |= TOF_SCALE;
783 }
784 /* Timestamps. */
785 if ((tp->t_flags & TF_RCVD_TSTMP) ||
786 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
787 to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
788 to.to_tsecr = tp->ts_recent;
789 to.to_flags |= TOF_TS;
790 /* Set receive buffer autosizing timestamp. */
791 if (tp->rfbuf_ts == 0 &&
792 (so->so_rcv.sb_flags & SB_AUTOSIZE))
793 tp->rfbuf_ts = tcp_ts_getticks();
794 }
795 /* Selective ACK's. */
796 if (tp->t_flags & TF_SACK_PERMIT) {
797 if (flags & TH_SYN)
798 to.to_flags |= TOF_SACKPERM;
799 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
800 (tp->t_flags & TF_SACK_PERMIT) &&
801 tp->rcv_numsacks > 0) {
802 to.to_flags |= TOF_SACK;
803 to.to_nsacks = tp->rcv_numsacks;
804 to.to_sacks = (u_char *)tp->sackblks;
805 }
806 }
807#ifdef TCP_SIGNATURE
808 /* TCP-MD5 (RFC2385). */
809 if (tp->t_flags & TF_SIGNATURE)
810 to.to_flags |= TOF_SIGNATURE;
811#endif /* TCP_SIGNATURE */
812
813 /* Processing the options. */
814 hdrlen += optlen = tcp_addoptions(&to, opt);
815 }
816
817#ifdef INET6
818 if (isipv6)
819 ipoptlen = ip6_optlen(tp->t_inpcb);
820 else
821#endif
822 if (tp->t_inpcb->inp_options)
823 ipoptlen = tp->t_inpcb->inp_options->m_len -
824 offsetof(struct ipoption, ipopt_list);
825 else
826 ipoptlen = 0;
827#ifdef IPSEC
828 ipoptlen += ipsec_optlen;
829#endif
830
831 /*
832 * Adjust data length if insertion of options will
833 * bump the packet length beyond the t_maxseg length.
834 * Clear the FIN bit because we cut off the tail of
835 * the segment.
836 */
837 if (len + optlen + ipoptlen > tp->t_maxseg) {
838 flags &= ~TH_FIN;
839
840 if (tso) {
841 u_int if_hw_tsomax;
842 u_int if_hw_tsomaxsegcount;
843 u_int if_hw_tsomaxsegsize;
844 struct mbuf *mb;
845 u_int moff;
846 int max_len;
847
848 /* extract TSO information */
849 if_hw_tsomax = tp->t_tsomax;
850 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
851 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
852
853 /*
854 * Limit a TSO burst to prevent it from
855 * overflowing or exceeding the maximum length
856 * allowed by the network interface:
857 */
858 KASSERT(ipoptlen == 0,
859 ("%s: TSO can't do IP options", __func__));
860
861 /*
862 * Check if we should limit by maximum payload
863 * length:
864 */
865 if (if_hw_tsomax != 0) {
866 /* compute maximum TSO length */
867 max_len = (if_hw_tsomax - hdrlen -
868 max_linkhdr);
869 if (max_len <= 0) {
870 len = 0;
871 } else if (len > max_len) {
872 sendalot = 1;
873 len = max_len;
874 }
875 }
876
877 /*
878 * Check if we should limit by maximum segment
879 * size and count:
880 */
881 if (if_hw_tsomaxsegcount != 0 &&
882 if_hw_tsomaxsegsize != 0) {
883 /*
884 * Subtract one segment for the LINK
885 * and TCP/IP headers mbuf that will
886 * be prepended to this mbuf chain
887 * after the code in this section
888 * limits the number of mbufs in the
889 * chain to if_hw_tsomaxsegcount.
890 */
891 if_hw_tsomaxsegcount -= 1;
892 max_len = 0;
893 mb = sbsndmbuf(&so->so_snd, off, &moff);
894
895 while (mb != NULL && max_len < len) {
896 u_int mlen;
897 u_int frags;
898
899 /*
900 * Get length of mbuf fragment
901 * and how many hardware frags,
902 * rounded up, it would use:
903 */
904 mlen = (mb->m_len - moff);
905 frags = howmany(mlen,
906 if_hw_tsomaxsegsize);
907
908 /* Handle special case: Zero Length Mbuf */
909 if (frags == 0)
910 frags = 1;
911
912 /*
913 * Check if the fragment limit
914 * will be reached or exceeded:
915 */
916 if (frags >= if_hw_tsomaxsegcount) {
917 max_len += min(mlen,
918 if_hw_tsomaxsegcount *
919 if_hw_tsomaxsegsize);
920 break;
921 }
922 max_len += mlen;
923 if_hw_tsomaxsegcount -= frags;
924 moff = 0;
925 mb = mb->m_next;
926 }
927 if (max_len <= 0) {
928 len = 0;
929 } else if (len > max_len) {
930 sendalot = 1;
931 len = max_len;
932 }
933 }
934
935 /*
936 * Prevent the last segment from being
937 * fractional unless the send sockbuf can be
938 * emptied:
939 */
940 max_len = (tp->t_maxseg - optlen);
941 if ((off + len) < sbavail(&so->so_snd)) {
942 moff = len % max_len;
943 if (moff != 0) {
944 len -= moff;
945 sendalot = 1;
946 }
947 }
948
949 /*
950 * In case there are too many small fragments
951 * don't use TSO:
952 */
953 if (len <= max_len) {
954 len = max_len;
955 sendalot = 1;
956 tso = 0;
957 }
958
959 /*
960 * Send the FIN in a separate segment
961 * after the bulk sending is done.
962 * We don't trust the TSO implementations
963 * to clear the FIN flag on all but the
964 * last segment.
965 */
966 if (tp->t_flags & TF_NEEDFIN)
967 sendalot = 1;
968
969 } else {
970 len = tp->t_maxseg - optlen - ipoptlen;
971 sendalot = 1;
972 }
973 } else
974 tso = 0;
975
976 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
977 ("%s: len > IP_MAXPACKET", __func__));
978
979/*#ifdef DIAGNOSTIC*/
980#ifdef INET6
981 if (max_linkhdr + hdrlen > MCLBYTES)
982#else
983 if (max_linkhdr + hdrlen > MHLEN)
984#endif
985 panic("tcphdr too big");
986/*#endif*/
987
988 /*
989 * This KASSERT is here to catch edge cases at a well defined place.
990 * Before, those had triggered (random) panic conditions further down.
991 */
992 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
993
994 /*
995 * Grab a header mbuf, attaching a copy of data to
996 * be transmitted, and initialize the header from
997 * the template for sends on this connection.
998 */
999 if (len) {
1000 struct mbuf *mb;
1001 u_int moff;
1002
1003 if ((tp->t_flags & TF_FORCEDATA) && len == 1)
1004 TCPSTAT_INC(tcps_sndprobe);
1005 else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
1006 tp->t_sndrexmitpack++;
1007 TCPSTAT_INC(tcps_sndrexmitpack);
1008 TCPSTAT_ADD(tcps_sndrexmitbyte, len);
1009 } else {
1010 TCPSTAT_INC(tcps_sndpack);
1011 TCPSTAT_ADD(tcps_sndbyte, len);
1012 }
1013#ifdef INET6
1014 if (MHLEN < hdrlen + max_linkhdr)
1015 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1016 else
1017#endif
1018 m = m_gethdr(M_NOWAIT, MT_DATA);
1019
1020 if (m == NULL) {
1021 SOCKBUF_UNLOCK(&so->so_snd);
1022 error = ENOBUFS;
1023 sack_rxmit = 0;
1024 goto out;
1025 }
1026
1027 m->m_data += max_linkhdr;
1028 m->m_len = hdrlen;
1029
1030 /*
1031 * Start the m_copy functions from the closest mbuf
1032 * to the offset in the socket buffer chain.
1033 */
1034 mb = sbsndptr(&so->so_snd, off, len, &moff);
1035
1036 if (len <= MHLEN - hdrlen - max_linkhdr) {
1037 m_copydata(mb, moff, (int)len,
1038 mtod(m, caddr_t) + hdrlen);
1039 m->m_len += len;
1040 } else {
1041 m->m_next = m_copy(mb, moff, (int)len);
1042 if (m->m_next == NULL) {
1043 SOCKBUF_UNLOCK(&so->so_snd);
1044 (void) m_free(m);
1045 error = ENOBUFS;
1046 sack_rxmit = 0;
1047 goto out;
1048 }
1049 }
1050
1051 /*
1052 * If we're sending everything we've got, set PUSH.
1053 * (This will keep happy those implementations which only
1054 * give data to the user when a buffer fills or
1055 * a PUSH comes in.)
1056 */
1057 if ((off + len == sbused(&so->so_snd)) && !(flags & TH_SYN))
1058 flags |= TH_PUSH;
1059 SOCKBUF_UNLOCK(&so->so_snd);
1060 } else {
1061 SOCKBUF_UNLOCK(&so->so_snd);
1062 if (tp->t_flags & TF_ACKNOW)
1063 TCPSTAT_INC(tcps_sndacks);
1064 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1065 TCPSTAT_INC(tcps_sndctrl);
1066 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1067 TCPSTAT_INC(tcps_sndurg);
1068 else
1069 TCPSTAT_INC(tcps_sndwinup);
1070
1071 m = m_gethdr(M_NOWAIT, MT_DATA);
1072 if (m == NULL) {
1073 error = ENOBUFS;
1074 sack_rxmit = 0;
1075 goto out;
1076 }
1077#ifdef INET6
1078 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
1079 MHLEN >= hdrlen) {
1080 M_ALIGN(m, hdrlen);
1081 } else
1082#endif
1083 m->m_data += max_linkhdr;
1084 m->m_len = hdrlen;
1085 }
1086 SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
1087 m->m_pkthdr.rcvif = (struct ifnet *)0;
1088#ifdef MAC
1089 mac_inpcb_create_mbuf(tp->t_inpcb, m);
1090#endif
1091#ifdef INET6
1092 if (isipv6) {
1093 ip6 = mtod(m, struct ip6_hdr *);
1094 th = (struct tcphdr *)(ip6 + 1);
1095 tcpip_fillheaders(tp->t_inpcb, ip6, th);
1096 } else
1097#endif /* INET6 */
1098 {
1099 ip = mtod(m, struct ip *);
1100 ipov = (struct ipovly *)ip;
1101 th = (struct tcphdr *)(ip + 1);
1102 tcpip_fillheaders(tp->t_inpcb, ip, th);
1103 }
1104
1105 /*
1106 * Fill in fields, remembering maximum advertised
1107 * window for use in delaying messages about window sizes.
1108 * If resending a FIN, be sure not to use a new sequence number.
1109 */
1110 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
1111 tp->snd_nxt == tp->snd_max)
1112 tp->snd_nxt--;
1113 /*
1114 * If we are starting a connection, send ECN setup
1115 * SYN packet. If we are on a retransmit, we may
1116 * resend those bits a number of times as per
1117 * RFC 3168.
1118 */
1119 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
1120 if (tp->t_rxtshift >= 1) {
1121 if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
1122 flags |= TH_ECE|TH_CWR;
1123 } else
1124 flags |= TH_ECE|TH_CWR;
1125 }
1126
1127 if (tp->t_state == TCPS_ESTABLISHED &&
1128 (tp->t_flags & TF_ECN_PERMIT)) {
1129 /*
1130 * If the peer has ECN, mark data packets with
1131 * ECN capable transmission (ECT).
1132 * Ignore pure ack packets, retransmissions and window probes.
1133 */
1134 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
1135 !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
1136#ifdef INET6
1137 if (isipv6)
1138 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
1139 else
1140#endif
1141 ip->ip_tos |= IPTOS_ECN_ECT0;
1142 TCPSTAT_INC(tcps_ecn_ect0);
1143 }
1144
1145 /*
1146 * Reply with proper ECN notifications.
1147 */
1148 if (tp->t_flags & TF_ECN_SND_CWR) {
1149 flags |= TH_CWR;
1150 tp->t_flags &= ~TF_ECN_SND_CWR;
1151 }
1152 if (tp->t_flags & TF_ECN_SND_ECE)
1153 flags |= TH_ECE;
1154 }
1155
1156 /*
1157 * If we are doing retransmissions, then snd_nxt will
1158 * not reflect the first unsent octet. For ACK only
1159 * packets, we do not want the sequence number of the
1160 * retransmitted packet, we want the sequence number
1161 * of the next unsent octet. So, if there is no data
1162 * (and no SYN or FIN), use snd_max instead of snd_nxt
1163 * when filling in ti_seq. But if we are in persist
1164 * state, snd_max might reflect one byte beyond the
1165 * right edge of the window, so use snd_nxt in that
1166 * case, since we know we aren't doing a retransmission.
1167 * (retransmit and persist are mutually exclusive...)
1168 */
1169 if (sack_rxmit == 0) {
1170 if (len || (flags & (TH_SYN|TH_FIN)) ||
1171 tcp_timer_active(tp, TT_PERSIST))
1172 th->th_seq = htonl(tp->snd_nxt);
1173 else
1174 th->th_seq = htonl(tp->snd_max);
1175 } else {
1176 th->th_seq = htonl(p->rxmit);
1177 p->rxmit += len;
1178 tp->sackhint.sack_bytes_rexmit += len;
1179 }
1180 th->th_ack = htonl(tp->rcv_nxt);
1181 if (optlen) {
1182 bcopy(opt, th + 1, optlen);
1183 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1184 }
1185 th->th_flags = flags;
1186 /*
1187 * Calculate receive window. Don't shrink window,
1188 * but avoid silly window syndrome.
1189 */
1190 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
1191 recwin < (long)tp->t_maxseg)
1192 recwin = 0;
1193 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
1194 recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
1195 recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
1196 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
1197 recwin = (long)TCP_MAXWIN << tp->rcv_scale;
1198
1199 /*
1200 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1201 * or <SYN,ACK>) segment itself is never scaled. The <SYN,ACK>
1202 * case is handled in syncache.
1203 */
1204 if (flags & TH_SYN)
1205 th->th_win = htons((u_short)
1206 (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
1207 else
1208 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
1209
1210 /*
1211 * Adjust the RXWIN0SENT flag - indicate that we have advertised
1212 * a 0 window. This may cause the remote transmitter to stall. This
1213 * flag tells soreceive() to disable delayed acknowledgements when
1214 * draining the buffer. This can occur if the receiver is attempting
1215 * to read more data than can be buffered prior to transmitting on
1216 * the connection.
1217 */
1218 if (th->th_win == 0) {
1219 tp->t_sndzerowin++;
1220 tp->t_flags |= TF_RXWIN0SENT;
1221 } else
1222 tp->t_flags &= ~TF_RXWIN0SENT;
1223 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1224 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
1225 th->th_flags |= TH_URG;
1226 } else
1227 /*
1228 * If no urgent pointer to send, then we pull
1229 * the urgent pointer to the left edge of the send window
1230 * so that it doesn't drift into the send window on sequence
1231 * number wraparound.
1232 */
1233 tp->snd_up = tp->snd_una; /* drag it along */
1234
1235#ifdef TCP_SIGNATURE
1236 if (to.to_flags & TOF_SIGNATURE) {
1237 int sigoff = to.to_signature - opt;
1238 tcp_signature_compute(m, 0, len, optlen,
1239 (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
1240 }
1241#endif
1242
1243 /*
1244 * Put TCP length in extended header, and then
1245 * checksum extended header and data.
1246 */
1247 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1248 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1249#ifdef INET6
1250 if (isipv6) {
1251 /*
1252 * ip6_plen is not need to be filled now, and will be filled
1253 * in ip6_output.
1254 */
1255 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
1256 th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
1257 optlen + len, IPPROTO_TCP, 0);
1258 }
1259#endif
1260#if defined(INET6) && defined(INET)
1261 else
1262#endif
1263#ifdef INET
1264 {
1265 m->m_pkthdr.csum_flags = CSUM_TCP;
1266 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1267 htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
1268
1269 /* IP version must be set here for ipv4/ipv6 checking later */
1270 KASSERT(ip->ip_v == IPVERSION,
1271 ("%s: IP version incorrect: %d", __func__, ip->ip_v));
1272 }
1273#endif
1274
1275 /*
1276 * Enable TSO and specify the size of the segments.
1277 * The TCP pseudo header checksum is always provided.
1278 */
1279 if (tso) {
1280 KASSERT(len > tp->t_maxseg - optlen,
1281 ("%s: len <= tso_segsz", __func__));
1282 m->m_pkthdr.csum_flags |= CSUM_TSO;
1283 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
1284 }
1285
1286#ifdef IPSEC
1287 KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
1288 ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u",
1289 __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
1290#else
1291 KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
1292 ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u",
1293 __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
1294#endif
1295
1296 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
1297 hhook_run_tcp_est_out(tp, th, &to, len, tso);
1298
1299#ifdef TCPDEBUG
1300 /*
1301 * Trace.
1302 */
1303 if (so->so_options & SO_DEBUG) {
1304 u_short save = 0;
1305#ifdef INET6
1306 if (!isipv6)
1307#endif
1308 {
1309 save = ipov->ih_len;
1310 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
1311 }
1312 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
1313#ifdef INET6
1314 if (!isipv6)
1315#endif
1316 ipov->ih_len = save;
1317 }
1318#endif /* TCPDEBUG */
1319 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
1320
1321 /*
1322 * Fill in IP length and desired time to live and
1323 * send to IP level. There should be a better way
1324 * to handle ttl and tos; we could keep them in
1325 * the template, but need a way to checksum without them.
1326 */
1327 /*
1328 * m->m_pkthdr.len should have been set before checksum calculation,
1329 * because in6_cksum() need it.
1330 */
1331#ifdef INET6
1332 if (isipv6) {
1333 struct route_in6 ro;
1334
1335 bzero(&ro, sizeof(ro));
1336 /*
1337 * we separately set hoplimit for every segment, since the
1338 * user might want to change the value via setsockopt.
1339 * Also, desired default hop limit might be changed via
1340 * Neighbor Discovery.
1341 */
1342 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
1343
1344 /*
1345 * Set the packet size here for the benefit of DTrace probes.
1346 * ip6_output() will set it properly; it's supposed to include
1347 * the option header lengths as well.
1348 */
1349 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
1350
1351 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
1352 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
1353 else
1354 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
1355
1356 if (tp->t_state == TCPS_SYN_SENT)
1357 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
1358
1359 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
1360
1361#ifdef TCPPCAP
1362 /* Save packet, if requested. */
1363 tcp_pcap_add(th, m, &(tp->t_outpkts));
1364#endif
1365
1366 /* TODO: IPv6 IP6TOS_ECT bit on */
1367 error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
1368 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
1369 NULL, NULL, tp->t_inpcb);
1370
1371 if (error == EMSGSIZE && ro.ro_rt != NULL)
1372 mtu = ro.ro_rt->rt_mtu;
1373 RO_RTFREE(&ro);
1374 }
1375#endif /* INET6 */
1376#if defined(INET) && defined(INET6)
1377 else
1378#endif
1379#ifdef INET
1380 {
1381 struct route ro;
1382
1383 bzero(&ro, sizeof(ro));
1384 ip->ip_len = htons(m->m_pkthdr.len);
1385#ifdef INET6
1386 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
1387 ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
1388#endif /* INET6 */
1389 /*
1390 * If we do path MTU discovery, then we set DF on every packet.
1391 * This might not be the best thing to do according to RFC3390
1392 * Section 2. However the tcp hostcache migitates the problem
1393 * so it affects only the first tcp connection with a host.
1394 *
1395 * NB: Don't set DF on small MTU/MSS to have a safe fallback.
1396 */
1397 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
1398 ip->ip_off |= htons(IP_DF);
1399 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
1400 } else {
1401 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
1402 }
1403
1404 if (tp->t_state == TCPS_SYN_SENT)
1405 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
1406
1407 TCP_PROBE5(send, NULL, tp, ip, tp, th);
1408
1409#ifdef TCPPCAP
1410 /* Save packet, if requested. */
1411 tcp_pcap_add(th, m, &(tp->t_outpkts));
1412#endif
1413
1414 error = ip_output(m, tp->t_inpcb->inp_options, &ro,
1415 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
1416 tp->t_inpcb);
1417
1418 if (error == EMSGSIZE && ro.ro_rt != NULL)
1419 mtu = ro.ro_rt->rt_mtu;
1420 RO_RTFREE(&ro);
1421 }
1422#endif /* INET */
1423
1424out:
1425 /*
1426 * In transmit state, time the transmission and arrange for
1427 * the retransmit. In persist state, just set snd_max.
1428 */
1429 if ((tp->t_flags & TF_FORCEDATA) == 0 ||
1430 !tcp_timer_active(tp, TT_PERSIST)) {
1431 tcp_seq startseq = tp->snd_nxt;
1432
1433 /*
1434 * Advance snd_nxt over sequence space of this segment.
1435 */
1436 if (flags & (TH_SYN|TH_FIN)) {
1437 if (flags & TH_SYN)
1438 tp->snd_nxt++;
1439 if (flags & TH_FIN) {
1440 tp->snd_nxt++;
1441 tp->t_flags |= TF_SENTFIN;
1442 }
1443 }
1444 if (sack_rxmit)
1445 goto timer;
1446 tp->snd_nxt += len;
1447 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1448 tp->snd_max = tp->snd_nxt;
1449 /*
1450 * Time this transmission if not a retransmission and
1451 * not currently timing anything.
1452 */
1453 if (tp->t_rtttime == 0) {
1454 tp->t_rtttime = ticks;
1455 tp->t_rtseq = startseq;
1456 TCPSTAT_INC(tcps_segstimed);
1457 }
1458 }
1459
1460 /*
1461 * Set retransmit timer if not currently set,
1462 * and not doing a pure ack or a keep-alive probe.
1463 * Initial value for retransmit timer is smoothed
1464 * round-trip time + 2 * round-trip time variance.
1465 * Initialize shift counter which is used for backoff
1466 * of retransmit time.
1467 */
1468timer:
1469 if (!tcp_timer_active(tp, TT_REXMT) &&
1470 ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1471 (tp->snd_nxt != tp->snd_una))) {
1472 if (tcp_timer_active(tp, TT_PERSIST)) {
1473 tcp_timer_activate(tp, TT_PERSIST, 0);
1474 tp->t_rxtshift = 0;
1475 }
1476 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1477 } else if (len == 0 && sbavail(&so->so_snd) &&
1478 !tcp_timer_active(tp, TT_REXMT) &&
1479 !tcp_timer_active(tp, TT_PERSIST)) {
1480 /*
1481 * Avoid a situation where we do not set persist timer
1482 * after a zero window condition. For example:
1483 * 1) A -> B: packet with enough data to fill the window
1484 * 2) B -> A: ACK for #1 + new data (0 window
1485 * advertisement)
1486 * 3) A -> B: ACK for #2, 0 len packet
1487 *
1488 * In this case, A will not activate the persist timer,
1489 * because it chose to send a packet. Unless tcp_output
1490 * is called for some other reason (delayed ack timer,
1491 * another input packet from B, socket syscall), A will
1492 * not send zero window probes.
1493 *
1494 * So, if you send a 0-length packet, but there is data
1495 * in the socket buffer, and neither the rexmt or
1496 * persist timer is already set, then activate the
1497 * persist timer.
1498 */
1499 tp->t_rxtshift = 0;
1500 tcp_setpersist(tp);
1501 }
1502 } else {
1503 /*
1504 * Persist case, update snd_max but since we are in
1505 * persist mode (no window) we do not update snd_nxt.
1506 */
1507 int xlen = len;
1508 if (flags & TH_SYN)
1509 ++xlen;
1510 if (flags & TH_FIN) {
1511 ++xlen;
1512 tp->t_flags |= TF_SENTFIN;
1513 }
1514 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1515 tp->snd_max = tp->snd_nxt + len;
1516 }
1517
1518 if (error) {
1519
1520 /*
1521 * We know that the packet was lost, so back out the
1522 * sequence number advance, if any.
1523 *
1524 * If the error is EPERM the packet got blocked by the
1525 * local firewall. Normally we should terminate the
1526 * connection but the blocking may have been spurious
1527 * due to a firewall reconfiguration cycle. So we treat
1528 * it like a packet loss and let the retransmit timer and
1529 * timeouts do their work over time.
1530 * XXX: It is a POLA question whether calling tcp_drop right
1531 * away would be the really correct behavior instead.
1532 */
1533 if (((tp->t_flags & TF_FORCEDATA) == 0 ||
1534 !tcp_timer_active(tp, TT_PERSIST)) &&
1535 ((flags & TH_SYN) == 0) &&
1536 (error != EPERM)) {
1537 if (sack_rxmit) {
1538 p->rxmit -= len;
1539 tp->sackhint.sack_bytes_rexmit -= len;
1540 KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
1541 ("sackhint bytes rtx >= 0"));
1542 } else
1543 tp->snd_nxt -= len;
1544 }
1545 SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */
1546 switch (error) {
1547 case EPERM:
1548 tp->t_softerror = error;
1549 return (error);
1550 case ENOBUFS:
1551 if (!tcp_timer_active(tp, TT_REXMT) &&
1552 !tcp_timer_active(tp, TT_PERSIST))
1553 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1554 tp->snd_cwnd = tp->t_maxseg;
1555 return (0);
1556 case EMSGSIZE:
1557 /*
1558 * For some reason the interface we used initially
1559 * to send segments changed to another or lowered
1560 * its MTU.
1561 * If TSO was active we either got an interface
1562 * without TSO capabilits or TSO was turned off.
1563 * If we obtained mtu from ip_output() then update
1564 * it and try again.
1565 */
1566 if (tso)
1567 tp->t_flags &= ~TF_TSO;
1568 if (mtu != 0) {
1569 tcp_mss_update(tp, -1, mtu, NULL, NULL);
1570 goto again;
1571 }
1572 return (error);
1573 case EHOSTDOWN:
1574 case EHOSTUNREACH:
1575 case ENETDOWN:
1576 case ENETUNREACH:
1577 if (TCPS_HAVERCVDSYN(tp->t_state)) {
1578 tp->t_softerror = error;
1579 return (0);
1580 }
1581 /* FALLTHROUGH */
1582 default:
1583 return (error);
1584 }
1585 }
1586 TCPSTAT_INC(tcps_sndtotal);
1587
1588 /*
1589 * Data sent (as far as we can tell).
1590 * If this advertises a larger window than any other segment,
1591 * then remember the size of the advertised window.
1592 * Any pending ACK has now been sent.
1593 */
1594 if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
1595 tp->rcv_adv = tp->rcv_nxt + recwin;
1596 tp->last_ack_sent = tp->rcv_nxt;
1597 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
1598 if (tcp_timer_active(tp, TT_DELACK))
1599 tcp_timer_activate(tp, TT_DELACK, 0);
1600#if 0
1601 /*
1602 * This completely breaks TCP if newreno is turned on. What happens
1603 * is that if delayed-acks are turned on on the receiver, this code
1604 * on the transmitter effectively destroys the TCP window, forcing
1605 * it to four packets (1.5Kx4 = 6K window).
1606 */
1607 if (sendalot && --maxburst)
1608 goto again;
1609#endif
1610 if (sendalot)
1611 goto again;
1612 return (0);
1613}
1614
1615void
1616tcp_setpersist(struct tcpcb *tp)
1617{
1618 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1619 int tt;
1620
1621 tp->t_flags &= ~TF_PREVVALID;
1622 if (tcp_timer_active(tp, TT_REXMT))
1623 panic("tcp_setpersist: retransmit pending");
1624 /*
1625 * Start/restart persistance timer.
1626 */
1627 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
1628 TCPTV_PERSMIN, TCPTV_PERSMAX);
1629 tcp_timer_activate(tp, TT_PERSIST, tt);
1630 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1631 tp->t_rxtshift++;
1632}
1633
1634/*
1635 * Insert TCP options according to the supplied parameters to the place
1636 * optp in a consistent way. Can handle unaligned destinations.
1637 *
1638 * The order of the option processing is crucial for optimal packing and
1639 * alignment for the scarce option space.
1640 *
1641 * The optimal order for a SYN/SYN-ACK segment is:
1642 * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
1643 * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
1644 *
1645 * The SACK options should be last. SACK blocks consume 8*n+2 bytes.
1646 * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
1647 * At minimum we need 10 bytes (to generate 1 SACK block). If both
1648 * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
1649 * we only have 10 bytes for SACK options (40 - (12 + 18)).
1650 */
1651int
1652tcp_addoptions(struct tcpopt *to, u_char *optp)
1653{
1654 u_int mask, optlen = 0;
1655
1656 for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
1657 if ((to->to_flags & mask) != mask)
1658 continue;
1659 if (optlen == TCP_MAXOLEN)
1660 break;
1661 switch (to->to_flags & mask) {
1662 case TOF_MSS:
1663 while (optlen % 4) {
1664 optlen += TCPOLEN_NOP;
1665 *optp++ = TCPOPT_NOP;
1666 }
1667 if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
1668 continue;
1669 optlen += TCPOLEN_MAXSEG;
1670 *optp++ = TCPOPT_MAXSEG;
1671 *optp++ = TCPOLEN_MAXSEG;
1672 to->to_mss = htons(to->to_mss);
1673 bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
1674 optp += sizeof(to->to_mss);
1675 break;
1676 case TOF_SCALE:
1677 while (!optlen || optlen % 2 != 1) {
1678 optlen += TCPOLEN_NOP;
1679 *optp++ = TCPOPT_NOP;
1680 }
1681 if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
1682 continue;
1683 optlen += TCPOLEN_WINDOW;
1684 *optp++ = TCPOPT_WINDOW;
1685 *optp++ = TCPOLEN_WINDOW;
1686 *optp++ = to->to_wscale;
1687 break;
1688 case TOF_SACKPERM:
1689 while (optlen % 2) {
1690 optlen += TCPOLEN_NOP;
1691 *optp++ = TCPOPT_NOP;
1692 }
1693 if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
1694 continue;
1695 optlen += TCPOLEN_SACK_PERMITTED;
1696 *optp++ = TCPOPT_SACK_PERMITTED;
1697 *optp++ = TCPOLEN_SACK_PERMITTED;
1698 break;
1699 case TOF_TS:
1700 while (!optlen || optlen % 4 != 2) {
1701 optlen += TCPOLEN_NOP;
1702 *optp++ = TCPOPT_NOP;
1703 }
1704 if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
1705 continue;
1706 optlen += TCPOLEN_TIMESTAMP;
1707 *optp++ = TCPOPT_TIMESTAMP;
1708 *optp++ = TCPOLEN_TIMESTAMP;
1709 to->to_tsval = htonl(to->to_tsval);
1710 to->to_tsecr = htonl(to->to_tsecr);
1711 bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
1712 optp += sizeof(to->to_tsval);
1713 bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
1714 optp += sizeof(to->to_tsecr);
1715 break;
1716#ifdef TCP_SIGNATURE
1717 case TOF_SIGNATURE:
1718 {
1719 int siglen = TCPOLEN_SIGNATURE - 2;
1720
1721 while (!optlen || optlen % 4 != 2) {
1722 optlen += TCPOLEN_NOP;
1723 *optp++ = TCPOPT_NOP;
1724 }
1725 if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE)
1726 continue;
1727 optlen += TCPOLEN_SIGNATURE;
1728 *optp++ = TCPOPT_SIGNATURE;
1729 *optp++ = TCPOLEN_SIGNATURE;
1730 to->to_signature = optp;
1731 while (siglen--)
1732 *optp++ = 0;
1733 break;
1734 }
1735#endif
1736 case TOF_SACK:
1737 {
1738 int sackblks = 0;
1739 struct sackblk *sack = (struct sackblk *)to->to_sacks;
1740 tcp_seq sack_seq;
1741
1742 while (!optlen || optlen % 4 != 2) {
1743 optlen += TCPOLEN_NOP;
1744 *optp++ = TCPOPT_NOP;
1745 }
1746 if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
1747 continue;
1748 optlen += TCPOLEN_SACKHDR;
1749 *optp++ = TCPOPT_SACK;
1750 sackblks = min(to->to_nsacks,
1751 (TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
1752 *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
1753 while (sackblks--) {
1754 sack_seq = htonl(sack->start);
1755 bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
1756 optp += sizeof(sack_seq);
1757 sack_seq = htonl(sack->end);
1758 bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
1759 optp += sizeof(sack_seq);
1760 optlen += TCPOLEN_SACK;
1761 sack++;
1762 }
1763 TCPSTAT_INC(tcps_sack_send_blocks);
1764 break;
1765 }
1766#ifdef TCP_RFC7413
1767 case TOF_FASTOPEN:
1768 {
1769 int total_len;
1770
1771 /* XXX is there any point to aligning this option? */
1772 total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len;
1773 if (TCP_MAXOLEN - optlen < total_len)
1774 continue;
1775 *optp++ = TCPOPT_FAST_OPEN;
1776 *optp++ = total_len;
1777 if (to->to_tfo_len > 0) {
1778 bcopy(to->to_tfo_cookie, optp, to->to_tfo_len);
1779 optp += to->to_tfo_len;
1780 }
1781 optlen += total_len;
1782 break;
1783 }
1784#endif
1785 default:
1786 panic("%s: unknown TCP option type", __func__);
1787 break;
1788 }
1789 }
1790
1791 /* Terminate and pad TCP options to a 4 byte boundary. */
1792 if (optlen % 4) {
1793 optlen += TCPOLEN_EOL;
1794 *optp++ = TCPOPT_EOL;
1795 }
1796 /*
1797 * According to RFC 793 (STD0007):
1798 * "The content of the header beyond the End-of-Option option
1799 * must be header padding (i.e., zero)."
1800 * and later: "The padding is composed of zeros."
1801 */
1802 while (optlen % 4) {
1803 optlen += TCPOLEN_PAD;
1804 *optp++ = TCPOPT_PAD;
1805 }
1806
1807 KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
1808 return (optlen);
1809}
81#ifdef TCPPCAP
82#include <netinet/tcp_pcap.h>
83#endif
84#ifdef TCPDEBUG
85#include <netinet/tcp_debug.h>
86#endif
87#ifdef TCP_OFFLOAD
88#include <netinet/tcp_offload.h>
89#endif
90
91#ifdef IPSEC
92#include <netipsec/ipsec.h>
93#endif /*IPSEC*/
94
95#include <machine/in_cksum.h>
96
97#include <security/mac/mac_framework.h>
98
99VNET_DEFINE(int, path_mtu_discovery) = 1;
100SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW,
101 &VNET_NAME(path_mtu_discovery), 1,
102 "Enable Path MTU Discovery");
103
104VNET_DEFINE(int, tcp_do_tso) = 1;
105#define V_tcp_do_tso VNET(tcp_do_tso)
106SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW,
107 &VNET_NAME(tcp_do_tso), 0,
108 "Enable TCP Segmentation Offload");
109
110VNET_DEFINE(int, tcp_sendspace) = 1024*32;
111#define V_tcp_sendspace VNET(tcp_sendspace)
112SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW,
113 &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
114
115VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
116#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
117SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
118 &VNET_NAME(tcp_do_autosndbuf), 0,
119 "Enable automatic send buffer sizing");
120
121VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
122#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
123SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
124 &VNET_NAME(tcp_autosndbuf_inc), 0,
125 "Incrementor step size of automatic send buffer");
126
127VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
128#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
129SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
130 &VNET_NAME(tcp_autosndbuf_max), 0,
131 "Max size of automatic send buffer");
132
133static void inline hhook_run_tcp_est_out(struct tcpcb *tp,
134 struct tcphdr *th, struct tcpopt *to,
135 long len, int tso);
136static void inline cc_after_idle(struct tcpcb *tp);
137
138/*
139 * Wrapper for the TCP established output helper hook.
140 */
141static void inline
142hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
143 struct tcpopt *to, long len, int tso)
144{
145 struct tcp_hhook_data hhook_data;
146
147 if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) {
148 hhook_data.tp = tp;
149 hhook_data.th = th;
150 hhook_data.to = to;
151 hhook_data.len = len;
152 hhook_data.tso = tso;
153
154 hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data,
155 tp->osd);
156 }
157}
158
159/*
160 * CC wrapper hook functions
161 */
162static void inline
163cc_after_idle(struct tcpcb *tp)
164{
165 INP_WLOCK_ASSERT(tp->t_inpcb);
166
167 if (CC_ALGO(tp)->after_idle != NULL)
168 CC_ALGO(tp)->after_idle(tp->ccv);
169}
170
171/*
172 * Tcp output routine: figure out what should be sent and send it.
173 */
174int
175tcp_output(struct tcpcb *tp)
176{
177 struct socket *so = tp->t_inpcb->inp_socket;
178 long len, recwin, sendwin;
179 int off, flags, error = 0; /* Keep compiler happy */
180 struct mbuf *m;
181 struct ip *ip = NULL;
182 struct ipovly *ipov = NULL;
183 struct tcphdr *th;
184 u_char opt[TCP_MAXOLEN];
185 unsigned ipoptlen, optlen, hdrlen;
186#ifdef IPSEC
187 unsigned ipsec_optlen = 0;
188#endif
189 int idle, sendalot;
190 int sack_rxmit, sack_bytes_rxmt;
191 struct sackhole *p;
192 int tso, mtu;
193 struct tcpopt to;
194#if 0
195 int maxburst = TCP_MAXBURST;
196#endif
197#ifdef INET6
198 struct ip6_hdr *ip6 = NULL;
199 int isipv6;
200
201 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
202#endif
203
204 INP_WLOCK_ASSERT(tp->t_inpcb);
205
206#ifdef TCP_OFFLOAD
207 if (tp->t_flags & TF_TOE)
208 return (tcp_offload_output(tp));
209#endif
210
211#ifdef TCP_RFC7413
212 /*
213 * For TFO connections in SYN_RECEIVED, only allow the initial
214 * SYN|ACK and those sent by the retransmit timer.
215 */
216 if ((tp->t_flags & TF_FASTOPEN) &&
217 (tp->t_state == TCPS_SYN_RECEIVED) &&
218 SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */
219 (tp->snd_nxt != tp->snd_una)) /* not a retransmit */
220 return (0);
221#endif
222 /*
223 * Determine length of data that should be transmitted,
224 * and flags that will be used.
225 * If there is some data or critical controls (SYN, RST)
226 * to send, then transmit; otherwise, investigate further.
227 */
228 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
229 if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
230 cc_after_idle(tp);
231 tp->t_flags &= ~TF_LASTIDLE;
232 if (idle) {
233 if (tp->t_flags & TF_MORETOCOME) {
234 tp->t_flags |= TF_LASTIDLE;
235 idle = 0;
236 }
237 }
238again:
239 /*
240 * If we've recently taken a timeout, snd_max will be greater than
241 * snd_nxt. There may be SACK information that allows us to avoid
242 * resending already delivered data. Adjust snd_nxt accordingly.
243 */
244 if ((tp->t_flags & TF_SACK_PERMIT) &&
245 SEQ_LT(tp->snd_nxt, tp->snd_max))
246 tcp_sack_adjust(tp);
247 sendalot = 0;
248 tso = 0;
249 mtu = 0;
250 off = tp->snd_nxt - tp->snd_una;
251 sendwin = min(tp->snd_wnd, tp->snd_cwnd);
252
253 flags = tcp_outflags[tp->t_state];
254 /*
255 * Send any SACK-generated retransmissions. If we're explicitly trying
256 * to send out new data (when sendalot is 1), bypass this function.
257 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
258 * we're replacing a (future) new transmission with a retransmission
259 * now, and we previously incremented snd_cwnd in tcp_input().
260 */
261 /*
262 * Still in sack recovery , reset rxmit flag to zero.
263 */
264 sack_rxmit = 0;
265 sack_bytes_rxmt = 0;
266 len = 0;
267 p = NULL;
268 if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
269 (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
270 long cwin;
271
272 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
273 if (cwin < 0)
274 cwin = 0;
275 /* Do not retransmit SACK segments beyond snd_recover */
276 if (SEQ_GT(p->end, tp->snd_recover)) {
277 /*
278 * (At least) part of sack hole extends beyond
279 * snd_recover. Check to see if we can rexmit data
280 * for this hole.
281 */
282 if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
283 /*
284 * Can't rexmit any more data for this hole.
285 * That data will be rexmitted in the next
286 * sack recovery episode, when snd_recover
287 * moves past p->rxmit.
288 */
289 p = NULL;
290 goto after_sack_rexmit;
291 } else
292 /* Can rexmit part of the current hole */
293 len = ((long)ulmin(cwin,
294 tp->snd_recover - p->rxmit));
295 } else
296 len = ((long)ulmin(cwin, p->end - p->rxmit));
297 off = p->rxmit - tp->snd_una;
298 KASSERT(off >= 0,("%s: sack block to the left of una : %d",
299 __func__, off));
300 if (len > 0) {
301 sack_rxmit = 1;
302 sendalot = 1;
303 TCPSTAT_INC(tcps_sack_rexmits);
304 TCPSTAT_ADD(tcps_sack_rexmit_bytes,
305 min(len, tp->t_maxseg));
306 }
307 }
308after_sack_rexmit:
309 /*
310 * Get standard flags, and add SYN or FIN if requested by 'hidden'
311 * state flags.
312 */
313 if (tp->t_flags & TF_NEEDFIN)
314 flags |= TH_FIN;
315 if (tp->t_flags & TF_NEEDSYN)
316 flags |= TH_SYN;
317
318 SOCKBUF_LOCK(&so->so_snd);
319 /*
320 * If in persist timeout with window of 0, send 1 byte.
321 * Otherwise, if window is small but nonzero
322 * and timer expired, we will send what we can
323 * and go to transmit state.
324 */
325 if (tp->t_flags & TF_FORCEDATA) {
326 if (sendwin == 0) {
327 /*
328 * If we still have some data to send, then
329 * clear the FIN bit. Usually this would
330 * happen below when it realizes that we
331 * aren't sending all the data. However,
332 * if we have exactly 1 byte of unsent data,
333 * then it won't clear the FIN bit below,
334 * and if we are in persist state, we wind
335 * up sending the packet without recording
336 * that we sent the FIN bit.
337 *
338 * We can't just blindly clear the FIN bit,
339 * because if we don't have any more data
340 * to send then the probe will be the FIN
341 * itself.
342 */
343 if (off < sbused(&so->so_snd))
344 flags &= ~TH_FIN;
345 sendwin = 1;
346 } else {
347 tcp_timer_activate(tp, TT_PERSIST, 0);
348 tp->t_rxtshift = 0;
349 }
350 }
351
352 /*
353 * If snd_nxt == snd_max and we have transmitted a FIN, the
354 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
355 * a negative length. This can also occur when TCP opens up
356 * its congestion window while receiving additional duplicate
357 * acks after fast-retransmit because TCP will reset snd_nxt
358 * to snd_max after the fast-retransmit.
359 *
360 * In the normal retransmit-FIN-only case, however, snd_nxt will
361 * be set to snd_una, the offset will be 0, and the length may
362 * wind up 0.
363 *
364 * If sack_rxmit is true we are retransmitting from the scoreboard
365 * in which case len is already set.
366 */
367 if (sack_rxmit == 0) {
368 if (sack_bytes_rxmt == 0)
369 len = ((long)ulmin(sbavail(&so->so_snd), sendwin) -
370 off);
371 else {
372 long cwin;
373
374 /*
375 * We are inside of a SACK recovery episode and are
376 * sending new data, having retransmitted all the
377 * data possible in the scoreboard.
378 */
379 len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) -
380 off);
381 /*
382 * Don't remove this (len > 0) check !
383 * We explicitly check for len > 0 here (although it
384 * isn't really necessary), to work around a gcc
385 * optimization issue - to force gcc to compute
386 * len above. Without this check, the computation
387 * of len is bungled by the optimizer.
388 */
389 if (len > 0) {
390 cwin = tp->snd_cwnd -
391 (tp->snd_nxt - tp->sack_newdata) -
392 sack_bytes_rxmt;
393 if (cwin < 0)
394 cwin = 0;
395 len = lmin(len, cwin);
396 }
397 }
398 }
399
400 /*
401 * Lop off SYN bit if it has already been sent. However, if this
402 * is SYN-SENT state and if segment contains data and if we don't
403 * know that foreign host supports TAO, suppress sending segment.
404 */
405 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
406 if (tp->t_state != TCPS_SYN_RECEIVED)
407 flags &= ~TH_SYN;
408#ifdef TCP_RFC7413
409 /*
410 * When sending additional segments following a TFO SYN|ACK,
411 * do not include the SYN bit.
412 */
413 if ((tp->t_flags & TF_FASTOPEN) &&
414 (tp->t_state == TCPS_SYN_RECEIVED))
415 flags &= ~TH_SYN;
416#endif
417 off--, len++;
418 }
419
420 /*
421 * Be careful not to send data and/or FIN on SYN segments.
422 * This measure is needed to prevent interoperability problems
423 * with not fully conformant TCP implementations.
424 */
425 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
426 len = 0;
427 flags &= ~TH_FIN;
428 }
429
430#ifdef TCP_RFC7413
431 /*
432 * When retransmitting SYN|ACK on a passively-created TFO socket,
433 * don't include data, as the presence of data may have caused the
434 * original SYN|ACK to have been dropped by a middlebox.
435 */
436 if ((tp->t_flags & TF_FASTOPEN) &&
437 (((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)) ||
438 (flags & TH_RST)))
439 len = 0;
440#endif
441 if (len <= 0) {
442 /*
443 * If FIN has been sent but not acked,
444 * but we haven't been called to retransmit,
445 * len will be < 0. Otherwise, window shrank
446 * after we sent into it. If window shrank to 0,
447 * cancel pending retransmit, pull snd_nxt back
448 * to (closed) window, and set the persist timer
449 * if it isn't already going. If the window didn't
450 * close completely, just wait for an ACK.
451 *
452 * We also do a general check here to ensure that
453 * we will set the persist timer when we have data
454 * to send, but a 0-byte window. This makes sure
455 * the persist timer is set even if the packet
456 * hits one of the "goto send" lines below.
457 */
458 len = 0;
459 if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) &&
460 (off < (int) sbavail(&so->so_snd))) {
461 tcp_timer_activate(tp, TT_REXMT, 0);
462 tp->t_rxtshift = 0;
463 tp->snd_nxt = tp->snd_una;
464 if (!tcp_timer_active(tp, TT_PERSIST))
465 tcp_setpersist(tp);
466 }
467 }
468
469 /* len will be >= 0 after this point. */
470 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
471
472 /*
473 * Automatic sizing of send socket buffer. Often the send buffer
474 * size is not optimally adjusted to the actual network conditions
475 * at hand (delay bandwidth product). Setting the buffer size too
476 * small limits throughput on links with high bandwidth and high
477 * delay (eg. trans-continental/oceanic links). Setting the
478 * buffer size too big consumes too much real kernel memory,
479 * especially with many connections on busy servers.
480 *
481 * The criteria to step up the send buffer one notch are:
482 * 1. receive window of remote host is larger than send buffer
483 * (with a fudge factor of 5/4th);
484 * 2. send buffer is filled to 7/8th with data (so we actually
485 * have data to make use of it);
486 * 3. send buffer fill has not hit maximal automatic size;
487 * 4. our send window (slow start and cogestion controlled) is
488 * larger than sent but unacknowledged data in send buffer.
489 *
490 * The remote host receive window scaling factor may limit the
491 * growing of the send buffer before it reaches its allowed
492 * maximum.
493 *
494 * It scales directly with slow start or congestion window
495 * and does at most one step per received ACK. This fast
496 * scaling has the drawback of growing the send buffer beyond
497 * what is strictly necessary to make full use of a given
498 * delay*bandwith product. However testing has shown this not
499 * to be much of an problem. At worst we are trading wasting
500 * of available bandwith (the non-use of it) for wasting some
501 * socket buffer memory.
502 *
503 * TODO: Shrink send buffer during idle periods together
504 * with congestion window. Requires another timer. Has to
505 * wait for upcoming tcp timer rewrite.
506 *
507 * XXXGL: should there be used sbused() or sbavail()?
508 */
509 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
510 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
511 sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) &&
512 sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
513 sendwin >= (sbused(&so->so_snd) -
514 (tp->snd_nxt - tp->snd_una))) {
515 if (!sbreserve_locked(&so->so_snd,
516 min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
517 V_tcp_autosndbuf_max), so, curthread))
518 so->so_snd.sb_flags &= ~SB_AUTOSIZE;
519 }
520 }
521
522 /*
523 * Decide if we can use TCP Segmentation Offloading (if supported by
524 * hardware).
525 *
526 * TSO may only be used if we are in a pure bulk sending state. The
527 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
528 * IP options prevent using TSO. With TSO the TCP header is the same
529 * (except for the sequence number) for all generated packets. This
530 * makes it impossible to transmit any options which vary per generated
531 * segment or packet.
532 */
533#ifdef IPSEC
534 /*
535 * Pre-calculate here as we save another lookup into the darknesses
536 * of IPsec that way and can actually decide if TSO is ok.
537 */
538 ipsec_optlen = ipsec_hdrsiz_tcp(tp);
539#endif
540 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
541 ((tp->t_flags & TF_SIGNATURE) == 0) &&
542 tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
543#ifdef IPSEC
544 ipsec_optlen == 0 &&
545#endif
546 tp->t_inpcb->inp_options == NULL &&
547 tp->t_inpcb->in6p_options == NULL)
548 tso = 1;
549
550 if (sack_rxmit) {
551 if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd)))
552 flags &= ~TH_FIN;
553 } else {
554 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
555 sbused(&so->so_snd)))
556 flags &= ~TH_FIN;
557 }
558
559 recwin = sbspace(&so->so_rcv);
560
561 /*
562 * Sender silly window avoidance. We transmit under the following
563 * conditions when len is non-zero:
564 *
565 * - We have a full segment (or more with TSO)
566 * - This is the last buffer in a write()/send() and we are
567 * either idle or running NODELAY
568 * - we've timed out (e.g. persist timer)
569 * - we have more then 1/2 the maximum send window's worth of
570 * data (receiver may be limited the window size)
571 * - we need to retransmit
572 */
573 if (len) {
574 if (len >= tp->t_maxseg)
575 goto send;
576 /*
577 * NOTE! on localhost connections an 'ack' from the remote
578 * end may occur synchronously with the output and cause
579 * us to flush a buffer queued with moretocome. XXX
580 *
581 * note: the len + off check is almost certainly unnecessary.
582 */
583 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
584 (idle || (tp->t_flags & TF_NODELAY)) &&
585 len + off >= sbavail(&so->so_snd) &&
586 (tp->t_flags & TF_NOPUSH) == 0) {
587 goto send;
588 }
589 if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */
590 goto send;
591 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
592 goto send;
593 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
594 goto send;
595 if (sack_rxmit)
596 goto send;
597 }
598
599 /*
600 * Sending of standalone window updates.
601 *
602 * Window updates are important when we close our window due to a
603 * full socket buffer and are opening it again after the application
604 * reads data from it. Once the window has opened again and the
605 * remote end starts to send again the ACK clock takes over and
606 * provides the most current window information.
607 *
608 * We must avoid the silly window syndrome whereas every read
609 * from the receive buffer, no matter how small, causes a window
610 * update to be sent. We also should avoid sending a flurry of
611 * window updates when the socket buffer had queued a lot of data
612 * and the application is doing small reads.
613 *
614 * Prevent a flurry of pointless window updates by only sending
615 * an update when we can increase the advertized window by more
616 * than 1/4th of the socket buffer capacity. When the buffer is
617 * getting full or is very small be more aggressive and send an
618 * update whenever we can increase by two mss sized segments.
619 * In all other situations the ACK's to new incoming data will
620 * carry further window increases.
621 *
622 * Don't send an independent window update if a delayed
623 * ACK is pending (it will get piggy-backed on it) or the
624 * remote side already has done a half-close and won't send
625 * more data. Skip this if the connection is in T/TCP
626 * half-open state.
627 */
628 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
629 !(tp->t_flags & TF_DELACK) &&
630 !TCPS_HAVERCVDFIN(tp->t_state)) {
631 /*
632 * "adv" is the amount we could increase the window,
633 * taking into account that we are limited by
634 * TCP_MAXWIN << tp->rcv_scale.
635 */
636 long adv;
637 int oldwin;
638
639 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
640 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
641 oldwin = (tp->rcv_adv - tp->rcv_nxt);
642 adv -= oldwin;
643 } else
644 oldwin = 0;
645
646 /*
647 * If the new window size ends up being the same as the old
648 * size when it is scaled, then don't force a window update.
649 */
650 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
651 goto dontupdate;
652
653 if (adv >= (long)(2 * tp->t_maxseg) &&
654 (adv >= (long)(so->so_rcv.sb_hiwat / 4) ||
655 recwin <= (long)(so->so_rcv.sb_hiwat / 8) ||
656 so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))
657 goto send;
658 }
659dontupdate:
660
661 /*
662 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
663 * is also a catch-all for the retransmit timer timeout case.
664 */
665 if (tp->t_flags & TF_ACKNOW)
666 goto send;
667 if ((flags & TH_RST) ||
668 ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
669 goto send;
670 if (SEQ_GT(tp->snd_up, tp->snd_una))
671 goto send;
672 /*
673 * If our state indicates that FIN should be sent
674 * and we have not yet done so, then we need to send.
675 */
676 if (flags & TH_FIN &&
677 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
678 goto send;
679 /*
680 * In SACK, it is possible for tcp_output to fail to send a segment
681 * after the retransmission timer has been turned off. Make sure
682 * that the retransmission timer is set.
683 */
684 if ((tp->t_flags & TF_SACK_PERMIT) &&
685 SEQ_GT(tp->snd_max, tp->snd_una) &&
686 !tcp_timer_active(tp, TT_REXMT) &&
687 !tcp_timer_active(tp, TT_PERSIST)) {
688 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
689 goto just_return;
690 }
691 /*
692 * TCP window updates are not reliable, rather a polling protocol
693 * using ``persist'' packets is used to insure receipt of window
694 * updates. The three ``states'' for the output side are:
695 * idle not doing retransmits or persists
696 * persisting to move a small or zero window
697 * (re)transmitting and thereby not persisting
698 *
699 * tcp_timer_active(tp, TT_PERSIST)
700 * is true when we are in persist state.
701 * (tp->t_flags & TF_FORCEDATA)
702 * is set when we are called to send a persist packet.
703 * tcp_timer_active(tp, TT_REXMT)
704 * is set when we are retransmitting
705 * The output side is idle when both timers are zero.
706 *
707 * If send window is too small, there is data to transmit, and no
708 * retransmit or persist is pending, then go to persist state.
709 * If nothing happens soon, send when timer expires:
710 * if window is nonzero, transmit what we can,
711 * otherwise force out a byte.
712 */
713 if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) &&
714 !tcp_timer_active(tp, TT_PERSIST)) {
715 tp->t_rxtshift = 0;
716 tcp_setpersist(tp);
717 }
718
719 /*
720 * No reason to send a segment, just return.
721 */
722just_return:
723 SOCKBUF_UNLOCK(&so->so_snd);
724 return (0);
725
726send:
727 SOCKBUF_LOCK_ASSERT(&so->so_snd);
728 if (len > 0) {
729 if (len >= tp->t_maxseg)
730 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
731 else
732 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
733 }
734 /*
735 * Before ESTABLISHED, force sending of initial options
736 * unless TCP set not to do any options.
737 * NOTE: we assume that the IP/TCP header plus TCP options
738 * always fit in a single mbuf, leaving room for a maximum
739 * link header, i.e.
740 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
741 */
742 optlen = 0;
743#ifdef INET6
744 if (isipv6)
745 hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
746 else
747#endif
748 hdrlen = sizeof (struct tcpiphdr);
749
750 /*
751 * Compute options for segment.
752 * We only have to care about SYN and established connection
753 * segments. Options for SYN-ACK segments are handled in TCP
754 * syncache.
755 */
756 to.to_flags = 0;
757 if ((tp->t_flags & TF_NOOPT) == 0) {
758 /* Maximum segment size. */
759 if (flags & TH_SYN) {
760 tp->snd_nxt = tp->iss;
761 to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
762 to.to_flags |= TOF_MSS;
763#ifdef TCP_RFC7413
764 /*
765 * Only include the TFO option on the first
766 * transmission of the SYN|ACK on a
767 * passively-created TFO socket, as the presence of
768 * the TFO option may have caused the original
769 * SYN|ACK to have been dropped by a middlebox.
770 */
771 if ((tp->t_flags & TF_FASTOPEN) &&
772 (tp->t_state == TCPS_SYN_RECEIVED) &&
773 (tp->t_rxtshift == 0)) {
774 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
775 to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
776 to.to_flags |= TOF_FASTOPEN;
777 }
778#endif
779 }
780 /* Window scaling. */
781 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
782 to.to_wscale = tp->request_r_scale;
783 to.to_flags |= TOF_SCALE;
784 }
785 /* Timestamps. */
786 if ((tp->t_flags & TF_RCVD_TSTMP) ||
787 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
788 to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
789 to.to_tsecr = tp->ts_recent;
790 to.to_flags |= TOF_TS;
791 /* Set receive buffer autosizing timestamp. */
792 if (tp->rfbuf_ts == 0 &&
793 (so->so_rcv.sb_flags & SB_AUTOSIZE))
794 tp->rfbuf_ts = tcp_ts_getticks();
795 }
796 /* Selective ACK's. */
797 if (tp->t_flags & TF_SACK_PERMIT) {
798 if (flags & TH_SYN)
799 to.to_flags |= TOF_SACKPERM;
800 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
801 (tp->t_flags & TF_SACK_PERMIT) &&
802 tp->rcv_numsacks > 0) {
803 to.to_flags |= TOF_SACK;
804 to.to_nsacks = tp->rcv_numsacks;
805 to.to_sacks = (u_char *)tp->sackblks;
806 }
807 }
808#ifdef TCP_SIGNATURE
809 /* TCP-MD5 (RFC2385). */
810 if (tp->t_flags & TF_SIGNATURE)
811 to.to_flags |= TOF_SIGNATURE;
812#endif /* TCP_SIGNATURE */
813
814 /* Processing the options. */
815 hdrlen += optlen = tcp_addoptions(&to, opt);
816 }
817
818#ifdef INET6
819 if (isipv6)
820 ipoptlen = ip6_optlen(tp->t_inpcb);
821 else
822#endif
823 if (tp->t_inpcb->inp_options)
824 ipoptlen = tp->t_inpcb->inp_options->m_len -
825 offsetof(struct ipoption, ipopt_list);
826 else
827 ipoptlen = 0;
828#ifdef IPSEC
829 ipoptlen += ipsec_optlen;
830#endif
831
832 /*
833 * Adjust data length if insertion of options will
834 * bump the packet length beyond the t_maxseg length.
835 * Clear the FIN bit because we cut off the tail of
836 * the segment.
837 */
838 if (len + optlen + ipoptlen > tp->t_maxseg) {
839 flags &= ~TH_FIN;
840
841 if (tso) {
842 u_int if_hw_tsomax;
843 u_int if_hw_tsomaxsegcount;
844 u_int if_hw_tsomaxsegsize;
845 struct mbuf *mb;
846 u_int moff;
847 int max_len;
848
849 /* extract TSO information */
850 if_hw_tsomax = tp->t_tsomax;
851 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
852 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
853
854 /*
855 * Limit a TSO burst to prevent it from
856 * overflowing or exceeding the maximum length
857 * allowed by the network interface:
858 */
859 KASSERT(ipoptlen == 0,
860 ("%s: TSO can't do IP options", __func__));
861
862 /*
863 * Check if we should limit by maximum payload
864 * length:
865 */
866 if (if_hw_tsomax != 0) {
867 /* compute maximum TSO length */
868 max_len = (if_hw_tsomax - hdrlen -
869 max_linkhdr);
870 if (max_len <= 0) {
871 len = 0;
872 } else if (len > max_len) {
873 sendalot = 1;
874 len = max_len;
875 }
876 }
877
878 /*
879 * Check if we should limit by maximum segment
880 * size and count:
881 */
882 if (if_hw_tsomaxsegcount != 0 &&
883 if_hw_tsomaxsegsize != 0) {
884 /*
885 * Subtract one segment for the LINK
886 * and TCP/IP headers mbuf that will
887 * be prepended to this mbuf chain
888 * after the code in this section
889 * limits the number of mbufs in the
890 * chain to if_hw_tsomaxsegcount.
891 */
892 if_hw_tsomaxsegcount -= 1;
893 max_len = 0;
894 mb = sbsndmbuf(&so->so_snd, off, &moff);
895
896 while (mb != NULL && max_len < len) {
897 u_int mlen;
898 u_int frags;
899
900 /*
901 * Get length of mbuf fragment
902 * and how many hardware frags,
903 * rounded up, it would use:
904 */
905 mlen = (mb->m_len - moff);
906 frags = howmany(mlen,
907 if_hw_tsomaxsegsize);
908
909 /* Handle special case: Zero Length Mbuf */
910 if (frags == 0)
911 frags = 1;
912
913 /*
914 * Check if the fragment limit
915 * will be reached or exceeded:
916 */
917 if (frags >= if_hw_tsomaxsegcount) {
918 max_len += min(mlen,
919 if_hw_tsomaxsegcount *
920 if_hw_tsomaxsegsize);
921 break;
922 }
923 max_len += mlen;
924 if_hw_tsomaxsegcount -= frags;
925 moff = 0;
926 mb = mb->m_next;
927 }
928 if (max_len <= 0) {
929 len = 0;
930 } else if (len > max_len) {
931 sendalot = 1;
932 len = max_len;
933 }
934 }
935
936 /*
937 * Prevent the last segment from being
938 * fractional unless the send sockbuf can be
939 * emptied:
940 */
941 max_len = (tp->t_maxseg - optlen);
942 if ((off + len) < sbavail(&so->so_snd)) {
943 moff = len % max_len;
944 if (moff != 0) {
945 len -= moff;
946 sendalot = 1;
947 }
948 }
949
950 /*
951 * In case there are too many small fragments
952 * don't use TSO:
953 */
954 if (len <= max_len) {
955 len = max_len;
956 sendalot = 1;
957 tso = 0;
958 }
959
960 /*
961 * Send the FIN in a separate segment
962 * after the bulk sending is done.
963 * We don't trust the TSO implementations
964 * to clear the FIN flag on all but the
965 * last segment.
966 */
967 if (tp->t_flags & TF_NEEDFIN)
968 sendalot = 1;
969
970 } else {
971 len = tp->t_maxseg - optlen - ipoptlen;
972 sendalot = 1;
973 }
974 } else
975 tso = 0;
976
977 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
978 ("%s: len > IP_MAXPACKET", __func__));
979
980/*#ifdef DIAGNOSTIC*/
981#ifdef INET6
982 if (max_linkhdr + hdrlen > MCLBYTES)
983#else
984 if (max_linkhdr + hdrlen > MHLEN)
985#endif
986 panic("tcphdr too big");
987/*#endif*/
988
989 /*
990 * This KASSERT is here to catch edge cases at a well defined place.
991 * Before, those had triggered (random) panic conditions further down.
992 */
993 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
994
995 /*
996 * Grab a header mbuf, attaching a copy of data to
997 * be transmitted, and initialize the header from
998 * the template for sends on this connection.
999 */
1000 if (len) {
1001 struct mbuf *mb;
1002 u_int moff;
1003
1004 if ((tp->t_flags & TF_FORCEDATA) && len == 1)
1005 TCPSTAT_INC(tcps_sndprobe);
1006 else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
1007 tp->t_sndrexmitpack++;
1008 TCPSTAT_INC(tcps_sndrexmitpack);
1009 TCPSTAT_ADD(tcps_sndrexmitbyte, len);
1010 } else {
1011 TCPSTAT_INC(tcps_sndpack);
1012 TCPSTAT_ADD(tcps_sndbyte, len);
1013 }
1014#ifdef INET6
1015 if (MHLEN < hdrlen + max_linkhdr)
1016 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1017 else
1018#endif
1019 m = m_gethdr(M_NOWAIT, MT_DATA);
1020
1021 if (m == NULL) {
1022 SOCKBUF_UNLOCK(&so->so_snd);
1023 error = ENOBUFS;
1024 sack_rxmit = 0;
1025 goto out;
1026 }
1027
1028 m->m_data += max_linkhdr;
1029 m->m_len = hdrlen;
1030
1031 /*
1032 * Start the m_copy functions from the closest mbuf
1033 * to the offset in the socket buffer chain.
1034 */
1035 mb = sbsndptr(&so->so_snd, off, len, &moff);
1036
1037 if (len <= MHLEN - hdrlen - max_linkhdr) {
1038 m_copydata(mb, moff, (int)len,
1039 mtod(m, caddr_t) + hdrlen);
1040 m->m_len += len;
1041 } else {
1042 m->m_next = m_copy(mb, moff, (int)len);
1043 if (m->m_next == NULL) {
1044 SOCKBUF_UNLOCK(&so->so_snd);
1045 (void) m_free(m);
1046 error = ENOBUFS;
1047 sack_rxmit = 0;
1048 goto out;
1049 }
1050 }
1051
1052 /*
1053 * If we're sending everything we've got, set PUSH.
1054 * (This will keep happy those implementations which only
1055 * give data to the user when a buffer fills or
1056 * a PUSH comes in.)
1057 */
1058 if ((off + len == sbused(&so->so_snd)) && !(flags & TH_SYN))
1059 flags |= TH_PUSH;
1060 SOCKBUF_UNLOCK(&so->so_snd);
1061 } else {
1062 SOCKBUF_UNLOCK(&so->so_snd);
1063 if (tp->t_flags & TF_ACKNOW)
1064 TCPSTAT_INC(tcps_sndacks);
1065 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1066 TCPSTAT_INC(tcps_sndctrl);
1067 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1068 TCPSTAT_INC(tcps_sndurg);
1069 else
1070 TCPSTAT_INC(tcps_sndwinup);
1071
1072 m = m_gethdr(M_NOWAIT, MT_DATA);
1073 if (m == NULL) {
1074 error = ENOBUFS;
1075 sack_rxmit = 0;
1076 goto out;
1077 }
1078#ifdef INET6
1079 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
1080 MHLEN >= hdrlen) {
1081 M_ALIGN(m, hdrlen);
1082 } else
1083#endif
1084 m->m_data += max_linkhdr;
1085 m->m_len = hdrlen;
1086 }
1087 SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
1088 m->m_pkthdr.rcvif = (struct ifnet *)0;
1089#ifdef MAC
1090 mac_inpcb_create_mbuf(tp->t_inpcb, m);
1091#endif
1092#ifdef INET6
1093 if (isipv6) {
1094 ip6 = mtod(m, struct ip6_hdr *);
1095 th = (struct tcphdr *)(ip6 + 1);
1096 tcpip_fillheaders(tp->t_inpcb, ip6, th);
1097 } else
1098#endif /* INET6 */
1099 {
1100 ip = mtod(m, struct ip *);
1101 ipov = (struct ipovly *)ip;
1102 th = (struct tcphdr *)(ip + 1);
1103 tcpip_fillheaders(tp->t_inpcb, ip, th);
1104 }
1105
1106 /*
1107 * Fill in fields, remembering maximum advertised
1108 * window for use in delaying messages about window sizes.
1109 * If resending a FIN, be sure not to use a new sequence number.
1110 */
1111 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
1112 tp->snd_nxt == tp->snd_max)
1113 tp->snd_nxt--;
1114 /*
1115 * If we are starting a connection, send ECN setup
1116 * SYN packet. If we are on a retransmit, we may
1117 * resend those bits a number of times as per
1118 * RFC 3168.
1119 */
1120 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
1121 if (tp->t_rxtshift >= 1) {
1122 if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
1123 flags |= TH_ECE|TH_CWR;
1124 } else
1125 flags |= TH_ECE|TH_CWR;
1126 }
1127
1128 if (tp->t_state == TCPS_ESTABLISHED &&
1129 (tp->t_flags & TF_ECN_PERMIT)) {
1130 /*
1131 * If the peer has ECN, mark data packets with
1132 * ECN capable transmission (ECT).
1133 * Ignore pure ack packets, retransmissions and window probes.
1134 */
1135 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
1136 !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
1137#ifdef INET6
1138 if (isipv6)
1139 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
1140 else
1141#endif
1142 ip->ip_tos |= IPTOS_ECN_ECT0;
1143 TCPSTAT_INC(tcps_ecn_ect0);
1144 }
1145
1146 /*
1147 * Reply with proper ECN notifications.
1148 */
1149 if (tp->t_flags & TF_ECN_SND_CWR) {
1150 flags |= TH_CWR;
1151 tp->t_flags &= ~TF_ECN_SND_CWR;
1152 }
1153 if (tp->t_flags & TF_ECN_SND_ECE)
1154 flags |= TH_ECE;
1155 }
1156
1157 /*
1158 * If we are doing retransmissions, then snd_nxt will
1159 * not reflect the first unsent octet. For ACK only
1160 * packets, we do not want the sequence number of the
1161 * retransmitted packet, we want the sequence number
1162 * of the next unsent octet. So, if there is no data
1163 * (and no SYN or FIN), use snd_max instead of snd_nxt
1164 * when filling in ti_seq. But if we are in persist
1165 * state, snd_max might reflect one byte beyond the
1166 * right edge of the window, so use snd_nxt in that
1167 * case, since we know we aren't doing a retransmission.
1168 * (retransmit and persist are mutually exclusive...)
1169 */
1170 if (sack_rxmit == 0) {
1171 if (len || (flags & (TH_SYN|TH_FIN)) ||
1172 tcp_timer_active(tp, TT_PERSIST))
1173 th->th_seq = htonl(tp->snd_nxt);
1174 else
1175 th->th_seq = htonl(tp->snd_max);
1176 } else {
1177 th->th_seq = htonl(p->rxmit);
1178 p->rxmit += len;
1179 tp->sackhint.sack_bytes_rexmit += len;
1180 }
1181 th->th_ack = htonl(tp->rcv_nxt);
1182 if (optlen) {
1183 bcopy(opt, th + 1, optlen);
1184 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1185 }
1186 th->th_flags = flags;
1187 /*
1188 * Calculate receive window. Don't shrink window,
1189 * but avoid silly window syndrome.
1190 */
1191 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
1192 recwin < (long)tp->t_maxseg)
1193 recwin = 0;
1194 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
1195 recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
1196 recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
1197 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
1198 recwin = (long)TCP_MAXWIN << tp->rcv_scale;
1199
1200 /*
1201 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1202 * or <SYN,ACK>) segment itself is never scaled. The <SYN,ACK>
1203 * case is handled in syncache.
1204 */
1205 if (flags & TH_SYN)
1206 th->th_win = htons((u_short)
1207 (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
1208 else
1209 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
1210
1211 /*
1212 * Adjust the RXWIN0SENT flag - indicate that we have advertised
1213 * a 0 window. This may cause the remote transmitter to stall. This
1214 * flag tells soreceive() to disable delayed acknowledgements when
1215 * draining the buffer. This can occur if the receiver is attempting
1216 * to read more data than can be buffered prior to transmitting on
1217 * the connection.
1218 */
1219 if (th->th_win == 0) {
1220 tp->t_sndzerowin++;
1221 tp->t_flags |= TF_RXWIN0SENT;
1222 } else
1223 tp->t_flags &= ~TF_RXWIN0SENT;
1224 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1225 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
1226 th->th_flags |= TH_URG;
1227 } else
1228 /*
1229 * If no urgent pointer to send, then we pull
1230 * the urgent pointer to the left edge of the send window
1231 * so that it doesn't drift into the send window on sequence
1232 * number wraparound.
1233 */
1234 tp->snd_up = tp->snd_una; /* drag it along */
1235
1236#ifdef TCP_SIGNATURE
1237 if (to.to_flags & TOF_SIGNATURE) {
1238 int sigoff = to.to_signature - opt;
1239 tcp_signature_compute(m, 0, len, optlen,
1240 (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
1241 }
1242#endif
1243
1244 /*
1245 * Put TCP length in extended header, and then
1246 * checksum extended header and data.
1247 */
1248 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1249 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1250#ifdef INET6
1251 if (isipv6) {
1252 /*
1253 * ip6_plen is not need to be filled now, and will be filled
1254 * in ip6_output.
1255 */
1256 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
1257 th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
1258 optlen + len, IPPROTO_TCP, 0);
1259 }
1260#endif
1261#if defined(INET6) && defined(INET)
1262 else
1263#endif
1264#ifdef INET
1265 {
1266 m->m_pkthdr.csum_flags = CSUM_TCP;
1267 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1268 htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
1269
1270 /* IP version must be set here for ipv4/ipv6 checking later */
1271 KASSERT(ip->ip_v == IPVERSION,
1272 ("%s: IP version incorrect: %d", __func__, ip->ip_v));
1273 }
1274#endif
1275
1276 /*
1277 * Enable TSO and specify the size of the segments.
1278 * The TCP pseudo header checksum is always provided.
1279 */
1280 if (tso) {
1281 KASSERT(len > tp->t_maxseg - optlen,
1282 ("%s: len <= tso_segsz", __func__));
1283 m->m_pkthdr.csum_flags |= CSUM_TSO;
1284 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
1285 }
1286
1287#ifdef IPSEC
1288 KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
1289 ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u",
1290 __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
1291#else
1292 KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
1293 ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u",
1294 __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
1295#endif
1296
1297 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
1298 hhook_run_tcp_est_out(tp, th, &to, len, tso);
1299
1300#ifdef TCPDEBUG
1301 /*
1302 * Trace.
1303 */
1304 if (so->so_options & SO_DEBUG) {
1305 u_short save = 0;
1306#ifdef INET6
1307 if (!isipv6)
1308#endif
1309 {
1310 save = ipov->ih_len;
1311 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
1312 }
1313 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
1314#ifdef INET6
1315 if (!isipv6)
1316#endif
1317 ipov->ih_len = save;
1318 }
1319#endif /* TCPDEBUG */
1320 TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
1321
1322 /*
1323 * Fill in IP length and desired time to live and
1324 * send to IP level. There should be a better way
1325 * to handle ttl and tos; we could keep them in
1326 * the template, but need a way to checksum without them.
1327 */
1328 /*
1329 * m->m_pkthdr.len should have been set before checksum calculation,
1330 * because in6_cksum() need it.
1331 */
1332#ifdef INET6
1333 if (isipv6) {
1334 struct route_in6 ro;
1335
1336 bzero(&ro, sizeof(ro));
1337 /*
1338 * we separately set hoplimit for every segment, since the
1339 * user might want to change the value via setsockopt.
1340 * Also, desired default hop limit might be changed via
1341 * Neighbor Discovery.
1342 */
1343 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
1344
1345 /*
1346 * Set the packet size here for the benefit of DTrace probes.
1347 * ip6_output() will set it properly; it's supposed to include
1348 * the option header lengths as well.
1349 */
1350 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
1351
1352 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
1353 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
1354 else
1355 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
1356
1357 if (tp->t_state == TCPS_SYN_SENT)
1358 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
1359
1360 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
1361
1362#ifdef TCPPCAP
1363 /* Save packet, if requested. */
1364 tcp_pcap_add(th, m, &(tp->t_outpkts));
1365#endif
1366
1367 /* TODO: IPv6 IP6TOS_ECT bit on */
1368 error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
1369 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
1370 NULL, NULL, tp->t_inpcb);
1371
1372 if (error == EMSGSIZE && ro.ro_rt != NULL)
1373 mtu = ro.ro_rt->rt_mtu;
1374 RO_RTFREE(&ro);
1375 }
1376#endif /* INET6 */
1377#if defined(INET) && defined(INET6)
1378 else
1379#endif
1380#ifdef INET
1381 {
1382 struct route ro;
1383
1384 bzero(&ro, sizeof(ro));
1385 ip->ip_len = htons(m->m_pkthdr.len);
1386#ifdef INET6
1387 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
1388 ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
1389#endif /* INET6 */
1390 /*
1391 * If we do path MTU discovery, then we set DF on every packet.
1392 * This might not be the best thing to do according to RFC3390
1393 * Section 2. However the tcp hostcache migitates the problem
1394 * so it affects only the first tcp connection with a host.
1395 *
1396 * NB: Don't set DF on small MTU/MSS to have a safe fallback.
1397 */
1398 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
1399 ip->ip_off |= htons(IP_DF);
1400 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
1401 } else {
1402 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
1403 }
1404
1405 if (tp->t_state == TCPS_SYN_SENT)
1406 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
1407
1408 TCP_PROBE5(send, NULL, tp, ip, tp, th);
1409
1410#ifdef TCPPCAP
1411 /* Save packet, if requested. */
1412 tcp_pcap_add(th, m, &(tp->t_outpkts));
1413#endif
1414
1415 error = ip_output(m, tp->t_inpcb->inp_options, &ro,
1416 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
1417 tp->t_inpcb);
1418
1419 if (error == EMSGSIZE && ro.ro_rt != NULL)
1420 mtu = ro.ro_rt->rt_mtu;
1421 RO_RTFREE(&ro);
1422 }
1423#endif /* INET */
1424
1425out:
1426 /*
1427 * In transmit state, time the transmission and arrange for
1428 * the retransmit. In persist state, just set snd_max.
1429 */
1430 if ((tp->t_flags & TF_FORCEDATA) == 0 ||
1431 !tcp_timer_active(tp, TT_PERSIST)) {
1432 tcp_seq startseq = tp->snd_nxt;
1433
1434 /*
1435 * Advance snd_nxt over sequence space of this segment.
1436 */
1437 if (flags & (TH_SYN|TH_FIN)) {
1438 if (flags & TH_SYN)
1439 tp->snd_nxt++;
1440 if (flags & TH_FIN) {
1441 tp->snd_nxt++;
1442 tp->t_flags |= TF_SENTFIN;
1443 }
1444 }
1445 if (sack_rxmit)
1446 goto timer;
1447 tp->snd_nxt += len;
1448 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1449 tp->snd_max = tp->snd_nxt;
1450 /*
1451 * Time this transmission if not a retransmission and
1452 * not currently timing anything.
1453 */
1454 if (tp->t_rtttime == 0) {
1455 tp->t_rtttime = ticks;
1456 tp->t_rtseq = startseq;
1457 TCPSTAT_INC(tcps_segstimed);
1458 }
1459 }
1460
1461 /*
1462 * Set retransmit timer if not currently set,
1463 * and not doing a pure ack or a keep-alive probe.
1464 * Initial value for retransmit timer is smoothed
1465 * round-trip time + 2 * round-trip time variance.
1466 * Initialize shift counter which is used for backoff
1467 * of retransmit time.
1468 */
1469timer:
1470 if (!tcp_timer_active(tp, TT_REXMT) &&
1471 ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1472 (tp->snd_nxt != tp->snd_una))) {
1473 if (tcp_timer_active(tp, TT_PERSIST)) {
1474 tcp_timer_activate(tp, TT_PERSIST, 0);
1475 tp->t_rxtshift = 0;
1476 }
1477 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1478 } else if (len == 0 && sbavail(&so->so_snd) &&
1479 !tcp_timer_active(tp, TT_REXMT) &&
1480 !tcp_timer_active(tp, TT_PERSIST)) {
1481 /*
1482 * Avoid a situation where we do not set persist timer
1483 * after a zero window condition. For example:
1484 * 1) A -> B: packet with enough data to fill the window
1485 * 2) B -> A: ACK for #1 + new data (0 window
1486 * advertisement)
1487 * 3) A -> B: ACK for #2, 0 len packet
1488 *
1489 * In this case, A will not activate the persist timer,
1490 * because it chose to send a packet. Unless tcp_output
1491 * is called for some other reason (delayed ack timer,
1492 * another input packet from B, socket syscall), A will
1493 * not send zero window probes.
1494 *
1495 * So, if you send a 0-length packet, but there is data
1496 * in the socket buffer, and neither the rexmt or
1497 * persist timer is already set, then activate the
1498 * persist timer.
1499 */
1500 tp->t_rxtshift = 0;
1501 tcp_setpersist(tp);
1502 }
1503 } else {
1504 /*
1505 * Persist case, update snd_max but since we are in
1506 * persist mode (no window) we do not update snd_nxt.
1507 */
1508 int xlen = len;
1509 if (flags & TH_SYN)
1510 ++xlen;
1511 if (flags & TH_FIN) {
1512 ++xlen;
1513 tp->t_flags |= TF_SENTFIN;
1514 }
1515 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1516 tp->snd_max = tp->snd_nxt + len;
1517 }
1518
1519 if (error) {
1520
1521 /*
1522 * We know that the packet was lost, so back out the
1523 * sequence number advance, if any.
1524 *
1525 * If the error is EPERM the packet got blocked by the
1526 * local firewall. Normally we should terminate the
1527 * connection but the blocking may have been spurious
1528 * due to a firewall reconfiguration cycle. So we treat
1529 * it like a packet loss and let the retransmit timer and
1530 * timeouts do their work over time.
1531 * XXX: It is a POLA question whether calling tcp_drop right
1532 * away would be the really correct behavior instead.
1533 */
1534 if (((tp->t_flags & TF_FORCEDATA) == 0 ||
1535 !tcp_timer_active(tp, TT_PERSIST)) &&
1536 ((flags & TH_SYN) == 0) &&
1537 (error != EPERM)) {
1538 if (sack_rxmit) {
1539 p->rxmit -= len;
1540 tp->sackhint.sack_bytes_rexmit -= len;
1541 KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
1542 ("sackhint bytes rtx >= 0"));
1543 } else
1544 tp->snd_nxt -= len;
1545 }
1546 SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */
1547 switch (error) {
1548 case EPERM:
1549 tp->t_softerror = error;
1550 return (error);
1551 case ENOBUFS:
1552 if (!tcp_timer_active(tp, TT_REXMT) &&
1553 !tcp_timer_active(tp, TT_PERSIST))
1554 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
1555 tp->snd_cwnd = tp->t_maxseg;
1556 return (0);
1557 case EMSGSIZE:
1558 /*
1559 * For some reason the interface we used initially
1560 * to send segments changed to another or lowered
1561 * its MTU.
1562 * If TSO was active we either got an interface
1563 * without TSO capabilits or TSO was turned off.
1564 * If we obtained mtu from ip_output() then update
1565 * it and try again.
1566 */
1567 if (tso)
1568 tp->t_flags &= ~TF_TSO;
1569 if (mtu != 0) {
1570 tcp_mss_update(tp, -1, mtu, NULL, NULL);
1571 goto again;
1572 }
1573 return (error);
1574 case EHOSTDOWN:
1575 case EHOSTUNREACH:
1576 case ENETDOWN:
1577 case ENETUNREACH:
1578 if (TCPS_HAVERCVDSYN(tp->t_state)) {
1579 tp->t_softerror = error;
1580 return (0);
1581 }
1582 /* FALLTHROUGH */
1583 default:
1584 return (error);
1585 }
1586 }
1587 TCPSTAT_INC(tcps_sndtotal);
1588
1589 /*
1590 * Data sent (as far as we can tell).
1591 * If this advertises a larger window than any other segment,
1592 * then remember the size of the advertised window.
1593 * Any pending ACK has now been sent.
1594 */
1595 if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
1596 tp->rcv_adv = tp->rcv_nxt + recwin;
1597 tp->last_ack_sent = tp->rcv_nxt;
1598 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
1599 if (tcp_timer_active(tp, TT_DELACK))
1600 tcp_timer_activate(tp, TT_DELACK, 0);
1601#if 0
1602 /*
1603 * This completely breaks TCP if newreno is turned on. What happens
1604 * is that if delayed-acks are turned on on the receiver, this code
1605 * on the transmitter effectively destroys the TCP window, forcing
1606 * it to four packets (1.5Kx4 = 6K window).
1607 */
1608 if (sendalot && --maxburst)
1609 goto again;
1610#endif
1611 if (sendalot)
1612 goto again;
1613 return (0);
1614}
1615
1616void
1617tcp_setpersist(struct tcpcb *tp)
1618{
1619 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1620 int tt;
1621
1622 tp->t_flags &= ~TF_PREVVALID;
1623 if (tcp_timer_active(tp, TT_REXMT))
1624 panic("tcp_setpersist: retransmit pending");
1625 /*
1626 * Start/restart persistance timer.
1627 */
1628 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
1629 TCPTV_PERSMIN, TCPTV_PERSMAX);
1630 tcp_timer_activate(tp, TT_PERSIST, tt);
1631 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1632 tp->t_rxtshift++;
1633}
1634
1635/*
1636 * Insert TCP options according to the supplied parameters to the place
1637 * optp in a consistent way. Can handle unaligned destinations.
1638 *
1639 * The order of the option processing is crucial for optimal packing and
1640 * alignment for the scarce option space.
1641 *
1642 * The optimal order for a SYN/SYN-ACK segment is:
1643 * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
1644 * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
1645 *
1646 * The SACK options should be last. SACK blocks consume 8*n+2 bytes.
1647 * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
1648 * At minimum we need 10 bytes (to generate 1 SACK block). If both
1649 * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
1650 * we only have 10 bytes for SACK options (40 - (12 + 18)).
1651 */
1652int
1653tcp_addoptions(struct tcpopt *to, u_char *optp)
1654{
1655 u_int mask, optlen = 0;
1656
1657 for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
1658 if ((to->to_flags & mask) != mask)
1659 continue;
1660 if (optlen == TCP_MAXOLEN)
1661 break;
1662 switch (to->to_flags & mask) {
1663 case TOF_MSS:
1664 while (optlen % 4) {
1665 optlen += TCPOLEN_NOP;
1666 *optp++ = TCPOPT_NOP;
1667 }
1668 if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
1669 continue;
1670 optlen += TCPOLEN_MAXSEG;
1671 *optp++ = TCPOPT_MAXSEG;
1672 *optp++ = TCPOLEN_MAXSEG;
1673 to->to_mss = htons(to->to_mss);
1674 bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
1675 optp += sizeof(to->to_mss);
1676 break;
1677 case TOF_SCALE:
1678 while (!optlen || optlen % 2 != 1) {
1679 optlen += TCPOLEN_NOP;
1680 *optp++ = TCPOPT_NOP;
1681 }
1682 if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
1683 continue;
1684 optlen += TCPOLEN_WINDOW;
1685 *optp++ = TCPOPT_WINDOW;
1686 *optp++ = TCPOLEN_WINDOW;
1687 *optp++ = to->to_wscale;
1688 break;
1689 case TOF_SACKPERM:
1690 while (optlen % 2) {
1691 optlen += TCPOLEN_NOP;
1692 *optp++ = TCPOPT_NOP;
1693 }
1694 if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
1695 continue;
1696 optlen += TCPOLEN_SACK_PERMITTED;
1697 *optp++ = TCPOPT_SACK_PERMITTED;
1698 *optp++ = TCPOLEN_SACK_PERMITTED;
1699 break;
1700 case TOF_TS:
1701 while (!optlen || optlen % 4 != 2) {
1702 optlen += TCPOLEN_NOP;
1703 *optp++ = TCPOPT_NOP;
1704 }
1705 if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
1706 continue;
1707 optlen += TCPOLEN_TIMESTAMP;
1708 *optp++ = TCPOPT_TIMESTAMP;
1709 *optp++ = TCPOLEN_TIMESTAMP;
1710 to->to_tsval = htonl(to->to_tsval);
1711 to->to_tsecr = htonl(to->to_tsecr);
1712 bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
1713 optp += sizeof(to->to_tsval);
1714 bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
1715 optp += sizeof(to->to_tsecr);
1716 break;
1717#ifdef TCP_SIGNATURE
1718 case TOF_SIGNATURE:
1719 {
1720 int siglen = TCPOLEN_SIGNATURE - 2;
1721
1722 while (!optlen || optlen % 4 != 2) {
1723 optlen += TCPOLEN_NOP;
1724 *optp++ = TCPOPT_NOP;
1725 }
1726 if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE)
1727 continue;
1728 optlen += TCPOLEN_SIGNATURE;
1729 *optp++ = TCPOPT_SIGNATURE;
1730 *optp++ = TCPOLEN_SIGNATURE;
1731 to->to_signature = optp;
1732 while (siglen--)
1733 *optp++ = 0;
1734 break;
1735 }
1736#endif
1737 case TOF_SACK:
1738 {
1739 int sackblks = 0;
1740 struct sackblk *sack = (struct sackblk *)to->to_sacks;
1741 tcp_seq sack_seq;
1742
1743 while (!optlen || optlen % 4 != 2) {
1744 optlen += TCPOLEN_NOP;
1745 *optp++ = TCPOPT_NOP;
1746 }
1747 if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
1748 continue;
1749 optlen += TCPOLEN_SACKHDR;
1750 *optp++ = TCPOPT_SACK;
1751 sackblks = min(to->to_nsacks,
1752 (TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
1753 *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
1754 while (sackblks--) {
1755 sack_seq = htonl(sack->start);
1756 bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
1757 optp += sizeof(sack_seq);
1758 sack_seq = htonl(sack->end);
1759 bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
1760 optp += sizeof(sack_seq);
1761 optlen += TCPOLEN_SACK;
1762 sack++;
1763 }
1764 TCPSTAT_INC(tcps_sack_send_blocks);
1765 break;
1766 }
1767#ifdef TCP_RFC7413
1768 case TOF_FASTOPEN:
1769 {
1770 int total_len;
1771
1772 /* XXX is there any point to aligning this option? */
1773 total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len;
1774 if (TCP_MAXOLEN - optlen < total_len)
1775 continue;
1776 *optp++ = TCPOPT_FAST_OPEN;
1777 *optp++ = total_len;
1778 if (to->to_tfo_len > 0) {
1779 bcopy(to->to_tfo_cookie, optp, to->to_tfo_len);
1780 optp += to->to_tfo_len;
1781 }
1782 optlen += total_len;
1783 break;
1784 }
1785#endif
1786 default:
1787 panic("%s: unknown TCP option type", __func__);
1788 break;
1789 }
1790 }
1791
1792 /* Terminate and pad TCP options to a 4 byte boundary. */
1793 if (optlen % 4) {
1794 optlen += TCPOLEN_EOL;
1795 *optp++ = TCPOPT_EOL;
1796 }
1797 /*
1798 * According to RFC 793 (STD0007):
1799 * "The content of the header beyond the End-of-Option option
1800 * must be header padding (i.e., zero)."
1801 * and later: "The padding is composed of zeros."
1802 */
1803 while (optlen % 4) {
1804 optlen += TCPOLEN_PAD;
1805 *optp++ = TCPOPT_PAD;
1806 }
1807
1808 KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
1809 return (optlen);
1810}