Deleted Added
full compact
tcp_subr.c (205251) tcp_subr.c (207369)
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/netinet/tcp_subr.c 205251 2010-03-17 18:28:27Z bz $");
33__FBSDID("$FreeBSD: head/sys/netinet/tcp_subr.c 207369 2010-04-29 11:52:42Z bz $");
34
35#include "opt_compat.h"
36#include "opt_inet.h"
37#include "opt_inet6.h"
38#include "opt_ipsec.h"
39#include "opt_tcpdebug.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/callout.h>
44#include <sys/kernel.h>
45#include <sys/sysctl.h>
46#include <sys/jail.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#ifdef INET6
50#include <sys/domain.h>
51#endif
52#include <sys/priv.h>
53#include <sys/proc.h>
54#include <sys/socket.h>
55#include <sys/socketvar.h>
56#include <sys/protosw.h>
57#include <sys/random.h>
58
59#include <vm/uma.h>
60
61#include <net/route.h>
62#include <net/if.h>
63#include <net/vnet.h>
64
65#include <netinet/in.h>
66#include <netinet/in_systm.h>
67#include <netinet/ip.h>
68#ifdef INET6
69#include <netinet/ip6.h>
70#endif
71#include <netinet/in_pcb.h>
72#ifdef INET6
73#include <netinet6/in6_pcb.h>
74#endif
75#include <netinet/in_var.h>
76#include <netinet/ip_var.h>
77#ifdef INET6
78#include <netinet6/ip6_var.h>
79#include <netinet6/scope6_var.h>
80#include <netinet6/nd6.h>
81#endif
82#include <netinet/ip_icmp.h>
83#include <netinet/tcp.h>
84#include <netinet/tcp_fsm.h>
85#include <netinet/tcp_seq.h>
86#include <netinet/tcp_timer.h>
87#include <netinet/tcp_var.h>
88#include <netinet/tcp_syncache.h>
89#include <netinet/tcp_offload.h>
90#ifdef INET6
91#include <netinet6/tcp6_var.h>
92#endif
93#include <netinet/tcpip.h>
94#ifdef TCPDEBUG
95#include <netinet/tcp_debug.h>
96#endif
97#include <netinet6/ip6protosw.h>
98
99#ifdef IPSEC
100#include <netipsec/ipsec.h>
101#include <netipsec/xform.h>
102#ifdef INET6
103#include <netipsec/ipsec6.h>
104#endif
105#include <netipsec/key.h>
106#include <sys/syslog.h>
107#endif /*IPSEC*/
108
109#include <machine/in_cksum.h>
110#include <sys/md5.h>
111
112#include <security/mac/mac_framework.h>
113
34
35#include "opt_compat.h"
36#include "opt_inet.h"
37#include "opt_inet6.h"
38#include "opt_ipsec.h"
39#include "opt_tcpdebug.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/callout.h>
44#include <sys/kernel.h>
45#include <sys/sysctl.h>
46#include <sys/jail.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#ifdef INET6
50#include <sys/domain.h>
51#endif
52#include <sys/priv.h>
53#include <sys/proc.h>
54#include <sys/socket.h>
55#include <sys/socketvar.h>
56#include <sys/protosw.h>
57#include <sys/random.h>
58
59#include <vm/uma.h>
60
61#include <net/route.h>
62#include <net/if.h>
63#include <net/vnet.h>
64
65#include <netinet/in.h>
66#include <netinet/in_systm.h>
67#include <netinet/ip.h>
68#ifdef INET6
69#include <netinet/ip6.h>
70#endif
71#include <netinet/in_pcb.h>
72#ifdef INET6
73#include <netinet6/in6_pcb.h>
74#endif
75#include <netinet/in_var.h>
76#include <netinet/ip_var.h>
77#ifdef INET6
78#include <netinet6/ip6_var.h>
79#include <netinet6/scope6_var.h>
80#include <netinet6/nd6.h>
81#endif
82#include <netinet/ip_icmp.h>
83#include <netinet/tcp.h>
84#include <netinet/tcp_fsm.h>
85#include <netinet/tcp_seq.h>
86#include <netinet/tcp_timer.h>
87#include <netinet/tcp_var.h>
88#include <netinet/tcp_syncache.h>
89#include <netinet/tcp_offload.h>
90#ifdef INET6
91#include <netinet6/tcp6_var.h>
92#endif
93#include <netinet/tcpip.h>
94#ifdef TCPDEBUG
95#include <netinet/tcp_debug.h>
96#endif
97#include <netinet6/ip6protosw.h>
98
99#ifdef IPSEC
100#include <netipsec/ipsec.h>
101#include <netipsec/xform.h>
102#ifdef INET6
103#include <netipsec/ipsec6.h>
104#endif
105#include <netipsec/key.h>
106#include <sys/syslog.h>
107#endif /*IPSEC*/
108
109#include <machine/in_cksum.h>
110#include <sys/md5.h>
111
112#include <security/mac/mac_framework.h>
113
114VNET_DEFINE(int, tcp_mssdflt);
114VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
115#ifdef INET6
115#ifdef INET6
116VNET_DEFINE(int, tcp_v6mssdflt);
116VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
117#endif
117#endif
118VNET_DEFINE(int, tcp_minmss);
119VNET_DEFINE(int, tcp_do_rfc1323);
120
118
121static VNET_DEFINE(int, icmp_may_rst);
122static VNET_DEFINE(int, tcp_isn_reseed_interval);
123static VNET_DEFINE(int, tcp_inflight_enable);
124static VNET_DEFINE(int, tcp_inflight_rttthresh);
125static VNET_DEFINE(int, tcp_inflight_min);
126static VNET_DEFINE(int, tcp_inflight_max);
127static VNET_DEFINE(int, tcp_inflight_stab);
128
129#define V_icmp_may_rst VNET(icmp_may_rst)
130#define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval)
131#define V_tcp_inflight_enable VNET(tcp_inflight_enable)
132#define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh)
133#define V_tcp_inflight_min VNET(tcp_inflight_min)
134#define V_tcp_inflight_max VNET(tcp_inflight_max)
135#define V_tcp_inflight_stab VNET(tcp_inflight_stab)
136
137static int
138sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
139{
140 int error, new;
141
142 new = V_tcp_mssdflt;
143 error = sysctl_handle_int(oidp, &new, 0, req);
144 if (error == 0 && req->newptr) {
145 if (new < TCP_MINMSS)
146 error = EINVAL;
147 else
148 V_tcp_mssdflt = new;
149 }
150 return (error);
151}
152
153SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
154 CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0,
155 &sysctl_net_inet_tcp_mss_check, "I",
156 "Default TCP Maximum Segment Size");
157
158#ifdef INET6
159static int
160sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
161{
162 int error, new;
163
164 new = V_tcp_v6mssdflt;
165 error = sysctl_handle_int(oidp, &new, 0, req);
166 if (error == 0 && req->newptr) {
167 if (new < TCP_MINMSS)
168 error = EINVAL;
169 else
170 V_tcp_v6mssdflt = new;
171 }
172 return (error);
173}
174
175SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
176 CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
177 &sysctl_net_inet_tcp_mss_v6_check, "I",
178 "Default TCP Maximum Segment Size for IPv6");
179#endif
180
181static int
182vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
183{
184
185 VNET_SYSCTL_ARG(req, arg1);
186 return (sysctl_msec_to_ticks(oidp, arg1, arg2, req));
187}
188
189/*
190 * Minimum MSS we accept and use. This prevents DoS attacks where
191 * we are forced to a ridiculous low MSS like 20 and send hundreds
192 * of packets instead of one. The effect scales with the available
193 * bandwidth and quickly saturates the CPU and network interface
194 * with packet generation and sending. Set to zero to disable MINMSS
195 * checking. This setting prevents us from sending too small packets.
196 */
119static int
120sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
121{
122 int error, new;
123
124 new = V_tcp_mssdflt;
125 error = sysctl_handle_int(oidp, &new, 0, req);
126 if (error == 0 && req->newptr) {
127 if (new < TCP_MINMSS)
128 error = EINVAL;
129 else
130 V_tcp_mssdflt = new;
131 }
132 return (error);
133}
134
135SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
136 CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0,
137 &sysctl_net_inet_tcp_mss_check, "I",
138 "Default TCP Maximum Segment Size");
139
140#ifdef INET6
141static int
142sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
143{
144 int error, new;
145
146 new = V_tcp_v6mssdflt;
147 error = sysctl_handle_int(oidp, &new, 0, req);
148 if (error == 0 && req->newptr) {
149 if (new < TCP_MINMSS)
150 error = EINVAL;
151 else
152 V_tcp_v6mssdflt = new;
153 }
154 return (error);
155}
156
157SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
158 CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
159 &sysctl_net_inet_tcp_mss_v6_check, "I",
160 "Default TCP Maximum Segment Size for IPv6");
161#endif
162
163static int
164vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
165{
166
167 VNET_SYSCTL_ARG(req, arg1);
168 return (sysctl_msec_to_ticks(oidp, arg1, arg2, req));
169}
170
171/*
172 * Minimum MSS we accept and use. This prevents DoS attacks where
173 * we are forced to a ridiculous low MSS like 20 and send hundreds
174 * of packets instead of one. The effect scales with the available
175 * bandwidth and quickly saturates the CPU and network interface
176 * with packet generation and sending. Set to zero to disable MINMSS
177 * checking. This setting prevents us from sending too small packets.
178 */
179VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
197SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
198 &VNET_NAME(tcp_minmss), 0,
199 "Minmum TCP Maximum Segment Size");
200
180SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
181 &VNET_NAME(tcp_minmss), 0,
182 "Minmum TCP Maximum Segment Size");
183
184VNET_DEFINE(int, tcp_do_rfc1323) = 1;
201SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
202 &VNET_NAME(tcp_do_rfc1323), 0,
203 "Enable rfc1323 (high performance TCP) extensions");
204
205static int tcp_log_debug = 0;
206SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
207 &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
208
209static int tcp_tcbhashsize = 0;
210SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
211 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
212
213static int do_tcpdrain = 1;
214SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
215 "Enable tcp_drain routine for extra help when low on mbufs");
216
217SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
218 &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
219
185SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
186 &VNET_NAME(tcp_do_rfc1323), 0,
187 "Enable rfc1323 (high performance TCP) extensions");
188
189static int tcp_log_debug = 0;
190SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
191 &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
192
193static int tcp_tcbhashsize = 0;
194SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
195 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
196
197static int do_tcpdrain = 1;
198SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
199 "Enable tcp_drain routine for extra help when low on mbufs");
200
201SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
202 &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
203
204static VNET_DEFINE(int, icmp_may_rst) = 1;
205#define V_icmp_may_rst VNET(icmp_may_rst)
220SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
221 &VNET_NAME(icmp_may_rst), 0,
222 "Certain ICMP unreachable messages may abort connections in SYN_SENT");
223
206SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
207 &VNET_NAME(icmp_may_rst), 0,
208 "Certain ICMP unreachable messages may abort connections in SYN_SENT");
209
210static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0;
211#define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval)
224SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
225 &VNET_NAME(tcp_isn_reseed_interval), 0,
226 "Seconds between reseeding of ISN secret");
227
228/*
229 * TCP bandwidth limiting sysctls. Note that the default lower bound of
230 * 1024 exists only for debugging. A good production default would be
231 * something like 6100.
232 */
233SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
234 "TCP inflight data limiting");
235
212SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
213 &VNET_NAME(tcp_isn_reseed_interval), 0,
214 "Seconds between reseeding of ISN secret");
215
216/*
217 * TCP bandwidth limiting sysctls. Note that the default lower bound of
218 * 1024 exists only for debugging. A good production default would be
219 * something like 6100.
220 */
221SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
222 "TCP inflight data limiting");
223
224static VNET_DEFINE(int, tcp_inflight_enable) = 1;
225#define V_tcp_inflight_enable VNET(tcp_inflight_enable)
236SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
237 &VNET_NAME(tcp_inflight_enable), 0,
238 "Enable automatic TCP inflight data limiting");
239
240static int tcp_inflight_debug = 0;
241SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
242 &tcp_inflight_debug, 0,
243 "Debug TCP inflight calculations");
244
226SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
227 &VNET_NAME(tcp_inflight_enable), 0,
228 "Enable automatic TCP inflight data limiting");
229
230static int tcp_inflight_debug = 0;
231SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
232 &tcp_inflight_debug, 0,
233 "Debug TCP inflight calculations");
234
235static VNET_DEFINE(int, tcp_inflight_rttthresh);
236#define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh)
245SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh,
246 CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0,
247 vnet_sysctl_msec_to_ticks, "I",
248 "RTT threshold below which inflight will deactivate itself");
249
237SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh,
238 CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0,
239 vnet_sysctl_msec_to_ticks, "I",
240 "RTT threshold below which inflight will deactivate itself");
241
242static VNET_DEFINE(int, tcp_inflight_min) = 6144;
243#define V_tcp_inflight_min VNET(tcp_inflight_min)
250SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
251 &VNET_NAME(tcp_inflight_min), 0,
252 "Lower-bound for TCP inflight window");
253
244SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
245 &VNET_NAME(tcp_inflight_min), 0,
246 "Lower-bound for TCP inflight window");
247
248static VNET_DEFINE(int, tcp_inflight_max) = TCP_MAXWIN << TCP_MAX_WINSHIFT;
249#define V_tcp_inflight_max VNET(tcp_inflight_max)
254SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
255 &VNET_NAME(tcp_inflight_max), 0,
256 "Upper-bound for TCP inflight window");
257
250SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
251 &VNET_NAME(tcp_inflight_max), 0,
252 "Upper-bound for TCP inflight window");
253
254static VNET_DEFINE(int, tcp_inflight_stab) = 20;
255#define V_tcp_inflight_stab VNET(tcp_inflight_stab)
258SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
259 &VNET_NAME(tcp_inflight_stab), 0,
260 "Inflight Algorithm Stabilization 20 = 2 packets");
261
262#ifdef TCP_SORECEIVE_STREAM
263static int tcp_soreceive_stream = 0;
264SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
265 &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
266#endif
267
268VNET_DEFINE(uma_zone_t, sack_hole_zone);
269#define V_sack_hole_zone VNET(sack_hole_zone)
270
271static struct inpcb *tcp_notify(struct inpcb *, int);
272static void tcp_isn_tick(void *);
273
274/*
275 * Target size of TCP PCB hash tables. Must be a power of two.
276 *
277 * Note that this can be overridden by the kernel environment
278 * variable net.inet.tcp.tcbhashsize
279 */
280#ifndef TCBHASHSIZE
281#define TCBHASHSIZE 512
282#endif
283
284/*
285 * XXX
286 * Callouts should be moved into struct tcp directly. They are currently
287 * separate because the tcpcb structure is exported to userland for sysctl
288 * parsing purposes, which do not know about callouts.
289 */
290struct tcpcb_mem {
291 struct tcpcb tcb;
292 struct tcp_timer tt;
293};
294
295static VNET_DEFINE(uma_zone_t, tcpcb_zone);
296#define V_tcpcb_zone VNET(tcpcb_zone)
297
298MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
299struct callout isn_callout;
300static struct mtx isn_mtx;
301
302#define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
303#define ISN_LOCK() mtx_lock(&isn_mtx)
304#define ISN_UNLOCK() mtx_unlock(&isn_mtx)
305
306/*
307 * TCP initialization.
308 */
309static void
310tcp_zone_change(void *tag)
311{
312
313 uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
314 uma_zone_set_max(V_tcpcb_zone, maxsockets);
315 tcp_tw_zone_change();
316}
317
318static int
319tcp_inpcb_init(void *mem, int size, int flags)
320{
321 struct inpcb *inp = mem;
322
323 INP_LOCK_INIT(inp, "inp", "tcpinp");
324 return (0);
325}
326
327void
328tcp_init(void)
329{
330 int hashsize;
331
256SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
257 &VNET_NAME(tcp_inflight_stab), 0,
258 "Inflight Algorithm Stabilization 20 = 2 packets");
259
260#ifdef TCP_SORECEIVE_STREAM
261static int tcp_soreceive_stream = 0;
262SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
263 &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
264#endif
265
266VNET_DEFINE(uma_zone_t, sack_hole_zone);
267#define V_sack_hole_zone VNET(sack_hole_zone)
268
269static struct inpcb *tcp_notify(struct inpcb *, int);
270static void tcp_isn_tick(void *);
271
272/*
273 * Target size of TCP PCB hash tables. Must be a power of two.
274 *
275 * Note that this can be overridden by the kernel environment
276 * variable net.inet.tcp.tcbhashsize
277 */
278#ifndef TCBHASHSIZE
279#define TCBHASHSIZE 512
280#endif
281
282/*
283 * XXX
284 * Callouts should be moved into struct tcp directly. They are currently
285 * separate because the tcpcb structure is exported to userland for sysctl
286 * parsing purposes, which do not know about callouts.
287 */
288struct tcpcb_mem {
289 struct tcpcb tcb;
290 struct tcp_timer tt;
291};
292
293static VNET_DEFINE(uma_zone_t, tcpcb_zone);
294#define V_tcpcb_zone VNET(tcpcb_zone)
295
296MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
297struct callout isn_callout;
298static struct mtx isn_mtx;
299
300#define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
301#define ISN_LOCK() mtx_lock(&isn_mtx)
302#define ISN_UNLOCK() mtx_unlock(&isn_mtx)
303
304/*
305 * TCP initialization.
306 */
307static void
308tcp_zone_change(void *tag)
309{
310
311 uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
312 uma_zone_set_max(V_tcpcb_zone, maxsockets);
313 tcp_tw_zone_change();
314}
315
316static int
317tcp_inpcb_init(void *mem, int size, int flags)
318{
319 struct inpcb *inp = mem;
320
321 INP_LOCK_INIT(inp, "inp", "tcpinp");
322 return (0);
323}
324
325void
326tcp_init(void)
327{
328 int hashsize;
329
332 V_blackhole = 0;
333 V_tcp_delack_enabled = 1;
334 V_drop_synfin = 0;
335 V_tcp_do_rfc3042 = 1;
336 V_tcp_do_rfc3390 = 1;
337 V_tcp_do_ecn = 0;
338 V_tcp_ecn_maxretries = 1;
339 V_tcp_insecure_rst = 0;
340 V_tcp_do_autorcvbuf = 1;
341 V_tcp_autorcvbuf_inc = 16*1024;
342 V_tcp_autorcvbuf_max = 256*1024;
343 V_tcp_do_rfc3465 = 1;
344 V_tcp_abc_l_var = 2;
345
346 V_tcp_mssdflt = TCP_MSS;
347#ifdef INET6
348 V_tcp_v6mssdflt = TCP6_MSS;
349#endif
350 V_tcp_minmss = TCP_MINMSS;
351 V_tcp_do_rfc1323 = 1;
352 V_icmp_may_rst = 1;
353 V_tcp_isn_reseed_interval = 0;
354 V_tcp_inflight_enable = 1;
355 V_tcp_inflight_min = 6144;
356 V_tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
357 V_tcp_inflight_stab = 20;
358
359 V_path_mtu_discovery = 1;
360 V_ss_fltsz = 1;
361 V_ss_fltsz_local = 4;
362 V_tcp_do_newreno = 1;
363 V_tcp_do_tso = 1;
364 V_tcp_do_autosndbuf = 1;
365 V_tcp_autosndbuf_inc = 8*1024;
366 V_tcp_autosndbuf_max = 256*1024;
367
368 V_nolocaltimewait = 0;
369
370 V_tcp_do_sack = 1;
371 V_tcp_sack_maxholes = 128;
372 V_tcp_sack_globalmaxholes = 65536;
373 V_tcp_sack_globalholes = 0;
374
375 V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
376
377 TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
378
379 hashsize = TCBHASHSIZE;
380 TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
381 if (!powerof2(hashsize)) {
382 printf("WARNING: TCB hash size not a power of 2\n");
383 hashsize = 512; /* safe default */
384 }
385 in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
386 "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE);
387
330 hashsize = TCBHASHSIZE;
331 TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
332 if (!powerof2(hashsize)) {
333 printf("WARNING: TCB hash size not a power of 2\n");
334 hashsize = 512; /* safe default */
335 }
336 in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
337 "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE);
338
339 V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
340
388 /*
389 * These have to be type stable for the benefit of the timers.
390 */
391 V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
392 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
393 uma_zone_set_max(V_tcpcb_zone, maxsockets);
341 /*
342 * These have to be type stable for the benefit of the timers.
343 */
344 V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
345 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
346 uma_zone_set_max(V_tcpcb_zone, maxsockets);
347
394 tcp_tw_init();
395 syncache_init();
396 tcp_hc_init();
397 tcp_reass_init();
348 tcp_tw_init();
349 syncache_init();
350 tcp_hc_init();
351 tcp_reass_init();
352
353 TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
398 V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
399 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
400
401 /* Skip initialization of globals for non-default instances. */
402 if (!IS_DEFAULT_VNET(curvnet))
403 return;
404
405 /* XXX virtualize those bellow? */
406 tcp_delacktime = TCPTV_DELACK;
407 tcp_keepinit = TCPTV_KEEP_INIT;
408 tcp_keepidle = TCPTV_KEEP_IDLE;
409 tcp_keepintvl = TCPTV_KEEPINTVL;
410 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
411 tcp_msl = TCPTV_MSL;
412 tcp_rexmit_min = TCPTV_MIN;
413 if (tcp_rexmit_min < 1)
414 tcp_rexmit_min = 1;
415 tcp_rexmit_slop = TCPTV_CPU_VAR;
416 tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
417 tcp_tcbhashsize = hashsize;
418
419#ifdef TCP_SORECEIVE_STREAM
420 TUNABLE_INT_FETCH("net.inet.tcp.soreceive_stream", &tcp_soreceive_stream);
421 if (tcp_soreceive_stream) {
422 tcp_usrreqs.pru_soreceive = soreceive_stream;
423 tcp6_usrreqs.pru_soreceive = soreceive_stream;
424 }
425#endif
426
427#ifdef INET6
428#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
429#else /* INET6 */
430#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
431#endif /* INET6 */
432 if (max_protohdr < TCP_MINPROTOHDR)
433 max_protohdr = TCP_MINPROTOHDR;
434 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
435 panic("tcp_init");
436#undef TCP_MINPROTOHDR
437
438 ISN_LOCK_INIT();
439 callout_init(&isn_callout, CALLOUT_MPSAFE);
440 callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
441 EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
442 SHUTDOWN_PRI_DEFAULT);
443 EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
444 EVENTHANDLER_PRI_ANY);
445}
446
447#ifdef VIMAGE
448void
449tcp_destroy(void)
450{
451
452 tcp_reass_destroy();
453 tcp_hc_destroy();
454 syncache_destroy();
455 tcp_tw_destroy();
456 in_pcbinfo_destroy(&V_tcbinfo);
457 uma_zdestroy(V_sack_hole_zone);
458 uma_zdestroy(V_tcpcb_zone);
459}
460#endif
461
462void
463tcp_fini(void *xtp)
464{
465
466 callout_stop(&isn_callout);
467}
468
469/*
470 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
471 * tcp_template used to store this data in mbufs, but we now recopy it out
472 * of the tcpcb each time to conserve mbufs.
473 */
474void
475tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
476{
477 struct tcphdr *th = (struct tcphdr *)tcp_ptr;
478
479 INP_WLOCK_ASSERT(inp);
480
481#ifdef INET6
482 if ((inp->inp_vflag & INP_IPV6) != 0) {
483 struct ip6_hdr *ip6;
484
485 ip6 = (struct ip6_hdr *)ip_ptr;
486 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
487 (inp->inp_flow & IPV6_FLOWINFO_MASK);
488 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
489 (IPV6_VERSION & IPV6_VERSION_MASK);
490 ip6->ip6_nxt = IPPROTO_TCP;
491 ip6->ip6_plen = htons(sizeof(struct tcphdr));
492 ip6->ip6_src = inp->in6p_laddr;
493 ip6->ip6_dst = inp->in6p_faddr;
494 } else
495#endif
496 {
497 struct ip *ip;
498
499 ip = (struct ip *)ip_ptr;
500 ip->ip_v = IPVERSION;
501 ip->ip_hl = 5;
502 ip->ip_tos = inp->inp_ip_tos;
503 ip->ip_len = 0;
504 ip->ip_id = 0;
505 ip->ip_off = 0;
506 ip->ip_ttl = inp->inp_ip_ttl;
507 ip->ip_sum = 0;
508 ip->ip_p = IPPROTO_TCP;
509 ip->ip_src = inp->inp_laddr;
510 ip->ip_dst = inp->inp_faddr;
511 }
512 th->th_sport = inp->inp_lport;
513 th->th_dport = inp->inp_fport;
514 th->th_seq = 0;
515 th->th_ack = 0;
516 th->th_x2 = 0;
517 th->th_off = 5;
518 th->th_flags = 0;
519 th->th_win = 0;
520 th->th_urp = 0;
521 th->th_sum = 0; /* in_pseudo() is called later for ipv4 */
522}
523
524/*
525 * Create template to be used to send tcp packets on a connection.
526 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
527 * use for this function is in keepalives, which use tcp_respond.
528 */
529struct tcptemp *
530tcpip_maketemplate(struct inpcb *inp)
531{
532 struct tcptemp *t;
533
534 t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
535 if (t == NULL)
536 return (NULL);
537 tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
538 return (t);
539}
540
541/*
542 * Send a single message to the TCP at address specified by
543 * the given TCP/IP header. If m == NULL, then we make a copy
544 * of the tcpiphdr at ti and send directly to the addressed host.
545 * This is used to force keep alive messages out using the TCP
546 * template for a connection. If flags are given then we send
547 * a message back to the TCP which originated the * segment ti,
548 * and discard the mbuf containing it and any other attached mbufs.
549 *
550 * In any case the ack and sequence number of the transmitted
551 * segment are as specified by the parameters.
552 *
553 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
554 */
555void
556tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
557 tcp_seq ack, tcp_seq seq, int flags)
558{
559 int tlen;
560 int win = 0;
561 struct ip *ip;
562 struct tcphdr *nth;
563#ifdef INET6
564 struct ip6_hdr *ip6;
565 int isipv6;
566#endif /* INET6 */
567 int ipflags = 0;
568 struct inpcb *inp;
569
570 KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
571
572#ifdef INET6
573 isipv6 = ((struct ip *)ipgen)->ip_v == 6;
574 ip6 = ipgen;
575#endif /* INET6 */
576 ip = ipgen;
577
578 if (tp != NULL) {
579 inp = tp->t_inpcb;
580 KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
581 INP_WLOCK_ASSERT(inp);
582 } else
583 inp = NULL;
584
585 if (tp != NULL) {
586 if (!(flags & TH_RST)) {
587 win = sbspace(&inp->inp_socket->so_rcv);
588 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
589 win = (long)TCP_MAXWIN << tp->rcv_scale;
590 }
591 }
592 if (m == NULL) {
593 m = m_gethdr(M_DONTWAIT, MT_DATA);
594 if (m == NULL)
595 return;
596 tlen = 0;
597 m->m_data += max_linkhdr;
598#ifdef INET6
599 if (isipv6) {
600 bcopy((caddr_t)ip6, mtod(m, caddr_t),
601 sizeof(struct ip6_hdr));
602 ip6 = mtod(m, struct ip6_hdr *);
603 nth = (struct tcphdr *)(ip6 + 1);
604 } else
605#endif /* INET6 */
606 {
607 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
608 ip = mtod(m, struct ip *);
609 nth = (struct tcphdr *)(ip + 1);
610 }
611 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
612 flags = TH_ACK;
613 } else {
614 /*
615 * reuse the mbuf.
616 * XXX MRT We inherrit the FIB, which is lucky.
617 */
618 m_freem(m->m_next);
619 m->m_next = NULL;
620 m->m_data = (caddr_t)ipgen;
621 /* m_len is set later */
622 tlen = 0;
623#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
624#ifdef INET6
625 if (isipv6) {
626 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
627 nth = (struct tcphdr *)(ip6 + 1);
628 } else
629#endif /* INET6 */
630 {
631 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
632 nth = (struct tcphdr *)(ip + 1);
633 }
634 if (th != nth) {
635 /*
636 * this is usually a case when an extension header
637 * exists between the IPv6 header and the
638 * TCP header.
639 */
640 nth->th_sport = th->th_sport;
641 nth->th_dport = th->th_dport;
642 }
643 xchg(nth->th_dport, nth->th_sport, uint16_t);
644#undef xchg
645 }
646#ifdef INET6
647 if (isipv6) {
648 ip6->ip6_flow = 0;
649 ip6->ip6_vfc = IPV6_VERSION;
650 ip6->ip6_nxt = IPPROTO_TCP;
651 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
652 tlen));
653 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
654 } else
655#endif
656 {
657 tlen += sizeof (struct tcpiphdr);
658 ip->ip_len = tlen;
659 ip->ip_ttl = V_ip_defttl;
660 if (V_path_mtu_discovery)
661 ip->ip_off |= IP_DF;
662 }
663 m->m_len = tlen;
664 m->m_pkthdr.len = tlen;
665 m->m_pkthdr.rcvif = NULL;
666#ifdef MAC
667 if (inp != NULL) {
668 /*
669 * Packet is associated with a socket, so allow the
670 * label of the response to reflect the socket label.
671 */
672 INP_WLOCK_ASSERT(inp);
673 mac_inpcb_create_mbuf(inp, m);
674 } else {
675 /*
676 * Packet is not associated with a socket, so possibly
677 * update the label in place.
678 */
679 mac_netinet_tcp_reply(m);
680 }
681#endif
682 nth->th_seq = htonl(seq);
683 nth->th_ack = htonl(ack);
684 nth->th_x2 = 0;
685 nth->th_off = sizeof (struct tcphdr) >> 2;
686 nth->th_flags = flags;
687 if (tp != NULL)
688 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
689 else
690 nth->th_win = htons((u_short)win);
691 nth->th_urp = 0;
692#ifdef INET6
693 if (isipv6) {
694 nth->th_sum = 0;
695 nth->th_sum = in6_cksum(m, IPPROTO_TCP,
696 sizeof(struct ip6_hdr),
697 tlen - sizeof(struct ip6_hdr));
698 ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
699 NULL, NULL);
700 } else
701#endif /* INET6 */
702 {
703 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
704 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
705 m->m_pkthdr.csum_flags = CSUM_TCP;
706 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
707 }
708#ifdef TCPDEBUG
709 if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
710 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
711#endif
712#ifdef INET6
713 if (isipv6)
714 (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
715 else
716#endif /* INET6 */
717 (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
718}
719
720/*
721 * Create a new TCP control block, making an
722 * empty reassembly queue and hooking it to the argument
723 * protocol control block. The `inp' parameter must have
724 * come from the zone allocator set up in tcp_init().
725 */
726struct tcpcb *
727tcp_newtcpcb(struct inpcb *inp)
728{
729 struct tcpcb_mem *tm;
730 struct tcpcb *tp;
731#ifdef INET6
732 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
733#endif /* INET6 */
734
735 tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO);
736 if (tm == NULL)
737 return (NULL);
738 tp = &tm->tcb;
739#ifdef VIMAGE
740 tp->t_vnet = inp->inp_vnet;
741#endif
742 tp->t_timers = &tm->tt;
743 /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */
744 tp->t_maxseg = tp->t_maxopd =
745#ifdef INET6
746 isipv6 ? V_tcp_v6mssdflt :
747#endif /* INET6 */
748 V_tcp_mssdflt;
749
750 /* Set up our timeouts. */
751 callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE);
752 callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE);
753 callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
754 callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
755 callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
756
757 if (V_tcp_do_rfc1323)
758 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
759 if (V_tcp_do_sack)
760 tp->t_flags |= TF_SACK_PERMIT;
761 TAILQ_INIT(&tp->snd_holes);
762 tp->t_inpcb = inp; /* XXX */
763 /*
764 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
765 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
766 * reasonable initial retransmit time.
767 */
768 tp->t_srtt = TCPTV_SRTTBASE;
769 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
770 tp->t_rttmin = tcp_rexmit_min;
771 tp->t_rxtcur = TCPTV_RTOBASE;
772 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
773 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
774 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
775 tp->t_rcvtime = ticks;
776 tp->t_bw_rtttime = ticks;
777 /*
778 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
779 * because the socket may be bound to an IPv6 wildcard address,
780 * which may match an IPv4-mapped IPv6 address.
781 */
782 inp->inp_ip_ttl = V_ip_defttl;
783 inp->inp_ppcb = tp;
784 return (tp); /* XXX */
785}
786
787/*
788 * Drop a TCP connection, reporting
789 * the specified error. If connection is synchronized,
790 * then send a RST to peer.
791 */
792struct tcpcb *
793tcp_drop(struct tcpcb *tp, int errno)
794{
795 struct socket *so = tp->t_inpcb->inp_socket;
796
797 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
798 INP_WLOCK_ASSERT(tp->t_inpcb);
799
800 if (TCPS_HAVERCVDSYN(tp->t_state)) {
801 tp->t_state = TCPS_CLOSED;
802 (void) tcp_output_reset(tp);
803 TCPSTAT_INC(tcps_drops);
804 } else
805 TCPSTAT_INC(tcps_conndrops);
806 if (errno == ETIMEDOUT && tp->t_softerror)
807 errno = tp->t_softerror;
808 so->so_error = errno;
809 return (tcp_close(tp));
810}
811
812void
813tcp_discardcb(struct tcpcb *tp)
814{
815 struct tseg_qent *q;
816 struct inpcb *inp = tp->t_inpcb;
817 struct socket *so = inp->inp_socket;
818#ifdef INET6
819 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
820#endif /* INET6 */
821
822 INP_WLOCK_ASSERT(inp);
823
824 /*
825 * Make sure that all of our timers are stopped before we delete the
826 * PCB.
827 *
828 * XXXRW: Really, we would like to use callout_drain() here in order
829 * to avoid races experienced in tcp_timer.c where a timer is already
830 * executing at this point. However, we can't, both because we're
831 * running in a context where we can't sleep, and also because we
832 * hold locks required by the timers. What we instead need to do is
833 * test to see if callout_drain() is required, and if so, defer some
834 * portion of the remainder of tcp_discardcb() to an asynchronous
835 * context that can callout_drain() and then continue. Some care
836 * will be required to ensure that no further processing takes place
837 * on the tcpcb, even though it hasn't been freed (a flag?).
838 */
839 callout_stop(&tp->t_timers->tt_rexmt);
840 callout_stop(&tp->t_timers->tt_persist);
841 callout_stop(&tp->t_timers->tt_keep);
842 callout_stop(&tp->t_timers->tt_2msl);
843 callout_stop(&tp->t_timers->tt_delack);
844
845 /*
846 * If we got enough samples through the srtt filter,
847 * save the rtt and rttvar in the routing entry.
848 * 'Enough' is arbitrarily defined as 4 rtt samples.
849 * 4 samples is enough for the srtt filter to converge
850 * to within enough % of the correct value; fewer samples
851 * and we could save a bogus rtt. The danger is not high
852 * as tcp quickly recovers from everything.
853 * XXX: Works very well but needs some more statistics!
854 */
855 if (tp->t_rttupdated >= 4) {
856 struct hc_metrics_lite metrics;
857 u_long ssthresh;
858
859 bzero(&metrics, sizeof(metrics));
860 /*
861 * Update the ssthresh always when the conditions below
862 * are satisfied. This gives us better new start value
863 * for the congestion avoidance for new connections.
864 * ssthresh is only set if packet loss occured on a session.
865 *
866 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
867 * being torn down. Ideally this code would not use 'so'.
868 */
869 ssthresh = tp->snd_ssthresh;
870 if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
871 /*
872 * convert the limit from user data bytes to
873 * packets then to packet data bytes.
874 */
875 ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
876 if (ssthresh < 2)
877 ssthresh = 2;
878 ssthresh *= (u_long)(tp->t_maxseg +
879#ifdef INET6
880 (isipv6 ? sizeof (struct ip6_hdr) +
881 sizeof (struct tcphdr) :
882#endif
883 sizeof (struct tcpiphdr)
884#ifdef INET6
885 )
886#endif
887 );
888 } else
889 ssthresh = 0;
890 metrics.rmx_ssthresh = ssthresh;
891
892 metrics.rmx_rtt = tp->t_srtt;
893 metrics.rmx_rttvar = tp->t_rttvar;
894 /* XXX: This wraps if the pipe is more than 4 Gbit per second */
895 metrics.rmx_bandwidth = tp->snd_bandwidth;
896 metrics.rmx_cwnd = tp->snd_cwnd;
897 metrics.rmx_sendpipe = 0;
898 metrics.rmx_recvpipe = 0;
899
900 tcp_hc_update(&inp->inp_inc, &metrics);
901 }
902
903 /* free the reassembly queue, if any */
904 while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
905 LIST_REMOVE(q, tqe_q);
906 m_freem(q->tqe_m);
907 uma_zfree(V_tcp_reass_zone, q);
908 tp->t_segqlen--;
909 V_tcp_reass_qsize--;
910 }
911 /* Disconnect offload device, if any. */
912 tcp_offload_detach(tp);
913
914 tcp_free_sackholes(tp);
915 inp->inp_ppcb = NULL;
916 tp->t_inpcb = NULL;
917 uma_zfree(V_tcpcb_zone, tp);
918}
919
920/*
921 * Attempt to close a TCP control block, marking it as dropped, and freeing
922 * the socket if we hold the only reference.
923 */
924struct tcpcb *
925tcp_close(struct tcpcb *tp)
926{
927 struct inpcb *inp = tp->t_inpcb;
928 struct socket *so;
929
930 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
931 INP_WLOCK_ASSERT(inp);
932
933 /* Notify any offload devices of listener close */
934 if (tp->t_state == TCPS_LISTEN)
935 tcp_offload_listen_close(tp);
936 in_pcbdrop(inp);
937 TCPSTAT_INC(tcps_closed);
938 KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
939 so = inp->inp_socket;
940 soisdisconnected(so);
941 if (inp->inp_flags & INP_SOCKREF) {
942 KASSERT(so->so_state & SS_PROTOREF,
943 ("tcp_close: !SS_PROTOREF"));
944 inp->inp_flags &= ~INP_SOCKREF;
945 INP_WUNLOCK(inp);
946 ACCEPT_LOCK();
947 SOCK_LOCK(so);
948 so->so_state &= ~SS_PROTOREF;
949 sofree(so);
950 return (NULL);
951 }
952 return (tp);
953}
954
955void
956tcp_drain(void)
957{
958 VNET_ITERATOR_DECL(vnet_iter);
959
960 if (!do_tcpdrain)
961 return;
962
963 VNET_LIST_RLOCK_NOSLEEP();
964 VNET_FOREACH(vnet_iter) {
965 CURVNET_SET(vnet_iter);
966 struct inpcb *inpb;
967 struct tcpcb *tcpb;
968 struct tseg_qent *te;
969
970 /*
971 * Walk the tcpbs, if existing, and flush the reassembly queue,
972 * if there is one...
973 * XXX: The "Net/3" implementation doesn't imply that the TCP
974 * reassembly queue should be flushed, but in a situation
975 * where we're really low on mbufs, this is potentially
976 * usefull.
977 */
978 INP_INFO_RLOCK(&V_tcbinfo);
979 LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
980 if (inpb->inp_flags & INP_TIMEWAIT)
981 continue;
982 INP_WLOCK(inpb);
983 if ((tcpb = intotcpcb(inpb)) != NULL) {
984 while ((te = LIST_FIRST(&tcpb->t_segq))
985 != NULL) {
986 LIST_REMOVE(te, tqe_q);
987 m_freem(te->tqe_m);
988 uma_zfree(V_tcp_reass_zone, te);
989 tcpb->t_segqlen--;
990 V_tcp_reass_qsize--;
991 }
992 tcp_clean_sackreport(tcpb);
993 }
994 INP_WUNLOCK(inpb);
995 }
996 INP_INFO_RUNLOCK(&V_tcbinfo);
997 CURVNET_RESTORE();
998 }
999 VNET_LIST_RUNLOCK_NOSLEEP();
1000}
1001
1002/*
1003 * Notify a tcp user of an asynchronous error;
1004 * store error as soft error, but wake up user
1005 * (for now, won't do anything until can select for soft error).
1006 *
1007 * Do not wake up user since there currently is no mechanism for
1008 * reporting soft errors (yet - a kqueue filter may be added).
1009 */
1010static struct inpcb *
1011tcp_notify(struct inpcb *inp, int error)
1012{
1013 struct tcpcb *tp;
1014
1015 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
1016 INP_WLOCK_ASSERT(inp);
1017
1018 if ((inp->inp_flags & INP_TIMEWAIT) ||
1019 (inp->inp_flags & INP_DROPPED))
1020 return (inp);
1021
1022 tp = intotcpcb(inp);
1023 KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
1024
1025 /*
1026 * Ignore some errors if we are hooked up.
1027 * If connection hasn't completed, has retransmitted several times,
1028 * and receives a second error, give up now. This is better
1029 * than waiting a long time to establish a connection that
1030 * can never complete.
1031 */
1032 if (tp->t_state == TCPS_ESTABLISHED &&
1033 (error == EHOSTUNREACH || error == ENETUNREACH ||
1034 error == EHOSTDOWN)) {
1035 return (inp);
1036 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
1037 tp->t_softerror) {
1038 tp = tcp_drop(tp, error);
1039 if (tp != NULL)
1040 return (inp);
1041 else
1042 return (NULL);
1043 } else {
1044 tp->t_softerror = error;
1045 return (inp);
1046 }
1047#if 0
1048 wakeup( &so->so_timeo);
1049 sorwakeup(so);
1050 sowwakeup(so);
1051#endif
1052}
1053
1054static int
1055tcp_pcblist(SYSCTL_HANDLER_ARGS)
1056{
1057 int error, i, m, n, pcb_count;
1058 struct inpcb *inp, **inp_list;
1059 inp_gen_t gencnt;
1060 struct xinpgen xig;
1061
1062 /*
1063 * The process of preparing the TCB list is too time-consuming and
1064 * resource-intensive to repeat twice on every request.
1065 */
1066 if (req->oldptr == NULL) {
1067 m = syncache_pcbcount();
1068 n = V_tcbinfo.ipi_count;
1069 req->oldidx = 2 * (sizeof xig)
1070 + ((m + n) + n/8) * sizeof(struct xtcpcb);
1071 return (0);
1072 }
1073
1074 if (req->newptr != NULL)
1075 return (EPERM);
1076
1077 /*
1078 * OK, now we're committed to doing something.
1079 */
1080 INP_INFO_RLOCK(&V_tcbinfo);
1081 gencnt = V_tcbinfo.ipi_gencnt;
1082 n = V_tcbinfo.ipi_count;
1083 INP_INFO_RUNLOCK(&V_tcbinfo);
1084
1085 m = syncache_pcbcount();
1086
1087 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1088 + (n + m) * sizeof(struct xtcpcb));
1089 if (error != 0)
1090 return (error);
1091
1092 xig.xig_len = sizeof xig;
1093 xig.xig_count = n + m;
1094 xig.xig_gen = gencnt;
1095 xig.xig_sogen = so_gencnt;
1096 error = SYSCTL_OUT(req, &xig, sizeof xig);
1097 if (error)
1098 return (error);
1099
1100 error = syncache_pcblist(req, m, &pcb_count);
1101 if (error)
1102 return (error);
1103
1104 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1105 if (inp_list == NULL)
1106 return (ENOMEM);
1107
1108 INP_INFO_RLOCK(&V_tcbinfo);
1109 for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
1110 inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
1111 INP_WLOCK(inp);
1112 if (inp->inp_gencnt <= gencnt) {
1113 /*
1114 * XXX: This use of cr_cansee(), introduced with
1115 * TCP state changes, is not quite right, but for
1116 * now, better than nothing.
1117 */
1118 if (inp->inp_flags & INP_TIMEWAIT) {
1119 if (intotw(inp) != NULL)
1120 error = cr_cansee(req->td->td_ucred,
1121 intotw(inp)->tw_cred);
1122 else
1123 error = EINVAL; /* Skip this inp. */
1124 } else
1125 error = cr_canseeinpcb(req->td->td_ucred, inp);
1126 if (error == 0) {
1127 in_pcbref(inp);
1128 inp_list[i++] = inp;
1129 }
1130 }
1131 INP_WUNLOCK(inp);
1132 }
1133 INP_INFO_RUNLOCK(&V_tcbinfo);
1134 n = i;
1135
1136 error = 0;
1137 for (i = 0; i < n; i++) {
1138 inp = inp_list[i];
1139 INP_RLOCK(inp);
1140 if (inp->inp_gencnt <= gencnt) {
1141 struct xtcpcb xt;
1142 void *inp_ppcb;
1143
1144 bzero(&xt, sizeof(xt));
1145 xt.xt_len = sizeof xt;
1146 /* XXX should avoid extra copy */
1147 bcopy(inp, &xt.xt_inp, sizeof *inp);
1148 inp_ppcb = inp->inp_ppcb;
1149 if (inp_ppcb == NULL)
1150 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
1151 else if (inp->inp_flags & INP_TIMEWAIT) {
1152 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
1153 xt.xt_tp.t_state = TCPS_TIME_WAIT;
1154 } else {
1155 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
1156 if (xt.xt_tp.t_timers)
1157 tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer);
1158 }
1159 if (inp->inp_socket != NULL)
1160 sotoxsocket(inp->inp_socket, &xt.xt_socket);
1161 else {
1162 bzero(&xt.xt_socket, sizeof xt.xt_socket);
1163 xt.xt_socket.xso_protocol = IPPROTO_TCP;
1164 }
1165 xt.xt_inp.inp_gencnt = inp->inp_gencnt;
1166 INP_RUNLOCK(inp);
1167 error = SYSCTL_OUT(req, &xt, sizeof xt);
1168 } else
1169 INP_RUNLOCK(inp);
1170 }
1171 INP_INFO_WLOCK(&V_tcbinfo);
1172 for (i = 0; i < n; i++) {
1173 inp = inp_list[i];
1174 INP_WLOCK(inp);
1175 if (!in_pcbrele(inp))
1176 INP_WUNLOCK(inp);
1177 }
1178 INP_INFO_WUNLOCK(&V_tcbinfo);
1179
1180 if (!error) {
1181 /*
1182 * Give the user an updated idea of our state.
1183 * If the generation differs from what we told
1184 * her before, she knows that something happened
1185 * while we were processing this request, and it
1186 * might be necessary to retry.
1187 */
1188 INP_INFO_RLOCK(&V_tcbinfo);
1189 xig.xig_gen = V_tcbinfo.ipi_gencnt;
1190 xig.xig_sogen = so_gencnt;
1191 xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
1192 INP_INFO_RUNLOCK(&V_tcbinfo);
1193 error = SYSCTL_OUT(req, &xig, sizeof xig);
1194 }
1195 free(inp_list, M_TEMP);
1196 return (error);
1197}
1198
1199SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
1200 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
1201
1202static int
1203tcp_getcred(SYSCTL_HANDLER_ARGS)
1204{
1205 struct xucred xuc;
1206 struct sockaddr_in addrs[2];
1207 struct inpcb *inp;
1208 int error;
1209
1210 error = priv_check(req->td, PRIV_NETINET_GETCRED);
1211 if (error)
1212 return (error);
1213 error = SYSCTL_IN(req, addrs, sizeof(addrs));
1214 if (error)
1215 return (error);
1216 INP_INFO_RLOCK(&V_tcbinfo);
1217 inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr,
1218 addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
1219 if (inp != NULL) {
1220 INP_RLOCK(inp);
1221 INP_INFO_RUNLOCK(&V_tcbinfo);
1222 if (inp->inp_socket == NULL)
1223 error = ENOENT;
1224 if (error == 0)
1225 error = cr_canseeinpcb(req->td->td_ucred, inp);
1226 if (error == 0)
1227 cru2x(inp->inp_cred, &xuc);
1228 INP_RUNLOCK(inp);
1229 } else {
1230 INP_INFO_RUNLOCK(&V_tcbinfo);
1231 error = ENOENT;
1232 }
1233 if (error == 0)
1234 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
1235 return (error);
1236}
1237
1238SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
1239 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
1240 tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
1241
1242#ifdef INET6
1243static int
1244tcp6_getcred(SYSCTL_HANDLER_ARGS)
1245{
1246 struct xucred xuc;
1247 struct sockaddr_in6 addrs[2];
1248 struct inpcb *inp;
1249 int error, mapped = 0;
1250
1251 error = priv_check(req->td, PRIV_NETINET_GETCRED);
1252 if (error)
1253 return (error);
1254 error = SYSCTL_IN(req, addrs, sizeof(addrs));
1255 if (error)
1256 return (error);
1257 if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
1258 (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
1259 return (error);
1260 }
1261 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
1262 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
1263 mapped = 1;
1264 else
1265 return (EINVAL);
1266 }
1267
1268 INP_INFO_RLOCK(&V_tcbinfo);
1269 if (mapped == 1)
1270 inp = in_pcblookup_hash(&V_tcbinfo,
1271 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
1272 addrs[1].sin6_port,
1273 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
1274 addrs[0].sin6_port,
1275 0, NULL);
1276 else
1277 inp = in6_pcblookup_hash(&V_tcbinfo,
1278 &addrs[1].sin6_addr, addrs[1].sin6_port,
1279 &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL);
1280 if (inp != NULL) {
1281 INP_RLOCK(inp);
1282 INP_INFO_RUNLOCK(&V_tcbinfo);
1283 if (inp->inp_socket == NULL)
1284 error = ENOENT;
1285 if (error == 0)
1286 error = cr_canseeinpcb(req->td->td_ucred, inp);
1287 if (error == 0)
1288 cru2x(inp->inp_cred, &xuc);
1289 INP_RUNLOCK(inp);
1290 } else {
1291 INP_INFO_RUNLOCK(&V_tcbinfo);
1292 error = ENOENT;
1293 }
1294 if (error == 0)
1295 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
1296 return (error);
1297}
1298
1299SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
1300 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
1301 tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
1302#endif
1303
1304
1305void
1306tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1307{
1308 struct ip *ip = vip;
1309 struct tcphdr *th;
1310 struct in_addr faddr;
1311 struct inpcb *inp;
1312 struct tcpcb *tp;
1313 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
1314 struct icmp *icp;
1315 struct in_conninfo inc;
1316 tcp_seq icmp_tcp_seq;
1317 int mtu;
1318
1319 faddr = ((struct sockaddr_in *)sa)->sin_addr;
1320 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1321 return;
1322
1323 if (cmd == PRC_MSGSIZE)
1324 notify = tcp_mtudisc;
1325 else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
1326 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
1327 notify = tcp_drop_syn_sent;
1328 /*
1329 * Redirects don't need to be handled up here.
1330 */
1331 else if (PRC_IS_REDIRECT(cmd))
1332 return;
1333 /*
1334 * Source quench is depreciated.
1335 */
1336 else if (cmd == PRC_QUENCH)
1337 return;
1338 /*
1339 * Hostdead is ugly because it goes linearly through all PCBs.
1340 * XXX: We never get this from ICMP, otherwise it makes an
1341 * excellent DoS attack on machines with many connections.
1342 */
1343 else if (cmd == PRC_HOSTDEAD)
1344 ip = NULL;
1345 else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
1346 return;
1347 if (ip != NULL) {
1348 icp = (struct icmp *)((caddr_t)ip
1349 - offsetof(struct icmp, icmp_ip));
1350 th = (struct tcphdr *)((caddr_t)ip
1351 + (ip->ip_hl << 2));
1352 INP_INFO_WLOCK(&V_tcbinfo);
1353 inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport,
1354 ip->ip_src, th->th_sport, 0, NULL);
1355 if (inp != NULL) {
1356 INP_WLOCK(inp);
1357 if (!(inp->inp_flags & INP_TIMEWAIT) &&
1358 !(inp->inp_flags & INP_DROPPED) &&
1359 !(inp->inp_socket == NULL)) {
1360 icmp_tcp_seq = htonl(th->th_seq);
1361 tp = intotcpcb(inp);
1362 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
1363 SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
1364 if (cmd == PRC_MSGSIZE) {
1365 /*
1366 * MTU discovery:
1367 * If we got a needfrag set the MTU
1368 * in the route to the suggested new
1369 * value (if given) and then notify.
1370 */
1371 bzero(&inc, sizeof(inc));
1372 inc.inc_faddr = faddr;
1373 inc.inc_fibnum =
1374 inp->inp_inc.inc_fibnum;
1375
1376 mtu = ntohs(icp->icmp_nextmtu);
1377 /*
1378 * If no alternative MTU was
1379 * proposed, try the next smaller
1380 * one. ip->ip_len has already
1381 * been swapped in icmp_input().
1382 */
1383 if (!mtu)
1384 mtu = ip_next_mtu(ip->ip_len,
1385 1);
1386 if (mtu < max(296, V_tcp_minmss
1387 + sizeof(struct tcpiphdr)))
1388 mtu = 0;
1389 if (!mtu)
1390 mtu = V_tcp_mssdflt
1391 + sizeof(struct tcpiphdr);
1392 /*
1393 * Only cache the the MTU if it
1394 * is smaller than the interface
1395 * or route MTU. tcp_mtudisc()
1396 * will do right thing by itself.
1397 */
1398 if (mtu <= tcp_maxmtu(&inc, NULL))
1399 tcp_hc_updatemtu(&inc, mtu);
1400 }
1401
1402 inp = (*notify)(inp, inetctlerrmap[cmd]);
1403 }
1404 }
1405 if (inp != NULL)
1406 INP_WUNLOCK(inp);
1407 } else {
1408 bzero(&inc, sizeof(inc));
1409 inc.inc_fport = th->th_dport;
1410 inc.inc_lport = th->th_sport;
1411 inc.inc_faddr = faddr;
1412 inc.inc_laddr = ip->ip_src;
1413 syncache_unreach(&inc, th);
1414 }
1415 INP_INFO_WUNLOCK(&V_tcbinfo);
1416 } else
1417 in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
1418}
1419
1420#ifdef INET6
1421void
1422tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
1423{
1424 struct tcphdr th;
1425 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
1426 struct ip6_hdr *ip6;
1427 struct mbuf *m;
1428 struct ip6ctlparam *ip6cp = NULL;
1429 const struct sockaddr_in6 *sa6_src = NULL;
1430 int off;
1431 struct tcp_portonly {
1432 u_int16_t th_sport;
1433 u_int16_t th_dport;
1434 } *thp;
1435
1436 if (sa->sa_family != AF_INET6 ||
1437 sa->sa_len != sizeof(struct sockaddr_in6))
1438 return;
1439
1440 if (cmd == PRC_MSGSIZE)
1441 notify = tcp_mtudisc;
1442 else if (!PRC_IS_REDIRECT(cmd) &&
1443 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1444 return;
1445 /* Source quench is depreciated. */
1446 else if (cmd == PRC_QUENCH)
1447 return;
1448
1449 /* if the parameter is from icmp6, decode it. */
1450 if (d != NULL) {
1451 ip6cp = (struct ip6ctlparam *)d;
1452 m = ip6cp->ip6c_m;
1453 ip6 = ip6cp->ip6c_ip6;
1454 off = ip6cp->ip6c_off;
1455 sa6_src = ip6cp->ip6c_src;
1456 } else {
1457 m = NULL;
1458 ip6 = NULL;
1459 off = 0; /* fool gcc */
1460 sa6_src = &sa6_any;
1461 }
1462
1463 if (ip6 != NULL) {
1464 struct in_conninfo inc;
1465 /*
1466 * XXX: We assume that when IPV6 is non NULL,
1467 * M and OFF are valid.
1468 */
1469
1470 /* check if we can safely examine src and dst ports */
1471 if (m->m_pkthdr.len < off + sizeof(*thp))
1472 return;
1473
1474 bzero(&th, sizeof(th));
1475 m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1476
1477 in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
1478 (struct sockaddr *)ip6cp->ip6c_src,
1479 th.th_sport, cmd, NULL, notify);
1480
1481 bzero(&inc, sizeof(inc));
1482 inc.inc_fport = th.th_dport;
1483 inc.inc_lport = th.th_sport;
1484 inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
1485 inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
1486 inc.inc_flags |= INC_ISIPV6;
1487 INP_INFO_WLOCK(&V_tcbinfo);
1488 syncache_unreach(&inc, &th);
1489 INP_INFO_WUNLOCK(&V_tcbinfo);
1490 } else
1491 in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
1492 0, cmd, NULL, notify);
1493}
1494#endif /* INET6 */
1495
1496
1497/*
1498 * Following is where TCP initial sequence number generation occurs.
1499 *
1500 * There are two places where we must use initial sequence numbers:
1501 * 1. In SYN-ACK packets.
1502 * 2. In SYN packets.
1503 *
1504 * All ISNs for SYN-ACK packets are generated by the syncache. See
1505 * tcp_syncache.c for details.
1506 *
1507 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
1508 * depends on this property. In addition, these ISNs should be
1509 * unguessable so as to prevent connection hijacking. To satisfy
1510 * the requirements of this situation, the algorithm outlined in
1511 * RFC 1948 is used, with only small modifications.
1512 *
1513 * Implementation details:
1514 *
1515 * Time is based off the system timer, and is corrected so that it
1516 * increases by one megabyte per second. This allows for proper
1517 * recycling on high speed LANs while still leaving over an hour
1518 * before rollover.
1519 *
1520 * As reading the *exact* system time is too expensive to be done
1521 * whenever setting up a TCP connection, we increment the time
1522 * offset in two ways. First, a small random positive increment
1523 * is added to isn_offset for each connection that is set up.
1524 * Second, the function tcp_isn_tick fires once per clock tick
1525 * and increments isn_offset as necessary so that sequence numbers
1526 * are incremented at approximately ISN_BYTES_PER_SECOND. The
1527 * random positive increments serve only to ensure that the same
1528 * exact sequence number is never sent out twice (as could otherwise
1529 * happen when a port is recycled in less than the system tick
1530 * interval.)
1531 *
1532 * net.inet.tcp.isn_reseed_interval controls the number of seconds
1533 * between seeding of isn_secret. This is normally set to zero,
1534 * as reseeding should not be necessary.
1535 *
1536 * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
1537 * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In
1538 * general, this means holding an exclusive (write) lock.
1539 */
1540
1541#define ISN_BYTES_PER_SECOND 1048576
1542#define ISN_STATIC_INCREMENT 4096
1543#define ISN_RANDOM_INCREMENT (4096 - 1)
1544
1545static VNET_DEFINE(u_char, isn_secret[32]);
1546static VNET_DEFINE(int, isn_last_reseed);
1547static VNET_DEFINE(u_int32_t, isn_offset);
1548static VNET_DEFINE(u_int32_t, isn_offset_old);
1549
1550#define V_isn_secret VNET(isn_secret)
1551#define V_isn_last_reseed VNET(isn_last_reseed)
1552#define V_isn_offset VNET(isn_offset)
1553#define V_isn_offset_old VNET(isn_offset_old)
1554
1555tcp_seq
1556tcp_new_isn(struct tcpcb *tp)
1557{
1558 MD5_CTX isn_ctx;
1559 u_int32_t md5_buffer[4];
1560 tcp_seq new_isn;
1561
1562 INP_WLOCK_ASSERT(tp->t_inpcb);
1563
1564 ISN_LOCK();
1565 /* Seed if this is the first use, reseed if requested. */
1566 if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
1567 (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
1568 < (u_int)ticks))) {
1569 read_random(&V_isn_secret, sizeof(V_isn_secret));
1570 V_isn_last_reseed = ticks;
1571 }
1572
1573 /* Compute the md5 hash and return the ISN. */
1574 MD5Init(&isn_ctx);
1575 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
1576 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
1577#ifdef INET6
1578 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
1579 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
1580 sizeof(struct in6_addr));
1581 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
1582 sizeof(struct in6_addr));
1583 } else
1584#endif
1585 {
1586 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
1587 sizeof(struct in_addr));
1588 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
1589 sizeof(struct in_addr));
1590 }
1591 MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
1592 MD5Final((u_char *) &md5_buffer, &isn_ctx);
1593 new_isn = (tcp_seq) md5_buffer[0];
1594 V_isn_offset += ISN_STATIC_INCREMENT +
1595 (arc4random() & ISN_RANDOM_INCREMENT);
1596 new_isn += V_isn_offset;
1597 ISN_UNLOCK();
1598 return (new_isn);
1599}
1600
1601/*
1602 * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary
1603 * to keep time flowing at a relatively constant rate. If the random
1604 * increments have already pushed us past the projected offset, do nothing.
1605 */
1606static void
1607tcp_isn_tick(void *xtp)
1608{
1609 VNET_ITERATOR_DECL(vnet_iter);
1610 u_int32_t projected_offset;
1611
1612 VNET_LIST_RLOCK_NOSLEEP();
1613 ISN_LOCK();
1614 VNET_FOREACH(vnet_iter) {
1615 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */
1616 projected_offset =
1617 V_isn_offset_old + ISN_BYTES_PER_SECOND / 100;
1618
1619 if (SEQ_GT(projected_offset, V_isn_offset))
1620 V_isn_offset = projected_offset;
1621
1622 V_isn_offset_old = V_isn_offset;
1623 CURVNET_RESTORE();
1624 }
1625 ISN_UNLOCK();
1626 VNET_LIST_RUNLOCK_NOSLEEP();
1627 callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
1628}
1629
1630/*
1631 * When a specific ICMP unreachable message is received and the
1632 * connection state is SYN-SENT, drop the connection. This behavior
1633 * is controlled by the icmp_may_rst sysctl.
1634 */
1635struct inpcb *
1636tcp_drop_syn_sent(struct inpcb *inp, int errno)
1637{
1638 struct tcpcb *tp;
1639
1640 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
1641 INP_WLOCK_ASSERT(inp);
1642
1643 if ((inp->inp_flags & INP_TIMEWAIT) ||
1644 (inp->inp_flags & INP_DROPPED))
1645 return (inp);
1646
1647 tp = intotcpcb(inp);
1648 if (tp->t_state != TCPS_SYN_SENT)
1649 return (inp);
1650
1651 tp = tcp_drop(tp, errno);
1652 if (tp != NULL)
1653 return (inp);
1654 else
1655 return (NULL);
1656}
1657
1658/*
1659 * When `need fragmentation' ICMP is received, update our idea of the MSS
1660 * based on the new value in the route. Also nudge TCP to send something,
1661 * since we know the packet we just sent was dropped.
1662 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1663 */
1664struct inpcb *
1665tcp_mtudisc(struct inpcb *inp, int errno)
1666{
1667 struct tcpcb *tp;
1668 struct socket *so;
1669
1670 INP_WLOCK_ASSERT(inp);
1671 if ((inp->inp_flags & INP_TIMEWAIT) ||
1672 (inp->inp_flags & INP_DROPPED))
1673 return (inp);
1674
1675 tp = intotcpcb(inp);
1676 KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
1677
1678 tcp_mss_update(tp, -1, NULL, NULL);
1679
1680 so = inp->inp_socket;
1681 SOCKBUF_LOCK(&so->so_snd);
1682 /* If the mss is larger than the socket buffer, decrease the mss. */
1683 if (so->so_snd.sb_hiwat < tp->t_maxseg)
1684 tp->t_maxseg = so->so_snd.sb_hiwat;
1685 SOCKBUF_UNLOCK(&so->so_snd);
1686
1687 TCPSTAT_INC(tcps_mturesent);
1688 tp->t_rtttime = 0;
1689 tp->snd_nxt = tp->snd_una;
1690 tcp_free_sackholes(tp);
1691 tp->snd_recover = tp->snd_max;
1692 if (tp->t_flags & TF_SACK_PERMIT)
1693 EXIT_FASTRECOVERY(tp);
1694 tcp_output_send(tp);
1695 return (inp);
1696}
1697
1698/*
1699 * Look-up the routing entry to the peer of this inpcb. If no route
1700 * is found and it cannot be allocated, then return 0. This routine
1701 * is called by TCP routines that access the rmx structure and by
1702 * tcp_mss_update to get the peer/interface MTU.
1703 */
1704u_long
1705tcp_maxmtu(struct in_conninfo *inc, int *flags)
1706{
1707 struct route sro;
1708 struct sockaddr_in *dst;
1709 struct ifnet *ifp;
1710 u_long maxmtu = 0;
1711
1712 KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
1713
1714 bzero(&sro, sizeof(sro));
1715 if (inc->inc_faddr.s_addr != INADDR_ANY) {
1716 dst = (struct sockaddr_in *)&sro.ro_dst;
1717 dst->sin_family = AF_INET;
1718 dst->sin_len = sizeof(*dst);
1719 dst->sin_addr = inc->inc_faddr;
1720 in_rtalloc_ign(&sro, 0, inc->inc_fibnum);
1721 }
1722 if (sro.ro_rt != NULL) {
1723 ifp = sro.ro_rt->rt_ifp;
1724 if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
1725 maxmtu = ifp->if_mtu;
1726 else
1727 maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
1728
1729 /* Report additional interface capabilities. */
1730 if (flags != NULL) {
1731 if (ifp->if_capenable & IFCAP_TSO4 &&
1732 ifp->if_hwassist & CSUM_TSO)
1733 *flags |= CSUM_TSO;
1734 }
1735 RTFREE(sro.ro_rt);
1736 }
1737 return (maxmtu);
1738}
1739
1740#ifdef INET6
1741u_long
1742tcp_maxmtu6(struct in_conninfo *inc, int *flags)
1743{
1744 struct route_in6 sro6;
1745 struct ifnet *ifp;
1746 u_long maxmtu = 0;
1747
1748 KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
1749
1750 bzero(&sro6, sizeof(sro6));
1751 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
1752 sro6.ro_dst.sin6_family = AF_INET6;
1753 sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
1754 sro6.ro_dst.sin6_addr = inc->inc6_faddr;
1755 rtalloc_ign((struct route *)&sro6, 0);
1756 }
1757 if (sro6.ro_rt != NULL) {
1758 ifp = sro6.ro_rt->rt_ifp;
1759 if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
1760 maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
1761 else
1762 maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
1763 IN6_LINKMTU(sro6.ro_rt->rt_ifp));
1764
1765 /* Report additional interface capabilities. */
1766 if (flags != NULL) {
1767 if (ifp->if_capenable & IFCAP_TSO6 &&
1768 ifp->if_hwassist & CSUM_TSO)
1769 *flags |= CSUM_TSO;
1770 }
1771 RTFREE(sro6.ro_rt);
1772 }
1773
1774 return (maxmtu);
1775}
1776#endif /* INET6 */
1777
1778#ifdef IPSEC
1779/* compute ESP/AH header size for TCP, including outer IP header. */
1780size_t
1781ipsec_hdrsiz_tcp(struct tcpcb *tp)
1782{
1783 struct inpcb *inp;
1784 struct mbuf *m;
1785 size_t hdrsiz;
1786 struct ip *ip;
1787#ifdef INET6
1788 struct ip6_hdr *ip6;
1789#endif
1790 struct tcphdr *th;
1791
1792 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
1793 return (0);
1794 MGETHDR(m, M_DONTWAIT, MT_DATA);
1795 if (!m)
1796 return (0);
1797
1798#ifdef INET6
1799 if ((inp->inp_vflag & INP_IPV6) != 0) {
1800 ip6 = mtod(m, struct ip6_hdr *);
1801 th = (struct tcphdr *)(ip6 + 1);
1802 m->m_pkthdr.len = m->m_len =
1803 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1804 tcpip_fillheaders(inp, ip6, th);
1805 hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1806 } else
1807#endif /* INET6 */
1808 {
1809 ip = mtod(m, struct ip *);
1810 th = (struct tcphdr *)(ip + 1);
1811 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1812 tcpip_fillheaders(inp, ip, th);
1813 hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1814 }
1815
1816 m_free(m);
1817 return (hdrsiz);
1818}
1819#endif /* IPSEC */
1820
1821/*
1822 * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
1823 *
1824 * This code attempts to calculate the bandwidth-delay product as a
1825 * means of determining the optimal window size to maximize bandwidth,
1826 * minimize RTT, and avoid the over-allocation of buffers on interfaces and
1827 * routers. This code also does a fairly good job keeping RTTs in check
1828 * across slow links like modems. We implement an algorithm which is very
1829 * similar (but not meant to be) TCP/Vegas. The code operates on the
1830 * transmitter side of a TCP connection and so only effects the transmit
1831 * side of the connection.
1832 *
1833 * BACKGROUND: TCP makes no provision for the management of buffer space
1834 * at the end points or at the intermediate routers and switches. A TCP
1835 * stream, whether using NewReno or not, will eventually buffer as
1836 * many packets as it is able and the only reason this typically works is
1837 * due to the fairly small default buffers made available for a connection
1838 * (typicaly 16K or 32K). As machines use larger windows and/or window
1839 * scaling it is now fairly easy for even a single TCP connection to blow-out
1840 * all available buffer space not only on the local interface, but on
1841 * intermediate routers and switches as well. NewReno makes a misguided
1842 * attempt to 'solve' this problem by waiting for an actual failure to occur,
1843 * then backing off, then steadily increasing the window again until another
1844 * failure occurs, ad-infinitum. This results in terrible oscillation that
1845 * is only made worse as network loads increase and the idea of intentionally
1846 * blowing out network buffers is, frankly, a terrible way to manage network
1847 * resources.
1848 *
1849 * It is far better to limit the transmit window prior to the failure
1850 * condition being achieved. There are two general ways to do this: First
1851 * you can 'scan' through different transmit window sizes and locate the
1852 * point where the RTT stops increasing, indicating that you have filled the
1853 * pipe, then scan backwards until you note that RTT stops decreasing, then
1854 * repeat ad-infinitum. This method works in principle but has severe
1855 * implementation issues due to RTT variances, timer granularity, and
1856 * instability in the algorithm which can lead to many false positives and
1857 * create oscillations as well as interact badly with other TCP streams
1858 * implementing the same algorithm.
1859 *
1860 * The second method is to limit the window to the bandwidth delay product
1861 * of the link. This is the method we implement. RTT variances and our
1862 * own manipulation of the congestion window, bwnd, can potentially
1863 * destabilize the algorithm. For this reason we have to stabilize the
1864 * elements used to calculate the window. We do this by using the minimum
1865 * observed RTT, the long term average of the observed bandwidth, and
1866 * by adding two segments worth of slop. It isn't perfect but it is able
1867 * to react to changing conditions and gives us a very stable basis on
1868 * which to extend the algorithm.
1869 */
1870void
1871tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
1872{
1873 u_long bw;
1874 u_long bwnd;
1875 int save_ticks;
1876
1877 INP_WLOCK_ASSERT(tp->t_inpcb);
1878
1879 /*
1880 * If inflight_enable is disabled in the middle of a tcp connection,
1881 * make sure snd_bwnd is effectively disabled.
1882 */
1883 if (V_tcp_inflight_enable == 0 ||
1884 tp->t_rttlow < V_tcp_inflight_rttthresh) {
1885 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1886 tp->snd_bandwidth = 0;
1887 return;
1888 }
1889
1890 /*
1891 * Figure out the bandwidth. Due to the tick granularity this
1892 * is a very rough number and it MUST be averaged over a fairly
1893 * long period of time. XXX we need to take into account a link
1894 * that is not using all available bandwidth, but for now our
1895 * slop will ramp us up if this case occurs and the bandwidth later
1896 * increases.
1897 *
1898 * Note: if ticks rollover 'bw' may wind up negative. We must
1899 * effectively reset t_bw_rtttime for this case.
1900 */
1901 save_ticks = ticks;
1902 if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
1903 return;
1904
1905 bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
1906 (save_ticks - tp->t_bw_rtttime);
1907 tp->t_bw_rtttime = save_ticks;
1908 tp->t_bw_rtseq = ack_seq;
1909 if (tp->t_bw_rtttime == 0 || (int)bw < 0)
1910 return;
1911 bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
1912
1913 tp->snd_bandwidth = bw;
1914
1915 /*
1916 * Calculate the semi-static bandwidth delay product, plus two maximal
1917 * segments. The additional slop puts us squarely in the sweet
1918 * spot and also handles the bandwidth run-up case and stabilization.
1919 * Without the slop we could be locking ourselves into a lower
1920 * bandwidth.
1921 *
1922 * Situations Handled:
1923 * (1) Prevents over-queueing of packets on LANs, especially on
1924 * high speed LANs, allowing larger TCP buffers to be
1925 * specified, and also does a good job preventing
1926 * over-queueing of packets over choke points like modems
1927 * (at least for the transmit side).
1928 *
1929 * (2) Is able to handle changing network loads (bandwidth
1930 * drops so bwnd drops, bandwidth increases so bwnd
1931 * increases).
1932 *
1933 * (3) Theoretically should stabilize in the face of multiple
1934 * connections implementing the same algorithm (this may need
1935 * a little work).
1936 *
1937 * (4) Stability value (defaults to 20 = 2 maximal packets) can
1938 * be adjusted with a sysctl but typically only needs to be
1939 * on very slow connections. A value no smaller then 5
1940 * should be used, but only reduce this default if you have
1941 * no other choice.
1942 */
1943#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
1944 bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10;
1945#undef USERTT
1946
1947 if (tcp_inflight_debug > 0) {
1948 static int ltime;
1949 if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
1950 ltime = ticks;
1951 printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
1952 tp,
1953 bw,
1954 tp->t_rttbest,
1955 tp->t_srtt,
1956 bwnd
1957 );
1958 }
1959 }
1960 if ((long)bwnd < V_tcp_inflight_min)
1961 bwnd = V_tcp_inflight_min;
1962 if (bwnd > V_tcp_inflight_max)
1963 bwnd = V_tcp_inflight_max;
1964 if ((long)bwnd < tp->t_maxseg * 2)
1965 bwnd = tp->t_maxseg * 2;
1966 tp->snd_bwnd = bwnd;
1967}
1968
1969#ifdef TCP_SIGNATURE
1970/*
1971 * Callback function invoked by m_apply() to digest TCP segment data
1972 * contained within an mbuf chain.
1973 */
1974static int
1975tcp_signature_apply(void *fstate, void *data, u_int len)
1976{
1977
1978 MD5Update(fstate, (u_char *)data, len);
1979 return (0);
1980}
1981
1982/*
1983 * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
1984 *
1985 * Parameters:
1986 * m pointer to head of mbuf chain
1987 * _unused
1988 * len length of TCP segment data, excluding options
1989 * optlen length of TCP segment options
1990 * buf pointer to storage for computed MD5 digest
1991 * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
1992 *
1993 * We do this over ip, tcphdr, segment data, and the key in the SADB.
1994 * When called from tcp_input(), we can be sure that th_sum has been
1995 * zeroed out and verified already.
1996 *
1997 * Return 0 if successful, otherwise return -1.
1998 *
1999 * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
2000 * search with the destination IP address, and a 'magic SPI' to be
2001 * determined by the application. This is hardcoded elsewhere to 1179
2002 * right now. Another branch of this code exists which uses the SPD to
2003 * specify per-application flows but it is unstable.
2004 */
2005int
2006tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
2007 u_char *buf, u_int direction)
2008{
2009 union sockaddr_union dst;
2010 struct ippseudo ippseudo;
2011 MD5_CTX ctx;
2012 int doff;
2013 struct ip *ip;
2014 struct ipovly *ipovly;
2015 struct secasvar *sav;
2016 struct tcphdr *th;
2017#ifdef INET6
2018 struct ip6_hdr *ip6;
2019 struct in6_addr in6;
2020 char ip6buf[INET6_ADDRSTRLEN];
2021 uint32_t plen;
2022 uint16_t nhdr;
2023#endif
2024 u_short savecsum;
2025
2026 KASSERT(m != NULL, ("NULL mbuf chain"));
2027 KASSERT(buf != NULL, ("NULL signature pointer"));
2028
2029 /* Extract the destination from the IP header in the mbuf. */
2030 bzero(&dst, sizeof(union sockaddr_union));
2031 ip = mtod(m, struct ip *);
2032#ifdef INET6
2033 ip6 = NULL; /* Make the compiler happy. */
2034#endif
2035 switch (ip->ip_v) {
2036 case IPVERSION:
2037 dst.sa.sa_len = sizeof(struct sockaddr_in);
2038 dst.sa.sa_family = AF_INET;
2039 dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
2040 ip->ip_src : ip->ip_dst;
2041 break;
2042#ifdef INET6
2043 case (IPV6_VERSION >> 4):
2044 ip6 = mtod(m, struct ip6_hdr *);
2045 dst.sa.sa_len = sizeof(struct sockaddr_in6);
2046 dst.sa.sa_family = AF_INET6;
2047 dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ?
2048 ip6->ip6_src : ip6->ip6_dst;
2049 break;
2050#endif
2051 default:
2052 return (EINVAL);
2053 /* NOTREACHED */
2054 break;
2055 }
2056
2057 /* Look up an SADB entry which matches the address of the peer. */
2058 sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
2059 if (sav == NULL) {
2060 ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__,
2061 (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) :
2062#ifdef INET6
2063 (ip->ip_v == (IPV6_VERSION >> 4)) ?
2064 ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
2065#endif
2066 "(unsupported)"));
2067 return (EINVAL);
2068 }
2069
2070 MD5Init(&ctx);
2071 /*
2072 * Step 1: Update MD5 hash with IP(v6) pseudo-header.
2073 *
2074 * XXX The ippseudo header MUST be digested in network byte order,
2075 * or else we'll fail the regression test. Assume all fields we've
2076 * been doing arithmetic on have been in host byte order.
2077 * XXX One cannot depend on ipovly->ih_len here. When called from
2078 * tcp_output(), the underlying ip_len member has not yet been set.
2079 */
2080 switch (ip->ip_v) {
2081 case IPVERSION:
2082 ipovly = (struct ipovly *)ip;
2083 ippseudo.ippseudo_src = ipovly->ih_src;
2084 ippseudo.ippseudo_dst = ipovly->ih_dst;
2085 ippseudo.ippseudo_pad = 0;
2086 ippseudo.ippseudo_p = IPPROTO_TCP;
2087 ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) +
2088 optlen);
2089 MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
2090
2091 th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip));
2092 doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
2093 break;
2094#ifdef INET6
2095 /*
2096 * RFC 2385, 2.0 Proposal
2097 * For IPv6, the pseudo-header is as described in RFC 2460, namely the
2098 * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero-
2099 * extended next header value (to form 32 bits), and 32-bit segment
2100 * length.
2101 * Note: Upper-Layer Packet Length comes before Next Header.
2102 */
2103 case (IPV6_VERSION >> 4):
2104 in6 = ip6->ip6_src;
2105 in6_clearscope(&in6);
2106 MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
2107 in6 = ip6->ip6_dst;
2108 in6_clearscope(&in6);
2109 MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
2110 plen = htonl(len + sizeof(struct tcphdr) + optlen);
2111 MD5Update(&ctx, (char *)&plen, sizeof(uint32_t));
2112 nhdr = 0;
2113 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
2114 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
2115 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
2116 nhdr = IPPROTO_TCP;
2117 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
2118
2119 th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr));
2120 doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen;
2121 break;
2122#endif
2123 default:
2124 return (EINVAL);
2125 /* NOTREACHED */
2126 break;
2127 }
2128
2129
2130 /*
2131 * Step 2: Update MD5 hash with TCP header, excluding options.
2132 * The TCP checksum must be set to zero.
2133 */
2134 savecsum = th->th_sum;
2135 th->th_sum = 0;
2136 MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
2137 th->th_sum = savecsum;
2138
2139 /*
2140 * Step 3: Update MD5 hash with TCP segment data.
2141 * Use m_apply() to avoid an early m_pullup().
2142 */
2143 if (len > 0)
2144 m_apply(m, doff, len, tcp_signature_apply, &ctx);
2145
2146 /*
2147 * Step 4: Update MD5 hash with shared secret.
2148 */
2149 MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
2150 MD5Final(buf, &ctx);
2151
2152 key_sa_recordxfer(sav, m);
2153 KEY_FREESAV(&sav);
2154 return (0);
2155}
2156#endif /* TCP_SIGNATURE */
2157
2158static int
2159sysctl_drop(SYSCTL_HANDLER_ARGS)
2160{
2161 /* addrs[0] is a foreign socket, addrs[1] is a local one. */
2162 struct sockaddr_storage addrs[2];
2163 struct inpcb *inp;
2164 struct tcpcb *tp;
2165 struct tcptw *tw;
2166 struct sockaddr_in *fin, *lin;
2167#ifdef INET6
2168 struct sockaddr_in6 *fin6, *lin6;
2169#endif
2170 int error;
2171
2172 inp = NULL;
2173 fin = lin = NULL;
2174#ifdef INET6
2175 fin6 = lin6 = NULL;
2176#endif
2177 error = 0;
2178
2179 if (req->oldptr != NULL || req->oldlen != 0)
2180 return (EINVAL);
2181 if (req->newptr == NULL)
2182 return (EPERM);
2183 if (req->newlen < sizeof(addrs))
2184 return (ENOMEM);
2185 error = SYSCTL_IN(req, &addrs, sizeof(addrs));
2186 if (error)
2187 return (error);
2188
2189 switch (addrs[0].ss_family) {
2190#ifdef INET6
2191 case AF_INET6:
2192 fin6 = (struct sockaddr_in6 *)&addrs[0];
2193 lin6 = (struct sockaddr_in6 *)&addrs[1];
2194 if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
2195 lin6->sin6_len != sizeof(struct sockaddr_in6))
2196 return (EINVAL);
2197 if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
2198 if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
2199 return (EINVAL);
2200 in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
2201 in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
2202 fin = (struct sockaddr_in *)&addrs[0];
2203 lin = (struct sockaddr_in *)&addrs[1];
2204 break;
2205 }
2206 error = sa6_embedscope(fin6, V_ip6_use_defzone);
2207 if (error)
2208 return (error);
2209 error = sa6_embedscope(lin6, V_ip6_use_defzone);
2210 if (error)
2211 return (error);
2212 break;
2213#endif
2214 case AF_INET:
2215 fin = (struct sockaddr_in *)&addrs[0];
2216 lin = (struct sockaddr_in *)&addrs[1];
2217 if (fin->sin_len != sizeof(struct sockaddr_in) ||
2218 lin->sin_len != sizeof(struct sockaddr_in))
2219 return (EINVAL);
2220 break;
2221 default:
2222 return (EINVAL);
2223 }
2224 INP_INFO_WLOCK(&V_tcbinfo);
2225 switch (addrs[0].ss_family) {
2226#ifdef INET6
2227 case AF_INET6:
2228 inp = in6_pcblookup_hash(&V_tcbinfo, &fin6->sin6_addr,
2229 fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 0,
2230 NULL);
2231 break;
2232#endif
2233 case AF_INET:
2234 inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr,
2235 fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL);
2236 break;
2237 }
2238 if (inp != NULL) {
2239 INP_WLOCK(inp);
2240 if (inp->inp_flags & INP_TIMEWAIT) {
2241 /*
2242 * XXXRW: There currently exists a state where an
2243 * inpcb is present, but its timewait state has been
2244 * discarded. For now, don't allow dropping of this
2245 * type of inpcb.
2246 */
2247 tw = intotw(inp);
2248 if (tw != NULL)
2249 tcp_twclose(tw, 0);
2250 else
2251 INP_WUNLOCK(inp);
2252 } else if (!(inp->inp_flags & INP_DROPPED) &&
2253 !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
2254 tp = intotcpcb(inp);
2255 tp = tcp_drop(tp, ECONNABORTED);
2256 if (tp != NULL)
2257 INP_WUNLOCK(inp);
2258 } else
2259 INP_WUNLOCK(inp);
2260 } else
2261 error = ESRCH;
2262 INP_INFO_WUNLOCK(&V_tcbinfo);
2263 return (error);
2264}
2265
2266SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
2267 CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
2268 0, sysctl_drop, "", "Drop TCP connection");
2269
2270/*
2271 * Generate a standardized TCP log line for use throughout the
2272 * tcp subsystem. Memory allocation is done with M_NOWAIT to
2273 * allow use in the interrupt context.
2274 *
2275 * NB: The caller MUST free(s, M_TCPLOG) the returned string.
2276 * NB: The function may return NULL if memory allocation failed.
2277 *
2278 * Due to header inclusion and ordering limitations the struct ip
2279 * and ip6_hdr pointers have to be passed as void pointers.
2280 */
2281char *
2282tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
2283 const void *ip6hdr)
2284{
2285 char *s, *sp;
2286 size_t size;
2287 struct ip *ip;
2288#ifdef INET6
2289 const struct ip6_hdr *ip6;
2290
2291 ip6 = (const struct ip6_hdr *)ip6hdr;
2292#endif /* INET6 */
2293 ip = (struct ip *)ip4hdr;
2294
2295 /*
2296 * The log line looks like this:
2297 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
2298 */
2299 size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
2300 sizeof(PRINT_TH_FLAGS) + 1 +
2301#ifdef INET6
2302 2 * INET6_ADDRSTRLEN;
2303#else
2304 2 * INET_ADDRSTRLEN;
2305#endif /* INET6 */
2306
2307 /* Is logging enabled? */
2308 if (tcp_log_debug == 0 && tcp_log_in_vain == 0)
2309 return (NULL);
2310
2311 s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
2312 if (s == NULL)
2313 return (NULL);
2314
2315 strcat(s, "TCP: [");
2316 sp = s + strlen(s);
2317
2318 if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
2319 inet_ntoa_r(inc->inc_faddr, sp);
2320 sp = s + strlen(s);
2321 sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
2322 sp = s + strlen(s);
2323 inet_ntoa_r(inc->inc_laddr, sp);
2324 sp = s + strlen(s);
2325 sprintf(sp, "]:%i", ntohs(inc->inc_lport));
2326#ifdef INET6
2327 } else if (inc) {
2328 ip6_sprintf(sp, &inc->inc6_faddr);
2329 sp = s + strlen(s);
2330 sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
2331 sp = s + strlen(s);
2332 ip6_sprintf(sp, &inc->inc6_laddr);
2333 sp = s + strlen(s);
2334 sprintf(sp, "]:%i", ntohs(inc->inc_lport));
2335 } else if (ip6 && th) {
2336 ip6_sprintf(sp, &ip6->ip6_src);
2337 sp = s + strlen(s);
2338 sprintf(sp, "]:%i to [", ntohs(th->th_sport));
2339 sp = s + strlen(s);
2340 ip6_sprintf(sp, &ip6->ip6_dst);
2341 sp = s + strlen(s);
2342 sprintf(sp, "]:%i", ntohs(th->th_dport));
2343#endif /* INET6 */
2344 } else if (ip && th) {
2345 inet_ntoa_r(ip->ip_src, sp);
2346 sp = s + strlen(s);
2347 sprintf(sp, "]:%i to [", ntohs(th->th_sport));
2348 sp = s + strlen(s);
2349 inet_ntoa_r(ip->ip_dst, sp);
2350 sp = s + strlen(s);
2351 sprintf(sp, "]:%i", ntohs(th->th_dport));
2352 } else {
2353 free(s, M_TCPLOG);
2354 return (NULL);
2355 }
2356 sp = s + strlen(s);
2357 if (th)
2358 sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
2359 if (*(s + size - 1) != '\0')
2360 panic("%s: string too long", __func__);
2361 return (s);
2362}
354 V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
355 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
356
357 /* Skip initialization of globals for non-default instances. */
358 if (!IS_DEFAULT_VNET(curvnet))
359 return;
360
361 /* XXX virtualize those bellow? */
362 tcp_delacktime = TCPTV_DELACK;
363 tcp_keepinit = TCPTV_KEEP_INIT;
364 tcp_keepidle = TCPTV_KEEP_IDLE;
365 tcp_keepintvl = TCPTV_KEEPINTVL;
366 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
367 tcp_msl = TCPTV_MSL;
368 tcp_rexmit_min = TCPTV_MIN;
369 if (tcp_rexmit_min < 1)
370 tcp_rexmit_min = 1;
371 tcp_rexmit_slop = TCPTV_CPU_VAR;
372 tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
373 tcp_tcbhashsize = hashsize;
374
375#ifdef TCP_SORECEIVE_STREAM
376 TUNABLE_INT_FETCH("net.inet.tcp.soreceive_stream", &tcp_soreceive_stream);
377 if (tcp_soreceive_stream) {
378 tcp_usrreqs.pru_soreceive = soreceive_stream;
379 tcp6_usrreqs.pru_soreceive = soreceive_stream;
380 }
381#endif
382
383#ifdef INET6
384#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
385#else /* INET6 */
386#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
387#endif /* INET6 */
388 if (max_protohdr < TCP_MINPROTOHDR)
389 max_protohdr = TCP_MINPROTOHDR;
390 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
391 panic("tcp_init");
392#undef TCP_MINPROTOHDR
393
394 ISN_LOCK_INIT();
395 callout_init(&isn_callout, CALLOUT_MPSAFE);
396 callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
397 EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
398 SHUTDOWN_PRI_DEFAULT);
399 EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
400 EVENTHANDLER_PRI_ANY);
401}
402
403#ifdef VIMAGE
404void
405tcp_destroy(void)
406{
407
408 tcp_reass_destroy();
409 tcp_hc_destroy();
410 syncache_destroy();
411 tcp_tw_destroy();
412 in_pcbinfo_destroy(&V_tcbinfo);
413 uma_zdestroy(V_sack_hole_zone);
414 uma_zdestroy(V_tcpcb_zone);
415}
416#endif
417
418void
419tcp_fini(void *xtp)
420{
421
422 callout_stop(&isn_callout);
423}
424
425/*
426 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
427 * tcp_template used to store this data in mbufs, but we now recopy it out
428 * of the tcpcb each time to conserve mbufs.
429 */
430void
431tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
432{
433 struct tcphdr *th = (struct tcphdr *)tcp_ptr;
434
435 INP_WLOCK_ASSERT(inp);
436
437#ifdef INET6
438 if ((inp->inp_vflag & INP_IPV6) != 0) {
439 struct ip6_hdr *ip6;
440
441 ip6 = (struct ip6_hdr *)ip_ptr;
442 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
443 (inp->inp_flow & IPV6_FLOWINFO_MASK);
444 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
445 (IPV6_VERSION & IPV6_VERSION_MASK);
446 ip6->ip6_nxt = IPPROTO_TCP;
447 ip6->ip6_plen = htons(sizeof(struct tcphdr));
448 ip6->ip6_src = inp->in6p_laddr;
449 ip6->ip6_dst = inp->in6p_faddr;
450 } else
451#endif
452 {
453 struct ip *ip;
454
455 ip = (struct ip *)ip_ptr;
456 ip->ip_v = IPVERSION;
457 ip->ip_hl = 5;
458 ip->ip_tos = inp->inp_ip_tos;
459 ip->ip_len = 0;
460 ip->ip_id = 0;
461 ip->ip_off = 0;
462 ip->ip_ttl = inp->inp_ip_ttl;
463 ip->ip_sum = 0;
464 ip->ip_p = IPPROTO_TCP;
465 ip->ip_src = inp->inp_laddr;
466 ip->ip_dst = inp->inp_faddr;
467 }
468 th->th_sport = inp->inp_lport;
469 th->th_dport = inp->inp_fport;
470 th->th_seq = 0;
471 th->th_ack = 0;
472 th->th_x2 = 0;
473 th->th_off = 5;
474 th->th_flags = 0;
475 th->th_win = 0;
476 th->th_urp = 0;
477 th->th_sum = 0; /* in_pseudo() is called later for ipv4 */
478}
479
480/*
481 * Create template to be used to send tcp packets on a connection.
482 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
483 * use for this function is in keepalives, which use tcp_respond.
484 */
485struct tcptemp *
486tcpip_maketemplate(struct inpcb *inp)
487{
488 struct tcptemp *t;
489
490 t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
491 if (t == NULL)
492 return (NULL);
493 tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
494 return (t);
495}
496
497/*
498 * Send a single message to the TCP at address specified by
499 * the given TCP/IP header. If m == NULL, then we make a copy
500 * of the tcpiphdr at ti and send directly to the addressed host.
501 * This is used to force keep alive messages out using the TCP
502 * template for a connection. If flags are given then we send
503 * a message back to the TCP which originated the * segment ti,
504 * and discard the mbuf containing it and any other attached mbufs.
505 *
506 * In any case the ack and sequence number of the transmitted
507 * segment are as specified by the parameters.
508 *
509 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
510 */
511void
512tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
513 tcp_seq ack, tcp_seq seq, int flags)
514{
515 int tlen;
516 int win = 0;
517 struct ip *ip;
518 struct tcphdr *nth;
519#ifdef INET6
520 struct ip6_hdr *ip6;
521 int isipv6;
522#endif /* INET6 */
523 int ipflags = 0;
524 struct inpcb *inp;
525
526 KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
527
528#ifdef INET6
529 isipv6 = ((struct ip *)ipgen)->ip_v == 6;
530 ip6 = ipgen;
531#endif /* INET6 */
532 ip = ipgen;
533
534 if (tp != NULL) {
535 inp = tp->t_inpcb;
536 KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
537 INP_WLOCK_ASSERT(inp);
538 } else
539 inp = NULL;
540
541 if (tp != NULL) {
542 if (!(flags & TH_RST)) {
543 win = sbspace(&inp->inp_socket->so_rcv);
544 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
545 win = (long)TCP_MAXWIN << tp->rcv_scale;
546 }
547 }
548 if (m == NULL) {
549 m = m_gethdr(M_DONTWAIT, MT_DATA);
550 if (m == NULL)
551 return;
552 tlen = 0;
553 m->m_data += max_linkhdr;
554#ifdef INET6
555 if (isipv6) {
556 bcopy((caddr_t)ip6, mtod(m, caddr_t),
557 sizeof(struct ip6_hdr));
558 ip6 = mtod(m, struct ip6_hdr *);
559 nth = (struct tcphdr *)(ip6 + 1);
560 } else
561#endif /* INET6 */
562 {
563 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
564 ip = mtod(m, struct ip *);
565 nth = (struct tcphdr *)(ip + 1);
566 }
567 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
568 flags = TH_ACK;
569 } else {
570 /*
571 * reuse the mbuf.
572 * XXX MRT We inherrit the FIB, which is lucky.
573 */
574 m_freem(m->m_next);
575 m->m_next = NULL;
576 m->m_data = (caddr_t)ipgen;
577 /* m_len is set later */
578 tlen = 0;
579#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
580#ifdef INET6
581 if (isipv6) {
582 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
583 nth = (struct tcphdr *)(ip6 + 1);
584 } else
585#endif /* INET6 */
586 {
587 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
588 nth = (struct tcphdr *)(ip + 1);
589 }
590 if (th != nth) {
591 /*
592 * this is usually a case when an extension header
593 * exists between the IPv6 header and the
594 * TCP header.
595 */
596 nth->th_sport = th->th_sport;
597 nth->th_dport = th->th_dport;
598 }
599 xchg(nth->th_dport, nth->th_sport, uint16_t);
600#undef xchg
601 }
602#ifdef INET6
603 if (isipv6) {
604 ip6->ip6_flow = 0;
605 ip6->ip6_vfc = IPV6_VERSION;
606 ip6->ip6_nxt = IPPROTO_TCP;
607 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
608 tlen));
609 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
610 } else
611#endif
612 {
613 tlen += sizeof (struct tcpiphdr);
614 ip->ip_len = tlen;
615 ip->ip_ttl = V_ip_defttl;
616 if (V_path_mtu_discovery)
617 ip->ip_off |= IP_DF;
618 }
619 m->m_len = tlen;
620 m->m_pkthdr.len = tlen;
621 m->m_pkthdr.rcvif = NULL;
622#ifdef MAC
623 if (inp != NULL) {
624 /*
625 * Packet is associated with a socket, so allow the
626 * label of the response to reflect the socket label.
627 */
628 INP_WLOCK_ASSERT(inp);
629 mac_inpcb_create_mbuf(inp, m);
630 } else {
631 /*
632 * Packet is not associated with a socket, so possibly
633 * update the label in place.
634 */
635 mac_netinet_tcp_reply(m);
636 }
637#endif
638 nth->th_seq = htonl(seq);
639 nth->th_ack = htonl(ack);
640 nth->th_x2 = 0;
641 nth->th_off = sizeof (struct tcphdr) >> 2;
642 nth->th_flags = flags;
643 if (tp != NULL)
644 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
645 else
646 nth->th_win = htons((u_short)win);
647 nth->th_urp = 0;
648#ifdef INET6
649 if (isipv6) {
650 nth->th_sum = 0;
651 nth->th_sum = in6_cksum(m, IPPROTO_TCP,
652 sizeof(struct ip6_hdr),
653 tlen - sizeof(struct ip6_hdr));
654 ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
655 NULL, NULL);
656 } else
657#endif /* INET6 */
658 {
659 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
660 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
661 m->m_pkthdr.csum_flags = CSUM_TCP;
662 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
663 }
664#ifdef TCPDEBUG
665 if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
666 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
667#endif
668#ifdef INET6
669 if (isipv6)
670 (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
671 else
672#endif /* INET6 */
673 (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
674}
675
676/*
677 * Create a new TCP control block, making an
678 * empty reassembly queue and hooking it to the argument
679 * protocol control block. The `inp' parameter must have
680 * come from the zone allocator set up in tcp_init().
681 */
682struct tcpcb *
683tcp_newtcpcb(struct inpcb *inp)
684{
685 struct tcpcb_mem *tm;
686 struct tcpcb *tp;
687#ifdef INET6
688 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
689#endif /* INET6 */
690
691 tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO);
692 if (tm == NULL)
693 return (NULL);
694 tp = &tm->tcb;
695#ifdef VIMAGE
696 tp->t_vnet = inp->inp_vnet;
697#endif
698 tp->t_timers = &tm->tt;
699 /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */
700 tp->t_maxseg = tp->t_maxopd =
701#ifdef INET6
702 isipv6 ? V_tcp_v6mssdflt :
703#endif /* INET6 */
704 V_tcp_mssdflt;
705
706 /* Set up our timeouts. */
707 callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE);
708 callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE);
709 callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
710 callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
711 callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
712
713 if (V_tcp_do_rfc1323)
714 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
715 if (V_tcp_do_sack)
716 tp->t_flags |= TF_SACK_PERMIT;
717 TAILQ_INIT(&tp->snd_holes);
718 tp->t_inpcb = inp; /* XXX */
719 /*
720 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
721 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
722 * reasonable initial retransmit time.
723 */
724 tp->t_srtt = TCPTV_SRTTBASE;
725 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
726 tp->t_rttmin = tcp_rexmit_min;
727 tp->t_rxtcur = TCPTV_RTOBASE;
728 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
729 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
730 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
731 tp->t_rcvtime = ticks;
732 tp->t_bw_rtttime = ticks;
733 /*
734 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
735 * because the socket may be bound to an IPv6 wildcard address,
736 * which may match an IPv4-mapped IPv6 address.
737 */
738 inp->inp_ip_ttl = V_ip_defttl;
739 inp->inp_ppcb = tp;
740 return (tp); /* XXX */
741}
742
743/*
744 * Drop a TCP connection, reporting
745 * the specified error. If connection is synchronized,
746 * then send a RST to peer.
747 */
748struct tcpcb *
749tcp_drop(struct tcpcb *tp, int errno)
750{
751 struct socket *so = tp->t_inpcb->inp_socket;
752
753 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
754 INP_WLOCK_ASSERT(tp->t_inpcb);
755
756 if (TCPS_HAVERCVDSYN(tp->t_state)) {
757 tp->t_state = TCPS_CLOSED;
758 (void) tcp_output_reset(tp);
759 TCPSTAT_INC(tcps_drops);
760 } else
761 TCPSTAT_INC(tcps_conndrops);
762 if (errno == ETIMEDOUT && tp->t_softerror)
763 errno = tp->t_softerror;
764 so->so_error = errno;
765 return (tcp_close(tp));
766}
767
768void
769tcp_discardcb(struct tcpcb *tp)
770{
771 struct tseg_qent *q;
772 struct inpcb *inp = tp->t_inpcb;
773 struct socket *so = inp->inp_socket;
774#ifdef INET6
775 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
776#endif /* INET6 */
777
778 INP_WLOCK_ASSERT(inp);
779
780 /*
781 * Make sure that all of our timers are stopped before we delete the
782 * PCB.
783 *
784 * XXXRW: Really, we would like to use callout_drain() here in order
785 * to avoid races experienced in tcp_timer.c where a timer is already
786 * executing at this point. However, we can't, both because we're
787 * running in a context where we can't sleep, and also because we
788 * hold locks required by the timers. What we instead need to do is
789 * test to see if callout_drain() is required, and if so, defer some
790 * portion of the remainder of tcp_discardcb() to an asynchronous
791 * context that can callout_drain() and then continue. Some care
792 * will be required to ensure that no further processing takes place
793 * on the tcpcb, even though it hasn't been freed (a flag?).
794 */
795 callout_stop(&tp->t_timers->tt_rexmt);
796 callout_stop(&tp->t_timers->tt_persist);
797 callout_stop(&tp->t_timers->tt_keep);
798 callout_stop(&tp->t_timers->tt_2msl);
799 callout_stop(&tp->t_timers->tt_delack);
800
801 /*
802 * If we got enough samples through the srtt filter,
803 * save the rtt and rttvar in the routing entry.
804 * 'Enough' is arbitrarily defined as 4 rtt samples.
805 * 4 samples is enough for the srtt filter to converge
806 * to within enough % of the correct value; fewer samples
807 * and we could save a bogus rtt. The danger is not high
808 * as tcp quickly recovers from everything.
809 * XXX: Works very well but needs some more statistics!
810 */
811 if (tp->t_rttupdated >= 4) {
812 struct hc_metrics_lite metrics;
813 u_long ssthresh;
814
815 bzero(&metrics, sizeof(metrics));
816 /*
817 * Update the ssthresh always when the conditions below
818 * are satisfied. This gives us better new start value
819 * for the congestion avoidance for new connections.
820 * ssthresh is only set if packet loss occured on a session.
821 *
822 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
823 * being torn down. Ideally this code would not use 'so'.
824 */
825 ssthresh = tp->snd_ssthresh;
826 if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
827 /*
828 * convert the limit from user data bytes to
829 * packets then to packet data bytes.
830 */
831 ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
832 if (ssthresh < 2)
833 ssthresh = 2;
834 ssthresh *= (u_long)(tp->t_maxseg +
835#ifdef INET6
836 (isipv6 ? sizeof (struct ip6_hdr) +
837 sizeof (struct tcphdr) :
838#endif
839 sizeof (struct tcpiphdr)
840#ifdef INET6
841 )
842#endif
843 );
844 } else
845 ssthresh = 0;
846 metrics.rmx_ssthresh = ssthresh;
847
848 metrics.rmx_rtt = tp->t_srtt;
849 metrics.rmx_rttvar = tp->t_rttvar;
850 /* XXX: This wraps if the pipe is more than 4 Gbit per second */
851 metrics.rmx_bandwidth = tp->snd_bandwidth;
852 metrics.rmx_cwnd = tp->snd_cwnd;
853 metrics.rmx_sendpipe = 0;
854 metrics.rmx_recvpipe = 0;
855
856 tcp_hc_update(&inp->inp_inc, &metrics);
857 }
858
859 /* free the reassembly queue, if any */
860 while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
861 LIST_REMOVE(q, tqe_q);
862 m_freem(q->tqe_m);
863 uma_zfree(V_tcp_reass_zone, q);
864 tp->t_segqlen--;
865 V_tcp_reass_qsize--;
866 }
867 /* Disconnect offload device, if any. */
868 tcp_offload_detach(tp);
869
870 tcp_free_sackholes(tp);
871 inp->inp_ppcb = NULL;
872 tp->t_inpcb = NULL;
873 uma_zfree(V_tcpcb_zone, tp);
874}
875
876/*
877 * Attempt to close a TCP control block, marking it as dropped, and freeing
878 * the socket if we hold the only reference.
879 */
880struct tcpcb *
881tcp_close(struct tcpcb *tp)
882{
883 struct inpcb *inp = tp->t_inpcb;
884 struct socket *so;
885
886 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
887 INP_WLOCK_ASSERT(inp);
888
889 /* Notify any offload devices of listener close */
890 if (tp->t_state == TCPS_LISTEN)
891 tcp_offload_listen_close(tp);
892 in_pcbdrop(inp);
893 TCPSTAT_INC(tcps_closed);
894 KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
895 so = inp->inp_socket;
896 soisdisconnected(so);
897 if (inp->inp_flags & INP_SOCKREF) {
898 KASSERT(so->so_state & SS_PROTOREF,
899 ("tcp_close: !SS_PROTOREF"));
900 inp->inp_flags &= ~INP_SOCKREF;
901 INP_WUNLOCK(inp);
902 ACCEPT_LOCK();
903 SOCK_LOCK(so);
904 so->so_state &= ~SS_PROTOREF;
905 sofree(so);
906 return (NULL);
907 }
908 return (tp);
909}
910
911void
912tcp_drain(void)
913{
914 VNET_ITERATOR_DECL(vnet_iter);
915
916 if (!do_tcpdrain)
917 return;
918
919 VNET_LIST_RLOCK_NOSLEEP();
920 VNET_FOREACH(vnet_iter) {
921 CURVNET_SET(vnet_iter);
922 struct inpcb *inpb;
923 struct tcpcb *tcpb;
924 struct tseg_qent *te;
925
926 /*
927 * Walk the tcpbs, if existing, and flush the reassembly queue,
928 * if there is one...
929 * XXX: The "Net/3" implementation doesn't imply that the TCP
930 * reassembly queue should be flushed, but in a situation
931 * where we're really low on mbufs, this is potentially
932 * usefull.
933 */
934 INP_INFO_RLOCK(&V_tcbinfo);
935 LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
936 if (inpb->inp_flags & INP_TIMEWAIT)
937 continue;
938 INP_WLOCK(inpb);
939 if ((tcpb = intotcpcb(inpb)) != NULL) {
940 while ((te = LIST_FIRST(&tcpb->t_segq))
941 != NULL) {
942 LIST_REMOVE(te, tqe_q);
943 m_freem(te->tqe_m);
944 uma_zfree(V_tcp_reass_zone, te);
945 tcpb->t_segqlen--;
946 V_tcp_reass_qsize--;
947 }
948 tcp_clean_sackreport(tcpb);
949 }
950 INP_WUNLOCK(inpb);
951 }
952 INP_INFO_RUNLOCK(&V_tcbinfo);
953 CURVNET_RESTORE();
954 }
955 VNET_LIST_RUNLOCK_NOSLEEP();
956}
957
958/*
959 * Notify a tcp user of an asynchronous error;
960 * store error as soft error, but wake up user
961 * (for now, won't do anything until can select for soft error).
962 *
963 * Do not wake up user since there currently is no mechanism for
964 * reporting soft errors (yet - a kqueue filter may be added).
965 */
966static struct inpcb *
967tcp_notify(struct inpcb *inp, int error)
968{
969 struct tcpcb *tp;
970
971 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
972 INP_WLOCK_ASSERT(inp);
973
974 if ((inp->inp_flags & INP_TIMEWAIT) ||
975 (inp->inp_flags & INP_DROPPED))
976 return (inp);
977
978 tp = intotcpcb(inp);
979 KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
980
981 /*
982 * Ignore some errors if we are hooked up.
983 * If connection hasn't completed, has retransmitted several times,
984 * and receives a second error, give up now. This is better
985 * than waiting a long time to establish a connection that
986 * can never complete.
987 */
988 if (tp->t_state == TCPS_ESTABLISHED &&
989 (error == EHOSTUNREACH || error == ENETUNREACH ||
990 error == EHOSTDOWN)) {
991 return (inp);
992 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
993 tp->t_softerror) {
994 tp = tcp_drop(tp, error);
995 if (tp != NULL)
996 return (inp);
997 else
998 return (NULL);
999 } else {
1000 tp->t_softerror = error;
1001 return (inp);
1002 }
1003#if 0
1004 wakeup( &so->so_timeo);
1005 sorwakeup(so);
1006 sowwakeup(so);
1007#endif
1008}
1009
1010static int
1011tcp_pcblist(SYSCTL_HANDLER_ARGS)
1012{
1013 int error, i, m, n, pcb_count;
1014 struct inpcb *inp, **inp_list;
1015 inp_gen_t gencnt;
1016 struct xinpgen xig;
1017
1018 /*
1019 * The process of preparing the TCB list is too time-consuming and
1020 * resource-intensive to repeat twice on every request.
1021 */
1022 if (req->oldptr == NULL) {
1023 m = syncache_pcbcount();
1024 n = V_tcbinfo.ipi_count;
1025 req->oldidx = 2 * (sizeof xig)
1026 + ((m + n) + n/8) * sizeof(struct xtcpcb);
1027 return (0);
1028 }
1029
1030 if (req->newptr != NULL)
1031 return (EPERM);
1032
1033 /*
1034 * OK, now we're committed to doing something.
1035 */
1036 INP_INFO_RLOCK(&V_tcbinfo);
1037 gencnt = V_tcbinfo.ipi_gencnt;
1038 n = V_tcbinfo.ipi_count;
1039 INP_INFO_RUNLOCK(&V_tcbinfo);
1040
1041 m = syncache_pcbcount();
1042
1043 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1044 + (n + m) * sizeof(struct xtcpcb));
1045 if (error != 0)
1046 return (error);
1047
1048 xig.xig_len = sizeof xig;
1049 xig.xig_count = n + m;
1050 xig.xig_gen = gencnt;
1051 xig.xig_sogen = so_gencnt;
1052 error = SYSCTL_OUT(req, &xig, sizeof xig);
1053 if (error)
1054 return (error);
1055
1056 error = syncache_pcblist(req, m, &pcb_count);
1057 if (error)
1058 return (error);
1059
1060 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
1061 if (inp_list == NULL)
1062 return (ENOMEM);
1063
1064 INP_INFO_RLOCK(&V_tcbinfo);
1065 for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
1066 inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
1067 INP_WLOCK(inp);
1068 if (inp->inp_gencnt <= gencnt) {
1069 /*
1070 * XXX: This use of cr_cansee(), introduced with
1071 * TCP state changes, is not quite right, but for
1072 * now, better than nothing.
1073 */
1074 if (inp->inp_flags & INP_TIMEWAIT) {
1075 if (intotw(inp) != NULL)
1076 error = cr_cansee(req->td->td_ucred,
1077 intotw(inp)->tw_cred);
1078 else
1079 error = EINVAL; /* Skip this inp. */
1080 } else
1081 error = cr_canseeinpcb(req->td->td_ucred, inp);
1082 if (error == 0) {
1083 in_pcbref(inp);
1084 inp_list[i++] = inp;
1085 }
1086 }
1087 INP_WUNLOCK(inp);
1088 }
1089 INP_INFO_RUNLOCK(&V_tcbinfo);
1090 n = i;
1091
1092 error = 0;
1093 for (i = 0; i < n; i++) {
1094 inp = inp_list[i];
1095 INP_RLOCK(inp);
1096 if (inp->inp_gencnt <= gencnt) {
1097 struct xtcpcb xt;
1098 void *inp_ppcb;
1099
1100 bzero(&xt, sizeof(xt));
1101 xt.xt_len = sizeof xt;
1102 /* XXX should avoid extra copy */
1103 bcopy(inp, &xt.xt_inp, sizeof *inp);
1104 inp_ppcb = inp->inp_ppcb;
1105 if (inp_ppcb == NULL)
1106 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
1107 else if (inp->inp_flags & INP_TIMEWAIT) {
1108 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
1109 xt.xt_tp.t_state = TCPS_TIME_WAIT;
1110 } else {
1111 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
1112 if (xt.xt_tp.t_timers)
1113 tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer);
1114 }
1115 if (inp->inp_socket != NULL)
1116 sotoxsocket(inp->inp_socket, &xt.xt_socket);
1117 else {
1118 bzero(&xt.xt_socket, sizeof xt.xt_socket);
1119 xt.xt_socket.xso_protocol = IPPROTO_TCP;
1120 }
1121 xt.xt_inp.inp_gencnt = inp->inp_gencnt;
1122 INP_RUNLOCK(inp);
1123 error = SYSCTL_OUT(req, &xt, sizeof xt);
1124 } else
1125 INP_RUNLOCK(inp);
1126 }
1127 INP_INFO_WLOCK(&V_tcbinfo);
1128 for (i = 0; i < n; i++) {
1129 inp = inp_list[i];
1130 INP_WLOCK(inp);
1131 if (!in_pcbrele(inp))
1132 INP_WUNLOCK(inp);
1133 }
1134 INP_INFO_WUNLOCK(&V_tcbinfo);
1135
1136 if (!error) {
1137 /*
1138 * Give the user an updated idea of our state.
1139 * If the generation differs from what we told
1140 * her before, she knows that something happened
1141 * while we were processing this request, and it
1142 * might be necessary to retry.
1143 */
1144 INP_INFO_RLOCK(&V_tcbinfo);
1145 xig.xig_gen = V_tcbinfo.ipi_gencnt;
1146 xig.xig_sogen = so_gencnt;
1147 xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
1148 INP_INFO_RUNLOCK(&V_tcbinfo);
1149 error = SYSCTL_OUT(req, &xig, sizeof xig);
1150 }
1151 free(inp_list, M_TEMP);
1152 return (error);
1153}
1154
1155SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
1156 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
1157
1158static int
1159tcp_getcred(SYSCTL_HANDLER_ARGS)
1160{
1161 struct xucred xuc;
1162 struct sockaddr_in addrs[2];
1163 struct inpcb *inp;
1164 int error;
1165
1166 error = priv_check(req->td, PRIV_NETINET_GETCRED);
1167 if (error)
1168 return (error);
1169 error = SYSCTL_IN(req, addrs, sizeof(addrs));
1170 if (error)
1171 return (error);
1172 INP_INFO_RLOCK(&V_tcbinfo);
1173 inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr,
1174 addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
1175 if (inp != NULL) {
1176 INP_RLOCK(inp);
1177 INP_INFO_RUNLOCK(&V_tcbinfo);
1178 if (inp->inp_socket == NULL)
1179 error = ENOENT;
1180 if (error == 0)
1181 error = cr_canseeinpcb(req->td->td_ucred, inp);
1182 if (error == 0)
1183 cru2x(inp->inp_cred, &xuc);
1184 INP_RUNLOCK(inp);
1185 } else {
1186 INP_INFO_RUNLOCK(&V_tcbinfo);
1187 error = ENOENT;
1188 }
1189 if (error == 0)
1190 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
1191 return (error);
1192}
1193
1194SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
1195 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
1196 tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
1197
1198#ifdef INET6
1199static int
1200tcp6_getcred(SYSCTL_HANDLER_ARGS)
1201{
1202 struct xucred xuc;
1203 struct sockaddr_in6 addrs[2];
1204 struct inpcb *inp;
1205 int error, mapped = 0;
1206
1207 error = priv_check(req->td, PRIV_NETINET_GETCRED);
1208 if (error)
1209 return (error);
1210 error = SYSCTL_IN(req, addrs, sizeof(addrs));
1211 if (error)
1212 return (error);
1213 if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
1214 (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
1215 return (error);
1216 }
1217 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
1218 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
1219 mapped = 1;
1220 else
1221 return (EINVAL);
1222 }
1223
1224 INP_INFO_RLOCK(&V_tcbinfo);
1225 if (mapped == 1)
1226 inp = in_pcblookup_hash(&V_tcbinfo,
1227 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
1228 addrs[1].sin6_port,
1229 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
1230 addrs[0].sin6_port,
1231 0, NULL);
1232 else
1233 inp = in6_pcblookup_hash(&V_tcbinfo,
1234 &addrs[1].sin6_addr, addrs[1].sin6_port,
1235 &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL);
1236 if (inp != NULL) {
1237 INP_RLOCK(inp);
1238 INP_INFO_RUNLOCK(&V_tcbinfo);
1239 if (inp->inp_socket == NULL)
1240 error = ENOENT;
1241 if (error == 0)
1242 error = cr_canseeinpcb(req->td->td_ucred, inp);
1243 if (error == 0)
1244 cru2x(inp->inp_cred, &xuc);
1245 INP_RUNLOCK(inp);
1246 } else {
1247 INP_INFO_RUNLOCK(&V_tcbinfo);
1248 error = ENOENT;
1249 }
1250 if (error == 0)
1251 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
1252 return (error);
1253}
1254
1255SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
1256 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
1257 tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
1258#endif
1259
1260
1261void
1262tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1263{
1264 struct ip *ip = vip;
1265 struct tcphdr *th;
1266 struct in_addr faddr;
1267 struct inpcb *inp;
1268 struct tcpcb *tp;
1269 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
1270 struct icmp *icp;
1271 struct in_conninfo inc;
1272 tcp_seq icmp_tcp_seq;
1273 int mtu;
1274
1275 faddr = ((struct sockaddr_in *)sa)->sin_addr;
1276 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1277 return;
1278
1279 if (cmd == PRC_MSGSIZE)
1280 notify = tcp_mtudisc;
1281 else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
1282 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
1283 notify = tcp_drop_syn_sent;
1284 /*
1285 * Redirects don't need to be handled up here.
1286 */
1287 else if (PRC_IS_REDIRECT(cmd))
1288 return;
1289 /*
1290 * Source quench is depreciated.
1291 */
1292 else if (cmd == PRC_QUENCH)
1293 return;
1294 /*
1295 * Hostdead is ugly because it goes linearly through all PCBs.
1296 * XXX: We never get this from ICMP, otherwise it makes an
1297 * excellent DoS attack on machines with many connections.
1298 */
1299 else if (cmd == PRC_HOSTDEAD)
1300 ip = NULL;
1301 else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
1302 return;
1303 if (ip != NULL) {
1304 icp = (struct icmp *)((caddr_t)ip
1305 - offsetof(struct icmp, icmp_ip));
1306 th = (struct tcphdr *)((caddr_t)ip
1307 + (ip->ip_hl << 2));
1308 INP_INFO_WLOCK(&V_tcbinfo);
1309 inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport,
1310 ip->ip_src, th->th_sport, 0, NULL);
1311 if (inp != NULL) {
1312 INP_WLOCK(inp);
1313 if (!(inp->inp_flags & INP_TIMEWAIT) &&
1314 !(inp->inp_flags & INP_DROPPED) &&
1315 !(inp->inp_socket == NULL)) {
1316 icmp_tcp_seq = htonl(th->th_seq);
1317 tp = intotcpcb(inp);
1318 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
1319 SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
1320 if (cmd == PRC_MSGSIZE) {
1321 /*
1322 * MTU discovery:
1323 * If we got a needfrag set the MTU
1324 * in the route to the suggested new
1325 * value (if given) and then notify.
1326 */
1327 bzero(&inc, sizeof(inc));
1328 inc.inc_faddr = faddr;
1329 inc.inc_fibnum =
1330 inp->inp_inc.inc_fibnum;
1331
1332 mtu = ntohs(icp->icmp_nextmtu);
1333 /*
1334 * If no alternative MTU was
1335 * proposed, try the next smaller
1336 * one. ip->ip_len has already
1337 * been swapped in icmp_input().
1338 */
1339 if (!mtu)
1340 mtu = ip_next_mtu(ip->ip_len,
1341 1);
1342 if (mtu < max(296, V_tcp_minmss
1343 + sizeof(struct tcpiphdr)))
1344 mtu = 0;
1345 if (!mtu)
1346 mtu = V_tcp_mssdflt
1347 + sizeof(struct tcpiphdr);
1348 /*
1349 * Only cache the the MTU if it
1350 * is smaller than the interface
1351 * or route MTU. tcp_mtudisc()
1352 * will do right thing by itself.
1353 */
1354 if (mtu <= tcp_maxmtu(&inc, NULL))
1355 tcp_hc_updatemtu(&inc, mtu);
1356 }
1357
1358 inp = (*notify)(inp, inetctlerrmap[cmd]);
1359 }
1360 }
1361 if (inp != NULL)
1362 INP_WUNLOCK(inp);
1363 } else {
1364 bzero(&inc, sizeof(inc));
1365 inc.inc_fport = th->th_dport;
1366 inc.inc_lport = th->th_sport;
1367 inc.inc_faddr = faddr;
1368 inc.inc_laddr = ip->ip_src;
1369 syncache_unreach(&inc, th);
1370 }
1371 INP_INFO_WUNLOCK(&V_tcbinfo);
1372 } else
1373 in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
1374}
1375
1376#ifdef INET6
1377void
1378tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
1379{
1380 struct tcphdr th;
1381 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
1382 struct ip6_hdr *ip6;
1383 struct mbuf *m;
1384 struct ip6ctlparam *ip6cp = NULL;
1385 const struct sockaddr_in6 *sa6_src = NULL;
1386 int off;
1387 struct tcp_portonly {
1388 u_int16_t th_sport;
1389 u_int16_t th_dport;
1390 } *thp;
1391
1392 if (sa->sa_family != AF_INET6 ||
1393 sa->sa_len != sizeof(struct sockaddr_in6))
1394 return;
1395
1396 if (cmd == PRC_MSGSIZE)
1397 notify = tcp_mtudisc;
1398 else if (!PRC_IS_REDIRECT(cmd) &&
1399 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1400 return;
1401 /* Source quench is depreciated. */
1402 else if (cmd == PRC_QUENCH)
1403 return;
1404
1405 /* if the parameter is from icmp6, decode it. */
1406 if (d != NULL) {
1407 ip6cp = (struct ip6ctlparam *)d;
1408 m = ip6cp->ip6c_m;
1409 ip6 = ip6cp->ip6c_ip6;
1410 off = ip6cp->ip6c_off;
1411 sa6_src = ip6cp->ip6c_src;
1412 } else {
1413 m = NULL;
1414 ip6 = NULL;
1415 off = 0; /* fool gcc */
1416 sa6_src = &sa6_any;
1417 }
1418
1419 if (ip6 != NULL) {
1420 struct in_conninfo inc;
1421 /*
1422 * XXX: We assume that when IPV6 is non NULL,
1423 * M and OFF are valid.
1424 */
1425
1426 /* check if we can safely examine src and dst ports */
1427 if (m->m_pkthdr.len < off + sizeof(*thp))
1428 return;
1429
1430 bzero(&th, sizeof(th));
1431 m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1432
1433 in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
1434 (struct sockaddr *)ip6cp->ip6c_src,
1435 th.th_sport, cmd, NULL, notify);
1436
1437 bzero(&inc, sizeof(inc));
1438 inc.inc_fport = th.th_dport;
1439 inc.inc_lport = th.th_sport;
1440 inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
1441 inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
1442 inc.inc_flags |= INC_ISIPV6;
1443 INP_INFO_WLOCK(&V_tcbinfo);
1444 syncache_unreach(&inc, &th);
1445 INP_INFO_WUNLOCK(&V_tcbinfo);
1446 } else
1447 in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
1448 0, cmd, NULL, notify);
1449}
1450#endif /* INET6 */
1451
1452
1453/*
1454 * Following is where TCP initial sequence number generation occurs.
1455 *
1456 * There are two places where we must use initial sequence numbers:
1457 * 1. In SYN-ACK packets.
1458 * 2. In SYN packets.
1459 *
1460 * All ISNs for SYN-ACK packets are generated by the syncache. See
1461 * tcp_syncache.c for details.
1462 *
1463 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
1464 * depends on this property. In addition, these ISNs should be
1465 * unguessable so as to prevent connection hijacking. To satisfy
1466 * the requirements of this situation, the algorithm outlined in
1467 * RFC 1948 is used, with only small modifications.
1468 *
1469 * Implementation details:
1470 *
1471 * Time is based off the system timer, and is corrected so that it
1472 * increases by one megabyte per second. This allows for proper
1473 * recycling on high speed LANs while still leaving over an hour
1474 * before rollover.
1475 *
1476 * As reading the *exact* system time is too expensive to be done
1477 * whenever setting up a TCP connection, we increment the time
1478 * offset in two ways. First, a small random positive increment
1479 * is added to isn_offset for each connection that is set up.
1480 * Second, the function tcp_isn_tick fires once per clock tick
1481 * and increments isn_offset as necessary so that sequence numbers
1482 * are incremented at approximately ISN_BYTES_PER_SECOND. The
1483 * random positive increments serve only to ensure that the same
1484 * exact sequence number is never sent out twice (as could otherwise
1485 * happen when a port is recycled in less than the system tick
1486 * interval.)
1487 *
1488 * net.inet.tcp.isn_reseed_interval controls the number of seconds
1489 * between seeding of isn_secret. This is normally set to zero,
1490 * as reseeding should not be necessary.
1491 *
1492 * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
1493 * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In
1494 * general, this means holding an exclusive (write) lock.
1495 */
1496
1497#define ISN_BYTES_PER_SECOND 1048576
1498#define ISN_STATIC_INCREMENT 4096
1499#define ISN_RANDOM_INCREMENT (4096 - 1)
1500
1501static VNET_DEFINE(u_char, isn_secret[32]);
1502static VNET_DEFINE(int, isn_last_reseed);
1503static VNET_DEFINE(u_int32_t, isn_offset);
1504static VNET_DEFINE(u_int32_t, isn_offset_old);
1505
1506#define V_isn_secret VNET(isn_secret)
1507#define V_isn_last_reseed VNET(isn_last_reseed)
1508#define V_isn_offset VNET(isn_offset)
1509#define V_isn_offset_old VNET(isn_offset_old)
1510
1511tcp_seq
1512tcp_new_isn(struct tcpcb *tp)
1513{
1514 MD5_CTX isn_ctx;
1515 u_int32_t md5_buffer[4];
1516 tcp_seq new_isn;
1517
1518 INP_WLOCK_ASSERT(tp->t_inpcb);
1519
1520 ISN_LOCK();
1521 /* Seed if this is the first use, reseed if requested. */
1522 if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
1523 (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
1524 < (u_int)ticks))) {
1525 read_random(&V_isn_secret, sizeof(V_isn_secret));
1526 V_isn_last_reseed = ticks;
1527 }
1528
1529 /* Compute the md5 hash and return the ISN. */
1530 MD5Init(&isn_ctx);
1531 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
1532 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
1533#ifdef INET6
1534 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
1535 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
1536 sizeof(struct in6_addr));
1537 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
1538 sizeof(struct in6_addr));
1539 } else
1540#endif
1541 {
1542 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
1543 sizeof(struct in_addr));
1544 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
1545 sizeof(struct in_addr));
1546 }
1547 MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
1548 MD5Final((u_char *) &md5_buffer, &isn_ctx);
1549 new_isn = (tcp_seq) md5_buffer[0];
1550 V_isn_offset += ISN_STATIC_INCREMENT +
1551 (arc4random() & ISN_RANDOM_INCREMENT);
1552 new_isn += V_isn_offset;
1553 ISN_UNLOCK();
1554 return (new_isn);
1555}
1556
1557/*
1558 * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary
1559 * to keep time flowing at a relatively constant rate. If the random
1560 * increments have already pushed us past the projected offset, do nothing.
1561 */
1562static void
1563tcp_isn_tick(void *xtp)
1564{
1565 VNET_ITERATOR_DECL(vnet_iter);
1566 u_int32_t projected_offset;
1567
1568 VNET_LIST_RLOCK_NOSLEEP();
1569 ISN_LOCK();
1570 VNET_FOREACH(vnet_iter) {
1571 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */
1572 projected_offset =
1573 V_isn_offset_old + ISN_BYTES_PER_SECOND / 100;
1574
1575 if (SEQ_GT(projected_offset, V_isn_offset))
1576 V_isn_offset = projected_offset;
1577
1578 V_isn_offset_old = V_isn_offset;
1579 CURVNET_RESTORE();
1580 }
1581 ISN_UNLOCK();
1582 VNET_LIST_RUNLOCK_NOSLEEP();
1583 callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
1584}
1585
1586/*
1587 * When a specific ICMP unreachable message is received and the
1588 * connection state is SYN-SENT, drop the connection. This behavior
1589 * is controlled by the icmp_may_rst sysctl.
1590 */
1591struct inpcb *
1592tcp_drop_syn_sent(struct inpcb *inp, int errno)
1593{
1594 struct tcpcb *tp;
1595
1596 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
1597 INP_WLOCK_ASSERT(inp);
1598
1599 if ((inp->inp_flags & INP_TIMEWAIT) ||
1600 (inp->inp_flags & INP_DROPPED))
1601 return (inp);
1602
1603 tp = intotcpcb(inp);
1604 if (tp->t_state != TCPS_SYN_SENT)
1605 return (inp);
1606
1607 tp = tcp_drop(tp, errno);
1608 if (tp != NULL)
1609 return (inp);
1610 else
1611 return (NULL);
1612}
1613
1614/*
1615 * When `need fragmentation' ICMP is received, update our idea of the MSS
1616 * based on the new value in the route. Also nudge TCP to send something,
1617 * since we know the packet we just sent was dropped.
1618 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1619 */
1620struct inpcb *
1621tcp_mtudisc(struct inpcb *inp, int errno)
1622{
1623 struct tcpcb *tp;
1624 struct socket *so;
1625
1626 INP_WLOCK_ASSERT(inp);
1627 if ((inp->inp_flags & INP_TIMEWAIT) ||
1628 (inp->inp_flags & INP_DROPPED))
1629 return (inp);
1630
1631 tp = intotcpcb(inp);
1632 KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
1633
1634 tcp_mss_update(tp, -1, NULL, NULL);
1635
1636 so = inp->inp_socket;
1637 SOCKBUF_LOCK(&so->so_snd);
1638 /* If the mss is larger than the socket buffer, decrease the mss. */
1639 if (so->so_snd.sb_hiwat < tp->t_maxseg)
1640 tp->t_maxseg = so->so_snd.sb_hiwat;
1641 SOCKBUF_UNLOCK(&so->so_snd);
1642
1643 TCPSTAT_INC(tcps_mturesent);
1644 tp->t_rtttime = 0;
1645 tp->snd_nxt = tp->snd_una;
1646 tcp_free_sackholes(tp);
1647 tp->snd_recover = tp->snd_max;
1648 if (tp->t_flags & TF_SACK_PERMIT)
1649 EXIT_FASTRECOVERY(tp);
1650 tcp_output_send(tp);
1651 return (inp);
1652}
1653
1654/*
1655 * Look-up the routing entry to the peer of this inpcb. If no route
1656 * is found and it cannot be allocated, then return 0. This routine
1657 * is called by TCP routines that access the rmx structure and by
1658 * tcp_mss_update to get the peer/interface MTU.
1659 */
1660u_long
1661tcp_maxmtu(struct in_conninfo *inc, int *flags)
1662{
1663 struct route sro;
1664 struct sockaddr_in *dst;
1665 struct ifnet *ifp;
1666 u_long maxmtu = 0;
1667
1668 KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
1669
1670 bzero(&sro, sizeof(sro));
1671 if (inc->inc_faddr.s_addr != INADDR_ANY) {
1672 dst = (struct sockaddr_in *)&sro.ro_dst;
1673 dst->sin_family = AF_INET;
1674 dst->sin_len = sizeof(*dst);
1675 dst->sin_addr = inc->inc_faddr;
1676 in_rtalloc_ign(&sro, 0, inc->inc_fibnum);
1677 }
1678 if (sro.ro_rt != NULL) {
1679 ifp = sro.ro_rt->rt_ifp;
1680 if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
1681 maxmtu = ifp->if_mtu;
1682 else
1683 maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
1684
1685 /* Report additional interface capabilities. */
1686 if (flags != NULL) {
1687 if (ifp->if_capenable & IFCAP_TSO4 &&
1688 ifp->if_hwassist & CSUM_TSO)
1689 *flags |= CSUM_TSO;
1690 }
1691 RTFREE(sro.ro_rt);
1692 }
1693 return (maxmtu);
1694}
1695
1696#ifdef INET6
1697u_long
1698tcp_maxmtu6(struct in_conninfo *inc, int *flags)
1699{
1700 struct route_in6 sro6;
1701 struct ifnet *ifp;
1702 u_long maxmtu = 0;
1703
1704 KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
1705
1706 bzero(&sro6, sizeof(sro6));
1707 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
1708 sro6.ro_dst.sin6_family = AF_INET6;
1709 sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
1710 sro6.ro_dst.sin6_addr = inc->inc6_faddr;
1711 rtalloc_ign((struct route *)&sro6, 0);
1712 }
1713 if (sro6.ro_rt != NULL) {
1714 ifp = sro6.ro_rt->rt_ifp;
1715 if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
1716 maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
1717 else
1718 maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
1719 IN6_LINKMTU(sro6.ro_rt->rt_ifp));
1720
1721 /* Report additional interface capabilities. */
1722 if (flags != NULL) {
1723 if (ifp->if_capenable & IFCAP_TSO6 &&
1724 ifp->if_hwassist & CSUM_TSO)
1725 *flags |= CSUM_TSO;
1726 }
1727 RTFREE(sro6.ro_rt);
1728 }
1729
1730 return (maxmtu);
1731}
1732#endif /* INET6 */
1733
1734#ifdef IPSEC
1735/* compute ESP/AH header size for TCP, including outer IP header. */
1736size_t
1737ipsec_hdrsiz_tcp(struct tcpcb *tp)
1738{
1739 struct inpcb *inp;
1740 struct mbuf *m;
1741 size_t hdrsiz;
1742 struct ip *ip;
1743#ifdef INET6
1744 struct ip6_hdr *ip6;
1745#endif
1746 struct tcphdr *th;
1747
1748 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
1749 return (0);
1750 MGETHDR(m, M_DONTWAIT, MT_DATA);
1751 if (!m)
1752 return (0);
1753
1754#ifdef INET6
1755 if ((inp->inp_vflag & INP_IPV6) != 0) {
1756 ip6 = mtod(m, struct ip6_hdr *);
1757 th = (struct tcphdr *)(ip6 + 1);
1758 m->m_pkthdr.len = m->m_len =
1759 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1760 tcpip_fillheaders(inp, ip6, th);
1761 hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1762 } else
1763#endif /* INET6 */
1764 {
1765 ip = mtod(m, struct ip *);
1766 th = (struct tcphdr *)(ip + 1);
1767 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1768 tcpip_fillheaders(inp, ip, th);
1769 hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1770 }
1771
1772 m_free(m);
1773 return (hdrsiz);
1774}
1775#endif /* IPSEC */
1776
1777/*
1778 * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
1779 *
1780 * This code attempts to calculate the bandwidth-delay product as a
1781 * means of determining the optimal window size to maximize bandwidth,
1782 * minimize RTT, and avoid the over-allocation of buffers on interfaces and
1783 * routers. This code also does a fairly good job keeping RTTs in check
1784 * across slow links like modems. We implement an algorithm which is very
1785 * similar (but not meant to be) TCP/Vegas. The code operates on the
1786 * transmitter side of a TCP connection and so only effects the transmit
1787 * side of the connection.
1788 *
1789 * BACKGROUND: TCP makes no provision for the management of buffer space
1790 * at the end points or at the intermediate routers and switches. A TCP
1791 * stream, whether using NewReno or not, will eventually buffer as
1792 * many packets as it is able and the only reason this typically works is
1793 * due to the fairly small default buffers made available for a connection
1794 * (typicaly 16K or 32K). As machines use larger windows and/or window
1795 * scaling it is now fairly easy for even a single TCP connection to blow-out
1796 * all available buffer space not only on the local interface, but on
1797 * intermediate routers and switches as well. NewReno makes a misguided
1798 * attempt to 'solve' this problem by waiting for an actual failure to occur,
1799 * then backing off, then steadily increasing the window again until another
1800 * failure occurs, ad-infinitum. This results in terrible oscillation that
1801 * is only made worse as network loads increase and the idea of intentionally
1802 * blowing out network buffers is, frankly, a terrible way to manage network
1803 * resources.
1804 *
1805 * It is far better to limit the transmit window prior to the failure
1806 * condition being achieved. There are two general ways to do this: First
1807 * you can 'scan' through different transmit window sizes and locate the
1808 * point where the RTT stops increasing, indicating that you have filled the
1809 * pipe, then scan backwards until you note that RTT stops decreasing, then
1810 * repeat ad-infinitum. This method works in principle but has severe
1811 * implementation issues due to RTT variances, timer granularity, and
1812 * instability in the algorithm which can lead to many false positives and
1813 * create oscillations as well as interact badly with other TCP streams
1814 * implementing the same algorithm.
1815 *
1816 * The second method is to limit the window to the bandwidth delay product
1817 * of the link. This is the method we implement. RTT variances and our
1818 * own manipulation of the congestion window, bwnd, can potentially
1819 * destabilize the algorithm. For this reason we have to stabilize the
1820 * elements used to calculate the window. We do this by using the minimum
1821 * observed RTT, the long term average of the observed bandwidth, and
1822 * by adding two segments worth of slop. It isn't perfect but it is able
1823 * to react to changing conditions and gives us a very stable basis on
1824 * which to extend the algorithm.
1825 */
1826void
1827tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
1828{
1829 u_long bw;
1830 u_long bwnd;
1831 int save_ticks;
1832
1833 INP_WLOCK_ASSERT(tp->t_inpcb);
1834
1835 /*
1836 * If inflight_enable is disabled in the middle of a tcp connection,
1837 * make sure snd_bwnd is effectively disabled.
1838 */
1839 if (V_tcp_inflight_enable == 0 ||
1840 tp->t_rttlow < V_tcp_inflight_rttthresh) {
1841 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1842 tp->snd_bandwidth = 0;
1843 return;
1844 }
1845
1846 /*
1847 * Figure out the bandwidth. Due to the tick granularity this
1848 * is a very rough number and it MUST be averaged over a fairly
1849 * long period of time. XXX we need to take into account a link
1850 * that is not using all available bandwidth, but for now our
1851 * slop will ramp us up if this case occurs and the bandwidth later
1852 * increases.
1853 *
1854 * Note: if ticks rollover 'bw' may wind up negative. We must
1855 * effectively reset t_bw_rtttime for this case.
1856 */
1857 save_ticks = ticks;
1858 if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
1859 return;
1860
1861 bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
1862 (save_ticks - tp->t_bw_rtttime);
1863 tp->t_bw_rtttime = save_ticks;
1864 tp->t_bw_rtseq = ack_seq;
1865 if (tp->t_bw_rtttime == 0 || (int)bw < 0)
1866 return;
1867 bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
1868
1869 tp->snd_bandwidth = bw;
1870
1871 /*
1872 * Calculate the semi-static bandwidth delay product, plus two maximal
1873 * segments. The additional slop puts us squarely in the sweet
1874 * spot and also handles the bandwidth run-up case and stabilization.
1875 * Without the slop we could be locking ourselves into a lower
1876 * bandwidth.
1877 *
1878 * Situations Handled:
1879 * (1) Prevents over-queueing of packets on LANs, especially on
1880 * high speed LANs, allowing larger TCP buffers to be
1881 * specified, and also does a good job preventing
1882 * over-queueing of packets over choke points like modems
1883 * (at least for the transmit side).
1884 *
1885 * (2) Is able to handle changing network loads (bandwidth
1886 * drops so bwnd drops, bandwidth increases so bwnd
1887 * increases).
1888 *
1889 * (3) Theoretically should stabilize in the face of multiple
1890 * connections implementing the same algorithm (this may need
1891 * a little work).
1892 *
1893 * (4) Stability value (defaults to 20 = 2 maximal packets) can
1894 * be adjusted with a sysctl but typically only needs to be
1895 * on very slow connections. A value no smaller then 5
1896 * should be used, but only reduce this default if you have
1897 * no other choice.
1898 */
1899#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
1900 bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10;
1901#undef USERTT
1902
1903 if (tcp_inflight_debug > 0) {
1904 static int ltime;
1905 if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
1906 ltime = ticks;
1907 printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
1908 tp,
1909 bw,
1910 tp->t_rttbest,
1911 tp->t_srtt,
1912 bwnd
1913 );
1914 }
1915 }
1916 if ((long)bwnd < V_tcp_inflight_min)
1917 bwnd = V_tcp_inflight_min;
1918 if (bwnd > V_tcp_inflight_max)
1919 bwnd = V_tcp_inflight_max;
1920 if ((long)bwnd < tp->t_maxseg * 2)
1921 bwnd = tp->t_maxseg * 2;
1922 tp->snd_bwnd = bwnd;
1923}
1924
1925#ifdef TCP_SIGNATURE
1926/*
1927 * Callback function invoked by m_apply() to digest TCP segment data
1928 * contained within an mbuf chain.
1929 */
1930static int
1931tcp_signature_apply(void *fstate, void *data, u_int len)
1932{
1933
1934 MD5Update(fstate, (u_char *)data, len);
1935 return (0);
1936}
1937
1938/*
1939 * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
1940 *
1941 * Parameters:
1942 * m pointer to head of mbuf chain
1943 * _unused
1944 * len length of TCP segment data, excluding options
1945 * optlen length of TCP segment options
1946 * buf pointer to storage for computed MD5 digest
1947 * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
1948 *
1949 * We do this over ip, tcphdr, segment data, and the key in the SADB.
1950 * When called from tcp_input(), we can be sure that th_sum has been
1951 * zeroed out and verified already.
1952 *
1953 * Return 0 if successful, otherwise return -1.
1954 *
1955 * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
1956 * search with the destination IP address, and a 'magic SPI' to be
1957 * determined by the application. This is hardcoded elsewhere to 1179
1958 * right now. Another branch of this code exists which uses the SPD to
1959 * specify per-application flows but it is unstable.
1960 */
1961int
1962tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
1963 u_char *buf, u_int direction)
1964{
1965 union sockaddr_union dst;
1966 struct ippseudo ippseudo;
1967 MD5_CTX ctx;
1968 int doff;
1969 struct ip *ip;
1970 struct ipovly *ipovly;
1971 struct secasvar *sav;
1972 struct tcphdr *th;
1973#ifdef INET6
1974 struct ip6_hdr *ip6;
1975 struct in6_addr in6;
1976 char ip6buf[INET6_ADDRSTRLEN];
1977 uint32_t plen;
1978 uint16_t nhdr;
1979#endif
1980 u_short savecsum;
1981
1982 KASSERT(m != NULL, ("NULL mbuf chain"));
1983 KASSERT(buf != NULL, ("NULL signature pointer"));
1984
1985 /* Extract the destination from the IP header in the mbuf. */
1986 bzero(&dst, sizeof(union sockaddr_union));
1987 ip = mtod(m, struct ip *);
1988#ifdef INET6
1989 ip6 = NULL; /* Make the compiler happy. */
1990#endif
1991 switch (ip->ip_v) {
1992 case IPVERSION:
1993 dst.sa.sa_len = sizeof(struct sockaddr_in);
1994 dst.sa.sa_family = AF_INET;
1995 dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
1996 ip->ip_src : ip->ip_dst;
1997 break;
1998#ifdef INET6
1999 case (IPV6_VERSION >> 4):
2000 ip6 = mtod(m, struct ip6_hdr *);
2001 dst.sa.sa_len = sizeof(struct sockaddr_in6);
2002 dst.sa.sa_family = AF_INET6;
2003 dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ?
2004 ip6->ip6_src : ip6->ip6_dst;
2005 break;
2006#endif
2007 default:
2008 return (EINVAL);
2009 /* NOTREACHED */
2010 break;
2011 }
2012
2013 /* Look up an SADB entry which matches the address of the peer. */
2014 sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
2015 if (sav == NULL) {
2016 ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__,
2017 (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) :
2018#ifdef INET6
2019 (ip->ip_v == (IPV6_VERSION >> 4)) ?
2020 ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
2021#endif
2022 "(unsupported)"));
2023 return (EINVAL);
2024 }
2025
2026 MD5Init(&ctx);
2027 /*
2028 * Step 1: Update MD5 hash with IP(v6) pseudo-header.
2029 *
2030 * XXX The ippseudo header MUST be digested in network byte order,
2031 * or else we'll fail the regression test. Assume all fields we've
2032 * been doing arithmetic on have been in host byte order.
2033 * XXX One cannot depend on ipovly->ih_len here. When called from
2034 * tcp_output(), the underlying ip_len member has not yet been set.
2035 */
2036 switch (ip->ip_v) {
2037 case IPVERSION:
2038 ipovly = (struct ipovly *)ip;
2039 ippseudo.ippseudo_src = ipovly->ih_src;
2040 ippseudo.ippseudo_dst = ipovly->ih_dst;
2041 ippseudo.ippseudo_pad = 0;
2042 ippseudo.ippseudo_p = IPPROTO_TCP;
2043 ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) +
2044 optlen);
2045 MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
2046
2047 th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip));
2048 doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
2049 break;
2050#ifdef INET6
2051 /*
2052 * RFC 2385, 2.0 Proposal
2053 * For IPv6, the pseudo-header is as described in RFC 2460, namely the
2054 * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero-
2055 * extended next header value (to form 32 bits), and 32-bit segment
2056 * length.
2057 * Note: Upper-Layer Packet Length comes before Next Header.
2058 */
2059 case (IPV6_VERSION >> 4):
2060 in6 = ip6->ip6_src;
2061 in6_clearscope(&in6);
2062 MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
2063 in6 = ip6->ip6_dst;
2064 in6_clearscope(&in6);
2065 MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
2066 plen = htonl(len + sizeof(struct tcphdr) + optlen);
2067 MD5Update(&ctx, (char *)&plen, sizeof(uint32_t));
2068 nhdr = 0;
2069 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
2070 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
2071 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
2072 nhdr = IPPROTO_TCP;
2073 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
2074
2075 th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr));
2076 doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen;
2077 break;
2078#endif
2079 default:
2080 return (EINVAL);
2081 /* NOTREACHED */
2082 break;
2083 }
2084
2085
2086 /*
2087 * Step 2: Update MD5 hash with TCP header, excluding options.
2088 * The TCP checksum must be set to zero.
2089 */
2090 savecsum = th->th_sum;
2091 th->th_sum = 0;
2092 MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
2093 th->th_sum = savecsum;
2094
2095 /*
2096 * Step 3: Update MD5 hash with TCP segment data.
2097 * Use m_apply() to avoid an early m_pullup().
2098 */
2099 if (len > 0)
2100 m_apply(m, doff, len, tcp_signature_apply, &ctx);
2101
2102 /*
2103 * Step 4: Update MD5 hash with shared secret.
2104 */
2105 MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
2106 MD5Final(buf, &ctx);
2107
2108 key_sa_recordxfer(sav, m);
2109 KEY_FREESAV(&sav);
2110 return (0);
2111}
2112#endif /* TCP_SIGNATURE */
2113
2114static int
2115sysctl_drop(SYSCTL_HANDLER_ARGS)
2116{
2117 /* addrs[0] is a foreign socket, addrs[1] is a local one. */
2118 struct sockaddr_storage addrs[2];
2119 struct inpcb *inp;
2120 struct tcpcb *tp;
2121 struct tcptw *tw;
2122 struct sockaddr_in *fin, *lin;
2123#ifdef INET6
2124 struct sockaddr_in6 *fin6, *lin6;
2125#endif
2126 int error;
2127
2128 inp = NULL;
2129 fin = lin = NULL;
2130#ifdef INET6
2131 fin6 = lin6 = NULL;
2132#endif
2133 error = 0;
2134
2135 if (req->oldptr != NULL || req->oldlen != 0)
2136 return (EINVAL);
2137 if (req->newptr == NULL)
2138 return (EPERM);
2139 if (req->newlen < sizeof(addrs))
2140 return (ENOMEM);
2141 error = SYSCTL_IN(req, &addrs, sizeof(addrs));
2142 if (error)
2143 return (error);
2144
2145 switch (addrs[0].ss_family) {
2146#ifdef INET6
2147 case AF_INET6:
2148 fin6 = (struct sockaddr_in6 *)&addrs[0];
2149 lin6 = (struct sockaddr_in6 *)&addrs[1];
2150 if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
2151 lin6->sin6_len != sizeof(struct sockaddr_in6))
2152 return (EINVAL);
2153 if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
2154 if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
2155 return (EINVAL);
2156 in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
2157 in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
2158 fin = (struct sockaddr_in *)&addrs[0];
2159 lin = (struct sockaddr_in *)&addrs[1];
2160 break;
2161 }
2162 error = sa6_embedscope(fin6, V_ip6_use_defzone);
2163 if (error)
2164 return (error);
2165 error = sa6_embedscope(lin6, V_ip6_use_defzone);
2166 if (error)
2167 return (error);
2168 break;
2169#endif
2170 case AF_INET:
2171 fin = (struct sockaddr_in *)&addrs[0];
2172 lin = (struct sockaddr_in *)&addrs[1];
2173 if (fin->sin_len != sizeof(struct sockaddr_in) ||
2174 lin->sin_len != sizeof(struct sockaddr_in))
2175 return (EINVAL);
2176 break;
2177 default:
2178 return (EINVAL);
2179 }
2180 INP_INFO_WLOCK(&V_tcbinfo);
2181 switch (addrs[0].ss_family) {
2182#ifdef INET6
2183 case AF_INET6:
2184 inp = in6_pcblookup_hash(&V_tcbinfo, &fin6->sin6_addr,
2185 fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 0,
2186 NULL);
2187 break;
2188#endif
2189 case AF_INET:
2190 inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr,
2191 fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL);
2192 break;
2193 }
2194 if (inp != NULL) {
2195 INP_WLOCK(inp);
2196 if (inp->inp_flags & INP_TIMEWAIT) {
2197 /*
2198 * XXXRW: There currently exists a state where an
2199 * inpcb is present, but its timewait state has been
2200 * discarded. For now, don't allow dropping of this
2201 * type of inpcb.
2202 */
2203 tw = intotw(inp);
2204 if (tw != NULL)
2205 tcp_twclose(tw, 0);
2206 else
2207 INP_WUNLOCK(inp);
2208 } else if (!(inp->inp_flags & INP_DROPPED) &&
2209 !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
2210 tp = intotcpcb(inp);
2211 tp = tcp_drop(tp, ECONNABORTED);
2212 if (tp != NULL)
2213 INP_WUNLOCK(inp);
2214 } else
2215 INP_WUNLOCK(inp);
2216 } else
2217 error = ESRCH;
2218 INP_INFO_WUNLOCK(&V_tcbinfo);
2219 return (error);
2220}
2221
2222SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
2223 CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
2224 0, sysctl_drop, "", "Drop TCP connection");
2225
2226/*
2227 * Generate a standardized TCP log line for use throughout the
2228 * tcp subsystem. Memory allocation is done with M_NOWAIT to
2229 * allow use in the interrupt context.
2230 *
2231 * NB: The caller MUST free(s, M_TCPLOG) the returned string.
2232 * NB: The function may return NULL if memory allocation failed.
2233 *
2234 * Due to header inclusion and ordering limitations the struct ip
2235 * and ip6_hdr pointers have to be passed as void pointers.
2236 */
2237char *
2238tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
2239 const void *ip6hdr)
2240{
2241 char *s, *sp;
2242 size_t size;
2243 struct ip *ip;
2244#ifdef INET6
2245 const struct ip6_hdr *ip6;
2246
2247 ip6 = (const struct ip6_hdr *)ip6hdr;
2248#endif /* INET6 */
2249 ip = (struct ip *)ip4hdr;
2250
2251 /*
2252 * The log line looks like this:
2253 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
2254 */
2255 size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
2256 sizeof(PRINT_TH_FLAGS) + 1 +
2257#ifdef INET6
2258 2 * INET6_ADDRSTRLEN;
2259#else
2260 2 * INET_ADDRSTRLEN;
2261#endif /* INET6 */
2262
2263 /* Is logging enabled? */
2264 if (tcp_log_debug == 0 && tcp_log_in_vain == 0)
2265 return (NULL);
2266
2267 s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
2268 if (s == NULL)
2269 return (NULL);
2270
2271 strcat(s, "TCP: [");
2272 sp = s + strlen(s);
2273
2274 if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
2275 inet_ntoa_r(inc->inc_faddr, sp);
2276 sp = s + strlen(s);
2277 sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
2278 sp = s + strlen(s);
2279 inet_ntoa_r(inc->inc_laddr, sp);
2280 sp = s + strlen(s);
2281 sprintf(sp, "]:%i", ntohs(inc->inc_lport));
2282#ifdef INET6
2283 } else if (inc) {
2284 ip6_sprintf(sp, &inc->inc6_faddr);
2285 sp = s + strlen(s);
2286 sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
2287 sp = s + strlen(s);
2288 ip6_sprintf(sp, &inc->inc6_laddr);
2289 sp = s + strlen(s);
2290 sprintf(sp, "]:%i", ntohs(inc->inc_lport));
2291 } else if (ip6 && th) {
2292 ip6_sprintf(sp, &ip6->ip6_src);
2293 sp = s + strlen(s);
2294 sprintf(sp, "]:%i to [", ntohs(th->th_sport));
2295 sp = s + strlen(s);
2296 ip6_sprintf(sp, &ip6->ip6_dst);
2297 sp = s + strlen(s);
2298 sprintf(sp, "]:%i", ntohs(th->th_dport));
2299#endif /* INET6 */
2300 } else if (ip && th) {
2301 inet_ntoa_r(ip->ip_src, sp);
2302 sp = s + strlen(s);
2303 sprintf(sp, "]:%i to [", ntohs(th->th_sport));
2304 sp = s + strlen(s);
2305 inet_ntoa_r(ip->ip_dst, sp);
2306 sp = s + strlen(s);
2307 sprintf(sp, "]:%i", ntohs(th->th_dport));
2308 } else {
2309 free(s, M_TCPLOG);
2310 return (NULL);
2311 }
2312 sp = s + strlen(s);
2313 if (th)
2314 sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
2315 if (*(s + size - 1) != '\0')
2316 panic("%s: string too long", __func__);
2317 return (s);
2318}