Deleted Added
full compact
tcp_timewait.c (77900) tcp_timewait.c (78064)
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
34 * $FreeBSD: head/sys/netinet/tcp_timewait.c 77900 2001-06-08 05:24:21Z peter $
34 * $FreeBSD: head/sys/netinet/tcp_timewait.c 78064 2001-06-11 12:39:29Z ume $
35 */
36
37#include "opt_compat.h"
38#include "opt_inet6.h"
39#include "opt_ipsec.h"
40#include "opt_tcpdebug.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/callout.h>
45#include <sys/kernel.h>
46#include <sys/sysctl.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#ifdef INET6
50#include <sys/domain.h>
51#endif
52#include <sys/proc.h>
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55#include <sys/protosw.h>
56#include <sys/random.h>
57
58#include <vm/vm_zone.h>
59
60#include <net/route.h>
61#include <net/if.h>
62
63#define _IP_VHL
64#include <netinet/in.h>
65#include <netinet/in_systm.h>
66#include <netinet/ip.h>
67#ifdef INET6
68#include <netinet/ip6.h>
69#endif
70#include <netinet/in_pcb.h>
71#ifdef INET6
72#include <netinet6/in6_pcb.h>
73#endif
74#include <netinet/in_var.h>
75#include <netinet/ip_var.h>
76#ifdef INET6
77#include <netinet6/ip6_var.h>
78#endif
79#include <netinet/tcp.h>
80#include <netinet/tcp_fsm.h>
81#include <netinet/tcp_seq.h>
82#include <netinet/tcp_timer.h>
83#include <netinet/tcp_var.h>
84#ifdef INET6
85#include <netinet6/tcp6_var.h>
86#endif
87#include <netinet/tcpip.h>
88#ifdef TCPDEBUG
89#include <netinet/tcp_debug.h>
90#endif
91#include <netinet6/ip6protosw.h>
92
93#ifdef IPSEC
94#include <netinet6/ipsec.h>
95#ifdef INET6
96#include <netinet6/ipsec6.h>
97#endif
98#endif /*IPSEC*/
99
100#include <machine/in_cksum.h>
101
102int tcp_mssdflt = TCP_MSS;
103SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
104 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
105
106#ifdef INET6
107int tcp_v6mssdflt = TCP6_MSS;
108SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
109 CTLFLAG_RW, &tcp_v6mssdflt , 0,
110 "Default TCP Maximum Segment Size for IPv6");
111#endif
112
113#if 0
114static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
115SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
116 &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
117#endif
118
119static int tcp_do_rfc1323 = 1;
120SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
121 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
122
123static int tcp_do_rfc1644 = 0;
124SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
125 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
126
127static int tcp_tcbhashsize = 0;
128SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
129 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
130
131static int do_tcpdrain = 1;
132SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
133 "Enable tcp_drain routine for extra help when low on mbufs");
134
135SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
136 &tcbinfo.ipi_count, 0, "Number of active PCBs");
137
138static int icmp_may_rst = 1;
139SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
140 "Certain ICMP unreachable messages may abort connections in SYN_SENT");
141
142static void tcp_cleartaocache __P((void));
143static void tcp_notify __P((struct inpcb *, int));
144
145/*
146 * Target size of TCP PCB hash tables. Must be a power of two.
147 *
148 * Note that this can be overridden by the kernel environment
149 * variable net.inet.tcp.tcbhashsize
150 */
151#ifndef TCBHASHSIZE
152#define TCBHASHSIZE 512
153#endif
154
155/*
156 * This is the actual shape of what we allocate using the zone
157 * allocator. Doing it this way allows us to protect both structures
158 * using the same generation count, and also eliminates the overhead
159 * of allocating tcpcbs separately. By hiding the structure here,
160 * we avoid changing most of the rest of the code (although it needs
161 * to be changed, eventually, for greater efficiency).
162 */
163#define ALIGNMENT 32
164#define ALIGNM1 (ALIGNMENT - 1)
165struct inp_tp {
166 union {
167 struct inpcb inp;
168 char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
169 } inp_tp_u;
170 struct tcpcb tcb;
171 struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
172 struct callout inp_tp_delack;
173};
174#undef ALIGNMENT
175#undef ALIGNM1
176
177/*
178 * Tcp initialization
179 */
180void
181tcp_init()
182{
183 int hashsize = TCBHASHSIZE;
184
185 tcp_ccgen = 1;
186 tcp_cleartaocache();
187
188 tcp_delacktime = TCPTV_DELACK;
189 tcp_keepinit = TCPTV_KEEP_INIT;
190 tcp_keepidle = TCPTV_KEEP_IDLE;
191 tcp_keepintvl = TCPTV_KEEPINTVL;
192 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
193 tcp_msl = TCPTV_MSL;
194
195 LIST_INIT(&tcb);
196 tcbinfo.listhead = &tcb;
197 TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
198 if (!powerof2(hashsize)) {
199 printf("WARNING: TCB hash size not a power of 2\n");
200 hashsize = 512; /* safe default */
201 }
202 tcp_tcbhashsize = hashsize;
203 tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
204 tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
205 &tcbinfo.porthashmask);
206 tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets,
207 ZONE_INTERRUPT, 0);
208#ifdef INET6
209#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
210#else /* INET6 */
211#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
212#endif /* INET6 */
213 if (max_protohdr < TCP_MINPROTOHDR)
214 max_protohdr = TCP_MINPROTOHDR;
215 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
216 panic("tcp_init");
217#undef TCP_MINPROTOHDR
218}
219
220/*
221 * Create template to be used to send tcp packets on a connection.
222 * Call after host entry created, allocates an mbuf and fills
223 * in a skeletal tcp/ip header, minimizing the amount of work
224 * necessary when the connection is used.
225 */
226struct tcptemp *
227tcp_template(tp)
228 struct tcpcb *tp;
229{
230 register struct inpcb *inp = tp->t_inpcb;
231 register struct mbuf *m;
232 register struct tcptemp *n;
233
234 if ((n = tp->t_template) == 0) {
235 m = m_get(M_DONTWAIT, MT_HEADER);
236 if (m == NULL)
237 return (0);
238 m->m_len = sizeof (struct tcptemp);
239 n = mtod(m, struct tcptemp *);
240 }
241#ifdef INET6
242 if ((inp->inp_vflag & INP_IPV6) != 0) {
243 register struct ip6_hdr *ip6;
244
245 ip6 = (struct ip6_hdr *)n->tt_ipgen;
246 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
247 (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
248 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
249 (IPV6_VERSION & IPV6_VERSION_MASK);
250 ip6->ip6_nxt = IPPROTO_TCP;
251 ip6->ip6_plen = sizeof(struct tcphdr);
252 ip6->ip6_src = inp->in6p_laddr;
253 ip6->ip6_dst = inp->in6p_faddr;
254 n->tt_t.th_sum = 0;
255 } else
256#endif
257 {
258 struct ip *ip = (struct ip *)n->tt_ipgen;
259
260 bzero(ip, sizeof(struct ip)); /* XXX overkill? */
261 ip->ip_vhl = IP_VHL_BORING;
262 ip->ip_p = IPPROTO_TCP;
263 ip->ip_src = inp->inp_laddr;
264 ip->ip_dst = inp->inp_faddr;
265 n->tt_t.th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
266 htons(sizeof(struct tcphdr) + IPPROTO_TCP));
267 }
268 n->tt_t.th_sport = inp->inp_lport;
269 n->tt_t.th_dport = inp->inp_fport;
270 n->tt_t.th_seq = 0;
271 n->tt_t.th_ack = 0;
272 n->tt_t.th_x2 = 0;
273 n->tt_t.th_off = 5;
274 n->tt_t.th_flags = 0;
275 n->tt_t.th_win = 0;
276 n->tt_t.th_urp = 0;
277 return (n);
278}
279
280/*
281 * Send a single message to the TCP at address specified by
282 * the given TCP/IP header. If m == 0, then we make a copy
283 * of the tcpiphdr at ti and send directly to the addressed host.
284 * This is used to force keep alive messages out using the TCP
285 * template for a connection tp->t_template. If flags are given
286 * then we send a message back to the TCP which originated the
287 * segment ti, and discard the mbuf containing it and any other
288 * attached mbufs.
289 *
290 * In any case the ack and sequence number of the transmitted
291 * segment are as specified by the parameters.
292 *
293 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
294 */
295void
296tcp_respond(tp, ipgen, th, m, ack, seq, flags)
297 struct tcpcb *tp;
298 void *ipgen;
299 register struct tcphdr *th;
300 register struct mbuf *m;
301 tcp_seq ack, seq;
302 int flags;
303{
304 register int tlen;
305 int win = 0;
306 struct route *ro = 0;
307 struct route sro;
308 struct ip *ip;
309 struct tcphdr *nth;
310#ifdef INET6
311 struct route_in6 *ro6 = 0;
312 struct route_in6 sro6;
313 struct ip6_hdr *ip6;
314 int isipv6;
315#endif /* INET6 */
316 int ipflags = 0;
317
318#ifdef INET6
319 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
320 ip6 = ipgen;
321#endif /* INET6 */
322 ip = ipgen;
323
324 if (tp) {
325 if (!(flags & TH_RST)) {
326 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
327 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
328 win = (long)TCP_MAXWIN << tp->rcv_scale;
329 }
330#ifdef INET6
331 if (isipv6)
332 ro6 = &tp->t_inpcb->in6p_route;
333 else
334#endif /* INET6 */
335 ro = &tp->t_inpcb->inp_route;
336 } else {
337#ifdef INET6
338 if (isipv6) {
339 ro6 = &sro6;
340 bzero(ro6, sizeof *ro6);
341 } else
342#endif /* INET6 */
343 {
344 ro = &sro;
345 bzero(ro, sizeof *ro);
346 }
347 }
348 if (m == 0) {
349 m = m_gethdr(M_DONTWAIT, MT_HEADER);
350 if (m == NULL)
351 return;
352 tlen = 0;
353 m->m_data += max_linkhdr;
354#ifdef INET6
355 if (isipv6) {
356 bcopy((caddr_t)ip6, mtod(m, caddr_t),
357 sizeof(struct ip6_hdr));
358 ip6 = mtod(m, struct ip6_hdr *);
359 nth = (struct tcphdr *)(ip6 + 1);
360 } else
361#endif /* INET6 */
362 {
363 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
364 ip = mtod(m, struct ip *);
365 nth = (struct tcphdr *)(ip + 1);
366 }
367 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
368 flags = TH_ACK;
369 } else {
370 m_freem(m->m_next);
371 m->m_next = 0;
372 m->m_data = (caddr_t)ipgen;
373 /* m_len is set later */
374 tlen = 0;
375#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
376#ifdef INET6
377 if (isipv6) {
378 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
379 nth = (struct tcphdr *)(ip6 + 1);
380 } else
381#endif /* INET6 */
382 {
383 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
384 nth = (struct tcphdr *)(ip + 1);
385 }
386 if (th != nth) {
387 /*
388 * this is usually a case when an extension header
389 * exists between the IPv6 header and the
390 * TCP header.
391 */
392 nth->th_sport = th->th_sport;
393 nth->th_dport = th->th_dport;
394 }
395 xchg(nth->th_dport, nth->th_sport, n_short);
396#undef xchg
397 }
398#ifdef INET6
399 if (isipv6) {
400 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
401 tlen));
402 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
403 } else
404#endif
405 {
406 tlen += sizeof (struct tcpiphdr);
407 ip->ip_len = tlen;
408 ip->ip_ttl = ip_defttl;
409 }
410 m->m_len = tlen;
411 m->m_pkthdr.len = tlen;
412 m->m_pkthdr.rcvif = (struct ifnet *) 0;
413 nth->th_seq = htonl(seq);
414 nth->th_ack = htonl(ack);
415 nth->th_x2 = 0;
416 nth->th_off = sizeof (struct tcphdr) >> 2;
417 nth->th_flags = flags;
418 if (tp)
419 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
420 else
421 nth->th_win = htons((u_short)win);
422 nth->th_urp = 0;
423#ifdef INET6
424 if (isipv6) {
425 nth->th_sum = 0;
426 nth->th_sum = in6_cksum(m, IPPROTO_TCP,
427 sizeof(struct ip6_hdr),
428 tlen - sizeof(struct ip6_hdr));
429 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
430 ro6 && ro6->ro_rt ?
431 ro6->ro_rt->rt_ifp :
432 NULL);
433 } else
434#endif /* INET6 */
435 {
436 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
437 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
438 m->m_pkthdr.csum_flags = CSUM_TCP;
439 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
440 }
441#ifdef TCPDEBUG
442 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
443 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
444#endif
445#ifdef IPSEC
35 */
36
37#include "opt_compat.h"
38#include "opt_inet6.h"
39#include "opt_ipsec.h"
40#include "opt_tcpdebug.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/callout.h>
45#include <sys/kernel.h>
46#include <sys/sysctl.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#ifdef INET6
50#include <sys/domain.h>
51#endif
52#include <sys/proc.h>
53#include <sys/socket.h>
54#include <sys/socketvar.h>
55#include <sys/protosw.h>
56#include <sys/random.h>
57
58#include <vm/vm_zone.h>
59
60#include <net/route.h>
61#include <net/if.h>
62
63#define _IP_VHL
64#include <netinet/in.h>
65#include <netinet/in_systm.h>
66#include <netinet/ip.h>
67#ifdef INET6
68#include <netinet/ip6.h>
69#endif
70#include <netinet/in_pcb.h>
71#ifdef INET6
72#include <netinet6/in6_pcb.h>
73#endif
74#include <netinet/in_var.h>
75#include <netinet/ip_var.h>
76#ifdef INET6
77#include <netinet6/ip6_var.h>
78#endif
79#include <netinet/tcp.h>
80#include <netinet/tcp_fsm.h>
81#include <netinet/tcp_seq.h>
82#include <netinet/tcp_timer.h>
83#include <netinet/tcp_var.h>
84#ifdef INET6
85#include <netinet6/tcp6_var.h>
86#endif
87#include <netinet/tcpip.h>
88#ifdef TCPDEBUG
89#include <netinet/tcp_debug.h>
90#endif
91#include <netinet6/ip6protosw.h>
92
93#ifdef IPSEC
94#include <netinet6/ipsec.h>
95#ifdef INET6
96#include <netinet6/ipsec6.h>
97#endif
98#endif /*IPSEC*/
99
100#include <machine/in_cksum.h>
101
102int tcp_mssdflt = TCP_MSS;
103SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
104 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
105
106#ifdef INET6
107int tcp_v6mssdflt = TCP6_MSS;
108SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
109 CTLFLAG_RW, &tcp_v6mssdflt , 0,
110 "Default TCP Maximum Segment Size for IPv6");
111#endif
112
113#if 0
114static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
115SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
116 &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
117#endif
118
119static int tcp_do_rfc1323 = 1;
120SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
121 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
122
123static int tcp_do_rfc1644 = 0;
124SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
125 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
126
127static int tcp_tcbhashsize = 0;
128SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
129 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
130
131static int do_tcpdrain = 1;
132SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
133 "Enable tcp_drain routine for extra help when low on mbufs");
134
135SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
136 &tcbinfo.ipi_count, 0, "Number of active PCBs");
137
138static int icmp_may_rst = 1;
139SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
140 "Certain ICMP unreachable messages may abort connections in SYN_SENT");
141
142static void tcp_cleartaocache __P((void));
143static void tcp_notify __P((struct inpcb *, int));
144
145/*
146 * Target size of TCP PCB hash tables. Must be a power of two.
147 *
148 * Note that this can be overridden by the kernel environment
149 * variable net.inet.tcp.tcbhashsize
150 */
151#ifndef TCBHASHSIZE
152#define TCBHASHSIZE 512
153#endif
154
155/*
156 * This is the actual shape of what we allocate using the zone
157 * allocator. Doing it this way allows us to protect both structures
158 * using the same generation count, and also eliminates the overhead
159 * of allocating tcpcbs separately. By hiding the structure here,
160 * we avoid changing most of the rest of the code (although it needs
161 * to be changed, eventually, for greater efficiency).
162 */
163#define ALIGNMENT 32
164#define ALIGNM1 (ALIGNMENT - 1)
165struct inp_tp {
166 union {
167 struct inpcb inp;
168 char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
169 } inp_tp_u;
170 struct tcpcb tcb;
171 struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
172 struct callout inp_tp_delack;
173};
174#undef ALIGNMENT
175#undef ALIGNM1
176
177/*
178 * Tcp initialization
179 */
180void
181tcp_init()
182{
183 int hashsize = TCBHASHSIZE;
184
185 tcp_ccgen = 1;
186 tcp_cleartaocache();
187
188 tcp_delacktime = TCPTV_DELACK;
189 tcp_keepinit = TCPTV_KEEP_INIT;
190 tcp_keepidle = TCPTV_KEEP_IDLE;
191 tcp_keepintvl = TCPTV_KEEPINTVL;
192 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
193 tcp_msl = TCPTV_MSL;
194
195 LIST_INIT(&tcb);
196 tcbinfo.listhead = &tcb;
197 TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
198 if (!powerof2(hashsize)) {
199 printf("WARNING: TCB hash size not a power of 2\n");
200 hashsize = 512; /* safe default */
201 }
202 tcp_tcbhashsize = hashsize;
203 tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
204 tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
205 &tcbinfo.porthashmask);
206 tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets,
207 ZONE_INTERRUPT, 0);
208#ifdef INET6
209#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
210#else /* INET6 */
211#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
212#endif /* INET6 */
213 if (max_protohdr < TCP_MINPROTOHDR)
214 max_protohdr = TCP_MINPROTOHDR;
215 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
216 panic("tcp_init");
217#undef TCP_MINPROTOHDR
218}
219
220/*
221 * Create template to be used to send tcp packets on a connection.
222 * Call after host entry created, allocates an mbuf and fills
223 * in a skeletal tcp/ip header, minimizing the amount of work
224 * necessary when the connection is used.
225 */
226struct tcptemp *
227tcp_template(tp)
228 struct tcpcb *tp;
229{
230 register struct inpcb *inp = tp->t_inpcb;
231 register struct mbuf *m;
232 register struct tcptemp *n;
233
234 if ((n = tp->t_template) == 0) {
235 m = m_get(M_DONTWAIT, MT_HEADER);
236 if (m == NULL)
237 return (0);
238 m->m_len = sizeof (struct tcptemp);
239 n = mtod(m, struct tcptemp *);
240 }
241#ifdef INET6
242 if ((inp->inp_vflag & INP_IPV6) != 0) {
243 register struct ip6_hdr *ip6;
244
245 ip6 = (struct ip6_hdr *)n->tt_ipgen;
246 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
247 (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
248 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
249 (IPV6_VERSION & IPV6_VERSION_MASK);
250 ip6->ip6_nxt = IPPROTO_TCP;
251 ip6->ip6_plen = sizeof(struct tcphdr);
252 ip6->ip6_src = inp->in6p_laddr;
253 ip6->ip6_dst = inp->in6p_faddr;
254 n->tt_t.th_sum = 0;
255 } else
256#endif
257 {
258 struct ip *ip = (struct ip *)n->tt_ipgen;
259
260 bzero(ip, sizeof(struct ip)); /* XXX overkill? */
261 ip->ip_vhl = IP_VHL_BORING;
262 ip->ip_p = IPPROTO_TCP;
263 ip->ip_src = inp->inp_laddr;
264 ip->ip_dst = inp->inp_faddr;
265 n->tt_t.th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
266 htons(sizeof(struct tcphdr) + IPPROTO_TCP));
267 }
268 n->tt_t.th_sport = inp->inp_lport;
269 n->tt_t.th_dport = inp->inp_fport;
270 n->tt_t.th_seq = 0;
271 n->tt_t.th_ack = 0;
272 n->tt_t.th_x2 = 0;
273 n->tt_t.th_off = 5;
274 n->tt_t.th_flags = 0;
275 n->tt_t.th_win = 0;
276 n->tt_t.th_urp = 0;
277 return (n);
278}
279
280/*
281 * Send a single message to the TCP at address specified by
282 * the given TCP/IP header. If m == 0, then we make a copy
283 * of the tcpiphdr at ti and send directly to the addressed host.
284 * This is used to force keep alive messages out using the TCP
285 * template for a connection tp->t_template. If flags are given
286 * then we send a message back to the TCP which originated the
287 * segment ti, and discard the mbuf containing it and any other
288 * attached mbufs.
289 *
290 * In any case the ack and sequence number of the transmitted
291 * segment are as specified by the parameters.
292 *
293 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
294 */
295void
296tcp_respond(tp, ipgen, th, m, ack, seq, flags)
297 struct tcpcb *tp;
298 void *ipgen;
299 register struct tcphdr *th;
300 register struct mbuf *m;
301 tcp_seq ack, seq;
302 int flags;
303{
304 register int tlen;
305 int win = 0;
306 struct route *ro = 0;
307 struct route sro;
308 struct ip *ip;
309 struct tcphdr *nth;
310#ifdef INET6
311 struct route_in6 *ro6 = 0;
312 struct route_in6 sro6;
313 struct ip6_hdr *ip6;
314 int isipv6;
315#endif /* INET6 */
316 int ipflags = 0;
317
318#ifdef INET6
319 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
320 ip6 = ipgen;
321#endif /* INET6 */
322 ip = ipgen;
323
324 if (tp) {
325 if (!(flags & TH_RST)) {
326 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
327 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
328 win = (long)TCP_MAXWIN << tp->rcv_scale;
329 }
330#ifdef INET6
331 if (isipv6)
332 ro6 = &tp->t_inpcb->in6p_route;
333 else
334#endif /* INET6 */
335 ro = &tp->t_inpcb->inp_route;
336 } else {
337#ifdef INET6
338 if (isipv6) {
339 ro6 = &sro6;
340 bzero(ro6, sizeof *ro6);
341 } else
342#endif /* INET6 */
343 {
344 ro = &sro;
345 bzero(ro, sizeof *ro);
346 }
347 }
348 if (m == 0) {
349 m = m_gethdr(M_DONTWAIT, MT_HEADER);
350 if (m == NULL)
351 return;
352 tlen = 0;
353 m->m_data += max_linkhdr;
354#ifdef INET6
355 if (isipv6) {
356 bcopy((caddr_t)ip6, mtod(m, caddr_t),
357 sizeof(struct ip6_hdr));
358 ip6 = mtod(m, struct ip6_hdr *);
359 nth = (struct tcphdr *)(ip6 + 1);
360 } else
361#endif /* INET6 */
362 {
363 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
364 ip = mtod(m, struct ip *);
365 nth = (struct tcphdr *)(ip + 1);
366 }
367 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
368 flags = TH_ACK;
369 } else {
370 m_freem(m->m_next);
371 m->m_next = 0;
372 m->m_data = (caddr_t)ipgen;
373 /* m_len is set later */
374 tlen = 0;
375#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
376#ifdef INET6
377 if (isipv6) {
378 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
379 nth = (struct tcphdr *)(ip6 + 1);
380 } else
381#endif /* INET6 */
382 {
383 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
384 nth = (struct tcphdr *)(ip + 1);
385 }
386 if (th != nth) {
387 /*
388 * this is usually a case when an extension header
389 * exists between the IPv6 header and the
390 * TCP header.
391 */
392 nth->th_sport = th->th_sport;
393 nth->th_dport = th->th_dport;
394 }
395 xchg(nth->th_dport, nth->th_sport, n_short);
396#undef xchg
397 }
398#ifdef INET6
399 if (isipv6) {
400 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
401 tlen));
402 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
403 } else
404#endif
405 {
406 tlen += sizeof (struct tcpiphdr);
407 ip->ip_len = tlen;
408 ip->ip_ttl = ip_defttl;
409 }
410 m->m_len = tlen;
411 m->m_pkthdr.len = tlen;
412 m->m_pkthdr.rcvif = (struct ifnet *) 0;
413 nth->th_seq = htonl(seq);
414 nth->th_ack = htonl(ack);
415 nth->th_x2 = 0;
416 nth->th_off = sizeof (struct tcphdr) >> 2;
417 nth->th_flags = flags;
418 if (tp)
419 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
420 else
421 nth->th_win = htons((u_short)win);
422 nth->th_urp = 0;
423#ifdef INET6
424 if (isipv6) {
425 nth->th_sum = 0;
426 nth->th_sum = in6_cksum(m, IPPROTO_TCP,
427 sizeof(struct ip6_hdr),
428 tlen - sizeof(struct ip6_hdr));
429 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
430 ro6 && ro6->ro_rt ?
431 ro6->ro_rt->rt_ifp :
432 NULL);
433 } else
434#endif /* INET6 */
435 {
436 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
437 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
438 m->m_pkthdr.csum_flags = CSUM_TCP;
439 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
440 }
441#ifdef TCPDEBUG
442 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
443 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
444#endif
445#ifdef IPSEC
446 ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL);
446 if (ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
447 m_freem(m);
448 return;
449 }
447#endif
448#ifdef INET6
449 if (isipv6) {
450 (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL);
451 if (ro6 == &sro6 && ro6->ro_rt) {
452 RTFREE(ro6->ro_rt);
453 ro6->ro_rt = NULL;
454 }
455 } else
456#endif /* INET6 */
457 {
458 (void) ip_output(m, NULL, ro, ipflags, NULL);
459 if (ro == &sro && ro->ro_rt) {
460 RTFREE(ro->ro_rt);
461 ro->ro_rt = NULL;
462 }
463 }
464}
465
466/*
467 * Create a new TCP control block, making an
468 * empty reassembly queue and hooking it to the argument
469 * protocol control block. The `inp' parameter must have
470 * come from the zone allocator set up in tcp_init().
471 */
472struct tcpcb *
473tcp_newtcpcb(inp)
474 struct inpcb *inp;
475{
476 struct inp_tp *it;
477 register struct tcpcb *tp;
478#ifdef INET6
479 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
480#endif /* INET6 */
481
482 it = (struct inp_tp *)inp;
483 tp = &it->tcb;
484 bzero((char *) tp, sizeof(struct tcpcb));
485 LIST_INIT(&tp->t_segq);
486 tp->t_maxseg = tp->t_maxopd =
487#ifdef INET6
488 isipv6 ? tcp_v6mssdflt :
489#endif /* INET6 */
490 tcp_mssdflt;
491
492 /* Set up our timeouts. */
493 callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0);
494 callout_init(tp->tt_persist = &it->inp_tp_persist, 0);
495 callout_init(tp->tt_keep = &it->inp_tp_keep, 0);
496 callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
497 callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
498
499 if (tcp_do_rfc1323)
500 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
501 if (tcp_do_rfc1644)
502 tp->t_flags |= TF_REQ_CC;
503 tp->t_inpcb = inp; /* XXX */
504 /*
505 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
506 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
507 * reasonable initial retransmit time.
508 */
509 tp->t_srtt = TCPTV_SRTTBASE;
510 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
511 tp->t_rttmin = TCPTV_MIN;
512 tp->t_rxtcur = TCPTV_RTOBASE;
513 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
514 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
515 tp->t_rcvtime = ticks;
516 /*
517 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
518 * because the socket may be bound to an IPv6 wildcard address,
519 * which may match an IPv4-mapped IPv6 address.
520 */
521 inp->inp_ip_ttl = ip_defttl;
522 inp->inp_ppcb = (caddr_t)tp;
523 return (tp); /* XXX */
524}
525
526/*
527 * Drop a TCP connection, reporting
528 * the specified error. If connection is synchronized,
529 * then send a RST to peer.
530 */
531struct tcpcb *
532tcp_drop(tp, errno)
533 register struct tcpcb *tp;
534 int errno;
535{
536 struct socket *so = tp->t_inpcb->inp_socket;
537
538 if (TCPS_HAVERCVDSYN(tp->t_state)) {
539 tp->t_state = TCPS_CLOSED;
540 (void) tcp_output(tp);
541 tcpstat.tcps_drops++;
542 } else
543 tcpstat.tcps_conndrops++;
544 if (errno == ETIMEDOUT && tp->t_softerror)
545 errno = tp->t_softerror;
546 so->so_error = errno;
547 return (tcp_close(tp));
548}
549
550/*
551 * Close a TCP control block:
552 * discard all space held by the tcp
553 * discard internet protocol block
554 * wake up any sleepers
555 */
556struct tcpcb *
557tcp_close(tp)
558 register struct tcpcb *tp;
559{
560 register struct tseg_qent *q;
561 struct inpcb *inp = tp->t_inpcb;
562 struct socket *so = inp->inp_socket;
563#ifdef INET6
564 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
565#endif /* INET6 */
566 register struct rtentry *rt;
567 int dosavessthresh;
568
569 /*
570 * Make sure that all of our timers are stopped before we
571 * delete the PCB.
572 */
573 callout_stop(tp->tt_rexmt);
574 callout_stop(tp->tt_persist);
575 callout_stop(tp->tt_keep);
576 callout_stop(tp->tt_2msl);
577 callout_stop(tp->tt_delack);
578
579 /*
580 * If we got enough samples through the srtt filter,
581 * save the rtt and rttvar in the routing entry.
582 * 'Enough' is arbitrarily defined as the 16 samples.
583 * 16 samples is enough for the srtt filter to converge
584 * to within 5% of the correct value; fewer samples and
585 * we could save a very bogus rtt.
586 *
587 * Don't update the default route's characteristics and don't
588 * update anything that the user "locked".
589 */
590 if (tp->t_rttupdated >= 16) {
591 register u_long i = 0;
592#ifdef INET6
593 if (isipv6) {
594 struct sockaddr_in6 *sin6;
595
596 if ((rt = inp->in6p_route.ro_rt) == NULL)
597 goto no_valid_rt;
598 sin6 = (struct sockaddr_in6 *)rt_key(rt);
599 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
600 goto no_valid_rt;
601 }
602 else
603#endif /* INET6 */
604 if ((rt = inp->inp_route.ro_rt) == NULL ||
605 ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
606 == INADDR_ANY)
607 goto no_valid_rt;
608
609 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
610 i = tp->t_srtt *
611 (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
612 if (rt->rt_rmx.rmx_rtt && i)
613 /*
614 * filter this update to half the old & half
615 * the new values, converting scale.
616 * See route.h and tcp_var.h for a
617 * description of the scaling constants.
618 */
619 rt->rt_rmx.rmx_rtt =
620 (rt->rt_rmx.rmx_rtt + i) / 2;
621 else
622 rt->rt_rmx.rmx_rtt = i;
623 tcpstat.tcps_cachedrtt++;
624 }
625 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
626 i = tp->t_rttvar *
627 (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
628 if (rt->rt_rmx.rmx_rttvar && i)
629 rt->rt_rmx.rmx_rttvar =
630 (rt->rt_rmx.rmx_rttvar + i) / 2;
631 else
632 rt->rt_rmx.rmx_rttvar = i;
633 tcpstat.tcps_cachedrttvar++;
634 }
635 /*
636 * The old comment here said:
637 * update the pipelimit (ssthresh) if it has been updated
638 * already or if a pipesize was specified & the threshhold
639 * got below half the pipesize. I.e., wait for bad news
640 * before we start updating, then update on both good
641 * and bad news.
642 *
643 * But we want to save the ssthresh even if no pipesize is
644 * specified explicitly in the route, because such
645 * connections still have an implicit pipesize specified
646 * by the global tcp_sendspace. In the absence of a reliable
647 * way to calculate the pipesize, it will have to do.
648 */
649 i = tp->snd_ssthresh;
650 if (rt->rt_rmx.rmx_sendpipe != 0)
651 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
652 else
653 dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
654 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
655 i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
656 || dosavessthresh) {
657 /*
658 * convert the limit from user data bytes to
659 * packets then to packet data bytes.
660 */
661 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
662 if (i < 2)
663 i = 2;
664 i *= (u_long)(tp->t_maxseg +
665#ifdef INET6
666 (isipv6 ? sizeof (struct ip6_hdr) +
667 sizeof (struct tcphdr) :
668#endif
669 sizeof (struct tcpiphdr)
670#ifdef INET6
671 )
672#endif
673 );
674 if (rt->rt_rmx.rmx_ssthresh)
675 rt->rt_rmx.rmx_ssthresh =
676 (rt->rt_rmx.rmx_ssthresh + i) / 2;
677 else
678 rt->rt_rmx.rmx_ssthresh = i;
679 tcpstat.tcps_cachedssthresh++;
680 }
681 }
682 rt = inp->inp_route.ro_rt;
683 if (rt) {
684 /*
685 * mark route for deletion if no information is
686 * cached.
687 */
688 if ((tp->t_flags & TF_LQ_OVERFLOW) &&
689 ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0)){
690 if (rt->rt_rmx.rmx_rtt == 0)
691 rt->rt_flags |= RTF_DELCLONE;
692 }
693 }
694 no_valid_rt:
695 /* free the reassembly queue, if any */
696 while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
697 LIST_REMOVE(q, tqe_q);
698 m_freem(q->tqe_m);
699 FREE(q, M_TSEGQ);
700 }
701 if (tp->t_template)
702 (void) m_free(dtom(tp->t_template));
703 inp->inp_ppcb = NULL;
704 soisdisconnected(so);
705#ifdef INET6
706 if (INP_CHECK_SOCKAF(so, AF_INET6))
707 in6_pcbdetach(inp);
708 else
709#endif /* INET6 */
710 in_pcbdetach(inp);
711 tcpstat.tcps_closed++;
712 return ((struct tcpcb *)0);
713}
714
715void
716tcp_drain()
717{
718 if (do_tcpdrain)
719 {
720 struct inpcb *inpb;
721 struct tcpcb *tcpb;
722 struct tseg_qent *te;
723
724 /*
725 * Walk the tcpbs, if existing, and flush the reassembly queue,
726 * if there is one...
727 * XXX: The "Net/3" implementation doesn't imply that the TCP
728 * reassembly queue should be flushed, but in a situation
729 * where we're really low on mbufs, this is potentially
730 * usefull.
731 */
732 LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
733 if ((tcpb = intotcpcb(inpb))) {
734 while ((te = LIST_FIRST(&tcpb->t_segq))
735 != NULL) {
736 LIST_REMOVE(te, tqe_q);
737 m_freem(te->tqe_m);
738 FREE(te, M_TSEGQ);
739 }
740 }
741 }
742 }
743}
744
745/*
746 * Notify a tcp user of an asynchronous error;
747 * store error as soft error, but wake up user
748 * (for now, won't do anything until can select for soft error).
749 *
750 * Do not wake up user since there currently is no mechanism for
751 * reporting soft errors (yet - a kqueue filter may be added).
752 */
753static void
754tcp_notify(inp, error)
755 struct inpcb *inp;
756 int error;
757{
758 struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
759
760 /*
761 * Ignore some errors if we are hooked up.
762 * If connection hasn't completed, has retransmitted several times,
763 * and receives a second error, give up now. This is better
764 * than waiting a long time to establish a connection that
765 * can never complete.
766 */
767 if (tp->t_state == TCPS_ESTABLISHED &&
768 (error == EHOSTUNREACH || error == ENETUNREACH ||
769 error == EHOSTDOWN)) {
770 return;
771 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
772 tp->t_softerror)
773 tcp_drop(tp, error);
774 else
775 tp->t_softerror = error;
776#if 0
777 wakeup((caddr_t) &so->so_timeo);
778 sorwakeup(so);
779 sowwakeup(so);
780#endif
781}
782
783static int
784tcp_pcblist(SYSCTL_HANDLER_ARGS)
785{
786 int error, i, n, s;
787 struct inpcb *inp, **inp_list;
788 inp_gen_t gencnt;
789 struct xinpgen xig;
790
791 /*
792 * The process of preparing the TCB list is too time-consuming and
793 * resource-intensive to repeat twice on every request.
794 */
795 if (req->oldptr == 0) {
796 n = tcbinfo.ipi_count;
797 req->oldidx = 2 * (sizeof xig)
798 + (n + n/8) * sizeof(struct xtcpcb);
799 return 0;
800 }
801
802 if (req->newptr != 0)
803 return EPERM;
804
805 /*
806 * OK, now we're committed to doing something.
807 */
808 s = splnet();
809 gencnt = tcbinfo.ipi_gencnt;
810 n = tcbinfo.ipi_count;
811 splx(s);
812
813 xig.xig_len = sizeof xig;
814 xig.xig_count = n;
815 xig.xig_gen = gencnt;
816 xig.xig_sogen = so_gencnt;
817 error = SYSCTL_OUT(req, &xig, sizeof xig);
818 if (error)
819 return error;
820
821 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
822 if (inp_list == 0)
823 return ENOMEM;
824
825 s = splnet();
826 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
827 inp = LIST_NEXT(inp, inp_list)) {
828 if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
829 inp_list[i++] = inp;
830 }
831 splx(s);
832 n = i;
833
834 error = 0;
835 for (i = 0; i < n; i++) {
836 inp = inp_list[i];
837 if (inp->inp_gencnt <= gencnt) {
838 struct xtcpcb xt;
839 caddr_t inp_ppcb;
840 xt.xt_len = sizeof xt;
841 /* XXX should avoid extra copy */
842 bcopy(inp, &xt.xt_inp, sizeof *inp);
843 inp_ppcb = inp->inp_ppcb;
844 if (inp_ppcb != NULL)
845 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
846 else
847 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
848 if (inp->inp_socket)
849 sotoxsocket(inp->inp_socket, &xt.xt_socket);
850 error = SYSCTL_OUT(req, &xt, sizeof xt);
851 }
852 }
853 if (!error) {
854 /*
855 * Give the user an updated idea of our state.
856 * If the generation differs from what we told
857 * her before, she knows that something happened
858 * while we were processing this request, and it
859 * might be necessary to retry.
860 */
861 s = splnet();
862 xig.xig_gen = tcbinfo.ipi_gencnt;
863 xig.xig_sogen = so_gencnt;
864 xig.xig_count = tcbinfo.ipi_count;
865 splx(s);
866 error = SYSCTL_OUT(req, &xig, sizeof xig);
867 }
868 free(inp_list, M_TEMP);
869 return error;
870}
871
872SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
873 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
874
875static int
876tcp_getcred(SYSCTL_HANDLER_ARGS)
877{
878 struct xucred xuc;
879 struct sockaddr_in addrs[2];
880 struct inpcb *inp;
881 int error, s;
882
883 error = suser(req->p);
884 if (error)
885 return (error);
886 error = SYSCTL_IN(req, addrs, sizeof(addrs));
887 if (error)
888 return (error);
889 s = splnet();
890 inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
891 addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
892 if (inp == NULL || inp->inp_socket == NULL) {
893 error = ENOENT;
894 goto out;
895 }
896 bzero(&xuc, sizeof(xuc));
897 xuc.cr_uid = inp->inp_socket->so_cred->cr_uid;
898 xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups;
899 bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups,
900 sizeof(xuc.cr_groups));
901 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
902out:
903 splx(s);
904 return (error);
905}
906
907SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
908 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
909
910#ifdef INET6
911static int
912tcp6_getcred(SYSCTL_HANDLER_ARGS)
913{
914 struct xucred xuc;
915 struct sockaddr_in6 addrs[2];
916 struct inpcb *inp;
917 int error, s, mapped = 0;
918
919 error = suser(req->p);
920 if (error)
921 return (error);
922 error = SYSCTL_IN(req, addrs, sizeof(addrs));
923 if (error)
924 return (error);
925 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
926 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
927 mapped = 1;
928 else
929 return (EINVAL);
930 }
931 s = splnet();
932 if (mapped == 1)
933 inp = in_pcblookup_hash(&tcbinfo,
934 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
935 addrs[1].sin6_port,
936 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
937 addrs[0].sin6_port,
938 0, NULL);
939 else
940 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
941 addrs[1].sin6_port,
942 &addrs[0].sin6_addr, addrs[0].sin6_port,
943 0, NULL);
944 if (inp == NULL || inp->inp_socket == NULL) {
945 error = ENOENT;
946 goto out;
947 }
948 bzero(&xuc, sizeof(xuc));
949 xuc.cr_uid = inp->inp_socket->so_cred->cr_uid;
950 xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups;
951 bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups,
952 sizeof(xuc.cr_groups));
953 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
954out:
955 splx(s);
956 return (error);
957}
958
959SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
960 0, 0,
961 tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
962#endif
963
964
965void
966tcp_ctlinput(cmd, sa, vip)
967 int cmd;
968 struct sockaddr *sa;
969 void *vip;
970{
971 struct ip *ip = vip;
972 struct tcphdr *th;
973 struct in_addr faddr;
974 struct inpcb *inp;
975 struct tcpcb *tp;
976 void (*notify) __P((struct inpcb *, int)) = tcp_notify;
977 tcp_seq icmp_seq;
978 int s;
979
980 faddr = ((struct sockaddr_in *)sa)->sin_addr;
981 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
982 return;
983
984 if (cmd == PRC_QUENCH)
985 notify = tcp_quench;
986 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
987 cmd == PRC_UNREACH_PORT) && ip)
988 notify = tcp_drop_syn_sent;
989 else if (cmd == PRC_MSGSIZE)
990 notify = tcp_mtudisc;
991 else if (PRC_IS_REDIRECT(cmd)) {
992 ip = 0;
993 notify = in_rtchange;
994 } else if (cmd == PRC_HOSTDEAD)
995 ip = 0;
996 else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
997 return;
998 if (ip) {
999 s = splnet();
1000 th = (struct tcphdr *)((caddr_t)ip
1001 + (IP_VHL_HL(ip->ip_vhl) << 2));
1002 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
1003 ip->ip_src, th->th_sport, 0, NULL);
1004 if (inp != NULL && inp->inp_socket != NULL) {
1005 icmp_seq = htonl(th->th_seq);
1006 tp = intotcpcb(inp);
1007 if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
1008 SEQ_LT(icmp_seq, tp->snd_max))
1009 (*notify)(inp, inetctlerrmap[cmd]);
1010 }
1011 splx(s);
1012 } else
1013 in_pcbnotifyall(&tcb, faddr, inetctlerrmap[cmd], notify);
1014}
1015
1016#ifdef INET6
1017void
1018tcp6_ctlinput(cmd, sa, d)
1019 int cmd;
1020 struct sockaddr *sa;
1021 void *d;
1022{
450#endif
451#ifdef INET6
452 if (isipv6) {
453 (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL);
454 if (ro6 == &sro6 && ro6->ro_rt) {
455 RTFREE(ro6->ro_rt);
456 ro6->ro_rt = NULL;
457 }
458 } else
459#endif /* INET6 */
460 {
461 (void) ip_output(m, NULL, ro, ipflags, NULL);
462 if (ro == &sro && ro->ro_rt) {
463 RTFREE(ro->ro_rt);
464 ro->ro_rt = NULL;
465 }
466 }
467}
468
469/*
470 * Create a new TCP control block, making an
471 * empty reassembly queue and hooking it to the argument
472 * protocol control block. The `inp' parameter must have
473 * come from the zone allocator set up in tcp_init().
474 */
475struct tcpcb *
476tcp_newtcpcb(inp)
477 struct inpcb *inp;
478{
479 struct inp_tp *it;
480 register struct tcpcb *tp;
481#ifdef INET6
482 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
483#endif /* INET6 */
484
485 it = (struct inp_tp *)inp;
486 tp = &it->tcb;
487 bzero((char *) tp, sizeof(struct tcpcb));
488 LIST_INIT(&tp->t_segq);
489 tp->t_maxseg = tp->t_maxopd =
490#ifdef INET6
491 isipv6 ? tcp_v6mssdflt :
492#endif /* INET6 */
493 tcp_mssdflt;
494
495 /* Set up our timeouts. */
496 callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0);
497 callout_init(tp->tt_persist = &it->inp_tp_persist, 0);
498 callout_init(tp->tt_keep = &it->inp_tp_keep, 0);
499 callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
500 callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
501
502 if (tcp_do_rfc1323)
503 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
504 if (tcp_do_rfc1644)
505 tp->t_flags |= TF_REQ_CC;
506 tp->t_inpcb = inp; /* XXX */
507 /*
508 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
509 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
510 * reasonable initial retransmit time.
511 */
512 tp->t_srtt = TCPTV_SRTTBASE;
513 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
514 tp->t_rttmin = TCPTV_MIN;
515 tp->t_rxtcur = TCPTV_RTOBASE;
516 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
517 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
518 tp->t_rcvtime = ticks;
519 /*
520 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
521 * because the socket may be bound to an IPv6 wildcard address,
522 * which may match an IPv4-mapped IPv6 address.
523 */
524 inp->inp_ip_ttl = ip_defttl;
525 inp->inp_ppcb = (caddr_t)tp;
526 return (tp); /* XXX */
527}
528
529/*
530 * Drop a TCP connection, reporting
531 * the specified error. If connection is synchronized,
532 * then send a RST to peer.
533 */
534struct tcpcb *
535tcp_drop(tp, errno)
536 register struct tcpcb *tp;
537 int errno;
538{
539 struct socket *so = tp->t_inpcb->inp_socket;
540
541 if (TCPS_HAVERCVDSYN(tp->t_state)) {
542 tp->t_state = TCPS_CLOSED;
543 (void) tcp_output(tp);
544 tcpstat.tcps_drops++;
545 } else
546 tcpstat.tcps_conndrops++;
547 if (errno == ETIMEDOUT && tp->t_softerror)
548 errno = tp->t_softerror;
549 so->so_error = errno;
550 return (tcp_close(tp));
551}
552
553/*
554 * Close a TCP control block:
555 * discard all space held by the tcp
556 * discard internet protocol block
557 * wake up any sleepers
558 */
559struct tcpcb *
560tcp_close(tp)
561 register struct tcpcb *tp;
562{
563 register struct tseg_qent *q;
564 struct inpcb *inp = tp->t_inpcb;
565 struct socket *so = inp->inp_socket;
566#ifdef INET6
567 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
568#endif /* INET6 */
569 register struct rtentry *rt;
570 int dosavessthresh;
571
572 /*
573 * Make sure that all of our timers are stopped before we
574 * delete the PCB.
575 */
576 callout_stop(tp->tt_rexmt);
577 callout_stop(tp->tt_persist);
578 callout_stop(tp->tt_keep);
579 callout_stop(tp->tt_2msl);
580 callout_stop(tp->tt_delack);
581
582 /*
583 * If we got enough samples through the srtt filter,
584 * save the rtt and rttvar in the routing entry.
585 * 'Enough' is arbitrarily defined as the 16 samples.
586 * 16 samples is enough for the srtt filter to converge
587 * to within 5% of the correct value; fewer samples and
588 * we could save a very bogus rtt.
589 *
590 * Don't update the default route's characteristics and don't
591 * update anything that the user "locked".
592 */
593 if (tp->t_rttupdated >= 16) {
594 register u_long i = 0;
595#ifdef INET6
596 if (isipv6) {
597 struct sockaddr_in6 *sin6;
598
599 if ((rt = inp->in6p_route.ro_rt) == NULL)
600 goto no_valid_rt;
601 sin6 = (struct sockaddr_in6 *)rt_key(rt);
602 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
603 goto no_valid_rt;
604 }
605 else
606#endif /* INET6 */
607 if ((rt = inp->inp_route.ro_rt) == NULL ||
608 ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
609 == INADDR_ANY)
610 goto no_valid_rt;
611
612 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
613 i = tp->t_srtt *
614 (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
615 if (rt->rt_rmx.rmx_rtt && i)
616 /*
617 * filter this update to half the old & half
618 * the new values, converting scale.
619 * See route.h and tcp_var.h for a
620 * description of the scaling constants.
621 */
622 rt->rt_rmx.rmx_rtt =
623 (rt->rt_rmx.rmx_rtt + i) / 2;
624 else
625 rt->rt_rmx.rmx_rtt = i;
626 tcpstat.tcps_cachedrtt++;
627 }
628 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
629 i = tp->t_rttvar *
630 (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
631 if (rt->rt_rmx.rmx_rttvar && i)
632 rt->rt_rmx.rmx_rttvar =
633 (rt->rt_rmx.rmx_rttvar + i) / 2;
634 else
635 rt->rt_rmx.rmx_rttvar = i;
636 tcpstat.tcps_cachedrttvar++;
637 }
638 /*
639 * The old comment here said:
640 * update the pipelimit (ssthresh) if it has been updated
641 * already or if a pipesize was specified & the threshhold
642 * got below half the pipesize. I.e., wait for bad news
643 * before we start updating, then update on both good
644 * and bad news.
645 *
646 * But we want to save the ssthresh even if no pipesize is
647 * specified explicitly in the route, because such
648 * connections still have an implicit pipesize specified
649 * by the global tcp_sendspace. In the absence of a reliable
650 * way to calculate the pipesize, it will have to do.
651 */
652 i = tp->snd_ssthresh;
653 if (rt->rt_rmx.rmx_sendpipe != 0)
654 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
655 else
656 dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
657 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
658 i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
659 || dosavessthresh) {
660 /*
661 * convert the limit from user data bytes to
662 * packets then to packet data bytes.
663 */
664 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
665 if (i < 2)
666 i = 2;
667 i *= (u_long)(tp->t_maxseg +
668#ifdef INET6
669 (isipv6 ? sizeof (struct ip6_hdr) +
670 sizeof (struct tcphdr) :
671#endif
672 sizeof (struct tcpiphdr)
673#ifdef INET6
674 )
675#endif
676 );
677 if (rt->rt_rmx.rmx_ssthresh)
678 rt->rt_rmx.rmx_ssthresh =
679 (rt->rt_rmx.rmx_ssthresh + i) / 2;
680 else
681 rt->rt_rmx.rmx_ssthresh = i;
682 tcpstat.tcps_cachedssthresh++;
683 }
684 }
685 rt = inp->inp_route.ro_rt;
686 if (rt) {
687 /*
688 * mark route for deletion if no information is
689 * cached.
690 */
691 if ((tp->t_flags & TF_LQ_OVERFLOW) &&
692 ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0)){
693 if (rt->rt_rmx.rmx_rtt == 0)
694 rt->rt_flags |= RTF_DELCLONE;
695 }
696 }
697 no_valid_rt:
698 /* free the reassembly queue, if any */
699 while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
700 LIST_REMOVE(q, tqe_q);
701 m_freem(q->tqe_m);
702 FREE(q, M_TSEGQ);
703 }
704 if (tp->t_template)
705 (void) m_free(dtom(tp->t_template));
706 inp->inp_ppcb = NULL;
707 soisdisconnected(so);
708#ifdef INET6
709 if (INP_CHECK_SOCKAF(so, AF_INET6))
710 in6_pcbdetach(inp);
711 else
712#endif /* INET6 */
713 in_pcbdetach(inp);
714 tcpstat.tcps_closed++;
715 return ((struct tcpcb *)0);
716}
717
718void
719tcp_drain()
720{
721 if (do_tcpdrain)
722 {
723 struct inpcb *inpb;
724 struct tcpcb *tcpb;
725 struct tseg_qent *te;
726
727 /*
728 * Walk the tcpbs, if existing, and flush the reassembly queue,
729 * if there is one...
730 * XXX: The "Net/3" implementation doesn't imply that the TCP
731 * reassembly queue should be flushed, but in a situation
732 * where we're really low on mbufs, this is potentially
733 * usefull.
734 */
735 LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
736 if ((tcpb = intotcpcb(inpb))) {
737 while ((te = LIST_FIRST(&tcpb->t_segq))
738 != NULL) {
739 LIST_REMOVE(te, tqe_q);
740 m_freem(te->tqe_m);
741 FREE(te, M_TSEGQ);
742 }
743 }
744 }
745 }
746}
747
748/*
749 * Notify a tcp user of an asynchronous error;
750 * store error as soft error, but wake up user
751 * (for now, won't do anything until can select for soft error).
752 *
753 * Do not wake up user since there currently is no mechanism for
754 * reporting soft errors (yet - a kqueue filter may be added).
755 */
756static void
757tcp_notify(inp, error)
758 struct inpcb *inp;
759 int error;
760{
761 struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
762
763 /*
764 * Ignore some errors if we are hooked up.
765 * If connection hasn't completed, has retransmitted several times,
766 * and receives a second error, give up now. This is better
767 * than waiting a long time to establish a connection that
768 * can never complete.
769 */
770 if (tp->t_state == TCPS_ESTABLISHED &&
771 (error == EHOSTUNREACH || error == ENETUNREACH ||
772 error == EHOSTDOWN)) {
773 return;
774 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
775 tp->t_softerror)
776 tcp_drop(tp, error);
777 else
778 tp->t_softerror = error;
779#if 0
780 wakeup((caddr_t) &so->so_timeo);
781 sorwakeup(so);
782 sowwakeup(so);
783#endif
784}
785
786static int
787tcp_pcblist(SYSCTL_HANDLER_ARGS)
788{
789 int error, i, n, s;
790 struct inpcb *inp, **inp_list;
791 inp_gen_t gencnt;
792 struct xinpgen xig;
793
794 /*
795 * The process of preparing the TCB list is too time-consuming and
796 * resource-intensive to repeat twice on every request.
797 */
798 if (req->oldptr == 0) {
799 n = tcbinfo.ipi_count;
800 req->oldidx = 2 * (sizeof xig)
801 + (n + n/8) * sizeof(struct xtcpcb);
802 return 0;
803 }
804
805 if (req->newptr != 0)
806 return EPERM;
807
808 /*
809 * OK, now we're committed to doing something.
810 */
811 s = splnet();
812 gencnt = tcbinfo.ipi_gencnt;
813 n = tcbinfo.ipi_count;
814 splx(s);
815
816 xig.xig_len = sizeof xig;
817 xig.xig_count = n;
818 xig.xig_gen = gencnt;
819 xig.xig_sogen = so_gencnt;
820 error = SYSCTL_OUT(req, &xig, sizeof xig);
821 if (error)
822 return error;
823
824 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
825 if (inp_list == 0)
826 return ENOMEM;
827
828 s = splnet();
829 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
830 inp = LIST_NEXT(inp, inp_list)) {
831 if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
832 inp_list[i++] = inp;
833 }
834 splx(s);
835 n = i;
836
837 error = 0;
838 for (i = 0; i < n; i++) {
839 inp = inp_list[i];
840 if (inp->inp_gencnt <= gencnt) {
841 struct xtcpcb xt;
842 caddr_t inp_ppcb;
843 xt.xt_len = sizeof xt;
844 /* XXX should avoid extra copy */
845 bcopy(inp, &xt.xt_inp, sizeof *inp);
846 inp_ppcb = inp->inp_ppcb;
847 if (inp_ppcb != NULL)
848 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
849 else
850 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
851 if (inp->inp_socket)
852 sotoxsocket(inp->inp_socket, &xt.xt_socket);
853 error = SYSCTL_OUT(req, &xt, sizeof xt);
854 }
855 }
856 if (!error) {
857 /*
858 * Give the user an updated idea of our state.
859 * If the generation differs from what we told
860 * her before, she knows that something happened
861 * while we were processing this request, and it
862 * might be necessary to retry.
863 */
864 s = splnet();
865 xig.xig_gen = tcbinfo.ipi_gencnt;
866 xig.xig_sogen = so_gencnt;
867 xig.xig_count = tcbinfo.ipi_count;
868 splx(s);
869 error = SYSCTL_OUT(req, &xig, sizeof xig);
870 }
871 free(inp_list, M_TEMP);
872 return error;
873}
874
875SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
876 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
877
878static int
879tcp_getcred(SYSCTL_HANDLER_ARGS)
880{
881 struct xucred xuc;
882 struct sockaddr_in addrs[2];
883 struct inpcb *inp;
884 int error, s;
885
886 error = suser(req->p);
887 if (error)
888 return (error);
889 error = SYSCTL_IN(req, addrs, sizeof(addrs));
890 if (error)
891 return (error);
892 s = splnet();
893 inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
894 addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
895 if (inp == NULL || inp->inp_socket == NULL) {
896 error = ENOENT;
897 goto out;
898 }
899 bzero(&xuc, sizeof(xuc));
900 xuc.cr_uid = inp->inp_socket->so_cred->cr_uid;
901 xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups;
902 bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups,
903 sizeof(xuc.cr_groups));
904 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
905out:
906 splx(s);
907 return (error);
908}
909
910SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
911 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
912
913#ifdef INET6
914static int
915tcp6_getcred(SYSCTL_HANDLER_ARGS)
916{
917 struct xucred xuc;
918 struct sockaddr_in6 addrs[2];
919 struct inpcb *inp;
920 int error, s, mapped = 0;
921
922 error = suser(req->p);
923 if (error)
924 return (error);
925 error = SYSCTL_IN(req, addrs, sizeof(addrs));
926 if (error)
927 return (error);
928 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
929 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
930 mapped = 1;
931 else
932 return (EINVAL);
933 }
934 s = splnet();
935 if (mapped == 1)
936 inp = in_pcblookup_hash(&tcbinfo,
937 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
938 addrs[1].sin6_port,
939 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
940 addrs[0].sin6_port,
941 0, NULL);
942 else
943 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
944 addrs[1].sin6_port,
945 &addrs[0].sin6_addr, addrs[0].sin6_port,
946 0, NULL);
947 if (inp == NULL || inp->inp_socket == NULL) {
948 error = ENOENT;
949 goto out;
950 }
951 bzero(&xuc, sizeof(xuc));
952 xuc.cr_uid = inp->inp_socket->so_cred->cr_uid;
953 xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups;
954 bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups,
955 sizeof(xuc.cr_groups));
956 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
957out:
958 splx(s);
959 return (error);
960}
961
962SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
963 0, 0,
964 tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
965#endif
966
967
968void
969tcp_ctlinput(cmd, sa, vip)
970 int cmd;
971 struct sockaddr *sa;
972 void *vip;
973{
974 struct ip *ip = vip;
975 struct tcphdr *th;
976 struct in_addr faddr;
977 struct inpcb *inp;
978 struct tcpcb *tp;
979 void (*notify) __P((struct inpcb *, int)) = tcp_notify;
980 tcp_seq icmp_seq;
981 int s;
982
983 faddr = ((struct sockaddr_in *)sa)->sin_addr;
984 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
985 return;
986
987 if (cmd == PRC_QUENCH)
988 notify = tcp_quench;
989 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
990 cmd == PRC_UNREACH_PORT) && ip)
991 notify = tcp_drop_syn_sent;
992 else if (cmd == PRC_MSGSIZE)
993 notify = tcp_mtudisc;
994 else if (PRC_IS_REDIRECT(cmd)) {
995 ip = 0;
996 notify = in_rtchange;
997 } else if (cmd == PRC_HOSTDEAD)
998 ip = 0;
999 else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
1000 return;
1001 if (ip) {
1002 s = splnet();
1003 th = (struct tcphdr *)((caddr_t)ip
1004 + (IP_VHL_HL(ip->ip_vhl) << 2));
1005 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
1006 ip->ip_src, th->th_sport, 0, NULL);
1007 if (inp != NULL && inp->inp_socket != NULL) {
1008 icmp_seq = htonl(th->th_seq);
1009 tp = intotcpcb(inp);
1010 if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
1011 SEQ_LT(icmp_seq, tp->snd_max))
1012 (*notify)(inp, inetctlerrmap[cmd]);
1013 }
1014 splx(s);
1015 } else
1016 in_pcbnotifyall(&tcb, faddr, inetctlerrmap[cmd], notify);
1017}
1018
1019#ifdef INET6
1020void
1021tcp6_ctlinput(cmd, sa, d)
1022 int cmd;
1023 struct sockaddr *sa;
1024 void *d;
1025{
1023 register struct tcphdr *thp;
1024 struct tcphdr th;
1025 void (*notify) __P((struct inpcb *, int)) = tcp_notify;
1026 struct tcphdr th;
1027 void (*notify) __P((struct inpcb *, int)) = tcp_notify;
1026 struct sockaddr_in6 sa6;
1027 struct ip6_hdr *ip6;
1028 struct mbuf *m;
1028 struct ip6_hdr *ip6;
1029 struct mbuf *m;
1030 struct ip6ctlparam *ip6cp = NULL;
1031 const struct sockaddr_in6 *sa6_src = NULL;
1029 int off;
1032 int off;
1033 struct tcp_portonly {
1034 u_int16_t th_sport;
1035 u_int16_t th_dport;
1036 } *thp;
1030
1031 if (sa->sa_family != AF_INET6 ||
1032 sa->sa_len != sizeof(struct sockaddr_in6))
1033 return;
1034
1035 if (cmd == PRC_QUENCH)
1036 notify = tcp_quench;
1037 else if (cmd == PRC_MSGSIZE)
1038 notify = tcp_mtudisc;
1039 else if (!PRC_IS_REDIRECT(cmd) &&
1040 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1041 return;
1042
1043 /* if the parameter is from icmp6, decode it. */
1044 if (d != NULL) {
1037
1038 if (sa->sa_family != AF_INET6 ||
1039 sa->sa_len != sizeof(struct sockaddr_in6))
1040 return;
1041
1042 if (cmd == PRC_QUENCH)
1043 notify = tcp_quench;
1044 else if (cmd == PRC_MSGSIZE)
1045 notify = tcp_mtudisc;
1046 else if (!PRC_IS_REDIRECT(cmd) &&
1047 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1048 return;
1049
1050 /* if the parameter is from icmp6, decode it. */
1051 if (d != NULL) {
1045 struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d;
1052 ip6cp = (struct ip6ctlparam *)d;
1046 m = ip6cp->ip6c_m;
1047 ip6 = ip6cp->ip6c_ip6;
1048 off = ip6cp->ip6c_off;
1053 m = ip6cp->ip6c_m;
1054 ip6 = ip6cp->ip6c_ip6;
1055 off = ip6cp->ip6c_off;
1056 sa6_src = ip6cp->ip6c_src;
1049 } else {
1050 m = NULL;
1051 ip6 = NULL;
1052 off = 0; /* fool gcc */
1057 } else {
1058 m = NULL;
1059 ip6 = NULL;
1060 off = 0; /* fool gcc */
1061 sa6_src = &sa6_any;
1053 }
1054
1062 }
1063
1055 /*
1056 * Translate addresses into internal form.
1057 * Sa check if it is AF_INET6 is done at the top of this funciton.
1058 */
1059 sa6 = *(struct sockaddr_in6 *)sa;
1060 if (IN6_IS_ADDR_LINKLOCAL(&sa6.sin6_addr) != 0 && m != NULL &&
1061 m->m_pkthdr.rcvif != NULL)
1062 sa6.sin6_addr.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index);
1063
1064 if (ip6) {
1065 /*
1066 * XXX: We assume that when IPV6 is non NULL,
1067 * M and OFF are valid.
1068 */
1064 if (ip6) {
1065 /*
1066 * XXX: We assume that when IPV6 is non NULL,
1067 * M and OFF are valid.
1068 */
1069 struct in6_addr s;
1070
1069
1071 /* translate addresses into internal form */
1072 memcpy(&s, &ip6->ip6_src, sizeof(s));
1073 if (IN6_IS_ADDR_LINKLOCAL(&s) != 0 && m != NULL &&
1074 m->m_pkthdr.rcvif != NULL)
1075 s.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index);
1076
1077 /* check if we can safely examine src and dst ports */
1070 /* check if we can safely examine src and dst ports */
1078 if (m->m_pkthdr.len < off + sizeof(th))
1071 if (m->m_pkthdr.len < off + sizeof(*thp))
1079 return;
1080
1072 return;
1073
1081 if (m->m_len < off + sizeof(th)) {
1082 /*
1083 * this should be rare case
1084 * because now MINCLSIZE is "(MHLEN + 1)",
1085 * so we compromise on this copy...
1086 */
1087 m_copydata(m, off, sizeof(th), (caddr_t)&th);
1088 thp = &th;
1089 } else
1090 thp = (struct tcphdr *)(mtod(m, caddr_t) + off);
1091 in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, thp->th_dport,
1092 &s, thp->th_sport, cmd, notify);
1074 bzero(&th, sizeof(th));
1075 m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1076
1077 in6_pcbnotify(&tcb, sa, th.th_dport,
1078 (struct sockaddr *)ip6cp->ip6c_src,
1079 th.th_sport, cmd, notify);
1093 } else
1080 } else
1094 in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, 0, &zeroin6_addr,
1081 in6_pcbnotify(&tcb, sa, 0, (struct sockaddr *)sa6_src,
1095 0, cmd, notify);
1096}
1097#endif /* INET6 */
1098
1099#define TCP_RNDISS_ROUNDS 16
1100#define TCP_RNDISS_OUT 7200
1101#define TCP_RNDISS_MAX 30000
1102
1103u_int8_t tcp_rndiss_sbox[128];
1104u_int16_t tcp_rndiss_msb;
1105u_int16_t tcp_rndiss_cnt;
1106long tcp_rndiss_reseed;
1107
1108u_int16_t
1109tcp_rndiss_encrypt(val)
1110 u_int16_t val;
1111{
1112 u_int16_t sum = 0, i;
1113
1114 for (i = 0; i < TCP_RNDISS_ROUNDS; i++) {
1115 sum += 0x79b9;
1116 val ^= ((u_int16_t)tcp_rndiss_sbox[(val^sum) & 0x7f]) << 7;
1117 val = ((val & 0xff) << 7) | (val >> 8);
1118 }
1119
1120 return val;
1121}
1122
1123void
1124tcp_rndiss_init()
1125{
1126 struct timeval time;
1127
1128 getmicrotime(&time);
1129 read_random(tcp_rndiss_sbox, sizeof(tcp_rndiss_sbox));
1130
1131 tcp_rndiss_reseed = time.tv_sec + TCP_RNDISS_OUT;
1132 tcp_rndiss_msb = tcp_rndiss_msb == 0x8000 ? 0 : 0x8000;
1133 tcp_rndiss_cnt = 0;
1134}
1135
1136tcp_seq
1137tcp_rndiss_next()
1138{
1139 u_int16_t tmp;
1140 struct timeval time;
1141
1142 getmicrotime(&time);
1143
1144 if (tcp_rndiss_cnt >= TCP_RNDISS_MAX ||
1145 time.tv_sec > tcp_rndiss_reseed)
1146 tcp_rndiss_init();
1147
1148 read_random(&tmp, sizeof(tmp));
1149
1150 /* (tmp & 0x7fff) ensures a 32768 byte gap between ISS */
1151 return ((tcp_rndiss_encrypt(tcp_rndiss_cnt++) | tcp_rndiss_msb) <<16) |
1152 (tmp & 0x7fff);
1153}
1154
1155
1156/*
1157 * When a source quench is received, close congestion window
1158 * to one segment. We will gradually open it again as we proceed.
1159 */
1160void
1161tcp_quench(inp, errno)
1162 struct inpcb *inp;
1163 int errno;
1164{
1165 struct tcpcb *tp = intotcpcb(inp);
1166
1167 if (tp)
1168 tp->snd_cwnd = tp->t_maxseg;
1169}
1170
1171/*
1172 * When a specific ICMP unreachable message is received and the
1173 * connection state is SYN-SENT, drop the connection. This behavior
1174 * is controlled by the icmp_may_rst sysctl.
1175 */
1176void
1177tcp_drop_syn_sent(inp, errno)
1178 struct inpcb *inp;
1179 int errno;
1180{
1181 struct tcpcb *tp = intotcpcb(inp);
1182
1183 if (tp && tp->t_state == TCPS_SYN_SENT)
1184 tcp_drop(tp, errno);
1185}
1186
1187/*
1188 * When `need fragmentation' ICMP is received, update our idea of the MSS
1189 * based on the new value in the route. Also nudge TCP to send something,
1190 * since we know the packet we just sent was dropped.
1191 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1192 */
1193void
1194tcp_mtudisc(inp, errno)
1195 struct inpcb *inp;
1196 int errno;
1197{
1198 struct tcpcb *tp = intotcpcb(inp);
1199 struct rtentry *rt;
1200 struct rmxp_tao *taop;
1201 struct socket *so = inp->inp_socket;
1202 int offered;
1203 int mss;
1204#ifdef INET6
1205 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
1206#endif /* INET6 */
1207
1208 if (tp) {
1209#ifdef INET6
1210 if (isipv6)
1211 rt = tcp_rtlookup6(inp);
1212 else
1213#endif /* INET6 */
1214 rt = tcp_rtlookup(inp);
1215 if (!rt || !rt->rt_rmx.rmx_mtu) {
1216 tp->t_maxopd = tp->t_maxseg =
1217#ifdef INET6
1218 isipv6 ? tcp_v6mssdflt :
1219#endif /* INET6 */
1220 tcp_mssdflt;
1221 return;
1222 }
1223 taop = rmx_taop(rt->rt_rmx);
1224 offered = taop->tao_mssopt;
1225 mss = rt->rt_rmx.rmx_mtu -
1226#ifdef INET6
1227 (isipv6 ?
1228 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
1229#endif /* INET6 */
1230 sizeof(struct tcpiphdr)
1231#ifdef INET6
1232 )
1233#endif /* INET6 */
1234 ;
1235
1236 if (offered)
1237 mss = min(mss, offered);
1238 /*
1239 * XXX - The above conditional probably violates the TCP
1240 * spec. The problem is that, since we don't know the
1241 * other end's MSS, we are supposed to use a conservative
1242 * default. But, if we do that, then MTU discovery will
1243 * never actually take place, because the conservative
1244 * default is much less than the MTUs typically seen
1245 * on the Internet today. For the moment, we'll sweep
1246 * this under the carpet.
1247 *
1248 * The conservative default might not actually be a problem
1249 * if the only case this occurs is when sending an initial
1250 * SYN with options and data to a host we've never talked
1251 * to before. Then, they will reply with an MSS value which
1252 * will get recorded and the new parameters should get
1253 * recomputed. For Further Study.
1254 */
1255 if (tp->t_maxopd <= mss)
1256 return;
1257 tp->t_maxopd = mss;
1258
1259 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1260 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1261 mss -= TCPOLEN_TSTAMP_APPA;
1262 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
1263 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
1264 mss -= TCPOLEN_CC_APPA;
1265#if (MCLBYTES & (MCLBYTES - 1)) == 0
1266 if (mss > MCLBYTES)
1267 mss &= ~(MCLBYTES-1);
1268#else
1269 if (mss > MCLBYTES)
1270 mss = mss / MCLBYTES * MCLBYTES;
1271#endif
1272 if (so->so_snd.sb_hiwat < mss)
1273 mss = so->so_snd.sb_hiwat;
1274
1275 tp->t_maxseg = mss;
1276
1277 tcpstat.tcps_mturesent++;
1278 tp->t_rtttime = 0;
1279 tp->snd_nxt = tp->snd_una;
1280 tcp_output(tp);
1281 }
1282}
1283
1284/*
1285 * Look-up the routing entry to the peer of this inpcb. If no route
1286 * is found and it cannot be allocated the return NULL. This routine
1287 * is called by TCP routines that access the rmx structure and by tcp_mss
1288 * to get the interface MTU.
1289 */
1290struct rtentry *
1291tcp_rtlookup(inp)
1292 struct inpcb *inp;
1293{
1294 struct route *ro;
1295 struct rtentry *rt;
1296
1297 ro = &inp->inp_route;
1298 rt = ro->ro_rt;
1299 if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1300 /* No route yet, so try to acquire one */
1301 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1302 ro->ro_dst.sa_family = AF_INET;
1303 ro->ro_dst.sa_len = sizeof(ro->ro_dst);
1304 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
1305 inp->inp_faddr;
1306 rtalloc(ro);
1307 rt = ro->ro_rt;
1308 }
1309 }
1310 return rt;
1311}
1312
1313#ifdef INET6
1314struct rtentry *
1315tcp_rtlookup6(inp)
1316 struct inpcb *inp;
1317{
1318 struct route_in6 *ro6;
1319 struct rtentry *rt;
1320
1321 ro6 = &inp->in6p_route;
1322 rt = ro6->ro_rt;
1323 if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1324 /* No route yet, so try to acquire one */
1325 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
1082 0, cmd, notify);
1083}
1084#endif /* INET6 */
1085
1086#define TCP_RNDISS_ROUNDS 16
1087#define TCP_RNDISS_OUT 7200
1088#define TCP_RNDISS_MAX 30000
1089
1090u_int8_t tcp_rndiss_sbox[128];
1091u_int16_t tcp_rndiss_msb;
1092u_int16_t tcp_rndiss_cnt;
1093long tcp_rndiss_reseed;
1094
1095u_int16_t
1096tcp_rndiss_encrypt(val)
1097 u_int16_t val;
1098{
1099 u_int16_t sum = 0, i;
1100
1101 for (i = 0; i < TCP_RNDISS_ROUNDS; i++) {
1102 sum += 0x79b9;
1103 val ^= ((u_int16_t)tcp_rndiss_sbox[(val^sum) & 0x7f]) << 7;
1104 val = ((val & 0xff) << 7) | (val >> 8);
1105 }
1106
1107 return val;
1108}
1109
1110void
1111tcp_rndiss_init()
1112{
1113 struct timeval time;
1114
1115 getmicrotime(&time);
1116 read_random(tcp_rndiss_sbox, sizeof(tcp_rndiss_sbox));
1117
1118 tcp_rndiss_reseed = time.tv_sec + TCP_RNDISS_OUT;
1119 tcp_rndiss_msb = tcp_rndiss_msb == 0x8000 ? 0 : 0x8000;
1120 tcp_rndiss_cnt = 0;
1121}
1122
1123tcp_seq
1124tcp_rndiss_next()
1125{
1126 u_int16_t tmp;
1127 struct timeval time;
1128
1129 getmicrotime(&time);
1130
1131 if (tcp_rndiss_cnt >= TCP_RNDISS_MAX ||
1132 time.tv_sec > tcp_rndiss_reseed)
1133 tcp_rndiss_init();
1134
1135 read_random(&tmp, sizeof(tmp));
1136
1137 /* (tmp & 0x7fff) ensures a 32768 byte gap between ISS */
1138 return ((tcp_rndiss_encrypt(tcp_rndiss_cnt++) | tcp_rndiss_msb) <<16) |
1139 (tmp & 0x7fff);
1140}
1141
1142
1143/*
1144 * When a source quench is received, close congestion window
1145 * to one segment. We will gradually open it again as we proceed.
1146 */
1147void
1148tcp_quench(inp, errno)
1149 struct inpcb *inp;
1150 int errno;
1151{
1152 struct tcpcb *tp = intotcpcb(inp);
1153
1154 if (tp)
1155 tp->snd_cwnd = tp->t_maxseg;
1156}
1157
1158/*
1159 * When a specific ICMP unreachable message is received and the
1160 * connection state is SYN-SENT, drop the connection. This behavior
1161 * is controlled by the icmp_may_rst sysctl.
1162 */
1163void
1164tcp_drop_syn_sent(inp, errno)
1165 struct inpcb *inp;
1166 int errno;
1167{
1168 struct tcpcb *tp = intotcpcb(inp);
1169
1170 if (tp && tp->t_state == TCPS_SYN_SENT)
1171 tcp_drop(tp, errno);
1172}
1173
1174/*
1175 * When `need fragmentation' ICMP is received, update our idea of the MSS
1176 * based on the new value in the route. Also nudge TCP to send something,
1177 * since we know the packet we just sent was dropped.
1178 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1179 */
1180void
1181tcp_mtudisc(inp, errno)
1182 struct inpcb *inp;
1183 int errno;
1184{
1185 struct tcpcb *tp = intotcpcb(inp);
1186 struct rtentry *rt;
1187 struct rmxp_tao *taop;
1188 struct socket *so = inp->inp_socket;
1189 int offered;
1190 int mss;
1191#ifdef INET6
1192 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
1193#endif /* INET6 */
1194
1195 if (tp) {
1196#ifdef INET6
1197 if (isipv6)
1198 rt = tcp_rtlookup6(inp);
1199 else
1200#endif /* INET6 */
1201 rt = tcp_rtlookup(inp);
1202 if (!rt || !rt->rt_rmx.rmx_mtu) {
1203 tp->t_maxopd = tp->t_maxseg =
1204#ifdef INET6
1205 isipv6 ? tcp_v6mssdflt :
1206#endif /* INET6 */
1207 tcp_mssdflt;
1208 return;
1209 }
1210 taop = rmx_taop(rt->rt_rmx);
1211 offered = taop->tao_mssopt;
1212 mss = rt->rt_rmx.rmx_mtu -
1213#ifdef INET6
1214 (isipv6 ?
1215 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
1216#endif /* INET6 */
1217 sizeof(struct tcpiphdr)
1218#ifdef INET6
1219 )
1220#endif /* INET6 */
1221 ;
1222
1223 if (offered)
1224 mss = min(mss, offered);
1225 /*
1226 * XXX - The above conditional probably violates the TCP
1227 * spec. The problem is that, since we don't know the
1228 * other end's MSS, we are supposed to use a conservative
1229 * default. But, if we do that, then MTU discovery will
1230 * never actually take place, because the conservative
1231 * default is much less than the MTUs typically seen
1232 * on the Internet today. For the moment, we'll sweep
1233 * this under the carpet.
1234 *
1235 * The conservative default might not actually be a problem
1236 * if the only case this occurs is when sending an initial
1237 * SYN with options and data to a host we've never talked
1238 * to before. Then, they will reply with an MSS value which
1239 * will get recorded and the new parameters should get
1240 * recomputed. For Further Study.
1241 */
1242 if (tp->t_maxopd <= mss)
1243 return;
1244 tp->t_maxopd = mss;
1245
1246 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1247 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1248 mss -= TCPOLEN_TSTAMP_APPA;
1249 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
1250 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
1251 mss -= TCPOLEN_CC_APPA;
1252#if (MCLBYTES & (MCLBYTES - 1)) == 0
1253 if (mss > MCLBYTES)
1254 mss &= ~(MCLBYTES-1);
1255#else
1256 if (mss > MCLBYTES)
1257 mss = mss / MCLBYTES * MCLBYTES;
1258#endif
1259 if (so->so_snd.sb_hiwat < mss)
1260 mss = so->so_snd.sb_hiwat;
1261
1262 tp->t_maxseg = mss;
1263
1264 tcpstat.tcps_mturesent++;
1265 tp->t_rtttime = 0;
1266 tp->snd_nxt = tp->snd_una;
1267 tcp_output(tp);
1268 }
1269}
1270
1271/*
1272 * Look-up the routing entry to the peer of this inpcb. If no route
1273 * is found and it cannot be allocated the return NULL. This routine
1274 * is called by TCP routines that access the rmx structure and by tcp_mss
1275 * to get the interface MTU.
1276 */
1277struct rtentry *
1278tcp_rtlookup(inp)
1279 struct inpcb *inp;
1280{
1281 struct route *ro;
1282 struct rtentry *rt;
1283
1284 ro = &inp->inp_route;
1285 rt = ro->ro_rt;
1286 if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1287 /* No route yet, so try to acquire one */
1288 if (inp->inp_faddr.s_addr != INADDR_ANY) {
1289 ro->ro_dst.sa_family = AF_INET;
1290 ro->ro_dst.sa_len = sizeof(ro->ro_dst);
1291 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
1292 inp->inp_faddr;
1293 rtalloc(ro);
1294 rt = ro->ro_rt;
1295 }
1296 }
1297 return rt;
1298}
1299
1300#ifdef INET6
1301struct rtentry *
1302tcp_rtlookup6(inp)
1303 struct inpcb *inp;
1304{
1305 struct route_in6 *ro6;
1306 struct rtentry *rt;
1307
1308 ro6 = &inp->in6p_route;
1309 rt = ro6->ro_rt;
1310 if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1311 /* No route yet, so try to acquire one */
1312 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
1326 ro6->ro_dst.sin6_family = AF_INET6;
1327 ro6->ro_dst.sin6_len = sizeof(ro6->ro_dst);
1328 ro6->ro_dst.sin6_addr = inp->in6p_faddr;
1313 struct sockaddr_in6 *dst6;
1314
1315 dst6 = (struct sockaddr_in6 *)&ro6->ro_dst;
1316 dst6->sin6_family = AF_INET6;
1317 dst6->sin6_len = sizeof(ro6->ro_dst);
1318 dst6->sin6_addr = inp->in6p_faddr;
1329 rtalloc((struct route *)ro6);
1330 rt = ro6->ro_rt;
1331 }
1332 }
1333 return rt;
1334}
1335#endif /* INET6 */
1336
1337#ifdef IPSEC
1338/* compute ESP/AH header size for TCP, including outer IP header. */
1339size_t
1340ipsec_hdrsiz_tcp(tp)
1341 struct tcpcb *tp;
1342{
1343 struct inpcb *inp;
1344 struct mbuf *m;
1345 size_t hdrsiz;
1346 struct ip *ip;
1347#ifdef INET6
1348 struct ip6_hdr *ip6;
1349#endif /* INET6 */
1350 struct tcphdr *th;
1351
1352 if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
1353 return 0;
1354 MGETHDR(m, M_DONTWAIT, MT_DATA);
1355 if (!m)
1356 return 0;
1357
1358#ifdef INET6
1359 if ((inp->inp_vflag & INP_IPV6) != 0) {
1360 ip6 = mtod(m, struct ip6_hdr *);
1361 th = (struct tcphdr *)(ip6 + 1);
1362 m->m_pkthdr.len = m->m_len =
1363 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1364 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6,
1365 sizeof(struct ip6_hdr));
1366 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
1367 sizeof(struct tcphdr));
1368 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1369 } else
1370#endif /* INET6 */
1371 {
1372 ip = mtod(m, struct ip *);
1373 th = (struct tcphdr *)(ip + 1);
1374 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1375 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip,
1376 sizeof(struct ip));
1377 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
1378 sizeof(struct tcphdr));
1319 rtalloc((struct route *)ro6);
1320 rt = ro6->ro_rt;
1321 }
1322 }
1323 return rt;
1324}
1325#endif /* INET6 */
1326
1327#ifdef IPSEC
1328/* compute ESP/AH header size for TCP, including outer IP header. */
1329size_t
1330ipsec_hdrsiz_tcp(tp)
1331 struct tcpcb *tp;
1332{
1333 struct inpcb *inp;
1334 struct mbuf *m;
1335 size_t hdrsiz;
1336 struct ip *ip;
1337#ifdef INET6
1338 struct ip6_hdr *ip6;
1339#endif /* INET6 */
1340 struct tcphdr *th;
1341
1342 if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
1343 return 0;
1344 MGETHDR(m, M_DONTWAIT, MT_DATA);
1345 if (!m)
1346 return 0;
1347
1348#ifdef INET6
1349 if ((inp->inp_vflag & INP_IPV6) != 0) {
1350 ip6 = mtod(m, struct ip6_hdr *);
1351 th = (struct tcphdr *)(ip6 + 1);
1352 m->m_pkthdr.len = m->m_len =
1353 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1354 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6,
1355 sizeof(struct ip6_hdr));
1356 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
1357 sizeof(struct tcphdr));
1358 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1359 } else
1360#endif /* INET6 */
1361 {
1362 ip = mtod(m, struct ip *);
1363 th = (struct tcphdr *)(ip + 1);
1364 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1365 bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip,
1366 sizeof(struct ip));
1367 bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th,
1368 sizeof(struct tcphdr));
1369 ip->ip_vhl = IP_VHL_BORING;
1379 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1380 }
1381
1382 m_free(m);
1383 return hdrsiz;
1384}
1385#endif /*IPSEC*/
1386
1387/*
1388 * Return a pointer to the cached information about the remote host.
1389 * The cached information is stored in the protocol specific part of
1390 * the route metrics.
1391 */
1392struct rmxp_tao *
1393tcp_gettaocache(inp)
1394 struct inpcb *inp;
1395{
1396 struct rtentry *rt;
1397
1398#ifdef INET6
1399 if ((inp->inp_vflag & INP_IPV6) != 0)
1400 rt = tcp_rtlookup6(inp);
1401 else
1402#endif /* INET6 */
1403 rt = tcp_rtlookup(inp);
1404
1405 /* Make sure this is a host route and is up. */
1406 if (rt == NULL ||
1407 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
1408 return NULL;
1409
1410 return rmx_taop(rt->rt_rmx);
1411}
1412
1413/*
1414 * Clear all the TAO cache entries, called from tcp_init.
1415 *
1416 * XXX
1417 * This routine is just an empty one, because we assume that the routing
1418 * routing tables are initialized at the same time when TCP, so there is
1419 * nothing in the cache left over.
1420 */
1421static void
1422tcp_cleartaocache()
1423{
1424}
1370 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1371 }
1372
1373 m_free(m);
1374 return hdrsiz;
1375}
1376#endif /*IPSEC*/
1377
1378/*
1379 * Return a pointer to the cached information about the remote host.
1380 * The cached information is stored in the protocol specific part of
1381 * the route metrics.
1382 */
1383struct rmxp_tao *
1384tcp_gettaocache(inp)
1385 struct inpcb *inp;
1386{
1387 struct rtentry *rt;
1388
1389#ifdef INET6
1390 if ((inp->inp_vflag & INP_IPV6) != 0)
1391 rt = tcp_rtlookup6(inp);
1392 else
1393#endif /* INET6 */
1394 rt = tcp_rtlookup(inp);
1395
1396 /* Make sure this is a host route and is up. */
1397 if (rt == NULL ||
1398 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
1399 return NULL;
1400
1401 return rmx_taop(rt->rt_rmx);
1402}
1403
1404/*
1405 * Clear all the TAO cache entries, called from tcp_init.
1406 *
1407 * XXX
1408 * This routine is just an empty one, because we assume that the routing
1409 * routing tables are initialized at the same time when TCP, so there is
1410 * nothing in the cache left over.
1411 */
1412static void
1413tcp_cleartaocache()
1414{
1415}