Deleted Added
sdiff udiff text old ( 163606 ) new ( 164033 )
full compact
1/*-
2 * Copyright (c) 1982, 1986, 1991, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
30 * $FreeBSD: head/sys/netinet/in_pcb.c 163606 2006-10-22 11:52:19Z rwatson $
31 */
32
33#include "opt_ipsec.h"
34#include "opt_inet6.h"
35#include "opt_mac.h"
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/malloc.h>
40#include <sys/mbuf.h>
41#include <sys/domain.h>
42#include <sys/protosw.h>
43#include <sys/socket.h>
44#include <sys/socketvar.h>
45#include <sys/proc.h>
46#include <sys/jail.h>
47#include <sys/kernel.h>
48#include <sys/sysctl.h>
49
50#include <vm/uma.h>
51
52#include <net/if.h>
53#include <net/if_types.h>
54#include <net/route.h>
55
56#include <netinet/in.h>
57#include <netinet/in_pcb.h>
58#include <netinet/in_var.h>
59#include <netinet/ip_var.h>
60#include <netinet/tcp_var.h>
61#include <netinet/udp.h>
62#include <netinet/udp_var.h>
63#ifdef INET6
64#include <netinet/ip6.h>
65#include <netinet6/ip6_var.h>
66#endif /* INET6 */
67
68#ifdef IPSEC
69#include <netinet6/ipsec.h>
70#include <netkey/key.h>
71#endif /* IPSEC */
72
73#ifdef FAST_IPSEC
74#if defined(IPSEC) || defined(IPSEC_ESP)
75#error "Bad idea: don't compile with both IPSEC and FAST_IPSEC!"
76#endif
77
78#include <netipsec/ipsec.h>
79#include <netipsec/key.h>
80#endif /* FAST_IPSEC */
81
82#include <security/mac/mac_framework.h>
83
84/*
85 * These configure the range of local port addresses assigned to
86 * "unspecified" outgoing connections/packets/whatever.
87 */
88int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */
89int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */
90int ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
91int ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */
92int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
93int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */
94
95/*
96 * Reserved ports accessible only to root. There are significant
97 * security considerations that must be accounted for when changing these,
98 * but the security benefits can be great. Please be careful.
99 */
100int ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */
101int ipport_reservedlow = 0;
102
103/* Variables dealing with random ephemeral port allocation. */
104int ipport_randomized = 1; /* user controlled via sysctl */
105int ipport_randomcps = 10; /* user controlled via sysctl */
106int ipport_randomtime = 45; /* user controlled via sysctl */
107int ipport_stoprandom = 0; /* toggled by ipport_tick */
108int ipport_tcpallocs;
109int ipport_tcplastcount;
110
111#define RANGECHK(var, min, max) \
112 if ((var) < (min)) { (var) = (min); } \
113 else if ((var) > (max)) { (var) = (max); }
114
115static int
116sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
117{
118 int error;
119
120 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
121 if (error == 0) {
122 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
123 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
124 RANGECHK(ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
125 RANGECHK(ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
126 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
127 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
128 }
129 return (error);
130}
131
132#undef RANGECHK
133
134SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
135
136SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
137 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
138SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
139 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
140SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
141 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
142SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
143 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
144SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
145 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
146SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
147 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
148SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
149 CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedhigh, 0, "");
150SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
151 CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedlow, 0, "");
152SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
153 &ipport_randomized, 0, "Enable random port allocation");
154SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
155 &ipport_randomcps, 0, "Maximum number of random port "
156 "allocations before switching to a sequental one");
157SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
158 &ipport_randomtime, 0, "Minimum time to keep sequental port "
159 "allocation before switching to a random one");
160
161/*
162 * in_pcb.c: manage the Protocol Control Blocks.
163 *
164 * NOTE: It is assumed that most of these functions will be called with
165 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
166 * functions often modify hash chains or addresses in pcbs.
167 */
168
169/*
170 * Allocate a PCB and associate it with the socket.
171 * On success return with the PCB locked.
172 */
173int
174in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
175{
176 struct inpcb *inp;
177 int error;
178
179 INP_INFO_WLOCK_ASSERT(pcbinfo);
180 error = 0;
181 inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
182 if (inp == NULL)
183 return (ENOBUFS);
184 bzero(inp,inp_zero_size);
185 inp->inp_pcbinfo = pcbinfo;
186 inp->inp_socket = so;
187#ifdef MAC
188 error = mac_init_inpcb(inp, M_NOWAIT);
189 if (error != 0)
190 goto out;
191 SOCK_LOCK(so);
192 mac_create_inpcb_from_socket(so, inp);
193 SOCK_UNLOCK(so);
194#endif
195#if defined(IPSEC) || defined(FAST_IPSEC)
196#ifdef FAST_IPSEC
197 error = ipsec_init_policy(so, &inp->inp_sp);
198#else
199 error = ipsec_init_pcbpolicy(so, &inp->inp_sp);
200#endif
201 if (error != 0)
202 goto out;
203#endif /*IPSEC*/
204#if defined(INET6)
205 if (INP_SOCKAF(so) == AF_INET6) {
206 inp->inp_vflag |= INP_IPV6PROTO;
207 if (ip6_v6only)
208 inp->inp_flags |= IN6P_IPV6_V6ONLY;
209 }
210#endif
211 LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list);
212 pcbinfo->ipi_count++;
213 so->so_pcb = (caddr_t)inp;
214#ifdef INET6
215 if (ip6_auto_flowlabel)
216 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
217#endif
218 INP_LOCK(inp);
219 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
220
221#if defined(IPSEC) || defined(FAST_IPSEC) || defined(MAC)
222out:
223 if (error != 0)
224 uma_zfree(pcbinfo->ipi_zone, inp);
225#endif
226 return (error);
227}
228
229int
230in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
231{
232 int anonport, error;
233
234 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
235 INP_LOCK_ASSERT(inp);
236
237 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
238 return (EINVAL);
239 anonport = inp->inp_lport == 0 && (nam == NULL ||
240 ((struct sockaddr_in *)nam)->sin_port == 0);
241 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
242 &inp->inp_lport, cred);
243 if (error)
244 return (error);
245 if (in_pcbinshash(inp) != 0) {
246 inp->inp_laddr.s_addr = INADDR_ANY;
247 inp->inp_lport = 0;
248 return (EAGAIN);
249 }
250 if (anonport)
251 inp->inp_flags |= INP_ANONPORT;
252 return (0);
253}
254
255/*
256 * Set up a bind operation on a PCB, performing port allocation
257 * as required, but do not actually modify the PCB. Callers can
258 * either complete the bind by setting inp_laddr/inp_lport and
259 * calling in_pcbinshash(), or they can just use the resulting
260 * port and address to authorise the sending of a once-off packet.
261 *
262 * On error, the values of *laddrp and *lportp are not changed.
263 */
264int
265in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
266 u_short *lportp, struct ucred *cred)
267{
268 struct socket *so = inp->inp_socket;
269 unsigned short *lastport;
270 struct sockaddr_in *sin;
271 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
272 struct in_addr laddr;
273 u_short lport = 0;
274 int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
275 int error, prison = 0;
276 int dorandom;
277
278 INP_INFO_WLOCK_ASSERT(pcbinfo);
279 INP_LOCK_ASSERT(inp);
280
281 if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */
282 return (EADDRNOTAVAIL);
283 laddr.s_addr = *laddrp;
284 if (nam != NULL && laddr.s_addr != INADDR_ANY)
285 return (EINVAL);
286 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
287 wild = INPLOOKUP_WILDCARD;
288 if (nam) {
289 sin = (struct sockaddr_in *)nam;
290 if (nam->sa_len != sizeof (*sin))
291 return (EINVAL);
292#ifdef notdef
293 /*
294 * We should check the family, but old programs
295 * incorrectly fail to initialize it.
296 */
297 if (sin->sin_family != AF_INET)
298 return (EAFNOSUPPORT);
299#endif
300 if (sin->sin_addr.s_addr != INADDR_ANY)
301 if (prison_ip(cred, 0, &sin->sin_addr.s_addr))
302 return(EINVAL);
303 if (sin->sin_port != *lportp) {
304 /* Don't allow the port to change. */
305 if (*lportp != 0)
306 return (EINVAL);
307 lport = sin->sin_port;
308 }
309 /* NB: lport is left as 0 if the port isn't being changed. */
310 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
311 /*
312 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
313 * allow complete duplication of binding if
314 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
315 * and a multicast address is bound on both
316 * new and duplicated sockets.
317 */
318 if (so->so_options & SO_REUSEADDR)
319 reuseport = SO_REUSEADDR|SO_REUSEPORT;
320 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
321 sin->sin_port = 0; /* yech... */
322 bzero(&sin->sin_zero, sizeof(sin->sin_zero));
323 if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
324 return (EADDRNOTAVAIL);
325 }
326 laddr = sin->sin_addr;
327 if (lport) {
328 struct inpcb *t;
329 struct tcptw *tw;
330
331 /* GROSS */
332 if (ntohs(lport) <= ipport_reservedhigh &&
333 ntohs(lport) >= ipport_reservedlow &&
334 suser_cred(cred, SUSER_ALLOWJAIL))
335 return (EACCES);
336 if (jailed(cred))
337 prison = 1;
338 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
339 suser_cred(so->so_cred, SUSER_ALLOWJAIL) != 0) {
340 t = in_pcblookup_local(inp->inp_pcbinfo,
341 sin->sin_addr, lport,
342 prison ? 0 : INPLOOKUP_WILDCARD);
343 /*
344 * XXX
345 * This entire block sorely needs a rewrite.
346 */
347 if (t &&
348 ((t->inp_vflag & INP_TIMEWAIT) == 0) &&
349 (so->so_type != SOCK_STREAM ||
350 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
351 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
352 ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
353 (t->inp_socket->so_options &
354 SO_REUSEPORT) == 0) &&
355 (so->so_cred->cr_uid !=
356 t->inp_socket->so_cred->cr_uid))
357 return (EADDRINUSE);
358 }
359 if (prison && prison_ip(cred, 0, &sin->sin_addr.s_addr))
360 return (EADDRNOTAVAIL);
361 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
362 lport, prison ? 0 : wild);
363 if (t && (t->inp_vflag & INP_TIMEWAIT)) {
364 /*
365 * XXXRW: If an incpb has had its timewait
366 * state recycled, we treat the address as
367 * being in use (for now). This is better
368 * than a panic, but not desirable.
369 */
370 tw = intotw(inp);
371 if (tw == NULL ||
372 (reuseport & tw->tw_so_options) == 0)
373 return (EADDRINUSE);
374 } else if (t &&
375 (reuseport & t->inp_socket->so_options) == 0) {
376#if defined(INET6)
377 if (ntohl(sin->sin_addr.s_addr) !=
378 INADDR_ANY ||
379 ntohl(t->inp_laddr.s_addr) !=
380 INADDR_ANY ||
381 INP_SOCKAF(so) ==
382 INP_SOCKAF(t->inp_socket))
383#endif /* defined(INET6) */
384 return (EADDRINUSE);
385 }
386 }
387 }
388 if (*lportp != 0)
389 lport = *lportp;
390 if (lport == 0) {
391 u_short first, last;
392 int count;
393
394 if (laddr.s_addr != INADDR_ANY)
395 if (prison_ip(cred, 0, &laddr.s_addr))
396 return (EINVAL);
397
398 if (inp->inp_flags & INP_HIGHPORT) {
399 first = ipport_hifirstauto; /* sysctl */
400 last = ipport_hilastauto;
401 lastport = &pcbinfo->lasthi;
402 } else if (inp->inp_flags & INP_LOWPORT) {
403 if ((error = suser_cred(cred, SUSER_ALLOWJAIL)) != 0)
404 return error;
405 first = ipport_lowfirstauto; /* 1023 */
406 last = ipport_lowlastauto; /* 600 */
407 lastport = &pcbinfo->lastlow;
408 } else {
409 first = ipport_firstauto; /* sysctl */
410 last = ipport_lastauto;
411 lastport = &pcbinfo->lastport;
412 }
413 /*
414 * For UDP, use random port allocation as long as the user
415 * allows it. For TCP (and as of yet unknown) connections,
416 * use random port allocation only if the user allows it AND
417 * ipport_tick() allows it.
418 */
419 if (ipport_randomized &&
420 (!ipport_stoprandom || pcbinfo == &udbinfo))
421 dorandom = 1;
422 else
423 dorandom = 0;
424 /*
425 * It makes no sense to do random port allocation if
426 * we have the only port available.
427 */
428 if (first == last)
429 dorandom = 0;
430 /* Make sure to not include UDP packets in the count. */
431 if (pcbinfo != &udbinfo)
432 ipport_tcpallocs++;
433 /*
434 * Simple check to ensure all ports are not used up causing
435 * a deadlock here.
436 *
437 * We split the two cases (up and down) so that the direction
438 * is not being tested on each round of the loop.
439 */
440 if (first > last) {
441 /*
442 * counting down
443 */
444 if (dorandom)
445 *lastport = first -
446 (arc4random() % (first - last));
447 count = first - last;
448
449 do {
450 if (count-- < 0) /* completely used? */
451 return (EADDRNOTAVAIL);
452 --*lastport;
453 if (*lastport > first || *lastport < last)
454 *lastport = first;
455 lport = htons(*lastport);
456 } while (in_pcblookup_local(pcbinfo, laddr, lport,
457 wild));
458 } else {
459 /*
460 * counting up
461 */
462 if (dorandom)
463 *lastport = first +
464 (arc4random() % (last - first));
465 count = last - first;
466
467 do {
468 if (count-- < 0) /* completely used? */
469 return (EADDRNOTAVAIL);
470 ++*lastport;
471 if (*lastport < first || *lastport > last)
472 *lastport = first;
473 lport = htons(*lastport);
474 } while (in_pcblookup_local(pcbinfo, laddr, lport,
475 wild));
476 }
477 }
478 if (prison_ip(cred, 0, &laddr.s_addr))
479 return (EINVAL);
480 *laddrp = laddr.s_addr;
481 *lportp = lport;
482 return (0);
483}
484
485/*
486 * Connect from a socket to a specified address.
487 * Both address and port must be specified in argument sin.
488 * If don't have a local address for this socket yet,
489 * then pick one.
490 */
491int
492in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
493{
494 u_short lport, fport;
495 in_addr_t laddr, faddr;
496 int anonport, error;
497
498 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
499 INP_LOCK_ASSERT(inp);
500
501 lport = inp->inp_lport;
502 laddr = inp->inp_laddr.s_addr;
503 anonport = (lport == 0);
504 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
505 NULL, cred);
506 if (error)
507 return (error);
508
509 /* Do the initial binding of the local address if required. */
510 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
511 inp->inp_lport = lport;
512 inp->inp_laddr.s_addr = laddr;
513 if (in_pcbinshash(inp) != 0) {
514 inp->inp_laddr.s_addr = INADDR_ANY;
515 inp->inp_lport = 0;
516 return (EAGAIN);
517 }
518 }
519
520 /* Commit the remaining changes. */
521 inp->inp_lport = lport;
522 inp->inp_laddr.s_addr = laddr;
523 inp->inp_faddr.s_addr = faddr;
524 inp->inp_fport = fport;
525 in_pcbrehash(inp);
526#ifdef IPSEC
527 if (inp->inp_socket->so_type == SOCK_STREAM)
528 ipsec_pcbconn(inp->inp_sp);
529#endif
530 if (anonport)
531 inp->inp_flags |= INP_ANONPORT;
532 return (0);
533}
534
535/*
536 * Set up for a connect from a socket to the specified address.
537 * On entry, *laddrp and *lportp should contain the current local
538 * address and port for the PCB; these are updated to the values
539 * that should be placed in inp_laddr and inp_lport to complete
540 * the connect.
541 *
542 * On success, *faddrp and *fportp will be set to the remote address
543 * and port. These are not updated in the error case.
544 *
545 * If the operation fails because the connection already exists,
546 * *oinpp will be set to the PCB of that connection so that the
547 * caller can decide to override it. In all other cases, *oinpp
548 * is set to NULL.
549 */
550int
551in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
552 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
553 struct inpcb **oinpp, struct ucred *cred)
554{
555 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
556 struct in_ifaddr *ia;
557 struct sockaddr_in sa;
558 struct ucred *socred;
559 struct inpcb *oinp;
560 struct in_addr laddr, faddr;
561 u_short lport, fport;
562 int error;
563
564 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
565 INP_LOCK_ASSERT(inp);
566
567 if (oinpp != NULL)
568 *oinpp = NULL;
569 if (nam->sa_len != sizeof (*sin))
570 return (EINVAL);
571 if (sin->sin_family != AF_INET)
572 return (EAFNOSUPPORT);
573 if (sin->sin_port == 0)
574 return (EADDRNOTAVAIL);
575 laddr.s_addr = *laddrp;
576 lport = *lportp;
577 faddr = sin->sin_addr;
578 fport = sin->sin_port;
579 socred = inp->inp_socket->so_cred;
580 if (laddr.s_addr == INADDR_ANY && jailed(socred)) {
581 bzero(&sa, sizeof(sa));
582 sa.sin_addr.s_addr = htonl(prison_getip(socred));
583 sa.sin_len = sizeof(sa);
584 sa.sin_family = AF_INET;
585 error = in_pcbbind_setup(inp, (struct sockaddr *)&sa,
586 &laddr.s_addr, &lport, cred);
587 if (error)
588 return (error);
589 }
590 if (!TAILQ_EMPTY(&in_ifaddrhead)) {
591 /*
592 * If the destination address is INADDR_ANY,
593 * use the primary local address.
594 * If the supplied address is INADDR_BROADCAST,
595 * and the primary interface supports broadcast,
596 * choose the broadcast address for that interface.
597 */
598 if (faddr.s_addr == INADDR_ANY)
599 faddr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr;
600 else if (faddr.s_addr == (u_long)INADDR_BROADCAST &&
601 (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags &
602 IFF_BROADCAST))
603 faddr = satosin(&TAILQ_FIRST(
604 &in_ifaddrhead)->ia_broadaddr)->sin_addr;
605 }
606 if (laddr.s_addr == INADDR_ANY) {
607 ia = (struct in_ifaddr *)0;
608 /*
609 * If route is known our src addr is taken from the i/f,
610 * else punt.
611 *
612 * Find out route to destination
613 */
614 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
615 ia = ip_rtaddr(faddr);
616 /*
617 * If we found a route, use the address corresponding to
618 * the outgoing interface.
619 *
620 * Otherwise assume faddr is reachable on a directly connected
621 * network and try to find a corresponding interface to take
622 * the source address from.
623 */
624 if (ia == 0) {
625 bzero(&sa, sizeof(sa));
626 sa.sin_addr = faddr;
627 sa.sin_len = sizeof(sa);
628 sa.sin_family = AF_INET;
629
630 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sa)));
631 if (ia == 0)
632 ia = ifatoia(ifa_ifwithnet(sintosa(&sa)));
633 if (ia == 0)
634 return (ENETUNREACH);
635 }
636 /*
637 * If the destination address is multicast and an outgoing
638 * interface has been set as a multicast option, use the
639 * address of that interface as our source address.
640 */
641 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
642 inp->inp_moptions != NULL) {
643 struct ip_moptions *imo;
644 struct ifnet *ifp;
645
646 imo = inp->inp_moptions;
647 if (imo->imo_multicast_ifp != NULL) {
648 ifp = imo->imo_multicast_ifp;
649 TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link)
650 if (ia->ia_ifp == ifp)
651 break;
652 if (ia == 0)
653 return (EADDRNOTAVAIL);
654 }
655 }
656 laddr = ia->ia_addr.sin_addr;
657 }
658
659 oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
660 0, NULL);
661 if (oinp != NULL) {
662 if (oinpp != NULL)
663 *oinpp = oinp;
664 return (EADDRINUSE);
665 }
666 if (lport == 0) {
667 error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
668 cred);
669 if (error)
670 return (error);
671 }
672 *laddrp = laddr.s_addr;
673 *lportp = lport;
674 *faddrp = faddr.s_addr;
675 *fportp = fport;
676 return (0);
677}
678
679void
680in_pcbdisconnect(struct inpcb *inp)
681{
682
683 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
684 INP_LOCK_ASSERT(inp);
685
686 inp->inp_faddr.s_addr = INADDR_ANY;
687 inp->inp_fport = 0;
688 in_pcbrehash(inp);
689#ifdef IPSEC
690 ipsec_pcbdisconn(inp->inp_sp);
691#endif
692}
693
694/*
695 * In the old world order, in_pcbdetach() served two functions: to detach the
696 * pcb from the socket/potentially free the socket, and to free the pcb
697 * itself. In the new world order, the protocol code is responsible for
698 * managing the relationship with the socket, and this code simply frees the
699 * pcb.
700 */
701void
702in_pcbdetach(struct inpcb *inp)
703{
704
705 KASSERT(inp->inp_socket != NULL, ("in_pcbdetach: inp_socket == NULL"));
706 inp->inp_socket->so_pcb = NULL;
707 inp->inp_socket = NULL;
708}
709
710void
711in_pcbfree(struct inpcb *inp)
712{
713 struct inpcbinfo *ipi = inp->inp_pcbinfo;
714
715 KASSERT(inp->inp_socket == NULL, ("in_pcbfree: inp_socket != NULL"));
716 INP_INFO_WLOCK_ASSERT(ipi);
717 INP_LOCK_ASSERT(inp);
718
719#if defined(IPSEC) || defined(FAST_IPSEC)
720 ipsec4_delete_pcbpolicy(inp);
721#endif /*IPSEC*/
722 inp->inp_gencnt = ++ipi->ipi_gencnt;
723 in_pcbremlists(inp);
724 if (inp->inp_options)
725 (void)m_free(inp->inp_options);
726 ip_freemoptions(inp->inp_moptions);
727 inp->inp_vflag = 0;
728
729#ifdef MAC
730 mac_destroy_inpcb(inp);
731#endif
732 INP_UNLOCK(inp);
733 uma_zfree(ipi->ipi_zone, inp);
734}
735
736/*
737 * TCP needs to maintain its inpcb structure after the TCP connection has
738 * been torn down. However, it must be disconnected from the inpcb hashes as
739 * it must not prevent binding of future connections to the same port/ip
740 * combination by other inpcbs.
741 */
742void
743in_pcbdrop(struct inpcb *inp)
744{
745
746 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
747 INP_LOCK_ASSERT(inp);
748
749 inp->inp_vflag |= INP_DROPPED;
750 if (inp->inp_lport) {
751 struct inpcbport *phd = inp->inp_phd;
752
753 LIST_REMOVE(inp, inp_hash);
754 LIST_REMOVE(inp, inp_portlist);
755 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
756 LIST_REMOVE(phd, phd_hash);
757 free(phd, M_PCB);
758 }
759 inp->inp_lport = 0;
760 }
761}
762
763struct sockaddr *
764in_sockaddr(in_port_t port, struct in_addr *addr_p)
765{
766 struct sockaddr_in *sin;
767
768 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
769 M_WAITOK | M_ZERO);
770 sin->sin_family = AF_INET;
771 sin->sin_len = sizeof(*sin);
772 sin->sin_addr = *addr_p;
773 sin->sin_port = port;
774
775 return (struct sockaddr *)sin;
776}
777
778/*
779 * The wrapper function will pass down the pcbinfo for this function to lock.
780 * The socket must have a valid
781 * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
782 * except through a kernel programming error, so it is acceptable to panic
783 * (or in this case trap) if the PCB is invalid. (Actually, we don't trap
784 * because there actually /is/ a programming error somewhere... XXX)
785 */
786int
787in_setsockaddr(struct socket *so, struct sockaddr **nam,
788 struct inpcbinfo *pcbinfo)
789{
790 struct inpcb *inp;
791 struct in_addr addr;
792 in_port_t port;
793
794 inp = sotoinpcb(so);
795 KASSERT(inp != NULL, ("in_setsockaddr: inp == NULL"));
796
797 INP_LOCK(inp);
798 port = inp->inp_lport;
799 addr = inp->inp_laddr;
800 INP_UNLOCK(inp);
801
802 *nam = in_sockaddr(port, &addr);
803 return 0;
804}
805
806/*
807 * The wrapper function will pass down the pcbinfo for this function to lock.
808 */
809int
810in_setpeeraddr(struct socket *so, struct sockaddr **nam,
811 struct inpcbinfo *pcbinfo)
812{
813 struct inpcb *inp;
814 struct in_addr addr;
815 in_port_t port;
816
817 inp = sotoinpcb(so);
818 KASSERT(inp != NULL, ("in_setpeeraddr: inp == NULL"));
819
820 INP_LOCK(inp);
821 port = inp->inp_fport;
822 addr = inp->inp_faddr;
823 INP_UNLOCK(inp);
824
825 *nam = in_sockaddr(port, &addr);
826 return 0;
827}
828
829void
830in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
831 struct inpcb *(*notify)(struct inpcb *, int))
832{
833 struct inpcb *inp, *ninp;
834 struct inpcbhead *head;
835
836 INP_INFO_WLOCK(pcbinfo);
837 head = pcbinfo->listhead;
838 for (inp = LIST_FIRST(head); inp != NULL; inp = ninp) {
839 INP_LOCK(inp);
840 ninp = LIST_NEXT(inp, inp_list);
841#ifdef INET6
842 if ((inp->inp_vflag & INP_IPV4) == 0) {
843 INP_UNLOCK(inp);
844 continue;
845 }
846#endif
847 if (inp->inp_faddr.s_addr != faddr.s_addr ||
848 inp->inp_socket == NULL) {
849 INP_UNLOCK(inp);
850 continue;
851 }
852 if ((*notify)(inp, errno))
853 INP_UNLOCK(inp);
854 }
855 INP_INFO_WUNLOCK(pcbinfo);
856}
857
858void
859in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
860{
861 struct inpcb *inp;
862 struct ip_moptions *imo;
863 int i, gap;
864
865 INP_INFO_RLOCK(pcbinfo);
866 LIST_FOREACH(inp, pcbinfo->listhead, inp_list) {
867 INP_LOCK(inp);
868 imo = inp->inp_moptions;
869 if ((inp->inp_vflag & INP_IPV4) &&
870 imo != NULL) {
871 /*
872 * Unselect the outgoing interface if it is being
873 * detached.
874 */
875 if (imo->imo_multicast_ifp == ifp)
876 imo->imo_multicast_ifp = NULL;
877
878 /*
879 * Drop multicast group membership if we joined
880 * through the interface being detached.
881 */
882 for (i = 0, gap = 0; i < imo->imo_num_memberships;
883 i++) {
884 if (imo->imo_membership[i]->inm_ifp == ifp) {
885 in_delmulti(imo->imo_membership[i]);
886 gap++;
887 } else if (gap != 0)
888 imo->imo_membership[i - gap] =
889 imo->imo_membership[i];
890 }
891 imo->imo_num_memberships -= gap;
892 }
893 INP_UNLOCK(inp);
894 }
895 INP_INFO_RUNLOCK(pcbinfo);
896}
897
898/*
899 * Lookup a PCB based on the local address and port.
900 */
901#define INP_LOOKUP_MAPPED_PCB_COST 3
902struct inpcb *
903in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
904 u_int lport_arg, int wild_okay)
905{
906 struct inpcb *inp;
907#ifdef INET6
908 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
909#else
910 int matchwild = 3;
911#endif
912 int wildcard;
913 u_short lport = lport_arg;
914
915 INP_INFO_WLOCK_ASSERT(pcbinfo);
916
917 if (!wild_okay) {
918 struct inpcbhead *head;
919 /*
920 * Look for an unconnected (wildcard foreign addr) PCB that
921 * matches the local address and port we're looking for.
922 */
923 head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
924 LIST_FOREACH(inp, head, inp_hash) {
925#ifdef INET6
926 if ((inp->inp_vflag & INP_IPV4) == 0)
927 continue;
928#endif
929 if (inp->inp_faddr.s_addr == INADDR_ANY &&
930 inp->inp_laddr.s_addr == laddr.s_addr &&
931 inp->inp_lport == lport) {
932 /*
933 * Found.
934 */
935 return (inp);
936 }
937 }
938 /*
939 * Not found.
940 */
941 return (NULL);
942 } else {
943 struct inpcbporthead *porthash;
944 struct inpcbport *phd;
945 struct inpcb *match = NULL;
946 /*
947 * Best fit PCB lookup.
948 *
949 * First see if this local port is in use by looking on the
950 * port hash list.
951 */
952 porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
953 pcbinfo->porthashmask)];
954 LIST_FOREACH(phd, porthash, phd_hash) {
955 if (phd->phd_port == lport)
956 break;
957 }
958 if (phd != NULL) {
959 /*
960 * Port is in use by one or more PCBs. Look for best
961 * fit.
962 */
963 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
964 wildcard = 0;
965#ifdef INET6
966 if ((inp->inp_vflag & INP_IPV4) == 0)
967 continue;
968 /*
969 * We never select the PCB that has
970 * INP_IPV6 flag and is bound to :: if
971 * we have another PCB which is bound
972 * to 0.0.0.0. If a PCB has the
973 * INP_IPV6 flag, then we set its cost
974 * higher than IPv4 only PCBs.
975 *
976 * Note that the case only happens
977 * when a socket is bound to ::, under
978 * the condition that the use of the
979 * mapped address is allowed.
980 */
981 if ((inp->inp_vflag & INP_IPV6) != 0)
982 wildcard += INP_LOOKUP_MAPPED_PCB_COST;
983#endif
984 if (inp->inp_faddr.s_addr != INADDR_ANY)
985 wildcard++;
986 if (inp->inp_laddr.s_addr != INADDR_ANY) {
987 if (laddr.s_addr == INADDR_ANY)
988 wildcard++;
989 else if (inp->inp_laddr.s_addr != laddr.s_addr)
990 continue;
991 } else {
992 if (laddr.s_addr != INADDR_ANY)
993 wildcard++;
994 }
995 if (wildcard < matchwild) {
996 match = inp;
997 matchwild = wildcard;
998 if (matchwild == 0) {
999 break;
1000 }
1001 }
1002 }
1003 }
1004 return (match);
1005 }
1006}
1007#undef INP_LOOKUP_MAPPED_PCB_COST
1008
1009/*
1010 * Lookup PCB in hash list.
1011 */
1012struct inpcb *
1013in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1014 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
1015 struct ifnet *ifp)
1016{
1017 struct inpcbhead *head;
1018 struct inpcb *inp;
1019 u_short fport = fport_arg, lport = lport_arg;
1020
1021 INP_INFO_RLOCK_ASSERT(pcbinfo);
1022
1023 /*
1024 * First look for an exact match.
1025 */
1026 head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)];
1027 LIST_FOREACH(inp, head, inp_hash) {
1028#ifdef INET6
1029 if ((inp->inp_vflag & INP_IPV4) == 0)
1030 continue;
1031#endif
1032 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1033 inp->inp_laddr.s_addr == laddr.s_addr &&
1034 inp->inp_fport == fport &&
1035 inp->inp_lport == lport) {
1036 /*
1037 * Found.
1038 */
1039 return (inp);
1040 }
1041 }
1042 if (wildcard) {
1043 struct inpcb *local_wild = NULL;
1044#if defined(INET6)
1045 struct inpcb *local_wild_mapped = NULL;
1046#endif /* defined(INET6) */
1047
1048 head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1049 LIST_FOREACH(inp, head, inp_hash) {
1050#ifdef INET6
1051 if ((inp->inp_vflag & INP_IPV4) == 0)
1052 continue;
1053#endif
1054 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1055 inp->inp_lport == lport) {
1056 if (ifp && ifp->if_type == IFT_FAITH &&
1057 (inp->inp_flags & INP_FAITH) == 0)
1058 continue;
1059 if (inp->inp_laddr.s_addr == laddr.s_addr)
1060 return (inp);
1061 else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1062#if defined(INET6)
1063 if (INP_CHECK_SOCKAF(inp->inp_socket,
1064 AF_INET6))
1065 local_wild_mapped = inp;
1066 else
1067#endif /* defined(INET6) */
1068 local_wild = inp;
1069 }
1070 }
1071 }
1072#if defined(INET6)
1073 if (local_wild == NULL)
1074 return (local_wild_mapped);
1075#endif /* defined(INET6) */
1076 return (local_wild);
1077 }
1078
1079 /*
1080 * Not found.
1081 */
1082 return (NULL);
1083}
1084
1085/*
1086 * Insert PCB onto various hash lists.
1087 */
1088int
1089in_pcbinshash(struct inpcb *inp)
1090{
1091 struct inpcbhead *pcbhash;
1092 struct inpcbporthead *pcbporthash;
1093 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1094 struct inpcbport *phd;
1095 u_int32_t hashkey_faddr;
1096
1097 INP_INFO_WLOCK_ASSERT(pcbinfo);
1098 INP_LOCK_ASSERT(inp);
1099
1100#ifdef INET6
1101 if (inp->inp_vflag & INP_IPV6)
1102 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1103 else
1104#endif /* INET6 */
1105 hashkey_faddr = inp->inp_faddr.s_addr;
1106
1107 pcbhash = &pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr,
1108 inp->inp_lport, inp->inp_fport, pcbinfo->hashmask)];
1109
1110 pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport,
1111 pcbinfo->porthashmask)];
1112
1113 /*
1114 * Go through port list and look for a head for this lport.
1115 */
1116 LIST_FOREACH(phd, pcbporthash, phd_hash) {
1117 if (phd->phd_port == inp->inp_lport)
1118 break;
1119 }
1120 /*
1121 * If none exists, malloc one and tack it on.
1122 */
1123 if (phd == NULL) {
1124 MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT);
1125 if (phd == NULL) {
1126 return (ENOBUFS); /* XXX */
1127 }
1128 phd->phd_port = inp->inp_lport;
1129 LIST_INIT(&phd->phd_pcblist);
1130 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1131 }
1132 inp->inp_phd = phd;
1133 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1134 LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
1135 return (0);
1136}
1137
1138/*
1139 * Move PCB to the proper hash bucket when { faddr, fport } have been
1140 * changed. NOTE: This does not handle the case of the lport changing (the
1141 * hashed port list would have to be updated as well), so the lport must
1142 * not change after in_pcbinshash() has been called.
1143 */
1144void
1145in_pcbrehash(struct inpcb *inp)
1146{
1147 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1148 struct inpcbhead *head;
1149 u_int32_t hashkey_faddr;
1150
1151 INP_INFO_WLOCK_ASSERT(pcbinfo);
1152 INP_LOCK_ASSERT(inp);
1153
1154#ifdef INET6
1155 if (inp->inp_vflag & INP_IPV6)
1156 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1157 else
1158#endif /* INET6 */
1159 hashkey_faddr = inp->inp_faddr.s_addr;
1160
1161 head = &pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr,
1162 inp->inp_lport, inp->inp_fport, pcbinfo->hashmask)];
1163
1164 LIST_REMOVE(inp, inp_hash);
1165 LIST_INSERT_HEAD(head, inp, inp_hash);
1166}
1167
1168/*
1169 * Remove PCB from various lists.
1170 */
1171void
1172in_pcbremlists(struct inpcb *inp)
1173{
1174 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1175
1176 INP_INFO_WLOCK_ASSERT(pcbinfo);
1177 INP_LOCK_ASSERT(inp);
1178
1179 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1180 if (inp->inp_lport) {
1181 struct inpcbport *phd = inp->inp_phd;
1182
1183 LIST_REMOVE(inp, inp_hash);
1184 LIST_REMOVE(inp, inp_portlist);
1185 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1186 LIST_REMOVE(phd, phd_hash);
1187 free(phd, M_PCB);
1188 }
1189 }
1190 LIST_REMOVE(inp, inp_list);
1191 pcbinfo->ipi_count--;
1192}
1193
1194/*
1195 * A set label operation has occurred at the socket layer, propagate the
1196 * label change into the in_pcb for the socket.
1197 */
1198void
1199in_pcbsosetlabel(struct socket *so)
1200{
1201#ifdef MAC
1202 struct inpcb *inp;
1203
1204 inp = sotoinpcb(so);
1205 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
1206
1207 INP_LOCK(inp);
1208 SOCK_LOCK(so);
1209 mac_inpcb_sosetlabel(so, inp);
1210 SOCK_UNLOCK(so);
1211 INP_UNLOCK(inp);
1212#endif
1213}
1214
1215/*
1216 * ipport_tick runs once per second, determining if random port allocation
1217 * should be continued. If more than ipport_randomcps ports have been
1218 * allocated in the last second, then we return to sequential port
1219 * allocation. We return to random allocation only once we drop below
1220 * ipport_randomcps for at least ipport_randomtime seconds.
1221 */
1222void
1223ipport_tick(void *xtp)
1224{
1225
1226 if (ipport_tcpallocs <= ipport_tcplastcount + ipport_randomcps) {
1227 if (ipport_stoprandom > 0)
1228 ipport_stoprandom--;
1229 } else
1230 ipport_stoprandom = ipport_randomtime;
1231 ipport_tcplastcount = ipport_tcpallocs;
1232 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
1233}