1/*-
2 * Copyright (c) 1982, 1986, 1991, 1993, 1995
3 *	The Regents of the University of California.
4 * Copyright (c) 2007-2009 Robert N. M. Watson
5 * Copyright (c) 2010-2011 Juniper Networks, Inc.
6 * All rights reserved.
7 *
8 * Portions of this software were developed by Robert N. M. Watson under
9 * contract to Juniper Networks, Inc.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: stable/11/sys/netinet/in_pcb.c 332889 2018-04-23 14:22:16Z jtl $");
40
41#include "opt_ddb.h"
42#include "opt_ipsec.h"
43#include "opt_inet.h"
44#include "opt_inet6.h"
45#include "opt_pcbgroup.h"
46#include "opt_rss.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/lock.h>
51#include <sys/malloc.h>
52#include <sys/mbuf.h>
53#include <sys/callout.h>
54#include <sys/eventhandler.h>
55#include <sys/domain.h>
56#include <sys/protosw.h>
57#include <sys/rmlock.h>
58#include <sys/socket.h>
59#include <sys/socketvar.h>
60#include <sys/priv.h>
61#include <sys/proc.h>
62#include <sys/refcount.h>
63#include <sys/jail.h>
64#include <sys/kernel.h>
65#include <sys/sysctl.h>
66
67#ifdef DDB
68#include <ddb/ddb.h>
69#endif
70
71#include <vm/uma.h>
72
73#include <net/if.h>
74#include <net/if_var.h>
75#include <net/if_types.h>
76#include <net/if_llatbl.h>
77#include <net/route.h>
78#include <net/rss_config.h>
79#include <net/vnet.h>
80
81#if defined(INET) || defined(INET6)
82#include <netinet/in.h>
83#include <netinet/in_pcb.h>
84#include <netinet/ip_var.h>
85#include <netinet/tcp_var.h>
86#include <netinet/udp.h>
87#include <netinet/udp_var.h>
88#endif
89#ifdef INET
90#include <netinet/in_var.h>
91#endif
92#ifdef INET6
93#include <netinet/ip6.h>
94#include <netinet6/in6_pcb.h>
95#include <netinet6/in6_var.h>
96#include <netinet6/ip6_var.h>
97#endif /* INET6 */
98
99#include <netipsec/ipsec_support.h>
100
101#include <security/mac/mac_framework.h>
102
103static struct callout	ipport_tick_callout;
104
105/*
106 * These configure the range of local port addresses assigned to
107 * "unspecified" outgoing connections/packets/whatever.
108 */
109VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
110VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
111VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
112VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
113VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
114VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
115
116/*
117 * Reserved ports accessible only to root. There are significant
118 * security considerations that must be accounted for when changing these,
119 * but the security benefits can be great. Please be careful.
120 */
121VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
122VNET_DEFINE(int, ipport_reservedlow);
123
124/* Variables dealing with random ephemeral port allocation. */
125VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
126VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
127VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
128VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
129VNET_DEFINE(int, ipport_tcpallocs);
130static VNET_DEFINE(int, ipport_tcplastcount);
131
132#define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
133
134static void	in_pcbremlists(struct inpcb *inp);
135#ifdef INET
136static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
137			    struct in_addr faddr, u_int fport_arg,
138			    struct in_addr laddr, u_int lport_arg,
139			    int lookupflags, struct ifnet *ifp);
140
141#define RANGECHK(var, min, max) \
142	if ((var) < (min)) { (var) = (min); } \
143	else if ((var) > (max)) { (var) = (max); }
144
145static int
146sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
147{
148	int error;
149
150	error = sysctl_handle_int(oidp, arg1, arg2, req);
151	if (error == 0) {
152		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
153		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
154		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
155		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
156		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
157		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
158	}
159	return (error);
160}
161
162#undef RANGECHK
163
164static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
165    "IP Ports");
166
167SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
168	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
169	&VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
170SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
171	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
172	&VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
173SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
174	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
175	&VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
176SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
177	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
178	&VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
179SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
180	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
181	&VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
182SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
183	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
184	&VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
185SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
186	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
187	&VNET_NAME(ipport_reservedhigh), 0, "");
188SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
189	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
190SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
191	CTLFLAG_VNET | CTLFLAG_RW,
192	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
193SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
194	CTLFLAG_VNET | CTLFLAG_RW,
195	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
196	"allocations before switching to a sequental one");
197SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
198	CTLFLAG_VNET | CTLFLAG_RW,
199	&VNET_NAME(ipport_randomtime), 0,
200	"Minimum time to keep sequental port "
201	"allocation before switching to a random one");
202#endif /* INET */
203
204/*
205 * in_pcb.c: manage the Protocol Control Blocks.
206 *
207 * NOTE: It is assumed that most of these functions will be called with
208 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
209 * functions often modify hash chains or addresses in pcbs.
210 */
211
212/*
213 * Initialize an inpcbinfo -- we should be able to reduce the number of
214 * arguments in time.
215 */
216void
217in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
218    struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
219    char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini,
220    uint32_t inpcbzone_flags, u_int hashfields)
221{
222
223	INP_INFO_LOCK_INIT(pcbinfo, name);
224	INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");	/* XXXRW: argument? */
225	INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
226#ifdef VIMAGE
227	pcbinfo->ipi_vnet = curvnet;
228#endif
229	pcbinfo->ipi_listhead = listhead;
230	LIST_INIT(pcbinfo->ipi_listhead);
231	pcbinfo->ipi_count = 0;
232	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
233	    &pcbinfo->ipi_hashmask);
234	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
235	    &pcbinfo->ipi_porthashmask);
236#ifdef PCBGROUP
237	in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
238#endif
239	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
240	    NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
241	    inpcbzone_flags);
242	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
243	uma_zone_set_warning(pcbinfo->ipi_zone,
244	    "kern.ipc.maxsockets limit reached");
245}
246
247/*
248 * Destroy an inpcbinfo.
249 */
250void
251in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
252{
253
254	KASSERT(pcbinfo->ipi_count == 0,
255	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
256
257	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
258	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
259	    pcbinfo->ipi_porthashmask);
260#ifdef PCBGROUP
261	in_pcbgroup_destroy(pcbinfo);
262#endif
263	uma_zdestroy(pcbinfo->ipi_zone);
264	INP_LIST_LOCK_DESTROY(pcbinfo);
265	INP_HASH_LOCK_DESTROY(pcbinfo);
266	INP_INFO_LOCK_DESTROY(pcbinfo);
267}
268
269/*
270 * Allocate a PCB and associate it with the socket.
271 * On success return with the PCB locked.
272 */
273int
274in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
275{
276	struct inpcb *inp;
277	int error;
278
279#ifdef INVARIANTS
280	if (pcbinfo == &V_tcbinfo) {
281		INP_INFO_RLOCK_ASSERT(pcbinfo);
282	} else {
283		INP_INFO_WLOCK_ASSERT(pcbinfo);
284	}
285#endif
286
287	error = 0;
288	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
289	if (inp == NULL)
290		return (ENOBUFS);
291	bzero(inp, inp_zero_size);
292	inp->inp_pcbinfo = pcbinfo;
293	inp->inp_socket = so;
294	inp->inp_cred = crhold(so->so_cred);
295	inp->inp_inc.inc_fibnum = so->so_fibnum;
296#ifdef MAC
297	error = mac_inpcb_init(inp, M_NOWAIT);
298	if (error != 0)
299		goto out;
300	mac_inpcb_create(so, inp);
301#endif
302#if defined(IPSEC) || defined(IPSEC_SUPPORT)
303	error = ipsec_init_pcbpolicy(inp);
304	if (error != 0) {
305#ifdef MAC
306		mac_inpcb_destroy(inp);
307#endif
308		goto out;
309	}
310#endif /*IPSEC*/
311#ifdef INET6
312	if (INP_SOCKAF(so) == AF_INET6) {
313		inp->inp_vflag |= INP_IPV6PROTO;
314		if (V_ip6_v6only)
315			inp->inp_flags |= IN6P_IPV6_V6ONLY;
316	}
317#endif
318	INP_WLOCK(inp);
319	INP_LIST_WLOCK(pcbinfo);
320	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
321	pcbinfo->ipi_count++;
322	so->so_pcb = (caddr_t)inp;
323#ifdef INET6
324	if (V_ip6_auto_flowlabel)
325		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
326#endif
327	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
328	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
329
330	/*
331	 * Routes in inpcb's can cache L2 as well; they are guaranteed
332	 * to be cleaned up.
333	 */
334	inp->inp_route.ro_flags = RT_LLE_CACHE;
335	INP_LIST_WUNLOCK(pcbinfo);
336#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
337out:
338	if (error != 0) {
339		crfree(inp->inp_cred);
340		uma_zfree(pcbinfo->ipi_zone, inp);
341	}
342#endif
343	return (error);
344}
345
346#ifdef INET
347int
348in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
349{
350	int anonport, error;
351
352	INP_WLOCK_ASSERT(inp);
353	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
354
355	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
356		return (EINVAL);
357	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
358	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
359	    &inp->inp_lport, cred);
360	if (error)
361		return (error);
362	if (in_pcbinshash(inp) != 0) {
363		inp->inp_laddr.s_addr = INADDR_ANY;
364		inp->inp_lport = 0;
365		return (EAGAIN);
366	}
367	if (anonport)
368		inp->inp_flags |= INP_ANONPORT;
369	return (0);
370}
371#endif
372
373/*
374 * Select a local port (number) to use.
375 */
376#if defined(INET) || defined(INET6)
377int
378in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
379    struct ucred *cred, int lookupflags)
380{
381	struct inpcbinfo *pcbinfo;
382	struct inpcb *tmpinp;
383	unsigned short *lastport;
384	int count, dorandom, error;
385	u_short aux, first, last, lport;
386#ifdef INET
387	struct in_addr laddr;
388#endif
389
390	pcbinfo = inp->inp_pcbinfo;
391
392	/*
393	 * Because no actual state changes occur here, a global write lock on
394	 * the pcbinfo isn't required.
395	 */
396	INP_LOCK_ASSERT(inp);
397	INP_HASH_LOCK_ASSERT(pcbinfo);
398
399	if (inp->inp_flags & INP_HIGHPORT) {
400		first = V_ipport_hifirstauto;	/* sysctl */
401		last  = V_ipport_hilastauto;
402		lastport = &pcbinfo->ipi_lasthi;
403	} else if (inp->inp_flags & INP_LOWPORT) {
404		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
405		if (error)
406			return (error);
407		first = V_ipport_lowfirstauto;	/* 1023 */
408		last  = V_ipport_lowlastauto;	/* 600 */
409		lastport = &pcbinfo->ipi_lastlow;
410	} else {
411		first = V_ipport_firstauto;	/* sysctl */
412		last  = V_ipport_lastauto;
413		lastport = &pcbinfo->ipi_lastport;
414	}
415	/*
416	 * For UDP(-Lite), use random port allocation as long as the user
417	 * allows it.  For TCP (and as of yet unknown) connections,
418	 * use random port allocation only if the user allows it AND
419	 * ipport_tick() allows it.
420	 */
421	if (V_ipport_randomized &&
422		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
423		pcbinfo == &V_ulitecbinfo))
424		dorandom = 1;
425	else
426		dorandom = 0;
427	/*
428	 * It makes no sense to do random port allocation if
429	 * we have the only port available.
430	 */
431	if (first == last)
432		dorandom = 0;
433	/* Make sure to not include UDP(-Lite) packets in the count. */
434	if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
435		V_ipport_tcpallocs++;
436	/*
437	 * Instead of having two loops further down counting up or down
438	 * make sure that first is always <= last and go with only one
439	 * code path implementing all logic.
440	 */
441	if (first > last) {
442		aux = first;
443		first = last;
444		last = aux;
445	}
446
447#ifdef INET
448	/* Make the compiler happy. */
449	laddr.s_addr = 0;
450	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
451		KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
452		    __func__, inp));
453		laddr = *laddrp;
454	}
455#endif
456	tmpinp = NULL;	/* Make compiler happy. */
457	lport = *lportp;
458
459	if (dorandom)
460		*lastport = first + (arc4random() % (last - first));
461
462	count = last - first;
463
464	do {
465		if (count-- < 0)	/* completely used? */
466			return (EADDRNOTAVAIL);
467		++*lastport;
468		if (*lastport < first || *lastport > last)
469			*lastport = first;
470		lport = htons(*lastport);
471
472#ifdef INET6
473		if ((inp->inp_vflag & INP_IPV6) != 0)
474			tmpinp = in6_pcblookup_local(pcbinfo,
475			    &inp->in6p_laddr, lport, lookupflags, cred);
476#endif
477#if defined(INET) && defined(INET6)
478		else
479#endif
480#ifdef INET
481			tmpinp = in_pcblookup_local(pcbinfo, laddr,
482			    lport, lookupflags, cred);
483#endif
484	} while (tmpinp != NULL);
485
486#ifdef INET
487	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
488		laddrp->s_addr = laddr.s_addr;
489#endif
490	*lportp = lport;
491
492	return (0);
493}
494
495/*
496 * Return cached socket options.
497 */
498short
499inp_so_options(const struct inpcb *inp)
500{
501   short so_options;
502
503   so_options = 0;
504
505   if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
506	   so_options |= SO_REUSEPORT;
507   if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
508	   so_options |= SO_REUSEADDR;
509   return (so_options);
510}
511#endif /* INET || INET6 */
512
513/*
514 * Check if a new BINDMULTI socket is allowed to be created.
515 *
516 * ni points to the new inp.
517 * oi points to the exisitng inp.
518 *
519 * This checks whether the existing inp also has BINDMULTI and
520 * whether the credentials match.
521 */
522int
523in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
524{
525	/* Check permissions match */
526	if ((ni->inp_flags2 & INP_BINDMULTI) &&
527	    (ni->inp_cred->cr_uid !=
528	    oi->inp_cred->cr_uid))
529		return (0);
530
531	/* Check the existing inp has BINDMULTI set */
532	if ((ni->inp_flags2 & INP_BINDMULTI) &&
533	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
534		return (0);
535
536	/*
537	 * We're okay - either INP_BINDMULTI isn't set on ni, or
538	 * it is and it matches the checks.
539	 */
540	return (1);
541}
542
543#ifdef INET
544/*
545 * Set up a bind operation on a PCB, performing port allocation
546 * as required, but do not actually modify the PCB. Callers can
547 * either complete the bind by setting inp_laddr/inp_lport and
548 * calling in_pcbinshash(), or they can just use the resulting
549 * port and address to authorise the sending of a once-off packet.
550 *
551 * On error, the values of *laddrp and *lportp are not changed.
552 */
553int
554in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
555    u_short *lportp, struct ucred *cred)
556{
557	struct socket *so = inp->inp_socket;
558	struct sockaddr_in *sin;
559	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
560	struct in_addr laddr;
561	u_short lport = 0;
562	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
563	int error;
564
565	/*
566	 * No state changes, so read locks are sufficient here.
567	 */
568	INP_LOCK_ASSERT(inp);
569	INP_HASH_LOCK_ASSERT(pcbinfo);
570
571	if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
572		return (EADDRNOTAVAIL);
573	laddr.s_addr = *laddrp;
574	if (nam != NULL && laddr.s_addr != INADDR_ANY)
575		return (EINVAL);
576	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
577		lookupflags = INPLOOKUP_WILDCARD;
578	if (nam == NULL) {
579		if ((error = prison_local_ip4(cred, &laddr)) != 0)
580			return (error);
581	} else {
582		sin = (struct sockaddr_in *)nam;
583		if (nam->sa_len != sizeof (*sin))
584			return (EINVAL);
585#ifdef notdef
586		/*
587		 * We should check the family, but old programs
588		 * incorrectly fail to initialize it.
589		 */
590		if (sin->sin_family != AF_INET)
591			return (EAFNOSUPPORT);
592#endif
593		error = prison_local_ip4(cred, &sin->sin_addr);
594		if (error)
595			return (error);
596		if (sin->sin_port != *lportp) {
597			/* Don't allow the port to change. */
598			if (*lportp != 0)
599				return (EINVAL);
600			lport = sin->sin_port;
601		}
602		/* NB: lport is left as 0 if the port isn't being changed. */
603		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
604			/*
605			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
606			 * allow complete duplication of binding if
607			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
608			 * and a multicast address is bound on both
609			 * new and duplicated sockets.
610			 */
611			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
612				reuseport = SO_REUSEADDR|SO_REUSEPORT;
613		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
614			sin->sin_port = 0;		/* yech... */
615			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
616			/*
617			 * Is the address a local IP address?
618			 * If INP_BINDANY is set, then the socket may be bound
619			 * to any endpoint address, local or not.
620			 */
621			if ((inp->inp_flags & INP_BINDANY) == 0 &&
622			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
623				return (EADDRNOTAVAIL);
624		}
625		laddr = sin->sin_addr;
626		if (lport) {
627			struct inpcb *t;
628			struct tcptw *tw;
629
630			/* GROSS */
631			if (ntohs(lport) <= V_ipport_reservedhigh &&
632			    ntohs(lport) >= V_ipport_reservedlow &&
633			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
634			    0))
635				return (EACCES);
636			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
637			    priv_check_cred(inp->inp_cred,
638			    PRIV_NETINET_REUSEPORT, 0) != 0) {
639				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
640				    lport, INPLOOKUP_WILDCARD, cred);
641	/*
642	 * XXX
643	 * This entire block sorely needs a rewrite.
644	 */
645				if (t &&
646				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
647				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
648				    (so->so_type != SOCK_STREAM ||
649				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
650				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
651				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
652				     (t->inp_flags2 & INP_REUSEPORT) == 0) &&
653				    (inp->inp_cred->cr_uid !=
654				     t->inp_cred->cr_uid))
655					return (EADDRINUSE);
656
657				/*
658				 * If the socket is a BINDMULTI socket, then
659				 * the credentials need to match and the
660				 * original socket also has to have been bound
661				 * with BINDMULTI.
662				 */
663				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
664					return (EADDRINUSE);
665			}
666			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
667			    lport, lookupflags, cred);
668			if (t && (t->inp_flags & INP_TIMEWAIT)) {
669				/*
670				 * XXXRW: If an incpb has had its timewait
671				 * state recycled, we treat the address as
672				 * being in use (for now).  This is better
673				 * than a panic, but not desirable.
674				 */
675				tw = intotw(t);
676				if (tw == NULL ||
677				    (reuseport & tw->tw_so_options) == 0)
678					return (EADDRINUSE);
679			} else if (t &&
680			    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
681			    (reuseport & inp_so_options(t)) == 0) {
682#ifdef INET6
683				if (ntohl(sin->sin_addr.s_addr) !=
684				    INADDR_ANY ||
685				    ntohl(t->inp_laddr.s_addr) !=
686				    INADDR_ANY ||
687				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
688				    (t->inp_vflag & INP_IPV6PROTO) == 0)
689#endif
690				return (EADDRINUSE);
691				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
692					return (EADDRINUSE);
693			}
694		}
695	}
696	if (*lportp != 0)
697		lport = *lportp;
698	if (lport == 0) {
699		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
700		if (error != 0)
701			return (error);
702
703	}
704	*laddrp = laddr.s_addr;
705	*lportp = lport;
706	return (0);
707}
708
709/*
710 * Connect from a socket to a specified address.
711 * Both address and port must be specified in argument sin.
712 * If don't have a local address for this socket yet,
713 * then pick one.
714 */
715int
716in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
717    struct ucred *cred, struct mbuf *m)
718{
719	u_short lport, fport;
720	in_addr_t laddr, faddr;
721	int anonport, error;
722
723	INP_WLOCK_ASSERT(inp);
724	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
725
726	lport = inp->inp_lport;
727	laddr = inp->inp_laddr.s_addr;
728	anonport = (lport == 0);
729	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
730	    NULL, cred);
731	if (error)
732		return (error);
733
734	/* Do the initial binding of the local address if required. */
735	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
736		inp->inp_lport = lport;
737		inp->inp_laddr.s_addr = laddr;
738		if (in_pcbinshash(inp) != 0) {
739			inp->inp_laddr.s_addr = INADDR_ANY;
740			inp->inp_lport = 0;
741			return (EAGAIN);
742		}
743	}
744
745	/* Commit the remaining changes. */
746	inp->inp_lport = lport;
747	inp->inp_laddr.s_addr = laddr;
748	inp->inp_faddr.s_addr = faddr;
749	inp->inp_fport = fport;
750	in_pcbrehash_mbuf(inp, m);
751
752	if (anonport)
753		inp->inp_flags |= INP_ANONPORT;
754	return (0);
755}
756
757int
758in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
759{
760
761	return (in_pcbconnect_mbuf(inp, nam, cred, NULL));
762}
763
764/*
765 * Do proper source address selection on an unbound socket in case
766 * of connect. Take jails into account as well.
767 */
768int
769in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
770    struct ucred *cred)
771{
772	struct ifaddr *ifa;
773	struct sockaddr *sa;
774	struct sockaddr_in *sin;
775	struct route sro;
776	int error;
777
778	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
779
780	/*
781	 * Bypass source address selection and use the primary jail IP
782	 * if requested.
783	 */
784	if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
785		return (0);
786
787	error = 0;
788	bzero(&sro, sizeof(sro));
789
790	sin = (struct sockaddr_in *)&sro.ro_dst;
791	sin->sin_family = AF_INET;
792	sin->sin_len = sizeof(struct sockaddr_in);
793	sin->sin_addr.s_addr = faddr->s_addr;
794
795	/*
796	 * If route is known our src addr is taken from the i/f,
797	 * else punt.
798	 *
799	 * Find out route to destination.
800	 */
801	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
802		in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
803
804	/*
805	 * If we found a route, use the address corresponding to
806	 * the outgoing interface.
807	 *
808	 * Otherwise assume faddr is reachable on a directly connected
809	 * network and try to find a corresponding interface to take
810	 * the source address from.
811	 */
812	if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
813		struct in_ifaddr *ia;
814		struct ifnet *ifp;
815
816		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
817					inp->inp_socket->so_fibnum));
818		if (ia == NULL)
819			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
820						inp->inp_socket->so_fibnum));
821		if (ia == NULL) {
822			error = ENETUNREACH;
823			goto done;
824		}
825
826		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
827			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
828			ifa_free(&ia->ia_ifa);
829			goto done;
830		}
831
832		ifp = ia->ia_ifp;
833		ifa_free(&ia->ia_ifa);
834		ia = NULL;
835		IF_ADDR_RLOCK(ifp);
836		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
837
838			sa = ifa->ifa_addr;
839			if (sa->sa_family != AF_INET)
840				continue;
841			sin = (struct sockaddr_in *)sa;
842			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
843				ia = (struct in_ifaddr *)ifa;
844				break;
845			}
846		}
847		if (ia != NULL) {
848			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
849			IF_ADDR_RUNLOCK(ifp);
850			goto done;
851		}
852		IF_ADDR_RUNLOCK(ifp);
853
854		/* 3. As a last resort return the 'default' jail address. */
855		error = prison_get_ip4(cred, laddr);
856		goto done;
857	}
858
859	/*
860	 * If the outgoing interface on the route found is not
861	 * a loopback interface, use the address from that interface.
862	 * In case of jails do those three steps:
863	 * 1. check if the interface address belongs to the jail. If so use it.
864	 * 2. check if we have any address on the outgoing interface
865	 *    belonging to this jail. If so use it.
866	 * 3. as a last resort return the 'default' jail address.
867	 */
868	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
869		struct in_ifaddr *ia;
870		struct ifnet *ifp;
871
872		/* If not jailed, use the default returned. */
873		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
874			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
875			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
876			goto done;
877		}
878
879		/* Jailed. */
880		/* 1. Check if the iface address belongs to the jail. */
881		sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
882		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
883			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
884			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
885			goto done;
886		}
887
888		/*
889		 * 2. Check if we have any address on the outgoing interface
890		 *    belonging to this jail.
891		 */
892		ia = NULL;
893		ifp = sro.ro_rt->rt_ifp;
894		IF_ADDR_RLOCK(ifp);
895		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
896			sa = ifa->ifa_addr;
897			if (sa->sa_family != AF_INET)
898				continue;
899			sin = (struct sockaddr_in *)sa;
900			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
901				ia = (struct in_ifaddr *)ifa;
902				break;
903			}
904		}
905		if (ia != NULL) {
906			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
907			IF_ADDR_RUNLOCK(ifp);
908			goto done;
909		}
910		IF_ADDR_RUNLOCK(ifp);
911
912		/* 3. As a last resort return the 'default' jail address. */
913		error = prison_get_ip4(cred, laddr);
914		goto done;
915	}
916
917	/*
918	 * The outgoing interface is marked with 'loopback net', so a route
919	 * to ourselves is here.
920	 * Try to find the interface of the destination address and then
921	 * take the address from there. That interface is not necessarily
922	 * a loopback interface.
923	 * In case of jails, check that it is an address of the jail
924	 * and if we cannot find, fall back to the 'default' jail address.
925	 */
926	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
927		struct sockaddr_in sain;
928		struct in_ifaddr *ia;
929
930		bzero(&sain, sizeof(struct sockaddr_in));
931		sain.sin_family = AF_INET;
932		sain.sin_len = sizeof(struct sockaddr_in);
933		sain.sin_addr.s_addr = faddr->s_addr;
934
935		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
936					inp->inp_socket->so_fibnum));
937		if (ia == NULL)
938			ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
939						inp->inp_socket->so_fibnum));
940		if (ia == NULL)
941			ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
942
943		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
944			if (ia == NULL) {
945				error = ENETUNREACH;
946				goto done;
947			}
948			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
949			ifa_free(&ia->ia_ifa);
950			goto done;
951		}
952
953		/* Jailed. */
954		if (ia != NULL) {
955			struct ifnet *ifp;
956
957			ifp = ia->ia_ifp;
958			ifa_free(&ia->ia_ifa);
959			ia = NULL;
960			IF_ADDR_RLOCK(ifp);
961			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
962
963				sa = ifa->ifa_addr;
964				if (sa->sa_family != AF_INET)
965					continue;
966				sin = (struct sockaddr_in *)sa;
967				if (prison_check_ip4(cred,
968				    &sin->sin_addr) == 0) {
969					ia = (struct in_ifaddr *)ifa;
970					break;
971				}
972			}
973			if (ia != NULL) {
974				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
975				IF_ADDR_RUNLOCK(ifp);
976				goto done;
977			}
978			IF_ADDR_RUNLOCK(ifp);
979		}
980
981		/* 3. As a last resort return the 'default' jail address. */
982		error = prison_get_ip4(cred, laddr);
983		goto done;
984	}
985
986done:
987	if (sro.ro_rt != NULL)
988		RTFREE(sro.ro_rt);
989	return (error);
990}
991
992/*
993 * Set up for a connect from a socket to the specified address.
994 * On entry, *laddrp and *lportp should contain the current local
995 * address and port for the PCB; these are updated to the values
996 * that should be placed in inp_laddr and inp_lport to complete
997 * the connect.
998 *
999 * On success, *faddrp and *fportp will be set to the remote address
1000 * and port. These are not updated in the error case.
1001 *
1002 * If the operation fails because the connection already exists,
1003 * *oinpp will be set to the PCB of that connection so that the
1004 * caller can decide to override it. In all other cases, *oinpp
1005 * is set to NULL.
1006 */
1007int
1008in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
1009    in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1010    struct inpcb **oinpp, struct ucred *cred)
1011{
1012	struct rm_priotracker in_ifa_tracker;
1013	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1014	struct in_ifaddr *ia;
1015	struct inpcb *oinp;
1016	struct in_addr laddr, faddr;
1017	u_short lport, fport;
1018	int error;
1019
1020	/*
1021	 * Because a global state change doesn't actually occur here, a read
1022	 * lock is sufficient.
1023	 */
1024	INP_LOCK_ASSERT(inp);
1025	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1026
1027	if (oinpp != NULL)
1028		*oinpp = NULL;
1029	if (nam->sa_len != sizeof (*sin))
1030		return (EINVAL);
1031	if (sin->sin_family != AF_INET)
1032		return (EAFNOSUPPORT);
1033	if (sin->sin_port == 0)
1034		return (EADDRNOTAVAIL);
1035	laddr.s_addr = *laddrp;
1036	lport = *lportp;
1037	faddr = sin->sin_addr;
1038	fport = sin->sin_port;
1039
1040	if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
1041		/*
1042		 * If the destination address is INADDR_ANY,
1043		 * use the primary local address.
1044		 * If the supplied address is INADDR_BROADCAST,
1045		 * and the primary interface supports broadcast,
1046		 * choose the broadcast address for that interface.
1047		 */
1048		if (faddr.s_addr == INADDR_ANY) {
1049			IN_IFADDR_RLOCK(&in_ifa_tracker);
1050			faddr =
1051			    IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1052			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1053			if (cred != NULL &&
1054			    (error = prison_get_ip4(cred, &faddr)) != 0)
1055				return (error);
1056		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1057			IN_IFADDR_RLOCK(&in_ifa_tracker);
1058			if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1059			    IFF_BROADCAST)
1060				faddr = satosin(&TAILQ_FIRST(
1061				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1062			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1063		}
1064	}
1065	if (laddr.s_addr == INADDR_ANY) {
1066		error = in_pcbladdr(inp, &faddr, &laddr, cred);
1067		/*
1068		 * If the destination address is multicast and an outgoing
1069		 * interface has been set as a multicast option, prefer the
1070		 * address of that interface as our source address.
1071		 */
1072		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1073		    inp->inp_moptions != NULL) {
1074			struct ip_moptions *imo;
1075			struct ifnet *ifp;
1076
1077			imo = inp->inp_moptions;
1078			if (imo->imo_multicast_ifp != NULL) {
1079				ifp = imo->imo_multicast_ifp;
1080				IN_IFADDR_RLOCK(&in_ifa_tracker);
1081				TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1082					if ((ia->ia_ifp == ifp) &&
1083					    (cred == NULL ||
1084					    prison_check_ip4(cred,
1085					    &ia->ia_addr.sin_addr) == 0))
1086						break;
1087				}
1088				if (ia == NULL)
1089					error = EADDRNOTAVAIL;
1090				else {
1091					laddr = ia->ia_addr.sin_addr;
1092					error = 0;
1093				}
1094				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1095			}
1096		}
1097		if (error)
1098			return (error);
1099	}
1100	oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport,
1101	    laddr, lport, 0, NULL);
1102	if (oinp != NULL) {
1103		if (oinpp != NULL)
1104			*oinpp = oinp;
1105		return (EADDRINUSE);
1106	}
1107	if (lport == 0) {
1108		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
1109		    cred);
1110		if (error)
1111			return (error);
1112	}
1113	*laddrp = laddr.s_addr;
1114	*lportp = lport;
1115	*faddrp = faddr.s_addr;
1116	*fportp = fport;
1117	return (0);
1118}
1119
1120void
1121in_pcbdisconnect(struct inpcb *inp)
1122{
1123
1124	INP_WLOCK_ASSERT(inp);
1125	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1126
1127	inp->inp_faddr.s_addr = INADDR_ANY;
1128	inp->inp_fport = 0;
1129	in_pcbrehash(inp);
1130}
1131#endif /* INET */
1132
1133/*
1134 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1135 * For most protocols, this will be invoked immediately prior to calling
1136 * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
1137 * socket, in which case in_pcbfree() is deferred.
1138 */
1139void
1140in_pcbdetach(struct inpcb *inp)
1141{
1142
1143	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1144
1145	inp->inp_socket->so_pcb = NULL;
1146	inp->inp_socket = NULL;
1147}
1148
1149/*
1150 * in_pcbref() bumps the reference count on an inpcb in order to maintain
1151 * stability of an inpcb pointer despite the inpcb lock being released.  This
1152 * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
1153 * but where the inpcb lock may already held, or when acquiring a reference
1154 * via a pcbgroup.
1155 *
1156 * in_pcbref() should be used only to provide brief memory stability, and
1157 * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
1158 * garbage collect the inpcb if it has been in_pcbfree()'d from another
1159 * context.  Until in_pcbrele() has returned that the inpcb is still valid,
1160 * lock and rele are the *only* safe operations that may be performed on the
1161 * inpcb.
1162 *
1163 * While the inpcb will not be freed, releasing the inpcb lock means that the
1164 * connection's state may change, so the caller should be careful to
1165 * revalidate any cached state on reacquiring the lock.  Drop the reference
1166 * using in_pcbrele().
1167 */
1168void
1169in_pcbref(struct inpcb *inp)
1170{
1171
1172	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1173
1174	refcount_acquire(&inp->inp_refcount);
1175}
1176
1177/*
1178 * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
1179 * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
1180 * return a flag indicating whether or not the inpcb remains valid.  If it is
1181 * valid, we return with the inpcb lock held.
1182 *
1183 * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
1184 * reference on an inpcb.  Historically more work was done here (actually, in
1185 * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
1186 * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
1187 * about memory stability (and continued use of the write lock).
1188 */
1189int
1190in_pcbrele_rlocked(struct inpcb *inp)
1191{
1192	struct inpcbinfo *pcbinfo;
1193
1194	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1195
1196	INP_RLOCK_ASSERT(inp);
1197
1198	if (refcount_release(&inp->inp_refcount) == 0) {
1199		/*
1200		 * If the inpcb has been freed, let the caller know, even if
1201		 * this isn't the last reference.
1202		 */
1203		if (inp->inp_flags2 & INP_FREED) {
1204			INP_RUNLOCK(inp);
1205			return (1);
1206		}
1207		return (0);
1208	}
1209
1210	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1211
1212	INP_RUNLOCK(inp);
1213	pcbinfo = inp->inp_pcbinfo;
1214	uma_zfree(pcbinfo->ipi_zone, inp);
1215	return (1);
1216}
1217
1218int
1219in_pcbrele_wlocked(struct inpcb *inp)
1220{
1221	struct inpcbinfo *pcbinfo;
1222
1223	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1224
1225	INP_WLOCK_ASSERT(inp);
1226
1227	if (refcount_release(&inp->inp_refcount) == 0) {
1228		/*
1229		 * If the inpcb has been freed, let the caller know, even if
1230		 * this isn't the last reference.
1231		 */
1232		if (inp->inp_flags2 & INP_FREED) {
1233			INP_WUNLOCK(inp);
1234			return (1);
1235		}
1236		return (0);
1237	}
1238
1239	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1240
1241	INP_WUNLOCK(inp);
1242	pcbinfo = inp->inp_pcbinfo;
1243	uma_zfree(pcbinfo->ipi_zone, inp);
1244	return (1);
1245}
1246
1247/*
1248 * Temporary wrapper.
1249 */
1250int
1251in_pcbrele(struct inpcb *inp)
1252{
1253
1254	return (in_pcbrele_wlocked(inp));
1255}
1256
1257/*
1258 * Unconditionally schedule an inpcb to be freed by decrementing its
1259 * reference count, which should occur only after the inpcb has been detached
1260 * from its socket.  If another thread holds a temporary reference (acquired
1261 * using in_pcbref()) then the free is deferred until that reference is
1262 * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
1263 * work, including removal from global lists, is done in this context, where
1264 * the pcbinfo lock is held.
1265 */
1266void
1267in_pcbfree(struct inpcb *inp)
1268{
1269	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1270
1271	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1272
1273#ifdef INVARIANTS
1274	if (pcbinfo == &V_tcbinfo) {
1275		INP_INFO_LOCK_ASSERT(pcbinfo);
1276	} else {
1277		INP_INFO_WLOCK_ASSERT(pcbinfo);
1278	}
1279#endif
1280	INP_WLOCK_ASSERT(inp);
1281
1282	/* XXXRW: Do as much as possible here. */
1283#if defined(IPSEC) || defined(IPSEC_SUPPORT)
1284	if (inp->inp_sp != NULL)
1285		ipsec_delete_pcbpolicy(inp);
1286#endif
1287	INP_LIST_WLOCK(pcbinfo);
1288	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1289	in_pcbremlists(inp);
1290	INP_LIST_WUNLOCK(pcbinfo);
1291#ifdef INET6
1292	if (inp->inp_vflag & INP_IPV6PROTO) {
1293		ip6_freepcbopts(inp->in6p_outputopts);
1294		if (inp->in6p_moptions != NULL)
1295			ip6_freemoptions(inp->in6p_moptions);
1296	}
1297#endif
1298	if (inp->inp_options)
1299		(void)m_free(inp->inp_options);
1300#ifdef INET
1301	if (inp->inp_moptions != NULL)
1302		inp_freemoptions(inp->inp_moptions);
1303#endif
1304	if (inp->inp_route.ro_rt) {
1305		RTFREE(inp->inp_route.ro_rt);
1306		inp->inp_route.ro_rt = (struct rtentry *)NULL;
1307	}
1308	if (inp->inp_route.ro_lle)
1309		LLE_FREE(inp->inp_route.ro_lle);	/* zeros ro_lle */
1310
1311	inp->inp_vflag = 0;
1312	inp->inp_flags2 |= INP_FREED;
1313	crfree(inp->inp_cred);
1314#ifdef MAC
1315	mac_inpcb_destroy(inp);
1316#endif
1317	if (!in_pcbrele_wlocked(inp))
1318		INP_WUNLOCK(inp);
1319}
1320
1321/*
1322 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1323 * port reservation, and preventing it from being returned by inpcb lookups.
1324 *
1325 * It is used by TCP to mark an inpcb as unused and avoid future packet
1326 * delivery or event notification when a socket remains open but TCP has
1327 * closed.  This might occur as a result of a shutdown()-initiated TCP close
1328 * or a RST on the wire, and allows the port binding to be reused while still
1329 * maintaining the invariant that so_pcb always points to a valid inpcb until
1330 * in_pcbdetach().
1331 *
1332 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1333 * in_pcbnotifyall() and in_pcbpurgeif0()?
1334 */
1335void
1336in_pcbdrop(struct inpcb *inp)
1337{
1338
1339	INP_WLOCK_ASSERT(inp);
1340
1341	/*
1342	 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
1343	 * the hash lock...?
1344	 */
1345	inp->inp_flags |= INP_DROPPED;
1346	if (inp->inp_flags & INP_INHASHLIST) {
1347		struct inpcbport *phd = inp->inp_phd;
1348
1349		INP_HASH_WLOCK(inp->inp_pcbinfo);
1350		LIST_REMOVE(inp, inp_hash);
1351		LIST_REMOVE(inp, inp_portlist);
1352		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1353			LIST_REMOVE(phd, phd_hash);
1354			free(phd, M_PCB);
1355		}
1356		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1357		inp->inp_flags &= ~INP_INHASHLIST;
1358#ifdef PCBGROUP
1359		in_pcbgroup_remove(inp);
1360#endif
1361	}
1362}
1363
1364#ifdef INET
1365/*
1366 * Common routines to return the socket addresses associated with inpcbs.
1367 */
1368struct sockaddr *
1369in_sockaddr(in_port_t port, struct in_addr *addr_p)
1370{
1371	struct sockaddr_in *sin;
1372
1373	sin = malloc(sizeof *sin, M_SONAME,
1374		M_WAITOK | M_ZERO);
1375	sin->sin_family = AF_INET;
1376	sin->sin_len = sizeof(*sin);
1377	sin->sin_addr = *addr_p;
1378	sin->sin_port = port;
1379
1380	return (struct sockaddr *)sin;
1381}
1382
1383int
1384in_getsockaddr(struct socket *so, struct sockaddr **nam)
1385{
1386	struct inpcb *inp;
1387	struct in_addr addr;
1388	in_port_t port;
1389
1390	inp = sotoinpcb(so);
1391	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1392
1393	INP_RLOCK(inp);
1394	port = inp->inp_lport;
1395	addr = inp->inp_laddr;
1396	INP_RUNLOCK(inp);
1397
1398	*nam = in_sockaddr(port, &addr);
1399	return 0;
1400}
1401
1402int
1403in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1404{
1405	struct inpcb *inp;
1406	struct in_addr addr;
1407	in_port_t port;
1408
1409	inp = sotoinpcb(so);
1410	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1411
1412	INP_RLOCK(inp);
1413	port = inp->inp_fport;
1414	addr = inp->inp_faddr;
1415	INP_RUNLOCK(inp);
1416
1417	*nam = in_sockaddr(port, &addr);
1418	return 0;
1419}
1420
1421void
1422in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
1423    struct inpcb *(*notify)(struct inpcb *, int))
1424{
1425	struct inpcb *inp, *inp_temp;
1426
1427	INP_INFO_WLOCK(pcbinfo);
1428	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
1429		INP_WLOCK(inp);
1430#ifdef INET6
1431		if ((inp->inp_vflag & INP_IPV4) == 0) {
1432			INP_WUNLOCK(inp);
1433			continue;
1434		}
1435#endif
1436		if (inp->inp_faddr.s_addr != faddr.s_addr ||
1437		    inp->inp_socket == NULL) {
1438			INP_WUNLOCK(inp);
1439			continue;
1440		}
1441		if ((*notify)(inp, errno))
1442			INP_WUNLOCK(inp);
1443	}
1444	INP_INFO_WUNLOCK(pcbinfo);
1445}
1446
1447void
1448in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1449{
1450	struct inpcb *inp;
1451	struct ip_moptions *imo;
1452	int i, gap;
1453
1454	INP_INFO_WLOCK(pcbinfo);
1455	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1456		INP_WLOCK(inp);
1457		imo = inp->inp_moptions;
1458		if ((inp->inp_vflag & INP_IPV4) &&
1459		    imo != NULL) {
1460			/*
1461			 * Unselect the outgoing interface if it is being
1462			 * detached.
1463			 */
1464			if (imo->imo_multicast_ifp == ifp)
1465				imo->imo_multicast_ifp = NULL;
1466
1467			/*
1468			 * Drop multicast group membership if we joined
1469			 * through the interface being detached.
1470			 */
1471			for (i = 0, gap = 0; i < imo->imo_num_memberships;
1472			    i++) {
1473				if (imo->imo_membership[i]->inm_ifp == ifp) {
1474					in_delmulti(imo->imo_membership[i]);
1475					gap++;
1476				} else if (gap != 0)
1477					imo->imo_membership[i - gap] =
1478					    imo->imo_membership[i];
1479			}
1480			imo->imo_num_memberships -= gap;
1481		}
1482		INP_WUNLOCK(inp);
1483	}
1484	INP_INFO_WUNLOCK(pcbinfo);
1485}
1486
1487/*
1488 * Lookup a PCB based on the local address and port.  Caller must hold the
1489 * hash lock.  No inpcb locks or references are acquired.
1490 */
1491#define INP_LOOKUP_MAPPED_PCB_COST	3
1492struct inpcb *
1493in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1494    u_short lport, int lookupflags, struct ucred *cred)
1495{
1496	struct inpcb *inp;
1497#ifdef INET6
1498	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1499#else
1500	int matchwild = 3;
1501#endif
1502	int wildcard;
1503
1504	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1505	    ("%s: invalid lookup flags %d", __func__, lookupflags));
1506
1507	INP_HASH_LOCK_ASSERT(pcbinfo);
1508
1509	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
1510		struct inpcbhead *head;
1511		/*
1512		 * Look for an unconnected (wildcard foreign addr) PCB that
1513		 * matches the local address and port we're looking for.
1514		 */
1515		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1516		    0, pcbinfo->ipi_hashmask)];
1517		LIST_FOREACH(inp, head, inp_hash) {
1518#ifdef INET6
1519			/* XXX inp locking */
1520			if ((inp->inp_vflag & INP_IPV4) == 0)
1521				continue;
1522#endif
1523			if (inp->inp_faddr.s_addr == INADDR_ANY &&
1524			    inp->inp_laddr.s_addr == laddr.s_addr &&
1525			    inp->inp_lport == lport) {
1526				/*
1527				 * Found?
1528				 */
1529				if (cred == NULL ||
1530				    prison_equal_ip4(cred->cr_prison,
1531					inp->inp_cred->cr_prison))
1532					return (inp);
1533			}
1534		}
1535		/*
1536		 * Not found.
1537		 */
1538		return (NULL);
1539	} else {
1540		struct inpcbporthead *porthash;
1541		struct inpcbport *phd;
1542		struct inpcb *match = NULL;
1543		/*
1544		 * Best fit PCB lookup.
1545		 *
1546		 * First see if this local port is in use by looking on the
1547		 * port hash list.
1548		 */
1549		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
1550		    pcbinfo->ipi_porthashmask)];
1551		LIST_FOREACH(phd, porthash, phd_hash) {
1552			if (phd->phd_port == lport)
1553				break;
1554		}
1555		if (phd != NULL) {
1556			/*
1557			 * Port is in use by one or more PCBs. Look for best
1558			 * fit.
1559			 */
1560			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1561				wildcard = 0;
1562				if (cred != NULL &&
1563				    !prison_equal_ip4(inp->inp_cred->cr_prison,
1564					cred->cr_prison))
1565					continue;
1566#ifdef INET6
1567				/* XXX inp locking */
1568				if ((inp->inp_vflag & INP_IPV4) == 0)
1569					continue;
1570				/*
1571				 * We never select the PCB that has
1572				 * INP_IPV6 flag and is bound to :: if
1573				 * we have another PCB which is bound
1574				 * to 0.0.0.0.  If a PCB has the
1575				 * INP_IPV6 flag, then we set its cost
1576				 * higher than IPv4 only PCBs.
1577				 *
1578				 * Note that the case only happens
1579				 * when a socket is bound to ::, under
1580				 * the condition that the use of the
1581				 * mapped address is allowed.
1582				 */
1583				if ((inp->inp_vflag & INP_IPV6) != 0)
1584					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
1585#endif
1586				if (inp->inp_faddr.s_addr != INADDR_ANY)
1587					wildcard++;
1588				if (inp->inp_laddr.s_addr != INADDR_ANY) {
1589					if (laddr.s_addr == INADDR_ANY)
1590						wildcard++;
1591					else if (inp->inp_laddr.s_addr != laddr.s_addr)
1592						continue;
1593				} else {
1594					if (laddr.s_addr != INADDR_ANY)
1595						wildcard++;
1596				}
1597				if (wildcard < matchwild) {
1598					match = inp;
1599					matchwild = wildcard;
1600					if (matchwild == 0)
1601						break;
1602				}
1603			}
1604		}
1605		return (match);
1606	}
1607}
1608#undef INP_LOOKUP_MAPPED_PCB_COST
1609
1610#ifdef PCBGROUP
1611/*
1612 * Lookup PCB in hash list, using pcbgroup tables.
1613 */
1614static struct inpcb *
1615in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
1616    struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
1617    u_int lport_arg, int lookupflags, struct ifnet *ifp)
1618{
1619	struct inpcbhead *head;
1620	struct inpcb *inp, *tmpinp;
1621	u_short fport = fport_arg, lport = lport_arg;
1622	bool locked;
1623
1624	/*
1625	 * First look for an exact match.
1626	 */
1627	tmpinp = NULL;
1628	INP_GROUP_LOCK(pcbgroup);
1629	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1630	    pcbgroup->ipg_hashmask)];
1631	LIST_FOREACH(inp, head, inp_pcbgrouphash) {
1632#ifdef INET6
1633		/* XXX inp locking */
1634		if ((inp->inp_vflag & INP_IPV4) == 0)
1635			continue;
1636#endif
1637		if (inp->inp_faddr.s_addr == faddr.s_addr &&
1638		    inp->inp_laddr.s_addr == laddr.s_addr &&
1639		    inp->inp_fport == fport &&
1640		    inp->inp_lport == lport) {
1641			/*
1642			 * XXX We should be able to directly return
1643			 * the inp here, without any checks.
1644			 * Well unless both bound with SO_REUSEPORT?
1645			 */
1646			if (prison_flag(inp->inp_cred, PR_IP4))
1647				goto found;
1648			if (tmpinp == NULL)
1649				tmpinp = inp;
1650		}
1651	}
1652	if (tmpinp != NULL) {
1653		inp = tmpinp;
1654		goto found;
1655	}
1656
1657#ifdef	RSS
1658	/*
1659	 * For incoming connections, we may wish to do a wildcard
1660	 * match for an RSS-local socket.
1661	 */
1662	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
1663		struct inpcb *local_wild = NULL, *local_exact = NULL;
1664#ifdef INET6
1665		struct inpcb *local_wild_mapped = NULL;
1666#endif
1667		struct inpcb *jail_wild = NULL;
1668		struct inpcbhead *head;
1669		int injail;
1670
1671		/*
1672		 * Order of socket selection - we always prefer jails.
1673		 *      1. jailed, non-wild.
1674		 *      2. jailed, wild.
1675		 *      3. non-jailed, non-wild.
1676		 *      4. non-jailed, wild.
1677		 */
1678
1679		head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
1680		    lport, 0, pcbgroup->ipg_hashmask)];
1681		LIST_FOREACH(inp, head, inp_pcbgrouphash) {
1682#ifdef INET6
1683			/* XXX inp locking */
1684			if ((inp->inp_vflag & INP_IPV4) == 0)
1685				continue;
1686#endif
1687			if (inp->inp_faddr.s_addr != INADDR_ANY ||
1688			    inp->inp_lport != lport)
1689				continue;
1690
1691			injail = prison_flag(inp->inp_cred, PR_IP4);
1692			if (injail) {
1693				if (prison_check_ip4(inp->inp_cred,
1694				    &laddr) != 0)
1695					continue;
1696			} else {
1697				if (local_exact != NULL)
1698					continue;
1699			}
1700
1701			if (inp->inp_laddr.s_addr == laddr.s_addr) {
1702				if (injail)
1703					goto found;
1704				else
1705					local_exact = inp;
1706			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1707#ifdef INET6
1708				/* XXX inp locking, NULL check */
1709				if (inp->inp_vflag & INP_IPV6PROTO)
1710					local_wild_mapped = inp;
1711				else
1712#endif
1713					if (injail)
1714						jail_wild = inp;
1715					else
1716						local_wild = inp;
1717			}
1718		} /* LIST_FOREACH */
1719
1720		inp = jail_wild;
1721		if (inp == NULL)
1722			inp = local_exact;
1723		if (inp == NULL)
1724			inp = local_wild;
1725#ifdef INET6
1726		if (inp == NULL)
1727			inp = local_wild_mapped;
1728#endif
1729		if (inp != NULL)
1730			goto found;
1731	}
1732#endif
1733
1734	/*
1735	 * Then look for a wildcard match, if requested.
1736	 */
1737	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
1738		struct inpcb *local_wild = NULL, *local_exact = NULL;
1739#ifdef INET6
1740		struct inpcb *local_wild_mapped = NULL;
1741#endif
1742		struct inpcb *jail_wild = NULL;
1743		struct inpcbhead *head;
1744		int injail;
1745
1746		/*
1747		 * Order of socket selection - we always prefer jails.
1748		 *      1. jailed, non-wild.
1749		 *      2. jailed, wild.
1750		 *      3. non-jailed, non-wild.
1751		 *      4. non-jailed, wild.
1752		 */
1753		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
1754		    0, pcbinfo->ipi_wildmask)];
1755		LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
1756#ifdef INET6
1757			/* XXX inp locking */
1758			if ((inp->inp_vflag & INP_IPV4) == 0)
1759				continue;
1760#endif
1761			if (inp->inp_faddr.s_addr != INADDR_ANY ||
1762			    inp->inp_lport != lport)
1763				continue;
1764
1765			injail = prison_flag(inp->inp_cred, PR_IP4);
1766			if (injail) {
1767				if (prison_check_ip4(inp->inp_cred,
1768				    &laddr) != 0)
1769					continue;
1770			} else {
1771				if (local_exact != NULL)
1772					continue;
1773			}
1774
1775			if (inp->inp_laddr.s_addr == laddr.s_addr) {
1776				if (injail)
1777					goto found;
1778				else
1779					local_exact = inp;
1780			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1781#ifdef INET6
1782				/* XXX inp locking, NULL check */
1783				if (inp->inp_vflag & INP_IPV6PROTO)
1784					local_wild_mapped = inp;
1785				else
1786#endif
1787					if (injail)
1788						jail_wild = inp;
1789					else
1790						local_wild = inp;
1791			}
1792		} /* LIST_FOREACH */
1793		inp = jail_wild;
1794		if (inp == NULL)
1795			inp = local_exact;
1796		if (inp == NULL)
1797			inp = local_wild;
1798#ifdef INET6
1799		if (inp == NULL)
1800			inp = local_wild_mapped;
1801#endif
1802		if (inp != NULL)
1803			goto found;
1804	} /* if (lookupflags & INPLOOKUP_WILDCARD) */
1805	INP_GROUP_UNLOCK(pcbgroup);
1806	return (NULL);
1807
1808found:
1809	if (lookupflags & INPLOOKUP_WLOCKPCB)
1810		locked = INP_TRY_WLOCK(inp);
1811	else if (lookupflags & INPLOOKUP_RLOCKPCB)
1812		locked = INP_TRY_RLOCK(inp);
1813	else
1814		panic("%s: locking bug", __func__);
1815	if (!locked)
1816		in_pcbref(inp);
1817	INP_GROUP_UNLOCK(pcbgroup);
1818	if (!locked) {
1819		if (lookupflags & INPLOOKUP_WLOCKPCB) {
1820			INP_WLOCK(inp);
1821			if (in_pcbrele_wlocked(inp))
1822				return (NULL);
1823		} else {
1824			INP_RLOCK(inp);
1825			if (in_pcbrele_rlocked(inp))
1826				return (NULL);
1827		}
1828	}
1829#ifdef INVARIANTS
1830	if (lookupflags & INPLOOKUP_WLOCKPCB)
1831		INP_WLOCK_ASSERT(inp);
1832	else
1833		INP_RLOCK_ASSERT(inp);
1834#endif
1835	return (inp);
1836}
1837#endif /* PCBGROUP */
1838
1839/*
1840 * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
1841 * that the caller has locked the hash list, and will not perform any further
1842 * locking or reference operations on either the hash list or the connection.
1843 */
1844static struct inpcb *
1845in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1846    u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
1847    struct ifnet *ifp)
1848{
1849	struct inpcbhead *head;
1850	struct inpcb *inp, *tmpinp;
1851	u_short fport = fport_arg, lport = lport_arg;
1852
1853	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1854	    ("%s: invalid lookup flags %d", __func__, lookupflags));
1855
1856	INP_HASH_LOCK_ASSERT(pcbinfo);
1857
1858	/*
1859	 * First look for an exact match.
1860	 */
1861	tmpinp = NULL;
1862	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1863	    pcbinfo->ipi_hashmask)];
1864	LIST_FOREACH(inp, head, inp_hash) {
1865#ifdef INET6
1866		/* XXX inp locking */
1867		if ((inp->inp_vflag & INP_IPV4) == 0)
1868			continue;
1869#endif
1870		if (inp->inp_faddr.s_addr == faddr.s_addr &&
1871		    inp->inp_laddr.s_addr == laddr.s_addr &&
1872		    inp->inp_fport == fport &&
1873		    inp->inp_lport == lport) {
1874			/*
1875			 * XXX We should be able to directly return
1876			 * the inp here, without any checks.
1877			 * Well unless both bound with SO_REUSEPORT?
1878			 */
1879			if (prison_flag(inp->inp_cred, PR_IP4))
1880				return (inp);
1881			if (tmpinp == NULL)
1882				tmpinp = inp;
1883		}
1884	}
1885	if (tmpinp != NULL)
1886		return (tmpinp);
1887
1888	/*
1889	 * Then look for a wildcard match, if requested.
1890	 */
1891	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
1892		struct inpcb *local_wild = NULL, *local_exact = NULL;
1893#ifdef INET6
1894		struct inpcb *local_wild_mapped = NULL;
1895#endif
1896		struct inpcb *jail_wild = NULL;
1897		int injail;
1898
1899		/*
1900		 * Order of socket selection - we always prefer jails.
1901		 *      1. jailed, non-wild.
1902		 *      2. jailed, wild.
1903		 *      3. non-jailed, non-wild.
1904		 *      4. non-jailed, wild.
1905		 */
1906
1907		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1908		    0, pcbinfo->ipi_hashmask)];
1909		LIST_FOREACH(inp, head, inp_hash) {
1910#ifdef INET6
1911			/* XXX inp locking */
1912			if ((inp->inp_vflag & INP_IPV4) == 0)
1913				continue;
1914#endif
1915			if (inp->inp_faddr.s_addr != INADDR_ANY ||
1916			    inp->inp_lport != lport)
1917				continue;
1918
1919			injail = prison_flag(inp->inp_cred, PR_IP4);
1920			if (injail) {
1921				if (prison_check_ip4(inp->inp_cred,
1922				    &laddr) != 0)
1923					continue;
1924			} else {
1925				if (local_exact != NULL)
1926					continue;
1927			}
1928
1929			if (inp->inp_laddr.s_addr == laddr.s_addr) {
1930				if (injail)
1931					return (inp);
1932				else
1933					local_exact = inp;
1934			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1935#ifdef INET6
1936				/* XXX inp locking, NULL check */
1937				if (inp->inp_vflag & INP_IPV6PROTO)
1938					local_wild_mapped = inp;
1939				else
1940#endif
1941					if (injail)
1942						jail_wild = inp;
1943					else
1944						local_wild = inp;
1945			}
1946		} /* LIST_FOREACH */
1947		if (jail_wild != NULL)
1948			return (jail_wild);
1949		if (local_exact != NULL)
1950			return (local_exact);
1951		if (local_wild != NULL)
1952			return (local_wild);
1953#ifdef INET6
1954		if (local_wild_mapped != NULL)
1955			return (local_wild_mapped);
1956#endif
1957	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
1958
1959	return (NULL);
1960}
1961
1962/*
1963 * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
1964 * hash list lock, and will return the inpcb locked (i.e., requires
1965 * INPLOOKUP_LOCKPCB).
1966 */
1967static struct inpcb *
1968in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1969    u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
1970    struct ifnet *ifp)
1971{
1972	struct inpcb *inp;
1973	bool locked;
1974
1975	INP_HASH_RLOCK(pcbinfo);
1976	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
1977	    (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
1978	if (inp != NULL) {
1979		if (lookupflags & INPLOOKUP_WLOCKPCB)
1980			locked = INP_TRY_WLOCK(inp);
1981		else if (lookupflags & INPLOOKUP_RLOCKPCB)
1982			locked = INP_TRY_RLOCK(inp);
1983		else
1984			panic("%s: locking bug", __func__);
1985		if (!locked)
1986			in_pcbref(inp);
1987		INP_HASH_RUNLOCK(pcbinfo);
1988		if (!locked) {
1989			if (lookupflags & INPLOOKUP_WLOCKPCB) {
1990				INP_WLOCK(inp);
1991				if (in_pcbrele_wlocked(inp))
1992					return (NULL);
1993			} else {
1994				INP_RLOCK(inp);
1995				if (in_pcbrele_rlocked(inp))
1996					return (NULL);
1997			}
1998		}
1999#ifdef INVARIANTS
2000		if (lookupflags & INPLOOKUP_WLOCKPCB)
2001			INP_WLOCK_ASSERT(inp);
2002		else
2003			INP_RLOCK_ASSERT(inp);
2004#endif
2005	} else
2006		INP_HASH_RUNLOCK(pcbinfo);
2007	return (inp);
2008}
2009
2010/*
2011 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2012 * from which a pre-calculated hash value may be extracted.
2013 *
2014 * Possibly more of this logic should be in in_pcbgroup.c.
2015 */
2016struct inpcb *
2017in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2018    struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
2019{
2020#if defined(PCBGROUP) && !defined(RSS)
2021	struct inpcbgroup *pcbgroup;
2022#endif
2023
2024	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2025	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2026	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2027	    ("%s: LOCKPCB not set", __func__));
2028
2029	/*
2030	 * When not using RSS, use connection groups in preference to the
2031	 * reservation table when looking up 4-tuples.  When using RSS, just
2032	 * use the reservation table, due to the cost of the Toeplitz hash
2033	 * in software.
2034	 *
2035	 * XXXRW: This policy belongs in the pcbgroup code, as in principle
2036	 * we could be doing RSS with a non-Toeplitz hash that is affordable
2037	 * in software.
2038	 */
2039#if defined(PCBGROUP) && !defined(RSS)
2040	if (in_pcbgroup_enabled(pcbinfo)) {
2041		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2042		    fport);
2043		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2044		    laddr, lport, lookupflags, ifp));
2045	}
2046#endif
2047	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2048	    lookupflags, ifp));
2049}
2050
2051struct inpcb *
2052in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2053    u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2054    struct ifnet *ifp, struct mbuf *m)
2055{
2056#ifdef PCBGROUP
2057	struct inpcbgroup *pcbgroup;
2058#endif
2059
2060	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2061	    ("%s: invalid lookup flags %d", __func__, lookupflags));
2062	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2063	    ("%s: LOCKPCB not set", __func__));
2064
2065#ifdef PCBGROUP
2066	/*
2067	 * If we can use a hardware-generated hash to look up the connection
2068	 * group, use that connection group to find the inpcb.  Otherwise
2069	 * fall back on a software hash -- or the reservation table if we're
2070	 * using RSS.
2071	 *
2072	 * XXXRW: As above, that policy belongs in the pcbgroup code.
2073	 */
2074	if (in_pcbgroup_enabled(pcbinfo) &&
2075	    !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
2076		pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
2077		    m->m_pkthdr.flowid);
2078		if (pcbgroup != NULL)
2079			return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
2080			    fport, laddr, lport, lookupflags, ifp));
2081#ifndef RSS
2082		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2083		    fport);
2084		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2085		    laddr, lport, lookupflags, ifp));
2086#endif
2087	}
2088#endif
2089	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2090	    lookupflags, ifp));
2091}
2092#endif /* INET */
2093
2094/*
2095 * Insert PCB onto various hash lists.
2096 */
2097static int
2098in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
2099{
2100	struct inpcbhead *pcbhash;
2101	struct inpcbporthead *pcbporthash;
2102	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2103	struct inpcbport *phd;
2104	u_int32_t hashkey_faddr;
2105
2106	INP_WLOCK_ASSERT(inp);
2107	INP_HASH_WLOCK_ASSERT(pcbinfo);
2108
2109	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2110	    ("in_pcbinshash: INP_INHASHLIST"));
2111
2112#ifdef INET6
2113	if (inp->inp_vflag & INP_IPV6)
2114		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2115	else
2116#endif
2117	hashkey_faddr = inp->inp_faddr.s_addr;
2118
2119	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2120		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2121
2122	pcbporthash = &pcbinfo->ipi_porthashbase[
2123	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2124
2125	/*
2126	 * Go through port list and look for a head for this lport.
2127	 */
2128	LIST_FOREACH(phd, pcbporthash, phd_hash) {
2129		if (phd->phd_port == inp->inp_lport)
2130			break;
2131	}
2132	/*
2133	 * If none exists, malloc one and tack it on.
2134	 */
2135	if (phd == NULL) {
2136		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
2137		if (phd == NULL) {
2138			return (ENOBUFS); /* XXX */
2139		}
2140		phd->phd_port = inp->inp_lport;
2141		LIST_INIT(&phd->phd_pcblist);
2142		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2143	}
2144	inp->inp_phd = phd;
2145	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2146	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2147	inp->inp_flags |= INP_INHASHLIST;
2148#ifdef PCBGROUP
2149	if (do_pcbgroup_update)
2150		in_pcbgroup_update(inp);
2151#endif
2152	return (0);
2153}
2154
2155/*
2156 * For now, there are two public interfaces to insert an inpcb into the hash
2157 * lists -- one that does update pcbgroups, and one that doesn't.  The latter
2158 * is used only in the TCP syncache, where in_pcbinshash is called before the
2159 * full 4-tuple is set for the inpcb, and we don't want to install in the
2160 * pcbgroup until later.
2161 *
2162 * XXXRW: This seems like a misfeature.  in_pcbinshash should always update
2163 * connection groups, and partially initialised inpcbs should not be exposed
2164 * to either reservation hash tables or pcbgroups.
2165 */
2166int
2167in_pcbinshash(struct inpcb *inp)
2168{
2169
2170	return (in_pcbinshash_internal(inp, 1));
2171}
2172
2173int
2174in_pcbinshash_nopcbgroup(struct inpcb *inp)
2175{
2176
2177	return (in_pcbinshash_internal(inp, 0));
2178}
2179
2180/*
2181 * Move PCB to the proper hash bucket when { faddr, fport } have  been
2182 * changed. NOTE: This does not handle the case of the lport changing (the
2183 * hashed port list would have to be updated as well), so the lport must
2184 * not change after in_pcbinshash() has been called.
2185 */
2186void
2187in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
2188{
2189	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2190	struct inpcbhead *head;
2191	u_int32_t hashkey_faddr;
2192
2193	INP_WLOCK_ASSERT(inp);
2194	INP_HASH_WLOCK_ASSERT(pcbinfo);
2195
2196	KASSERT(inp->inp_flags & INP_INHASHLIST,
2197	    ("in_pcbrehash: !INP_INHASHLIST"));
2198
2199#ifdef INET6
2200	if (inp->inp_vflag & INP_IPV6)
2201		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2202	else
2203#endif
2204	hashkey_faddr = inp->inp_faddr.s_addr;
2205
2206	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2207		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2208
2209	LIST_REMOVE(inp, inp_hash);
2210	LIST_INSERT_HEAD(head, inp, inp_hash);
2211
2212#ifdef PCBGROUP
2213	if (m != NULL)
2214		in_pcbgroup_update_mbuf(inp, m);
2215	else
2216		in_pcbgroup_update(inp);
2217#endif
2218}
2219
2220void
2221in_pcbrehash(struct inpcb *inp)
2222{
2223
2224	in_pcbrehash_mbuf(inp, NULL);
2225}
2226
2227/*
2228 * Remove PCB from various lists.
2229 */
2230static void
2231in_pcbremlists(struct inpcb *inp)
2232{
2233	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2234
2235#ifdef INVARIANTS
2236	if (pcbinfo == &V_tcbinfo) {
2237		INP_INFO_RLOCK_ASSERT(pcbinfo);
2238	} else {
2239		INP_INFO_WLOCK_ASSERT(pcbinfo);
2240	}
2241#endif
2242
2243	INP_WLOCK_ASSERT(inp);
2244	INP_LIST_WLOCK_ASSERT(pcbinfo);
2245
2246	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
2247	if (inp->inp_flags & INP_INHASHLIST) {
2248		struct inpcbport *phd = inp->inp_phd;
2249
2250		INP_HASH_WLOCK(pcbinfo);
2251		LIST_REMOVE(inp, inp_hash);
2252		LIST_REMOVE(inp, inp_portlist);
2253		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
2254			LIST_REMOVE(phd, phd_hash);
2255			free(phd, M_PCB);
2256		}
2257		INP_HASH_WUNLOCK(pcbinfo);
2258		inp->inp_flags &= ~INP_INHASHLIST;
2259	}
2260	LIST_REMOVE(inp, inp_list);
2261	pcbinfo->ipi_count--;
2262#ifdef PCBGROUP
2263	in_pcbgroup_remove(inp);
2264#endif
2265}
2266
2267/*
2268 * Check for alternatives when higher level complains
2269 * about service problems.  For now, invalidate cached
2270 * routing information.  If the route was created dynamically
2271 * (by a redirect), time to try a default gateway again.
2272 */
2273void
2274in_losing(struct inpcb *inp)
2275{
2276
2277	if (inp->inp_route.ro_rt) {
2278		RTFREE(inp->inp_route.ro_rt);
2279		inp->inp_route.ro_rt = (struct rtentry *)NULL;
2280	}
2281	if (inp->inp_route.ro_lle)
2282		LLE_FREE(inp->inp_route.ro_lle);	/* zeros ro_lle */
2283	return;
2284}
2285
2286/*
2287 * A set label operation has occurred at the socket layer, propagate the
2288 * label change into the in_pcb for the socket.
2289 */
2290void
2291in_pcbsosetlabel(struct socket *so)
2292{
2293#ifdef MAC
2294	struct inpcb *inp;
2295
2296	inp = sotoinpcb(so);
2297	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2298
2299	INP_WLOCK(inp);
2300	SOCK_LOCK(so);
2301	mac_inpcb_sosetlabel(so, inp);
2302	SOCK_UNLOCK(so);
2303	INP_WUNLOCK(inp);
2304#endif
2305}
2306
2307/*
2308 * ipport_tick runs once per second, determining if random port allocation
2309 * should be continued.  If more than ipport_randomcps ports have been
2310 * allocated in the last second, then we return to sequential port
2311 * allocation. We return to random allocation only once we drop below
2312 * ipport_randomcps for at least ipport_randomtime seconds.
2313 */
2314static void
2315ipport_tick(void *xtp)
2316{
2317	VNET_ITERATOR_DECL(vnet_iter);
2318
2319	VNET_LIST_RLOCK_NOSLEEP();
2320	VNET_FOREACH(vnet_iter) {
2321		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
2322		if (V_ipport_tcpallocs <=
2323		    V_ipport_tcplastcount + V_ipport_randomcps) {
2324			if (V_ipport_stoprandom > 0)
2325				V_ipport_stoprandom--;
2326		} else
2327			V_ipport_stoprandom = V_ipport_randomtime;
2328		V_ipport_tcplastcount = V_ipport_tcpallocs;
2329		CURVNET_RESTORE();
2330	}
2331	VNET_LIST_RUNLOCK_NOSLEEP();
2332	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
2333}
2334
2335static void
2336ip_fini(void *xtp)
2337{
2338
2339	callout_stop(&ipport_tick_callout);
2340}
2341
2342/*
2343 * The ipport_callout should start running at about the time we attach the
2344 * inet or inet6 domains.
2345 */
2346static void
2347ipport_tick_init(const void *unused __unused)
2348{
2349
2350	/* Start ipport_tick. */
2351	callout_init(&ipport_tick_callout, 1);
2352	callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
2353	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
2354		SHUTDOWN_PRI_DEFAULT);
2355}
2356SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
2357    ipport_tick_init, NULL);
2358
2359void
2360inp_wlock(struct inpcb *inp)
2361{
2362
2363	INP_WLOCK(inp);
2364}
2365
2366void
2367inp_wunlock(struct inpcb *inp)
2368{
2369
2370	INP_WUNLOCK(inp);
2371}
2372
2373void
2374inp_rlock(struct inpcb *inp)
2375{
2376
2377	INP_RLOCK(inp);
2378}
2379
2380void
2381inp_runlock(struct inpcb *inp)
2382{
2383
2384	INP_RUNLOCK(inp);
2385}
2386
2387#ifdef INVARIANTS
2388void
2389inp_lock_assert(struct inpcb *inp)
2390{
2391
2392	INP_WLOCK_ASSERT(inp);
2393}
2394
2395void
2396inp_unlock_assert(struct inpcb *inp)
2397{
2398
2399	INP_UNLOCK_ASSERT(inp);
2400}
2401#endif
2402
2403void
2404inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
2405{
2406	struct inpcb *inp;
2407
2408	INP_INFO_WLOCK(&V_tcbinfo);
2409	LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
2410		INP_WLOCK(inp);
2411		func(inp, arg);
2412		INP_WUNLOCK(inp);
2413	}
2414	INP_INFO_WUNLOCK(&V_tcbinfo);
2415}
2416
2417struct socket *
2418inp_inpcbtosocket(struct inpcb *inp)
2419{
2420
2421	INP_WLOCK_ASSERT(inp);
2422	return (inp->inp_socket);
2423}
2424
2425struct tcpcb *
2426inp_inpcbtotcpcb(struct inpcb *inp)
2427{
2428
2429	INP_WLOCK_ASSERT(inp);
2430	return ((struct tcpcb *)inp->inp_ppcb);
2431}
2432
2433int
2434inp_ip_tos_get(const struct inpcb *inp)
2435{
2436
2437	return (inp->inp_ip_tos);
2438}
2439
2440void
2441inp_ip_tos_set(struct inpcb *inp, int val)
2442{
2443
2444	inp->inp_ip_tos = val;
2445}
2446
2447void
2448inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2449    uint32_t *faddr, uint16_t *fp)
2450{
2451
2452	INP_LOCK_ASSERT(inp);
2453	*laddr = inp->inp_laddr.s_addr;
2454	*faddr = inp->inp_faddr.s_addr;
2455	*lp = inp->inp_lport;
2456	*fp = inp->inp_fport;
2457}
2458
2459struct inpcb *
2460so_sotoinpcb(struct socket *so)
2461{
2462
2463	return (sotoinpcb(so));
2464}
2465
2466struct tcpcb *
2467so_sototcpcb(struct socket *so)
2468{
2469
2470	return (sototcpcb(so));
2471}
2472
2473#ifdef DDB
2474static void
2475db_print_indent(int indent)
2476{
2477	int i;
2478
2479	for (i = 0; i < indent; i++)
2480		db_printf(" ");
2481}
2482
2483static void
2484db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
2485{
2486	char faddr_str[48], laddr_str[48];
2487
2488	db_print_indent(indent);
2489	db_printf("%s at %p\n", name, inc);
2490
2491	indent += 2;
2492
2493#ifdef INET6
2494	if (inc->inc_flags & INC_ISIPV6) {
2495		/* IPv6. */
2496		ip6_sprintf(laddr_str, &inc->inc6_laddr);
2497		ip6_sprintf(faddr_str, &inc->inc6_faddr);
2498	} else
2499#endif
2500	{
2501		/* IPv4. */
2502		inet_ntoa_r(inc->inc_laddr, laddr_str);
2503		inet_ntoa_r(inc->inc_faddr, faddr_str);
2504	}
2505	db_print_indent(indent);
2506	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
2507	    ntohs(inc->inc_lport));
2508	db_print_indent(indent);
2509	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
2510	    ntohs(inc->inc_fport));
2511}
2512
2513static void
2514db_print_inpflags(int inp_flags)
2515{
2516	int comma;
2517
2518	comma = 0;
2519	if (inp_flags & INP_RECVOPTS) {
2520		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
2521		comma = 1;
2522	}
2523	if (inp_flags & INP_RECVRETOPTS) {
2524		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
2525		comma = 1;
2526	}
2527	if (inp_flags & INP_RECVDSTADDR) {
2528		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
2529		comma = 1;
2530	}
2531	if (inp_flags & INP_HDRINCL) {
2532		db_printf("%sINP_HDRINCL", comma ? ", " : "");
2533		comma = 1;
2534	}
2535	if (inp_flags & INP_HIGHPORT) {
2536		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
2537		comma = 1;
2538	}
2539	if (inp_flags & INP_LOWPORT) {
2540		db_printf("%sINP_LOWPORT", comma ? ", " : "");
2541		comma = 1;
2542	}
2543	if (inp_flags & INP_ANONPORT) {
2544		db_printf("%sINP_ANONPORT", comma ? ", " : "");
2545		comma = 1;
2546	}
2547	if (inp_flags & INP_RECVIF) {
2548		db_printf("%sINP_RECVIF", comma ? ", " : "");
2549		comma = 1;
2550	}
2551	if (inp_flags & INP_MTUDISC) {
2552		db_printf("%sINP_MTUDISC", comma ? ", " : "");
2553		comma = 1;
2554	}
2555	if (inp_flags & INP_RECVTTL) {
2556		db_printf("%sINP_RECVTTL", comma ? ", " : "");
2557		comma = 1;
2558	}
2559	if (inp_flags & INP_DONTFRAG) {
2560		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
2561		comma = 1;
2562	}
2563	if (inp_flags & INP_RECVTOS) {
2564		db_printf("%sINP_RECVTOS", comma ? ", " : "");
2565		comma = 1;
2566	}
2567	if (inp_flags & IN6P_IPV6_V6ONLY) {
2568		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
2569		comma = 1;
2570	}
2571	if (inp_flags & IN6P_PKTINFO) {
2572		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
2573		comma = 1;
2574	}
2575	if (inp_flags & IN6P_HOPLIMIT) {
2576		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
2577		comma = 1;
2578	}
2579	if (inp_flags & IN6P_HOPOPTS) {
2580		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
2581		comma = 1;
2582	}
2583	if (inp_flags & IN6P_DSTOPTS) {
2584		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
2585		comma = 1;
2586	}
2587	if (inp_flags & IN6P_RTHDR) {
2588		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
2589		comma = 1;
2590	}
2591	if (inp_flags & IN6P_RTHDRDSTOPTS) {
2592		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
2593		comma = 1;
2594	}
2595	if (inp_flags & IN6P_TCLASS) {
2596		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
2597		comma = 1;
2598	}
2599	if (inp_flags & IN6P_AUTOFLOWLABEL) {
2600		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
2601		comma = 1;
2602	}
2603	if (inp_flags & INP_TIMEWAIT) {
2604		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
2605		comma  = 1;
2606	}
2607	if (inp_flags & INP_ONESBCAST) {
2608		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
2609		comma  = 1;
2610	}
2611	if (inp_flags & INP_DROPPED) {
2612		db_printf("%sINP_DROPPED", comma ? ", " : "");
2613		comma  = 1;
2614	}
2615	if (inp_flags & INP_SOCKREF) {
2616		db_printf("%sINP_SOCKREF", comma ? ", " : "");
2617		comma  = 1;
2618	}
2619	if (inp_flags & IN6P_RFC2292) {
2620		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
2621		comma = 1;
2622	}
2623	if (inp_flags & IN6P_MTU) {
2624		db_printf("IN6P_MTU%s", comma ? ", " : "");
2625		comma = 1;
2626	}
2627}
2628
2629static void
2630db_print_inpvflag(u_char inp_vflag)
2631{
2632	int comma;
2633
2634	comma = 0;
2635	if (inp_vflag & INP_IPV4) {
2636		db_printf("%sINP_IPV4", comma ? ", " : "");
2637		comma  = 1;
2638	}
2639	if (inp_vflag & INP_IPV6) {
2640		db_printf("%sINP_IPV6", comma ? ", " : "");
2641		comma  = 1;
2642	}
2643	if (inp_vflag & INP_IPV6PROTO) {
2644		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
2645		comma  = 1;
2646	}
2647}
2648
2649static void
2650db_print_inpcb(struct inpcb *inp, const char *name, int indent)
2651{
2652
2653	db_print_indent(indent);
2654	db_printf("%s at %p\n", name, inp);
2655
2656	indent += 2;
2657
2658	db_print_indent(indent);
2659	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
2660
2661	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
2662
2663	db_print_indent(indent);
2664	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
2665	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
2666
2667	db_print_indent(indent);
2668	db_printf("inp_label: %p   inp_flags: 0x%x (",
2669	   inp->inp_label, inp->inp_flags);
2670	db_print_inpflags(inp->inp_flags);
2671	db_printf(")\n");
2672
2673	db_print_indent(indent);
2674	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
2675	    inp->inp_vflag);
2676	db_print_inpvflag(inp->inp_vflag);
2677	db_printf(")\n");
2678
2679	db_print_indent(indent);
2680	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
2681	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
2682
2683	db_print_indent(indent);
2684#ifdef INET6
2685	if (inp->inp_vflag & INP_IPV6) {
2686		db_printf("in6p_options: %p   in6p_outputopts: %p   "
2687		    "in6p_moptions: %p\n", inp->in6p_options,
2688		    inp->in6p_outputopts, inp->in6p_moptions);
2689		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
2690		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
2691		    inp->in6p_hops);
2692	} else
2693#endif
2694	{
2695		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
2696		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
2697		    inp->inp_options, inp->inp_moptions);
2698	}
2699
2700	db_print_indent(indent);
2701	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
2702	    (uintmax_t)inp->inp_gencnt);
2703}
2704
2705DB_SHOW_COMMAND(inpcb, db_show_inpcb)
2706{
2707	struct inpcb *inp;
2708
2709	if (!have_addr) {
2710		db_printf("usage: show inpcb <addr>\n");
2711		return;
2712	}
2713	inp = (struct inpcb *)addr;
2714
2715	db_print_inpcb(inp, "inpcb", 0);
2716}
2717#endif /* DDB */
2718