1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1991, 1993, 1995
30 *	The Regents of the University of California.  All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *	This product includes software developed by the University of
43 *	California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.17 2001/08/13 16:26:17 ume Exp $
62 */
63
64#include <sys/param.h>
65#include <sys/systm.h>
66#include <sys/malloc.h>
67#include <sys/mbuf.h>
68#include <sys/domain.h>
69#include <sys/protosw.h>
70#include <sys/socket.h>
71#include <sys/socketvar.h>
72#include <sys/proc.h>
73#ifndef __APPLE__
74#include <sys/jail.h>
75#endif
76#include <sys/kernel.h>
77#include <sys/sysctl.h>
78#include <sys/mcache.h>
79#include <sys/kauth.h>
80#include <sys/priv.h>
81#include <libkern/OSAtomic.h>
82#include <kern/locks.h>
83
84#include <machine/limits.h>
85
86#ifdef __APPLE__
87#include <kern/zalloc.h>
88#endif
89
90#include <net/if.h>
91#include <net/if_types.h>
92#include <net/route.h>
93#include <net/flowhash.h>
94#include <net/flowadv.h>
95
96#include <netinet/in.h>
97#include <netinet/in_pcb.h>
98#include <netinet/in_var.h>
99#include <netinet/ip_var.h>
100#if INET6
101#include <netinet/ip6.h>
102#include <netinet6/ip6_var.h>
103#endif /* INET6 */
104
105#if IPSEC
106#include <netinet6/ipsec.h>
107#include <netkey/key.h>
108#endif /* IPSEC */
109
110#include <sys/kdebug.h>
111#include <sys/random.h>
112#include <dev/random/randomdev.h>
113
114#if IPSEC
115extern int ipsec_bypass;
116#endif
117
118#define DBG_FNC_PCB_LOOKUP	NETDBG_CODE(DBG_NETTCP, (6 << 8))
119#define DBG_FNC_PCB_HLOOKUP	NETDBG_CODE(DBG_NETTCP, ((6 << 8) | 1))
120
121struct	in_addr zeroin_addr;
122
123/*
124 * These configure the range of local port addresses assigned to
125 * "unspecified" outgoing connections/packets/whatever.
126 */
127int	ipport_lowfirstauto  = IPPORT_RESERVED - 1;	/* 1023 */
128int	ipport_lowlastauto = IPPORT_RESERVEDSTART;	/* 600 */
129#ifndef __APPLE__
130int	ipport_firstauto = IPPORT_RESERVED;		/* 1024 */
131int	ipport_lastauto  = IPPORT_USERRESERVED;		/* 5000 */
132#else
133int 	ipport_firstauto = IPPORT_HIFIRSTAUTO;      	/* 49152 */
134int 	ipport_lastauto  = IPPORT_HILASTAUTO;       	/* 65535 */
135#endif
136int	ipport_hifirstauto = IPPORT_HIFIRSTAUTO;	/* 49152 */
137int	ipport_hilastauto  = IPPORT_HILASTAUTO;		/* 65535 */
138
139#define RANGECHK(var, min, max) \
140	if ((var) < (min)) { (var) = (min); } \
141	else if ((var) > (max)) { (var) = (max); }
142
143static int
144sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
145{
146#pragma unused(arg1, arg2)
147	int error = sysctl_handle_int(oidp,
148		oidp->oid_arg1, oidp->oid_arg2, req);
149	if (!error) {
150		RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
151		RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
152		RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
153		RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
154		RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
155		RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
156	}
157	return error;
158}
159
160#undef RANGECHK
161
162SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "IP Ports");
163
164SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
165	   &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
166SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
167	   &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
168SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
169	   &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
170SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
171	   &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
172SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
173	   &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
174SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
175	   &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
176
177extern int	udp_use_randomport;
178extern int	tcp_use_randomport;
179
180/* Structs used for flowhash computation */
181struct inp_flowhash_key_addr {
182	union {
183		struct in_addr	v4;
184		struct in6_addr v6;
185		u_int8_t	addr8[16];
186		u_int16_t	addr16[8];
187		u_int32_t	addr32[4];
188	} infha;
189};
190
191struct inp_flowhash_key {
192	struct inp_flowhash_key_addr 	infh_laddr;
193	struct inp_flowhash_key_addr	infh_faddr;
194	u_int32_t			infh_lport;
195	u_int32_t			infh_fport;
196	u_int32_t			infh_af;
197	u_int32_t			infh_proto;
198	u_int32_t			infh_rand1;
199	u_int32_t			infh_rand2;
200};
201
202u_int32_t inp_hash_seed = 0;
203
204static __inline int infc_cmp(const struct inp_fc_entry *,
205    const struct inp_fc_entry *);
206lck_grp_t *inp_lck_grp;
207lck_grp_attr_t *inp_lck_grp_attr;
208lck_attr_t *inp_lck_attr;
209decl_lck_mtx_data(, inp_fc_lck);
210
211RB_HEAD(inp_fc_tree, inp_fc_entry) inp_fc_tree;
212RB_PROTOTYPE(inp_fc_tree, inp_fc_entry, infc_link, infc_cmp);
213
214RB_GENERATE(inp_fc_tree, inp_fc_entry, infc_link, infc_cmp);
215
216static unsigned int inp_fcezone_size;
217static struct zone *inp_fcezone;
218#define INP_FCEZONE_NAME "inp_fcezone"
219#define INP_FCEZONE_MAX 32
220
221/*
222 * in_pcb.c: manage the Protocol Control Blocks.
223 */
224
225/*
226 * Initialize data structures required to deliver
227 * flow advisories.
228 */
229void
230socket_flowadv_init(void)
231{
232	inp_lck_grp_attr = lck_grp_attr_alloc_init();
233	inp_lck_grp = lck_grp_alloc_init("inp_lck_grp", inp_lck_grp_attr);
234
235	inp_lck_attr = lck_attr_alloc_init();
236	lck_mtx_init(&inp_fc_lck, inp_lck_grp, inp_lck_attr);
237
238	RB_INIT(&inp_fc_tree);
239
240	inp_fcezone_size = P2ROUNDUP(sizeof (struct inp_fc_entry),
241	    sizeof (u_int64_t));
242	inp_fcezone = zinit(inp_fcezone_size,
243	    INP_FCEZONE_MAX * inp_fcezone_size, 0, INP_FCEZONE_NAME);
244	if (inp_fcezone == NULL) {
245		panic("%s: failed allocating %s", __func__,
246		    INP_FCEZONE_NAME);
247		/* NOTREACHED */
248	}
249	zone_change(inp_fcezone, Z_EXPAND, TRUE);
250	zone_change(inp_fcezone, Z_CALLERACCT, FALSE);
251}
252
253/*
254 * Allocate a PCB and associate it with the socket.
255 *
256 * Returns:	0			Success
257 *		ENOBUFS
258 *		ENOMEM
259 *	ipsec_init_policy:???		[IPSEC]
260 */
261int
262in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, __unused struct proc *p)
263{
264	struct inpcb *inp;
265	caddr_t		      temp;
266#if IPSEC
267#ifndef __APPLE__
268	int error;
269#endif
270#endif
271#if CONFIG_MACF_NET
272	int mac_error;
273#endif
274
275	if (so->cached_in_sock_layer == 0) {
276#if TEMPDEBUG
277	    printf("PCBALLOC calling zalloc for socket %x\n", so);
278#endif
279	    inp = (struct inpcb *) zalloc(pcbinfo->ipi_zone);
280	    if (inp == NULL)
281		 return (ENOBUFS);
282	    bzero((caddr_t)inp, sizeof(*inp));
283	}
284	else {
285#if TEMPDEBUG
286	    printf("PCBALLOC reusing PCB for socket %x\n", so);
287#endif
288	    inp = (struct inpcb *)(void *)so->so_saved_pcb;
289	    temp = inp->inp_saved_ppcb;
290	    bzero((caddr_t) inp, sizeof(*inp));
291	    inp->inp_saved_ppcb = temp;
292	}
293
294	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
295	inp->inp_pcbinfo = pcbinfo;
296	inp->inp_socket = so;
297#if CONFIG_MACF_NET
298	mac_error = mac_inpcb_label_init(inp, M_WAITOK);
299	if (mac_error != 0) {
300		if (so->cached_in_sock_layer == 0)
301			zfree(pcbinfo->ipi_zone, inp);
302		return (mac_error);
303	}
304	mac_inpcb_label_associate(so, inp);
305#endif
306	// make sure inp_stat is always 64bit aligned
307	inp->inp_stat = (struct inp_stat*)P2ROUNDUP(inp->inp_stat_store, sizeof(u_int64_t));
308	if (((uintptr_t)inp->inp_stat - (uintptr_t)inp->inp_stat_store)
309		+ sizeof(*inp->inp_stat) > sizeof(inp->inp_stat_store)) {
310		panic("insufficient space to align inp_stat");
311	}
312
313	so->so_pcb = (caddr_t)inp;
314
315	if (so->so_proto->pr_flags & PR_PCBLOCK) {
316		lck_mtx_init(&inp->inpcb_mtx, pcbinfo->mtx_grp, pcbinfo->mtx_attr);
317	}
318
319#if IPSEC
320#ifndef __APPLE__
321	if (ipsec_bypass == 0) {
322		error = ipsec_init_policy(so, &inp->inp_sp);
323		if (error != 0) {
324			zfree(pcbinfo->ipi_zone, inp);
325			return error;
326		}
327	}
328#endif
329#endif /*IPSEC*/
330#if INET6
331	if (INP_SOCKAF(so) == AF_INET6 && !ip6_mapped_addr_on)
332		inp->inp_flags |= IN6P_IPV6_V6ONLY;
333#endif
334
335#if INET6
336	if (ip6_auto_flowlabel)
337		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
338#endif
339	lck_rw_lock_exclusive(pcbinfo->mtx);
340	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
341	LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list);
342	pcbinfo->ipi_count++;
343	lck_rw_done(pcbinfo->mtx);
344	return (0);
345}
346
347
348/*
349  in_pcblookup_local_and_cleanup does everything
350  in_pcblookup_local does but it checks for a socket
351  that's going away. Since we know that the lock is
352  held read+write when this funciton is called, we
353  can safely dispose of this socket like the slow
354  timer would usually do and return NULL. This is
355  great for bind.
356*/
357struct inpcb*
358in_pcblookup_local_and_cleanup(
359	struct inpcbinfo *pcbinfo,
360	struct in_addr laddr,
361	u_int lport_arg,
362	int wild_okay)
363{
364	struct inpcb *inp;
365
366	/* Perform normal lookup */
367	inp = in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay);
368
369	/* Check if we found a match but it's waiting to be disposed */
370	if (inp && inp->inp_wantcnt == WNT_STOPUSING) {
371		struct socket *so = inp->inp_socket;
372
373		lck_mtx_lock(&inp->inpcb_mtx);
374
375		if (so->so_usecount == 0) {
376			if (inp->inp_state != INPCB_STATE_DEAD)
377				in_pcbdetach(inp);
378			in_pcbdispose(inp);
379			inp = NULL;
380		}
381		else {
382			lck_mtx_unlock(&inp->inpcb_mtx);
383		}
384	}
385
386	return inp;
387}
388
389#ifdef __APPLE_API_PRIVATE
390static void
391in_pcb_conflict_post_msg(u_int16_t port)
392{
393	/*
394	 * Radar 5523020 send a kernel event notification if a non-participating socket tries to bind
395	 * 		 the port a socket who has set SOF_NOTIFYCONFLICT owns.
396	 */
397	struct kev_msg        ev_msg;
398	struct kev_in_portinuse	in_portinuse;
399
400	bzero(&in_portinuse, sizeof(struct kev_in_portinuse));
401	bzero(&ev_msg, sizeof(struct kev_msg));
402	in_portinuse.port = ntohs(port);	/* port in host order */
403	in_portinuse.req_pid = proc_selfpid();
404	ev_msg.vendor_code = KEV_VENDOR_APPLE;
405	ev_msg.kev_class = KEV_NETWORK_CLASS;
406	ev_msg.kev_subclass = KEV_INET_SUBCLASS;
407	ev_msg.event_code = KEV_INET_PORTINUSE;
408	ev_msg.dv[0].data_ptr = &in_portinuse;
409	ev_msg.dv[0].data_length      = sizeof(struct kev_in_portinuse);
410	ev_msg.dv[1].data_length = 0;
411	kev_post_msg(&ev_msg);
412}
413#endif
414/*
415 * Returns:	0			Success
416 *		EADDRNOTAVAIL		Address not available.
417 *		EINVAL			Invalid argument
418 *		EAFNOSUPPORT		Address family not supported [notdef]
419 *		EACCES			Permission denied
420 *		EADDRINUSE		Address in use
421 *		EAGAIN			Resource unavailable, try again
422 *		priv_check_cred:EPERM	Operation not permitted
423 */
424int
425in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
426{
427	struct socket *so = inp->inp_socket;
428	unsigned short *lastport;
429	struct sockaddr_in *sin;
430	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
431	u_short lport = 0, rand_port = 0;
432	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
433	int error, randomport, conflict = 0;
434	kauth_cred_t cred;
435
436	if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */
437		return (EADDRNOTAVAIL);
438	if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY)
439		return (EINVAL);
440	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
441		wild = 1;
442	socket_unlock(so, 0); /* keep reference on socket */
443	lck_rw_lock_exclusive(pcbinfo->mtx);
444	if (nam) {
445		struct ifnet *outif = NULL;
446
447		sin = (struct sockaddr_in *)(void *)nam;
448		if (nam->sa_len != sizeof (*sin)) {
449			lck_rw_done(pcbinfo->mtx);
450			socket_lock(so, 0);
451			return (EINVAL);
452		}
453#ifdef notdef
454		/*
455		 * We should check the family, but old programs
456		 * incorrectly fail to initialize it.
457		 */
458		if (sin->sin_family != AF_INET) {
459			lck_rw_done(pcbinfo->mtx);
460			socket_lock(so, 0);
461			return (EAFNOSUPPORT);
462		}
463#endif
464		lport = sin->sin_port;
465		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
466			/*
467			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
468			 * allow complete duplication of binding if
469			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
470			 * and a multicast address is bound on both
471			 * new and duplicated sockets.
472			 */
473			if (so->so_options & SO_REUSEADDR)
474				reuseport = SO_REUSEADDR|SO_REUSEPORT;
475		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
476			struct ifaddr *ifa;
477			sin->sin_port = 0;		/* yech... */
478			if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin)) == 0) {
479				lck_rw_done(pcbinfo->mtx);
480				socket_lock(so, 0);
481				return (EADDRNOTAVAIL);
482			}
483			else {
484				IFA_LOCK(ifa);
485				outif = ifa->ifa_ifp;
486				IFA_UNLOCK(ifa);
487				IFA_REMREF(ifa);
488			}
489		}
490		if (lport) {
491			struct inpcb *t;
492
493			/* GROSS */
494#if !CONFIG_EMBEDDED
495			if (ntohs(lport) < IPPORT_RESERVED) {
496				cred = kauth_cred_proc_ref(p);
497				error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
498				kauth_cred_unref(&cred);
499				if (error != 0) {
500					lck_rw_done(pcbinfo->mtx);
501					socket_lock(so, 0);
502					return (EACCES);
503				}
504			}
505#endif
506			if (kauth_cred_getuid(so->so_cred) &&
507			    !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
508				t = in_pcblookup_local_and_cleanup(inp->inp_pcbinfo,
509				    sin->sin_addr, lport, INPLOOKUP_WILDCARD);
510				if (t &&
511				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
512				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
513				     (t->inp_socket->so_options &
514					 SO_REUSEPORT) == 0) &&
515				     (kauth_cred_getuid(so->so_cred) !=
516					 kauth_cred_getuid(t->inp_socket->so_cred)) &&
517				      ((t->inp_socket->so_flags & SOF_REUSESHAREUID) == 0) &&
518				        (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
519					 ntohl(t->inp_laddr.s_addr) != INADDR_ANY))
520				{
521#ifdef __APPLE_API_PRIVATE
522
523						if ((t->inp_socket->so_flags & SOF_NOTIFYCONFLICT) && ((so->so_flags & SOF_NOTIFYCONFLICT) == 0))
524							conflict = 1;
525
526						lck_rw_done(pcbinfo->mtx);
527
528						if (conflict)
529							in_pcb_conflict_post_msg(lport);
530#else
531						lck_rw_done(pcbinfo->mtx);
532#endif /* __APPLE_API_PRIVATE */
533
534						socket_lock(so, 0);
535						return (EADDRINUSE);
536				}
537			}
538			t = in_pcblookup_local_and_cleanup(pcbinfo, sin->sin_addr,
539			    lport, wild);
540			if (t &&
541			    (reuseport & t->inp_socket->so_options) == 0) {
542#if INET6
543				if (ntohl(sin->sin_addr.s_addr) !=
544				    INADDR_ANY ||
545				    ntohl(t->inp_laddr.s_addr) !=
546				    INADDR_ANY ||
547				    INP_SOCKAF(so) != AF_INET6 ||
548				    INP_SOCKAF(t->inp_socket) != AF_INET6)
549#endif /* INET6 */
550				{
551#ifdef __APPLE_API_PRIVATE
552
553					if ((t->inp_socket->so_flags & SOF_NOTIFYCONFLICT) && ((so->so_flags & SOF_NOTIFYCONFLICT) == 0))
554						conflict = 1;
555
556					lck_rw_done(pcbinfo->mtx);
557
558					if (conflict)
559						in_pcb_conflict_post_msg(lport);
560#else
561					lck_rw_done(pcbinfo->mtx);
562#endif /* __APPLE_API_PRIVATE */
563					socket_lock(so, 0);
564					return (EADDRINUSE);
565				}
566			}
567		}
568		inp->inp_laddr = sin->sin_addr;
569		inp->inp_last_outifp = outif;
570	}
571	if (lport == 0) {
572		u_short first, last;
573		int count;
574
575		randomport = (so->so_flags & SOF_BINDRANDOMPORT) ||
576			(so->so_type == SOCK_STREAM ? tcp_use_randomport : udp_use_randomport);
577
578		inp->inp_flags |= INP_ANONPORT;
579
580		if (inp->inp_flags & INP_HIGHPORT) {
581			first = ipport_hifirstauto;	/* sysctl */
582			last  = ipport_hilastauto;
583			lastport = &pcbinfo->lasthi;
584		} else if (inp->inp_flags & INP_LOWPORT) {
585			cred = kauth_cred_proc_ref(p);
586			error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
587			kauth_cred_unref(&cred);
588			if (error != 0) {
589				lck_rw_done(pcbinfo->mtx);
590				socket_lock(so, 0);
591				return error;
592			}
593			first = ipport_lowfirstauto;	/* 1023 */
594			last  = ipport_lowlastauto;	/* 600 */
595			lastport = &pcbinfo->lastlow;
596		} else {
597			first = ipport_firstauto;	/* sysctl */
598			last  = ipport_lastauto;
599			lastport = &pcbinfo->lastport;
600		}
601		/* No point in randomizing if only one port is available */
602
603		if (first == last)
604			randomport = 0;
605		/*
606		 * Simple check to ensure all ports are not used up causing
607		 * a deadlock here.
608		 *
609		 * We split the two cases (up and down) so that the direction
610		 * is not being tested on each round of the loop.
611		 */
612		if (first > last) {
613			/*
614			 * counting down
615			 */
616			if (randomport) {
617				read_random(&rand_port, sizeof(rand_port));
618				*lastport = first - (rand_port % (first - last));
619			}
620			count = first - last;
621
622			do {
623				if (count-- < 0) {	/* completely used? */
624					lck_rw_done(pcbinfo->mtx);
625					socket_lock(so, 0);
626					inp->inp_laddr.s_addr = INADDR_ANY;
627					inp->inp_last_outifp = NULL;
628					return (EADDRNOTAVAIL);
629				}
630				--*lastport;
631				if (*lastport > first || *lastport < last)
632					*lastport = first;
633				lport = htons(*lastport);
634			} while (in_pcblookup_local_and_cleanup(pcbinfo,
635				 inp->inp_laddr, lport, wild));
636		} else {
637			/*
638			 * counting up
639			 */
640			if (randomport) {
641				read_random(&rand_port, sizeof(rand_port));
642				*lastport = first + (rand_port % (first - last));
643			}
644			count = last - first;
645
646			do {
647				if (count-- < 0) {	/* completely used? */
648					lck_rw_done(pcbinfo->mtx);
649					socket_lock(so, 0);
650					inp->inp_laddr.s_addr = INADDR_ANY;
651					inp->inp_last_outifp = NULL;
652					return (EADDRNOTAVAIL);
653				}
654				++*lastport;
655				if (*lastport < first || *lastport > last)
656					*lastport = first;
657				lport = htons(*lastport);
658			} while (in_pcblookup_local_and_cleanup(pcbinfo,
659				 inp->inp_laddr, lport, wild));
660		}
661	}
662	socket_lock(so, 0);
663	inp->inp_lport = lport;
664	if (in_pcbinshash(inp, 1) != 0) {
665		inp->inp_laddr.s_addr = INADDR_ANY;
666		inp->inp_lport = 0;
667		inp->inp_last_outifp = NULL;
668		lck_rw_done(pcbinfo->mtx);
669		return (EAGAIN);
670	}
671	lck_rw_done(pcbinfo->mtx);
672	sflt_notify(so, sock_evt_bound, NULL);
673	return (0);
674}
675
676/*
677 *   Transform old in_pcbconnect() into an inner subroutine for new
678 *   in_pcbconnect(): Do some validity-checking on the remote
679 *   address (in mbuf 'nam') and then determine local host address
680 *   (i.e., which interface) to use to access that remote host.
681 *
682 *   This preserves definition of in_pcbconnect(), while supporting a
683 *   slightly different version for T/TCP.  (This is more than
684 *   a bit of a kludge, but cleaning up the internal interfaces would
685 *   have forced minor changes in every protocol).
686 *
687 * Returns:	0			Success
688 *		EINVAL			Invalid argument
689 *		EAFNOSUPPORT		Address family not supported
690 *		EADDRNOTAVAIL		Address not available
691 */
692int
693in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
694    struct sockaddr_in *plocal_sin, struct ifnet **outif)
695{
696	struct in_ifaddr *ia;
697	struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
698
699	if (nam->sa_len != sizeof (*sin))
700		return (EINVAL);
701	if (sin->sin_family != AF_INET)
702		return (EAFNOSUPPORT);
703	if (sin->sin_port == 0)
704		return (EADDRNOTAVAIL);
705
706	lck_rw_lock_shared(in_ifaddr_rwlock);
707	if (!TAILQ_EMPTY(&in_ifaddrhead)) {
708		ia = TAILQ_FIRST(&in_ifaddrhead);
709		/*
710		 * If the destination address is INADDR_ANY,
711		 * use the primary local address.
712		 * If the supplied address is INADDR_BROADCAST,
713		 * and the primary interface supports broadcast,
714		 * choose the broadcast address for that interface.
715		 */
716		IFA_LOCK_SPIN(&ia->ia_ifa);
717		if (sin->sin_addr.s_addr == INADDR_ANY)
718			sin->sin_addr = IA_SIN(ia)->sin_addr;
719		else if (sin->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST &&
720		    (ia->ia_ifp->if_flags & IFF_BROADCAST))
721			sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr;
722		IFA_UNLOCK(&ia->ia_ifa);
723		ia = NULL;
724	}
725	lck_rw_done(in_ifaddr_rwlock);
726
727	if (inp->inp_laddr.s_addr == INADDR_ANY) {
728		struct route *ro;
729		unsigned int ifscope = IFSCOPE_NONE;
730		unsigned int nocell;
731		/*
732		 * If the socket is bound to a specifc interface, the
733		  * optional scoped takes precedence over that if it
734		  * is set by the caller.
735		 */
736		ia = (struct in_ifaddr *)0;
737
738		if (outif != NULL && *outif != NULL)
739			ifscope = (*outif)->if_index;
740		else if (inp->inp_flags & INP_BOUND_IF)
741			ifscope = inp->inp_boundifp->if_index;
742
743		nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
744		/*
745		 * If route is known or can be allocated now,
746		 * our src addr is taken from the i/f, else punt.
747		 * Note that we should check the address family of the cached
748		 * destination, in case of sharing the cache with IPv6.
749		 */
750		ro = &inp->inp_route;
751		if (ro->ro_rt != NULL)
752			RT_LOCK_SPIN(ro->ro_rt);
753		if (ro->ro_rt && (ro->ro_dst.sa_family != AF_INET ||
754		    satosin(&ro->ro_dst)->sin_addr.s_addr !=
755		    sin->sin_addr.s_addr ||
756		    inp->inp_socket->so_options & SO_DONTROUTE ||
757		    ro->ro_rt->generation_id != route_generation)) {
758			RT_UNLOCK(ro->ro_rt);
759			rtfree(ro->ro_rt);
760			ro->ro_rt = NULL;
761		}
762		if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/
763		    (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL)) {
764			if (ro->ro_rt != NULL)
765				RT_UNLOCK(ro->ro_rt);
766			/* No route yet, so try to acquire one */
767			bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
768			ro->ro_dst.sa_family = AF_INET;
769			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
770			((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr =
771				sin->sin_addr;
772			rtalloc_scoped(ro, ifscope);
773			if (ro->ro_rt != NULL)
774				RT_LOCK_SPIN(ro->ro_rt);
775		}
776		/*
777		 * If the route points to a cellular interface and the
778		 * caller forbids our using interfaces of such type,
779		 * pretend that there is no route.
780		 */
781		if (nocell && ro->ro_rt != NULL) {
782			RT_LOCK_ASSERT_HELD(ro->ro_rt);
783			if (ro->ro_rt->rt_ifp->if_type == IFT_CELLULAR) {
784				RT_UNLOCK(ro->ro_rt);
785				rtfree(ro->ro_rt);
786				ro->ro_rt = NULL;
787				soevent(inp->inp_socket,
788				    (SO_FILT_HINT_LOCKED |
789				    SO_FILT_HINT_IFDENIED));
790			}
791		}
792		/*
793		 * If we found a route, use the address
794		 * corresponding to the outgoing interface
795		 * unless it is the loopback (in case a route
796		 * to our address on another net goes to loopback).
797		 */
798		if (ro->ro_rt != NULL) {
799			/* Become a regular mutex */
800			RT_CONVERT_LOCK(ro->ro_rt);
801			if (!(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
802				ia = ifatoia(ro->ro_rt->rt_ifa);
803				if (ia) {
804					IFA_ADDREF(&ia->ia_ifa);
805				}
806			}
807			RT_UNLOCK(ro->ro_rt);
808		}
809		if (ia == 0) {
810			u_short fport = sin->sin_port;
811
812			sin->sin_port = 0;
813			ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
814			if (ia == 0) {
815				ia = ifatoia(ifa_ifwithnet_scoped(sintosa(sin),
816				    ifscope));
817			}
818			sin->sin_port = fport;
819			if (ia == 0) {
820				lck_rw_lock_shared(in_ifaddr_rwlock);
821				ia = TAILQ_FIRST(&in_ifaddrhead);
822				if (ia)
823					IFA_ADDREF(&ia->ia_ifa);
824				lck_rw_done(in_ifaddr_rwlock);
825			}
826			/*
827			 * If the source address belongs to a cellular interface
828			 * and the socket forbids our using interfaces of such
829			 * type, pretend that there is no source address.
830			 */
831			if (nocell && ia != NULL &&
832			    ia->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR) {
833				IFA_REMREF(&ia->ia_ifa);
834				ia = NULL;
835			    soevent(inp->inp_socket,
836				(SO_FILT_HINT_LOCKED |
837				SO_FILT_HINT_IFDENIED));
838			}
839			if (ia == 0)
840				return (EADDRNOTAVAIL);
841		}
842		/*
843		 * If the destination address is multicast and an outgoing
844		 * interface has been set as a multicast option, use the
845		 * address of that interface as our source address.
846		 */
847		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
848		    inp->inp_moptions != NULL) {
849			struct ip_moptions *imo;
850			struct ifnet *ifp;
851
852			imo = inp->inp_moptions;
853			IMO_LOCK(imo);
854			if (imo->imo_multicast_ifp != NULL && (ia == NULL ||
855				ia->ia_ifp != imo->imo_multicast_ifp)) {
856				ifp = imo->imo_multicast_ifp;
857				if (ia)
858					IFA_REMREF(&ia->ia_ifa);
859				lck_rw_lock_shared(in_ifaddr_rwlock);
860				TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
861					if (ia->ia_ifp == ifp)
862						break;
863				}
864				if (ia)
865					IFA_ADDREF(&ia->ia_ifa);
866				lck_rw_done(in_ifaddr_rwlock);
867				if (ia == 0) {
868					IMO_UNLOCK(imo);
869					return (EADDRNOTAVAIL);
870				}
871			}
872			IMO_UNLOCK(imo);
873		}
874		/*
875		 * Don't do pcblookup call here; return interface in plocal_sin
876		 * and exit to caller, that will do the lookup.
877		 */
878		IFA_LOCK_SPIN(&ia->ia_ifa);
879		*plocal_sin = ia->ia_addr;
880		if (outif != NULL)
881			*outif = ia->ia_ifp;
882		IFA_UNLOCK(&ia->ia_ifa);
883		IFA_REMREF(&ia->ia_ifa);
884	}
885	return(0);
886}
887
888/*
889 * Outer subroutine:
890 * Connect from a socket to a specified address.
891 * Both address and port must be specified in argument sin.
892 * If don't have a local address for this socket yet,
893 * then pick one.
894 */
895int
896in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p,
897    struct ifnet **outif)
898{
899	struct sockaddr_in ifaddr;
900	struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
901	struct inpcb *pcb;
902	int error;
903
904	/*
905	 *   Call inner routine, to assign local interface address.
906	 */
907	if ((error = in_pcbladdr(inp, nam, &ifaddr, outif)) != 0)
908		return(error);
909
910	socket_unlock(inp->inp_socket, 0);
911	pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
912	    inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr.sin_addr,
913	    inp->inp_lport, 0, NULL);
914	socket_lock(inp->inp_socket, 0);
915
916	/* Check if the socket is still in a valid state. When we unlock this
917	 * embryonic socket, it can get aborted if another thread is closing
918	 * the listener (radar 7947600).
919	 */
920	if ((inp->inp_socket->so_flags & SOF_ABORTED) != 0) {
921		return ECONNREFUSED;
922	}
923
924	if (pcb != NULL) {
925		in_pcb_checkstate(pcb, WNT_RELEASE, pcb == inp ? 1 : 0);
926		return (EADDRINUSE);
927	}
928	if (inp->inp_laddr.s_addr == INADDR_ANY) {
929		if (inp->inp_lport == 0) {
930			error = in_pcbbind(inp, (struct sockaddr *)0, p);
931			if (error)
932			    return (error);
933		}
934		if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
935			/*lock inversion issue, mostly with udp multicast packets */
936			socket_unlock(inp->inp_socket, 0);
937			lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
938			socket_lock(inp->inp_socket, 0);
939		}
940		inp->inp_laddr = ifaddr.sin_addr;
941		inp->inp_last_outifp = (outif != NULL) ? *outif : NULL;
942		inp->inp_flags |= INP_INADDR_ANY;
943	}
944	 else {
945		if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
946			/*lock inversion issue, mostly with udp multicast packets */
947			socket_unlock(inp->inp_socket, 0);
948			lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
949			socket_lock(inp->inp_socket, 0);
950		}
951	}
952	inp->inp_faddr = sin->sin_addr;
953	inp->inp_fport = sin->sin_port;
954	in_pcbrehash(inp);
955	lck_rw_done(inp->inp_pcbinfo->mtx);
956	return (0);
957}
958
959void
960in_pcbdisconnect(struct inpcb *inp)
961{
962
963	inp->inp_faddr.s_addr = INADDR_ANY;
964	inp->inp_fport = 0;
965
966	if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
967		/*lock inversion issue, mostly with udp multicast packets */
968		socket_unlock(inp->inp_socket, 0);
969		lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
970		socket_lock(inp->inp_socket, 0);
971	}
972
973	in_pcbrehash(inp);
974	lck_rw_done(inp->inp_pcbinfo->mtx);
975
976	if (inp->inp_socket->so_state & SS_NOFDREF)
977		in_pcbdetach(inp);
978}
979
980void
981in_pcbdetach(struct inpcb *inp)
982{
983	struct socket *so = inp->inp_socket;
984
985	if (so->so_pcb == 0) { /* we've been called twice */
986		panic("in_pcbdetach: inp=%p so=%p proto=%d so_pcb is null!\n",
987			inp, so, so->so_proto->pr_protocol);
988	}
989
990#if IPSEC
991	if (ipsec_bypass == 0) {
992		ipsec4_delete_pcbpolicy(inp);
993	}
994#endif /*IPSEC*/
995
996	/* mark socket state as dead */
997	if (in_pcb_checkstate(inp, WNT_STOPUSING, 1) != WNT_STOPUSING)
998		panic("in_pcbdetach so=%p prot=%x couldn't set to STOPUSING\n", so, so->so_proto->pr_protocol);
999
1000#if TEMPDEBUG
1001	if (so->cached_in_sock_layer)
1002	    printf("in_pcbdetach for cached socket %x flags=%x\n", so, so->so_flags);
1003	else
1004	    printf("in_pcbdetach for allocated socket %x flags=%x\n", so, so->so_flags);
1005#endif
1006	if ((so->so_flags & SOF_PCBCLEARING) == 0) {
1007		struct rtentry *rt;
1008		struct ip_moptions *imo;
1009
1010		inp->inp_vflag = 0;
1011		if (inp->inp_options)
1012			(void)m_free(inp->inp_options);
1013		if ((rt = inp->inp_route.ro_rt) != NULL) {
1014			inp->inp_route.ro_rt = NULL;
1015			rtfree(rt);
1016		}
1017		imo = inp->inp_moptions;
1018		inp->inp_moptions = NULL;
1019		if (imo != NULL)
1020			IMO_REMREF(imo);
1021		sofreelastref(so, 0);
1022		inp->inp_state = INPCB_STATE_DEAD;
1023		so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */
1024	}
1025}
1026
1027
1028void
1029in_pcbdispose(struct inpcb *inp)
1030{
1031	struct socket *so = inp->inp_socket;
1032	struct inpcbinfo *ipi = inp->inp_pcbinfo;
1033
1034#if TEMPDEBUG
1035	if (inp->inp_state != INPCB_STATE_DEAD) {
1036		printf("in_pcbdispose: not dead yet? so=%p\n", so);
1037	}
1038#endif
1039	if (so && so->so_usecount != 0)
1040		panic("%s: so %p so_usecount %d so_lockhistory %s\n",
1041			__func__, so, so->so_usecount,
1042			(so != NULL) ? solockhistory_nr(so) : "--");
1043
1044	lck_rw_assert(ipi->mtx, LCK_RW_ASSERT_EXCLUSIVE);
1045
1046	inp->inp_gencnt = ++ipi->ipi_gencnt;
1047	/* access ipi in in_pcbremlists */
1048	in_pcbremlists(inp);
1049
1050	if (so) {
1051		if (so->so_proto->pr_flags & PR_PCBLOCK) {
1052			sofreelastref(so, 0);
1053			if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
1054#if TEMPDEBUG
1055				printf("in_pcbdispose sb not cleaned up so=%p rc_cci=%x snd_cc=%x\n",
1056				       	so, so->so_rcv.sb_cc, so->so_snd.sb_cc);
1057#endif
1058				sbrelease(&so->so_rcv);
1059				sbrelease(&so->so_snd);
1060			}
1061			if (so->so_head != NULL)
1062				panic("in_pcbdispose, so=%p head still exist\n", so);
1063  			lck_mtx_unlock(&inp->inpcb_mtx);
1064  			lck_mtx_destroy(&inp->inpcb_mtx, ipi->mtx_grp);
1065		}
1066		so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */
1067		so->so_saved_pcb = (caddr_t) inp;
1068		so->so_pcb = 0;
1069		inp->inp_socket = 0;
1070#if CONFIG_MACF_NET
1071		mac_inpcb_label_destroy(inp);
1072#endif
1073		/*
1074		 * In case there a route cached after a detach (possible
1075		 * in the tcp case), make sure that it is freed before
1076		 * we deallocate the structure.
1077		 */
1078		if (inp->inp_route.ro_rt != NULL) {
1079			rtfree(inp->inp_route.ro_rt);
1080			inp->inp_route.ro_rt = NULL;
1081		}
1082		if (so->cached_in_sock_layer == 0) {
1083			zfree(ipi->ipi_zone, inp);
1084		}
1085		sodealloc(so);
1086	}
1087#if TEMPDEBUG
1088	else
1089		printf("in_pcbdispose: no socket for inp=%p\n", inp);
1090#endif
1091}
1092
1093/*
1094 * The calling convention of in_setsockaddr() and in_setpeeraddr() was
1095 * modified to match the pru_sockaddr() and pru_peeraddr() entry points
1096 * in struct pr_usrreqs, so that protocols can just reference then directly
1097 * without the need for a wrapper function.  The socket must have a valid
1098 * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
1099 * except through a kernel programming error, so it is acceptable to panic
1100 * (or in this case trap) if the PCB is invalid.  (Actually, we don't trap
1101 * because there actually /is/ a programming error somewhere... XXX)
1102 *
1103 * Returns:	0			Success
1104 *		ENOBUFS			No buffer space available
1105 *		ECONNRESET		Connection reset
1106 */
1107int
1108in_setsockaddr(struct socket *so, struct sockaddr **nam)
1109{
1110	struct inpcb *inp;
1111	struct sockaddr_in *sin;
1112
1113	/*
1114	 * Do the malloc first in case it blocks.
1115	 */
1116	MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK);
1117	if (sin == NULL)
1118		return ENOBUFS;
1119	bzero(sin, sizeof *sin);
1120	sin->sin_family = AF_INET;
1121	sin->sin_len = sizeof(*sin);
1122
1123	inp = sotoinpcb(so);
1124	if (!inp) {
1125		FREE(sin, M_SONAME);
1126		return ECONNRESET;
1127	}
1128	sin->sin_port = inp->inp_lport;
1129	sin->sin_addr = inp->inp_laddr;
1130
1131	*nam = (struct sockaddr *)sin;
1132	return 0;
1133}
1134
1135int
1136in_setpeeraddr(struct socket *so, struct sockaddr **nam)
1137{
1138	struct inpcb *inp;
1139	struct sockaddr_in *sin;
1140
1141	/*
1142	 * Do the malloc first in case it blocks.
1143	 */
1144	MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK);
1145	if (sin == NULL)
1146		return ENOBUFS;
1147	bzero((caddr_t)sin, sizeof (*sin));
1148	sin->sin_family = AF_INET;
1149	sin->sin_len = sizeof(*sin);
1150
1151	inp = sotoinpcb(so);
1152	if (!inp) {
1153		FREE(sin, M_SONAME);
1154		return ECONNRESET;
1155	}
1156	sin->sin_port = inp->inp_fport;
1157	sin->sin_addr = inp->inp_faddr;
1158
1159	*nam = (struct sockaddr *)sin;
1160	return 0;
1161}
1162
1163void
1164in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1165		int errno, void (*notify)(struct inpcb *, int))
1166{
1167	struct inpcb *inp;
1168
1169	lck_rw_lock_shared(pcbinfo->mtx);
1170
1171	LIST_FOREACH(inp, pcbinfo->listhead, inp_list) {
1172#if INET6
1173		if ((inp->inp_vflag & INP_IPV4) == 0)
1174			continue;
1175#endif
1176		if (inp->inp_faddr.s_addr != faddr.s_addr ||
1177		    inp->inp_socket == NULL)
1178				continue;
1179		if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
1180			continue;
1181		socket_lock(inp->inp_socket, 1);
1182		(*notify)(inp, errno);
1183		(void)in_pcb_checkstate(inp, WNT_RELEASE, 1);
1184		socket_unlock(inp->inp_socket, 1);
1185	}
1186	lck_rw_done(pcbinfo->mtx);
1187}
1188
1189/*
1190 * Check for alternatives when higher level complains
1191 * about service problems.  For now, invalidate cached
1192 * routing information.  If the route was created dynamically
1193 * (by a redirect), time to try a default gateway again.
1194 */
1195void
1196in_losing(struct inpcb *inp)
1197{
1198	struct rtentry *rt;
1199	struct rt_addrinfo info;
1200
1201	if ((rt = inp->inp_route.ro_rt) != NULL) {
1202		struct in_ifaddr *ia;
1203
1204		bzero((caddr_t)&info, sizeof(info));
1205		RT_LOCK(rt);
1206		info.rti_info[RTAX_DST] =
1207			(struct sockaddr *)&inp->inp_route.ro_dst;
1208		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1209		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1210		rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
1211		if (rt->rt_flags & RTF_DYNAMIC) {
1212			/*
1213			 * Prevent another thread from modifying rt_key,
1214			 * rt_gateway via rt_setgate() after rt_lock is
1215			 * dropped by marking the route as defunct.
1216			 */
1217			rt->rt_flags |= RTF_CONDEMNED;
1218			RT_UNLOCK(rt);
1219			(void) rtrequest(RTM_DELETE, rt_key(rt),
1220				rt->rt_gateway, rt_mask(rt), rt->rt_flags,
1221				(struct rtentry **)0);
1222		} else {
1223			RT_UNLOCK(rt);
1224		}
1225		/* if the address is gone keep the old route in the pcb */
1226		if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
1227			inp->inp_route.ro_rt = NULL;
1228			rtfree(rt);
1229			IFA_REMREF(&ia->ia_ifa);
1230		}
1231		/*
1232		 * A new route can be allocated
1233		 * the next time output is attempted.
1234		 */
1235	}
1236}
1237
1238/*
1239 * After a routing change, flush old routing
1240 * and allocate a (hopefully) better one.
1241 */
1242void
1243in_rtchange(struct inpcb *inp, __unused int errno)
1244{
1245	struct rtentry *rt;
1246
1247	if ((rt = inp->inp_route.ro_rt) != NULL) {
1248		struct in_ifaddr *ia;
1249
1250		if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) == NULL) {
1251			return; /* we can't remove the route now. not sure if still ok to use src */
1252		}
1253		IFA_REMREF(&ia->ia_ifa);
1254		rtfree(rt);
1255		inp->inp_route.ro_rt = NULL;
1256		/*
1257		 * A new route can be allocated the next time
1258		 * output is attempted.
1259		 */
1260	}
1261}
1262
1263/*
1264 * Lookup a PCB based on the local address and port.
1265 */
1266struct inpcb *
1267in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1268		   unsigned int lport_arg, int wild_okay)
1269{
1270	struct inpcb *inp;
1271	int matchwild = 3, wildcard;
1272	u_short lport = lport_arg;
1273
1274	KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_START, 0,0,0,0,0);
1275
1276	if (!wild_okay) {
1277		struct inpcbhead *head;
1278		/*
1279		 * Look for an unconnected (wildcard foreign addr) PCB that
1280		 * matches the local address and port we're looking for.
1281		 */
1282		head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1283		LIST_FOREACH(inp, head, inp_hash) {
1284#if INET6
1285			if ((inp->inp_vflag & INP_IPV4) == 0)
1286				continue;
1287#endif
1288			if (inp->inp_faddr.s_addr == INADDR_ANY &&
1289			    inp->inp_laddr.s_addr == laddr.s_addr &&
1290			    inp->inp_lport == lport) {
1291				/*
1292				 * Found.
1293				 */
1294				return (inp);
1295			}
1296		}
1297		/*
1298		 * Not found.
1299		 */
1300		KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, 0,0,0,0,0);
1301		return (NULL);
1302	} else {
1303		struct inpcbporthead *porthash;
1304		struct inpcbport *phd;
1305		struct inpcb *match = NULL;
1306		/*
1307		 * Best fit PCB lookup.
1308		 *
1309		 * First see if this local port is in use by looking on the
1310		 * port hash list.
1311		 */
1312		porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
1313		    pcbinfo->porthashmask)];
1314		LIST_FOREACH(phd, porthash, phd_hash) {
1315			if (phd->phd_port == lport)
1316				break;
1317		}
1318		if (phd != NULL) {
1319			/*
1320			 * Port is in use by one or more PCBs. Look for best
1321			 * fit.
1322			 */
1323			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1324				wildcard = 0;
1325#if INET6
1326				if ((inp->inp_vflag & INP_IPV4) == 0)
1327					continue;
1328#endif
1329				if (inp->inp_faddr.s_addr != INADDR_ANY)
1330					wildcard++;
1331				if (inp->inp_laddr.s_addr != INADDR_ANY) {
1332					if (laddr.s_addr == INADDR_ANY)
1333						wildcard++;
1334					else if (inp->inp_laddr.s_addr != laddr.s_addr)
1335						continue;
1336				} else {
1337					if (laddr.s_addr != INADDR_ANY)
1338						wildcard++;
1339				}
1340				if (wildcard < matchwild) {
1341					match = inp;
1342					matchwild = wildcard;
1343					if (matchwild == 0) {
1344						break;
1345					}
1346				}
1347			}
1348		}
1349		KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, match,0,0,0,0);
1350		return (match);
1351	}
1352}
1353
1354/*
1355 * Check if PCB exists in hash list.
1356 */
1357int
1358in_pcblookup_hash_exists(
1359	struct inpcbinfo *pcbinfo,
1360	struct in_addr faddr,
1361	u_int fport_arg,
1362	struct in_addr laddr,
1363	u_int lport_arg,
1364	int wildcard,
1365	uid_t *uid,
1366	gid_t *gid,
1367	struct ifnet *ifp)
1368{
1369	struct inpcbhead *head;
1370	struct inpcb *inp;
1371	u_short fport = fport_arg, lport = lport_arg;
1372	int found;
1373
1374	*uid = UID_MAX;
1375	*gid = GID_MAX;
1376
1377	/*
1378	 * We may have found the pcb in the last lookup - check this first.
1379	 */
1380
1381	lck_rw_lock_shared(pcbinfo->mtx);
1382
1383	/*
1384	 * First look for an exact match.
1385	 */
1386	head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1387	    pcbinfo->hashmask)];
1388	LIST_FOREACH(inp, head, inp_hash) {
1389#if INET6
1390		if ((inp->inp_vflag & INP_IPV4) == 0)
1391			continue;
1392#endif
1393		if (ip_restrictrecvif && ifp != NULL &&
1394		    (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1395		    !(inp->inp_flags & INP_RECV_ANYIF))
1396			continue;
1397
1398		if (inp->inp_faddr.s_addr == faddr.s_addr &&
1399		    inp->inp_laddr.s_addr == laddr.s_addr &&
1400		    inp->inp_fport == fport &&
1401		    inp->inp_lport == lport) {
1402			if ((found = (inp->inp_socket != NULL))) {
1403				/*
1404				 * Found.
1405				 */
1406				*uid = kauth_cred_getuid(
1407				    inp->inp_socket->so_cred);
1408				*gid = kauth_cred_getgid(
1409				    inp->inp_socket->so_cred);
1410			}
1411			lck_rw_done(pcbinfo->mtx);
1412			return (found);
1413		}
1414	}
1415	if (wildcard) {
1416		struct inpcb *local_wild = NULL;
1417#if INET6
1418		struct inpcb *local_wild_mapped = NULL;
1419#endif
1420
1421		head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
1422		    pcbinfo->hashmask)];
1423		LIST_FOREACH(inp, head, inp_hash) {
1424#if INET6
1425			if ((inp->inp_vflag & INP_IPV4) == 0)
1426				continue;
1427#endif
1428			if (ip_restrictrecvif && ifp != NULL &&
1429			    (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1430			    !(inp->inp_flags & INP_RECV_ANYIF))
1431				continue;
1432
1433			if (inp->inp_faddr.s_addr == INADDR_ANY &&
1434			    inp->inp_lport == lport) {
1435				if (inp->inp_laddr.s_addr == laddr.s_addr) {
1436					if ((found = (inp->inp_socket != NULL))) {
1437						*uid = kauth_cred_getuid(
1438						    inp->inp_socket->so_cred);
1439						*gid = kauth_cred_getgid(
1440						    inp->inp_socket->so_cred);
1441					}
1442					lck_rw_done(pcbinfo->mtx);
1443					return (found);
1444				}
1445				else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1446#if INET6
1447					if (inp->inp_socket &&
1448					    INP_CHECK_SOCKAF(inp->inp_socket,
1449					    AF_INET6))
1450						local_wild_mapped = inp;
1451					else
1452#endif /* INET6 */
1453					local_wild = inp;
1454				}
1455			}
1456		}
1457		if (local_wild == NULL) {
1458#if INET6
1459			if (local_wild_mapped != NULL) {
1460				if ((found = (local_wild_mapped->inp_socket != NULL))) {
1461					*uid = kauth_cred_getuid(
1462					    local_wild_mapped->inp_socket->so_cred);
1463					*gid = kauth_cred_getgid(
1464					    local_wild_mapped->inp_socket->so_cred);
1465				}
1466				lck_rw_done(pcbinfo->mtx);
1467				return (found);
1468			}
1469#endif /* INET6 */
1470			lck_rw_done(pcbinfo->mtx);
1471			return (0);
1472		}
1473		if (local_wild != NULL) {
1474			if ((found = (local_wild->inp_socket != NULL))) {
1475				*uid = kauth_cred_getuid(
1476				    local_wild->inp_socket->so_cred);
1477				*gid = kauth_cred_getgid(
1478				    local_wild->inp_socket->so_cred);
1479			}
1480			lck_rw_done(pcbinfo->mtx);
1481			return (found);
1482		}
1483	}
1484
1485	/*
1486	 * Not found.
1487	 */
1488	lck_rw_done(pcbinfo->mtx);
1489	return (0);
1490}
1491
1492/*
1493 * Lookup PCB in hash list.
1494 */
1495struct inpcb *
1496in_pcblookup_hash(
1497	struct inpcbinfo *pcbinfo,
1498	struct in_addr faddr,
1499	u_int fport_arg,
1500	struct in_addr laddr,
1501	u_int lport_arg,
1502	int wildcard,
1503	struct ifnet *ifp)
1504{
1505	struct inpcbhead *head;
1506	struct inpcb *inp;
1507	u_short fport = fport_arg, lport = lport_arg;
1508
1509	/*
1510	 * We may have found the pcb in the last lookup - check this first.
1511	 */
1512
1513	lck_rw_lock_shared(pcbinfo->mtx);
1514
1515	/*
1516	 * First look for an exact match.
1517	 */
1518	head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)];
1519	LIST_FOREACH(inp, head, inp_hash) {
1520#if INET6
1521		if ((inp->inp_vflag & INP_IPV4) == 0)
1522			continue;
1523#endif
1524		if (ip_restrictrecvif && ifp != NULL &&
1525		    (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1526		    !(inp->inp_flags & INP_RECV_ANYIF))
1527			continue;
1528
1529		if (inp->inp_faddr.s_addr == faddr.s_addr &&
1530		    inp->inp_laddr.s_addr == laddr.s_addr &&
1531		    inp->inp_fport == fport &&
1532		    inp->inp_lport == lport) {
1533			/*
1534			 * Found.
1535			 */
1536			if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1537				lck_rw_done(pcbinfo->mtx);
1538				return (inp);
1539			}
1540			else {	/* it's there but dead, say it isn't found */
1541				lck_rw_done(pcbinfo->mtx);
1542				return (NULL);
1543			}
1544		}
1545	}
1546	if (wildcard) {
1547		struct inpcb *local_wild = NULL;
1548#if INET6
1549		struct inpcb *local_wild_mapped = NULL;
1550#endif
1551
1552		head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1553		LIST_FOREACH(inp, head, inp_hash) {
1554#if INET6
1555			if ((inp->inp_vflag & INP_IPV4) == 0)
1556				continue;
1557#endif
1558			if (ip_restrictrecvif && ifp != NULL &&
1559			    (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1560			    !(inp->inp_flags & INP_RECV_ANYIF))
1561				continue;
1562
1563			if (inp->inp_faddr.s_addr == INADDR_ANY &&
1564			    inp->inp_lport == lport) {
1565				if (inp->inp_laddr.s_addr == laddr.s_addr) {
1566					if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1567						lck_rw_done(pcbinfo->mtx);
1568						return (inp);
1569					}
1570					else {	/* it's there but dead, say it isn't found */
1571						lck_rw_done(pcbinfo->mtx);
1572						return (NULL);
1573					}
1574				}
1575				else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1576#if INET6
1577					if (INP_CHECK_SOCKAF(inp->inp_socket,
1578							     AF_INET6))
1579						local_wild_mapped = inp;
1580					else
1581#endif /* INET6 */
1582					local_wild = inp;
1583				}
1584			}
1585		}
1586		if (local_wild == NULL) {
1587#if INET6
1588			if (local_wild_mapped != NULL) {
1589				if (in_pcb_checkstate(local_wild_mapped, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1590					lck_rw_done(pcbinfo->mtx);
1591					return (local_wild_mapped);
1592				}
1593				else {	/* it's there but dead, say it isn't found */
1594					lck_rw_done(pcbinfo->mtx);
1595					return (NULL);
1596				}
1597			}
1598#endif /* INET6 */
1599			lck_rw_done(pcbinfo->mtx);
1600			return (NULL);
1601		}
1602		if (in_pcb_checkstate(local_wild, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1603			lck_rw_done(pcbinfo->mtx);
1604			return (local_wild);
1605		}
1606		else {	/* it's there but dead, say it isn't found */
1607			lck_rw_done(pcbinfo->mtx);
1608			return (NULL);
1609		}
1610	}
1611
1612	/*
1613	 * Not found.
1614	 */
1615	lck_rw_done(pcbinfo->mtx);
1616	return (NULL);
1617}
1618
1619/*
1620 * Insert PCB onto various hash lists.
1621 */
1622int
1623in_pcbinshash(struct inpcb *inp, int locked)
1624{
1625	struct inpcbhead *pcbhash;
1626	struct inpcbporthead *pcbporthash;
1627	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1628	struct inpcbport *phd;
1629	u_int32_t hashkey_faddr;
1630
1631        if (!locked) {
1632                if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
1633                	/*lock inversion issue, mostly with udp multicast packets */
1634                        socket_unlock(inp->inp_socket, 0);
1635                        lck_rw_lock_exclusive(pcbinfo->mtx);
1636                        socket_lock(inp->inp_socket, 0);
1637			if (inp->inp_state == INPCB_STATE_DEAD) {
1638				/* The socket got dropped when it was unlocked */
1639				lck_rw_done(pcbinfo->mtx);
1640				return(ECONNABORTED);
1641			}
1642                }
1643        }
1644
1645#if INET6
1646	if (inp->inp_vflag & INP_IPV6)
1647		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1648	else
1649#endif /* INET6 */
1650	hashkey_faddr = inp->inp_faddr.s_addr;
1651
1652	inp->hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->hashmask);
1653
1654	pcbhash = &pcbinfo->hashbase[inp->hash_element];
1655
1656	pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport,
1657	    pcbinfo->porthashmask)];
1658
1659	/*
1660	 * Go through port list and look for a head for this lport.
1661	 */
1662	LIST_FOREACH(phd, pcbporthash, phd_hash) {
1663		if (phd->phd_port == inp->inp_lport)
1664			break;
1665	}
1666
1667	VERIFY(inp->inp_state != INPCB_STATE_DEAD);
1668
1669	/*
1670	 * If none exists, malloc one and tack it on.
1671	 */
1672	if (phd == NULL) {
1673		MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_WAITOK);
1674		if (phd == NULL) {
1675			if (!locked)
1676				lck_rw_done(pcbinfo->mtx);
1677			return (ENOBUFS); /* XXX */
1678		}
1679		phd->phd_port = inp->inp_lport;
1680		LIST_INIT(&phd->phd_pcblist);
1681		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1682	}
1683	inp->inp_phd = phd;
1684	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1685	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
1686	if (!locked)
1687		lck_rw_done(pcbinfo->mtx);
1688	return (0);
1689}
1690
1691/*
1692 * Move PCB to the proper hash bucket when { faddr, fport } have  been
1693 * changed. NOTE: This does not handle the case of the lport changing (the
1694 * hashed port list would have to be updated as well), so the lport must
1695 * not change after in_pcbinshash() has been called.
1696 */
1697void
1698in_pcbrehash(struct inpcb *inp)
1699{
1700	struct inpcbhead *head;
1701	u_int32_t hashkey_faddr;
1702
1703#if INET6
1704	if (inp->inp_vflag & INP_IPV6)
1705		hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1706	else
1707#endif /* INET6 */
1708	hashkey_faddr = inp->inp_faddr.s_addr;
1709	inp->hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
1710				inp->inp_fport, inp->inp_pcbinfo->hashmask);
1711	head = &inp->inp_pcbinfo->hashbase[inp->hash_element];
1712
1713	LIST_REMOVE(inp, inp_hash);
1714	LIST_INSERT_HEAD(head, inp, inp_hash);
1715}
1716
1717/*
1718 * Remove PCB from various lists.
1719 * Must be called pcbinfo lock is held in exclusive mode.
1720 */
1721void
1722in_pcbremlists(struct inpcb *inp)
1723{
1724	struct inp_fc_entry *infce;
1725	inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
1726
1727	if (inp->inp_lport) {
1728		struct inpcbport *phd = inp->inp_phd;
1729
1730		LIST_REMOVE(inp, inp_hash);
1731		LIST_REMOVE(inp, inp_portlist);
1732		if (phd != NULL && (LIST_FIRST(&phd->phd_pcblist) == NULL)) {
1733			LIST_REMOVE(phd, phd_hash);
1734			FREE(phd, M_PCB);
1735		}
1736	}
1737	LIST_REMOVE(inp, inp_list);
1738
1739	infce = inp_fc_getinp(inp->inp_flowhash);
1740	if (infce != NULL)
1741		inp_fc_entry_free(infce);
1742
1743	inp->inp_pcbinfo->ipi_count--;
1744}
1745
1746/* Mechanism used to defer the memory release of PCBs
1747 * The pcb list will contain the pcb until the ripper can clean it up if
1748 * the following conditions are met: 1) state "DEAD", 2) wantcnt is STOPUSING
1749 * 3) usecount is null
1750 * This function will be called to either mark the pcb as
1751*/
1752int
1753in_pcb_checkstate(struct inpcb *pcb, int mode, int locked)
1754{
1755
1756	volatile UInt32 *wantcnt	= (volatile UInt32 *)&pcb->inp_wantcnt;
1757	UInt32 origwant;
1758	UInt32 newwant;
1759
1760	switch (mode) {
1761
1762		case WNT_STOPUSING:	/* try to mark the pcb as ready for recycling */
1763
1764			/* compareswap with STOPUSING, if success we're good, if it's in use, will be marked later */
1765
1766			if (locked == 0)
1767				socket_lock(pcb->inp_socket, 1);
1768			pcb->inp_state = INPCB_STATE_DEAD;
1769
1770stopusing:
1771			if (pcb->inp_socket->so_usecount < 0)
1772				panic("in_pcb_checkstate STOP pcb=%p so=%p usecount is negative\n", pcb, pcb->inp_socket);
1773			if (locked == 0)
1774				socket_unlock(pcb->inp_socket, 1);
1775
1776			origwant = *wantcnt;
1777        		if ((UInt16) origwant == 0xffff ) /* should stop using */
1778				return (WNT_STOPUSING);
1779			newwant = 0xffff;
1780			if ((UInt16) origwant == 0) {/* try to mark it as unsuable now */
1781    				OSCompareAndSwap(origwant, newwant, wantcnt) ;
1782			}
1783			return (WNT_STOPUSING);
1784			break;
1785
1786		case WNT_ACQUIRE:	/* try to increase reference to pcb */
1787					/* if WNT_STOPUSING should bail out */
1788			/*
1789			 * if socket state DEAD, try to set count to STOPUSING, return failed
1790			 * otherwise increase cnt
1791			 */
1792			do {
1793				origwant = *wantcnt;
1794        			if ((UInt16) origwant == 0xffff ) {/* should stop using */
1795//					printf("in_pcb_checkstate: ACQ PCB was STOPUSING while release. odd pcb=%p\n", pcb);
1796					return (WNT_STOPUSING);
1797				}
1798				newwant = origwant + 1;
1799			} while (!OSCompareAndSwap(origwant, newwant, wantcnt));
1800			return (WNT_ACQUIRE);
1801			break;
1802
1803		case WNT_RELEASE:	/* release reference. if result is null and pcb state is DEAD,
1804					   set wanted bit to STOPUSING
1805					 */
1806
1807			if (locked == 0)
1808				socket_lock(pcb->inp_socket, 1);
1809
1810			do {
1811				origwant = *wantcnt;
1812        			if ((UInt16) origwant == 0x0 )
1813					panic("in_pcb_checkstate pcb=%p release with zero count", pcb);
1814        			if ((UInt16) origwant == 0xffff ) {/* should stop using */
1815#if TEMPDEBUG
1816					printf("in_pcb_checkstate: REL PCB was STOPUSING while release. odd pcb=%p\n", pcb);
1817#endif
1818					if (locked == 0)
1819						socket_unlock(pcb->inp_socket, 1);
1820					return (WNT_STOPUSING);
1821				}
1822				newwant = origwant - 1;
1823			} while (!OSCompareAndSwap(origwant, newwant, wantcnt));
1824
1825			if (pcb->inp_state == INPCB_STATE_DEAD)
1826				goto stopusing;
1827			if (pcb->inp_socket->so_usecount < 0)
1828				panic("in_pcb_checkstate RELEASE pcb=%p so=%p usecount is negative\n", pcb, pcb->inp_socket);
1829
1830			if (locked == 0)
1831				socket_unlock(pcb->inp_socket, 1);
1832			return (WNT_RELEASE);
1833			break;
1834
1835		default:
1836
1837			panic("in_pcb_checkstate: so=%p not a valid state =%x\n", pcb->inp_socket, mode);
1838	}
1839
1840	/* NOTREACHED */
1841	return (mode);
1842}
1843
1844/*
1845 * inpcb_to_compat copies specific bits of an inpcb to a inpcb_compat.
1846 * The inpcb_compat data structure is passed to user space and must
1847 * not change. We intentionally avoid copying pointers.
1848 */
1849void
1850inpcb_to_compat(
1851	struct inpcb *inp,
1852	struct inpcb_compat *inp_compat)
1853{
1854	bzero(inp_compat, sizeof(*inp_compat));
1855	inp_compat->inp_fport = inp->inp_fport;
1856	inp_compat->inp_lport = inp->inp_lport;
1857	inp_compat->nat_owner = 0;
1858	inp_compat->nat_cookie = inp->nat_cookie;
1859	inp_compat->inp_gencnt = inp->inp_gencnt;
1860	inp_compat->inp_flags = inp->inp_flags;
1861	inp_compat->inp_flow = inp->inp_flow;
1862	inp_compat->inp_vflag = inp->inp_vflag;
1863	inp_compat->inp_ip_ttl = inp->inp_ip_ttl;
1864	inp_compat->inp_ip_p = inp->inp_ip_p;
1865	inp_compat->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
1866	inp_compat->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
1867	inp_compat->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
1868	inp_compat->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim;
1869	inp_compat->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
1870	inp_compat->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex;
1871	inp_compat->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
1872}
1873
1874#if !CONFIG_EMBEDDED
1875
1876void
1877inpcb_to_xinpcb64(
1878        struct inpcb *inp,
1879        struct xinpcb64 *xinp)
1880{
1881	xinp->inp_fport = inp->inp_fport;
1882	xinp->inp_lport = inp->inp_lport;
1883	xinp->inp_gencnt = inp->inp_gencnt;
1884	xinp->inp_flags = inp->inp_flags;
1885	xinp->inp_flow = inp->inp_flow;
1886	xinp->inp_vflag = inp->inp_vflag;
1887	xinp->inp_ip_ttl = inp->inp_ip_ttl;
1888	xinp->inp_ip_p = inp->inp_ip_p;
1889	xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
1890	xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
1891	xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
1892	xinp->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim;
1893	xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
1894	xinp->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex;
1895	xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
1896}
1897
1898#endif /* !CONFIG_EMBEDDED */
1899
1900
1901/*
1902 * The following routines implement this scheme:
1903 *
1904 * Callers of ip_output() that intend to cache the route in the inpcb pass
1905 * a local copy of the struct route to ip_output().  Using a local copy of
1906 * the cached route significantly simplifies things as IP no longer has to
1907 * worry about having exclusive access to the passed in struct route, since
1908 * it's defined in the caller's stack; in essence, this allows for a lock-
1909 * less operation when updating the struct route at the IP level and below,
1910 * whenever necessary. The scheme works as follows:
1911 *
1912 * Prior to dropping the socket's lock and calling ip_output(), the caller
1913 * copies the struct route from the inpcb into its stack, and adds a reference
1914 * to the cached route entry, if there was any.  The socket's lock is then
1915 * dropped and ip_output() is called with a pointer to the copy of struct
1916 * route defined on the stack (not to the one in the inpcb.)
1917 *
1918 * Upon returning from ip_output(), the caller then acquires the socket's
1919 * lock and synchronizes the cache; if there is no route cached in the inpcb,
1920 * it copies the local copy of struct route (which may or may not contain any
1921 * route) back into the cache; otherwise, if the inpcb has a route cached in
1922 * it, the one in the local copy will be freed, if there's any.  Trashing the
1923 * cached route in the inpcb can be avoided because ip_output() is single-
1924 * threaded per-PCB (i.e. multiple transmits on a PCB are always serialized
1925 * by the socket/transport layer.)
1926 */
1927void
1928inp_route_copyout(struct inpcb *inp, struct route *dst)
1929{
1930	struct route *src = &inp->inp_route;
1931
1932	lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1933
1934	/*
1935	 * If the route in the PCB is not for IPv4, blow it away;
1936	 * this is possible in the case of IPv4-mapped address case.
1937	 */
1938	if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
1939		rtfree(src->ro_rt);
1940		src->ro_rt = NULL;
1941	}
1942
1943	route_copyout(dst, src, sizeof(*dst));
1944}
1945
1946void
1947inp_route_copyin(struct inpcb *inp, struct route *src)
1948{
1949	struct route *dst = &inp->inp_route;
1950
1951	lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1952
1953	/* Minor sanity check */
1954	if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET)
1955		panic("%s: wrong or corrupted route: %p", __func__, src);
1956
1957	route_copyin(src, dst, sizeof(*src));
1958}
1959
1960/*
1961 * Handler for setting IP_FORCE_OUT_IFP/IP_BOUND_IF/IPV6_BOUND_IF socket option.
1962 */
1963int
1964inp_bindif(struct inpcb *inp, unsigned int ifscope)
1965{
1966	struct ifnet *ifp = NULL;
1967
1968	ifnet_head_lock_shared();
1969	if ((ifscope > (unsigned)if_index) || (ifscope != IFSCOPE_NONE &&
1970	    (ifp = ifindex2ifnet[ifscope]) == NULL)) {
1971		ifnet_head_done();
1972		return (ENXIO);
1973	}
1974	ifnet_head_done();
1975
1976	VERIFY(ifp != NULL || ifscope == IFSCOPE_NONE);
1977
1978	/*
1979	 * A zero interface scope value indicates an "unbind".
1980	 * Otherwise, take in whatever value the app desires;
1981	 * the app may already know the scope (or force itself
1982	 * to such a scope) ahead of time before the interface
1983	 * gets attached.  It doesn't matter either way; any
1984	 * route lookup from this point on will require an
1985	 * exact match for the embedded interface scope.
1986	 */
1987	inp->inp_boundifp = ifp;
1988	if (inp->inp_boundifp == NULL)
1989		inp->inp_flags &= ~INP_BOUND_IF;
1990	else
1991		inp->inp_flags |= INP_BOUND_IF;
1992
1993	/* Blow away any cached route in the PCB */
1994	if (inp->inp_route.ro_rt != NULL) {
1995		rtfree(inp->inp_route.ro_rt);
1996		inp->inp_route.ro_rt = NULL;
1997	}
1998
1999	return (0);
2000}
2001
2002/*
2003 * Handler for setting IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option.
2004 */
2005int
2006inp_nocellular(struct inpcb *inp, unsigned int val)
2007{
2008	if (val) {
2009		inp->inp_flags |= INP_NO_IFT_CELLULAR;
2010	} else if (inp->inp_flags & INP_NO_IFT_CELLULAR) {
2011		/* once set, it cannot be unset */
2012		return (EINVAL);
2013	}
2014
2015	/* Blow away any cached route in the PCB */
2016	if (inp->inp_route.ro_rt != NULL) {
2017		rtfree(inp->inp_route.ro_rt);
2018		inp->inp_route.ro_rt = NULL;
2019	}
2020
2021	return (0);
2022}
2023
2024/*
2025 * Calculate flow hash for an inp, used by an interface to identify a
2026 * flow. When an interface provides flow control advisory, this flow
2027 * hash is used as an identifier.
2028 */
2029u_int32_t
2030inp_calc_flowhash(struct inpcb *inp)
2031{
2032	struct inp_flowhash_key fh __attribute__((aligned(8)));
2033	u_int32_t flowhash = 0;
2034
2035	if (inp_hash_seed == 0)
2036		inp_hash_seed = RandomULong();
2037
2038	bzero(&fh, sizeof (fh));
2039
2040	bcopy(&inp->inp_dependladdr, &fh.infh_laddr, sizeof (fh.infh_laddr));
2041	bcopy(&inp->inp_dependfaddr, &fh.infh_faddr, sizeof (fh.infh_faddr));
2042
2043	fh.infh_lport = inp->inp_lport;
2044	fh.infh_fport = inp->inp_fport;
2045	fh.infh_af = (inp->inp_vflag & INP_IPV6) ? AF_INET6 : AF_INET;
2046	fh.infh_proto = inp->inp_ip_p;
2047	fh.infh_rand1 = RandomULong();
2048	fh.infh_rand2 = RandomULong();
2049
2050try_again:
2051	flowhash = net_flowhash(&fh, sizeof (fh), inp_hash_seed);
2052	if (flowhash == 0) {
2053		/* try to get a non-zero flowhash */
2054		inp_hash_seed = RandomULong();
2055		goto try_again;
2056	}
2057
2058	return flowhash;
2059}
2060
2061/*
2062 * Function to compare inp_fc_entries in inp flow control tree
2063 */
2064static inline int
2065infc_cmp(const struct inp_fc_entry *fc1, const struct inp_fc_entry *fc2)
2066{
2067	return (fc1->infc_flowhash - fc2->infc_flowhash);
2068}
2069
2070int
2071inp_fc_addinp(struct inpcb *inp)
2072{
2073	struct inp_fc_entry keyfc, *infc;
2074	u_int32_t flowhash = inp->inp_flowhash;
2075
2076	keyfc.infc_flowhash = flowhash;
2077
2078	lck_mtx_lock_spin(&inp_fc_lck);
2079	infc = RB_FIND(inp_fc_tree, &inp_fc_tree, &keyfc);
2080	if (infc != NULL && infc->infc_inp == inp) {
2081		/* Entry is already in inp_fc_tree, return */
2082		lck_mtx_unlock(&inp_fc_lck);
2083		return (1);
2084	}
2085
2086	if (infc != NULL) {
2087		/*
2088		 * There is a different fc entry with the same
2089		 * flow hash but different inp pointer. There
2090		 * can be a collision on flow hash but the
2091		 * probability is low. Let's just avoid
2092		 * adding a second one when there is a collision
2093		 */
2094		lck_mtx_unlock(&inp_fc_lck);
2095		return (0);
2096	}
2097
2098	/* become regular mutex */
2099	lck_mtx_convert_spin(&inp_fc_lck);
2100
2101	infc = zalloc_noblock(inp_fcezone);
2102	if (infc == NULL) {
2103		/* memory allocation failed */
2104		lck_mtx_unlock(&inp_fc_lck);
2105		return (0);
2106	}
2107	bzero(infc, sizeof (*infc));
2108
2109	infc->infc_flowhash = flowhash;
2110	infc->infc_inp = inp;
2111
2112	RB_INSERT(inp_fc_tree, &inp_fc_tree, infc);
2113	lck_mtx_unlock(&inp_fc_lck);
2114	return (1);
2115}
2116
2117struct inp_fc_entry*
2118inp_fc_getinp(u_int32_t flowhash)
2119{
2120	struct inp_fc_entry keyfc, *infc;
2121
2122	keyfc.infc_flowhash = flowhash;
2123
2124	lck_mtx_lock_spin(&inp_fc_lck);
2125	infc = RB_FIND(inp_fc_tree, &inp_fc_tree, &keyfc);
2126	if (infc == NULL) {
2127		/* inp is not present, return */
2128		lck_mtx_unlock(&inp_fc_lck);
2129		return (NULL);
2130	}
2131
2132	RB_REMOVE(inp_fc_tree, &inp_fc_tree, infc);
2133
2134	if (in_pcb_checkstate(infc->infc_inp, WNT_ACQUIRE, 0) ==
2135	    WNT_STOPUSING) {
2136		/* become regular mutex */
2137		lck_mtx_convert_spin(&inp_fc_lck);
2138
2139		/*
2140		 * This inp is going away, just don't process it.
2141		 */
2142		inp_fc_entry_free(infc);
2143		infc = NULL;
2144	}
2145	lck_mtx_unlock(&inp_fc_lck);
2146
2147	return (infc);
2148}
2149
2150void
2151inp_fc_entry_free(struct inp_fc_entry *infc)
2152{
2153	zfree(inp_fcezone, infc);
2154}
2155
2156void
2157inp_fc_feedback(struct inpcb *inp)
2158{
2159	struct socket *so = inp->inp_socket;
2160
2161	/* we already hold a want_cnt on this inp, socket can't be null */
2162	VERIFY (so != NULL);
2163	socket_lock(so, 1);
2164
2165	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2166		socket_unlock(so, 1);
2167		return;
2168	}
2169
2170	/*
2171	 * Return if the connection is not in flow-controlled state.
2172	 * This can happen if the connection experienced
2173	 * loss while it was in flow controlled state
2174	 */
2175	if (!INP_WAIT_FOR_IF_FEEDBACK(inp)) {
2176		socket_unlock(so, 1);
2177		return;
2178	}
2179	inp_reset_fc_state(inp);
2180
2181	if (so->so_proto->pr_type == SOCK_STREAM)
2182		inp_fc_unthrottle_tcp(inp);
2183
2184	socket_unlock(so, 1);
2185}
2186
2187void
2188inp_reset_fc_state(struct inpcb *inp)
2189{
2190	struct socket *so = inp->inp_socket;
2191	int suspended = (INP_IS_FLOW_SUSPENDED(inp)) ? 1 : 0;
2192	int needwakeup = (INP_WAIT_FOR_IF_FEEDBACK(inp)) ? 1 : 0;
2193
2194	inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
2195
2196	if (suspended) {
2197		so->so_flags &= ~(SOF_SUSPENDED);
2198		soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_RESUME));
2199	}
2200
2201	if (inp->inp_sndinprog_cnt > 0)
2202		inp->inp_flags |= INP_FC_FEEDBACK;
2203
2204	/* Give a write wakeup to unblock the socket */
2205	if (needwakeup)
2206		sowwakeup(so);
2207}
2208
2209int
2210inp_set_fc_state(struct inpcb *inp, int advcode)
2211{
2212	/*
2213	 * If there was a feedback from the interface when
2214	 * send operation was in progress, we should ignore
2215	 * this flow advisory to avoid a race between setting
2216	 * flow controlled state and receiving feedback from
2217	 * the interface
2218	 */
2219	if (inp->inp_flags & INP_FC_FEEDBACK)
2220		return(0);
2221
2222	inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
2223	if (inp_fc_addinp(inp)) {
2224		switch (advcode) {
2225		case FADV_FLOW_CONTROLLED:
2226			inp->inp_flags |= INP_FLOW_CONTROLLED;
2227			break;
2228		case FADV_SUSPENDED:
2229			inp->inp_flags |= INP_FLOW_SUSPENDED;
2230			soevent(inp->inp_socket,
2231			    (SO_FILT_HINT_LOCKED | SO_FILT_HINT_SUSPEND));
2232
2233			/* Record the fact that suspend event was sent */
2234			inp->inp_socket->so_flags |= SOF_SUSPENDED;
2235			break;
2236		}
2237	}
2238	return(1);
2239}
2240
2241/*
2242 * Handler for SO_FLUSH socket option.
2243 */
2244int
2245inp_flush(struct inpcb *inp, int optval)
2246{
2247	u_int32_t flowhash = inp->inp_flowhash;
2248	struct rtentry *rt;
2249
2250	/* Either all classes or one of the valid ones */
2251	if (optval != SO_TC_ALL && !SO_VALID_TC(optval))
2252		return (EINVAL);
2253
2254	/* We need a flow hash for identification */
2255	if (flowhash == 0)
2256		return (0);
2257
2258	/* We need a cached route for the interface */
2259	if ((rt = inp->inp_route.ro_rt) != NULL) {
2260		struct ifnet *ifp = rt->rt_ifp;
2261		if_qflush_sc(ifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
2262	}
2263
2264	return (0);
2265}
2266
2267/*
2268 * Clear the INP_INADDR_ANY flag (special case for PPP only)
2269 */
2270void inp_clear_INP_INADDR_ANY(struct socket *so)
2271{
2272	struct inpcb *inp = NULL;
2273
2274	socket_lock(so, 1);
2275	inp = sotoinpcb(so);
2276	if (inp) {
2277		inp->inp_flags &= ~INP_INADDR_ANY;
2278	}
2279	socket_unlock(so, 1);
2280}
2281
2282