1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*	$FreeBSD: src/sys/netinet6/in6_src.c,v 1.1.2.2 2001/07/03 11:01:52 ume Exp $	*/
30/*	$KAME: in6_src.c,v 1.37 2001/03/29 05:34:31 itojun Exp $	*/
31
32/*
33 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the project nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 */
60
61/*
62 * Copyright (c) 1982, 1986, 1991, 1993
63 *	The Regents of the University of California.  All rights reserved.
64 *
65 * Redistribution and use in source and binary forms, with or without
66 * modification, are permitted provided that the following conditions
67 * are met:
68 * 1. Redistributions of source code must retain the above copyright
69 *    notice, this list of conditions and the following disclaimer.
70 * 2. Redistributions in binary form must reproduce the above copyright
71 *    notice, this list of conditions and the following disclaimer in the
72 *    documentation and/or other materials provided with the distribution.
73 * 3. All advertising materials mentioning features or use of this software
74 *    must display the following acknowledgement:
75 *	This product includes software developed by the University of
76 *	California, Berkeley and its contributors.
77 * 4. Neither the name of the University nor the names of its contributors
78 *    may be used to endorse or promote products derived from this software
79 *    without specific prior written permission.
80 *
81 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
82 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
83 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
84 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
85 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
86 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
87 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
88 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
89 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
90 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
91 * SUCH DAMAGE.
92 *
93 *	@(#)in_pcb.c	8.2 (Berkeley) 1/4/94
94 */
95
96
97#include <sys/param.h>
98#include <sys/systm.h>
99#include <sys/malloc.h>
100#include <sys/mbuf.h>
101#include <sys/protosw.h>
102#include <sys/socket.h>
103#include <sys/socketvar.h>
104#include <sys/errno.h>
105#include <sys/time.h>
106#include <sys/proc.h>
107#include <sys/sysctl.h>
108#include <sys/kauth.h>
109#include <sys/priv.h>
110#include <kern/lock.h>
111
112#include <net/if.h>
113#include <net/if_types.h>
114#include <net/route.h>
115
116#include <netinet/in.h>
117#include <netinet/in_var.h>
118#include <netinet/in_systm.h>
119#include <netinet/ip.h>
120#include <netinet/in_pcb.h>
121#include <netinet6/in6_var.h>
122#include <netinet/ip6.h>
123#include <netinet6/in6_pcb.h>
124#include <netinet6/ip6_var.h>
125#include <netinet6/scope6_var.h>
126#include <netinet6/nd6.h>
127
128#include <net/net_osdep.h>
129
130#include "loop.h"
131
132SYSCTL_DECL(_net_inet6_ip6);
133
134static int ip6_select_srcif_debug = 0;
135SYSCTL_INT(_net_inet6_ip6, OID_AUTO, select_srcif_debug,
136    CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_select_srcif_debug, 0,
137    "log source interface selection debug info");
138
139#define ADDR_LABEL_NOTAPP (-1)
140struct in6_addrpolicy defaultaddrpolicy;
141
142int ip6_prefer_tempaddr = 1;
143#ifdef ENABLE_ADDRSEL
144extern lck_mtx_t *addrsel_mutex;
145#define	ADDRSEL_LOCK()		lck_mtx_lock(addrsel_mutex)
146#define	ADDRSEL_UNLOCK()	lck_mtx_unlock(addrsel_mutex)
147#else
148#define	ADDRSEL_LOCK()
149#define	ADDRSEL_UNLOCK()
150#endif
151
152static int selectroute(struct sockaddr_in6 *, struct sockaddr_in6 *,
153	struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *,
154	struct ifnet **, struct rtentry **, int, int,
155	const struct ip6_out_args *ip6oa);
156static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
157	struct ip6_moptions *, struct route_in6 *ro,
158	const struct ip6_out_args *, struct ifnet **);
159static void init_policy_queue(void);
160static int add_addrsel_policyent(const struct in6_addrpolicy *);
161#ifdef ENABLE_ADDRSEL
162static int delete_addrsel_policyent(const struct in6_addrpolicy *);
163#endif
164static int walk_addrsel_policy(int (*)(const struct in6_addrpolicy *, void *),
165	void *);
166static int dump_addrsel_policyent(const struct in6_addrpolicy *, void *);
167static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *);
168void addrsel_policy_init(void);
169
170/*
171 * Return an IPv6 address, which is the most appropriate for a given
172 * destination and user specified options.
173 * If necessary, this function lookups the routing table and returns
174 * an entry to the caller for later use.
175 */
176#define REPLACE(r) do {\
177	if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
178		sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
179		ip6stat.ip6s_sources_rule[(r)]++; \
180	goto replace; \
181} while(0)
182#define NEXTSRC(r) do {\
183	if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
184		sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
185		ip6stat.ip6s_sources_rule[(r)]++; \
186	goto next;		/* XXX: we can't use 'continue' here */ \
187} while(0)
188#define BREAK(r) do { \
189	if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
190		sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
191		ip6stat.ip6s_sources_rule[(r)]++; \
192	goto out;		/* XXX: we can't use 'break' here */ \
193} while(0)
194
195/*
196 * Regardless of error, it will return an ifp with a reference held if the
197 * caller provides a non-NULL ifpp.  The caller is responsible for checking
198 * if the returned ifp is valid and release its reference at all times.
199 */
200struct in6_addr *
201in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
202    struct inpcb *inp, struct route_in6 *ro,
203    struct ifnet **ifpp, struct in6_addr *src_storage, unsigned int ifscope,
204    int *errorp)
205{
206	struct in6_addr dst;
207	struct ifnet *ifp = NULL;
208	struct in6_ifaddr *ia = NULL, *ia_best = NULL;
209	struct in6_pktinfo *pi = NULL;
210	int dst_scope = -1, best_scope = -1, best_matchlen = -1;
211	struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL;
212	u_int32_t odstzone;
213	int prefer_tempaddr;
214	struct ip6_moptions *mopts;
215	struct timeval timenow;
216	struct ip6_out_args ip6oa = { ifscope, { 0 }, IP6OAF_SELECT_SRCIF };
217	boolean_t islocal = FALSE;
218
219	getmicrotime(&timenow);
220
221	dst = dstsock->sin6_addr; /* make a copy for local operation */
222	*errorp = 0;
223	if (ifpp != NULL)
224		*ifpp = NULL;
225
226	if (inp != NULL) {
227		mopts = inp->in6p_moptions;
228		if (inp->inp_flags & INP_NO_IFT_CELLULAR)
229			ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
230	} else {
231		mopts = NULL;
232	}
233
234	if (ip6oa.ip6oa_boundif != IFSCOPE_NONE)
235		ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
236
237	/*
238	 * If the source address is explicitly specified by the caller,
239	 * check if the requested source address is indeed a unicast address
240	 * assigned to the node, and can be used as the packet's source
241	 * address.  If everything is okay, use the address as source.
242	 */
243	if (opts && (pi = opts->ip6po_pktinfo) &&
244	    !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
245		struct sockaddr_in6 srcsock;
246		struct in6_ifaddr *ia6;
247
248		/* get the outgoing interface */
249		if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ip6oa,
250		    &ifp)) != 0) {
251			src_storage = NULL;
252			goto done;
253		}
254
255		/*
256		 * determine the appropriate zone id of the source based on
257		 * the zone of the destination and the outgoing interface.
258		 * If the specified address is ambiguous wrt the scope zone,
259		 * the interface must be specified; otherwise, ifa_ifwithaddr()
260		 * will fail matching the address.
261		 */
262		bzero(&srcsock, sizeof(srcsock));
263		srcsock.sin6_family = AF_INET6;
264		srcsock.sin6_len = sizeof(srcsock);
265		srcsock.sin6_addr = pi->ipi6_addr;
266		if (ifp != NULL) {
267			*errorp = in6_setscope(&srcsock.sin6_addr, ifp, NULL);
268			if (*errorp != 0) {
269				src_storage = NULL;
270				goto done;
271			}
272		}
273		ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)
274		    (&srcsock));
275		if (ia6 == NULL) {
276			*errorp = EADDRNOTAVAIL;
277			src_storage = NULL;
278			goto done;
279		}
280		IFA_LOCK_SPIN(&ia6->ia_ifa);
281		if ((ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) ||
282		    ((ip6oa.ip6oa_flags & IP6OAF_NO_CELLULAR) &&
283		     (ia6->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR))) {
284			IFA_UNLOCK(&ia6->ia_ifa);
285			IFA_REMREF(&ia6->ia_ifa);
286			*errorp = EADDRNOTAVAIL;
287			src_storage = NULL;
288			goto done;
289		}
290
291		*src_storage = satosin6(&ia6->ia_addr)->sin6_addr;
292		IFA_UNLOCK(&ia6->ia_ifa);
293		IFA_REMREF(&ia6->ia_ifa);
294		goto done;
295	}
296
297	/*
298	 * Otherwise, if the socket has already bound the source, just use it.
299	 */
300	if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
301		src_storage = &inp->in6p_laddr;
302		goto done;
303	}
304
305	/*
306	 * If the address is not specified, choose the best one based on
307	 * the outgoing interface and the destination address.
308	 */
309
310	/* get the outgoing interface */
311	if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ip6oa,
312	    &ifp)) != 0) {
313		src_storage = NULL;
314		goto done;
315	}
316
317	*errorp = in6_setscope(&dst, ifp, &odstzone);
318	if (*errorp != 0) {
319		src_storage = NULL;
320		goto done;
321	}
322	lck_rw_lock_shared(&in6_ifaddr_rwlock);
323
324	for (ia = in6_ifaddrs; ia; ia = ia->ia_next) {
325		int new_scope = -1, new_matchlen = -1;
326		struct in6_addrpolicy *new_policy = NULL;
327		u_int32_t srczone, osrczone, dstzone;
328		struct in6_addr src;
329		struct ifnet *ifp1 = ia->ia_ifp;
330
331		IFA_LOCK(&ia->ia_ifa);
332		/*
333		 * We'll never take an address that breaks the scope zone
334		 * of the destination.  We also skip an address if its zone
335		 * does not contain the outgoing interface.
336		 * XXX: we should probably use sin6_scope_id here.
337		 */
338		if (in6_setscope(&dst, ifp1, &dstzone) ||
339		    odstzone != dstzone)
340			goto next;
341
342		src = ia->ia_addr.sin6_addr;
343		if (in6_setscope(&src, ifp, &osrczone) ||
344		    in6_setscope(&src, ifp1, &srczone) ||
345		    osrczone != srczone)
346			goto next;
347
348		/* avoid unusable addresses */
349		if ((ia->ia6_flags &
350		     (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED)))
351			goto next;
352
353		if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
354			goto next;
355
356		if (!nd6_optimistic_dad &&
357		     (ia->ia6_flags & IN6_IFF_OPTIMISTIC) != 0)
358			goto next;
359
360		/* Rule 1: Prefer same address */
361		if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr))
362			BREAK(1); /* there should be no better candidate */
363
364		if (ia_best == NULL)
365			REPLACE(0);
366
367		/* Rule 2: Prefer appropriate scope */
368		if (dst_scope < 0)
369			dst_scope = in6_addrscope(&dst);
370		new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
371		if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
372			if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
373				REPLACE(2);
374			NEXTSRC(2);
375		} else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
376			if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
377				NEXTSRC(2);
378			REPLACE(2);
379		}
380
381		/*
382		 * Rule 3: Avoid deprecated addresses.  Note that the case of
383		 * !ip6_use_deprecated is already rejected above.
384		 */
385		if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
386			NEXTSRC(3);
387		if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
388			REPLACE(3);
389
390		/*
391		 * RFC 4429 says that optimistic addresses are equivalent to
392		 * deprecated addresses, so avoid them here.
393		 */
394		if ((ia_best->ia6_flags & IN6_IFF_OPTIMISTIC) == 0 &&
395		    (ia->ia6_flags & IN6_IFF_OPTIMISTIC) != 0)
396			NEXTSRC(3);
397		if ((ia_best->ia6_flags & IN6_IFF_OPTIMISTIC) != 0 &&
398		    (ia->ia6_flags & IN6_IFF_OPTIMISTIC) == 0)
399			REPLACE(3);
400
401		/* Rule 4: Prefer home addresses */
402		/*
403		 * XXX: This is a TODO.  We should probably merge the MIP6
404		 * case above.
405		 */
406
407		/* Rule 5: Prefer outgoing interface */
408		if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
409			NEXTSRC(5);
410		if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
411			REPLACE(5);
412
413		/*
414		 * Rule 6: Prefer matching label
415		 * Note that best_policy should be non-NULL here.
416		 */
417		if (dst_policy == NULL)
418			dst_policy = in6_addrsel_lookup_policy(dstsock);
419		if (dst_policy->label != ADDR_LABEL_NOTAPP) {
420			new_policy = in6_addrsel_lookup_policy(&ia->ia_addr);
421			if (dst_policy->label == best_policy->label &&
422			    dst_policy->label != new_policy->label)
423				NEXTSRC(6);
424			if (dst_policy->label != best_policy->label &&
425			    dst_policy->label == new_policy->label)
426				REPLACE(6);
427		}
428
429		/*
430		 * Rule 7: Prefer public addresses.
431		 * We allow users to reverse the logic by configuring
432		 * a sysctl variable, so that privacy conscious users can
433		 * always prefer temporary addresses.
434		 * Don't use temporary addresses for local destinations or
435		 * for multicast addresses unless we were passed in an option.
436		 */
437		if (IN6_IS_ADDR_MULTICAST(&dst) ||
438		    in6_matchlen(&ia_best->ia_addr.sin6_addr, &dst) >=
439		    in6_mask2len(&ia_best->ia_prefixmask.sin6_addr, NULL))
440			islocal = TRUE;
441		if (opts == NULL ||
442		    opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
443			prefer_tempaddr = islocal ? 0 : ip6_prefer_tempaddr;
444		} else if (opts->ip6po_prefer_tempaddr ==
445		    IP6PO_TEMPADDR_NOTPREFER) {
446			prefer_tempaddr = 0;
447		} else
448			prefer_tempaddr = 1;
449		if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
450		    (ia->ia6_flags & IN6_IFF_TEMPORARY)) {
451			if (prefer_tempaddr)
452				REPLACE(7);
453			else
454				NEXTSRC(7);
455		}
456		if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
457		    !(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
458			if (prefer_tempaddr)
459				NEXTSRC(7);
460			else
461				REPLACE(7);
462		}
463
464		/*
465		 * Rule 8: prefer addresses on alive interfaces.
466		 * This is a KAME specific rule.
467		 */
468		if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
469		    !(ia->ia_ifp->if_flags & IFF_UP))
470			NEXTSRC(8);
471		if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
472		    (ia->ia_ifp->if_flags & IFF_UP))
473			REPLACE(8);
474
475		/*
476		 * Rule 14: Use longest matching prefix.
477		 * Note: in the address selection draft, this rule is
478		 * documented as "Rule 8".  However, since it is also
479		 * documented that this rule can be overridden, we assign
480		 * a large number so that it is easy to assign smaller numbers
481		 * to more preferred rules.
482		 */
483		new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst);
484		if (best_matchlen < new_matchlen)
485			REPLACE(14);
486		if (new_matchlen < best_matchlen)
487			NEXTSRC(14);
488
489		/* Rule 15 is reserved. */
490
491		/*
492		 * Last resort: just keep the current candidate.
493		 * Or, do we need more rules?
494		 */
495		IFA_UNLOCK(&ia->ia_ifa);
496		continue;
497
498replace:
499		best_scope = (new_scope >= 0 ? new_scope :
500			      in6_addrscope(&ia->ia_addr.sin6_addr));
501		best_policy = (new_policy ? new_policy :
502			       in6_addrsel_lookup_policy(&ia->ia_addr));
503		best_matchlen = (new_matchlen >= 0 ? new_matchlen :
504				 in6_matchlen(&ia->ia_addr.sin6_addr, &dst));
505		IFA_ADDREF_LOCKED(&ia->ia_ifa);	/* for ia_best */
506		IFA_UNLOCK(&ia->ia_ifa);
507		if (ia_best != NULL)
508			IFA_REMREF(&ia_best->ia_ifa);
509		ia_best = ia;
510		continue;
511
512next:
513		IFA_UNLOCK(&ia->ia_ifa);
514		continue;
515
516out:
517		IFA_ADDREF_LOCKED(&ia->ia_ifa);	/* for ia_best */
518		IFA_UNLOCK(&ia->ia_ifa);
519		if (ia_best != NULL)
520			IFA_REMREF(&ia_best->ia_ifa);
521		ia_best = ia;
522		break;
523	}
524
525	lck_rw_done(&in6_ifaddr_rwlock);
526
527	if (ia_best != NULL &&
528	    (ip6oa.ip6oa_flags & IP6OAF_NO_CELLULAR) &&
529	    ia_best->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR) {
530		IFA_REMREF(&ia_best->ia_ifa);
531		ia_best = NULL;
532	}
533
534	if ((ia = ia_best) == NULL) {
535		*errorp = EADDRNOTAVAIL;
536		src_storage = NULL;
537		goto done;
538	}
539
540	IFA_LOCK_SPIN(&ia->ia_ifa);
541	*src_storage = satosin6(&ia->ia_addr)->sin6_addr;
542	IFA_UNLOCK(&ia->ia_ifa);
543	IFA_REMREF(&ia->ia_ifa);
544done:
545	if (ifpp != NULL) {
546		/* if ifp is non-NULL, refcnt held in in6_selectif() */
547		*ifpp = ifp;
548	} else if (ifp != NULL) {
549		ifnet_release(ifp);
550	}
551	return (src_storage);
552}
553
554/*
555 * Given a source IPv6 address (and route, if available), determine the best
556 * interface to send the packet from.  Checking for (and updating) the
557 * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
558 * without any locks, based on the assumption that in the event this is
559 * called from ip6_output(), the output operation is single-threaded per-pcb,
560 * i.e. for any given pcb there can only be one thread performing output at
561 * the IPv6 layer.
562 *
563 * This routine is analogous to in_selectsrcif() for IPv4.  Regardless of
564 * error, it will return an ifp with a reference held if the caller provides
565 * a non-NULL retifp.  The caller is responsible for checking if the
566 * returned ifp is valid and release its reference at all times.
567 *
568 * clone - meaningful only for bsdi and freebsd
569 */
570static int
571selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock,
572    struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro,
573    struct ifnet **retifp, struct rtentry **retrt, int clone,
574    int norouteok, const struct ip6_out_args *ip6oa)
575{
576	int error = 0;
577	struct ifnet *ifp = NULL, *ifp0 = NULL;
578	struct route_in6 *route = NULL;
579	struct sockaddr_in6 *sin6_next;
580	struct in6_pktinfo *pi = NULL;
581	struct in6_addr *dst = &dstsock->sin6_addr;
582	struct ifaddr *ifa = NULL;
583	char s_src[MAX_IPv6_STR_LEN], s_dst[MAX_IPv6_STR_LEN];
584	boolean_t select_srcif, proxied_ifa = FALSE;
585	unsigned int ifscope = ip6oa->ip6oa_boundif;
586
587#if 0
588	char ip6buf[INET6_ADDRSTRLEN];
589
590	if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
591	    dstsock->sin6_addr.s6_addr32[1] == 0 &&
592	    !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
593		printf("in6_selectroute: strange destination %s\n",
594		       ip6_sprintf(ip6buf, &dstsock->sin6_addr));
595	} else {
596		printf("in6_selectroute: destination = %s%%%d\n",
597		       ip6_sprintf(ip6buf, &dstsock->sin6_addr),
598		       dstsock->sin6_scope_id); /* for debug */
599	}
600#endif
601
602	if (retifp != NULL)
603		*retifp = NULL;
604
605	if (retrt != NULL)
606		*retrt = NULL;
607
608	if (ip6_select_srcif_debug) {
609		struct in6_addr src;
610		src = (srcsock != NULL) ? srcsock->sin6_addr : in6addr_any;
611		(void) inet_ntop(AF_INET6, &src, s_src, sizeof (s_src));
612		(void) inet_ntop(AF_INET6, dst, s_dst, sizeof (s_dst));
613	}
614
615	/*
616	 * If the destination address is UNSPECIFIED addr, bail out.
617	 */
618	if (IN6_IS_ADDR_UNSPECIFIED(dst)) {
619		error = EHOSTUNREACH;
620		goto done;
621	}
622
623	/*
624	 * Perform source interface selection only if Scoped Routing
625	 * is enabled and a source address that isn't unspecified.
626	 */
627	select_srcif = (ip6_doscopedroute && srcsock != NULL &&
628	    !IN6_IS_ADDR_UNSPECIFIED(&srcsock->sin6_addr));
629
630	/*
631	 * If Scoped Routing is disabled, ignore the given ifscope.
632	 * Otherwise even if source selection won't be performed,
633	 * we still obey IPV6_BOUND_IF.
634	 */
635	if (!ip6_doscopedroute && ifscope != IFSCOPE_NONE)
636		ifscope = IFSCOPE_NONE;
637
638	/* If the caller specified the outgoing interface explicitly, use it */
639	if (opts != NULL && (pi = opts->ip6po_pktinfo) != NULL &&
640	    pi->ipi6_ifindex != 0) {
641		/*
642		 * If IPV6_PKTINFO takes precedence over IPV6_BOUND_IF.
643		 */
644		ifscope = pi->ipi6_ifindex;
645		ifnet_head_lock_shared();
646		/* ifp may be NULL if detached or out of range */
647		ifp = ifp0 =
648		    ((ifscope <= if_index) ? ifindex2ifnet[ifscope] : NULL);
649		ifnet_head_done();
650		if (norouteok || retrt == NULL || IN6_IS_ADDR_MULTICAST(dst)) {
651			/*
652			 * We do not have to check or get the route for
653			 * multicast.  If the caller didn't ask/care for
654			 * the route and we have no interface to use,
655			 * it's an error.
656			 */
657			if (ifp == NULL)
658				error = EHOSTUNREACH;
659			goto done;
660		} else {
661			goto getsrcif;
662		}
663	}
664
665	/*
666	 * If the destination address is a multicast address and the outgoing
667	 * interface for the address is specified by the caller, use it.
668	 */
669	if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL) {
670		IM6O_LOCK(mopts);
671		if ((ifp = ifp0 = mopts->im6o_multicast_ifp) != NULL) {
672			IM6O_UNLOCK(mopts);
673			goto done; /* we do not need a route for multicast. */
674		}
675		IM6O_UNLOCK(mopts);
676	}
677
678getsrcif:
679	/*
680	 * If the outgoing interface was not set via IPV6_BOUND_IF or
681	 * IPV6_PKTINFO, use the scope ID in the destination address.
682	 */
683	if (ip6_doscopedroute && ifscope == IFSCOPE_NONE)
684		ifscope = dstsock->sin6_scope_id;
685
686	/*
687	 * Perform source interface selection; the source IPv6 address
688	 * must belong to one of the addresses of the interface used
689	 * by the route.  For performance reasons, do this only if
690	 * there is no route, or if the routing table has changed,
691	 * or if we haven't done source interface selection on this
692	 * route (for this PCB instance) before.
693	 */
694	if (!select_srcif || (ro != NULL && ro->ro_rt != NULL &&
695	    (ro->ro_rt->rt_flags & RTF_UP) &&
696	    ro->ro_rt->generation_id == route_generation &&
697	    (ro->ro_flags & ROF_SRCIF_SELECTED))) {
698		if (ro != NULL && ro->ro_rt != NULL) {
699			ifa = ro->ro_rt->rt_ifa;
700			IFA_ADDREF(ifa);
701		}
702		goto getroute;
703	}
704
705	/*
706	 * Given the source IPv6 address, find a suitable source interface
707	 * to use for transmission; if a scope ID has been specified,
708	 * optimize the search by looking at the addresses only for that
709	 * interface.  This is still suboptimal, however, as we need to
710	 * traverse the per-interface list.
711	 */
712	if (ifscope != IFSCOPE_NONE || (ro != NULL && ro->ro_rt != NULL)) {
713		unsigned int scope = ifscope;
714		struct ifnet *rt_ifp;
715
716		rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
717
718		/*
719		 * If no scope is specified and the route is stale (pointing
720		 * to a defunct interface) use the current primary interface;
721		 * this happens when switching between interfaces configured
722		 * with the same IPv6 address.  Otherwise pick up the scope
723		 * information from the route; the ULP may have looked up a
724		 * correct route and we just need to verify it here and mark
725		 * it with the ROF_SRCIF_SELECTED flag below.
726		 */
727		if (scope == IFSCOPE_NONE) {
728			scope = rt_ifp->if_index;
729			if (scope != get_primary_ifscope(AF_INET6) &&
730			    ro->ro_rt->generation_id != route_generation)
731				scope = get_primary_ifscope(AF_INET6);
732		}
733
734		ifa = (struct ifaddr *)
735		    ifa_foraddr6_scoped(&srcsock->sin6_addr, scope);
736
737		/*
738		 * If we are forwarding and proxying prefix(es), see if the
739		 * source address is one of ours and is a proxied address;
740		 * if so, use it.
741		 */
742		if (ifa == NULL && ip6_forwarding && nd6_prproxy) {
743			ifa = (struct ifaddr *)
744			    ifa_foraddr6(&srcsock->sin6_addr);
745			if (ifa != NULL && !(proxied_ifa =
746			    nd6_prproxy_ifaddr((struct in6_ifaddr *)ifa))) {
747				IFA_REMREF(ifa);
748				ifa = NULL;
749			}
750		}
751
752		if (ip6_select_srcif_debug && ifa != NULL) {
753			if (ro->ro_rt != NULL) {
754				printf("%s->%s ifscope %d->%d ifa_if %s "
755				    "ro_if %s\n", s_src, s_dst, ifscope,
756				    scope, if_name(ifa->ifa_ifp),
757				    if_name(rt_ifp));
758			} else {
759				printf("%s->%s ifscope %d->%d ifa_if %s\n",
760				    s_src, s_dst, ifscope, scope,
761				    if_name(ifa->ifa_ifp));
762			}
763		}
764	}
765
766	/*
767	 * Slow path; search for an interface having the corresponding source
768	 * IPv6 address if the scope was not specified by the caller, and:
769	 *
770	 *   1) There currently isn't any route, or,
771	 *   2) The interface used by the route does not own that source
772	 *	IPv6 address; in this case, the route will get blown away
773	 *	and we'll do a more specific scoped search using the newly
774	 *	found interface.
775	 */
776	if (ifa == NULL && ifscope == IFSCOPE_NONE) {
777		ifa = (struct ifaddr *)ifa_foraddr6(&srcsock->sin6_addr);
778
779		if (ip6_select_srcif_debug && ifa != NULL) {
780			printf("%s->%s ifscope %d ifa_if %s\n",
781			    s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
782		}
783
784	}
785
786getroute:
787	if (ifa != NULL && !proxied_ifa)
788		ifscope = ifa->ifa_ifp->if_index;
789
790	/*
791	 * If the next hop address for the packet is specified by the caller,
792	 * use it as the gateway.
793	 */
794	if (opts != NULL && opts->ip6po_nexthop != NULL) {
795		struct route_in6 *ron;
796
797		sin6_next = satosin6(opts->ip6po_nexthop);
798
799		/* at this moment, we only support AF_INET6 next hops */
800		if (sin6_next->sin6_family != AF_INET6) {
801			error = EAFNOSUPPORT; /* or should we proceed? */
802			goto done;
803		}
804
805		/*
806		 * If the next hop is an IPv6 address, then the node identified
807		 * by that address must be a neighbor of the sending host.
808		 */
809		ron = &opts->ip6po_nextroute;
810		if (ron->ro_rt != NULL)
811			RT_LOCK(ron->ro_rt);
812		if ((ron->ro_rt != NULL &&
813		    ((ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
814		    (RTF_UP | RTF_LLINFO) ||
815		    ron->ro_rt->generation_id != route_generation ||
816		    (select_srcif && (ifa == NULL ||
817		    (ifa->ifa_ifp != ron->ro_rt->rt_ifp && !proxied_ifa))))) ||
818		    !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr,
819		    &sin6_next->sin6_addr)) {
820			if (ron->ro_rt != NULL) {
821				RT_UNLOCK(ron->ro_rt);
822				rtfree(ron->ro_rt);
823				ron->ro_rt = NULL;
824			}
825			*satosin6(&ron->ro_dst) = *sin6_next;
826		}
827		if (ron->ro_rt == NULL) {
828			rtalloc_scoped((struct route *)ron, ifscope);
829			if (ron->ro_rt != NULL)
830				RT_LOCK(ron->ro_rt);
831			if (ron->ro_rt == NULL ||
832			    !(ron->ro_rt->rt_flags & RTF_LLINFO) ||
833			    !IN6_ARE_ADDR_EQUAL(&satosin6(rt_key(ron->ro_rt))->
834			    sin6_addr, &sin6_next->sin6_addr)) {
835				if (ron->ro_rt != NULL) {
836					RT_UNLOCK(ron->ro_rt);
837					rtfree(ron->ro_rt);
838					ron->ro_rt = NULL;
839				}
840				error = EHOSTUNREACH;
841				goto done;
842			}
843		}
844		route = ron;
845		ifp = ifp0 = ron->ro_rt->rt_ifp;
846
847		/*
848		 * When cloning is required, try to allocate a route to the
849		 * destination so that the caller can store path MTU
850		 * information.
851		 */
852		if (!clone) {
853			if (select_srcif) {
854				/* Keep the route locked */
855				goto validateroute;
856			}
857			RT_UNLOCK(ron->ro_rt);
858			goto done;
859		}
860		RT_UNLOCK(ron->ro_rt);
861	}
862
863	/*
864	 * Use a cached route if it exists and is valid, else try to allocate
865	 * a new one.  Note that we should check the address family of the
866	 * cached destination, in case of sharing the cache with IPv4.
867	 */
868	if (ro == NULL)
869		goto done;
870	if (ro->ro_rt != NULL)
871		RT_LOCK(ro->ro_rt);
872	if (ro->ro_rt != NULL && (!(ro->ro_rt->rt_flags & RTF_UP) ||
873	    satosin6(&ro->ro_dst)->sin6_family != AF_INET6 ||
874	    ro->ro_rt->generation_id != route_generation ||
875	    !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst) ||
876	    (select_srcif && (ifa == NULL ||
877	    (ifa->ifa_ifp != ro->ro_rt->rt_ifp && !proxied_ifa))))) {
878		RT_UNLOCK(ro->ro_rt);
879		rtfree(ro->ro_rt);
880		ro->ro_rt = NULL;
881	}
882	if (ro->ro_rt == NULL) {
883		struct sockaddr_in6 *sa6;
884
885		if (ro->ro_rt != NULL)
886			RT_UNLOCK(ro->ro_rt);
887		/* No route yet, so try to acquire one */
888		bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
889		sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
890		sa6->sin6_family = AF_INET6;
891		sa6->sin6_len = sizeof(struct sockaddr_in6);
892		sa6->sin6_addr = *dst;
893		if (IN6_IS_ADDR_MULTICAST(dst)) {
894			ro->ro_rt = rtalloc1_scoped(
895			    &((struct route *)ro)->ro_dst, 0, 0, ifscope);
896		} else {
897			rtalloc_scoped((struct route *)ro, ifscope);
898		}
899		if (ro->ro_rt != NULL)
900			RT_LOCK(ro->ro_rt);
901	}
902
903	/*
904	 * Do not care about the result if we have the nexthop
905	 * explicitly specified (in case we're asked to clone.)
906	 */
907	if (opts != NULL && opts->ip6po_nexthop != NULL) {
908		if (ro->ro_rt != NULL)
909			RT_UNLOCK(ro->ro_rt);
910		goto done;
911	}
912
913	if (ro->ro_rt != NULL) {
914		RT_LOCK_ASSERT_HELD(ro->ro_rt);
915		ifp = ifp0 = ro->ro_rt->rt_ifp;
916	} else {
917		error = EHOSTUNREACH;
918	}
919	route = ro;
920
921validateroute:
922	if (select_srcif) {
923		boolean_t has_route = (route != NULL && route->ro_rt != NULL);
924		boolean_t srcif_selected = FALSE;
925
926		if (has_route)
927			RT_LOCK_ASSERT_HELD(route->ro_rt);
928		/*
929		 * If there is a non-loopback route with the wrong interface,
930		 * or if there is no interface configured with such an address,
931		 * blow it away.  Except for local/loopback, we look for one
932		 * with a matching interface scope/index.
933		 */
934		if (has_route && (ifa == NULL ||
935		    (ifa->ifa_ifp != ifp && ifp != lo_ifp) ||
936		    !(route->ro_rt->rt_flags & RTF_UP))) {
937			/*
938			 * If the destination address belongs to a proxied
939			 * prefix, relax the requirement and allow the packet
940			 * to come out of the proxy interface with the source
941			 * address of the real interface.
942			 */
943			if (ifa != NULL && proxied_ifa &&
944			    (route->ro_rt->rt_flags & (RTF_UP|RTF_PROXY)) ==
945			    (RTF_UP|RTF_PROXY)) {
946				srcif_selected = TRUE;
947			} else {
948				if (ip6_select_srcif_debug) {
949					if (ifa != NULL) {
950						printf("%s->%s ifscope %d "
951						    "ro_if %s != ifa_if %s "
952						    "(cached route cleared)\n",
953						    s_src, s_dst,
954						    ifscope, if_name(ifp),
955						    if_name(ifa->ifa_ifp));
956					} else {
957						printf("%s->%s ifscope %d "
958						    "ro_if %s (no ifa_if "
959						    "found)\n", s_src, s_dst,
960						    ifscope, if_name(ifp));
961					}
962				}
963				RT_UNLOCK(route->ro_rt);
964				rtfree(route->ro_rt);
965				route->ro_rt = NULL;
966				route->ro_flags &= ~ROF_SRCIF_SELECTED;
967				error = EHOSTUNREACH;
968				/* Undo the settings done above */
969				route = NULL;
970				ifp = NULL;	/* ditch ifp; keep ifp0 */
971				has_route = FALSE;
972			}
973		} else if (has_route) {
974			srcif_selected = TRUE;
975		}
976
977		if (srcif_selected) {
978			VERIFY(has_route);
979			route->ro_flags |= ROF_SRCIF_SELECTED;
980			route->ro_rt->generation_id = route_generation;
981			RT_UNLOCK(route->ro_rt);
982		}
983	} else {
984		if (ro->ro_rt != NULL)
985			RT_UNLOCK(ro->ro_rt);
986		if (ifp != NULL && opts != NULL &&
987		    opts->ip6po_pktinfo != NULL &&
988		    opts->ip6po_pktinfo->ipi6_ifindex != 0) {
989			/*
990			 * Check if the outgoing interface conflicts with the
991			 * interface specified by ipi6_ifindex (if specified).
992			 * Note that loopback interface is always okay.
993			 * (this may happen when we are sending a packet to
994			 * one of our own addresses.)
995			 */
996			if (!(ifp->if_flags & IFF_LOOPBACK) && ifp->if_index !=
997			    opts->ip6po_pktinfo->ipi6_ifindex) {
998				error = EHOSTUNREACH;
999				goto done;
1000			}
1001		}
1002	}
1003
1004done:
1005	if (error == 0) {
1006		if ((ip6oa->ip6oa_flags & IP6OAF_NO_CELLULAR) &&
1007		    ((ifp != NULL && ifp->if_type == IFT_CELLULAR) ||
1008		    (route != NULL && route->ro_rt != NULL &&
1009		    route->ro_rt->rt_ifp->if_type == IFT_CELLULAR))) {
1010			if (route != NULL && route->ro_rt != NULL) {
1011				rtfree(route->ro_rt);
1012				route->ro_rt = NULL;
1013				route->ro_flags &= ~ROF_SRCIF_SELECTED;
1014				route = NULL;
1015			}
1016			ifp = NULL;	/* ditch ifp; keep ifp0 */
1017			error = EHOSTUNREACH;
1018		}
1019	}
1020
1021	if (ifp == NULL && (route == NULL || route->ro_rt == NULL)) {
1022		/*
1023		 * This can happen if the caller did not pass a cached route
1024		 * nor any other hints.  We treat this case an error.
1025		 */
1026		error = EHOSTUNREACH;
1027	}
1028	if (error == EHOSTUNREACH)
1029		ip6stat.ip6s_noroute++;
1030
1031	/*
1032	 * We'll return ifp regardless of error, so pick it up from ifp0
1033	 * in case it was nullified above.  Caller is responsible for
1034	 * releasing the ifp if it is non-NULL.
1035	 */
1036	ifp = ifp0;
1037	if (retifp != NULL) {
1038		if (ifp != NULL)
1039			ifnet_reference(ifp);	/* for caller */
1040		*retifp = ifp;
1041	}
1042
1043	if (error == 0) {
1044		if (retrt != NULL && route != NULL)
1045			*retrt = route->ro_rt;	/* ro_rt may be NULL */
1046	} else if (select_srcif && ip6_select_srcif_debug) {
1047		printf("%s->%s ifscope %d ifa_if %s ro_if %s (error=%d)\n",
1048		    s_src, s_dst, ifscope,
1049		    (ifa != NULL) ? if_name(ifa->ifa_ifp) : "NONE",
1050		    (ifp != NULL) ? if_name(ifp) : "NONE", error);
1051	}
1052
1053	if (ifa != NULL)
1054		IFA_REMREF(ifa);
1055
1056	return (error);
1057}
1058
1059/*
1060 * Regardless of error, it will return an ifp with a reference held if the
1061 * caller provides a non-NULL retifp.  The caller is responsible for checking
1062 * if the returned ifp is valid and release its reference at all times.
1063 */
1064static int
1065in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
1066    struct ip6_moptions *mopts, struct route_in6 *ro,
1067    const struct ip6_out_args *ip6oa, struct ifnet **retifp)
1068{
1069	int err = 0;
1070	struct route_in6 sro;
1071	struct rtentry *rt = NULL;
1072
1073	if (ro == NULL) {
1074		bzero(&sro, sizeof(sro));
1075		ro = &sro;
1076	}
1077
1078	if ((err = selectroute(NULL, dstsock, opts, mopts, ro, retifp,
1079	    &rt, 0, 1, ip6oa)) != 0)
1080		goto done;
1081
1082	/*
1083	 * do not use a rejected or black hole route.
1084	 * XXX: this check should be done in the L2 output routine.
1085	 * However, if we skipped this check here, we'd see the following
1086	 * scenario:
1087	 * - install a rejected route for a scoped address prefix
1088	 *   (like fe80::/10)
1089	 * - send a packet to a destination that matches the scoped prefix,
1090	 *   with ambiguity about the scope zone.
1091	 * - pick the outgoing interface from the route, and disambiguate the
1092	 *   scope zone with the interface.
1093	 * - ip6_output() would try to get another route with the "new"
1094	 *   destination, which may be valid.
1095	 * - we'd see no error on output.
1096	 * Although this may not be very harmful, it should still be confusing.
1097	 * We thus reject the case here.
1098	 */
1099	if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
1100		err = ((rt->rt_flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH);
1101		goto done;
1102	}
1103
1104	/*
1105	 * Adjust the "outgoing" interface.  If we're going to loop the packet
1106	 * back to ourselves, the ifp would be the loopback interface.
1107	 * However, we'd rather know the interface associated to the
1108	 * destination address (which should probably be one of our own
1109	 * addresses.)
1110	 */
1111	if (rt != NULL && rt->rt_ifa != NULL && rt->rt_ifa->ifa_ifp != NULL &&
1112	    retifp != NULL) {
1113		ifnet_reference(rt->rt_ifa->ifa_ifp);
1114		if (*retifp != NULL)
1115			ifnet_release(*retifp);
1116		*retifp = rt->rt_ifa->ifa_ifp;
1117	}
1118
1119done:
1120	if (ro == &sro && rt && rt == sro.ro_rt)
1121		rtfree(rt);
1122
1123	/*
1124	 * retifp might point to a valid ifp with a reference held;
1125	 * caller is responsible for releasing it if non-NULL.
1126	 */
1127	return (err);
1128}
1129
1130/*
1131 * Regardless of error, it will return an ifp with a reference held if the
1132 * caller provides a non-NULL retifp.  The caller is responsible for checking
1133 * if the returned ifp is valid and release its reference at all times.
1134 *
1135 * clone - meaningful only for bsdi and freebsd
1136 */
1137int
1138in6_selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock,
1139    struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro,
1140    struct ifnet **retifp, struct rtentry **retrt, int clone,
1141    const struct ip6_out_args *ip6oa)
1142{
1143
1144	return (selectroute(srcsock, dstsock, opts, mopts, ro, retifp,
1145	    retrt, clone, 0, ip6oa));
1146}
1147
1148/*
1149 * Default hop limit selection. The precedence is as follows:
1150 * 1. Hoplimit value specified via ioctl.
1151 * 2. (If the outgoing interface is detected) the current
1152 *     hop limit of the interface specified by router advertisement.
1153 * 3. The system default hoplimit.
1154*/
1155int
1156in6_selecthlim(
1157	struct in6pcb *in6p,
1158	struct ifnet *ifp)
1159{
1160	if (in6p && in6p->in6p_hops >= 0) {
1161		return(in6p->in6p_hops);
1162	} else {
1163		lck_rw_lock_shared(nd_if_rwlock);
1164		if (ifp && ifp->if_index < nd_ifinfo_indexlim) {
1165			u_int8_t chlim;
1166			struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index];
1167
1168			if (ndi->initialized) {
1169				lck_mtx_lock(&ndi->lock);
1170				chlim = ndi->chlim;
1171				lck_mtx_unlock(&ndi->lock);
1172			} else {
1173				chlim = ip6_defhlim;
1174			}
1175			lck_rw_done(nd_if_rwlock);
1176			return (chlim);
1177		} else {
1178			lck_rw_done(nd_if_rwlock);
1179			return(ip6_defhlim);
1180		}
1181	}
1182}
1183
1184/*
1185 * XXX: this is borrowed from in6_pcbbind(). If possible, we should
1186 * share this function by all *bsd*...
1187 */
1188int
1189in6_pcbsetport(
1190	__unused struct in6_addr *laddr,
1191	struct inpcb *inp,
1192	struct proc *p,
1193	int locked)
1194{
1195	struct socket *so = inp->inp_socket;
1196	u_int16_t lport = 0, first, last, *lastport;
1197	int count, error = 0, wild = 0;
1198	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1199	kauth_cred_t cred;
1200	if (!locked) { /* Make sure we don't run into a deadlock: 4052373 */
1201		if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
1202			socket_unlock(inp->inp_socket, 0);
1203			lck_rw_lock_exclusive(pcbinfo->mtx);
1204			socket_lock(inp->inp_socket, 0);
1205		}
1206	}
1207
1208	/* XXX: this is redundant when called from in6_pcbbind */
1209	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
1210		wild = INPLOOKUP_WILDCARD;
1211
1212	inp->inp_flags |= INP_ANONPORT;
1213
1214	if (inp->inp_flags & INP_HIGHPORT) {
1215		first = ipport_hifirstauto;	/* sysctl */
1216		last  = ipport_hilastauto;
1217		lastport = &pcbinfo->lasthi;
1218	} else if (inp->inp_flags & INP_LOWPORT) {
1219		cred = kauth_cred_proc_ref(p);
1220		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
1221		kauth_cred_unref(&cred);
1222		if (error != 0) {
1223			if (!locked)
1224				lck_rw_done(pcbinfo->mtx);
1225			return error;
1226		}
1227		first = ipport_lowfirstauto;	/* 1023 */
1228		last  = ipport_lowlastauto;	/* 600 */
1229		lastport = &pcbinfo->lastlow;
1230	} else {
1231		first = ipport_firstauto;	/* sysctl */
1232		last  = ipport_lastauto;
1233		lastport = &pcbinfo->lastport;
1234	}
1235	/*
1236	 * Simple check to ensure all ports are not used up causing
1237	 * a deadlock here.
1238	 *
1239	 * We split the two cases (up and down) so that the direction
1240	 * is not being tested on each round of the loop.
1241	 */
1242	if (first > last) {
1243		/*
1244		 * counting down
1245		 */
1246		count = first - last;
1247
1248		do {
1249			if (count-- < 0) {	/* completely used? */
1250				/*
1251				 * Undo any address bind that may have
1252				 * occurred above.
1253				 */
1254				inp->in6p_laddr = in6addr_any;
1255				inp->in6p_last_outifp = NULL;
1256				if (!locked)
1257					lck_rw_done(pcbinfo->mtx);
1258				return (EAGAIN);
1259			}
1260			--*lastport;
1261			if (*lastport > first || *lastport < last)
1262				*lastport = first;
1263			lport = htons(*lastport);
1264		} while (in6_pcblookup_local(pcbinfo,
1265					     &inp->in6p_laddr, lport, wild));
1266	} else {
1267		/*
1268			 * counting up
1269			 */
1270		count = last - first;
1271
1272		do {
1273			if (count-- < 0) {	/* completely used? */
1274				/*
1275				 * Undo any address bind that may have
1276				 * occurred above.
1277				 */
1278				inp->in6p_laddr = in6addr_any;
1279				inp->in6p_last_outifp = NULL;
1280				if (!locked)
1281					lck_rw_done(pcbinfo->mtx);
1282				return (EAGAIN);
1283			}
1284			++*lastport;
1285			if (*lastport < first || *lastport > last)
1286				*lastport = first;
1287			lport = htons(*lastport);
1288		} while (in6_pcblookup_local(pcbinfo,
1289					     &inp->in6p_laddr, lport, wild));
1290	}
1291
1292	inp->inp_lport = lport;
1293	if (in_pcbinshash(inp, 1) != 0) {
1294		inp->in6p_laddr = in6addr_any;
1295		inp->inp_lport = 0;
1296		inp->in6p_last_outifp = NULL;
1297		if (!locked)
1298			lck_rw_done(pcbinfo->mtx);
1299		return (EAGAIN);
1300	}
1301
1302	if (!locked)
1303		lck_rw_done(pcbinfo->mtx);
1304	return(0);
1305}
1306
1307/*
1308 * * The followings are implementation of the policy table using a
1309 * * simple tail queue.
1310 * * XXX such details should be hidden.
1311 * * XXX implementation using binary tree should be more efficient.
1312 * */
1313struct addrsel_policyent {
1314        TAILQ_ENTRY(addrsel_policyent) ape_entry;
1315        struct in6_addrpolicy ape_policy;
1316};
1317
1318TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);
1319
1320struct addrsel_policyhead addrsel_policytab;
1321
1322static void
1323init_policy_queue(void)
1324{
1325
1326        TAILQ_INIT(&addrsel_policytab);
1327}
1328
1329void
1330addrsel_policy_init(void)
1331{
1332	/*
1333	 * Default address selection policy based on RFC 3484 and
1334	 * draft-arifumi-6man-rfc3484-revise-03.
1335	 */
1336	static const struct in6_addrpolicy defaddrsel[] = {
1337		/* localhost */
1338		{ .addr     = { .sin6_family = AF_INET6,
1339				.sin6_addr   = IN6ADDR_LOOPBACK_INIT,
1340				.sin6_len    = sizeof(struct sockaddr_in6) },
1341		  .addrmask = { .sin6_family = AF_INET6,
1342			        .sin6_addr   = IN6MASK128,
1343				.sin6_len    = sizeof(struct sockaddr_in6) },
1344		  .preced   = 60,
1345		  .label    = 0 },
1346		/* ULA */
1347		{ .addr	    = { .sin6_family = AF_INET6,
1348				.sin6_addr   = {{{ 0xfc }}},
1349				.sin6_len    = sizeof(struct sockaddr_in6) },
1350		  .addrmask = { .sin6_family = AF_INET6,
1351			        .sin6_addr   = IN6MASK7,
1352				.sin6_len    = sizeof(struct sockaddr_in6) },
1353		  .preced   = 50,
1354		  .label    = 1 },
1355		/* any IPv6 src */
1356		{ .addr	    = { .sin6_family = AF_INET6,
1357				.sin6_addr   = IN6ADDR_ANY_INIT,
1358				.sin6_len    = sizeof(struct sockaddr_in6) },
1359		  .addrmask = { .sin6_family = AF_INET6,
1360			        .sin6_addr   = IN6MASK0,
1361				.sin6_len    = sizeof(struct sockaddr_in6) },
1362		  .preced   = 40,
1363		  .label    = 2 },
1364		/* any IPv4 src */
1365		{ .addr	    = { .sin6_family = AF_INET6,
1366				.sin6_addr   = IN6ADDR_V4MAPPED_INIT,
1367				.sin6_len    = sizeof(struct sockaddr_in6) },
1368		  .addrmask = { .sin6_family = AF_INET6,
1369			        .sin6_addr   = IN6MASK96,
1370				.sin6_len    = sizeof(struct sockaddr_in6) },
1371		  .preced   = 30,
1372		  .label    = 3 },
1373		/* 6to4 */
1374		{ .addr	    = { .sin6_family = AF_INET6,
1375				.sin6_addr   = {{{ 0x20, 0x02 }}},
1376				.sin6_len    = sizeof(struct sockaddr_in6) },
1377		  .addrmask = { .sin6_family = AF_INET6,
1378			        .sin6_addr   = IN6MASK16,
1379				.sin6_len    = sizeof(struct sockaddr_in6) },
1380		  .preced   = 20,
1381		  .label    = 4 },
1382		/* Teredo */
1383		{ .addr	    = { .sin6_family = AF_INET6,
1384				.sin6_addr   = {{{ 0x20, 0x01 }}},
1385				.sin6_len    = sizeof(struct sockaddr_in6) },
1386		  .addrmask = { .sin6_family = AF_INET6,
1387			        .sin6_addr   = IN6MASK32,
1388				.sin6_len    = sizeof(struct sockaddr_in6) },
1389		  .preced   = 10,
1390		  .label    = 5 },
1391		/* v4 compat addresses */
1392		{ .addr	    = { .sin6_family = AF_INET6,
1393				.sin6_addr = IN6ADDR_ANY_INIT,
1394				.sin6_len    = sizeof(struct sockaddr_in6) },
1395		  .addrmask = { .sin6_family = AF_INET6,
1396			        .sin6_addr = IN6MASK96,
1397				.sin6_len    = sizeof(struct sockaddr_in6) },
1398		  .preced   = 1,
1399		  .label    = 10 },
1400		/* site-local (deprecated) */
1401		{ .addr	    = { .sin6_family = AF_INET6,
1402				.sin6_addr = {{{ 0xfe, 0xc0 }}},
1403				.sin6_len    = sizeof(struct sockaddr_in6) },
1404		  .addrmask = { .sin6_family = AF_INET6,
1405			        .sin6_addr = IN6MASK16,
1406				.sin6_len    = sizeof(struct sockaddr_in6) },
1407		  .preced   = 1,
1408		  .label    = 11 },
1409		/* 6bone (deprecated) */
1410		{ .addr	    = { .sin6_family = AF_INET6,
1411				.sin6_addr = {{{ 0x3f, 0xfe }}},
1412				.sin6_len    = sizeof(struct sockaddr_in6) },
1413		  .addrmask = { .sin6_family = AF_INET6,
1414			        .sin6_addr = IN6MASK16,
1415				.sin6_len    = sizeof(struct sockaddr_in6) },
1416		  .preced   = 1,
1417		  .label    = 12 },
1418	};
1419	int i;
1420
1421	init_policy_queue();
1422
1423	/* initialize the "last resort" policy */
1424	bzero(&defaultaddrpolicy, sizeof(defaultaddrpolicy));
1425	defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
1426
1427	for (i = 0; i < sizeof(defaddrsel) / sizeof(defaddrsel[0]); i++)
1428		add_addrsel_policyent(&defaddrsel[i]);
1429
1430}
1431
1432struct in6_addrpolicy *
1433in6_addrsel_lookup_policy(struct sockaddr_in6 *key)
1434{
1435	struct in6_addrpolicy *match = NULL;
1436
1437	ADDRSEL_LOCK();
1438	match = match_addrsel_policy(key);
1439
1440	if (match == NULL)
1441		match = &defaultaddrpolicy;
1442	else
1443		match->use++;
1444	ADDRSEL_UNLOCK();
1445
1446	return (match);
1447}
1448
1449static struct in6_addrpolicy *
1450match_addrsel_policy(struct sockaddr_in6 *key)
1451{
1452	struct addrsel_policyent *pent;
1453	struct in6_addrpolicy *bestpol = NULL, *pol;
1454	int matchlen, bestmatchlen = -1;
1455	u_char *mp, *ep, *k, *p, m;
1456
1457	TAILQ_FOREACH(pent, &addrsel_policytab, ape_entry) {
1458		matchlen = 0;
1459
1460		pol = &pent->ape_policy;
1461		mp = (u_char *)&pol->addrmask.sin6_addr;
1462		ep = mp + 16;	/* XXX: scope field? */
1463		k = (u_char *)&key->sin6_addr;
1464		p = (u_char *)&pol->addr.sin6_addr;
1465		for (; mp < ep && *mp; mp++, k++, p++) {
1466			m = *mp;
1467			if ((*k & m) != *p)
1468				goto next; /* not match */
1469			if (m == 0xff) /* short cut for a typical case */
1470				matchlen += 8;
1471			else {
1472				while (m >= 0x80) {
1473					matchlen++;
1474					m <<= 1;
1475				}
1476			}
1477		}
1478
1479		/* matched.  check if this is better than the current best. */
1480		if (bestpol == NULL ||
1481		    matchlen > bestmatchlen) {
1482			bestpol = pol;
1483			bestmatchlen = matchlen;
1484		}
1485
1486	  next:
1487		continue;
1488	}
1489
1490	return (bestpol);
1491}
1492
1493static int
1494add_addrsel_policyent(const struct in6_addrpolicy *newpolicy)
1495{
1496	struct addrsel_policyent *new, *pol;
1497
1498	MALLOC(new, struct addrsel_policyent *, sizeof(*new), M_IFADDR,
1499	       M_WAITOK);
1500
1501	ADDRSEL_LOCK();
1502
1503	/* duplication check */
1504	TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) {
1505		if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr,
1506				       &pol->ape_policy.addr.sin6_addr) &&
1507		    IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
1508				       &pol->ape_policy.addrmask.sin6_addr)) {
1509			ADDRSEL_UNLOCK();
1510			FREE(new, M_IFADDR);
1511			return (EEXIST);	/* or override it? */
1512		}
1513	}
1514
1515	bzero(new, sizeof(*new));
1516
1517	/* XXX: should validate entry */
1518	new->ape_policy = *newpolicy;
1519
1520	TAILQ_INSERT_TAIL(&addrsel_policytab, new, ape_entry);
1521	ADDRSEL_UNLOCK();
1522
1523	return (0);
1524}
1525#ifdef ENABLE_ADDRSEL
1526static int
1527delete_addrsel_policyent(const struct in6_addrpolicy *key)
1528{
1529	struct addrsel_policyent *pol;
1530
1531
1532	ADDRSEL_LOCK();
1533
1534	/* search for the entry in the table */
1535	TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) {
1536		if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr,
1537		    &pol->ape_policy.addr.sin6_addr) &&
1538		    IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
1539		    &pol->ape_policy.addrmask.sin6_addr)) {
1540			break;
1541		}
1542	}
1543	if (pol == NULL) {
1544		ADDRSEL_UNLOCK();
1545		return (ESRCH);
1546	}
1547
1548	TAILQ_REMOVE(&addrsel_policytab, pol, ape_entry);
1549	FREE(pol, M_IFADDR);
1550	pol = NULL;
1551	ADDRSEL_UNLOCK();
1552
1553	return (0);
1554}
1555#endif /* ENABLE_ADDRSEL */
1556
1557int
1558walk_addrsel_policy(int (*callback)(const struct in6_addrpolicy *, void *),
1559    void *w)
1560{
1561	struct addrsel_policyent *pol;
1562	int error = 0;
1563
1564	ADDRSEL_LOCK();
1565	TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) {
1566		if ((error = (*callback)(&pol->ape_policy, w)) != 0) {
1567			ADDRSEL_UNLOCK();
1568			return (error);
1569		}
1570	}
1571	ADDRSEL_UNLOCK();
1572	return (error);
1573}
1574/*
1575 * Subroutines to manage the address selection policy table via sysctl.
1576 */
1577struct walkarg {
1578	struct sysctl_req *w_req;
1579};
1580
1581
1582static int
1583dump_addrsel_policyent(const struct in6_addrpolicy *pol, void *arg)
1584{
1585	int error = 0;
1586	struct walkarg *w = arg;
1587
1588	error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol));
1589
1590	return (error);
1591}
1592
1593static int
1594in6_src_sysctl SYSCTL_HANDLER_ARGS
1595{
1596#pragma unused(oidp, arg1, arg2)
1597struct walkarg w;
1598
1599	if (req->newptr)
1600		return EPERM;
1601	bzero(&w, sizeof(w));
1602	w.w_req = req;
1603
1604	return (walk_addrsel_policy(dump_addrsel_policyent, &w));
1605}
1606
1607
1608SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy,
1609	CTLFLAG_RD | CTLFLAG_LOCKED, in6_src_sysctl, "");
1610int
1611in6_src_ioctl(u_long cmd, caddr_t data)
1612{
1613	int i;
1614	struct in6_addrpolicy ent0;
1615
1616	if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
1617		return (EOPNOTSUPP); /* check for safety */
1618
1619	bcopy(data, &ent0, sizeof (ent0));
1620
1621	if (ent0.label == ADDR_LABEL_NOTAPP)
1622		return (EINVAL);
1623	/* check if the prefix mask is consecutive. */
1624	if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
1625		return (EINVAL);
1626	/* clear trailing garbages (if any) of the prefix address. */
1627	for (i = 0; i < 4; i++) {
1628		ent0.addr.sin6_addr.s6_addr32[i] &=
1629			ent0.addrmask.sin6_addr.s6_addr32[i];
1630	}
1631	ent0.use = 0;
1632
1633	switch (cmd) {
1634	case SIOCAADDRCTL_POLICY:
1635#ifdef ENABLE_ADDRSEL
1636		return (add_addrsel_policyent(&ent0));
1637#else
1638		return (ENOTSUP);
1639#endif
1640	case SIOCDADDRCTL_POLICY:
1641#ifdef ENABLE_ADDRSEL
1642		return (delete_addrsel_policyent(&ent0));
1643#else
1644		return (ENOTSUP);
1645#endif
1646	}
1647
1648	return (0);		/* XXX: compromise compilers */
1649}
1650
1651/*
1652 * generate kernel-internal form (scopeid embedded into s6_addr16[1]).
1653 * If the address scope of is link-local, embed the interface index in the
1654 * address.  The routine determines our precedence
1655 * between advanced API scope/interface specification and basic API
1656 * specification.
1657 *
1658 * this function should be nuked in the future, when we get rid of
1659 * embedded scopeid thing.
1660 *
1661 * XXX actually, it is over-specification to return ifp against sin6_scope_id.
1662 * there can be multiple interfaces that belong to a particular scope zone
1663 * (in specification, we have 1:N mapping between a scope zone and interfaces).
1664 * we may want to change the function to return something other than ifp.
1665 */
1666int
1667in6_embedscope(
1668	struct in6_addr *in6,
1669	const struct sockaddr_in6 *sin6,
1670	struct in6pcb *in6p,
1671	struct ifnet **ifpp,
1672	struct ip6_pktopts *opt)
1673{
1674	struct ifnet *ifp = NULL;
1675	u_int32_t scopeid;
1676	struct ip6_pktopts *optp = NULL;
1677
1678	*in6 = sin6->sin6_addr;
1679	scopeid = sin6->sin6_scope_id;
1680	if (ifpp != NULL)
1681		*ifpp = NULL;
1682
1683	/*
1684	 * don't try to read sin6->sin6_addr beyond here, since the caller may
1685	 * ask us to overwrite existing sockaddr_in6
1686	 */
1687
1688#ifdef ENABLE_DEFAULT_SCOPE
1689	if (scopeid == 0)
1690		scopeid = scope6_addr2default(in6);
1691#endif
1692
1693	if (IN6_IS_SCOPE_LINKLOCAL(in6)) {
1694		struct in6_pktinfo *pi;
1695		struct ifnet *im6o_multicast_ifp = NULL;
1696
1697		if (in6p != NULL && IN6_IS_ADDR_MULTICAST(in6) &&
1698		    in6p->in6p_moptions != NULL) {
1699			IM6O_LOCK(in6p->in6p_moptions);
1700			im6o_multicast_ifp =
1701			    in6p->in6p_moptions->im6o_multicast_ifp;
1702			IM6O_UNLOCK(in6p->in6p_moptions);
1703		}
1704
1705		if (opt)
1706			optp = opt;
1707		else if (in6p)
1708			optp = in6p->in6p_outputopts;
1709		/*
1710		 * KAME assumption: link id == interface id
1711		 */
1712		ifnet_head_lock_shared();
1713		if (in6p && optp && (pi = optp->ip6po_pktinfo) &&
1714		    pi->ipi6_ifindex) {
1715			ifp = ifindex2ifnet[pi->ipi6_ifindex];
1716			in6->s6_addr16[1] = htons(pi->ipi6_ifindex);
1717		} else if (in6p && IN6_IS_ADDR_MULTICAST(in6) &&
1718		    in6p->in6p_moptions != NULL && im6o_multicast_ifp != NULL) {
1719			ifp = im6o_multicast_ifp;
1720			in6->s6_addr16[1] = htons(ifp->if_index);
1721		} else if (scopeid) {
1722			/*
1723			 * Since scopeid is unsigned, we only have to check it
1724			 * against if_index
1725			 */
1726			if (if_index < scopeid) {
1727				ifnet_head_done();
1728				return ENXIO;  /* XXX EINVAL? */
1729
1730			}
1731			ifp = ifindex2ifnet[scopeid];
1732			/*XXX assignment to 16bit from 32bit variable */
1733			in6->s6_addr16[1] = htons(scopeid & 0xffff);
1734		}
1735		ifnet_head_done();
1736
1737		if (ifpp != NULL) {
1738			if (ifp != NULL)
1739				ifnet_reference(ifp);	/* for caller */
1740			*ifpp = ifp;
1741		}
1742	}
1743
1744	return 0;
1745}
1746
1747/*
1748 * generate standard sockaddr_in6 from embedded form.
1749 * touches sin6_addr and sin6_scope_id only.
1750 *
1751 * this function should be nuked in the future, when we get rid of
1752 * embedded scopeid thing.
1753 */
1754int
1755in6_recoverscope(
1756	struct sockaddr_in6 *sin6,
1757	const struct in6_addr *in6,
1758	struct ifnet *ifp)
1759{
1760	u_int32_t scopeid;
1761
1762	sin6->sin6_addr = *in6;
1763
1764	/*
1765	 * don't try to read *in6 beyond here, since the caller may
1766	 * ask us to overwrite existing sockaddr_in6
1767	 */
1768
1769	sin6->sin6_scope_id = 0;
1770	if (IN6_IS_SCOPE_LINKLOCAL(in6)) {
1771		/*
1772		 * KAME assumption: link id == interface id
1773		 */
1774		scopeid = ntohs(sin6->sin6_addr.s6_addr16[1]);
1775		if (scopeid) {
1776			/*
1777			 * sanity check
1778			 *
1779			 * Since scopeid is unsigned, we only have to check it
1780			 * against if_index
1781			 */
1782			if (if_index < scopeid)
1783				return ENXIO;
1784			if (ifp && ifp->if_index != scopeid)
1785				return ENXIO;
1786			sin6->sin6_addr.s6_addr16[1] = 0;
1787			sin6->sin6_scope_id = scopeid;
1788		}
1789	}
1790
1791	return 0;
1792}
1793