ip6_if.c revision 3448:aaf16568054b
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25/*
26 * Copyright (c) 1990 Mentat Inc.
27 */
28
29#pragma ident	"%Z%%M%	%I%	%E% SMI"
30
31/*
32 * This file contains the interface control functions for IPv6.
33 */
34
35#include <sys/types.h>
36#include <sys/sysmacros.h>
37#include <sys/stream.h>
38#include <sys/dlpi.h>
39#include <sys/stropts.h>
40#include <sys/ddi.h>
41#include <sys/cmn_err.h>
42#include <sys/kstat.h>
43#include <sys/debug.h>
44#include <sys/zone.h>
45#include <sys/policy.h>
46
47#include <sys/systm.h>
48#include <sys/param.h>
49#include <sys/socket.h>
50#include <sys/isa_defs.h>
51#include <net/if.h>
52#include <net/if_dl.h>
53#include <net/route.h>
54#include <netinet/in.h>
55#include <netinet/igmp_var.h>
56#include <netinet/ip6.h>
57#include <netinet/icmp6.h>
58#include <netinet/in.h>
59
60#include <inet/common.h>
61#include <inet/nd.h>
62#include <inet/mib2.h>
63#include <inet/ip.h>
64#include <inet/ip6.h>
65#include <inet/ip_multi.h>
66#include <inet/ip_ire.h>
67#include <inet/ip_rts.h>
68#include <inet/ip_ndp.h>
69#include <inet/ip_if.h>
70#include <inet/ip6_asp.h>
71#include <inet/tun.h>
72#include <inet/ipclassifier.h>
73#include <inet/sctp_ip.h>
74
75#include <sys/tsol/tndb.h>
76#include <sys/tsol/tnet.h>
77
78static in6_addr_t	ipv6_ll_template =
79			{(uint32_t)V6_LINKLOCAL, 0x0, 0x0, 0x0};
80
81static ipif_t *
82ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
83    queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst);
84
85/*
86 * ipif_lookup_group_v6
87 */
88ipif_t *
89ipif_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
90{
91	ire_t	*ire;
92	ipif_t	*ipif;
93
94	ire = ire_lookup_multi_v6(group, zoneid, ipst);
95	if (ire == NULL)
96		return (NULL);
97	ipif = ire->ire_ipif;
98	ipif_refhold(ipif);
99	ire_refrele(ire);
100	return (ipif);
101}
102
103/*
104 * ill_lookup_group_v6
105 */
106ill_t *
107ill_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
108{
109	ire_t	*ire;
110	ill_t	*ill;
111
112	ire = ire_lookup_multi_v6(group, zoneid, ipst);
113	if (ire == NULL)
114		return (NULL);
115	ill = ire->ire_ipif->ipif_ill;
116	ill_refhold(ill);
117	ire_refrele(ire);
118	return (ill);
119}
120
121/*
122 * Look for an ipif with the specified interface address and destination.
123 * The destination address is used only for matching point-to-point interfaces.
124 */
125static ipif_t *
126ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
127    queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
128{
129	ipif_t	*ipif;
130	ill_t	*ill;
131	ipsq_t	*ipsq;
132	ill_walk_context_t ctx;
133
134	if (error != NULL)
135		*error = 0;
136
137	/*
138	 * First match all the point-to-point interfaces
139	 * before looking at non-point-to-point interfaces.
140	 * This is done to avoid returning non-point-to-point
141	 * ipif instead of unnumbered point-to-point ipif.
142	 */
143	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
144	ill = ILL_START_WALK_V6(&ctx, ipst);
145	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
146		GRAB_CONN_LOCK(q);
147		mutex_enter(&ill->ill_lock);
148		for (ipif = ill->ill_ipif; ipif != NULL;
149		    ipif = ipif->ipif_next) {
150			/* Allow the ipif to be down */
151			if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
152			    (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
153			    if_addr)) &&
154			    (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr,
155			    dst))) {
156				if (IPIF_CAN_LOOKUP(ipif)) {
157					ipif_refhold_locked(ipif);
158					mutex_exit(&ill->ill_lock);
159					RELEASE_CONN_LOCK(q);
160					rw_exit(&ipst->ips_ill_g_lock);
161					return (ipif);
162				} else if (IPIF_CAN_WAIT(ipif, q)) {
163					ipsq = ill->ill_phyint->phyint_ipsq;
164					mutex_enter(&ipsq->ipsq_lock);
165					mutex_exit(&ill->ill_lock);
166					rw_exit(&ipst->ips_ill_g_lock);
167					ipsq_enq(ipsq, q, mp, func, NEW_OP,
168						ill);
169					mutex_exit(&ipsq->ipsq_lock);
170					RELEASE_CONN_LOCK(q);
171					*error = EINPROGRESS;
172					return (NULL);
173				}
174			}
175		}
176		mutex_exit(&ill->ill_lock);
177		RELEASE_CONN_LOCK(q);
178	}
179	rw_exit(&ipst->ips_ill_g_lock);
180	/* lookup the ipif based on interface address */
181	ipif = ipif_lookup_addr_v6(if_addr, NULL, ALL_ZONES, q, mp, func,
182	    error, ipst);
183	ASSERT(ipif == NULL || ipif->ipif_isv6);
184	return (ipif);
185}
186
187/*
188 * Look for an ipif with the specified address. For point-point links
189 * we look for matches on either the destination address and the local
190 * address, but we ignore the check on the local address if IPIF_UNNUMBERED
191 * is set.
192 * Matches on a specific ill if match_ill is set.
193 */
194/* ARGSUSED */
195ipif_t *
196ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
197    queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
198{
199	ipif_t	*ipif;
200	ill_t	*ill;
201	boolean_t  ptp = B_FALSE;
202	ipsq_t	*ipsq;
203	ill_walk_context_t ctx;
204
205	if (error != NULL)
206		*error = 0;
207
208	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
209	/*
210	 * Repeat twice, first based on local addresses and
211	 * next time for pointopoint.
212	 */
213repeat:
214	ill = ILL_START_WALK_V6(&ctx, ipst);
215	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
216		if (match_ill != NULL && ill != match_ill) {
217			continue;
218		}
219		GRAB_CONN_LOCK(q);
220		mutex_enter(&ill->ill_lock);
221		for (ipif = ill->ill_ipif; ipif != NULL;
222		    ipif = ipif->ipif_next) {
223			if (zoneid != ALL_ZONES &&
224			    ipif->ipif_zoneid != zoneid &&
225			    ipif->ipif_zoneid != ALL_ZONES)
226				continue;
227			/* Allow the ipif to be down */
228			if ((!ptp && (IN6_ARE_ADDR_EQUAL(
229			    &ipif->ipif_v6lcl_addr, addr) &&
230			    (ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
231			    (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
232			    IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr,
233			    addr))) {
234				if (IPIF_CAN_LOOKUP(ipif)) {
235					ipif_refhold_locked(ipif);
236					mutex_exit(&ill->ill_lock);
237					RELEASE_CONN_LOCK(q);
238					rw_exit(&ipst->ips_ill_g_lock);
239					return (ipif);
240				} else if (IPIF_CAN_WAIT(ipif, q)) {
241					ipsq = ill->ill_phyint->phyint_ipsq;
242					mutex_enter(&ipsq->ipsq_lock);
243					mutex_exit(&ill->ill_lock);
244					rw_exit(&ipst->ips_ill_g_lock);
245					ipsq_enq(ipsq, q, mp, func, NEW_OP,
246						ill);
247					mutex_exit(&ipsq->ipsq_lock);
248					RELEASE_CONN_LOCK(q);
249					*error = EINPROGRESS;
250					return (NULL);
251				}
252			}
253		}
254		mutex_exit(&ill->ill_lock);
255		RELEASE_CONN_LOCK(q);
256	}
257
258	/* If we already did the ptp case, then we are done */
259	if (ptp) {
260		rw_exit(&ipst->ips_ill_g_lock);
261		if (error != NULL)
262			*error = ENXIO;
263		return (NULL);
264	}
265	ptp = B_TRUE;
266	goto repeat;
267}
268
269/*
270 * Look for an ipif with the specified address. For point-point links
271 * we look for matches on either the destination address and the local
272 * address, but we ignore the check on the local address if IPIF_UNNUMBERED
273 * is set.
274 * Matches on a specific ill if match_ill is set.
275 * Return the zoneid for the ipif. ALL_ZONES if none found.
276 */
277zoneid_t
278ipif_lookup_addr_zoneid_v6(const in6_addr_t *addr, ill_t *match_ill,
279    ip_stack_t *ipst)
280{
281	ipif_t	*ipif;
282	ill_t	*ill;
283	boolean_t  ptp = B_FALSE;
284	ill_walk_context_t ctx;
285	zoneid_t	zoneid;
286
287	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
288	/*
289	 * Repeat twice, first based on local addresses and
290	 * next time for pointopoint.
291	 */
292repeat:
293	ill = ILL_START_WALK_V6(&ctx, ipst);
294	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
295		if (match_ill != NULL && ill != match_ill) {
296			continue;
297		}
298		mutex_enter(&ill->ill_lock);
299		for (ipif = ill->ill_ipif; ipif != NULL;
300		    ipif = ipif->ipif_next) {
301			/* Allow the ipif to be down */
302			if ((!ptp && (IN6_ARE_ADDR_EQUAL(
303			    &ipif->ipif_v6lcl_addr, addr) &&
304			    (ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
305			    (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
306			    IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr,
307			    addr)) &&
308			    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
309				zoneid = ipif->ipif_zoneid;
310				mutex_exit(&ill->ill_lock);
311				rw_exit(&ipst->ips_ill_g_lock);
312				/*
313				 * If ipif_zoneid was ALL_ZONES then we have
314				 * a trusted extensions shared IP address.
315				 * In that case GLOBAL_ZONEID works to send.
316				 */
317				if (zoneid == ALL_ZONES)
318					zoneid = GLOBAL_ZONEID;
319				return (zoneid);
320			}
321		}
322		mutex_exit(&ill->ill_lock);
323	}
324
325	/* If we already did the ptp case, then we are done */
326	if (ptp) {
327		rw_exit(&ipst->ips_ill_g_lock);
328		return (ALL_ZONES);
329	}
330	ptp = B_TRUE;
331	goto repeat;
332}
333
334/*
335 * Perform various checks to verify that an address would make sense as a local
336 * interface address.  This is currently only called when an attempt is made
337 * to set a local address.
338 *
339 * Does not allow a v4-mapped address, an address that equals the subnet
340 * anycast address, ... a multicast address, ...
341 */
342boolean_t
343ip_local_addr_ok_v6(const in6_addr_t *addr, const in6_addr_t *subnet_mask)
344{
345	in6_addr_t subnet;
346
347	if (IN6_IS_ADDR_UNSPECIFIED(addr))
348		return (B_TRUE);	/* Allow all zeros */
349
350	/*
351	 * Don't allow all zeroes or host part, but allow
352	 * all ones netmask.
353	 */
354	V6_MASK_COPY(*addr, *subnet_mask, subnet);
355	if (IN6_IS_ADDR_V4MAPPED(addr) ||
356	    (IN6_ARE_ADDR_EQUAL(addr, &subnet) &&
357	    !IN6_ARE_ADDR_EQUAL(subnet_mask, &ipv6_all_ones)) ||
358	    (IN6_IS_ADDR_V4COMPAT(addr) && CLASSD(V4_PART_OF_V6((*addr)))) ||
359	    IN6_IS_ADDR_MULTICAST(addr))
360		return (B_FALSE);
361
362	return (B_TRUE);
363}
364
365/*
366 * Perform various checks to verify that an address would make sense as a
367 * remote/subnet interface address.
368 */
369boolean_t
370ip_remote_addr_ok_v6(const in6_addr_t *addr, const in6_addr_t *subnet_mask)
371{
372	in6_addr_t subnet;
373
374	if (IN6_IS_ADDR_UNSPECIFIED(addr))
375		return (B_TRUE);	/* Allow all zeros */
376
377	V6_MASK_COPY(*addr, *subnet_mask, subnet);
378	if (IN6_IS_ADDR_V4MAPPED(addr) ||
379	    (IN6_ARE_ADDR_EQUAL(addr, &subnet) &&
380	    !IN6_ARE_ADDR_EQUAL(subnet_mask, &ipv6_all_ones)) ||
381	    IN6_IS_ADDR_MULTICAST(addr) ||
382	    (IN6_IS_ADDR_V4COMPAT(addr) && CLASSD(V4_PART_OF_V6((*addr)))))
383		return (B_FALSE);
384
385	return (B_TRUE);
386}
387
388/*
389 * ip_rt_add_v6 is called to add an IPv6 route to the forwarding table.
390 * ipif_arg is passed in to associate it with the correct interface
391 * (for link-local destinations and gateways).
392 */
393/* ARGSUSED1 */
394int
395ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
396    const in6_addr_t *gw_addr, const in6_addr_t *src_addr, int flags,
397    ipif_t *ipif_arg, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func,
398    struct rtsa_s *sp, ip_stack_t *ipst)
399{
400	ire_t	*ire;
401	ire_t	*gw_ire = NULL;
402	ipif_t	*ipif;
403	boolean_t ipif_refheld = B_FALSE;
404	uint_t	type;
405	int	match_flags = MATCH_IRE_TYPE;
406	int	error;
407	tsol_gc_t *gc = NULL;
408	tsol_gcgrp_t *gcgrp = NULL;
409	boolean_t gcgrp_xtraref = B_FALSE;
410
411	if (ire_arg != NULL)
412		*ire_arg = NULL;
413
414	/*
415	 * Prevent routes with a zero gateway from being created (since
416	 * interfaces can currently be plumbed and brought up with no assigned
417	 * address).
418	 */
419	if (IN6_IS_ADDR_UNSPECIFIED(gw_addr))
420		return (ENETUNREACH);
421
422	/*
423	 * If this is the case of RTF_HOST being set, then we set the netmask
424	 * to all ones (regardless if one was supplied).
425	 */
426	if (flags & RTF_HOST)
427		mask = &ipv6_all_ones;
428
429	/*
430	 * Get the ipif, if any, corresponding to the gw_addr
431	 */
432	ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, q, mp, func,
433	    &error, ipst);
434	if (ipif != NULL)
435		ipif_refheld = B_TRUE;
436	else if (error == EINPROGRESS) {
437		ip1dbg(("ip_rt_add_v6: null and EINPROGRESS"));
438		return (error);
439	}
440
441	/*
442	 * GateD will attempt to create routes with a loopback interface
443	 * address as the gateway and with RTF_GATEWAY set.  We allow
444	 * these routes to be added, but create them as interface routes
445	 * since the gateway is an interface address.
446	 */
447	if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
448		flags &= ~RTF_GATEWAY;
449		if (IN6_ARE_ADDR_EQUAL(gw_addr, &ipv6_loopback) &&
450		    IN6_ARE_ADDR_EQUAL(dst_addr, &ipv6_loopback) &&
451		    IN6_ARE_ADDR_EQUAL(mask, &ipv6_all_ones)) {
452			ire = ire_ctable_lookup_v6(dst_addr, 0, IRE_LOOPBACK,
453			    ipif, ALL_ZONES, NULL, match_flags, ipst);
454			if (ire != NULL) {
455				ire_refrele(ire);
456				if (ipif_refheld)
457					ipif_refrele(ipif);
458				return (EEXIST);
459			}
460			ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x"
461			    "for 0x%x\n", (void *)ipif,
462			    ipif->ipif_ire_type,
463			    ntohl(ipif->ipif_lcl_addr)));
464			ire = ire_create_v6(
465			    dst_addr,
466			    mask,
467			    &ipif->ipif_v6src_addr,
468			    NULL,
469			    &ipif->ipif_mtu,
470			    NULL,
471			    NULL,
472			    NULL,
473			    ipif->ipif_net_type,
474			    ipif->ipif_resolver_mp,
475			    ipif,
476			    NULL,
477			    0,
478			    0,
479			    flags,
480			    &ire_uinfo_null,
481			    NULL,
482			    NULL,
483			    ipst);
484			if (ire == NULL) {
485				if (ipif_refheld)
486					ipif_refrele(ipif);
487				return (ENOMEM);
488			}
489			error = ire_add(&ire, q, mp, func, B_FALSE);
490			if (error == 0)
491				goto save_ire;
492			/*
493			 * In the result of failure, ire_add() will have already
494			 * deleted the ire in question, so there is no need to
495			 * do that here.
496			 */
497			if (ipif_refheld)
498				ipif_refrele(ipif);
499			return (error);
500		}
501	}
502
503	/*
504	 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
505	 * and the gateway address provided is one of the system's interface
506	 * addresses.  By using the routing socket interface and supplying an
507	 * RTA_IFP sockaddr with an interface index, an alternate method of
508	 * specifying an interface route to be created is available which uses
509	 * the interface index that specifies the outgoing interface rather than
510	 * the address of an outgoing interface (which may not be able to
511	 * uniquely identify an interface).  When coupled with the RTF_GATEWAY
512	 * flag, routes can be specified which not only specify the next-hop to
513	 * be used when routing to a certain prefix, but also which outgoing
514	 * interface should be used.
515	 *
516	 * Previously, interfaces would have unique addresses assigned to them
517	 * and so the address assigned to a particular interface could be used
518	 * to identify a particular interface.  One exception to this was the
519	 * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
520	 *
521	 * With the advent of IPv6 and its link-local addresses, this
522	 * restriction was relaxed and interfaces could share addresses between
523	 * themselves.  In fact, typically all of the link-local interfaces on
524	 * an IPv6 node or router will have the same link-local address.  In
525	 * order to differentiate between these interfaces, the use of an
526	 * interface index is necessary and this index can be carried inside a
527	 * RTA_IFP sockaddr (which is actually a sockaddr_dl).  One restriction
528	 * of using the interface index, however, is that all of the ipif's that
529	 * are part of an ill have the same index and so the RTA_IFP sockaddr
530	 * cannot be used to differentiate between ipif's (or logical
531	 * interfaces) that belong to the same ill (physical interface).
532	 *
533	 * For example, in the following case involving IPv4 interfaces and
534	 * logical interfaces
535	 *
536	 *	192.0.2.32	255.255.255.224	192.0.2.33	U	if0
537	 *	192.0.2.32	255.255.255.224	192.0.2.34	U	if0:1
538	 *	192.0.2.32	255.255.255.224	192.0.2.35	U	if0:2
539	 *
540	 * the ipif's corresponding to each of these interface routes can be
541	 * uniquely identified by the "gateway" (actually interface address).
542	 *
543	 * In this case involving multiple IPv6 default routes to a particular
544	 * link-local gateway, the use of RTA_IFP is necessary to specify which
545	 * default route is of interest:
546	 *
547	 *	default		fe80::123:4567:89ab:cdef	U	if0
548	 *	default		fe80::123:4567:89ab:cdef	U	if1
549	 */
550
551	/* RTF_GATEWAY not set */
552	if (!(flags & RTF_GATEWAY)) {
553		queue_t	*stq;
554
555		if (sp != NULL) {
556			ip2dbg(("ip_rt_add_v6: gateway security attributes "
557			    "cannot be set with interface route\n"));
558			if (ipif_refheld)
559				ipif_refrele(ipif);
560			return (EINVAL);
561		}
562
563		/*
564		 * As the interface index specified with the RTA_IFP sockaddr is
565		 * the same for all ipif's off of an ill, the matching logic
566		 * below uses MATCH_IRE_ILL if such an index was specified.
567		 * This means that routes sharing the same prefix when added
568		 * using a RTA_IFP sockaddr must have distinct interface
569		 * indices (namely, they must be on distinct ill's).
570		 *
571		 * On the other hand, since the gateway address will usually be
572		 * different for each ipif on the system, the matching logic
573		 * uses MATCH_IRE_IPIF in the case of a traditional interface
574		 * route.  This means that interface routes for the same prefix
575		 * can be created if they belong to distinct ipif's and if a
576		 * RTA_IFP sockaddr is not present.
577		 */
578		if (ipif_arg != NULL) {
579			if (ipif_refheld) {
580				ipif_refrele(ipif);
581				ipif_refheld = B_FALSE;
582			}
583			ipif = ipif_arg;
584			match_flags |= MATCH_IRE_ILL;
585		} else {
586			/*
587			 * Check the ipif corresponding to the gw_addr
588			 */
589			if (ipif == NULL)
590				return (ENETUNREACH);
591			match_flags |= MATCH_IRE_IPIF;
592		}
593
594		ASSERT(ipif != NULL);
595		/*
596		 * We check for an existing entry at this point.
597		 */
598		match_flags |= MATCH_IRE_MASK;
599		ire = ire_ftable_lookup_v6(dst_addr, mask, 0, IRE_INTERFACE,
600		    ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
601		if (ire != NULL) {
602			ire_refrele(ire);
603			if (ipif_refheld)
604				ipif_refrele(ipif);
605			return (EEXIST);
606		}
607
608		stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
609		    ? ipif->ipif_rq : ipif->ipif_wq;
610
611		/*
612		 * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or
613		 * IRE_IF_RESOLVER with the modified address and netmask.
614		 */
615		ire = ire_create_v6(
616		    dst_addr,
617		    mask,
618		    &ipif->ipif_v6src_addr,
619		    NULL,
620		    &ipif->ipif_mtu,
621		    NULL,
622		    NULL,
623		    stq,
624		    ipif->ipif_net_type,
625		    ipif->ipif_resolver_mp,
626		    ipif,
627		    NULL,
628		    0,
629		    0,
630		    flags,
631		    &ire_uinfo_null,
632		    NULL,
633		    NULL,
634		    ipst);
635		if (ire == NULL) {
636			if (ipif_refheld)
637				ipif_refrele(ipif);
638			return (ENOMEM);
639		}
640
641		/*
642		 * Some software (for example, GateD and Sun Cluster) attempts
643		 * to create (what amount to) IRE_PREFIX routes with the
644		 * loopback address as the gateway.  This is primarily done to
645		 * set up prefixes with the RTF_REJECT flag set (for example,
646		 * when generating aggregate routes.)
647		 *
648		 * If the IRE type (as defined by ipif->ipif_net_type) is
649		 * IRE_LOOPBACK, then we map the request into a
650		 * IRE_IF_NORESOLVER.
651		 *
652		 * Needless to say, the real IRE_LOOPBACK is NOT created by this
653		 * routine, but rather using ire_create_v6() directly.
654		 */
655		if (ipif->ipif_net_type == IRE_LOOPBACK)
656			ire->ire_type = IRE_IF_NORESOLVER;
657		error = ire_add(&ire, q, mp, func, B_FALSE);
658		if (error == 0)
659			goto save_ire;
660		/*
661		 * In the result of failure, ire_add() will have already
662		 * deleted the ire in question, so there is no need to
663		 * do that here.
664		 */
665		if (ipif_refheld)
666			ipif_refrele(ipif);
667		return (error);
668	}
669	if (ipif_refheld) {
670		ipif_refrele(ipif);
671		ipif_refheld = B_FALSE;
672	}
673
674	/*
675	 * Get an interface IRE for the specified gateway.
676	 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
677	 * gateway, it is currently unreachable and we fail the request
678	 * accordingly.
679	 */
680	ipif = ipif_arg;
681	if (ipif_arg != NULL)
682		match_flags |= MATCH_IRE_ILL;
683	gw_ire = ire_ftable_lookup_v6(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg,
684	    NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
685	if (gw_ire == NULL)
686		return (ENETUNREACH);
687
688	/*
689	 * We create one of three types of IREs as a result of this request
690	 * based on the netmask.  A netmask of all ones (which is automatically
691	 * assumed when RTF_HOST is set) results in an IRE_HOST being created.
692	 * An all zeroes netmask implies a default route so an IRE_DEFAULT is
693	 * created.  Otherwise, an IRE_PREFIX route is created for the
694	 * destination prefix.
695	 */
696	if (IN6_ARE_ADDR_EQUAL(mask, &ipv6_all_ones))
697		type = IRE_HOST;
698	else if (IN6_IS_ADDR_UNSPECIFIED(mask))
699		type = IRE_DEFAULT;
700	else
701		type = IRE_PREFIX;
702
703	/* check for a duplicate entry */
704	ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type, ipif_arg,
705	    NULL, ALL_ZONES, 0, NULL,
706	    match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst);
707	if (ire != NULL) {
708		ire_refrele(gw_ire);
709		ire_refrele(ire);
710		return (EEXIST);
711	}
712
713	/* Security attribute exists */
714	if (sp != NULL) {
715		tsol_gcgrp_addr_t ga;
716
717		/* find or create the gateway credentials group */
718		ga.ga_af = AF_INET6;
719		ga.ga_addr = *gw_addr;
720
721		/* we hold reference to it upon success */
722		gcgrp = gcgrp_lookup(&ga, B_TRUE);
723		if (gcgrp == NULL) {
724			ire_refrele(gw_ire);
725			return (ENOMEM);
726		}
727
728		/*
729		 * Create and add the security attribute to the group; a
730		 * reference to the group is made upon allocating a new
731		 * entry successfully.  If it finds an already-existing
732		 * entry for the security attribute in the group, it simply
733		 * returns it and no new reference is made to the group.
734		 */
735		gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
736		if (gc == NULL) {
737			/* release reference held by gcgrp_lookup */
738			GCGRP_REFRELE(gcgrp);
739			ire_refrele(gw_ire);
740			return (ENOMEM);
741		}
742	}
743
744	/* Create the IRE. */
745	ire = ire_create_v6(
746	    dst_addr,				/* dest address */
747	    mask,				/* mask */
748	    /* src address assigned by the caller? */
749	    (((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr)) ?
750		src_addr : NULL),
751	    gw_addr,				/* gateway address */
752	    &gw_ire->ire_max_frag,
753	    NULL,				/* no Fast Path header */
754	    NULL,				/* no recv-from queue */
755	    NULL,				/* no send-to queue */
756	    (ushort_t)type,			/* IRE type */
757	    NULL,
758	    ipif_arg,
759	    NULL,
760	    0,
761	    0,
762	    flags,
763	    &gw_ire->ire_uinfo,			/* Inherit ULP info from gw */
764	    gc,					/* security attribute */
765	    NULL,
766	    ipst);
767
768	/*
769	 * The ire holds a reference to the 'gc' and the 'gc' holds a
770	 * reference to the 'gcgrp'. We can now release the extra reference
771	 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used.
772	 */
773	if (gcgrp_xtraref)
774		GCGRP_REFRELE(gcgrp);
775	if (ire == NULL) {
776		if (gc != NULL)
777			GC_REFRELE(gc);
778		ire_refrele(gw_ire);
779		return (ENOMEM);
780	}
781
782	/*
783	 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
784	 * SUN/OS socket stuff does but do we really want to allow ::0 ?
785	 */
786
787	/* Add the new IRE. */
788	error = ire_add(&ire, q, mp, func, B_FALSE);
789	/*
790	 * In the result of failure, ire_add() will have already
791	 * deleted the ire in question, so there is no need to
792	 * do that here.
793	 */
794	if (error != 0) {
795		ire_refrele(gw_ire);
796		return (error);
797	}
798
799	if (flags & RTF_MULTIRT) {
800		/*
801		 * Invoke the CGTP (multirouting) filtering module
802		 * to add the dst address in the filtering database.
803		 * Replicated inbound packets coming from that address
804		 * will be filtered to discard the duplicates.
805		 * It is not necessary to call the CGTP filter hook
806		 * when the dst address is a multicast, because an
807		 * IP source address cannot be a multicast.
808		 */
809		if ((ip_cgtp_filter_ops != NULL) &&
810		    ipst->ips_netstack->netstack_stackid == GLOBAL_NETSTACKID &&
811		    !IN6_IS_ADDR_MULTICAST(&(ire->ire_addr_v6))) {
812			int res = ip_cgtp_filter_ops->cfo_add_dest_v6(
813			    &ire->ire_addr_v6,
814			    &ire->ire_gateway_addr_v6,
815			    &ire->ire_src_addr_v6,
816			    &gw_ire->ire_src_addr_v6);
817			if (res != 0) {
818				ire_refrele(gw_ire);
819				ire_delete(ire);
820				return (res);
821			}
822		}
823	}
824
825	/*
826	 * Now that the prefix IRE entry has been created, delete any
827	 * existing gateway IRE cache entries as well as any IRE caches
828	 * using the gateway, and force them to be created through
829	 * ip_newroute_v6.
830	 */
831	if (gc != NULL) {
832		ASSERT(gcgrp != NULL);
833		ire_clookup_delete_cache_gw_v6(gw_addr, ALL_ZONES, ipst);
834	}
835
836save_ire:
837	if (gw_ire != NULL) {
838		ire_refrele(gw_ire);
839	}
840	if (ipif != NULL) {
841		mblk_t	*save_mp;
842
843		/*
844		 * Save enough information so that we can recreate the IRE if
845		 * the interface goes down and then up.  The metrics associated
846		 * with the route will be saved as well when rts_setmetrics() is
847		 * called after the IRE has been created.  In the case where
848		 * memory cannot be allocated, none of this information will be
849		 * saved.
850		 */
851		save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
852		if (save_mp != NULL) {
853			ifrt_t	*ifrt;
854
855			save_mp->b_wptr += sizeof (ifrt_t);
856			ifrt = (ifrt_t *)save_mp->b_rptr;
857			bzero(ifrt, sizeof (ifrt_t));
858			ifrt->ifrt_type = ire->ire_type;
859			ifrt->ifrt_v6addr = ire->ire_addr_v6;
860			mutex_enter(&ire->ire_lock);
861			ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
862			ifrt->ifrt_v6src_addr = ire->ire_src_addr_v6;
863			mutex_exit(&ire->ire_lock);
864			ifrt->ifrt_v6mask = ire->ire_mask_v6;
865			ifrt->ifrt_flags = ire->ire_flags;
866			ifrt->ifrt_max_frag = ire->ire_max_frag;
867			mutex_enter(&ipif->ipif_saved_ire_lock);
868			save_mp->b_cont = ipif->ipif_saved_ire_mp;
869			ipif->ipif_saved_ire_mp = save_mp;
870			ipif->ipif_saved_ire_cnt++;
871			mutex_exit(&ipif->ipif_saved_ire_lock);
872		}
873	}
874	if (ire_arg != NULL) {
875		/*
876		 * Store the ire that was successfully added into where ire_arg
877		 * points to so that callers don't have to look it up
878		 * themselves (but they are responsible for ire_refrele()ing
879		 * the ire when they are finished with it).
880		 */
881		*ire_arg = ire;
882	} else {
883		ire_refrele(ire);		/* Held in ire_add */
884	}
885	if (ipif_refheld)
886		ipif_refrele(ipif);
887	return (0);
888}
889
890/*
891 * ip_rt_delete_v6 is called to delete an IPv6 route.
892 * ipif_arg is passed in to associate it with the correct interface
893 * (for link-local destinations and gateways).
894 */
895/* ARGSUSED4 */
896int
897ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
898    const in6_addr_t *gw_addr, uint_t rtm_addrs, int flags, ipif_t *ipif_arg,
899    queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst)
900{
901	ire_t	*ire = NULL;
902	ipif_t	*ipif;
903	uint_t	type;
904	uint_t	match_flags = MATCH_IRE_TYPE;
905	int	err = 0;
906	boolean_t	ipif_refheld = B_FALSE;
907
908	/*
909	 * If this is the case of RTF_HOST being set, then we set the netmask
910	 * to all ones.  Otherwise, we use the netmask if one was supplied.
911	 */
912	if (flags & RTF_HOST) {
913		mask = &ipv6_all_ones;
914		match_flags |= MATCH_IRE_MASK;
915	} else if (rtm_addrs & RTA_NETMASK) {
916		match_flags |= MATCH_IRE_MASK;
917	}
918
919	/*
920	 * Note that RTF_GATEWAY is never set on a delete, therefore
921	 * we check if the gateway address is one of our interfaces first,
922	 * and fall back on RTF_GATEWAY routes.
923	 *
924	 * This makes it possible to delete an original
925	 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
926	 *
927	 * As the interface index specified with the RTA_IFP sockaddr is the
928	 * same for all ipif's off of an ill, the matching logic below uses
929	 * MATCH_IRE_ILL if such an index was specified.  This means a route
930	 * sharing the same prefix and interface index as the the route
931	 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr
932	 * is specified in the request.
933	 *
934	 * On the other hand, since the gateway address will usually be
935	 * different for each ipif on the system, the matching logic
936	 * uses MATCH_IRE_IPIF in the case of a traditional interface
937	 * route.  This means that interface routes for the same prefix can be
938	 * uniquely identified if they belong to distinct ipif's and if a
939	 * RTA_IFP sockaddr is not present.
940	 *
941	 * For more detail on specifying routes by gateway address and by
942	 * interface index, see the comments in ip_rt_add_v6().
943	 */
944	ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, q, mp, func, &err,
945	    ipst);
946	if (ipif != NULL) {
947		ipif_refheld = B_TRUE;
948		if (ipif_arg != NULL) {
949			ipif_refrele(ipif);
950			ipif_refheld = B_FALSE;
951			ipif = ipif_arg;
952			match_flags |= MATCH_IRE_ILL;
953		} else {
954			match_flags |= MATCH_IRE_IPIF;
955		}
956
957		if (ipif->ipif_ire_type == IRE_LOOPBACK)
958			ire = ire_ctable_lookup_v6(dst_addr, 0, IRE_LOOPBACK,
959			    ipif, ALL_ZONES, NULL, match_flags, ipst);
960		if (ire == NULL)
961			ire = ire_ftable_lookup_v6(dst_addr, mask, 0,
962			    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL,
963			    match_flags, ipst);
964	} else if (err == EINPROGRESS) {
965		return (err);
966	} else {
967		err = 0;
968	}
969	if (ire == NULL) {
970		/*
971		 * At this point, the gateway address is not one of our own
972		 * addresses or a matching interface route was not found.  We
973		 * set the IRE type to lookup based on whether
974		 * this is a host route, a default route or just a prefix.
975		 *
976		 * If an ipif_arg was passed in, then the lookup is based on an
977		 * interface index so MATCH_IRE_ILL is added to match_flags.
978		 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is
979		 * set as the route being looked up is not a traditional
980		 * interface route.
981		 */
982		match_flags &= ~MATCH_IRE_IPIF;
983		match_flags |= MATCH_IRE_GW;
984		if (ipif_arg != NULL)
985			match_flags |= MATCH_IRE_ILL;
986		if (IN6_ARE_ADDR_EQUAL(mask, &ipv6_all_ones))
987			type = IRE_HOST;
988		else if (IN6_IS_ADDR_UNSPECIFIED(mask))
989			type = IRE_DEFAULT;
990		else
991			type = IRE_PREFIX;
992		ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type,
993		    ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
994	}
995
996	if (ipif_refheld) {
997		ipif_refrele(ipif);
998		ipif_refheld = B_FALSE;
999	}
1000	if (ire == NULL)
1001		return (ESRCH);
1002
1003	if (ire->ire_flags & RTF_MULTIRT) {
1004		/*
1005		 * Invoke the CGTP (multirouting) filtering module
1006		 * to remove the dst address from the filtering database.
1007		 * Packets coming from that address will no longer be
1008		 * filtered to remove duplicates.
1009		 */
1010		if (ip_cgtp_filter_ops != NULL &&
1011		    ipst->ips_netstack->netstack_stackid == GLOBAL_NETSTACKID) {
1012			err = ip_cgtp_filter_ops->cfo_del_dest_v6(
1013			    &ire->ire_addr_v6, &ire->ire_gateway_addr_v6);
1014		}
1015	}
1016
1017	ipif = ire->ire_ipif;
1018	if (ipif != NULL) {
1019		mblk_t		**mpp;
1020		mblk_t		*mp;
1021		ifrt_t		*ifrt;
1022		in6_addr_t	gw_addr_v6;
1023
1024		/* Remove from ipif_saved_ire_mp list if it is there */
1025		mutex_enter(&ire->ire_lock);
1026		gw_addr_v6 = ire->ire_gateway_addr_v6;
1027		mutex_exit(&ire->ire_lock);
1028		mutex_enter(&ipif->ipif_saved_ire_lock);
1029		for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL;
1030		    mpp = &(*mpp)->b_cont) {
1031			/*
1032			 * On a given ipif, the triple of address, gateway and
1033			 * mask is unique for each saved IRE (in the case of
1034			 * ordinary interface routes, the gateway address is
1035			 * all-zeroes).
1036			 */
1037			mp = *mpp;
1038			ifrt = (ifrt_t *)mp->b_rptr;
1039			if (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
1040			    &ire->ire_addr_v6) &&
1041			    IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
1042			    &gw_addr_v6) &&
1043			    IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
1044			    &ire->ire_mask_v6)) {
1045				*mpp = mp->b_cont;
1046				ipif->ipif_saved_ire_cnt--;
1047				freeb(mp);
1048				break;
1049			}
1050		}
1051		mutex_exit(&ipif->ipif_saved_ire_lock);
1052	}
1053	ire_delete(ire);
1054	ire_refrele(ire);
1055	return (err);
1056}
1057
1058/*
1059 * Derive a token from the link layer address.
1060 */
1061boolean_t
1062ill_setdefaulttoken(ill_t *ill)
1063{
1064	int 		i;
1065	in6_addr_t	v6addr, v6mask;
1066
1067	if (!MEDIA_V6INTFID(ill->ill_media, ill->ill_phys_addr_length,
1068	    ill->ill_phys_addr, &v6addr))
1069		return (B_FALSE);
1070
1071	(void) ip_plen_to_mask_v6(IPV6_TOKEN_LEN, &v6mask);
1072
1073	for (i = 0; i < 4; i++)
1074		v6mask.s6_addr32[i] = v6mask.s6_addr32[i] ^
1075		    (uint32_t)0xffffffff;
1076
1077	V6_MASK_COPY(v6addr, v6mask, ill->ill_token);
1078	ill->ill_token_length = IPV6_TOKEN_LEN;
1079	return (B_TRUE);
1080}
1081
1082/*
1083 * Create a link-local address from a token.
1084 */
1085static void
1086ipif_get_linklocal(in6_addr_t *dest, const in6_addr_t *token)
1087{
1088	int i;
1089
1090	for (i = 0; i < 4; i++) {
1091		dest->s6_addr32[i] =
1092		    token->s6_addr32[i] | ipv6_ll_template.s6_addr32[i];
1093	}
1094}
1095
1096/*
1097 * Set a nice default address for either automatic tunnels tsrc/96 or
1098 * 6to4 tunnels 2002:<tsrc>::1/64
1099 */
1100static void
1101ipif_set_tun_auto_addr(ipif_t *ipif, struct iftun_req *ta)
1102{
1103	sin6_t	sin6;
1104	sin_t	*sin;
1105	ill_t 	*ill = ipif->ipif_ill;
1106	tun_t *tp = (tun_t *)ill->ill_wq->q_next->q_ptr;
1107
1108	if (ta->ifta_saddr.ss_family != AF_INET ||
1109	    (ipif->ipif_flags & IPIF_UP) || !ipif->ipif_isv6 ||
1110	    (ta->ifta_flags & IFTUN_SRC) == 0)
1111		return;
1112
1113	/*
1114	 * Check the tunnel type by examining q_next->q_ptr
1115	 */
1116	if (tp->tun_flags & TUN_AUTOMATIC) {
1117		/* this is an automatic tunnel */
1118		(void) ip_plen_to_mask_v6(IPV6_ABITS - IP_ABITS,
1119		    &ipif->ipif_v6net_mask);
1120		bzero(&sin6, sizeof (sin6_t));
1121		sin = (sin_t *)&ta->ifta_saddr;
1122		V4_PART_OF_V6(sin6.sin6_addr) = sin->sin_addr.s_addr;
1123		sin6.sin6_family = AF_INET6;
1124		(void) ip_sioctl_addr(ipif, (sin_t *)&sin6,
1125		    NULL, NULL, NULL, NULL);
1126	} else if (tp->tun_flags & TUN_6TO4) {
1127		/* this is a 6to4 tunnel */
1128		(void) ip_plen_to_mask_v6(IPV6_PREFIX_LEN,
1129		    &ipif->ipif_v6net_mask);
1130		sin = (sin_t *)&ta->ifta_saddr;
1131		/* create a 6to4 address from the IPv4 tsrc */
1132		IN6_V4ADDR_TO_6TO4(&sin->sin_addr, &sin6.sin6_addr);
1133		sin6.sin6_family = AF_INET6;
1134		(void) ip_sioctl_addr(ipif, (sin_t *)&sin6,
1135		    NULL, NULL, NULL, NULL);
1136	} else {
1137		ip1dbg(("ipif_set_tun_auto_addr: Unknown tunnel type"));
1138		return;
1139	}
1140}
1141
1142/*
1143 * Set link local for ipif_id 0 of a configured tunnel based on the
1144 * tsrc or tdst parameter
1145 * For tunnels over IPv4 use the IPv4 address prepended with 32 zeros as
1146 * the token.
1147 * For tunnels over IPv6 use the low-order 64 bits of the "inner" IPv6 address
1148 * as the token for the "outer" link.
1149 */
1150void
1151ipif_set_tun_llink(ill_t *ill, struct iftun_req *ta)
1152{
1153	ipif_t		*ipif;
1154	sin_t		*sin;
1155	in6_addr_t	*s6addr;
1156
1157	ASSERT(IAM_WRITER_ILL(ill));
1158
1159	/* The first ipif must be id zero. */
1160	ipif = ill->ill_ipif;
1161	ASSERT(ipif->ipif_id == 0);
1162
1163	/* no link local for automatic tunnels */
1164	if (!(ipif->ipif_flags & IPIF_POINTOPOINT)) {
1165		ipif_set_tun_auto_addr(ipif, ta);
1166		return;
1167	}
1168
1169	if ((ta->ifta_flags & IFTUN_DST) &&
1170	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)) {
1171		sin6_t	sin6;
1172
1173		ASSERT(!(ipif->ipif_flags & IPIF_UP));
1174		bzero(&sin6, sizeof (sin6_t));
1175		if ((ta->ifta_saddr.ss_family == AF_INET)) {
1176			sin = (sin_t *)&ta->ifta_daddr;
1177			V4_PART_OF_V6(sin6.sin6_addr) =
1178			    sin->sin_addr.s_addr;
1179		} else {
1180			s6addr =
1181			    &((sin6_t *)&ta->ifta_daddr)->sin6_addr;
1182			sin6.sin6_addr.s6_addr32[3] = s6addr->s6_addr32[3];
1183			sin6.sin6_addr.s6_addr32[2] = s6addr->s6_addr32[2];
1184		}
1185		ipif_get_linklocal(&ipif->ipif_v6pp_dst_addr,
1186		    &sin6.sin6_addr);
1187		ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
1188	}
1189	if ((ta->ifta_flags & IFTUN_SRC)) {
1190		ASSERT(!(ipif->ipif_flags & IPIF_UP));
1191
1192		/* Set the token if it isn't already set */
1193		if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token)) {
1194			if ((ta->ifta_saddr.ss_family == AF_INET)) {
1195				sin = (sin_t *)&ta->ifta_saddr;
1196				V4_PART_OF_V6(ill->ill_token) =
1197				    sin->sin_addr.s_addr;
1198			} else {
1199				s6addr =
1200				    &((sin6_t *)&ta->ifta_saddr)->sin6_addr;
1201				ill->ill_token.s6_addr32[3] =
1202				    s6addr->s6_addr32[3];
1203				ill->ill_token.s6_addr32[2] =
1204				    s6addr->s6_addr32[2];
1205			}
1206			ill->ill_token_length = IPV6_TOKEN_LEN;
1207		}
1208		/*
1209		 * Attempt to set the link local address if it isn't set.
1210		 */
1211		if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
1212			(void) ipif_setlinklocal(ipif);
1213	}
1214}
1215
1216/*
1217 * Is it not possible to set the link local address?
1218 * The address can be set if the token is set, and the token
1219 * isn't too long.
1220 * Return B_TRUE if the address can't be set, or B_FALSE if it can.
1221 */
1222boolean_t
1223ipif_cant_setlinklocal(ipif_t *ipif)
1224{
1225	ill_t *ill = ipif->ipif_ill;
1226
1227	if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token) ||
1228	    ill->ill_token_length > IPV6_ABITS - IPV6_LL_PREFIXLEN)
1229		return (B_TRUE);
1230
1231	return (B_FALSE);
1232}
1233
1234/*
1235 * Generate a link-local address from the token.
1236 * Return zero if the address was set, or non-zero if it couldn't be set.
1237 */
1238int
1239ipif_setlinklocal(ipif_t *ipif)
1240{
1241	ill_t *ill = ipif->ipif_ill;
1242
1243	ASSERT(IAM_WRITER_ILL(ill));
1244
1245	if (ipif_cant_setlinklocal(ipif))
1246		return (-1);
1247
1248	ipif_get_linklocal(&ipif->ipif_v6lcl_addr, &ill->ill_token);
1249	(void) ip_plen_to_mask_v6(IPV6_LL_PREFIXLEN, &ipif->ipif_v6net_mask);
1250	V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
1251	    ipif->ipif_v6subnet);
1252
1253	if (ipif->ipif_flags & IPIF_NOLOCAL) {
1254		ipif->ipif_v6src_addr = ipv6_all_zeros;
1255	} else {
1256		ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
1257	}
1258	return (0);
1259}
1260
1261/*
1262 * This function sets up the multicast mappings in NDP.
1263 * Unlike ARP, there are no mapping_mps here. We delete the
1264 * mapping nces and add a new one.
1265 *
1266 * Returns non-zero on error and 0 on success.
1267 */
1268int
1269ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce)
1270{
1271	ill_t		*ill = ipif->ipif_ill;
1272	in6_addr_t	v6_mcast_addr = {(uint32_t)V6_MCAST, 0, 0, 0};
1273	in6_addr_t	v6_mcast_mask = {(uint32_t)V6_MCAST, 0, 0, 0};
1274	in6_addr_t	v6_extract_mask;
1275	uchar_t		*phys_addr, *bphys_addr, *alloc_phys;
1276	nce_t		*mnce = NULL;
1277	int		err = 0;
1278	phyint_t	*phyi = ill->ill_phyint;
1279	uint32_t	hw_extract_start;
1280	dl_unitdata_req_t *dlur;
1281	ip_stack_t	*ipst = ill->ill_ipst;
1282
1283	if (ret_nce != NULL)
1284		*ret_nce = NULL;
1285	/*
1286	 * Delete the mapping nce. Normally these should not exist
1287	 * as a previous ipif_down -> ipif_ndp_down should have deleted
1288	 * all the nces. But they can exist if ip_rput_dlpi_writer
1289	 * calls this when PHYI_MULTI_BCAST is set.
1290	 */
1291	mnce = ndp_lookup_v6(ill, &v6_mcast_addr, B_FALSE);
1292	if (mnce != NULL) {
1293		ndp_delete(mnce);
1294		NCE_REFRELE(mnce);
1295		mnce = NULL;
1296	}
1297
1298	/*
1299	 * Get media specific v6 mapping information. Note that
1300	 * nd_lla_len can be 0 for tunnels.
1301	 */
1302	alloc_phys = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1303	if ((alloc_phys == NULL) && (ill->ill_nd_lla_len != 0))
1304		return (ENOMEM);
1305	/*
1306	 * Determine the broadcast address.
1307	 */
1308	dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
1309	if (ill->ill_sap_length < 0)
1310		bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
1311	else
1312		bphys_addr = (uchar_t *)dlur +
1313		    dlur->dl_dest_addr_offset + ill->ill_sap_length;
1314
1315	/*
1316	 * Check PHYI_MULTI_BCAST and possible length of physical
1317	 * address to determine if we use the mapping or the
1318	 * broadcast address.
1319	 */
1320	if ((phyi->phyint_flags & PHYI_MULTI_BCAST) ||
1321	    (!MEDIA_V6MINFO(ill->ill_media, ill->ill_nd_lla_len,
1322	    bphys_addr, alloc_phys, &hw_extract_start,
1323	    &v6_extract_mask))) {
1324		if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) {
1325			kmem_free(alloc_phys, ill->ill_nd_lla_len);
1326			return (E2BIG);
1327		}
1328		/* Use the link-layer broadcast address for MULTI_BCAST */
1329		phys_addr = bphys_addr;
1330		bzero(&v6_extract_mask, sizeof (v6_extract_mask));
1331		hw_extract_start = ill->ill_nd_lla_len;
1332	} else {
1333		phys_addr = alloc_phys;
1334	}
1335	if ((ipif->ipif_flags & IPIF_BROADCAST) ||
1336	    (ill->ill_flags & ILLF_MULTICAST) ||
1337	    (phyi->phyint_flags & PHYI_MULTI_BCAST)) {
1338		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1339		err = ndp_add(ill,
1340		    phys_addr,
1341		    &v6_mcast_addr,	/* v6 address */
1342		    &v6_mcast_mask,	/* v6 mask */
1343		    &v6_extract_mask,
1344		    hw_extract_start,
1345		    NCE_F_MAPPING | NCE_F_PERMANENT | NCE_F_NONUD,
1346		    ND_REACHABLE,
1347		    &mnce,
1348		    NULL,
1349		    NULL);
1350		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1351		if (err == 0) {
1352			if (ret_nce != NULL) {
1353				*ret_nce = mnce;
1354			} else {
1355				NCE_REFRELE(mnce);
1356			}
1357		}
1358	}
1359	kmem_free(alloc_phys, ill->ill_nd_lla_len);
1360	return (err);
1361}
1362
1363/*
1364 * Get the resolver set up for a new interface address.  (Always called
1365 * as writer.)
1366 */
1367int
1368ipif_ndp_up(ipif_t *ipif, const in6_addr_t *addr)
1369{
1370	ill_t		*ill = ipif->ipif_ill;
1371	int		err = 0;
1372	nce_t		*nce = NULL;
1373	nce_t		*mnce = NULL;
1374
1375	ip1dbg(("ipif_ndp_up(%s:%u)\n",
1376		ipif->ipif_ill->ill_name, ipif->ipif_id));
1377
1378	/*
1379	 * ND not supported on XRESOLV interfaces. If ND support (multicast)
1380	 * added later, take out this check.
1381	 */
1382	if ((ill->ill_flags & ILLF_XRESOLV) ||
1383	    IN6_IS_ADDR_UNSPECIFIED(addr) ||
1384	    (!(ill->ill_net_type & IRE_INTERFACE))) {
1385		ipif->ipif_addr_ready = 1;
1386		return (0);
1387	}
1388
1389	/*
1390	 * Need to setup multicast mapping only when the first
1391	 * interface is coming UP.
1392	 */
1393	if (ill->ill_ipif_up_count == 0 &&
1394	    (ill->ill_flags & ILLF_MULTICAST)) {
1395		/*
1396		 * We set the multicast before setting up the mapping for
1397		 * local address because ipif_ndp_setup_multicast does
1398		 * ndp_walk to delete nces which will delete the mapping
1399		 * for local address also if we added the mapping for
1400		 * local address first.
1401		 */
1402		err = ipif_ndp_setup_multicast(ipif, &mnce);
1403		if (err != 0)
1404			return (err);
1405	}
1406
1407	if ((ipif->ipif_flags & (IPIF_UNNUMBERED|IPIF_NOLOCAL)) == 0) {
1408		uint16_t	flags;
1409		uchar_t	*hw_addr = NULL;
1410
1411		/* Permanent entries don't need NUD */
1412		flags = NCE_F_PERMANENT | NCE_F_NONUD;
1413		if (ill->ill_flags & ILLF_ROUTER)
1414			flags |= NCE_F_ISROUTER;
1415
1416		if (ipif->ipif_flags & IPIF_ANYCAST)
1417			flags |= NCE_F_ANYCAST;
1418
1419		if (ill->ill_net_type == IRE_IF_RESOLVER) {
1420			hw_addr = ill->ill_nd_lla;
1421
1422			if (ill->ill_move_in_progress) {
1423				/*
1424				 * Addresses are failing over to this ill.
1425				 * Don't wait for NUD to see this change.
1426				 * Publish our new link-layer address.
1427				 */
1428				flags |= NCE_F_UNSOL_ADV;
1429			}
1430		}
1431		err = ndp_lookup_then_add(ill,
1432		    hw_addr,
1433		    addr,
1434		    &ipv6_all_ones,
1435		    &ipv6_all_zeros,
1436		    0,
1437		    flags,
1438		    ND_PROBE,	/* Causes Duplicate Address Detection to run */
1439		    &nce,
1440		    NULL,
1441		    NULL);
1442		switch (err) {
1443		case 0:
1444			ip1dbg(("ipif_ndp_up: NCE created for %s\n",
1445			    ill->ill_name));
1446			ipif->ipif_addr_ready = 1;
1447			break;
1448		case EINPROGRESS:
1449			ip1dbg(("ipif_ndp_up: running DAD now for %s\n",
1450			    ill->ill_name));
1451			break;
1452		case EEXIST:
1453			NCE_REFRELE(nce);
1454			ip1dbg(("ipif_ndp_up: NCE already exists for %s\n",
1455			    ill->ill_name));
1456			if (mnce != NULL) {
1457				ndp_delete(mnce);
1458				NCE_REFRELE(mnce);
1459			}
1460			return (err);
1461		default:
1462			ip1dbg(("ipif_ndp_up: NCE creation failed %s\n",
1463			    ill->ill_name));
1464			if (mnce != NULL) {
1465				ndp_delete(mnce);
1466				NCE_REFRELE(mnce);
1467			}
1468			return (err);
1469		}
1470	} else {
1471		/* No local NCE for this entry */
1472		ipif->ipif_addr_ready = 1;
1473	}
1474	if (nce != NULL)
1475		NCE_REFRELE(nce);
1476	if (mnce != NULL)
1477		NCE_REFRELE(mnce);
1478	return (0);
1479}
1480
1481/* Remove all cache entries for this logical interface */
1482void
1483ipif_ndp_down(ipif_t *ipif)
1484{
1485	nce_t	*nce;
1486
1487	if (ipif->ipif_isv6) {
1488		nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr,
1489		    B_FALSE);
1490		if (nce != NULL) {
1491			ndp_delete(nce);
1492			NCE_REFRELE(nce);
1493		}
1494	}
1495	/*
1496	 * Remove mapping and all other nces dependent on this ill
1497	 * when the last ipif is going away.
1498	 */
1499	if (ipif->ipif_ill->ill_ipif_up_count == 0) {
1500		ndp_walk(ipif->ipif_ill, (pfi_t)ndp_delete_per_ill,
1501		    (uchar_t *)ipif->ipif_ill, ipif->ipif_ill->ill_ipst);
1502	}
1503}
1504
1505/*
1506 * Used when an interface comes up to recreate any extra routes on this
1507 * interface.
1508 */
1509static ire_t **
1510ipif_recover_ire_v6(ipif_t *ipif)
1511{
1512	mblk_t	*mp;
1513	ire_t   **ipif_saved_irep;
1514	ire_t   **irep;
1515	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
1516
1517	ip1dbg(("ipif_recover_ire_v6(%s:%u)", ipif->ipif_ill->ill_name,
1518	    ipif->ipif_id));
1519
1520	ASSERT(ipif->ipif_isv6);
1521
1522	mutex_enter(&ipif->ipif_saved_ire_lock);
1523	ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) *
1524	    ipif->ipif_saved_ire_cnt, KM_NOSLEEP);
1525	if (ipif_saved_irep == NULL) {
1526		mutex_exit(&ipif->ipif_saved_ire_lock);
1527		return (NULL);
1528	}
1529
1530	irep = ipif_saved_irep;
1531
1532	for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
1533		ire_t		*ire;
1534		queue_t		*rfq;
1535		queue_t		*stq;
1536		ifrt_t		*ifrt;
1537		in6_addr_t	*src_addr;
1538		in6_addr_t	*gateway_addr;
1539		mblk_t		*resolver_mp;
1540		char		buf[INET6_ADDRSTRLEN];
1541		ushort_t	type;
1542
1543		/*
1544		 * When the ire was initially created and then added in
1545		 * ip_rt_add_v6(), it was created either using
1546		 * ipif->ipif_net_type in the case of a traditional interface
1547		 * route, or as one of the IRE_OFFSUBNET types (with the
1548		 * exception of IRE_HOST type redirect ire which is created by
1549		 * icmp_redirect_v6() and which we don't need to save or
1550		 * recover).  In the case where ipif->ipif_net_type was
1551		 * IRE_LOOPBACK, ip_rt_add_v6() will update the ire_type to
1552		 * IRE_IF_NORESOLVER before calling ire_add_v6() to satisfy
1553		 * software like GateD and Sun Cluster which creates routes
1554		 * using the the loopback interface's address as a gateway.
1555		 *
1556		 * As ifrt->ifrt_type reflects the already updated ire_type and
1557		 * since ire_create_v6() expects that IRE_IF_NORESOLVER will
1558		 * have a valid ire_dlureq_mp field (which doesn't make sense
1559		 * for a IRE_LOOPBACK), ire_create_v6() will be called in the
1560		 * same way here as in ip_rt_add_v6(), namely using
1561		 * ipif->ipif_net_type when the route looks like a traditional
1562		 * interface route (where ifrt->ifrt_type & IRE_INTERFACE is
1563		 * true) and otherwise using the saved ifrt->ifrt_type.  This
1564		 * means that in the case where ipif->ipif_net_type is
1565		 * IRE_LOOPBACK, the ire created by ire_create_v6() will be an
1566		 * IRE_LOOPBACK, it will then be turned into an
1567		 * IRE_IF_NORESOLVER and then added by ire_add_v6().
1568		 */
1569		ifrt = (ifrt_t *)mp->b_rptr;
1570		if (ifrt->ifrt_type & IRE_INTERFACE) {
1571			rfq = NULL;
1572			stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
1573			    ? ipif->ipif_rq : ipif->ipif_wq;
1574			src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
1575			    ? &ifrt->ifrt_v6src_addr
1576			    : &ipif->ipif_v6src_addr;
1577			gateway_addr = NULL;
1578			resolver_mp = ipif->ipif_resolver_mp;
1579			type = ipif->ipif_net_type;
1580		} else {
1581			rfq = NULL;
1582			stq = NULL;
1583			src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
1584			    ? &ifrt->ifrt_v6src_addr : NULL;
1585			gateway_addr = &ifrt->ifrt_v6gateway_addr;
1586			resolver_mp = NULL;
1587			type = ifrt->ifrt_type;
1588		}
1589
1590		/*
1591		 * Create a copy of the IRE with the saved address and netmask.
1592		 */
1593		ip1dbg(("ipif_recover_ire_v6: creating IRE %s (%d) for %s/%d\n",
1594		    ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type,
1595		    inet_ntop(AF_INET6, &ifrt->ifrt_v6addr, buf, sizeof (buf)),
1596		    ip_mask_to_plen_v6(&ifrt->ifrt_v6mask)));
1597		ire = ire_create_v6(
1598		    &ifrt->ifrt_v6addr,
1599		    &ifrt->ifrt_v6mask,
1600		    src_addr,
1601		    gateway_addr,
1602		    &ifrt->ifrt_max_frag,
1603		    NULL,
1604		    rfq,
1605		    stq,
1606		    type,
1607		    resolver_mp,
1608		    ipif,
1609		    NULL,
1610		    0,
1611		    0,
1612		    ifrt->ifrt_flags,
1613		    &ifrt->ifrt_iulp_info,
1614		    NULL,
1615		    NULL,
1616		    ipst);
1617		if (ire == NULL) {
1618			mutex_exit(&ipif->ipif_saved_ire_lock);
1619			kmem_free(ipif_saved_irep,
1620			    ipif->ipif_saved_ire_cnt * sizeof (ire_t *));
1621			return (NULL);
1622		}
1623
1624		/*
1625		 * Some software (for example, GateD and Sun Cluster) attempts
1626		 * to create (what amount to) IRE_PREFIX routes with the
1627		 * loopback address as the gateway.  This is primarily done to
1628		 * set up prefixes with the RTF_REJECT flag set (for example,
1629		 * when generating aggregate routes.)
1630		 *
1631		 * If the IRE type (as defined by ipif->ipif_net_type) is
1632		 * IRE_LOOPBACK, then we map the request into a
1633		 * IRE_IF_NORESOLVER.
1634		 */
1635		if (ipif->ipif_net_type == IRE_LOOPBACK)
1636			ire->ire_type = IRE_IF_NORESOLVER;
1637		/*
1638		 * ire held by ire_add, will be refreled' in ipif_up_done
1639		 * towards the end
1640		 */
1641		(void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
1642		*irep = ire;
1643		irep++;
1644		ip1dbg(("ipif_recover_ire_v6: added ire %p\n", (void *)ire));
1645	}
1646	mutex_exit(&ipif->ipif_saved_ire_lock);
1647	return (ipif_saved_irep);
1648}
1649
1650/*
1651 * Return the scope of the given IPv6 address.  If the address is an
1652 * IPv4 mapped IPv6 address, return the scope of the corresponding
1653 * IPv4 address.
1654 */
1655in6addr_scope_t
1656ip_addr_scope_v6(const in6_addr_t *addr)
1657{
1658	static in6_addr_t ipv6loopback = IN6ADDR_LOOPBACK_INIT;
1659
1660	if (IN6_IS_ADDR_V4MAPPED(addr)) {
1661		in_addr_t v4addr_h = ntohl(V4_PART_OF_V6((*addr)));
1662		if ((v4addr_h >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1663		    (v4addr_h & IN_AUTOCONF_MASK) == IN_AUTOCONF_NET)
1664			return (IP6_SCOPE_LINKLOCAL);
1665		if ((v4addr_h & IN_PRIVATE8_MASK) == IN_PRIVATE8_NET ||
1666		    (v4addr_h & IN_PRIVATE12_MASK) == IN_PRIVATE12_NET ||
1667		    (v4addr_h & IN_PRIVATE16_MASK) == IN_PRIVATE16_NET)
1668			return (IP6_SCOPE_SITELOCAL);
1669		return (IP6_SCOPE_GLOBAL);
1670	}
1671
1672	if (IN6_IS_ADDR_MULTICAST(addr))
1673		return (IN6_ADDR_MC_SCOPE(addr));
1674
1675	/* link-local and loopback addresses are of link-local scope */
1676	if (IN6_IS_ADDR_LINKLOCAL(addr) ||
1677	    IN6_ARE_ADDR_EQUAL(addr, &ipv6loopback))
1678		return (IP6_SCOPE_LINKLOCAL);
1679	if (IN6_IS_ADDR_SITELOCAL(addr))
1680		return (IP6_SCOPE_SITELOCAL);
1681	return (IP6_SCOPE_GLOBAL);
1682}
1683
1684
1685/*
1686 * Returns the length of the common prefix of a1 and a2, as per
1687 * CommonPrefixLen() defined in RFC 3484.
1688 */
1689static int
1690ip_common_prefix_v6(const in6_addr_t *a1, const in6_addr_t *a2)
1691{
1692	int i;
1693	uint32_t a1val, a2val, mask;
1694
1695	for (i = 0; i < 4; i++) {
1696		if ((a1val = a1->s6_addr32[i]) != (a2val = a2->s6_addr32[i])) {
1697			a1val ^= a2val;
1698			i *= 32;
1699			mask = 0x80000000u;
1700			while (!(a1val & mask)) {
1701				mask >>= 1;
1702				i++;
1703			}
1704			return (i);
1705		}
1706	}
1707	return (IPV6_ABITS);
1708}
1709
1710#define	IPIF_VALID_IPV6_SOURCE(ipif) \
1711	(((ipif)->ipif_flags & IPIF_UP) && \
1712	!((ipif)->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) && \
1713	(ipif)->ipif_addr_ready)
1714
1715/* source address candidate */
1716typedef struct candidate {
1717	ipif_t		*cand_ipif;
1718	/* The properties of this candidate */
1719	boolean_t	cand_isdst;
1720	boolean_t	cand_isdst_set;
1721	in6addr_scope_t	cand_scope;
1722	boolean_t	cand_scope_set;
1723	boolean_t	cand_isdeprecated;
1724	boolean_t	cand_isdeprecated_set;
1725	boolean_t	cand_ispreferred;
1726	boolean_t	cand_ispreferred_set;
1727	boolean_t	cand_matchedinterface;
1728	boolean_t	cand_matchedinterface_set;
1729	boolean_t	cand_matchedlabel;
1730	boolean_t	cand_matchedlabel_set;
1731	boolean_t	cand_istmp;
1732	boolean_t	cand_istmp_set;
1733	int		cand_common_pref;
1734	boolean_t	cand_common_pref_set;
1735	boolean_t	cand_pref_eq;
1736	boolean_t	cand_pref_eq_set;
1737	int		cand_pref_len;
1738	boolean_t	cand_pref_len_set;
1739} cand_t;
1740#define	cand_srcaddr	cand_ipif->ipif_v6lcl_addr
1741#define	cand_mask	cand_ipif->ipif_v6net_mask
1742#define	cand_flags	cand_ipif->ipif_flags
1743#define	cand_ill	cand_ipif->ipif_ill
1744#define	cand_zoneid	cand_ipif->ipif_zoneid
1745
1746/* information about the destination for source address selection */
1747typedef struct dstinfo {
1748	const in6_addr_t	*dst_addr;
1749	ill_t			*dst_ill;
1750	uint_t			dst_restrict_ill;
1751	boolean_t		dst_prefer_src_tmp;
1752	in6addr_scope_t		dst_scope;
1753	char			*dst_label;
1754} dstinfo_t;
1755
1756/*
1757 * The following functions are rules used to select a source address in
1758 * ipif_select_source_v6().  Each rule compares a current candidate (cc)
1759 * against the best candidate (bc).  Each rule has three possible outcomes;
1760 * the candidate is preferred over the best candidate (CAND_PREFER), the
1761 * candidate is not preferred over the best candidate (CAND_AVOID), or the
1762 * candidate is of equal value as the best candidate (CAND_TIE).
1763 *
1764 * These rules are part of a greater "Default Address Selection for IPv6"
1765 * sheme, which is standards based work coming out of the IETF ipv6 working
1766 * group.  The IETF document defines both IPv6 source address selection and
1767 * destination address ordering.  The rules defined here implement the IPv6
1768 * source address selection.  Destination address ordering is done by
1769 * libnsl, and uses a similar set of rules to implement the sorting.
1770 *
1771 * Most of the rules are defined by the RFC and are not typically altered.  The
1772 * last rule, number 8, has language that allows for local preferences.  In the
1773 * scheme below, this means that new Solaris rules should normally go between
1774 * rule_ifprefix and rule_prefix.
1775 */
1776typedef enum {CAND_AVOID, CAND_TIE, CAND_PREFER} rule_res_t;
1777typedef	rule_res_t (*rulef_t)(cand_t *, cand_t *, const dstinfo_t *,
1778    ip_stack_t *);
1779
1780/* Prefer an address if it is equal to the destination address. */
1781/* ARGSUSED3 */
1782static rule_res_t
1783rule_isdst(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst)
1784{
1785	if (!bc->cand_isdst_set) {
1786		bc->cand_isdst =
1787		    IN6_ARE_ADDR_EQUAL(&bc->cand_srcaddr, dstinfo->dst_addr);
1788		bc->cand_isdst_set = B_TRUE;
1789	}
1790
1791	cc->cand_isdst =
1792	    IN6_ARE_ADDR_EQUAL(&cc->cand_srcaddr, dstinfo->dst_addr);
1793	cc->cand_isdst_set = B_TRUE;
1794
1795	if (cc->cand_isdst == bc->cand_isdst)
1796		return (CAND_TIE);
1797	else if (cc->cand_isdst)
1798		return (CAND_PREFER);
1799	else
1800		return (CAND_AVOID);
1801}
1802
1803/*
1804 * Prefer addresses that are of closest scope to the destination.  Always
1805 * prefer addresses that are of greater scope than the destination over
1806 * those that are of lesser scope than the destination.
1807 */
1808/* ARGSUSED3 */
1809static rule_res_t
1810rule_scope(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst)
1811{
1812	if (!bc->cand_scope_set) {
1813		bc->cand_scope = ip_addr_scope_v6(&bc->cand_srcaddr);
1814		bc->cand_scope_set = B_TRUE;
1815	}
1816
1817	cc->cand_scope = ip_addr_scope_v6(&cc->cand_srcaddr);
1818	cc->cand_scope_set = B_TRUE;
1819
1820	if (cc->cand_scope < bc->cand_scope) {
1821		if (cc->cand_scope < dstinfo->dst_scope)
1822			return (CAND_AVOID);
1823		else
1824			return (CAND_PREFER);
1825	} else if (bc->cand_scope < cc->cand_scope) {
1826		if (bc->cand_scope < dstinfo->dst_scope)
1827			return (CAND_PREFER);
1828		else
1829			return (CAND_AVOID);
1830	} else {
1831		return (CAND_TIE);
1832	}
1833}
1834
1835/*
1836 * Prefer non-deprecated source addresses.
1837 */
1838/* ARGSUSED2 */
1839static rule_res_t
1840rule_deprecated(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
1841    ip_stack_t *ipst)
1842{
1843	if (!bc->cand_isdeprecated_set) {
1844		bc->cand_isdeprecated =
1845		    ((bc->cand_flags & IPIF_DEPRECATED) != 0);
1846		bc->cand_isdeprecated_set = B_TRUE;
1847	}
1848
1849	cc->cand_isdeprecated = ((cc->cand_flags & IPIF_DEPRECATED) != 0);
1850	cc->cand_isdeprecated_set = B_TRUE;
1851
1852	if (bc->cand_isdeprecated == cc->cand_isdeprecated)
1853		return (CAND_TIE);
1854	else if (cc->cand_isdeprecated)
1855		return (CAND_AVOID);
1856	else
1857		return (CAND_PREFER);
1858}
1859
1860/*
1861 * Prefer source addresses that have the IPIF_PREFERRED flag set.  This
1862 * rule must be before rule_interface because the flag could be set on any
1863 * interface, not just the interface being used for outgoing packets (for
1864 * example, the IFF_PREFERRED could be set on an address assigned to the
1865 * loopback interface).
1866 */
1867/* ARGSUSED2 */
1868static rule_res_t
1869rule_preferred(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
1870    ip_stack_t *ipst)
1871{
1872	if (!bc->cand_ispreferred_set) {
1873		bc->cand_ispreferred = ((bc->cand_flags & IPIF_PREFERRED) != 0);
1874		bc->cand_ispreferred_set = B_TRUE;
1875	}
1876
1877	cc->cand_ispreferred = ((cc->cand_flags & IPIF_PREFERRED) != 0);
1878	cc->cand_ispreferred_set = B_TRUE;
1879
1880	if (bc->cand_ispreferred == cc->cand_ispreferred)
1881		return (CAND_TIE);
1882	else if (cc->cand_ispreferred)
1883		return (CAND_PREFER);
1884	else
1885		return (CAND_AVOID);
1886}
1887
1888/*
1889 * Prefer source addresses that are assigned to the outgoing interface, or
1890 * to an interface that is in the same IPMP group as the outgoing
1891 * interface.
1892 */
1893/* ARGSUSED3 */
1894static rule_res_t
1895rule_interface(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
1896    ip_stack_t *ipst)
1897{
1898	ill_t *dstill = dstinfo->dst_ill;
1899
1900	/*
1901	 * If dstinfo->dst_restrict_ill is set, this rule is unnecessary
1902	 * since we know all candidates will be on the same link.
1903	 */
1904	if (dstinfo->dst_restrict_ill)
1905		return (CAND_TIE);
1906
1907	if (!bc->cand_matchedinterface_set) {
1908		bc->cand_matchedinterface = (bc->cand_ill == dstill ||
1909		    (dstill->ill_group != NULL &&
1910		    dstill->ill_group == bc->cand_ill->ill_group));
1911		bc->cand_matchedinterface_set = B_TRUE;
1912	}
1913
1914	cc->cand_matchedinterface = (cc->cand_ill == dstill ||
1915	    (dstill->ill_group != NULL &&
1916		dstill->ill_group == cc->cand_ill->ill_group));
1917	cc->cand_matchedinterface_set = B_TRUE;
1918
1919	if (bc->cand_matchedinterface == cc->cand_matchedinterface)
1920		return (CAND_TIE);
1921	else if (cc->cand_matchedinterface)
1922		return (CAND_PREFER);
1923	else
1924		return (CAND_AVOID);
1925}
1926
1927/*
1928 * Prefer source addresses whose label matches the destination's label.
1929 */
1930static rule_res_t
1931rule_label(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst)
1932{
1933	char *label;
1934
1935	if (!bc->cand_matchedlabel_set) {
1936		label = ip6_asp_lookup(&bc->cand_srcaddr, NULL, ipst);
1937		bc->cand_matchedlabel =
1938		    ip6_asp_labelcmp(label, dstinfo->dst_label);
1939		bc->cand_matchedlabel_set = B_TRUE;
1940	}
1941
1942	label = ip6_asp_lookup(&cc->cand_srcaddr, NULL, ipst);
1943	cc->cand_matchedlabel = ip6_asp_labelcmp(label, dstinfo->dst_label);
1944	cc->cand_matchedlabel_set = B_TRUE;
1945
1946	if (bc->cand_matchedlabel == cc->cand_matchedlabel)
1947		return (CAND_TIE);
1948	else if (cc->cand_matchedlabel)
1949		return (CAND_PREFER);
1950	else
1951		return (CAND_AVOID);
1952}
1953
1954/*
1955 * Prefer public addresses over temporary ones.  An application can reverse
1956 * the logic of this rule and prefer temporary addresses by using the
1957 * IPV6_SRC_PREFERENCES socket option.
1958 */
1959/* ARGSUSED3 */
1960static rule_res_t
1961rule_temporary(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
1962    ip_stack_t *ipst)
1963{
1964	if (!bc->cand_istmp_set) {
1965		bc->cand_istmp = ((bc->cand_flags & IPIF_TEMPORARY) != 0);
1966		bc->cand_istmp_set = B_TRUE;
1967	}
1968
1969	cc->cand_istmp = ((cc->cand_flags & IPIF_TEMPORARY) != 0);
1970	cc->cand_istmp_set = B_TRUE;
1971
1972	if (bc->cand_istmp == cc->cand_istmp)
1973		return (CAND_TIE);
1974
1975	if (dstinfo->dst_prefer_src_tmp && cc->cand_istmp)
1976		return (CAND_PREFER);
1977	else if (!dstinfo->dst_prefer_src_tmp && !cc->cand_istmp)
1978		return (CAND_PREFER);
1979	else
1980		return (CAND_AVOID);
1981}
1982
1983/*
1984 * Prefer source addresses with longer matching prefix with the destination
1985 * under the interface mask.  This gets us on the same subnet before applying
1986 * any Solaris-specific rules.
1987 */
1988/* ARGSUSED3 */
1989static rule_res_t
1990rule_ifprefix(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
1991    ip_stack_t *ipst)
1992{
1993	if (!bc->cand_pref_eq_set) {
1994		bc->cand_pref_eq = V6_MASK_EQ_2(bc->cand_srcaddr,
1995		    bc->cand_mask, *dstinfo->dst_addr);
1996		bc->cand_pref_eq_set = B_TRUE;
1997	}
1998
1999	cc->cand_pref_eq = V6_MASK_EQ_2(cc->cand_srcaddr, cc->cand_mask,
2000	    *dstinfo->dst_addr);
2001	cc->cand_pref_eq_set = B_TRUE;
2002
2003	if (bc->cand_pref_eq) {
2004		if (cc->cand_pref_eq) {
2005			if (!bc->cand_pref_len_set) {
2006				bc->cand_pref_len =
2007				    ip_mask_to_plen_v6(&bc->cand_mask);
2008				bc->cand_pref_len_set = B_TRUE;
2009			}
2010			cc->cand_pref_len = ip_mask_to_plen_v6(&cc->cand_mask);
2011			cc->cand_pref_len_set = B_TRUE;
2012			if (bc->cand_pref_len == cc->cand_pref_len)
2013				return (CAND_TIE);
2014			else if (bc->cand_pref_len > cc->cand_pref_len)
2015				return (CAND_AVOID);
2016			else
2017				return (CAND_PREFER);
2018		} else {
2019			return (CAND_AVOID);
2020		}
2021	} else {
2022		if (cc->cand_pref_eq)
2023			return (CAND_PREFER);
2024		else
2025			return (CAND_TIE);
2026	}
2027}
2028
2029/*
2030 * Prefer to use zone-specific addresses when possible instead of all-zones
2031 * addresses.
2032 */
2033/* ARGSUSED2 */
2034static rule_res_t
2035rule_zone_specific(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
2036    ip_stack_t *ipst)
2037{
2038	if ((bc->cand_zoneid == ALL_ZONES) ==
2039	    (cc->cand_zoneid == ALL_ZONES))
2040		return (CAND_TIE);
2041	else if (cc->cand_zoneid == ALL_ZONES)
2042		return (CAND_AVOID);
2043	else
2044		return (CAND_PREFER);
2045}
2046
2047/*
2048 * Prefer to use DHCPv6 (first) and static addresses (second) when possible
2049 * instead of statelessly autoconfigured addresses.
2050 *
2051 * This is done after trying all other preferences (and before the final tie
2052 * breaker) so that, if all else is equal, we select addresses configured by
2053 * DHCPv6 over other addresses.  We presume that DHCPv6 addresses, unlike
2054 * stateless autoconfigured addresses, are deliberately configured by an
2055 * administrator, and thus are correctly set up in DNS and network packet
2056 * filters.
2057 */
2058/* ARGSUSED2 */
2059static rule_res_t
2060rule_addr_type(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
2061    ip_stack_t *ipst)
2062{
2063#define	ATYPE(x)	\
2064	((x) & IPIF_DHCPRUNNING) ? 1 : ((x) & IPIF_ADDRCONF) ? 3 : 2
2065	int bcval = ATYPE(bc->cand_flags);
2066	int ccval = ATYPE(cc->cand_flags);
2067#undef ATYPE
2068
2069	if (bcval == ccval)
2070		return (CAND_TIE);
2071	else if (ccval < bcval)
2072		return (CAND_PREFER);
2073	else
2074		return (CAND_AVOID);
2075}
2076
2077/*
2078 * Prefer source addresses with longer matching prefix with the destination.
2079 * We do the longest matching prefix calculation by doing an xor of both
2080 * addresses with the destination, and pick the address with the longest string
2081 * of leading zeros, as per CommonPrefixLen() defined in RFC 3484.
2082 */
2083/* ARGSUSED3 */
2084static rule_res_t
2085rule_prefix(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst)
2086{
2087	if (!bc->cand_common_pref_set) {
2088		bc->cand_common_pref = ip_common_prefix_v6(&bc->cand_srcaddr,
2089		    dstinfo->dst_addr);
2090		bc->cand_common_pref_set = B_TRUE;
2091	}
2092
2093	cc->cand_common_pref = ip_common_prefix_v6(&cc->cand_srcaddr,
2094	    dstinfo->dst_addr);
2095	cc->cand_common_pref_set = B_TRUE;
2096
2097	if (bc->cand_common_pref == cc->cand_common_pref)
2098		return (CAND_TIE);
2099	else if (bc->cand_common_pref > cc->cand_common_pref)
2100		return (CAND_AVOID);
2101	else
2102		return (CAND_PREFER);
2103}
2104
2105/*
2106 * Last rule: we must pick something, so just prefer the current best
2107 * candidate.
2108 */
2109/* ARGSUSED */
2110static rule_res_t
2111rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
2112    ip_stack_t *ipst)
2113{
2114	return (CAND_AVOID);
2115}
2116
2117/*
2118 * Determine the best source address given a destination address and a
2119 * destination ill.  If no suitable source address is found, it returns
2120 * NULL. If there is a usable address pointed to by the usesrc
2121 * (i.e ill_usesrc_ifindex != 0) then return that first since it is more
2122 * fine grained (i.e per interface)
2123 *
2124 * This implementation is based on the "Default Address Selection for IPv6"
2125 * specification produced by the IETF IPv6 working group.  It has been
2126 * implemented so that the list of addresses is only traversed once (the
2127 * specification's algorithm could traverse the list of addresses once for
2128 * every rule).
2129 *
2130 * The restrict_ill argument restricts the algorithm to chose a source
2131 * address that is assigned to the destination ill or an ill in the same
2132 * IPMP group as the destination ill.  This is used when the destination
2133 * address is a link-local or multicast address, and when
2134 * ipv6_strict_dst_multihoming is turned on.
2135 *
2136 * src_prefs is the caller's set of source address preferences.  If source
2137 * address selection is being called to determine the source address of a
2138 * connected socket (from ip_bind_connected_v6()), then the preferences are
2139 * taken from conn_src_preferences.  These preferences can be set on a
2140 * per-socket basis using the IPV6_SRC_PREFERENCES socket option.  The only
2141 * preference currently implemented is for rfc3041 temporary addresses.
2142 */
2143ipif_t *
2144ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
2145    uint_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid)
2146{
2147	dstinfo_t	dstinfo;
2148	char		dstr[INET6_ADDRSTRLEN];
2149	char		sstr[INET6_ADDRSTRLEN];
2150	ipif_t		*ipif;
2151	ill_t		*ill, *usesrc_ill = NULL;
2152	ill_walk_context_t	ctx;
2153	cand_t		best_c;	/* The best candidate */
2154	cand_t		curr_c;	/* The current candidate */
2155	uint_t		index;
2156	boolean_t	first_candidate = B_TRUE;
2157	rule_res_t	rule_result;
2158	tsol_tpc_t	*src_rhtp, *dst_rhtp;
2159	ip_stack_t	*ipst = dstill->ill_ipst;
2160
2161	/*
2162	 * The list of ordering rules.  They are applied in the order they
2163	 * appear in the list.
2164	 *
2165	 * Solaris doesn't currently support Mobile IPv6, so there's no
2166	 * rule_mipv6 corresponding to rule 4 in the specification.
2167	 */
2168	rulef_t	rules[] = {
2169		rule_isdst,
2170		rule_scope,
2171		rule_deprecated,
2172		rule_preferred,
2173		rule_interface,
2174		rule_label,
2175		rule_temporary,
2176		rule_ifprefix,			/* local rules after this */
2177		rule_zone_specific,
2178		rule_addr_type,
2179		rule_prefix,			/* local rules before this */
2180		rule_must_be_last,		/* must always be last */
2181		NULL
2182	};
2183
2184	ASSERT(dstill->ill_isv6);
2185	ASSERT(!IN6_IS_ADDR_V4MAPPED(dst));
2186
2187	/*
2188	 * Check if there is a usable src address pointed to by the
2189	 * usesrc ifindex. This has higher precedence since it is
2190	 * finer grained (i.e per interface) v/s being system wide.
2191	 */
2192	if (dstill->ill_usesrc_ifindex != 0) {
2193		if ((usesrc_ill =
2194		    ill_lookup_on_ifindex(dstill->ill_usesrc_ifindex, B_TRUE,
2195		    NULL, NULL, NULL, NULL, ipst)) != NULL) {
2196			dstinfo.dst_ill = usesrc_ill;
2197		} else {
2198			return (NULL);
2199		}
2200	} else {
2201		dstinfo.dst_ill = dstill;
2202	}
2203
2204	/*
2205	 * If we're dealing with an unlabeled destination on a labeled system,
2206	 * make sure that we ignore source addresses that are incompatible with
2207	 * the destination's default label.  That destination's default label
2208	 * must dominate the minimum label on the source address.
2209	 *
2210	 * (Note that this has to do with Trusted Solaris.  It's not related to
2211	 * the labels described by ip6_asp_lookup.)
2212	 */
2213	dst_rhtp = NULL;
2214	if (is_system_labeled()) {
2215		dst_rhtp = find_tpc(dst, IPV6_VERSION, B_FALSE);
2216		if (dst_rhtp == NULL)
2217			return (NULL);
2218		if (dst_rhtp->tpc_tp.host_type != UNLABELED) {
2219			TPC_RELE(dst_rhtp);
2220			dst_rhtp = NULL;
2221		}
2222	}
2223
2224	dstinfo.dst_addr = dst;
2225	dstinfo.dst_scope = ip_addr_scope_v6(dst);
2226	dstinfo.dst_label = ip6_asp_lookup(dst, NULL, ipst);
2227	dstinfo.dst_prefer_src_tmp = ((src_prefs & IPV6_PREFER_SRC_TMP) != 0);
2228
2229	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2230	/*
2231	 * Section three of the I-D states that for multicast and
2232	 * link-local destinations, the candidate set must be restricted to
2233	 * an interface that is on the same link as the outgoing interface.
2234	 * Also, when ipv6_strict_dst_multihoming is turned on, always
2235	 * restrict the source address to the destination link as doing
2236	 * otherwise will almost certainly cause problems.
2237	 */
2238	if (IN6_IS_ADDR_LINKLOCAL(dst) || IN6_IS_ADDR_MULTICAST(dst) ||
2239	    ipst->ips_ipv6_strict_dst_multihoming || usesrc_ill != NULL) {
2240		if (restrict_ill == RESTRICT_TO_NONE)
2241			dstinfo.dst_restrict_ill = RESTRICT_TO_GROUP;
2242		else
2243			dstinfo.dst_restrict_ill = restrict_ill;
2244	} else {
2245		dstinfo.dst_restrict_ill = restrict_ill;
2246	}
2247
2248	bzero(&best_c, sizeof (cand_t));
2249
2250	/*
2251	 * Take a pass through the list of IPv6 interfaces to chose the
2252	 * best possible source address.  If restrict_ill is true, we only
2253	 * iterate through the ill's that are in the same IPMP group as the
2254	 * destination's outgoing ill.  If restrict_ill is false, we walk
2255	 * the entire list of IPv6 ill's.
2256	 */
2257	if (dstinfo.dst_restrict_ill != RESTRICT_TO_NONE) {
2258		if (dstinfo.dst_ill->ill_group != NULL &&
2259		    dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP) {
2260			ill = dstinfo.dst_ill->ill_group->illgrp_ill;
2261		} else {
2262			ill = dstinfo.dst_ill;
2263		}
2264	} else {
2265		ill = ILL_START_WALK_V6(&ctx, ipst);
2266	}
2267
2268	while (ill != NULL) {
2269		ASSERT(ill->ill_isv6);
2270
2271		/*
2272		 * Avoid FAILED/OFFLINE ills.
2273		 * Global and site local addresses will failover and
2274		 * will be available on the new ill.
2275		 * But link local addresses don't move.
2276		 */
2277		if (dstinfo.dst_restrict_ill != RESTRICT_TO_ILL &&
2278		    ill->ill_phyint->phyint_flags &
2279		    (PHYI_OFFLINE | PHYI_FAILED))
2280			goto next_ill;
2281
2282		for (ipif = ill->ill_ipif; ipif != NULL;
2283		    ipif = ipif->ipif_next) {
2284
2285			if (!IPIF_VALID_IPV6_SOURCE(ipif))
2286				continue;
2287
2288			if (zoneid != ALL_ZONES &&
2289			    ipif->ipif_zoneid != zoneid &&
2290			    ipif->ipif_zoneid != ALL_ZONES)
2291				continue;
2292
2293			/*
2294			 * Check compatibility of local address for
2295			 * destination's default label if we're on a labeled
2296			 * system.  Incompatible addresses can't be used at
2297			 * all and must be skipped over.
2298			 */
2299			if (dst_rhtp != NULL) {
2300				boolean_t incompat;
2301
2302				src_rhtp = find_tpc(&ipif->ipif_v6lcl_addr,
2303				    IPV6_VERSION, B_FALSE);
2304				if (src_rhtp == NULL)
2305					continue;
2306				incompat =
2307				    src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
2308				    src_rhtp->tpc_tp.tp_doi !=
2309				    dst_rhtp->tpc_tp.tp_doi ||
2310				    (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
2311				    &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
2312				    !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
2313				    src_rhtp->tpc_tp.tp_sl_set_cipso));
2314				TPC_RELE(src_rhtp);
2315				if (incompat)
2316					continue;
2317			}
2318
2319			if (first_candidate) {
2320				/*
2321				 * This is first valid address in the list.
2322				 * It is automatically the best candidate
2323				 * so far.
2324				 */
2325				best_c.cand_ipif = ipif;
2326				first_candidate = B_FALSE;
2327				continue;
2328			}
2329
2330			bzero(&curr_c, sizeof (cand_t));
2331			curr_c.cand_ipif = ipif;
2332
2333			/*
2334			 * Compare this current candidate (curr_c) with the
2335			 * best candidate (best_c) by applying the
2336			 * comparison rules in order until one breaks the
2337			 * tie.
2338			 */
2339			for (index = 0; rules[index] != NULL; index++) {
2340				/* Apply a comparison rule. */
2341				rule_result =
2342				    (rules[index])(&best_c, &curr_c, &dstinfo,
2343				    ipst);
2344				if (rule_result == CAND_AVOID) {
2345					/*
2346					 * The best candidate is still the
2347					 * best candidate.  Forget about
2348					 * this current candidate and go on
2349					 * to the next one.
2350					 */
2351					break;
2352				} else if (rule_result == CAND_PREFER) {
2353					/*
2354					 * This candidate is prefered.  It
2355					 * becomes the best candidate so
2356					 * far.  Go on to the next address.
2357					 */
2358					best_c = curr_c;
2359					break;
2360				}
2361				/* We have a tie, apply the next rule. */
2362			}
2363
2364			/*
2365			 * The last rule must be a tie breaker rule and
2366			 * must never produce a tie.  At this point, the
2367			 * candidate should have either been rejected, or
2368			 * have been prefered as the best candidate so far.
2369			 */
2370			ASSERT(rule_result != CAND_TIE);
2371		}
2372
2373		/*
2374		 * We may be walking the linked-list of ill's in an
2375		 * IPMP group or traversing the IPv6 ill avl tree. If it is a
2376		 * usesrc ILL then it can't be part of IPMP group and we
2377		 * will exit the while loop.
2378		 */
2379next_ill:
2380		if (dstinfo.dst_restrict_ill == RESTRICT_TO_ILL)
2381			ill = NULL;
2382		else if (dstinfo.dst_restrict_ill == RESTRICT_TO_GROUP)
2383			ill = ill->ill_group_next;
2384		else
2385			ill = ill_next(&ctx, ill);
2386	}
2387
2388	ipif = best_c.cand_ipif;
2389	ip1dbg(("ipif_select_source_v6(%s, %s) -> %s\n",
2390	    dstinfo.dst_ill->ill_name,
2391	    inet_ntop(AF_INET6, dstinfo.dst_addr, dstr, sizeof (dstr)),
2392	    (ipif == NULL ? "NULL" :
2393	    inet_ntop(AF_INET6, &ipif->ipif_v6lcl_addr, sstr, sizeof (sstr)))));
2394
2395	if (usesrc_ill != NULL)
2396		ill_refrele(usesrc_ill);
2397
2398	if (dst_rhtp != NULL)
2399		TPC_RELE(dst_rhtp);
2400
2401	if (ipif == NULL) {
2402		rw_exit(&ipst->ips_ill_g_lock);
2403		return (NULL);
2404	}
2405
2406	mutex_enter(&ipif->ipif_ill->ill_lock);
2407	if (IPIF_CAN_LOOKUP(ipif)) {
2408		ipif_refhold_locked(ipif);
2409		mutex_exit(&ipif->ipif_ill->ill_lock);
2410		rw_exit(&ipst->ips_ill_g_lock);
2411		return (ipif);
2412	}
2413	mutex_exit(&ipif->ipif_ill->ill_lock);
2414	rw_exit(&ipst->ips_ill_g_lock);
2415	ip1dbg(("ipif_select_source_v6 cannot lookup ipif %p"
2416	    " returning null \n", (void *)ipif));
2417
2418	return (NULL);
2419}
2420
2421/*
2422 * If old_ipif is not NULL, see if ipif was derived from old
2423 * ipif and if so, recreate the interface route by re-doing
2424 * source address selection. This happens when ipif_down ->
2425 * ipif_update_other_ipifs calls us.
2426 *
2427 * If old_ipif is NULL, just redo the source address selection
2428 * if needed. This happens when illgrp_insert or ipif_up_done_v6
2429 * calls us.
2430 */
2431void
2432ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
2433{
2434	ire_t *ire;
2435	ire_t *ipif_ire;
2436	queue_t *stq;
2437	ill_t *ill;
2438	ipif_t *nipif = NULL;
2439	boolean_t nipif_refheld = B_FALSE;
2440	boolean_t ip6_asp_table_held = B_FALSE;
2441	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2442
2443	ill = ipif->ipif_ill;
2444
2445	if (!(ipif->ipif_flags &
2446	    (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
2447		/*
2448		 * Can't possibly have borrowed the source
2449		 * from old_ipif.
2450		 */
2451		return;
2452	}
2453
2454	/*
2455	 * Is there any work to be done? No work if the address
2456	 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST (
2457	 * ipif_select_source_v6() does not borrow addresses from
2458	 * NOLOCAL and ANYCAST interfaces).
2459	 */
2460	if ((old_ipif != NULL) &&
2461	    ((IN6_IS_ADDR_UNSPECIFIED(&old_ipif->ipif_v6lcl_addr)) ||
2462	    (old_ipif->ipif_ill->ill_wq == NULL) ||
2463	    (old_ipif->ipif_flags &
2464	    (IPIF_NOLOCAL|IPIF_ANYCAST)))) {
2465		return;
2466	}
2467
2468	/*
2469	 * Perform the same checks as when creating the
2470	 * IRE_INTERFACE in ipif_up_done_v6.
2471	 */
2472	if (!(ipif->ipif_flags & IPIF_UP))
2473		return;
2474
2475	if ((ipif->ipif_flags & IPIF_NOXMIT))
2476		return;
2477
2478	if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) &&
2479	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
2480		return;
2481
2482	/*
2483	 * We know that ipif uses some other source for its
2484	 * IRE_INTERFACE. Is it using the source of this
2485	 * old_ipif?
2486	 */
2487	ipif_ire = ipif_to_ire_v6(ipif);
2488	if (ipif_ire == NULL)
2489		return;
2490
2491	if (old_ipif != NULL &&
2492	    !IN6_ARE_ADDR_EQUAL(&old_ipif->ipif_v6lcl_addr,
2493	    &ipif_ire->ire_src_addr_v6)) {
2494		ire_refrele(ipif_ire);
2495		return;
2496	}
2497
2498	if (ip_debug > 2) {
2499		/* ip1dbg */
2500		pr_addr_dbg("ipif_recreate_interface_routes_v6: deleting IRE"
2501		    " for src %s\n", AF_INET6, &ipif_ire->ire_src_addr_v6);
2502	}
2503
2504	stq = ipif_ire->ire_stq;
2505
2506	/*
2507	 * Can't use our source address. Select a different source address
2508	 * for the IRE_INTERFACE.  We restrict interface route source
2509	 * address selection to ipif's assigned to the same link as the
2510	 * interface.
2511	 */
2512	if (ip6_asp_can_lookup(ipst)) {
2513		ip6_asp_table_held = B_TRUE;
2514		nipif = ipif_select_source_v6(ill, &ipif->ipif_v6subnet,
2515		    RESTRICT_TO_GROUP, IPV6_PREFER_SRC_DEFAULT,
2516		    ipif->ipif_zoneid);
2517	}
2518	if (nipif == NULL) {
2519		/* Last resort - all ipif's have IPIF_NOLOCAL */
2520		nipif = ipif;
2521	} else {
2522		nipif_refheld = B_TRUE;
2523	}
2524
2525	ire = ire_create_v6(
2526	    &ipif->ipif_v6subnet,	/* dest pref */
2527	    &ipif->ipif_v6net_mask,	/* mask */
2528	    &nipif->ipif_v6src_addr,	/* src addr */
2529	    NULL,			/* no gateway */
2530	    &ipif->ipif_mtu,		/* max frag */
2531	    NULL,			/* no Fast path header */
2532	    NULL,			/* no recv from queue */
2533	    stq,			/* send-to queue */
2534	    ill->ill_net_type,		/* IF_[NO]RESOLVER */
2535	    ill->ill_resolver_mp,	/* xmit header */
2536	    ipif,
2537	    NULL,
2538	    0,
2539	    0,
2540	    0,
2541	    &ire_uinfo_null,
2542	    NULL,
2543	    NULL,
2544	    ipst);
2545
2546	if (ire != NULL) {
2547		ire_t *ret_ire;
2548		int   error;
2549
2550		/*
2551		 * We don't need ipif_ire anymore. We need to delete
2552		 * before we add so that ire_add does not detect
2553		 * duplicates.
2554		 */
2555		ire_delete(ipif_ire);
2556		ret_ire = ire;
2557		error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE);
2558		ASSERT(error == 0);
2559		ASSERT(ret_ire == ire);
2560		if (ret_ire != NULL) {
2561			/* Held in ire_add */
2562			ire_refrele(ret_ire);
2563		}
2564	}
2565	/*
2566	 * Either we are falling through from above or could not
2567	 * allocate a replacement.
2568	 */
2569	ire_refrele(ipif_ire);
2570	if (ip6_asp_table_held)
2571		ip6_asp_table_refrele(ipst);
2572	if (nipif_refheld)
2573		ipif_refrele(nipif);
2574}
2575
2576/*
2577 * This old_ipif is going away.
2578 *
2579 * Determine if any other ipif's are using our address as
2580 * ipif_v6lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
2581 * IPIF_DEPRECATED).
2582 * Find the IRE_INTERFACE for such ipif's and recreate them
2583 * to use an different source address following the rules in
2584 * ipif_up_done_v6.
2585 *
2586 * This function takes an illgrp as an argument so that illgrp_delete
2587 * can call this to update source address even after deleting the
2588 * old_ipif->ipif_ill from the ill group.
2589 */
2590void
2591ipif_update_other_ipifs_v6(ipif_t *old_ipif, ill_group_t *illgrp)
2592{
2593	ipif_t	*ipif;
2594	ill_t	*ill;
2595	char	buf[INET6_ADDRSTRLEN];
2596
2597	ASSERT(IAM_WRITER_IPIF(old_ipif));
2598
2599	ill = old_ipif->ipif_ill;
2600
2601	ip1dbg(("ipif_update_other_ipifs_v6(%s, %s)\n",
2602	    ill->ill_name,
2603	    inet_ntop(AF_INET6, &old_ipif->ipif_v6lcl_addr,
2604	    buf, sizeof (buf))));
2605
2606	/*
2607	 * If this part of a group, look at all ills as ipif_select_source
2608	 * borrows a source address across all the ills in the group.
2609	 */
2610	if (illgrp != NULL)
2611		ill = illgrp->illgrp_ill;
2612
2613	/* Don't need a lock since this is a writer */
2614	for (; ill != NULL; ill = ill->ill_group_next) {
2615		for (ipif = ill->ill_ipif; ipif != NULL;
2616		    ipif = ipif->ipif_next) {
2617
2618			if (ipif == old_ipif)
2619				continue;
2620
2621			ipif_recreate_interface_routes_v6(old_ipif, ipif);
2622		}
2623	}
2624}
2625
2626/*
2627 * Perform an attach and bind to get phys addr plus info_req for
2628 * the physical device.
2629 * q and mp represents an ioctl which will be queued waiting for
2630 * completion of the DLPI message exchange.
2631 * MUST be called on an ill queue. Can not set conn_pending_ill for that
2632 * reason thus the DL_PHYS_ADDR_ACK code does not assume ill_pending_q.
2633 *
2634 * Returns EINPROGRESS when mp has been consumed by queueing it on
2635 * ill_pending_mp and the ioctl will complete in ip_rput.
2636 */
2637int
2638ill_dl_phys(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
2639{
2640	mblk_t	*v6token_mp = NULL;
2641	mblk_t	*v6lla_mp = NULL;
2642	mblk_t	*phys_mp = NULL;
2643	mblk_t	*info_mp = NULL;
2644	mblk_t	*attach_mp = NULL;
2645	mblk_t	*detach_mp = NULL;
2646	mblk_t	*bind_mp = NULL;
2647	mblk_t	*unbind_mp = NULL;
2648	mblk_t	*notify_mp = NULL;
2649
2650	ip1dbg(("ill_dl_phys(%s:%u)\n", ill->ill_name, ipif->ipif_id));
2651	ASSERT(ill->ill_dlpi_style_set);
2652	ASSERT(WR(q)->q_next != NULL);
2653
2654	if (ill->ill_isv6) {
2655		v6token_mp = ip_dlpi_alloc(sizeof (dl_phys_addr_req_t) +
2656		    sizeof (t_scalar_t), DL_PHYS_ADDR_REQ);
2657		if (v6token_mp == NULL)
2658			goto bad;
2659		((dl_phys_addr_req_t *)v6token_mp->b_rptr)->dl_addr_type =
2660		    DL_IPV6_TOKEN;
2661
2662		v6lla_mp = ip_dlpi_alloc(sizeof (dl_phys_addr_req_t) +
2663		    sizeof (t_scalar_t), DL_PHYS_ADDR_REQ);
2664		if (v6lla_mp == NULL)
2665			goto bad;
2666		((dl_phys_addr_req_t *)v6lla_mp->b_rptr)->dl_addr_type =
2667		    DL_IPV6_LINK_LAYER_ADDR;
2668	}
2669
2670	/*
2671	 * Allocate a DL_NOTIFY_REQ and set the notifications we want.
2672	 */
2673	notify_mp = ip_dlpi_alloc(sizeof (dl_notify_req_t) + sizeof (long),
2674	    DL_NOTIFY_REQ);
2675	if (notify_mp == NULL)
2676		goto bad;
2677	((dl_notify_req_t *)notify_mp->b_rptr)->dl_notifications =
2678	    (DL_NOTE_PHYS_ADDR | DL_NOTE_SDU_SIZE | DL_NOTE_FASTPATH_FLUSH |
2679	    DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN | DL_NOTE_CAPAB_RENEG);
2680
2681	phys_mp = ip_dlpi_alloc(sizeof (dl_phys_addr_req_t) +
2682	    sizeof (t_scalar_t), DL_PHYS_ADDR_REQ);
2683	if (phys_mp == NULL)
2684		goto bad;
2685	((dl_phys_addr_req_t *)phys_mp->b_rptr)->dl_addr_type =
2686	    DL_CURR_PHYS_ADDR;
2687
2688	info_mp = ip_dlpi_alloc(
2689	    sizeof (dl_info_req_t) + sizeof (dl_info_ack_t),
2690	    DL_INFO_REQ);
2691	if (info_mp == NULL)
2692		goto bad;
2693
2694	bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
2695	    DL_BIND_REQ);
2696	if (bind_mp == NULL)
2697		goto bad;
2698	((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap;
2699	((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
2700
2701	unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ);
2702	if (unbind_mp == NULL)
2703		goto bad;
2704
2705	/* If we need to attach/detach, pre-alloc and initialize the mblks */
2706	if (ill->ill_needs_attach) {
2707		attach_mp = ip_dlpi_alloc(sizeof (dl_attach_req_t),
2708		    DL_ATTACH_REQ);
2709		if (attach_mp == NULL)
2710			goto bad;
2711		((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa = ill->ill_ppa;
2712
2713		detach_mp = ip_dlpi_alloc(sizeof (dl_detach_req_t),
2714		    DL_DETACH_REQ);
2715		if (detach_mp == NULL)
2716			goto bad;
2717	}
2718
2719	/*
2720	 * Here we are going to delay the ioctl ack until after
2721	 * ACKs from DL_PHYS_ADDR_REQ. So need to save the
2722	 * original ioctl message before sending the requests
2723	 */
2724	mutex_enter(&ill->ill_lock);
2725	/* ipsq_pending_mp_add won't fail since we pass in a NULL connp */
2726	(void) ipsq_pending_mp_add(NULL, ipif, ill->ill_wq, mp, 0);
2727	/*
2728	 * Set ill_phys_addr_pend to zero. It will be set to the addr_type of
2729	 * the DL_PHYS_ADDR_REQ in ill_dlpi_send() and ill_dlpi_done(). It will
2730	 * be used to track which DL_PHYS_ADDR_REQ is being ACK'd/NAK'd.
2731	 */
2732	ill->ill_phys_addr_pend = 0;
2733	mutex_exit(&ill->ill_lock);
2734
2735	if (attach_mp != NULL) {
2736		ip1dbg(("ill_dl_phys: attach\n"));
2737		ill_dlpi_send(ill, attach_mp);
2738	}
2739	ill_dlpi_send(ill, bind_mp);
2740	ill_dlpi_send(ill, info_mp);
2741	if (ill->ill_isv6) {
2742		ill_dlpi_send(ill, v6token_mp);
2743		ill_dlpi_send(ill, v6lla_mp);
2744	}
2745	ill_dlpi_send(ill, phys_mp);
2746	ill_dlpi_send(ill, notify_mp);
2747	ill_dlpi_send(ill, unbind_mp);
2748
2749	/*
2750	 * Save the DL_DETACH_REQ (if there is one) for use in ill_delete().
2751	 */
2752	ASSERT(ill->ill_detach_mp == NULL);
2753	ill->ill_detach_mp = detach_mp;
2754
2755	/*
2756	 * This operation will complete in ip_rput_dlpi_writer with either
2757	 * a DL_PHYS_ADDR_ACK or DL_ERROR_ACK.
2758	 */
2759	return (EINPROGRESS);
2760bad:
2761	if (v6token_mp != NULL)
2762		freemsg(v6token_mp);
2763	if (v6lla_mp != NULL)
2764		freemsg(v6lla_mp);
2765	if (phys_mp != NULL)
2766		freemsg(phys_mp);
2767	if (info_mp != NULL)
2768		freemsg(info_mp);
2769	if (attach_mp != NULL)
2770		freemsg(attach_mp);
2771	if (detach_mp != NULL)
2772		freemsg(detach_mp);
2773	if (bind_mp != NULL)
2774		freemsg(bind_mp);
2775	if (unbind_mp != NULL)
2776		freemsg(unbind_mp);
2777	if (notify_mp != NULL)
2778		freemsg(notify_mp);
2779	return (ENOMEM);
2780}
2781
2782uint_t ip_loopback_mtu_v6plus = IP_LOOPBACK_MTU + IPV6_HDR_LEN + 20;
2783
2784/*
2785 * DLPI is up.
2786 * Create all the IREs associated with an interface bring up multicast.
2787 * Set the interface flag and finish other initialization
2788 * that potentially had to be differed to after DL_BIND_ACK.
2789 */
2790int
2791ipif_up_done_v6(ipif_t *ipif)
2792{
2793	ire_t	*ire_array[20];
2794	ire_t	**irep = ire_array;
2795	ire_t	**irep1;
2796	ill_t	*ill = ipif->ipif_ill;
2797	queue_t	*stq;
2798	in6_addr_t	v6addr;
2799	in6_addr_t	route_mask;
2800	ipif_t	 *src_ipif = NULL;
2801	ipif_t   *tmp_ipif;
2802	boolean_t	flush_ire_cache = B_TRUE;
2803	int	err;
2804	char	buf[INET6_ADDRSTRLEN];
2805	phyint_t *phyi;
2806	ire_t	**ipif_saved_irep = NULL;
2807	int ipif_saved_ire_cnt;
2808	int cnt;
2809	boolean_t src_ipif_held = B_FALSE;
2810	boolean_t ire_added = B_FALSE;
2811	boolean_t loopback = B_FALSE;
2812	boolean_t ip6_asp_table_held = B_FALSE;
2813	ip_stack_t	*ipst = ill->ill_ipst;
2814
2815	ip1dbg(("ipif_up_done_v6(%s:%u)\n",
2816		ipif->ipif_ill->ill_name, ipif->ipif_id));
2817
2818	/* Check if this is a loopback interface */
2819	if (ipif->ipif_ill->ill_wq == NULL)
2820		loopback = B_TRUE;
2821
2822	ASSERT(ipif->ipif_isv6);
2823	ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
2824
2825	/*
2826	 * If all other interfaces for this ill are down or DEPRECATED,
2827	 * or otherwise unsuitable for source address selection, remove
2828	 * any IRE_CACHE entries for this ill to make sure source
2829	 * address selection gets to take this new ipif into account.
2830	 * No need to hold ill_lock while traversing the ipif list since
2831	 * we are writer
2832	 */
2833	for (tmp_ipif = ill->ill_ipif; tmp_ipif;
2834		tmp_ipif = tmp_ipif->ipif_next) {
2835		if (((tmp_ipif->ipif_flags &
2836		    (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
2837		    !(tmp_ipif->ipif_flags & IPIF_UP)) ||
2838		    (tmp_ipif == ipif))
2839			continue;
2840		/* first useable pre-existing interface */
2841		flush_ire_cache = B_FALSE;
2842		break;
2843	}
2844	if (flush_ire_cache)
2845		ire_walk_ill_v6(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE,
2846		    IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
2847
2848	/*
2849	 * Figure out which way the send-to queue should go.  Only
2850	 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER should show up here.
2851	 */
2852	switch (ill->ill_net_type) {
2853	case IRE_IF_RESOLVER:
2854		stq = ill->ill_rq;
2855		break;
2856	case IRE_IF_NORESOLVER:
2857	case IRE_LOOPBACK:
2858		stq = ill->ill_wq;
2859		break;
2860	default:
2861		return (EINVAL);
2862	}
2863
2864	if (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) {
2865		/*
2866		 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
2867		 * ipif_lookup_on_name(), but in the case of zones we can have
2868		 * several loopback addresses on lo0. So all the interfaces with
2869		 * loopback addresses need to be marked IRE_LOOPBACK.
2870		 */
2871		if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, &ipv6_loopback))
2872			ipif->ipif_ire_type = IRE_LOOPBACK;
2873		else
2874			ipif->ipif_ire_type = IRE_LOCAL;
2875	}
2876
2877	if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) {
2878		/*
2879		 * Can't use our source address. Select a different
2880		 * source address for the IRE_INTERFACE and IRE_LOCAL
2881		 */
2882		if (ip6_asp_can_lookup(ipst)) {
2883			ip6_asp_table_held = B_TRUE;
2884			src_ipif = ipif_select_source_v6(ipif->ipif_ill,
2885			    &ipif->ipif_v6subnet, RESTRICT_TO_NONE,
2886			    IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
2887		}
2888		if (src_ipif == NULL)
2889			src_ipif = ipif;	/* Last resort */
2890		else
2891			src_ipif_held = B_TRUE;
2892	} else {
2893		src_ipif = ipif;
2894	}
2895
2896	if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
2897	    !(ipif->ipif_flags & IPIF_NOLOCAL)) {
2898
2899		/*
2900		 * If we're on a labeled system then make sure that zone-
2901		 * private addresses have proper remote host database entries.
2902		 */
2903		if (is_system_labeled() &&
2904		    ipif->ipif_ire_type != IRE_LOOPBACK) {
2905			if (ip6opt_ls == 0) {
2906				cmn_err(CE_WARN, "IPv6 not enabled "
2907				    "via /etc/system");
2908				return (EINVAL);
2909			}
2910			if (!tsol_check_interface_address(ipif))
2911				return (EINVAL);
2912		}
2913
2914		/* Register the source address for __sin6_src_id */
2915		err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
2916		    ipif->ipif_zoneid, ipst);
2917		if (err != 0) {
2918			ip0dbg(("ipif_up_done_v6: srcid_insert %d\n", err));
2919			if (src_ipif_held)
2920				ipif_refrele(src_ipif);
2921			if (ip6_asp_table_held)
2922				ip6_asp_table_refrele(ipst);
2923			return (err);
2924		}
2925		/*
2926		 * If the interface address is set, create the LOCAL
2927		 * or LOOPBACK IRE.
2928		 */
2929		ip1dbg(("ipif_up_done_v6: creating IRE %d for %s\n",
2930		    ipif->ipif_ire_type,
2931		    inet_ntop(AF_INET6, &ipif->ipif_v6lcl_addr,
2932		    buf, sizeof (buf))));
2933
2934		*irep++ = ire_create_v6(
2935		    &ipif->ipif_v6lcl_addr,		/* dest address */
2936		    &ipv6_all_ones,			/* mask */
2937		    &src_ipif->ipif_v6src_addr,		/* source address */
2938		    NULL,				/* no gateway */
2939		    &ip_loopback_mtu_v6plus,		/* max frag size */
2940		    NULL,
2941		    ipif->ipif_rq,			/* recv-from queue */
2942		    NULL,				/* no send-to queue */
2943		    ipif->ipif_ire_type,		/* LOCAL or LOOPBACK */
2944		    NULL,
2945		    ipif,				/* interface */
2946		    NULL,
2947		    0,
2948		    0,
2949		    (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
2950		    &ire_uinfo_null,
2951		    NULL,
2952		    NULL,
2953		    ipst);
2954	}
2955
2956	/*
2957	 * Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate.
2958	 * Note that atun interfaces have an all-zero ipif_v6subnet.
2959	 * Thus we allow a zero subnet as long as the mask is non-zero.
2960	 */
2961	if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) &&
2962	    !(IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) &&
2963	    IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))) {
2964		/* ipif_v6subnet is ipif_v6pp_dst_addr for pt-pt */
2965		v6addr = ipif->ipif_v6subnet;
2966
2967		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
2968			route_mask = ipv6_all_ones;
2969		} else {
2970			route_mask = ipif->ipif_v6net_mask;
2971		}
2972
2973		ip1dbg(("ipif_up_done_v6: creating if IRE %d for %s\n",
2974		    ill->ill_net_type,
2975		    inet_ntop(AF_INET6, &v6addr, buf, sizeof (buf))));
2976
2977		*irep++ = ire_create_v6(
2978		    &v6addr,			/* dest pref */
2979		    &route_mask,		/* mask */
2980		    &src_ipif->ipif_v6src_addr,	/* src addr */
2981		    NULL,			/* no gateway */
2982		    &ipif->ipif_mtu,		/* max frag */
2983		    NULL,			/* no Fast path header */
2984		    NULL,			/* no recv from queue */
2985		    stq,			/* send-to queue */
2986		    ill->ill_net_type,		/* IF_[NO]RESOLVER */
2987		    ill->ill_resolver_mp,	/* xmit header */
2988		    ipif,
2989		    NULL,
2990		    0,
2991		    0,
2992		    (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
2993		    &ire_uinfo_null,
2994		    NULL,
2995		    NULL,
2996		    ipst);
2997	}
2998
2999	/*
3000	 * Setup 2002::/16 route, if this interface is a 6to4 tunnel
3001	 */
3002	if (IN6_IS_ADDR_6TO4(&ipif->ipif_v6lcl_addr) &&
3003	    (ill->ill_is_6to4tun)) {
3004		/*
3005		 * Destination address is 2002::/16
3006		 */
3007#ifdef	_BIG_ENDIAN
3008		const in6_addr_t prefix_addr = { 0x20020000U, 0, 0, 0 };
3009		const in6_addr_t prefix_mask = { 0xffff0000U, 0, 0, 0 };
3010#else
3011		const in6_addr_t prefix_addr = { 0x00000220U, 0, 0, 0 };
3012		const in6_addr_t prefix_mask = { 0x0000ffffU, 0, 0, 0 };
3013#endif /* _BIG_ENDIAN */
3014		char	buf2[INET6_ADDRSTRLEN];
3015		ire_t *isdup;
3016		in6_addr_t *first_addr = &ill->ill_ipif->ipif_v6lcl_addr;
3017
3018		/*
3019		 * check to see if this route has already been added for
3020		 * this tunnel interface.
3021		 */
3022		isdup = ire_ftable_lookup_v6(first_addr, &prefix_mask, 0,
3023		    IRE_IF_NORESOLVER, ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
3024		    (MATCH_IRE_SRC | MATCH_IRE_MASK), ipst);
3025
3026		if (isdup == NULL) {
3027			ip1dbg(("ipif_up_done_v6: creating if IRE %d for %s",
3028			    IRE_IF_NORESOLVER, inet_ntop(AF_INET6, &v6addr,
3029				buf2, sizeof (buf2))));
3030
3031			*irep++ = ire_create_v6(
3032			    &prefix_addr,		/* 2002:: */
3033			    &prefix_mask,		/* ffff:: */
3034			    &ipif->ipif_v6lcl_addr, 	/* src addr */
3035			    NULL, 			/* gateway */
3036			    &ipif->ipif_mtu, 		/* max_frag */
3037			    NULL, 			/* no Fast Path hdr */
3038			    NULL, 			/* no rfq */
3039			    ill->ill_wq, 		/* stq */
3040			    IRE_IF_NORESOLVER,		/* type */
3041			    ill->ill_resolver_mp,	/* dlureq_mp */
3042			    ipif,			/* interface */
3043			    NULL,			/* v6cmask */
3044			    0,
3045			    0,
3046			    RTF_UP,
3047			    &ire_uinfo_null,
3048			    NULL,
3049			    NULL,
3050			    ipst);
3051		} else {
3052			ire_refrele(isdup);
3053		}
3054	}
3055
3056	/* If an earlier ire_create failed, get out now */
3057	for (irep1 = irep; irep1 > ire_array; ) {
3058		irep1--;
3059		if (*irep1 == NULL) {
3060			ip1dbg(("ipif_up_done_v6: NULL ire found in"
3061			    " ire_array\n"));
3062			err = ENOMEM;
3063			goto bad;
3064		}
3065	}
3066
3067	ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
3068
3069	/*
3070	 * Need to atomically check for ip_addr_availablity_check
3071	 * now under ill_g_lock, and if it fails got bad, and remove
3072	 * from group also
3073	 */
3074	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3075	mutex_enter(&ipst->ips_ip_addr_avail_lock);
3076	ill->ill_ipif_up_count++;
3077	ipif->ipif_flags |= IPIF_UP;
3078	err = ip_addr_availability_check(ipif);
3079	mutex_exit(&ipst->ips_ip_addr_avail_lock);
3080	rw_exit(&ipst->ips_ill_g_lock);
3081
3082	if (err != 0) {
3083		/*
3084		 * Our address may already be up on the same ill. In this case,
3085		 * the external resolver entry for our ipif replaced the one for
3086		 * the other ipif. So we don't want to delete it (otherwise the
3087		 * other ipif would be unable to send packets).
3088		 * ip_addr_availability_check() identifies this case for us and
3089		 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL
3090		 * which is the expected error code.
3091		 */
3092		if (err == EADDRINUSE) {
3093			if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV) {
3094				freemsg(ipif->ipif_arp_del_mp);
3095				ipif->ipif_arp_del_mp = NULL;
3096			}
3097			err = EADDRNOTAVAIL;
3098		}
3099		ill->ill_ipif_up_count--;
3100		ipif->ipif_flags &= ~IPIF_UP;
3101		goto bad;
3102	}
3103
3104	/*
3105	 * Add in all newly created IREs. We want to add before
3106	 * we call ifgrp_insert which wants to know whether
3107	 * IRE_IF_RESOLVER exists or not.
3108	 *
3109	 * NOTE : We refrele the ire though we may branch to "bad"
3110	 *	  later on where we do ire_delete. This is okay
3111	 *	  because nobody can delete it as we are running
3112	 *	  exclusively.
3113	 */
3114	for (irep1 = irep; irep1 > ire_array; ) {
3115		irep1--;
3116		/* Shouldn't be adding any bcast ire's */
3117		ASSERT((*irep1)->ire_type != IRE_BROADCAST);
3118		ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
3119		/*
3120		 * refheld by ire_add. refele towards the end of the func
3121		 */
3122		(void) ire_add(irep1, NULL, NULL, NULL, B_FALSE);
3123	}
3124	if (ip6_asp_table_held) {
3125		ip6_asp_table_refrele(ipst);
3126		ip6_asp_table_held = B_FALSE;
3127	}
3128	ire_added = B_TRUE;
3129
3130	/*
3131	 * Form groups if possible.
3132	 *
3133	 * If we are supposed to be in a ill_group with a name, insert it
3134	 * now as we know that at least one ipif is UP. Otherwise form
3135	 * nameless groups.
3136	 *
3137	 * If ip_enable_group_ifs is set and ipif address is not ::0, insert
3138	 * this ipif into the appropriate interface group, or create a
3139	 * new one. If this is already in a nameless group, we try to form
3140	 * a bigger group looking at other ills potentially sharing this
3141	 * ipif's prefix.
3142	 */
3143	phyi = ill->ill_phyint;
3144	if (phyi->phyint_groupname_len != 0) {
3145		ASSERT(phyi->phyint_groupname != NULL);
3146		if (ill->ill_ipif_up_count == 1) {
3147			ASSERT(ill->ill_group == NULL);
3148			err = illgrp_insert(&ipst->ips_illgrp_head_v6, ill,
3149			    phyi->phyint_groupname, NULL, B_TRUE);
3150			if (err != 0) {
3151				ip1dbg(("ipif_up_done_v6: illgrp allocation "
3152				    "failed, error %d\n", err));
3153				goto bad;
3154			}
3155		}
3156		ASSERT(ill->ill_group != NULL);
3157	}
3158
3159	/* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
3160	ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
3161	ipif_saved_irep = ipif_recover_ire_v6(ipif);
3162
3163	if (ipif->ipif_ipif_up_count == 1 && !loopback) {
3164		/*
3165		 * Need to recover all multicast memberships in the driver.
3166		 * This had to be deferred until we had attached.
3167		 */
3168		ill_recover_multicast(ill);
3169	}
3170	/* Join the allhosts multicast address and the solicited node MC */
3171	ipif_multicast_up(ipif);
3172
3173	if (!loopback) {
3174		/*
3175		 * See whether anybody else would benefit from the
3176		 * new ipif that we added. We call this always rather
3177		 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST
3178		 * ipif for the benefit of illgrp_insert (done above)
3179		 * which does not do source address selection as it does
3180		 * not want to re-create interface routes that we are
3181		 * having reference to it here.
3182		 */
3183		ill_update_source_selection(ill);
3184	}
3185
3186	for (irep1 = irep; irep1 > ire_array; ) {
3187		irep1--;
3188		if (*irep1 != NULL) {
3189			/* was held in ire_add */
3190			ire_refrele(*irep1);
3191		}
3192	}
3193
3194	cnt = ipif_saved_ire_cnt;
3195	for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) {
3196		if (*irep1 != NULL) {
3197			/* was held in ire_add */
3198			ire_refrele(*irep1);
3199		}
3200	}
3201
3202	if (ipif->ipif_addr_ready) {
3203		ip_rts_ifmsg(ipif);
3204		ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
3205		sctp_update_ipif(ipif, SCTP_IPIF_UP);
3206	}
3207
3208	if (ipif_saved_irep != NULL) {
3209		kmem_free(ipif_saved_irep,
3210		    ipif_saved_ire_cnt * sizeof (ire_t *));
3211	}
3212
3213	if (src_ipif_held)
3214		ipif_refrele(src_ipif);
3215	return (0);
3216
3217bad:
3218	if (ip6_asp_table_held)
3219		ip6_asp_table_refrele(ipst);
3220	/*
3221	 * We don't have to bother removing from ill groups because
3222	 *
3223	 * 1) For groups with names, we insert only when the first ipif
3224	 *    comes up. In that case if it fails, it will not be in any
3225	 *    group. So, we need not try to remove for that case.
3226	 *
3227	 * 2) For groups without names, either we tried to insert ipif_ill
3228	 *    in a group as singleton or found some other group to become
3229	 *    a bigger group. For the former, if it fails we don't have
3230	 *    anything to do as ipif_ill is not in the group and for the
3231	 *    latter, there are no failures in illgrp_insert/illgrp_delete
3232	 *    (ENOMEM can't occur for this. Check ifgrp_insert).
3233	 */
3234
3235	while (irep > ire_array) {
3236		irep--;
3237		if (*irep != NULL) {
3238			ire_delete(*irep);
3239			if (ire_added)
3240				ire_refrele(*irep);
3241		}
3242
3243	}
3244	(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
3245
3246	if (ipif_saved_irep != NULL) {
3247		kmem_free(ipif_saved_irep,
3248		    ipif_saved_ire_cnt * sizeof (ire_t *));
3249	}
3250	if (src_ipif_held)
3251		ipif_refrele(src_ipif);
3252
3253	ipif_ndp_down(ipif);
3254	if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV)
3255		ipif_arp_down(ipif);
3256
3257	return (err);
3258}
3259
3260/*
3261 * Delete an ND entry and the corresponding IRE_CACHE entry if it exists.
3262 */
3263/* ARGSUSED */
3264int
3265ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
3266    ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
3267{
3268	in6_addr_t	addr;
3269	sin6_t		*sin6;
3270	nce_t		*nce;
3271	struct lifreq	*lifr;
3272	lif_nd_req_t	*lnr;
3273	mblk_t	*mp1;
3274
3275	mp1 = mp->b_cont->b_cont;
3276	lifr = (struct lifreq *)mp1->b_rptr;
3277	lnr = &lifr->lifr_nd;
3278	/* Only allow for logical unit zero i.e. not on "le0:17" */
3279	if (ipif->ipif_id != 0)
3280		return (EINVAL);
3281
3282	if (!ipif->ipif_isv6)
3283		return (EINVAL);
3284
3285	if (lnr->lnr_addr.ss_family != AF_INET6)
3286		return (EAFNOSUPPORT);
3287
3288	sin6 = (sin6_t *)&lnr->lnr_addr;
3289	addr = sin6->sin6_addr;
3290	nce = ndp_lookup_v6(ipif->ipif_ill, &addr, B_FALSE);
3291	if (nce == NULL)
3292		return (ESRCH);
3293	ndp_delete(nce);
3294	NCE_REFRELE(nce);
3295	return (0);
3296}
3297
3298/*
3299 * Return nbr cache info.
3300 */
3301/* ARGSUSED */
3302int
3303ip_siocqueryndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
3304    ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
3305{
3306	ill_t		*ill = ipif->ipif_ill;
3307	struct lifreq	*lifr;
3308	lif_nd_req_t	*lnr;
3309
3310	lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr;
3311	lnr = &lifr->lifr_nd;
3312	/* Only allow for logical unit zero i.e. not on "le0:17" */
3313	if (ipif->ipif_id != 0)
3314		return (EINVAL);
3315
3316	if (!ipif->ipif_isv6)
3317		return (EINVAL);
3318
3319	if (lnr->lnr_addr.ss_family != AF_INET6)
3320		return (EAFNOSUPPORT);
3321
3322	if (ill->ill_phys_addr_length > sizeof (lnr->lnr_hdw_addr))
3323		return (EINVAL);
3324
3325	return (ndp_query(ill, lnr));
3326}
3327
3328/*
3329 * Perform an update of the nd entry for the specified address.
3330 */
3331/* ARGSUSED */
3332int
3333ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
3334    ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
3335{
3336	ill_t		*ill = ipif->ipif_ill;
3337	struct	lifreq	*lifr;
3338	lif_nd_req_t	*lnr;
3339
3340	ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
3341
3342	lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr;
3343	lnr = &lifr->lifr_nd;
3344	/* Only allow for logical unit zero i.e. not on "le0:17" */
3345	if (ipif->ipif_id != 0)
3346		return (EINVAL);
3347
3348	if (!ipif->ipif_isv6)
3349		return (EINVAL);
3350
3351	if (lnr->lnr_addr.ss_family != AF_INET6)
3352		return (EAFNOSUPPORT);
3353
3354	return (ndp_sioc_update(ill, lnr));
3355}
3356