ip6_ire.c revision 12016:0248e987199b
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25/*
26 * Copyright (c) 1990 Mentat Inc.
27 */
28
29/*
30 * This file contains routines that manipulate Internet Routing Entries (IREs).
31 */
32#include <sys/types.h>
33#include <sys/stream.h>
34#include <sys/stropts.h>
35#include <sys/ddi.h>
36#include <sys/cmn_err.h>
37
38#include <sys/systm.h>
39#include <sys/param.h>
40#include <sys/socket.h>
41#include <net/if.h>
42#include <net/route.h>
43#include <netinet/in.h>
44#include <net/if_dl.h>
45#include <netinet/ip6.h>
46#include <netinet/icmp6.h>
47
48#include <inet/common.h>
49#include <inet/mi.h>
50#include <inet/ip.h>
51#include <inet/ip6.h>
52#include <inet/ip_ndp.h>
53#include <inet/ip_if.h>
54#include <inet/ip_ire.h>
55#include <inet/ipclassifier.h>
56#include <inet/nd.h>
57#include <inet/tunables.h>
58#include <sys/kmem.h>
59#include <sys/zone.h>
60
61#include <sys/tsol/label.h>
62#include <sys/tsol/tnet.h>
63
64#define	IS_DEFAULT_ROUTE_V6(ire)	\
65	(((ire)->ire_type & IRE_DEFAULT) || \
66	    (((ire)->ire_type & IRE_INTERFACE) && \
67	    (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
68
69static	ire_t	ire_null;
70
71static ire_t *
72ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
73    const in6_addr_t *gateway, int type, const ill_t *ill,
74    zoneid_t zoneid, const ts_label_t *tsl, int flags,
75    ip_stack_t *ipst);
76
77/*
78 * Initialize the ire that is specific to IPv6 part and call
79 * ire_init_common to finish it.
80 * Returns zero or errno.
81 */
82int
83ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
84    const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
85    zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
86{
87	int error;
88
89	/*
90	 * Reject IRE security attmakeribute creation/initialization
91	 * if system is not running in Trusted mode.
92	 */
93	if (gc != NULL && !is_system_labeled())
94		return (EINVAL);
95
96	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
97	if (v6addr != NULL)
98		ire->ire_addr_v6 = *v6addr;
99	if (v6gateway != NULL)
100		ire->ire_gateway_addr_v6 = *v6gateway;
101
102	/* Make sure we don't have stray values in some fields */
103	switch (type) {
104	case IRE_LOOPBACK:
105	case IRE_HOST:
106	case IRE_LOCAL:
107	case IRE_IF_CLONE:
108		ire->ire_mask_v6 = ipv6_all_ones;
109		ire->ire_masklen = IPV6_ABITS;
110		break;
111	case IRE_PREFIX:
112	case IRE_DEFAULT:
113	case IRE_IF_RESOLVER:
114	case IRE_IF_NORESOLVER:
115		if (v6mask != NULL) {
116			ire->ire_mask_v6 = *v6mask;
117			ire->ire_masklen =
118			    ip_mask_to_plen_v6(&ire->ire_mask_v6);
119		}
120		break;
121	case IRE_MULTICAST:
122	case IRE_NOROUTE:
123		ASSERT(v6mask == NULL);
124		break;
125	default:
126		ASSERT(0);
127		return (EINVAL);
128	}
129
130	error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
131	    gc, ipst);
132	if (error != NULL)
133		return (error);
134
135	/* Determine which function pointers to use */
136	ire->ire_postfragfn = ip_xmit;		/* Common case */
137
138	switch (ire->ire_type) {
139	case IRE_LOCAL:
140		ire->ire_sendfn = ire_send_local_v6;
141		ire->ire_recvfn = ire_recv_local_v6;
142		ASSERT(ire->ire_ill != NULL);
143		if (ire->ire_ill->ill_flags & ILLF_NOACCEPT)
144			ire->ire_recvfn = ire_recv_noaccept_v6;
145		break;
146	case IRE_LOOPBACK:
147		ire->ire_sendfn = ire_send_local_v6;
148		ire->ire_recvfn = ire_recv_loopback_v6;
149		break;
150	case IRE_MULTICAST:
151		ire->ire_postfragfn = ip_postfrag_loopcheck;
152		ire->ire_sendfn = ire_send_multicast_v6;
153		ire->ire_recvfn = ire_recv_multicast_v6;
154		break;
155	default:
156		/*
157		 * For IRE_IF_ALL and IRE_OFFLINK we forward received
158		 * packets by default.
159		 */
160		ire->ire_sendfn = ire_send_wire_v6;
161		ire->ire_recvfn = ire_recv_forward_v6;
162		break;
163	}
164	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
165		ire->ire_sendfn = ire_send_noroute_v6;
166		ire->ire_recvfn = ire_recv_noroute_v6;
167	} else if (ire->ire_flags & RTF_MULTIRT) {
168		ire->ire_postfragfn = ip_postfrag_multirt_v6;
169		ire->ire_sendfn = ire_send_multirt_v6;
170		ire->ire_recvfn = ire_recv_multirt_v6;
171	}
172	ire->ire_nce_capable = ire_determine_nce_capable(ire);
173	return (0);
174}
175
176/*
177 * ire_create_v6 is called to allocate and initialize a new IRE.
178 *
179 * NOTE : This is called as writer sometimes though not required
180 * by this function.
181 */
182/* ARGSUSED */
183ire_t *
184ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
185    const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
186    uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
187{
188	ire_t	*ire;
189	int	error;
190
191	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
192
193	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
194	if (ire == NULL) {
195		DTRACE_PROBE(kmem__cache__alloc);
196		return (NULL);
197	}
198	*ire = ire_null;
199
200	error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
201	    type, ill, zoneid, flags, gc, ipst);
202
203	if (error != 0) {
204		DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
205		kmem_cache_free(ire_cache, ire);
206		return (NULL);
207	}
208	return (ire);
209}
210
211/*
212 * Find the ill matching a multicast group.
213 * Allows different routes for multicast addresses
214 * in the unicast routing table (akin to FF::0/8 but could be more specific)
215 * which point at different interfaces. This is used when IPV6_MULTICAST_IF
216 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
217 * specify the interface to join on.
218 *
219 * Supports link-local addresses by using ire_route_recursive which follows
220 * the ill when recursing.
221 *
222 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
223 * and the MULTIRT property can be different for different groups, we
224 * extract RTF_MULTIRT from the special unicast route added for a group
225 * with CGTP and pass that back in the multirtp argument.
226 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
227 * We have a setsrcp argument for the same reason.
228 */
229ill_t *
230ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
231    ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp)
232{
233	ire_t	*ire;
234	ill_t	*ill;
235
236	ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL,
237	    MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
238	ASSERT(ire != NULL);
239
240	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
241		ire_refrele(ire);
242		return (NULL);
243	}
244
245	if (multirtp != NULL)
246		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
247
248	ill = ire_nexthop_ill(ire);
249	ire_refrele(ire);
250	return (ill);
251}
252
253/*
254 * This function takes a mask and returns number of bits set in the
255 * mask (the represented prefix length).  Assumes a contiguous mask.
256 */
257int
258ip_mask_to_plen_v6(const in6_addr_t *v6mask)
259{
260	int		bits;
261	int		plen = IPV6_ABITS;
262	int		i;
263
264	for (i = 3; i >= 0; i--) {
265		if (v6mask->s6_addr32[i] == 0) {
266			plen -= 32;
267			continue;
268		}
269		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
270		if (bits == 0)
271			break;
272		plen -= bits;
273	}
274
275	return (plen);
276}
277
278/*
279 * Convert a prefix length to the mask for that prefix.
280 * Returns the argument bitmask.
281 */
282in6_addr_t *
283ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
284{
285	uint32_t *ptr;
286
287	if (plen < 0 || plen > IPV6_ABITS)
288		return (NULL);
289	*bitmask = ipv6_all_zeros;
290	if (plen == 0)
291		return (bitmask);
292
293	ptr = (uint32_t *)bitmask;
294	while (plen > 32) {
295		*ptr++ = 0xffffffffU;
296		plen -= 32;
297	}
298	*ptr = htonl(0xffffffffU << (32 - plen));
299	return (bitmask);
300}
301
302/*
303 * Add a fully initialized IPv6 IRE to the forwarding table.
304 * This returns NULL on failure, or a held IRE on success.
305 * Normally the returned IRE is the same as the argument. But a different
306 * IRE will be returned if the added IRE is deemed identical to an existing
307 * one. In that case ire_identical_ref will be increased.
308 * The caller always needs to do an ire_refrele() on the returned IRE.
309 */
310ire_t *
311ire_add_v6(ire_t *ire)
312{
313	ire_t	*ire1;
314	int	mask_table_index;
315	irb_t	*irb_ptr;
316	ire_t	**irep;
317	int	match_flags;
318	int	error;
319	ip_stack_t	*ipst = ire->ire_ipst;
320
321	ASSERT(ire->ire_ipversion == IPV6_VERSION);
322
323	/* Make sure the address is properly masked. */
324	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
325
326	mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
327	if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
328		irb_t *ptr;
329		int i;
330
331		ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
332		    sizeof (irb_t)));
333		if (ptr == NULL) {
334			ire_delete(ire);
335			return (NULL);
336		}
337		for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
338			rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
339			ptr[i].irb_ipst = ipst;
340		}
341		mutex_enter(&ipst->ips_ire_ft_init_lock);
342		if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
343		    NULL) {
344			ipst->ips_ip_forwarding_table_v6[mask_table_index] =
345			    ptr;
346			mutex_exit(&ipst->ips_ire_ft_init_lock);
347		} else {
348			/*
349			 * Some other thread won the race in
350			 * initializing the forwarding table at the
351			 * same index.
352			 */
353			mutex_exit(&ipst->ips_ire_ft_init_lock);
354			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
355				rw_destroy(&ptr[i].irb_lock);
356			}
357			mi_free(ptr);
358		}
359	}
360	irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
361	    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
362	    ipst->ips_ip6_ftable_hash_size)]);
363
364	match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
365	if (ire->ire_ill != NULL)
366		match_flags |= MATCH_IRE_ILL;
367	/*
368	 * Start the atomic add of the ire. Grab the bucket lock and the
369	 * ill lock. Check for condemned.
370	 */
371	error = ire_atomic_start(irb_ptr, ire);
372	if (error != 0) {
373		ire_delete(ire);
374		return (NULL);
375	}
376
377	/*
378	 * If we are creating a hidden IRE, make sure we search for
379	 * hidden IREs when searching for duplicates below.
380	 * Otherwise, we might find an IRE on some other interface
381	 * that's not marked hidden.
382	 */
383	if (ire->ire_testhidden)
384		match_flags |= MATCH_IRE_TESTHIDDEN;
385
386	/*
387	 * Atomically check for duplicate and insert in the table.
388	 */
389	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
390		if (IRE_IS_CONDEMNED(ire1))
391			continue;
392		/*
393		 * Here we need an exact match on zoneid, i.e.,
394		 * ire_match_args doesn't fit.
395		 */
396		if (ire1->ire_zoneid != ire->ire_zoneid)
397			continue;
398
399		if (ire1->ire_type != ire->ire_type)
400			continue;
401
402		/*
403		 * Note: We do not allow multiple routes that differ only
404		 * in the gateway security attributes; such routes are
405		 * considered duplicates.
406		 * To change that we explicitly have to treat them as
407		 * different here.
408		 */
409		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
410		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
411		    ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL,
412		    match_flags)) {
413			/*
414			 * Return the old ire after doing a REFHOLD.
415			 * As most of the callers continue to use the IRE
416			 * after adding, we return a held ire. This will
417			 * avoid a lookup in the caller again. If the callers
418			 * don't want to use it, they need to do a REFRELE.
419			 */
420			ip1dbg(("found dup ire existing %p new %p",
421			    (void *)ire1, (void *)ire));
422			ire_refhold(ire1);
423			atomic_add_32(&ire1->ire_identical_ref, 1);
424			ire_atomic_end(irb_ptr, ire);
425			ire_delete(ire);
426			return (ire1);
427		}
428	}
429
430	/*
431	 * Normally we do head insertion since most things do not care about
432	 * the order of the IREs in the bucket.
433	 * However, due to shared-IP zones (and restrict_interzone_loopback)
434	 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
435	 * address. For that reason we do tail insertion for IRE_IF_CLONE.
436	 */
437	irep = (ire_t **)irb_ptr;
438	if (ire->ire_type & IRE_IF_CLONE) {
439		while ((ire1 = *irep) != NULL)
440			irep = &ire1->ire_next;
441	}
442	/* Insert at *irep */
443	ire1 = *irep;
444	if (ire1 != NULL)
445		ire1->ire_ptpn = &ire->ire_next;
446	ire->ire_next = ire1;
447	/* Link the new one in. */
448	ire->ire_ptpn = irep;
449	/*
450	 * ire_walk routines de-reference ire_next without holding
451	 * a lock. Before we point to the new ire, we want to make
452	 * sure the store that sets the ire_next of the new ire
453	 * reaches global visibility, so that ire_walk routines
454	 * don't see a truncated list of ires i.e if the ire_next
455	 * of the new ire gets set after we do "*irep = ire" due
456	 * to re-ordering, the ire_walk thread will see a NULL
457	 * once it accesses the ire_next of the new ire.
458	 * membar_producer() makes sure that the following store
459	 * happens *after* all of the above stores.
460	 */
461	membar_producer();
462	*irep = ire;
463	ire->ire_bucket = irb_ptr;
464	/*
465	 * We return a bumped up IRE above. Keep it symmetrical
466	 * so that the callers will always have to release. This
467	 * helps the callers of this function because they continue
468	 * to use the IRE after adding and hence they don't have to
469	 * lookup again after we return the IRE.
470	 *
471	 * NOTE : We don't have to use atomics as this is appearing
472	 * in the list for the first time and no one else can bump
473	 * up the reference count on this yet.
474	 */
475	ire_refhold_locked(ire);
476	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
477	irb_ptr->irb_ire_cnt++;
478
479	if (ire->ire_ill != NULL) {
480		DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
481		    (char *), "ire", (void *), ire);
482		ire->ire_ill->ill_ire_cnt++;
483		ASSERT(ire->ire_ill->ill_ire_cnt != 0);	/* Wraparound */
484	}
485	ire_atomic_end(irb_ptr, ire);
486
487	/* Make any caching of the IREs be notified or updated */
488	ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
489
490	return (ire);
491}
492
493/*
494 * Search for all HOST REDIRECT routes that are
495 * pointing at the specified gateway and
496 * delete them. This routine is called only
497 * when a default gateway is going away.
498 */
499static void
500ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
501{
502	irb_t *irb_ptr;
503	irb_t *irb;
504	ire_t *ire;
505	in6_addr_t gw_addr_v6;
506	int i;
507
508	/* get the hash table for HOST routes */
509	irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
510	if (irb_ptr == NULL)
511		return;
512	for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
513		irb = &irb_ptr[i];
514		irb_refhold(irb);
515		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
516			if (!(ire->ire_flags & RTF_DYNAMIC))
517				continue;
518			mutex_enter(&ire->ire_lock);
519			gw_addr_v6 = ire->ire_gateway_addr_v6;
520			mutex_exit(&ire->ire_lock);
521			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
522				ire_delete(ire);
523		}
524		irb_refrele(irb);
525	}
526}
527
528/*
529 * Delete the specified IRE.
530 * All calls should use ire_delete().
531 * Sometimes called as writer though not required by this function.
532 *
533 * NOTE : This function is called only if the ire was added
534 * in the list.
535 */
536void
537ire_delete_v6(ire_t *ire)
538{
539	in6_addr_t gw_addr_v6;
540	ip_stack_t	*ipst = ire->ire_ipst;
541
542	/*
543	 * Make sure ire_generation increases from ire_flush_cache happen
544	 * after any lookup/reader has read ire_generation.
545	 * Since the rw_enter makes us wait until any lookup/reader has
546	 * completed we can exit the lock immediately.
547	 */
548	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
549	rw_exit(&ipst->ips_ip6_ire_head_lock);
550
551	ASSERT(ire->ire_refcnt >= 1);
552	ASSERT(ire->ire_ipversion == IPV6_VERSION);
553
554	ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
555
556	if (ire->ire_type == IRE_DEFAULT) {
557		/*
558		 * when a default gateway is going away
559		 * delete all the host redirects pointing at that
560		 * gateway.
561		 */
562		mutex_enter(&ire->ire_lock);
563		gw_addr_v6 = ire->ire_gateway_addr_v6;
564		mutex_exit(&ire->ire_lock);
565		ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
566	}
567
568	/*
569	 * If we are deleting an IRE_INTERFACE then we make sure we also
570	 * delete any IRE_IF_CLONE that has been created from it.
571	 * Those are always in ire_dep_children.
572	 */
573	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
574		ire_dep_delete_if_clone(ire);
575
576	/* Remove from parent dependencies and child */
577	rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
578	if (ire->ire_dep_parent != NULL) {
579		ire_dep_remove(ire);
580	}
581	while (ire->ire_dep_children != NULL)
582		ire_dep_remove(ire->ire_dep_children);
583	rw_exit(&ipst->ips_ire_dep_lock);
584}
585
586/*
587 * When an IRE is added or deleted this routine is called to make sure
588 * any caching of IRE information is notified or updated.
589 *
590 * The flag argument indicates if the flush request is due to addition
591 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
592 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
593 */
594void
595ire_flush_cache_v6(ire_t *ire, int flag)
596{
597	ip_stack_t *ipst = ire->ire_ipst;
598
599	/*
600	 * IRE_IF_CLONE ire's don't provide any new information
601	 * than the parent from which they are cloned, so don't
602	 * perturb the generation numbers.
603	 */
604	if (ire->ire_type & IRE_IF_CLONE)
605		return;
606
607	/*
608	 * Ensure that an ire_add during a lookup serializes the updates of
609	 * the generation numbers under ire_head_lock so that the lookup gets
610	 * either the old ire and old generation number, or a new ire and new
611	 * generation number.
612	 */
613	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
614
615	/*
616	 * If a route was just added, we need to notify everybody that
617	 * has cached an IRE_NOROUTE since there might now be a better
618	 * route for them.
619	 */
620	if (flag == IRE_FLUSH_ADD) {
621		ire_increment_generation(ipst->ips_ire_reject_v6);
622		ire_increment_generation(ipst->ips_ire_blackhole_v6);
623	}
624
625	/* Adding a default can't otherwise provide a better route */
626	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
627		rw_exit(&ipst->ips_ip6_ire_head_lock);
628		return;
629	}
630
631	switch (flag) {
632	case IRE_FLUSH_DELETE:
633	case IRE_FLUSH_GWCHANGE:
634		/*
635		 * Update ire_generation for all ire_dep_children chains
636		 * starting with this IRE
637		 */
638		ire_dep_incr_generation(ire);
639		break;
640	case IRE_FLUSH_ADD: {
641		in6_addr_t	addr;
642		in6_addr_t	mask;
643		ip_stack_t	*ipst = ire->ire_ipst;
644		uint_t		masklen;
645
646		/*
647		 * Find an IRE which is a shorter match than the ire to be added
648		 * For any such IRE (which we repeat) we update the
649		 * ire_generation the same way as in the delete case.
650		 */
651		addr = ire->ire_addr_v6;
652		mask = ire->ire_mask_v6;
653		masklen = ip_mask_to_plen_v6(&mask);
654
655		ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
656		    ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
657		while (ire != NULL) {
658			/* We need to handle all in the same bucket */
659			irb_increment_generation(ire->ire_bucket);
660
661			mask = ire->ire_mask_v6;
662			ASSERT(masklen > ip_mask_to_plen_v6(&mask));
663			masklen = ip_mask_to_plen_v6(&mask);
664			ire_refrele(ire);
665			ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
666			    NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
667		}
668		}
669		break;
670	}
671	rw_exit(&ipst->ips_ip6_ire_head_lock);
672}
673
674/*
675 * Matches the arguments passed with the values in the ire.
676 *
677 * Note: for match types that match using "ill" passed in, ill
678 * must be checked for non-NULL before calling this routine.
679 */
680boolean_t
681ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
682    const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
683    const ts_label_t *tsl, int match_flags)
684{
685	in6_addr_t masked_addr;
686	in6_addr_t gw_addr_v6;
687	ill_t *ire_ill = NULL, *dst_ill;
688	ip_stack_t *ipst = ire->ire_ipst;
689
690	ASSERT(ire->ire_ipversion == IPV6_VERSION);
691	ASSERT(addr != NULL);
692	ASSERT(mask != NULL);
693	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
694	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
695	    (ill != NULL && ill->ill_isv6));
696
697	/*
698	 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
699	 * is in fact hidden, to ensure the caller gets the right one.
700	 */
701	if (ire->ire_testhidden) {
702		if (!(match_flags & MATCH_IRE_TESTHIDDEN))
703			return (B_FALSE);
704	}
705
706	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
707	    ire->ire_zoneid != ALL_ZONES) {
708		/*
709		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
710		 * does not match that of ire_zoneid, a failure to
711		 * match is reported at this point. Otherwise, since some IREs
712		 * that are available in the global zone can be used in local
713		 * zones, additional checks need to be performed:
714		 *
715		 * IRE_LOOPBACK
716		 *	entries should never be matched in this situation.
717		 *	Each zone has its own IRE_LOOPBACK.
718		 *
719		 * IRE_LOCAL
720		 *	We allow them for any zoneid. ire_route_recursive
721		 *	does additional checks when
722		 *	ip_restrict_interzone_loopback is set.
723		 *
724		 * If ill_usesrc_ifindex is set
725		 *	Then we check if the zone has a valid source address
726		 *	on the usesrc ill.
727		 *
728		 * If ire_ill is set, then check that the zone has an ipif
729		 *	on that ill.
730		 *
731		 * Outside of this function (in ire_round_robin) we check
732		 * that any IRE_OFFLINK has a gateway that reachable from the
733		 * zone when we have multiple choices (ECMP).
734		 */
735		if (match_flags & MATCH_IRE_ZONEONLY)
736			return (B_FALSE);
737		if (ire->ire_type & IRE_LOOPBACK)
738			return (B_FALSE);
739
740		if (ire->ire_type & IRE_LOCAL)
741			goto matchit;
742
743		/*
744		 * The normal case of IRE_ONLINK has a matching zoneid.
745		 * Here we handle the case when shared-IP zones have been
746		 * configured with IP addresses on vniN. In that case it
747		 * is ok for traffic from a zone to use IRE_ONLINK routes
748		 * if the ill has a usesrc pointing at vniN
749		 * Applies to IRE_INTERFACE.
750		 */
751		dst_ill = ire->ire_ill;
752		if (ire->ire_type & IRE_ONLINK) {
753			uint_t	ifindex;
754
755			/*
756			 * Note there is no IRE_INTERFACE on vniN thus
757			 * can't do an IRE lookup for a matching route.
758			 */
759			ifindex = dst_ill->ill_usesrc_ifindex;
760			if (ifindex == 0)
761				return (B_FALSE);
762
763			/*
764			 * If there is a usable source address in the
765			 * zone, then it's ok to return this IRE_INTERFACE
766			 */
767			if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
768			    zoneid, ipst)) {
769				ip3dbg(("ire_match_args: no usrsrc for zone"
770				    " dst_ill %p\n", (void *)dst_ill));
771				return (B_FALSE);
772			}
773		}
774		/*
775		 * For example, with
776		 * route add 11.0.0.0 gw1 -ifp bge0
777		 * route add 11.0.0.0 gw2 -ifp bge1
778		 * this code would differentiate based on
779		 * where the sending zone has addresses.
780		 * Only if the zone has an address on bge0 can it use the first
781		 * route. It isn't clear if this behavior is documented
782		 * anywhere.
783		 */
784		if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
785			ipif_t	*tipif;
786
787			mutex_enter(&dst_ill->ill_lock);
788			for (tipif = dst_ill->ill_ipif;
789			    tipif != NULL; tipif = tipif->ipif_next) {
790				if (!IPIF_IS_CONDEMNED(tipif) &&
791				    (tipif->ipif_flags & IPIF_UP) &&
792				    (tipif->ipif_zoneid == zoneid ||
793				    tipif->ipif_zoneid == ALL_ZONES))
794					break;
795			}
796			mutex_exit(&dst_ill->ill_lock);
797			if (tipif == NULL)
798				return (B_FALSE);
799		}
800	}
801
802matchit:
803	ire_ill = ire->ire_ill;
804	if (match_flags & MATCH_IRE_GW) {
805		mutex_enter(&ire->ire_lock);
806		gw_addr_v6 = ire->ire_gateway_addr_v6;
807		mutex_exit(&ire->ire_lock);
808	}
809	if (match_flags & MATCH_IRE_ILL) {
810
811		/*
812		 * If asked to match an ill, we *must* match
813		 * on the ire_ill for ipmp test addresses, or
814		 * any of the ill in the group for data addresses.
815		 * If we don't, we may as well fail.
816		 * However, we need an exception for IRE_LOCALs to ensure
817		 * we loopback packets even sent to test addresses on different
818		 * interfaces in the group.
819		 */
820		if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
821		    !(ire->ire_type & IRE_LOCAL)) {
822			if (ire->ire_ill != ill)
823				return (B_FALSE);
824		} else  {
825			match_flags &= ~MATCH_IRE_TESTHIDDEN;
826			/*
827			 * We know that ill is not NULL, but ire_ill could be
828			 * NULL
829			 */
830			if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
831				return (B_FALSE);
832		}
833	}
834	if (match_flags & MATCH_IRE_SRC_ILL) {
835		if (ire_ill == NULL)
836			return (B_FALSE);
837		if (!IS_ON_SAME_LAN(ill, ire_ill)) {
838			if (ire_ill->ill_usesrc_ifindex == 0 ||
839			    (ire_ill->ill_usesrc_ifindex !=
840			    ill->ill_phyint->phyint_ifindex))
841				return (B_FALSE);
842		}
843	}
844
845	/* No ire_addr_v6 bits set past the mask */
846	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
847	    ire->ire_addr_v6));
848	V6_MASK_COPY(*addr, *mask, masked_addr);
849	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
850	    ((!(match_flags & MATCH_IRE_GW)) ||
851	    IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
852	    ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
853	    ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
854	    ((!(match_flags & MATCH_IRE_MASK)) ||
855	    (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) &&
856	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
857	    (!is_system_labeled()) ||
858	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
859		/* We found the matched IRE */
860		return (B_TRUE);
861	}
862	return (B_FALSE);
863}
864
865/*
866 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
867 * gateway address. If ill is non-NULL we also match on it.
868 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
869 */
870boolean_t
871ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
872    const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
873{
874	ire_t	*ire;
875	uint_t	match_flags;
876
877	if (lock_held)
878		ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
879	else
880		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
881
882	match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
883	if (ill != NULL)
884		match_flags |= MATCH_IRE_ILL;
885
886	ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
887	    &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags,
888	    ipst);
889
890	if (!lock_held)
891		rw_exit(&ipst->ips_ip6_ire_head_lock);
892	if (ire != NULL) {
893		ire_refrele(ire);
894		return (B_TRUE);
895	} else {
896		return (B_FALSE);
897	}
898}
899
900/*
901 * Lookup a route in forwarding table.
902 * specific lookup is indicated by passing the
903 * required parameters and indicating the
904 * match required in flag field.
905 *
906 * Supports link-local addresses by following the ipif/ill when recursing.
907 */
908ire_t *
909ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
910    const in6_addr_t *gateway, int type, const ill_t *ill,
911    zoneid_t zoneid, const ts_label_t *tsl, int flags,
912    uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
913{
914	ire_t *ire = NULL;
915
916	ASSERT(addr != NULL);
917	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
918	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
919	ASSERT(ill == NULL || ill->ill_isv6);
920
921	ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
922
923	/*
924	 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
925	 * or MATCH_IRE_SRC_ILL is set.
926	 */
927	if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
928		return (NULL);
929
930	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
931	ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
932	    tsl, flags, ipst);
933	if (ire == NULL) {
934		rw_exit(&ipst->ips_ip6_ire_head_lock);
935		return (NULL);
936	}
937
938	/*
939	 * round-robin only if we have more than one route in the bucket.
940	 * ips_ip_ecmp_behavior controls when we do ECMP
941	 *	2:	always
942	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
943	 *	0:	never
944	 *
945	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
946	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
947	 * and the IRE_INTERFACESs are likely to be shorter matches.
948	 */
949	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
950		if (ipst->ips_ip_ecmp_behavior == 2 ||
951		    (ipst->ips_ip_ecmp_behavior == 1 &&
952		    IS_DEFAULT_ROUTE_V6(ire))) {
953			ire_t	*next_ire;
954			ire_ftable_args_t margs;
955
956			bzero(&margs, sizeof (margs));
957			margs.ift_addr_v6 = *addr;
958			if (mask != NULL)
959				margs.ift_mask_v6 = *mask;
960			if (gateway != NULL)
961				margs.ift_gateway_v6 = *gateway;
962			margs.ift_type = type;
963			margs.ift_ill = ill;
964			margs.ift_zoneid = zoneid;
965			margs.ift_tsl = tsl;
966			margs.ift_flags = flags;
967
968			next_ire = ire_round_robin(ire->ire_bucket, &margs,
969			    xmit_hint, ire, ipst);
970			if (next_ire == NULL) {
971				/* keep ire if next_ire is null */
972				goto done;
973			}
974			ire_refrele(ire);
975			ire = next_ire;
976		}
977	}
978
979done:
980	/* Return generation before dropping lock */
981	if (generationp != NULL)
982		*generationp = ire->ire_generation;
983
984	rw_exit(&ipst->ips_ip6_ire_head_lock);
985
986	/*
987	 * For shared-IP zones we need additional checks to what was
988	 * done in ire_match_args to make sure IRE_LOCALs are handled.
989	 *
990	 * When ip_restrict_interzone_loopback is set, then
991	 * we ensure that IRE_LOCAL are only used for loopback
992	 * between zones when the logical "Ethernet" would
993	 * have looped them back. That is, if in the absense of
994	 * the IRE_LOCAL we would have sent to packet out the
995	 * same ill.
996	 */
997	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
998	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
999	    ipst->ips_ip_restrict_interzone_loopback) {
1000		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
1001		ASSERT(ire != NULL);
1002	}
1003
1004	return (ire);
1005}
1006
1007/*
1008 * Look up a single ire. The caller holds either the read or write lock.
1009 */
1010ire_t *
1011ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1012    const in6_addr_t *gateway, int type, const ill_t *ill,
1013    zoneid_t zoneid, const ts_label_t *tsl, int flags,
1014    ip_stack_t *ipst)
1015{
1016	irb_t *irb_ptr;
1017	ire_t *ire = NULL;
1018	int i;
1019
1020	ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
1021
1022	/*
1023	 * If the mask is known, the lookup
1024	 * is simple, if the mask is not known
1025	 * we need to search.
1026	 */
1027	if (flags & MATCH_IRE_MASK) {
1028		uint_t masklen;
1029
1030		masklen = ip_mask_to_plen_v6(mask);
1031		if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
1032			return (NULL);
1033		}
1034		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
1035		    IRE_ADDR_MASK_HASH_V6(*addr, *mask,
1036		    ipst->ips_ip6_ftable_hash_size)]);
1037		rw_enter(&irb_ptr->irb_lock, RW_READER);
1038		for (ire = irb_ptr->irb_ire; ire != NULL;
1039		    ire = ire->ire_next) {
1040			if (IRE_IS_CONDEMNED(ire))
1041				continue;
1042			if (ire_match_args_v6(ire, addr, mask, gateway, type,
1043			    ill, zoneid, tsl, flags))
1044				goto found_ire;
1045		}
1046		rw_exit(&irb_ptr->irb_lock);
1047	} else {
1048		uint_t masklen;
1049
1050		/*
1051		 * In this case we don't know the mask, we need to
1052		 * search the table assuming different mask sizes.
1053		 */
1054		if (flags & MATCH_IRE_SHORTERMASK) {
1055			masklen = ip_mask_to_plen_v6(mask);
1056			if (masklen == 0) {
1057				/* Nothing shorter than zero */
1058				return (NULL);
1059			}
1060			masklen--;
1061		} else {
1062			masklen = IP6_MASK_TABLE_SIZE - 1;
1063		}
1064
1065		for (i = masklen; i >= 0; i--) {
1066			in6_addr_t tmpmask;
1067
1068			if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
1069				continue;
1070			(void) ip_plen_to_mask_v6(i, &tmpmask);
1071			irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][
1072			    IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
1073			    ipst->ips_ip6_ftable_hash_size)];
1074			rw_enter(&irb_ptr->irb_lock, RW_READER);
1075			for (ire = irb_ptr->irb_ire; ire != NULL;
1076			    ire = ire->ire_next) {
1077				if (IRE_IS_CONDEMNED(ire))
1078					continue;
1079				if (ire_match_args_v6(ire, addr,
1080				    &ire->ire_mask_v6, gateway, type, ill,
1081				    zoneid, tsl, flags))
1082					goto found_ire;
1083			}
1084			rw_exit(&irb_ptr->irb_lock);
1085		}
1086	}
1087	ASSERT(ire == NULL);
1088	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
1089	return (NULL);
1090
1091found_ire:
1092	ire_refhold(ire);
1093	rw_exit(&irb_ptr->irb_lock);
1094	return (ire);
1095}
1096
1097
1098/*
1099 * This function is called by
1100 * ip_input/ire_route_recursive when doing a route lookup on only the
1101 * destination address.
1102 *
1103 * The optimizations of this function over ire_ftable_lookup are:
1104 *	o removing unnecessary flag matching
1105 *	o doing longest prefix match instead of overloading it further
1106 *	  with the unnecessary "best_prefix_match"
1107 *
1108 * If no route is found we return IRE_NOROUTE.
1109 */
1110ire_t *
1111ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
1112    ip_stack_t *ipst, uint_t *generationp)
1113{
1114	ire_t	*ire;
1115
1116	ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL,
1117	    MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
1118	if (ire == NULL) {
1119		ire = ire_reject(ipst, B_TRUE);
1120		if (generationp != NULL)
1121			*generationp = IRE_GENERATION_VERIFY;
1122	}
1123	/* ftable_lookup did round robin */
1124	return (ire);
1125}
1126
1127ire_t *
1128ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src,
1129    ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1130    int *errorp, boolean_t *multirtp)
1131{
1132	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
1133
1134	return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp,
1135	    multirtp));
1136}
1137
1138/*
1139 * Recursively look for a route to the destination. Can also match on
1140 * the zoneid, ill, and label. Used for the data paths. See also
1141 * ire_route_recursive_dstonly.
1142 *
1143 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1144 * create an IRE_IF_CLONE. This is used on the receive side when we are not
1145 * forwarding.
1146 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1147 * resolve the gateway.
1148 *
1149 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1150 * instead.
1151 *
1152 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1153 * is an error.
1154 * Allow at most one RTF_INDIRECT.
1155 */
1156ire_t *
1157ire_route_recursive_impl_v6(ire_t *ire,
1158    const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
1159    zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1160    uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
1161    in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1162{
1163	int		i, j;
1164	in6_addr_t	v6nexthop = *nexthop;
1165	ire_t		*ires[MAX_IRE_RECURSION];
1166	uint_t		generation;
1167	uint_t		generations[MAX_IRE_RECURSION];
1168	boolean_t	need_refrele = B_FALSE;
1169	boolean_t	invalidate = B_FALSE;
1170	int		prefs[MAX_IRE_RECURSION];
1171	ill_t		*ill = NULL;
1172
1173	if (setsrcp != NULL)
1174		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1175	if (gwattrp != NULL)
1176		ASSERT(*gwattrp == NULL);
1177
1178	/*
1179	 * We iterate up to three times to resolve a route, even though
1180	 * we have four slots in the array. The extra slot is for an
1181	 * IRE_IF_CLONE we might need to create.
1182	 */
1183	i = 0;
1184	while (i < MAX_IRE_RECURSION - 1) {
1185		/* ire_ftable_lookup handles round-robin/ECMP */
1186		if (ire == NULL) {
1187			ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
1188			    (ill != NULL ? ill : ill_arg), zoneid, tsl,
1189			    match_args, xmit_hint, ipst, &generation);
1190		} else {
1191			/* Caller passed it; extra hold since we will rele */
1192			ire_refhold(ire);
1193			if (generationp != NULL)
1194				generation = *generationp;
1195			else
1196				generation = IRE_GENERATION_VERIFY;
1197		}
1198
1199		if (ire == NULL)
1200			ire = ire_reject(ipst, B_TRUE);
1201
1202		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
1203		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1204			goto error;
1205
1206		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1207
1208		if (i != 0) {
1209			prefs[i] = ire_pref(ire);
1210			/*
1211			 * Don't allow anything unusual past the first
1212			 * iteration.
1213			 */
1214			if ((ire->ire_type &
1215			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
1216			    prefs[i] <= prefs[i-1]) {
1217				ire_refrele(ire);
1218				if (irr_flags & IRR_INCOMPLETE) {
1219					ire = ires[0];
1220					ire_refhold(ire);
1221				} else {
1222					ire = ire_reject(ipst, B_TRUE);
1223				}
1224				goto error;
1225			}
1226		}
1227		/* We have a usable IRE */
1228		ires[i] = ire;
1229		generations[i] = generation;
1230		i++;
1231
1232		/* The first RTF_SETSRC address is passed back if setsrcp */
1233		if ((ire->ire_flags & RTF_SETSRC) &&
1234		    setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
1235			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
1236			    &ire->ire_setsrc_addr_v6));
1237			*setsrcp = ire->ire_setsrc_addr_v6;
1238		}
1239
1240		/* The first ire_gw_secattr is passed back if gwattrp */
1241		if (ire->ire_gw_secattr != NULL &&
1242		    gwattrp != NULL && *gwattrp == NULL)
1243			*gwattrp = ire->ire_gw_secattr;
1244
1245		/*
1246		 * Check if we have a short-cut pointer to an IRE for this
1247		 * destination, and that the cached dependency isn't stale.
1248		 * In that case we've rejoined an existing tree towards a
1249		 * parent, thus we don't need to continue the loop to
1250		 * discover the rest of the tree.
1251		 */
1252		mutex_enter(&ire->ire_lock);
1253		if (ire->ire_dep_parent != NULL &&
1254		    ire->ire_dep_parent->ire_generation ==
1255		    ire->ire_dep_parent_generation) {
1256			mutex_exit(&ire->ire_lock);
1257			ire = NULL;
1258			goto done;
1259		}
1260		mutex_exit(&ire->ire_lock);
1261
1262		/*
1263		 * If this type should have an ire_nce_cache (even if it
1264		 * doesn't yet have one) then we are done. Includes
1265		 * IRE_INTERFACE with a full 128 bit mask.
1266		 */
1267		if (ire->ire_nce_capable) {
1268			ire = NULL;
1269			goto done;
1270		}
1271		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1272		/*
1273		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1274		 * particular destination
1275		 */
1276		if (ire->ire_type & IRE_INTERFACE) {
1277			ire_t		*clone;
1278
1279			ASSERT(ire->ire_masklen != IPV6_ABITS);
1280
1281			/*
1282			 * In the case of ip_input and ILLF_FORWARDING not
1283			 * being set, and in the case of RTM_GET, there is
1284			 * no point in allocating an IRE_IF_CLONE. We return
1285			 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1286			 * result in a ire_dep_parent which is IRE_IF_*
1287			 * without an IRE_IF_CLONE.
1288			 * We recover from that when we need to send packets
1289			 * by ensuring that the generations become
1290			 * IRE_GENERATION_VERIFY in this case.
1291			 */
1292			if (!(irr_flags & IRR_ALLOCATE)) {
1293				invalidate = B_TRUE;
1294				ire = NULL;
1295				goto done;
1296			}
1297
1298			clone = ire_create_if_clone(ire, &v6nexthop,
1299			    &generation);
1300			if (clone == NULL) {
1301				/*
1302				 * Temporary failure - no memory.
1303				 * Don't want caller to cache IRE_NOROUTE.
1304				 */
1305				invalidate = B_TRUE;
1306				ire = ire_blackhole(ipst, B_TRUE);
1307				goto error;
1308			}
1309			/*
1310			 * Make clone next to last entry and the
1311			 * IRE_INTERFACE the last in the dependency
1312			 * chain since the clone depends on the
1313			 * IRE_INTERFACE.
1314			 */
1315			ASSERT(i >= 1);
1316			ASSERT(i < MAX_IRE_RECURSION);
1317
1318			ires[i] = ires[i-1];
1319			generations[i] = generations[i-1];
1320			ires[i-1] = clone;
1321			generations[i-1] = generation;
1322			i++;
1323
1324			ire = NULL;
1325			goto done;
1326		}
1327
1328		/*
1329		 * We only match on the type and optionally ILL when
1330		 * recursing. The type match is used by some callers
1331		 * to exclude certain types (such as IRE_IF_CLONE or
1332		 * IRE_LOCAL|IRE_LOOPBACK).
1333		 *
1334		 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1335		 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1336		 * ire_ill, so we set ill to the ire_ill
1337		 */
1338		match_args &= MATCH_IRE_TYPE;
1339		v6nexthop = ire->ire_gateway_addr_v6;
1340		if (ill == NULL && ire->ire_ill != NULL) {
1341			ill = ire->ire_ill;
1342			need_refrele = B_TRUE;
1343			ill_refhold(ill);
1344			match_args |= MATCH_IRE_ILL;
1345		}
1346		/*
1347		 * We set the prefs[i] value above if i > 0. We've already
1348		 * done i++ so i is one in the case of the first time around.
1349		 */
1350		if (i == 1)
1351			prefs[0] = ire_pref(ire);
1352		ire = NULL;
1353	}
1354	ASSERT(ire == NULL);
1355	ire = ire_reject(ipst, B_TRUE);
1356
1357error:
1358	ASSERT(ire != NULL);
1359	if (need_refrele)
1360		ill_refrele(ill);
1361
1362	/*
1363	 * In the case of MULTIRT we want to try a different IRE the next
1364	 * time. We let the next packet retry in that case.
1365	 */
1366	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1367		(void) ire_no_good(ires[0]);
1368
1369cleanup:
1370	/* cleanup ires[i] */
1371	ire_dep_unbuild(ires, i);
1372	for (j = 0; j < i; j++)
1373		ire_refrele(ires[j]);
1374
1375	ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1376	    (irr_flags & IRR_INCOMPLETE));
1377	/*
1378	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1379	 * ip_select_route since the reject or lack of memory might be gone.
1380	 */
1381	if (generationp != NULL)
1382		*generationp = IRE_GENERATION_VERIFY;
1383	return (ire);
1384
1385done:
1386	ASSERT(ire == NULL);
1387	if (need_refrele)
1388		ill_refrele(ill);
1389
1390	/* Build dependencies */
1391	if (i > 1 && !ire_dep_build(ires, generations, i)) {
1392		/* Something in chain was condemned; tear it apart */
1393		ire = ire_blackhole(ipst, B_TRUE);
1394		goto cleanup;
1395	}
1396
1397	/*
1398	 * Release all refholds except the one for ires[0] that we
1399	 * will return to the caller.
1400	 */
1401	for (j = 1; j < i; j++)
1402		ire_refrele(ires[j]);
1403
1404	if (invalidate) {
1405		/*
1406		 * Since we needed to allocate but couldn't we need to make
1407		 * sure that the dependency chain is rebuilt the next time.
1408		 */
1409		ire_dep_invalidate_generations(ires[0]);
1410		generation = IRE_GENERATION_VERIFY;
1411	} else {
1412		/*
1413		 * IREs can have been added or deleted while we did the
1414		 * recursive lookup and we can't catch those until we've built
1415		 * the dependencies. We verify the stored
1416		 * ire_dep_parent_generation to catch any such changes and
1417		 * return IRE_GENERATION_VERIFY (which will cause
1418		 * ip_select_route to be called again so we can redo the
1419		 * recursive lookup next time we send a packet.
1420		 */
1421		if (ires[0]->ire_dep_parent == NULL)
1422			generation = ires[0]->ire_generation;
1423		else
1424			generation = ire_dep_validate_generations(ires[0]);
1425		if (generations[0] != ires[0]->ire_generation) {
1426			/* Something changed at the top */
1427			generation = IRE_GENERATION_VERIFY;
1428		}
1429	}
1430	if (generationp != NULL)
1431		*generationp = generation;
1432
1433	return (ires[0]);
1434}
1435
1436ire_t *
1437ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
1438    const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1439    uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
1440    in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1441{
1442	return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
1443	    zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1444	    gwattrp, generationp));
1445}
1446
1447/*
1448 * Recursively look for a route to the destination.
1449 * We only handle a destination match here, yet we have the same arguments
1450 * as the full match to allow function pointers to select between the two.
1451 *
1452 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1453 * instead.
1454 *
1455 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1456 * is an error.
1457 * Allow at most one RTF_INDIRECT.
1458 */
1459ire_t *
1460ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags,
1461    uint32_t xmit_hint, ip_stack_t *ipst)
1462{
1463	ire_t	*ire;
1464	ire_t	*ire1;
1465	uint_t	generation;
1466
1467	/* ire_ftable_lookup handles round-robin/ECMP */
1468	ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
1469	    &generation);
1470	ASSERT(ire != NULL);
1471
1472	/*
1473	 * If this type should have an ire_nce_cache (even if it
1474	 * doesn't yet have one) then we are done. Includes
1475	 * IRE_INTERFACE with a full 128 bit mask.
1476	 */
1477	if (ire->ire_nce_capable)
1478		return (ire);
1479
1480	/*
1481	 * If the IRE has a current cached parent we know that the whole
1482	 * parent chain is current, hence we don't need to discover and
1483	 * build any dependencies by doing a recursive lookup.
1484	 */
1485	mutex_enter(&ire->ire_lock);
1486	if (ire->ire_dep_parent != NULL &&
1487	    ire->ire_dep_parent->ire_generation ==
1488	    ire->ire_dep_parent_generation) {
1489		mutex_exit(&ire->ire_lock);
1490		return (ire);
1491	}
1492	mutex_exit(&ire->ire_lock);
1493
1494	/*
1495	 * Fallback to loop in the normal code starting with the ire
1496	 * we found. Normally this would return the same ire.
1497	 */
1498	ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
1499	    NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
1500	    &generation);
1501	ire_refrele(ire);
1502	return (ire1);
1503}
1504