tnet.c revision 9041:a02891391cbe
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/types.h>
27#include <sys/stream.h>
28#include <sys/strsubr.h>
29#include <sys/stropts.h>
30#include <sys/sunddi.h>
31#include <sys/cred.h>
32#include <sys/debug.h>
33#include <sys/kmem.h>
34#include <sys/errno.h>
35#include <sys/disp.h>
36#include <netinet/in.h>
37#include <netinet/in_systm.h>
38#include <netinet/ip.h>
39#include <netinet/ip_icmp.h>
40#include <netinet/tcp.h>
41#include <inet/common.h>
42#include <inet/ipclassifier.h>
43#include <inet/ip.h>
44#include <inet/mib2.h>
45#include <inet/nd.h>
46#include <inet/tcp.h>
47#include <inet/ip_rts.h>
48#include <inet/ip_ire.h>
49#include <inet/ip_if.h>
50#include <sys/modhash.h>
51
52#include <sys/tsol/label.h>
53#include <sys/tsol/label_macro.h>
54#include <sys/tsol/tnet.h>
55#include <sys/tsol/tndb.h>
56#include <sys/strsun.h>
57
58/* tunable for strict error-reply behavior (TCP RST and ICMP Unreachable) */
59int tsol_strict_error;
60
61/*
62 * Some notes on the Trusted Solaris IRE gateway security attributes:
63 *
64 * When running in Trusted mode, the routing subsystem determines whether or
65 * not a packet can be delivered to an off-link host (not directly reachable
66 * through an interface) based on the accreditation checks of the packet's
67 * security attributes against those associated with the next-hop gateway.
68 *
69 * The next-hop gateway's security attributes can be derived from two sources
70 * (in order of preference): route-related and the host database.  A Trusted
71 * system must be configured with at least the host database containing an
72 * entry for the next-hop gateway, or otherwise no accreditation checks can
73 * be performed, which may result in the inability to send packets to any
74 * off-link destination host.
75 *
76 * The major differences between the two sources are the number and type of
77 * security attributes used for accreditation checks.  A host database entry
78 * can contain at most one set of security attributes, specific only to the
79 * next-hop gateway.  On contrast, route-related security attributes are made
80 * up of a collection of security attributes for the distant networks, and
81 * are grouped together per next-hop gateway used to reach those networks.
82 * This is the preferred method, and the routing subsystem will fallback to
83 * the host database entry only if there are no route-related attributes
84 * associated with the next-hop gateway.
85 *
86 * In Trusted mode, all of the IRE entries (except LOCAL/LOOPBACK/BROADCAST/
87 * INTERFACE type) are initialized to contain a placeholder to store this
88 * information.  The ire_gw_secattr structure gets allocated, initialized
89 * and associated with the IRE during the time of the IRE creation.  The
90 * initialization process also includes resolving the host database entry
91 * of the next-hop gateway for fallback purposes.  It does not include any
92 * route-related attribute setup, as that process comes separately as part
93 * of the route requests (add/change) made to the routing subsystem.
94 *
95 * The underlying logic which involves associating IREs with the gateway
96 * security attributes are represented by the following data structures:
97 *
98 * tsol_gcdb_t, or "gcdb"
99 *
100 *	- This is a system-wide collection of records containing the
101 *	  currently used route-related security attributes, which are fed
102 *	  through the routing socket interface, e.g. "route add/change".
103 *
104 * tsol_gc_t, or "gc"
105 *
106 *	- This is the gateway credential structure, and it provides for the
107 *	  only mechanism to access the contents of gcdb.  More than one gc
108 *	  entries may refer to the same gcdb record.  gc's in the system are
109 *	  grouped according to the next-hop gateway address.
110 *
111 * tsol_gcgrp_t, or "gcgrp"
112 *
113 *	- Group of gateway credentials, and is unique per next-hop gateway
114 *	  address.  When the group is not empty, i.e. when gcgrp_count is
115 *	  greater than zero, it contains one or more gc's, each pointing to
116 *	  a gcdb record which indicates the gateway security attributes
117 *	  associated with the next-hop gateway.
118 *
119 * The fields of the tsol_ire_gw_secattr_t used from within the IRE are:
120 *
121 * igsa_lock
122 *
123 *	- Lock that protects all fields within tsol_ire_gw_secattr_t.
124 *
125 * igsa_rhc
126 *
127 *	- Remote host cache database entry of next-hop gateway.  This is
128 *	  used in the case when there are no route-related attributes
129 *	  configured for the IRE.
130 *
131 * igsa_gc
132 *
133 *	- A set of route-related attributes that only get set for prefix
134 *	  IREs.  If this is non-NULL, the prefix IRE has been associated
135 *	  with a set of gateway security attributes by way of route add/
136 *	  change functionality.  This field stays NULL for IRE_CACHEs.
137 *
138 * igsa_gcgrp
139 *
140 *	- Group of gc's which only gets set for IRE_CACHEs.  Each of the gc
141 *	  points to a gcdb record that contains the security attributes
142 *	  used to perform the credential checks of the packet which uses
143 *	  the IRE.  If the group is not empty, the list of gc's can be
144 *	  traversed starting at gcgrp_head.  This field stays NULL for
145 *	  prefix IREs.
146 */
147
148static kmem_cache_t *ire_gw_secattr_cache;
149
150#define	GCDB_HASH_SIZE	101
151#define	GCGRP_HASH_SIZE	101
152
153#define	GCDB_REFRELE(p) {		\
154	mutex_enter(&gcdb_lock);	\
155	ASSERT((p)->gcdb_refcnt > 0);	\
156	if (--((p)->gcdb_refcnt) == 0)	\
157		gcdb_inactive(p);	\
158	ASSERT(MUTEX_HELD(&gcdb_lock));	\
159	mutex_exit(&gcdb_lock);		\
160}
161
162static int gcdb_hash_size = GCDB_HASH_SIZE;
163static int gcgrp_hash_size = GCGRP_HASH_SIZE;
164static mod_hash_t *gcdb_hash;
165static mod_hash_t *gcgrp4_hash;
166static mod_hash_t *gcgrp6_hash;
167
168static kmutex_t gcdb_lock;
169kmutex_t gcgrp_lock;
170
171static uint_t gcdb_hash_by_secattr(void *, mod_hash_key_t);
172static int gcdb_hash_cmp(mod_hash_key_t, mod_hash_key_t);
173static tsol_gcdb_t *gcdb_lookup(struct rtsa_s *, boolean_t);
174static void gcdb_inactive(tsol_gcdb_t *);
175
176static uint_t gcgrp_hash_by_addr(void *, mod_hash_key_t);
177static int gcgrp_hash_cmp(mod_hash_key_t, mod_hash_key_t);
178
179static int ire_gw_secattr_constructor(void *, void *, int);
180static void ire_gw_secattr_destructor(void *, void *);
181
182void
183tnet_init(void)
184{
185	ire_gw_secattr_cache = kmem_cache_create("ire_gw_secattr_cache",
186	    sizeof (tsol_ire_gw_secattr_t), 64, ire_gw_secattr_constructor,
187	    ire_gw_secattr_destructor, NULL, NULL, NULL, 0);
188
189	gcdb_hash = mod_hash_create_extended("gcdb_hash",
190	    gcdb_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
191	    gcdb_hash_by_secattr, NULL, gcdb_hash_cmp, KM_SLEEP);
192
193	gcgrp4_hash = mod_hash_create_extended("gcgrp4_hash",
194	    gcgrp_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
195	    gcgrp_hash_by_addr, NULL, gcgrp_hash_cmp, KM_SLEEP);
196
197	gcgrp6_hash = mod_hash_create_extended("gcgrp6_hash",
198	    gcgrp_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
199	    gcgrp_hash_by_addr, NULL, gcgrp_hash_cmp, KM_SLEEP);
200
201	mutex_init(&gcdb_lock, NULL, MUTEX_DEFAULT, NULL);
202	mutex_init(&gcgrp_lock, NULL, MUTEX_DEFAULT, NULL);
203}
204
205void
206tnet_fini(void)
207{
208	kmem_cache_destroy(ire_gw_secattr_cache);
209	mod_hash_destroy_hash(gcdb_hash);
210	mod_hash_destroy_hash(gcgrp4_hash);
211	mod_hash_destroy_hash(gcgrp6_hash);
212	mutex_destroy(&gcdb_lock);
213	mutex_destroy(&gcgrp_lock);
214}
215
216/* ARGSUSED */
217static int
218ire_gw_secattr_constructor(void *buf, void *cdrarg, int kmflags)
219{
220	tsol_ire_gw_secattr_t *attrp = buf;
221
222	mutex_init(&attrp->igsa_lock, NULL, MUTEX_DEFAULT, NULL);
223
224	attrp->igsa_rhc = NULL;
225	attrp->igsa_gc = NULL;
226	attrp->igsa_gcgrp = NULL;
227
228	return (0);
229}
230
231/* ARGSUSED */
232static void
233ire_gw_secattr_destructor(void *buf, void *cdrarg)
234{
235	tsol_ire_gw_secattr_t *attrp = (tsol_ire_gw_secattr_t *)buf;
236
237	mutex_destroy(&attrp->igsa_lock);
238}
239
240tsol_ire_gw_secattr_t *
241ire_gw_secattr_alloc(int kmflags)
242{
243	return (kmem_cache_alloc(ire_gw_secattr_cache, kmflags));
244}
245
246void
247ire_gw_secattr_free(tsol_ire_gw_secattr_t *attrp)
248{
249	ASSERT(MUTEX_NOT_HELD(&attrp->igsa_lock));
250
251	if (attrp->igsa_rhc != NULL) {
252		TNRHC_RELE(attrp->igsa_rhc);
253		attrp->igsa_rhc = NULL;
254	}
255
256	if (attrp->igsa_gc != NULL) {
257		GC_REFRELE(attrp->igsa_gc);
258		attrp->igsa_gc = NULL;
259	}
260	if (attrp->igsa_gcgrp != NULL) {
261		GCGRP_REFRELE(attrp->igsa_gcgrp);
262		attrp->igsa_gcgrp = NULL;
263	}
264
265	ASSERT(attrp->igsa_rhc == NULL);
266	ASSERT(attrp->igsa_gc == NULL);
267	ASSERT(attrp->igsa_gcgrp == NULL);
268
269	kmem_cache_free(ire_gw_secattr_cache, attrp);
270}
271
272/* ARGSUSED */
273static uint_t
274gcdb_hash_by_secattr(void *hash_data, mod_hash_key_t key)
275{
276	const struct rtsa_s *rp = (struct rtsa_s *)key;
277	const uint32_t *up, *ue;
278	uint_t hash;
279	int i;
280
281	ASSERT(rp != NULL);
282
283	/* See comments in hash_bylabel in zone.c for details */
284	hash = rp->rtsa_doi + (rp->rtsa_doi << 1);
285	up = (const uint32_t *)&rp->rtsa_slrange;
286	ue = up + sizeof (rp->rtsa_slrange) / sizeof (*up);
287	i = 1;
288	while (up < ue) {
289		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
290		hash += *up + (*up << ((i % 16) + 1));
291		up++;
292		i++;
293	}
294	return (hash);
295}
296
297static int
298gcdb_hash_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
299{
300	struct rtsa_s *rp1 = (struct rtsa_s *)key1;
301	struct rtsa_s *rp2 = (struct rtsa_s *)key2;
302
303	ASSERT(rp1 != NULL && rp2 != NULL);
304
305	if (blequal(&rp1->rtsa_slrange.lower_bound,
306	    &rp2->rtsa_slrange.lower_bound) &&
307	    blequal(&rp1->rtsa_slrange.upper_bound,
308	    &rp2->rtsa_slrange.upper_bound) &&
309	    rp1->rtsa_doi == rp2->rtsa_doi)
310		return (0);
311
312	/* No match; not found */
313	return (-1);
314}
315
316/* ARGSUSED */
317static uint_t
318gcgrp_hash_by_addr(void *hash_data, mod_hash_key_t key)
319{
320	tsol_gcgrp_addr_t *ga = (tsol_gcgrp_addr_t *)key;
321	uint_t		idx = 0;
322	uint32_t	*ap;
323
324	ASSERT(ga != NULL);
325	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
326
327	ap = (uint32_t *)&ga->ga_addr.s6_addr32[0];
328	idx ^= *ap++;
329	idx ^= *ap++;
330	idx ^= *ap++;
331	idx ^= *ap;
332
333	return (idx);
334}
335
336static int
337gcgrp_hash_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
338{
339	tsol_gcgrp_addr_t *ga1 = (tsol_gcgrp_addr_t *)key1;
340	tsol_gcgrp_addr_t *ga2 = (tsol_gcgrp_addr_t *)key2;
341
342	ASSERT(ga1 != NULL && ga2 != NULL);
343
344	/* Address family must match */
345	if (ga1->ga_af != ga2->ga_af)
346		return (-1);
347
348	if (ga1->ga_addr.s6_addr32[0] == ga2->ga_addr.s6_addr32[0] &&
349	    ga1->ga_addr.s6_addr32[1] == ga2->ga_addr.s6_addr32[1] &&
350	    ga1->ga_addr.s6_addr32[2] == ga2->ga_addr.s6_addr32[2] &&
351	    ga1->ga_addr.s6_addr32[3] == ga2->ga_addr.s6_addr32[3])
352		return (0);
353
354	/* No match; not found */
355	return (-1);
356}
357
358#define	RTSAFLAGS	"\20\11cipso\3doi\2max_sl\1min_sl"
359
360int
361rtsa_validate(const struct rtsa_s *rp)
362{
363	uint32_t mask = rp->rtsa_mask;
364
365	/* RTSA_CIPSO must be set, and DOI must not be zero */
366	if ((mask & RTSA_CIPSO) == 0 || rp->rtsa_doi == 0) {
367		DTRACE_PROBE2(tx__gcdb__log__error__rtsa__validate, char *,
368		    "rtsa(1) lacks flag or has 0 doi.",
369		    rtsa_s *, rp);
370		return (EINVAL);
371	}
372	/*
373	 * SL range must be specified, and it must have its
374	 * upper bound dominating its lower bound.
375	 */
376	if ((mask & RTSA_SLRANGE) != RTSA_SLRANGE ||
377	    !bldominates(&rp->rtsa_slrange.upper_bound,
378	    &rp->rtsa_slrange.lower_bound)) {
379		DTRACE_PROBE2(tx__gcdb__log__error__rtsa__validate, char *,
380		    "rtsa(1) min_sl and max_sl not set or max_sl is "
381		    "not dominating.", rtsa_s *, rp);
382		return (EINVAL);
383	}
384	return (0);
385}
386
387/*
388 * A brief explanation of the reference counting scheme:
389 *
390 * Prefix IREs have a non-NULL igsa_gc and a NULL igsa_gcgrp;
391 * IRE_CACHEs have it vice-versa.
392 *
393 * Apart from dynamic references due to to reference holds done
394 * actively by threads, we have the following references:
395 *
396 * gcdb_refcnt:
397 *	- Every tsol_gc_t pointing to a tsol_gcdb_t contributes a reference
398 *	  to the gcdb_refcnt.
399 *
400 * gc_refcnt:
401 *	- A prefix IRE that points to an igsa_gc contributes a reference
402 *	  to the gc_refcnt.
403 *
404 * gcgrp_refcnt:
405 *	- An IRE_CACHE that points to an igsa_gcgrp contributes a reference
406 *	  to the gcgrp_refcnt of the associated tsol_gcgrp_t.
407 *	- Every tsol_gc_t in the chain headed by tsol_gcgrp_t contributes
408 *	  a reference to the gcgrp_refcnt.
409 */
410static tsol_gcdb_t *
411gcdb_lookup(struct rtsa_s *rp, boolean_t alloc)
412{
413	tsol_gcdb_t *gcdb = NULL;
414
415	if (rtsa_validate(rp) != 0)
416		return (NULL);
417
418	mutex_enter(&gcdb_lock);
419	/* Find a copy in the cache; otherwise, create one and cache it */
420	if (mod_hash_find(gcdb_hash, (mod_hash_key_t)rp,
421	    (mod_hash_val_t *)&gcdb) == 0) {
422		gcdb->gcdb_refcnt++;
423		ASSERT(gcdb->gcdb_refcnt != 0);
424
425		DTRACE_PROBE2(tx__gcdb__log__info__gcdb__lookup, char *,
426		    "gcdb(1) is in gcdb_hash(global)", tsol_gcdb_t *, gcdb);
427	} else if (alloc) {
428		gcdb = kmem_zalloc(sizeof (*gcdb), KM_NOSLEEP);
429		if (gcdb != NULL) {
430			gcdb->gcdb_refcnt = 1;
431			gcdb->gcdb_mask = rp->rtsa_mask;
432			gcdb->gcdb_doi = rp->rtsa_doi;
433			gcdb->gcdb_slrange = rp->rtsa_slrange;
434
435			if (mod_hash_insert(gcdb_hash,
436			    (mod_hash_key_t)&gcdb->gcdb_attr,
437			    (mod_hash_val_t)gcdb) != 0) {
438				mutex_exit(&gcdb_lock);
439				kmem_free(gcdb, sizeof (*gcdb));
440				return (NULL);
441			}
442
443			DTRACE_PROBE2(tx__gcdb__log__info__gcdb__insert, char *,
444			    "gcdb(1) inserted in gcdb_hash(global)",
445			    tsol_gcdb_t *, gcdb);
446		}
447	}
448	mutex_exit(&gcdb_lock);
449	return (gcdb);
450}
451
452static void
453gcdb_inactive(tsol_gcdb_t *gcdb)
454{
455	ASSERT(MUTEX_HELD(&gcdb_lock));
456	ASSERT(gcdb != NULL && gcdb->gcdb_refcnt == 0);
457
458	(void) mod_hash_remove(gcdb_hash, (mod_hash_key_t)&gcdb->gcdb_attr,
459	    (mod_hash_val_t *)&gcdb);
460
461	DTRACE_PROBE2(tx__gcdb__log__info__gcdb__remove, char *,
462	    "gcdb(1) removed from gcdb_hash(global)",
463	    tsol_gcdb_t *, gcdb);
464	kmem_free(gcdb, sizeof (*gcdb));
465}
466
467tsol_gc_t *
468gc_create(struct rtsa_s *rp, tsol_gcgrp_t *gcgrp, boolean_t *gcgrp_xtrarefp)
469{
470	tsol_gc_t *gc;
471	tsol_gcdb_t *gcdb;
472
473	*gcgrp_xtrarefp = B_TRUE;
474
475	rw_enter(&gcgrp->gcgrp_rwlock, RW_WRITER);
476	if ((gcdb = gcdb_lookup(rp, B_TRUE)) == NULL) {
477		rw_exit(&gcgrp->gcgrp_rwlock);
478		return (NULL);
479	}
480
481	for (gc = gcgrp->gcgrp_head; gc != NULL; gc = gc->gc_next) {
482		if (gc->gc_db == gcdb) {
483			ASSERT(gc->gc_grp == gcgrp);
484
485			gc->gc_refcnt++;
486			ASSERT(gc->gc_refcnt != 0);
487
488			GCDB_REFRELE(gcdb);
489
490			DTRACE_PROBE3(tx__gcdb__log__info__gc__create,
491			    char *, "found gc(1) in gcgrp(2)",
492			    tsol_gc_t *, gc, tsol_gcgrp_t *, gcgrp);
493			rw_exit(&gcgrp->gcgrp_rwlock);
494			return (gc);
495		}
496	}
497
498	gc = kmem_zalloc(sizeof (*gc), KM_NOSLEEP);
499	if (gc != NULL) {
500		if (gcgrp->gcgrp_head == NULL) {
501			gcgrp->gcgrp_head = gcgrp->gcgrp_tail = gc;
502		} else {
503			gcgrp->gcgrp_tail->gc_next = gc;
504			gc->gc_prev = gcgrp->gcgrp_tail;
505			gcgrp->gcgrp_tail = gc;
506		}
507		gcgrp->gcgrp_count++;
508		ASSERT(gcgrp->gcgrp_count != 0);
509
510		/* caller has incremented gcgrp reference for us */
511		gc->gc_grp = gcgrp;
512
513		gc->gc_db = gcdb;
514		gc->gc_refcnt = 1;
515
516		DTRACE_PROBE3(tx__gcdb__log__info__gc__create, char *,
517		    "added gc(1) to gcgrp(2)", tsol_gc_t *, gc,
518		    tsol_gcgrp_t *, gcgrp);
519
520		*gcgrp_xtrarefp = B_FALSE;
521	}
522	rw_exit(&gcgrp->gcgrp_rwlock);
523
524	return (gc);
525}
526
527void
528gc_inactive(tsol_gc_t *gc)
529{
530	tsol_gcgrp_t *gcgrp = gc->gc_grp;
531
532	ASSERT(gcgrp != NULL);
533	ASSERT(RW_WRITE_HELD(&gcgrp->gcgrp_rwlock));
534	ASSERT(gc->gc_refcnt == 0);
535
536	if (gc->gc_prev != NULL)
537		gc->gc_prev->gc_next = gc->gc_next;
538	else
539		gcgrp->gcgrp_head = gc->gc_next;
540	if (gc->gc_next != NULL)
541		gc->gc_next->gc_prev = gc->gc_prev;
542	else
543		gcgrp->gcgrp_tail = gc->gc_prev;
544	ASSERT(gcgrp->gcgrp_count > 0);
545	gcgrp->gcgrp_count--;
546
547	/* drop lock before it's destroyed */
548	rw_exit(&gcgrp->gcgrp_rwlock);
549
550	DTRACE_PROBE3(tx__gcdb__log__info__gc__remove, char *,
551	    "removed inactive gc(1) from gcgrp(2)",
552	    tsol_gc_t *, gc, tsol_gcgrp_t *, gcgrp);
553
554	GCGRP_REFRELE(gcgrp);
555
556	gc->gc_grp = NULL;
557	gc->gc_prev = gc->gc_next = NULL;
558
559	if (gc->gc_db != NULL)
560		GCDB_REFRELE(gc->gc_db);
561
562	kmem_free(gc, sizeof (*gc));
563}
564
565tsol_gcgrp_t *
566gcgrp_lookup(tsol_gcgrp_addr_t *ga, boolean_t alloc)
567{
568	tsol_gcgrp_t *gcgrp = NULL;
569	mod_hash_t *hashp;
570
571	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
572
573	hashp = (ga->ga_af == AF_INET) ? gcgrp4_hash : gcgrp6_hash;
574
575	mutex_enter(&gcgrp_lock);
576	if (mod_hash_find(hashp, (mod_hash_key_t)ga,
577	    (mod_hash_val_t *)&gcgrp) == 0) {
578		gcgrp->gcgrp_refcnt++;
579		ASSERT(gcgrp->gcgrp_refcnt != 0);
580
581		DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__lookup, char *,
582		    "found gcgrp(1) in hash(2)", tsol_gcgrp_t *, gcgrp,
583		    mod_hash_t *, hashp);
584
585	} else if (alloc) {
586		gcgrp = kmem_zalloc(sizeof (*gcgrp), KM_NOSLEEP);
587		if (gcgrp != NULL) {
588			gcgrp->gcgrp_refcnt = 1;
589			rw_init(&gcgrp->gcgrp_rwlock, NULL, RW_DEFAULT, NULL);
590			bcopy(ga, &gcgrp->gcgrp_addr, sizeof (*ga));
591
592			if (mod_hash_insert(hashp,
593			    (mod_hash_key_t)&gcgrp->gcgrp_addr,
594			    (mod_hash_val_t)gcgrp) != 0) {
595				mutex_exit(&gcgrp_lock);
596				kmem_free(gcgrp, sizeof (*gcgrp));
597				return (NULL);
598			}
599
600			DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__insert,
601			    char *, "inserted gcgrp(1) in hash(2)",
602			    tsol_gcgrp_t *, gcgrp, mod_hash_t *, hashp);
603		}
604	}
605	mutex_exit(&gcgrp_lock);
606	return (gcgrp);
607}
608
609void
610gcgrp_inactive(tsol_gcgrp_t *gcgrp)
611{
612	tsol_gcgrp_addr_t *ga;
613	mod_hash_t *hashp;
614
615	ASSERT(MUTEX_HELD(&gcgrp_lock));
616	ASSERT(!RW_LOCK_HELD(&gcgrp->gcgrp_rwlock));
617	ASSERT(gcgrp != NULL && gcgrp->gcgrp_refcnt == 0);
618	ASSERT(gcgrp->gcgrp_head == NULL && gcgrp->gcgrp_count == 0);
619
620	ga = &gcgrp->gcgrp_addr;
621	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
622
623	hashp = (ga->ga_af == AF_INET) ? gcgrp4_hash : gcgrp6_hash;
624	(void) mod_hash_remove(hashp, (mod_hash_key_t)ga,
625	    (mod_hash_val_t *)&gcgrp);
626	rw_destroy(&gcgrp->gcgrp_rwlock);
627
628	DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__remove, char *,
629	    "removed inactive gcgrp(1) from hash(2)",
630	    tsol_gcgrp_t *, gcgrp, mod_hash_t *, hashp);
631
632	kmem_free(gcgrp, sizeof (*gcgrp));
633}
634
635/*
636 * Converts CIPSO option to sensitivity label.
637 * Validity checks based on restrictions defined in
638 * COMMERCIAL IP SECURITY OPTION (CIPSO 2.2) (draft-ietf-cipso-ipsecurity)
639 */
640static boolean_t
641cipso_to_sl(const uchar_t *option, bslabel_t *sl)
642{
643	const struct cipso_option *co = (const struct cipso_option *)option;
644	const struct cipso_tag_type_1 *tt1;
645
646	tt1 = (struct cipso_tag_type_1 *)&co->cipso_tag_type[0];
647	if (tt1->tag_type != 1 ||
648	    tt1->tag_length < TSOL_TT1_MIN_LENGTH ||
649	    tt1->tag_length > TSOL_TT1_MAX_LENGTH ||
650	    tt1->tag_length + TSOL_CIPSO_TAG_OFFSET > co->cipso_length)
651		return (B_FALSE);
652
653	bsllow(sl);	/* assumed: sets compartments to all zeroes */
654	LCLASS_SET((_bslabel_impl_t *)sl, tt1->tag_sl);
655	bcopy(tt1->tag_cat, &((_bslabel_impl_t *)sl)->compartments,
656	    tt1->tag_length - TSOL_TT1_MIN_LENGTH);
657	return (B_TRUE);
658}
659
660/*
661 * Parse the CIPSO label in the incoming packet and construct a ts_label_t
662 * that reflects the CIPSO label and attach it to the dblk cred. Later as
663 * the mblk flows up through the stack any code that needs to examine the
664 * packet label can inspect the label from the dblk cred. This function is
665 * called right in ip_rput for all packets, i.e. locally destined and
666 * to be forwarded packets. The forwarding path needs to examine the label
667 * to determine how to forward the packet.
668 *
669 * For IPv4, IP header options have been pulled up, but other headers might not
670 * have been.  For IPv6, any hop-by-hop options have been pulled up, but any
671 * other headers might not be present.
672 */
673boolean_t
674tsol_get_pkt_label(mblk_t *mp, int version)
675{
676	tsol_tpc_t	*src_rhtp;
677	uchar_t		*opt_ptr = NULL;
678	const ipha_t	*ipha;
679	bslabel_t	sl;
680	uint32_t	doi;
681	tsol_ip_label_t	label_type;
682	const cipso_option_t *co;
683	const void	*src;
684	const ip6_t	*ip6h;
685	cred_t		*credp;
686	pid_t		cpid;
687
688	ASSERT(DB_TYPE(mp) == M_DATA);
689
690	if (version == IPV4_VERSION) {
691		ipha = (const ipha_t *)mp->b_rptr;
692		src = &ipha->ipha_src;
693		label_type = tsol_get_option(mp, &opt_ptr);
694	} else {
695		uchar_t		*after_secopt;
696		boolean_t	hbh_needed;
697		const uchar_t	*ip6hbh;
698		size_t		optlen;
699
700		label_type = OPT_NONE;
701		ip6h = (const ip6_t *)mp->b_rptr;
702		src = &ip6h->ip6_src;
703		if (ip6h->ip6_nxt == IPPROTO_HOPOPTS) {
704			ip6hbh = (const uchar_t *)&ip6h[1];
705			optlen = (ip6hbh[1] + 1) << 3;
706			ASSERT(ip6hbh + optlen <= mp->b_wptr);
707			opt_ptr = tsol_find_secopt_v6(ip6hbh, optlen,
708			    &after_secopt, &hbh_needed);
709			/* tsol_find_secopt_v6 guarantees some sanity */
710			if (opt_ptr != NULL &&
711			    (optlen = opt_ptr[1]) >= 8) {
712				opt_ptr += 2;
713				bcopy(opt_ptr, &doi, sizeof (doi));
714				doi = ntohl(doi);
715				if (doi == IP6LS_DOI_V4 &&
716				    opt_ptr[4] == IP6LS_TT_V4 &&
717				    opt_ptr[5] <= optlen - 4 &&
718				    opt_ptr[7] <= optlen - 6) {
719					opt_ptr += sizeof (doi) + 2;
720					label_type = OPT_CIPSO;
721				}
722			}
723		}
724	}
725
726	switch (label_type) {
727	case OPT_CIPSO:
728		/*
729		 * Convert the CIPSO label to the internal format
730		 * and attach it to the dblk cred.
731		 * Validity checks based on restrictions defined in
732		 * COMMERCIAL IP SECURITY OPTION (CIPSO 2.2)
733		 * (draft-ietf-cipso-ipsecurity)
734		 */
735		if (version == IPV6_VERSION && ip6opt_ls == 0)
736			return (B_FALSE);
737		co = (const struct cipso_option *)opt_ptr;
738		if ((co->cipso_length <
739		    TSOL_CIPSO_TAG_OFFSET + TSOL_TT1_MIN_LENGTH) ||
740		    (co->cipso_length > IP_MAX_OPT_LENGTH))
741			return (B_FALSE);
742		bcopy(co->cipso_doi, &doi, sizeof (doi));
743		doi = ntohl(doi);
744		if (!cipso_to_sl(opt_ptr, &sl))
745			return (B_FALSE);
746		setbltype(&sl, SUN_SL_ID);
747		break;
748
749	case OPT_NONE:
750		/*
751		 * Handle special cases that are not currently labeled, even
752		 * though the sending system may otherwise be configured as
753		 * labeled.
754		 *	- IGMP
755		 *	- IPv4 ICMP Router Discovery
756		 *	- IPv6 Neighbor Discovery
757		 */
758		if (version == IPV4_VERSION) {
759			if (ipha->ipha_protocol == IPPROTO_IGMP)
760				return (B_TRUE);
761			if (ipha->ipha_protocol == IPPROTO_ICMP) {
762				const struct icmp *icmp = (const struct icmp *)
763				    (mp->b_rptr + IPH_HDR_LENGTH(ipha));
764
765				if ((uchar_t *)icmp > mp->b_wptr) {
766					if (!pullupmsg(mp,
767					    (uchar_t *)icmp - mp->b_rptr + 1))
768						return (B_FALSE);
769					icmp = (const struct icmp *)
770					    (mp->b_rptr +
771					    IPH_HDR_LENGTH(ipha));
772				}
773				if (icmp->icmp_type == ICMP_ROUTERADVERT ||
774				    icmp->icmp_type == ICMP_ROUTERSOLICIT)
775					return (B_TRUE);
776			}
777			src = &ipha->ipha_src;
778		} else {
779			if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
780				const icmp6_t *icmp6 = (const icmp6_t *)
781				    (mp->b_rptr + IPV6_HDR_LEN);
782
783				if ((uchar_t *)icmp6 + ICMP6_MINLEN >
784				    mp->b_wptr) {
785					if (!pullupmsg(mp,
786					    (uchar_t *)icmp6 - mp->b_rptr +
787					    ICMP6_MINLEN))
788						return (B_FALSE);
789					icmp6 = (const icmp6_t *)
790					    (mp->b_rptr + IPV6_HDR_LEN);
791				}
792				if (icmp6->icmp6_type >= MLD_LISTENER_QUERY &&
793				    icmp6->icmp6_type <= ICMP6_MAX_INFO_TYPE)
794					return (B_TRUE);
795			}
796			src = &ip6h->ip6_src;
797		}
798
799		/*
800		 * Look up the tnrhtp database and get the implicit label
801		 * that is associated with this unlabeled host and attach
802		 * it to the packet.
803		 */
804		if ((src_rhtp = find_tpc(src, version, B_FALSE)) == NULL)
805			return (B_FALSE);
806
807		/* If the sender is labeled, drop the unlabeled packet. */
808		if (src_rhtp->tpc_tp.host_type != UNLABELED) {
809			TPC_RELE(src_rhtp);
810			pr_addr_dbg("unlabeled packet forged from %s\n",
811			    version == IPV4_VERSION ? AF_INET : AF_INET6, src);
812			return (B_FALSE);
813		}
814
815		sl = src_rhtp->tpc_tp.tp_def_label;
816		setbltype(&sl, SUN_SL_ID);
817		doi = src_rhtp->tpc_tp.tp_doi;
818		TPC_RELE(src_rhtp);
819		break;
820
821	default:
822		return (B_FALSE);
823	}
824
825	/* Make sure no other thread is messing with this mblk */
826	ASSERT(DB_REF(mp) == 1);
827	/* Preserve db_cpid */
828	credp = msg_extractcred(mp, &cpid);
829	if (credp == NULL) {
830		credp = newcred_from_bslabel(&sl, doi, KM_NOSLEEP);
831	} else {
832		cred_t	*newcr;
833
834		newcr = copycred_from_bslabel(credp, &sl, doi,
835		    KM_NOSLEEP);
836		crfree(credp);
837		credp = newcr;
838	}
839	if (credp == NULL)
840		return (B_FALSE);
841	mblk_setcred(mp, credp, cpid);
842	crfree(credp);			/* mblk has ref on cred */
843
844	/*
845	 * If the source was unlabeled, then flag as such,
846	 * while remembering that CIPSO routers add headers.
847	 */
848	if (label_type == OPT_NONE) {
849		crgetlabel(credp)->tsl_flags |= TSLF_UNLABELED;
850	} else if (label_type == OPT_CIPSO) {
851		if ((src_rhtp = find_tpc(src, version, B_FALSE)) == NULL)
852			return (B_FALSE);
853		if (src_rhtp->tpc_tp.host_type == UNLABELED)
854			crgetlabel(credp)->tsl_flags |= TSLF_UNLABELED;
855		TPC_RELE(src_rhtp);
856	}
857
858	return (B_TRUE);
859}
860
861/*
862 * This routine determines whether the given packet should be accepted locally.
863 * It does a range/set check on the packet's label by looking up the given
864 * address in the remote host database.
865 */
866boolean_t
867tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
868    boolean_t shared_addr, const conn_t *connp)
869{
870	const cred_t *credp;
871	ts_label_t *plabel, *conn_plabel;
872	tsol_tpc_t *tp;
873	boolean_t retv;
874	const bslabel_t *label, *conn_label;
875
876	/*
877	 * The cases in which this can happen are:
878	 *	- IPv6 Router Alert, where ip_rput_data_v6 deliberately skips
879	 *	  over the label attachment process.
880	 *	- MLD output looped-back to ourselves.
881	 *	- IPv4 Router Discovery, where tsol_get_pkt_label intentionally
882	 *	  avoids the labeling process.
883	 * We trust that all valid paths in the code set the cred pointer when
884	 * needed.
885	 */
886	if ((credp = msg_getcred(mp, NULL)) == NULL)
887		return (B_TRUE);
888
889	/*
890	 * If this packet is from the inside (not a remote host) and has the
891	 * same zoneid as the selected destination, then no checks are
892	 * necessary.  Membership in the zone is enough proof.  This is
893	 * intended to be a hot path through this function.
894	 */
895	if (!crisremote(credp) &&
896	    crgetzone(credp) == crgetzone(connp->conn_cred))
897		return (B_TRUE);
898
899	plabel = crgetlabel(credp);
900	conn_plabel = crgetlabel(connp->conn_cred);
901	ASSERT(plabel != NULL && conn_plabel != NULL);
902
903	label = label2bslabel(plabel);
904	conn_label = label2bslabel(crgetlabel(connp->conn_cred));
905
906	/*
907	 * MLPs are always validated using the range and set of the local
908	 * address, even when the remote host is unlabeled.
909	 */
910	if (connp->conn_mlp_type == mlptBoth ||
911	/* LINTED: no consequent */
912	    connp->conn_mlp_type == (shared_addr ? mlptShared : mlptPrivate)) {
913		;
914
915	/*
916	 * If this is a packet from an unlabeled sender, then we must apply
917	 * different rules.  If the label is equal to the zone's label, then
918	 * it's allowed.  If it's not equal, but the zone is either the global
919	 * zone or the label is dominated by the zone's label, then allow it
920	 * as long as it's in the range configured for the destination.
921	 */
922	} else if (plabel->tsl_flags & TSLF_UNLABELED) {
923		if (plabel->tsl_doi == conn_plabel->tsl_doi &&
924		    blequal(label, conn_label))
925			return (B_TRUE);
926
927		/*
928		 * conn_zoneid is global for an exclusive stack, thus we use
929		 * conn_cred to get the zoneid
930		 */
931		if (!connp->conn_mac_exempt ||
932		    (crgetzoneid(connp->conn_cred) != GLOBAL_ZONEID &&
933		    (plabel->tsl_doi != conn_plabel->tsl_doi ||
934		    !bldominates(conn_label, label)))) {
935			DTRACE_PROBE3(
936			    tx__ip__log__drop__receivelocal__mac_unl,
937			    char *,
938			    "unlabeled packet mp(1) fails mac for conn(2)",
939			    mblk_t *, mp, conn_t *, connp);
940			return (B_FALSE);
941		}
942
943	/*
944	 * If this is a packet from a labeled sender, verify the
945	 * label on the packet matches the connection label.
946	 */
947	} else {
948		if (plabel->tsl_doi != conn_plabel->tsl_doi ||
949		    !blequal(label, conn_label)) {
950			DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac__slp,
951			    char *,
952			    "packet mp(1) failed label match to SLP conn(2)",
953			    mblk_t *, mp, conn_t *, connp);
954			return (B_FALSE);
955		}
956		/*
957		 * No further checks will be needed if this is a zone-
958		 * specific address because (1) The process for bringing up
959		 * the interface ensures the zone's label is within the zone-
960		 * specific address's valid label range; (2) For cases where
961		 * the conn is bound to the unspecified addresses, ip fanout
962		 * logic ensures conn's zoneid equals the dest addr's zoneid;
963		 * (3) Mac-exempt and mlp logic above already handle all
964		 * cases where the zone label may not be the same as the
965		 * conn label.
966		 */
967		if (!shared_addr)
968			return (B_TRUE);
969	}
970
971	tp = find_tpc(addr, version, B_FALSE);
972	if (tp == NULL) {
973		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__no__tnr,
974		    char *, "dropping mp(1), host(2) lacks entry",
975		    mblk_t *, mp, void *, addr);
976		return (B_FALSE);
977	}
978
979	/*
980	 * The local host address should not be unlabeled at this point.  The
981	 * only way this can happen is that the destination isn't unicast.  We
982	 * assume that the packet should not have had a label, and thus should
983	 * have been handled by the TSLF_UNLABELED logic above.
984	 */
985	if (tp->tpc_tp.host_type == UNLABELED) {
986		retv = B_FALSE;
987		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__flag, char *,
988		    "mp(1) unlabeled source, but tp is not unlabeled.",
989		    mblk_t *, mp, tsol_tpc_t *, tp);
990
991	} else if (tp->tpc_tp.host_type != SUN_CIPSO) {
992		retv = B_FALSE;
993		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__tptype, char *,
994		    "delivering mp(1), found unrecognized tpc(2) type.",
995		    mblk_t *, mp, tsol_tpc_t *, tp);
996
997	} else if (plabel->tsl_doi != tp->tpc_tp.tp_doi) {
998		retv = B_FALSE;
999		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac, char *,
1000		    "mp(1) could not be delievered to tp(2), doi mismatch",
1001		    mblk_t *, mp, tsol_tpc_t *, tp);
1002
1003	} else if (!_blinrange(label, &tp->tpc_tp.tp_sl_range_cipso) &&
1004	    !blinlset(label, tp->tpc_tp.tp_sl_set_cipso)) {
1005		retv = B_FALSE;
1006		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac, char *,
1007		    "mp(1) could not be delievered to tp(2), bad mac",
1008		    mblk_t *, mp, tsol_tpc_t *, tp);
1009	} else {
1010		retv = B_TRUE;
1011	}
1012
1013	TPC_RELE(tp);
1014
1015	return (retv);
1016}
1017
1018boolean_t
1019tsol_can_accept_raw(mblk_t *mp, boolean_t check_host)
1020{
1021	ts_label_t	*plabel = NULL;
1022	tsol_tpc_t	*src_rhtp, *dst_rhtp;
1023	boolean_t	retv;
1024	cred_t		*credp;
1025
1026	credp = msg_getcred(mp, NULL);
1027	if (credp != NULL)
1028		plabel = crgetlabel(credp);
1029
1030	/* We are bootstrapping or the internal template was never deleted */
1031	if (plabel == NULL)
1032		return (B_TRUE);
1033
1034	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1035		ipha_t *ipha = (ipha_t *)mp->b_rptr;
1036
1037		src_rhtp = find_tpc(&ipha->ipha_src, IPV4_VERSION,
1038		    B_FALSE);
1039		if (src_rhtp == NULL)
1040			return (B_FALSE);
1041		dst_rhtp = find_tpc(&ipha->ipha_dst, IPV4_VERSION,
1042		    B_FALSE);
1043	} else {
1044		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1045
1046		src_rhtp = find_tpc(&ip6h->ip6_src, IPV6_VERSION,
1047		    B_FALSE);
1048		if (src_rhtp == NULL)
1049			return (B_FALSE);
1050		dst_rhtp = find_tpc(&ip6h->ip6_dst, IPV6_VERSION,
1051		    B_FALSE);
1052	}
1053	if (dst_rhtp == NULL) {
1054		TPC_RELE(src_rhtp);
1055		return (B_FALSE);
1056	}
1057
1058	if (label2doi(plabel) != src_rhtp->tpc_tp.tp_doi) {
1059		retv = B_FALSE;
1060
1061	/*
1062	 * Check that the packet's label is in the correct range for labeled
1063	 * sender, or is equal to the default label for unlabeled sender.
1064	 */
1065	} else if ((src_rhtp->tpc_tp.host_type != UNLABELED &&
1066	    !_blinrange(label2bslabel(plabel),
1067	    &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
1068	    !blinlset(label2bslabel(plabel),
1069	    src_rhtp->tpc_tp.tp_sl_set_cipso)) ||
1070	    (src_rhtp->tpc_tp.host_type == UNLABELED &&
1071	    !blequal(&plabel->tsl_label, &src_rhtp->tpc_tp.tp_def_label))) {
1072		retv = B_FALSE;
1073
1074	} else if (check_host) {
1075		retv = B_TRUE;
1076
1077	/*
1078	 * Until we have SL range in the Zone structure, pass it
1079	 * when our own address lookup returned an internal entry.
1080	 */
1081	} else switch (dst_rhtp->tpc_tp.host_type) {
1082	case UNLABELED:
1083		retv = B_TRUE;
1084		break;
1085
1086	case SUN_CIPSO:
1087		retv = _blinrange(label2bslabel(plabel),
1088		    &dst_rhtp->tpc_tp.tp_sl_range_cipso) ||
1089		    blinlset(label2bslabel(plabel),
1090		    dst_rhtp->tpc_tp.tp_sl_set_cipso);
1091		break;
1092
1093	default:
1094		retv = B_FALSE;
1095	}
1096	TPC_RELE(src_rhtp);
1097	TPC_RELE(dst_rhtp);
1098	return (retv);
1099}
1100
1101/*
1102 * This routine determines whether a response to a failed packet delivery or
1103 * connection should be sent back.  By default, the policy is to allow such
1104 * messages to be sent at all times, as these messages reveal little useful
1105 * information and are healthy parts of TCP/IP networking.
1106 *
1107 * If tsol_strict_error is set, then we do strict tests: if the packet label is
1108 * within the label range/set of this host/zone, return B_TRUE; otherwise
1109 * return B_FALSE, which causes the packet to be dropped silently.
1110 *
1111 * Note that tsol_get_pkt_label will cause the packet to drop if the sender is
1112 * marked as labeled in the remote host database, but the packet lacks a label.
1113 * This means that we don't need to do a lookup on the source; the
1114 * TSLF_UNLABELED flag is sufficient.
1115 */
1116boolean_t
1117tsol_can_reply_error(const mblk_t *mp)
1118{
1119	ts_label_t	*plabel = NULL;
1120	tsol_tpc_t	*rhtp;
1121	const ipha_t	*ipha;
1122	const ip6_t	*ip6h;
1123	boolean_t	retv;
1124	bslabel_t	*pktbs;
1125	cred_t		*credp;
1126
1127	/* Caller must pull up at least the IP header */
1128	ASSERT(MBLKL(mp) >= (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ?
1129	    sizeof (*ipha) : sizeof (*ip6h)));
1130
1131	if (!tsol_strict_error)
1132		return (B_TRUE);
1133
1134	credp = msg_getcred(mp, NULL);
1135	if (credp != NULL)
1136		plabel = crgetlabel(credp);
1137
1138	/* We are bootstrapping or the internal template was never deleted */
1139	if (plabel == NULL)
1140		return (B_TRUE);
1141
1142	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1143		ipha = (const ipha_t *)mp->b_rptr;
1144		rhtp = find_tpc(&ipha->ipha_dst, IPV4_VERSION, B_FALSE);
1145	} else {
1146		ip6h = (const ip6_t *)mp->b_rptr;
1147		rhtp = find_tpc(&ip6h->ip6_dst, IPV6_VERSION, B_FALSE);
1148	}
1149
1150	if (rhtp == NULL || label2doi(plabel) != rhtp->tpc_tp.tp_doi) {
1151		retv = B_FALSE;
1152	} else {
1153		/*
1154		 * If we're in the midst of forwarding, then the destination
1155		 * address might not be labeled.  In that case, allow unlabeled
1156		 * packets through only if the default label is the same, and
1157		 * labeled ones if they dominate.
1158		 */
1159		pktbs = label2bslabel(plabel);
1160		switch (rhtp->tpc_tp.host_type) {
1161		case UNLABELED:
1162			if (plabel->tsl_flags & TSLF_UNLABELED) {
1163				retv = blequal(pktbs,
1164				    &rhtp->tpc_tp.tp_def_label);
1165			} else {
1166				retv = bldominates(pktbs,
1167				    &rhtp->tpc_tp.tp_def_label);
1168			}
1169			break;
1170
1171		case SUN_CIPSO:
1172			retv = _blinrange(pktbs,
1173			    &rhtp->tpc_tp.tp_sl_range_cipso) ||
1174			    blinlset(pktbs, rhtp->tpc_tp.tp_sl_set_cipso);
1175			break;
1176
1177		default:
1178			retv = B_FALSE;
1179			break;
1180		}
1181	}
1182
1183	if (rhtp != NULL)
1184		TPC_RELE(rhtp);
1185
1186	return (retv);
1187}
1188
1189/*
1190 * Finds the zone associated with the given packet.  Returns GLOBAL_ZONEID if
1191 * the zone cannot be located.
1192 *
1193 * This is used by the classifier when the packet matches an ALL_ZONES IRE, and
1194 * there's no MLP defined.
1195 *
1196 * Note that we assume that this is only invoked in the ALL_ZONES case.
1197 * Handling other cases would require handle exclusive stack zones where either
1198 * this routine or the callers would have to map from
1199 * the zoneid (zone->zone_id) to what IP uses in conn_zoneid etc.
1200 */
1201zoneid_t
1202tsol_packet_to_zoneid(const mblk_t *mp)
1203{
1204	cred_t *cr = msg_getcred(mp, NULL);
1205	zone_t *zone;
1206	ts_label_t *label;
1207
1208	if (cr != NULL) {
1209		if ((label = crgetlabel(cr)) != NULL) {
1210			zone = zone_find_by_label(label);
1211			if (zone != NULL) {
1212				zoneid_t zoneid = zone->zone_id;
1213
1214				zone_rele(zone);
1215				return (zoneid);
1216			}
1217		}
1218	}
1219	return (GLOBAL_ZONEID);
1220}
1221
1222int
1223tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
1224{
1225	int		error = 0;
1226	tsol_ire_gw_secattr_t *attrp = NULL;
1227	tsol_tnrhc_t	*gw_rhc = NULL;
1228	tsol_gcgrp_t	*gcgrp = NULL;
1229	tsol_gc_t	*gc = NULL;
1230	in_addr_t	ga_addr4;
1231	void		*paddr = NULL;
1232
1233	/* Not in Trusted mode or IRE is local/loopback/broadcast/interface */
1234	if (!is_system_labeled() ||
1235	    (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
1236	    IRE_INTERFACE)))
1237		goto done;
1238
1239	/*
1240	 * If we don't have a label to compare with, or the IRE does not
1241	 * contain any gateway security attributes, there's not much that
1242	 * we can do.  We let the former case pass, and the latter fail,
1243	 * since the IRE doesn't qualify for a match due to the lack of
1244	 * security attributes.
1245	 */
1246	if (tsl == NULL || ire->ire_gw_secattr == NULL) {
1247		if (tsl != NULL) {
1248			DTRACE_PROBE3(tx__ip__log__drop__irematch__nogwsec,
1249			    char *,
1250			    "ire(1) lacks ire_gw_secattr matching label(2)",
1251			    ire_t *, ire, ts_label_t *, tsl);
1252			error = EACCES;
1253		}
1254		goto done;
1255	}
1256
1257	attrp = ire->ire_gw_secattr;
1258
1259	/*
1260	 * The possible lock order scenarios related to the tsol gateway
1261	 * attribute locks are documented at the beginning of ip.c in the
1262	 * lock order scenario section.
1263	 */
1264	mutex_enter(&attrp->igsa_lock);
1265
1266	/*
1267	 * Depending on the IRE type (prefix vs. cache), we seek the group
1268	 * structure which contains all security credentials of the gateway.
1269	 * A prefix IRE is associated with at most one gateway credential,
1270	 * while a cache IRE is associated with every credentials that the
1271	 * gateway has.
1272	 */
1273	if ((gc = attrp->igsa_gc) != NULL) {			/* prefix */
1274		gcgrp = gc->gc_grp;
1275		ASSERT(gcgrp != NULL);
1276		rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
1277	} else if ((gcgrp = attrp->igsa_gcgrp) != NULL) {	/* cache */
1278		rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
1279		gc = gcgrp->gcgrp_head;
1280		if (gc == NULL) {
1281			/* gc group is empty, so the drop lock now */
1282			ASSERT(gcgrp->gcgrp_count == 0);
1283			rw_exit(&gcgrp->gcgrp_rwlock);
1284			gcgrp = NULL;
1285		}
1286	}
1287
1288	if (gcgrp != NULL)
1289		GCGRP_REFHOLD(gcgrp);
1290
1291	if ((gw_rhc = attrp->igsa_rhc) != NULL) {
1292		/*
1293		 * If our cached entry has grown stale, then discard it so we
1294		 * can get a new one.
1295		 */
1296		if (gw_rhc->rhc_invalid || gw_rhc->rhc_tpc->tpc_invalid) {
1297			TNRHC_RELE(gw_rhc);
1298			attrp->igsa_rhc = gw_rhc = NULL;
1299		} else {
1300			TNRHC_HOLD(gw_rhc)
1301		}
1302	}
1303
1304	/* Last attempt at loading the template had failed; try again */
1305	if (gw_rhc == NULL) {
1306		if (gcgrp != NULL) {
1307			tsol_gcgrp_addr_t *ga = &gcgrp->gcgrp_addr;
1308
1309			if (ire->ire_ipversion == IPV4_VERSION) {
1310				ASSERT(ga->ga_af == AF_INET);
1311				IN6_V4MAPPED_TO_IPADDR(&ga->ga_addr, ga_addr4);
1312				paddr = &ga_addr4;
1313			} else {
1314				ASSERT(ga->ga_af == AF_INET6);
1315				paddr = &ga->ga_addr;
1316			}
1317		} else if (ire->ire_ipversion == IPV6_VERSION &&
1318		    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
1319			paddr = &ire->ire_gateway_addr_v6;
1320		} else if (ire->ire_ipversion == IPV4_VERSION &&
1321		    ire->ire_gateway_addr != INADDR_ANY) {
1322			paddr = &ire->ire_gateway_addr;
1323		}
1324
1325		/* We've found a gateway address to do the template lookup */
1326		if (paddr != NULL) {
1327			ASSERT(gw_rhc == NULL);
1328			gw_rhc = find_rhc(paddr, ire->ire_ipversion, B_FALSE);
1329			if (gw_rhc != NULL) {
1330				/*
1331				 * Note that if the lookup above returned an
1332				 * internal template, we'll use it for the
1333				 * time being, and do another lookup next
1334				 * time around.
1335				 */
1336				/* Another thread has loaded the template? */
1337				if (attrp->igsa_rhc != NULL) {
1338					TNRHC_RELE(gw_rhc)
1339					/* reload, it could be different */
1340					gw_rhc = attrp->igsa_rhc;
1341				} else {
1342					attrp->igsa_rhc = gw_rhc;
1343				}
1344				/*
1345				 * Hold an extra reference just like we did
1346				 * above prior to dropping the igsa_lock.
1347				 */
1348				TNRHC_HOLD(gw_rhc)
1349			}
1350		}
1351	}
1352
1353	mutex_exit(&attrp->igsa_lock);
1354	/* Gateway template not found */
1355	if (gw_rhc == NULL) {
1356		/*
1357		 * If destination address is directly reachable through an
1358		 * interface rather than through a learned route, pass it.
1359		 */
1360		if (paddr != NULL) {
1361			DTRACE_PROBE3(
1362			    tx__ip__log__drop__irematch__nogwtmpl, char *,
1363			    "ire(1), label(2) off-link with no gw_rhc",
1364			    ire_t *, ire, ts_label_t *, tsl);
1365			error = EINVAL;
1366		}
1367		goto done;
1368	}
1369
1370	if (gc != NULL) {
1371		tsol_gcdb_t *gcdb;
1372		/*
1373		 * In the case of IRE_CACHE we've got one or more gateway
1374		 * security credentials to compare against the passed in label.
1375		 * Perform label range comparison against each security
1376		 * credential of the gateway. In the case of a prefix ire
1377		 * we need to match against the security attributes of
1378		 * just the route itself, so the loop is executed only once.
1379		 */
1380		ASSERT(gcgrp != NULL);
1381		do {
1382			gcdb = gc->gc_db;
1383			if (tsl->tsl_doi == gcdb->gcdb_doi &&
1384			    _blinrange(&tsl->tsl_label, &gcdb->gcdb_slrange))
1385				break;
1386			if (ire->ire_type == IRE_CACHE)
1387				gc = gc->gc_next;
1388			else
1389				gc = NULL;
1390		} while (gc != NULL);
1391
1392		if (gc == NULL) {
1393			DTRACE_PROBE3(
1394			    tx__ip__log__drop__irematch__nogcmatched,
1395			    char *, "ire(1), tsl(2): all gc failed match",
1396			    ire_t *, ire, ts_label_t *, tsl);
1397			error = EACCES;
1398		}
1399	} else {
1400		/*
1401		 * We didn't find any gateway credentials in the IRE
1402		 * attributes; fall back to the gateway's template for
1403		 * label range checks, if we are required to do so.
1404		 */
1405		ASSERT(gw_rhc != NULL);
1406		switch (gw_rhc->rhc_tpc->tpc_tp.host_type) {
1407		case SUN_CIPSO:
1408			if (tsl->tsl_doi != gw_rhc->rhc_tpc->tpc_tp.tp_doi ||
1409			    (!_blinrange(&tsl->tsl_label,
1410			    &gw_rhc->rhc_tpc->tpc_tp.tp_sl_range_cipso) &&
1411			    !blinlset(&tsl->tsl_label,
1412			    gw_rhc->rhc_tpc->tpc_tp.tp_sl_set_cipso))) {
1413				error = EACCES;
1414				DTRACE_PROBE4(
1415				    tx__ip__log__drop__irematch__deftmpl,
1416				    char *, "ire(1), tsl(2), gw_rhc(3) "
1417				    "failed match (cipso gw)",
1418				    ire_t *, ire, ts_label_t *, tsl,
1419				    tsol_tnrhc_t *, gw_rhc);
1420			}
1421			break;
1422
1423		case UNLABELED:
1424			if (tsl->tsl_doi != gw_rhc->rhc_tpc->tpc_tp.tp_doi ||
1425			    (!_blinrange(&tsl->tsl_label,
1426			    &gw_rhc->rhc_tpc->tpc_tp.tp_gw_sl_range) &&
1427			    !blinlset(&tsl->tsl_label,
1428			    gw_rhc->rhc_tpc->tpc_tp.tp_gw_sl_set))) {
1429				error = EACCES;
1430				DTRACE_PROBE4(
1431				    tx__ip__log__drop__irematch__deftmpl,
1432				    char *, "ire(1), tsl(2), gw_rhc(3) "
1433				    "failed match (unlabeled gw)",
1434				    ire_t *, ire, ts_label_t *, tsl,
1435				    tsol_tnrhc_t *, gw_rhc);
1436			}
1437			break;
1438		}
1439	}
1440
1441done:
1442
1443	if (gcgrp != NULL) {
1444		rw_exit(&gcgrp->gcgrp_rwlock);
1445		GCGRP_REFRELE(gcgrp);
1446	}
1447
1448	if (gw_rhc != NULL)
1449		TNRHC_RELE(gw_rhc)
1450
1451	return (error);
1452}
1453
1454/*
1455 * Performs label accreditation checks for packet forwarding.
1456 *
1457 * Returns a pointer to the modified mblk if allowed for forwarding,
1458 * or NULL if the packet must be dropped.
1459 */
1460mblk_t *
1461tsol_ip_forward(ire_t *ire, mblk_t *mp)
1462{
1463	tsol_ire_gw_secattr_t *attrp = NULL;
1464	ipha_t		*ipha;
1465	ip6_t		*ip6h;
1466	const void	*pdst;
1467	const void	*psrc;
1468	boolean_t	off_link;
1469	tsol_tpc_t	*dst_rhtp, *gw_rhtp;
1470	tsol_ip_label_t label_type;
1471	uchar_t		*opt_ptr = NULL;
1472	ts_label_t	*tsl;
1473	uint8_t		proto;
1474	int		af, adjust;
1475	uint16_t	iplen;
1476	boolean_t	need_tpc_rele = B_FALSE;
1477	ipaddr_t	*gw;
1478	ip_stack_t	*ipst = ire->ire_ipst;
1479	cred_t		*credp;
1480
1481	ASSERT(ire != NULL && mp != NULL);
1482	ASSERT(ire->ire_stq != NULL);
1483
1484	af = (ire->ire_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6;
1485
1486	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1487		ASSERT(ire->ire_ipversion == IPV4_VERSION);
1488		ipha = (ipha_t *)mp->b_rptr;
1489		psrc = &ipha->ipha_src;
1490		pdst = &ipha->ipha_dst;
1491		proto = ipha->ipha_protocol;
1492
1493		/*
1494		 * off_link is TRUE if destination not directly reachable.
1495		 * Surya note: we avoid creation of per-dst IRE_CACHE entries
1496		 * for forwarded packets, so we set off_link to be TRUE
1497		 * if the packet dst is different from the ire_addr of
1498		 * the ire for the nexthop.
1499		 */
1500		off_link = ((ipha->ipha_dst != ire->ire_addr) ||
1501		    (ire->ire_gateway_addr != INADDR_ANY));
1502	} else {
1503		ASSERT(ire->ire_ipversion == IPV6_VERSION);
1504		ip6h = (ip6_t *)mp->b_rptr;
1505		psrc = &ip6h->ip6_src;
1506		pdst = &ip6h->ip6_dst;
1507		proto = ip6h->ip6_nxt;
1508
1509		if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
1510		    proto != IPPROTO_ICMPV6) {
1511			uint8_t *nexthdrp;
1512			uint16_t hdr_len;
1513
1514			if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len,
1515			    &nexthdrp)) {
1516				/* malformed packet; drop it */
1517				return (NULL);
1518			}
1519			proto = *nexthdrp;
1520		}
1521
1522		/* destination not directly reachable? */
1523		off_link = !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
1524	}
1525
1526	if ((tsl = msg_getlabel(mp)) == NULL)
1527		return (mp);
1528
1529	label_type = tsol_get_option(mp, &opt_ptr);
1530
1531	ASSERT(psrc != NULL && pdst != NULL);
1532	dst_rhtp = find_tpc(pdst, ire->ire_ipversion, B_FALSE);
1533
1534	if (dst_rhtp == NULL) {
1535		/*
1536		 * Without a template we do not know if forwarding
1537		 * violates MAC
1538		 */
1539		DTRACE_PROBE3(tx__ip__log__drop__forward__nodst, char *,
1540		    "mp(1) dropped, no template for destination ip4|6(2)",
1541		    mblk_t *, mp, void *, pdst);
1542		return (NULL);
1543	}
1544
1545	/*
1546	 * Gateway template must have existed for off-link destinations,
1547	 * since tsol_ire_match_gwattr has ensured such condition.
1548	 */
1549	if (ire->ire_ipversion == IPV4_VERSION && off_link) {
1550		/*
1551		 * Surya note: first check if we can get the gw_rhtp from
1552		 * the ire_gw_secattr->igsa_rhc; if this is null, then
1553		 * do a lookup based on the ire_addr (address of gw)
1554		 */
1555		if (ire->ire_gw_secattr != NULL &&
1556		    ire->ire_gw_secattr->igsa_rhc != NULL) {
1557			attrp = ire->ire_gw_secattr;
1558			gw_rhtp = attrp->igsa_rhc->rhc_tpc;
1559		} else  {
1560			/*
1561			 * use the ire_addr if this is the IRE_CACHE of nexthop
1562			 */
1563			gw = (ire->ire_gateway_addr == NULL? &ire->ire_addr :
1564			    &ire->ire_gateway_addr);
1565			gw_rhtp = find_tpc(gw, ire->ire_ipversion, B_FALSE);
1566			need_tpc_rele = B_TRUE;
1567		}
1568		if (gw_rhtp == NULL) {
1569			DTRACE_PROBE3(tx__ip__log__drop__forward__nogw, char *,
1570			    "mp(1) dropped, no gateway in ire attributes(2)",
1571			    mblk_t *, mp, tsol_ire_gw_secattr_t *, attrp);
1572			mp = NULL;
1573			goto keep_label;
1574		}
1575	}
1576	if (ire->ire_ipversion == IPV6_VERSION &&
1577	    ((attrp = ire->ire_gw_secattr) == NULL || attrp->igsa_rhc == NULL ||
1578	    (gw_rhtp = attrp->igsa_rhc->rhc_tpc) == NULL) && off_link) {
1579		DTRACE_PROBE3(tx__ip__log__drop__forward__nogw, char *,
1580		    "mp(1) dropped, no gateway in ire attributes(2)",
1581		    mblk_t *, mp, tsol_ire_gw_secattr_t *, attrp);
1582		mp = NULL;
1583		goto keep_label;
1584	}
1585
1586	/*
1587	 * Check that the label for the packet is acceptable
1588	 * by destination host; otherwise, drop it.
1589	 */
1590	switch (dst_rhtp->tpc_tp.host_type) {
1591	case SUN_CIPSO:
1592		if (tsl->tsl_doi != dst_rhtp->tpc_tp.tp_doi ||
1593		    (!_blinrange(&tsl->tsl_label,
1594		    &dst_rhtp->tpc_tp.tp_sl_range_cipso) &&
1595		    !blinlset(&tsl->tsl_label,
1596		    dst_rhtp->tpc_tp.tp_sl_set_cipso))) {
1597			DTRACE_PROBE4(tx__ip__log__drop__forward__mac, char *,
1598			    "labeled packet mp(1) dropped, label(2) fails "
1599			    "destination(3) accredation check",
1600			    mblk_t *, mp, ts_label_t *, tsl,
1601			    tsol_tpc_t *, dst_rhtp);
1602			mp = NULL;
1603			goto keep_label;
1604		}
1605		break;
1606
1607
1608	case UNLABELED:
1609		if (tsl->tsl_doi != dst_rhtp->tpc_tp.tp_doi ||
1610		    !blequal(&dst_rhtp->tpc_tp.tp_def_label,
1611		    &tsl->tsl_label)) {
1612			DTRACE_PROBE4(tx__ip__log__drop__forward__mac, char *,
1613			    "unlabeled packet mp(1) dropped, label(2) fails "
1614			    "destination(3) accredation check",
1615			    mblk_t *, mp, ts_label_t *, tsl,
1616			    tsol_tpc_t *, dst_rhtp);
1617			mp = NULL;
1618			goto keep_label;
1619		}
1620		break;
1621	}
1622	if (label_type == OPT_CIPSO) {
1623		/*
1624		 * We keep the label on any of the following cases:
1625		 *
1626		 *   1. The destination is labeled (on/off-link).
1627		 *   2. The unlabeled destination is off-link,
1628		 *	and the next hop gateway is labeled.
1629		 */
1630		if (dst_rhtp->tpc_tp.host_type != UNLABELED ||
1631		    (off_link &&
1632		    gw_rhtp->tpc_tp.host_type != UNLABELED))
1633			goto keep_label;
1634
1635		/*
1636		 * Strip off the CIPSO option from the packet because: the
1637		 * unlabeled destination host is directly reachable through
1638		 * an interface (on-link); or, the unlabeled destination host
1639		 * is not directly reachable (off-link), and the next hop
1640		 * gateway is unlabeled.
1641		 */
1642		adjust = (af == AF_INET) ? tsol_remove_secopt(ipha, MBLKL(mp)) :
1643		    tsol_remove_secopt_v6(ip6h, MBLKL(mp));
1644
1645		ASSERT(adjust <= 0);
1646		if (adjust != 0) {
1647
1648			/* adjust is negative */
1649			ASSERT((mp->b_wptr + adjust) >= mp->b_rptr);
1650			mp->b_wptr += adjust;
1651
1652			if (af == AF_INET) {
1653				ipha = (ipha_t *)mp->b_rptr;
1654				iplen = ntohs(ipha->ipha_length) + adjust;
1655				ipha->ipha_length = htons(iplen);
1656				ipha->ipha_hdr_checksum = 0;
1657				ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1658			}
1659			DTRACE_PROBE3(tx__ip__log__info__forward__adjust,
1660			    char *,
1661			    "mp(1) adjusted(2) for CIPSO option removal",
1662			    mblk_t *, mp, int, adjust);
1663		}
1664		goto keep_label;
1665	}
1666
1667	ASSERT(label_type == OPT_NONE);
1668	ASSERT(dst_rhtp != NULL);
1669
1670	/*
1671	 * We need to add CIPSO option if the destination or the next hop
1672	 * gateway is labeled.  Otherwise, pass the packet as is.
1673	 */
1674	if (dst_rhtp->tpc_tp.host_type == UNLABELED &&
1675	    (!off_link || gw_rhtp->tpc_tp.host_type == UNLABELED))
1676		goto keep_label;
1677
1678
1679	credp = msg_getcred(mp, NULL);
1680	if ((af == AF_INET &&
1681	    tsol_check_label(credp, &mp, B_FALSE, ipst) != 0) ||
1682	    (af == AF_INET6 &&
1683	    tsol_check_label_v6(credp, &mp, B_FALSE, ipst) != 0)) {
1684		mp = NULL;
1685		goto keep_label;
1686	}
1687
1688	if (af == AF_INET) {
1689		ipha = (ipha_t *)mp->b_rptr;
1690		ipha->ipha_hdr_checksum = 0;
1691		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1692	}
1693
1694keep_label:
1695	TPC_RELE(dst_rhtp);
1696	if (need_tpc_rele && gw_rhtp != NULL)
1697		TPC_RELE(gw_rhtp);
1698	return (mp);
1699}
1700
1701/*
1702 * Name:	tsol_pmtu_adjust()
1703 *
1704 * Returns the adjusted mtu after removing security option.
1705 * Removes/subtracts the option if the packet's cred indicates an unlabeled
1706 * sender or if pkt_diff indicates this system enlarged the packet.
1707 */
1708uint32_t
1709tsol_pmtu_adjust(mblk_t *mp, uint32_t mtu, int pkt_diff, int af)
1710{
1711	int		label_adj = 0;
1712	uint32_t	min_mtu = IP_MIN_MTU;
1713	tsol_tpc_t	*src_rhtp;
1714	void		*src;
1715
1716	/*
1717	 * Note: label_adj is non-positive, indicating the number of
1718	 * bytes removed by removing the security option from the
1719	 * header.
1720	 */
1721	if (af == AF_INET6) {
1722		ip6_t	*ip6h;
1723
1724		min_mtu = IPV6_MIN_MTU;
1725		ip6h = (ip6_t *)mp->b_rptr;
1726		src = &ip6h->ip6_src;
1727		if ((src_rhtp = find_tpc(src, IPV6_VERSION, B_FALSE)) == NULL)
1728			return (mtu);
1729		if (pkt_diff > 0 || src_rhtp->tpc_tp.host_type == UNLABELED) {
1730			label_adj = tsol_remove_secopt_v6(
1731			    (ip6_t *)mp->b_rptr, MBLKL(mp));
1732		}
1733	} else {
1734		ipha_t    *ipha;
1735
1736		ASSERT(af == AF_INET);
1737		ipha = (ipha_t *)mp->b_rptr;
1738		src = &ipha->ipha_src;
1739		if ((src_rhtp = find_tpc(src, IPV4_VERSION, B_FALSE)) == NULL)
1740			return (mtu);
1741		if (pkt_diff > 0 || src_rhtp->tpc_tp.host_type == UNLABELED)
1742			label_adj = tsol_remove_secopt(
1743			    (ipha_t *)mp->b_rptr, MBLKL(mp));
1744	}
1745	/*
1746	 * Make pkt_diff non-negative and the larger of the bytes
1747	 * previously added (if any) or just removed, since label
1748	 * addition + subtraction may not be completely idempotent.
1749	 */
1750	if (pkt_diff < -label_adj)
1751		pkt_diff = -label_adj;
1752	if (pkt_diff > 0 && pkt_diff < mtu)
1753		mtu -= pkt_diff;
1754
1755	TPC_RELE(src_rhtp);
1756	return (MAX(mtu, min_mtu));
1757}
1758
1759/*
1760 * Name:	tsol_rtsa_init()
1761 *
1762 * Normal:	Sanity checks on the route security attributes provided by
1763 *		user.  Convert it into a route security parameter list to
1764 *		be returned to caller.
1765 *
1766 * Output:	EINVAL if bad security attributes in the routing message
1767 *		ENOMEM if unable to allocate data structures
1768 *		0 otherwise.
1769 *
1770 * Note:	On input, cp must point to the end of any addresses in
1771 *		the rt_msghdr_t structure.
1772 */
1773int
1774tsol_rtsa_init(rt_msghdr_t *rtm, tsol_rtsecattr_t *sp, caddr_t cp)
1775{
1776	uint_t	sacnt;
1777	int	err;
1778	caddr_t	lim;
1779	tsol_rtsecattr_t *tp;
1780
1781	ASSERT((cp >= (caddr_t)&rtm[1]) && sp != NULL);
1782
1783	/*
1784	 * In theory, we could accept as many security attributes configured
1785	 * per route destination.  However, the current design is limited
1786	 * such that at most only one set security attributes is allowed to
1787	 * be associated with a prefix IRE.  We therefore assert for now.
1788	 */
1789	/* LINTED */
1790	ASSERT(TSOL_RTSA_REQUEST_MAX == 1);
1791
1792	sp->rtsa_cnt = 0;
1793	lim = (caddr_t)rtm + rtm->rtm_msglen;
1794	ASSERT(cp <= lim);
1795
1796	if ((lim - cp) < sizeof (rtm_ext_t) ||
1797	    ((rtm_ext_t *)cp)->rtmex_type != RTMEX_GATEWAY_SECATTR)
1798		return (0);
1799
1800	if (((rtm_ext_t *)cp)->rtmex_len < sizeof (tsol_rtsecattr_t))
1801		return (EINVAL);
1802
1803	cp += sizeof (rtm_ext_t);
1804
1805	if ((lim - cp) < sizeof (*tp) ||
1806	    (tp = (tsol_rtsecattr_t *)cp, (sacnt = tp->rtsa_cnt) == 0) ||
1807	    (lim - cp) < TSOL_RTSECATTR_SIZE(sacnt))
1808		return (EINVAL);
1809
1810	/*
1811	 * Trying to add route security attributes when system
1812	 * labeling service is not available, or when user supllies
1813	 * more than the maximum number of security attributes
1814	 * allowed per request.
1815	 */
1816	if ((sacnt > 0 && !is_system_labeled()) ||
1817	    sacnt > TSOL_RTSA_REQUEST_MAX)
1818		return (EINVAL);
1819
1820	/* Ensure valid credentials */
1821	if ((err = rtsa_validate(&((tsol_rtsecattr_t *)cp)->
1822	    rtsa_attr[0])) != 0) {
1823		cp += sizeof (*sp);
1824		return (err);
1825	}
1826
1827	bcopy(cp, sp, sizeof (*sp));
1828	cp += sizeof (*sp);
1829	return (0);
1830}
1831
1832int
1833tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
1834    tsol_gcgrp_t *gcgrp)
1835{
1836	tsol_ire_gw_secattr_t *attrp;
1837	boolean_t exists = B_FALSE;
1838	in_addr_t ga_addr4;
1839	void *paddr = NULL;
1840
1841	ASSERT(ire != NULL);
1842
1843	/*
1844	 * The only time that attrp can be NULL is when this routine is
1845	 * called for the first time during the creation/initialization
1846	 * of the corresponding IRE.  It will only get cleared when the
1847	 * IRE is deleted.
1848	 */
1849	if ((attrp = ire->ire_gw_secattr) == NULL) {
1850		attrp = ire_gw_secattr_alloc(KM_NOSLEEP);
1851		if (attrp == NULL)
1852			return (ENOMEM);
1853		ire->ire_gw_secattr = attrp;
1854	} else {
1855		exists = B_TRUE;
1856		mutex_enter(&attrp->igsa_lock);
1857
1858		if (attrp->igsa_rhc != NULL) {
1859			TNRHC_RELE(attrp->igsa_rhc);
1860			attrp->igsa_rhc = NULL;
1861		}
1862
1863		if (attrp->igsa_gc != NULL)
1864			GC_REFRELE(attrp->igsa_gc);
1865		if (attrp->igsa_gcgrp != NULL)
1866			GCGRP_REFRELE(attrp->igsa_gcgrp);
1867	}
1868	ASSERT(!exists || MUTEX_HELD(&attrp->igsa_lock));
1869
1870	/*
1871	 * References already held by caller and we keep them;
1872	 * note that both gc and gcgrp may be set to NULL to
1873	 * clear out igsa_gc and igsa_gcgrp, respectively.
1874	 */
1875	attrp->igsa_gc = gc;
1876	attrp->igsa_gcgrp = gcgrp;
1877
1878	if (gcgrp == NULL && gc != NULL) {
1879		gcgrp = gc->gc_grp;
1880		ASSERT(gcgrp != NULL);
1881	}
1882
1883	/*
1884	 * Intialize the template for gateway; we use the gateway's
1885	 * address found in either the passed in gateway credential
1886	 * or group pointer, or the ire_gateway_addr{_v6} field.
1887	 */
1888	if (gcgrp != NULL) {
1889		tsol_gcgrp_addr_t *ga = &gcgrp->gcgrp_addr;
1890
1891		/*
1892		 * Caller is holding a reference, and that we don't
1893		 * need to hold any lock to access the address.
1894		 */
1895		if (ipversion == IPV4_VERSION) {
1896			ASSERT(ga->ga_af == AF_INET);
1897			IN6_V4MAPPED_TO_IPADDR(&ga->ga_addr, ga_addr4);
1898			paddr = &ga_addr4;
1899		} else {
1900			ASSERT(ga->ga_af == AF_INET6);
1901			paddr = &ga->ga_addr;
1902		}
1903	} else if (ipversion == IPV6_VERSION &&
1904	    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
1905		paddr = &ire->ire_gateway_addr_v6;
1906	} else if (ipversion == IPV4_VERSION &&
1907	    ire->ire_gateway_addr != INADDR_ANY) {
1908		paddr = &ire->ire_gateway_addr;
1909	}
1910
1911	/*
1912	 * Lookup the gateway template; note that we could get an internal
1913	 * template here, which we cache anyway.  During IRE matching, we'll
1914	 * try to update this gateway template cache and hopefully get a
1915	 * real one.
1916	 */
1917	if (paddr != NULL) {
1918		attrp->igsa_rhc = find_rhc(paddr, ipversion, B_FALSE);
1919	}
1920
1921	if (exists)
1922		mutex_exit(&attrp->igsa_lock);
1923
1924	return (0);
1925}
1926
1927/*
1928 * This function figures the type of MLP that we'll be using based on the
1929 * address that the user is binding and the zone.  If the address is
1930 * unspecified, then we're looking at both private and shared.  If it's one
1931 * of the zone's private addresses, then it's private only.  If it's one
1932 * of the global addresses, then it's shared only.
1933 *
1934 * If we can't figure out what it is, then return mlptSingle.  That's actually
1935 * an error case.
1936 *
1937 * The callers are assume to pass in zone->zone_id and not the zoneid that
1938 * is stored in a conn_t (since the latter will be GLOBAL_ZONEID in an
1939 * exclusive stack zone).
1940 */
1941mlp_type_t
1942tsol_mlp_addr_type(zoneid_t zoneid, uchar_t version, const void *addr,
1943    ip_stack_t *ipst)
1944{
1945	in_addr_t in4;
1946	ire_t *ire;
1947	ipif_t *ipif;
1948	zoneid_t addrzone;
1949	zoneid_t ip_zoneid;
1950
1951	ASSERT(addr != NULL);
1952
1953	/*
1954	 * For exclusive stacks we set the zoneid to zero
1955	 * to operate as if in the global zone for IRE and conn_t comparisons.
1956	 */
1957	if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1958		ip_zoneid = GLOBAL_ZONEID;
1959	else
1960		ip_zoneid = zoneid;
1961
1962	if (version == IPV6_VERSION &&
1963	    IN6_IS_ADDR_V4MAPPED((const in6_addr_t *)addr)) {
1964		IN6_V4MAPPED_TO_IPADDR((const in6_addr_t *)addr, in4);
1965		addr = &in4;
1966		version = IPV4_VERSION;
1967	}
1968
1969	if (version == IPV4_VERSION) {
1970		in4 = *(const in_addr_t *)addr;
1971		if (in4 == INADDR_ANY) {
1972			return (mlptBoth);
1973		}
1974		ire = ire_cache_lookup(in4, ip_zoneid, NULL, ipst);
1975	} else {
1976		if (IN6_IS_ADDR_UNSPECIFIED((const in6_addr_t *)addr)) {
1977			return (mlptBoth);
1978		}
1979		ire = ire_cache_lookup_v6(addr, ip_zoneid, NULL, ipst);
1980	}
1981	/*
1982	 * If we can't find the IRE, then we have to behave exactly like
1983	 * ip_bind_laddr{,_v6}.  That means looking up the IPIF so that users
1984	 * can bind to addresses on "down" interfaces.
1985	 *
1986	 * If we can't find that either, then the bind is going to fail, so
1987	 * just give up.  Note that there's a miniscule chance that the address
1988	 * is in transition, but we don't bother handling that.
1989	 */
1990	if (ire == NULL) {
1991		if (version == IPV4_VERSION)
1992			ipif = ipif_lookup_addr(*(const in_addr_t *)addr, NULL,
1993			    ip_zoneid, NULL, NULL, NULL, NULL, ipst);
1994		else
1995			ipif = ipif_lookup_addr_v6((const in6_addr_t *)addr,
1996			    NULL, ip_zoneid, NULL, NULL, NULL, NULL, ipst);
1997		if (ipif == NULL) {
1998			return (mlptSingle);
1999		}
2000		addrzone = ipif->ipif_zoneid;
2001		ipif_refrele(ipif);
2002	} else {
2003		addrzone = ire->ire_zoneid;
2004		ire_refrele(ire);
2005	}
2006	return (addrzone == ALL_ZONES ? mlptShared : mlptPrivate);
2007}
2008
2009/*
2010 * Since we are configuring local interfaces, and we know trusted
2011 * extension CDE requires local interfaces to be cipso host type in
2012 * order to function correctly, we'll associate a cipso template
2013 * to each local interface and let the interface come up.  Configuring
2014 * a local interface to be "unlabeled" host type is a configuration error.
2015 * We'll override that error and make the interface host type to be cipso
2016 * here.
2017 *
2018 * The code is optimized for the usual "success" case and unwinds things on
2019 * error.  We don't want to go to the trouble and expense of formatting the
2020 * interface name for the usual case where everything is configured correctly.
2021 */
2022boolean_t
2023tsol_check_interface_address(const ipif_t *ipif)
2024{
2025	tsol_tpc_t *tp;
2026	char addrbuf[INET6_ADDRSTRLEN];
2027	int af;
2028	const void *addr;
2029	zone_t *zone;
2030	ts_label_t *plabel;
2031	const bslabel_t *label;
2032	char ifbuf[LIFNAMSIZ + 10];
2033	const char *ifname;
2034	boolean_t retval;
2035	tsol_rhent_t rhent;
2036	netstack_t *ns = ipif->ipif_ill->ill_ipst->ips_netstack;
2037
2038	if (IN6_IS_ADDR_V4MAPPED(&ipif->ipif_v6lcl_addr)) {
2039		af = AF_INET;
2040		addr = &V4_PART_OF_V6(ipif->ipif_v6lcl_addr);
2041	} else {
2042		af = AF_INET6;
2043		addr = &ipif->ipif_v6lcl_addr;
2044	}
2045
2046	tp = find_tpc(&ipif->ipif_v6lcl_addr, IPV6_VERSION, B_FALSE);
2047
2048	/* assumes that ALL_ZONES implies that there is no exclusive stack */
2049	if (ipif->ipif_zoneid == ALL_ZONES) {
2050		zone = NULL;
2051	} else if (ns->netstack_stackid == GLOBAL_NETSTACKID) {
2052		/* Shared stack case */
2053		zone = zone_find_by_id(ipif->ipif_zoneid);
2054	} else {
2055		/* Exclusive stack case */
2056		zone = zone_find_by_id(crgetzoneid(ipif->ipif_ill->ill_credp));
2057	}
2058	if (zone != NULL) {
2059		plabel = zone->zone_slabel;
2060		ASSERT(plabel != NULL);
2061		label = label2bslabel(plabel);
2062	}
2063
2064	/*
2065	 * If it's CIPSO and an all-zones address, then we're done.
2066	 * If it's a CIPSO zone specific address, the zone's label
2067	 * must be in the range or set specified in the template.
2068	 * When the remote host entry is missing or the template
2069	 * type is incorrect for this interface, we create a
2070	 * CIPSO host entry in kernel and allow the interface to be
2071	 * brought up as CIPSO type.
2072	 */
2073	if (tp != NULL && (
2074	    /* The all-zones case */
2075	    (tp->tpc_tp.host_type == SUN_CIPSO &&
2076	    tp->tpc_tp.tp_doi == default_doi &&
2077	    ipif->ipif_zoneid == ALL_ZONES) ||
2078	    /* The local-zone case */
2079	    (zone != NULL && plabel->tsl_doi == tp->tpc_tp.tp_doi &&
2080	    ((tp->tpc_tp.host_type == SUN_CIPSO &&
2081	    (_blinrange(label, &tp->tpc_tp.tp_sl_range_cipso) ||
2082	    blinlset(label, tp->tpc_tp.tp_sl_set_cipso))))))) {
2083		if (zone != NULL)
2084			zone_rele(zone);
2085		TPC_RELE(tp);
2086		return (B_TRUE);
2087	}
2088
2089	ifname = ipif->ipif_ill->ill_name;
2090	if (ipif->ipif_id != 0) {
2091		(void) snprintf(ifbuf, sizeof (ifbuf), "%s:%u", ifname,
2092		    ipif->ipif_id);
2093		ifname = ifbuf;
2094	}
2095	(void) inet_ntop(af, addr, addrbuf, sizeof (addrbuf));
2096
2097	if (tp == NULL) {
2098		cmn_err(CE_NOTE, "template entry for %s missing. Default to "
2099		    "CIPSO type for %s", ifname, addrbuf);
2100		retval = B_TRUE;
2101	} else if (tp->tpc_tp.host_type == UNLABELED) {
2102		cmn_err(CE_NOTE, "template type for %s incorrectly configured. "
2103		    "Change to CIPSO type for %s", ifname, addrbuf);
2104		retval = B_TRUE;
2105	} else if (ipif->ipif_zoneid == ALL_ZONES) {
2106		if (tp->tpc_tp.host_type != SUN_CIPSO) {
2107			cmn_err(CE_NOTE, "%s failed: %s isn't set to CIPSO for "
2108			    "all-zones. Converted to CIPSO.", ifname, addrbuf);
2109			retval = B_TRUE;
2110		} else {
2111			cmn_err(CE_NOTE, "%s failed: %s has wrong DOI %d "
2112			    "instead of %d", ifname, addrbuf,
2113			    tp->tpc_tp.tp_doi, default_doi);
2114			retval = B_FALSE;
2115		}
2116	} else if (zone == NULL) {
2117		cmn_err(CE_NOTE, "%s failed: zoneid %d unknown",
2118		    ifname, ipif->ipif_zoneid);
2119		retval = B_FALSE;
2120	} else if (plabel->tsl_doi != tp->tpc_tp.tp_doi) {
2121		cmn_err(CE_NOTE, "%s failed: zone %s has DOI %d but %s has "
2122		    "DOI %d", ifname, zone->zone_name, plabel->tsl_doi,
2123		    addrbuf, tp->tpc_tp.tp_doi);
2124		retval = B_FALSE;
2125	} else {
2126		cmn_err(CE_NOTE, "%s failed: zone %s label incompatible with "
2127		    "%s", ifname, zone->zone_name, addrbuf);
2128		tsol_print_label(label, "zone label");
2129		retval = B_FALSE;
2130	}
2131
2132	if (zone != NULL)
2133		zone_rele(zone);
2134	if (tp != NULL)
2135		TPC_RELE(tp);
2136	if (retval) {
2137		/*
2138		 * we've corrected a config error and let the interface
2139		 * come up as cipso. Need to insert an rhent.
2140		 */
2141		if ((rhent.rh_address.ta_family = af) == AF_INET) {
2142			rhent.rh_prefix = 32;
2143			rhent.rh_address.ta_addr_v4 = *(struct in_addr *)addr;
2144		} else {
2145			rhent.rh_prefix = 128;
2146			rhent.rh_address.ta_addr_v6 = *(in6_addr_t *)addr;
2147		}
2148		(void) strcpy(rhent.rh_template, "cipso");
2149		if (tnrh_load(&rhent) != 0) {
2150			cmn_err(CE_NOTE, "%s failed: Cannot insert CIPSO "
2151			    "template for local addr %s", ifname, addrbuf);
2152			retval = B_FALSE;
2153		}
2154	}
2155	return (retval);
2156}
2157