spd.c revision 11042:2d6e217af1b4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * IPsec Security Policy Database.
28 *
29 * This module maintains the SPD and provides routines used by ip and ip6
30 * to apply IPsec policy to inbound and outbound datagrams.
31 */
32
33#include <sys/types.h>
34#include <sys/stream.h>
35#include <sys/stropts.h>
36#include <sys/sysmacros.h>
37#include <sys/strsubr.h>
38#include <sys/strsun.h>
39#include <sys/strlog.h>
40#include <sys/strsun.h>
41#include <sys/cmn_err.h>
42#include <sys/zone.h>
43
44#include <sys/systm.h>
45#include <sys/param.h>
46#include <sys/kmem.h>
47#include <sys/ddi.h>
48
49#include <sys/crypto/api.h>
50
51#include <inet/common.h>
52#include <inet/mi.h>
53
54#include <netinet/ip6.h>
55#include <netinet/icmp6.h>
56#include <netinet/udp.h>
57
58#include <inet/ip.h>
59#include <inet/ip6.h>
60
61#include <net/pfkeyv2.h>
62#include <net/pfpolicy.h>
63#include <inet/sadb.h>
64#include <inet/ipsec_impl.h>
65
66#include <inet/ip_impl.h>	/* For IP_MOD_ID */
67
68#include <inet/ipsecah.h>
69#include <inet/ipsecesp.h>
70#include <inet/ipdrop.h>
71#include <inet/ipclassifier.h>
72#include <inet/iptun.h>
73#include <inet/iptun/iptun_impl.h>
74
75static void ipsec_update_present_flags(ipsec_stack_t *);
76static ipsec_act_t *ipsec_act_wildcard_expand(ipsec_act_t *, uint_t *,
77    netstack_t *);
78static mblk_t *ipsec_check_ipsecin_policy(mblk_t *, ipsec_policy_t *,
79    ipha_t *, ip6_t *, uint64_t, ip_recv_attr_t *, netstack_t *);
80static void ipsec_action_free_table(ipsec_action_t *);
81static void ipsec_action_reclaim(void *);
82static void ipsec_action_reclaim_stack(netstack_t *);
83static void ipsid_init(netstack_t *);
84static void ipsid_fini(netstack_t *);
85
86/* sel_flags values for ipsec_init_inbound_sel(). */
87#define	SEL_NONE	0x0000
88#define	SEL_PORT_POLICY	0x0001
89#define	SEL_IS_ICMP	0x0002
90#define	SEL_TUNNEL_MODE	0x0004
91#define	SEL_POST_FRAG	0x0008
92
93/* Return values for ipsec_init_inbound_sel(). */
94typedef enum { SELRET_NOMEM, SELRET_BADPKT, SELRET_SUCCESS, SELRET_TUNFRAG}
95    selret_t;
96
97static selret_t ipsec_init_inbound_sel(ipsec_selector_t *, mblk_t *,
98    ipha_t *, ip6_t *, uint8_t);
99
100static boolean_t ipsec_check_ipsecin_action(ip_recv_attr_t *, mblk_t *,
101    struct ipsec_action_s *, ipha_t *ipha, ip6_t *ip6h, const char **,
102    kstat_named_t **, netstack_t *);
103static void ipsec_unregister_prov_update(void);
104static void ipsec_prov_update_callback_stack(uint32_t, void *, netstack_t *);
105static boolean_t ipsec_compare_action(ipsec_policy_t *, ipsec_policy_t *);
106static uint32_t selector_hash(ipsec_selector_t *, ipsec_policy_root_t *);
107static boolean_t ipsec_kstat_init(ipsec_stack_t *);
108static void ipsec_kstat_destroy(ipsec_stack_t *);
109static int ipsec_free_tables(ipsec_stack_t *);
110static int tunnel_compare(const void *, const void *);
111static void ipsec_freemsg_chain(mblk_t *);
112static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *,
113    struct kstat_named *, ipdropper_t *);
114static boolean_t ipsec_kstat_init(ipsec_stack_t *);
115static void ipsec_kstat_destroy(ipsec_stack_t *);
116static int ipsec_free_tables(ipsec_stack_t *);
117static int tunnel_compare(const void *, const void *);
118static void ipsec_freemsg_chain(mblk_t *);
119
120/*
121 * Selector hash table is statically sized at module load time.
122 * we default to 251 buckets, which is the largest prime number under 255
123 */
124
125#define	IPSEC_SPDHASH_DEFAULT 251
126
127/* SPD hash-size tunable per tunnel. */
128#define	TUN_SPDHASH_DEFAULT 5
129
130uint32_t ipsec_spd_hashsize;
131uint32_t tun_spd_hashsize;
132
133#define	IPSEC_SEL_NOHASH ((uint32_t)(~0))
134
135/*
136 * Handle global across all stack instances
137 */
138static crypto_notify_handle_t prov_update_handle = NULL;
139
140static kmem_cache_t *ipsec_action_cache;
141static kmem_cache_t *ipsec_sel_cache;
142static kmem_cache_t *ipsec_pol_cache;
143
144/* Frag cache prototypes */
145static void ipsec_fragcache_clean(ipsec_fragcache_t *, ipsec_stack_t *);
146static ipsec_fragcache_entry_t *fragcache_delentry(int,
147    ipsec_fragcache_entry_t *, ipsec_fragcache_t *, ipsec_stack_t *);
148boolean_t ipsec_fragcache_init(ipsec_fragcache_t *);
149void ipsec_fragcache_uninit(ipsec_fragcache_t *, ipsec_stack_t *ipss);
150mblk_t *ipsec_fragcache_add(ipsec_fragcache_t *, mblk_t *, mblk_t *,
151    int, ipsec_stack_t *);
152
153int ipsec_hdr_pullup_needed = 0;
154int ipsec_weird_null_inbound_policy = 0;
155
156#define	ALGBITS_ROUND_DOWN(x, align)	(((x)/(align))*(align))
157#define	ALGBITS_ROUND_UP(x, align)	ALGBITS_ROUND_DOWN((x)+(align)-1, align)
158
159/*
160 * Inbound traffic should have matching identities for both SA's.
161 */
162
163#define	SA_IDS_MATCH(sa1, sa2) 						\
164	(((sa1) == NULL) || ((sa2) == NULL) ||				\
165	(((sa1)->ipsa_src_cid == (sa2)->ipsa_src_cid) &&		\
166	    (((sa1)->ipsa_dst_cid == (sa2)->ipsa_dst_cid))))
167
168/*
169 * IPv6 Fragments
170 */
171#define	IS_V6_FRAGMENT(ipp)	(ipp.ipp_fields & IPPF_FRAGHDR)
172
173/*
174 * Policy failure messages.
175 */
176static char *ipsec_policy_failure_msgs[] = {
177
178	/* IPSEC_POLICY_NOT_NEEDED */
179	"%s: Dropping the datagram because the incoming packet "
180	"is %s, but the recipient expects clear; Source %s, "
181	"Destination %s.\n",
182
183	/* IPSEC_POLICY_MISMATCH */
184	"%s: Policy Failure for the incoming packet (%s); Source %s, "
185	"Destination %s.\n",
186
187	/* IPSEC_POLICY_AUTH_NOT_NEEDED	*/
188	"%s: Authentication present while not expected in the "
189	"incoming %s packet; Source %s, Destination %s.\n",
190
191	/* IPSEC_POLICY_ENCR_NOT_NEEDED */
192	"%s: Encryption present while not expected in the "
193	"incoming %s packet; Source %s, Destination %s.\n",
194
195	/* IPSEC_POLICY_SE_NOT_NEEDED */
196	"%s: Self-Encapsulation present while not expected in the "
197	"incoming %s packet; Source %s, Destination %s.\n",
198};
199
200/*
201 * General overviews:
202 *
203 * Locking:
204 *
205 *	All of the system policy structures are protected by a single
206 *	rwlock.  These structures are threaded in a
207 *	fairly complex fashion and are not expected to change on a
208 *	regular basis, so this should not cause scaling/contention
209 *	problems.  As a result, policy checks should (hopefully) be MT-hot.
210 *
211 * Allocation policy:
212 *
213 *	We use custom kmem cache types for the various
214 *	bits & pieces of the policy data structures.  All allocations
215 *	use KM_NOSLEEP instead of KM_SLEEP for policy allocation.  The
216 *	policy table is of potentially unbounded size, so we don't
217 *	want to provide a way to hog all system memory with policy
218 *	entries..
219 */
220
221/* Convenient functions for freeing or dropping a b_next linked mblk chain */
222
223/* Free all messages in an mblk chain */
224static void
225ipsec_freemsg_chain(mblk_t *mp)
226{
227	mblk_t *mpnext;
228	while (mp != NULL) {
229		ASSERT(mp->b_prev == NULL);
230		mpnext = mp->b_next;
231		mp->b_next = NULL;
232		freemsg(mp);
233		mp = mpnext;
234	}
235}
236
237/*
238 * ip_drop all messages in an mblk chain
239 * Can handle a b_next chain of ip_recv_attr_t mblks, or just a b_next chain
240 * of data.
241 */
242static void
243ip_drop_packet_chain(mblk_t *mp, boolean_t inbound, ill_t *ill,
244    struct kstat_named *counter, ipdropper_t *who_called)
245{
246	mblk_t *mpnext;
247	while (mp != NULL) {
248		ASSERT(mp->b_prev == NULL);
249		mpnext = mp->b_next;
250		mp->b_next = NULL;
251		if (ip_recv_attr_is_mblk(mp))
252			mp = ip_recv_attr_free_mblk(mp);
253		ip_drop_packet(mp, inbound, ill, counter, who_called);
254		mp = mpnext;
255	}
256}
257
258/*
259 * AVL tree comparison function.
260 * the in-kernel avl assumes unique keys for all objects.
261 * Since sometimes policy will duplicate rules, we may insert
262 * multiple rules with the same rule id, so we need a tie-breaker.
263 */
264static int
265ipsec_policy_cmpbyid(const void *a, const void *b)
266{
267	const ipsec_policy_t *ipa, *ipb;
268	uint64_t idxa, idxb;
269
270	ipa = (const ipsec_policy_t *)a;
271	ipb = (const ipsec_policy_t *)b;
272	idxa = ipa->ipsp_index;
273	idxb = ipb->ipsp_index;
274
275	if (idxa < idxb)
276		return (-1);
277	if (idxa > idxb)
278		return (1);
279	/*
280	 * Tie-breaker #1: All installed policy rules have a non-NULL
281	 * ipsl_sel (selector set), so an entry with a NULL ipsp_sel is not
282	 * actually in-tree but rather a template node being used in
283	 * an avl_find query; see ipsec_policy_delete().  This gives us
284	 * a placeholder in the ordering just before the first entry with
285	 * a key >= the one we're looking for, so we can walk forward from
286	 * that point to get the remaining entries with the same id.
287	 */
288	if ((ipa->ipsp_sel == NULL) && (ipb->ipsp_sel != NULL))
289		return (-1);
290	if ((ipb->ipsp_sel == NULL) && (ipa->ipsp_sel != NULL))
291		return (1);
292	/*
293	 * At most one of the arguments to the comparison should have a
294	 * NULL selector pointer; if not, the tree is broken.
295	 */
296	ASSERT(ipa->ipsp_sel != NULL);
297	ASSERT(ipb->ipsp_sel != NULL);
298	/*
299	 * Tie-breaker #2: use the virtual address of the policy node
300	 * to arbitrarily break ties.  Since we use the new tree node in
301	 * the avl_find() in ipsec_insert_always, the new node will be
302	 * inserted into the tree in the right place in the sequence.
303	 */
304	if (ipa < ipb)
305		return (-1);
306	if (ipa > ipb)
307		return (1);
308	return (0);
309}
310
311/*
312 * Free what ipsec_alloc_table allocated.
313 */
314void
315ipsec_polhead_free_table(ipsec_policy_head_t *iph)
316{
317	int dir;
318	int i;
319
320	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
321		ipsec_policy_root_t *ipr = &iph->iph_root[dir];
322
323		if (ipr->ipr_hash == NULL)
324			continue;
325
326		for (i = 0; i < ipr->ipr_nchains; i++) {
327			ASSERT(ipr->ipr_hash[i].hash_head == NULL);
328		}
329		kmem_free(ipr->ipr_hash, ipr->ipr_nchains *
330		    sizeof (ipsec_policy_hash_t));
331		ipr->ipr_hash = NULL;
332	}
333}
334
335void
336ipsec_polhead_destroy(ipsec_policy_head_t *iph)
337{
338	int dir;
339
340	avl_destroy(&iph->iph_rulebyid);
341	rw_destroy(&iph->iph_lock);
342
343	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
344		ipsec_policy_root_t *ipr = &iph->iph_root[dir];
345		int chain;
346
347		for (chain = 0; chain < ipr->ipr_nchains; chain++)
348			mutex_destroy(&(ipr->ipr_hash[chain].hash_lock));
349
350	}
351	ipsec_polhead_free_table(iph);
352}
353
354/*
355 * Free the IPsec stack instance.
356 */
357/* ARGSUSED */
358static void
359ipsec_stack_fini(netstackid_t stackid, void *arg)
360{
361	ipsec_stack_t	*ipss = (ipsec_stack_t *)arg;
362	void *cookie;
363	ipsec_tun_pol_t *node;
364	netstack_t	*ns = ipss->ipsec_netstack;
365	int		i;
366	ipsec_algtype_t	algtype;
367
368	ipsec_loader_destroy(ipss);
369
370	rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_WRITER);
371	/*
372	 * It's possible we can just ASSERT() the tree is empty.  After all,
373	 * we aren't called until IP is ready to unload (and presumably all
374	 * tunnels have been unplumbed).  But we'll play it safe for now, the
375	 * loop will just exit immediately if it's empty.
376	 */
377	cookie = NULL;
378	while ((node = (ipsec_tun_pol_t *)
379	    avl_destroy_nodes(&ipss->ipsec_tunnel_policies,
380	    &cookie)) != NULL) {
381		ITP_REFRELE(node, ns);
382	}
383	avl_destroy(&ipss->ipsec_tunnel_policies);
384	rw_exit(&ipss->ipsec_tunnel_policy_lock);
385	rw_destroy(&ipss->ipsec_tunnel_policy_lock);
386
387	ipsec_config_flush(ns);
388
389	ipsec_kstat_destroy(ipss);
390
391	ip_drop_unregister(&ipss->ipsec_dropper);
392
393	ip_drop_unregister(&ipss->ipsec_spd_dropper);
394	ip_drop_destroy(ipss);
395	/*
396	 * Globals start with ref == 1 to prevent IPPH_REFRELE() from
397	 * attempting to free them, hence they should have 1 now.
398	 */
399	ipsec_polhead_destroy(&ipss->ipsec_system_policy);
400	ASSERT(ipss->ipsec_system_policy.iph_refs == 1);
401	ipsec_polhead_destroy(&ipss->ipsec_inactive_policy);
402	ASSERT(ipss->ipsec_inactive_policy.iph_refs == 1);
403
404	for (i = 0; i < IPSEC_ACTION_HASH_SIZE; i++) {
405		ipsec_action_free_table(ipss->ipsec_action_hash[i].hash_head);
406		ipss->ipsec_action_hash[i].hash_head = NULL;
407		mutex_destroy(&(ipss->ipsec_action_hash[i].hash_lock));
408	}
409
410	for (i = 0; i < ipss->ipsec_spd_hashsize; i++) {
411		ASSERT(ipss->ipsec_sel_hash[i].hash_head == NULL);
412		mutex_destroy(&(ipss->ipsec_sel_hash[i].hash_lock));
413	}
414
415	mutex_enter(&ipss->ipsec_alg_lock);
416	for (algtype = 0; algtype < IPSEC_NALGTYPES; algtype ++) {
417		int nalgs = ipss->ipsec_nalgs[algtype];
418
419		for (i = 0; i < nalgs; i++) {
420			if (ipss->ipsec_alglists[algtype][i] != NULL)
421				ipsec_alg_unreg(algtype, i, ns);
422		}
423	}
424	mutex_exit(&ipss->ipsec_alg_lock);
425	mutex_destroy(&ipss->ipsec_alg_lock);
426
427	ipsid_gc(ns);
428	ipsid_fini(ns);
429
430	(void) ipsec_free_tables(ipss);
431	kmem_free(ipss, sizeof (*ipss));
432}
433
434void
435ipsec_policy_g_destroy(void)
436{
437	kmem_cache_destroy(ipsec_action_cache);
438	kmem_cache_destroy(ipsec_sel_cache);
439	kmem_cache_destroy(ipsec_pol_cache);
440
441	ipsec_unregister_prov_update();
442
443	netstack_unregister(NS_IPSEC);
444}
445
446
447/*
448 * Free what ipsec_alloc_tables allocated.
449 * Called when table allocation fails to free the table.
450 */
451static int
452ipsec_free_tables(ipsec_stack_t *ipss)
453{
454	int i;
455
456	if (ipss->ipsec_sel_hash != NULL) {
457		for (i = 0; i < ipss->ipsec_spd_hashsize; i++) {
458			ASSERT(ipss->ipsec_sel_hash[i].hash_head == NULL);
459		}
460		kmem_free(ipss->ipsec_sel_hash, ipss->ipsec_spd_hashsize *
461		    sizeof (*ipss->ipsec_sel_hash));
462		ipss->ipsec_sel_hash = NULL;
463		ipss->ipsec_spd_hashsize = 0;
464	}
465	ipsec_polhead_free_table(&ipss->ipsec_system_policy);
466	ipsec_polhead_free_table(&ipss->ipsec_inactive_policy);
467
468	return (ENOMEM);
469}
470
471/*
472 * Attempt to allocate the tables in a single policy head.
473 * Return nonzero on failure after cleaning up any work in progress.
474 */
475int
476ipsec_alloc_table(ipsec_policy_head_t *iph, int nchains, int kmflag,
477    boolean_t global_cleanup, netstack_t *ns)
478{
479	int dir;
480
481	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
482		ipsec_policy_root_t *ipr = &iph->iph_root[dir];
483
484		ipr->ipr_nchains = nchains;
485		ipr->ipr_hash = kmem_zalloc(nchains *
486		    sizeof (ipsec_policy_hash_t), kmflag);
487		if (ipr->ipr_hash == NULL)
488			return (global_cleanup ?
489			    ipsec_free_tables(ns->netstack_ipsec) :
490			    ENOMEM);
491	}
492	return (0);
493}
494
495/*
496 * Attempt to allocate the various tables.  Return nonzero on failure
497 * after cleaning up any work in progress.
498 */
499static int
500ipsec_alloc_tables(int kmflag, netstack_t *ns)
501{
502	int error;
503	ipsec_stack_t	*ipss = ns->netstack_ipsec;
504
505	error = ipsec_alloc_table(&ipss->ipsec_system_policy,
506	    ipss->ipsec_spd_hashsize, kmflag, B_TRUE, ns);
507	if (error != 0)
508		return (error);
509
510	error = ipsec_alloc_table(&ipss->ipsec_inactive_policy,
511	    ipss->ipsec_spd_hashsize, kmflag, B_TRUE, ns);
512	if (error != 0)
513		return (error);
514
515	ipss->ipsec_sel_hash = kmem_zalloc(ipss->ipsec_spd_hashsize *
516	    sizeof (*ipss->ipsec_sel_hash), kmflag);
517
518	if (ipss->ipsec_sel_hash == NULL)
519		return (ipsec_free_tables(ipss));
520
521	return (0);
522}
523
524/*
525 * After table allocation, initialize a policy head.
526 */
527void
528ipsec_polhead_init(ipsec_policy_head_t *iph, int nchains)
529{
530	int dir, chain;
531
532	rw_init(&iph->iph_lock, NULL, RW_DEFAULT, NULL);
533	avl_create(&iph->iph_rulebyid, ipsec_policy_cmpbyid,
534	    sizeof (ipsec_policy_t), offsetof(ipsec_policy_t, ipsp_byid));
535
536	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
537		ipsec_policy_root_t *ipr = &iph->iph_root[dir];
538		ipr->ipr_nchains = nchains;
539
540		for (chain = 0; chain < nchains; chain++) {
541			mutex_init(&(ipr->ipr_hash[chain].hash_lock),
542			    NULL, MUTEX_DEFAULT, NULL);
543		}
544	}
545}
546
547static boolean_t
548ipsec_kstat_init(ipsec_stack_t *ipss)
549{
550	ipss->ipsec_ksp = kstat_create_netstack("ip", 0, "ipsec_stat", "net",
551	    KSTAT_TYPE_NAMED, sizeof (ipsec_kstats_t) / sizeof (kstat_named_t),
552	    KSTAT_FLAG_PERSISTENT, ipss->ipsec_netstack->netstack_stackid);
553
554	if (ipss->ipsec_ksp == NULL || ipss->ipsec_ksp->ks_data == NULL)
555		return (B_FALSE);
556
557	ipss->ipsec_kstats = ipss->ipsec_ksp->ks_data;
558
559#define	KI(x) kstat_named_init(&ipss->ipsec_kstats->x, #x, KSTAT_DATA_UINT64)
560	KI(esp_stat_in_requests);
561	KI(esp_stat_in_discards);
562	KI(esp_stat_lookup_failure);
563	KI(ah_stat_in_requests);
564	KI(ah_stat_in_discards);
565	KI(ah_stat_lookup_failure);
566	KI(sadb_acquire_maxpackets);
567	KI(sadb_acquire_qhiwater);
568#undef KI
569
570	kstat_install(ipss->ipsec_ksp);
571	return (B_TRUE);
572}
573
574static void
575ipsec_kstat_destroy(ipsec_stack_t *ipss)
576{
577	kstat_delete_netstack(ipss->ipsec_ksp,
578	    ipss->ipsec_netstack->netstack_stackid);
579	ipss->ipsec_kstats = NULL;
580
581}
582
583/*
584 * Initialize the IPsec stack instance.
585 */
586/* ARGSUSED */
587static void *
588ipsec_stack_init(netstackid_t stackid, netstack_t *ns)
589{
590	ipsec_stack_t	*ipss;
591	int i;
592
593	ipss = (ipsec_stack_t *)kmem_zalloc(sizeof (*ipss), KM_SLEEP);
594	ipss->ipsec_netstack = ns;
595
596	/*
597	 * FIXME: netstack_ipsec is used by some of the routines we call
598	 * below, but it isn't set until this routine returns.
599	 * Either we introduce optional xxx_stack_alloc() functions
600	 * that will be called by the netstack framework before xxx_stack_init,
601	 * or we switch spd.c and sadb.c to operate on ipsec_stack_t
602	 * (latter has some include file order issues for sadb.h, but makes
603	 * sense if we merge some of the ipsec related stack_t's together.
604	 */
605	ns->netstack_ipsec = ipss;
606
607	/*
608	 * Make two attempts to allocate policy hash tables; try it at
609	 * the "preferred" size (may be set in /etc/system) first,
610	 * then fall back to the default size.
611	 */
612	ipss->ipsec_spd_hashsize = (ipsec_spd_hashsize == 0) ?
613	    IPSEC_SPDHASH_DEFAULT : ipsec_spd_hashsize;
614
615	if (ipsec_alloc_tables(KM_NOSLEEP, ns) != 0) {
616		cmn_err(CE_WARN,
617		    "Unable to allocate %d entry IPsec policy hash table",
618		    ipss->ipsec_spd_hashsize);
619		ipss->ipsec_spd_hashsize = IPSEC_SPDHASH_DEFAULT;
620		cmn_err(CE_WARN, "Falling back to %d entries",
621		    ipss->ipsec_spd_hashsize);
622		(void) ipsec_alloc_tables(KM_SLEEP, ns);
623	}
624
625	/* Just set a default for tunnels. */
626	ipss->ipsec_tun_spd_hashsize = (tun_spd_hashsize == 0) ?
627	    TUN_SPDHASH_DEFAULT : tun_spd_hashsize;
628
629	ipsid_init(ns);
630	/*
631	 * Globals need ref == 1 to prevent IPPH_REFRELE() from attempting
632	 * to free them.
633	 */
634	ipss->ipsec_system_policy.iph_refs = 1;
635	ipss->ipsec_inactive_policy.iph_refs = 1;
636	ipsec_polhead_init(&ipss->ipsec_system_policy,
637	    ipss->ipsec_spd_hashsize);
638	ipsec_polhead_init(&ipss->ipsec_inactive_policy,
639	    ipss->ipsec_spd_hashsize);
640	rw_init(&ipss->ipsec_tunnel_policy_lock, NULL, RW_DEFAULT, NULL);
641	avl_create(&ipss->ipsec_tunnel_policies, tunnel_compare,
642	    sizeof (ipsec_tun_pol_t), 0);
643
644	ipss->ipsec_next_policy_index = 1;
645
646	rw_init(&ipss->ipsec_system_policy.iph_lock, NULL, RW_DEFAULT, NULL);
647	rw_init(&ipss->ipsec_inactive_policy.iph_lock, NULL, RW_DEFAULT, NULL);
648
649	for (i = 0; i < IPSEC_ACTION_HASH_SIZE; i++)
650		mutex_init(&(ipss->ipsec_action_hash[i].hash_lock),
651		    NULL, MUTEX_DEFAULT, NULL);
652
653	for (i = 0; i < ipss->ipsec_spd_hashsize; i++)
654		mutex_init(&(ipss->ipsec_sel_hash[i].hash_lock),
655		    NULL, MUTEX_DEFAULT, NULL);
656
657	mutex_init(&ipss->ipsec_alg_lock, NULL, MUTEX_DEFAULT, NULL);
658	for (i = 0; i < IPSEC_NALGTYPES; i++) {
659		ipss->ipsec_nalgs[i] = 0;
660	}
661
662	ip_drop_init(ipss);
663	ip_drop_register(&ipss->ipsec_spd_dropper, "IPsec SPD");
664
665	/* IP's IPsec code calls the packet dropper */
666	ip_drop_register(&ipss->ipsec_dropper, "IP IPsec processing");
667
668	(void) ipsec_kstat_init(ipss);
669
670	ipsec_loader_init(ipss);
671	ipsec_loader_start(ipss);
672
673	return (ipss);
674}
675
676/* Global across all stack instances */
677void
678ipsec_policy_g_init(void)
679{
680	ipsec_action_cache = kmem_cache_create("ipsec_actions",
681	    sizeof (ipsec_action_t), _POINTER_ALIGNMENT, NULL, NULL,
682	    ipsec_action_reclaim, NULL, NULL, 0);
683	ipsec_sel_cache = kmem_cache_create("ipsec_selectors",
684	    sizeof (ipsec_sel_t), _POINTER_ALIGNMENT, NULL, NULL,
685	    NULL, NULL, NULL, 0);
686	ipsec_pol_cache = kmem_cache_create("ipsec_policy",
687	    sizeof (ipsec_policy_t), _POINTER_ALIGNMENT, NULL, NULL,
688	    NULL, NULL, NULL, 0);
689
690	/*
691	 * We want to be informed each time a stack is created or
692	 * destroyed in the kernel, so we can maintain the
693	 * set of ipsec_stack_t's.
694	 */
695	netstack_register(NS_IPSEC, ipsec_stack_init, NULL, ipsec_stack_fini);
696}
697
698/*
699 * Sort algorithm lists.
700 *
701 * I may need to split this based on
702 * authentication/encryption, and I may wish to have an administrator
703 * configure this list.  Hold on to some NDD variables...
704 *
705 * XXX For now, sort on minimum key size (GAG!).  While minimum key size is
706 * not the ideal metric, it's the only quantifiable measure available.
707 * We need a better metric for sorting algorithms by preference.
708 */
709static void
710alg_insert_sortlist(enum ipsec_algtype at, uint8_t algid, netstack_t *ns)
711{
712	ipsec_stack_t	*ipss = ns->netstack_ipsec;
713	ipsec_alginfo_t *ai = ipss->ipsec_alglists[at][algid];
714	uint8_t holder, swap;
715	uint_t i;
716	uint_t count = ipss->ipsec_nalgs[at];
717	ASSERT(ai != NULL);
718	ASSERT(algid == ai->alg_id);
719
720	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
721
722	holder = algid;
723
724	for (i = 0; i < count - 1; i++) {
725		ipsec_alginfo_t *alt;
726
727		alt = ipss->ipsec_alglists[at][ipss->ipsec_sortlist[at][i]];
728		/*
729		 * If you want to give precedence to newly added algs,
730		 * add the = in the > comparison.
731		 */
732		if ((holder != algid) || (ai->alg_minbits > alt->alg_minbits)) {
733			/* Swap sortlist[i] and holder. */
734			swap = ipss->ipsec_sortlist[at][i];
735			ipss->ipsec_sortlist[at][i] = holder;
736			holder = swap;
737			ai = alt;
738		} /* Else just continue. */
739	}
740
741	/* Store holder in last slot. */
742	ipss->ipsec_sortlist[at][i] = holder;
743}
744
745/*
746 * Remove an algorithm from a sorted algorithm list.
747 * This should be considerably easier, even with complex sorting.
748 */
749static void
750alg_remove_sortlist(enum ipsec_algtype at, uint8_t algid, netstack_t *ns)
751{
752	boolean_t copyback = B_FALSE;
753	int i;
754	ipsec_stack_t	*ipss = ns->netstack_ipsec;
755	int newcount = ipss->ipsec_nalgs[at];
756
757	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
758
759	for (i = 0; i <= newcount; i++) {
760		if (copyback) {
761			ipss->ipsec_sortlist[at][i-1] =
762			    ipss->ipsec_sortlist[at][i];
763		} else if (ipss->ipsec_sortlist[at][i] == algid) {
764			copyback = B_TRUE;
765		}
766	}
767}
768
769/*
770 * Add the specified algorithm to the algorithm tables.
771 * Must be called while holding the algorithm table writer lock.
772 */
773void
774ipsec_alg_reg(ipsec_algtype_t algtype, ipsec_alginfo_t *alg, netstack_t *ns)
775{
776	ipsec_stack_t	*ipss = ns->netstack_ipsec;
777
778	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
779
780	ASSERT(ipss->ipsec_alglists[algtype][alg->alg_id] == NULL);
781	ipsec_alg_fix_min_max(alg, algtype, ns);
782	ipss->ipsec_alglists[algtype][alg->alg_id] = alg;
783
784	ipss->ipsec_nalgs[algtype]++;
785	alg_insert_sortlist(algtype, alg->alg_id, ns);
786}
787
788/*
789 * Remove the specified algorithm from the algorithm tables.
790 * Must be called while holding the algorithm table writer lock.
791 */
792void
793ipsec_alg_unreg(ipsec_algtype_t algtype, uint8_t algid, netstack_t *ns)
794{
795	ipsec_stack_t	*ipss = ns->netstack_ipsec;
796
797	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
798
799	ASSERT(ipss->ipsec_alglists[algtype][algid] != NULL);
800	ipsec_alg_free(ipss->ipsec_alglists[algtype][algid]);
801	ipss->ipsec_alglists[algtype][algid] = NULL;
802
803	ipss->ipsec_nalgs[algtype]--;
804	alg_remove_sortlist(algtype, algid, ns);
805}
806
807/*
808 * Hooks for spdsock to get a grip on system policy.
809 */
810
811ipsec_policy_head_t *
812ipsec_system_policy(netstack_t *ns)
813{
814	ipsec_stack_t	*ipss = ns->netstack_ipsec;
815	ipsec_policy_head_t *h = &ipss->ipsec_system_policy;
816
817	IPPH_REFHOLD(h);
818	return (h);
819}
820
821ipsec_policy_head_t *
822ipsec_inactive_policy(netstack_t *ns)
823{
824	ipsec_stack_t	*ipss = ns->netstack_ipsec;
825	ipsec_policy_head_t *h = &ipss->ipsec_inactive_policy;
826
827	IPPH_REFHOLD(h);
828	return (h);
829}
830
831/*
832 * Lock inactive policy, then active policy, then exchange policy root
833 * pointers.
834 */
835void
836ipsec_swap_policy(ipsec_policy_head_t *active, ipsec_policy_head_t *inactive,
837    netstack_t *ns)
838{
839	int af, dir;
840	avl_tree_t r1, r2;
841
842	rw_enter(&inactive->iph_lock, RW_WRITER);
843	rw_enter(&active->iph_lock, RW_WRITER);
844
845	r1 = active->iph_rulebyid;
846	r2 = inactive->iph_rulebyid;
847	active->iph_rulebyid = r2;
848	inactive->iph_rulebyid = r1;
849
850	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
851		ipsec_policy_hash_t *h1, *h2;
852
853		h1 = active->iph_root[dir].ipr_hash;
854		h2 = inactive->iph_root[dir].ipr_hash;
855		active->iph_root[dir].ipr_hash = h2;
856		inactive->iph_root[dir].ipr_hash = h1;
857
858		for (af = 0; af < IPSEC_NAF; af++) {
859			ipsec_policy_t *t1, *t2;
860
861			t1 = active->iph_root[dir].ipr_nonhash[af];
862			t2 = inactive->iph_root[dir].ipr_nonhash[af];
863			active->iph_root[dir].ipr_nonhash[af] = t2;
864			inactive->iph_root[dir].ipr_nonhash[af] = t1;
865			if (t1 != NULL) {
866				t1->ipsp_hash.hash_pp =
867				    &(inactive->iph_root[dir].ipr_nonhash[af]);
868			}
869			if (t2 != NULL) {
870				t2->ipsp_hash.hash_pp =
871				    &(active->iph_root[dir].ipr_nonhash[af]);
872			}
873
874		}
875	}
876	active->iph_gen++;
877	inactive->iph_gen++;
878	ipsec_update_present_flags(ns->netstack_ipsec);
879	rw_exit(&active->iph_lock);
880	rw_exit(&inactive->iph_lock);
881}
882
883/*
884 * Swap global policy primary/secondary.
885 */
886void
887ipsec_swap_global_policy(netstack_t *ns)
888{
889	ipsec_stack_t	*ipss = ns->netstack_ipsec;
890
891	ipsec_swap_policy(&ipss->ipsec_system_policy,
892	    &ipss->ipsec_inactive_policy, ns);
893}
894
895/*
896 * Clone one policy rule..
897 */
898static ipsec_policy_t *
899ipsec_copy_policy(const ipsec_policy_t *src)
900{
901	ipsec_policy_t *dst = kmem_cache_alloc(ipsec_pol_cache, KM_NOSLEEP);
902
903	if (dst == NULL)
904		return (NULL);
905
906	/*
907	 * Adjust refcounts of cloned state.
908	 */
909	IPACT_REFHOLD(src->ipsp_act);
910	src->ipsp_sel->ipsl_refs++;
911
912	HASH_NULL(dst, ipsp_hash);
913	dst->ipsp_netstack = src->ipsp_netstack;
914	dst->ipsp_refs = 1;
915	dst->ipsp_sel = src->ipsp_sel;
916	dst->ipsp_act = src->ipsp_act;
917	dst->ipsp_prio = src->ipsp_prio;
918	dst->ipsp_index = src->ipsp_index;
919
920	return (dst);
921}
922
923void
924ipsec_insert_always(avl_tree_t *tree, void *new_node)
925{
926	void *node;
927	avl_index_t where;
928
929	node = avl_find(tree, new_node, &where);
930	ASSERT(node == NULL);
931	avl_insert(tree, new_node, where);
932}
933
934
935static int
936ipsec_copy_chain(ipsec_policy_head_t *dph, ipsec_policy_t *src,
937    ipsec_policy_t **dstp)
938{
939	for (; src != NULL; src = src->ipsp_hash.hash_next) {
940		ipsec_policy_t *dst = ipsec_copy_policy(src);
941		if (dst == NULL)
942			return (ENOMEM);
943
944		HASHLIST_INSERT(dst, ipsp_hash, *dstp);
945		ipsec_insert_always(&dph->iph_rulebyid, dst);
946	}
947	return (0);
948}
949
950
951
952/*
953 * Make one policy head look exactly like another.
954 *
955 * As with ipsec_swap_policy, we lock the destination policy head first, then
956 * the source policy head. Note that we only need to read-lock the source
957 * policy head as we are not changing it.
958 */
959int
960ipsec_copy_polhead(ipsec_policy_head_t *sph, ipsec_policy_head_t *dph,
961    netstack_t *ns)
962{
963	int af, dir, chain, nchains;
964
965	rw_enter(&dph->iph_lock, RW_WRITER);
966
967	ipsec_polhead_flush(dph, ns);
968
969	rw_enter(&sph->iph_lock, RW_READER);
970
971	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
972		ipsec_policy_root_t *dpr = &dph->iph_root[dir];
973		ipsec_policy_root_t *spr = &sph->iph_root[dir];
974		nchains = dpr->ipr_nchains;
975
976		ASSERT(dpr->ipr_nchains == spr->ipr_nchains);
977
978		for (af = 0; af < IPSEC_NAF; af++) {
979			if (ipsec_copy_chain(dph, spr->ipr_nonhash[af],
980			    &dpr->ipr_nonhash[af]))
981				goto abort_copy;
982		}
983
984		for (chain = 0; chain < nchains; chain++) {
985			if (ipsec_copy_chain(dph,
986			    spr->ipr_hash[chain].hash_head,
987			    &dpr->ipr_hash[chain].hash_head))
988				goto abort_copy;
989		}
990	}
991
992	dph->iph_gen++;
993
994	rw_exit(&sph->iph_lock);
995	rw_exit(&dph->iph_lock);
996	return (0);
997
998abort_copy:
999	ipsec_polhead_flush(dph, ns);
1000	rw_exit(&sph->iph_lock);
1001	rw_exit(&dph->iph_lock);
1002	return (ENOMEM);
1003}
1004
1005/*
1006 * Clone currently active policy to the inactive policy list.
1007 */
1008int
1009ipsec_clone_system_policy(netstack_t *ns)
1010{
1011	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1012
1013	return (ipsec_copy_polhead(&ipss->ipsec_system_policy,
1014	    &ipss->ipsec_inactive_policy, ns));
1015}
1016
1017/*
1018 * Extract the string from ipsec_policy_failure_msgs[type] and
1019 * log it.
1020 *
1021 */
1022void
1023ipsec_log_policy_failure(int type, char *func_name, ipha_t *ipha, ip6_t *ip6h,
1024    boolean_t secure, netstack_t *ns)
1025{
1026	char	sbuf[INET6_ADDRSTRLEN];
1027	char	dbuf[INET6_ADDRSTRLEN];
1028	char	*s;
1029	char	*d;
1030	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1031
1032	ASSERT((ipha == NULL && ip6h != NULL) ||
1033	    (ip6h == NULL && ipha != NULL));
1034
1035	if (ipha != NULL) {
1036		s = inet_ntop(AF_INET, &ipha->ipha_src, sbuf, sizeof (sbuf));
1037		d = inet_ntop(AF_INET, &ipha->ipha_dst, dbuf, sizeof (dbuf));
1038	} else {
1039		s = inet_ntop(AF_INET6, &ip6h->ip6_src, sbuf, sizeof (sbuf));
1040		d = inet_ntop(AF_INET6, &ip6h->ip6_dst, dbuf, sizeof (dbuf));
1041
1042	}
1043
1044	/* Always bump the policy failure counter. */
1045	ipss->ipsec_policy_failure_count[type]++;
1046
1047	ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
1048	    ipsec_policy_failure_msgs[type], func_name,
1049	    (secure ? "secure" : "not secure"), s, d);
1050}
1051
1052/*
1053 * Rate-limiting front-end to strlog() for AH and ESP.	Uses the ndd variables
1054 * in /dev/ip and the same rate-limiting clock so that there's a single
1055 * knob to turn to throttle the rate of messages.
1056 */
1057void
1058ipsec_rl_strlog(netstack_t *ns, short mid, short sid, char level, ushort_t sl,
1059    char *fmt, ...)
1060{
1061	va_list adx;
1062	hrtime_t current = gethrtime();
1063	ip_stack_t	*ipst = ns->netstack_ip;
1064	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1065
1066	sl |= SL_CONSOLE;
1067	/*
1068	 * Throttle logging to stop syslog from being swamped. If variable
1069	 * 'ipsec_policy_log_interval' is zero, don't log any messages at
1070	 * all, otherwise log only one message every 'ipsec_policy_log_interval'
1071	 * msec. Convert interval (in msec) to hrtime (in nsec).
1072	 */
1073
1074	if (ipst->ips_ipsec_policy_log_interval) {
1075		if (ipss->ipsec_policy_failure_last +
1076		    ((hrtime_t)ipst->ips_ipsec_policy_log_interval *
1077		    (hrtime_t)1000000) <= current) {
1078			va_start(adx, fmt);
1079			(void) vstrlog(mid, sid, level, sl, fmt, adx);
1080			va_end(adx);
1081			ipss->ipsec_policy_failure_last = current;
1082		}
1083	}
1084}
1085
1086void
1087ipsec_config_flush(netstack_t *ns)
1088{
1089	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1090
1091	rw_enter(&ipss->ipsec_system_policy.iph_lock, RW_WRITER);
1092	ipsec_polhead_flush(&ipss->ipsec_system_policy, ns);
1093	ipss->ipsec_next_policy_index = 1;
1094	rw_exit(&ipss->ipsec_system_policy.iph_lock);
1095	ipsec_action_reclaim_stack(ns);
1096}
1097
1098/*
1099 * Clip a policy's min/max keybits vs. the capabilities of the
1100 * algorithm.
1101 */
1102static void
1103act_alg_adjust(uint_t algtype, uint_t algid,
1104    uint16_t *minbits, uint16_t *maxbits, netstack_t *ns)
1105{
1106	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1107	ipsec_alginfo_t *algp = ipss->ipsec_alglists[algtype][algid];
1108
1109	if (algp != NULL) {
1110		/*
1111		 * If passed-in minbits is zero, we assume the caller trusts
1112		 * us with setting the minimum key size.  We pick the
1113		 * algorithms DEFAULT key size for the minimum in this case.
1114		 */
1115		if (*minbits == 0) {
1116			*minbits = algp->alg_default_bits;
1117			ASSERT(*minbits >= algp->alg_minbits);
1118		} else {
1119			*minbits = MAX(MIN(*minbits, algp->alg_maxbits),
1120			    algp->alg_minbits);
1121		}
1122		if (*maxbits == 0)
1123			*maxbits = algp->alg_maxbits;
1124		else
1125			*maxbits = MIN(MAX(*maxbits, algp->alg_minbits),
1126			    algp->alg_maxbits);
1127		ASSERT(*minbits <= *maxbits);
1128	} else {
1129		*minbits = 0;
1130		*maxbits = 0;
1131	}
1132}
1133
1134/*
1135 * Check an action's requested algorithms against the algorithms currently
1136 * loaded in the system.
1137 */
1138boolean_t
1139ipsec_check_action(ipsec_act_t *act, int *diag, netstack_t *ns)
1140{
1141	ipsec_prot_t *ipp;
1142	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1143
1144	ipp = &act->ipa_apply;
1145
1146	if (ipp->ipp_use_ah &&
1147	    ipss->ipsec_alglists[IPSEC_ALG_AUTH][ipp->ipp_auth_alg] == NULL) {
1148		*diag = SPD_DIAGNOSTIC_UNSUPP_AH_ALG;
1149		return (B_FALSE);
1150	}
1151	if (ipp->ipp_use_espa &&
1152	    ipss->ipsec_alglists[IPSEC_ALG_AUTH][ipp->ipp_esp_auth_alg] ==
1153	    NULL) {
1154		*diag = SPD_DIAGNOSTIC_UNSUPP_ESP_AUTH_ALG;
1155		return (B_FALSE);
1156	}
1157	if (ipp->ipp_use_esp &&
1158	    ipss->ipsec_alglists[IPSEC_ALG_ENCR][ipp->ipp_encr_alg] == NULL) {
1159		*diag = SPD_DIAGNOSTIC_UNSUPP_ESP_ENCR_ALG;
1160		return (B_FALSE);
1161	}
1162
1163	act_alg_adjust(IPSEC_ALG_AUTH, ipp->ipp_auth_alg,
1164	    &ipp->ipp_ah_minbits, &ipp->ipp_ah_maxbits, ns);
1165	act_alg_adjust(IPSEC_ALG_AUTH, ipp->ipp_esp_auth_alg,
1166	    &ipp->ipp_espa_minbits, &ipp->ipp_espa_maxbits, ns);
1167	act_alg_adjust(IPSEC_ALG_ENCR, ipp->ipp_encr_alg,
1168	    &ipp->ipp_espe_minbits, &ipp->ipp_espe_maxbits, ns);
1169
1170	if (ipp->ipp_ah_minbits > ipp->ipp_ah_maxbits) {
1171		*diag = SPD_DIAGNOSTIC_UNSUPP_AH_KEYSIZE;
1172		return (B_FALSE);
1173	}
1174	if (ipp->ipp_espa_minbits > ipp->ipp_espa_maxbits) {
1175		*diag = SPD_DIAGNOSTIC_UNSUPP_ESP_AUTH_KEYSIZE;
1176		return (B_FALSE);
1177	}
1178	if (ipp->ipp_espe_minbits > ipp->ipp_espe_maxbits) {
1179		*diag = SPD_DIAGNOSTIC_UNSUPP_ESP_ENCR_KEYSIZE;
1180		return (B_FALSE);
1181	}
1182	/* TODO: sanity check lifetimes */
1183	return (B_TRUE);
1184}
1185
1186/*
1187 * Set up a single action during wildcard expansion..
1188 */
1189static void
1190ipsec_setup_act(ipsec_act_t *outact, ipsec_act_t *act,
1191    uint_t auth_alg, uint_t encr_alg, uint_t eauth_alg, netstack_t *ns)
1192{
1193	ipsec_prot_t *ipp;
1194
1195	*outact = *act;
1196	ipp = &outact->ipa_apply;
1197	ipp->ipp_auth_alg = (uint8_t)auth_alg;
1198	ipp->ipp_encr_alg = (uint8_t)encr_alg;
1199	ipp->ipp_esp_auth_alg = (uint8_t)eauth_alg;
1200
1201	act_alg_adjust(IPSEC_ALG_AUTH, auth_alg,
1202	    &ipp->ipp_ah_minbits, &ipp->ipp_ah_maxbits, ns);
1203	act_alg_adjust(IPSEC_ALG_AUTH, eauth_alg,
1204	    &ipp->ipp_espa_minbits, &ipp->ipp_espa_maxbits, ns);
1205	act_alg_adjust(IPSEC_ALG_ENCR, encr_alg,
1206	    &ipp->ipp_espe_minbits, &ipp->ipp_espe_maxbits, ns);
1207}
1208
1209/*
1210 * combinatoric expansion time: expand a wildcarded action into an
1211 * array of wildcarded actions; we return the exploded action list,
1212 * and return a count in *nact (output only).
1213 */
1214static ipsec_act_t *
1215ipsec_act_wildcard_expand(ipsec_act_t *act, uint_t *nact, netstack_t *ns)
1216{
1217	boolean_t use_ah, use_esp, use_espa;
1218	boolean_t wild_auth, wild_encr, wild_eauth;
1219	uint_t	auth_alg, auth_idx, auth_min, auth_max;
1220	uint_t	eauth_alg, eauth_idx, eauth_min, eauth_max;
1221	uint_t  encr_alg, encr_idx, encr_min, encr_max;
1222	uint_t	action_count, ai;
1223	ipsec_act_t *outact;
1224	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1225
1226	if (act->ipa_type != IPSEC_ACT_APPLY) {
1227		outact = kmem_alloc(sizeof (*act), KM_NOSLEEP);
1228		*nact = 1;
1229		if (outact != NULL)
1230			bcopy(act, outact, sizeof (*act));
1231		return (outact);
1232	}
1233	/*
1234	 * compute the combinatoric explosion..
1235	 *
1236	 * we assume a request for encr if esp_req is PREF_REQUIRED
1237	 * we assume a request for ah auth if ah_req is PREF_REQUIRED.
1238	 * we assume a request for esp auth if !ah and esp_req is PREF_REQUIRED
1239	 */
1240
1241	use_ah = act->ipa_apply.ipp_use_ah;
1242	use_esp = act->ipa_apply.ipp_use_esp;
1243	use_espa = act->ipa_apply.ipp_use_espa;
1244	auth_alg = act->ipa_apply.ipp_auth_alg;
1245	eauth_alg = act->ipa_apply.ipp_esp_auth_alg;
1246	encr_alg = act->ipa_apply.ipp_encr_alg;
1247
1248	wild_auth = use_ah && (auth_alg == 0);
1249	wild_eauth = use_espa && (eauth_alg == 0);
1250	wild_encr = use_esp && (encr_alg == 0);
1251
1252	action_count = 1;
1253	auth_min = auth_max = auth_alg;
1254	eauth_min = eauth_max = eauth_alg;
1255	encr_min = encr_max = encr_alg;
1256
1257	/*
1258	 * set up for explosion.. for each dimension, expand output
1259	 * size by the explosion factor.
1260	 *
1261	 * Don't include the "any" algorithms, if defined, as no
1262	 * kernel policies should be set for these algorithms.
1263	 */
1264
1265#define	SET_EXP_MINMAX(type, wild, alg, min, max, ipss)		\
1266	if (wild) {						\
1267		int nalgs = ipss->ipsec_nalgs[type];		\
1268		if (ipss->ipsec_alglists[type][alg] != NULL)	\
1269			nalgs--;				\
1270		action_count *= nalgs;				\
1271		min = 0;					\
1272		max = ipss->ipsec_nalgs[type] - 1;		\
1273	}
1274
1275	SET_EXP_MINMAX(IPSEC_ALG_AUTH, wild_auth, SADB_AALG_NONE,
1276	    auth_min, auth_max, ipss);
1277	SET_EXP_MINMAX(IPSEC_ALG_AUTH, wild_eauth, SADB_AALG_NONE,
1278	    eauth_min, eauth_max, ipss);
1279	SET_EXP_MINMAX(IPSEC_ALG_ENCR, wild_encr, SADB_EALG_NONE,
1280	    encr_min, encr_max, ipss);
1281
1282#undef	SET_EXP_MINMAX
1283
1284	/*
1285	 * ok, allocate the whole mess..
1286	 */
1287
1288	outact = kmem_alloc(sizeof (*outact) * action_count, KM_NOSLEEP);
1289	if (outact == NULL)
1290		return (NULL);
1291
1292	/*
1293	 * Now compute all combinations.  Note that non-wildcarded
1294	 * dimensions just get a single value from auth_min, while
1295	 * wildcarded dimensions indirect through the sortlist.
1296	 *
1297	 * We do encryption outermost since, at this time, there's
1298	 * greater difference in security and performance between
1299	 * encryption algorithms vs. authentication algorithms.
1300	 */
1301
1302	ai = 0;
1303
1304#define	WHICH_ALG(type, wild, idx, ipss) \
1305	((wild)?(ipss->ipsec_sortlist[type][idx]):(idx))
1306
1307	for (encr_idx = encr_min; encr_idx <= encr_max; encr_idx++) {
1308		encr_alg = WHICH_ALG(IPSEC_ALG_ENCR, wild_encr, encr_idx, ipss);
1309		if (wild_encr && encr_alg == SADB_EALG_NONE)
1310			continue;
1311		for (auth_idx = auth_min; auth_idx <= auth_max; auth_idx++) {
1312			auth_alg = WHICH_ALG(IPSEC_ALG_AUTH, wild_auth,
1313			    auth_idx, ipss);
1314			if (wild_auth && auth_alg == SADB_AALG_NONE)
1315				continue;
1316			for (eauth_idx = eauth_min; eauth_idx <= eauth_max;
1317			    eauth_idx++) {
1318				eauth_alg = WHICH_ALG(IPSEC_ALG_AUTH,
1319				    wild_eauth, eauth_idx, ipss);
1320				if (wild_eauth && eauth_alg == SADB_AALG_NONE)
1321					continue;
1322
1323				ipsec_setup_act(&outact[ai], act,
1324				    auth_alg, encr_alg, eauth_alg, ns);
1325				ai++;
1326			}
1327		}
1328	}
1329
1330#undef WHICH_ALG
1331
1332	ASSERT(ai == action_count);
1333	*nact = action_count;
1334	return (outact);
1335}
1336
1337/*
1338 * Extract the parts of an ipsec_prot_t from an old-style ipsec_req_t.
1339 */
1340static void
1341ipsec_prot_from_req(const ipsec_req_t *req, ipsec_prot_t *ipp)
1342{
1343	bzero(ipp, sizeof (*ipp));
1344	/*
1345	 * ipp_use_* are bitfields.  Look at "!!" in the following as a
1346	 * "boolean canonicalization" operator.
1347	 */
1348	ipp->ipp_use_ah = !!(req->ipsr_ah_req & IPSEC_PREF_REQUIRED);
1349	ipp->ipp_use_esp = !!(req->ipsr_esp_req & IPSEC_PREF_REQUIRED);
1350	ipp->ipp_use_espa = !!(req->ipsr_esp_auth_alg);
1351	ipp->ipp_use_se = !!(req->ipsr_self_encap_req & IPSEC_PREF_REQUIRED);
1352	ipp->ipp_use_unique = !!((req->ipsr_ah_req|req->ipsr_esp_req) &
1353	    IPSEC_PREF_UNIQUE);
1354	ipp->ipp_encr_alg = req->ipsr_esp_alg;
1355	/*
1356	 * SADB_AALG_ANY is a placeholder to distinguish "any" from
1357	 * "none" above.  If auth is required, as determined above,
1358	 * SADB_AALG_ANY becomes 0, which is the representation
1359	 * of "any" and "none" in PF_KEY v2.
1360	 */
1361	ipp->ipp_auth_alg = (req->ipsr_auth_alg != SADB_AALG_ANY) ?
1362	    req->ipsr_auth_alg : 0;
1363	ipp->ipp_esp_auth_alg = (req->ipsr_esp_auth_alg != SADB_AALG_ANY) ?
1364	    req->ipsr_esp_auth_alg : 0;
1365}
1366
1367/*
1368 * Extract a new-style action from a request.
1369 */
1370void
1371ipsec_actvec_from_req(const ipsec_req_t *req, ipsec_act_t **actp, uint_t *nactp,
1372    netstack_t *ns)
1373{
1374	struct ipsec_act act;
1375
1376	bzero(&act, sizeof (act));
1377	if ((req->ipsr_ah_req & IPSEC_PREF_NEVER) &&
1378	    (req->ipsr_esp_req & IPSEC_PREF_NEVER)) {
1379		act.ipa_type = IPSEC_ACT_BYPASS;
1380	} else {
1381		act.ipa_type = IPSEC_ACT_APPLY;
1382		ipsec_prot_from_req(req, &act.ipa_apply);
1383	}
1384	*actp = ipsec_act_wildcard_expand(&act, nactp, ns);
1385}
1386
1387/*
1388 * Convert a new-style "prot" back to an ipsec_req_t (more backwards compat).
1389 * We assume caller has already zero'ed *req for us.
1390 */
1391static int
1392ipsec_req_from_prot(ipsec_prot_t *ipp, ipsec_req_t *req)
1393{
1394	req->ipsr_esp_alg = ipp->ipp_encr_alg;
1395	req->ipsr_auth_alg = ipp->ipp_auth_alg;
1396	req->ipsr_esp_auth_alg = ipp->ipp_esp_auth_alg;
1397
1398	if (ipp->ipp_use_unique) {
1399		req->ipsr_ah_req |= IPSEC_PREF_UNIQUE;
1400		req->ipsr_esp_req |= IPSEC_PREF_UNIQUE;
1401	}
1402	if (ipp->ipp_use_se)
1403		req->ipsr_self_encap_req |= IPSEC_PREF_REQUIRED;
1404	if (ipp->ipp_use_ah)
1405		req->ipsr_ah_req |= IPSEC_PREF_REQUIRED;
1406	if (ipp->ipp_use_esp)
1407		req->ipsr_esp_req |= IPSEC_PREF_REQUIRED;
1408	return (sizeof (*req));
1409}
1410
1411/*
1412 * Convert a new-style action back to an ipsec_req_t (more backwards compat).
1413 * We assume caller has already zero'ed *req for us.
1414 */
1415static int
1416ipsec_req_from_act(ipsec_action_t *ap, ipsec_req_t *req)
1417{
1418	switch (ap->ipa_act.ipa_type) {
1419	case IPSEC_ACT_BYPASS:
1420		req->ipsr_ah_req = IPSEC_PREF_NEVER;
1421		req->ipsr_esp_req = IPSEC_PREF_NEVER;
1422		return (sizeof (*req));
1423	case IPSEC_ACT_APPLY:
1424		return (ipsec_req_from_prot(&ap->ipa_act.ipa_apply, req));
1425	}
1426	return (sizeof (*req));
1427}
1428
1429/*
1430 * Convert a new-style action back to an ipsec_req_t (more backwards compat).
1431 * We assume caller has already zero'ed *req for us.
1432 */
1433int
1434ipsec_req_from_head(ipsec_policy_head_t *ph, ipsec_req_t *req, int af)
1435{
1436	ipsec_policy_t *p;
1437
1438	/*
1439	 * FULL-PERSOCK: consult hash table, too?
1440	 */
1441	for (p = ph->iph_root[IPSEC_INBOUND].ipr_nonhash[af];
1442	    p != NULL;
1443	    p = p->ipsp_hash.hash_next) {
1444		if ((p->ipsp_sel->ipsl_key.ipsl_valid & IPSL_WILDCARD) == 0)
1445			return (ipsec_req_from_act(p->ipsp_act, req));
1446	}
1447	return (sizeof (*req));
1448}
1449
1450/*
1451 * Based on per-socket or latched policy, convert to an appropriate
1452 * IP_SEC_OPT ipsec_req_t for the socket option; return size so we can
1453 * be tail-called from ip.
1454 */
1455int
1456ipsec_req_from_conn(conn_t *connp, ipsec_req_t *req, int af)
1457{
1458	ipsec_latch_t *ipl;
1459	int rv = sizeof (ipsec_req_t);
1460
1461	bzero(req, sizeof (*req));
1462
1463	ASSERT(MUTEX_HELD(&connp->conn_lock));
1464	ipl = connp->conn_latch;
1465
1466	/*
1467	 * Find appropriate policy.  First choice is latched action;
1468	 * failing that, see latched policy; failing that,
1469	 * look at configured policy.
1470	 */
1471	if (ipl != NULL) {
1472		if (connp->conn_latch_in_action != NULL) {
1473			rv = ipsec_req_from_act(connp->conn_latch_in_action,
1474			    req);
1475			goto done;
1476		}
1477		if (connp->conn_latch_in_policy != NULL) {
1478			rv = ipsec_req_from_act(
1479			    connp->conn_latch_in_policy->ipsp_act, req);
1480			goto done;
1481		}
1482	}
1483	if (connp->conn_policy != NULL)
1484		rv = ipsec_req_from_head(connp->conn_policy, req, af);
1485done:
1486	return (rv);
1487}
1488
1489void
1490ipsec_actvec_free(ipsec_act_t *act, uint_t nact)
1491{
1492	kmem_free(act, nact * sizeof (*act));
1493}
1494
1495/*
1496 * Consumes a reference to ipsp.
1497 */
1498static mblk_t *
1499ipsec_check_loopback_policy(mblk_t *data_mp, ip_recv_attr_t *ira,
1500    ipsec_policy_t *ipsp)
1501{
1502	if (!(ira->ira_flags & IRAF_IPSEC_SECURE))
1503		return (data_mp);
1504
1505	ASSERT(ira->ira_flags & IRAF_LOOPBACK);
1506
1507	IPPOL_REFRELE(ipsp);
1508
1509	/*
1510	 * We should do an actual policy check here.  Revisit this
1511	 * when we revisit the IPsec API.  (And pass a conn_t in when we
1512	 * get there.)
1513	 */
1514
1515	return (data_mp);
1516}
1517
1518/*
1519 * Check that packet's inbound ports & proto match the selectors
1520 * expected by the SAs it traversed on the way in.
1521 */
1522static boolean_t
1523ipsec_check_ipsecin_unique(ip_recv_attr_t *ira, const char **reason,
1524    kstat_named_t **counter, uint64_t pkt_unique, netstack_t *ns)
1525{
1526	uint64_t ah_mask, esp_mask;
1527	ipsa_t *ah_assoc;
1528	ipsa_t *esp_assoc;
1529	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1530
1531	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
1532	ASSERT(!(ira->ira_flags & IRAF_LOOPBACK));
1533
1534	ah_assoc = ira->ira_ipsec_ah_sa;
1535	esp_assoc = ira->ira_ipsec_esp_sa;
1536	ASSERT((ah_assoc != NULL) || (esp_assoc != NULL));
1537
1538	ah_mask = (ah_assoc != NULL) ? ah_assoc->ipsa_unique_mask : 0;
1539	esp_mask = (esp_assoc != NULL) ? esp_assoc->ipsa_unique_mask : 0;
1540
1541	if ((ah_mask == 0) && (esp_mask == 0))
1542		return (B_TRUE);
1543
1544	/*
1545	 * The pkt_unique check will also check for tunnel mode on the SA
1546	 * vs. the tunneled_packet boolean.  "Be liberal in what you receive"
1547	 * should not apply in this case.  ;)
1548	 */
1549
1550	if (ah_mask != 0 &&
1551	    ah_assoc->ipsa_unique_id != (pkt_unique & ah_mask)) {
1552		*reason = "AH inner header mismatch";
1553		*counter = DROPPER(ipss, ipds_spd_ah_innermismatch);
1554		return (B_FALSE);
1555	}
1556	if (esp_mask != 0 &&
1557	    esp_assoc->ipsa_unique_id != (pkt_unique & esp_mask)) {
1558		*reason = "ESP inner header mismatch";
1559		*counter = DROPPER(ipss, ipds_spd_esp_innermismatch);
1560		return (B_FALSE);
1561	}
1562	return (B_TRUE);
1563}
1564
1565static boolean_t
1566ipsec_check_ipsecin_action(ip_recv_attr_t *ira, mblk_t *mp, ipsec_action_t *ap,
1567    ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter,
1568    netstack_t *ns)
1569{
1570	boolean_t ret = B_TRUE;
1571	ipsec_prot_t *ipp;
1572	ipsa_t *ah_assoc;
1573	ipsa_t *esp_assoc;
1574	boolean_t decaps;
1575	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1576
1577	ASSERT((ipha == NULL && ip6h != NULL) ||
1578	    (ip6h == NULL && ipha != NULL));
1579
1580	if (ira->ira_flags & IRAF_LOOPBACK) {
1581		/*
1582		 * Besides accepting pointer-equivalent actions, we also
1583		 * accept any ICMP errors we generated for ourselves,
1584		 * regardless of policy.  If we do not wish to make this
1585		 * assumption in the future, check here, and where
1586		 * IXAF_TRUSTED_ICMP is initialized in ip.c and ip6.c.
1587		 */
1588		if (ap == ira->ira_ipsec_action ||
1589		    (ira->ira_flags & IRAF_TRUSTED_ICMP))
1590			return (B_TRUE);
1591
1592		/* Deep compare necessary here?? */
1593		*counter = DROPPER(ipss, ipds_spd_loopback_mismatch);
1594		*reason = "loopback policy mismatch";
1595		return (B_FALSE);
1596	}
1597	ASSERT(!(ira->ira_flags & IRAF_TRUSTED_ICMP));
1598	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
1599
1600	ah_assoc = ira->ira_ipsec_ah_sa;
1601	esp_assoc = ira->ira_ipsec_esp_sa;
1602
1603	decaps = (ira->ira_flags & IRAF_IPSEC_DECAPS);
1604
1605	switch (ap->ipa_act.ipa_type) {
1606	case IPSEC_ACT_DISCARD:
1607	case IPSEC_ACT_REJECT:
1608		/* Should "fail hard" */
1609		*counter = DROPPER(ipss, ipds_spd_explicit);
1610		*reason = "blocked by policy";
1611		return (B_FALSE);
1612
1613	case IPSEC_ACT_BYPASS:
1614	case IPSEC_ACT_CLEAR:
1615		*counter = DROPPER(ipss, ipds_spd_got_secure);
1616		*reason = "expected clear, got protected";
1617		return (B_FALSE);
1618
1619	case IPSEC_ACT_APPLY:
1620		ipp = &ap->ipa_act.ipa_apply;
1621		/*
1622		 * As of now we do the simple checks of whether
1623		 * the datagram has gone through the required IPSEC
1624		 * protocol constraints or not. We might have more
1625		 * in the future like sensitive levels, key bits, etc.
1626		 * If it fails the constraints, check whether we would
1627		 * have accepted this if it had come in clear.
1628		 */
1629		if (ipp->ipp_use_ah) {
1630			if (ah_assoc == NULL) {
1631				ret = ipsec_inbound_accept_clear(mp, ipha,
1632				    ip6h);
1633				*counter = DROPPER(ipss, ipds_spd_got_clear);
1634				*reason = "unprotected not accepted";
1635				break;
1636			}
1637			ASSERT(ah_assoc != NULL);
1638			ASSERT(ipp->ipp_auth_alg != 0);
1639
1640			if (ah_assoc->ipsa_auth_alg !=
1641			    ipp->ipp_auth_alg) {
1642				*counter = DROPPER(ipss, ipds_spd_bad_ahalg);
1643				*reason = "unacceptable ah alg";
1644				ret = B_FALSE;
1645				break;
1646			}
1647		} else if (ah_assoc != NULL) {
1648			/*
1649			 * Don't allow this. Check IPSEC NOTE above
1650			 * ip_fanout_proto().
1651			 */
1652			*counter = DROPPER(ipss, ipds_spd_got_ah);
1653			*reason = "unexpected AH";
1654			ret = B_FALSE;
1655			break;
1656		}
1657		if (ipp->ipp_use_esp) {
1658			if (esp_assoc == NULL) {
1659				ret = ipsec_inbound_accept_clear(mp, ipha,
1660				    ip6h);
1661				*counter = DROPPER(ipss, ipds_spd_got_clear);
1662				*reason = "unprotected not accepted";
1663				break;
1664			}
1665			ASSERT(esp_assoc != NULL);
1666			ASSERT(ipp->ipp_encr_alg != 0);
1667
1668			if (esp_assoc->ipsa_encr_alg !=
1669			    ipp->ipp_encr_alg) {
1670				*counter = DROPPER(ipss, ipds_spd_bad_espealg);
1671				*reason = "unacceptable esp alg";
1672				ret = B_FALSE;
1673				break;
1674			}
1675			/*
1676			 * If the client does not need authentication,
1677			 * we don't verify the alogrithm.
1678			 */
1679			if (ipp->ipp_use_espa) {
1680				if (esp_assoc->ipsa_auth_alg !=
1681				    ipp->ipp_esp_auth_alg) {
1682					*counter = DROPPER(ipss,
1683					    ipds_spd_bad_espaalg);
1684					*reason = "unacceptable esp auth alg";
1685					ret = B_FALSE;
1686					break;
1687				}
1688			}
1689		} else if (esp_assoc != NULL) {
1690			/*
1691			 * Don't allow this. Check IPSEC NOTE above
1692			 * ip_fanout_proto().
1693			 */
1694			*counter = DROPPER(ipss, ipds_spd_got_esp);
1695			*reason = "unexpected ESP";
1696			ret = B_FALSE;
1697			break;
1698		}
1699		if (ipp->ipp_use_se) {
1700			if (!decaps) {
1701				ret = ipsec_inbound_accept_clear(mp, ipha,
1702				    ip6h);
1703				if (!ret) {
1704					/* XXX mutant? */
1705					*counter = DROPPER(ipss,
1706					    ipds_spd_bad_selfencap);
1707					*reason = "self encap not found";
1708					break;
1709				}
1710			}
1711		} else if (decaps) {
1712			/*
1713			 * XXX If the packet comes in tunneled and the
1714			 * recipient does not expect it to be tunneled, it
1715			 * is okay. But we drop to be consistent with the
1716			 * other cases.
1717			 */
1718			*counter = DROPPER(ipss, ipds_spd_got_selfencap);
1719			*reason = "unexpected self encap";
1720			ret = B_FALSE;
1721			break;
1722		}
1723		if (ira->ira_ipsec_action != NULL) {
1724			/*
1725			 * This can happen if we do a double policy-check on
1726			 * a packet
1727			 * XXX XXX should fix this case!
1728			 */
1729			IPACT_REFRELE(ira->ira_ipsec_action);
1730		}
1731		ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
1732		ASSERT(ira->ira_ipsec_action == NULL);
1733		IPACT_REFHOLD(ap);
1734		ira->ira_ipsec_action = ap;
1735		break;	/* from switch */
1736	}
1737	return (ret);
1738}
1739
1740static boolean_t
1741spd_match_inbound_ids(ipsec_latch_t *ipl, ipsa_t *sa)
1742{
1743	ASSERT(ipl->ipl_ids_latched == B_TRUE);
1744	return ipsid_equal(ipl->ipl_remote_cid, sa->ipsa_src_cid) &&
1745	    ipsid_equal(ipl->ipl_local_cid, sa->ipsa_dst_cid);
1746}
1747
1748/*
1749 * Takes a latched conn and an inbound packet and returns a unique_id suitable
1750 * for SA comparisons.  Most of the time we will copy from the conn_t, but
1751 * there are cases when the conn_t is latched but it has wildcard selectors,
1752 * and then we need to fallback to scooping them out of the packet.
1753 *
1754 * Assume we'll never have 0 with a conn_t present, so use 0 as a failure.  We
1755 * can get away with this because we only have non-zero ports/proto for
1756 * latched conn_ts.
1757 *
1758 * Ideal candidate for an "inline" keyword, as we're JUST convoluted enough
1759 * to not be a nice macro.
1760 */
1761static uint64_t
1762conn_to_unique(conn_t *connp, mblk_t *data_mp, ipha_t *ipha, ip6_t *ip6h)
1763{
1764	ipsec_selector_t sel;
1765	uint8_t ulp = connp->conn_proto;
1766
1767	ASSERT(connp->conn_latch_in_policy != NULL);
1768
1769	if ((ulp == IPPROTO_TCP || ulp == IPPROTO_UDP || ulp == IPPROTO_SCTP) &&
1770	    (connp->conn_fport == 0 || connp->conn_lport == 0)) {
1771		/* Slow path - we gotta grab from the packet. */
1772		if (ipsec_init_inbound_sel(&sel, data_mp, ipha, ip6h,
1773		    SEL_NONE) != SELRET_SUCCESS) {
1774			/* Failure -> have caller free packet with ENOMEM. */
1775			return (0);
1776		}
1777		return (SA_UNIQUE_ID(sel.ips_remote_port, sel.ips_local_port,
1778		    sel.ips_protocol, 0));
1779	}
1780
1781#ifdef DEBUG_NOT_UNTIL_6478464
1782	if (ipsec_init_inbound_sel(&sel, data_mp, ipha, ip6h, SEL_NONE) ==
1783	    SELRET_SUCCESS) {
1784		ASSERT(sel.ips_local_port == connp->conn_lport);
1785		ASSERT(sel.ips_remote_port == connp->conn_fport);
1786		ASSERT(sel.ips_protocol == connp->conn_proto);
1787	}
1788	ASSERT(connp->conn_proto != 0);
1789#endif
1790
1791	return (SA_UNIQUE_ID(connp->conn_fport, connp->conn_lport, ulp, 0));
1792}
1793
1794/*
1795 * Called to check policy on a latched connection.
1796 * Note that we don't dereference conn_latch or conn_ihere since the conn might
1797 * be closing. The caller passes a held ipsec_latch_t instead.
1798 */
1799static boolean_t
1800ipsec_check_ipsecin_latch(ip_recv_attr_t *ira, mblk_t *mp, ipsec_latch_t *ipl,
1801    ipsec_action_t *ap, ipha_t *ipha, ip6_t *ip6h, const char **reason,
1802    kstat_named_t **counter, conn_t *connp, netstack_t *ns)
1803{
1804	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1805
1806	ASSERT(ipl->ipl_ids_latched == B_TRUE);
1807	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
1808
1809	if (!(ira->ira_flags & IRAF_LOOPBACK)) {
1810		/*
1811		 * Over loopback, there aren't real security associations,
1812		 * so there are neither identities nor "unique" values
1813		 * for us to check the packet against.
1814		 */
1815		if (ira->ira_ipsec_ah_sa != NULL) {
1816			if (!spd_match_inbound_ids(ipl,
1817			    ira->ira_ipsec_ah_sa)) {
1818				*counter = DROPPER(ipss, ipds_spd_ah_badid);
1819				*reason = "AH identity mismatch";
1820				return (B_FALSE);
1821			}
1822		}
1823
1824		if (ira->ira_ipsec_esp_sa != NULL) {
1825			if (!spd_match_inbound_ids(ipl,
1826			    ira->ira_ipsec_esp_sa)) {
1827				*counter = DROPPER(ipss, ipds_spd_esp_badid);
1828				*reason = "ESP identity mismatch";
1829				return (B_FALSE);
1830			}
1831		}
1832
1833		/*
1834		 * Can fudge pkt_unique from connp because we're latched.
1835		 * In DEBUG kernels (see conn_to_unique()'s implementation),
1836		 * verify this even if it REALLY slows things down.
1837		 */
1838		if (!ipsec_check_ipsecin_unique(ira, reason, counter,
1839		    conn_to_unique(connp, mp, ipha, ip6h), ns)) {
1840			return (B_FALSE);
1841		}
1842	}
1843	return (ipsec_check_ipsecin_action(ira, mp, ap, ipha, ip6h, reason,
1844	    counter, ns));
1845}
1846
1847/*
1848 * Check to see whether this secured datagram meets the policy
1849 * constraints specified in ipsp.
1850 *
1851 * Called from ipsec_check_global_policy, and ipsec_check_inbound_policy.
1852 *
1853 * Consumes a reference to ipsp.
1854 * Returns the mblk if ok.
1855 */
1856static mblk_t *
1857ipsec_check_ipsecin_policy(mblk_t *data_mp, ipsec_policy_t *ipsp,
1858    ipha_t *ipha, ip6_t *ip6h, uint64_t pkt_unique, ip_recv_attr_t *ira,
1859    netstack_t *ns)
1860{
1861	ipsec_action_t *ap;
1862	const char *reason = "no policy actions found";
1863	ip_stack_t	*ipst = ns->netstack_ip;
1864	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1865	kstat_named_t *counter;
1866
1867	counter = DROPPER(ipss, ipds_spd_got_secure);
1868
1869	ASSERT(ipsp != NULL);
1870
1871	ASSERT((ipha == NULL && ip6h != NULL) ||
1872	    (ip6h == NULL && ipha != NULL));
1873
1874	if (ira->ira_flags & IRAF_LOOPBACK)
1875		return (ipsec_check_loopback_policy(data_mp, ira, ipsp));
1876
1877	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
1878
1879	if (ira->ira_ipsec_action != NULL) {
1880		/*
1881		 * this can happen if we do a double policy-check on a packet
1882		 * Would be nice to be able to delete this test..
1883		 */
1884		IPACT_REFRELE(ira->ira_ipsec_action);
1885	}
1886	ASSERT(ira->ira_ipsec_action == NULL);
1887
1888	if (!SA_IDS_MATCH(ira->ira_ipsec_ah_sa, ira->ira_ipsec_esp_sa)) {
1889		reason = "inbound AH and ESP identities differ";
1890		counter = DROPPER(ipss, ipds_spd_ahesp_diffid);
1891		goto drop;
1892	}
1893
1894	if (!ipsec_check_ipsecin_unique(ira, &reason, &counter, pkt_unique,
1895	    ns))
1896		goto drop;
1897
1898	/*
1899	 * Ok, now loop through the possible actions and see if any
1900	 * of them work for us.
1901	 */
1902
1903	for (ap = ipsp->ipsp_act; ap != NULL; ap = ap->ipa_next) {
1904		if (ipsec_check_ipsecin_action(ira, data_mp, ap,
1905		    ipha, ip6h, &reason, &counter, ns)) {
1906			BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
1907			IPPOL_REFRELE(ipsp);
1908			return (data_mp);
1909		}
1910	}
1911drop:
1912	ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
1913	    "ipsec inbound policy mismatch: %s, packet dropped\n",
1914	    reason);
1915	IPPOL_REFRELE(ipsp);
1916	ASSERT(ira->ira_ipsec_action == NULL);
1917	BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
1918	ip_drop_packet(data_mp, B_TRUE, NULL, counter,
1919	    &ipss->ipsec_spd_dropper);
1920	return (NULL);
1921}
1922
1923/*
1924 * sleazy prefix-length-based compare.
1925 * another inlining candidate..
1926 */
1927boolean_t
1928ip_addr_match(uint8_t *addr1, int pfxlen, in6_addr_t *addr2p)
1929{
1930	int offset = pfxlen>>3;
1931	int bitsleft = pfxlen & 7;
1932	uint8_t *addr2 = (uint8_t *)addr2p;
1933
1934	/*
1935	 * and there was much evil..
1936	 * XXX should inline-expand the bcmp here and do this 32 bits
1937	 * or 64 bits at a time..
1938	 */
1939	return ((bcmp(addr1, addr2, offset) == 0) &&
1940	    ((bitsleft == 0) ||
1941	    (((addr1[offset] ^ addr2[offset]) & (0xff<<(8-bitsleft))) == 0)));
1942}
1943
1944static ipsec_policy_t *
1945ipsec_find_policy_chain(ipsec_policy_t *best, ipsec_policy_t *chain,
1946    ipsec_selector_t *sel, boolean_t is_icmp_inv_acq)
1947{
1948	ipsec_selkey_t *isel;
1949	ipsec_policy_t *p;
1950	int bpri = best ? best->ipsp_prio : 0;
1951
1952	for (p = chain; p != NULL; p = p->ipsp_hash.hash_next) {
1953		uint32_t valid;
1954
1955		if (p->ipsp_prio <= bpri)
1956			continue;
1957		isel = &p->ipsp_sel->ipsl_key;
1958		valid = isel->ipsl_valid;
1959
1960		if ((valid & IPSL_PROTOCOL) &&
1961		    (isel->ipsl_proto != sel->ips_protocol))
1962			continue;
1963
1964		if ((valid & IPSL_REMOTE_ADDR) &&
1965		    !ip_addr_match((uint8_t *)&isel->ipsl_remote,
1966		    isel->ipsl_remote_pfxlen, &sel->ips_remote_addr_v6))
1967			continue;
1968
1969		if ((valid & IPSL_LOCAL_ADDR) &&
1970		    !ip_addr_match((uint8_t *)&isel->ipsl_local,
1971		    isel->ipsl_local_pfxlen, &sel->ips_local_addr_v6))
1972			continue;
1973
1974		if ((valid & IPSL_REMOTE_PORT) &&
1975		    isel->ipsl_rport != sel->ips_remote_port)
1976			continue;
1977
1978		if ((valid & IPSL_LOCAL_PORT) &&
1979		    isel->ipsl_lport != sel->ips_local_port)
1980			continue;
1981
1982		if (!is_icmp_inv_acq) {
1983			if ((valid & IPSL_ICMP_TYPE) &&
1984			    (isel->ipsl_icmp_type > sel->ips_icmp_type ||
1985			    isel->ipsl_icmp_type_end < sel->ips_icmp_type)) {
1986				continue;
1987			}
1988
1989			if ((valid & IPSL_ICMP_CODE) &&
1990			    (isel->ipsl_icmp_code > sel->ips_icmp_code ||
1991			    isel->ipsl_icmp_code_end <
1992			    sel->ips_icmp_code)) {
1993				continue;
1994			}
1995		} else {
1996			/*
1997			 * special case for icmp inverse acquire
1998			 * we only want policies that aren't drop/pass
1999			 */
2000			if (p->ipsp_act->ipa_act.ipa_type != IPSEC_ACT_APPLY)
2001				continue;
2002		}
2003
2004		/* we matched all the packet-port-field selectors! */
2005		best = p;
2006		bpri = p->ipsp_prio;
2007	}
2008
2009	return (best);
2010}
2011
2012/*
2013 * Try to find and return the best policy entry under a given policy
2014 * root for a given set of selectors; the first parameter "best" is
2015 * the current best policy so far.  If "best" is non-null, we have a
2016 * reference to it.  We return a reference to a policy; if that policy
2017 * is not the original "best", we need to release that reference
2018 * before returning.
2019 */
2020ipsec_policy_t *
2021ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head,
2022    int direction, ipsec_selector_t *sel)
2023{
2024	ipsec_policy_t *curbest;
2025	ipsec_policy_root_t *root;
2026	uint8_t is_icmp_inv_acq = sel->ips_is_icmp_inv_acq;
2027	int af = sel->ips_isv4 ? IPSEC_AF_V4 : IPSEC_AF_V6;
2028
2029	curbest = best;
2030	root = &head->iph_root[direction];
2031
2032#ifdef DEBUG
2033	if (is_icmp_inv_acq) {
2034		if (sel->ips_isv4) {
2035			if (sel->ips_protocol != IPPROTO_ICMP) {
2036				cmn_err(CE_WARN, "ipsec_find_policy_head:"
2037				    " expecting icmp, got %d",
2038				    sel->ips_protocol);
2039			}
2040		} else {
2041			if (sel->ips_protocol != IPPROTO_ICMPV6) {
2042				cmn_err(CE_WARN, "ipsec_find_policy_head:"
2043				    " expecting icmpv6, got %d",
2044				    sel->ips_protocol);
2045			}
2046		}
2047	}
2048#endif
2049
2050	rw_enter(&head->iph_lock, RW_READER);
2051
2052	if (root->ipr_nchains > 0) {
2053		curbest = ipsec_find_policy_chain(curbest,
2054		    root->ipr_hash[selector_hash(sel, root)].hash_head, sel,
2055		    is_icmp_inv_acq);
2056	}
2057	curbest = ipsec_find_policy_chain(curbest, root->ipr_nonhash[af], sel,
2058	    is_icmp_inv_acq);
2059
2060	/*
2061	 * Adjust reference counts if we found anything new.
2062	 */
2063	if (curbest != best) {
2064		ASSERT(curbest != NULL);
2065		IPPOL_REFHOLD(curbest);
2066
2067		if (best != NULL) {
2068			IPPOL_REFRELE(best);
2069		}
2070	}
2071
2072	rw_exit(&head->iph_lock);
2073
2074	return (curbest);
2075}
2076
2077/*
2078 * Find the best system policy (either global or per-interface) which
2079 * applies to the given selector; look in all the relevant policy roots
2080 * to figure out which policy wins.
2081 *
2082 * Returns a reference to a policy; caller must release this
2083 * reference when done.
2084 */
2085ipsec_policy_t *
2086ipsec_find_policy(int direction, const conn_t *connp, ipsec_selector_t *sel,
2087    netstack_t *ns)
2088{
2089	ipsec_policy_t *p;
2090	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2091
2092	p = ipsec_find_policy_head(NULL, &ipss->ipsec_system_policy,
2093	    direction, sel);
2094	if ((connp != NULL) && (connp->conn_policy != NULL)) {
2095		p = ipsec_find_policy_head(p, connp->conn_policy,
2096		    direction, sel);
2097	}
2098
2099	return (p);
2100}
2101
2102/*
2103 * Check with global policy and see whether this inbound
2104 * packet meets the policy constraints.
2105 *
2106 * Locate appropriate policy from global policy, supplemented by the
2107 * conn's configured and/or cached policy if the conn is supplied.
2108 *
2109 * Dispatch to ipsec_check_ipsecin_policy if we have policy and an
2110 * encrypted packet to see if they match.
2111 *
2112 * Otherwise, see if the policy allows cleartext; if not, drop it on the
2113 * floor.
2114 */
2115mblk_t *
2116ipsec_check_global_policy(mblk_t *data_mp, conn_t *connp,
2117    ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, netstack_t *ns)
2118{
2119	ipsec_policy_t *p;
2120	ipsec_selector_t sel;
2121	boolean_t policy_present;
2122	kstat_named_t *counter;
2123	uint64_t pkt_unique;
2124	ip_stack_t	*ipst = ns->netstack_ip;
2125	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2126
2127	sel.ips_is_icmp_inv_acq = 0;
2128
2129	ASSERT((ipha == NULL && ip6h != NULL) ||
2130	    (ip6h == NULL && ipha != NULL));
2131
2132	if (ipha != NULL)
2133		policy_present = ipss->ipsec_inbound_v4_policy_present;
2134	else
2135		policy_present = ipss->ipsec_inbound_v6_policy_present;
2136
2137	if (!policy_present && connp == NULL) {
2138		/*
2139		 * No global policy and no per-socket policy;
2140		 * just pass it back (but we shouldn't get here in that case)
2141		 */
2142		return (data_mp);
2143	}
2144
2145	/*
2146	 * If we have cached policy, use it.
2147	 * Otherwise consult system policy.
2148	 */
2149	if ((connp != NULL) && (connp->conn_latch != NULL)) {
2150		p = connp->conn_latch_in_policy;
2151		if (p != NULL) {
2152			IPPOL_REFHOLD(p);
2153		}
2154		/*
2155		 * Fudge sel for UNIQUE_ID setting below.
2156		 */
2157		pkt_unique = conn_to_unique(connp, data_mp, ipha, ip6h);
2158	} else {
2159		/* Initialize the ports in the selector */
2160		if (ipsec_init_inbound_sel(&sel, data_mp, ipha, ip6h,
2161		    SEL_NONE) == SELRET_NOMEM) {
2162			/*
2163			 * Technically not a policy mismatch, but it is
2164			 * an internal failure.
2165			 */
2166			ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH,
2167			    "ipsec_init_inbound_sel", ipha, ip6h, B_TRUE, ns);
2168			counter = DROPPER(ipss, ipds_spd_nomem);
2169			goto fail;
2170		}
2171
2172		/*
2173		 * Find the policy which best applies.
2174		 *
2175		 * If we find global policy, we should look at both
2176		 * local policy and global policy and see which is
2177		 * stronger and match accordingly.
2178		 *
2179		 * If we don't find a global policy, check with
2180		 * local policy alone.
2181		 */
2182
2183		p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, &sel, ns);
2184		pkt_unique = SA_UNIQUE_ID(sel.ips_remote_port,
2185		    sel.ips_local_port, sel.ips_protocol, 0);
2186	}
2187
2188	if (p == NULL) {
2189		if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
2190			/*
2191			 * We have no policy; default to succeeding.
2192			 * XXX paranoid system design doesn't do this.
2193			 */
2194			BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
2195			return (data_mp);
2196		} else {
2197			counter = DROPPER(ipss, ipds_spd_got_secure);
2198			ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED,
2199			    "ipsec_check_global_policy", ipha, ip6h, B_TRUE,
2200			    ns);
2201			goto fail;
2202		}
2203	}
2204	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2205		return (ipsec_check_ipsecin_policy(data_mp, p, ipha, ip6h,
2206		    pkt_unique, ira, ns));
2207	}
2208	if (p->ipsp_act->ipa_allow_clear) {
2209		BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
2210		IPPOL_REFRELE(p);
2211		return (data_mp);
2212	}
2213	IPPOL_REFRELE(p);
2214	/*
2215	 * If we reach here, we will drop the packet because it failed the
2216	 * global policy check because the packet was cleartext, and it
2217	 * should not have been.
2218	 */
2219	ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH,
2220	    "ipsec_check_global_policy", ipha, ip6h, B_FALSE, ns);
2221	counter = DROPPER(ipss, ipds_spd_got_clear);
2222
2223fail:
2224	ip_drop_packet(data_mp, B_TRUE, NULL, counter,
2225	    &ipss->ipsec_spd_dropper);
2226	BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
2227	return (NULL);
2228}
2229
2230/*
2231 * We check whether an inbound datagram is a valid one
2232 * to accept in clear. If it is secure, it is the job
2233 * of IPSEC to log information appropriately if it
2234 * suspects that it may not be the real one.
2235 *
2236 * It is called only while fanning out to the ULP
2237 * where ULP accepts only secure data and the incoming
2238 * is clear. Usually we never accept clear datagrams in
2239 * such cases. ICMP is the only exception.
2240 *
2241 * NOTE : We don't call this function if the client (ULP)
2242 * is willing to accept things in clear.
2243 */
2244boolean_t
2245ipsec_inbound_accept_clear(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
2246{
2247	ushort_t iph_hdr_length;
2248	icmph_t *icmph;
2249	icmp6_t *icmp6;
2250	uint8_t *nexthdrp;
2251
2252	ASSERT((ipha != NULL && ip6h == NULL) ||
2253	    (ipha == NULL && ip6h != NULL));
2254
2255	if (ip6h != NULL) {
2256		iph_hdr_length = ip_hdr_length_v6(mp, ip6h);
2257		if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length,
2258		    &nexthdrp)) {
2259			return (B_FALSE);
2260		}
2261		if (*nexthdrp != IPPROTO_ICMPV6)
2262			return (B_FALSE);
2263		icmp6 = (icmp6_t *)(&mp->b_rptr[iph_hdr_length]);
2264		/* Match IPv6 ICMP policy as closely as IPv4 as possible. */
2265		switch (icmp6->icmp6_type) {
2266		case ICMP6_PARAM_PROB:
2267			/* Corresponds to port/proto unreach in IPv4. */
2268		case ICMP6_ECHO_REQUEST:
2269			/* Just like IPv4. */
2270			return (B_FALSE);
2271
2272		case MLD_LISTENER_QUERY:
2273		case MLD_LISTENER_REPORT:
2274		case MLD_LISTENER_REDUCTION:
2275			/*
2276			 * XXX Seperate NDD in IPv4 what about here?
2277			 * Plus, mcast is important to ND.
2278			 */
2279		case ICMP6_DST_UNREACH:
2280			/* Corresponds to HOST/NET unreachable in IPv4. */
2281		case ICMP6_PACKET_TOO_BIG:
2282		case ICMP6_ECHO_REPLY:
2283			/* These are trusted in IPv4. */
2284		case ND_ROUTER_SOLICIT:
2285		case ND_ROUTER_ADVERT:
2286		case ND_NEIGHBOR_SOLICIT:
2287		case ND_NEIGHBOR_ADVERT:
2288		case ND_REDIRECT:
2289			/* Trust ND messages for now. */
2290		case ICMP6_TIME_EXCEEDED:
2291		default:
2292			return (B_TRUE);
2293		}
2294	} else {
2295		/*
2296		 * If it is not ICMP, fail this request.
2297		 */
2298		if (ipha->ipha_protocol != IPPROTO_ICMP) {
2299#ifdef FRAGCACHE_DEBUG
2300			cmn_err(CE_WARN, "Dropping - ipha_proto = %d\n",
2301			    ipha->ipha_protocol);
2302#endif
2303			return (B_FALSE);
2304		}
2305		iph_hdr_length = IPH_HDR_LENGTH(ipha);
2306		icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
2307		/*
2308		 * It is an insecure icmp message. Check to see whether we are
2309		 * willing to accept this one.
2310		 */
2311
2312		switch (icmph->icmph_type) {
2313		case ICMP_ECHO_REPLY:
2314		case ICMP_TIME_STAMP_REPLY:
2315		case ICMP_INFO_REPLY:
2316		case ICMP_ROUTER_ADVERTISEMENT:
2317			/*
2318			 * We should not encourage clear replies if this
2319			 * client expects secure. If somebody is replying
2320			 * in clear some mailicious user watching both the
2321			 * request and reply, can do chosen-plain-text attacks.
2322			 * With global policy we might be just expecting secure
2323			 * but sending out clear. We don't know what the right
2324			 * thing is. We can't do much here as we can't control
2325			 * the sender here. Till we are sure of what to do,
2326			 * accept them.
2327			 */
2328			return (B_TRUE);
2329		case ICMP_ECHO_REQUEST:
2330		case ICMP_TIME_STAMP_REQUEST:
2331		case ICMP_INFO_REQUEST:
2332		case ICMP_ADDRESS_MASK_REQUEST:
2333		case ICMP_ROUTER_SOLICITATION:
2334		case ICMP_ADDRESS_MASK_REPLY:
2335			/*
2336			 * Don't accept this as somebody could be sending
2337			 * us plain text to get encrypted data. If we reply,
2338			 * it will lead to chosen plain text attack.
2339			 */
2340			return (B_FALSE);
2341		case ICMP_DEST_UNREACHABLE:
2342			switch (icmph->icmph_code) {
2343			case ICMP_FRAGMENTATION_NEEDED:
2344				/*
2345				 * Be in sync with icmp_inbound, where we have
2346				 * already set dce_pmtu
2347				 */
2348#ifdef FRAGCACHE_DEBUG
2349			cmn_err(CE_WARN, "ICMP frag needed\n");
2350#endif
2351				return (B_TRUE);
2352			case ICMP_HOST_UNREACHABLE:
2353			case ICMP_NET_UNREACHABLE:
2354				/*
2355				 * By accepting, we could reset a connection.
2356				 * How do we solve the problem of some
2357				 * intermediate router sending in-secure ICMP
2358				 * messages ?
2359				 */
2360				return (B_TRUE);
2361			case ICMP_PORT_UNREACHABLE:
2362			case ICMP_PROTOCOL_UNREACHABLE:
2363			default :
2364				return (B_FALSE);
2365			}
2366		case ICMP_SOURCE_QUENCH:
2367			/*
2368			 * If this is an attack, TCP will slow start
2369			 * because of this. Is it very harmful ?
2370			 */
2371			return (B_TRUE);
2372		case ICMP_PARAM_PROBLEM:
2373			return (B_FALSE);
2374		case ICMP_TIME_EXCEEDED:
2375			return (B_TRUE);
2376		case ICMP_REDIRECT:
2377			return (B_FALSE);
2378		default :
2379			return (B_FALSE);
2380		}
2381	}
2382}
2383
2384void
2385ipsec_latch_ids(ipsec_latch_t *ipl, ipsid_t *local, ipsid_t *remote)
2386{
2387	mutex_enter(&ipl->ipl_lock);
2388
2389	if (ipl->ipl_ids_latched) {
2390		/* I lost, someone else got here before me */
2391		mutex_exit(&ipl->ipl_lock);
2392		return;
2393	}
2394
2395	if (local != NULL)
2396		IPSID_REFHOLD(local);
2397	if (remote != NULL)
2398		IPSID_REFHOLD(remote);
2399
2400	ipl->ipl_local_cid = local;
2401	ipl->ipl_remote_cid = remote;
2402	ipl->ipl_ids_latched = B_TRUE;
2403	mutex_exit(&ipl->ipl_lock);
2404}
2405
2406void
2407ipsec_latch_inbound(conn_t *connp, ip_recv_attr_t *ira)
2408{
2409	ipsa_t *sa;
2410	ipsec_latch_t *ipl = connp->conn_latch;
2411
2412	if (!ipl->ipl_ids_latched) {
2413		ipsid_t *local = NULL;
2414		ipsid_t *remote = NULL;
2415
2416		if (!(ira->ira_flags & IRAF_LOOPBACK)) {
2417			ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2418			if (ira->ira_ipsec_esp_sa != NULL)
2419				sa = ira->ira_ipsec_esp_sa;
2420			else
2421				sa = ira->ira_ipsec_ah_sa;
2422			ASSERT(sa != NULL);
2423			local = sa->ipsa_dst_cid;
2424			remote = sa->ipsa_src_cid;
2425		}
2426		ipsec_latch_ids(ipl, local, remote);
2427	}
2428	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2429		if (connp->conn_latch_in_action != NULL) {
2430			/*
2431			 * Previously cached action.  This is probably
2432			 * harmless, but in DEBUG kernels, check for
2433			 * action equality.
2434			 *
2435			 * Preserve the existing action to preserve latch
2436			 * invariance.
2437			 */
2438			ASSERT(connp->conn_latch_in_action ==
2439			    ira->ira_ipsec_action);
2440			return;
2441		}
2442		connp->conn_latch_in_action = ira->ira_ipsec_action;
2443		IPACT_REFHOLD(connp->conn_latch_in_action);
2444	}
2445}
2446
2447/*
2448 * Check whether the policy constraints are met either for an
2449 * inbound datagram; called from IP in numerous places.
2450 *
2451 * Note that this is not a chokepoint for inbound policy checks;
2452 * see also ipsec_check_ipsecin_latch() and ipsec_check_global_policy()
2453 */
2454mblk_t *
2455ipsec_check_inbound_policy(mblk_t *mp, conn_t *connp,
2456    ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira)
2457{
2458	boolean_t	ret;
2459	ipsec_latch_t	*ipl;
2460	ipsec_action_t	*ap;
2461	uint64_t	unique_id;
2462	ipsec_stack_t	*ipss;
2463	ip_stack_t	*ipst;
2464	netstack_t	*ns;
2465	ipsec_policy_head_t *policy_head;
2466	ipsec_policy_t	*p = NULL;
2467
2468	ASSERT(connp != NULL);
2469	ns = connp->conn_netstack;
2470	ipss = ns->netstack_ipsec;
2471	ipst = ns->netstack_ip;
2472
2473	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
2474		/*
2475		 * This is the case where the incoming datagram is
2476		 * cleartext and we need to see whether this client
2477		 * would like to receive such untrustworthy things from
2478		 * the wire.
2479		 */
2480		ASSERT(mp != NULL);
2481
2482		mutex_enter(&connp->conn_lock);
2483		if (connp->conn_state_flags & CONN_CONDEMNED) {
2484			mutex_exit(&connp->conn_lock);
2485			ip_drop_packet(mp, B_TRUE, NULL,
2486			    DROPPER(ipss, ipds_spd_got_clear),
2487			    &ipss->ipsec_spd_dropper);
2488			BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
2489			return (NULL);
2490		}
2491		if (connp->conn_latch != NULL) {
2492			/* Hold a reference in case the conn is closing */
2493			p = connp->conn_latch_in_policy;
2494			if (p != NULL)
2495				IPPOL_REFHOLD(p);
2496			mutex_exit(&connp->conn_lock);
2497			/*
2498			 * Policy is cached in the conn.
2499			 */
2500			if (p != NULL && !p->ipsp_act->ipa_allow_clear) {
2501				ret = ipsec_inbound_accept_clear(mp,
2502				    ipha, ip6h);
2503				if (ret) {
2504					BUMP_MIB(&ipst->ips_ip_mib,
2505					    ipsecInSucceeded);
2506					IPPOL_REFRELE(p);
2507					return (mp);
2508				} else {
2509					ipsec_log_policy_failure(
2510					    IPSEC_POLICY_MISMATCH,
2511					    "ipsec_check_inbound_policy", ipha,
2512					    ip6h, B_FALSE, ns);
2513					ip_drop_packet(mp, B_TRUE, NULL,
2514					    DROPPER(ipss, ipds_spd_got_clear),
2515					    &ipss->ipsec_spd_dropper);
2516					BUMP_MIB(&ipst->ips_ip_mib,
2517					    ipsecInFailed);
2518					IPPOL_REFRELE(p);
2519					return (NULL);
2520				}
2521			} else {
2522				BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
2523				if (p != NULL)
2524					IPPOL_REFRELE(p);
2525				return (mp);
2526			}
2527		} else {
2528			policy_head = connp->conn_policy;
2529
2530			/* Hold a reference in case the conn is closing */
2531			if (policy_head != NULL)
2532				IPPH_REFHOLD(policy_head);
2533			mutex_exit(&connp->conn_lock);
2534			/*
2535			 * As this is a non-hardbound connection we need
2536			 * to look at both per-socket policy and global
2537			 * policy.
2538			 */
2539			mp = ipsec_check_global_policy(mp, connp,
2540			    ipha, ip6h, ira, ns);
2541			if (policy_head != NULL)
2542				IPPH_REFRELE(policy_head, ns);
2543			return (mp);
2544		}
2545	}
2546
2547	mutex_enter(&connp->conn_lock);
2548	/* Connection is closing */
2549	if (connp->conn_state_flags & CONN_CONDEMNED) {
2550		mutex_exit(&connp->conn_lock);
2551		ip_drop_packet(mp, B_TRUE, NULL,
2552		    DROPPER(ipss, ipds_spd_got_clear),
2553		    &ipss->ipsec_spd_dropper);
2554		BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
2555		return (NULL);
2556	}
2557
2558	/*
2559	 * Once a connection is latched it remains so for life, the conn_latch
2560	 * pointer on the conn has not changed, simply initializing ipl here
2561	 * as the earlier initialization was done only in the cleartext case.
2562	 */
2563	if ((ipl = connp->conn_latch) == NULL) {
2564		mblk_t *retmp;
2565		policy_head = connp->conn_policy;
2566
2567		/* Hold a reference in case the conn is closing */
2568		if (policy_head != NULL)
2569			IPPH_REFHOLD(policy_head);
2570		mutex_exit(&connp->conn_lock);
2571		/*
2572		 * We don't have policies cached in the conn
2573		 * for this stream. So, look at the global
2574		 * policy. It will check against conn or global
2575		 * depending on whichever is stronger.
2576		 */
2577		retmp = ipsec_check_global_policy(mp, connp,
2578		    ipha, ip6h, ira, ns);
2579		if (policy_head != NULL)
2580			IPPH_REFRELE(policy_head, ns);
2581		return (retmp);
2582	}
2583
2584	IPLATCH_REFHOLD(ipl);
2585	/* Hold reference on conn_latch_in_action in case conn is closing */
2586	ap = connp->conn_latch_in_action;
2587	if (ap != NULL)
2588		IPACT_REFHOLD(ap);
2589	mutex_exit(&connp->conn_lock);
2590
2591	if (ap != NULL) {
2592		/* Policy is cached & latched; fast(er) path */
2593		const char *reason;
2594		kstat_named_t *counter;
2595
2596		if (ipsec_check_ipsecin_latch(ira, mp, ipl, ap,
2597		    ipha, ip6h, &reason, &counter, connp, ns)) {
2598			BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
2599			IPLATCH_REFRELE(ipl);
2600			IPACT_REFRELE(ap);
2601			return (mp);
2602		}
2603		ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0,
2604		    SL_ERROR|SL_WARN|SL_CONSOLE,
2605		    "ipsec inbound policy mismatch: %s, packet dropped\n",
2606		    reason);
2607		ip_drop_packet(mp, B_TRUE, NULL, counter,
2608		    &ipss->ipsec_spd_dropper);
2609		BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
2610		IPLATCH_REFRELE(ipl);
2611		IPACT_REFRELE(ap);
2612		return (NULL);
2613	}
2614	if ((p = connp->conn_latch_in_policy) == NULL) {
2615		ipsec_weird_null_inbound_policy++;
2616		IPLATCH_REFRELE(ipl);
2617		return (mp);
2618	}
2619
2620	unique_id = conn_to_unique(connp, mp, ipha, ip6h);
2621	IPPOL_REFHOLD(p);
2622	mp = ipsec_check_ipsecin_policy(mp, p, ipha, ip6h, unique_id, ira, ns);
2623	/*
2624	 * NOTE: ipsecIn{Failed,Succeeeded} bumped by
2625	 * ipsec_check_ipsecin_policy().
2626	 */
2627	if (mp != NULL)
2628		ipsec_latch_inbound(connp, ira);
2629	IPLATCH_REFRELE(ipl);
2630	return (mp);
2631}
2632
2633/*
2634 * Handle all sorts of cases like tunnel-mode and ICMP.
2635 */
2636static int
2637prepended_length(mblk_t *mp, uintptr_t hptr)
2638{
2639	int rc = 0;
2640
2641	while (mp != NULL) {
2642		if (hptr >= (uintptr_t)mp->b_rptr && hptr <
2643		    (uintptr_t)mp->b_wptr) {
2644			rc += (int)(hptr - (uintptr_t)mp->b_rptr);
2645			break;	/* out of while loop */
2646		}
2647		rc += (int)MBLKL(mp);
2648		mp = mp->b_cont;
2649	}
2650
2651	if (mp == NULL) {
2652		/*
2653		 * IF (big IF) we make it here by naturally exiting the loop,
2654		 * then ip6h isn't in the mblk chain "mp" at all.
2655		 *
2656		 * The only case where this happens is with a reversed IP
2657		 * header that gets passed up by inbound ICMP processing.
2658		 * This unfortunately triggers longstanding bug 6478464.  For
2659		 * now, just pass up 0 for the answer.
2660		 */
2661#ifdef DEBUG_NOT_UNTIL_6478464
2662		ASSERT(mp != NULL);
2663#endif
2664		rc = 0;
2665	}
2666
2667	return (rc);
2668}
2669
2670/*
2671 * Returns:
2672 *
2673 * SELRET_NOMEM --> msgpullup() needed to gather things failed.
2674 * SELRET_BADPKT --> If we're being called after tunnel-mode fragment
2675 *		     gathering, the initial fragment is too short for
2676 *		     useful data.  Only returned if SEL_TUNNEL_FIRSTFRAG is
2677 *		     set.
2678 * SELRET_SUCCESS --> "sel" now has initialized IPsec selector data.
2679 * SELRET_TUNFRAG --> This is a fragment in a tunnel-mode packet.  Caller
2680 *		      should put this packet in a fragment-gathering queue.
2681 *		      Only returned if SEL_TUNNEL_MODE and SEL_PORT_POLICY
2682 *		      is set.
2683 *
2684 * Note that ipha/ip6h can be in a different mblk (mp->b_cont) in the case
2685 * of tunneled packets.
2686 * Also, mp->b_rptr can be an ICMP error where ipha/ip6h is the packet in
2687 * error past the ICMP error.
2688 */
2689static selret_t
2690ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
2691    ip6_t *ip6h, uint8_t sel_flags)
2692{
2693	uint16_t *ports;
2694	int outer_hdr_len = 0;	/* For ICMP or tunnel-mode cases... */
2695	ushort_t hdr_len;
2696	mblk_t *spare_mp = NULL;
2697	uint8_t *nexthdrp, *transportp;
2698	uint8_t nexthdr;
2699	uint8_t icmp_proto;
2700	ip_pkt_t ipp;
2701	boolean_t port_policy_present = (sel_flags & SEL_PORT_POLICY);
2702	boolean_t is_icmp = (sel_flags & SEL_IS_ICMP);
2703	boolean_t tunnel_mode = (sel_flags & SEL_TUNNEL_MODE);
2704	boolean_t post_frag = (sel_flags & SEL_POST_FRAG);
2705
2706	ASSERT((ipha == NULL && ip6h != NULL) ||
2707	    (ipha != NULL && ip6h == NULL));
2708
2709	if (ip6h != NULL) {
2710		outer_hdr_len = prepended_length(mp, (uintptr_t)ip6h);
2711		nexthdr = ip6h->ip6_nxt;
2712		icmp_proto = IPPROTO_ICMPV6;
2713		sel->ips_isv4 = B_FALSE;
2714		sel->ips_local_addr_v6 = ip6h->ip6_dst;
2715		sel->ips_remote_addr_v6 = ip6h->ip6_src;
2716
2717		bzero(&ipp, sizeof (ipp));
2718		(void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &ipp, NULL);
2719
2720		switch (nexthdr) {
2721		case IPPROTO_HOPOPTS:
2722		case IPPROTO_ROUTING:
2723		case IPPROTO_DSTOPTS:
2724		case IPPROTO_FRAGMENT:
2725			/*
2726			 * Use ip_hdr_length_nexthdr_v6().  And have a spare
2727			 * mblk that's contiguous to feed it
2728			 */
2729			if ((spare_mp = msgpullup(mp, -1)) == NULL)
2730				return (SELRET_NOMEM);
2731			if (!ip_hdr_length_nexthdr_v6(spare_mp,
2732			    (ip6_t *)(spare_mp->b_rptr + outer_hdr_len),
2733			    &hdr_len, &nexthdrp)) {
2734				/* Malformed packet - caller frees. */
2735				ipsec_freemsg_chain(spare_mp);
2736				return (SELRET_BADPKT);
2737			}
2738			nexthdr = *nexthdrp;
2739			/* We can just extract based on hdr_len now. */
2740			break;
2741		default:
2742			hdr_len = IPV6_HDR_LEN;
2743			break;
2744		}
2745
2746		if (port_policy_present && IS_V6_FRAGMENT(ipp) && !is_icmp) {
2747			/* IPv6 Fragment */
2748			ipsec_freemsg_chain(spare_mp);
2749			return (SELRET_TUNFRAG);
2750		}
2751		transportp = (uint8_t *)ip6h + hdr_len;
2752	} else {
2753		outer_hdr_len = prepended_length(mp, (uintptr_t)ipha);
2754		icmp_proto = IPPROTO_ICMP;
2755		sel->ips_isv4 = B_TRUE;
2756		sel->ips_local_addr_v4 = ipha->ipha_dst;
2757		sel->ips_remote_addr_v4 = ipha->ipha_src;
2758		nexthdr = ipha->ipha_protocol;
2759		hdr_len = IPH_HDR_LENGTH(ipha);
2760
2761		if (port_policy_present &&
2762		    IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags) &&
2763		    !is_icmp) {
2764			/* IPv4 Fragment */
2765			ipsec_freemsg_chain(spare_mp);
2766			return (SELRET_TUNFRAG);
2767		}
2768		transportp = (uint8_t *)ipha + hdr_len;
2769	}
2770	sel->ips_protocol = nexthdr;
2771
2772	if ((nexthdr != IPPROTO_TCP && nexthdr != IPPROTO_UDP &&
2773	    nexthdr != IPPROTO_SCTP && nexthdr != icmp_proto) ||
2774	    (!port_policy_present && !post_frag && tunnel_mode)) {
2775		sel->ips_remote_port = sel->ips_local_port = 0;
2776		ipsec_freemsg_chain(spare_mp);
2777		return (SELRET_SUCCESS);
2778	}
2779
2780	if (transportp + 4 > mp->b_wptr) {
2781		/* If we didn't pullup a copy already, do so now. */
2782		/*
2783		 * XXX performance, will upper-layers frequently split TCP/UDP
2784		 * apart from IP or options?  If so, perhaps we should revisit
2785		 * the spare_mp strategy.
2786		 */
2787		ipsec_hdr_pullup_needed++;
2788		if (spare_mp == NULL &&
2789		    (spare_mp = msgpullup(mp, -1)) == NULL) {
2790			return (SELRET_NOMEM);
2791		}
2792		transportp = &spare_mp->b_rptr[hdr_len + outer_hdr_len];
2793	}
2794
2795	if (nexthdr == icmp_proto) {
2796		sel->ips_icmp_type = *transportp++;
2797		sel->ips_icmp_code = *transportp;
2798		sel->ips_remote_port = sel->ips_local_port = 0;
2799	} else {
2800		ports = (uint16_t *)transportp;
2801		sel->ips_remote_port = *ports++;
2802		sel->ips_local_port = *ports;
2803	}
2804	ipsec_freemsg_chain(spare_mp);
2805	return (SELRET_SUCCESS);
2806}
2807
2808/*
2809 * This is called with a b_next chain of messages from the fragcache code,
2810 * hence it needs to discard a chain on error.
2811 */
2812static boolean_t
2813ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
2814    ip6_t *ip6h, int outer_hdr_len, ipsec_stack_t *ipss)
2815{
2816	/*
2817	 * XXX cut&paste shared with ipsec_init_inbound_sel
2818	 */
2819	uint16_t *ports;
2820	ushort_t hdr_len;
2821	mblk_t *spare_mp = NULL;
2822	uint8_t *nexthdrp;
2823	uint8_t nexthdr;
2824	uint8_t *typecode;
2825	uint8_t check_proto;
2826
2827	ASSERT((ipha == NULL && ip6h != NULL) ||
2828	    (ipha != NULL && ip6h == NULL));
2829
2830	if (ip6h != NULL) {
2831		check_proto = IPPROTO_ICMPV6;
2832		nexthdr = ip6h->ip6_nxt;
2833		switch (nexthdr) {
2834		case IPPROTO_HOPOPTS:
2835		case IPPROTO_ROUTING:
2836		case IPPROTO_DSTOPTS:
2837		case IPPROTO_FRAGMENT:
2838			/*
2839			 * Use ip_hdr_length_nexthdr_v6().  And have a spare
2840			 * mblk that's contiguous to feed it
2841			 */
2842			spare_mp = msgpullup(mp, -1);
2843			if (spare_mp == NULL ||
2844			    !ip_hdr_length_nexthdr_v6(spare_mp,
2845			    (ip6_t *)(spare_mp->b_rptr + outer_hdr_len),
2846			    &hdr_len, &nexthdrp)) {
2847				/* Always works, even if NULL. */
2848				ipsec_freemsg_chain(spare_mp);
2849				ip_drop_packet_chain(mp, B_FALSE, NULL,
2850				    DROPPER(ipss, ipds_spd_nomem),
2851				    &ipss->ipsec_spd_dropper);
2852				return (B_FALSE);
2853			} else {
2854				nexthdr = *nexthdrp;
2855				/* We can just extract based on hdr_len now. */
2856			}
2857			break;
2858		default:
2859			hdr_len = IPV6_HDR_LEN;
2860			break;
2861		}
2862	} else {
2863		check_proto = IPPROTO_ICMP;
2864		hdr_len = IPH_HDR_LENGTH(ipha);
2865		nexthdr = ipha->ipha_protocol;
2866	}
2867
2868	sel->ips_protocol = nexthdr;
2869	if (nexthdr != IPPROTO_TCP && nexthdr != IPPROTO_UDP &&
2870	    nexthdr != IPPROTO_SCTP && nexthdr != check_proto) {
2871		sel->ips_local_port = sel->ips_remote_port = 0;
2872		ipsec_freemsg_chain(spare_mp); /* Always works, even if NULL */
2873		return (B_TRUE);
2874	}
2875
2876	if (&mp->b_rptr[hdr_len] + 4 + outer_hdr_len > mp->b_wptr) {
2877		/* If we didn't pullup a copy already, do so now. */
2878		/*
2879		 * XXX performance, will upper-layers frequently split TCP/UDP
2880		 * apart from IP or options?  If so, perhaps we should revisit
2881		 * the spare_mp strategy.
2882		 *
2883		 * XXX should this be msgpullup(mp, hdr_len+4) ???
2884		 */
2885		if (spare_mp == NULL &&
2886		    (spare_mp = msgpullup(mp, -1)) == NULL) {
2887			ip_drop_packet_chain(mp, B_FALSE, NULL,
2888			    DROPPER(ipss, ipds_spd_nomem),
2889			    &ipss->ipsec_spd_dropper);
2890			return (B_FALSE);
2891		}
2892		ports = (uint16_t *)&spare_mp->b_rptr[hdr_len + outer_hdr_len];
2893	} else {
2894		ports = (uint16_t *)&mp->b_rptr[hdr_len + outer_hdr_len];
2895	}
2896
2897	if (nexthdr == check_proto) {
2898		typecode = (uint8_t *)ports;
2899		sel->ips_icmp_type = *typecode++;
2900		sel->ips_icmp_code = *typecode;
2901		sel->ips_remote_port = sel->ips_local_port = 0;
2902	} else {
2903		sel->ips_local_port = *ports++;
2904		sel->ips_remote_port = *ports;
2905	}
2906	ipsec_freemsg_chain(spare_mp);	/* Always works, even if NULL */
2907	return (B_TRUE);
2908}
2909
2910/*
2911 * Prepend an mblk with a ipsec_crypto_t to the message chain.
2912 * Frees the argument and returns NULL should the allocation fail.
2913 * Returns the pointer to the crypto data part.
2914 */
2915mblk_t *
2916ipsec_add_crypto_data(mblk_t *data_mp, ipsec_crypto_t **icp)
2917{
2918	mblk_t	*mp;
2919
2920	mp = allocb(sizeof (ipsec_crypto_t), BPRI_MED);
2921	if (mp == NULL) {
2922		freemsg(data_mp);
2923		return (NULL);
2924	}
2925	bzero(mp->b_rptr, sizeof (ipsec_crypto_t));
2926	mp->b_wptr += sizeof (ipsec_crypto_t);
2927	mp->b_cont = data_mp;
2928	mp->b_datap->db_type = M_EVENT;	/* For ASSERT */
2929	*icp = (ipsec_crypto_t *)mp->b_rptr;
2930	return (mp);
2931}
2932
2933/*
2934 * Remove what was prepended above. Return b_cont and a pointer to the
2935 * crypto data.
2936 * The caller must call ipsec_free_crypto_data for mblk once it is done
2937 * with the crypto data.
2938 */
2939mblk_t *
2940ipsec_remove_crypto_data(mblk_t *crypto_mp, ipsec_crypto_t **icp)
2941{
2942	ASSERT(crypto_mp->b_datap->db_type == M_EVENT);
2943	ASSERT(MBLKL(crypto_mp) == sizeof (ipsec_crypto_t));
2944
2945	*icp = (ipsec_crypto_t *)crypto_mp->b_rptr;
2946	return (crypto_mp->b_cont);
2947}
2948
2949/*
2950 * Free what was prepended above. Return b_cont.
2951 */
2952mblk_t *
2953ipsec_free_crypto_data(mblk_t *crypto_mp)
2954{
2955	mblk_t	*mp;
2956
2957	ASSERT(crypto_mp->b_datap->db_type == M_EVENT);
2958	ASSERT(MBLKL(crypto_mp) == sizeof (ipsec_crypto_t));
2959
2960	mp = crypto_mp->b_cont;
2961	freeb(crypto_mp);
2962	return (mp);
2963}
2964
2965/*
2966 * Create an ipsec_action_t based on the way an inbound packet was protected.
2967 * Used to reflect traffic back to a sender.
2968 *
2969 * We don't bother interning the action into the hash table.
2970 */
2971ipsec_action_t *
2972ipsec_in_to_out_action(ip_recv_attr_t *ira)
2973{
2974	ipsa_t *ah_assoc, *esp_assoc;
2975	uint_t auth_alg = 0, encr_alg = 0, espa_alg = 0;
2976	ipsec_action_t *ap;
2977	boolean_t unique;
2978
2979	ap = kmem_cache_alloc(ipsec_action_cache, KM_NOSLEEP);
2980
2981	if (ap == NULL)
2982		return (NULL);
2983
2984	bzero(ap, sizeof (*ap));
2985	HASH_NULL(ap, ipa_hash);
2986	ap->ipa_next = NULL;
2987	ap->ipa_refs = 1;
2988
2989	/*
2990	 * Get the algorithms that were used for this packet.
2991	 */
2992	ap->ipa_act.ipa_type = IPSEC_ACT_APPLY;
2993	ap->ipa_act.ipa_log = 0;
2994	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2995
2996	ah_assoc = ira->ira_ipsec_ah_sa;
2997	ap->ipa_act.ipa_apply.ipp_use_ah = (ah_assoc != NULL);
2998
2999	esp_assoc = ira->ira_ipsec_esp_sa;
3000	ap->ipa_act.ipa_apply.ipp_use_esp = (esp_assoc != NULL);
3001
3002	if (esp_assoc != NULL) {
3003		encr_alg = esp_assoc->ipsa_encr_alg;
3004		espa_alg = esp_assoc->ipsa_auth_alg;
3005		ap->ipa_act.ipa_apply.ipp_use_espa = (espa_alg != 0);
3006	}
3007	if (ah_assoc != NULL)
3008		auth_alg = ah_assoc->ipsa_auth_alg;
3009
3010	ap->ipa_act.ipa_apply.ipp_encr_alg = (uint8_t)encr_alg;
3011	ap->ipa_act.ipa_apply.ipp_auth_alg = (uint8_t)auth_alg;
3012	ap->ipa_act.ipa_apply.ipp_esp_auth_alg = (uint8_t)espa_alg;
3013	ap->ipa_act.ipa_apply.ipp_use_se =
3014	    !!(ira->ira_flags & IRAF_IPSEC_DECAPS);
3015	unique = B_FALSE;
3016
3017	if (esp_assoc != NULL) {
3018		ap->ipa_act.ipa_apply.ipp_espa_minbits =
3019		    esp_assoc->ipsa_authkeybits;
3020		ap->ipa_act.ipa_apply.ipp_espa_maxbits =
3021		    esp_assoc->ipsa_authkeybits;
3022		ap->ipa_act.ipa_apply.ipp_espe_minbits =
3023		    esp_assoc->ipsa_encrkeybits;
3024		ap->ipa_act.ipa_apply.ipp_espe_maxbits =
3025		    esp_assoc->ipsa_encrkeybits;
3026		ap->ipa_act.ipa_apply.ipp_km_proto = esp_assoc->ipsa_kmp;
3027		ap->ipa_act.ipa_apply.ipp_km_cookie = esp_assoc->ipsa_kmc;
3028		if (esp_assoc->ipsa_flags & IPSA_F_UNIQUE)
3029			unique = B_TRUE;
3030	}
3031	if (ah_assoc != NULL) {
3032		ap->ipa_act.ipa_apply.ipp_ah_minbits =
3033		    ah_assoc->ipsa_authkeybits;
3034		ap->ipa_act.ipa_apply.ipp_ah_maxbits =
3035		    ah_assoc->ipsa_authkeybits;
3036		ap->ipa_act.ipa_apply.ipp_km_proto = ah_assoc->ipsa_kmp;
3037		ap->ipa_act.ipa_apply.ipp_km_cookie = ah_assoc->ipsa_kmc;
3038		if (ah_assoc->ipsa_flags & IPSA_F_UNIQUE)
3039			unique = B_TRUE;
3040	}
3041	ap->ipa_act.ipa_apply.ipp_use_unique = unique;
3042	ap->ipa_want_unique = unique;
3043	ap->ipa_allow_clear = B_FALSE;
3044	ap->ipa_want_se = !!(ira->ira_flags & IRAF_IPSEC_DECAPS);
3045	ap->ipa_want_ah = (ah_assoc != NULL);
3046	ap->ipa_want_esp = (esp_assoc != NULL);
3047
3048	ap->ipa_ovhd = ipsec_act_ovhd(&ap->ipa_act);
3049
3050	ap->ipa_act.ipa_apply.ipp_replay_depth = 0; /* don't care */
3051
3052	return (ap);
3053}
3054
3055
3056/*
3057 * Compute the worst-case amount of extra space required by an action.
3058 * Note that, because of the ESP considerations listed below, this is
3059 * actually not the same as the best-case reduction in the MTU; in the
3060 * future, we should pass additional information to this function to
3061 * allow the actual MTU impact to be computed.
3062 *
3063 * AH: Revisit this if we implement algorithms with
3064 * a verifier size of more than 12 bytes.
3065 *
3066 * ESP: A more exact but more messy computation would take into
3067 * account the interaction between the cipher block size and the
3068 * effective MTU, yielding the inner payload size which reflects a
3069 * packet with *minimum* ESP padding..
3070 */
3071int32_t
3072ipsec_act_ovhd(const ipsec_act_t *act)
3073{
3074	int32_t overhead = 0;
3075
3076	if (act->ipa_type == IPSEC_ACT_APPLY) {
3077		const ipsec_prot_t *ipp = &act->ipa_apply;
3078
3079		if (ipp->ipp_use_ah)
3080			overhead += IPSEC_MAX_AH_HDR_SIZE;
3081		if (ipp->ipp_use_esp) {
3082			overhead += IPSEC_MAX_ESP_HDR_SIZE;
3083			overhead += sizeof (struct udphdr);
3084		}
3085		if (ipp->ipp_use_se)
3086			overhead += IP_SIMPLE_HDR_LENGTH;
3087	}
3088	return (overhead);
3089}
3090
3091/*
3092 * This hash function is used only when creating policies and thus is not
3093 * performance-critical for packet flows.
3094 *
3095 * Future work: canonicalize the structures hashed with this (i.e.,
3096 * zeroize padding) so the hash works correctly.
3097 */
3098/* ARGSUSED */
3099static uint32_t
3100policy_hash(int size, const void *start, const void *end)
3101{
3102	return (0);
3103}
3104
3105
3106/*
3107 * Hash function macros for each address type.
3108 *
3109 * The IPV6 hash function assumes that the low order 32-bits of the
3110 * address (typically containing the low order 24 bits of the mac
3111 * address) are reasonably well-distributed.  Revisit this if we run
3112 * into trouble from lots of collisions on ::1 addresses and the like
3113 * (seems unlikely).
3114 */
3115#define	IPSEC_IPV4_HASH(a, n) ((a) % (n))
3116#define	IPSEC_IPV6_HASH(a, n) (((a).s6_addr32[3]) % (n))
3117
3118/*
3119 * These two hash functions should produce coordinated values
3120 * but have slightly different roles.
3121 */
3122static uint32_t
3123selkey_hash(const ipsec_selkey_t *selkey, netstack_t *ns)
3124{
3125	uint32_t valid = selkey->ipsl_valid;
3126	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3127
3128	if (!(valid & IPSL_REMOTE_ADDR))
3129		return (IPSEC_SEL_NOHASH);
3130
3131	if (valid & IPSL_IPV4) {
3132		if (selkey->ipsl_remote_pfxlen == 32) {
3133			return (IPSEC_IPV4_HASH(selkey->ipsl_remote.ipsad_v4,
3134			    ipss->ipsec_spd_hashsize));
3135		}
3136	}
3137	if (valid & IPSL_IPV6) {
3138		if (selkey->ipsl_remote_pfxlen == 128) {
3139			return (IPSEC_IPV6_HASH(selkey->ipsl_remote.ipsad_v6,
3140			    ipss->ipsec_spd_hashsize));
3141		}
3142	}
3143	return (IPSEC_SEL_NOHASH);
3144}
3145
3146static uint32_t
3147selector_hash(ipsec_selector_t *sel, ipsec_policy_root_t *root)
3148{
3149	if (sel->ips_isv4) {
3150		return (IPSEC_IPV4_HASH(sel->ips_remote_addr_v4,
3151		    root->ipr_nchains));
3152	}
3153	return (IPSEC_IPV6_HASH(sel->ips_remote_addr_v6, root->ipr_nchains));
3154}
3155
3156/*
3157 * Intern actions into the action hash table.
3158 */
3159ipsec_action_t *
3160ipsec_act_find(const ipsec_act_t *a, int n, netstack_t *ns)
3161{
3162	int i;
3163	uint32_t hval;
3164	ipsec_action_t *ap;
3165	ipsec_action_t *prev = NULL;
3166	int32_t overhead, maxovhd = 0;
3167	boolean_t allow_clear = B_FALSE;
3168	boolean_t want_ah = B_FALSE;
3169	boolean_t want_esp = B_FALSE;
3170	boolean_t want_se = B_FALSE;
3171	boolean_t want_unique = B_FALSE;
3172	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3173
3174	/*
3175	 * TODO: should canonicalize a[] (i.e., zeroize any padding)
3176	 * so we can use a non-trivial policy_hash function.
3177	 */
3178	for (i = n-1; i >= 0; i--) {
3179		hval = policy_hash(IPSEC_ACTION_HASH_SIZE, &a[i], &a[n]);
3180
3181		HASH_LOCK(ipss->ipsec_action_hash, hval);
3182
3183		for (HASH_ITERATE(ap, ipa_hash,
3184		    ipss->ipsec_action_hash, hval)) {
3185			if (bcmp(&ap->ipa_act, &a[i], sizeof (*a)) != 0)
3186				continue;
3187			if (ap->ipa_next != prev)
3188				continue;
3189			break;
3190		}
3191		if (ap != NULL) {
3192			HASH_UNLOCK(ipss->ipsec_action_hash, hval);
3193			prev = ap;
3194			continue;
3195		}
3196		/*
3197		 * need to allocate a new one..
3198		 */
3199		ap = kmem_cache_alloc(ipsec_action_cache, KM_NOSLEEP);
3200		if (ap == NULL) {
3201			HASH_UNLOCK(ipss->ipsec_action_hash, hval);
3202			if (prev != NULL)
3203				ipsec_action_free(prev);
3204			return (NULL);
3205		}
3206		HASH_INSERT(ap, ipa_hash, ipss->ipsec_action_hash, hval);
3207
3208		ap->ipa_next = prev;
3209		ap->ipa_act = a[i];
3210
3211		overhead = ipsec_act_ovhd(&a[i]);
3212		if (maxovhd < overhead)
3213			maxovhd = overhead;
3214
3215		if ((a[i].ipa_type == IPSEC_ACT_BYPASS) ||
3216		    (a[i].ipa_type == IPSEC_ACT_CLEAR))
3217			allow_clear = B_TRUE;
3218		if (a[i].ipa_type == IPSEC_ACT_APPLY) {
3219			const ipsec_prot_t *ipp = &a[i].ipa_apply;
3220
3221			ASSERT(ipp->ipp_use_ah || ipp->ipp_use_esp);
3222			want_ah |= ipp->ipp_use_ah;
3223			want_esp |= ipp->ipp_use_esp;
3224			want_se |= ipp->ipp_use_se;
3225			want_unique |= ipp->ipp_use_unique;
3226		}
3227		ap->ipa_allow_clear = allow_clear;
3228		ap->ipa_want_ah = want_ah;
3229		ap->ipa_want_esp = want_esp;
3230		ap->ipa_want_se = want_se;
3231		ap->ipa_want_unique = want_unique;
3232		ap->ipa_refs = 1; /* from the hash table */
3233		ap->ipa_ovhd = maxovhd;
3234		if (prev)
3235			prev->ipa_refs++;
3236		prev = ap;
3237		HASH_UNLOCK(ipss->ipsec_action_hash, hval);
3238	}
3239
3240	ap->ipa_refs++;		/* caller's reference */
3241
3242	return (ap);
3243}
3244
3245/*
3246 * Called when refcount goes to 0, indicating that all references to this
3247 * node are gone.
3248 *
3249 * This does not unchain the action from the hash table.
3250 */
3251void
3252ipsec_action_free(ipsec_action_t *ap)
3253{
3254	for (;;) {
3255		ipsec_action_t *np = ap->ipa_next;
3256		ASSERT(ap->ipa_refs == 0);
3257		ASSERT(ap->ipa_hash.hash_pp == NULL);
3258		kmem_cache_free(ipsec_action_cache, ap);
3259		ap = np;
3260		/* Inlined IPACT_REFRELE -- avoid recursion */
3261		if (ap == NULL)
3262			break;
3263		membar_exit();
3264		if (atomic_add_32_nv(&(ap)->ipa_refs, -1) != 0)
3265			break;
3266		/* End inlined IPACT_REFRELE */
3267	}
3268}
3269
3270/*
3271 * Called when the action hash table goes away.
3272 *
3273 * The actions can be queued on an mblk with ipsec_in or
3274 * ipsec_out, hence the actions might still be around.
3275 * But we decrement ipa_refs here since we no longer have
3276 * a reference to the action from the hash table.
3277 */
3278static void
3279ipsec_action_free_table(ipsec_action_t *ap)
3280{
3281	while (ap != NULL) {
3282		ipsec_action_t *np = ap->ipa_next;
3283
3284		/* FIXME: remove? */
3285		(void) printf("ipsec_action_free_table(%p) ref %d\n",
3286		    (void *)ap, ap->ipa_refs);
3287		ASSERT(ap->ipa_refs > 0);
3288		IPACT_REFRELE(ap);
3289		ap = np;
3290	}
3291}
3292
3293/*
3294 * Need to walk all stack instances since the reclaim function
3295 * is global for all instances
3296 */
3297/* ARGSUSED */
3298static void
3299ipsec_action_reclaim(void *arg)
3300{
3301	netstack_handle_t nh;
3302	netstack_t *ns;
3303
3304	netstack_next_init(&nh);
3305	while ((ns = netstack_next(&nh)) != NULL) {
3306		ipsec_action_reclaim_stack(ns);
3307		netstack_rele(ns);
3308	}
3309	netstack_next_fini(&nh);
3310}
3311
3312/*
3313 * Periodically sweep action hash table for actions with refcount==1, and
3314 * nuke them.  We cannot do this "on demand" (i.e., from IPACT_REFRELE)
3315 * because we can't close the race between another thread finding the action
3316 * in the hash table without holding the bucket lock during IPACT_REFRELE.
3317 * Instead, we run this function sporadically to clean up after ourselves;
3318 * we also set it as the "reclaim" function for the action kmem_cache.
3319 *
3320 * Note that it may take several passes of ipsec_action_gc() to free all
3321 * "stale" actions.
3322 */
3323static void
3324ipsec_action_reclaim_stack(netstack_t *ns)
3325{
3326	int i;
3327	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3328
3329	for (i = 0; i < IPSEC_ACTION_HASH_SIZE; i++) {
3330		ipsec_action_t *ap, *np;
3331
3332		/* skip the lock if nobody home */
3333		if (ipss->ipsec_action_hash[i].hash_head == NULL)
3334			continue;
3335
3336		HASH_LOCK(ipss->ipsec_action_hash, i);
3337		for (ap = ipss->ipsec_action_hash[i].hash_head;
3338		    ap != NULL; ap = np) {
3339			ASSERT(ap->ipa_refs > 0);
3340			np = ap->ipa_hash.hash_next;
3341			if (ap->ipa_refs > 1)
3342				continue;
3343			HASH_UNCHAIN(ap, ipa_hash,
3344			    ipss->ipsec_action_hash, i);
3345			IPACT_REFRELE(ap);
3346		}
3347		HASH_UNLOCK(ipss->ipsec_action_hash, i);
3348	}
3349}
3350
3351/*
3352 * Intern a selector set into the selector set hash table.
3353 * This is simpler than the actions case..
3354 */
3355static ipsec_sel_t *
3356ipsec_find_sel(ipsec_selkey_t *selkey, netstack_t *ns)
3357{
3358	ipsec_sel_t *sp;
3359	uint32_t hval, bucket;
3360	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3361
3362	/*
3363	 * Exactly one AF bit should be set in selkey.
3364	 */
3365	ASSERT(!(selkey->ipsl_valid & IPSL_IPV4) ^
3366	    !(selkey->ipsl_valid & IPSL_IPV6));
3367
3368	hval = selkey_hash(selkey, ns);
3369	/* Set pol_hval to uninitialized until we put it in a polhead. */
3370	selkey->ipsl_sel_hval = hval;
3371
3372	bucket = (hval == IPSEC_SEL_NOHASH) ? 0 : hval;
3373
3374	ASSERT(!HASH_LOCKED(ipss->ipsec_sel_hash, bucket));
3375	HASH_LOCK(ipss->ipsec_sel_hash, bucket);
3376
3377	for (HASH_ITERATE(sp, ipsl_hash, ipss->ipsec_sel_hash, bucket)) {
3378		if (bcmp(&sp->ipsl_key, selkey,
3379		    offsetof(ipsec_selkey_t, ipsl_pol_hval)) == 0)
3380			break;
3381	}
3382	if (sp != NULL) {
3383		sp->ipsl_refs++;
3384
3385		HASH_UNLOCK(ipss->ipsec_sel_hash, bucket);
3386		return (sp);
3387	}
3388
3389	sp = kmem_cache_alloc(ipsec_sel_cache, KM_NOSLEEP);
3390	if (sp == NULL) {
3391		HASH_UNLOCK(ipss->ipsec_sel_hash, bucket);
3392		return (NULL);
3393	}
3394
3395	HASH_INSERT(sp, ipsl_hash, ipss->ipsec_sel_hash, bucket);
3396	sp->ipsl_refs = 2;	/* one for hash table, one for caller */
3397	sp->ipsl_key = *selkey;
3398	/* Set to uninitalized and have insertion into polhead fix things. */
3399	if (selkey->ipsl_sel_hval != IPSEC_SEL_NOHASH)
3400		sp->ipsl_key.ipsl_pol_hval = 0;
3401	else
3402		sp->ipsl_key.ipsl_pol_hval = IPSEC_SEL_NOHASH;
3403
3404	HASH_UNLOCK(ipss->ipsec_sel_hash, bucket);
3405
3406	return (sp);
3407}
3408
3409static void
3410ipsec_sel_rel(ipsec_sel_t **spp, netstack_t *ns)
3411{
3412	ipsec_sel_t *sp = *spp;
3413	int hval = sp->ipsl_key.ipsl_sel_hval;
3414	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3415
3416	*spp = NULL;
3417
3418	if (hval == IPSEC_SEL_NOHASH)
3419		hval = 0;
3420
3421	ASSERT(!HASH_LOCKED(ipss->ipsec_sel_hash, hval));
3422	HASH_LOCK(ipss->ipsec_sel_hash, hval);
3423	if (--sp->ipsl_refs == 1) {
3424		HASH_UNCHAIN(sp, ipsl_hash, ipss->ipsec_sel_hash, hval);
3425		sp->ipsl_refs--;
3426		HASH_UNLOCK(ipss->ipsec_sel_hash, hval);
3427		ASSERT(sp->ipsl_refs == 0);
3428		kmem_cache_free(ipsec_sel_cache, sp);
3429		/* Caller unlocks */
3430		return;
3431	}
3432
3433	HASH_UNLOCK(ipss->ipsec_sel_hash, hval);
3434}
3435
3436/*
3437 * Free a policy rule which we know is no longer being referenced.
3438 */
3439void
3440ipsec_policy_free(ipsec_policy_t *ipp)
3441{
3442	ASSERT(ipp->ipsp_refs == 0);
3443	ASSERT(ipp->ipsp_sel != NULL);
3444	ASSERT(ipp->ipsp_act != NULL);
3445	ASSERT(ipp->ipsp_netstack != NULL);
3446
3447	ipsec_sel_rel(&ipp->ipsp_sel, ipp->ipsp_netstack);
3448	IPACT_REFRELE(ipp->ipsp_act);
3449	kmem_cache_free(ipsec_pol_cache, ipp);
3450}
3451
3452/*
3453 * Construction of new policy rules; construct a policy, and add it to
3454 * the appropriate tables.
3455 */
3456ipsec_policy_t *
3457ipsec_policy_create(ipsec_selkey_t *keys, const ipsec_act_t *a,
3458    int nacts, int prio, uint64_t *index_ptr, netstack_t *ns)
3459{
3460	ipsec_action_t *ap;
3461	ipsec_sel_t *sp;
3462	ipsec_policy_t *ipp;
3463	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3464
3465	if (index_ptr == NULL)
3466		index_ptr = &ipss->ipsec_next_policy_index;
3467
3468	ipp = kmem_cache_alloc(ipsec_pol_cache, KM_NOSLEEP);
3469	ap = ipsec_act_find(a, nacts, ns);
3470	sp = ipsec_find_sel(keys, ns);
3471
3472	if ((ap == NULL) || (sp == NULL) || (ipp == NULL)) {
3473		if (ap != NULL) {
3474			IPACT_REFRELE(ap);
3475		}
3476		if (sp != NULL)
3477			ipsec_sel_rel(&sp, ns);
3478		if (ipp != NULL)
3479			kmem_cache_free(ipsec_pol_cache, ipp);
3480		return (NULL);
3481	}
3482
3483	HASH_NULL(ipp, ipsp_hash);
3484
3485	ipp->ipsp_netstack = ns;	/* Needed for ipsec_policy_free */
3486	ipp->ipsp_refs = 1;	/* caller's reference */
3487	ipp->ipsp_sel = sp;
3488	ipp->ipsp_act = ap;
3489	ipp->ipsp_prio = prio;	/* rule priority */
3490	ipp->ipsp_index = *index_ptr;
3491	(*index_ptr)++;
3492
3493	return (ipp);
3494}
3495
3496static void
3497ipsec_update_present_flags(ipsec_stack_t *ipss)
3498{
3499	boolean_t hashpol;
3500
3501	hashpol = (avl_numnodes(&ipss->ipsec_system_policy.iph_rulebyid) > 0);
3502
3503	if (hashpol) {
3504		ipss->ipsec_outbound_v4_policy_present = B_TRUE;
3505		ipss->ipsec_outbound_v6_policy_present = B_TRUE;
3506		ipss->ipsec_inbound_v4_policy_present = B_TRUE;
3507		ipss->ipsec_inbound_v6_policy_present = B_TRUE;
3508		return;
3509	}
3510
3511	ipss->ipsec_outbound_v4_policy_present = (NULL !=
3512	    ipss->ipsec_system_policy.iph_root[IPSEC_TYPE_OUTBOUND].
3513	    ipr_nonhash[IPSEC_AF_V4]);
3514	ipss->ipsec_outbound_v6_policy_present = (NULL !=
3515	    ipss->ipsec_system_policy.iph_root[IPSEC_TYPE_OUTBOUND].
3516	    ipr_nonhash[IPSEC_AF_V6]);
3517	ipss->ipsec_inbound_v4_policy_present = (NULL !=
3518	    ipss->ipsec_system_policy.iph_root[IPSEC_TYPE_INBOUND].
3519	    ipr_nonhash[IPSEC_AF_V4]);
3520	ipss->ipsec_inbound_v6_policy_present = (NULL !=
3521	    ipss->ipsec_system_policy.iph_root[IPSEC_TYPE_INBOUND].
3522	    ipr_nonhash[IPSEC_AF_V6]);
3523}
3524
3525boolean_t
3526ipsec_policy_delete(ipsec_policy_head_t *php, ipsec_selkey_t *keys, int dir,
3527	netstack_t *ns)
3528{
3529	ipsec_sel_t *sp;
3530	ipsec_policy_t *ip, *nip, *head;
3531	int af;
3532	ipsec_policy_root_t *pr = &php->iph_root[dir];
3533
3534	sp = ipsec_find_sel(keys, ns);
3535
3536	if (sp == NULL)
3537		return (B_FALSE);
3538
3539	af = (sp->ipsl_key.ipsl_valid & IPSL_IPV4) ? IPSEC_AF_V4 : IPSEC_AF_V6;
3540
3541	rw_enter(&php->iph_lock, RW_WRITER);
3542
3543	if (sp->ipsl_key.ipsl_pol_hval == IPSEC_SEL_NOHASH) {
3544		head = pr->ipr_nonhash[af];
3545	} else {
3546		head = pr->ipr_hash[sp->ipsl_key.ipsl_pol_hval].hash_head;
3547	}
3548
3549	for (ip = head; ip != NULL; ip = nip) {
3550		nip = ip->ipsp_hash.hash_next;
3551		if (ip->ipsp_sel != sp) {
3552			continue;
3553		}
3554
3555		IPPOL_UNCHAIN(php, ip);
3556
3557		php->iph_gen++;
3558		ipsec_update_present_flags(ns->netstack_ipsec);
3559
3560		rw_exit(&php->iph_lock);
3561
3562		ipsec_sel_rel(&sp, ns);
3563
3564		return (B_TRUE);
3565	}
3566
3567	rw_exit(&php->iph_lock);
3568	ipsec_sel_rel(&sp, ns);
3569	return (B_FALSE);
3570}
3571
3572int
3573ipsec_policy_delete_index(ipsec_policy_head_t *php, uint64_t policy_index,
3574    netstack_t *ns)
3575{
3576	boolean_t found = B_FALSE;
3577	ipsec_policy_t ipkey;
3578	ipsec_policy_t *ip;
3579	avl_index_t where;
3580
3581	(void) memset(&ipkey, 0, sizeof (ipkey));
3582	ipkey.ipsp_index = policy_index;
3583
3584	rw_enter(&php->iph_lock, RW_WRITER);
3585
3586	/*
3587	 * We could be cleverer here about the walk.
3588	 * but well, (k+1)*log(N) will do for now (k==number of matches,
3589	 * N==number of table entries
3590	 */
3591	for (;;) {
3592		ip = (ipsec_policy_t *)avl_find(&php->iph_rulebyid,
3593		    (void *)&ipkey, &where);
3594		ASSERT(ip == NULL);
3595
3596		ip = avl_nearest(&php->iph_rulebyid, where, AVL_AFTER);
3597
3598		if (ip == NULL)
3599			break;
3600
3601		if (ip->ipsp_index != policy_index) {
3602			ASSERT(ip->ipsp_index > policy_index);
3603			break;
3604		}
3605
3606		IPPOL_UNCHAIN(php, ip);
3607		found = B_TRUE;
3608	}
3609
3610	if (found) {
3611		php->iph_gen++;
3612		ipsec_update_present_flags(ns->netstack_ipsec);
3613	}
3614
3615	rw_exit(&php->iph_lock);
3616
3617	return (found ? 0 : ENOENT);
3618}
3619
3620/*
3621 * Given a constructed ipsec_policy_t policy rule, see if it can be entered
3622 * into the correct policy ruleset.  As a side-effect, it sets the hash
3623 * entries on "ipp"'s ipsp_pol_hval.
3624 *
3625 * Returns B_TRUE if it can be entered, B_FALSE if it can't be (because a
3626 * duplicate policy exists with exactly the same selectors), or an icmp
3627 * rule exists with a different encryption/authentication action.
3628 */
3629boolean_t
3630ipsec_check_policy(ipsec_policy_head_t *php, ipsec_policy_t *ipp, int direction)
3631{
3632	ipsec_policy_root_t *pr = &php->iph_root[direction];
3633	int af = -1;
3634	ipsec_policy_t *p2, *head;
3635	uint8_t check_proto;
3636	ipsec_selkey_t *selkey = &ipp->ipsp_sel->ipsl_key;
3637	uint32_t	valid = selkey->ipsl_valid;
3638
3639	if (valid & IPSL_IPV6) {
3640		ASSERT(!(valid & IPSL_IPV4));
3641		af = IPSEC_AF_V6;
3642		check_proto = IPPROTO_ICMPV6;
3643	} else {
3644		ASSERT(valid & IPSL_IPV4);
3645		af = IPSEC_AF_V4;
3646		check_proto = IPPROTO_ICMP;
3647	}
3648
3649	ASSERT(RW_WRITE_HELD(&php->iph_lock));
3650
3651	/*
3652	 * Double-check that we don't have any duplicate selectors here.
3653	 * Because selectors are interned below, we need only compare pointers
3654	 * for equality.
3655	 */
3656	if (selkey->ipsl_sel_hval == IPSEC_SEL_NOHASH) {
3657		head = pr->ipr_nonhash[af];
3658	} else {
3659		selkey->ipsl_pol_hval =
3660		    (selkey->ipsl_valid & IPSL_IPV4) ?
3661		    IPSEC_IPV4_HASH(selkey->ipsl_remote.ipsad_v4,
3662		    pr->ipr_nchains) :
3663		    IPSEC_IPV6_HASH(selkey->ipsl_remote.ipsad_v6,
3664		    pr->ipr_nchains);
3665
3666		head = pr->ipr_hash[selkey->ipsl_pol_hval].hash_head;
3667	}
3668
3669	for (p2 = head; p2 != NULL; p2 = p2->ipsp_hash.hash_next) {
3670		if (p2->ipsp_sel == ipp->ipsp_sel)
3671			return (B_FALSE);
3672	}
3673
3674	/*
3675	 * If it's ICMP and not a drop or pass rule, run through the ICMP
3676	 * rules and make sure the action is either new or the same as any
3677	 * other actions.  We don't have to check the full chain because
3678	 * discard and bypass will override all other actions
3679	 */
3680
3681	if (valid & IPSL_PROTOCOL &&
3682	    selkey->ipsl_proto == check_proto &&
3683	    (ipp->ipsp_act->ipa_act.ipa_type == IPSEC_ACT_APPLY)) {
3684
3685		for (p2 = head; p2 != NULL; p2 = p2->ipsp_hash.hash_next) {
3686
3687			if (p2->ipsp_sel->ipsl_key.ipsl_valid & IPSL_PROTOCOL &&
3688			    p2->ipsp_sel->ipsl_key.ipsl_proto == check_proto &&
3689			    (p2->ipsp_act->ipa_act.ipa_type ==
3690			    IPSEC_ACT_APPLY)) {
3691				return (ipsec_compare_action(p2, ipp));
3692			}
3693		}
3694	}
3695
3696	return (B_TRUE);
3697}
3698
3699/*
3700 * compare the action chains of two policies for equality
3701 * B_TRUE -> effective equality
3702 */
3703
3704static boolean_t
3705ipsec_compare_action(ipsec_policy_t *p1, ipsec_policy_t *p2)
3706{
3707
3708	ipsec_action_t *act1, *act2;
3709
3710	/* We have a valid rule. Let's compare the actions */
3711	if (p1->ipsp_act == p2->ipsp_act) {
3712		/* same action. We are good */
3713		return (B_TRUE);
3714	}
3715
3716	/* we have to walk the chain */
3717
3718	act1 = p1->ipsp_act;
3719	act2 = p2->ipsp_act;
3720
3721	while (act1 != NULL && act2 != NULL) {
3722
3723		/* otherwise, Are we close enough? */
3724		if (act1->ipa_allow_clear != act2->ipa_allow_clear ||
3725		    act1->ipa_want_ah != act2->ipa_want_ah ||
3726		    act1->ipa_want_esp != act2->ipa_want_esp ||
3727		    act1->ipa_want_se != act2->ipa_want_se) {
3728			/* Nope, we aren't */
3729			return (B_FALSE);
3730		}
3731
3732		if (act1->ipa_want_ah) {
3733			if (act1->ipa_act.ipa_apply.ipp_auth_alg !=
3734			    act2->ipa_act.ipa_apply.ipp_auth_alg) {
3735				return (B_FALSE);
3736			}
3737
3738			if (act1->ipa_act.ipa_apply.ipp_ah_minbits !=
3739			    act2->ipa_act.ipa_apply.ipp_ah_minbits ||
3740			    act1->ipa_act.ipa_apply.ipp_ah_maxbits !=
3741			    act2->ipa_act.ipa_apply.ipp_ah_maxbits) {
3742				return (B_FALSE);
3743			}
3744		}
3745
3746		if (act1->ipa_want_esp) {
3747			if (act1->ipa_act.ipa_apply.ipp_use_esp !=
3748			    act2->ipa_act.ipa_apply.ipp_use_esp ||
3749			    act1->ipa_act.ipa_apply.ipp_use_espa !=
3750			    act2->ipa_act.ipa_apply.ipp_use_espa) {
3751				return (B_FALSE);
3752			}
3753
3754			if (act1->ipa_act.ipa_apply.ipp_use_esp) {
3755				if (act1->ipa_act.ipa_apply.ipp_encr_alg !=
3756				    act2->ipa_act.ipa_apply.ipp_encr_alg) {
3757					return (B_FALSE);
3758				}
3759
3760				if (act1->ipa_act.ipa_apply.ipp_espe_minbits !=
3761				    act2->ipa_act.ipa_apply.ipp_espe_minbits ||
3762				    act1->ipa_act.ipa_apply.ipp_espe_maxbits !=
3763				    act2->ipa_act.ipa_apply.ipp_espe_maxbits) {
3764					return (B_FALSE);
3765				}
3766			}
3767
3768			if (act1->ipa_act.ipa_apply.ipp_use_espa) {
3769				if (act1->ipa_act.ipa_apply.ipp_esp_auth_alg !=
3770				    act2->ipa_act.ipa_apply.ipp_esp_auth_alg) {
3771					return (B_FALSE);
3772				}
3773
3774				if (act1->ipa_act.ipa_apply.ipp_espa_minbits !=
3775				    act2->ipa_act.ipa_apply.ipp_espa_minbits ||
3776				    act1->ipa_act.ipa_apply.ipp_espa_maxbits !=
3777				    act2->ipa_act.ipa_apply.ipp_espa_maxbits) {
3778					return (B_FALSE);
3779				}
3780			}
3781
3782		}
3783
3784		act1 = act1->ipa_next;
3785		act2 = act2->ipa_next;
3786	}
3787
3788	if (act1 != NULL || act2 != NULL) {
3789		return (B_FALSE);
3790	}
3791
3792	return (B_TRUE);
3793}
3794
3795
3796/*
3797 * Given a constructed ipsec_policy_t policy rule, enter it into
3798 * the correct policy ruleset.
3799 *
3800 * ipsec_check_policy() is assumed to have succeeded first (to check for
3801 * duplicates).
3802 */
3803void
3804ipsec_enter_policy(ipsec_policy_head_t *php, ipsec_policy_t *ipp, int direction,
3805    netstack_t *ns)
3806{
3807	ipsec_policy_root_t *pr = &php->iph_root[direction];
3808	ipsec_selkey_t *selkey = &ipp->ipsp_sel->ipsl_key;
3809	uint32_t valid = selkey->ipsl_valid;
3810	uint32_t hval = selkey->ipsl_pol_hval;
3811	int af = -1;
3812
3813	ASSERT(RW_WRITE_HELD(&php->iph_lock));
3814
3815	if (valid & IPSL_IPV6) {
3816		ASSERT(!(valid & IPSL_IPV4));
3817		af = IPSEC_AF_V6;
3818	} else {
3819		ASSERT(valid & IPSL_IPV4);
3820		af = IPSEC_AF_V4;
3821	}
3822
3823	php->iph_gen++;
3824
3825	if (hval == IPSEC_SEL_NOHASH) {
3826		HASHLIST_INSERT(ipp, ipsp_hash, pr->ipr_nonhash[af]);
3827	} else {
3828		HASH_LOCK(pr->ipr_hash, hval);
3829		HASH_INSERT(ipp, ipsp_hash, pr->ipr_hash, hval);
3830		HASH_UNLOCK(pr->ipr_hash, hval);
3831	}
3832
3833	ipsec_insert_always(&php->iph_rulebyid, ipp);
3834
3835	ipsec_update_present_flags(ns->netstack_ipsec);
3836}
3837
3838static void
3839ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr)
3840{
3841	ipsec_policy_t *ip, *nip;
3842	int af, chain, nchain;
3843
3844	for (af = 0; af < IPSEC_NAF; af++) {
3845		for (ip = ipr->ipr_nonhash[af]; ip != NULL; ip = nip) {
3846			nip = ip->ipsp_hash.hash_next;
3847			IPPOL_UNCHAIN(php, ip);
3848		}
3849		ipr->ipr_nonhash[af] = NULL;
3850	}
3851	nchain = ipr->ipr_nchains;
3852
3853	for (chain = 0; chain < nchain; chain++) {
3854		for (ip = ipr->ipr_hash[chain].hash_head; ip != NULL;
3855		    ip = nip) {
3856			nip = ip->ipsp_hash.hash_next;
3857			IPPOL_UNCHAIN(php, ip);
3858		}
3859		ipr->ipr_hash[chain].hash_head = NULL;
3860	}
3861}
3862
3863/*
3864 * Create and insert inbound or outbound policy associated with actp for the
3865 * address family fam into the policy head ph.  Returns B_TRUE if policy was
3866 * inserted, and B_FALSE otherwise.
3867 */
3868boolean_t
3869ipsec_polhead_insert(ipsec_policy_head_t *ph, ipsec_act_t *actp, uint_t nact,
3870    int fam, int ptype, netstack_t *ns)
3871{
3872	ipsec_selkey_t		sel;
3873	ipsec_policy_t		*pol;
3874	ipsec_policy_root_t	*pr;
3875
3876	bzero(&sel, sizeof (sel));
3877	sel.ipsl_valid = (fam == IPSEC_AF_V4 ? IPSL_IPV4 : IPSL_IPV6);
3878	if ((pol = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET,
3879	    NULL, ns)) != NULL) {
3880		pr = &ph->iph_root[ptype];
3881		HASHLIST_INSERT(pol, ipsp_hash, pr->ipr_nonhash[fam]);
3882		ipsec_insert_always(&ph->iph_rulebyid, pol);
3883	}
3884	return (pol != NULL);
3885}
3886
3887void
3888ipsec_polhead_flush(ipsec_policy_head_t *php, netstack_t *ns)
3889{
3890	int dir;
3891
3892	ASSERT(RW_WRITE_HELD(&php->iph_lock));
3893
3894	for (dir = 0; dir < IPSEC_NTYPES; dir++)
3895		ipsec_ipr_flush(php, &php->iph_root[dir]);
3896
3897	php->iph_gen++;
3898	ipsec_update_present_flags(ns->netstack_ipsec);
3899}
3900
3901void
3902ipsec_polhead_free(ipsec_policy_head_t *php, netstack_t *ns)
3903{
3904	int dir;
3905
3906	ASSERT(php->iph_refs == 0);
3907
3908	rw_enter(&php->iph_lock, RW_WRITER);
3909	ipsec_polhead_flush(php, ns);
3910	rw_exit(&php->iph_lock);
3911	rw_destroy(&php->iph_lock);
3912	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
3913		ipsec_policy_root_t *ipr = &php->iph_root[dir];
3914		int chain;
3915
3916		for (chain = 0; chain < ipr->ipr_nchains; chain++)
3917			mutex_destroy(&(ipr->ipr_hash[chain].hash_lock));
3918
3919	}
3920	ipsec_polhead_free_table(php);
3921	kmem_free(php, sizeof (*php));
3922}
3923
3924static void
3925ipsec_ipr_init(ipsec_policy_root_t *ipr)
3926{
3927	int af;
3928
3929	ipr->ipr_nchains = 0;
3930	ipr->ipr_hash = NULL;
3931
3932	for (af = 0; af < IPSEC_NAF; af++) {
3933		ipr->ipr_nonhash[af] = NULL;
3934	}
3935}
3936
3937ipsec_policy_head_t *
3938ipsec_polhead_create(void)
3939{
3940	ipsec_policy_head_t *php;
3941
3942	php = kmem_alloc(sizeof (*php), KM_NOSLEEP);
3943	if (php == NULL)
3944		return (php);
3945
3946	rw_init(&php->iph_lock, NULL, RW_DEFAULT, NULL);
3947	php->iph_refs = 1;
3948	php->iph_gen = 0;
3949
3950	ipsec_ipr_init(&php->iph_root[IPSEC_TYPE_INBOUND]);
3951	ipsec_ipr_init(&php->iph_root[IPSEC_TYPE_OUTBOUND]);
3952
3953	avl_create(&php->iph_rulebyid, ipsec_policy_cmpbyid,
3954	    sizeof (ipsec_policy_t), offsetof(ipsec_policy_t, ipsp_byid));
3955
3956	return (php);
3957}
3958
3959/*
3960 * Clone the policy head into a new polhead; release one reference to the
3961 * old one and return the only reference to the new one.
3962 * If the old one had a refcount of 1, just return it.
3963 */
3964ipsec_policy_head_t *
3965ipsec_polhead_split(ipsec_policy_head_t *php, netstack_t *ns)
3966{
3967	ipsec_policy_head_t *nphp;
3968
3969	if (php == NULL)
3970		return (ipsec_polhead_create());
3971	else if (php->iph_refs == 1)
3972		return (php);
3973
3974	nphp = ipsec_polhead_create();
3975	if (nphp == NULL)
3976		return (NULL);
3977
3978	if (ipsec_copy_polhead(php, nphp, ns) != 0) {
3979		ipsec_polhead_free(nphp, ns);
3980		return (NULL);
3981	}
3982	IPPH_REFRELE(php, ns);
3983	return (nphp);
3984}
3985
3986/*
3987 * When sending a response to a ICMP request or generating a RST
3988 * in the TCP case, the outbound packets need to go at the same level
3989 * of protection as the incoming ones i.e we associate our outbound
3990 * policy with how the packet came in. We call this after we have
3991 * accepted the incoming packet which may or may not have been in
3992 * clear and hence we are sending the reply back with the policy
3993 * matching the incoming datagram's policy.
3994 *
3995 * NOTE : This technology serves two purposes :
3996 *
3997 * 1) If we have multiple outbound policies, we send out a reply
3998 *    matching with how it came in rather than matching the outbound
3999 *    policy.
4000 *
4001 * 2) For assymetric policies, we want to make sure that incoming
4002 *    and outgoing has the same level of protection. Assymetric
4003 *    policies exist only with global policy where we may not have
4004 *    both outbound and inbound at the same time.
4005 *
4006 * NOTE2:	This function is called by cleartext cases, so it needs to be
4007 *		in IP proper.
4008 *
4009 * Note: the caller has moved other parts of ira into ixa already.
4010 */
4011boolean_t
4012ipsec_in_to_out(ip_recv_attr_t *ira, ip_xmit_attr_t *ixa, mblk_t *data_mp,
4013    ipha_t *ipha, ip6_t *ip6h)
4014{
4015	ipsec_selector_t sel;
4016	ipsec_action_t	*reflect_action = NULL;
4017	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
4018
4019	bzero((void*)&sel, sizeof (sel));
4020
4021	if (ira->ira_ipsec_action != NULL) {
4022		/* transfer reference.. */
4023		reflect_action = ira->ira_ipsec_action;
4024		ira->ira_ipsec_action = NULL;
4025	} else if (!(ira->ira_flags & IRAF_LOOPBACK))
4026		reflect_action = ipsec_in_to_out_action(ira);
4027
4028	/*
4029	 * The caller is going to send the datagram out which might
4030	 * go on the wire or delivered locally through ire_send_local.
4031	 *
4032	 * 1) If it goes out on the wire, new associations will be
4033	 *    obtained.
4034	 * 2) If it is delivered locally, ire_send_local will convert
4035	 *    this ip_xmit_attr_t back to a ip_recv_attr_t looking at the
4036	 *    requests.
4037	 */
4038	ixa->ixa_ipsec_action = reflect_action;
4039
4040	if (!ipsec_init_outbound_ports(&sel, data_mp, ipha, ip6h, 0,
4041	    ns->netstack_ipsec)) {
4042		/* Note: data_mp already consumed and ip_drop_packet done */
4043		return (B_FALSE);
4044	}
4045	ixa->ixa_ipsec_src_port = sel.ips_local_port;
4046	ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
4047	ixa->ixa_ipsec_proto = sel.ips_protocol;
4048	ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
4049	ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
4050
4051	/*
4052	 * Don't use global policy for this, as we want
4053	 * to use the same protection that was applied to the inbound packet.
4054	 * Thus we set IXAF_NO_IPSEC is it arrived in the clear to make
4055	 * it be sent in the clear.
4056	 */
4057	if (ira->ira_flags & IRAF_IPSEC_SECURE)
4058		ixa->ixa_flags |= IXAF_IPSEC_SECURE;
4059	else
4060		ixa->ixa_flags |= IXAF_NO_IPSEC;
4061
4062	return (B_TRUE);
4063}
4064
4065void
4066ipsec_out_release_refs(ip_xmit_attr_t *ixa)
4067{
4068	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
4069		return;
4070
4071	if (ixa->ixa_ipsec_ah_sa != NULL) {
4072		IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
4073		ixa->ixa_ipsec_ah_sa = NULL;
4074	}
4075	if (ixa->ixa_ipsec_esp_sa != NULL) {
4076		IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
4077		ixa->ixa_ipsec_esp_sa = NULL;
4078	}
4079	if (ixa->ixa_ipsec_policy != NULL) {
4080		IPPOL_REFRELE(ixa->ixa_ipsec_policy);
4081		ixa->ixa_ipsec_policy = NULL;
4082	}
4083	if (ixa->ixa_ipsec_action != NULL) {
4084		IPACT_REFRELE(ixa->ixa_ipsec_action);
4085		ixa->ixa_ipsec_action = NULL;
4086	}
4087	if (ixa->ixa_ipsec_latch) {
4088		IPLATCH_REFRELE(ixa->ixa_ipsec_latch);
4089		ixa->ixa_ipsec_latch = NULL;
4090	}
4091	/* Clear the soft references to the SAs */
4092	ixa->ixa_ipsec_ref[0].ipsr_sa = NULL;
4093	ixa->ixa_ipsec_ref[0].ipsr_bucket = NULL;
4094	ixa->ixa_ipsec_ref[0].ipsr_gen = 0;
4095	ixa->ixa_ipsec_ref[1].ipsr_sa = NULL;
4096	ixa->ixa_ipsec_ref[1].ipsr_bucket = NULL;
4097	ixa->ixa_ipsec_ref[1].ipsr_gen = 0;
4098	ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4099}
4100
4101void
4102ipsec_in_release_refs(ip_recv_attr_t *ira)
4103{
4104	if (!(ira->ira_flags & IRAF_IPSEC_SECURE))
4105		return;
4106
4107	if (ira->ira_ipsec_ah_sa != NULL) {
4108		IPSA_REFRELE(ira->ira_ipsec_ah_sa);
4109		ira->ira_ipsec_ah_sa = NULL;
4110	}
4111	if (ira->ira_ipsec_esp_sa != NULL) {
4112		IPSA_REFRELE(ira->ira_ipsec_esp_sa);
4113		ira->ira_ipsec_esp_sa = NULL;
4114	}
4115	ira->ira_flags &= ~IRAF_IPSEC_SECURE;
4116}
4117
4118/*
4119 * This is called from ire_send_local when a packet
4120 * is looped back. We setup the ip_recv_attr_t "borrowing" the references
4121 * held by the callers.
4122 * Note that we don't do any IPsec but we carry the actions and IPSEC flags
4123 * across so that the fanout policy checks see that IPsec was applied.
4124 *
4125 * The caller should do ipsec_in_release_refs() on the ira by calling
4126 * ira_cleanup().
4127 */
4128void
4129ipsec_out_to_in(ip_xmit_attr_t *ixa, ill_t *ill, ip_recv_attr_t *ira)
4130{
4131	ipsec_policy_t *pol;
4132	ipsec_action_t *act;
4133
4134	/* Non-IPsec operations */
4135	ira->ira_free_flags = 0;
4136	ira->ira_zoneid = ixa->ixa_zoneid;
4137	ira->ira_cred = ixa->ixa_cred;
4138	ira->ira_cpid = ixa->ixa_cpid;
4139	ira->ira_tsl = ixa->ixa_tsl;
4140	ira->ira_ill = ira->ira_rill = ill;
4141	ira->ira_flags = ixa->ixa_flags & IAF_MASK;
4142	ira->ira_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
4143	ira->ira_pktlen = ixa->ixa_pktlen;
4144	ira->ira_ip_hdr_length = ixa->ixa_ip_hdr_length;
4145	ira->ira_protocol = ixa->ixa_protocol;
4146	ira->ira_mhip = NULL;
4147
4148	ira->ira_flags |= IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
4149
4150	ira->ira_sqp = ixa->ixa_sqp;
4151	ira->ira_ring = NULL;
4152
4153	ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
4154	ira->ira_rifindex = ira->ira_ruifindex;
4155
4156	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
4157		return;
4158
4159	ira->ira_flags |= IRAF_IPSEC_SECURE;
4160
4161	ira->ira_ipsec_ah_sa = NULL;
4162	ira->ira_ipsec_esp_sa = NULL;
4163
4164	act = ixa->ixa_ipsec_action;
4165	if (act == NULL) {
4166		pol = ixa->ixa_ipsec_policy;
4167		if (pol != NULL) {
4168			act = pol->ipsp_act;
4169			IPACT_REFHOLD(act);
4170		}
4171	}
4172	ixa->ixa_ipsec_action = NULL;
4173	ira->ira_ipsec_action = act;
4174}
4175
4176/*
4177 * Consults global policy and per-socket policy to see whether this datagram
4178 * should go out secure. If so it updates the ip_xmit_attr_t
4179 * Should not be used when connecting, since then we want to latch the policy.
4180 *
4181 * If connp is NULL we just look at the global policy.
4182 *
4183 * Returns NULL if the packet was dropped, in which case the MIB has
4184 * been incremented and ip_drop_packet done.
4185 */
4186mblk_t *
4187ip_output_attach_policy(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
4188    const conn_t *connp, ip_xmit_attr_t *ixa)
4189{
4190	ipsec_selector_t sel;
4191	boolean_t	policy_present;
4192	ip_stack_t	*ipst = ixa->ixa_ipst;
4193	netstack_t	*ns = ipst->ips_netstack;
4194	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4195	ipsec_policy_t	*p;
4196
4197	ixa->ixa_ipsec_policy_gen = ipss->ipsec_system_policy.iph_gen;
4198	ASSERT((ipha != NULL && ip6h == NULL) ||
4199	    (ip6h != NULL && ipha == NULL));
4200
4201	if (ipha != NULL)
4202		policy_present = ipss->ipsec_outbound_v4_policy_present;
4203	else
4204		policy_present = ipss->ipsec_outbound_v6_policy_present;
4205
4206	if (!policy_present && (connp == NULL || connp->conn_policy == NULL))
4207		return (mp);
4208
4209	bzero((void*)&sel, sizeof (sel));
4210
4211	if (ipha != NULL) {
4212		sel.ips_local_addr_v4 = ipha->ipha_src;
4213		sel.ips_remote_addr_v4 = ip_get_dst(ipha);
4214		sel.ips_isv4 = B_TRUE;
4215	} else {
4216		sel.ips_isv4 = B_FALSE;
4217		sel.ips_local_addr_v6 = ip6h->ip6_src;
4218		sel.ips_remote_addr_v6 = ip_get_dst_v6(ip6h, mp, NULL);
4219	}
4220	sel.ips_protocol = ixa->ixa_protocol;
4221
4222	if (!ipsec_init_outbound_ports(&sel, mp, ipha, ip6h, 0, ipss)) {
4223		if (ipha != NULL) {
4224			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
4225		} else {
4226			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
4227		}
4228		/* Note: mp already consumed and ip_drop_packet done */
4229		return (NULL);
4230	}
4231
4232	ASSERT(ixa->ixa_ipsec_policy == NULL);
4233	p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
4234	ixa->ixa_ipsec_policy = p;
4235	if (p != NULL) {
4236		ixa->ixa_flags |= IXAF_IPSEC_SECURE;
4237		if (connp == NULL || connp->conn_policy == NULL)
4238			ixa->ixa_flags |= IXAF_IPSEC_GLOBAL_POLICY;
4239	} else {
4240		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4241	}
4242
4243	/*
4244	 * Copy the right port information.
4245	 */
4246	ixa->ixa_ipsec_src_port = sel.ips_local_port;
4247	ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
4248	ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
4249	ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
4250	ixa->ixa_ipsec_proto = sel.ips_protocol;
4251	return (mp);
4252}
4253
4254/*
4255 * When appropriate, this function caches inbound and outbound policy
4256 * for this connection. The outbound policy is stored in conn_ixa.
4257 * Note that it can not be used for SCTP since conn_faddr isn't set for SCTP.
4258 *
4259 * XXX need to work out more details about per-interface policy and
4260 * caching here!
4261 *
4262 * XXX may want to split inbound and outbound caching for ill..
4263 */
4264int
4265ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4)
4266{
4267	boolean_t global_policy_present;
4268	netstack_t	*ns = connp->conn_netstack;
4269	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4270
4271	connp->conn_ixa->ixa_ipsec_policy_gen =
4272	    ipss->ipsec_system_policy.iph_gen;
4273	/*
4274	 * There is no policy latching for ICMP sockets because we can't
4275	 * decide on which policy to use until we see the packet and get
4276	 * type/code selectors.
4277	 */
4278	if (connp->conn_proto == IPPROTO_ICMP ||
4279	    connp->conn_proto == IPPROTO_ICMPV6) {
4280		connp->conn_in_enforce_policy =
4281		    connp->conn_out_enforce_policy = B_TRUE;
4282		if (connp->conn_latch != NULL) {
4283			IPLATCH_REFRELE(connp->conn_latch);
4284			connp->conn_latch = NULL;
4285		}
4286		if (connp->conn_latch_in_policy != NULL) {
4287			IPPOL_REFRELE(connp->conn_latch_in_policy);
4288			connp->conn_latch_in_policy = NULL;
4289		}
4290		if (connp->conn_latch_in_action != NULL) {
4291			IPACT_REFRELE(connp->conn_latch_in_action);
4292			connp->conn_latch_in_action = NULL;
4293		}
4294		if (connp->conn_ixa->ixa_ipsec_policy != NULL) {
4295			IPPOL_REFRELE(connp->conn_ixa->ixa_ipsec_policy);
4296			connp->conn_ixa->ixa_ipsec_policy = NULL;
4297		}
4298		if (connp->conn_ixa->ixa_ipsec_action != NULL) {
4299			IPACT_REFRELE(connp->conn_ixa->ixa_ipsec_action);
4300			connp->conn_ixa->ixa_ipsec_action = NULL;
4301		}
4302		connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4303		return (0);
4304	}
4305
4306	global_policy_present = isv4 ?
4307	    (ipss->ipsec_outbound_v4_policy_present ||
4308	    ipss->ipsec_inbound_v4_policy_present) :
4309	    (ipss->ipsec_outbound_v6_policy_present ||
4310	    ipss->ipsec_inbound_v6_policy_present);
4311
4312	if ((connp->conn_policy != NULL) || global_policy_present) {
4313		ipsec_selector_t sel;
4314		ipsec_policy_t	*p;
4315
4316		if (connp->conn_latch == NULL &&
4317		    (connp->conn_latch = iplatch_create()) == NULL) {
4318			return (ENOMEM);
4319		}
4320
4321		bzero((void*)&sel, sizeof (sel));
4322
4323		sel.ips_protocol = connp->conn_proto;
4324		sel.ips_local_port = connp->conn_lport;
4325		sel.ips_remote_port = connp->conn_fport;
4326		sel.ips_is_icmp_inv_acq = 0;
4327		sel.ips_isv4 = isv4;
4328		if (isv4) {
4329			sel.ips_local_addr_v4 = connp->conn_laddr_v4;
4330			sel.ips_remote_addr_v4 = connp->conn_faddr_v4;
4331		} else {
4332			sel.ips_local_addr_v6 = connp->conn_laddr_v6;
4333			sel.ips_remote_addr_v6 = connp->conn_faddr_v6;
4334		}
4335
4336		p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, &sel, ns);
4337		if (connp->conn_latch_in_policy != NULL)
4338			IPPOL_REFRELE(connp->conn_latch_in_policy);
4339		connp->conn_latch_in_policy = p;
4340		connp->conn_in_enforce_policy = (p != NULL);
4341
4342		p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
4343		if (connp->conn_ixa->ixa_ipsec_policy != NULL)
4344			IPPOL_REFRELE(connp->conn_ixa->ixa_ipsec_policy);
4345		connp->conn_ixa->ixa_ipsec_policy = p;
4346		connp->conn_out_enforce_policy = (p != NULL);
4347		if (p != NULL) {
4348			connp->conn_ixa->ixa_flags |= IXAF_IPSEC_SECURE;
4349			if (connp->conn_policy == NULL) {
4350				connp->conn_ixa->ixa_flags |=
4351				    IXAF_IPSEC_GLOBAL_POLICY;
4352			}
4353		} else {
4354			connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4355		}
4356		/* Clear the latched actions too, in case we're recaching. */
4357		if (connp->conn_ixa->ixa_ipsec_action != NULL) {
4358			IPACT_REFRELE(connp->conn_ixa->ixa_ipsec_action);
4359			connp->conn_ixa->ixa_ipsec_action = NULL;
4360		}
4361		if (connp->conn_latch_in_action != NULL) {
4362			IPACT_REFRELE(connp->conn_latch_in_action);
4363			connp->conn_latch_in_action = NULL;
4364		}
4365		connp->conn_ixa->ixa_ipsec_src_port = sel.ips_local_port;
4366		connp->conn_ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
4367		connp->conn_ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
4368		connp->conn_ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
4369		connp->conn_ixa->ixa_ipsec_proto = sel.ips_protocol;
4370	} else {
4371		connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4372	}
4373
4374	/*
4375	 * We may or may not have policy for this endpoint.  We still set
4376	 * conn_policy_cached so that inbound datagrams don't have to look
4377	 * at global policy as policy is considered latched for these
4378	 * endpoints.  We should not set conn_policy_cached until the conn
4379	 * reflects the actual policy. If we *set* this before inheriting
4380	 * the policy there is a window where the check
4381	 * CONN_INBOUND_POLICY_PRESENT, will neither check with the policy
4382	 * on the conn (because we have not yet copied the policy on to
4383	 * conn and hence not set conn_in_enforce_policy) nor with the
4384	 * global policy (because conn_policy_cached is already set).
4385	 */
4386	connp->conn_policy_cached = B_TRUE;
4387	return (0);
4388}
4389
4390/*
4391 * When appropriate, this function caches outbound policy for faddr/fport.
4392 * It is used when we are not connected i.e., when we can not latch the
4393 * policy.
4394 */
4395void
4396ipsec_cache_outbound_policy(const conn_t *connp, const in6_addr_t *v6src,
4397    const in6_addr_t *v6dst, in_port_t dstport, ip_xmit_attr_t *ixa)
4398{
4399	boolean_t	isv4 = (ixa->ixa_flags & IXAF_IS_IPV4) != 0;
4400	boolean_t	global_policy_present;
4401	netstack_t	*ns = connp->conn_netstack;
4402	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4403
4404	ixa->ixa_ipsec_policy_gen = ipss->ipsec_system_policy.iph_gen;
4405
4406	/*
4407	 * There is no policy caching for ICMP sockets because we can't
4408	 * decide on which policy to use until we see the packet and get
4409	 * type/code selectors.
4410	 */
4411	if (connp->conn_proto == IPPROTO_ICMP ||
4412	    connp->conn_proto == IPPROTO_ICMPV6) {
4413		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4414		if (ixa->ixa_ipsec_policy != NULL) {
4415			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
4416			ixa->ixa_ipsec_policy = NULL;
4417		}
4418		if (ixa->ixa_ipsec_action != NULL) {
4419			IPACT_REFRELE(ixa->ixa_ipsec_action);
4420			ixa->ixa_ipsec_action = NULL;
4421		}
4422		return;
4423	}
4424
4425	global_policy_present = isv4 ?
4426	    (ipss->ipsec_outbound_v4_policy_present ||
4427	    ipss->ipsec_inbound_v4_policy_present) :
4428	    (ipss->ipsec_outbound_v6_policy_present ||
4429	    ipss->ipsec_inbound_v6_policy_present);
4430
4431	if ((connp->conn_policy != NULL) || global_policy_present) {
4432		ipsec_selector_t sel;
4433		ipsec_policy_t	*p;
4434
4435		bzero((void*)&sel, sizeof (sel));
4436
4437		sel.ips_protocol = connp->conn_proto;
4438		sel.ips_local_port = connp->conn_lport;
4439		sel.ips_remote_port = dstport;
4440		sel.ips_is_icmp_inv_acq = 0;
4441		sel.ips_isv4 = isv4;
4442		if (isv4) {
4443			IN6_V4MAPPED_TO_IPADDR(v6src, sel.ips_local_addr_v4);
4444			IN6_V4MAPPED_TO_IPADDR(v6dst, sel.ips_remote_addr_v4);
4445		} else {
4446			sel.ips_local_addr_v6 = *v6src;
4447			sel.ips_remote_addr_v6 = *v6dst;
4448		}
4449
4450		p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
4451		if (ixa->ixa_ipsec_policy != NULL)
4452			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
4453		ixa->ixa_ipsec_policy = p;
4454		if (p != NULL) {
4455			ixa->ixa_flags |= IXAF_IPSEC_SECURE;
4456			if (connp->conn_policy == NULL)
4457				ixa->ixa_flags |= IXAF_IPSEC_GLOBAL_POLICY;
4458		} else {
4459			ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4460		}
4461		/* Clear the latched actions too, in case we're recaching. */
4462		if (ixa->ixa_ipsec_action != NULL) {
4463			IPACT_REFRELE(ixa->ixa_ipsec_action);
4464			ixa->ixa_ipsec_action = NULL;
4465		}
4466
4467		ixa->ixa_ipsec_src_port = sel.ips_local_port;
4468		ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
4469		ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
4470		ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
4471		ixa->ixa_ipsec_proto = sel.ips_protocol;
4472	} else {
4473		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4474		if (ixa->ixa_ipsec_policy != NULL) {
4475			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
4476			ixa->ixa_ipsec_policy = NULL;
4477		}
4478		if (ixa->ixa_ipsec_action != NULL) {
4479			IPACT_REFRELE(ixa->ixa_ipsec_action);
4480			ixa->ixa_ipsec_action = NULL;
4481		}
4482	}
4483}
4484
4485/*
4486 * Returns B_FALSE if the policy has gone stale.
4487 */
4488boolean_t
4489ipsec_outbound_policy_current(ip_xmit_attr_t *ixa)
4490{
4491	ipsec_stack_t	*ipss = ixa->ixa_ipst->ips_netstack->netstack_ipsec;
4492
4493	if (!(ixa->ixa_flags & IXAF_IPSEC_GLOBAL_POLICY))
4494		return (B_TRUE);
4495
4496	return (ixa->ixa_ipsec_policy_gen == ipss->ipsec_system_policy.iph_gen);
4497}
4498
4499void
4500iplatch_free(ipsec_latch_t *ipl)
4501{
4502	if (ipl->ipl_local_cid != NULL)
4503		IPSID_REFRELE(ipl->ipl_local_cid);
4504	if (ipl->ipl_remote_cid != NULL)
4505		IPSID_REFRELE(ipl->ipl_remote_cid);
4506	mutex_destroy(&ipl->ipl_lock);
4507	kmem_free(ipl, sizeof (*ipl));
4508}
4509
4510ipsec_latch_t *
4511iplatch_create()
4512{
4513	ipsec_latch_t *ipl = kmem_alloc(sizeof (*ipl), KM_NOSLEEP);
4514	if (ipl == NULL)
4515		return (ipl);
4516	bzero(ipl, sizeof (*ipl));
4517	mutex_init(&ipl->ipl_lock, NULL, MUTEX_DEFAULT, NULL);
4518	ipl->ipl_refcnt = 1;
4519	return (ipl);
4520}
4521
4522/*
4523 * Hash function for ID hash table.
4524 */
4525static uint32_t
4526ipsid_hash(int idtype, char *idstring)
4527{
4528	uint32_t hval = idtype;
4529	unsigned char c;
4530
4531	while ((c = *idstring++) != 0) {
4532		hval = (hval << 4) | (hval >> 28);
4533		hval ^= c;
4534	}
4535	hval = hval ^ (hval >> 16);
4536	return (hval & (IPSID_HASHSIZE-1));
4537}
4538
4539/*
4540 * Look up identity string in hash table.  Return identity object
4541 * corresponding to the name -- either preexisting, or newly allocated.
4542 *
4543 * Return NULL if we need to allocate a new one and can't get memory.
4544 */
4545ipsid_t *
4546ipsid_lookup(int idtype, char *idstring, netstack_t *ns)
4547{
4548	ipsid_t *retval;
4549	char *nstr;
4550	int idlen = strlen(idstring) + 1;
4551	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4552	ipsif_t *bucket;
4553
4554	bucket = &ipss->ipsec_ipsid_buckets[ipsid_hash(idtype, idstring)];
4555
4556	mutex_enter(&bucket->ipsif_lock);
4557
4558	for (retval = bucket->ipsif_head; retval != NULL;
4559	    retval = retval->ipsid_next) {
4560		if (idtype != retval->ipsid_type)
4561			continue;
4562		if (bcmp(idstring, retval->ipsid_cid, idlen) != 0)
4563			continue;
4564
4565		IPSID_REFHOLD(retval);
4566		mutex_exit(&bucket->ipsif_lock);
4567		return (retval);
4568	}
4569
4570	retval = kmem_alloc(sizeof (*retval), KM_NOSLEEP);
4571	if (!retval) {
4572		mutex_exit(&bucket->ipsif_lock);
4573		return (NULL);
4574	}
4575
4576	nstr = kmem_alloc(idlen, KM_NOSLEEP);
4577	if (!nstr) {
4578		mutex_exit(&bucket->ipsif_lock);
4579		kmem_free(retval, sizeof (*retval));
4580		return (NULL);
4581	}
4582
4583	retval->ipsid_refcnt = 1;
4584	retval->ipsid_next = bucket->ipsif_head;
4585	if (retval->ipsid_next != NULL)
4586		retval->ipsid_next->ipsid_ptpn = &retval->ipsid_next;
4587	retval->ipsid_ptpn = &bucket->ipsif_head;
4588	retval->ipsid_type = idtype;
4589	retval->ipsid_cid = nstr;
4590	bucket->ipsif_head = retval;
4591	bcopy(idstring, nstr, idlen);
4592	mutex_exit(&bucket->ipsif_lock);
4593
4594	return (retval);
4595}
4596
4597/*
4598 * Garbage collect the identity hash table.
4599 */
4600void
4601ipsid_gc(netstack_t *ns)
4602{
4603	int i, len;
4604	ipsid_t *id, *nid;
4605	ipsif_t *bucket;
4606	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4607
4608	for (i = 0; i < IPSID_HASHSIZE; i++) {
4609		bucket = &ipss->ipsec_ipsid_buckets[i];
4610		mutex_enter(&bucket->ipsif_lock);
4611		for (id = bucket->ipsif_head; id != NULL; id = nid) {
4612			nid = id->ipsid_next;
4613			if (id->ipsid_refcnt == 0) {
4614				*id->ipsid_ptpn = nid;
4615				if (nid != NULL)
4616					nid->ipsid_ptpn = id->ipsid_ptpn;
4617				len = strlen(id->ipsid_cid) + 1;
4618				kmem_free(id->ipsid_cid, len);
4619				kmem_free(id, sizeof (*id));
4620			}
4621		}
4622		mutex_exit(&bucket->ipsif_lock);
4623	}
4624}
4625
4626/*
4627 * Return true if two identities are the same.
4628 */
4629boolean_t
4630ipsid_equal(ipsid_t *id1, ipsid_t *id2)
4631{
4632	if (id1 == id2)
4633		return (B_TRUE);
4634#ifdef DEBUG
4635	if ((id1 == NULL) || (id2 == NULL))
4636		return (B_FALSE);
4637	/*
4638	 * test that we're interning id's correctly..
4639	 */
4640	ASSERT((strcmp(id1->ipsid_cid, id2->ipsid_cid) != 0) ||
4641	    (id1->ipsid_type != id2->ipsid_type));
4642#endif
4643	return (B_FALSE);
4644}
4645
4646/*
4647 * Initialize identity table; called during module initialization.
4648 */
4649static void
4650ipsid_init(netstack_t *ns)
4651{
4652	ipsif_t *bucket;
4653	int i;
4654	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4655
4656	for (i = 0; i < IPSID_HASHSIZE; i++) {
4657		bucket = &ipss->ipsec_ipsid_buckets[i];
4658		mutex_init(&bucket->ipsif_lock, NULL, MUTEX_DEFAULT, NULL);
4659	}
4660}
4661
4662/*
4663 * Free identity table (preparatory to module unload)
4664 */
4665static void
4666ipsid_fini(netstack_t *ns)
4667{
4668	ipsif_t *bucket;
4669	int i;
4670	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4671
4672	for (i = 0; i < IPSID_HASHSIZE; i++) {
4673		bucket = &ipss->ipsec_ipsid_buckets[i];
4674		ASSERT(bucket->ipsif_head == NULL);
4675		mutex_destroy(&bucket->ipsif_lock);
4676	}
4677}
4678
4679/*
4680 * Update the minimum and maximum supported key sizes for the
4681 * specified algorithm. Must be called while holding the algorithms lock.
4682 */
4683void
4684ipsec_alg_fix_min_max(ipsec_alginfo_t *alg, ipsec_algtype_t alg_type,
4685    netstack_t *ns)
4686{
4687	size_t crypto_min = (size_t)-1, crypto_max = 0;
4688	size_t cur_crypto_min, cur_crypto_max;
4689	boolean_t is_valid;
4690	crypto_mechanism_info_t *mech_infos;
4691	uint_t nmech_infos;
4692	int crypto_rc, i;
4693	crypto_mech_usage_t mask;
4694	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4695
4696	ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
4697
4698	/*
4699	 * Compute the min, max, and default key sizes (in number of
4700	 * increments to the default key size in bits) as defined
4701	 * by the algorithm mappings. This range of key sizes is used
4702	 * for policy related operations. The effective key sizes
4703	 * supported by the framework could be more limited than
4704	 * those defined for an algorithm.
4705	 */
4706	alg->alg_default_bits = alg->alg_key_sizes[0];
4707	alg->alg_default = 0;
4708	if (alg->alg_increment != 0) {
4709		/* key sizes are defined by range & increment */
4710		alg->alg_minbits = alg->alg_key_sizes[1];
4711		alg->alg_maxbits = alg->alg_key_sizes[2];
4712	} else if (alg->alg_nkey_sizes == 0) {
4713		/* no specified key size for algorithm */
4714		alg->alg_minbits = alg->alg_maxbits = 0;
4715	} else {
4716		/* key sizes are defined by enumeration */
4717		alg->alg_minbits = (uint16_t)-1;
4718		alg->alg_maxbits = 0;
4719
4720		for (i = 0; i < alg->alg_nkey_sizes; i++) {
4721			if (alg->alg_key_sizes[i] < alg->alg_minbits)
4722				alg->alg_minbits = alg->alg_key_sizes[i];
4723			if (alg->alg_key_sizes[i] > alg->alg_maxbits)
4724				alg->alg_maxbits = alg->alg_key_sizes[i];
4725		}
4726	}
4727
4728	if (!(alg->alg_flags & ALG_FLAG_VALID))
4729		return;
4730
4731	/*
4732	 * Mechanisms do not apply to the NULL encryption
4733	 * algorithm, so simply return for this case.
4734	 */
4735	if (alg->alg_id == SADB_EALG_NULL)
4736		return;
4737
4738	/*
4739	 * Find the min and max key sizes supported by the cryptographic
4740	 * framework providers.
4741	 */
4742
4743	/* get the key sizes supported by the framework */
4744	crypto_rc = crypto_get_all_mech_info(alg->alg_mech_type,
4745	    &mech_infos, &nmech_infos, KM_SLEEP);
4746	if (crypto_rc != CRYPTO_SUCCESS || nmech_infos == 0) {
4747		alg->alg_flags &= ~ALG_FLAG_VALID;
4748		return;
4749	}
4750
4751	/* min and max key sizes supported by framework */
4752	for (i = 0, is_valid = B_FALSE; i < nmech_infos; i++) {
4753		int unit_bits;
4754
4755		/*
4756		 * Ignore entries that do not support the operations
4757		 * needed for the algorithm type.
4758		 */
4759		if (alg_type == IPSEC_ALG_AUTH) {
4760			mask = CRYPTO_MECH_USAGE_MAC;
4761		} else {
4762			mask = CRYPTO_MECH_USAGE_ENCRYPT |
4763			    CRYPTO_MECH_USAGE_DECRYPT;
4764		}
4765		if ((mech_infos[i].mi_usage & mask) != mask)
4766			continue;
4767
4768		unit_bits = (mech_infos[i].mi_keysize_unit ==
4769		    CRYPTO_KEYSIZE_UNIT_IN_BYTES)  ? 8 : 1;
4770		/* adjust min/max supported by framework */
4771		cur_crypto_min = mech_infos[i].mi_min_key_size * unit_bits;
4772		cur_crypto_max = mech_infos[i].mi_max_key_size * unit_bits;
4773
4774		if (cur_crypto_min < crypto_min)
4775			crypto_min = cur_crypto_min;
4776
4777		/*
4778		 * CRYPTO_EFFECTIVELY_INFINITE is a special value of
4779		 * the crypto framework which means "no upper limit".
4780		 */
4781		if (mech_infos[i].mi_max_key_size ==
4782		    CRYPTO_EFFECTIVELY_INFINITE) {
4783			crypto_max = (size_t)-1;
4784		} else if (cur_crypto_max > crypto_max) {
4785			crypto_max = cur_crypto_max;
4786		}
4787
4788		is_valid = B_TRUE;
4789	}
4790
4791	kmem_free(mech_infos, sizeof (crypto_mechanism_info_t) *
4792	    nmech_infos);
4793
4794	if (!is_valid) {
4795		/* no key sizes supported by framework */
4796		alg->alg_flags &= ~ALG_FLAG_VALID;
4797		return;
4798	}
4799
4800	/*
4801	 * Determine min and max key sizes from alg_key_sizes[].
4802	 * defined for the algorithm entry. Adjust key sizes based on
4803	 * those supported by the framework.
4804	 */
4805	alg->alg_ef_default_bits = alg->alg_key_sizes[0];
4806
4807	/*
4808	 * For backwards compatability, assume that the IV length
4809	 * is the same as the data length.
4810	 */
4811	alg->alg_ivlen = alg->alg_datalen;
4812
4813	/*
4814	 * Copy any algorithm parameters (if provided) into dedicated
4815	 * elements in the ipsec_alginfo_t structure.
4816	 * There may be a better place to put this code.
4817	 */
4818	for (i = 0; i < alg->alg_nparams; i++) {
4819		switch (i) {
4820		case 0:
4821			/* Initialisation Vector length (bytes) */
4822			alg->alg_ivlen =  alg->alg_params[0];
4823			break;
4824		case 1:
4825			/* Integrity Check Vector length (bytes) */
4826			alg->alg_icvlen = alg->alg_params[1];
4827			break;
4828		case 2:
4829			/* Salt length (bytes) */
4830			alg->alg_saltlen = (uint8_t)alg->alg_params[2];
4831			break;
4832		default:
4833			break;
4834		}
4835	}
4836
4837	/* Default if the IV length is not specified. */
4838	if (alg_type == IPSEC_ALG_ENCR && alg->alg_ivlen == 0)
4839		alg->alg_ivlen = alg->alg_datalen;
4840
4841	alg_flag_check(alg);
4842
4843	if (alg->alg_increment != 0) {
4844		/* supported key sizes are defined by range  & increment */
4845		crypto_min = ALGBITS_ROUND_UP(crypto_min, alg->alg_increment);
4846		crypto_max = ALGBITS_ROUND_DOWN(crypto_max, alg->alg_increment);
4847
4848		alg->alg_ef_minbits = MAX(alg->alg_minbits,
4849		    (uint16_t)crypto_min);
4850		alg->alg_ef_maxbits = MIN(alg->alg_maxbits,
4851		    (uint16_t)crypto_max);
4852
4853		/*
4854		 * If the sizes supported by the framework are outside
4855		 * the range of sizes defined by the algorithm mappings,
4856		 * the algorithm cannot be used. Check for this
4857		 * condition here.
4858		 */
4859		if (alg->alg_ef_minbits > alg->alg_ef_maxbits) {
4860			alg->alg_flags &= ~ALG_FLAG_VALID;
4861			return;
4862		}
4863		if (alg->alg_ef_default_bits < alg->alg_ef_minbits)
4864			alg->alg_ef_default_bits = alg->alg_ef_minbits;
4865		if (alg->alg_ef_default_bits > alg->alg_ef_maxbits)
4866			alg->alg_ef_default_bits = alg->alg_ef_maxbits;
4867	} else if (alg->alg_nkey_sizes == 0) {
4868		/* no specified key size for algorithm */
4869		alg->alg_ef_minbits = alg->alg_ef_maxbits = 0;
4870	} else {
4871		/* supported key sizes are defined by enumeration */
4872		alg->alg_ef_minbits = (uint16_t)-1;
4873		alg->alg_ef_maxbits = 0;
4874
4875		for (i = 0, is_valid = B_FALSE; i < alg->alg_nkey_sizes; i++) {
4876			/*
4877			 * Ignore the current key size if it is not in the
4878			 * range of sizes supported by the framework.
4879			 */
4880			if (alg->alg_key_sizes[i] < crypto_min ||
4881			    alg->alg_key_sizes[i] > crypto_max)
4882				continue;
4883			if (alg->alg_key_sizes[i] < alg->alg_ef_minbits)
4884				alg->alg_ef_minbits = alg->alg_key_sizes[i];
4885			if (alg->alg_key_sizes[i] > alg->alg_ef_maxbits)
4886				alg->alg_ef_maxbits = alg->alg_key_sizes[i];
4887			is_valid = B_TRUE;
4888		}
4889
4890		if (!is_valid) {
4891			alg->alg_flags &= ~ALG_FLAG_VALID;
4892			return;
4893		}
4894		alg->alg_ef_default = 0;
4895	}
4896}
4897
4898/*
4899 * Sanity check parameters provided by ipsecalgs(1m). Assume that
4900 * the algoritm is marked as valid, there is a check at the top
4901 * of this function. If any of the checks below fail, the algorithm
4902 * entry is invalid.
4903 */
4904void
4905alg_flag_check(ipsec_alginfo_t *alg)
4906{
4907	alg->alg_flags &= ~ALG_FLAG_VALID;
4908
4909	/*
4910	 * Can't have the algorithm marked as CCM and GCM.
4911	 * Check the ALG_FLAG_COMBINED and ALG_FLAG_COUNTERMODE
4912	 * flags are set for CCM & GCM.
4913	 */
4914	if ((alg->alg_flags & (ALG_FLAG_CCM|ALG_FLAG_GCM)) ==
4915	    (ALG_FLAG_CCM|ALG_FLAG_GCM))
4916		return;
4917	if (alg->alg_flags & (ALG_FLAG_CCM|ALG_FLAG_GCM)) {
4918		if (!(alg->alg_flags & ALG_FLAG_COUNTERMODE))
4919			return;
4920		if (!(alg->alg_flags & ALG_FLAG_COMBINED))
4921			return;
4922	}
4923
4924	/*
4925	 * For ALG_FLAG_COUNTERMODE, check the parameters
4926	 * fit in the ipsec_nonce_t structure.
4927	 */
4928	if (alg->alg_flags & ALG_FLAG_COUNTERMODE) {
4929		if (alg->alg_ivlen != sizeof (((ipsec_nonce_t *)NULL)->iv))
4930			return;
4931		if (alg->alg_saltlen > sizeof (((ipsec_nonce_t *)NULL)->salt))
4932			return;
4933	}
4934	if ((alg->alg_flags & ALG_FLAG_COMBINED) &&
4935	    (alg->alg_icvlen == 0))
4936		return;
4937
4938	/* all is well. */
4939	alg->alg_flags |= ALG_FLAG_VALID;
4940}
4941
4942/*
4943 * Free the memory used by the specified algorithm.
4944 */
4945void
4946ipsec_alg_free(ipsec_alginfo_t *alg)
4947{
4948	if (alg == NULL)
4949		return;
4950
4951	if (alg->alg_key_sizes != NULL) {
4952		kmem_free(alg->alg_key_sizes,
4953		    (alg->alg_nkey_sizes + 1) * sizeof (uint16_t));
4954		alg->alg_key_sizes = NULL;
4955	}
4956	if (alg->alg_block_sizes != NULL) {
4957		kmem_free(alg->alg_block_sizes,
4958		    (alg->alg_nblock_sizes + 1) * sizeof (uint16_t));
4959		alg->alg_block_sizes = NULL;
4960	}
4961	kmem_free(alg, sizeof (*alg));
4962}
4963
4964/*
4965 * Check the validity of the specified key size for an algorithm.
4966 * Returns B_TRUE if key size is valid, B_FALSE otherwise.
4967 */
4968boolean_t
4969ipsec_valid_key_size(uint16_t key_size, ipsec_alginfo_t *alg)
4970{
4971	if (key_size < alg->alg_ef_minbits || key_size > alg->alg_ef_maxbits)
4972		return (B_FALSE);
4973
4974	if (alg->alg_increment == 0 && alg->alg_nkey_sizes != 0) {
4975		/*
4976		 * If the key sizes are defined by enumeration, the new
4977		 * key size must be equal to one of the supported values.
4978		 */
4979		int i;
4980
4981		for (i = 0; i < alg->alg_nkey_sizes; i++)
4982			if (key_size == alg->alg_key_sizes[i])
4983				break;
4984		if (i == alg->alg_nkey_sizes)
4985			return (B_FALSE);
4986	}
4987
4988	return (B_TRUE);
4989}
4990
4991/*
4992 * Callback function invoked by the crypto framework when a provider
4993 * registers or unregisters. This callback updates the algorithms
4994 * tables when a crypto algorithm is no longer available or becomes
4995 * available, and triggers the freeing/creation of context templates
4996 * associated with existing SAs, if needed.
4997 *
4998 * Need to walk all stack instances since the callback is global
4999 * for all instances
5000 */
5001void
5002ipsec_prov_update_callback(uint32_t event, void *event_arg)
5003{
5004	netstack_handle_t nh;
5005	netstack_t *ns;
5006
5007	netstack_next_init(&nh);
5008	while ((ns = netstack_next(&nh)) != NULL) {
5009		ipsec_prov_update_callback_stack(event, event_arg, ns);
5010		netstack_rele(ns);
5011	}
5012	netstack_next_fini(&nh);
5013}
5014
5015static void
5016ipsec_prov_update_callback_stack(uint32_t event, void *event_arg,
5017    netstack_t *ns)
5018{
5019	crypto_notify_event_change_t *prov_change =
5020	    (crypto_notify_event_change_t *)event_arg;
5021	uint_t algidx, algid, algtype, mech_count, mech_idx;
5022	ipsec_alginfo_t *alg;
5023	ipsec_alginfo_t oalg;
5024	crypto_mech_name_t *mechs;
5025	boolean_t alg_changed = B_FALSE;
5026	ipsec_stack_t	*ipss = ns->netstack_ipsec;
5027
5028	/* ignore events for which we didn't register */
5029	if (event != CRYPTO_EVENT_MECHS_CHANGED) {
5030		ip1dbg(("ipsec_prov_update_callback: unexpected event 0x%x "
5031		    " received from crypto framework\n", event));
5032		return;
5033	}
5034
5035	mechs = crypto_get_mech_list(&mech_count, KM_SLEEP);
5036	if (mechs == NULL)
5037		return;
5038
5039	/*
5040	 * Walk the list of currently defined IPsec algorithm. Update
5041	 * the algorithm valid flag and trigger an update of the
5042	 * SAs that depend on that algorithm.
5043	 */
5044	mutex_enter(&ipss->ipsec_alg_lock);
5045	for (algtype = 0; algtype < IPSEC_NALGTYPES; algtype++) {
5046		for (algidx = 0; algidx < ipss->ipsec_nalgs[algtype];
5047		    algidx++) {
5048
5049			algid = ipss->ipsec_sortlist[algtype][algidx];
5050			alg = ipss->ipsec_alglists[algtype][algid];
5051			ASSERT(alg != NULL);
5052
5053			/*
5054			 * Skip the algorithms which do not map to the
5055			 * crypto framework provider being added or removed.
5056			 */
5057			if (strncmp(alg->alg_mech_name,
5058			    prov_change->ec_mech_name,
5059			    CRYPTO_MAX_MECH_NAME) != 0)
5060				continue;
5061
5062			/*
5063			 * Determine if the mechanism is valid. If it
5064			 * is not, mark the algorithm as being invalid. If
5065			 * it is, mark the algorithm as being valid.
5066			 */
5067			for (mech_idx = 0; mech_idx < mech_count; mech_idx++)
5068				if (strncmp(alg->alg_mech_name,
5069				    mechs[mech_idx], CRYPTO_MAX_MECH_NAME) == 0)
5070					break;
5071			if (mech_idx == mech_count &&
5072			    alg->alg_flags & ALG_FLAG_VALID) {
5073				alg->alg_flags &= ~ALG_FLAG_VALID;
5074				alg_changed = B_TRUE;
5075			} else if (mech_idx < mech_count &&
5076			    !(alg->alg_flags & ALG_FLAG_VALID)) {
5077				alg->alg_flags |= ALG_FLAG_VALID;
5078				alg_changed = B_TRUE;
5079			}
5080
5081			/*
5082			 * Update the supported key sizes, regardless
5083			 * of whether a crypto provider was added or
5084			 * removed.
5085			 */
5086			oalg = *alg;
5087			ipsec_alg_fix_min_max(alg, algtype, ns);
5088			if (!alg_changed &&
5089			    alg->alg_ef_minbits != oalg.alg_ef_minbits ||
5090			    alg->alg_ef_maxbits != oalg.alg_ef_maxbits ||
5091			    alg->alg_ef_default != oalg.alg_ef_default ||
5092			    alg->alg_ef_default_bits !=
5093			    oalg.alg_ef_default_bits)
5094				alg_changed = B_TRUE;
5095
5096			/*
5097			 * Update the affected SAs if a software provider is
5098			 * being added or removed.
5099			 */
5100			if (prov_change->ec_provider_type ==
5101			    CRYPTO_SW_PROVIDER)
5102				sadb_alg_update(algtype, alg->alg_id,
5103				    prov_change->ec_change ==
5104				    CRYPTO_MECH_ADDED, ns);
5105		}
5106	}
5107	mutex_exit(&ipss->ipsec_alg_lock);
5108	crypto_free_mech_list(mechs, mech_count);
5109
5110	if (alg_changed) {
5111		/*
5112		 * An algorithm has changed, i.e. it became valid or
5113		 * invalid, or its support key sizes have changed.
5114		 * Notify ipsecah and ipsecesp of this change so
5115		 * that they can send a SADB_REGISTER to their consumers.
5116		 */
5117		ipsecah_algs_changed(ns);
5118		ipsecesp_algs_changed(ns);
5119	}
5120}
5121
5122/*
5123 * Registers with the crypto framework to be notified of crypto
5124 * providers changes. Used to update the algorithm tables and
5125 * to free or create context templates if needed. Invoked after IPsec
5126 * is loaded successfully.
5127 *
5128 * This is called separately for each IP instance, so we ensure we only
5129 * register once.
5130 */
5131void
5132ipsec_register_prov_update(void)
5133{
5134	if (prov_update_handle != NULL)
5135		return;
5136
5137	prov_update_handle = crypto_notify_events(
5138	    ipsec_prov_update_callback, CRYPTO_EVENT_MECHS_CHANGED);
5139}
5140
5141/*
5142 * Unregisters from the framework to be notified of crypto providers
5143 * changes. Called from ipsec_policy_g_destroy().
5144 */
5145static void
5146ipsec_unregister_prov_update(void)
5147{
5148	if (prov_update_handle != NULL)
5149		crypto_unnotify_events(prov_update_handle);
5150}
5151
5152/*
5153 * Tunnel-mode support routines.
5154 */
5155
5156/*
5157 * Returns an mblk chain suitable for putnext() if policies match and IPsec
5158 * SAs are available.  If there's no per-tunnel policy, or a match comes back
5159 * with no match, then still return the packet and have global policy take
5160 * a crack at it in IP.
5161 * This updates the ip_xmit_attr with the IPsec policy.
5162 *
5163 * Remember -> we can be forwarding packets.  Keep that in mind w.r.t.
5164 * inner-packet contents.
5165 */
5166mblk_t *
5167ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
5168    ip6_t *inner_ipv6, ipha_t *outer_ipv4, ip6_t *outer_ipv6, int outer_hdr_len,
5169    ip_xmit_attr_t *ixa)
5170{
5171	ipsec_policy_head_t *polhead;
5172	ipsec_selector_t sel;
5173	mblk_t *nmp;
5174	boolean_t is_fragment;
5175	ipsec_policy_t *pol;
5176	ipsec_tun_pol_t *itp = iptun->iptun_itp;
5177	netstack_t *ns = iptun->iptun_ns;
5178	ipsec_stack_t *ipss = ns->netstack_ipsec;
5179
5180	ASSERT(outer_ipv6 != NULL && outer_ipv4 == NULL ||
5181	    outer_ipv4 != NULL && outer_ipv6 == NULL);
5182	/* We take care of inners in a bit. */
5183
5184	/* Are the IPsec fields initialized at all? */
5185	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE)) {
5186		ASSERT(ixa->ixa_ipsec_policy == NULL);
5187		ASSERT(ixa->ixa_ipsec_latch == NULL);
5188		ASSERT(ixa->ixa_ipsec_action == NULL);
5189		ASSERT(ixa->ixa_ipsec_ah_sa == NULL);
5190		ASSERT(ixa->ixa_ipsec_esp_sa == NULL);
5191	}
5192
5193	ASSERT(itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE));
5194	polhead = itp->itp_policy;
5195
5196	bzero(&sel, sizeof (sel));
5197	if (inner_ipv4 != NULL) {
5198		ASSERT(inner_ipv6 == NULL);
5199		sel.ips_isv4 = B_TRUE;
5200		sel.ips_local_addr_v4 = inner_ipv4->ipha_src;
5201		sel.ips_remote_addr_v4 = inner_ipv4->ipha_dst;
5202		sel.ips_protocol = (uint8_t)inner_ipv4->ipha_protocol;
5203	} else {
5204		ASSERT(inner_ipv6 != NULL);
5205		sel.ips_isv4 = B_FALSE;
5206		sel.ips_local_addr_v6 = inner_ipv6->ip6_src;
5207		/*
5208		 * We don't care about routing-header dests in the
5209		 * forwarding/tunnel path, so just grab ip6_dst.
5210		 */
5211		sel.ips_remote_addr_v6 = inner_ipv6->ip6_dst;
5212	}
5213
5214	if (itp->itp_flags & ITPF_P_PER_PORT_SECURITY) {
5215		/*
5216		 * Caller can prepend the outer header, which means
5217		 * inner_ipv[46] may be stuck in the middle.  Pullup the whole
5218		 * mess now if need-be, for easier processing later.  Don't
5219		 * forget to rewire the outer header too.
5220		 */
5221		if (mp->b_cont != NULL) {
5222			nmp = msgpullup(mp, -1);
5223			if (nmp == NULL) {
5224				ip_drop_packet(mp, B_FALSE, NULL,
5225				    DROPPER(ipss, ipds_spd_nomem),
5226				    &ipss->ipsec_spd_dropper);
5227				return (NULL);
5228			}
5229			freemsg(mp);
5230			mp = nmp;
5231			if (outer_ipv4 != NULL)
5232				outer_ipv4 = (ipha_t *)mp->b_rptr;
5233			else
5234				outer_ipv6 = (ip6_t *)mp->b_rptr;
5235			if (inner_ipv4 != NULL) {
5236				inner_ipv4 =
5237				    (ipha_t *)(mp->b_rptr + outer_hdr_len);
5238			} else {
5239				inner_ipv6 =
5240				    (ip6_t *)(mp->b_rptr + outer_hdr_len);
5241			}
5242		}
5243		if (inner_ipv4 != NULL) {
5244			is_fragment = IS_V4_FRAGMENT(
5245			    inner_ipv4->ipha_fragment_offset_and_flags);
5246		} else {
5247			sel.ips_remote_addr_v6 = ip_get_dst_v6(inner_ipv6, mp,
5248			    &is_fragment);
5249		}
5250
5251		if (is_fragment) {
5252			ipha_t *oiph;
5253			ipha_t *iph = NULL;
5254			ip6_t *ip6h = NULL;
5255			int hdr_len;
5256			uint16_t ip6_hdr_length;
5257			uint8_t v6_proto;
5258			uint8_t *v6_proto_p;
5259
5260			/*
5261			 * We have a fragment we need to track!
5262			 */
5263			mp = ipsec_fragcache_add(&itp->itp_fragcache, NULL, mp,
5264			    outer_hdr_len, ipss);
5265			if (mp == NULL)
5266				return (NULL);
5267			ASSERT(mp->b_cont == NULL);
5268
5269			/*
5270			 * If we get here, we have a full fragment chain
5271			 */
5272
5273			oiph = (ipha_t *)mp->b_rptr;
5274			if (IPH_HDR_VERSION(oiph) == IPV4_VERSION) {
5275				hdr_len = ((outer_hdr_len != 0) ?
5276				    IPH_HDR_LENGTH(oiph) : 0);
5277				iph = (ipha_t *)(mp->b_rptr + hdr_len);
5278			} else {
5279				ASSERT(IPH_HDR_VERSION(oiph) == IPV6_VERSION);
5280				ip6h = (ip6_t *)mp->b_rptr;
5281				if (!ip_hdr_length_nexthdr_v6(mp, ip6h,
5282				    &ip6_hdr_length, &v6_proto_p)) {
5283					ip_drop_packet_chain(mp, B_FALSE, NULL,
5284					    DROPPER(ipss,
5285					    ipds_spd_malformed_packet),
5286					    &ipss->ipsec_spd_dropper);
5287					return (NULL);
5288				}
5289				hdr_len = ip6_hdr_length;
5290			}
5291			outer_hdr_len = hdr_len;
5292
5293			if (sel.ips_isv4) {
5294				if (iph == NULL) {
5295					/* Was v6 outer */
5296					iph = (ipha_t *)(mp->b_rptr + hdr_len);
5297				}
5298				inner_ipv4 = iph;
5299				sel.ips_local_addr_v4 = inner_ipv4->ipha_src;
5300				sel.ips_remote_addr_v4 = inner_ipv4->ipha_dst;
5301				sel.ips_protocol =
5302				    (uint8_t)inner_ipv4->ipha_protocol;
5303			} else {
5304				inner_ipv6 = (ip6_t *)(mp->b_rptr +
5305				    hdr_len);
5306				sel.ips_local_addr_v6 = inner_ipv6->ip6_src;
5307				sel.ips_remote_addr_v6 = inner_ipv6->ip6_dst;
5308				if (!ip_hdr_length_nexthdr_v6(mp,
5309				    inner_ipv6, &ip6_hdr_length, &v6_proto_p)) {
5310					ip_drop_packet_chain(mp, B_FALSE, NULL,
5311					    DROPPER(ipss,
5312					    ipds_spd_malformed_frag),
5313					    &ipss->ipsec_spd_dropper);
5314					return (NULL);
5315				}
5316				v6_proto = *v6_proto_p;
5317				sel.ips_protocol = v6_proto;
5318#ifdef FRAGCACHE_DEBUG
5319				cmn_err(CE_WARN, "v6_sel.ips_protocol = %d\n",
5320				    sel.ips_protocol);
5321#endif
5322			}
5323			/* Ports are extracted below */
5324		}
5325
5326		/* Get ports... */
5327		if (!ipsec_init_outbound_ports(&sel, mp,
5328		    inner_ipv4, inner_ipv6, outer_hdr_len, ipss)) {
5329			/* callee did ip_drop_packet_chain() on mp. */
5330			return (NULL);
5331		}
5332#ifdef FRAGCACHE_DEBUG
5333		if (inner_ipv4 != NULL)
5334			cmn_err(CE_WARN,
5335			    "(v4) sel.ips_protocol = %d, "
5336			    "sel.ips_local_port = %d, "
5337			    "sel.ips_remote_port = %d\n",
5338			    sel.ips_protocol, ntohs(sel.ips_local_port),
5339			    ntohs(sel.ips_remote_port));
5340		if (inner_ipv6 != NULL)
5341			cmn_err(CE_WARN,
5342			    "(v6) sel.ips_protocol = %d, "
5343			    "sel.ips_local_port = %d, "
5344			    "sel.ips_remote_port = %d\n",
5345			    sel.ips_protocol, ntohs(sel.ips_local_port),
5346			    ntohs(sel.ips_remote_port));
5347#endif
5348		/* Success so far! */
5349	}
5350	rw_enter(&polhead->iph_lock, RW_READER);
5351	pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_OUTBOUND, &sel);
5352	rw_exit(&polhead->iph_lock);
5353	if (pol == NULL) {
5354		/*
5355		 * No matching policy on this tunnel, drop the packet.
5356		 *
5357		 * NOTE:  Tunnel-mode tunnels are different from the
5358		 * IP global transport mode policy head.  For a tunnel-mode
5359		 * tunnel, we drop the packet in lieu of passing it
5360		 * along accepted the way a global-policy miss would.
5361		 *
5362		 * NOTE2:  "negotiate transport" tunnels should match ALL
5363		 * inbound packets, but we do not uncomment the ASSERT()
5364		 * below because if/when we open PF_POLICY, a user can
5365		 * shoot him/her-self in the foot with a 0 priority.
5366		 */
5367
5368		/* ASSERT(itp->itp_flags & ITPF_P_TUNNEL); */
5369#ifdef FRAGCACHE_DEBUG
5370		cmn_err(CE_WARN, "ipsec_tun_outbound(): No matching tunnel "
5371		    "per-port policy\n");
5372#endif
5373		ip_drop_packet_chain(mp, B_FALSE, NULL,
5374		    DROPPER(ipss, ipds_spd_explicit),
5375		    &ipss->ipsec_spd_dropper);
5376		return (NULL);
5377	}
5378
5379#ifdef FRAGCACHE_DEBUG
5380	cmn_err(CE_WARN, "Having matching tunnel per-port policy\n");
5381#endif
5382
5383	/*
5384	 * NOTE: ixa_cleanup() function will release pol references.
5385	 */
5386	ixa->ixa_ipsec_policy = pol;
5387	/*
5388	 * NOTE: There is a subtle difference between iptun_zoneid and
5389	 * iptun_connp->conn_zoneid explained in iptun_conn_create().  When
5390	 * interacting with the ip module, we must use conn_zoneid.
5391	 */
5392	ixa->ixa_zoneid = iptun->iptun_connp->conn_zoneid;
5393
5394	ASSERT((outer_ipv4 != NULL) ? (ixa->ixa_flags & IXAF_IS_IPV4) :
5395	    !(ixa->ixa_flags & IXAF_IS_IPV4));
5396	ASSERT(ixa->ixa_ipsec_policy != NULL);
5397	ixa->ixa_flags |= IXAF_IPSEC_SECURE;
5398
5399	if (!(itp->itp_flags & ITPF_P_TUNNEL)) {
5400		/* Set up transport mode for tunnelled packets. */
5401		ixa->ixa_ipsec_proto = (inner_ipv4 != NULL) ? IPPROTO_ENCAP :
5402		    IPPROTO_IPV6;
5403		return (mp);
5404	}
5405
5406	/* Fill in tunnel-mode goodies here. */
5407	ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
5408	/* XXX Do I need to fill in all of the goodies here? */
5409	if (inner_ipv4) {
5410		ixa->ixa_ipsec_inaf = AF_INET;
5411		ixa->ixa_ipsec_insrc[0] =
5412		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v4;
5413		ixa->ixa_ipsec_indst[0] =
5414		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v4;
5415	} else {
5416		ixa->ixa_ipsec_inaf = AF_INET6;
5417		ixa->ixa_ipsec_insrc[0] =
5418		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[0];
5419		ixa->ixa_ipsec_insrc[1] =
5420		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[1];
5421		ixa->ixa_ipsec_insrc[2] =
5422		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[2];
5423		ixa->ixa_ipsec_insrc[3] =
5424		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[3];
5425		ixa->ixa_ipsec_indst[0] =
5426		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[0];
5427		ixa->ixa_ipsec_indst[1] =
5428		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[1];
5429		ixa->ixa_ipsec_indst[2] =
5430		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[2];
5431		ixa->ixa_ipsec_indst[3] =
5432		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[3];
5433	}
5434	ixa->ixa_ipsec_insrcpfx = pol->ipsp_sel->ipsl_key.ipsl_local_pfxlen;
5435	ixa->ixa_ipsec_indstpfx = pol->ipsp_sel->ipsl_key.ipsl_remote_pfxlen;
5436	/* NOTE:  These are used for transport mode too. */
5437	ixa->ixa_ipsec_src_port = pol->ipsp_sel->ipsl_key.ipsl_lport;
5438	ixa->ixa_ipsec_dst_port = pol->ipsp_sel->ipsl_key.ipsl_rport;
5439	ixa->ixa_ipsec_proto = pol->ipsp_sel->ipsl_key.ipsl_proto;
5440
5441	return (mp);
5442}
5443
5444/*
5445 * NOTE: The following releases pol's reference and
5446 * calls ip_drop_packet() for me on NULL returns.
5447 */
5448mblk_t *
5449ipsec_check_ipsecin_policy_reasm(mblk_t *attr_mp, ipsec_policy_t *pol,
5450    ipha_t *inner_ipv4, ip6_t *inner_ipv6, uint64_t pkt_unique, netstack_t *ns)
5451{
5452	/* Assume attr_mp is a chain of b_next-linked ip_recv_attr mblk. */
5453	mblk_t *data_chain = NULL, *data_tail = NULL;
5454	mblk_t *next;
5455	mblk_t *data_mp;
5456	ip_recv_attr_t	iras;
5457
5458	while (attr_mp != NULL) {
5459		ASSERT(ip_recv_attr_is_mblk(attr_mp));
5460		next = attr_mp->b_next;
5461		attr_mp->b_next = NULL;  /* No tripping asserts. */
5462
5463		data_mp = attr_mp->b_cont;
5464		attr_mp->b_cont = NULL;
5465		if (!ip_recv_attr_from_mblk(attr_mp, &iras)) {
5466			/* The ill or ip_stack_t disappeared on us */
5467			freemsg(data_mp);	/* ip_drop_packet?? */
5468			ira_cleanup(&iras, B_TRUE);
5469			goto fail;
5470		}
5471
5472		/*
5473		 * Need IPPOL_REFHOLD(pol) for extras because
5474		 * ipsecin_policy does the refrele.
5475		 */
5476		IPPOL_REFHOLD(pol);
5477
5478		data_mp = ipsec_check_ipsecin_policy(data_mp, pol, inner_ipv4,
5479		    inner_ipv6, pkt_unique, &iras, ns);
5480		ira_cleanup(&iras, B_TRUE);
5481
5482		if (data_mp == NULL)
5483			goto fail;
5484
5485		if (data_tail == NULL) {
5486			/* First one */
5487			data_chain = data_tail = data_mp;
5488		} else {
5489			data_tail->b_next = data_mp;
5490			data_tail = data_mp;
5491		}
5492		attr_mp = next;
5493	}
5494	/*
5495	 * One last release because either the loop bumped it up, or we never
5496	 * called ipsec_check_ipsecin_policy().
5497	 */
5498	IPPOL_REFRELE(pol);
5499
5500	/* data_chain is ready for return to tun module. */
5501	return (data_chain);
5502
5503fail:
5504	/*
5505	 * Need to get rid of any extra pol
5506	 * references, and any remaining bits as well.
5507	 */
5508	IPPOL_REFRELE(pol);
5509	ipsec_freemsg_chain(data_chain);
5510	ipsec_freemsg_chain(next);	/* ipdrop stats? */
5511	return (NULL);
5512}
5513
5514/*
5515 * Return a message if the inbound packet passed an IPsec policy check.  Returns
5516 * NULL if it failed or if it is a fragment needing its friends before a
5517 * policy check can be performed.
5518 *
5519 * Expects a non-NULL data_mp, and a non-NULL polhead.
5520 * The returned mblk may be a b_next chain of packets if fragments
5521 * neeeded to be collected for a proper policy check.
5522 *
5523 * This function calls ip_drop_packet() on data_mp if need be.
5524 *
5525 * NOTE:  outer_hdr_len is signed.  If it's a negative value, the caller
5526 * is inspecting an ICMP packet.
5527 */
5528mblk_t *
5529ipsec_tun_inbound(ip_recv_attr_t *ira, mblk_t *data_mp, ipsec_tun_pol_t *itp,
5530    ipha_t *inner_ipv4, ip6_t *inner_ipv6, ipha_t *outer_ipv4,
5531    ip6_t *outer_ipv6, int outer_hdr_len, netstack_t *ns)
5532{
5533	ipsec_policy_head_t *polhead;
5534	ipsec_selector_t sel;
5535	ipsec_policy_t *pol;
5536	uint16_t tmpport;
5537	selret_t rc;
5538	boolean_t port_policy_present, is_icmp, global_present;
5539	in6_addr_t tmpaddr;
5540	ipaddr_t tmp4;
5541	uint8_t flags, *inner_hdr;
5542	ipsec_stack_t *ipss = ns->netstack_ipsec;
5543
5544	sel.ips_is_icmp_inv_acq = 0;
5545
5546	if (outer_ipv4 != NULL) {
5547		ASSERT(outer_ipv6 == NULL);
5548		global_present = ipss->ipsec_inbound_v4_policy_present;
5549	} else {
5550		ASSERT(outer_ipv6 != NULL);
5551		global_present = ipss->ipsec_inbound_v6_policy_present;
5552	}
5553
5554	ASSERT(inner_ipv4 != NULL && inner_ipv6 == NULL ||
5555	    inner_ipv4 == NULL && inner_ipv6 != NULL);
5556
5557	if (outer_hdr_len < 0) {
5558		outer_hdr_len = (-outer_hdr_len);
5559		is_icmp = B_TRUE;
5560	} else {
5561		is_icmp = B_FALSE;
5562	}
5563
5564	if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
5565		mblk_t *mp = data_mp;
5566
5567		polhead = itp->itp_policy;
5568		/*
5569		 * We need to perform full Tunnel-Mode enforcement,
5570		 * and we need to have inner-header data for such enforcement.
5571		 *
5572		 * See ipsec_init_inbound_sel() for the 0x80000000 on inbound
5573		 * and on return.
5574		 */
5575
5576		port_policy_present = ((itp->itp_flags &
5577		    ITPF_P_PER_PORT_SECURITY) ? B_TRUE : B_FALSE);
5578		/*
5579		 * NOTE:  Even if our policy is transport mode, set the
5580		 * SEL_TUNNEL_MODE flag so ipsec_init_inbound_sel() can
5581		 * do the right thing w.r.t. outer headers.
5582		 */
5583		flags = ((port_policy_present ? SEL_PORT_POLICY : SEL_NONE) |
5584		    (is_icmp ? SEL_IS_ICMP : SEL_NONE) | SEL_TUNNEL_MODE);
5585
5586		rc = ipsec_init_inbound_sel(&sel, data_mp, inner_ipv4,
5587		    inner_ipv6, flags);
5588
5589		switch (rc) {
5590		case SELRET_NOMEM:
5591			ip_drop_packet(data_mp, B_TRUE, NULL,
5592			    DROPPER(ipss, ipds_spd_nomem),
5593			    &ipss->ipsec_spd_dropper);
5594			return (NULL);
5595		case SELRET_TUNFRAG:
5596			/*
5597			 * At this point, if we're cleartext, we don't want
5598			 * to go there.
5599			 */
5600			if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
5601				ip_drop_packet(data_mp, B_TRUE, NULL,
5602				    DROPPER(ipss, ipds_spd_got_clear),
5603				    &ipss->ipsec_spd_dropper);
5604				return (NULL);
5605			}
5606			/*
5607			 * If we need to queue the packet. First we
5608			 * get an mblk with the attributes. ipsec_fragcache_add
5609			 * will prepend that to the queued data and return
5610			 * a list of b_next messages each of which starts with
5611			 * the attribute mblk.
5612			 */
5613			mp = ip_recv_attr_to_mblk(ira);
5614			if (mp == NULL) {
5615				ip_drop_packet(data_mp, B_TRUE, NULL,
5616				    DROPPER(ipss, ipds_spd_nomem),
5617				    &ipss->ipsec_spd_dropper);
5618				return (NULL);
5619			}
5620			mp = ipsec_fragcache_add(&itp->itp_fragcache,
5621			    mp, data_mp, outer_hdr_len, ipss);
5622
5623			if (mp == NULL) {
5624				/*
5625				 * Data is cached, fragment chain is not
5626				 * complete.
5627				 */
5628				return (NULL);
5629			}
5630
5631			/*
5632			 * If we get here, we have a full fragment chain.
5633			 * Reacquire headers and selectors from first fragment.
5634			 */
5635			ASSERT(ip_recv_attr_is_mblk(mp));
5636			data_mp = mp->b_cont;
5637			inner_hdr = data_mp->b_rptr;
5638			if (outer_ipv4 != NULL) {
5639				inner_hdr += IPH_HDR_LENGTH(
5640				    (ipha_t *)data_mp->b_rptr);
5641			} else {
5642				inner_hdr += ip_hdr_length_v6(data_mp,
5643				    (ip6_t *)data_mp->b_rptr);
5644			}
5645			ASSERT(inner_hdr <= data_mp->b_wptr);
5646
5647			if (inner_ipv4 != NULL) {
5648				inner_ipv4 = (ipha_t *)inner_hdr;
5649				inner_ipv6 = NULL;
5650			} else {
5651				inner_ipv6 = (ip6_t *)inner_hdr;
5652				inner_ipv4 = NULL;
5653			}
5654
5655			/*
5656			 * Use SEL_TUNNEL_MODE to take into account the outer
5657			 * header.  Use SEL_POST_FRAG so we always get ports.
5658			 */
5659			rc = ipsec_init_inbound_sel(&sel, data_mp,
5660			    inner_ipv4, inner_ipv6,
5661			    SEL_TUNNEL_MODE | SEL_POST_FRAG);
5662			switch (rc) {
5663			case SELRET_SUCCESS:
5664				/*
5665				 * Get to same place as first caller's
5666				 * SELRET_SUCCESS case.
5667				 */
5668				break;
5669			case SELRET_NOMEM:
5670				ip_drop_packet_chain(mp, B_TRUE, NULL,
5671				    DROPPER(ipss, ipds_spd_nomem),
5672				    &ipss->ipsec_spd_dropper);
5673				return (NULL);
5674			case SELRET_BADPKT:
5675				ip_drop_packet_chain(mp, B_TRUE, NULL,
5676				    DROPPER(ipss, ipds_spd_malformed_frag),
5677				    &ipss->ipsec_spd_dropper);
5678				return (NULL);
5679			case SELRET_TUNFRAG:
5680				cmn_err(CE_WARN, "(TUNFRAG on 2nd call...)");
5681				/* FALLTHRU */
5682			default:
5683				cmn_err(CE_WARN, "ipsec_init_inbound_sel(mark2)"
5684				    " returns bizarro 0x%x", rc);
5685				/* Guaranteed panic! */
5686				ASSERT(rc == SELRET_NOMEM);
5687				return (NULL);
5688			}
5689			/* FALLTHRU */
5690		case SELRET_SUCCESS:
5691			/*
5692			 * Common case:
5693			 * No per-port policy or a non-fragment.  Keep going.
5694			 */
5695			break;
5696		case SELRET_BADPKT:
5697			/*
5698			 * We may receive ICMP (with IPv6 inner) packets that
5699			 * trigger this return value.  Send 'em in for
5700			 * enforcement checking.
5701			 */
5702			cmn_err(CE_NOTE, "ipsec_tun_inbound(): "
5703			    "sending 'bad packet' in for enforcement");
5704			break;
5705		default:
5706			cmn_err(CE_WARN,
5707			    "ipsec_init_inbound_sel() returns bizarro 0x%x",
5708			    rc);
5709			ASSERT(rc == SELRET_NOMEM);	/* Guaranteed panic! */
5710			return (NULL);
5711		}
5712
5713		if (is_icmp) {
5714			/*
5715			 * Swap local/remote because this is an ICMP packet.
5716			 */
5717			tmpaddr = sel.ips_local_addr_v6;
5718			sel.ips_local_addr_v6 = sel.ips_remote_addr_v6;
5719			sel.ips_remote_addr_v6 = tmpaddr;
5720			tmpport = sel.ips_local_port;
5721			sel.ips_local_port = sel.ips_remote_port;
5722			sel.ips_remote_port = tmpport;
5723		}
5724
5725		/* find_policy_head() */
5726		rw_enter(&polhead->iph_lock, RW_READER);
5727		pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_INBOUND,
5728		    &sel);
5729		rw_exit(&polhead->iph_lock);
5730		if (pol != NULL) {
5731			uint64_t pkt_unique;
5732
5733			if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
5734				if (!pol->ipsp_act->ipa_allow_clear) {
5735					/*
5736					 * XXX should never get here with
5737					 * tunnel reassembled fragments?
5738					 */
5739					ASSERT(mp == data_mp);
5740					ip_drop_packet(data_mp, B_TRUE, NULL,
5741					    DROPPER(ipss, ipds_spd_got_clear),
5742					    &ipss->ipsec_spd_dropper);
5743					IPPOL_REFRELE(pol);
5744					return (NULL);
5745				} else {
5746					IPPOL_REFRELE(pol);
5747					return (mp);
5748				}
5749			}
5750			pkt_unique = SA_UNIQUE_ID(sel.ips_remote_port,
5751			    sel.ips_local_port,
5752			    (inner_ipv4 == NULL) ? IPPROTO_IPV6 :
5753			    IPPROTO_ENCAP, sel.ips_protocol);
5754
5755			/*
5756			 * NOTE: The following releases pol's reference and
5757			 * calls ip_drop_packet() for me on NULL returns.
5758			 *
5759			 * "sel" is still good here, so let's use it!
5760			 */
5761			if (data_mp == mp) {
5762				/* A single packet without attributes */
5763				data_mp = ipsec_check_ipsecin_policy(data_mp,
5764				    pol, inner_ipv4, inner_ipv6, pkt_unique,
5765				    ira, ns);
5766			} else {
5767				/*
5768				 * We pass in the b_next chain of attr_mp's
5769				 * and get back a b_next chain of data_mp's.
5770				 */
5771				data_mp = ipsec_check_ipsecin_policy_reasm(mp,
5772				    pol, inner_ipv4, inner_ipv6, pkt_unique,
5773				    ns);
5774			}
5775			return (data_mp);
5776		}
5777
5778		/*
5779		 * Else fallthru and check the global policy on the outer
5780		 * header(s) if this tunnel is an old-style transport-mode
5781		 * one.  Drop the packet explicitly (no policy entry) for
5782		 * a new-style tunnel-mode tunnel.
5783		 */
5784		if ((itp->itp_flags & ITPF_P_TUNNEL) && !is_icmp) {
5785			ip_drop_packet_chain(data_mp, B_TRUE, NULL,
5786			    DROPPER(ipss, ipds_spd_explicit),
5787			    &ipss->ipsec_spd_dropper);
5788			return (NULL);
5789		}
5790	}
5791
5792	/*
5793	 * NOTE:  If we reach here, we will not have packet chains from
5794	 * fragcache_add(), because the only way I get chains is on a
5795	 * tunnel-mode tunnel, which either returns with a pass, or gets
5796	 * hit by the ip_drop_packet_chain() call right above here.
5797	 */
5798	ASSERT(data_mp->b_next == NULL);
5799
5800	/* If no per-tunnel security, check global policy now. */
5801	if ((ira->ira_flags & IRAF_IPSEC_SECURE) && !global_present) {
5802		if (ira->ira_flags & IRAF_TRUSTED_ICMP) {
5803			/*
5804			 * This is an ICMP message that was geenrated locally.
5805			 * We should accept it.
5806			 */
5807			return (data_mp);
5808		}
5809
5810		ip_drop_packet(data_mp, B_TRUE, NULL,
5811		    DROPPER(ipss, ipds_spd_got_secure),
5812		    &ipss->ipsec_spd_dropper);
5813		return (NULL);
5814	}
5815
5816	if (is_icmp) {
5817		/*
5818		 * For ICMP packets, "outer_ipvN" is set to the outer header
5819		 * that is *INSIDE* the ICMP payload.  For global policy
5820		 * checking, we need to reverse src/dst on the payload in
5821		 * order to construct selectors appropriately.  See "ripha"
5822		 * constructions in ip.c.  To avoid a bug like 6478464 (see
5823		 * earlier in this file), we will actually exchange src/dst
5824		 * in the packet, and reverse if after the call to
5825		 * ipsec_check_global_policy().
5826		 */
5827		if (outer_ipv4 != NULL) {
5828			tmp4 = outer_ipv4->ipha_src;
5829			outer_ipv4->ipha_src = outer_ipv4->ipha_dst;
5830			outer_ipv4->ipha_dst = tmp4;
5831		} else {
5832			ASSERT(outer_ipv6 != NULL);
5833			tmpaddr = outer_ipv6->ip6_src;
5834			outer_ipv6->ip6_src = outer_ipv6->ip6_dst;
5835			outer_ipv6->ip6_dst = tmpaddr;
5836		}
5837	}
5838
5839	data_mp = ipsec_check_global_policy(data_mp, NULL, outer_ipv4,
5840	    outer_ipv6, ira, ns);
5841	if (data_mp == NULL)
5842		return (NULL);
5843
5844	if (is_icmp) {
5845		/* Set things back to normal. */
5846		if (outer_ipv4 != NULL) {
5847			tmp4 = outer_ipv4->ipha_src;
5848			outer_ipv4->ipha_src = outer_ipv4->ipha_dst;
5849			outer_ipv4->ipha_dst = tmp4;
5850		} else {
5851			/* No need for ASSERT()s now. */
5852			tmpaddr = outer_ipv6->ip6_src;
5853			outer_ipv6->ip6_src = outer_ipv6->ip6_dst;
5854			outer_ipv6->ip6_dst = tmpaddr;
5855		}
5856	}
5857
5858	/*
5859	 * At this point, we pretend it's a cleartext accepted
5860	 * packet.
5861	 */
5862	return (data_mp);
5863}
5864
5865/*
5866 * AVL comparison routine for our list of tunnel polheads.
5867 */
5868static int
5869tunnel_compare(const void *arg1, const void *arg2)
5870{
5871	ipsec_tun_pol_t *left, *right;
5872	int rc;
5873
5874	left = (ipsec_tun_pol_t *)arg1;
5875	right = (ipsec_tun_pol_t *)arg2;
5876
5877	rc = strncmp(left->itp_name, right->itp_name, LIFNAMSIZ);
5878	return (rc == 0 ? rc : (rc > 0 ? 1 : -1));
5879}
5880
5881/*
5882 * Free a tunnel policy node.
5883 */
5884void
5885itp_free(ipsec_tun_pol_t *node, netstack_t *ns)
5886{
5887	if (node->itp_policy != NULL) {
5888		IPPH_REFRELE(node->itp_policy, ns);
5889		node->itp_policy = NULL;
5890	}
5891	if (node->itp_inactive != NULL) {
5892		IPPH_REFRELE(node->itp_inactive, ns);
5893		node->itp_inactive = NULL;
5894	}
5895	mutex_destroy(&node->itp_lock);
5896	kmem_free(node, sizeof (*node));
5897}
5898
5899void
5900itp_unlink(ipsec_tun_pol_t *node, netstack_t *ns)
5901{
5902	ipsec_stack_t *ipss = ns->netstack_ipsec;
5903
5904	rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_WRITER);
5905	ipss->ipsec_tunnel_policy_gen++;
5906	ipsec_fragcache_uninit(&node->itp_fragcache, ipss);
5907	avl_remove(&ipss->ipsec_tunnel_policies, node);
5908	rw_exit(&ipss->ipsec_tunnel_policy_lock);
5909	ITP_REFRELE(node, ns);
5910}
5911
5912/*
5913 * Public interface to look up a tunnel security policy by name.  Used by
5914 * spdsock mostly.  Returns "node" with a bumped refcnt.
5915 */
5916ipsec_tun_pol_t *
5917get_tunnel_policy(char *name, netstack_t *ns)
5918{
5919	ipsec_tun_pol_t *node, lookup;
5920	ipsec_stack_t *ipss = ns->netstack_ipsec;
5921
5922	(void) strncpy(lookup.itp_name, name, LIFNAMSIZ);
5923
5924	rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_READER);
5925	node = (ipsec_tun_pol_t *)avl_find(&ipss->ipsec_tunnel_policies,
5926	    &lookup, NULL);
5927	if (node != NULL) {
5928		ITP_REFHOLD(node);
5929	}
5930	rw_exit(&ipss->ipsec_tunnel_policy_lock);
5931
5932	return (node);
5933}
5934
5935/*
5936 * Public interface to walk all tunnel security polcies.  Useful for spdsock
5937 * DUMP operations.  iterator() will not consume a reference.
5938 */
5939void
5940itp_walk(void (*iterator)(ipsec_tun_pol_t *, void *, netstack_t *),
5941    void *arg, netstack_t *ns)
5942{
5943	ipsec_tun_pol_t *node;
5944	ipsec_stack_t *ipss = ns->netstack_ipsec;
5945
5946	rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_READER);
5947	for (node = avl_first(&ipss->ipsec_tunnel_policies); node != NULL;
5948	    node = AVL_NEXT(&ipss->ipsec_tunnel_policies, node)) {
5949		iterator(node, arg, ns);
5950	}
5951	rw_exit(&ipss->ipsec_tunnel_policy_lock);
5952}
5953
5954/*
5955 * Initialize policy head.  This can only fail if there's a memory problem.
5956 */
5957static boolean_t
5958tunnel_polhead_init(ipsec_policy_head_t *iph, netstack_t *ns)
5959{
5960	ipsec_stack_t *ipss = ns->netstack_ipsec;
5961
5962	rw_init(&iph->iph_lock, NULL, RW_DEFAULT, NULL);
5963	iph->iph_refs = 1;
5964	iph->iph_gen = 0;
5965	if (ipsec_alloc_table(iph, ipss->ipsec_tun_spd_hashsize,
5966	    KM_SLEEP, B_FALSE, ns) != 0) {
5967		ipsec_polhead_free_table(iph);
5968		return (B_FALSE);
5969	}
5970	ipsec_polhead_init(iph, ipss->ipsec_tun_spd_hashsize);
5971	return (B_TRUE);
5972}
5973
5974/*
5975 * Create a tunnel policy node with "name".  Set errno with
5976 * ENOMEM if there's a memory problem, and EEXIST if there's an existing
5977 * node.
5978 */
5979ipsec_tun_pol_t *
5980create_tunnel_policy(char *name, int *errno, uint64_t *gen, netstack_t *ns)
5981{
5982	ipsec_tun_pol_t *newbie, *existing;
5983	avl_index_t where;
5984	ipsec_stack_t *ipss = ns->netstack_ipsec;
5985
5986	newbie = kmem_zalloc(sizeof (*newbie), KM_NOSLEEP);
5987	if (newbie == NULL) {
5988		*errno = ENOMEM;
5989		return (NULL);
5990	}
5991	if (!ipsec_fragcache_init(&newbie->itp_fragcache)) {
5992		kmem_free(newbie, sizeof (*newbie));
5993		*errno = ENOMEM;
5994		return (NULL);
5995	}
5996
5997	(void) strncpy(newbie->itp_name, name, LIFNAMSIZ);
5998
5999	rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_WRITER);
6000	existing = (ipsec_tun_pol_t *)avl_find(&ipss->ipsec_tunnel_policies,
6001	    newbie, &where);
6002	if (existing != NULL) {
6003		itp_free(newbie, ns);
6004		*errno = EEXIST;
6005		rw_exit(&ipss->ipsec_tunnel_policy_lock);
6006		return (NULL);
6007	}
6008	ipss->ipsec_tunnel_policy_gen++;
6009	*gen = ipss->ipsec_tunnel_policy_gen;
6010	newbie->itp_refcnt = 2;	/* One for the caller, one for the tree. */
6011	newbie->itp_next_policy_index = 1;
6012	avl_insert(&ipss->ipsec_tunnel_policies, newbie, where);
6013	mutex_init(&newbie->itp_lock, NULL, MUTEX_DEFAULT, NULL);
6014	newbie->itp_policy = kmem_zalloc(sizeof (ipsec_policy_head_t),
6015	    KM_NOSLEEP);
6016	if (newbie->itp_policy == NULL)
6017		goto nomem;
6018	newbie->itp_inactive = kmem_zalloc(sizeof (ipsec_policy_head_t),
6019	    KM_NOSLEEP);
6020	if (newbie->itp_inactive == NULL) {
6021		kmem_free(newbie->itp_policy, sizeof (ipsec_policy_head_t));
6022		goto nomem;
6023	}
6024
6025	if (!tunnel_polhead_init(newbie->itp_policy, ns)) {
6026		kmem_free(newbie->itp_policy, sizeof (ipsec_policy_head_t));
6027		kmem_free(newbie->itp_inactive, sizeof (ipsec_policy_head_t));
6028		goto nomem;
6029	} else if (!tunnel_polhead_init(newbie->itp_inactive, ns)) {
6030		IPPH_REFRELE(newbie->itp_policy, ns);
6031		kmem_free(newbie->itp_inactive, sizeof (ipsec_policy_head_t));
6032		goto nomem;
6033	}
6034	rw_exit(&ipss->ipsec_tunnel_policy_lock);
6035
6036	return (newbie);
6037nomem:
6038	*errno = ENOMEM;
6039	kmem_free(newbie, sizeof (*newbie));
6040	return (NULL);
6041}
6042
6043/*
6044 * Given two addresses, find a tunnel instance's IPsec policy heads.
6045 * Returns NULL on failure.
6046 */
6047ipsec_tun_pol_t *
6048itp_get_byaddr(uint32_t *laddr, uint32_t *faddr, int af, ip_stack_t *ipst)
6049{
6050	conn_t *connp;
6051	iptun_t *iptun;
6052	ipsec_tun_pol_t *itp = NULL;
6053
6054	/* Classifiers are used to "src" being foreign. */
6055	if (af == AF_INET) {
6056		connp = ipcl_iptun_classify_v4((ipaddr_t *)faddr,
6057		    (ipaddr_t *)laddr, ipst);
6058	} else {
6059		ASSERT(af == AF_INET6);
6060		ASSERT(!IN6_IS_ADDR_V4MAPPED((in6_addr_t *)laddr));
6061		ASSERT(!IN6_IS_ADDR_V4MAPPED((in6_addr_t *)faddr));
6062		connp = ipcl_iptun_classify_v6((in6_addr_t *)faddr,
6063		    (in6_addr_t *)laddr, ipst);
6064	}
6065
6066	if (connp == NULL)
6067		return (NULL);
6068
6069	if (IPCL_IS_IPTUN(connp)) {
6070		iptun = connp->conn_iptun;
6071		if (iptun != NULL) {
6072			itp = iptun->iptun_itp;
6073			if (itp != NULL) {
6074				/* Braces due to the macro's nature... */
6075				ITP_REFHOLD(itp);
6076			}
6077		}  /* Else itp is already NULL. */
6078	}
6079
6080	CONN_DEC_REF(connp);
6081	return (itp);
6082}
6083
6084/*
6085 * Frag cache code, based on SunScreen 3.2 source
6086 *	screen/kernel/common/screen_fragcache.c
6087 */
6088
6089#define	IPSEC_FRAG_TTL_MAX	5
6090/*
6091 * Note that the following parameters create 256 hash buckets
6092 * with 1024 free entries to be distributed.  Things are cleaned
6093 * periodically and are attempted to be cleaned when there is no
6094 * free space, but this system errs on the side of dropping packets
6095 * over creating memory exhaustion.  We may decide to make hash
6096 * factor a tunable if this proves to be a bad decision.
6097 */
6098#define	IPSEC_FRAG_HASH_SLOTS	(1<<8)
6099#define	IPSEC_FRAG_HASH_FACTOR	4
6100#define	IPSEC_FRAG_HASH_SIZE	(IPSEC_FRAG_HASH_SLOTS * IPSEC_FRAG_HASH_FACTOR)
6101
6102#define	IPSEC_FRAG_HASH_MASK		(IPSEC_FRAG_HASH_SLOTS - 1)
6103#define	IPSEC_FRAG_HASH_FUNC(id)	(((id) & IPSEC_FRAG_HASH_MASK) ^ \
6104					    (((id) / \
6105					    (ushort_t)IPSEC_FRAG_HASH_SLOTS) & \
6106					    IPSEC_FRAG_HASH_MASK))
6107
6108/* Maximum fragments per packet.  48 bytes payload x 1366 packets > 64KB */
6109#define	IPSEC_MAX_FRAGS		1366
6110
6111#define	V4_FRAG_OFFSET(ipha) ((ntohs(ipha->ipha_fragment_offset_and_flags) & \
6112				    IPH_OFFSET) << 3)
6113#define	V4_MORE_FRAGS(ipha) (ntohs(ipha->ipha_fragment_offset_and_flags) & \
6114		IPH_MF)
6115
6116/*
6117 * Initialize an ipsec fragcache instance.
6118 * Returns B_FALSE if memory allocation fails.
6119 */
6120boolean_t
6121ipsec_fragcache_init(ipsec_fragcache_t *frag)
6122{
6123	ipsec_fragcache_entry_t *ftemp;
6124	int i;
6125
6126	mutex_init(&frag->itpf_lock, NULL, MUTEX_DEFAULT, NULL);
6127	frag->itpf_ptr = (ipsec_fragcache_entry_t **)
6128	    kmem_zalloc(sizeof (ipsec_fragcache_entry_t *) *
6129	    IPSEC_FRAG_HASH_SLOTS, KM_NOSLEEP);
6130	if (frag->itpf_ptr == NULL)
6131		return (B_FALSE);
6132
6133	ftemp = (ipsec_fragcache_entry_t *)
6134	    kmem_zalloc(sizeof (ipsec_fragcache_entry_t) *
6135	    IPSEC_FRAG_HASH_SIZE, KM_NOSLEEP);
6136	if (ftemp == NULL) {
6137		kmem_free(frag->itpf_ptr, sizeof (ipsec_fragcache_entry_t *) *
6138		    IPSEC_FRAG_HASH_SLOTS);
6139		return (B_FALSE);
6140	}
6141
6142	frag->itpf_freelist = NULL;
6143
6144	for (i = 0; i < IPSEC_FRAG_HASH_SIZE; i++) {
6145		ftemp->itpfe_next = frag->itpf_freelist;
6146		frag->itpf_freelist = ftemp;
6147		ftemp++;
6148	}
6149
6150	frag->itpf_expire_hint = 0;
6151
6152	return (B_TRUE);
6153}
6154
6155void
6156ipsec_fragcache_uninit(ipsec_fragcache_t *frag, ipsec_stack_t *ipss)
6157{
6158	ipsec_fragcache_entry_t *fep;
6159	int i;
6160
6161	mutex_enter(&frag->itpf_lock);
6162	if (frag->itpf_ptr) {
6163		/* Delete any existing fragcache entry chains */
6164		for (i = 0; i < IPSEC_FRAG_HASH_SLOTS; i++) {
6165			fep = (frag->itpf_ptr)[i];
6166			while (fep != NULL) {
6167				/* Returned fep is next in chain or NULL */
6168				fep = fragcache_delentry(i, fep, frag, ipss);
6169			}
6170		}
6171		/*
6172		 * Chase the pointers back to the beginning
6173		 * of the memory allocation and then
6174		 * get rid of the allocated freelist
6175		 */
6176		while (frag->itpf_freelist->itpfe_next != NULL)
6177			frag->itpf_freelist = frag->itpf_freelist->itpfe_next;
6178		/*
6179		 * XXX - If we ever dynamically grow the freelist
6180		 * then we'll have to free entries individually
6181		 * or determine how many entries or chunks we have
6182		 * grown since the initial allocation.
6183		 */
6184		kmem_free(frag->itpf_freelist,
6185		    sizeof (ipsec_fragcache_entry_t) *
6186		    IPSEC_FRAG_HASH_SIZE);
6187		/* Free the fragcache structure */
6188		kmem_free(frag->itpf_ptr,
6189		    sizeof (ipsec_fragcache_entry_t *) *
6190		    IPSEC_FRAG_HASH_SLOTS);
6191	}
6192	mutex_exit(&frag->itpf_lock);
6193	mutex_destroy(&frag->itpf_lock);
6194}
6195
6196/*
6197 * Add a fragment to the fragment cache.   Consumes mp if NULL is returned.
6198 * Returns mp if a whole fragment has been assembled, NULL otherwise
6199 * The returned mp could be a b_next chain of fragments.
6200 *
6201 * The iramp argument is set on inbound; NULL if outbound.
6202 */
6203mblk_t *
6204ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *iramp, mblk_t *mp,
6205    int outer_hdr_len, ipsec_stack_t *ipss)
6206{
6207	boolean_t is_v4;
6208	time_t itpf_time;
6209	ipha_t *iph;
6210	ipha_t *oiph;
6211	ip6_t *ip6h = NULL;
6212	uint8_t v6_proto;
6213	uint8_t *v6_proto_p;
6214	uint16_t ip6_hdr_length;
6215	ip_pkt_t ipp;
6216	ip6_frag_t *fraghdr;
6217	ipsec_fragcache_entry_t *fep;
6218	int i;
6219	mblk_t *nmp, *prevmp;
6220	int firstbyte, lastbyte;
6221	int offset;
6222	int last;
6223	boolean_t inbound = (iramp != NULL);
6224
6225	/*
6226	 * You're on the slow path, so insure that every packet in the
6227	 * cache is a single-mblk one.
6228	 */
6229	if (mp->b_cont != NULL) {
6230		nmp = msgpullup(mp, -1);
6231		if (nmp == NULL) {
6232			ip_drop_packet(mp, inbound, NULL,
6233			    DROPPER(ipss, ipds_spd_nomem),
6234			    &ipss->ipsec_spd_dropper);
6235			if (inbound)
6236				(void) ip_recv_attr_free_mblk(iramp);
6237			return (NULL);
6238		}
6239		freemsg(mp);
6240		mp = nmp;
6241	}
6242
6243	mutex_enter(&frag->itpf_lock);
6244
6245	oiph  = (ipha_t *)mp->b_rptr;
6246	iph  = (ipha_t *)(mp->b_rptr + outer_hdr_len);
6247
6248	if (IPH_HDR_VERSION(iph) == IPV4_VERSION) {
6249		is_v4 = B_TRUE;
6250	} else {
6251		ASSERT(IPH_HDR_VERSION(iph) == IPV6_VERSION);
6252		ip6h = (ip6_t *)(mp->b_rptr + outer_hdr_len);
6253
6254		if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip6_hdr_length,
6255		    &v6_proto_p)) {
6256			/*
6257			 * Find upper layer protocol.
6258			 * If it fails we have a malformed packet
6259			 */
6260			mutex_exit(&frag->itpf_lock);
6261			ip_drop_packet(mp, inbound, NULL,
6262			    DROPPER(ipss, ipds_spd_malformed_packet),
6263			    &ipss->ipsec_spd_dropper);
6264			if (inbound)
6265				(void) ip_recv_attr_free_mblk(iramp);
6266			return (NULL);
6267		} else {
6268			v6_proto = *v6_proto_p;
6269		}
6270
6271
6272		bzero(&ipp, sizeof (ipp));
6273		(void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &ipp, NULL);
6274		if (!(ipp.ipp_fields & IPPF_FRAGHDR)) {
6275			/*
6276			 * We think this is a fragment, but didn't find
6277			 * a fragment header.  Something is wrong.
6278			 */
6279			mutex_exit(&frag->itpf_lock);
6280			ip_drop_packet(mp, inbound, NULL,
6281			    DROPPER(ipss, ipds_spd_malformed_frag),
6282			    &ipss->ipsec_spd_dropper);
6283			if (inbound)
6284				(void) ip_recv_attr_free_mblk(iramp);
6285			return (NULL);
6286		}
6287		fraghdr = ipp.ipp_fraghdr;
6288		is_v4 = B_FALSE;
6289	}
6290
6291	/* Anything to cleanup? */
6292
6293	/*
6294	 * This cleanup call could be put in a timer loop
6295	 * but it may actually be just as reasonable a decision to
6296	 * leave it here.  The disadvantage is this only gets called when
6297	 * frags are added.  The advantage is that it is not
6298	 * susceptible to race conditions like a time-based cleanup
6299	 * may be.
6300	 */
6301	itpf_time = gethrestime_sec();
6302	if (itpf_time >= frag->itpf_expire_hint)
6303		ipsec_fragcache_clean(frag, ipss);
6304
6305	/* Lookup to see if there is an existing entry */
6306
6307	if (is_v4)
6308		i = IPSEC_FRAG_HASH_FUNC(iph->ipha_ident);
6309	else
6310		i = IPSEC_FRAG_HASH_FUNC(fraghdr->ip6f_ident);
6311
6312	for (fep = (frag->itpf_ptr)[i]; fep; fep = fep->itpfe_next) {
6313		if (is_v4) {
6314			ASSERT(iph != NULL);
6315			if ((fep->itpfe_id == iph->ipha_ident) &&
6316			    (fep->itpfe_src == iph->ipha_src) &&
6317			    (fep->itpfe_dst == iph->ipha_dst) &&
6318			    (fep->itpfe_proto == iph->ipha_protocol))
6319				break;
6320		} else {
6321			ASSERT(fraghdr != NULL);
6322			ASSERT(fep != NULL);
6323			if ((fep->itpfe_id == fraghdr->ip6f_ident) &&
6324			    IN6_ARE_ADDR_EQUAL(&fep->itpfe_src6,
6325			    &ip6h->ip6_src) &&
6326			    IN6_ARE_ADDR_EQUAL(&fep->itpfe_dst6,
6327			    &ip6h->ip6_dst) && (fep->itpfe_proto == v6_proto))
6328				break;
6329		}
6330	}
6331
6332	if (is_v4) {
6333		firstbyte = V4_FRAG_OFFSET(iph);
6334		lastbyte  = firstbyte + ntohs(iph->ipha_length) -
6335		    IPH_HDR_LENGTH(iph);
6336		last = (V4_MORE_FRAGS(iph) == 0);
6337#ifdef FRAGCACHE_DEBUG
6338		cmn_err(CE_WARN, "V4 fragcache: firstbyte = %d, lastbyte = %d, "
6339		    "last = %d, id = %d\n", firstbyte, lastbyte, last,
6340		    iph->ipha_ident);
6341#endif
6342	} else {
6343		firstbyte = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK);
6344		lastbyte  = firstbyte + ntohs(ip6h->ip6_plen) +
6345		    sizeof (ip6_t) - ip6_hdr_length;
6346		last = (fraghdr->ip6f_offlg & IP6F_MORE_FRAG) == 0;
6347#ifdef FRAGCACHE_DEBUG
6348		cmn_err(CE_WARN, "V6 fragcache: firstbyte = %d, lastbyte = %d, "
6349		    "last = %d, id = %d, fraghdr = %p, mp = %p\n",
6350		    firstbyte, lastbyte, last, fraghdr->ip6f_ident,
6351		    fraghdr, mp);
6352#endif
6353	}
6354
6355	/* check for bogus fragments and delete the entry */
6356	if (firstbyte > 0 && firstbyte <= 8) {
6357		if (fep != NULL)
6358			(void) fragcache_delentry(i, fep, frag, ipss);
6359		mutex_exit(&frag->itpf_lock);
6360		ip_drop_packet(mp, inbound, NULL,
6361		    DROPPER(ipss, ipds_spd_malformed_frag),
6362		    &ipss->ipsec_spd_dropper);
6363		if (inbound)
6364			(void) ip_recv_attr_free_mblk(iramp);
6365		return (NULL);
6366	}
6367
6368	/* Not found, allocate a new entry */
6369	if (fep == NULL) {
6370		if (frag->itpf_freelist == NULL) {
6371			/* see if there is some space */
6372			ipsec_fragcache_clean(frag, ipss);
6373			if (frag->itpf_freelist == NULL) {
6374				mutex_exit(&frag->itpf_lock);
6375				ip_drop_packet(mp, inbound, NULL,
6376				    DROPPER(ipss, ipds_spd_nomem),
6377				    &ipss->ipsec_spd_dropper);
6378				if (inbound)
6379					(void) ip_recv_attr_free_mblk(iramp);
6380				return (NULL);
6381			}
6382		}
6383
6384		fep = frag->itpf_freelist;
6385		frag->itpf_freelist = fep->itpfe_next;
6386
6387		if (is_v4) {
6388			bcopy((caddr_t)&iph->ipha_src, (caddr_t)&fep->itpfe_src,
6389			    sizeof (struct in_addr));
6390			bcopy((caddr_t)&iph->ipha_dst, (caddr_t)&fep->itpfe_dst,
6391			    sizeof (struct in_addr));
6392			fep->itpfe_id = iph->ipha_ident;
6393			fep->itpfe_proto = iph->ipha_protocol;
6394			i = IPSEC_FRAG_HASH_FUNC(fep->itpfe_id);
6395		} else {
6396			bcopy((in6_addr_t *)&ip6h->ip6_src,
6397			    (in6_addr_t *)&fep->itpfe_src6,
6398			    sizeof (struct in6_addr));
6399			bcopy((in6_addr_t *)&ip6h->ip6_dst,
6400			    (in6_addr_t *)&fep->itpfe_dst6,
6401			    sizeof (struct in6_addr));
6402			fep->itpfe_id = fraghdr->ip6f_ident;
6403			fep->itpfe_proto = v6_proto;
6404			i = IPSEC_FRAG_HASH_FUNC(fep->itpfe_id);
6405		}
6406		itpf_time = gethrestime_sec();
6407		fep->itpfe_exp = itpf_time + IPSEC_FRAG_TTL_MAX + 1;
6408		fep->itpfe_last = 0;
6409		fep->itpfe_fraglist = NULL;
6410		fep->itpfe_depth = 0;
6411		fep->itpfe_next = (frag->itpf_ptr)[i];
6412		(frag->itpf_ptr)[i] = fep;
6413
6414		if (frag->itpf_expire_hint > fep->itpfe_exp)
6415			frag->itpf_expire_hint = fep->itpfe_exp;
6416
6417	}
6418
6419	/* Insert it in the frag list */
6420	/* List is in order by starting offset of fragments */
6421
6422	prevmp = NULL;
6423	for (nmp = fep->itpfe_fraglist; nmp; nmp = nmp->b_next) {
6424		ipha_t *niph;
6425		ipha_t *oniph;
6426		ip6_t *nip6h;
6427		ip_pkt_t nipp;
6428		ip6_frag_t *nfraghdr;
6429		uint16_t nip6_hdr_length;
6430		uint8_t *nv6_proto_p;
6431		int nfirstbyte, nlastbyte;
6432		char *data, *ndata;
6433		mblk_t *ndata_mp = (inbound ? nmp->b_cont : nmp);
6434		int hdr_len;
6435
6436		oniph  = (ipha_t *)mp->b_rptr;
6437		nip6h = NULL;
6438		niph = NULL;
6439
6440		/*
6441		 * Determine outer header type and length and set
6442		 * pointers appropriately
6443		 */
6444
6445		if (IPH_HDR_VERSION(oniph) == IPV4_VERSION) {
6446			hdr_len = ((outer_hdr_len != 0) ?
6447			    IPH_HDR_LENGTH(oiph) : 0);
6448			niph = (ipha_t *)(ndata_mp->b_rptr + hdr_len);
6449		} else {
6450			ASSERT(IPH_HDR_VERSION(oniph) == IPV6_VERSION);
6451			ASSERT(ndata_mp->b_cont == NULL);
6452			nip6h = (ip6_t *)ndata_mp->b_rptr;
6453			(void) ip_hdr_length_nexthdr_v6(ndata_mp, nip6h,
6454			    &nip6_hdr_length, &v6_proto_p);
6455			hdr_len = ((outer_hdr_len != 0) ? nip6_hdr_length : 0);
6456		}
6457
6458		/*
6459		 * Determine inner header type and length and set
6460		 * pointers appropriately
6461		 */
6462
6463		if (is_v4) {
6464			if (niph == NULL) {
6465				/* Was v6 outer */
6466				niph = (ipha_t *)(ndata_mp->b_rptr + hdr_len);
6467			}
6468			nfirstbyte = V4_FRAG_OFFSET(niph);
6469			nlastbyte = nfirstbyte + ntohs(niph->ipha_length) -
6470			    IPH_HDR_LENGTH(niph);
6471		} else {
6472			ASSERT(ndata_mp->b_cont == NULL);
6473			nip6h = (ip6_t *)(ndata_mp->b_rptr + hdr_len);
6474			if (!ip_hdr_length_nexthdr_v6(ndata_mp, nip6h,
6475			    &nip6_hdr_length, &nv6_proto_p)) {
6476				mutex_exit(&frag->itpf_lock);
6477				ip_drop_packet_chain(nmp, inbound, NULL,
6478				    DROPPER(ipss, ipds_spd_malformed_frag),
6479				    &ipss->ipsec_spd_dropper);
6480				ipsec_freemsg_chain(ndata_mp);
6481				if (inbound)
6482					(void) ip_recv_attr_free_mblk(iramp);
6483				return (NULL);
6484			}
6485			bzero(&nipp, sizeof (nipp));
6486			(void) ip_find_hdr_v6(ndata_mp, nip6h, B_FALSE, &nipp,
6487			    NULL);
6488			nfraghdr = nipp.ipp_fraghdr;
6489			nfirstbyte = ntohs(nfraghdr->ip6f_offlg &
6490			    IP6F_OFF_MASK);
6491			nlastbyte  = nfirstbyte + ntohs(nip6h->ip6_plen) +
6492			    sizeof (ip6_t) - nip6_hdr_length;
6493		}
6494
6495		/* Check for overlapping fragments */
6496		if (firstbyte >= nfirstbyte && firstbyte < nlastbyte) {
6497			/*
6498			 * Overlap Check:
6499			 *  ~~~~---------		# Check if the newly
6500			 * ~	ndata_mp|		# received fragment
6501			 *  ~~~~---------		# overlaps with the
6502			 *	 ---------~~~~~~	# current fragment.
6503			 *	|    mp		~
6504			 *	 ---------~~~~~~
6505			 */
6506			if (is_v4) {
6507				data  = (char *)iph  + IPH_HDR_LENGTH(iph) +
6508				    firstbyte - nfirstbyte;
6509				ndata = (char *)niph + IPH_HDR_LENGTH(niph);
6510			} else {
6511				data  = (char *)ip6h  +
6512				    nip6_hdr_length + firstbyte -
6513				    nfirstbyte;
6514				ndata = (char *)nip6h + nip6_hdr_length;
6515			}
6516			if (bcmp(data, ndata, MIN(lastbyte, nlastbyte) -
6517			    firstbyte)) {
6518				/* Overlapping data does not match */
6519				(void) fragcache_delentry(i, fep, frag, ipss);
6520				mutex_exit(&frag->itpf_lock);
6521				ip_drop_packet(mp, inbound, NULL,
6522				    DROPPER(ipss, ipds_spd_overlap_frag),
6523				    &ipss->ipsec_spd_dropper);
6524				if (inbound)
6525					(void) ip_recv_attr_free_mblk(iramp);
6526				return (NULL);
6527			}
6528			/* Part of defense for jolt2.c fragmentation attack */
6529			if (firstbyte >= nfirstbyte && lastbyte <= nlastbyte) {
6530				/*
6531				 * Check for identical or subset fragments:
6532				 *  ----------	    ~~~~--------~~~~~
6533				 * |    nmp   | or  ~	   nmp	    ~
6534				 *  ----------	    ~~~~--------~~~~~
6535				 *  ----------		  ------
6536				 * |	mp    |		 |  mp  |
6537				 *  ----------		  ------
6538				 */
6539				mutex_exit(&frag->itpf_lock);
6540				ip_drop_packet(mp, inbound, NULL,
6541				    DROPPER(ipss, ipds_spd_evil_frag),
6542				    &ipss->ipsec_spd_dropper);
6543				if (inbound)
6544					(void) ip_recv_attr_free_mblk(iramp);
6545				return (NULL);
6546			}
6547
6548		}
6549
6550		/* Correct location for this fragment? */
6551		if (firstbyte <= nfirstbyte) {
6552			/*
6553			 * Check if the tail end of the new fragment overlaps
6554			 * with the head of the current fragment.
6555			 *	  --------~~~~~~~
6556			 *	 |    nmp	~
6557			 *	  --------~~~~~~~
6558			 *  ~~~~~--------
6559			 *  ~	mp	 |
6560			 *  ~~~~~--------
6561			 */
6562			if (lastbyte > nfirstbyte) {
6563				/* Fragments overlap */
6564				data  = (char *)iph  + IPH_HDR_LENGTH(iph) +
6565				    firstbyte - nfirstbyte;
6566				ndata = (char *)niph + IPH_HDR_LENGTH(niph);
6567				if (is_v4) {
6568					data  = (char *)iph +
6569					    IPH_HDR_LENGTH(iph) + firstbyte -
6570					    nfirstbyte;
6571					ndata = (char *)niph +
6572					    IPH_HDR_LENGTH(niph);
6573				} else {
6574					data  = (char *)ip6h  +
6575					    nip6_hdr_length + firstbyte -
6576					    nfirstbyte;
6577					ndata = (char *)nip6h + nip6_hdr_length;
6578				}
6579				if (bcmp(data, ndata, MIN(lastbyte, nlastbyte)
6580				    - nfirstbyte)) {
6581					/* Overlap mismatch */
6582					(void) fragcache_delentry(i, fep, frag,
6583					    ipss);
6584					mutex_exit(&frag->itpf_lock);
6585					ip_drop_packet(mp, inbound, NULL,
6586					    DROPPER(ipss,
6587					    ipds_spd_overlap_frag),
6588					    &ipss->ipsec_spd_dropper);
6589					if (inbound) {
6590						(void) ip_recv_attr_free_mblk(
6591						    iramp);
6592					}
6593					return (NULL);
6594				}
6595			}
6596
6597			/*
6598			 * Fragment does not illegally overlap and can now
6599			 * be inserted into the chain
6600			 */
6601			break;
6602		}
6603
6604		prevmp = nmp;
6605	}
6606	/* Prepend the attributes before we link it in */
6607	if (iramp != NULL) {
6608		ASSERT(iramp->b_cont == NULL);
6609		iramp->b_cont = mp;
6610		mp = iramp;
6611		iramp = NULL;
6612	}
6613	mp->b_next = nmp;
6614
6615	if (prevmp == NULL) {
6616		fep->itpfe_fraglist = mp;
6617	} else {
6618		prevmp->b_next = mp;
6619	}
6620	if (last)
6621		fep->itpfe_last = 1;
6622
6623	/* Part of defense for jolt2.c fragmentation attack */
6624	if (++(fep->itpfe_depth) > IPSEC_MAX_FRAGS) {
6625		(void) fragcache_delentry(i, fep, frag, ipss);
6626		mutex_exit(&frag->itpf_lock);
6627		if (inbound)
6628			mp = ip_recv_attr_free_mblk(mp);
6629
6630		ip_drop_packet(mp, inbound, NULL,
6631		    DROPPER(ipss, ipds_spd_max_frags),
6632		    &ipss->ipsec_spd_dropper);
6633		return (NULL);
6634	}
6635
6636	/* Check for complete packet */
6637
6638	if (!fep->itpfe_last) {
6639		mutex_exit(&frag->itpf_lock);
6640#ifdef FRAGCACHE_DEBUG
6641		cmn_err(CE_WARN, "Fragment cached, not last.\n");
6642#endif
6643		return (NULL);
6644	}
6645
6646#ifdef FRAGCACHE_DEBUG
6647	cmn_err(CE_WARN, "Last fragment cached.\n");
6648	cmn_err(CE_WARN, "mp = %p\n", mp);
6649#endif
6650
6651	offset = 0;
6652	for (mp = fep->itpfe_fraglist; mp; mp = mp->b_next) {
6653		mblk_t *data_mp = (inbound ? mp->b_cont : mp);
6654		int hdr_len;
6655
6656		oiph  = (ipha_t *)data_mp->b_rptr;
6657		ip6h = NULL;
6658		iph = NULL;
6659
6660		if (IPH_HDR_VERSION(oiph) == IPV4_VERSION) {
6661			hdr_len = ((outer_hdr_len != 0) ?
6662			    IPH_HDR_LENGTH(oiph) : 0);
6663			iph = (ipha_t *)(data_mp->b_rptr + hdr_len);
6664		} else {
6665			ASSERT(IPH_HDR_VERSION(oiph) == IPV6_VERSION);
6666			ASSERT(data_mp->b_cont == NULL);
6667			ip6h = (ip6_t *)data_mp->b_rptr;
6668			(void) ip_hdr_length_nexthdr_v6(data_mp, ip6h,
6669			    &ip6_hdr_length, &v6_proto_p);
6670			hdr_len = ((outer_hdr_len != 0) ? ip6_hdr_length : 0);
6671		}
6672
6673		/* Calculate current fragment start/end */
6674		if (is_v4) {
6675			if (iph == NULL) {
6676				/* Was v6 outer */
6677				iph = (ipha_t *)(data_mp->b_rptr + hdr_len);
6678			}
6679			firstbyte = V4_FRAG_OFFSET(iph);
6680			lastbyte = firstbyte + ntohs(iph->ipha_length) -
6681			    IPH_HDR_LENGTH(iph);
6682		} else {
6683			ASSERT(data_mp->b_cont == NULL);
6684			ip6h = (ip6_t *)(data_mp->b_rptr + hdr_len);
6685			if (!ip_hdr_length_nexthdr_v6(data_mp, ip6h,
6686			    &ip6_hdr_length, &v6_proto_p)) {
6687				mutex_exit(&frag->itpf_lock);
6688				ip_drop_packet_chain(mp, inbound, NULL,
6689				    DROPPER(ipss, ipds_spd_malformed_frag),
6690				    &ipss->ipsec_spd_dropper);
6691				return (NULL);
6692			}
6693			v6_proto = *v6_proto_p;
6694			bzero(&ipp, sizeof (ipp));
6695			(void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp,
6696			    NULL);
6697			fraghdr = ipp.ipp_fraghdr;
6698			firstbyte = ntohs(fraghdr->ip6f_offlg &
6699			    IP6F_OFF_MASK);
6700			lastbyte  = firstbyte + ntohs(ip6h->ip6_plen) +
6701			    sizeof (ip6_t) - ip6_hdr_length;
6702		}
6703
6704		/*
6705		 * If this fragment is greater than current offset,
6706		 * we have a missing fragment so return NULL
6707		 */
6708		if (firstbyte > offset) {
6709			mutex_exit(&frag->itpf_lock);
6710#ifdef FRAGCACHE_DEBUG
6711			/*
6712			 * Note, this can happen when the last frag
6713			 * gets sent through because it is smaller
6714			 * than the MTU.  It is not necessarily an
6715			 * error condition.
6716			 */
6717			cmn_err(CE_WARN, "Frag greater than offset! : "
6718			    "missing fragment: firstbyte = %d, offset = %d, "
6719			    "mp = %p\n", firstbyte, offset, mp);
6720#endif
6721			return (NULL);
6722		}
6723
6724		/*
6725		 * If we are at the last fragment, we have the complete
6726		 * packet, so rechain things and return it to caller
6727		 * for processing
6728		 */
6729
6730		if ((is_v4 && !V4_MORE_FRAGS(iph)) ||
6731		    (!is_v4 && !(fraghdr->ip6f_offlg & IP6F_MORE_FRAG))) {
6732			mp = fep->itpfe_fraglist;
6733			fep->itpfe_fraglist = NULL;
6734			(void) fragcache_delentry(i, fep, frag, ipss);
6735			mutex_exit(&frag->itpf_lock);
6736
6737			if ((is_v4 && (firstbyte + ntohs(iph->ipha_length) >
6738			    65535)) || (!is_v4 && (firstbyte +
6739			    ntohs(ip6h->ip6_plen) > 65535))) {
6740				/* It is an invalid "ping-o-death" packet */
6741				/* Discard it */
6742				ip_drop_packet_chain(mp, inbound, NULL,
6743				    DROPPER(ipss, ipds_spd_evil_frag),
6744				    &ipss->ipsec_spd_dropper);
6745				return (NULL);
6746			}
6747#ifdef FRAGCACHE_DEBUG
6748			cmn_err(CE_WARN, "Fragcache returning mp = %p, "
6749			    "mp->b_next = %p", mp, mp->b_next);
6750#endif
6751			/*
6752			 * For inbound case, mp has attrmp b_next'd chain
6753			 * For outbound case, it is just data mp chain
6754			 */
6755			return (mp);
6756		}
6757
6758		/*
6759		 * Update new ending offset if this
6760		 * fragment extends the packet
6761		 */
6762		if (offset < lastbyte)
6763			offset = lastbyte;
6764	}
6765
6766	mutex_exit(&frag->itpf_lock);
6767
6768	/* Didn't find last fragment, so return NULL */
6769	return (NULL);
6770}
6771
6772static void
6773ipsec_fragcache_clean(ipsec_fragcache_t *frag, ipsec_stack_t *ipss)
6774{
6775	ipsec_fragcache_entry_t *fep;
6776	int i;
6777	ipsec_fragcache_entry_t *earlyfep = NULL;
6778	time_t itpf_time;
6779	int earlyexp;
6780	int earlyi = 0;
6781
6782	ASSERT(MUTEX_HELD(&frag->itpf_lock));
6783
6784	itpf_time = gethrestime_sec();
6785	earlyexp = itpf_time + 10000;
6786
6787	for (i = 0; i < IPSEC_FRAG_HASH_SLOTS; i++) {
6788		fep = (frag->itpf_ptr)[i];
6789		while (fep) {
6790			if (fep->itpfe_exp < itpf_time) {
6791				/* found */
6792				fep = fragcache_delentry(i, fep, frag, ipss);
6793			} else {
6794				if (fep->itpfe_exp < earlyexp) {
6795					earlyfep = fep;
6796					earlyexp = fep->itpfe_exp;
6797					earlyi = i;
6798				}
6799				fep = fep->itpfe_next;
6800			}
6801		}
6802	}
6803
6804	frag->itpf_expire_hint = earlyexp;
6805
6806	/* if (!found) */
6807	if (frag->itpf_freelist == NULL)
6808		(void) fragcache_delentry(earlyi, earlyfep, frag, ipss);
6809}
6810
6811static ipsec_fragcache_entry_t *
6812fragcache_delentry(int slot, ipsec_fragcache_entry_t *fep,
6813    ipsec_fragcache_t *frag, ipsec_stack_t *ipss)
6814{
6815	ipsec_fragcache_entry_t *targp;
6816	ipsec_fragcache_entry_t *nextp = fep->itpfe_next;
6817
6818	ASSERT(MUTEX_HELD(&frag->itpf_lock));
6819
6820	/* Free up any fragment list still in cache entry */
6821	if (fep->itpfe_fraglist != NULL) {
6822		ip_drop_packet_chain(fep->itpfe_fraglist,
6823		    ip_recv_attr_is_mblk(fep->itpfe_fraglist), NULL,
6824		    DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper);
6825	}
6826	fep->itpfe_fraglist = NULL;
6827
6828	targp = (frag->itpf_ptr)[slot];
6829	ASSERT(targp != 0);
6830
6831	if (targp == fep) {
6832		/* unlink from head of hash chain */
6833		(frag->itpf_ptr)[slot] = nextp;
6834		/* link into free list */
6835		fep->itpfe_next = frag->itpf_freelist;
6836		frag->itpf_freelist = fep;
6837		return (nextp);
6838	}
6839
6840	/* maybe should use double linked list to make update faster */
6841	/* must be past front of chain */
6842	while (targp) {
6843		if (targp->itpfe_next == fep) {
6844			/* unlink from hash chain */
6845			targp->itpfe_next = nextp;
6846			/* link into free list */
6847			fep->itpfe_next = frag->itpf_freelist;
6848			frag->itpf_freelist = fep;
6849			return (nextp);
6850		}
6851		targp = targp->itpfe_next;
6852		ASSERT(targp != 0);
6853	}
6854	/* NOTREACHED */
6855	return (NULL);
6856}
6857