ipclassifier.c revision 11042:2d6e217af1b4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * IP PACKET CLASSIFIER
28 *
29 * The IP packet classifier provides mapping between IP packets and persistent
30 * connection state for connection-oriented protocols. It also provides
31 * interface for managing connection states.
32 *
33 * The connection state is kept in conn_t data structure and contains, among
34 * other things:
35 *
36 *	o local/remote address and ports
37 *	o Transport protocol
38 *	o squeue for the connection (for TCP only)
39 *	o reference counter
40 *	o Connection state
41 *	o hash table linkage
42 *	o interface/ire information
43 *	o credentials
44 *	o ipsec policy
45 *	o send and receive functions.
46 *	o mutex lock.
47 *
48 * Connections use a reference counting scheme. They are freed when the
49 * reference counter drops to zero. A reference is incremented when connection
50 * is placed in a list or table, when incoming packet for the connection arrives
51 * and when connection is processed via squeue (squeue processing may be
52 * asynchronous and the reference protects the connection from being destroyed
53 * before its processing is finished).
54 *
55 * conn_recv is used to pass up packets to the ULP.
56 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
57 * a listener, and changes to tcp_input_listener as the listener has picked a
58 * good squeue. For other cases it is set to tcp_input_data.
59 *
60 * conn_recvicmp is used to pass up ICMP errors to the ULP.
61 *
62 * Classifier uses several hash tables:
63 *
64 * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
65 *	ipcl_bind_fanout:	contains all connections in BOUND state
66 *	ipcl_proto_fanout:	IPv4 protocol fanout
67 *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
68 *	ipcl_udp_fanout:	contains all UDP connections
69 *	ipcl_iptun_fanout:	contains all IP tunnel connections
70 *	ipcl_globalhash_fanout:	contains all connections
71 *
72 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
73 * which need to view all existing connections.
74 *
75 * All tables are protected by per-bucket locks. When both per-bucket lock and
76 * connection lock need to be held, the per-bucket lock should be acquired
77 * first, followed by the connection lock.
78 *
79 * All functions doing search in one of these tables increment a reference
80 * counter on the connection found (if any). This reference should be dropped
81 * when the caller has finished processing the connection.
82 *
83 *
84 * INTERFACES:
85 * ===========
86 *
87 * Connection Lookup:
88 * ------------------
89 *
90 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
91 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
92 *
93 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
94 * it can't find any associated connection. If the connection is found, its
95 * reference counter is incremented.
96 *
97 *	mp:	mblock, containing packet header. The full header should fit
98 *		into a single mblock. It should also contain at least full IP
99 *		and TCP or UDP header.
100 *
101 *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
102 *
103 *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
104 *		 the packet.
105 *
106 * 	ira->ira_zoneid: The zone in which the returned connection must be; the
107 *		zoneid corresponding to the ire_zoneid on the IRE located for
108 *		the packet's destination address.
109 *
110 *	ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
111 *		IRAF_TX_SHARED_ADDR flags
112 *
113 *	For TCP connections, the lookup order is as follows:
114 *		5-tuple {src, dst, protocol, local port, remote port}
115 *			lookup in ipcl_conn_fanout table.
116 *		3-tuple {dst, remote port, protocol} lookup in
117 *			ipcl_bind_fanout table.
118 *
119 *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
120 *	remote port} lookup is done on ipcl_udp_fanout. Note that,
121 *	these interfaces do not handle cases where a packets belongs
122 *	to multiple UDP clients, which is handled in IP itself.
123 *
124 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
125 * determine which actual zone gets the segment.  This is used only in a
126 * labeled environment.  The matching rules are:
127 *
128 *	- If it's not a multilevel port, then the label on the packet selects
129 *	  the zone.  Unlabeled packets are delivered to the global zone.
130 *
131 *	- If it's a multilevel port, then only the zone registered to receive
132 *	  packets on that port matches.
133 *
134 * Also, in a labeled environment, packet labels need to be checked.  For fully
135 * bound TCP connections, we can assume that the packet label was checked
136 * during connection establishment, and doesn't need to be checked on each
137 * packet.  For others, though, we need to check for strict equality or, for
138 * multilevel ports, membership in the range or set.  This part currently does
139 * a tnrh lookup on each packet, but could be optimized to use cached results
140 * if that were necessary.  (SCTP doesn't come through here, but if it did,
141 * we would apply the same rules as TCP.)
142 *
143 * An implication of the above is that fully-bound TCP sockets must always use
144 * distinct 4-tuples; they can't be discriminated by label alone.
145 *
146 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
147 * as there's no connection set-up handshake and no shared state.
148 *
149 * Labels on looped-back packets within a single zone do not need to be
150 * checked, as all processes in the same zone have the same label.
151 *
152 * Finally, for unlabeled packets received by a labeled system, special rules
153 * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
154 * socket in the zone whose label matches the default label of the sender, if
155 * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
156 * receiver's label must dominate the sender's default label.
157 *
158 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
159 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
160 *					 ip_stack);
161 *
162 *	Lookup routine to find a exact match for {src, dst, local port,
163 *	remote port) for TCP connections in ipcl_conn_fanout. The address and
164 *	ports are read from the IP and TCP header respectively.
165 *
166 * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
167 *					 zoneid, ip_stack);
168 * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
169 *					 zoneid, ip_stack);
170 *
171 * 	Lookup routine to find a listener with the tuple {lport, laddr,
172 * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
173 * 	parameter interface index is also compared.
174 *
175 * void ipcl_walk(func, arg, ip_stack)
176 *
177 * 	Apply 'func' to every connection available. The 'func' is called as
178 *	(*func)(connp, arg). The walk is non-atomic so connections may be
179 *	created and destroyed during the walk. The CONN_CONDEMNED and
180 *	CONN_INCIPIENT flags ensure that connections which are newly created
181 *	or being destroyed are not selected by the walker.
182 *
183 * Table Updates
184 * -------------
185 *
186 * int ipcl_conn_insert(connp);
187 * int ipcl_conn_insert_v4(connp);
188 * int ipcl_conn_insert_v6(connp);
189 *
190 *	Insert 'connp' in the ipcl_conn_fanout.
191 *	Arguements :
192 *		connp		conn_t to be inserted
193 *
194 *	Return value :
195 *		0		if connp was inserted
196 *		EADDRINUSE	if the connection with the same tuple
197 *				already exists.
198 *
199 * int ipcl_bind_insert(connp);
200 * int ipcl_bind_insert_v4(connp);
201 * int ipcl_bind_insert_v6(connp);
202 *
203 * 	Insert 'connp' in ipcl_bind_fanout.
204 * 	Arguements :
205 * 		connp		conn_t to be inserted
206 *
207 *
208 * void ipcl_hash_remove(connp);
209 *
210 * 	Removes the 'connp' from the connection fanout table.
211 *
212 * Connection Creation/Destruction
213 * -------------------------------
214 *
215 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
216 *
217 * 	Creates a new conn based on the type flag, inserts it into
218 * 	globalhash table.
219 *
220 *	type:	This flag determines the type of conn_t which needs to be
221 *		created i.e., which kmem_cache it comes from.
222 *		IPCL_TCPCONN	indicates a TCP connection
223 *		IPCL_SCTPCONN	indicates a SCTP connection
224 *		IPCL_UDPCONN	indicates a UDP conn_t.
225 *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
226 *		IPCL_RTSCONN	indicates a RTS conn_t.
227 *		IPCL_IPCCONN	indicates all other connections.
228 *
229 * void ipcl_conn_destroy(connp)
230 *
231 * 	Destroys the connection state, removes it from the global
232 * 	connection hash table and frees its memory.
233 */
234
235#include <sys/types.h>
236#include <sys/stream.h>
237#include <sys/stropts.h>
238#include <sys/sysmacros.h>
239#include <sys/strsubr.h>
240#include <sys/strsun.h>
241#define	_SUN_TPI_VERSION 2
242#include <sys/ddi.h>
243#include <sys/cmn_err.h>
244#include <sys/debug.h>
245
246#include <sys/systm.h>
247#include <sys/param.h>
248#include <sys/kmem.h>
249#include <sys/isa_defs.h>
250#include <inet/common.h>
251#include <netinet/ip6.h>
252#include <netinet/icmp6.h>
253
254#include <inet/ip.h>
255#include <inet/ip_if.h>
256#include <inet/ip_ire.h>
257#include <inet/ip6.h>
258#include <inet/ip_ndp.h>
259#include <inet/ip_impl.h>
260#include <inet/udp_impl.h>
261#include <inet/sctp_ip.h>
262#include <inet/sctp/sctp_impl.h>
263#include <inet/rawip_impl.h>
264#include <inet/rts_impl.h>
265#include <inet/iptun/iptun_impl.h>
266
267#include <sys/cpuvar.h>
268
269#include <inet/ipclassifier.h>
270#include <inet/tcp.h>
271#include <inet/ipsec_impl.h>
272
273#include <sys/tsol/tnet.h>
274#include <sys/sockio.h>
275
276/* Old value for compatibility. Setable in /etc/system */
277uint_t tcp_conn_hash_size = 0;
278
279/* New value. Zero means choose automatically.  Setable in /etc/system */
280uint_t ipcl_conn_hash_size = 0;
281uint_t ipcl_conn_hash_memfactor = 8192;
282uint_t ipcl_conn_hash_maxsize = 82500;
283
284/* bind/udp fanout table size */
285uint_t ipcl_bind_fanout_size = 512;
286uint_t ipcl_udp_fanout_size = 16384;
287
288/* Raw socket fanout size.  Must be a power of 2. */
289uint_t ipcl_raw_fanout_size = 256;
290
291/*
292 * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
293 * expect that most large deployments would have hundreds of tunnels, and
294 * thousands in the extreme case.
295 */
296uint_t ipcl_iptun_fanout_size = 6143;
297
298/*
299 * Power of 2^N Primes useful for hashing for N of 0-28,
300 * these primes are the nearest prime <= 2^N - 2^(N-2).
301 */
302
303#define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
304		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
305		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
306		50331599, 100663291, 201326557, 0}
307
308/*
309 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
310 * are aligned on cache lines.
311 */
312typedef union itc_s {
313	conn_t	itc_conn;
314	char	itcu_filler[CACHE_ALIGN(conn_s)];
315} itc_t;
316
317struct kmem_cache  *tcp_conn_cache;
318struct kmem_cache  *ip_conn_cache;
319extern struct kmem_cache  *sctp_conn_cache;
320extern struct kmem_cache  *tcp_sack_info_cache;
321struct kmem_cache  *udp_conn_cache;
322struct kmem_cache  *rawip_conn_cache;
323struct kmem_cache  *rts_conn_cache;
324
325extern void	tcp_timermp_free(tcp_t *);
326extern mblk_t	*tcp_timermp_alloc(int);
327
328static int	ip_conn_constructor(void *, void *, int);
329static void	ip_conn_destructor(void *, void *);
330
331static int	tcp_conn_constructor(void *, void *, int);
332static void	tcp_conn_destructor(void *, void *);
333
334static int	udp_conn_constructor(void *, void *, int);
335static void	udp_conn_destructor(void *, void *);
336
337static int	rawip_conn_constructor(void *, void *, int);
338static void	rawip_conn_destructor(void *, void *);
339
340static int	rts_conn_constructor(void *, void *, int);
341static void	rts_conn_destructor(void *, void *);
342
343/*
344 * Global (for all stack instances) init routine
345 */
346void
347ipcl_g_init(void)
348{
349	ip_conn_cache = kmem_cache_create("ip_conn_cache",
350	    sizeof (conn_t), CACHE_ALIGN_SIZE,
351	    ip_conn_constructor, ip_conn_destructor,
352	    NULL, NULL, NULL, 0);
353
354	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
355	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
356	    tcp_conn_constructor, tcp_conn_destructor,
357	    NULL, NULL, NULL, 0);
358
359	udp_conn_cache = kmem_cache_create("udp_conn_cache",
360	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
361	    udp_conn_constructor, udp_conn_destructor,
362	    NULL, NULL, NULL, 0);
363
364	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
365	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
366	    rawip_conn_constructor, rawip_conn_destructor,
367	    NULL, NULL, NULL, 0);
368
369	rts_conn_cache = kmem_cache_create("rts_conn_cache",
370	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
371	    rts_conn_constructor, rts_conn_destructor,
372	    NULL, NULL, NULL, 0);
373}
374
375/*
376 * ipclassifier intialization routine, sets up hash tables.
377 */
378void
379ipcl_init(ip_stack_t *ipst)
380{
381	int i;
382	int sizes[] = P2Ps();
383
384	/*
385	 * Calculate size of conn fanout table from /etc/system settings
386	 */
387	if (ipcl_conn_hash_size != 0) {
388		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
389	} else if (tcp_conn_hash_size != 0) {
390		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
391	} else {
392		extern pgcnt_t freemem;
393
394		ipst->ips_ipcl_conn_fanout_size =
395		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
396
397		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
398			ipst->ips_ipcl_conn_fanout_size =
399			    ipcl_conn_hash_maxsize;
400		}
401	}
402
403	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
404		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
405			break;
406		}
407	}
408	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
409		/* Out of range, use the 2^16 value */
410		ipst->ips_ipcl_conn_fanout_size = sizes[16];
411	}
412
413	/* Take values from /etc/system */
414	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
415	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
416	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
417	ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
418
419	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
420
421	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
422	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
423
424	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
425		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
426		    MUTEX_DEFAULT, NULL);
427	}
428
429	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
430	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
431
432	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
433		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
434		    MUTEX_DEFAULT, NULL);
435	}
436
437	ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
438	    sizeof (connf_t), KM_SLEEP);
439	for (i = 0; i < IPPROTO_MAX; i++) {
440		mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
441		    MUTEX_DEFAULT, NULL);
442	}
443
444	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
445	    sizeof (connf_t), KM_SLEEP);
446	for (i = 0; i < IPPROTO_MAX; i++) {
447		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
448		    MUTEX_DEFAULT, NULL);
449	}
450
451	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
452	mutex_init(&ipst->ips_rts_clients->connf_lock,
453	    NULL, MUTEX_DEFAULT, NULL);
454
455	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
456	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
457	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
458		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
459		    MUTEX_DEFAULT, NULL);
460	}
461
462	ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
463	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
464	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
465		mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
466		    MUTEX_DEFAULT, NULL);
467	}
468
469	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
470	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
471	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
472		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
473		    MUTEX_DEFAULT, NULL);
474	}
475
476	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
477	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
478	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
479		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
480		    NULL, MUTEX_DEFAULT, NULL);
481	}
482}
483
484void
485ipcl_g_destroy(void)
486{
487	kmem_cache_destroy(ip_conn_cache);
488	kmem_cache_destroy(tcp_conn_cache);
489	kmem_cache_destroy(udp_conn_cache);
490	kmem_cache_destroy(rawip_conn_cache);
491	kmem_cache_destroy(rts_conn_cache);
492}
493
494/*
495 * All user-level and kernel use of the stack must be gone
496 * by now.
497 */
498void
499ipcl_destroy(ip_stack_t *ipst)
500{
501	int i;
502
503	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
504		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
505		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
506	}
507	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
508	    sizeof (connf_t));
509	ipst->ips_ipcl_conn_fanout = NULL;
510
511	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
512		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
513		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
514	}
515	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
516	    sizeof (connf_t));
517	ipst->ips_ipcl_bind_fanout = NULL;
518
519	for (i = 0; i < IPPROTO_MAX; i++) {
520		ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
521		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
522	}
523	kmem_free(ipst->ips_ipcl_proto_fanout_v4,
524	    IPPROTO_MAX * sizeof (connf_t));
525	ipst->ips_ipcl_proto_fanout_v4 = NULL;
526
527	for (i = 0; i < IPPROTO_MAX; i++) {
528		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
529		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
530	}
531	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
532	    IPPROTO_MAX * sizeof (connf_t));
533	ipst->ips_ipcl_proto_fanout_v6 = NULL;
534
535	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
536		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
537		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
538	}
539	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
540	    sizeof (connf_t));
541	ipst->ips_ipcl_udp_fanout = NULL;
542
543	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
544		ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
545		mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
546	}
547	kmem_free(ipst->ips_ipcl_iptun_fanout,
548	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
549	ipst->ips_ipcl_iptun_fanout = NULL;
550
551	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
552		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
553		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
554	}
555	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
556	    sizeof (connf_t));
557	ipst->ips_ipcl_raw_fanout = NULL;
558
559	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
560		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
561		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
562	}
563	kmem_free(ipst->ips_ipcl_globalhash_fanout,
564	    sizeof (connf_t) * CONN_G_HASH_SIZE);
565	ipst->ips_ipcl_globalhash_fanout = NULL;
566
567	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
568	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
569	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
570	ipst->ips_rts_clients = NULL;
571}
572
573/*
574 * conn creation routine. initialize the conn, sets the reference
575 * and inserts it in the global hash table.
576 */
577conn_t *
578ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
579{
580	conn_t	*connp;
581	struct kmem_cache *conn_cache;
582
583	switch (type) {
584	case IPCL_SCTPCONN:
585		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
586			return (NULL);
587		sctp_conn_init(connp);
588		netstack_hold(ns);
589		connp->conn_netstack = ns;
590		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
591		ipcl_globalhash_insert(connp);
592		return (connp);
593
594	case IPCL_TCPCONN:
595		conn_cache = tcp_conn_cache;
596		break;
597
598	case IPCL_UDPCONN:
599		conn_cache = udp_conn_cache;
600		break;
601
602	case IPCL_RAWIPCONN:
603		conn_cache = rawip_conn_cache;
604		break;
605
606	case IPCL_RTSCONN:
607		conn_cache = rts_conn_cache;
608		break;
609
610	case IPCL_IPCCONN:
611		conn_cache = ip_conn_cache;
612		break;
613
614	default:
615		connp = NULL;
616		ASSERT(0);
617	}
618
619	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
620		return (NULL);
621
622	connp->conn_ref = 1;
623	netstack_hold(ns);
624	connp->conn_netstack = ns;
625	connp->conn_ixa->ixa_ipst = ns->netstack_ip;
626	ipcl_globalhash_insert(connp);
627	return (connp);
628}
629
630void
631ipcl_conn_destroy(conn_t *connp)
632{
633	mblk_t	*mp;
634	netstack_t	*ns = connp->conn_netstack;
635
636	ASSERT(!MUTEX_HELD(&connp->conn_lock));
637	ASSERT(connp->conn_ref == 0);
638
639	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
640
641	if (connp->conn_cred != NULL) {
642		crfree(connp->conn_cred);
643		connp->conn_cred = NULL;
644	}
645
646	if (connp->conn_ht_iphc != NULL) {
647		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
648		connp->conn_ht_iphc = NULL;
649		connp->conn_ht_iphc_allocated = 0;
650		connp->conn_ht_iphc_len = 0;
651		connp->conn_ht_ulp = NULL;
652		connp->conn_ht_ulp_len = 0;
653	}
654	ip_pkt_free(&connp->conn_xmit_ipp);
655
656	ipcl_globalhash_remove(connp);
657
658	if (connp->conn_latch != NULL) {
659		IPLATCH_REFRELE(connp->conn_latch);
660		connp->conn_latch = NULL;
661	}
662	if (connp->conn_latch_in_policy != NULL) {
663		IPPOL_REFRELE(connp->conn_latch_in_policy);
664		connp->conn_latch_in_policy = NULL;
665	}
666	if (connp->conn_latch_in_action != NULL) {
667		IPACT_REFRELE(connp->conn_latch_in_action);
668		connp->conn_latch_in_action = NULL;
669	}
670	if (connp->conn_policy != NULL) {
671		IPPH_REFRELE(connp->conn_policy, ns);
672		connp->conn_policy = NULL;
673	}
674
675	if (connp->conn_ipsec_opt_mp != NULL) {
676		freemsg(connp->conn_ipsec_opt_mp);
677		connp->conn_ipsec_opt_mp = NULL;
678	}
679
680	if (connp->conn_flags & IPCL_TCPCONN) {
681		tcp_t *tcp = connp->conn_tcp;
682
683		tcp_free(tcp);
684		mp = tcp->tcp_timercache;
685
686		tcp->tcp_tcps = NULL;
687
688		if (tcp->tcp_sack_info != NULL) {
689			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
690			kmem_cache_free(tcp_sack_info_cache,
691			    tcp->tcp_sack_info);
692		}
693
694		/*
695		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
696		 * the mblk.
697		 */
698		if (tcp->tcp_rsrv_mp != NULL) {
699			freeb(tcp->tcp_rsrv_mp);
700			tcp->tcp_rsrv_mp = NULL;
701			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
702		}
703
704		ipcl_conn_cleanup(connp);
705		connp->conn_flags = IPCL_TCPCONN;
706		if (ns != NULL) {
707			ASSERT(tcp->tcp_tcps == NULL);
708			connp->conn_netstack = NULL;
709			connp->conn_ixa->ixa_ipst = NULL;
710			netstack_rele(ns);
711		}
712
713		bzero(tcp, sizeof (tcp_t));
714
715		tcp->tcp_timercache = mp;
716		tcp->tcp_connp = connp;
717		kmem_cache_free(tcp_conn_cache, connp);
718		return;
719	}
720
721	if (connp->conn_flags & IPCL_SCTPCONN) {
722		ASSERT(ns != NULL);
723		sctp_free(connp);
724		return;
725	}
726
727	ipcl_conn_cleanup(connp);
728	if (ns != NULL) {
729		connp->conn_netstack = NULL;
730		connp->conn_ixa->ixa_ipst = NULL;
731		netstack_rele(ns);
732	}
733
734	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
735	if (connp->conn_flags & IPCL_UDPCONN) {
736		connp->conn_flags = IPCL_UDPCONN;
737		kmem_cache_free(udp_conn_cache, connp);
738	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
739		connp->conn_flags = IPCL_RAWIPCONN;
740		connp->conn_proto = IPPROTO_ICMP;
741		connp->conn_ixa->ixa_protocol = connp->conn_proto;
742		kmem_cache_free(rawip_conn_cache, connp);
743	} else if (connp->conn_flags & IPCL_RTSCONN) {
744		connp->conn_flags = IPCL_RTSCONN;
745		kmem_cache_free(rts_conn_cache, connp);
746	} else {
747		connp->conn_flags = IPCL_IPCCONN;
748		ASSERT(connp->conn_flags & IPCL_IPCCONN);
749		ASSERT(connp->conn_priv == NULL);
750		kmem_cache_free(ip_conn_cache, connp);
751	}
752}
753
754/*
755 * Running in cluster mode - deregister listener information
756 */
757static void
758ipcl_conn_unlisten(conn_t *connp)
759{
760	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
761	ASSERT(connp->conn_lport != 0);
762
763	if (cl_inet_unlisten != NULL) {
764		sa_family_t	addr_family;
765		uint8_t		*laddrp;
766
767		if (connp->conn_ipversion == IPV6_VERSION) {
768			addr_family = AF_INET6;
769			laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
770		} else {
771			addr_family = AF_INET;
772			laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
773		}
774		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
775		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
776	}
777	connp->conn_flags &= ~IPCL_CL_LISTENER;
778}
779
780/*
781 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
782 * which table the conn belonged to). So for debugging we can see which hash
783 * table this connection was in.
784 */
785#define	IPCL_HASH_REMOVE(connp)	{					\
786	connf_t	*connfp = (connp)->conn_fanout;				\
787	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
788	if (connfp != NULL) {						\
789		mutex_enter(&connfp->connf_lock);			\
790		if ((connp)->conn_next != NULL)				\
791			(connp)->conn_next->conn_prev =			\
792			    (connp)->conn_prev;				\
793		if ((connp)->conn_prev != NULL)				\
794			(connp)->conn_prev->conn_next =			\
795			    (connp)->conn_next;				\
796		else							\
797			connfp->connf_head = (connp)->conn_next;	\
798		(connp)->conn_fanout = NULL;				\
799		(connp)->conn_next = NULL;				\
800		(connp)->conn_prev = NULL;				\
801		(connp)->conn_flags |= IPCL_REMOVED;			\
802		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
803			ipcl_conn_unlisten((connp));			\
804		CONN_DEC_REF((connp));					\
805		mutex_exit(&connfp->connf_lock);			\
806	}								\
807}
808
809void
810ipcl_hash_remove(conn_t *connp)
811{
812	uint8_t		protocol = connp->conn_proto;
813
814	IPCL_HASH_REMOVE(connp);
815	if (protocol == IPPROTO_RSVP)
816		ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
817}
818
819/*
820 * The whole purpose of this function is allow removal of
821 * a conn_t from the connected hash for timewait reclaim.
822 * This is essentially a TW reclaim fastpath where timewait
823 * collector checks under fanout lock (so no one else can
824 * get access to the conn_t) that refcnt is 2 i.e. one for
825 * TCP and one for the classifier hash list. If ref count
826 * is indeed 2, we can just remove the conn under lock and
827 * avoid cleaning up the conn under squeue. This gives us
828 * improved performance.
829 */
830void
831ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
832{
833	ASSERT(MUTEX_HELD(&connfp->connf_lock));
834	ASSERT(MUTEX_HELD(&connp->conn_lock));
835	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
836
837	if ((connp)->conn_next != NULL) {
838		(connp)->conn_next->conn_prev = (connp)->conn_prev;
839	}
840	if ((connp)->conn_prev != NULL) {
841		(connp)->conn_prev->conn_next = (connp)->conn_next;
842	} else {
843		connfp->connf_head = (connp)->conn_next;
844	}
845	(connp)->conn_fanout = NULL;
846	(connp)->conn_next = NULL;
847	(connp)->conn_prev = NULL;
848	(connp)->conn_flags |= IPCL_REMOVED;
849	ASSERT((connp)->conn_ref == 2);
850	(connp)->conn_ref--;
851}
852
853#define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
854	ASSERT((connp)->conn_fanout == NULL);				\
855	ASSERT((connp)->conn_next == NULL);				\
856	ASSERT((connp)->conn_prev == NULL);				\
857	if ((connfp)->connf_head != NULL) {				\
858		(connfp)->connf_head->conn_prev = (connp);		\
859		(connp)->conn_next = (connfp)->connf_head;		\
860	}								\
861	(connp)->conn_fanout = (connfp);				\
862	(connfp)->connf_head = (connp);					\
863	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
864	    IPCL_CONNECTED;						\
865	CONN_INC_REF(connp);						\
866}
867
868#define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
869	IPCL_HASH_REMOVE((connp));					\
870	mutex_enter(&(connfp)->connf_lock);				\
871	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
872	mutex_exit(&(connfp)->connf_lock);				\
873}
874
875#define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
876	conn_t *pconnp = NULL, *nconnp;					\
877	IPCL_HASH_REMOVE((connp));					\
878	mutex_enter(&(connfp)->connf_lock);				\
879	nconnp = (connfp)->connf_head;					\
880	while (nconnp != NULL &&					\
881	    !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {		\
882		pconnp = nconnp;					\
883		nconnp = nconnp->conn_next;				\
884	}								\
885	if (pconnp != NULL) {						\
886		pconnp->conn_next = (connp);				\
887		(connp)->conn_prev = pconnp;				\
888	} else {							\
889		(connfp)->connf_head = (connp);				\
890	}								\
891	if (nconnp != NULL) {						\
892		(connp)->conn_next = nconnp;				\
893		nconnp->conn_prev = (connp);				\
894	}								\
895	(connp)->conn_fanout = (connfp);				\
896	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
897	    IPCL_BOUND;							\
898	CONN_INC_REF(connp);						\
899	mutex_exit(&(connfp)->connf_lock);				\
900}
901
902#define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
903	conn_t **list, *prev, *next;					\
904	boolean_t isv4mapped =						\
905	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);		\
906	IPCL_HASH_REMOVE((connp));					\
907	mutex_enter(&(connfp)->connf_lock);				\
908	list = &(connfp)->connf_head;					\
909	prev = NULL;							\
910	while ((next = *list) != NULL) {				\
911		if (isv4mapped &&					\
912		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&	\
913		    connp->conn_zoneid == next->conn_zoneid) {		\
914			(connp)->conn_next = next;			\
915			if (prev != NULL)				\
916				prev = next->conn_prev;			\
917			next->conn_prev = (connp);			\
918			break;						\
919		}							\
920		list = &next->conn_next;				\
921		prev = next;						\
922	}								\
923	(connp)->conn_prev = prev;					\
924	*list = (connp);						\
925	(connp)->conn_fanout = (connfp);				\
926	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
927	    IPCL_BOUND;							\
928	CONN_INC_REF((connp));						\
929	mutex_exit(&(connfp)->connf_lock);				\
930}
931
932void
933ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
934{
935	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
936}
937
938/*
939 * Because the classifier is used to classify inbound packets, the destination
940 * address is meant to be our local tunnel address (tunnel source), and the
941 * source the remote tunnel address (tunnel destination).
942 *
943 * Note that conn_proto can't be used for fanout since the upper protocol
944 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
945 */
946conn_t *
947ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
948{
949	connf_t	*connfp;
950	conn_t	*connp;
951
952	/* first look for IPv4 tunnel links */
953	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
954	mutex_enter(&connfp->connf_lock);
955	for (connp = connfp->connf_head; connp != NULL;
956	    connp = connp->conn_next) {
957		if (IPCL_IPTUN_MATCH(connp, *dst, *src))
958			break;
959	}
960	if (connp != NULL)
961		goto done;
962
963	mutex_exit(&connfp->connf_lock);
964
965	/* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
966	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
967	    INADDR_ANY)];
968	mutex_enter(&connfp->connf_lock);
969	for (connp = connfp->connf_head; connp != NULL;
970	    connp = connp->conn_next) {
971		if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
972			break;
973	}
974done:
975	if (connp != NULL)
976		CONN_INC_REF(connp);
977	mutex_exit(&connfp->connf_lock);
978	return (connp);
979}
980
981conn_t *
982ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
983{
984	connf_t	*connfp;
985	conn_t	*connp;
986
987	/* Look for an IPv6 tunnel link */
988	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
989	mutex_enter(&connfp->connf_lock);
990	for (connp = connfp->connf_head; connp != NULL;
991	    connp = connp->conn_next) {
992		if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
993			CONN_INC_REF(connp);
994			break;
995		}
996	}
997	mutex_exit(&connfp->connf_lock);
998	return (connp);
999}
1000
1001/*
1002 * This function is used only for inserting SCTP raw socket now.
1003 * This may change later.
1004 *
1005 * Note that only one raw socket can be bound to a port.  The param
1006 * lport is in network byte order.
1007 */
1008static int
1009ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1010{
1011	connf_t	*connfp;
1012	conn_t	*oconnp;
1013	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1014
1015	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1016
1017	/* Check for existing raw socket already bound to the port. */
1018	mutex_enter(&connfp->connf_lock);
1019	for (oconnp = connfp->connf_head; oconnp != NULL;
1020	    oconnp = oconnp->conn_next) {
1021		if (oconnp->conn_lport == lport &&
1022		    oconnp->conn_zoneid == connp->conn_zoneid &&
1023		    oconnp->conn_family == connp->conn_family &&
1024		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1025		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1026		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1027		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1028		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1029		    &connp->conn_laddr_v6))) {
1030			break;
1031		}
1032	}
1033	mutex_exit(&connfp->connf_lock);
1034	if (oconnp != NULL)
1035		return (EADDRNOTAVAIL);
1036
1037	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1038	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1039		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1040		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1041			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1042		} else {
1043			IPCL_HASH_INSERT_BOUND(connfp, connp);
1044		}
1045	} else {
1046		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1047	}
1048	return (0);
1049}
1050
1051static int
1052ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1053{
1054	connf_t	*connfp;
1055	conn_t	*tconnp;
1056	ipaddr_t laddr = connp->conn_laddr_v4;
1057	ipaddr_t faddr = connp->conn_faddr_v4;
1058
1059	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1060	mutex_enter(&connfp->connf_lock);
1061	for (tconnp = connfp->connf_head; tconnp != NULL;
1062	    tconnp = tconnp->conn_next) {
1063		if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1064			/* A tunnel is already bound to these addresses. */
1065			mutex_exit(&connfp->connf_lock);
1066			return (EADDRINUSE);
1067		}
1068	}
1069	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1070	mutex_exit(&connfp->connf_lock);
1071	return (0);
1072}
1073
1074static int
1075ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1076{
1077	connf_t	*connfp;
1078	conn_t	*tconnp;
1079	in6_addr_t *laddr = &connp->conn_laddr_v6;
1080	in6_addr_t *faddr = &connp->conn_faddr_v6;
1081
1082	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1083	mutex_enter(&connfp->connf_lock);
1084	for (tconnp = connfp->connf_head; tconnp != NULL;
1085	    tconnp = tconnp->conn_next) {
1086		if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1087			/* A tunnel is already bound to these addresses. */
1088			mutex_exit(&connfp->connf_lock);
1089			return (EADDRINUSE);
1090		}
1091	}
1092	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1093	mutex_exit(&connfp->connf_lock);
1094	return (0);
1095}
1096
1097/*
1098 * Check for a MAC exemption conflict on a labeled system.  Note that for
1099 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1100 * transport layer.  This check is for binding all other protocols.
1101 *
1102 * Returns true if there's a conflict.
1103 */
1104static boolean_t
1105check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1106{
1107	connf_t	*connfp;
1108	conn_t *tconn;
1109
1110	connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1111	mutex_enter(&connfp->connf_lock);
1112	for (tconn = connfp->connf_head; tconn != NULL;
1113	    tconn = tconn->conn_next) {
1114		/* We don't allow v4 fallback for v6 raw socket */
1115		if (connp->conn_family != tconn->conn_family)
1116			continue;
1117		/* If neither is exempt, then there's no conflict */
1118		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1119		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1120			continue;
1121		/* We are only concerned about sockets for a different zone */
1122		if (connp->conn_zoneid == tconn->conn_zoneid)
1123			continue;
1124		/* If both are bound to different specific addrs, ok */
1125		if (connp->conn_laddr_v4 != INADDR_ANY &&
1126		    tconn->conn_laddr_v4 != INADDR_ANY &&
1127		    connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1128			continue;
1129		/* These two conflict; fail */
1130		break;
1131	}
1132	mutex_exit(&connfp->connf_lock);
1133	return (tconn != NULL);
1134}
1135
1136static boolean_t
1137check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1138{
1139	connf_t	*connfp;
1140	conn_t *tconn;
1141
1142	connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1143	mutex_enter(&connfp->connf_lock);
1144	for (tconn = connfp->connf_head; tconn != NULL;
1145	    tconn = tconn->conn_next) {
1146		/* We don't allow v4 fallback for v6 raw socket */
1147		if (connp->conn_family != tconn->conn_family)
1148			continue;
1149		/* If neither is exempt, then there's no conflict */
1150		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1151		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1152			continue;
1153		/* We are only concerned about sockets for a different zone */
1154		if (connp->conn_zoneid == tconn->conn_zoneid)
1155			continue;
1156		/* If both are bound to different addrs, ok */
1157		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1158		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1159		    !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1160		    &tconn->conn_laddr_v6))
1161			continue;
1162		/* These two conflict; fail */
1163		break;
1164	}
1165	mutex_exit(&connfp->connf_lock);
1166	return (tconn != NULL);
1167}
1168
1169/*
1170 * (v4, v6) bind hash insertion routines
1171 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1172 */
1173
1174int
1175ipcl_bind_insert(conn_t *connp)
1176{
1177	if (connp->conn_ipversion == IPV6_VERSION)
1178		return (ipcl_bind_insert_v6(connp));
1179	else
1180		return (ipcl_bind_insert_v4(connp));
1181}
1182
1183int
1184ipcl_bind_insert_v4(conn_t *connp)
1185{
1186	connf_t	*connfp;
1187	int	ret = 0;
1188	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1189	uint16_t	lport = connp->conn_lport;
1190	uint8_t		protocol = connp->conn_proto;
1191
1192	if (IPCL_IS_IPTUN(connp))
1193		return (ipcl_iptun_hash_insert(connp, ipst));
1194
1195	switch (protocol) {
1196	default:
1197		if (is_system_labeled() &&
1198		    check_exempt_conflict_v4(connp, ipst))
1199			return (EADDRINUSE);
1200		/* FALLTHROUGH */
1201	case IPPROTO_UDP:
1202		if (protocol == IPPROTO_UDP) {
1203			connfp = &ipst->ips_ipcl_udp_fanout[
1204			    IPCL_UDP_HASH(lport, ipst)];
1205		} else {
1206			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1207		}
1208
1209		if (connp->conn_faddr_v4 != INADDR_ANY) {
1210			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1211		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1212			IPCL_HASH_INSERT_BOUND(connfp, connp);
1213		} else {
1214			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1215		}
1216		if (protocol == IPPROTO_RSVP)
1217			ill_set_inputfn_all(ipst);
1218		break;
1219
1220	case IPPROTO_TCP:
1221		/* Insert it in the Bind Hash */
1222		ASSERT(connp->conn_zoneid != ALL_ZONES);
1223		connfp = &ipst->ips_ipcl_bind_fanout[
1224		    IPCL_BIND_HASH(lport, ipst)];
1225		if (connp->conn_laddr_v4 != INADDR_ANY) {
1226			IPCL_HASH_INSERT_BOUND(connfp, connp);
1227		} else {
1228			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1229		}
1230		if (cl_inet_listen != NULL) {
1231			ASSERT(connp->conn_ipversion == IPV4_VERSION);
1232			connp->conn_flags |= IPCL_CL_LISTENER;
1233			(*cl_inet_listen)(
1234			    connp->conn_netstack->netstack_stackid,
1235			    IPPROTO_TCP, AF_INET,
1236			    (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1237		}
1238		break;
1239
1240	case IPPROTO_SCTP:
1241		ret = ipcl_sctp_hash_insert(connp, lport);
1242		break;
1243	}
1244
1245	return (ret);
1246}
1247
1248int
1249ipcl_bind_insert_v6(conn_t *connp)
1250{
1251	connf_t		*connfp;
1252	int		ret = 0;
1253	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1254	uint16_t	lport = connp->conn_lport;
1255	uint8_t		protocol = connp->conn_proto;
1256
1257	if (IPCL_IS_IPTUN(connp)) {
1258		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1259	}
1260
1261	switch (protocol) {
1262	default:
1263		if (is_system_labeled() &&
1264		    check_exempt_conflict_v6(connp, ipst))
1265			return (EADDRINUSE);
1266		/* FALLTHROUGH */
1267	case IPPROTO_UDP:
1268		if (protocol == IPPROTO_UDP) {
1269			connfp = &ipst->ips_ipcl_udp_fanout[
1270			    IPCL_UDP_HASH(lport, ipst)];
1271		} else {
1272			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1273		}
1274
1275		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1276			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1277		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1278			IPCL_HASH_INSERT_BOUND(connfp, connp);
1279		} else {
1280			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1281		}
1282		break;
1283
1284	case IPPROTO_TCP:
1285		/* Insert it in the Bind Hash */
1286		ASSERT(connp->conn_zoneid != ALL_ZONES);
1287		connfp = &ipst->ips_ipcl_bind_fanout[
1288		    IPCL_BIND_HASH(lport, ipst)];
1289		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1290			IPCL_HASH_INSERT_BOUND(connfp, connp);
1291		} else {
1292			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1293		}
1294		if (cl_inet_listen != NULL) {
1295			sa_family_t	addr_family;
1296			uint8_t		*laddrp;
1297
1298			if (connp->conn_ipversion == IPV6_VERSION) {
1299				addr_family = AF_INET6;
1300				laddrp =
1301				    (uint8_t *)&connp->conn_bound_addr_v6;
1302			} else {
1303				addr_family = AF_INET;
1304				laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1305			}
1306			connp->conn_flags |= IPCL_CL_LISTENER;
1307			(*cl_inet_listen)(
1308			    connp->conn_netstack->netstack_stackid,
1309			    IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1310		}
1311		break;
1312
1313	case IPPROTO_SCTP:
1314		ret = ipcl_sctp_hash_insert(connp, lport);
1315		break;
1316	}
1317
1318	return (ret);
1319}
1320
1321/*
1322 * ipcl_conn_hash insertion routines.
1323 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1324 */
1325
1326int
1327ipcl_conn_insert(conn_t *connp)
1328{
1329	if (connp->conn_ipversion == IPV6_VERSION)
1330		return (ipcl_conn_insert_v6(connp));
1331	else
1332		return (ipcl_conn_insert_v4(connp));
1333}
1334
1335int
1336ipcl_conn_insert_v4(conn_t *connp)
1337{
1338	connf_t		*connfp;
1339	conn_t		*tconnp;
1340	int		ret = 0;
1341	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1342	uint16_t	lport = connp->conn_lport;
1343	uint8_t		protocol = connp->conn_proto;
1344
1345	if (IPCL_IS_IPTUN(connp))
1346		return (ipcl_iptun_hash_insert(connp, ipst));
1347
1348	switch (protocol) {
1349	case IPPROTO_TCP:
1350		/*
1351		 * For TCP, we check whether the connection tuple already
1352		 * exists before allowing the connection to proceed.  We
1353		 * also allow indexing on the zoneid. This is to allow
1354		 * multiple shared stack zones to have the same tcp
1355		 * connection tuple. In practice this only happens for
1356		 * INADDR_LOOPBACK as it's the only local address which
1357		 * doesn't have to be unique.
1358		 */
1359		connfp = &ipst->ips_ipcl_conn_fanout[
1360		    IPCL_CONN_HASH(connp->conn_faddr_v4,
1361		    connp->conn_ports, ipst)];
1362		mutex_enter(&connfp->connf_lock);
1363		for (tconnp = connfp->connf_head; tconnp != NULL;
1364		    tconnp = tconnp->conn_next) {
1365			if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1366			    connp->conn_faddr_v4, connp->conn_laddr_v4,
1367			    connp->conn_ports) &&
1368			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1369				/* Already have a conn. bail out */
1370				mutex_exit(&connfp->connf_lock);
1371				return (EADDRINUSE);
1372			}
1373		}
1374		if (connp->conn_fanout != NULL) {
1375			/*
1376			 * Probably a XTI/TLI application trying to do a
1377			 * rebind. Let it happen.
1378			 */
1379			mutex_exit(&connfp->connf_lock);
1380			IPCL_HASH_REMOVE(connp);
1381			mutex_enter(&connfp->connf_lock);
1382		}
1383
1384		ASSERT(connp->conn_recv != NULL);
1385		ASSERT(connp->conn_recvicmp != NULL);
1386
1387		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1388		mutex_exit(&connfp->connf_lock);
1389		break;
1390
1391	case IPPROTO_SCTP:
1392		/*
1393		 * The raw socket may have already been bound, remove it
1394		 * from the hash first.
1395		 */
1396		IPCL_HASH_REMOVE(connp);
1397		ret = ipcl_sctp_hash_insert(connp, lport);
1398		break;
1399
1400	default:
1401		/*
1402		 * Check for conflicts among MAC exempt bindings.  For
1403		 * transports with port numbers, this is done by the upper
1404		 * level per-transport binding logic.  For all others, it's
1405		 * done here.
1406		 */
1407		if (is_system_labeled() &&
1408		    check_exempt_conflict_v4(connp, ipst))
1409			return (EADDRINUSE);
1410		/* FALLTHROUGH */
1411
1412	case IPPROTO_UDP:
1413		if (protocol == IPPROTO_UDP) {
1414			connfp = &ipst->ips_ipcl_udp_fanout[
1415			    IPCL_UDP_HASH(lport, ipst)];
1416		} else {
1417			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1418		}
1419
1420		if (connp->conn_faddr_v4 != INADDR_ANY) {
1421			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1422		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1423			IPCL_HASH_INSERT_BOUND(connfp, connp);
1424		} else {
1425			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1426		}
1427		break;
1428	}
1429
1430	return (ret);
1431}
1432
1433int
1434ipcl_conn_insert_v6(conn_t *connp)
1435{
1436	connf_t		*connfp;
1437	conn_t		*tconnp;
1438	int		ret = 0;
1439	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1440	uint16_t	lport = connp->conn_lport;
1441	uint8_t		protocol = connp->conn_proto;
1442	uint_t		ifindex = connp->conn_bound_if;
1443
1444	if (IPCL_IS_IPTUN(connp))
1445		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1446
1447	switch (protocol) {
1448	case IPPROTO_TCP:
1449
1450		/*
1451		 * For tcp, we check whether the connection tuple already
1452		 * exists before allowing the connection to proceed.  We
1453		 * also allow indexing on the zoneid. This is to allow
1454		 * multiple shared stack zones to have the same tcp
1455		 * connection tuple. In practice this only happens for
1456		 * ipv6_loopback as it's the only local address which
1457		 * doesn't have to be unique.
1458		 */
1459		connfp = &ipst->ips_ipcl_conn_fanout[
1460		    IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1461		    ipst)];
1462		mutex_enter(&connfp->connf_lock);
1463		for (tconnp = connfp->connf_head; tconnp != NULL;
1464		    tconnp = tconnp->conn_next) {
1465			/* NOTE: need to match zoneid. Bug in onnv-gate */
1466			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1467			    connp->conn_faddr_v6, connp->conn_laddr_v6,
1468			    connp->conn_ports) &&
1469			    (tconnp->conn_bound_if == 0 ||
1470			    tconnp->conn_bound_if == ifindex) &&
1471			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1472				/* Already have a conn. bail out */
1473				mutex_exit(&connfp->connf_lock);
1474				return (EADDRINUSE);
1475			}
1476		}
1477		if (connp->conn_fanout != NULL) {
1478			/*
1479			 * Probably a XTI/TLI application trying to do a
1480			 * rebind. Let it happen.
1481			 */
1482			mutex_exit(&connfp->connf_lock);
1483			IPCL_HASH_REMOVE(connp);
1484			mutex_enter(&connfp->connf_lock);
1485		}
1486		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1487		mutex_exit(&connfp->connf_lock);
1488		break;
1489
1490	case IPPROTO_SCTP:
1491		IPCL_HASH_REMOVE(connp);
1492		ret = ipcl_sctp_hash_insert(connp, lport);
1493		break;
1494
1495	default:
1496		if (is_system_labeled() &&
1497		    check_exempt_conflict_v6(connp, ipst))
1498			return (EADDRINUSE);
1499		/* FALLTHROUGH */
1500	case IPPROTO_UDP:
1501		if (protocol == IPPROTO_UDP) {
1502			connfp = &ipst->ips_ipcl_udp_fanout[
1503			    IPCL_UDP_HASH(lport, ipst)];
1504		} else {
1505			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1506		}
1507
1508		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1509			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1510		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1511			IPCL_HASH_INSERT_BOUND(connfp, connp);
1512		} else {
1513			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1514		}
1515		break;
1516	}
1517
1518	return (ret);
1519}
1520
1521/*
1522 * v4 packet classifying function. looks up the fanout table to
1523 * find the conn, the packet belongs to. returns the conn with
1524 * the reference held, null otherwise.
1525 *
1526 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1527 * Lookup" comment block are applied.  Labels are also checked as described
1528 * above.  If the packet is from the inside (looped back), and is from the same
1529 * zone, then label checks are omitted.
1530 */
1531conn_t *
1532ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1533    ip_recv_attr_t *ira, ip_stack_t *ipst)
1534{
1535	ipha_t	*ipha;
1536	connf_t	*connfp, *bind_connfp;
1537	uint16_t lport;
1538	uint16_t fport;
1539	uint32_t ports;
1540	conn_t	*connp;
1541	uint16_t  *up;
1542	zoneid_t	zoneid = ira->ira_zoneid;
1543
1544	ipha = (ipha_t *)mp->b_rptr;
1545	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1546
1547	switch (protocol) {
1548	case IPPROTO_TCP:
1549		ports = *(uint32_t *)up;
1550		connfp =
1551		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1552		    ports, ipst)];
1553		mutex_enter(&connfp->connf_lock);
1554		for (connp = connfp->connf_head; connp != NULL;
1555		    connp = connp->conn_next) {
1556			if (IPCL_CONN_MATCH(connp, protocol,
1557			    ipha->ipha_src, ipha->ipha_dst, ports) &&
1558			    (connp->conn_zoneid == zoneid ||
1559			    connp->conn_allzones ||
1560			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1561			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1562			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1563				break;
1564		}
1565
1566		if (connp != NULL) {
1567			/*
1568			 * We have a fully-bound TCP connection.
1569			 *
1570			 * For labeled systems, there's no need to check the
1571			 * label here.  It's known to be good as we checked
1572			 * before allowing the connection to become bound.
1573			 */
1574			CONN_INC_REF(connp);
1575			mutex_exit(&connfp->connf_lock);
1576			return (connp);
1577		}
1578
1579		mutex_exit(&connfp->connf_lock);
1580		lport = up[1];
1581		bind_connfp =
1582		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1583		mutex_enter(&bind_connfp->connf_lock);
1584		for (connp = bind_connfp->connf_head; connp != NULL;
1585		    connp = connp->conn_next) {
1586			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1587			    lport) &&
1588			    (connp->conn_zoneid == zoneid ||
1589			    connp->conn_allzones ||
1590			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1591			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1592			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1593				break;
1594		}
1595
1596		/*
1597		 * If the matching connection is SLP on a private address, then
1598		 * the label on the packet must match the local zone's label.
1599		 * Otherwise, it must be in the label range defined by tnrh.
1600		 * This is ensured by tsol_receive_local.
1601		 *
1602		 * Note that we don't check tsol_receive_local for
1603		 * the connected case.
1604		 */
1605		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1606		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1607		    ira, connp)) {
1608			DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1609			    char *, "connp(1) could not receive mp(2)",
1610			    conn_t *, connp, mblk_t *, mp);
1611			connp = NULL;
1612		}
1613
1614		if (connp != NULL) {
1615			/* Have a listener at least */
1616			CONN_INC_REF(connp);
1617			mutex_exit(&bind_connfp->connf_lock);
1618			return (connp);
1619		}
1620
1621		mutex_exit(&bind_connfp->connf_lock);
1622		break;
1623
1624	case IPPROTO_UDP:
1625		lport = up[1];
1626		fport = up[0];
1627		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1628		mutex_enter(&connfp->connf_lock);
1629		for (connp = connfp->connf_head; connp != NULL;
1630		    connp = connp->conn_next) {
1631			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1632			    fport, ipha->ipha_src) &&
1633			    (connp->conn_zoneid == zoneid ||
1634			    connp->conn_allzones ||
1635			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1636			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1637				break;
1638		}
1639
1640		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1641		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1642		    ira, connp)) {
1643			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1644			    char *, "connp(1) could not receive mp(2)",
1645			    conn_t *, connp, mblk_t *, mp);
1646			connp = NULL;
1647		}
1648
1649		if (connp != NULL) {
1650			CONN_INC_REF(connp);
1651			mutex_exit(&connfp->connf_lock);
1652			return (connp);
1653		}
1654
1655		/*
1656		 * We shouldn't come here for multicast/broadcast packets
1657		 */
1658		mutex_exit(&connfp->connf_lock);
1659
1660		break;
1661
1662	case IPPROTO_ENCAP:
1663	case IPPROTO_IPV6:
1664		return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1665		    &ipha->ipha_dst, ipst));
1666	}
1667
1668	return (NULL);
1669}
1670
1671conn_t *
1672ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1673    ip_recv_attr_t *ira, ip_stack_t *ipst)
1674{
1675	ip6_t		*ip6h;
1676	connf_t		*connfp, *bind_connfp;
1677	uint16_t	lport;
1678	uint16_t	fport;
1679	tcpha_t		*tcpha;
1680	uint32_t	ports;
1681	conn_t		*connp;
1682	uint16_t	*up;
1683	zoneid_t	zoneid = ira->ira_zoneid;
1684
1685	ip6h = (ip6_t *)mp->b_rptr;
1686
1687	switch (protocol) {
1688	case IPPROTO_TCP:
1689		tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1690		up = &tcpha->tha_lport;
1691		ports = *(uint32_t *)up;
1692
1693		connfp =
1694		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1695		    ports, ipst)];
1696		mutex_enter(&connfp->connf_lock);
1697		for (connp = connfp->connf_head; connp != NULL;
1698		    connp = connp->conn_next) {
1699			if (IPCL_CONN_MATCH_V6(connp, protocol,
1700			    ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1701			    (connp->conn_zoneid == zoneid ||
1702			    connp->conn_allzones ||
1703			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1704			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1705			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1706				break;
1707		}
1708
1709		if (connp != NULL) {
1710			/*
1711			 * We have a fully-bound TCP connection.
1712			 *
1713			 * For labeled systems, there's no need to check the
1714			 * label here.  It's known to be good as we checked
1715			 * before allowing the connection to become bound.
1716			 */
1717			CONN_INC_REF(connp);
1718			mutex_exit(&connfp->connf_lock);
1719			return (connp);
1720		}
1721
1722		mutex_exit(&connfp->connf_lock);
1723
1724		lport = up[1];
1725		bind_connfp =
1726		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1727		mutex_enter(&bind_connfp->connf_lock);
1728		for (connp = bind_connfp->connf_head; connp != NULL;
1729		    connp = connp->conn_next) {
1730			if (IPCL_BIND_MATCH_V6(connp, protocol,
1731			    ip6h->ip6_dst, lport) &&
1732			    (connp->conn_zoneid == zoneid ||
1733			    connp->conn_allzones ||
1734			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1735			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1736			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1737				break;
1738		}
1739
1740		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1741		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1742		    ira, connp)) {
1743			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1744			    char *, "connp(1) could not receive mp(2)",
1745			    conn_t *, connp, mblk_t *, mp);
1746			connp = NULL;
1747		}
1748
1749		if (connp != NULL) {
1750			/* Have a listner at least */
1751			CONN_INC_REF(connp);
1752			mutex_exit(&bind_connfp->connf_lock);
1753			return (connp);
1754		}
1755
1756		mutex_exit(&bind_connfp->connf_lock);
1757		break;
1758
1759	case IPPROTO_UDP:
1760		up = (uint16_t *)&mp->b_rptr[hdr_len];
1761		lport = up[1];
1762		fport = up[0];
1763		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1764		mutex_enter(&connfp->connf_lock);
1765		for (connp = connfp->connf_head; connp != NULL;
1766		    connp = connp->conn_next) {
1767			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1768			    fport, ip6h->ip6_src) &&
1769			    (connp->conn_zoneid == zoneid ||
1770			    connp->conn_allzones ||
1771			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1772			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1773			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1774				break;
1775		}
1776
1777		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1778		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1779		    ira, connp)) {
1780			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1781			    char *, "connp(1) could not receive mp(2)",
1782			    conn_t *, connp, mblk_t *, mp);
1783			connp = NULL;
1784		}
1785
1786		if (connp != NULL) {
1787			CONN_INC_REF(connp);
1788			mutex_exit(&connfp->connf_lock);
1789			return (connp);
1790		}
1791
1792		/*
1793		 * We shouldn't come here for multicast/broadcast packets
1794		 */
1795		mutex_exit(&connfp->connf_lock);
1796		break;
1797	case IPPROTO_ENCAP:
1798	case IPPROTO_IPV6:
1799		return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1800		    &ip6h->ip6_dst, ipst));
1801	}
1802
1803	return (NULL);
1804}
1805
1806/*
1807 * wrapper around ipcl_classify_(v4,v6) routines.
1808 */
1809conn_t *
1810ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1811{
1812	if (ira->ira_flags & IRAF_IS_IPV4) {
1813		return (ipcl_classify_v4(mp, ira->ira_protocol,
1814		    ira->ira_ip_hdr_length, ira, ipst));
1815	} else {
1816		return (ipcl_classify_v6(mp, ira->ira_protocol,
1817		    ira->ira_ip_hdr_length, ira, ipst));
1818	}
1819}
1820
1821/*
1822 * Only used to classify SCTP RAW sockets
1823 */
1824conn_t *
1825ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1826    ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1827{
1828	connf_t		*connfp;
1829	conn_t		*connp;
1830	in_port_t	lport;
1831	int		ipversion;
1832	const void	*dst;
1833	zoneid_t	zoneid = ira->ira_zoneid;
1834
1835	lport = ((uint16_t *)&ports)[1];
1836	if (ira->ira_flags & IRAF_IS_IPV4) {
1837		dst = (const void *)&ipha->ipha_dst;
1838		ipversion = IPV4_VERSION;
1839	} else {
1840		dst = (const void *)&ip6h->ip6_dst;
1841		ipversion = IPV6_VERSION;
1842	}
1843
1844	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1845	mutex_enter(&connfp->connf_lock);
1846	for (connp = connfp->connf_head; connp != NULL;
1847	    connp = connp->conn_next) {
1848		/* We don't allow v4 fallback for v6 raw socket. */
1849		if (ipversion != connp->conn_ipversion)
1850			continue;
1851		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1852		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1853			if (ipversion == IPV4_VERSION) {
1854				if (!IPCL_CONN_MATCH(connp, protocol,
1855				    ipha->ipha_src, ipha->ipha_dst, ports))
1856					continue;
1857			} else {
1858				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1859				    ip6h->ip6_src, ip6h->ip6_dst, ports))
1860					continue;
1861			}
1862		} else {
1863			if (ipversion == IPV4_VERSION) {
1864				if (!IPCL_BIND_MATCH(connp, protocol,
1865				    ipha->ipha_dst, lport))
1866					continue;
1867			} else {
1868				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1869				    ip6h->ip6_dst, lport))
1870					continue;
1871			}
1872		}
1873
1874		if (connp->conn_zoneid == zoneid ||
1875		    connp->conn_allzones ||
1876		    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1877		    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1878		    (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1879			break;
1880	}
1881
1882	if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1883	    !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1884		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1885		    char *, "connp(1) could not receive mp(2)",
1886		    conn_t *, connp, mblk_t *, mp);
1887		connp = NULL;
1888	}
1889
1890	if (connp != NULL)
1891		goto found;
1892	mutex_exit(&connfp->connf_lock);
1893
1894	/* Try to look for a wildcard SCTP RAW socket match. */
1895	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1896	mutex_enter(&connfp->connf_lock);
1897	for (connp = connfp->connf_head; connp != NULL;
1898	    connp = connp->conn_next) {
1899		/* We don't allow v4 fallback for v6 raw socket. */
1900		if (ipversion != connp->conn_ipversion)
1901			continue;
1902		if (!IPCL_ZONE_MATCH(connp, zoneid))
1903			continue;
1904
1905		if (ipversion == IPV4_VERSION) {
1906			if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1907				break;
1908		} else {
1909			if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1910				break;
1911			}
1912		}
1913	}
1914
1915	if (connp != NULL)
1916		goto found;
1917
1918	mutex_exit(&connfp->connf_lock);
1919	return (NULL);
1920
1921found:
1922	ASSERT(connp != NULL);
1923	CONN_INC_REF(connp);
1924	mutex_exit(&connfp->connf_lock);
1925	return (connp);
1926}
1927
1928/* ARGSUSED */
1929static int
1930tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1931{
1932	itc_t	*itc = (itc_t *)buf;
1933	conn_t 	*connp = &itc->itc_conn;
1934	tcp_t	*tcp = (tcp_t *)&itc[1];
1935
1936	bzero(connp, sizeof (conn_t));
1937	bzero(tcp, sizeof (tcp_t));
1938
1939	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1940	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1941	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1942	tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1943	if (tcp->tcp_timercache == NULL)
1944		return (ENOMEM);
1945	connp->conn_tcp = tcp;
1946	connp->conn_flags = IPCL_TCPCONN;
1947	connp->conn_proto = IPPROTO_TCP;
1948	tcp->tcp_connp = connp;
1949	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1950
1951	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1952	if (connp->conn_ixa == NULL) {
1953		tcp_timermp_free(tcp);
1954		return (ENOMEM);
1955	}
1956	connp->conn_ixa->ixa_refcnt = 1;
1957	connp->conn_ixa->ixa_protocol = connp->conn_proto;
1958	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1959	return (0);
1960}
1961
1962/* ARGSUSED */
1963static void
1964tcp_conn_destructor(void *buf, void *cdrarg)
1965{
1966	itc_t	*itc = (itc_t *)buf;
1967	conn_t 	*connp = &itc->itc_conn;
1968	tcp_t	*tcp = (tcp_t *)&itc[1];
1969
1970	ASSERT(connp->conn_flags & IPCL_TCPCONN);
1971	ASSERT(tcp->tcp_connp == connp);
1972	ASSERT(connp->conn_tcp == tcp);
1973	tcp_timermp_free(tcp);
1974	mutex_destroy(&connp->conn_lock);
1975	cv_destroy(&connp->conn_cv);
1976	cv_destroy(&connp->conn_sq_cv);
1977	rw_destroy(&connp->conn_ilg_lock);
1978
1979	/* Can be NULL if constructor failed */
1980	if (connp->conn_ixa != NULL) {
1981		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1982		ASSERT(connp->conn_ixa->ixa_ire == NULL);
1983		ASSERT(connp->conn_ixa->ixa_nce == NULL);
1984		ixa_refrele(connp->conn_ixa);
1985	}
1986}
1987
1988/* ARGSUSED */
1989static int
1990ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1991{
1992	itc_t	*itc = (itc_t *)buf;
1993	conn_t 	*connp = &itc->itc_conn;
1994
1995	bzero(connp, sizeof (conn_t));
1996	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1997	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1998	connp->conn_flags = IPCL_IPCCONN;
1999	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2000
2001	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2002	if (connp->conn_ixa == NULL)
2003		return (ENOMEM);
2004	connp->conn_ixa->ixa_refcnt = 1;
2005	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2006	return (0);
2007}
2008
2009/* ARGSUSED */
2010static void
2011ip_conn_destructor(void *buf, void *cdrarg)
2012{
2013	itc_t	*itc = (itc_t *)buf;
2014	conn_t 	*connp = &itc->itc_conn;
2015
2016	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2017	ASSERT(connp->conn_priv == NULL);
2018	mutex_destroy(&connp->conn_lock);
2019	cv_destroy(&connp->conn_cv);
2020	rw_destroy(&connp->conn_ilg_lock);
2021
2022	/* Can be NULL if constructor failed */
2023	if (connp->conn_ixa != NULL) {
2024		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2025		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2026		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2027		ixa_refrele(connp->conn_ixa);
2028	}
2029}
2030
2031/* ARGSUSED */
2032static int
2033udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2034{
2035	itc_t	*itc = (itc_t *)buf;
2036	conn_t 	*connp = &itc->itc_conn;
2037	udp_t	*udp = (udp_t *)&itc[1];
2038
2039	bzero(connp, sizeof (conn_t));
2040	bzero(udp, sizeof (udp_t));
2041
2042	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2043	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2044	connp->conn_udp = udp;
2045	connp->conn_flags = IPCL_UDPCONN;
2046	connp->conn_proto = IPPROTO_UDP;
2047	udp->udp_connp = connp;
2048	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2049	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2050	if (connp->conn_ixa == NULL)
2051		return (ENOMEM);
2052	connp->conn_ixa->ixa_refcnt = 1;
2053	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2054	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2055	return (0);
2056}
2057
2058/* ARGSUSED */
2059static void
2060udp_conn_destructor(void *buf, void *cdrarg)
2061{
2062	itc_t	*itc = (itc_t *)buf;
2063	conn_t 	*connp = &itc->itc_conn;
2064	udp_t	*udp = (udp_t *)&itc[1];
2065
2066	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2067	ASSERT(udp->udp_connp == connp);
2068	ASSERT(connp->conn_udp == udp);
2069	mutex_destroy(&connp->conn_lock);
2070	cv_destroy(&connp->conn_cv);
2071	rw_destroy(&connp->conn_ilg_lock);
2072
2073	/* Can be NULL if constructor failed */
2074	if (connp->conn_ixa != NULL) {
2075		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2076		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2077		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2078		ixa_refrele(connp->conn_ixa);
2079	}
2080}
2081
2082/* ARGSUSED */
2083static int
2084rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2085{
2086	itc_t	*itc = (itc_t *)buf;
2087	conn_t 	*connp = &itc->itc_conn;
2088	icmp_t	*icmp = (icmp_t *)&itc[1];
2089
2090	bzero(connp, sizeof (conn_t));
2091	bzero(icmp, sizeof (icmp_t));
2092
2093	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2094	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2095	connp->conn_icmp = icmp;
2096	connp->conn_flags = IPCL_RAWIPCONN;
2097	connp->conn_proto = IPPROTO_ICMP;
2098	icmp->icmp_connp = connp;
2099	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2100	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2101	if (connp->conn_ixa == NULL)
2102		return (ENOMEM);
2103	connp->conn_ixa->ixa_refcnt = 1;
2104	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2105	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2106	return (0);
2107}
2108
2109/* ARGSUSED */
2110static void
2111rawip_conn_destructor(void *buf, void *cdrarg)
2112{
2113	itc_t	*itc = (itc_t *)buf;
2114	conn_t 	*connp = &itc->itc_conn;
2115	icmp_t	*icmp = (icmp_t *)&itc[1];
2116
2117	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2118	ASSERT(icmp->icmp_connp == connp);
2119	ASSERT(connp->conn_icmp == icmp);
2120	mutex_destroy(&connp->conn_lock);
2121	cv_destroy(&connp->conn_cv);
2122	rw_destroy(&connp->conn_ilg_lock);
2123
2124	/* Can be NULL if constructor failed */
2125	if (connp->conn_ixa != NULL) {
2126		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2127		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2128		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2129		ixa_refrele(connp->conn_ixa);
2130	}
2131}
2132
2133/* ARGSUSED */
2134static int
2135rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2136{
2137	itc_t	*itc = (itc_t *)buf;
2138	conn_t 	*connp = &itc->itc_conn;
2139	rts_t	*rts = (rts_t *)&itc[1];
2140
2141	bzero(connp, sizeof (conn_t));
2142	bzero(rts, sizeof (rts_t));
2143
2144	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2145	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2146	connp->conn_rts = rts;
2147	connp->conn_flags = IPCL_RTSCONN;
2148	rts->rts_connp = connp;
2149	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2150	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2151	if (connp->conn_ixa == NULL)
2152		return (ENOMEM);
2153	connp->conn_ixa->ixa_refcnt = 1;
2154	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2155	return (0);
2156}
2157
2158/* ARGSUSED */
2159static void
2160rts_conn_destructor(void *buf, void *cdrarg)
2161{
2162	itc_t	*itc = (itc_t *)buf;
2163	conn_t 	*connp = &itc->itc_conn;
2164	rts_t	*rts = (rts_t *)&itc[1];
2165
2166	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2167	ASSERT(rts->rts_connp == connp);
2168	ASSERT(connp->conn_rts == rts);
2169	mutex_destroy(&connp->conn_lock);
2170	cv_destroy(&connp->conn_cv);
2171	rw_destroy(&connp->conn_ilg_lock);
2172
2173	/* Can be NULL if constructor failed */
2174	if (connp->conn_ixa != NULL) {
2175		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2176		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2177		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2178		ixa_refrele(connp->conn_ixa);
2179	}
2180}
2181
2182/*
2183 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2184 * in the conn_t.
2185 *
2186 * Below we list all the pointers in the conn_t as a documentation aid.
2187 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2188 * If you add any pointers to the conn_t please add an ASSERT here
2189 * and #ifdef it out if it can't be actually asserted to be NULL.
2190 * In any case, we bzero most of the conn_t at the end of the function.
2191 */
2192void
2193ipcl_conn_cleanup(conn_t *connp)
2194{
2195	ip_xmit_attr_t	*ixa;
2196
2197	ASSERT(connp->conn_latch == NULL);
2198	ASSERT(connp->conn_latch_in_policy == NULL);
2199	ASSERT(connp->conn_latch_in_action == NULL);
2200#ifdef notdef
2201	ASSERT(connp->conn_rq == NULL);
2202	ASSERT(connp->conn_wq == NULL);
2203#endif
2204	ASSERT(connp->conn_cred == NULL);
2205	ASSERT(connp->conn_g_fanout == NULL);
2206	ASSERT(connp->conn_g_next == NULL);
2207	ASSERT(connp->conn_g_prev == NULL);
2208	ASSERT(connp->conn_policy == NULL);
2209	ASSERT(connp->conn_fanout == NULL);
2210	ASSERT(connp->conn_next == NULL);
2211	ASSERT(connp->conn_prev == NULL);
2212	ASSERT(connp->conn_oper_pending_ill == NULL);
2213	ASSERT(connp->conn_ilg == NULL);
2214	ASSERT(connp->conn_drain_next == NULL);
2215	ASSERT(connp->conn_drain_prev == NULL);
2216#ifdef notdef
2217	/* conn_idl is not cleared when removed from idl list */
2218	ASSERT(connp->conn_idl == NULL);
2219#endif
2220	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2221#ifdef notdef
2222	/* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2223	ASSERT(connp->conn_netstack == NULL);
2224#endif
2225
2226	ASSERT(connp->conn_helper_info == NULL);
2227	ASSERT(connp->conn_ixa != NULL);
2228	ixa = connp->conn_ixa;
2229	ASSERT(ixa->ixa_refcnt == 1);
2230	/* Need to preserve ixa_protocol */
2231	ixa_cleanup(ixa);
2232	ixa->ixa_flags = 0;
2233
2234	/* Clear out the conn_t fields that are not preserved */
2235	bzero(&connp->conn_start_clr,
2236	    sizeof (conn_t) -
2237	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2238}
2239
2240/*
2241 * All conns are inserted in a global multi-list for the benefit of
2242 * walkers. The walk is guaranteed to walk all open conns at the time
2243 * of the start of the walk exactly once. This property is needed to
2244 * achieve some cleanups during unplumb of interfaces. This is achieved
2245 * as follows.
2246 *
2247 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2248 * call the insert and delete functions below at creation and deletion
2249 * time respectively. The conn never moves or changes its position in this
2250 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2251 * won't increase due to walkers, once the conn deletion has started. Note
2252 * that we can't remove the conn from the global list and then wait for
2253 * the refcnt to drop to zero, since walkers would then see a truncated
2254 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2255 * conns until ip_open is ready to make them globally visible.
2256 * The global round robin multi-list locks are held only to get the
2257 * next member/insertion/deletion and contention should be negligible
2258 * if the multi-list is much greater than the number of cpus.
2259 */
2260void
2261ipcl_globalhash_insert(conn_t *connp)
2262{
2263	int	index;
2264	struct connf_s	*connfp;
2265	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2266
2267	/*
2268	 * No need for atomic here. Approximate even distribution
2269	 * in the global lists is sufficient.
2270	 */
2271	ipst->ips_conn_g_index++;
2272	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2273
2274	connp->conn_g_prev = NULL;
2275	/*
2276	 * Mark as INCIPIENT, so that walkers will ignore this
2277	 * for now, till ip_open is ready to make it visible globally.
2278	 */
2279	connp->conn_state_flags |= CONN_INCIPIENT;
2280
2281	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2282	/* Insert at the head of the list */
2283	mutex_enter(&connfp->connf_lock);
2284	connp->conn_g_next = connfp->connf_head;
2285	if (connp->conn_g_next != NULL)
2286		connp->conn_g_next->conn_g_prev = connp;
2287	connfp->connf_head = connp;
2288
2289	/* The fanout bucket this conn points to */
2290	connp->conn_g_fanout = connfp;
2291
2292	mutex_exit(&connfp->connf_lock);
2293}
2294
2295void
2296ipcl_globalhash_remove(conn_t *connp)
2297{
2298	struct connf_s	*connfp;
2299
2300	/*
2301	 * We were never inserted in the global multi list.
2302	 * IPCL_NONE variety is never inserted in the global multilist
2303	 * since it is presumed to not need any cleanup and is transient.
2304	 */
2305	if (connp->conn_g_fanout == NULL)
2306		return;
2307
2308	connfp = connp->conn_g_fanout;
2309	mutex_enter(&connfp->connf_lock);
2310	if (connp->conn_g_prev != NULL)
2311		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2312	else
2313		connfp->connf_head = connp->conn_g_next;
2314	if (connp->conn_g_next != NULL)
2315		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2316	mutex_exit(&connfp->connf_lock);
2317
2318	/* Better to stumble on a null pointer than to corrupt memory */
2319	connp->conn_g_next = NULL;
2320	connp->conn_g_prev = NULL;
2321	connp->conn_g_fanout = NULL;
2322}
2323
2324/*
2325 * Walk the list of all conn_t's in the system, calling the function provided
2326 * With the specified argument for each.
2327 * Applies to both IPv4 and IPv6.
2328 *
2329 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2330 * conn_oper_pending_ill). To guard against stale pointers
2331 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2332 * unplumbed or removed. New conn_t's that are created while we are walking
2333 * may be missed by this walk, because they are not necessarily inserted
2334 * at the tail of the list. They are new conn_t's and thus don't have any
2335 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2336 * is created to the struct that is going away.
2337 */
2338void
2339ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2340{
2341	int	i;
2342	conn_t	*connp;
2343	conn_t	*prev_connp;
2344
2345	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2346		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2347		prev_connp = NULL;
2348		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2349		while (connp != NULL) {
2350			mutex_enter(&connp->conn_lock);
2351			if (connp->conn_state_flags &
2352			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2353				mutex_exit(&connp->conn_lock);
2354				connp = connp->conn_g_next;
2355				continue;
2356			}
2357			CONN_INC_REF_LOCKED(connp);
2358			mutex_exit(&connp->conn_lock);
2359			mutex_exit(
2360			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2361			(*func)(connp, arg);
2362			if (prev_connp != NULL)
2363				CONN_DEC_REF(prev_connp);
2364			mutex_enter(
2365			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2366			prev_connp = connp;
2367			connp = connp->conn_g_next;
2368		}
2369		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2370		if (prev_connp != NULL)
2371			CONN_DEC_REF(prev_connp);
2372	}
2373}
2374
2375/*
2376 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2377 * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2378 * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2379 * (peer tcp in ESTABLISHED state).
2380 */
2381conn_t *
2382ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2383    ip_stack_t *ipst)
2384{
2385	uint32_t ports;
2386	uint16_t *pports = (uint16_t *)&ports;
2387	connf_t	*connfp;
2388	conn_t	*tconnp;
2389	boolean_t zone_chk;
2390
2391	/*
2392	 * If either the source of destination address is loopback, then
2393	 * both endpoints must be in the same Zone.  Otherwise, both of
2394	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2395	 * state) and the endpoints may reside in different Zones.
2396	 */
2397	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2398	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2399
2400	pports[0] = tcpha->tha_fport;
2401	pports[1] = tcpha->tha_lport;
2402
2403	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2404	    ports, ipst)];
2405
2406	mutex_enter(&connfp->connf_lock);
2407	for (tconnp = connfp->connf_head; tconnp != NULL;
2408	    tconnp = tconnp->conn_next) {
2409
2410		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2411		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2412		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2413		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2414
2415			ASSERT(tconnp != connp);
2416			CONN_INC_REF(tconnp);
2417			mutex_exit(&connfp->connf_lock);
2418			return (tconnp);
2419		}
2420	}
2421	mutex_exit(&connfp->connf_lock);
2422	return (NULL);
2423}
2424
2425/*
2426 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2427 * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2428 * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2429 * (peer tcp in ESTABLISHED state).
2430 */
2431conn_t *
2432ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2433    ip_stack_t *ipst)
2434{
2435	uint32_t ports;
2436	uint16_t *pports = (uint16_t *)&ports;
2437	connf_t	*connfp;
2438	conn_t	*tconnp;
2439	boolean_t zone_chk;
2440
2441	/*
2442	 * If either the source of destination address is loopback, then
2443	 * both endpoints must be in the same Zone.  Otherwise, both of
2444	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2445	 * state) and the endpoints may reside in different Zones.  We
2446	 * don't do Zone check for link local address(es) because the
2447	 * current Zone implementation treats each link local address as
2448	 * being unique per system node, i.e. they belong to global Zone.
2449	 */
2450	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2451	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2452
2453	pports[0] = tcpha->tha_fport;
2454	pports[1] = tcpha->tha_lport;
2455
2456	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2457	    ports, ipst)];
2458
2459	mutex_enter(&connfp->connf_lock);
2460	for (tconnp = connfp->connf_head; tconnp != NULL;
2461	    tconnp = tconnp->conn_next) {
2462
2463		/* We skip conn_bound_if check here as this is loopback tcp */
2464		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2465		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2466		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2467		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2468
2469			ASSERT(tconnp != connp);
2470			CONN_INC_REF(tconnp);
2471			mutex_exit(&connfp->connf_lock);
2472			return (tconnp);
2473		}
2474	}
2475	mutex_exit(&connfp->connf_lock);
2476	return (NULL);
2477}
2478
2479/*
2480 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2481 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2482 * Only checks for connected entries i.e. no INADDR_ANY checks.
2483 */
2484conn_t *
2485ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2486    ip_stack_t *ipst)
2487{
2488	uint32_t ports;
2489	uint16_t *pports;
2490	connf_t	*connfp;
2491	conn_t	*tconnp;
2492
2493	pports = (uint16_t *)&ports;
2494	pports[0] = tcpha->tha_fport;
2495	pports[1] = tcpha->tha_lport;
2496
2497	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2498	    ports, ipst)];
2499
2500	mutex_enter(&connfp->connf_lock);
2501	for (tconnp = connfp->connf_head; tconnp != NULL;
2502	    tconnp = tconnp->conn_next) {
2503
2504		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2505		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2506		    tconnp->conn_tcp->tcp_state >= min_state) {
2507
2508			CONN_INC_REF(tconnp);
2509			mutex_exit(&connfp->connf_lock);
2510			return (tconnp);
2511		}
2512	}
2513	mutex_exit(&connfp->connf_lock);
2514	return (NULL);
2515}
2516
2517/*
2518 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2519 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2520 * Only checks for connected entries i.e. no INADDR_ANY checks.
2521 * Match on ifindex in addition to addresses.
2522 */
2523conn_t *
2524ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2525    uint_t ifindex, ip_stack_t *ipst)
2526{
2527	tcp_t	*tcp;
2528	uint32_t ports;
2529	uint16_t *pports;
2530	connf_t	*connfp;
2531	conn_t	*tconnp;
2532
2533	pports = (uint16_t *)&ports;
2534	pports[0] = tcpha->tha_fport;
2535	pports[1] = tcpha->tha_lport;
2536
2537	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2538	    ports, ipst)];
2539
2540	mutex_enter(&connfp->connf_lock);
2541	for (tconnp = connfp->connf_head; tconnp != NULL;
2542	    tconnp = tconnp->conn_next) {
2543
2544		tcp = tconnp->conn_tcp;
2545		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2546		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2547		    tcp->tcp_state >= min_state &&
2548		    (tconnp->conn_bound_if == 0 ||
2549		    tconnp->conn_bound_if == ifindex)) {
2550
2551			CONN_INC_REF(tconnp);
2552			mutex_exit(&connfp->connf_lock);
2553			return (tconnp);
2554		}
2555	}
2556	mutex_exit(&connfp->connf_lock);
2557	return (NULL);
2558}
2559
2560/*
2561 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2562 * a listener when changing state.
2563 */
2564conn_t *
2565ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2566    ip_stack_t *ipst)
2567{
2568	connf_t		*bind_connfp;
2569	conn_t		*connp;
2570	tcp_t		*tcp;
2571
2572	/*
2573	 * Avoid false matches for packets sent to an IP destination of
2574	 * all zeros.
2575	 */
2576	if (laddr == 0)
2577		return (NULL);
2578
2579	ASSERT(zoneid != ALL_ZONES);
2580
2581	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2582	mutex_enter(&bind_connfp->connf_lock);
2583	for (connp = bind_connfp->connf_head; connp != NULL;
2584	    connp = connp->conn_next) {
2585		tcp = connp->conn_tcp;
2586		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2587		    IPCL_ZONE_MATCH(connp, zoneid) &&
2588		    (tcp->tcp_listener == NULL)) {
2589			CONN_INC_REF(connp);
2590			mutex_exit(&bind_connfp->connf_lock);
2591			return (connp);
2592		}
2593	}
2594	mutex_exit(&bind_connfp->connf_lock);
2595	return (NULL);
2596}
2597
2598/*
2599 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2600 * a listener when changing state.
2601 */
2602conn_t *
2603ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2604    zoneid_t zoneid, ip_stack_t *ipst)
2605{
2606	connf_t		*bind_connfp;
2607	conn_t		*connp = NULL;
2608	tcp_t		*tcp;
2609
2610	/*
2611	 * Avoid false matches for packets sent to an IP destination of
2612	 * all zeros.
2613	 */
2614	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2615		return (NULL);
2616
2617	ASSERT(zoneid != ALL_ZONES);
2618
2619	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2620	mutex_enter(&bind_connfp->connf_lock);
2621	for (connp = bind_connfp->connf_head; connp != NULL;
2622	    connp = connp->conn_next) {
2623		tcp = connp->conn_tcp;
2624		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2625		    IPCL_ZONE_MATCH(connp, zoneid) &&
2626		    (connp->conn_bound_if == 0 ||
2627		    connp->conn_bound_if == ifindex) &&
2628		    tcp->tcp_listener == NULL) {
2629			CONN_INC_REF(connp);
2630			mutex_exit(&bind_connfp->connf_lock);
2631			return (connp);
2632		}
2633	}
2634	mutex_exit(&bind_connfp->connf_lock);
2635	return (NULL);
2636}
2637
2638/*
2639 * ipcl_get_next_conn
2640 *	get the next entry in the conn global list
2641 *	and put a reference on the next_conn.
2642 *	decrement the reference on the current conn.
2643 *
2644 * This is an iterator based walker function that also provides for
2645 * some selection by the caller. It walks through the conn_hash bucket
2646 * searching for the next valid connp in the list, and selects connections
2647 * that are neither closed nor condemned. It also REFHOLDS the conn
2648 * thus ensuring that the conn exists when the caller uses the conn.
2649 */
2650conn_t *
2651ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2652{
2653	conn_t	*next_connp;
2654
2655	if (connfp == NULL)
2656		return (NULL);
2657
2658	mutex_enter(&connfp->connf_lock);
2659
2660	next_connp = (connp == NULL) ?
2661	    connfp->connf_head : connp->conn_g_next;
2662
2663	while (next_connp != NULL) {
2664		mutex_enter(&next_connp->conn_lock);
2665		if (!(next_connp->conn_flags & conn_flags) ||
2666		    (next_connp->conn_state_flags &
2667		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2668			/*
2669			 * This conn has been condemned or
2670			 * is closing, or the flags don't match
2671			 */
2672			mutex_exit(&next_connp->conn_lock);
2673			next_connp = next_connp->conn_g_next;
2674			continue;
2675		}
2676		CONN_INC_REF_LOCKED(next_connp);
2677		mutex_exit(&next_connp->conn_lock);
2678		break;
2679	}
2680
2681	mutex_exit(&connfp->connf_lock);
2682
2683	if (connp != NULL)
2684		CONN_DEC_REF(connp);
2685
2686	return (next_connp);
2687}
2688
2689#ifdef CONN_DEBUG
2690/*
2691 * Trace of the last NBUF refhold/refrele
2692 */
2693int
2694conn_trace_ref(conn_t *connp)
2695{
2696	int	last;
2697	conn_trace_t	*ctb;
2698
2699	ASSERT(MUTEX_HELD(&connp->conn_lock));
2700	last = connp->conn_trace_last;
2701	last++;
2702	if (last == CONN_TRACE_MAX)
2703		last = 0;
2704
2705	ctb = &connp->conn_trace_buf[last];
2706	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2707	connp->conn_trace_last = last;
2708	return (1);
2709}
2710
2711int
2712conn_untrace_ref(conn_t *connp)
2713{
2714	int	last;
2715	conn_trace_t	*ctb;
2716
2717	ASSERT(MUTEX_HELD(&connp->conn_lock));
2718	last = connp->conn_trace_last;
2719	last++;
2720	if (last == CONN_TRACE_MAX)
2721		last = 0;
2722
2723	ctb = &connp->conn_trace_buf[last];
2724	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2725	connp->conn_trace_last = last;
2726	return (1);
2727}
2728#endif
2729