ipclassifier.c revision 5240:e7599510dd03
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28const char ipclassifier_version[] = "@(#)ipclassifier.c	%I%	%E% SMI";
29
30/*
31 * IP PACKET CLASSIFIER
32 *
33 * The IP packet classifier provides mapping between IP packets and persistent
34 * connection state for connection-oriented protocols. It also provides
35 * interface for managing connection states.
36 *
37 * The connection state is kept in conn_t data structure and contains, among
38 * other things:
39 *
40 *	o local/remote address and ports
41 *	o Transport protocol
42 *	o squeue for the connection (for TCP only)
43 *	o reference counter
44 *	o Connection state
45 *	o hash table linkage
46 *	o interface/ire information
47 *	o credentials
48 *	o ipsec policy
49 *	o send and receive functions.
50 *	o mutex lock.
51 *
52 * Connections use a reference counting scheme. They are freed when the
53 * reference counter drops to zero. A reference is incremented when connection
54 * is placed in a list or table, when incoming packet for the connection arrives
55 * and when connection is processed via squeue (squeue processing may be
56 * asynchronous and the reference protects the connection from being destroyed
57 * before its processing is finished).
58 *
59 * send and receive functions are currently used for TCP only. The send function
60 * determines the IP entry point for the packet once it leaves TCP to be sent to
61 * the destination address. The receive function is used by IP when the packet
62 * should be passed for TCP processing. When a new connection is created these
63 * are set to ip_output() and tcp_input() respectively. During the lifetime of
64 * the connection the send and receive functions may change depending on the
65 * changes in the connection state. For example, Once the connection is bound to
66 * an addresse, the receive function for this connection is set to
67 * tcp_conn_request().  This allows incoming SYNs to go directly into the
68 * listener SYN processing function without going to tcp_input() first.
69 *
70 * Classifier uses several hash tables:
71 *
72 * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
73 *	ipcl_bind_fanout:	contains all connections in BOUND state
74 *	ipcl_proto_fanout:	IPv4 protocol fanout
75 *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
76 *	ipcl_udp_fanout:	contains all UDP connections
77 *	ipcl_globalhash_fanout:	contains all connections
78 *
79 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
80 * which need to view all existing connections.
81 *
82 * All tables are protected by per-bucket locks. When both per-bucket lock and
83 * connection lock need to be held, the per-bucket lock should be acquired
84 * first, followed by the connection lock.
85 *
86 * All functions doing search in one of these tables increment a reference
87 * counter on the connection found (if any). This reference should be dropped
88 * when the caller has finished processing the connection.
89 *
90 *
91 * INTERFACES:
92 * ===========
93 *
94 * Connection Lookup:
95 * ------------------
96 *
97 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
98 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
99 *
100 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
101 * it can't find any associated connection. If the connection is found, its
102 * reference counter is incremented.
103 *
104 *	mp:	mblock, containing packet header. The full header should fit
105 *		into a single mblock. It should also contain at least full IP
106 *		and TCP or UDP header.
107 *
108 *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
109 *
110 *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
111 *		 the packet.
112 *
113 * 	zoneid: The zone in which the returned connection must be; the zoneid
114 *		corresponding to the ire_zoneid on the IRE located for the
115 *		packet's destination address.
116 *
117 *	For TCP connections, the lookup order is as follows:
118 *		5-tuple {src, dst, protocol, local port, remote port}
119 *			lookup in ipcl_conn_fanout table.
120 *		3-tuple {dst, remote port, protocol} lookup in
121 *			ipcl_bind_fanout table.
122 *
123 *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
124 *	remote port} lookup is done on ipcl_udp_fanout. Note that,
125 *	these interfaces do not handle cases where a packets belongs
126 *	to multiple UDP clients, which is handled in IP itself.
127 *
128 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
129 * determine which actual zone gets the segment.  This is used only in a
130 * labeled environment.  The matching rules are:
131 *
132 *	- If it's not a multilevel port, then the label on the packet selects
133 *	  the zone.  Unlabeled packets are delivered to the global zone.
134 *
135 *	- If it's a multilevel port, then only the zone registered to receive
136 *	  packets on that port matches.
137 *
138 * Also, in a labeled environment, packet labels need to be checked.  For fully
139 * bound TCP connections, we can assume that the packet label was checked
140 * during connection establishment, and doesn't need to be checked on each
141 * packet.  For others, though, we need to check for strict equality or, for
142 * multilevel ports, membership in the range or set.  This part currently does
143 * a tnrh lookup on each packet, but could be optimized to use cached results
144 * if that were necessary.  (SCTP doesn't come through here, but if it did,
145 * we would apply the same rules as TCP.)
146 *
147 * An implication of the above is that fully-bound TCP sockets must always use
148 * distinct 4-tuples; they can't be discriminated by label alone.
149 *
150 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
151 * as there's no connection set-up handshake and no shared state.
152 *
153 * Labels on looped-back packets within a single zone do not need to be
154 * checked, as all processes in the same zone have the same label.
155 *
156 * Finally, for unlabeled packets received by a labeled system, special rules
157 * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
158 * socket in the zone whose label matches the default label of the sender, if
159 * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
160 * receiver's label must dominate the sender's default label.
161 *
162 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
163 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
164 *					 ip_stack);
165 *
166 *	Lookup routine to find a exact match for {src, dst, local port,
167 *	remote port) for TCP connections in ipcl_conn_fanout. The address and
168 *	ports are read from the IP and TCP header respectively.
169 *
170 * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
171 *					 zoneid, ip_stack);
172 * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
173 *					 zoneid, ip_stack);
174 *
175 * 	Lookup routine to find a listener with the tuple {lport, laddr,
176 * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
177 * 	parameter interface index is also compared.
178 *
179 * void ipcl_walk(func, arg, ip_stack)
180 *
181 * 	Apply 'func' to every connection available. The 'func' is called as
182 *	(*func)(connp, arg). The walk is non-atomic so connections may be
183 *	created and destroyed during the walk. The CONN_CONDEMNED and
184 *	CONN_INCIPIENT flags ensure that connections which are newly created
185 *	or being destroyed are not selected by the walker.
186 *
187 * Table Updates
188 * -------------
189 *
190 * int ipcl_conn_insert(connp, protocol, src, dst, ports)
191 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
192 *
193 *	Insert 'connp' in the ipcl_conn_fanout.
194 *	Arguements :
195 *		connp		conn_t to be inserted
196 *		protocol	connection protocol
197 *		src		source address
198 *		dst		destination address
199 *		ports		local and remote port
200 *		ifindex		interface index for IPv6 connections
201 *
202 *	Return value :
203 *		0		if connp was inserted
204 *		EADDRINUSE	if the connection with the same tuple
205 *				already exists.
206 *
207 * int ipcl_bind_insert(connp, protocol, src, lport);
208 * int ipcl_bind_insert_v6(connp, protocol, src, lport);
209 *
210 * 	Insert 'connp' in ipcl_bind_fanout.
211 * 	Arguements :
212 * 		connp		conn_t to be inserted
213 * 		protocol	connection protocol
214 * 		src		source address connection wants
215 * 				to bind to
216 * 		lport		local port connection wants to
217 * 				bind to
218 *
219 *
220 * void ipcl_hash_remove(connp);
221 *
222 * 	Removes the 'connp' from the connection fanout table.
223 *
224 * Connection Creation/Destruction
225 * -------------------------------
226 *
227 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
228 *
229 * 	Creates a new conn based on the type flag, inserts it into
230 * 	globalhash table.
231 *
232 *	type:	This flag determines the type of conn_t which needs to be
233 *		created i.e., which kmem_cache it comes from.
234 *		IPCL_TCPCONN	indicates a TCP connection
235 *		IPCL_SCTPCONN	indicates a SCTP connection
236 *		IPCL_UDPCONN	indicates a UDP conn_t.
237 *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
238 *		IPCL_RTSCONN	indicates a RTS conn_t.
239 *		IPCL_IPCCONN	indicates all other connections.
240 *
241 * void ipcl_conn_destroy(connp)
242 *
243 * 	Destroys the connection state, removes it from the global
244 * 	connection hash table and frees its memory.
245 */
246
247#include <sys/types.h>
248#include <sys/stream.h>
249#include <sys/stropts.h>
250#include <sys/sysmacros.h>
251#include <sys/strsubr.h>
252#include <sys/strsun.h>
253#define	_SUN_TPI_VERSION 2
254#include <sys/ddi.h>
255#include <sys/cmn_err.h>
256#include <sys/debug.h>
257
258#include <sys/systm.h>
259#include <sys/param.h>
260#include <sys/kmem.h>
261#include <sys/isa_defs.h>
262#include <inet/common.h>
263#include <netinet/ip6.h>
264#include <netinet/icmp6.h>
265
266#include <inet/ip.h>
267#include <inet/ip6.h>
268#include <inet/tcp.h>
269#include <inet/ip_ndp.h>
270#include <inet/udp_impl.h>
271#include <inet/sctp_ip.h>
272#include <inet/sctp/sctp_impl.h>
273#include <inet/rawip_impl.h>
274#include <inet/rts_impl.h>
275
276#include <sys/cpuvar.h>
277
278#include <inet/ipclassifier.h>
279#include <inet/ipsec_impl.h>
280
281#include <sys/tsol/tnet.h>
282
283#ifdef DEBUG
284#define	IPCL_DEBUG
285#else
286#undef	IPCL_DEBUG
287#endif
288
289#ifdef	IPCL_DEBUG
290int	ipcl_debug_level = 0;
291#define	IPCL_DEBUG_LVL(level, args)	\
292	if (ipcl_debug_level  & level) { printf args; }
293#else
294#define	IPCL_DEBUG_LVL(level, args) {; }
295#endif
296/* Old value for compatibility. Setable in /etc/system */
297uint_t tcp_conn_hash_size = 0;
298
299/* New value. Zero means choose automatically.  Setable in /etc/system */
300uint_t ipcl_conn_hash_size = 0;
301uint_t ipcl_conn_hash_memfactor = 8192;
302uint_t ipcl_conn_hash_maxsize = 82500;
303
304/* bind/udp fanout table size */
305uint_t ipcl_bind_fanout_size = 512;
306uint_t ipcl_udp_fanout_size = 16384;
307
308/* Raw socket fanout size.  Must be a power of 2. */
309uint_t ipcl_raw_fanout_size = 256;
310
311/*
312 * Power of 2^N Primes useful for hashing for N of 0-28,
313 * these primes are the nearest prime <= 2^N - 2^(N-2).
314 */
315
316#define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
317		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
318		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
319		50331599, 100663291, 201326557, 0}
320
321/*
322 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
323 * are aligned on cache lines.
324 */
325typedef union itc_s {
326	conn_t	itc_conn;
327	char	itcu_filler[CACHE_ALIGN(conn_s)];
328} itc_t;
329
330struct kmem_cache  *tcp_conn_cache;
331struct kmem_cache  *ip_conn_cache;
332extern struct kmem_cache  *sctp_conn_cache;
333extern struct kmem_cache  *tcp_sack_info_cache;
334extern struct kmem_cache  *tcp_iphc_cache;
335struct kmem_cache  *udp_conn_cache;
336struct kmem_cache  *rawip_conn_cache;
337struct kmem_cache  *rts_conn_cache;
338
339extern void	tcp_timermp_free(tcp_t *);
340extern mblk_t	*tcp_timermp_alloc(int);
341
342static int	ip_conn_constructor(void *, void *, int);
343static void	ip_conn_destructor(void *, void *);
344
345static int	tcp_conn_constructor(void *, void *, int);
346static void	tcp_conn_destructor(void *, void *);
347
348static int	udp_conn_constructor(void *, void *, int);
349static void	udp_conn_destructor(void *, void *);
350
351static int	rawip_conn_constructor(void *, void *, int);
352static void	rawip_conn_destructor(void *, void *);
353
354static int	rts_conn_constructor(void *, void *, int);
355static void	rts_conn_destructor(void *, void *);
356
357#ifdef	IPCL_DEBUG
358#define	INET_NTOA_BUFSIZE	18
359
360static char *
361inet_ntoa_r(uint32_t in, char *b)
362{
363	unsigned char	*p;
364
365	p = (unsigned char *)&in;
366	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
367	return (b);
368}
369#endif
370
371/*
372 * Global (for all stack instances) init routine
373 */
374void
375ipcl_g_init(void)
376{
377	ip_conn_cache = kmem_cache_create("ip_conn_cache",
378	    sizeof (conn_t), CACHE_ALIGN_SIZE,
379	    ip_conn_constructor, ip_conn_destructor,
380	    NULL, NULL, NULL, 0);
381
382	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
383	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
384	    tcp_conn_constructor, tcp_conn_destructor,
385	    NULL, NULL, NULL, 0);
386
387	udp_conn_cache = kmem_cache_create("udp_conn_cache",
388	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
389	    udp_conn_constructor, udp_conn_destructor,
390	    NULL, NULL, NULL, 0);
391
392	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
393	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
394	    rawip_conn_constructor, rawip_conn_destructor,
395	    NULL, NULL, NULL, 0);
396
397	rts_conn_cache = kmem_cache_create("rts_conn_cache",
398	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
399	    rts_conn_constructor, rts_conn_destructor,
400	    NULL, NULL, NULL, 0);
401}
402
403/*
404 * ipclassifier intialization routine, sets up hash tables.
405 */
406void
407ipcl_init(ip_stack_t *ipst)
408{
409	int i;
410	int sizes[] = P2Ps();
411
412	/*
413	 * Calculate size of conn fanout table from /etc/system settings
414	 */
415	if (ipcl_conn_hash_size != 0) {
416		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
417	} else if (tcp_conn_hash_size != 0) {
418		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
419	} else {
420		extern pgcnt_t freemem;
421
422		ipst->ips_ipcl_conn_fanout_size =
423		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
424
425		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
426			ipst->ips_ipcl_conn_fanout_size =
427			    ipcl_conn_hash_maxsize;
428		}
429	}
430
431	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
432		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
433			break;
434		}
435	}
436	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
437		/* Out of range, use the 2^16 value */
438		ipst->ips_ipcl_conn_fanout_size = sizes[16];
439	}
440
441	/* Take values from /etc/system */
442	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
443	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
444	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
445
446	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
447
448	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
449	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
450
451	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
452		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
453		    MUTEX_DEFAULT, NULL);
454	}
455
456	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
457	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
458
459	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
460		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
461		    MUTEX_DEFAULT, NULL);
462	}
463
464	ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
465	    sizeof (connf_t), KM_SLEEP);
466	for (i = 0; i < IPPROTO_MAX; i++) {
467		mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
468		    MUTEX_DEFAULT, NULL);
469	}
470
471	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
472	    sizeof (connf_t), KM_SLEEP);
473	for (i = 0; i < IPPROTO_MAX; i++) {
474		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
475		    MUTEX_DEFAULT, NULL);
476	}
477
478	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
479	mutex_init(&ipst->ips_rts_clients->connf_lock,
480	    NULL, MUTEX_DEFAULT, NULL);
481
482	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
483	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
484	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
485		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
486		    MUTEX_DEFAULT, NULL);
487	}
488
489	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
490	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
491	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
492		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
493		    MUTEX_DEFAULT, NULL);
494	}
495
496	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
497	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
498	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
499		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
500		    NULL, MUTEX_DEFAULT, NULL);
501	}
502}
503
504void
505ipcl_g_destroy(void)
506{
507	kmem_cache_destroy(ip_conn_cache);
508	kmem_cache_destroy(tcp_conn_cache);
509	kmem_cache_destroy(udp_conn_cache);
510	kmem_cache_destroy(rawip_conn_cache);
511	kmem_cache_destroy(rts_conn_cache);
512}
513
514/*
515 * All user-level and kernel use of the stack must be gone
516 * by now.
517 */
518void
519ipcl_destroy(ip_stack_t *ipst)
520{
521	int i;
522
523	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
524		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
525		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
526	}
527	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
528	    sizeof (connf_t));
529	ipst->ips_ipcl_conn_fanout = NULL;
530
531	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
532		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
533		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
534	}
535	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
536	    sizeof (connf_t));
537	ipst->ips_ipcl_bind_fanout = NULL;
538
539	for (i = 0; i < IPPROTO_MAX; i++) {
540		ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
541		mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
542	}
543	kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
544	ipst->ips_ipcl_proto_fanout = NULL;
545
546	for (i = 0; i < IPPROTO_MAX; i++) {
547		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
548		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
549	}
550	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
551	    IPPROTO_MAX * sizeof (connf_t));
552	ipst->ips_ipcl_proto_fanout_v6 = NULL;
553
554	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
555		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
556		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
557	}
558	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
559	    sizeof (connf_t));
560	ipst->ips_ipcl_udp_fanout = NULL;
561
562	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
563		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
564		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
565	}
566	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
567	    sizeof (connf_t));
568	ipst->ips_ipcl_raw_fanout = NULL;
569
570	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
571		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
572		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
573	}
574	kmem_free(ipst->ips_ipcl_globalhash_fanout,
575	    sizeof (connf_t) * CONN_G_HASH_SIZE);
576	ipst->ips_ipcl_globalhash_fanout = NULL;
577
578	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
579	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
580	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
581	ipst->ips_rts_clients = NULL;
582}
583
584/*
585 * conn creation routine. initialize the conn, sets the reference
586 * and inserts it in the global hash table.
587 */
588conn_t *
589ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
590{
591	conn_t	*connp;
592	sctp_stack_t *sctps;
593	struct kmem_cache *conn_cache;
594
595	switch (type) {
596	case IPCL_SCTPCONN:
597		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
598			return (NULL);
599		sctp_conn_init(connp);
600		sctps = ns->netstack_sctp;
601		SCTP_G_Q_REFHOLD(sctps);
602		netstack_hold(ns);
603		connp->conn_netstack = ns;
604		return (connp);
605
606	case IPCL_TCPCONN:
607		conn_cache = tcp_conn_cache;
608		break;
609
610	case IPCL_UDPCONN:
611		conn_cache = udp_conn_cache;
612		break;
613
614	case IPCL_RAWIPCONN:
615		conn_cache = rawip_conn_cache;
616		break;
617
618	case IPCL_RTSCONN:
619		conn_cache = rts_conn_cache;
620		break;
621
622	case IPCL_IPCCONN:
623		conn_cache = ip_conn_cache;
624		break;
625
626	default:
627		connp = NULL;
628		ASSERT(0);
629	}
630
631	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
632		return (NULL);
633
634	connp->conn_ref = 1;
635	netstack_hold(ns);
636	connp->conn_netstack = ns;
637	ipcl_globalhash_insert(connp);
638	return (connp);
639}
640
641void
642ipcl_conn_destroy(conn_t *connp)
643{
644	mblk_t	*mp;
645	netstack_t	*ns = connp->conn_netstack;
646
647	ASSERT(!MUTEX_HELD(&connp->conn_lock));
648	ASSERT(connp->conn_ref == 0);
649	ASSERT(connp->conn_ire_cache == NULL);
650
651	if (connp->conn_peercred != NULL &&
652	    connp->conn_peercred != connp->conn_cred)
653		crfree(connp->conn_peercred);
654	connp->conn_peercred = NULL;
655
656	if (connp->conn_cred != NULL) {
657		crfree(connp->conn_cred);
658		connp->conn_cred = NULL;
659	}
660
661	ipcl_globalhash_remove(connp);
662
663	/* FIXME: add separate tcp_conn_free()? */
664	if (connp->conn_flags & IPCL_TCPCONN) {
665		tcp_t	*tcp = connp->conn_tcp;
666		tcp_stack_t *tcps;
667
668		ASSERT(tcp != NULL);
669		tcps = tcp->tcp_tcps;
670		if (tcps != NULL) {
671			if (connp->conn_latch != NULL) {
672				IPLATCH_REFRELE(connp->conn_latch, ns);
673				connp->conn_latch = NULL;
674			}
675			if (connp->conn_policy != NULL) {
676				IPPH_REFRELE(connp->conn_policy, ns);
677				connp->conn_policy = NULL;
678			}
679			tcp->tcp_tcps = NULL;
680			TCPS_REFRELE(tcps);
681		}
682
683		tcp_free(tcp);
684		mp = tcp->tcp_timercache;
685		tcp->tcp_cred = NULL;
686
687		if (tcp->tcp_sack_info != NULL) {
688			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
689			kmem_cache_free(tcp_sack_info_cache,
690			    tcp->tcp_sack_info);
691		}
692		if (tcp->tcp_iphc != NULL) {
693			if (tcp->tcp_hdr_grown) {
694				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
695			} else {
696				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
697				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
698			}
699			tcp->tcp_iphc_len = 0;
700		}
701		ASSERT(tcp->tcp_iphc_len == 0);
702
703		ASSERT(connp->conn_latch == NULL);
704		ASSERT(connp->conn_policy == NULL);
705
706		if (ns != NULL) {
707			ASSERT(tcp->tcp_tcps == NULL);
708			connp->conn_netstack = NULL;
709			netstack_rele(ns);
710		}
711
712		ipcl_conn_cleanup(connp);
713		connp->conn_flags = IPCL_TCPCONN;
714		bzero(tcp, sizeof (tcp_t));
715
716		tcp->tcp_timercache = mp;
717		tcp->tcp_connp = connp;
718		kmem_cache_free(tcp_conn_cache, connp);
719		return;
720	}
721	if (connp->conn_latch != NULL) {
722		IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
723		connp->conn_latch = NULL;
724	}
725	if (connp->conn_policy != NULL) {
726		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
727		connp->conn_policy = NULL;
728	}
729	if (connp->conn_ipsec_opt_mp != NULL) {
730		freemsg(connp->conn_ipsec_opt_mp);
731		connp->conn_ipsec_opt_mp = NULL;
732	}
733
734	if (connp->conn_flags & IPCL_SCTPCONN) {
735		ASSERT(ns != NULL);
736		sctp_free(connp);
737		return;
738	}
739
740	if (ns != NULL) {
741		connp->conn_netstack = NULL;
742		netstack_rele(ns);
743	}
744	ipcl_conn_cleanup(connp);
745
746	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
747	if (connp->conn_flags & IPCL_UDPCONN) {
748		connp->conn_flags = IPCL_UDPCONN;
749		kmem_cache_free(udp_conn_cache, connp);
750	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
751		connp->conn_flags = IPCL_RAWIPCONN;
752		connp->conn_ulp = IPPROTO_ICMP;
753		kmem_cache_free(rawip_conn_cache, connp);
754	} else if (connp->conn_flags & IPCL_RTSCONN) {
755		connp->conn_flags = IPCL_RTSCONN;
756		kmem_cache_free(rts_conn_cache, connp);
757	} else {
758		connp->conn_flags = IPCL_IPCCONN;
759		ASSERT(connp->conn_flags & IPCL_IPCCONN);
760		ASSERT(connp->conn_priv == NULL);
761		kmem_cache_free(ip_conn_cache, connp);
762	}
763}
764
765/*
766 * Running in cluster mode - deregister listener information
767 */
768
769static void
770ipcl_conn_unlisten(conn_t *connp)
771{
772	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
773	ASSERT(connp->conn_lport != 0);
774
775	if (cl_inet_unlisten != NULL) {
776		sa_family_t	addr_family;
777		uint8_t		*laddrp;
778
779		if (connp->conn_pkt_isv6) {
780			addr_family = AF_INET6;
781			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
782		} else {
783			addr_family = AF_INET;
784			laddrp = (uint8_t *)&connp->conn_bound_source;
785		}
786		(*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp,
787		    connp->conn_lport);
788	}
789	connp->conn_flags &= ~IPCL_CL_LISTENER;
790}
791
792/*
793 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
794 * which table the conn belonged to). So for debugging we can see which hash
795 * table this connection was in.
796 */
797#define	IPCL_HASH_REMOVE(connp)	{					\
798	connf_t	*connfp = (connp)->conn_fanout;				\
799	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
800	if (connfp != NULL) {						\
801		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
802		    (void *)(connp)));					\
803		mutex_enter(&connfp->connf_lock);			\
804		if ((connp)->conn_next != NULL)				\
805			(connp)->conn_next->conn_prev =			\
806			    (connp)->conn_prev;				\
807		if ((connp)->conn_prev != NULL)				\
808			(connp)->conn_prev->conn_next =			\
809			    (connp)->conn_next;				\
810		else							\
811			connfp->connf_head = (connp)->conn_next;	\
812		(connp)->conn_fanout = NULL;				\
813		(connp)->conn_next = NULL;				\
814		(connp)->conn_prev = NULL;				\
815		(connp)->conn_flags |= IPCL_REMOVED;			\
816		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
817			ipcl_conn_unlisten((connp));			\
818		CONN_DEC_REF((connp));					\
819		mutex_exit(&connfp->connf_lock);			\
820	}								\
821}
822
823void
824ipcl_hash_remove(conn_t *connp)
825{
826	IPCL_HASH_REMOVE(connp);
827}
828
829/*
830 * The whole purpose of this function is allow removal of
831 * a conn_t from the connected hash for timewait reclaim.
832 * This is essentially a TW reclaim fastpath where timewait
833 * collector checks under fanout lock (so no one else can
834 * get access to the conn_t) that refcnt is 2 i.e. one for
835 * TCP and one for the classifier hash list. If ref count
836 * is indeed 2, we can just remove the conn under lock and
837 * avoid cleaning up the conn under squeue. This gives us
838 * improved performance.
839 */
840void
841ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
842{
843	ASSERT(MUTEX_HELD(&connfp->connf_lock));
844	ASSERT(MUTEX_HELD(&connp->conn_lock));
845	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
846
847	if ((connp)->conn_next != NULL) {
848		(connp)->conn_next->conn_prev = (connp)->conn_prev;
849	}
850	if ((connp)->conn_prev != NULL) {
851		(connp)->conn_prev->conn_next = (connp)->conn_next;
852	} else {
853		connfp->connf_head = (connp)->conn_next;
854	}
855	(connp)->conn_fanout = NULL;
856	(connp)->conn_next = NULL;
857	(connp)->conn_prev = NULL;
858	(connp)->conn_flags |= IPCL_REMOVED;
859	ASSERT((connp)->conn_ref == 2);
860	(connp)->conn_ref--;
861}
862
863#define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
864	ASSERT((connp)->conn_fanout == NULL);				\
865	ASSERT((connp)->conn_next == NULL);				\
866	ASSERT((connp)->conn_prev == NULL);				\
867	if ((connfp)->connf_head != NULL) {				\
868		(connfp)->connf_head->conn_prev = (connp);		\
869		(connp)->conn_next = (connfp)->connf_head;		\
870	}								\
871	(connp)->conn_fanout = (connfp);				\
872	(connfp)->connf_head = (connp);					\
873	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
874	    IPCL_CONNECTED;						\
875	CONN_INC_REF(connp);						\
876}
877
878#define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
879	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
880	    "connp %p", (void *)(connfp), (void *)(connp)));		\
881	IPCL_HASH_REMOVE((connp));					\
882	mutex_enter(&(connfp)->connf_lock);				\
883	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
884	mutex_exit(&(connfp)->connf_lock);				\
885}
886
887#define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
888	conn_t *pconnp = NULL, *nconnp;					\
889	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
890	    "connp %p", (void *)connfp, (void *)(connp)));		\
891	IPCL_HASH_REMOVE((connp));					\
892	mutex_enter(&(connfp)->connf_lock);				\
893	nconnp = (connfp)->connf_head;					\
894	while (nconnp != NULL &&					\
895	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
896		pconnp = nconnp;					\
897		nconnp = nconnp->conn_next;				\
898	}								\
899	if (pconnp != NULL) {						\
900		pconnp->conn_next = (connp);				\
901		(connp)->conn_prev = pconnp;				\
902	} else {							\
903		(connfp)->connf_head = (connp);				\
904	}								\
905	if (nconnp != NULL) {						\
906		(connp)->conn_next = nconnp;				\
907		nconnp->conn_prev = (connp);				\
908	}								\
909	(connp)->conn_fanout = (connfp);				\
910	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
911	    IPCL_BOUND;							\
912	CONN_INC_REF(connp);						\
913	mutex_exit(&(connfp)->connf_lock);				\
914}
915
916#define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
917	conn_t **list, *prev, *next;					\
918	boolean_t isv4mapped =						\
919	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
920	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
921	    "connp %p", (void *)(connfp), (void *)(connp)));		\
922	IPCL_HASH_REMOVE((connp));					\
923	mutex_enter(&(connfp)->connf_lock);				\
924	list = &(connfp)->connf_head;					\
925	prev = NULL;							\
926	while ((next = *list) != NULL) {				\
927		if (isv4mapped &&					\
928		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
929		    connp->conn_zoneid == next->conn_zoneid) {		\
930			(connp)->conn_next = next;			\
931			if (prev != NULL)				\
932				prev = next->conn_prev;			\
933			next->conn_prev = (connp);			\
934			break;						\
935		}							\
936		list = &next->conn_next;				\
937		prev = next;						\
938	}								\
939	(connp)->conn_prev = prev;					\
940	*list = (connp);						\
941	(connp)->conn_fanout = (connfp);				\
942	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
943	    IPCL_BOUND;							\
944	CONN_INC_REF((connp));						\
945	mutex_exit(&(connfp)->connf_lock);				\
946}
947
948void
949ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
950{
951	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
952}
953
954void
955ipcl_proto_insert(conn_t *connp, uint8_t protocol)
956{
957	connf_t	*connfp;
958	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
959
960	ASSERT(connp != NULL);
961	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
962	    protocol == IPPROTO_ESP);
963
964	connp->conn_ulp = protocol;
965
966	/* Insert it in the protocol hash */
967	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
968	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
969}
970
971void
972ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
973{
974	connf_t	*connfp;
975	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
976
977	ASSERT(connp != NULL);
978	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
979	    protocol == IPPROTO_ESP);
980
981	connp->conn_ulp = protocol;
982
983	/* Insert it in the Bind Hash */
984	connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
985	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
986}
987
988/*
989 * This function is used only for inserting SCTP raw socket now.
990 * This may change later.
991 *
992 * Note that only one raw socket can be bound to a port.  The param
993 * lport is in network byte order.
994 */
995static int
996ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
997{
998	connf_t	*connfp;
999	conn_t	*oconnp;
1000	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1001
1002	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1003
1004	/* Check for existing raw socket already bound to the port. */
1005	mutex_enter(&connfp->connf_lock);
1006	for (oconnp = connfp->connf_head; oconnp != NULL;
1007	    oconnp = oconnp->conn_next) {
1008		if (oconnp->conn_lport == lport &&
1009		    oconnp->conn_zoneid == connp->conn_zoneid &&
1010		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
1011		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1012		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
1013		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
1014		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
1015		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
1016		    &connp->conn_srcv6))) {
1017			break;
1018		}
1019	}
1020	mutex_exit(&connfp->connf_lock);
1021	if (oconnp != NULL)
1022		return (EADDRNOTAVAIL);
1023
1024	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
1025	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
1026		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
1027		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
1028			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1029		} else {
1030			IPCL_HASH_INSERT_BOUND(connfp, connp);
1031		}
1032	} else {
1033		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1034	}
1035	return (0);
1036}
1037
1038/*
1039 * Check for a MAC exemption conflict on a labeled system.  Note that for
1040 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1041 * transport layer.  This check is for binding all other protocols.
1042 *
1043 * Returns true if there's a conflict.
1044 */
1045static boolean_t
1046check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1047{
1048	connf_t	*connfp;
1049	conn_t *tconn;
1050
1051	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1052	mutex_enter(&connfp->connf_lock);
1053	for (tconn = connfp->connf_head; tconn != NULL;
1054	    tconn = tconn->conn_next) {
1055		/* We don't allow v4 fallback for v6 raw socket */
1056		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1057			continue;
1058		/* If neither is exempt, then there's no conflict */
1059		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1060			continue;
1061		/* If both are bound to different specific addrs, ok */
1062		if (connp->conn_src != INADDR_ANY &&
1063		    tconn->conn_src != INADDR_ANY &&
1064		    connp->conn_src != tconn->conn_src)
1065			continue;
1066		/* These two conflict; fail */
1067		break;
1068	}
1069	mutex_exit(&connfp->connf_lock);
1070	return (tconn != NULL);
1071}
1072
1073static boolean_t
1074check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1075{
1076	connf_t	*connfp;
1077	conn_t *tconn;
1078
1079	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
1080	mutex_enter(&connfp->connf_lock);
1081	for (tconn = connfp->connf_head; tconn != NULL;
1082	    tconn = tconn->conn_next) {
1083		/* We don't allow v4 fallback for v6 raw socket */
1084		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
1085			continue;
1086		/* If neither is exempt, then there's no conflict */
1087		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
1088			continue;
1089		/* If both are bound to different addrs, ok */
1090		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
1091		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
1092		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
1093			continue;
1094		/* These two conflict; fail */
1095		break;
1096	}
1097	mutex_exit(&connfp->connf_lock);
1098	return (tconn != NULL);
1099}
1100
1101/*
1102 * (v4, v6) bind hash insertion routines
1103 */
1104int
1105ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
1106{
1107	connf_t	*connfp;
1108#ifdef	IPCL_DEBUG
1109	char	buf[INET_NTOA_BUFSIZE];
1110#endif
1111	int	ret = 0;
1112	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1113
1114	ASSERT(connp);
1115
1116	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
1117	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
1118
1119	connp->conn_ulp = protocol;
1120	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
1121	connp->conn_lport = lport;
1122
1123	switch (protocol) {
1124	default:
1125		if (is_system_labeled() &&
1126		    check_exempt_conflict_v4(connp, ipst))
1127			return (EADDRINUSE);
1128		/* FALLTHROUGH */
1129	case IPPROTO_UDP:
1130		if (protocol == IPPROTO_UDP) {
1131			IPCL_DEBUG_LVL(64,
1132			    ("ipcl_bind_insert: connp %p - udp\n",
1133			    (void *)connp));
1134			connfp = &ipst->ips_ipcl_udp_fanout[
1135			    IPCL_UDP_HASH(lport, ipst)];
1136		} else {
1137			IPCL_DEBUG_LVL(64,
1138			    ("ipcl_bind_insert: connp %p - protocol\n",
1139			    (void *)connp));
1140			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1141		}
1142
1143		if (connp->conn_rem != INADDR_ANY) {
1144			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1145		} else if (connp->conn_src != INADDR_ANY) {
1146			IPCL_HASH_INSERT_BOUND(connfp, connp);
1147		} else {
1148			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1149		}
1150		break;
1151
1152	case IPPROTO_TCP:
1153
1154		/* Insert it in the Bind Hash */
1155		ASSERT(connp->conn_zoneid != ALL_ZONES);
1156		connfp = &ipst->ips_ipcl_bind_fanout[
1157		    IPCL_BIND_HASH(lport, ipst)];
1158		if (connp->conn_src != INADDR_ANY) {
1159			IPCL_HASH_INSERT_BOUND(connfp, connp);
1160		} else {
1161			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1162		}
1163		if (cl_inet_listen != NULL) {
1164			ASSERT(!connp->conn_pkt_isv6);
1165			connp->conn_flags |= IPCL_CL_LISTENER;
1166			(*cl_inet_listen)(IPPROTO_TCP, AF_INET,
1167			    (uint8_t *)&connp->conn_bound_source, lport);
1168		}
1169		break;
1170
1171	case IPPROTO_SCTP:
1172		ret = ipcl_sctp_hash_insert(connp, lport);
1173		break;
1174	}
1175
1176	return (ret);
1177}
1178
1179int
1180ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1181    uint16_t lport)
1182{
1183	connf_t	*connfp;
1184	int	ret = 0;
1185	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1186
1187	ASSERT(connp);
1188
1189	connp->conn_ulp = protocol;
1190	connp->conn_srcv6 = *src;
1191	connp->conn_lport = lport;
1192
1193	switch (protocol) {
1194	default:
1195		if (is_system_labeled() &&
1196		    check_exempt_conflict_v6(connp, ipst))
1197			return (EADDRINUSE);
1198		/* FALLTHROUGH */
1199	case IPPROTO_UDP:
1200		if (protocol == IPPROTO_UDP) {
1201			IPCL_DEBUG_LVL(128,
1202			    ("ipcl_bind_insert_v6: connp %p - udp\n",
1203			    (void *)connp));
1204			connfp = &ipst->ips_ipcl_udp_fanout[
1205			    IPCL_UDP_HASH(lport, ipst)];
1206		} else {
1207			IPCL_DEBUG_LVL(128,
1208			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
1209			    (void *)connp));
1210			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1211		}
1212
1213		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1214			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1215		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1216			IPCL_HASH_INSERT_BOUND(connfp, connp);
1217		} else {
1218			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1219		}
1220		break;
1221
1222	case IPPROTO_TCP:
1223		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
1224
1225		/* Insert it in the Bind Hash */
1226		ASSERT(connp->conn_zoneid != ALL_ZONES);
1227		connfp = &ipst->ips_ipcl_bind_fanout[
1228		    IPCL_BIND_HASH(lport, ipst)];
1229		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1230			IPCL_HASH_INSERT_BOUND(connfp, connp);
1231		} else {
1232			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1233		}
1234		if (cl_inet_listen != NULL) {
1235			sa_family_t	addr_family;
1236			uint8_t		*laddrp;
1237
1238			if (connp->conn_pkt_isv6) {
1239				addr_family = AF_INET6;
1240				laddrp =
1241				    (uint8_t *)&connp->conn_bound_source_v6;
1242			} else {
1243				addr_family = AF_INET;
1244				laddrp = (uint8_t *)&connp->conn_bound_source;
1245			}
1246			connp->conn_flags |= IPCL_CL_LISTENER;
1247			(*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp,
1248			    lport);
1249		}
1250		break;
1251
1252	case IPPROTO_SCTP:
1253		ret = ipcl_sctp_hash_insert(connp, lport);
1254		break;
1255	}
1256
1257	return (ret);
1258}
1259
1260/*
1261 * ipcl_conn_hash insertion routines.
1262 */
1263int
1264ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
1265    ipaddr_t rem, uint32_t ports)
1266{
1267	connf_t		*connfp;
1268	uint16_t	*up;
1269	conn_t		*tconnp;
1270#ifdef	IPCL_DEBUG
1271	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1272#endif
1273	in_port_t	lport;
1274	int		ret = 0;
1275	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1276
1277	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1278	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1279	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1280	    ports, protocol));
1281
1282	switch (protocol) {
1283	case IPPROTO_TCP:
1284		if (!(connp->conn_flags & IPCL_EAGER)) {
1285			/*
1286			 * for a eager connection, i.e connections which
1287			 * have just been created, the initialization is
1288			 * already done in ip at conn_creation time, so
1289			 * we can skip the checks here.
1290			 */
1291			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1292		}
1293		connfp = &ipst->ips_ipcl_conn_fanout[
1294		    IPCL_CONN_HASH(connp->conn_rem,
1295		    connp->conn_ports, ipst)];
1296		mutex_enter(&connfp->connf_lock);
1297		for (tconnp = connfp->connf_head; tconnp != NULL;
1298		    tconnp = tconnp->conn_next) {
1299			if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1300			    connp->conn_rem, connp->conn_src,
1301			    connp->conn_ports)) {
1302
1303				/* Already have a conn. bail out */
1304				mutex_exit(&connfp->connf_lock);
1305				return (EADDRINUSE);
1306			}
1307		}
1308		if (connp->conn_fanout != NULL) {
1309			/*
1310			 * Probably a XTI/TLI application trying to do a
1311			 * rebind. Let it happen.
1312			 */
1313			mutex_exit(&connfp->connf_lock);
1314			IPCL_HASH_REMOVE(connp);
1315			mutex_enter(&connfp->connf_lock);
1316		}
1317
1318		ASSERT(connp->conn_recv != NULL);
1319
1320		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1321		mutex_exit(&connfp->connf_lock);
1322		break;
1323
1324	case IPPROTO_SCTP:
1325		/*
1326		 * The raw socket may have already been bound, remove it
1327		 * from the hash first.
1328		 */
1329		IPCL_HASH_REMOVE(connp);
1330		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1331		ret = ipcl_sctp_hash_insert(connp, lport);
1332		break;
1333
1334	default:
1335		/*
1336		 * Check for conflicts among MAC exempt bindings.  For
1337		 * transports with port numbers, this is done by the upper
1338		 * level per-transport binding logic.  For all others, it's
1339		 * done here.
1340		 */
1341		if (is_system_labeled() &&
1342		    check_exempt_conflict_v4(connp, ipst))
1343			return (EADDRINUSE);
1344		/* FALLTHROUGH */
1345
1346	case IPPROTO_UDP:
1347		up = (uint16_t *)&ports;
1348		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1349		if (protocol == IPPROTO_UDP) {
1350			connfp = &ipst->ips_ipcl_udp_fanout[
1351			    IPCL_UDP_HASH(up[1], ipst)];
1352		} else {
1353			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
1354		}
1355
1356		if (connp->conn_rem != INADDR_ANY) {
1357			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1358		} else if (connp->conn_src != INADDR_ANY) {
1359			IPCL_HASH_INSERT_BOUND(connfp, connp);
1360		} else {
1361			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1362		}
1363		break;
1364	}
1365
1366	return (ret);
1367}
1368
1369int
1370ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1371    const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1372{
1373	connf_t		*connfp;
1374	uint16_t	*up;
1375	conn_t		*tconnp;
1376	in_port_t	lport;
1377	int		ret = 0;
1378	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1379
1380	switch (protocol) {
1381	case IPPROTO_TCP:
1382		/* Just need to insert a conn struct */
1383		if (!(connp->conn_flags & IPCL_EAGER)) {
1384			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1385		}
1386		connfp = &ipst->ips_ipcl_conn_fanout[
1387		    IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports,
1388		    ipst)];
1389		mutex_enter(&connfp->connf_lock);
1390		for (tconnp = connfp->connf_head; tconnp != NULL;
1391		    tconnp = tconnp->conn_next) {
1392			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1393			    connp->conn_remv6, connp->conn_srcv6,
1394			    connp->conn_ports) &&
1395			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1396			    tconnp->conn_tcp->tcp_bound_if == ifindex)) {
1397				/* Already have a conn. bail out */
1398				mutex_exit(&connfp->connf_lock);
1399				return (EADDRINUSE);
1400			}
1401		}
1402		if (connp->conn_fanout != NULL) {
1403			/*
1404			 * Probably a XTI/TLI application trying to do a
1405			 * rebind. Let it happen.
1406			 */
1407			mutex_exit(&connfp->connf_lock);
1408			IPCL_HASH_REMOVE(connp);
1409			mutex_enter(&connfp->connf_lock);
1410		}
1411		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1412		mutex_exit(&connfp->connf_lock);
1413		break;
1414
1415	case IPPROTO_SCTP:
1416		IPCL_HASH_REMOVE(connp);
1417		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1418		ret = ipcl_sctp_hash_insert(connp, lport);
1419		break;
1420
1421	default:
1422		if (is_system_labeled() &&
1423		    check_exempt_conflict_v6(connp, ipst))
1424			return (EADDRINUSE);
1425		/* FALLTHROUGH */
1426	case IPPROTO_UDP:
1427		up = (uint16_t *)&ports;
1428		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1429		if (protocol == IPPROTO_UDP) {
1430			connfp = &ipst->ips_ipcl_udp_fanout[
1431			    IPCL_UDP_HASH(up[1], ipst)];
1432		} else {
1433			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1434		}
1435
1436		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1437			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1438		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1439			IPCL_HASH_INSERT_BOUND(connfp, connp);
1440		} else {
1441			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1442		}
1443		break;
1444	}
1445
1446	return (ret);
1447}
1448
1449/*
1450 * v4 packet classifying function. looks up the fanout table to
1451 * find the conn, the packet belongs to. returns the conn with
1452 * the reference held, null otherwise.
1453 *
1454 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1455 * Lookup" comment block are applied.  Labels are also checked as described
1456 * above.  If the packet is from the inside (looped back), and is from the same
1457 * zone, then label checks are omitted.
1458 */
1459conn_t *
1460ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1461    ip_stack_t *ipst)
1462{
1463	ipha_t	*ipha;
1464	connf_t	*connfp, *bind_connfp;
1465	uint16_t lport;
1466	uint16_t fport;
1467	uint32_t ports;
1468	conn_t	*connp;
1469	uint16_t  *up;
1470	boolean_t shared_addr;
1471	boolean_t unlabeled;
1472
1473	ipha = (ipha_t *)mp->b_rptr;
1474	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1475
1476	switch (protocol) {
1477	case IPPROTO_TCP:
1478		ports = *(uint32_t *)up;
1479		connfp =
1480		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1481		    ports, ipst)];
1482		mutex_enter(&connfp->connf_lock);
1483		for (connp = connfp->connf_head; connp != NULL;
1484		    connp = connp->conn_next) {
1485			if (IPCL_CONN_MATCH(connp, protocol,
1486			    ipha->ipha_src, ipha->ipha_dst, ports))
1487				break;
1488		}
1489
1490		if (connp != NULL) {
1491			/*
1492			 * We have a fully-bound TCP connection.
1493			 *
1494			 * For labeled systems, there's no need to check the
1495			 * label here.  It's known to be good as we checked
1496			 * before allowing the connection to become bound.
1497			 */
1498			CONN_INC_REF(connp);
1499			mutex_exit(&connfp->connf_lock);
1500			return (connp);
1501		}
1502
1503		mutex_exit(&connfp->connf_lock);
1504
1505		lport = up[1];
1506		unlabeled = B_FALSE;
1507		/* Cred cannot be null on IPv4 */
1508		if (is_system_labeled())
1509			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1510			    TSLF_UNLABELED) != 0;
1511		shared_addr = (zoneid == ALL_ZONES);
1512		if (shared_addr) {
1513			/*
1514			 * No need to handle exclusive-stack zones since
1515			 * ALL_ZONES only applies to the shared stack.
1516			 */
1517			zoneid = tsol_mlp_findzone(protocol, lport);
1518			/*
1519			 * If no shared MLP is found, tsol_mlp_findzone returns
1520			 * ALL_ZONES.  In that case, we assume it's SLP, and
1521			 * search for the zone based on the packet label.
1522			 *
1523			 * If there is such a zone, we prefer to find a
1524			 * connection in it.  Otherwise, we look for a
1525			 * MAC-exempt connection in any zone whose label
1526			 * dominates the default label on the packet.
1527			 */
1528			if (zoneid == ALL_ZONES)
1529				zoneid = tsol_packet_to_zoneid(mp);
1530			else
1531				unlabeled = B_FALSE;
1532		}
1533
1534		bind_connfp =
1535		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1536		mutex_enter(&bind_connfp->connf_lock);
1537		for (connp = bind_connfp->connf_head; connp != NULL;
1538		    connp = connp->conn_next) {
1539			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1540			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
1541			    (unlabeled && connp->conn_mac_exempt)))
1542				break;
1543		}
1544
1545		/*
1546		 * If the matching connection is SLP on a private address, then
1547		 * the label on the packet must match the local zone's label.
1548		 * Otherwise, it must be in the label range defined by tnrh.
1549		 * This is ensured by tsol_receive_label.
1550		 */
1551		if (connp != NULL && is_system_labeled() &&
1552		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1553		    shared_addr, connp)) {
1554				DTRACE_PROBE3(
1555				    tx__ip__log__info__classify__tcp,
1556				    char *,
1557				    "connp(1) could not receive mp(2)",
1558				    conn_t *, connp, mblk_t *, mp);
1559			connp = NULL;
1560		}
1561
1562		if (connp != NULL) {
1563			/* Have a listener at least */
1564			CONN_INC_REF(connp);
1565			mutex_exit(&bind_connfp->connf_lock);
1566			return (connp);
1567		}
1568
1569		mutex_exit(&bind_connfp->connf_lock);
1570
1571		IPCL_DEBUG_LVL(512,
1572		    ("ipcl_classify: couldn't classify mp = %p\n",
1573		    (void *)mp));
1574		break;
1575
1576	case IPPROTO_UDP:
1577		lport = up[1];
1578		unlabeled = B_FALSE;
1579		/* Cred cannot be null on IPv4 */
1580		if (is_system_labeled())
1581			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1582			    TSLF_UNLABELED) != 0;
1583		shared_addr = (zoneid == ALL_ZONES);
1584		if (shared_addr) {
1585			/*
1586			 * No need to handle exclusive-stack zones since
1587			 * ALL_ZONES only applies to the shared stack.
1588			 */
1589			zoneid = tsol_mlp_findzone(protocol, lport);
1590			/*
1591			 * If no shared MLP is found, tsol_mlp_findzone returns
1592			 * ALL_ZONES.  In that case, we assume it's SLP, and
1593			 * search for the zone based on the packet label.
1594			 *
1595			 * If there is such a zone, we prefer to find a
1596			 * connection in it.  Otherwise, we look for a
1597			 * MAC-exempt connection in any zone whose label
1598			 * dominates the default label on the packet.
1599			 */
1600			if (zoneid == ALL_ZONES)
1601				zoneid = tsol_packet_to_zoneid(mp);
1602			else
1603				unlabeled = B_FALSE;
1604		}
1605		fport = up[0];
1606		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1607		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1608		mutex_enter(&connfp->connf_lock);
1609		for (connp = connfp->connf_head; connp != NULL;
1610		    connp = connp->conn_next) {
1611			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1612			    fport, ipha->ipha_src) &&
1613			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1614			    (unlabeled && connp->conn_mac_exempt)))
1615				break;
1616		}
1617
1618		if (connp != NULL && is_system_labeled() &&
1619		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1620		    shared_addr, connp)) {
1621			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1622			    char *, "connp(1) could not receive mp(2)",
1623			    conn_t *, connp, mblk_t *, mp);
1624			connp = NULL;
1625		}
1626
1627		if (connp != NULL) {
1628			CONN_INC_REF(connp);
1629			mutex_exit(&connfp->connf_lock);
1630			return (connp);
1631		}
1632
1633		/*
1634		 * We shouldn't come here for multicast/broadcast packets
1635		 */
1636		mutex_exit(&connfp->connf_lock);
1637		IPCL_DEBUG_LVL(512,
1638		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1639		    lport, fport));
1640		break;
1641	}
1642
1643	return (NULL);
1644}
1645
1646conn_t *
1647ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
1648    ip_stack_t *ipst)
1649{
1650	ip6_t		*ip6h;
1651	connf_t		*connfp, *bind_connfp;
1652	uint16_t	lport;
1653	uint16_t	fport;
1654	tcph_t		*tcph;
1655	uint32_t	ports;
1656	conn_t		*connp;
1657	uint16_t	*up;
1658	boolean_t	shared_addr;
1659	boolean_t	unlabeled;
1660
1661	ip6h = (ip6_t *)mp->b_rptr;
1662
1663	switch (protocol) {
1664	case IPPROTO_TCP:
1665		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1666		up = (uint16_t *)tcph->th_lport;
1667		ports = *(uint32_t *)up;
1668
1669		connfp =
1670		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1671		    ports, ipst)];
1672		mutex_enter(&connfp->connf_lock);
1673		for (connp = connfp->connf_head; connp != NULL;
1674		    connp = connp->conn_next) {
1675			if (IPCL_CONN_MATCH_V6(connp, protocol,
1676			    ip6h->ip6_src, ip6h->ip6_dst, ports))
1677				break;
1678		}
1679
1680		if (connp != NULL) {
1681			/*
1682			 * We have a fully-bound TCP connection.
1683			 *
1684			 * For labeled systems, there's no need to check the
1685			 * label here.  It's known to be good as we checked
1686			 * before allowing the connection to become bound.
1687			 */
1688			CONN_INC_REF(connp);
1689			mutex_exit(&connfp->connf_lock);
1690			return (connp);
1691		}
1692
1693		mutex_exit(&connfp->connf_lock);
1694
1695		lport = up[1];
1696		unlabeled = B_FALSE;
1697		/* Cred can be null on IPv6 */
1698		if (is_system_labeled()) {
1699			cred_t *cr = DB_CRED(mp);
1700
1701			unlabeled = (cr != NULL &&
1702			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1703		}
1704		shared_addr = (zoneid == ALL_ZONES);
1705		if (shared_addr) {
1706			/*
1707			 * No need to handle exclusive-stack zones since
1708			 * ALL_ZONES only applies to the shared stack.
1709			 */
1710			zoneid = tsol_mlp_findzone(protocol, lport);
1711			/*
1712			 * If no shared MLP is found, tsol_mlp_findzone returns
1713			 * ALL_ZONES.  In that case, we assume it's SLP, and
1714			 * search for the zone based on the packet label.
1715			 *
1716			 * If there is such a zone, we prefer to find a
1717			 * connection in it.  Otherwise, we look for a
1718			 * MAC-exempt connection in any zone whose label
1719			 * dominates the default label on the packet.
1720			 */
1721			if (zoneid == ALL_ZONES)
1722				zoneid = tsol_packet_to_zoneid(mp);
1723			else
1724				unlabeled = B_FALSE;
1725		}
1726
1727		bind_connfp =
1728		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1729		mutex_enter(&bind_connfp->connf_lock);
1730		for (connp = bind_connfp->connf_head; connp != NULL;
1731		    connp = connp->conn_next) {
1732			if (IPCL_BIND_MATCH_V6(connp, protocol,
1733			    ip6h->ip6_dst, lport) &&
1734			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1735			    (unlabeled && connp->conn_mac_exempt)))
1736				break;
1737		}
1738
1739		if (connp != NULL && is_system_labeled() &&
1740		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1741		    shared_addr, connp)) {
1742			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1743			    char *, "connp(1) could not receive mp(2)",
1744			    conn_t *, connp, mblk_t *, mp);
1745			connp = NULL;
1746		}
1747
1748		if (connp != NULL) {
1749			/* Have a listner at least */
1750			CONN_INC_REF(connp);
1751			mutex_exit(&bind_connfp->connf_lock);
1752			IPCL_DEBUG_LVL(512,
1753			    ("ipcl_classify_v6: found listner "
1754			    "connp = %p\n", (void *)connp));
1755
1756			return (connp);
1757		}
1758
1759		mutex_exit(&bind_connfp->connf_lock);
1760
1761		IPCL_DEBUG_LVL(512,
1762		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1763		    (void *)mp));
1764		break;
1765
1766	case IPPROTO_UDP:
1767		up = (uint16_t *)&mp->b_rptr[hdr_len];
1768		lport = up[1];
1769		unlabeled = B_FALSE;
1770		/* Cred can be null on IPv6 */
1771		if (is_system_labeled()) {
1772			cred_t *cr = DB_CRED(mp);
1773
1774			unlabeled = (cr != NULL &&
1775			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1776		}
1777		shared_addr = (zoneid == ALL_ZONES);
1778		if (shared_addr) {
1779			/*
1780			 * No need to handle exclusive-stack zones since
1781			 * ALL_ZONES only applies to the shared stack.
1782			 */
1783			zoneid = tsol_mlp_findzone(protocol, lport);
1784			/*
1785			 * If no shared MLP is found, tsol_mlp_findzone returns
1786			 * ALL_ZONES.  In that case, we assume it's SLP, and
1787			 * search for the zone based on the packet label.
1788			 *
1789			 * If there is such a zone, we prefer to find a
1790			 * connection in it.  Otherwise, we look for a
1791			 * MAC-exempt connection in any zone whose label
1792			 * dominates the default label on the packet.
1793			 */
1794			if (zoneid == ALL_ZONES)
1795				zoneid = tsol_packet_to_zoneid(mp);
1796			else
1797				unlabeled = B_FALSE;
1798		}
1799
1800		fport = up[0];
1801		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
1802		    fport));
1803		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1804		mutex_enter(&connfp->connf_lock);
1805		for (connp = connfp->connf_head; connp != NULL;
1806		    connp = connp->conn_next) {
1807			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1808			    fport, ip6h->ip6_src) &&
1809			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1810			    (unlabeled && connp->conn_mac_exempt)))
1811				break;
1812		}
1813
1814		if (connp != NULL && is_system_labeled() &&
1815		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1816		    shared_addr, connp)) {
1817			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1818			    char *, "connp(1) could not receive mp(2)",
1819			    conn_t *, connp, mblk_t *, mp);
1820			connp = NULL;
1821		}
1822
1823		if (connp != NULL) {
1824			CONN_INC_REF(connp);
1825			mutex_exit(&connfp->connf_lock);
1826			return (connp);
1827		}
1828
1829		/*
1830		 * We shouldn't come here for multicast/broadcast packets
1831		 */
1832		mutex_exit(&connfp->connf_lock);
1833		IPCL_DEBUG_LVL(512,
1834		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
1835		    lport, fport));
1836		break;
1837	}
1838
1839	return (NULL);
1840}
1841
1842/*
1843 * wrapper around ipcl_classify_(v4,v6) routines.
1844 */
1845conn_t *
1846ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
1847{
1848	uint16_t	hdr_len;
1849	ipha_t		*ipha;
1850	uint8_t		*nexthdrp;
1851
1852	if (MBLKL(mp) < sizeof (ipha_t))
1853		return (NULL);
1854
1855	switch (IPH_HDR_VERSION(mp->b_rptr)) {
1856	case IPV4_VERSION:
1857		ipha = (ipha_t *)mp->b_rptr;
1858		hdr_len = IPH_HDR_LENGTH(ipha);
1859		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
1860		    zoneid, ipst));
1861	case IPV6_VERSION:
1862		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
1863		    &hdr_len, &nexthdrp))
1864			return (NULL);
1865
1866		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst));
1867	}
1868
1869	return (NULL);
1870}
1871
1872conn_t *
1873ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
1874    uint32_t ports, ipha_t *hdr, ip_stack_t *ipst)
1875{
1876	connf_t		*connfp;
1877	conn_t		*connp;
1878	in_port_t	lport;
1879	int		af;
1880	boolean_t	shared_addr;
1881	boolean_t	unlabeled;
1882	const void	*dst;
1883
1884	lport = ((uint16_t *)&ports)[1];
1885
1886	unlabeled = B_FALSE;
1887	/* Cred can be null on IPv6 */
1888	if (is_system_labeled()) {
1889		cred_t *cr = DB_CRED(mp);
1890
1891		unlabeled = (cr != NULL &&
1892		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1893	}
1894	shared_addr = (zoneid == ALL_ZONES);
1895	if (shared_addr) {
1896		/*
1897		 * No need to handle exclusive-stack zones since ALL_ZONES
1898		 * only applies to the shared stack.
1899		 */
1900		zoneid = tsol_mlp_findzone(protocol, lport);
1901		/*
1902		 * If no shared MLP is found, tsol_mlp_findzone returns
1903		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
1904		 * the zone based on the packet label.
1905		 *
1906		 * If there is such a zone, we prefer to find a connection in
1907		 * it.  Otherwise, we look for a MAC-exempt connection in any
1908		 * zone whose label dominates the default label on the packet.
1909		 */
1910		if (zoneid == ALL_ZONES)
1911			zoneid = tsol_packet_to_zoneid(mp);
1912		else
1913			unlabeled = B_FALSE;
1914	}
1915
1916	af = IPH_HDR_VERSION(hdr);
1917	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
1918	    (const void *)&((ip6_t *)hdr)->ip6_dst;
1919	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1920
1921	mutex_enter(&connfp->connf_lock);
1922	for (connp = connfp->connf_head; connp != NULL;
1923	    connp = connp->conn_next) {
1924		/* We don't allow v4 fallback for v6 raw socket. */
1925		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
1926		    IPV6_VERSION))
1927			continue;
1928		if (connp->conn_fully_bound) {
1929			if (af == IPV4_VERSION) {
1930				if (!IPCL_CONN_MATCH(connp, protocol,
1931				    hdr->ipha_src, hdr->ipha_dst, ports))
1932					continue;
1933			} else {
1934				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1935				    ((ip6_t *)hdr)->ip6_src,
1936				    ((ip6_t *)hdr)->ip6_dst, ports))
1937					continue;
1938			}
1939		} else {
1940			if (af == IPV4_VERSION) {
1941				if (!IPCL_BIND_MATCH(connp, protocol,
1942				    hdr->ipha_dst, lport))
1943					continue;
1944			} else {
1945				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1946				    ((ip6_t *)hdr)->ip6_dst, lport))
1947					continue;
1948			}
1949		}
1950
1951		if (IPCL_ZONE_MATCH(connp, zoneid) ||
1952		    (unlabeled && connp->conn_mac_exempt))
1953			break;
1954	}
1955	/*
1956	 * If the connection is fully-bound and connection-oriented (TCP or
1957	 * SCTP), then we've already validated the remote system's label.
1958	 * There's no need to do it again for every packet.
1959	 */
1960	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
1961	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
1962	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
1963		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1964		    char *, "connp(1) could not receive mp(2)",
1965		    conn_t *, connp, mblk_t *, mp);
1966		connp = NULL;
1967	}
1968
1969	if (connp != NULL)
1970		goto found;
1971	mutex_exit(&connfp->connf_lock);
1972
1973	/* Try to look for a wildcard match. */
1974	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1975	mutex_enter(&connfp->connf_lock);
1976	for (connp = connfp->connf_head; connp != NULL;
1977	    connp = connp->conn_next) {
1978		/* We don't allow v4 fallback for v6 raw socket. */
1979		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1980		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
1981			continue;
1982		}
1983		if (af == IPV4_VERSION) {
1984			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
1985				break;
1986		} else {
1987			if (IPCL_RAW_MATCH_V6(connp, protocol,
1988			    ((ip6_t *)hdr)->ip6_dst)) {
1989				break;
1990			}
1991		}
1992	}
1993
1994	if (connp != NULL)
1995		goto found;
1996
1997	mutex_exit(&connfp->connf_lock);
1998	return (NULL);
1999
2000found:
2001	ASSERT(connp != NULL);
2002	CONN_INC_REF(connp);
2003	mutex_exit(&connfp->connf_lock);
2004	return (connp);
2005}
2006
2007/* ARGSUSED */
2008static int
2009tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2010{
2011	itc_t	*itc = (itc_t *)buf;
2012	conn_t 	*connp = &itc->itc_conn;
2013	tcp_t	*tcp = (tcp_t *)&itc[1];
2014
2015	bzero(connp, sizeof (conn_t));
2016	bzero(tcp, sizeof (tcp_t));
2017
2018	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2019	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2020	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
2021	connp->conn_tcp = tcp;
2022	connp->conn_flags = IPCL_TCPCONN;
2023	connp->conn_ulp = IPPROTO_TCP;
2024	tcp->tcp_connp = connp;
2025	return (0);
2026}
2027
2028/* ARGSUSED */
2029static void
2030tcp_conn_destructor(void *buf, void *cdrarg)
2031{
2032	itc_t	*itc = (itc_t *)buf;
2033	conn_t 	*connp = &itc->itc_conn;
2034	tcp_t	*tcp = (tcp_t *)&itc[1];
2035
2036	ASSERT(connp->conn_flags & IPCL_TCPCONN);
2037	ASSERT(tcp->tcp_connp == connp);
2038	ASSERT(connp->conn_tcp == tcp);
2039	tcp_timermp_free(tcp);
2040	mutex_destroy(&connp->conn_lock);
2041	cv_destroy(&connp->conn_cv);
2042}
2043
2044/* ARGSUSED */
2045static int
2046ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2047{
2048	itc_t	*itc = (itc_t *)buf;
2049	conn_t 	*connp = &itc->itc_conn;
2050
2051	bzero(connp, sizeof (conn_t));
2052	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2053	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2054	connp->conn_flags = IPCL_IPCCONN;
2055
2056	return (0);
2057}
2058
2059/* ARGSUSED */
2060static void
2061ip_conn_destructor(void *buf, void *cdrarg)
2062{
2063	itc_t	*itc = (itc_t *)buf;
2064	conn_t 	*connp = &itc->itc_conn;
2065
2066	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2067	ASSERT(connp->conn_priv == NULL);
2068	mutex_destroy(&connp->conn_lock);
2069	cv_destroy(&connp->conn_cv);
2070}
2071
2072/* ARGSUSED */
2073static int
2074udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2075{
2076	itc_t	*itc = (itc_t *)buf;
2077	conn_t 	*connp = &itc->itc_conn;
2078	udp_t	*udp = (udp_t *)&itc[1];
2079
2080	bzero(connp, sizeof (conn_t));
2081	bzero(udp, sizeof (udp_t));
2082
2083	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2084	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2085	connp->conn_udp = udp;
2086	connp->conn_flags = IPCL_UDPCONN;
2087	connp->conn_ulp = IPPROTO_UDP;
2088	udp->udp_connp = connp;
2089	return (0);
2090}
2091
2092/* ARGSUSED */
2093static void
2094udp_conn_destructor(void *buf, void *cdrarg)
2095{
2096	itc_t	*itc = (itc_t *)buf;
2097	conn_t 	*connp = &itc->itc_conn;
2098	udp_t	*udp = (udp_t *)&itc[1];
2099
2100	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2101	ASSERT(udp->udp_connp == connp);
2102	ASSERT(connp->conn_udp == udp);
2103	mutex_destroy(&connp->conn_lock);
2104	cv_destroy(&connp->conn_cv);
2105}
2106
2107/* ARGSUSED */
2108static int
2109rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2110{
2111	itc_t	*itc = (itc_t *)buf;
2112	conn_t 	*connp = &itc->itc_conn;
2113	icmp_t	*icmp = (icmp_t *)&itc[1];
2114
2115	bzero(connp, sizeof (conn_t));
2116	bzero(icmp, sizeof (icmp_t));
2117
2118	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2119	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2120	connp->conn_icmp = icmp;
2121	connp->conn_flags = IPCL_RAWIPCONN;
2122	connp->conn_ulp = IPPROTO_ICMP;
2123	icmp->icmp_connp = connp;
2124	return (0);
2125}
2126
2127/* ARGSUSED */
2128static void
2129rawip_conn_destructor(void *buf, void *cdrarg)
2130{
2131	itc_t	*itc = (itc_t *)buf;
2132	conn_t 	*connp = &itc->itc_conn;
2133	icmp_t	*icmp = (icmp_t *)&itc[1];
2134
2135	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2136	ASSERT(icmp->icmp_connp == connp);
2137	ASSERT(connp->conn_icmp == icmp);
2138	mutex_destroy(&connp->conn_lock);
2139	cv_destroy(&connp->conn_cv);
2140}
2141
2142/* ARGSUSED */
2143static int
2144rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2145{
2146	itc_t	*itc = (itc_t *)buf;
2147	conn_t 	*connp = &itc->itc_conn;
2148	rts_t	*rts = (rts_t *)&itc[1];
2149
2150	bzero(connp, sizeof (conn_t));
2151	bzero(rts, sizeof (rts_t));
2152
2153	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2154	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2155	connp->conn_rts = rts;
2156	connp->conn_flags = IPCL_RTSCONN;
2157	rts->rts_connp = connp;
2158	return (0);
2159}
2160
2161/* ARGSUSED */
2162static void
2163rts_conn_destructor(void *buf, void *cdrarg)
2164{
2165	itc_t	*itc = (itc_t *)buf;
2166	conn_t 	*connp = &itc->itc_conn;
2167	rts_t	*rts = (rts_t *)&itc[1];
2168
2169	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2170	ASSERT(rts->rts_connp == connp);
2171	ASSERT(connp->conn_rts == rts);
2172	mutex_destroy(&connp->conn_lock);
2173	cv_destroy(&connp->conn_cv);
2174}
2175
2176/*
2177 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2178 * in the conn_t.
2179 */
2180void
2181ipcl_conn_cleanup(conn_t *connp)
2182{
2183	ASSERT(connp->conn_ire_cache == NULL);
2184	ASSERT(connp->conn_latch == NULL);
2185#ifdef notdef
2186	ASSERT(connp->conn_rq == NULL);
2187	ASSERT(connp->conn_wq == NULL);
2188#endif
2189	ASSERT(connp->conn_cred == NULL);
2190	ASSERT(connp->conn_g_fanout == NULL);
2191	ASSERT(connp->conn_g_next == NULL);
2192	ASSERT(connp->conn_g_prev == NULL);
2193	ASSERT(connp->conn_policy == NULL);
2194	ASSERT(connp->conn_fanout == NULL);
2195	ASSERT(connp->conn_next == NULL);
2196	ASSERT(connp->conn_prev == NULL);
2197#ifdef notdef
2198	/*
2199	 * The ill and ipif pointers are not cleared before the conn_t
2200	 * goes away since they do not hold a reference on the ill/ipif.
2201	 * We should replace these pointers with ifindex/ipaddr_t to
2202	 * make the code less complex.
2203	 */
2204	ASSERT(connp->conn_xmit_if_ill == NULL);
2205	ASSERT(connp->conn_nofailover_ill == NULL);
2206	ASSERT(connp->conn_outgoing_ill == NULL);
2207	ASSERT(connp->conn_incoming_ill == NULL);
2208	ASSERT(connp->conn_outgoing_pill == NULL);
2209	ASSERT(connp->conn_multicast_ipif == NULL);
2210	ASSERT(connp->conn_multicast_ill == NULL);
2211#endif
2212	ASSERT(connp->conn_oper_pending_ill == NULL);
2213	ASSERT(connp->conn_ilg == NULL);
2214	ASSERT(connp->conn_drain_next == NULL);
2215	ASSERT(connp->conn_drain_prev == NULL);
2216	ASSERT(connp->conn_idl == NULL);
2217	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2218	ASSERT(connp->conn_peercred == NULL);
2219	ASSERT(connp->conn_netstack == NULL);
2220
2221	/* Clear out the conn_t fields that are not preserved */
2222	bzero(&connp->conn_start_clr,
2223	    sizeof (conn_t) -
2224	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2225
2226}
2227
2228/*
2229 * All conns are inserted in a global multi-list for the benefit of
2230 * walkers. The walk is guaranteed to walk all open conns at the time
2231 * of the start of the walk exactly once. This property is needed to
2232 * achieve some cleanups during unplumb of interfaces. This is achieved
2233 * as follows.
2234 *
2235 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2236 * call the insert and delete functions below at creation and deletion
2237 * time respectively. The conn never moves or changes its position in this
2238 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2239 * won't increase due to walkers, once the conn deletion has started. Note
2240 * that we can't remove the conn from the global list and then wait for
2241 * the refcnt to drop to zero, since walkers would then see a truncated
2242 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2243 * conns until ip_open is ready to make them globally visible.
2244 * The global round robin multi-list locks are held only to get the
2245 * next member/insertion/deletion and contention should be negligible
2246 * if the multi-list is much greater than the number of cpus.
2247 */
2248void
2249ipcl_globalhash_insert(conn_t *connp)
2250{
2251	int	index;
2252	struct connf_s	*connfp;
2253	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2254
2255	/*
2256	 * No need for atomic here. Approximate even distribution
2257	 * in the global lists is sufficient.
2258	 */
2259	ipst->ips_conn_g_index++;
2260	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2261
2262	connp->conn_g_prev = NULL;
2263	/*
2264	 * Mark as INCIPIENT, so that walkers will ignore this
2265	 * for now, till ip_open is ready to make it visible globally.
2266	 */
2267	connp->conn_state_flags |= CONN_INCIPIENT;
2268
2269	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2270	/* Insert at the head of the list */
2271	mutex_enter(&connfp->connf_lock);
2272	connp->conn_g_next = connfp->connf_head;
2273	if (connp->conn_g_next != NULL)
2274		connp->conn_g_next->conn_g_prev = connp;
2275	connfp->connf_head = connp;
2276
2277	/* The fanout bucket this conn points to */
2278	connp->conn_g_fanout = connfp;
2279
2280	mutex_exit(&connfp->connf_lock);
2281}
2282
2283void
2284ipcl_globalhash_remove(conn_t *connp)
2285{
2286	struct connf_s	*connfp;
2287
2288	/*
2289	 * We were never inserted in the global multi list.
2290	 * IPCL_NONE variety is never inserted in the global multilist
2291	 * since it is presumed to not need any cleanup and is transient.
2292	 */
2293	if (connp->conn_g_fanout == NULL)
2294		return;
2295
2296	connfp = connp->conn_g_fanout;
2297	mutex_enter(&connfp->connf_lock);
2298	if (connp->conn_g_prev != NULL)
2299		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2300	else
2301		connfp->connf_head = connp->conn_g_next;
2302	if (connp->conn_g_next != NULL)
2303		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2304	mutex_exit(&connfp->connf_lock);
2305
2306	/* Better to stumble on a null pointer than to corrupt memory */
2307	connp->conn_g_next = NULL;
2308	connp->conn_g_prev = NULL;
2309	connp->conn_g_fanout = NULL;
2310}
2311
2312/*
2313 * Walk the list of all conn_t's in the system, calling the function provided
2314 * with the specified argument for each.
2315 * Applies to both IPv4 and IPv6.
2316 *
2317 * IPCs may hold pointers to ipif/ill. To guard against stale pointers
2318 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2319 * unplumbed or removed. New conn_t's that are created while we are walking
2320 * may be missed by this walk, because they are not necessarily inserted
2321 * at the tail of the list. They are new conn_t's and thus don't have any
2322 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2323 * is created to the struct that is going away.
2324 */
2325void
2326ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2327{
2328	int	i;
2329	conn_t	*connp;
2330	conn_t	*prev_connp;
2331
2332	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2333		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2334		prev_connp = NULL;
2335		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2336		while (connp != NULL) {
2337			mutex_enter(&connp->conn_lock);
2338			if (connp->conn_state_flags &
2339			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2340				mutex_exit(&connp->conn_lock);
2341				connp = connp->conn_g_next;
2342				continue;
2343			}
2344			CONN_INC_REF_LOCKED(connp);
2345			mutex_exit(&connp->conn_lock);
2346			mutex_exit(
2347			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2348			(*func)(connp, arg);
2349			if (prev_connp != NULL)
2350				CONN_DEC_REF(prev_connp);
2351			mutex_enter(
2352			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2353			prev_connp = connp;
2354			connp = connp->conn_g_next;
2355		}
2356		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2357		if (prev_connp != NULL)
2358			CONN_DEC_REF(prev_connp);
2359	}
2360}
2361
2362/*
2363 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2364 * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2365 * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2366 * (peer tcp in ESTABLISHED state).
2367 */
2368conn_t *
2369ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
2370    ip_stack_t *ipst)
2371{
2372	uint32_t ports;
2373	uint16_t *pports = (uint16_t *)&ports;
2374	connf_t	*connfp;
2375	conn_t	*tconnp;
2376	boolean_t zone_chk;
2377
2378	/*
2379	 * If either the source of destination address is loopback, then
2380	 * both endpoints must be in the same Zone.  Otherwise, both of
2381	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2382	 * state) and the endpoints may reside in different Zones.
2383	 */
2384	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2385	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2386
2387	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2388	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2389
2390	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2391	    ports, ipst)];
2392
2393	mutex_enter(&connfp->connf_lock);
2394	for (tconnp = connfp->connf_head; tconnp != NULL;
2395	    tconnp = tconnp->conn_next) {
2396
2397		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2398		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2399		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2400		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2401
2402			ASSERT(tconnp != connp);
2403			CONN_INC_REF(tconnp);
2404			mutex_exit(&connfp->connf_lock);
2405			return (tconnp);
2406		}
2407	}
2408	mutex_exit(&connfp->connf_lock);
2409	return (NULL);
2410}
2411
2412/*
2413 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2414 * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2415 * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2416 * (peer tcp in ESTABLISHED state).
2417 */
2418conn_t *
2419ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
2420    ip_stack_t *ipst)
2421{
2422	uint32_t ports;
2423	uint16_t *pports = (uint16_t *)&ports;
2424	connf_t	*connfp;
2425	conn_t	*tconnp;
2426	boolean_t zone_chk;
2427
2428	/*
2429	 * If either the source of destination address is loopback, then
2430	 * both endpoints must be in the same Zone.  Otherwise, both of
2431	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2432	 * state) and the endpoints may reside in different Zones.  We
2433	 * don't do Zone check for link local address(es) because the
2434	 * current Zone implementation treats each link local address as
2435	 * being unique per system node, i.e. they belong to global Zone.
2436	 */
2437	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2438	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2439
2440	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2441	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2442
2443	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2444	    ports, ipst)];
2445
2446	mutex_enter(&connfp->connf_lock);
2447	for (tconnp = connfp->connf_head; tconnp != NULL;
2448	    tconnp = tconnp->conn_next) {
2449
2450		/* We skip tcp_bound_if check here as this is loopback tcp */
2451		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2452		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2453		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2454		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2455
2456			ASSERT(tconnp != connp);
2457			CONN_INC_REF(tconnp);
2458			mutex_exit(&connfp->connf_lock);
2459			return (tconnp);
2460		}
2461	}
2462	mutex_exit(&connfp->connf_lock);
2463	return (NULL);
2464}
2465
2466/*
2467 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2468 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2469 * Only checks for connected entries i.e. no INADDR_ANY checks.
2470 */
2471conn_t *
2472ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
2473    ip_stack_t *ipst)
2474{
2475	uint32_t ports;
2476	uint16_t *pports;
2477	connf_t	*connfp;
2478	conn_t	*tconnp;
2479
2480	pports = (uint16_t *)&ports;
2481	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2482	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2483
2484	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2485	    ports, ipst)];
2486
2487	mutex_enter(&connfp->connf_lock);
2488	for (tconnp = connfp->connf_head; tconnp != NULL;
2489	    tconnp = tconnp->conn_next) {
2490
2491		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2492		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2493		    tconnp->conn_tcp->tcp_state >= min_state) {
2494
2495			CONN_INC_REF(tconnp);
2496			mutex_exit(&connfp->connf_lock);
2497			return (tconnp);
2498		}
2499	}
2500	mutex_exit(&connfp->connf_lock);
2501	return (NULL);
2502}
2503
2504/*
2505 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2506 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2507 * Only checks for connected entries i.e. no INADDR_ANY checks.
2508 * Match on ifindex in addition to addresses.
2509 */
2510conn_t *
2511ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2512    uint_t ifindex, ip_stack_t *ipst)
2513{
2514	tcp_t	*tcp;
2515	uint32_t ports;
2516	uint16_t *pports;
2517	connf_t	*connfp;
2518	conn_t	*tconnp;
2519
2520	pports = (uint16_t *)&ports;
2521	pports[0] = tcpha->tha_fport;
2522	pports[1] = tcpha->tha_lport;
2523
2524	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2525	    ports, ipst)];
2526
2527	mutex_enter(&connfp->connf_lock);
2528	for (tconnp = connfp->connf_head; tconnp != NULL;
2529	    tconnp = tconnp->conn_next) {
2530
2531		tcp = tconnp->conn_tcp;
2532		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2533		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2534		    tcp->tcp_state >= min_state &&
2535		    (tcp->tcp_bound_if == 0 ||
2536		    tcp->tcp_bound_if == ifindex)) {
2537
2538			CONN_INC_REF(tconnp);
2539			mutex_exit(&connfp->connf_lock);
2540			return (tconnp);
2541		}
2542	}
2543	mutex_exit(&connfp->connf_lock);
2544	return (NULL);
2545}
2546
2547/*
2548 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2549 * a listener when changing state.
2550 */
2551conn_t *
2552ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2553    ip_stack_t *ipst)
2554{
2555	connf_t		*bind_connfp;
2556	conn_t		*connp;
2557	tcp_t		*tcp;
2558
2559	/*
2560	 * Avoid false matches for packets sent to an IP destination of
2561	 * all zeros.
2562	 */
2563	if (laddr == 0)
2564		return (NULL);
2565
2566	ASSERT(zoneid != ALL_ZONES);
2567
2568	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2569	mutex_enter(&bind_connfp->connf_lock);
2570	for (connp = bind_connfp->connf_head; connp != NULL;
2571	    connp = connp->conn_next) {
2572		tcp = connp->conn_tcp;
2573		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2574		    IPCL_ZONE_MATCH(connp, zoneid) &&
2575		    (tcp->tcp_listener == NULL)) {
2576			CONN_INC_REF(connp);
2577			mutex_exit(&bind_connfp->connf_lock);
2578			return (connp);
2579		}
2580	}
2581	mutex_exit(&bind_connfp->connf_lock);
2582	return (NULL);
2583}
2584
2585/*
2586 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2587 * a listener when changing state.
2588 */
2589conn_t *
2590ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2591    zoneid_t zoneid, ip_stack_t *ipst)
2592{
2593	connf_t		*bind_connfp;
2594	conn_t		*connp = NULL;
2595	tcp_t		*tcp;
2596
2597	/*
2598	 * Avoid false matches for packets sent to an IP destination of
2599	 * all zeros.
2600	 */
2601	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2602		return (NULL);
2603
2604	ASSERT(zoneid != ALL_ZONES);
2605
2606	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2607	mutex_enter(&bind_connfp->connf_lock);
2608	for (connp = bind_connfp->connf_head; connp != NULL;
2609	    connp = connp->conn_next) {
2610		tcp = connp->conn_tcp;
2611		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2612		    IPCL_ZONE_MATCH(connp, zoneid) &&
2613		    (tcp->tcp_bound_if == 0 ||
2614		    tcp->tcp_bound_if == ifindex) &&
2615		    tcp->tcp_listener == NULL) {
2616			CONN_INC_REF(connp);
2617			mutex_exit(&bind_connfp->connf_lock);
2618			return (connp);
2619		}
2620	}
2621	mutex_exit(&bind_connfp->connf_lock);
2622	return (NULL);
2623}
2624
2625/*
2626 * ipcl_get_next_conn
2627 *	get the next entry in the conn global list
2628 *	and put a reference on the next_conn.
2629 *	decrement the reference on the current conn.
2630 *
2631 * This is an iterator based walker function that also provides for
2632 * some selection by the caller. It walks through the conn_hash bucket
2633 * searching for the next valid connp in the list, and selects connections
2634 * that are neither closed nor condemned. It also REFHOLDS the conn
2635 * thus ensuring that the conn exists when the caller uses the conn.
2636 */
2637conn_t *
2638ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2639{
2640	conn_t	*next_connp;
2641
2642	if (connfp == NULL)
2643		return (NULL);
2644
2645	mutex_enter(&connfp->connf_lock);
2646
2647	next_connp = (connp == NULL) ?
2648	    connfp->connf_head : connp->conn_g_next;
2649
2650	while (next_connp != NULL) {
2651		mutex_enter(&next_connp->conn_lock);
2652		if (!(next_connp->conn_flags & conn_flags) ||
2653		    (next_connp->conn_state_flags &
2654		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2655			/*
2656			 * This conn has been condemned or
2657			 * is closing, or the flags don't match
2658			 */
2659			mutex_exit(&next_connp->conn_lock);
2660			next_connp = next_connp->conn_g_next;
2661			continue;
2662		}
2663		CONN_INC_REF_LOCKED(next_connp);
2664		mutex_exit(&next_connp->conn_lock);
2665		break;
2666	}
2667
2668	mutex_exit(&connfp->connf_lock);
2669
2670	if (connp != NULL)
2671		CONN_DEC_REF(connp);
2672
2673	return (next_connp);
2674}
2675
2676#ifdef CONN_DEBUG
2677/*
2678 * Trace of the last NBUF refhold/refrele
2679 */
2680int
2681conn_trace_ref(conn_t *connp)
2682{
2683	int	last;
2684	conn_trace_t	*ctb;
2685
2686	ASSERT(MUTEX_HELD(&connp->conn_lock));
2687	last = connp->conn_trace_last;
2688	last++;
2689	if (last == CONN_TRACE_MAX)
2690		last = 0;
2691
2692	ctb = &connp->conn_trace_buf[last];
2693	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2694	connp->conn_trace_last = last;
2695	return (1);
2696}
2697
2698int
2699conn_untrace_ref(conn_t *connp)
2700{
2701	int	last;
2702	conn_trace_t	*ctb;
2703
2704	ASSERT(MUTEX_HELD(&connp->conn_lock));
2705	last = connp->conn_trace_last;
2706	last++;
2707	if (last == CONN_TRACE_MAX)
2708		last = 0;
2709
2710	ctb = &connp->conn_trace_buf[last];
2711	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2712	connp->conn_trace_last = last;
2713	return (1);
2714}
2715#endif
2716