ipclassifier.c revision 2263:fd48046384d0
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28const char ipclassifier_version[] = "@(#)ipclassifier.c	%I%	%E% SMI";
29
30/*
31 * IP PACKET CLASSIFIER
32 *
33 * The IP packet classifier provides mapping between IP packets and persistent
34 * connection state for connection-oriented protocols. It also provides
35 * interface for managing connection states.
36 *
37 * The connection state is kept in conn_t data structure and contains, among
38 * other things:
39 *
40 *	o local/remote address and ports
41 *	o Transport protocol
42 *	o squeue for the connection (for TCP only)
43 *	o reference counter
44 *	o Connection state
45 *	o hash table linkage
46 *	o interface/ire information
47 *	o credentials
48 *	o ipsec policy
49 *	o send and receive functions.
50 *	o mutex lock.
51 *
52 * Connections use a reference counting scheme. They are freed when the
53 * reference counter drops to zero. A reference is incremented when connection
54 * is placed in a list or table, when incoming packet for the connection arrives
55 * and when connection is processed via squeue (squeue processing may be
56 * asynchronous and the reference protects the connection from being destroyed
57 * before its processing is finished).
58 *
59 * send and receive functions are currently used for TCP only. The send function
60 * determines the IP entry point for the packet once it leaves TCP to be sent to
61 * the destination address. The receive function is used by IP when the packet
62 * should be passed for TCP processing. When a new connection is created these
63 * are set to ip_output() and tcp_input() respectively. During the lifetime of
64 * the connection the send and receive functions may change depending on the
65 * changes in the connection state. For example, Once the connection is bound to
66 * an addresse, the receive function for this connection is set to
67 * tcp_conn_request().  This allows incoming SYNs to go directly into the
68 * listener SYN processing function without going to tcp_input() first.
69 *
70 * Classifier uses several hash tables:
71 *
72 * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
73 *	ipcl_bind_fanout:	contains all connections in BOUND state
74 *	ipcl_proto_fanout:	IPv4 protocol fanout
75 *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
76 *	ipcl_udp_fanout:	contains all UDP connections
77 *	ipcl_globalhash_fanout:	contains all connections
78 *
79 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
80 * which need to view all existing connections.
81 *
82 * All tables are protected by per-bucket locks. When both per-bucket lock and
83 * connection lock need to be held, the per-bucket lock should be acquired
84 * first, followed by the connection lock.
85 *
86 * All functions doing search in one of these tables increment a reference
87 * counter on the connection found (if any). This reference should be dropped
88 * when the caller has finished processing the connection.
89 *
90 *
91 * INTERFACES:
92 * ===========
93 *
94 * Connection Lookup:
95 * ------------------
96 *
97 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid)
98 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid)
99 *
100 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
101 * it can't find any associated connection. If the connection is found, its
102 * reference counter is incremented.
103 *
104 *	mp:	mblock, containing packet header. The full header should fit
105 *		into a single mblock. It should also contain at least full IP
106 *		and TCP or UDP header.
107 *
108 *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
109 *
110 *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
111 *		 the packet.
112 *
113 * 	zoneid: The zone in which the returned connection must be; the zoneid
114 *		corresponding to the ire_zoneid on the IRE located for the
115 *		packet's destination address.
116 *
117 *	For TCP connections, the lookup order is as follows:
118 *		5-tuple {src, dst, protocol, local port, remote port}
119 *			lookup in ipcl_conn_fanout table.
120 *		3-tuple {dst, remote port, protocol} lookup in
121 *			ipcl_bind_fanout table.
122 *
123 *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
124 *	remote port} lookup is done on ipcl_udp_fanout. Note that,
125 *	these interfaces do not handle cases where a packets belongs
126 *	to multiple UDP clients, which is handled in IP itself.
127 *
128 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
129 * determine which actual zone gets the segment.  This is used only in a
130 * labeled environment.  The matching rules are:
131 *
132 *	- If it's not a multilevel port, then the label on the packet selects
133 *	  the zone.  Unlabeled packets are delivered to the global zone.
134 *
135 *	- If it's a multilevel port, then only the zone registered to receive
136 *	  packets on that port matches.
137 *
138 * Also, in a labeled environment, packet labels need to be checked.  For fully
139 * bound TCP connections, we can assume that the packet label was checked
140 * during connection establishment, and doesn't need to be checked on each
141 * packet.  For others, though, we need to check for strict equality or, for
142 * multilevel ports, membership in the range or set.  This part currently does
143 * a tnrh lookup on each packet, but could be optimized to use cached results
144 * if that were necessary.  (SCTP doesn't come through here, but if it did,
145 * we would apply the same rules as TCP.)
146 *
147 * An implication of the above is that fully-bound TCP sockets must always use
148 * distinct 4-tuples; they can't be discriminated by label alone.
149 *
150 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
151 * as there's no connection set-up handshake and no shared state.
152 *
153 * Labels on looped-back packets within a single zone do not need to be
154 * checked, as all processes in the same zone have the same label.
155 *
156 * Finally, for unlabeled packets received by a labeled system, special rules
157 * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
158 * socket in the zone whose label matches the default label of the sender, if
159 * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
160 * receiver's label must dominate the sender's default label.
161 *
162 * conn_t	*ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int);
163 * conn_t	*ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t);
164 *
165 *	Lookup routine to find a exact match for {src, dst, local port,
166 *	remote port) for TCP connections in ipcl_conn_fanout. The address and
167 *	ports are read from the IP and TCP header respectively.
168 *
169 * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol);
170 * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex);
171 *
172 * 	Lookup routine to find a listener with the tuple {lport, laddr,
173 * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
174 * 	parameter interface index is also compared.
175 *
176 * void ipcl_walk(func, arg)
177 *
178 * 	Apply 'func' to every connection available. The 'func' is called as
179 *	(*func)(connp, arg). The walk is non-atomic so connections may be
180 *	created and destroyed during the walk. The CONN_CONDEMNED and
181 *	CONN_INCIPIENT flags ensure that connections which are newly created
182 *	or being destroyed are not selected by the walker.
183 *
184 * Table Updates
185 * -------------
186 *
187 * int ipcl_conn_insert(connp, protocol, src, dst, ports)
188 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
189 *
190 *	Insert 'connp' in the ipcl_conn_fanout.
191 *	Arguements :
192 *		connp		conn_t to be inserted
193 *		protocol	connection protocol
194 *		src		source address
195 *		dst		destination address
196 *		ports		local and remote port
197 *		ifindex		interface index for IPv6 connections
198 *
199 *	Return value :
200 *		0		if connp was inserted
201 *		EADDRINUSE	if the connection with the same tuple
202 *				already exists.
203 *
204 * int ipcl_bind_insert(connp, protocol, src, lport);
205 * int ipcl_bind_insert_v6(connp, protocol, src, lport);
206 *
207 * 	Insert 'connp' in ipcl_bind_fanout.
208 * 	Arguements :
209 * 		connp		conn_t to be inserted
210 * 		protocol	connection protocol
211 * 		src		source address connection wants
212 * 				to bind to
213 * 		lport		local port connection wants to
214 * 				bind to
215 *
216 *
217 * void ipcl_hash_remove(connp);
218 *
219 * 	Removes the 'connp' from the connection fanout table.
220 *
221 * Connection Creation/Destruction
222 * -------------------------------
223 *
224 * conn_t *ipcl_conn_create(type, sleep)
225 *
226 * 	Creates a new conn based on the type flag, inserts it into
227 * 	globalhash table.
228 *
229 *	type:	This flag determines the type of conn_t which needs to be
230 *		created.
231 *		IPCL_TCPCONN	indicates a TCP connection
232 *		IPCL_IPCONN	indicates all non-TCP connections.
233 *
234 * void ipcl_conn_destroy(connp)
235 *
236 * 	Destroys the connection state, removes it from the global
237 * 	connection hash table and frees its memory.
238 */
239
240#include <sys/types.h>
241#include <sys/stream.h>
242#include <sys/stropts.h>
243#include <sys/sysmacros.h>
244#include <sys/strsubr.h>
245#include <sys/strsun.h>
246#define	_SUN_TPI_VERSION 2
247#include <sys/ddi.h>
248#include <sys/cmn_err.h>
249#include <sys/debug.h>
250
251#include <sys/systm.h>
252#include <sys/param.h>
253#include <sys/kmem.h>
254#include <sys/isa_defs.h>
255#include <inet/common.h>
256#include <netinet/ip6.h>
257#include <netinet/icmp6.h>
258
259#include <inet/ip.h>
260#include <inet/ip6.h>
261#include <inet/tcp.h>
262#include <inet/ip_ndp.h>
263#include <inet/udp_impl.h>
264#include <inet/sctp_ip.h>
265
266#include <sys/cpuvar.h>
267
268#include <inet/ipclassifier.h>
269#include <inet/ipsec_impl.h>
270
271#include <sys/tsol/tnet.h>
272
273#ifdef DEBUG
274#define	IPCL_DEBUG
275#else
276#undef	IPCL_DEBUG
277#endif
278
279#ifdef	IPCL_DEBUG
280int	ipcl_debug_level = 0;
281#define	IPCL_DEBUG_LVL(level, args)	\
282	if (ipcl_debug_level  & level) { printf args; }
283#else
284#define	IPCL_DEBUG_LVL(level, args) {; }
285#endif
286connf_t	*ipcl_conn_fanout;
287connf_t	*ipcl_bind_fanout;
288connf_t	ipcl_proto_fanout[IPPROTO_MAX + 1];
289connf_t	ipcl_proto_fanout_v6[IPPROTO_MAX + 1];
290connf_t	*ipcl_udp_fanout;
291
292/* A separate hash list for raw socket. */
293connf_t *ipcl_raw_fanout;
294
295connf_t rts_clients;
296
297/* Old value for compatibility */
298uint_t tcp_conn_hash_size = 0;
299
300/* New value. Zero means choose automatically. */
301uint_t ipcl_conn_hash_size = 0;
302uint_t ipcl_conn_hash_memfactor = 8192;
303uint_t ipcl_conn_hash_maxsize = 82500;
304
305uint_t ipcl_conn_fanout_size = 0;
306
307
308/* bind/udp fanout table size */
309uint_t ipcl_bind_fanout_size = 512;
310uint_t ipcl_udp_fanout_size = 16384;
311
312/* Raw socket fanout size.  Must be a power of 2. */
313uint_t ipcl_raw_fanout_size = 256;
314
315/*
316 * Power of 2^N Primes useful for hashing for N of 0-28,
317 * these primes are the nearest prime <= 2^N - 2^(N-2).
318 */
319
320#define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
321		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
322		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
323		50331599, 100663291, 201326557, 0}
324
325/*
326 * wrapper structure to ensure that conn+tcpb are aligned
327 * on cache lines.
328 */
329typedef struct itc_s {
330	union {
331		conn_t	itcu_conn;
332		char	itcu_filler[CACHE_ALIGN(conn_s)];
333	}	itc_u;
334	tcp_t	itc_tcp;
335} itc_t;
336
337#define	itc_conn	itc_u.itcu_conn
338
339struct kmem_cache  *ipcl_tcpconn_cache;
340struct kmem_cache  *ipcl_tcp_cache;
341struct kmem_cache  *ipcl_conn_cache;
342extern struct kmem_cache  *sctp_conn_cache;
343extern struct kmem_cache  *tcp_sack_info_cache;
344extern struct kmem_cache  *tcp_iphc_cache;
345
346extern void	tcp_timermp_free(tcp_t *);
347extern mblk_t	*tcp_timermp_alloc(int);
348
349static int	ipcl_tcpconn_constructor(void *, void *, int);
350static void	ipcl_tcpconn_destructor(void *, void *);
351
352static int conn_g_index;
353connf_t	*ipcl_globalhash_fanout;
354
355#ifdef	IPCL_DEBUG
356#define	INET_NTOA_BUFSIZE	18
357
358static char *
359inet_ntoa_r(uint32_t in, char *b)
360{
361	unsigned char	*p;
362
363	p = (unsigned char *)&in;
364	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
365	return (b);
366}
367#endif
368
369/*
370 * ipclassifier intialization routine, sets up hash tables and
371 * conn caches.
372 */
373void
374ipcl_init(void)
375{
376	int i;
377	int sizes[] = P2Ps();
378
379	ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache",
380	    sizeof (conn_t), CACHE_ALIGN_SIZE,
381	    NULL, NULL, NULL, NULL, NULL, 0);
382
383	ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache",
384	    sizeof (itc_t), CACHE_ALIGN_SIZE,
385	    ipcl_tcpconn_constructor, ipcl_tcpconn_destructor,
386	    NULL, NULL, NULL, 0);
387
388	/*
389	 * Calculate size of conn fanout table.
390	 */
391	if (ipcl_conn_hash_size != 0) {
392		ipcl_conn_fanout_size = ipcl_conn_hash_size;
393	} else if (tcp_conn_hash_size != 0) {
394		ipcl_conn_fanout_size = tcp_conn_hash_size;
395	} else {
396		extern pgcnt_t freemem;
397
398		ipcl_conn_fanout_size =
399		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
400
401		if (ipcl_conn_fanout_size > ipcl_conn_hash_maxsize)
402			ipcl_conn_fanout_size = ipcl_conn_hash_maxsize;
403	}
404
405	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
406		if (sizes[i] >= ipcl_conn_fanout_size) {
407			break;
408		}
409	}
410	if ((ipcl_conn_fanout_size = sizes[i]) == 0) {
411		/* Out of range, use the 2^16 value */
412		ipcl_conn_fanout_size = sizes[16];
413	}
414	ipcl_conn_fanout = (connf_t *)kmem_zalloc(ipcl_conn_fanout_size *
415	    sizeof (*ipcl_conn_fanout), KM_SLEEP);
416
417	for (i = 0; i < ipcl_conn_fanout_size; i++) {
418		mutex_init(&ipcl_conn_fanout[i].connf_lock, NULL,
419		    MUTEX_DEFAULT, NULL);
420	}
421
422	ipcl_bind_fanout = (connf_t *)kmem_zalloc(ipcl_bind_fanout_size *
423	    sizeof (*ipcl_bind_fanout), KM_SLEEP);
424
425	for (i = 0; i < ipcl_bind_fanout_size; i++) {
426		mutex_init(&ipcl_bind_fanout[i].connf_lock, NULL,
427		    MUTEX_DEFAULT, NULL);
428	}
429
430	for (i = 0; i < A_CNT(ipcl_proto_fanout); i++) {
431		mutex_init(&ipcl_proto_fanout[i].connf_lock, NULL,
432		    MUTEX_DEFAULT, NULL);
433	}
434	for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++) {
435		mutex_init(&ipcl_proto_fanout_v6[i].connf_lock, NULL,
436		    MUTEX_DEFAULT, NULL);
437	}
438
439	mutex_init(&rts_clients.connf_lock, NULL, MUTEX_DEFAULT, NULL);
440
441	ipcl_udp_fanout = (connf_t *)kmem_zalloc(ipcl_udp_fanout_size *
442	    sizeof (*ipcl_udp_fanout), KM_SLEEP);
443
444	for (i = 0; i < ipcl_udp_fanout_size; i++) {
445		mutex_init(&ipcl_udp_fanout[i].connf_lock, NULL,
446		    MUTEX_DEFAULT, NULL);
447	}
448
449	ipcl_raw_fanout = (connf_t *)kmem_zalloc(ipcl_raw_fanout_size *
450	    sizeof (*ipcl_raw_fanout), KM_SLEEP);
451
452	for (i = 0; i < ipcl_raw_fanout_size; i++) {
453		mutex_init(&ipcl_raw_fanout[i].connf_lock, NULL,
454		    MUTEX_DEFAULT, NULL);
455	}
456
457	ipcl_globalhash_fanout = (connf_t *)kmem_zalloc(sizeof (connf_t) *
458	    CONN_G_HASH_SIZE, KM_SLEEP);
459
460	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
461		mutex_init(&ipcl_globalhash_fanout[i].connf_lock, NULL,
462		    MUTEX_DEFAULT, NULL);
463	}
464}
465
466void
467ipcl_destroy(void)
468{
469	int i;
470	kmem_cache_destroy(ipcl_conn_cache);
471	kmem_cache_destroy(ipcl_tcpconn_cache);
472	for (i = 0; i < ipcl_conn_fanout_size; i++)
473		mutex_destroy(&ipcl_conn_fanout[i].connf_lock);
474	kmem_free(ipcl_conn_fanout, ipcl_conn_fanout_size *
475	    sizeof (*ipcl_conn_fanout));
476	for (i = 0; i < ipcl_bind_fanout_size; i++)
477		mutex_destroy(&ipcl_bind_fanout[i].connf_lock);
478	kmem_free(ipcl_bind_fanout, ipcl_bind_fanout_size *
479	    sizeof (*ipcl_bind_fanout));
480
481	for (i = 0; i < A_CNT(ipcl_proto_fanout); i++)
482		mutex_destroy(&ipcl_proto_fanout[i].connf_lock);
483	for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++)
484		mutex_destroy(&ipcl_proto_fanout_v6[i].connf_lock);
485
486	for (i = 0; i < ipcl_udp_fanout_size; i++)
487		mutex_destroy(&ipcl_udp_fanout[i].connf_lock);
488	kmem_free(ipcl_udp_fanout, ipcl_udp_fanout_size *
489	    sizeof (*ipcl_udp_fanout));
490
491	for (i = 0; i < ipcl_raw_fanout_size; i++)
492		mutex_destroy(&ipcl_raw_fanout[i].connf_lock);
493	kmem_free(ipcl_raw_fanout, ipcl_raw_fanout_size *
494	    sizeof (*ipcl_raw_fanout));
495
496	kmem_free(ipcl_globalhash_fanout, sizeof (connf_t) * CONN_G_HASH_SIZE);
497	mutex_destroy(&rts_clients.connf_lock);
498}
499
500/*
501 * conn creation routine. initialize the conn, sets the reference
502 * and inserts it in the global hash table.
503 */
504conn_t *
505ipcl_conn_create(uint32_t type, int sleep)
506{
507	itc_t	*itc;
508	conn_t	*connp;
509
510	switch (type) {
511	case IPCL_TCPCONN:
512		if ((itc = kmem_cache_alloc(ipcl_tcpconn_cache,
513		    sleep)) == NULL)
514			return (NULL);
515		connp = &itc->itc_conn;
516		connp->conn_ref = 1;
517		IPCL_DEBUG_LVL(1,
518		    ("ipcl_conn_create: connp = %p tcp (%p)",
519		    (void *)connp, (void *)connp->conn_tcp));
520		ipcl_globalhash_insert(connp);
521		break;
522	case IPCL_SCTPCONN:
523		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
524			return (NULL);
525		connp->conn_flags = IPCL_SCTPCONN;
526		break;
527	case IPCL_IPCCONN:
528		connp = kmem_cache_alloc(ipcl_conn_cache, sleep);
529		if (connp == NULL)
530			return (NULL);
531		bzero(connp, sizeof (conn_t));
532		mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
533		cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
534		connp->conn_flags = IPCL_IPCCONN;
535		connp->conn_ref = 1;
536		IPCL_DEBUG_LVL(1,
537		    ("ipcl_conn_create: connp = %p\n", (void *)connp));
538		ipcl_globalhash_insert(connp);
539		break;
540	default:
541		connp = NULL;
542		ASSERT(0);
543	}
544
545	return (connp);
546}
547
548void
549ipcl_conn_destroy(conn_t *connp)
550{
551	mblk_t	*mp;
552
553	ASSERT(!MUTEX_HELD(&connp->conn_lock));
554	ASSERT(connp->conn_ref == 0);
555	ASSERT(connp->conn_ire_cache == NULL);
556
557	if (connp->conn_peercred != NULL &&
558	    connp->conn_peercred != connp->conn_cred)
559		crfree(connp->conn_peercred);
560	connp->conn_peercred = NULL;
561
562	if (connp->conn_cred != NULL) {
563		crfree(connp->conn_cred);
564		connp->conn_cred = NULL;
565	}
566
567	ipcl_globalhash_remove(connp);
568
569	cv_destroy(&connp->conn_cv);
570	if (connp->conn_flags & IPCL_TCPCONN) {
571		tcp_t	*tcp = connp->conn_tcp;
572
573		mutex_destroy(&connp->conn_lock);
574		ASSERT(connp->conn_tcp != NULL);
575		tcp_free(tcp);
576		mp = tcp->tcp_timercache;
577		tcp->tcp_cred = NULL;
578
579		if (tcp->tcp_sack_info != NULL) {
580			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
581			kmem_cache_free(tcp_sack_info_cache,
582			    tcp->tcp_sack_info);
583		}
584		if (tcp->tcp_iphc != NULL) {
585			if (tcp->tcp_hdr_grown) {
586				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
587			} else {
588				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
589				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
590			}
591			tcp->tcp_iphc_len = 0;
592		}
593		ASSERT(tcp->tcp_iphc_len == 0);
594
595		if (connp->conn_latch != NULL)
596			IPLATCH_REFRELE(connp->conn_latch);
597		if (connp->conn_policy != NULL)
598			IPPH_REFRELE(connp->conn_policy);
599		bzero(connp, sizeof (itc_t));
600
601		tcp->tcp_timercache = mp;
602		connp->conn_tcp = tcp;
603		connp->conn_flags = IPCL_TCPCONN;
604		connp->conn_ulp = IPPROTO_TCP;
605		tcp->tcp_connp = connp;
606		kmem_cache_free(ipcl_tcpconn_cache, connp);
607	} else if (connp->conn_flags & IPCL_SCTPCONN) {
608		sctp_free(connp);
609	} else {
610		ASSERT(connp->conn_udp == NULL);
611		mutex_destroy(&connp->conn_lock);
612		kmem_cache_free(ipcl_conn_cache, connp);
613	}
614}
615
616/*
617 * Running in cluster mode - deregister listener information
618 */
619
620static void
621ipcl_conn_unlisten(conn_t *connp)
622{
623	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
624	ASSERT(connp->conn_lport != 0);
625
626	if (cl_inet_unlisten != NULL) {
627		sa_family_t	addr_family;
628		uint8_t		*laddrp;
629
630		if (connp->conn_pkt_isv6) {
631			addr_family = AF_INET6;
632			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
633		} else {
634			addr_family = AF_INET;
635			laddrp = (uint8_t *)&connp->conn_bound_source;
636		}
637		(*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp,
638		    connp->conn_lport);
639	}
640	connp->conn_flags &= ~IPCL_CL_LISTENER;
641}
642
643/*
644 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
645 * which table the conn belonged to). So for debugging we can see which hash
646 * table this connection was in.
647 */
648#define	IPCL_HASH_REMOVE(connp)	{					\
649	connf_t	*connfp = (connp)->conn_fanout;				\
650	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
651	if (connfp != NULL) {						\
652		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
653		    (void *)(connp)));					\
654		mutex_enter(&connfp->connf_lock);			\
655		if ((connp)->conn_next != NULL)				\
656			(connp)->conn_next->conn_prev =			\
657			    (connp)->conn_prev;				\
658		if ((connp)->conn_prev != NULL)				\
659			(connp)->conn_prev->conn_next =			\
660			    (connp)->conn_next;				\
661		else							\
662			connfp->connf_head = (connp)->conn_next;	\
663		(connp)->conn_fanout = NULL;				\
664		(connp)->conn_next = NULL;				\
665		(connp)->conn_prev = NULL;				\
666		(connp)->conn_flags |= IPCL_REMOVED;			\
667		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
668			ipcl_conn_unlisten((connp));			\
669		CONN_DEC_REF((connp));					\
670		mutex_exit(&connfp->connf_lock);			\
671	}								\
672}
673
674void
675ipcl_hash_remove(conn_t *connp)
676{
677	IPCL_HASH_REMOVE(connp);
678}
679
680/*
681 * The whole purpose of this function is allow removal of
682 * a conn_t from the connected hash for timewait reclaim.
683 * This is essentially a TW reclaim fastpath where timewait
684 * collector checks under fanout lock (so no one else can
685 * get access to the conn_t) that refcnt is 2 i.e. one for
686 * TCP and one for the classifier hash list. If ref count
687 * is indeed 2, we can just remove the conn under lock and
688 * avoid cleaning up the conn under squeue. This gives us
689 * improved performance.
690 */
691void
692ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
693{
694	ASSERT(MUTEX_HELD(&connfp->connf_lock));
695	ASSERT(MUTEX_HELD(&connp->conn_lock));
696	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
697
698	if ((connp)->conn_next != NULL) {
699		(connp)->conn_next->conn_prev =
700			(connp)->conn_prev;
701	}
702	if ((connp)->conn_prev != NULL) {
703		(connp)->conn_prev->conn_next =
704			(connp)->conn_next;
705	} else {
706		connfp->connf_head = (connp)->conn_next;
707	}
708	(connp)->conn_fanout = NULL;
709	(connp)->conn_next = NULL;
710	(connp)->conn_prev = NULL;
711	(connp)->conn_flags |= IPCL_REMOVED;
712	ASSERT((connp)->conn_ref == 2);
713	(connp)->conn_ref--;
714}
715
716#define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
717	ASSERT((connp)->conn_fanout == NULL);				\
718	ASSERT((connp)->conn_next == NULL);				\
719	ASSERT((connp)->conn_prev == NULL);				\
720	if ((connfp)->connf_head != NULL) {				\
721		(connfp)->connf_head->conn_prev = (connp);		\
722		(connp)->conn_next = (connfp)->connf_head;		\
723	}								\
724	(connp)->conn_fanout = (connfp);				\
725	(connfp)->connf_head = (connp);					\
726	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
727	    IPCL_CONNECTED;						\
728	CONN_INC_REF(connp);						\
729}
730
731#define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
732	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
733	    "connp %p", (void *)(connfp), (void *)(connp)));		\
734	IPCL_HASH_REMOVE((connp));					\
735	mutex_enter(&(connfp)->connf_lock);				\
736	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
737	mutex_exit(&(connfp)->connf_lock);				\
738}
739
740#define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
741	conn_t *pconnp = NULL, *nconnp;					\
742	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
743	    "connp %p", (void *)connfp, (void *)(connp)));		\
744	IPCL_HASH_REMOVE((connp));					\
745	mutex_enter(&(connfp)->connf_lock);				\
746	nconnp = (connfp)->connf_head;					\
747	while (nconnp != NULL &&					\
748	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
749		pconnp = nconnp;					\
750		nconnp = nconnp->conn_next;				\
751	}								\
752	if (pconnp != NULL) {						\
753		pconnp->conn_next = (connp);				\
754		(connp)->conn_prev = pconnp;				\
755	} else {							\
756		(connfp)->connf_head = (connp);				\
757	}								\
758	if (nconnp != NULL) {						\
759		(connp)->conn_next = nconnp;				\
760		nconnp->conn_prev = (connp);				\
761	}								\
762	(connp)->conn_fanout = (connfp);				\
763	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
764	    IPCL_BOUND;							\
765	CONN_INC_REF(connp);						\
766	mutex_exit(&(connfp)->connf_lock);				\
767}
768
769#define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
770	conn_t **list, *prev, *next;					\
771	boolean_t isv4mapped =						\
772	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
773	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
774	    "connp %p", (void *)(connfp), (void *)(connp)));		\
775	IPCL_HASH_REMOVE((connp));					\
776	mutex_enter(&(connfp)->connf_lock);				\
777	list = &(connfp)->connf_head;					\
778	prev = NULL;							\
779	while ((next = *list) != NULL) {				\
780		if (isv4mapped &&					\
781		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
782		    connp->conn_zoneid == next->conn_zoneid) {		\
783			(connp)->conn_next = next;			\
784			if (prev != NULL)				\
785				prev = next->conn_prev;			\
786			next->conn_prev = (connp);			\
787			break;						\
788		}							\
789		list = &next->conn_next;				\
790		prev = next;						\
791	}								\
792	(connp)->conn_prev = prev;					\
793	*list = (connp);						\
794	(connp)->conn_fanout = (connfp);				\
795	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
796	    IPCL_BOUND;							\
797	CONN_INC_REF((connp));						\
798	mutex_exit(&(connfp)->connf_lock);				\
799}
800
801void
802ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
803{
804	ASSERT(!connp->conn_mac_exempt);
805	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
806}
807
808void
809ipcl_proto_insert(conn_t *connp, uint8_t protocol)
810{
811	connf_t	*connfp;
812
813	ASSERT(connp != NULL);
814	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
815	    protocol == IPPROTO_ESP);
816
817	connp->conn_ulp = protocol;
818
819	/* Insert it in the protocol hash */
820	connfp = &ipcl_proto_fanout[protocol];
821	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
822}
823
824void
825ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
826{
827	connf_t	*connfp;
828
829	ASSERT(connp != NULL);
830	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
831	    protocol == IPPROTO_ESP);
832
833	connp->conn_ulp = protocol;
834
835	/* Insert it in the Bind Hash */
836	connfp = &ipcl_proto_fanout_v6[protocol];
837	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
838}
839
840/*
841 * This function is used only for inserting SCTP raw socket now.
842 * This may change later.
843 *
844 * Note that only one raw socket can be bound to a port.  The param
845 * lport is in network byte order.
846 */
847static int
848ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
849{
850	connf_t	*connfp;
851	conn_t	*oconnp;
852
853	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))];
854
855	/* Check for existing raw socket already bound to the port. */
856	mutex_enter(&connfp->connf_lock);
857	for (oconnp = connfp->connf_head; oconnp != NULL;
858	    oconnp = oconnp->conn_next) {
859		if (oconnp->conn_lport == lport &&
860		    oconnp->conn_zoneid == connp->conn_zoneid &&
861		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
862		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
863		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
864		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
865		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
866		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
867		    &connp->conn_srcv6))) {
868			break;
869		}
870	}
871	mutex_exit(&connfp->connf_lock);
872	if (oconnp != NULL)
873		return (EADDRNOTAVAIL);
874
875	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
876	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
877		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
878		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
879			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
880		} else {
881			IPCL_HASH_INSERT_BOUND(connfp, connp);
882		}
883	} else {
884		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
885	}
886	return (0);
887}
888
889/*
890 * Check for a MAC exemption conflict on a labeled system.  Note that for
891 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
892 * transport layer.  This check is for binding all other protocols.
893 *
894 * Returns true if there's a conflict.
895 */
896static boolean_t
897check_exempt_conflict_v4(conn_t *connp)
898{
899	connf_t	*connfp;
900	conn_t *tconn;
901
902	connfp = &ipcl_proto_fanout[connp->conn_ulp];
903	mutex_enter(&connfp->connf_lock);
904	for (tconn = connfp->connf_head; tconn != NULL;
905	    tconn = tconn->conn_next) {
906		/* We don't allow v4 fallback for v6 raw socket */
907		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
908			continue;
909		/* If neither is exempt, then there's no conflict */
910		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
911			continue;
912		/* If both are bound to different specific addrs, ok */
913		if (connp->conn_src != INADDR_ANY &&
914		    tconn->conn_src != INADDR_ANY &&
915		    connp->conn_src != tconn->conn_src)
916			continue;
917		/* These two conflict; fail */
918		break;
919	}
920	mutex_exit(&connfp->connf_lock);
921	return (tconn != NULL);
922}
923
924static boolean_t
925check_exempt_conflict_v6(conn_t *connp)
926{
927	connf_t	*connfp;
928	conn_t *tconn;
929
930	connfp = &ipcl_proto_fanout[connp->conn_ulp];
931	mutex_enter(&connfp->connf_lock);
932	for (tconn = connfp->connf_head; tconn != NULL;
933	    tconn = tconn->conn_next) {
934		/* We don't allow v4 fallback for v6 raw socket */
935		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
936			continue;
937		/* If neither is exempt, then there's no conflict */
938		if (!connp->conn_mac_exempt && !tconn->conn_mac_exempt)
939			continue;
940		/* If both are bound to different addrs, ok */
941		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
942		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
943		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
944			continue;
945		/* These two conflict; fail */
946		break;
947	}
948	mutex_exit(&connfp->connf_lock);
949	return (tconn != NULL);
950}
951
952/*
953 * (v4, v6) bind hash insertion routines
954 */
955int
956ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
957{
958	connf_t	*connfp;
959#ifdef	IPCL_DEBUG
960	char	buf[INET_NTOA_BUFSIZE];
961#endif
962	int	ret = 0;
963
964	ASSERT(connp);
965
966	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
967	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
968
969	connp->conn_ulp = protocol;
970	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
971	connp->conn_lport = lport;
972
973	switch (protocol) {
974	default:
975		if (is_system_labeled() && check_exempt_conflict_v4(connp))
976			return (EADDRINUSE);
977		/* FALLTHROUGH */
978	case IPPROTO_UDP:
979		if (protocol == IPPROTO_UDP) {
980			IPCL_DEBUG_LVL(64,
981			    ("ipcl_bind_insert: connp %p - udp\n",
982			    (void *)connp));
983			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
984		} else {
985			IPCL_DEBUG_LVL(64,
986			    ("ipcl_bind_insert: connp %p - protocol\n",
987			    (void *)connp));
988			connfp = &ipcl_proto_fanout[protocol];
989		}
990
991		if (connp->conn_rem != INADDR_ANY) {
992			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
993		} else if (connp->conn_src != INADDR_ANY) {
994			IPCL_HASH_INSERT_BOUND(connfp, connp);
995		} else {
996			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
997		}
998		break;
999
1000	case IPPROTO_TCP:
1001
1002		/* Insert it in the Bind Hash */
1003		ASSERT(connp->conn_zoneid != ALL_ZONES);
1004		connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1005		if (connp->conn_src != INADDR_ANY) {
1006			IPCL_HASH_INSERT_BOUND(connfp, connp);
1007		} else {
1008			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1009		}
1010		if (cl_inet_listen != NULL) {
1011			ASSERT(!connp->conn_pkt_isv6);
1012			connp->conn_flags |= IPCL_CL_LISTENER;
1013			(*cl_inet_listen)(IPPROTO_TCP, AF_INET,
1014			    (uint8_t *)&connp->conn_bound_source, lport);
1015		}
1016		break;
1017
1018	case IPPROTO_SCTP:
1019		ret = ipcl_sctp_hash_insert(connp, lport);
1020		break;
1021	}
1022
1023	return (ret);
1024}
1025
1026int
1027ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1028    uint16_t lport)
1029{
1030	connf_t	*connfp;
1031	int	ret = 0;
1032
1033	ASSERT(connp);
1034
1035	connp->conn_ulp = protocol;
1036	connp->conn_srcv6 = *src;
1037	connp->conn_lport = lport;
1038
1039	switch (protocol) {
1040	default:
1041		if (is_system_labeled() && check_exempt_conflict_v6(connp))
1042			return (EADDRINUSE);
1043		/* FALLTHROUGH */
1044	case IPPROTO_UDP:
1045		if (protocol == IPPROTO_UDP) {
1046			IPCL_DEBUG_LVL(128,
1047			    ("ipcl_bind_insert_v6: connp %p - udp\n",
1048			    (void *)connp));
1049			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1050		} else {
1051			IPCL_DEBUG_LVL(128,
1052			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
1053			    (void *)connp));
1054			connfp = &ipcl_proto_fanout_v6[protocol];
1055		}
1056
1057		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1058			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1059		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1060			IPCL_HASH_INSERT_BOUND(connfp, connp);
1061		} else {
1062			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1063		}
1064		break;
1065
1066	case IPPROTO_TCP:
1067		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
1068
1069		/* Insert it in the Bind Hash */
1070		ASSERT(connp->conn_zoneid != ALL_ZONES);
1071		connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1072		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1073			IPCL_HASH_INSERT_BOUND(connfp, connp);
1074		} else {
1075			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1076		}
1077		if (cl_inet_listen != NULL) {
1078			sa_family_t	addr_family;
1079			uint8_t		*laddrp;
1080
1081			if (connp->conn_pkt_isv6) {
1082				addr_family = AF_INET6;
1083				laddrp =
1084				    (uint8_t *)&connp->conn_bound_source_v6;
1085			} else {
1086				addr_family = AF_INET;
1087				laddrp = (uint8_t *)&connp->conn_bound_source;
1088			}
1089			connp->conn_flags |= IPCL_CL_LISTENER;
1090			(*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp,
1091			    lport);
1092		}
1093		break;
1094
1095	case IPPROTO_SCTP:
1096		ret = ipcl_sctp_hash_insert(connp, lport);
1097		break;
1098	}
1099
1100	return (ret);
1101}
1102
1103/*
1104 * ipcl_conn_hash insertion routines.
1105 */
1106int
1107ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
1108    ipaddr_t rem, uint32_t ports)
1109{
1110	connf_t		*connfp;
1111	uint16_t	*up;
1112	conn_t		*tconnp;
1113#ifdef	IPCL_DEBUG
1114	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1115#endif
1116	in_port_t	lport;
1117	int		ret = 0;
1118
1119	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1120	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1121	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1122	    ports, protocol));
1123
1124	switch (protocol) {
1125	case IPPROTO_TCP:
1126		if (!(connp->conn_flags & IPCL_EAGER)) {
1127			/*
1128			 * for a eager connection, i.e connections which
1129			 * have just been created, the initialization is
1130			 * already done in ip at conn_creation time, so
1131			 * we can skip the checks here.
1132			 */
1133			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1134		}
1135		connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(connp->conn_rem,
1136		    connp->conn_ports)];
1137		mutex_enter(&connfp->connf_lock);
1138		for (tconnp = connfp->connf_head; tconnp != NULL;
1139		    tconnp = tconnp->conn_next) {
1140			if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1141			    connp->conn_rem, connp->conn_src,
1142			    connp->conn_ports)) {
1143
1144				/* Already have a conn. bail out */
1145				mutex_exit(&connfp->connf_lock);
1146				return (EADDRINUSE);
1147			}
1148		}
1149		if (connp->conn_fanout != NULL) {
1150			/*
1151			 * Probably a XTI/TLI application trying to do a
1152			 * rebind. Let it happen.
1153			 */
1154			mutex_exit(&connfp->connf_lock);
1155			IPCL_HASH_REMOVE(connp);
1156			mutex_enter(&connfp->connf_lock);
1157		}
1158		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1159		mutex_exit(&connfp->connf_lock);
1160		break;
1161
1162	case IPPROTO_SCTP:
1163		/*
1164		 * The raw socket may have already been bound, remove it
1165		 * from the hash first.
1166		 */
1167		IPCL_HASH_REMOVE(connp);
1168		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1169		ret = ipcl_sctp_hash_insert(connp, lport);
1170		break;
1171
1172	default:
1173		/*
1174		 * Check for conflicts among MAC exempt bindings.  For
1175		 * transports with port numbers, this is done by the upper
1176		 * level per-transport binding logic.  For all others, it's
1177		 * done here.
1178		 */
1179		if (is_system_labeled() && check_exempt_conflict_v4(connp))
1180			return (EADDRINUSE);
1181		/* FALLTHROUGH */
1182
1183	case IPPROTO_UDP:
1184		up = (uint16_t *)&ports;
1185		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1186		if (protocol == IPPROTO_UDP) {
1187			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])];
1188		} else {
1189			connfp = &ipcl_proto_fanout[protocol];
1190		}
1191
1192		if (connp->conn_rem != INADDR_ANY) {
1193			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1194		} else if (connp->conn_src != INADDR_ANY) {
1195			IPCL_HASH_INSERT_BOUND(connfp, connp);
1196		} else {
1197			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1198		}
1199		break;
1200	}
1201
1202	return (ret);
1203}
1204
1205int
1206ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1207    const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1208{
1209	connf_t		*connfp;
1210	uint16_t	*up;
1211	conn_t		*tconnp;
1212	in_port_t	lport;
1213	int		ret = 0;
1214
1215	switch (protocol) {
1216	case IPPROTO_TCP:
1217		/* Just need to insert a conn struct */
1218		if (!(connp->conn_flags & IPCL_EAGER)) {
1219			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1220		}
1221		connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(connp->conn_remv6,
1222		    connp->conn_ports)];
1223		mutex_enter(&connfp->connf_lock);
1224		for (tconnp = connfp->connf_head; tconnp != NULL;
1225		    tconnp = tconnp->conn_next) {
1226			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1227			    connp->conn_remv6, connp->conn_srcv6,
1228			    connp->conn_ports) &&
1229			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1230			    tconnp->conn_tcp->tcp_bound_if == ifindex)) {
1231				/* Already have a conn. bail out */
1232				mutex_exit(&connfp->connf_lock);
1233				return (EADDRINUSE);
1234			}
1235		}
1236		if (connp->conn_fanout != NULL) {
1237			/*
1238			 * Probably a XTI/TLI application trying to do a
1239			 * rebind. Let it happen.
1240			 */
1241			mutex_exit(&connfp->connf_lock);
1242			IPCL_HASH_REMOVE(connp);
1243			mutex_enter(&connfp->connf_lock);
1244		}
1245		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1246		mutex_exit(&connfp->connf_lock);
1247		break;
1248
1249	case IPPROTO_SCTP:
1250		IPCL_HASH_REMOVE(connp);
1251		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1252		ret = ipcl_sctp_hash_insert(connp, lport);
1253		break;
1254
1255	default:
1256		if (is_system_labeled() && check_exempt_conflict_v6(connp))
1257			return (EADDRINUSE);
1258		/* FALLTHROUGH */
1259	case IPPROTO_UDP:
1260		up = (uint16_t *)&ports;
1261		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1262		if (protocol == IPPROTO_UDP) {
1263			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])];
1264		} else {
1265			connfp = &ipcl_proto_fanout_v6[protocol];
1266		}
1267
1268		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1269			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1270		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1271			IPCL_HASH_INSERT_BOUND(connfp, connp);
1272		} else {
1273			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1274		}
1275		break;
1276	}
1277
1278	return (ret);
1279}
1280
1281/*
1282 * v4 packet classifying function. looks up the fanout table to
1283 * find the conn, the packet belongs to. returns the conn with
1284 * the reference held, null otherwise.
1285 *
1286 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1287 * Lookup" comment block are applied.  Labels are also checked as described
1288 * above.  If the packet is from the inside (looped back), and is from the same
1289 * zone, then label checks are omitted.
1290 */
1291conn_t *
1292ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid)
1293{
1294	ipha_t	*ipha;
1295	connf_t	*connfp, *bind_connfp;
1296	uint16_t lport;
1297	uint16_t fport;
1298	uint32_t ports;
1299	conn_t	*connp;
1300	uint16_t  *up;
1301	boolean_t shared_addr;
1302	boolean_t unlabeled;
1303
1304	ipha = (ipha_t *)mp->b_rptr;
1305	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1306
1307	switch (protocol) {
1308	case IPPROTO_TCP:
1309		ports = *(uint32_t *)up;
1310		connfp =
1311		    &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, ports)];
1312		mutex_enter(&connfp->connf_lock);
1313		for (connp = connfp->connf_head; connp != NULL;
1314		    connp = connp->conn_next) {
1315			if (IPCL_CONN_MATCH(connp, protocol,
1316			    ipha->ipha_src, ipha->ipha_dst, ports))
1317				break;
1318		}
1319
1320		if (connp != NULL) {
1321			/*
1322			 * We have a fully-bound TCP connection.
1323			 *
1324			 * For labeled systems, there's no need to check the
1325			 * label here.  It's known to be good as we checked
1326			 * before allowing the connection to become bound.
1327			 */
1328			CONN_INC_REF(connp);
1329			mutex_exit(&connfp->connf_lock);
1330			return (connp);
1331		}
1332
1333		mutex_exit(&connfp->connf_lock);
1334
1335		lport = up[1];
1336		unlabeled = B_FALSE;
1337		/* Cred cannot be null on IPv4 */
1338		if (is_system_labeled())
1339			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1340			    TSLF_UNLABELED) != 0;
1341		shared_addr = (zoneid == ALL_ZONES);
1342		if (shared_addr) {
1343			zoneid = tsol_mlp_findzone(protocol, lport);
1344			/*
1345			 * If no shared MLP is found, tsol_mlp_findzone returns
1346			 * ALL_ZONES.  In that case, we assume it's SLP, and
1347			 * search for the zone based on the packet label.
1348			 *
1349			 * If there is such a zone, we prefer to find a
1350			 * connection in it.  Otherwise, we look for a
1351			 * MAC-exempt connection in any zone whose label
1352			 * dominates the default label on the packet.
1353			 */
1354			if (zoneid == ALL_ZONES)
1355				zoneid = tsol_packet_to_zoneid(mp);
1356			else
1357				unlabeled = B_FALSE;
1358		}
1359
1360		bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1361		mutex_enter(&bind_connfp->connf_lock);
1362		for (connp = bind_connfp->connf_head; connp != NULL;
1363		    connp = connp->conn_next) {
1364			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1365			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
1366			    (unlabeled && connp->conn_mac_exempt)))
1367				break;
1368		}
1369
1370		/*
1371		 * If the matching connection is SLP on a private address, then
1372		 * the label on the packet must match the local zone's label.
1373		 * Otherwise, it must be in the label range defined by tnrh.
1374		 * This is ensured by tsol_receive_label.
1375		 */
1376		if (connp != NULL && is_system_labeled() &&
1377		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1378		    shared_addr, connp)) {
1379				DTRACE_PROBE3(
1380				    tx__ip__log__info__classify__tcp,
1381				    char *,
1382				    "connp(1) could not receive mp(2)",
1383				    conn_t *, connp, mblk_t *, mp);
1384			connp = NULL;
1385		}
1386
1387		if (connp != NULL) {
1388			/* Have a listener at least */
1389			CONN_INC_REF(connp);
1390			mutex_exit(&bind_connfp->connf_lock);
1391			return (connp);
1392		}
1393
1394		mutex_exit(&bind_connfp->connf_lock);
1395
1396		IPCL_DEBUG_LVL(512,
1397		    ("ipcl_classify: couldn't classify mp = %p\n",
1398		    (void *)mp));
1399		break;
1400
1401	case IPPROTO_UDP:
1402		lport = up[1];
1403		unlabeled = B_FALSE;
1404		/* Cred cannot be null on IPv4 */
1405		if (is_system_labeled())
1406			unlabeled = (crgetlabel(DB_CRED(mp))->tsl_flags &
1407			    TSLF_UNLABELED) != 0;
1408		shared_addr = (zoneid == ALL_ZONES);
1409		if (shared_addr) {
1410			zoneid = tsol_mlp_findzone(protocol, lport);
1411			/*
1412			 * If no shared MLP is found, tsol_mlp_findzone returns
1413			 * ALL_ZONES.  In that case, we assume it's SLP, and
1414			 * search for the zone based on the packet label.
1415			 *
1416			 * If there is such a zone, we prefer to find a
1417			 * connection in it.  Otherwise, we look for a
1418			 * MAC-exempt connection in any zone whose label
1419			 * dominates the default label on the packet.
1420			 */
1421			if (zoneid == ALL_ZONES)
1422				zoneid = tsol_packet_to_zoneid(mp);
1423			else
1424				unlabeled = B_FALSE;
1425		}
1426		fport = up[0];
1427		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1428		connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1429		mutex_enter(&connfp->connf_lock);
1430		for (connp = connfp->connf_head; connp != NULL;
1431		    connp = connp->conn_next) {
1432			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1433			    fport, ipha->ipha_src) &&
1434			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1435			    (unlabeled && connp->conn_mac_exempt)))
1436				break;
1437		}
1438
1439		if (connp != NULL && is_system_labeled() &&
1440		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1441		    shared_addr, connp)) {
1442			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1443			    char *, "connp(1) could not receive mp(2)",
1444			    conn_t *, connp, mblk_t *, mp);
1445			connp = NULL;
1446		}
1447
1448		if (connp != NULL) {
1449			CONN_INC_REF(connp);
1450			mutex_exit(&connfp->connf_lock);
1451			return (connp);
1452		}
1453
1454		/*
1455		 * We shouldn't come here for multicast/broadcast packets
1456		 */
1457		mutex_exit(&connfp->connf_lock);
1458		IPCL_DEBUG_LVL(512,
1459		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1460		    lport, fport));
1461		break;
1462	}
1463
1464	return (NULL);
1465}
1466
1467conn_t *
1468ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid)
1469{
1470	ip6_t		*ip6h;
1471	connf_t		*connfp, *bind_connfp;
1472	uint16_t	lport;
1473	uint16_t	fport;
1474	tcph_t		*tcph;
1475	uint32_t	ports;
1476	conn_t		*connp;
1477	uint16_t	*up;
1478	boolean_t	shared_addr;
1479	boolean_t	unlabeled;
1480
1481	ip6h = (ip6_t *)mp->b_rptr;
1482
1483	switch (protocol) {
1484	case IPPROTO_TCP:
1485		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1486		up = (uint16_t *)tcph->th_lport;
1487		ports = *(uint32_t *)up;
1488
1489		connfp =
1490		    &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, ports)];
1491		mutex_enter(&connfp->connf_lock);
1492		for (connp = connfp->connf_head; connp != NULL;
1493		    connp = connp->conn_next) {
1494			if (IPCL_CONN_MATCH_V6(connp, protocol,
1495			    ip6h->ip6_src, ip6h->ip6_dst, ports))
1496				break;
1497		}
1498
1499		if (connp != NULL) {
1500			/*
1501			 * We have a fully-bound TCP connection.
1502			 *
1503			 * For labeled systems, there's no need to check the
1504			 * label here.  It's known to be good as we checked
1505			 * before allowing the connection to become bound.
1506			 */
1507			CONN_INC_REF(connp);
1508			mutex_exit(&connfp->connf_lock);
1509			return (connp);
1510		}
1511
1512		mutex_exit(&connfp->connf_lock);
1513
1514		lport = up[1];
1515		unlabeled = B_FALSE;
1516		/* Cred can be null on IPv6 */
1517		if (is_system_labeled()) {
1518			cred_t *cr = DB_CRED(mp);
1519
1520			unlabeled = (cr != NULL &&
1521			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1522		}
1523		shared_addr = (zoneid == ALL_ZONES);
1524		if (shared_addr) {
1525			zoneid = tsol_mlp_findzone(protocol, lport);
1526			/*
1527			 * If no shared MLP is found, tsol_mlp_findzone returns
1528			 * ALL_ZONES.  In that case, we assume it's SLP, and
1529			 * search for the zone based on the packet label.
1530			 *
1531			 * If there is such a zone, we prefer to find a
1532			 * connection in it.  Otherwise, we look for a
1533			 * MAC-exempt connection in any zone whose label
1534			 * dominates the default label on the packet.
1535			 */
1536			if (zoneid == ALL_ZONES)
1537				zoneid = tsol_packet_to_zoneid(mp);
1538			else
1539				unlabeled = B_FALSE;
1540		}
1541
1542		bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1543		mutex_enter(&bind_connfp->connf_lock);
1544		for (connp = bind_connfp->connf_head; connp != NULL;
1545		    connp = connp->conn_next) {
1546			if (IPCL_BIND_MATCH_V6(connp, protocol,
1547			    ip6h->ip6_dst, lport) &&
1548			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1549			    (unlabeled && connp->conn_mac_exempt)))
1550				break;
1551		}
1552
1553		if (connp != NULL && is_system_labeled() &&
1554		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1555		    shared_addr, connp)) {
1556			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1557			    char *, "connp(1) could not receive mp(2)",
1558			    conn_t *, connp, mblk_t *, mp);
1559			connp = NULL;
1560		}
1561
1562		if (connp != NULL) {
1563			/* Have a listner at least */
1564			CONN_INC_REF(connp);
1565			mutex_exit(&bind_connfp->connf_lock);
1566			IPCL_DEBUG_LVL(512,
1567			    ("ipcl_classify_v6: found listner "
1568			    "connp = %p\n", (void *)connp));
1569
1570			return (connp);
1571		}
1572
1573		mutex_exit(&bind_connfp->connf_lock);
1574
1575		IPCL_DEBUG_LVL(512,
1576		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1577		    (void *)mp));
1578		break;
1579
1580	case IPPROTO_UDP:
1581		up = (uint16_t *)&mp->b_rptr[hdr_len];
1582		lport = up[1];
1583		unlabeled = B_FALSE;
1584		/* Cred can be null on IPv6 */
1585		if (is_system_labeled()) {
1586			cred_t *cr = DB_CRED(mp);
1587
1588			unlabeled = (cr != NULL &&
1589			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1590		}
1591		shared_addr = (zoneid == ALL_ZONES);
1592		if (shared_addr) {
1593			zoneid = tsol_mlp_findzone(protocol, lport);
1594			/*
1595			 * If no shared MLP is found, tsol_mlp_findzone returns
1596			 * ALL_ZONES.  In that case, we assume it's SLP, and
1597			 * search for the zone based on the packet label.
1598			 *
1599			 * If there is such a zone, we prefer to find a
1600			 * connection in it.  Otherwise, we look for a
1601			 * MAC-exempt connection in any zone whose label
1602			 * dominates the default label on the packet.
1603			 */
1604			if (zoneid == ALL_ZONES)
1605				zoneid = tsol_packet_to_zoneid(mp);
1606			else
1607				unlabeled = B_FALSE;
1608		}
1609
1610		fport = up[0];
1611		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
1612		    fport));
1613		connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1614		mutex_enter(&connfp->connf_lock);
1615		for (connp = connfp->connf_head; connp != NULL;
1616		    connp = connp->conn_next) {
1617			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1618			    fport, ip6h->ip6_src) &&
1619			    (IPCL_ZONE_MATCH(connp, zoneid) ||
1620			    (unlabeled && connp->conn_mac_exempt)))
1621				break;
1622		}
1623
1624		if (connp != NULL && is_system_labeled() &&
1625		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1626		    shared_addr, connp)) {
1627			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1628			    char *, "connp(1) could not receive mp(2)",
1629			    conn_t *, connp, mblk_t *, mp);
1630			connp = NULL;
1631		}
1632
1633		if (connp != NULL) {
1634			CONN_INC_REF(connp);
1635			mutex_exit(&connfp->connf_lock);
1636			return (connp);
1637		}
1638
1639		/*
1640		 * We shouldn't come here for multicast/broadcast packets
1641		 */
1642		mutex_exit(&connfp->connf_lock);
1643		IPCL_DEBUG_LVL(512,
1644		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
1645		    lport, fport));
1646		break;
1647	}
1648
1649	return (NULL);
1650}
1651
1652/*
1653 * wrapper around ipcl_classify_(v4,v6) routines.
1654 */
1655conn_t *
1656ipcl_classify(mblk_t *mp, zoneid_t zoneid)
1657{
1658	uint16_t	hdr_len;
1659	ipha_t		*ipha;
1660	uint8_t		*nexthdrp;
1661
1662	if (MBLKL(mp) < sizeof (ipha_t))
1663		return (NULL);
1664
1665	switch (IPH_HDR_VERSION(mp->b_rptr)) {
1666	case IPV4_VERSION:
1667		ipha = (ipha_t *)mp->b_rptr;
1668		hdr_len = IPH_HDR_LENGTH(ipha);
1669		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
1670		    zoneid));
1671	case IPV6_VERSION:
1672		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
1673		    &hdr_len, &nexthdrp))
1674			return (NULL);
1675
1676		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid));
1677	}
1678
1679	return (NULL);
1680}
1681
1682conn_t *
1683ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
1684    uint32_t ports, ipha_t *hdr)
1685{
1686	connf_t		*connfp;
1687	conn_t		*connp;
1688	in_port_t	lport;
1689	int		af;
1690	boolean_t	shared_addr;
1691	boolean_t	unlabeled;
1692	const void	*dst;
1693
1694	lport = ((uint16_t *)&ports)[1];
1695
1696	unlabeled = B_FALSE;
1697	/* Cred can be null on IPv6 */
1698	if (is_system_labeled()) {
1699		cred_t *cr = DB_CRED(mp);
1700
1701		unlabeled = (cr != NULL &&
1702		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
1703	}
1704	shared_addr = (zoneid == ALL_ZONES);
1705	if (shared_addr) {
1706		zoneid = tsol_mlp_findzone(protocol, lport);
1707		/*
1708		 * If no shared MLP is found, tsol_mlp_findzone returns
1709		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
1710		 * the zone based on the packet label.
1711		 *
1712		 * If there is such a zone, we prefer to find a connection in
1713		 * it.  Otherwise, we look for a MAC-exempt connection in any
1714		 * zone whose label dominates the default label on the packet.
1715		 */
1716		if (zoneid == ALL_ZONES)
1717			zoneid = tsol_packet_to_zoneid(mp);
1718		else
1719			unlabeled = B_FALSE;
1720	}
1721
1722	af = IPH_HDR_VERSION(hdr);
1723	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
1724	    (const void *)&((ip6_t *)hdr)->ip6_dst;
1725	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))];
1726
1727	mutex_enter(&connfp->connf_lock);
1728	for (connp = connfp->connf_head; connp != NULL;
1729	    connp = connp->conn_next) {
1730		/* We don't allow v4 fallback for v6 raw socket. */
1731		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
1732		    IPV6_VERSION))
1733			continue;
1734		if (connp->conn_fully_bound) {
1735			if (af == IPV4_VERSION) {
1736				if (!IPCL_CONN_MATCH(connp, protocol,
1737				    hdr->ipha_src, hdr->ipha_dst, ports))
1738					continue;
1739			} else {
1740				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1741				    ((ip6_t *)hdr)->ip6_src,
1742				    ((ip6_t *)hdr)->ip6_dst, ports))
1743					continue;
1744			}
1745		} else {
1746			if (af == IPV4_VERSION) {
1747				if (!IPCL_BIND_MATCH(connp, protocol,
1748				    hdr->ipha_dst, lport))
1749					continue;
1750			} else {
1751				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1752				    ((ip6_t *)hdr)->ip6_dst, lport))
1753					continue;
1754			}
1755		}
1756
1757		if (IPCL_ZONE_MATCH(connp, zoneid) ||
1758		    (unlabeled && connp->conn_mac_exempt))
1759			break;
1760	}
1761	/*
1762	 * If the connection is fully-bound and connection-oriented (TCP or
1763	 * SCTP), then we've already validated the remote system's label.
1764	 * There's no need to do it again for every packet.
1765	 */
1766	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
1767	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
1768	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
1769		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1770		    char *, "connp(1) could not receive mp(2)",
1771		    conn_t *, connp, mblk_t *, mp);
1772		connp = NULL;
1773	}
1774
1775	if (connp != NULL)
1776		goto found;
1777	mutex_exit(&connfp->connf_lock);
1778
1779	/* Try to look for a wildcard match. */
1780	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(0)];
1781	mutex_enter(&connfp->connf_lock);
1782	for (connp = connfp->connf_head; connp != NULL;
1783	    connp = connp->conn_next) {
1784		/* We don't allow v4 fallback for v6 raw socket. */
1785		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1786		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
1787			continue;
1788		}
1789		if (af == IPV4_VERSION) {
1790			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
1791				break;
1792		} else {
1793			if (IPCL_RAW_MATCH_V6(connp, protocol,
1794			    ((ip6_t *)hdr)->ip6_dst)) {
1795				break;
1796			}
1797		}
1798	}
1799
1800	if (connp != NULL)
1801		goto found;
1802
1803	mutex_exit(&connfp->connf_lock);
1804	return (NULL);
1805
1806found:
1807	ASSERT(connp != NULL);
1808	CONN_INC_REF(connp);
1809	mutex_exit(&connfp->connf_lock);
1810	return (connp);
1811}
1812
1813/* ARGSUSED */
1814static int
1815ipcl_tcpconn_constructor(void *buf, void *cdrarg, int kmflags)
1816{
1817	itc_t	*itc = (itc_t *)buf;
1818	conn_t 	*connp = &itc->itc_conn;
1819	tcp_t	*tcp = &itc->itc_tcp;
1820	bzero(itc, sizeof (itc_t));
1821	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
1822	connp->conn_tcp = tcp;
1823	connp->conn_flags = IPCL_TCPCONN;
1824	connp->conn_ulp = IPPROTO_TCP;
1825	tcp->tcp_connp = connp;
1826	return (0);
1827}
1828
1829/* ARGSUSED */
1830static void
1831ipcl_tcpconn_destructor(void *buf, void *cdrarg)
1832{
1833	tcp_timermp_free(((conn_t *)buf)->conn_tcp);
1834}
1835
1836/*
1837 * All conns are inserted in a global multi-list for the benefit of
1838 * walkers. The walk is guaranteed to walk all open conns at the time
1839 * of the start of the walk exactly once. This property is needed to
1840 * achieve some cleanups during unplumb of interfaces. This is achieved
1841 * as follows.
1842 *
1843 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
1844 * call the insert and delete functions below at creation and deletion
1845 * time respectively. The conn never moves or changes its position in this
1846 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
1847 * won't increase due to walkers, once the conn deletion has started. Note
1848 * that we can't remove the conn from the global list and then wait for
1849 * the refcnt to drop to zero, since walkers would then see a truncated
1850 * list. CONN_INCIPIENT ensures that walkers don't start looking at
1851 * conns until ip_open is ready to make them globally visible.
1852 * The global round robin multi-list locks are held only to get the
1853 * next member/insertion/deletion and contention should be negligible
1854 * if the multi-list is much greater than the number of cpus.
1855 */
1856void
1857ipcl_globalhash_insert(conn_t *connp)
1858{
1859	int	index;
1860
1861	/*
1862	 * No need for atomic here. Approximate even distribution
1863	 * in the global lists is sufficient.
1864	 */
1865	conn_g_index++;
1866	index = conn_g_index & (CONN_G_HASH_SIZE - 1);
1867
1868	connp->conn_g_prev = NULL;
1869	/*
1870	 * Mark as INCIPIENT, so that walkers will ignore this
1871	 * for now, till ip_open is ready to make it visible globally.
1872	 */
1873	connp->conn_state_flags |= CONN_INCIPIENT;
1874
1875	/* Insert at the head of the list */
1876	mutex_enter(&ipcl_globalhash_fanout[index].connf_lock);
1877	connp->conn_g_next = ipcl_globalhash_fanout[index].connf_head;
1878	if (connp->conn_g_next != NULL)
1879		connp->conn_g_next->conn_g_prev = connp;
1880	ipcl_globalhash_fanout[index].connf_head = connp;
1881
1882	/* The fanout bucket this conn points to */
1883	connp->conn_g_fanout = &ipcl_globalhash_fanout[index];
1884
1885	mutex_exit(&ipcl_globalhash_fanout[index].connf_lock);
1886}
1887
1888void
1889ipcl_globalhash_remove(conn_t *connp)
1890{
1891	/*
1892	 * We were never inserted in the global multi list.
1893	 * IPCL_NONE variety is never inserted in the global multilist
1894	 * since it is presumed to not need any cleanup and is transient.
1895	 */
1896	if (connp->conn_g_fanout == NULL)
1897		return;
1898
1899	mutex_enter(&connp->conn_g_fanout->connf_lock);
1900	if (connp->conn_g_prev != NULL)
1901		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
1902	else
1903		connp->conn_g_fanout->connf_head = connp->conn_g_next;
1904	if (connp->conn_g_next != NULL)
1905		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
1906	mutex_exit(&connp->conn_g_fanout->connf_lock);
1907
1908	/* Better to stumble on a null pointer than to corrupt memory */
1909	connp->conn_g_next = NULL;
1910	connp->conn_g_prev = NULL;
1911}
1912
1913/*
1914 * Walk the list of all conn_t's in the system, calling the function provided
1915 * with the specified argument for each.
1916 * Applies to both IPv4 and IPv6.
1917 *
1918 * IPCs may hold pointers to ipif/ill. To guard against stale pointers
1919 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
1920 * unplumbed or removed. New conn_t's that are created while we are walking
1921 * may be missed by this walk, because they are not necessarily inserted
1922 * at the tail of the list. They are new conn_t's and thus don't have any
1923 * stale pointers. The CONN_CLOSING flag ensures that no new reference
1924 * is created to the struct that is going away.
1925 */
1926void
1927ipcl_walk(pfv_t func, void *arg)
1928{
1929	int	i;
1930	conn_t	*connp;
1931	conn_t	*prev_connp;
1932
1933	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
1934		mutex_enter(&ipcl_globalhash_fanout[i].connf_lock);
1935		prev_connp = NULL;
1936		connp = ipcl_globalhash_fanout[i].connf_head;
1937		while (connp != NULL) {
1938			mutex_enter(&connp->conn_lock);
1939			if (connp->conn_state_flags &
1940			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
1941				mutex_exit(&connp->conn_lock);
1942				connp = connp->conn_g_next;
1943				continue;
1944			}
1945			CONN_INC_REF_LOCKED(connp);
1946			mutex_exit(&connp->conn_lock);
1947			mutex_exit(&ipcl_globalhash_fanout[i].connf_lock);
1948			(*func)(connp, arg);
1949			if (prev_connp != NULL)
1950				CONN_DEC_REF(prev_connp);
1951			mutex_enter(&ipcl_globalhash_fanout[i].connf_lock);
1952			prev_connp = connp;
1953			connp = connp->conn_g_next;
1954		}
1955		mutex_exit(&ipcl_globalhash_fanout[i].connf_lock);
1956		if (prev_connp != NULL)
1957			CONN_DEC_REF(prev_connp);
1958	}
1959}
1960
1961/*
1962 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
1963 * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
1964 * held; caller must call CONN_DEC_REF.  Only checks for connected entries
1965 * (peer tcp in at least ESTABLISHED state).
1966 */
1967conn_t *
1968ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph)
1969{
1970	uint32_t ports;
1971	uint16_t *pports = (uint16_t *)&ports;
1972	connf_t	*connfp;
1973	conn_t	*tconnp;
1974	boolean_t zone_chk;
1975
1976	/*
1977	 * If either the source of destination address is loopback, then
1978	 * both endpoints must be in the same Zone.  Otherwise, both of
1979	 * the addresses are system-wide unique (tcp is in ESTABLISHED
1980	 * state) and the endpoints may reside in different Zones.
1981	 */
1982	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
1983	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
1984
1985	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1986	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1987
1988	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)];
1989
1990	mutex_enter(&connfp->connf_lock);
1991	for (tconnp = connfp->connf_head; tconnp != NULL;
1992	    tconnp = tconnp->conn_next) {
1993
1994		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
1995		    ipha->ipha_dst, ipha->ipha_src, ports) &&
1996		    tconnp->conn_tcp->tcp_state >= TCPS_ESTABLISHED &&
1997		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
1998
1999			ASSERT(tconnp != connp);
2000			CONN_INC_REF(tconnp);
2001			mutex_exit(&connfp->connf_lock);
2002			return (tconnp);
2003		}
2004	}
2005	mutex_exit(&connfp->connf_lock);
2006	return (NULL);
2007}
2008
2009/*
2010 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2011 * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2012 * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2013 * (peer tcp in at least ESTABLISHED state).
2014 */
2015conn_t *
2016ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph)
2017{
2018	uint32_t ports;
2019	uint16_t *pports = (uint16_t *)&ports;
2020	connf_t	*connfp;
2021	conn_t	*tconnp;
2022	boolean_t zone_chk;
2023
2024	/*
2025	 * If either the source of destination address is loopback, then
2026	 * both endpoints must be in the same Zone.  Otherwise, both of
2027	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2028	 * state) and the endpoints may reside in different Zones.  We
2029	 * don't do Zone check for link local address(es) because the
2030	 * current Zone implementation treats each link local address as
2031	 * being unique per system node, i.e. they belong to global Zone.
2032	 */
2033	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2034	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2035
2036	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2037	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2038
2039	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)];
2040
2041	mutex_enter(&connfp->connf_lock);
2042	for (tconnp = connfp->connf_head; tconnp != NULL;
2043	    tconnp = tconnp->conn_next) {
2044
2045		/* We skip tcp_bound_if check here as this is loopback tcp */
2046		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2047		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2048		    tconnp->conn_tcp->tcp_state >= TCPS_ESTABLISHED &&
2049		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2050
2051			ASSERT(tconnp != connp);
2052			CONN_INC_REF(tconnp);
2053			mutex_exit(&connfp->connf_lock);
2054			return (tconnp);
2055		}
2056	}
2057	mutex_exit(&connfp->connf_lock);
2058	return (NULL);
2059}
2060
2061/*
2062 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2063 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2064 * Only checks for connected entries i.e. no INADDR_ANY checks.
2065 */
2066conn_t *
2067ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state)
2068{
2069	uint32_t ports;
2070	uint16_t *pports;
2071	connf_t	*connfp;
2072	conn_t	*tconnp;
2073
2074	pports = (uint16_t *)&ports;
2075	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
2076	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
2077
2078	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)];
2079
2080	mutex_enter(&connfp->connf_lock);
2081	for (tconnp = connfp->connf_head; tconnp != NULL;
2082	    tconnp = tconnp->conn_next) {
2083
2084		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2085		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2086		    tconnp->conn_tcp->tcp_state >= min_state) {
2087
2088			CONN_INC_REF(tconnp);
2089			mutex_exit(&connfp->connf_lock);
2090			return (tconnp);
2091		}
2092	}
2093	mutex_exit(&connfp->connf_lock);
2094	return (NULL);
2095}
2096
2097/*
2098 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2099 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2100 * Only checks for connected entries i.e. no INADDR_ANY checks.
2101 * Match on ifindex in addition to addresses.
2102 */
2103conn_t *
2104ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2105    uint_t ifindex)
2106{
2107	tcp_t	*tcp;
2108	uint32_t ports;
2109	uint16_t *pports;
2110	connf_t	*connfp;
2111	conn_t	*tconnp;
2112
2113	pports = (uint16_t *)&ports;
2114	pports[0] = tcpha->tha_fport;
2115	pports[1] = tcpha->tha_lport;
2116
2117	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)];
2118
2119	mutex_enter(&connfp->connf_lock);
2120	for (tconnp = connfp->connf_head; tconnp != NULL;
2121	    tconnp = tconnp->conn_next) {
2122
2123		tcp = tconnp->conn_tcp;
2124		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2125		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2126		    tcp->tcp_state >= min_state &&
2127		    (tcp->tcp_bound_if == 0 ||
2128		    tcp->tcp_bound_if == ifindex)) {
2129
2130			CONN_INC_REF(tconnp);
2131			mutex_exit(&connfp->connf_lock);
2132			return (tconnp);
2133		}
2134	}
2135	mutex_exit(&connfp->connf_lock);
2136	return (NULL);
2137}
2138
2139/*
2140 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2141 * a listener when changing state.
2142 */
2143conn_t *
2144ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid)
2145{
2146	connf_t		*bind_connfp;
2147	conn_t		*connp;
2148	tcp_t		*tcp;
2149
2150	/*
2151	 * Avoid false matches for packets sent to an IP destination of
2152	 * all zeros.
2153	 */
2154	if (laddr == 0)
2155		return (NULL);
2156
2157	ASSERT(zoneid != ALL_ZONES);
2158
2159	bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
2160	mutex_enter(&bind_connfp->connf_lock);
2161	for (connp = bind_connfp->connf_head; connp != NULL;
2162	    connp = connp->conn_next) {
2163		tcp = connp->conn_tcp;
2164		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2165		    IPCL_ZONE_MATCH(connp, zoneid) &&
2166		    (tcp->tcp_listener == NULL)) {
2167			CONN_INC_REF(connp);
2168			mutex_exit(&bind_connfp->connf_lock);
2169			return (connp);
2170		}
2171	}
2172	mutex_exit(&bind_connfp->connf_lock);
2173	return (NULL);
2174}
2175
2176/*
2177 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2178 * a listener when changing state.
2179 */
2180conn_t *
2181ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2182    zoneid_t zoneid)
2183{
2184	connf_t		*bind_connfp;
2185	conn_t		*connp = NULL;
2186	tcp_t		*tcp;
2187
2188	/*
2189	 * Avoid false matches for packets sent to an IP destination of
2190	 * all zeros.
2191	 */
2192	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2193		return (NULL);
2194
2195	ASSERT(zoneid != ALL_ZONES);
2196
2197	bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
2198	mutex_enter(&bind_connfp->connf_lock);
2199	for (connp = bind_connfp->connf_head; connp != NULL;
2200	    connp = connp->conn_next) {
2201		tcp = connp->conn_tcp;
2202		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2203		    IPCL_ZONE_MATCH(connp, zoneid) &&
2204		    (tcp->tcp_bound_if == 0 ||
2205		    tcp->tcp_bound_if == ifindex) &&
2206		    tcp->tcp_listener == NULL) {
2207			CONN_INC_REF(connp);
2208			mutex_exit(&bind_connfp->connf_lock);
2209			return (connp);
2210		}
2211	}
2212	mutex_exit(&bind_connfp->connf_lock);
2213	return (NULL);
2214}
2215
2216/*
2217 * ipcl_get_next_conn
2218 *	get the next entry in the conn global list
2219 *	and put a reference on the next_conn.
2220 *	decrement the reference on the current conn.
2221 *
2222 * This is an iterator based walker function that also provides for
2223 * some selection by the caller. It walks through the conn_hash bucket
2224 * searching for the next valid connp in the list, and selects connections
2225 * that are neither closed nor condemned. It also REFHOLDS the conn
2226 * thus ensuring that the conn exists when the caller uses the conn.
2227 */
2228conn_t *
2229ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2230{
2231	conn_t	*next_connp;
2232
2233	if (connfp == NULL)
2234		return (NULL);
2235
2236	mutex_enter(&connfp->connf_lock);
2237
2238	next_connp = (connp == NULL) ?
2239	    connfp->connf_head : connp->conn_g_next;
2240
2241	while (next_connp != NULL) {
2242		mutex_enter(&next_connp->conn_lock);
2243		if (!(next_connp->conn_flags & conn_flags) ||
2244		    (next_connp->conn_state_flags &
2245		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2246			/*
2247			 * This conn has been condemned or
2248			 * is closing, or the flags don't match
2249			 */
2250			mutex_exit(&next_connp->conn_lock);
2251			next_connp = next_connp->conn_g_next;
2252			continue;
2253		}
2254		CONN_INC_REF_LOCKED(next_connp);
2255		mutex_exit(&next_connp->conn_lock);
2256		break;
2257	}
2258
2259	mutex_exit(&connfp->connf_lock);
2260
2261	if (connp != NULL)
2262		CONN_DEC_REF(connp);
2263
2264	return (next_connp);
2265}
2266
2267#ifdef CONN_DEBUG
2268/*
2269 * Trace of the last NBUF refhold/refrele
2270 */
2271int
2272conn_trace_ref(conn_t *connp)
2273{
2274	int	last;
2275	conn_trace_t	*ctb;
2276
2277	ASSERT(MUTEX_HELD(&connp->conn_lock));
2278	last = connp->conn_trace_last;
2279	last++;
2280	if (last == CONN_TRACE_MAX)
2281		last = 0;
2282
2283	ctb = &connp->conn_trace_buf[last];
2284	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
2285	connp->conn_trace_last = last;
2286	return (1);
2287}
2288
2289int
2290conn_untrace_ref(conn_t *connp)
2291{
2292	int	last;
2293	conn_trace_t	*ctb;
2294
2295	ASSERT(MUTEX_HELD(&connp->conn_lock));
2296	last = connp->conn_trace_last;
2297	last++;
2298	if (last == CONN_TRACE_MAX)
2299		last = 0;
2300
2301	ctb = &connp->conn_trace_buf[last];
2302	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
2303	connp->conn_trace_last = last;
2304	return (1);
2305}
2306#endif
2307