ipclassifier.c revision 1503:9c3595b79c0d
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28const char ipclassifier_version[] = "@(#)ipclassifier.c	1.6	04/03/31 SMI";
29
30/*
31 * IP PACKET CLASSIFIER
32 *
33 * The IP packet classifier provides mapping between IP packets and persistent
34 * connection state for connection-oriented protocols. It also provides
35 * interface for managing connection states.
36 *
37 * The connection state is kept in conn_t data structure and contains, among
38 * other things:
39 *
40 *	o local/remote address and ports
41 *	o Transport protocol
42 *	o squeue for the connection (for TCP only)
43 *	o reference counter
44 *	o Connection state
45 *	o hash table linkage
46 *	o interface/ire information
47 *	o credentials
48 *	o ipsec policy
49 *	o send and receive functions.
50 *	o mutex lock.
51 *
52 * Connections use a reference counting scheme. They are freed when the
53 * reference counter drops to zero. A reference is incremented when connection
54 * is placed in a list or table, when incoming packet for the connection arrives
55 * and when connection is processed via squeue (squeue processing may be
56 * asynchronous and the reference protects the connection from being destroyed
57 * before its processing is finished).
58 *
59 * send and receive functions are currently used for TCP only. The send function
60 * determines the IP entry point for the packet once it leaves TCP to be sent to
61 * the destination address. The receive function is used by IP when the packet
62 * should be passed for TCP processing. When a new connection is created these
63 * are set to ip_output() and tcp_input() respectively. During the lifetime of
64 * the connection the send and receive functions may change depending on the
65 * changes in the connection state. For example, Once the connection is bound to
66 * an addresse, the receive function for this connection is set to
67 * tcp_conn_request().  This allows incoming SYNs to go directly into the
68 * listener SYN processing function without going to tcp_input() first.
69 *
70 * Classifier uses several hash tables:
71 *
72 * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
73 *	ipcl_bind_fanout:	contains all connections in BOUND state
74 *	ipcl_proto_fanout:	IPv4 protocol fanout
75 *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
76 *	ipcl_udp_fanout:	contains all UDP connections
77 *	ipcl_globalhash_fanout:	contains all connections
78 *
79 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
80 * which need to view all existing connections.
81 *
82 * All tables are protected by per-bucket locks. When both per-bucket lock and
83 * connection lock need to be held, the per-bucket lock should be acquired
84 * first, followed by the connection lock.
85 *
86 * All functions doing search in one of these tables increment a reference
87 * counter on the connection found (if any). This reference should be dropped
88 * when the caller has finished processing the connection.
89 *
90 *
91 * INTERFACES:
92 * ===========
93 *
94 * Connection Lookup:
95 * ------------------
96 *
97 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid)
98 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid)
99 *
100 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
101 * it can't find any associated connection. If the connection is found, its
102 * reference counter is incremented.
103 *
104 *	mp:	mblock, containing packet header. The full header should fit
105 *		into a single mblock. It should also contain at least full IP
106 *		and TCP or UDP header.
107 *
108 *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
109 *
110 *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
111 *		 the packet.
112 *
113 * 	zoneid: The zone in which the returned connection must be.
114 *
115 *	For TCP connections, the lookup order is as follows:
116 *		5-tuple {src, dst, protocol, local port, remote port}
117 *			lookup in ipcl_conn_fanout table.
118 *		3-tuple {dst, remote port, protocol} lookup in
119 *			ipcl_bind_fanout table.
120 *
121 *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
122 *	remote port} lookup is done on ipcl_udp_fanout. Note that,
123 *	these interfaces do not handle cases where a packets belongs
124 *	to multiple UDP clients, which is handled in IP itself.
125 *
126 * conn_t	*ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int);
127 * conn_t	*ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t);
128 *
129 *	Lookup routine to find a exact match for {src, dst, local port,
130 *	remote port) for TCP connections in ipcl_conn_fanout. The address and
131 *	ports are read from the IP and TCP header respectively.
132 *
133 * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol);
134 * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex);
135 *
136 * 	Lookup routine to find a listener with the tuple {lport, laddr,
137 * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
138 * 	parameter interface index is also compared.
139 *
140 * void ipcl_walk(func, arg)
141 *
142 * 	Apply 'func' to every connection available. The 'func' is called as
143 *	(*func)(connp, arg). The walk is non-atomic so connections may be
144 *	created and destroyed during the walk. The CONN_CONDEMNED and
145 *	CONN_INCIPIENT flags ensure that connections which are newly created
146 *	or being destroyed are not selected by the walker.
147 *
148 * Table Updates
149 * -------------
150 *
151 * int ipcl_conn_insert(connp, protocol, src, dst, ports)
152 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
153 *
154 *	Insert 'connp' in the ipcl_conn_fanout.
155 *	Arguements :
156 *		connp		conn_t to be inserted
157 *		protocol	connection protocol
158 *		src		source address
159 *		dst		destination address
160 *		ports		local and remote port
161 *		ifindex		interface index for IPv6 connections
162 *
163 *	Return value :
164 *		0		if connp was inserted
165 *		EADDRINUSE	if the connection with the same tuple
166 *				already exists.
167 *
168 * int ipcl_bind_insert(connp, protocol, src, lport);
169 * int ipcl_bind_insert_v6(connp, protocol, src, lport);
170 *
171 * 	Insert 'connp' in ipcl_bind_fanout.
172 * 	Arguements :
173 * 		connp		conn_t to be inserted
174 * 		protocol	connection protocol
175 * 		src		source address connection wants
176 * 				to bind to
177 * 		lport		local port connection wants to
178 * 				bind to
179 *
180 *
181 * void ipcl_hash_remove(connp);
182 *
183 * 	Removes the 'connp' from the connection fanout table.
184 *
185 * Connection Creation/Destruction
186 * -------------------------------
187 *
188 * conn_t *ipcl_conn_create(type, sleep)
189 *
190 * 	Creates a new conn based on the type flag, inserts it into
191 * 	globalhash table.
192 *
193 *	type:	This flag determines the type of conn_t which needs to be
194 *		created.
195 *		IPCL_TCPCONN	indicates a TCP connection
196 *		IPCL_IPCONN	indicates all non-TCP connections.
197 *
198 * void ipcl_conn_destroy(connp)
199 *
200 * 	Destroys the connection state, removes it from the global
201 * 	connection hash table and frees its memory.
202 */
203
204#include <sys/types.h>
205#include <sys/stream.h>
206#include <sys/dlpi.h>
207#include <sys/stropts.h>
208#include <sys/sysmacros.h>
209#include <sys/strsubr.h>
210#include <sys/strlog.h>
211#include <sys/strsun.h>
212#define	_SUN_TPI_VERSION 2
213#include <sys/ddi.h>
214#include <sys/cmn_err.h>
215#include <sys/debug.h>
216
217#include <sys/systm.h>
218#include <sys/param.h>
219#include <sys/kmem.h>
220#include <sys/isa_defs.h>
221#include <inet/common.h>
222#include <netinet/ip6.h>
223#include <netinet/icmp6.h>
224
225#include <inet/ip.h>
226#include <inet/ip6.h>
227#include <inet/tcp.h>
228#include <inet/tcp_trace.h>
229#include <inet/ip_multi.h>
230#include <inet/ip_if.h>
231#include <inet/ip_ire.h>
232#include <inet/ip_rts.h>
233#include <inet/optcom.h>
234#include <inet/ip_ndp.h>
235#include <inet/udp_impl.h>
236#include <inet/sctp_ip.h>
237
238#include <sys/ethernet.h>
239#include <net/if_types.h>
240#include <sys/cpuvar.h>
241
242#include <inet/mi.h>
243#include <inet/ipclassifier.h>
244#include <inet/ipsec_impl.h>
245
246#ifdef DEBUG
247#define	IPCL_DEBUG
248#else
249#undef	IPCL_DEBUG
250#endif
251
252#ifdef	IPCL_DEBUG
253int	ipcl_debug_level = 0;
254#define	IPCL_DEBUG_LVL(level, args)	\
255	if (ipcl_debug_level  & level) { printf args; }
256#else
257#define	IPCL_DEBUG_LVL(level, args) {; }
258#endif
259connf_t	*ipcl_conn_fanout;
260connf_t	*ipcl_bind_fanout;
261connf_t	ipcl_proto_fanout[IPPROTO_MAX + 1];
262connf_t	ipcl_proto_fanout_v6[IPPROTO_MAX + 1];
263connf_t	*ipcl_udp_fanout;
264
265/* A separate hash list for raw socket. */
266connf_t *ipcl_raw_fanout;
267
268connf_t rts_clients;
269
270/* Old value for compatibility */
271uint_t tcp_conn_hash_size = 0;
272
273/* New value. Zero means choose automatically. */
274uint_t ipcl_conn_hash_size = 0;
275uint_t ipcl_conn_hash_memfactor = 8192;
276uint_t ipcl_conn_hash_maxsize = 82500;
277
278uint_t ipcl_conn_fanout_size = 0;
279
280
281/* bind/udp fanout table size */
282uint_t ipcl_bind_fanout_size = 512;
283uint_t ipcl_udp_fanout_size = 16384;
284
285/* Raw socket fanout size.  Must be a power of 2. */
286uint_t ipcl_raw_fanout_size = 256;
287
288/*
289 * Power of 2^N Primes useful for hashing for N of 0-28,
290 * these primes are the nearest prime <= 2^N - 2^(N-2).
291 */
292
293#define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
294		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
295		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
296		50331599, 100663291, 201326557, 0}
297
298/*
299 * wrapper structure to ensure that conn+tcpb are aligned
300 * on cache lines.
301 */
302typedef struct itc_s {
303	union {
304		conn_t	itcu_conn;
305		char	itcu_filler[CACHE_ALIGN(conn_s)];
306	}	itc_u;
307	tcp_t	itc_tcp;
308} itc_t;
309
310#define	itc_conn	itc_u.itcu_conn
311
312struct kmem_cache  *ipcl_tcpconn_cache;
313struct kmem_cache  *ipcl_tcp_cache;
314struct kmem_cache  *ipcl_conn_cache;
315extern struct kmem_cache  *sctp_conn_cache;
316extern struct kmem_cache  *tcp_sack_info_cache;
317extern struct kmem_cache  *tcp_iphc_cache;
318
319extern void	tcp_timermp_free(tcp_t *);
320extern mblk_t	*tcp_timermp_alloc(int);
321
322static int	ipcl_tcpconn_constructor(void *, void *, int);
323static void	ipcl_tcpconn_destructor(void *, void *);
324
325static int conn_g_index;
326connf_t	*ipcl_globalhash_fanout;
327
328#ifdef	IPCL_DEBUG
329#define	INET_NTOA_BUFSIZE	18
330
331static char *
332inet_ntoa_r(uint32_t in, char *b)
333{
334	unsigned char	*p;
335
336	p = (unsigned char *)&in;
337	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
338	return (b);
339}
340#endif
341
342/*
343 * ipclassifier intialization routine, sets up hash tables and
344 * conn caches.
345 */
346void
347ipcl_init(void)
348{
349	int i;
350	int sizes[] = P2Ps();
351
352	ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache",
353	    sizeof (conn_t), CACHE_ALIGN_SIZE,
354	    NULL, NULL, NULL, NULL, NULL, 0);
355
356	ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache",
357	    sizeof (itc_t), CACHE_ALIGN_SIZE,
358	    ipcl_tcpconn_constructor, ipcl_tcpconn_destructor,
359	    NULL, NULL, NULL, 0);
360
361	/*
362	 * Calculate size of conn fanout table.
363	 */
364	if (ipcl_conn_hash_size != 0) {
365		ipcl_conn_fanout_size = ipcl_conn_hash_size;
366	} else if (tcp_conn_hash_size != 0) {
367		ipcl_conn_fanout_size = tcp_conn_hash_size;
368	} else {
369		extern pgcnt_t freemem;
370
371		ipcl_conn_fanout_size =
372		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
373
374		if (ipcl_conn_fanout_size > ipcl_conn_hash_maxsize)
375			ipcl_conn_fanout_size = ipcl_conn_hash_maxsize;
376	}
377
378	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
379		if (sizes[i] >= ipcl_conn_fanout_size) {
380			break;
381		}
382	}
383	if ((ipcl_conn_fanout_size = sizes[i]) == 0) {
384		/* Out of range, use the 2^16 value */
385		ipcl_conn_fanout_size = sizes[16];
386	}
387	ipcl_conn_fanout = (connf_t *)kmem_zalloc(ipcl_conn_fanout_size *
388	    sizeof (*ipcl_conn_fanout), KM_SLEEP);
389
390	for (i = 0; i < ipcl_conn_fanout_size; i++) {
391		mutex_init(&ipcl_conn_fanout[i].connf_lock, NULL,
392		    MUTEX_DEFAULT, NULL);
393	}
394
395	ipcl_bind_fanout = (connf_t *)kmem_zalloc(ipcl_bind_fanout_size *
396	    sizeof (*ipcl_bind_fanout), KM_SLEEP);
397
398	for (i = 0; i < ipcl_bind_fanout_size; i++) {
399		mutex_init(&ipcl_bind_fanout[i].connf_lock, NULL,
400		    MUTEX_DEFAULT, NULL);
401	}
402
403	for (i = 0; i < A_CNT(ipcl_proto_fanout); i++) {
404		mutex_init(&ipcl_proto_fanout[i].connf_lock, NULL,
405		    MUTEX_DEFAULT, NULL);
406	}
407	for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++) {
408		mutex_init(&ipcl_proto_fanout_v6[i].connf_lock, NULL,
409		    MUTEX_DEFAULT, NULL);
410	}
411
412	mutex_init(&rts_clients.connf_lock, NULL, MUTEX_DEFAULT, NULL);
413
414	ipcl_udp_fanout = (connf_t *)kmem_zalloc(ipcl_udp_fanout_size *
415	    sizeof (*ipcl_udp_fanout), KM_SLEEP);
416
417	for (i = 0; i < ipcl_udp_fanout_size; i++) {
418		mutex_init(&ipcl_udp_fanout[i].connf_lock, NULL,
419		    MUTEX_DEFAULT, NULL);
420	}
421
422	ipcl_raw_fanout = (connf_t *)kmem_zalloc(ipcl_raw_fanout_size *
423	    sizeof (*ipcl_raw_fanout), KM_SLEEP);
424
425	for (i = 0; i < ipcl_raw_fanout_size; i++) {
426		mutex_init(&ipcl_raw_fanout[i].connf_lock, NULL,
427		    MUTEX_DEFAULT, NULL);
428	}
429
430	ipcl_globalhash_fanout = (connf_t *)kmem_zalloc(sizeof (connf_t) *
431	    CONN_G_HASH_SIZE, KM_SLEEP);
432
433	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
434		mutex_init(&ipcl_globalhash_fanout[i].connf_lock, NULL,
435		    MUTEX_DEFAULT, NULL);
436	}
437}
438
439void
440ipcl_destroy(void)
441{
442	int i;
443	kmem_cache_destroy(ipcl_conn_cache);
444	kmem_cache_destroy(ipcl_tcpconn_cache);
445	for (i = 0; i < ipcl_conn_fanout_size; i++)
446		mutex_destroy(&ipcl_conn_fanout[i].connf_lock);
447	kmem_free(ipcl_conn_fanout, ipcl_conn_fanout_size *
448	    sizeof (*ipcl_conn_fanout));
449	for (i = 0; i < ipcl_bind_fanout_size; i++)
450		mutex_destroy(&ipcl_bind_fanout[i].connf_lock);
451	kmem_free(ipcl_bind_fanout, ipcl_bind_fanout_size *
452	    sizeof (*ipcl_bind_fanout));
453
454	for (i = 0; i < A_CNT(ipcl_proto_fanout); i++)
455		mutex_destroy(&ipcl_proto_fanout[i].connf_lock);
456	for (i = 0; i < A_CNT(ipcl_proto_fanout_v6); i++)
457		mutex_destroy(&ipcl_proto_fanout_v6[i].connf_lock);
458
459	for (i = 0; i < ipcl_udp_fanout_size; i++)
460		mutex_destroy(&ipcl_udp_fanout[i].connf_lock);
461	kmem_free(ipcl_udp_fanout, ipcl_udp_fanout_size *
462	    sizeof (*ipcl_udp_fanout));
463
464	for (i = 0; i < ipcl_raw_fanout_size; i++)
465		mutex_destroy(&ipcl_raw_fanout[i].connf_lock);
466	kmem_free(ipcl_raw_fanout, ipcl_raw_fanout_size *
467	    sizeof (*ipcl_raw_fanout));
468
469	kmem_free(ipcl_globalhash_fanout, sizeof (connf_t) * CONN_G_HASH_SIZE);
470	mutex_destroy(&rts_clients.connf_lock);
471}
472
473/*
474 * conn creation routine. initialize the conn, sets the reference
475 * and inserts it in the global hash table.
476 */
477conn_t *
478ipcl_conn_create(uint32_t type, int sleep)
479{
480	itc_t	*itc;
481	conn_t	*connp;
482
483	switch (type) {
484	case IPCL_TCPCONN:
485		if ((itc = kmem_cache_alloc(ipcl_tcpconn_cache,
486		    sleep)) == NULL)
487			return (NULL);
488		connp = &itc->itc_conn;
489		connp->conn_ref = 1;
490		IPCL_DEBUG_LVL(1,
491		    ("ipcl_conn_create: connp = %p tcp (%p)",
492		    (void *)connp, (void *)connp->conn_tcp));
493		ipcl_globalhash_insert(connp);
494		break;
495	case IPCL_SCTPCONN:
496		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
497			return (NULL);
498		connp->conn_flags = IPCL_SCTPCONN;
499		break;
500	case IPCL_IPCCONN:
501		connp = kmem_cache_alloc(ipcl_conn_cache, sleep);
502		if (connp == NULL)
503			return (NULL);
504		bzero(connp, sizeof (conn_t));
505		mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
506		cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
507		connp->conn_flags = IPCL_IPCCONN;
508		connp->conn_ref = 1;
509		IPCL_DEBUG_LVL(1,
510		    ("ipcl_conn_create: connp = %p\n", (void *)connp));
511		ipcl_globalhash_insert(connp);
512		break;
513	default:
514		connp = NULL;
515		ASSERT(0);
516	}
517
518	return (connp);
519}
520
521void
522ipcl_conn_destroy(conn_t *connp)
523{
524	mblk_t	*mp;
525
526	ASSERT(!MUTEX_HELD(&connp->conn_lock));
527	ASSERT(connp->conn_ref == 0);
528	ASSERT(connp->conn_ire_cache == NULL);
529
530	ipcl_globalhash_remove(connp);
531
532	cv_destroy(&connp->conn_cv);
533	if (connp->conn_flags & IPCL_TCPCONN) {
534		tcp_t	*tcp = connp->conn_tcp;
535
536		mutex_destroy(&connp->conn_lock);
537		ASSERT(connp->conn_tcp != NULL);
538		tcp_free(tcp);
539		mp = tcp->tcp_timercache;
540
541		if (tcp->tcp_sack_info != NULL) {
542			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
543			kmem_cache_free(tcp_sack_info_cache,
544			    tcp->tcp_sack_info);
545		}
546		if (tcp->tcp_iphc != NULL) {
547			if (tcp->tcp_hdr_grown) {
548				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
549			} else {
550				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
551				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
552			}
553			tcp->tcp_iphc_len = 0;
554		}
555		ASSERT(tcp->tcp_iphc_len == 0);
556
557		if (connp->conn_latch != NULL)
558			IPLATCH_REFRELE(connp->conn_latch);
559		if (connp->conn_policy != NULL)
560			IPPH_REFRELE(connp->conn_policy);
561		bzero(connp, sizeof (itc_t));
562
563		tcp->tcp_timercache = mp;
564		connp->conn_tcp = tcp;
565		connp->conn_flags = IPCL_TCPCONN;
566		connp->conn_ulp = IPPROTO_TCP;
567		tcp->tcp_connp = connp;
568		kmem_cache_free(ipcl_tcpconn_cache, connp);
569	} else if (connp->conn_flags & IPCL_SCTPCONN) {
570		sctp_free(connp);
571	} else {
572		ASSERT(connp->conn_udp == NULL);
573		mutex_destroy(&connp->conn_lock);
574		kmem_cache_free(ipcl_conn_cache, connp);
575	}
576}
577
578/*
579 * Running in cluster mode - deregister listener information
580 */
581
582static void
583ipcl_conn_unlisten(conn_t *connp)
584{
585	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
586	ASSERT(connp->conn_lport != 0);
587
588	if (cl_inet_unlisten != NULL) {
589		sa_family_t	addr_family;
590		uint8_t		*laddrp;
591
592		if (connp->conn_pkt_isv6) {
593			addr_family = AF_INET6;
594			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
595		} else {
596			addr_family = AF_INET;
597			laddrp = (uint8_t *)&connp->conn_bound_source;
598		}
599		(*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp,
600		    connp->conn_lport);
601	}
602	connp->conn_flags &= ~IPCL_CL_LISTENER;
603}
604
605/*
606 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
607 * which table the conn belonged to). So for debugging we can see which hash
608 * table this connection was in.
609 */
610#define	IPCL_HASH_REMOVE(connp)	{					\
611	connf_t	*connfp = (connp)->conn_fanout;				\
612	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
613	if (connfp != NULL) {						\
614		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
615		    (void *)(connp)));					\
616		mutex_enter(&connfp->connf_lock);			\
617		if ((connp)->conn_next != NULL)				\
618			(connp)->conn_next->conn_prev =			\
619			    (connp)->conn_prev;				\
620		if ((connp)->conn_prev != NULL)				\
621			(connp)->conn_prev->conn_next =			\
622			    (connp)->conn_next;				\
623		else							\
624			connfp->connf_head = (connp)->conn_next;	\
625		(connp)->conn_fanout = NULL;				\
626		(connp)->conn_next = NULL;				\
627		(connp)->conn_prev = NULL;				\
628		(connp)->conn_flags |= IPCL_REMOVED;			\
629		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
630			ipcl_conn_unlisten((connp));			\
631		CONN_DEC_REF((connp));					\
632		mutex_exit(&connfp->connf_lock);			\
633	}								\
634}
635
636void
637ipcl_hash_remove(conn_t *connp)
638{
639	IPCL_HASH_REMOVE(connp);
640}
641
642/*
643 * The whole purpose of this function is allow removal of
644 * a conn_t from the connected hash for timewait reclaim.
645 * This is essentially a TW reclaim fastpath where timewait
646 * collector checks under fanout lock (so no one else can
647 * get access to the conn_t) that refcnt is 2 i.e. one for
648 * TCP and one for the classifier hash list. If ref count
649 * is indeed 2, we can just remove the conn under lock and
650 * avoid cleaning up the conn under squeue. This gives us
651 * improved performance.
652 */
653void
654ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
655{
656	ASSERT(MUTEX_HELD(&connfp->connf_lock));
657	ASSERT(MUTEX_HELD(&connp->conn_lock));
658	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
659
660	if ((connp)->conn_next != NULL) {
661		(connp)->conn_next->conn_prev =
662			(connp)->conn_prev;
663	}
664	if ((connp)->conn_prev != NULL) {
665		(connp)->conn_prev->conn_next =
666			(connp)->conn_next;
667	} else {
668		connfp->connf_head = (connp)->conn_next;
669	}
670	(connp)->conn_fanout = NULL;
671	(connp)->conn_next = NULL;
672	(connp)->conn_prev = NULL;
673	(connp)->conn_flags |= IPCL_REMOVED;
674	ASSERT((connp)->conn_ref == 2);
675	(connp)->conn_ref--;
676}
677
678#define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
679	ASSERT((connp)->conn_fanout == NULL);				\
680	ASSERT((connp)->conn_next == NULL);				\
681	ASSERT((connp)->conn_prev == NULL);				\
682	if ((connfp)->connf_head != NULL) {				\
683		(connfp)->connf_head->conn_prev = (connp);		\
684		(connp)->conn_next = (connfp)->connf_head;		\
685	}								\
686	(connp)->conn_fanout = (connfp);				\
687	(connfp)->connf_head = (connp);					\
688	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
689	    IPCL_CONNECTED;						\
690	CONN_INC_REF(connp);						\
691}
692
693#define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
694	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
695	    "connp %p", (void *)(connfp), (void *)(connp)));		\
696	IPCL_HASH_REMOVE((connp));					\
697	mutex_enter(&(connfp)->connf_lock);				\
698	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
699	mutex_exit(&(connfp)->connf_lock);				\
700}
701
702#define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
703	conn_t *pconnp = NULL, *nconnp;					\
704	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
705	    "connp %p", (void *)connfp, (void *)(connp)));		\
706	IPCL_HASH_REMOVE((connp));					\
707	mutex_enter(&(connfp)->connf_lock);				\
708	nconnp = (connfp)->connf_head;					\
709	while (nconnp != NULL &&					\
710	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
711		pconnp = nconnp;					\
712		nconnp = nconnp->conn_next;				\
713	}								\
714	if (pconnp != NULL) {						\
715		pconnp->conn_next = (connp);				\
716		(connp)->conn_prev = pconnp;				\
717	} else {							\
718		(connfp)->connf_head = (connp);				\
719	}								\
720	if (nconnp != NULL) {						\
721		(connp)->conn_next = nconnp;				\
722		nconnp->conn_prev = (connp);				\
723	}								\
724	(connp)->conn_fanout = (connfp);				\
725	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
726	    IPCL_BOUND;							\
727	CONN_INC_REF(connp);						\
728	mutex_exit(&(connfp)->connf_lock);				\
729}
730
731#define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
732	conn_t **list, *prev, *next;					\
733	boolean_t isv4mapped =						\
734	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
735	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
736	    "connp %p", (void *)(connfp), (void *)(connp)));		\
737	IPCL_HASH_REMOVE((connp));					\
738	mutex_enter(&(connfp)->connf_lock);				\
739	list = &(connfp)->connf_head;					\
740	prev = NULL;							\
741	while ((next = *list) != NULL) {				\
742		if (isv4mapped &&					\
743		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
744		    connp->conn_zoneid == next->conn_zoneid) {		\
745			(connp)->conn_next = next;			\
746			if (prev != NULL)				\
747				prev = next->conn_prev;			\
748			next->conn_prev = (connp);			\
749			break;						\
750		}							\
751		list = &next->conn_next;				\
752		prev = next;						\
753	}								\
754	(connp)->conn_prev = prev;					\
755	*list = (connp);						\
756	(connp)->conn_fanout = (connfp);				\
757	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
758	    IPCL_BOUND;							\
759	CONN_INC_REF((connp));						\
760	mutex_exit(&(connfp)->connf_lock);				\
761}
762
763void
764ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
765{
766	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
767}
768
769void
770ipcl_proto_insert(conn_t *connp, uint8_t protocol)
771{
772	connf_t	*connfp;
773
774	ASSERT(connp != NULL);
775
776	connp->conn_ulp = protocol;
777
778	/* Insert it in the protocol hash */
779	connfp = &ipcl_proto_fanout[protocol];
780	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
781}
782
783void
784ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
785{
786	connf_t	*connfp;
787
788	ASSERT(connp != NULL);
789
790	connp->conn_ulp = protocol;
791
792	/* Insert it in the Bind Hash */
793	connfp = &ipcl_proto_fanout_v6[protocol];
794	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
795}
796
797/*
798 * This function is used only for inserting SCTP raw socket now.
799 * This may change later.
800 *
801 * Note that only one raw socket can be bound to a port.  The param
802 * lport is in network byte order.
803 */
804static int
805ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
806{
807	connf_t	*connfp;
808	conn_t	*oconnp;
809
810	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))];
811
812	/* Check for existing raw socket already bound to the port. */
813	mutex_enter(&connfp->connf_lock);
814	for (oconnp = connfp->connf_head; oconnp != NULL;
815	    oconnp = oconnp->conn_next) {
816		if (oconnp->conn_lport == lport &&
817		    oconnp->conn_zoneid == connp->conn_zoneid &&
818		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
819		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
820		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
821		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
822		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
823		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
824		    &connp->conn_srcv6))) {
825			break;
826		}
827	}
828	mutex_exit(&connfp->connf_lock);
829	if (oconnp != NULL)
830		return (EADDRNOTAVAIL);
831
832	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
833	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
834		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
835		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
836			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
837		} else {
838			IPCL_HASH_INSERT_BOUND(connfp, connp);
839		}
840	} else {
841		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
842	}
843	return (0);
844}
845
846/*
847 * (v4, v6) bind hash insertion routines
848 */
849int
850ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
851{
852	connf_t	*connfp;
853#ifdef	IPCL_DEBUG
854	char	buf[INET_NTOA_BUFSIZE];
855#endif
856	int	ret = 0;
857
858	ASSERT(connp);
859
860	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
861	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
862
863	connp->conn_ulp = protocol;
864	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
865	connp->conn_lport = lport;
866
867	switch (protocol) {
868	case IPPROTO_UDP:
869	default:
870		if (protocol == IPPROTO_UDP) {
871			IPCL_DEBUG_LVL(64,
872			    ("ipcl_bind_insert: connp %p - udp\n",
873			    (void *)connp));
874			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
875		} else {
876			IPCL_DEBUG_LVL(64,
877			    ("ipcl_bind_insert: connp %p - protocol\n",
878			    (void *)connp));
879			connfp = &ipcl_proto_fanout[protocol];
880		}
881
882		if (connp->conn_rem != INADDR_ANY) {
883			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
884		} else if (connp->conn_src != INADDR_ANY) {
885			IPCL_HASH_INSERT_BOUND(connfp, connp);
886		} else {
887			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
888		}
889		break;
890
891	case IPPROTO_TCP:
892
893		/* Insert it in the Bind Hash */
894		connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
895		if (connp->conn_src != INADDR_ANY) {
896			IPCL_HASH_INSERT_BOUND(connfp, connp);
897		} else {
898			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
899		}
900		if (cl_inet_listen != NULL) {
901			ASSERT(!connp->conn_pkt_isv6);
902			connp->conn_flags |= IPCL_CL_LISTENER;
903			(*cl_inet_listen)(IPPROTO_TCP, AF_INET,
904			    (uint8_t *)&connp->conn_bound_source, lport);
905		}
906		break;
907
908	case IPPROTO_SCTP:
909		ret = ipcl_sctp_hash_insert(connp, lport);
910		break;
911	}
912
913	return (ret);
914}
915
916int
917ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
918    uint16_t lport)
919{
920	connf_t	*connfp;
921	int	ret = 0;
922
923	ASSERT(connp);
924
925	connp->conn_ulp = protocol;
926	connp->conn_srcv6 = *src;
927	connp->conn_lport = lport;
928
929	switch (protocol) {
930	case IPPROTO_UDP:
931	default:
932		if (protocol == IPPROTO_UDP) {
933			IPCL_DEBUG_LVL(128,
934			    ("ipcl_bind_insert_v6: connp %p - udp\n",
935			    (void *)connp));
936			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
937		} else {
938			IPCL_DEBUG_LVL(128,
939			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
940			    (void *)connp));
941			connfp = &ipcl_proto_fanout_v6[protocol];
942		}
943
944		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
945			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
946		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
947			IPCL_HASH_INSERT_BOUND(connfp, connp);
948		} else {
949			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
950		}
951		break;
952
953	case IPPROTO_TCP:
954		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
955
956		/* Insert it in the Bind Hash */
957		connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
958		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
959			IPCL_HASH_INSERT_BOUND(connfp, connp);
960		} else {
961			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
962		}
963		if (cl_inet_listen != NULL) {
964			sa_family_t	addr_family;
965			uint8_t		*laddrp;
966
967			if (connp->conn_pkt_isv6) {
968				addr_family = AF_INET6;
969				laddrp =
970				    (uint8_t *)&connp->conn_bound_source_v6;
971			} else {
972				addr_family = AF_INET;
973				laddrp = (uint8_t *)&connp->conn_bound_source;
974			}
975			connp->conn_flags |= IPCL_CL_LISTENER;
976			(*cl_inet_listen)(IPPROTO_TCP, addr_family, laddrp,
977			    lport);
978		}
979		break;
980
981	case IPPROTO_SCTP:
982		ret = ipcl_sctp_hash_insert(connp, lport);
983		break;
984	}
985
986	return (ret);
987}
988
989/*
990 * ipcl_conn_hash insertion routines.
991 */
992int
993ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
994    ipaddr_t rem, uint32_t ports)
995{
996	connf_t		*connfp;
997	uint16_t	*up;
998	conn_t		*tconnp;
999#ifdef	IPCL_DEBUG
1000	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
1001#endif
1002	in_port_t	lport;
1003	int		ret = 0;
1004
1005	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
1006	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
1007	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
1008	    ports, protocol));
1009
1010	switch (protocol) {
1011	case IPPROTO_TCP:
1012		if (!(connp->conn_flags & IPCL_EAGER)) {
1013			/*
1014			 * for a eager connection, i.e connections which
1015			 * have just been created, the initialization is
1016			 * already done in ip at conn_creation time, so
1017			 * we can skip the checks here.
1018			 */
1019			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1020		}
1021		connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(connp->conn_rem,
1022		    connp->conn_ports)];
1023		mutex_enter(&connfp->connf_lock);
1024		for (tconnp = connfp->connf_head; tconnp != NULL;
1025		    tconnp = tconnp->conn_next) {
1026			if (IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
1027			    connp->conn_rem, connp->conn_src,
1028			    connp->conn_ports)) {
1029
1030				/* Already have a conn. bail out */
1031				mutex_exit(&connfp->connf_lock);
1032				return (EADDRINUSE);
1033			}
1034		}
1035		if (connp->conn_fanout != NULL) {
1036			/*
1037			 * Probably a XTI/TLI application trying to do a
1038			 * rebind. Let it happen.
1039			 */
1040			mutex_exit(&connfp->connf_lock);
1041			IPCL_HASH_REMOVE(connp);
1042			mutex_enter(&connfp->connf_lock);
1043		}
1044		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1045		mutex_exit(&connfp->connf_lock);
1046		break;
1047
1048	case IPPROTO_SCTP:
1049		/*
1050		 * The raw socket may have already been bound, remove it
1051		 * from the hash first.
1052		 */
1053		IPCL_HASH_REMOVE(connp);
1054		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1055		ret = ipcl_sctp_hash_insert(connp, lport);
1056		break;
1057
1058	case IPPROTO_UDP:
1059	default:
1060		up = (uint16_t *)&ports;
1061		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
1062		if (protocol == IPPROTO_UDP) {
1063			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])];
1064		} else {
1065			connfp = &ipcl_proto_fanout[protocol];
1066		}
1067
1068		if (connp->conn_rem != INADDR_ANY) {
1069			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1070		} else if (connp->conn_src != INADDR_ANY) {
1071			IPCL_HASH_INSERT_BOUND(connfp, connp);
1072		} else {
1073			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1074		}
1075		break;
1076	}
1077
1078	return (ret);
1079}
1080
1081int
1082ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
1083    const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
1084{
1085	connf_t		*connfp;
1086	uint16_t	*up;
1087	conn_t		*tconnp;
1088	in_port_t	lport;
1089	int		ret = 0;
1090
1091	switch (protocol) {
1092	case IPPROTO_TCP:
1093		/* Just need to insert a conn struct */
1094		if (!(connp->conn_flags & IPCL_EAGER)) {
1095			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1096		}
1097		connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(connp->conn_remv6,
1098		    connp->conn_ports)];
1099		mutex_enter(&connfp->connf_lock);
1100		for (tconnp = connfp->connf_head; tconnp != NULL;
1101		    tconnp = tconnp->conn_next) {
1102			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
1103			    connp->conn_remv6, connp->conn_srcv6,
1104			    connp->conn_ports) &&
1105			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
1106			    tconnp->conn_tcp->tcp_bound_if == ifindex)) {
1107				/* Already have a conn. bail out */
1108				mutex_exit(&connfp->connf_lock);
1109				return (EADDRINUSE);
1110			}
1111		}
1112		if (connp->conn_fanout != NULL) {
1113			/*
1114			 * Probably a XTI/TLI application trying to do a
1115			 * rebind. Let it happen.
1116			 */
1117			mutex_exit(&connfp->connf_lock);
1118			IPCL_HASH_REMOVE(connp);
1119			mutex_enter(&connfp->connf_lock);
1120		}
1121		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1122		mutex_exit(&connfp->connf_lock);
1123		break;
1124
1125	case IPPROTO_SCTP:
1126		IPCL_HASH_REMOVE(connp);
1127		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
1128		ret = ipcl_sctp_hash_insert(connp, lport);
1129		break;
1130
1131	case IPPROTO_UDP:
1132	default:
1133		up = (uint16_t *)&ports;
1134		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
1135		if (protocol == IPPROTO_UDP) {
1136			connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(up[1])];
1137		} else {
1138			connfp = &ipcl_proto_fanout_v6[protocol];
1139		}
1140
1141		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
1142			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1143		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
1144			IPCL_HASH_INSERT_BOUND(connfp, connp);
1145		} else {
1146			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1147		}
1148		break;
1149	}
1150
1151	return (ret);
1152}
1153
1154/*
1155 * v4 packet classifying function. looks up the fanout table to
1156 * find the conn, the packet belongs to. returns the conn with
1157 * the reference held, null otherwise.
1158 */
1159conn_t *
1160ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid)
1161{
1162	ipha_t	*ipha;
1163	connf_t	*connfp, *bind_connfp;
1164	uint16_t lport;
1165	uint16_t fport;
1166	uint32_t ports;
1167	conn_t	*connp;
1168	uint16_t  *up;
1169
1170	ipha = (ipha_t *)mp->b_rptr;
1171	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1172
1173	switch (protocol) {
1174	case IPPROTO_TCP:
1175		ports = *(uint32_t *)up;
1176		connfp =
1177		    &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, ports)];
1178		mutex_enter(&connfp->connf_lock);
1179		for (connp = connfp->connf_head; connp != NULL;
1180		    connp = connp->conn_next) {
1181			if (IPCL_CONN_MATCH(connp, protocol,
1182			    ipha->ipha_src, ipha->ipha_dst, ports))
1183				break;
1184		}
1185
1186		if (connp != NULL) {
1187			CONN_INC_REF(connp);
1188			mutex_exit(&connfp->connf_lock);
1189			return (connp);
1190		}
1191
1192		mutex_exit(&connfp->connf_lock);
1193
1194		lport = up[1];
1195		bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1196		mutex_enter(&bind_connfp->connf_lock);
1197		for (connp = bind_connfp->connf_head; connp != NULL;
1198		    connp = connp->conn_next) {
1199			if (IPCL_BIND_MATCH(connp, protocol,
1200			    ipha->ipha_dst, lport) &&
1201			    connp->conn_zoneid == zoneid)
1202				break;
1203		}
1204
1205		if (connp != NULL) {
1206			/* Have a listner at least */
1207			CONN_INC_REF(connp);
1208			mutex_exit(&bind_connfp->connf_lock);
1209			return (connp);
1210		}
1211
1212		mutex_exit(&bind_connfp->connf_lock);
1213
1214		IPCL_DEBUG_LVL(512,
1215		    ("ipcl_classify: couldn't classify mp = %p\n",
1216		    (void *)mp));
1217		break;
1218
1219	case IPPROTO_UDP:
1220		lport = up[1];
1221		fport = up[0];
1222		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
1223		connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1224		mutex_enter(&connfp->connf_lock);
1225		for (connp = connfp->connf_head; connp != NULL;
1226		    connp = connp->conn_next) {
1227			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1228			    fport, ipha->ipha_src) &&
1229			    connp->conn_zoneid == zoneid)
1230				break;
1231		}
1232
1233		if (connp != NULL) {
1234			CONN_INC_REF(connp);
1235			mutex_exit(&connfp->connf_lock);
1236			return (connp);
1237		}
1238
1239		/*
1240		 * We shouldn't come here for multicast/broadcast packets
1241		 */
1242		mutex_exit(&connfp->connf_lock);
1243		IPCL_DEBUG_LVL(512,
1244		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
1245		    lport, fport));
1246		break;
1247	}
1248
1249	return (NULL);
1250}
1251
1252conn_t *
1253ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid)
1254{
1255	ip6_t		*ip6h;
1256	connf_t		*connfp, *bind_connfp;
1257	uint16_t	lport;
1258	uint16_t	fport;
1259	tcph_t		*tcph;
1260	uint32_t	ports;
1261	conn_t		*connp;
1262	uint16_t	*up;
1263
1264
1265	ip6h = (ip6_t *)mp->b_rptr;
1266
1267	switch (protocol) {
1268	case IPPROTO_TCP:
1269		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
1270		up = (uint16_t *)tcph->th_lport;
1271		ports = *(uint32_t *)up;
1272
1273		connfp =
1274		    &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, ports)];
1275		mutex_enter(&connfp->connf_lock);
1276		for (connp = connfp->connf_head; connp != NULL;
1277		    connp = connp->conn_next) {
1278			if (IPCL_CONN_MATCH_V6(connp, protocol,
1279			    ip6h->ip6_src, ip6h->ip6_dst, ports))
1280				break;
1281		}
1282
1283		if (connp != NULL) {
1284			CONN_INC_REF(connp);
1285			mutex_exit(&connfp->connf_lock);
1286			return (connp);
1287		}
1288
1289		mutex_exit(&connfp->connf_lock);
1290
1291		lport = up[1];
1292		bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1293		mutex_enter(&bind_connfp->connf_lock);
1294		for (connp = bind_connfp->connf_head; connp != NULL;
1295		    connp = connp->conn_next) {
1296			if (IPCL_BIND_MATCH_V6(connp, protocol,
1297			    ip6h->ip6_dst, lport) &&
1298			    connp->conn_zoneid == zoneid)
1299				break;
1300		}
1301
1302		if (connp != NULL) {
1303			/* Have a listner at least */
1304			CONN_INC_REF(connp);
1305			mutex_exit(&bind_connfp->connf_lock);
1306			IPCL_DEBUG_LVL(512,
1307			    ("ipcl_classify_v6: found listner "
1308			    "connp = %p\n", (void *)connp));
1309
1310			return (connp);
1311		}
1312
1313		mutex_exit(&bind_connfp->connf_lock);
1314
1315		IPCL_DEBUG_LVL(512,
1316		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
1317		    (void *)mp));
1318		break;
1319
1320	case IPPROTO_UDP:
1321		up = (uint16_t *)&mp->b_rptr[hdr_len];
1322		lport = up[1];
1323		fport = up[0];
1324		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
1325		    fport));
1326		connfp = &ipcl_udp_fanout[IPCL_UDP_HASH(lport)];
1327		mutex_enter(&connfp->connf_lock);
1328		for (connp = connfp->connf_head; connp != NULL;
1329		    connp = connp->conn_next) {
1330			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1331			    fport, ip6h->ip6_src) &&
1332			    connp->conn_zoneid == zoneid)
1333				break;
1334		}
1335
1336		if (connp != NULL) {
1337			CONN_INC_REF(connp);
1338			mutex_exit(&connfp->connf_lock);
1339			return (connp);
1340		}
1341
1342		/*
1343		 * We shouldn't come here for multicast/broadcast packets
1344		 */
1345		mutex_exit(&connfp->connf_lock);
1346		IPCL_DEBUG_LVL(512,
1347		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
1348		    lport, fport));
1349		break;
1350	}
1351
1352
1353	return (NULL);
1354}
1355
1356/*
1357 * wrapper around ipcl_classify_(v4,v6) routines.
1358 */
1359conn_t *
1360ipcl_classify(mblk_t *mp, zoneid_t zoneid)
1361{
1362	uint16_t	hdr_len;
1363	ipha_t		*ipha;
1364	uint8_t		*nexthdrp;
1365
1366	if (MBLKL(mp) < sizeof (ipha_t))
1367		return (NULL);
1368
1369	switch (IPH_HDR_VERSION(mp->b_rptr)) {
1370	case IPV4_VERSION:
1371		ipha = (ipha_t *)mp->b_rptr;
1372		hdr_len = IPH_HDR_LENGTH(ipha);
1373		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
1374		    zoneid));
1375	case IPV6_VERSION:
1376		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
1377		    &hdr_len, &nexthdrp))
1378			return (NULL);
1379
1380		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid));
1381	}
1382
1383	return (NULL);
1384}
1385
1386conn_t *
1387ipcl_classify_raw(uint8_t protocol, zoneid_t zoneid, uint32_t ports,
1388    ipha_t *hdr)
1389{
1390	struct connf_s	*connfp;
1391	conn_t		*connp;
1392	in_port_t	lport;
1393	int		af;
1394
1395	lport = ((uint16_t *)&ports)[1];
1396	af = IPH_HDR_VERSION(hdr);
1397	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport))];
1398
1399	mutex_enter(&connfp->connf_lock);
1400	for (connp = connfp->connf_head; connp != NULL;
1401	    connp = connp->conn_next) {
1402		/* We don't allow v4 fallback for v6 raw socket. */
1403		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1404		    IPV6_VERSION)) || (connp->conn_zoneid != zoneid)) {
1405			continue;
1406		}
1407		if (connp->conn_fully_bound) {
1408			if (af == IPV4_VERSION) {
1409				if (IPCL_CONN_MATCH(connp, protocol,
1410				    hdr->ipha_src, hdr->ipha_dst, ports)) {
1411					break;
1412				}
1413			} else {
1414				if (IPCL_CONN_MATCH_V6(connp, protocol,
1415				    ((ip6_t *)hdr)->ip6_src,
1416				    ((ip6_t *)hdr)->ip6_dst, ports)) {
1417					break;
1418				}
1419			}
1420		} else {
1421			if (af == IPV4_VERSION) {
1422				if (IPCL_BIND_MATCH(connp, protocol,
1423				    hdr->ipha_dst, lport)) {
1424					break;
1425				}
1426			} else {
1427				if (IPCL_BIND_MATCH_V6(connp, protocol,
1428				    ((ip6_t *)hdr)->ip6_dst, lport)) {
1429					break;
1430				}
1431			}
1432		}
1433	}
1434
1435	if (connp != NULL)
1436		goto found;
1437	mutex_exit(&connfp->connf_lock);
1438
1439	/* Try to look for a wildcard match. */
1440	connfp = &ipcl_raw_fanout[IPCL_RAW_HASH(0)];
1441	mutex_enter(&connfp->connf_lock);
1442	for (connp = connfp->connf_head; connp != NULL;
1443	    connp = connp->conn_next) {
1444		/* We don't allow v4 fallback for v6 raw socket. */
1445		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
1446		    IPV6_VERSION)) || (connp->conn_zoneid != zoneid)) {
1447			continue;
1448		}
1449		if (af == IPV4_VERSION) {
1450			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
1451				break;
1452		} else {
1453			if (IPCL_RAW_MATCH_V6(connp, protocol,
1454			    ((ip6_t *)hdr)->ip6_dst)) {
1455				break;
1456			}
1457		}
1458	}
1459
1460	if (connp != NULL)
1461		goto found;
1462
1463	mutex_exit(&connfp->connf_lock);
1464	return (NULL);
1465
1466found:
1467	ASSERT(connp != NULL);
1468	CONN_INC_REF(connp);
1469	mutex_exit(&connfp->connf_lock);
1470	return (connp);
1471}
1472
1473/* ARGSUSED */
1474static int
1475ipcl_tcpconn_constructor(void *buf, void *cdrarg, int kmflags)
1476{
1477	itc_t	*itc = (itc_t *)buf;
1478	conn_t 	*connp = &itc->itc_conn;
1479	tcp_t	*tcp = &itc->itc_tcp;
1480	bzero(itc, sizeof (itc_t));
1481	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
1482	connp->conn_tcp = tcp;
1483	connp->conn_flags = IPCL_TCPCONN;
1484	connp->conn_ulp = IPPROTO_TCP;
1485	tcp->tcp_connp = connp;
1486	return (0);
1487}
1488
1489/* ARGSUSED */
1490static void
1491ipcl_tcpconn_destructor(void *buf, void *cdrarg)
1492{
1493	tcp_timermp_free(((conn_t *)buf)->conn_tcp);
1494}
1495
1496/*
1497 * All conns are inserted in a global multi-list for the benefit of
1498 * walkers. The walk is guaranteed to walk all open conns at the time
1499 * of the start of the walk exactly once. This property is needed to
1500 * achieve some cleanups during unplumb of interfaces. This is achieved
1501 * as follows.
1502 *
1503 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
1504 * call the insert and delete functions below at creation and deletion
1505 * time respectively. The conn never moves or changes its position in this
1506 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
1507 * won't increase due to walkers, once the conn deletion has started. Note
1508 * that we can't remove the conn from the global list and then wait for
1509 * the refcnt to drop to zero, since walkers would then see a truncated
1510 * list. CONN_INCIPIENT ensures that walkers don't start looking at
1511 * conns until ip_open is ready to make them globally visible.
1512 * The global round robin multi-list locks are held only to get the
1513 * next member/insertion/deletion and contention should be negligible
1514 * if the multi-list is much greater than the number of cpus.
1515 */
1516void
1517ipcl_globalhash_insert(conn_t *connp)
1518{
1519	int	index;
1520
1521	/*
1522	 * No need for atomic here. Approximate even distribution
1523	 * in the global lists is sufficient.
1524	 */
1525	conn_g_index++;
1526	index = conn_g_index & (CONN_G_HASH_SIZE - 1);
1527
1528	connp->conn_g_prev = NULL;
1529	/*
1530	 * Mark as INCIPIENT, so that walkers will ignore this
1531	 * for now, till ip_open is ready to make it visible globally.
1532	 */
1533	connp->conn_state_flags |= CONN_INCIPIENT;
1534
1535	/* Insert at the head of the list */
1536	mutex_enter(&ipcl_globalhash_fanout[index].connf_lock);
1537	connp->conn_g_next = ipcl_globalhash_fanout[index].connf_head;
1538	if (connp->conn_g_next != NULL)
1539		connp->conn_g_next->conn_g_prev = connp;
1540	ipcl_globalhash_fanout[index].connf_head = connp;
1541
1542	/* The fanout bucket this conn points to */
1543	connp->conn_g_fanout = &ipcl_globalhash_fanout[index];
1544
1545	mutex_exit(&ipcl_globalhash_fanout[index].connf_lock);
1546}
1547
1548void
1549ipcl_globalhash_remove(conn_t *connp)
1550{
1551	/*
1552	 * We were never inserted in the global multi list.
1553	 * IPCL_NONE variety is never inserted in the global multilist
1554	 * since it is presumed to not need any cleanup and is transient.
1555	 */
1556	if (connp->conn_g_fanout == NULL)
1557		return;
1558
1559	mutex_enter(&connp->conn_g_fanout->connf_lock);
1560	if (connp->conn_g_prev != NULL)
1561		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
1562	else
1563		connp->conn_g_fanout->connf_head = connp->conn_g_next;
1564	if (connp->conn_g_next != NULL)
1565		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
1566	mutex_exit(&connp->conn_g_fanout->connf_lock);
1567
1568	/* Better to stumble on a null pointer than to corrupt memory */
1569	connp->conn_g_next = NULL;
1570	connp->conn_g_prev = NULL;
1571}
1572
1573/*
1574 * Walk the list of all conn_t's in the system, calling the function provided
1575 * with the specified argument for each.
1576 * Applies to both IPv4 and IPv6.
1577 *
1578 * IPCs may hold pointers to ipif/ill. To guard against stale pointers
1579 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
1580 * unplumbed or removed. New conn_t's that are created while we are walking
1581 * may be missed by this walk, because they are not necessarily inserted
1582 * at the tail of the list. They are new conn_t's and thus don't have any
1583 * stale pointers. The CONN_CLOSING flag ensures that no new reference
1584 * is created to the struct that is going away.
1585 */
1586void
1587ipcl_walk(pfv_t func, void *arg)
1588{
1589	int	i;
1590	conn_t	*connp;
1591	conn_t	*prev_connp;
1592
1593	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
1594		mutex_enter(&ipcl_globalhash_fanout[i].connf_lock);
1595		prev_connp = NULL;
1596		connp = ipcl_globalhash_fanout[i].connf_head;
1597		while (connp != NULL) {
1598			mutex_enter(&connp->conn_lock);
1599			if (connp->conn_state_flags &
1600			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
1601				mutex_exit(&connp->conn_lock);
1602				connp = connp->conn_g_next;
1603				continue;
1604			}
1605			CONN_INC_REF_LOCKED(connp);
1606			mutex_exit(&connp->conn_lock);
1607			mutex_exit(&ipcl_globalhash_fanout[i].connf_lock);
1608			(*func)(connp, arg);
1609			if (prev_connp != NULL)
1610				CONN_DEC_REF(prev_connp);
1611			mutex_enter(&ipcl_globalhash_fanout[i].connf_lock);
1612			prev_connp = connp;
1613			connp = connp->conn_g_next;
1614		}
1615		mutex_exit(&ipcl_globalhash_fanout[i].connf_lock);
1616		if (prev_connp != NULL)
1617			CONN_DEC_REF(prev_connp);
1618	}
1619}
1620
1621/*
1622 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
1623 * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
1624 * held; caller must call CONN_DEC_REF.  Only checks for connected entries
1625 * (peer tcp in at least ESTABLISHED state).
1626 */
1627conn_t *
1628ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph)
1629{
1630	uint32_t ports;
1631	uint16_t *pports = (uint16_t *)&ports;
1632	connf_t	*connfp;
1633	conn_t	*tconnp;
1634	boolean_t zone_chk;
1635
1636	/*
1637	 * If either the source of destination address is loopback, then
1638	 * both endpoints must be in the same Zone.  Otherwise, both of
1639	 * the addresses are system-wide unique (tcp is in ESTABLISHED
1640	 * state) and the endpoints may reside in different Zones.
1641	 */
1642	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
1643	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
1644
1645	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1646	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1647
1648	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)];
1649
1650	mutex_enter(&connfp->connf_lock);
1651	for (tconnp = connfp->connf_head; tconnp != NULL;
1652	    tconnp = tconnp->conn_next) {
1653
1654		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
1655		    ipha->ipha_dst, ipha->ipha_src, ports) &&
1656		    tconnp->conn_tcp->tcp_state >= TCPS_ESTABLISHED &&
1657		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
1658
1659			ASSERT(tconnp != connp);
1660			CONN_INC_REF(tconnp);
1661			mutex_exit(&connfp->connf_lock);
1662			return (tconnp);
1663		}
1664	}
1665	mutex_exit(&connfp->connf_lock);
1666	return (NULL);
1667}
1668
1669/*
1670 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
1671 * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
1672 * held; caller must call CONN_DEC_REF.  Only checks for connected entries
1673 * (peer tcp in at least ESTABLISHED state).
1674 */
1675conn_t *
1676ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph)
1677{
1678	uint32_t ports;
1679	uint16_t *pports = (uint16_t *)&ports;
1680	connf_t	*connfp;
1681	conn_t	*tconnp;
1682	boolean_t zone_chk;
1683
1684	/*
1685	 * If either the source of destination address is loopback, then
1686	 * both endpoints must be in the same Zone.  Otherwise, both of
1687	 * the addresses are system-wide unique (tcp is in ESTABLISHED
1688	 * state) and the endpoints may reside in different Zones.  We
1689	 * don't do Zone check for link local address(es) because the
1690	 * current Zone implementation treats each link local address as
1691	 * being unique per system node, i.e. they belong to global Zone.
1692	 */
1693	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
1694	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
1695
1696	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1697	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1698
1699	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)];
1700
1701	mutex_enter(&connfp->connf_lock);
1702	for (tconnp = connfp->connf_head; tconnp != NULL;
1703	    tconnp = tconnp->conn_next) {
1704
1705		/* We skip tcp_bound_if check here as this is loopback tcp */
1706		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
1707		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
1708		    tconnp->conn_tcp->tcp_state >= TCPS_ESTABLISHED &&
1709		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
1710
1711			ASSERT(tconnp != connp);
1712			CONN_INC_REF(tconnp);
1713			mutex_exit(&connfp->connf_lock);
1714			return (tconnp);
1715		}
1716	}
1717	mutex_exit(&connfp->connf_lock);
1718	return (NULL);
1719}
1720
1721/*
1722 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
1723 * Returns with conn reference held. Caller must call CONN_DEC_REF.
1724 * Only checks for connected entries i.e. no INADDR_ANY checks.
1725 */
1726conn_t *
1727ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state)
1728{
1729	uint32_t ports;
1730	uint16_t *pports;
1731	connf_t	*connfp;
1732	conn_t	*tconnp;
1733
1734	pports = (uint16_t *)&ports;
1735	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
1736	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
1737
1738	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports)];
1739
1740	mutex_enter(&connfp->connf_lock);
1741	for (tconnp = connfp->connf_head; tconnp != NULL;
1742	    tconnp = tconnp->conn_next) {
1743
1744		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
1745		    ipha->ipha_dst, ipha->ipha_src, ports) &&
1746		    tconnp->conn_tcp->tcp_state >= min_state) {
1747
1748			CONN_INC_REF(tconnp);
1749			mutex_exit(&connfp->connf_lock);
1750			return (tconnp);
1751		}
1752	}
1753	mutex_exit(&connfp->connf_lock);
1754	return (NULL);
1755}
1756
1757/*
1758 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
1759 * Returns with conn reference held. Caller must call CONN_DEC_REF.
1760 * Only checks for connected entries i.e. no INADDR_ANY checks.
1761 * Match on ifindex in addition to addresses.
1762 */
1763conn_t *
1764ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
1765    uint_t ifindex)
1766{
1767	tcp_t	*tcp;
1768	uint32_t ports;
1769	uint16_t *pports;
1770	connf_t	*connfp;
1771	conn_t	*tconnp;
1772
1773	pports = (uint16_t *)&ports;
1774	pports[0] = tcpha->tha_fport;
1775	pports[1] = tcpha->tha_lport;
1776
1777	connfp = &ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports)];
1778
1779	mutex_enter(&connfp->connf_lock);
1780	for (tconnp = connfp->connf_head; tconnp != NULL;
1781	    tconnp = tconnp->conn_next) {
1782
1783		tcp = tconnp->conn_tcp;
1784		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
1785		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
1786		    tcp->tcp_state >= min_state &&
1787		    (tcp->tcp_bound_if == 0 ||
1788		    tcp->tcp_bound_if == ifindex)) {
1789
1790			CONN_INC_REF(tconnp);
1791			mutex_exit(&connfp->connf_lock);
1792			return (tconnp);
1793		}
1794	}
1795	mutex_exit(&connfp->connf_lock);
1796	return (NULL);
1797}
1798
1799/*
1800 * To find a TCP listening connection matching the incoming segment.
1801 */
1802conn_t *
1803ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid)
1804{
1805	connf_t		*bind_connfp;
1806	conn_t		*connp;
1807	tcp_t		*tcp;
1808
1809	/*
1810	 * Avoid false matches for packets sent to an IP destination of
1811	 * all zeros.
1812	 */
1813	if (laddr == 0)
1814		return (NULL);
1815
1816	bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1817	mutex_enter(&bind_connfp->connf_lock);
1818	for (connp = bind_connfp->connf_head; connp != NULL;
1819	    connp = connp->conn_next) {
1820		tcp = connp->conn_tcp;
1821		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
1822		    connp->conn_zoneid == zoneid &&
1823		    (tcp->tcp_listener == NULL)) {
1824			CONN_INC_REF(connp);
1825			mutex_exit(&bind_connfp->connf_lock);
1826			return (connp);
1827		}
1828	}
1829	mutex_exit(&bind_connfp->connf_lock);
1830	return (NULL);
1831}
1832
1833
1834conn_t *
1835ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
1836    zoneid_t zoneid)
1837{
1838	connf_t		*bind_connfp;
1839	conn_t		*connp = NULL;
1840	tcp_t		*tcp;
1841
1842	/*
1843	 * Avoid false matches for packets sent to an IP destination of
1844	 * all zeros.
1845	 */
1846	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
1847		return (NULL);
1848
1849
1850	bind_connfp = &ipcl_bind_fanout[IPCL_BIND_HASH(lport)];
1851	mutex_enter(&bind_connfp->connf_lock);
1852	for (connp = bind_connfp->connf_head; connp != NULL;
1853	    connp = connp->conn_next) {
1854		tcp = connp->conn_tcp;
1855		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
1856		    connp->conn_zoneid == zoneid &&
1857		    (tcp->tcp_bound_if == 0 ||
1858		    tcp->tcp_bound_if == ifindex) &&
1859		    tcp->tcp_listener == NULL) {
1860			CONN_INC_REF(connp);
1861			mutex_exit(&bind_connfp->connf_lock);
1862			return (connp);
1863		}
1864	}
1865	mutex_exit(&bind_connfp->connf_lock);
1866	return (NULL);
1867}
1868
1869/*
1870 * ipcl_get_next_conn
1871 *	get the next entry in the conn global list
1872 *	and put a reference on the next_conn.
1873 *	decrement the reference on the current conn.
1874 *
1875 * This is an iterator based walker function that also provides for
1876 * some selection by the caller. It walks through the conn_hash bucket
1877 * searching for the next valid connp in the list, and selects connections
1878 * that are neither closed nor condemned. It also REFHOLDS the conn
1879 * thus ensuring that the conn exists when the caller uses the conn.
1880 */
1881conn_t *
1882ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
1883{
1884	conn_t	*next_connp;
1885
1886	if (connfp == NULL)
1887		return (NULL);
1888
1889	mutex_enter(&connfp->connf_lock);
1890
1891	next_connp = (connp == NULL) ?
1892	    connfp->connf_head : connp->conn_g_next;
1893
1894	while (next_connp != NULL) {
1895		mutex_enter(&next_connp->conn_lock);
1896		if (!(next_connp->conn_flags & conn_flags) ||
1897		    (next_connp->conn_state_flags &
1898		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
1899			/*
1900			 * This conn has been condemned or
1901			 * is closing, or the flags don't match
1902			 */
1903			mutex_exit(&next_connp->conn_lock);
1904			next_connp = next_connp->conn_g_next;
1905			continue;
1906		}
1907		CONN_INC_REF_LOCKED(next_connp);
1908		mutex_exit(&next_connp->conn_lock);
1909		break;
1910	}
1911
1912	mutex_exit(&connfp->connf_lock);
1913
1914	if (connp != NULL)
1915		CONN_DEC_REF(connp);
1916
1917	return (next_connp);
1918}
1919
1920#ifdef CONN_DEBUG
1921/*
1922 * Trace of the last NBUF refhold/refrele
1923 */
1924int
1925conn_trace_ref(conn_t *connp)
1926{
1927	int	last;
1928	conn_trace_t	*ctb;
1929
1930	ASSERT(MUTEX_HELD(&connp->conn_lock));
1931	last = connp->conn_trace_last;
1932	last++;
1933	if (last == CONN_TRACE_MAX)
1934		last = 0;
1935
1936	ctb = &connp->conn_trace_buf[last];
1937	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
1938	connp->conn_trace_last = last;
1939	return (1);
1940}
1941
1942int
1943conn_untrace_ref(conn_t *connp)
1944{
1945	int	last;
1946	conn_trace_t	*ctb;
1947
1948	ASSERT(MUTEX_HELD(&connp->conn_lock));
1949	last = connp->conn_trace_last;
1950	last++;
1951	if (last == CONN_TRACE_MAX)
1952		last = 0;
1953
1954	ctb = &connp->conn_trace_buf[last];
1955	ctb->ctb_depth = getpcstack(ctb->ctb_stack, IP_STACK_DEPTH);
1956	connp->conn_trace_last = last;
1957	return (1);
1958}
1959#endif
1960