ip_dce.c revision 11042:2d6e217af1b4
1169689Skan/*
2169689Skan * CDDL HEADER START
3169689Skan *
4169689Skan * The contents of this file are subject to the terms of the
5169689Skan * Common Development and Distribution License (the "License").
6169689Skan * You may not use this file except in compliance with the License.
7169689Skan *
8169689Skan * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9169689Skan * or http://www.opensolaris.org/os/licensing.
10169689Skan * See the License for the specific language governing permissions
11169689Skan * and limitations under the License.
12169689Skan *
13169689Skan * When distributing Covered Code, include this CDDL HEADER in each
14169689Skan * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15169689Skan * If applicable, add the following below this CDDL HEADER, with the
16169689Skan * fields enclosed by brackets "[]" replaced with your own identifying
17169689Skan * information: Portions Copyright [yyyy] [name of copyright owner]
18169689Skan *
19169689Skan * CDDL HEADER END
20169689Skan */
21169689Skan
22169689Skan/*
23169689Skan * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24169689Skan * Use is subject to license terms.
25169689Skan */
26169689Skan
27169689Skan#include <sys/types.h>
28169689Skan#include <sys/stream.h>
29169689Skan#include <sys/strsun.h>
30169689Skan#include <sys/zone.h>
31169689Skan#include <sys/ddi.h>
32169689Skan#include <sys/sunddi.h>
33169689Skan#include <sys/cmn_err.h>
34169689Skan#include <sys/debug.h>
35169689Skan#include <sys/atomic.h>
36169689Skan#define	_SUN_TPI_VERSION 2
37169689Skan#include <sys/tihdr.h>
38169689Skan
39169689Skan#include <inet/common.h>
40169689Skan#include <inet/mi.h>
41169689Skan#include <inet/mib2.h>
42169689Skan#include <inet/snmpcom.h>
43169689Skan
44169689Skan#include <netinet/ip6.h>
45169689Skan#include <netinet/icmp6.h>
46169689Skan
47169689Skan#include <inet/ip.h>
48169689Skan#include <inet/ip_impl.h>
49169689Skan#include <inet/ip6.h>
50169689Skan#include <inet/ip6_asp.h>
51169689Skan#include <inet/ip_multi.h>
52169689Skan#include <inet/ip_if.h>
53169689Skan#include <inet/ip_ire.h>
54169689Skan#include <inet/ip_ftable.h>
55169689Skan#include <inet/ip_rts.h>
56169689Skan#include <inet/ip_ndp.h>
57169689Skan#include <inet/ipclassifier.h>
58169689Skan#include <inet/ip_listutils.h>
59169689Skan
60169689Skan#include <sys/sunddi.h>
61169689Skan
62169689Skan/*
63169689Skan * Routines for handling destination cache entries.
64169689Skan * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
65169689Skan * That entry holds both the IP ident value and the dce generation number.
66169689Skan *
67169689Skan * Any time a DCE is changed significantly (different path MTU, but NOT
68169689Skan * different ULP info!), the dce_generation number is increased.
69169689Skan * Also, when a new DCE is created, the dce_generation number in the default
70169689Skan * DCE is bumped. That allows the dce_t information to be cached efficiently
71169689Skan * as long as the entity caching the dce_t also caches the dce_generation,
72169689Skan * and compares the cached generation to detect any changes.
73169689Skan * Furthermore, when a DCE is deleted, if there are any outstanding references
74169689Skan * to the DCE it will be marked as condemned. The condemned mark is
75169689Skan * a designated generation number which is never otherwise used, hence
76169689Skan * the single comparison with the generation number captures that as well.
77169689Skan *
78169689Skan * An example of code which caches is as follows:
79169689Skan *
80169689Skan *	if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
81169689Skan *		The DCE has changed
82169689Skan *		mystruct->my_dce = dce_lookup_pkt(mp, ixa,
83169689Skan *		    &mystruct->my_dce_generation);
84169689Skan *		Not needed in practice, since we have the default DCE:
85169689Skan *		if (DCE_IS_CONDEMNED(mystruct->my_dce))
86169689Skan *			return failure;
87169689Skan *	}
88169689Skan *
89169689Skan * Note that for IPv6 link-local addresses we record the ifindex since the
90169689Skan * link-locals are not globally unique.
91169689Skan */
92169689Skan
93169689Skan/*
94169689Skan * Hash bucket structure for DCEs
95169689Skan */
96169689Skantypedef struct dcb_s {
97169689Skan	krwlock_t	dcb_lock;
98169689Skan	uint32_t	dcb_cnt;
99169689Skan	dce_t		*dcb_dce;
100169689Skan} dcb_t;
101169689Skan
102169689Skanstatic void	dce_delete_locked(dcb_t *, dce_t *);
103169689Skanstatic void	dce_make_condemned(dce_t *);
104169689Skan
105169689Skanstatic kmem_cache_t *dce_cache;
106169689Skan
107169689Skan
108169689Skan/* Operates on a uint64_t */
109169689Skan#define	RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
110169689Skan
111169689Skan/*
112169689Skan * Reclaim a fraction of dce's in the dcb.
113169689Skan * For now we have a higher probability to delete DCEs without DCE_PMTU.
114169689Skan */
115169689Skanstatic void
116169689Skandcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
117169689Skan{
118169689Skan	uint_t	fraction_pmtu = fraction*4;
119169689Skan	uint_t	hash;
120169689Skan	dce_t	*dce, *nextdce;
121169689Skan
122169689Skan	rw_enter(&dcb->dcb_lock, RW_WRITER);
123169689Skan	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
124169689Skan		nextdce = dce->dce_next;
125169689Skan		/* Clear DCEF_PMTU if the pmtu is too old */
126169689Skan		mutex_enter(&dce->dce_lock);
127169689Skan		if ((dce->dce_flags & DCEF_PMTU) &&
128169689Skan		    TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
129169689Skan		    ipst->ips_ip_pathmtu_interval) {
130169689Skan			dce->dce_flags &= ~DCEF_PMTU;
131169689Skan			mutex_exit(&dce->dce_lock);
132169689Skan			dce_increment_generation(dce);
133169689Skan		} else {
134169689Skan			mutex_exit(&dce->dce_lock);
135169689Skan		}
136169689Skan		hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
137169689Skan		if (dce->dce_flags & DCEF_PMTU) {
138169689Skan			if (hash % fraction_pmtu != 0)
139169689Skan				continue;
140169689Skan		} else {
141169689Skan			if (hash % fraction != 0)
142169689Skan				continue;
143261188Spfg		}
144261188Spfg
145169689Skan		IP_STAT(ipst, ip_dce_reclaim_deleted);
146169689Skan		dce_delete_locked(dcb, dce);
147169689Skan		dce_refrele(dce);
148169689Skan	}
149169689Skan	rw_exit(&dcb->dcb_lock);
150169689Skan}
151169689Skan
152169689Skan/*
153169689Skan * kmem_cache callback to free up memory.
154169689Skan *
155169689Skan */
156169689Skanstatic void
157169689Skanip_dce_reclaim_stack(ip_stack_t *ipst)
158169689Skan{
159169689Skan	int	i;
160169689Skan
161169689Skan	IP_STAT(ipst, ip_dce_reclaim_calls);
162169689Skan	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
163169689Skan		dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
164169689Skan		    ipst->ips_ip_dce_reclaim_fraction);
165169689Skan
166169689Skan		dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
167169689Skan		    ipst->ips_ip_dce_reclaim_fraction);
168169689Skan	}
169169689Skan
170169689Skan	/*
171169689Skan	 * Walk all CONNs that can have a reference on an ire, nce or dce.
172169689Skan	 * Get them to update any stale references to drop any refholds they
173169689Skan	 * have.
174169689Skan	 */
175169689Skan	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
176169689Skan}
177169689Skan
178169689Skan/*
179169689Skan * Called by the memory allocator subsystem directly, when the system
180169689Skan * is running low on memory.
181169689Skan */
182169689Skan/* ARGSUSED */
183169689Skanvoid
184169689Skanip_dce_reclaim(void *args)
185169689Skan{
186169689Skan	netstack_handle_t nh;
187169689Skan	netstack_t *ns;
188169689Skan
189169689Skan	netstack_next_init(&nh);
190169689Skan	while ((ns = netstack_next(&nh)) != NULL) {
191169689Skan		ip_dce_reclaim_stack(ns->netstack_ip);
192169689Skan		netstack_rele(ns);
193169689Skan	}
194169689Skan	netstack_next_fini(&nh);
195169689Skan}
196169689Skan
197169689Skanvoid
198169689Skandce_g_init(void)
199169689Skan{
200169689Skan	dce_cache = kmem_cache_create("dce_cache",
201169689Skan	    sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
202169689Skan}
203169689Skan
204169689Skanvoid
205169689Skandce_g_destroy(void)
206169689Skan{
207169689Skan	kmem_cache_destroy(dce_cache);
208}
209
210
211/*
212 * Allocate a default DCE and a hash table for per-IP address DCEs
213 */
214void
215dce_stack_init(ip_stack_t *ipst)
216{
217	int	i;
218
219	ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
220	bzero(ipst->ips_dce_default, sizeof (dce_t));
221	ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
222	ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
223	ipst->ips_dce_default->dce_last_change_time = TICK_TO_SEC(lbolt64);
224	ipst->ips_dce_default->dce_refcnt = 1;	/* Should never go away */
225	ipst->ips_dce_default->dce_ipst = ipst;
226
227	/* This must be a power of two since we are using IRE_ADDR_HASH macro */
228	ipst->ips_dce_hashsize = 256;
229	ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
230	    sizeof (dcb_t), KM_SLEEP);
231	ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
232	    sizeof (dcb_t), KM_SLEEP);
233	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
234		rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
235		    NULL);
236		rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
237		    NULL);
238	}
239}
240
241void
242dce_stack_destroy(ip_stack_t *ipst)
243{
244	int i;
245	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
246		rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
247		rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
248	}
249	kmem_free(ipst->ips_dce_hash_v4,
250	    ipst->ips_dce_hashsize * sizeof (dcb_t));
251	ipst->ips_dce_hash_v4 = NULL;
252	kmem_free(ipst->ips_dce_hash_v6,
253	    ipst->ips_dce_hashsize * sizeof (dcb_t));
254	ipst->ips_dce_hash_v6 = NULL;
255	ipst->ips_dce_hashsize = 0;
256
257	ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
258	kmem_cache_free(dce_cache, ipst->ips_dce_default);
259	ipst->ips_dce_default = NULL;
260}
261
262/* When any DCE is good enough */
263dce_t *
264dce_get_default(ip_stack_t *ipst)
265{
266	dce_t		*dce;
267
268	dce = ipst->ips_dce_default;
269	dce_refhold(dce);
270	return (dce);
271}
272
273/*
274 * Generic for IPv4 and IPv6.
275 *
276 * Used by callers that need to cache e.g., the datapath
277 * Returns the generation number in the last argument.
278 */
279dce_t *
280dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
281{
282	if (ixa->ixa_flags & IXAF_IS_IPV4) {
283		/*
284		 * If we have a source route we need to look for the final
285		 * destination in the source route option.
286		 */
287		ipaddr_t final_dst;
288		ipha_t *ipha = (ipha_t *)mp->b_rptr;
289
290		final_dst = ip_get_dst(ipha);
291		return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
292	} else {
293		uint_t ifindex;
294		/*
295		 * If we have a routing header we need to look for the final
296		 * destination in the routing extension header.
297		 */
298		in6_addr_t final_dst;
299		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
300
301		final_dst = ip_get_dst_v6(ip6h, mp, NULL);
302		ifindex = 0;
303		if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
304			ifindex = ixa->ixa_nce->nce_common->ncec_ill->
305			    ill_phyint->phyint_ifindex;
306		}
307		return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
308		    generationp));
309	}
310}
311
312/*
313 * Used by callers that need to cache e.g., the datapath
314 * Returns the generation number in the last argument.
315 */
316dce_t *
317dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
318{
319	uint_t		hash;
320	dcb_t		*dcb;
321	dce_t		*dce;
322
323	/* Set *generationp before dropping the lock(s) that allow additions */
324	if (generationp != NULL)
325		*generationp = ipst->ips_dce_default->dce_generation;
326
327	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
328	dcb = &ipst->ips_dce_hash_v4[hash];
329	rw_enter(&dcb->dcb_lock, RW_READER);
330	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
331		if (dce->dce_v4addr == dst) {
332			mutex_enter(&dce->dce_lock);
333			if (!DCE_IS_CONDEMNED(dce)) {
334				dce_refhold(dce);
335				if (generationp != NULL)
336					*generationp = dce->dce_generation;
337				mutex_exit(&dce->dce_lock);
338				rw_exit(&dcb->dcb_lock);
339				return (dce);
340			}
341			mutex_exit(&dce->dce_lock);
342		}
343	}
344	rw_exit(&dcb->dcb_lock);
345	/* Not found */
346	dce = ipst->ips_dce_default;
347	dce_refhold(dce);
348	return (dce);
349}
350
351/*
352 * Used by callers that need to cache e.g., the datapath
353 * Returns the generation number in the last argument.
354 * ifindex should only be set for link-locals
355 */
356dce_t *
357dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
358    uint_t *generationp)
359{
360	uint_t		hash;
361	dcb_t		*dcb;
362	dce_t		*dce;
363
364	/* Set *generationp before dropping the lock(s) that allow additions */
365	if (generationp != NULL)
366		*generationp = ipst->ips_dce_default->dce_generation;
367
368	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
369	dcb = &ipst->ips_dce_hash_v6[hash];
370	rw_enter(&dcb->dcb_lock, RW_READER);
371	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
372		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
373		    dce->dce_ifindex == ifindex) {
374			mutex_enter(&dce->dce_lock);
375			if (!DCE_IS_CONDEMNED(dce)) {
376				dce_refhold(dce);
377				if (generationp != NULL)
378					*generationp = dce->dce_generation;
379				mutex_exit(&dce->dce_lock);
380				rw_exit(&dcb->dcb_lock);
381				return (dce);
382			}
383			mutex_exit(&dce->dce_lock);
384		}
385	}
386	rw_exit(&dcb->dcb_lock);
387	/* Not found */
388	dce = ipst->ips_dce_default;
389	dce_refhold(dce);
390	return (dce);
391}
392
393/*
394 * Atomically looks for a non-default DCE, and if not found tries to create one.
395 * If there is no memory it returns NULL.
396 * When an entry is created we increase the generation number on
397 * the default DCE so that conn_ip_output will detect there is a new DCE.
398 */
399dce_t *
400dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
401{
402	uint_t		hash;
403	dcb_t		*dcb;
404	dce_t		*dce;
405
406	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
407	dcb = &ipst->ips_dce_hash_v4[hash];
408	rw_enter(&dcb->dcb_lock, RW_WRITER);
409	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
410		if (dce->dce_v4addr == dst) {
411			mutex_enter(&dce->dce_lock);
412			if (!DCE_IS_CONDEMNED(dce)) {
413				dce_refhold(dce);
414				mutex_exit(&dce->dce_lock);
415				rw_exit(&dcb->dcb_lock);
416				return (dce);
417			}
418			mutex_exit(&dce->dce_lock);
419		}
420	}
421	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
422	if (dce == NULL) {
423		rw_exit(&dcb->dcb_lock);
424		return (NULL);
425	}
426	bzero(dce, sizeof (dce_t));
427	dce->dce_ipst = ipst;	/* No netstack_hold */
428	dce->dce_v4addr = dst;
429	dce->dce_generation = DCE_GENERATION_INITIAL;
430	dce->dce_ipversion = IPV4_VERSION;
431	dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
432	dce_refhold(dce);	/* For the hash list */
433
434	/* Link into list */
435	if (dcb->dcb_dce != NULL)
436		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
437	dce->dce_next = dcb->dcb_dce;
438	dce->dce_ptpn = &dcb->dcb_dce;
439	dcb->dcb_dce = dce;
440	dce->dce_bucket = dcb;
441	dce_refhold(dce);	/* For the caller */
442	rw_exit(&dcb->dcb_lock);
443
444	/* Initialize dce_ident to be different than for the last packet */
445	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
446
447	dce_increment_generation(ipst->ips_dce_default);
448	return (dce);
449}
450
451/*
452 * Atomically looks for a non-default DCE, and if not found tries to create one.
453 * If there is no memory it returns NULL.
454 * When an entry is created we increase the generation number on
455 * the default DCE so that conn_ip_output will detect there is a new DCE.
456 * ifindex should only be used with link-local addresses.
457 */
458dce_t *
459dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
460{
461	uint_t		hash;
462	dcb_t		*dcb;
463	dce_t		*dce;
464
465	/* We should not create entries for link-locals w/o an ifindex */
466	ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
467
468	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
469	dcb = &ipst->ips_dce_hash_v6[hash];
470	rw_enter(&dcb->dcb_lock, RW_WRITER);
471	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
472		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
473		    dce->dce_ifindex == ifindex) {
474			mutex_enter(&dce->dce_lock);
475			if (!DCE_IS_CONDEMNED(dce)) {
476				dce_refhold(dce);
477				mutex_exit(&dce->dce_lock);
478				rw_exit(&dcb->dcb_lock);
479				return (dce);
480			}
481			mutex_exit(&dce->dce_lock);
482		}
483	}
484
485	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
486	if (dce == NULL) {
487		rw_exit(&dcb->dcb_lock);
488		return (NULL);
489	}
490	bzero(dce, sizeof (dce_t));
491	dce->dce_ipst = ipst;	/* No netstack_hold */
492	dce->dce_v6addr = *dst;
493	dce->dce_ifindex = ifindex;
494	dce->dce_generation = DCE_GENERATION_INITIAL;
495	dce->dce_ipversion = IPV6_VERSION;
496	dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
497	dce_refhold(dce);	/* For the hash list */
498
499	/* Link into list */
500	if (dcb->dcb_dce != NULL)
501		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
502	dce->dce_next = dcb->dcb_dce;
503	dce->dce_ptpn = &dcb->dcb_dce;
504	dcb->dcb_dce = dce;
505	dce->dce_bucket = dcb;
506	atomic_add_32(&dcb->dcb_cnt, 1);
507	dce_refhold(dce);	/* For the caller */
508	rw_exit(&dcb->dcb_lock);
509
510	/* Initialize dce_ident to be different than for the last packet */
511	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
512	dce_increment_generation(ipst->ips_dce_default);
513	return (dce);
514}
515
516/*
517 * Set/update uinfo. Creates a per-destination dce if none exists.
518 *
519 * Note that we do not bump the generation number here.
520 * New connections will find the new uinfo.
521 *
522 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
523 */
524static void
525dce_setuinfo(dce_t *dce, iulp_t *uinfo)
526{
527	/*
528	 * Update the round trip time estimate and/or the max frag size
529	 * and/or the slow start threshold.
530	 *
531	 * We serialize multiple advises using dce_lock.
532	 */
533	mutex_enter(&dce->dce_lock);
534	/* Gard against setting to zero */
535	if (uinfo->iulp_rtt != 0) {
536		/*
537		 * If there is no old cached values, initialize them
538		 * conservatively.  Set them to be (1.5 * new value).
539		 */
540		if (dce->dce_uinfo.iulp_rtt != 0) {
541			dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
542			    uinfo->iulp_rtt) >> 1;
543		} else {
544			dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
545			    (uinfo->iulp_rtt >> 1);
546		}
547		if (dce->dce_uinfo.iulp_rtt_sd != 0) {
548			dce->dce_uinfo.iulp_rtt_sd =
549			    (dce->dce_uinfo.iulp_rtt_sd +
550			    uinfo->iulp_rtt_sd) >> 1;
551		} else {
552			dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
553			    (uinfo->iulp_rtt_sd >> 1);
554		}
555	}
556	if (uinfo->iulp_mtu != 0) {
557		if (dce->dce_flags & DCEF_PMTU) {
558			dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
559		} else {
560			dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
561			dce->dce_flags |= DCEF_PMTU;
562		}
563		dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
564	}
565	if (uinfo->iulp_ssthresh != 0) {
566		if (dce->dce_uinfo.iulp_ssthresh != 0)
567			dce->dce_uinfo.iulp_ssthresh =
568			    (uinfo->iulp_ssthresh +
569			    dce->dce_uinfo.iulp_ssthresh) >> 1;
570		else
571			dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
572	}
573	/* We have uinfo for sure */
574	dce->dce_flags |= DCEF_UINFO;
575	mutex_exit(&dce->dce_lock);
576}
577
578
579int
580dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
581{
582	dce_t *dce;
583
584	dce = dce_lookup_and_add_v4(dst, ipst);
585	if (dce == NULL)
586		return (ENOMEM);
587
588	dce_setuinfo(dce, uinfo);
589	dce_refrele(dce);
590	return (0);
591}
592
593int
594dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
595    ip_stack_t *ipst)
596{
597	dce_t *dce;
598
599	dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
600	if (dce == NULL)
601		return (ENOMEM);
602
603	dce_setuinfo(dce, uinfo);
604	dce_refrele(dce);
605	return (0);
606}
607
608/* Common routine for IPv4 and IPv6 */
609int
610dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
611    ip_stack_t *ipst)
612{
613	ipaddr_t dst4;
614
615	if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
616		IN6_V4MAPPED_TO_IPADDR(dst, dst4);
617		return (dce_update_uinfo_v4(dst4, uinfo, ipst));
618	} else {
619		return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
620	}
621}
622
623static void
624dce_make_condemned(dce_t *dce)
625{
626	ip_stack_t	*ipst = dce->dce_ipst;
627
628	mutex_enter(&dce->dce_lock);
629	ASSERT(!DCE_IS_CONDEMNED(dce));
630	dce->dce_generation = DCE_GENERATION_CONDEMNED;
631	mutex_exit(&dce->dce_lock);
632	/* Count how many condemned dces for kmem_cache callback */
633	atomic_add_32(&ipst->ips_num_dce_condemned, 1);
634}
635
636/*
637 * Increment the generation avoiding the special condemned value
638 */
639void
640dce_increment_generation(dce_t *dce)
641{
642	uint_t generation;
643
644	mutex_enter(&dce->dce_lock);
645	if (!DCE_IS_CONDEMNED(dce)) {
646		generation = dce->dce_generation + 1;
647		if (generation == DCE_GENERATION_CONDEMNED)
648			generation = DCE_GENERATION_INITIAL;
649		ASSERT(generation != DCE_GENERATION_VERIFY);
650		dce->dce_generation = generation;
651	}
652	mutex_exit(&dce->dce_lock);
653}
654
655/*
656 * Increment the generation number on all dces that have a path MTU and
657 * the default DCE. Used when ill_mtu changes.
658 */
659void
660dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
661{
662	int		i;
663	dcb_t		*dcb;
664	dce_t		*dce;
665
666	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
667		if (isv6)
668			dcb = &ipst->ips_dce_hash_v6[i];
669		else
670			dcb = &ipst->ips_dce_hash_v4[i];
671		rw_enter(&dcb->dcb_lock, RW_WRITER);
672		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
673			if (DCE_IS_CONDEMNED(dce))
674				continue;
675			dce_increment_generation(dce);
676		}
677		rw_exit(&dcb->dcb_lock);
678	}
679	dce_increment_generation(ipst->ips_dce_default);
680}
681
682/*
683 * Caller needs to do a dce_refrele since we can't do the
684 * dce_refrele under dcb_lock.
685 */
686static void
687dce_delete_locked(dcb_t *dcb, dce_t *dce)
688{
689	dce->dce_bucket = NULL;
690	*dce->dce_ptpn = dce->dce_next;
691	if (dce->dce_next != NULL)
692		dce->dce_next->dce_ptpn = dce->dce_ptpn;
693	dce->dce_ptpn = NULL;
694	dce->dce_next = NULL;
695	atomic_add_32(&dcb->dcb_cnt, -1);
696	dce_make_condemned(dce);
697}
698
699static void
700dce_inactive(dce_t *dce)
701{
702	ip_stack_t	*ipst = dce->dce_ipst;
703
704	ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
705	ASSERT(dce->dce_ptpn == NULL);
706	ASSERT(dce->dce_bucket == NULL);
707
708	/* Count how many condemned dces for kmem_cache callback */
709	if (DCE_IS_CONDEMNED(dce))
710		atomic_add_32(&ipst->ips_num_dce_condemned, -1);
711
712	kmem_cache_free(dce_cache, dce);
713}
714
715void
716dce_refrele(dce_t *dce)
717{
718	ASSERT(dce->dce_refcnt != 0);
719	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
720		dce_inactive(dce);
721}
722
723void
724dce_refhold(dce_t *dce)
725{
726	atomic_add_32(&dce->dce_refcnt, 1);
727	ASSERT(dce->dce_refcnt != 0);
728}
729
730/* No tracing support yet hence the same as the above functions */
731void
732dce_refrele_notr(dce_t *dce)
733{
734	ASSERT(dce->dce_refcnt != 0);
735	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
736		dce_inactive(dce);
737}
738
739void
740dce_refhold_notr(dce_t *dce)
741{
742	atomic_add_32(&dce->dce_refcnt, 1);
743	ASSERT(dce->dce_refcnt != 0);
744}
745
746/* Report both the IPv4 and IPv6 DCEs. */
747mblk_t *
748ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
749{
750	struct opthdr		*optp;
751	mblk_t			*mp2ctl;
752	dest_cache_entry_t	dest_cache;
753	mblk_t			*mp_tail = NULL;
754	dce_t			*dce;
755	dcb_t			*dcb;
756	int			i;
757	uint64_t		current_time;
758
759	current_time = TICK_TO_SEC(lbolt64);
760
761	/*
762	 * make a copy of the original message
763	 */
764	mp2ctl = copymsg(mpctl);
765
766	/* First we do IPv4 entries */
767	optp = (struct opthdr *)&mpctl->b_rptr[
768	    sizeof (struct T_optmgmt_ack)];
769	optp->level = MIB2_IP;
770	optp->name = EXPER_IP_DCE;
771
772	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
773		dcb = &ipst->ips_dce_hash_v4[i];
774		rw_enter(&dcb->dcb_lock, RW_READER);
775		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
776			dest_cache.DestIpv4Address = dce->dce_v4addr;
777			dest_cache.DestFlags = dce->dce_flags;
778			if (dce->dce_flags & DCEF_PMTU)
779				dest_cache.DestPmtu = dce->dce_pmtu;
780			else
781				dest_cache.DestPmtu = 0;
782			dest_cache.DestIdent = dce->dce_ident;
783			dest_cache.DestIfindex = 0;
784			dest_cache.DestAge = current_time -
785			    dce->dce_last_change_time;
786			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
787			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
788				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
789				    "failed to allocate %u bytes\n",
790				    (uint_t)sizeof (dest_cache)));
791			}
792		}
793		rw_exit(&dcb->dcb_lock);
794	}
795	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
796	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
797	    (int)optp->level, (int)optp->name, (int)optp->len));
798	qreply(q, mpctl);
799
800	if (mp2ctl == NULL) {
801		/* Copymsg failed above */
802		return (NULL);
803	}
804
805	/* Now for IPv6 */
806	mpctl = mp2ctl;
807	mp_tail = NULL;
808	mp2ctl = copymsg(mpctl);
809	optp = (struct opthdr *)&mpctl->b_rptr[
810	    sizeof (struct T_optmgmt_ack)];
811	optp->level = MIB2_IP6;
812	optp->name = EXPER_IP_DCE;
813
814	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
815		dcb = &ipst->ips_dce_hash_v6[i];
816		rw_enter(&dcb->dcb_lock, RW_READER);
817		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
818			dest_cache.DestIpv6Address = dce->dce_v6addr;
819			dest_cache.DestFlags = dce->dce_flags;
820			if (dce->dce_flags & DCEF_PMTU)
821				dest_cache.DestPmtu = dce->dce_pmtu;
822			else
823				dest_cache.DestPmtu = 0;
824			dest_cache.DestIdent = dce->dce_ident;
825			if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
826				dest_cache.DestIfindex = dce->dce_ifindex;
827			else
828				dest_cache.DestIfindex = 0;
829			dest_cache.DestAge = current_time -
830			    dce->dce_last_change_time;
831			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
832			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
833				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
834				    "failed to allocate %u bytes\n",
835				    (uint_t)sizeof (dest_cache)));
836			}
837		}
838		rw_exit(&dcb->dcb_lock);
839	}
840	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
841	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
842	    (int)optp->level, (int)optp->name, (int)optp->len));
843	qreply(q, mpctl);
844
845	return (mp2ctl);
846}
847
848/*
849 * Remove IPv6 DCEs which refer to an ifindex that is going away.
850 * This is not required for correctness, but it avoids netstat -d
851 * showing stale stuff that will never be used.
852 */
853void
854dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
855{
856	uint_t	i;
857	dcb_t	*dcb;
858	dce_t	*dce, *nextdce;
859
860	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
861		dcb = &ipst->ips_dce_hash_v6[i];
862		rw_enter(&dcb->dcb_lock, RW_WRITER);
863
864		for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
865			nextdce = dce->dce_next;
866			if (dce->dce_ifindex == ifindex) {
867				dce_delete_locked(dcb, dce);
868				dce_refrele(dce);
869			}
870		}
871		rw_exit(&dcb->dcb_lock);
872	}
873}
874