ipmp.c revision 11042:2d6e217af1b4
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
22 * Use is subject to license terms.
23 */
24
25#include <inet/ip.h>
26#include <inet/ip6.h>
27#include <inet/ip_if.h>
28#include <inet/ip_ire.h>
29#include <inet/ip_multi.h>
30#include <inet/ip_ndp.h>
31#include <inet/ip_rts.h>
32#include <inet/mi.h>
33#include <net/if_types.h>
34#include <sys/dlpi.h>
35#include <sys/kmem.h>
36#include <sys/modhash.h>
37#include <sys/sdt.h>
38#include <sys/strsun.h>
39#include <sys/sunddi.h>
40#include <sys/types.h>
41
42/*
43 * Convenience macros for getting the ip_stack_t associated with an
44 * ipmp_illgrp_t or ipmp_grp_t.
45 */
46#define	IPMP_GRP_TO_IPST(grp)		PHYINT_TO_IPST((grp)->gr_phyint)
47#define	IPMP_ILLGRP_TO_IPST(illg)	((illg)->ig_ipmp_ill->ill_ipst)
48
49/*
50 * Assorted constants that aren't important enough to be tunable.
51 */
52#define	IPMP_GRP_HASH_SIZE		64
53#define	IPMP_ILL_REFRESH_TIMEOUT	120	/* seconds */
54
55
56/*
57 * IPMP meta-interface kstats (based on those in PSARC/1997/198).
58 */
59static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
60	{ "obytes",	KSTAT_DATA_UINT32 },
61	{ "obytes64",	KSTAT_DATA_UINT64 },
62	{ "rbytes",	KSTAT_DATA_UINT32 },
63	{ "rbytes64",	KSTAT_DATA_UINT64 },
64	{ "opackets",	KSTAT_DATA_UINT32 },
65	{ "opackets64",	KSTAT_DATA_UINT64 },
66	{ "oerrors",	KSTAT_DATA_UINT32 },
67	{ "ipackets",	KSTAT_DATA_UINT32 },
68	{ "ipackets64",	KSTAT_DATA_UINT64 },
69	{ "ierrors",	KSTAT_DATA_UINT32 },
70	{ "multircv",	KSTAT_DATA_UINT32 },
71	{ "multixmt",	KSTAT_DATA_UINT32 },
72	{ "brdcstrcv",	KSTAT_DATA_UINT32 },
73	{ "brdcstxmt",	KSTAT_DATA_UINT32 },
74	{ "link_up",	KSTAT_DATA_UINT32 }
75};
76
77static void	ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
78static int	ipmp_grp_create_kstats(ipmp_grp_t *);
79static int	ipmp_grp_update_kstats(kstat_t *, int);
80static void	ipmp_grp_destroy_kstats(ipmp_grp_t *);
81static ill_t	*ipmp_illgrp_min_ill(ipmp_illgrp_t *);
82static ill_t	*ipmp_illgrp_max_ill(ipmp_illgrp_t *);
83static void	ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
84static void	ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t);
85static boolean_t ipmp_ill_activate(ill_t *);
86static void	ipmp_ill_deactivate(ill_t *);
87static void	ipmp_ill_ire_mark_testhidden(ire_t *, char *);
88static void	ipmp_ill_ire_clear_testhidden(ire_t *, char *);
89static void	ipmp_ill_refresh_active_timer_start(ill_t *);
90static void	ipmp_ill_rtsaddrmsg(ill_t *, int);
91static void	ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
92static ipif_t	*ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
93static void	ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
94static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
95
96/*
97 * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
98 */
99void
100ipmp_init(ip_stack_t *ipst)
101{
102	ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
103	    IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
104	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
105	rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
106}
107
108/*
109 * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
110 */
111void
112ipmp_destroy(ip_stack_t *ipst)
113{
114	mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
115	rw_destroy(&ipst->ips_ipmp_lock);
116}
117
118/*
119 * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
120 * and add it to the hash.  On success, return a pointer to the created group.
121 * Caller must ensure `grname' is not yet in the hash.  Assumes that the IPMP
122 * meta-interface associated with the group also has the same name (but they
123 * may differ later via ipmp_grp_rename()).
124 */
125ipmp_grp_t *
126ipmp_grp_create(const char *grname, phyint_t *phyi)
127{
128	ipmp_grp_t *grp;
129	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
130	mod_hash_hndl_t mh;
131
132	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
133
134	if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
135		return (NULL);
136
137	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
138	(void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
139
140	/*
141	 * Cache the group's phyint.  This is safe since a phyint_t will
142	 * outlive its ipmp_grp_t.
143	 */
144	grp->gr_phyint = phyi;
145
146	/*
147	 * Create IPMP group kstats.
148	 */
149	if (ipmp_grp_create_kstats(grp) != 0) {
150		kmem_free(grp, sizeof (ipmp_grp_t));
151		return (NULL);
152	}
153
154	/*
155	 * Insert the group into the hash.
156	 */
157	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
158		ipmp_grp_destroy_kstats(grp);
159		kmem_free(grp, sizeof (ipmp_grp_t));
160		return (NULL);
161	}
162	ipmp_grp_insert(grp, mh);
163
164	return (grp);
165}
166
167/*
168 * Create IPMP kstat structures for `grp'.  Return an errno upon failure.
169 */
170static int
171ipmp_grp_create_kstats(ipmp_grp_t *grp)
172{
173	kstat_t *ksp;
174	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
175
176	ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
177	    KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
178	if (ksp == NULL)
179		return (ENOMEM);
180
181	ksp->ks_update = ipmp_grp_update_kstats;
182	ksp->ks_private = grp;
183	bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
184
185	kstat_install(ksp);
186	grp->gr_ksp = ksp;
187	return (0);
188}
189
190/*
191 * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
192 */
193static int
194ipmp_grp_update_kstats(kstat_t *ksp, int rw)
195{
196	uint_t		i;
197	kstat_named_t	*kn = KSTAT_NAMED_PTR(ksp);
198	ipmp_grp_t	*grp = ksp->ks_private;
199	ip_stack_t	*ipst = IPMP_GRP_TO_IPST(grp);
200	ipsq_t		*ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
201	phyint_t	*phyi;
202	uint64_t	phyi_kstats[IPMP_KSTAT_MAX];
203
204	if (rw == KSTAT_WRITE)
205		return (EACCES);
206
207	/*
208	 * Start with the group's baseline values.
209	 */
210	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
211		if (kn[i].data_type == KSTAT_DATA_UINT32) {
212			kn[i].value.ui32 = grp->gr_kstats0[i];
213		} else {
214			ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
215			kn[i].value.ui64 = grp->gr_kstats0[i];
216		}
217	}
218
219	/*
220	 * Add in the stats of each phyint currently in the group.  Since we
221	 * don't directly track the phyints in a group, we cheat by walking
222	 * the IPSQ set under ill_g_lock.  (The IPSQ list cannot change while
223	 * ill_g_lock is held.)
224	 */
225	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
226	ipsq = grp_ipsq->ipsq_next;
227	for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
228		phyi = ipsq->ipsq_phyint;
229
230		/*
231		 * If a phyint in a group is being unplumbed, it's possible
232		 * that ill_glist_delete() -> phyint_free() already freed the
233		 * phyint (and set ipsq_phyint to NULL), but the unplumb
234		 * operation has yet to complete (and thus ipsq_dq() has yet
235		 * to remove the phyint's IPSQ from the group IPSQ's phyint
236		 * list).  We skip those phyints here (note that their kstats
237		 * have already been added to gr_kstats0[]).
238		 */
239		if (phyi == NULL)
240			continue;
241
242		ipmp_phyint_get_kstats(phyi, phyi_kstats);
243
244		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
245			phyi_kstats[i] -= phyi->phyint_kstats0[i];
246			if (kn[i].data_type == KSTAT_DATA_UINT32)
247				kn[i].value.ui32 += phyi_kstats[i];
248			else
249				kn[i].value.ui64 += phyi_kstats[i];
250		}
251	}
252
253	kn[IPMP_KSTAT_LINK_UP].value.ui32 =
254	    (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
255
256	rw_exit(&ipst->ips_ill_g_lock);
257	return (0);
258}
259
260/*
261 * Destroy IPMP kstat structures for `grp'.
262 */
263static void
264ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
265{
266	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
267
268	kstat_delete_netstack(grp->gr_ksp, id);
269	bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
270	grp->gr_ksp = NULL;
271}
272
273/*
274 * Look up an IPMP group named `grname' on IP stack `ipst'.  Return NULL if it
275 * does not exist.
276 */
277ipmp_grp_t *
278ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
279{
280	ipmp_grp_t *grp;
281
282	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
283
284	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
285	    (mod_hash_val_t *)&grp) == 0)
286		return (grp);
287
288	return (NULL);
289}
290
291/*
292 * Place information about group `grp' into `lifgr'.
293 */
294void
295ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
296{
297	ill_t *ill;
298	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
299
300	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
301
302	lifgr->gi_v4 = (grp->gr_v4 != NULL);
303	lifgr->gi_v6 = (grp->gr_v6 != NULL);
304	lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
305	lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
306	lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
307	(void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
308	lifgr->gi_m4ifname[0] = '\0';
309	lifgr->gi_m6ifname[0] = '\0';
310	lifgr->gi_bcifname[0] = '\0';
311
312	if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
313		(void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
314		(void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
315	}
316
317	if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
318		(void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
319}
320
321/*
322 * Insert `grp' into the hash using the reserved hash entry `mh'.
323 * Caller must ensure `grp' is not yet in the hash.
324 */
325static void
326ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
327{
328	int err;
329	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
330
331	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
332
333	/*
334	 * Since grp->gr_name will exist at least as long as `grp' is in the
335	 * hash, we use it directly as the key.
336	 */
337	err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
338	    (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
339	if (err != 0) {
340		/*
341		 * This should never happen since `mh' was preallocated.
342		 */
343		panic("cannot insert IPMP group \"%s\" (err %d)",
344		    grp->gr_name, err);
345	}
346}
347
348/*
349 * Remove `grp' from the hash.  Caller must ensure `grp' is in it.
350 */
351static void
352ipmp_grp_remove(ipmp_grp_t *grp)
353{
354	int err;
355	mod_hash_val_t val;
356	mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
357	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
358
359	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
360
361	err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
362	if (err != 0 || val != grp) {
363		panic("cannot remove IPMP group \"%s\" (err %d)",
364		    grp->gr_name, err);
365	}
366}
367
368/*
369 * Attempt to rename `grp' to new name `grname'.  Return an errno if the new
370 * group name already exists or is invalid, or if there isn't enough memory.
371 */
372int
373ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
374{
375	mod_hash_hndl_t mh;
376	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
377
378	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
379
380	if (grname[0] == '\0')
381		return (EINVAL);
382
383	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
384	    (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
385		return (EEXIST);
386
387	/*
388	 * Before we remove the group from the hash, ensure we'll be able to
389	 * re-insert it by reserving space.
390	 */
391	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
392		return (ENOMEM);
393
394	ipmp_grp_remove(grp);
395	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
396	ipmp_grp_insert(grp, mh);
397
398	return (0);
399}
400
401/*
402 * Destroy `grp' and remove it from the hash.  Caller must ensure `grp' is in
403 * the hash, and that there are no interfaces on it.
404 */
405void
406ipmp_grp_destroy(ipmp_grp_t *grp)
407{
408	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
409
410	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
411
412	/*
413	 * If there are still interfaces using this group, panic before things
414	 * go really off the rails.
415	 */
416	if (grp->gr_nif != 0)
417		panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
418
419	ipmp_grp_remove(grp);
420	ipmp_grp_destroy_kstats(grp);
421
422	ASSERT(grp->gr_v4 == NULL);
423	ASSERT(grp->gr_v6 == NULL);
424	ASSERT(grp->gr_nv4 == 0);
425	ASSERT(grp->gr_nv6 == 0);
426	ASSERT(grp->gr_nactif == 0);
427	ASSERT(grp->gr_linkdownmp == NULL);
428	grp->gr_phyint = NULL;
429
430	kmem_free(grp, sizeof (ipmp_grp_t));
431}
432
433/*
434 * Check whether `ill' is suitable for inclusion into `grp', and return an
435 * errno describing the problem (if any).  NOTE: many of these errno values
436 * are interpreted by ifconfig, which will take corrective action and retry
437 * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
438 */
439static int
440ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
441{
442	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
443
444	ASSERT(IAM_WRITER_ILL(ill));
445	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
446
447	/*
448	 * To sidestep complicated address migration logic in the kernel and
449	 * to force the kernel's all-hosts multicast memberships to be blown
450	 * away, all addresses that had been brought up must be brought back
451	 * down prior to adding an interface to a group.  (This includes
452	 * addresses currently down due to DAD.)  Once the interface has been
453	 * added to the group, its addresses can then be brought back up, at
454	 * which point they will be moved to the IPMP meta-interface.
455	 * NOTE: we do this before ill_appaddr_cnt() since bringing down the
456	 * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
457	 */
458	if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
459		return (EADDRINUSE);
460
461	/*
462	 * To avoid confusing applications by changing addresses that are
463	 * under their control, all such control must be removed prior to
464	 * adding an interface into a group.
465	 */
466	if (ill_appaddr_cnt(ill) != 0)
467		return (EADDRNOTAVAIL);
468
469	/*
470	 * Since PTP addresses do not share the same broadcast domain, they
471	 * are not allowed to be in an IPMP group.
472	 */
473	if (ill_ptpaddr_cnt(ill) != 0)
474		return (EINVAL);
475
476	/*
477	 * An ill must support multicast to be allowed into a group.
478	 */
479	if (!(ill->ill_flags & ILLF_MULTICAST))
480		return (ENOTSUP);
481
482	/*
483	 * An ill must strictly be using ARP and/or ND for address
484	 * resolution for it to be allowed into a group.
485	 */
486	if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP))
487		return (ENOTSUP);
488
489	/*
490	 * An ill cannot also be using usesrc groups.  (Although usesrc uses
491	 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
492	 * all its modifications as writer.)
493	 */
494	if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
495		return (ENOTSUP);
496
497	/*
498	 * All ills in a group must be the same mactype.
499	 */
500	if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
501		return (EINVAL);
502
503	return (0);
504}
505
506/*
507 * Check whether `phyi' is suitable for inclusion into `grp', and return an
508 * errno describing the problem (if any).  See comment above ipmp_grp_vet_ill()
509 * regarding errno values.
510 */
511int
512ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
513{
514	int err = 0;
515	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
516
517	ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
518	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
519
520	/*
521	 * An interface cannot have address families plumbed that are not
522	 * configured in the group.
523	 */
524	if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
525	    phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
526		return (EAFNOSUPPORT);
527
528	if (phyi->phyint_illv4 != NULL)
529		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
530	if (err == 0 && phyi->phyint_illv6 != NULL)
531		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
532
533	return (err);
534}
535
536/*
537 * Create a new illgrp on IPMP meta-interface `ill'.
538 */
539ipmp_illgrp_t *
540ipmp_illgrp_create(ill_t *ill)
541{
542	uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
543	ipmp_illgrp_t *illg;
544
545	ASSERT(IAM_WRITER_ILL(ill));
546	ASSERT(IS_IPMP(ill));
547	ASSERT(ill->ill_grp == NULL);
548
549	if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
550		return (NULL);
551
552	list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
553	list_create(&illg->ig_actif, sizeof (ill_t),
554	    offsetof(ill_t, ill_actnode));
555	list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
556	    offsetof(ipmp_arpent_t, ia_node));
557
558	illg->ig_ipmp_ill = ill;
559	ill->ill_grp = illg;
560	ipmp_illgrp_set_mtu(illg, mtu);
561
562	return (illg);
563}
564
565/*
566 * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
567 */
568void
569ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
570{
571	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
572	ASSERT(IS_IPMP(illg->ig_ipmp_ill));
573
574	/*
575	 * Verify `illg' is empty.
576	 */
577	ASSERT(illg->ig_next_ill == NULL);
578	ASSERT(illg->ig_cast_ill == NULL);
579	ASSERT(list_is_empty(&illg->ig_arpent));
580	ASSERT(list_is_empty(&illg->ig_if));
581	ASSERT(list_is_empty(&illg->ig_actif));
582	ASSERT(illg->ig_nactif == 0);
583
584	/*
585	 * Destroy `illg'.
586	 */
587	illg->ig_ipmp_ill->ill_grp = NULL;
588	illg->ig_ipmp_ill = NULL;
589	list_destroy(&illg->ig_if);
590	list_destroy(&illg->ig_actif);
591	list_destroy(&illg->ig_arpent);
592	kmem_free(illg, sizeof (ipmp_illgrp_t));
593}
594
595/*
596 * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
597 * bind it to an underlying ill, while keeping an even address distribution.
598 * If the bind is successful, return a pointer to the bound ill.
599 */
600ill_t *
601ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
602{
603	ill_t *minill;
604	ipmp_arpent_t *entp;
605
606	ASSERT(IAM_WRITER_IPIF(ipif));
607	ASSERT(ipmp_ipif_is_dataaddr(ipif));
608
609	/*
610	 * IPMP data address mappings are internally managed by IP itself, so
611	 * delete any existing ARP entries associated with the address.
612	 */
613	if (!ipif->ipif_isv6) {
614		entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
615		if (entp != NULL)
616			ipmp_illgrp_destroy_arpent(illg, entp);
617	}
618
619	if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
620		ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
621
622	return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
623}
624
625/*
626 * Delete `ipif' from the pool of usable data addresses on `illg'.  If it's
627 * bound, unbind it from the underlying ill while keeping an even address
628 * distribution.
629 */
630void
631ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
632{
633	ill_t *maxill, *boundill = ipif->ipif_bound_ill;
634
635	ASSERT(IAM_WRITER_IPIF(ipif));
636
637	if (boundill != NULL) {
638		(void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
639
640		maxill = ipmp_illgrp_max_ill(illg);
641		if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
642			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
643			ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
644		}
645	}
646}
647
648/*
649 * Return the active ill with the greatest number of data addresses in `illg'.
650 */
651static ill_t *
652ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
653{
654	ill_t *ill, *bestill = NULL;
655
656	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
657
658	ill = list_head(&illg->ig_actif);
659	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
660		if (bestill == NULL ||
661		    ill->ill_bound_cnt > bestill->ill_bound_cnt) {
662			bestill = ill;
663		}
664	}
665	return (bestill);
666}
667
668/*
669 * Return the active ill with the fewest number of data addresses in `illg'.
670 */
671static ill_t *
672ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
673{
674	ill_t *ill, *bestill = NULL;
675
676	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
677
678	ill = list_head(&illg->ig_actif);
679	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
680		if (bestill == NULL ||
681		    ill->ill_bound_cnt < bestill->ill_bound_cnt) {
682			if (ill->ill_bound_cnt == 0)
683				return (ill);	 /* can't get better */
684			bestill = ill;
685		}
686	}
687	return (bestill);
688}
689
690/*
691 * Return a pointer to IPMP meta-interface for `illg' (which must exist).
692 * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
693 */
694ill_t *
695ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
696{
697	return (illg->ig_ipmp_ill);
698}
699
700/*
701 * Return a pointer to the next available underlying ill in `illg', or NULL if
702 * one doesn't exist.  Caller must be inside the IPSQ.
703 */
704ill_t *
705ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
706{
707	ill_t *ill;
708	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
709
710	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
711
712	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
713	if ((ill = illg->ig_next_ill) != NULL) {
714		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
715		if (illg->ig_next_ill == NULL)
716			illg->ig_next_ill = list_head(&illg->ig_actif);
717	}
718	rw_exit(&ipst->ips_ipmp_lock);
719
720	return (ill);
721}
722
723/*
724 * Return a held pointer to the next available underlying ill in `illg', or
725 * NULL if one doesn't exist.  Caller need not be inside the IPSQ.
726 */
727ill_t *
728ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
729{
730	ill_t *ill;
731	uint_t i;
732	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
733
734	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
735	for (i = 0; i < illg->ig_nactif; i++) {
736		ill = illg->ig_next_ill;
737		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
738		if (illg->ig_next_ill == NULL)
739			illg->ig_next_ill = list_head(&illg->ig_actif);
740
741		if (ill_check_and_refhold(ill)) {
742			rw_exit(&ipst->ips_ipmp_lock);
743			return (ill);
744		}
745	}
746	rw_exit(&ipst->ips_ipmp_lock);
747
748	return (NULL);
749}
750
751/*
752 * Return a held pointer to the nominated multicast ill in `illg', or NULL if
753 * one doesn't exist.  Caller need not be inside the IPSQ.
754 */
755ill_t *
756ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
757{
758	ill_t *castill;
759	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
760
761	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
762	castill = illg->ig_cast_ill;
763	if (castill != NULL && ill_check_and_refhold(castill)) {
764		rw_exit(&ipst->ips_ipmp_lock);
765		return (castill);
766	}
767	rw_exit(&ipst->ips_ipmp_lock);
768	return (NULL);
769}
770
771/*
772 * Callback routine for ncec_walk() that deletes `nce' if it is associated with
773 * the `(ill_t *)arg' and it is not one of the local addresses.  Caller must be
774 * inside the IPSQ.
775 */
776static void
777ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *arg)
778{
779	if ((ncec != NULL) && !NCE_MYADDR(ncec) &&
780	    ncec->ncec_ill == (ill_t *)arg) {
781		ncec_delete(ncec);
782	}
783}
784
785/*
786 * Set the nominated cast ill on `illg' to `castill'.  If `castill' is NULL,
787 * any existing nomination is removed.  Caller must be inside the IPSQ.
788 */
789static void
790ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
791{
792	ill_t *ocastill = illg->ig_cast_ill;
793	ill_t *ipmp_ill = illg->ig_ipmp_ill;
794	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
795
796	ASSERT(IAM_WRITER_ILL(ipmp_ill));
797
798	/*
799	 * Disable old nominated ill (if any).
800	 */
801	if (ocastill != NULL) {
802		DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
803		    illg, ill_t *, ocastill);
804		ASSERT(ocastill->ill_nom_cast);
805		ocastill->ill_nom_cast = B_FALSE;
806		/*
807		 * If the IPMP meta-interface is down, we never did the join,
808		 * so we must not try to leave.
809		 */
810		if (ipmp_ill->ill_dl_up)
811			ill_leave_multicast(ipmp_ill);
812
813		/*
814		 * Delete any NCEs tied to the old nomination.  We must do this
815		 * last since ill_leave_multicast() may trigger IREs to be
816		 * built using ig_cast_ill.
817		 */
818		ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill,
819		    ocastill->ill_ipst);
820	}
821
822	/*
823	 * Set new nomination.
824	 */
825	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
826	illg->ig_cast_ill = castill;
827	rw_exit(&ipst->ips_ipmp_lock);
828
829	/*
830	 * Enable new nominated ill (if any).
831	 */
832	if (castill != NULL) {
833		DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
834		    illg, ill_t *, castill);
835		ASSERT(!castill->ill_nom_cast);
836		castill->ill_nom_cast = B_TRUE;
837		/*
838		 * If the IPMP meta-interface is down, the attempt to recover
839		 * will silently fail but ill_need_recover_multicast will be
840		 * erroneously cleared -- so check first.
841		 */
842		if (ipmp_ill->ill_dl_up)
843			ill_recover_multicast(ipmp_ill);
844	}
845}
846
847/*
848 * Create an IPMP ARP entry and add it to the set tracked on `illg'.  If an
849 * entry for the same IP address already exists, destroy it first.  Return the
850 * created IPMP ARP entry, or NULL on failure.
851 */
852ipmp_arpent_t *
853ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp,
854    ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags)
855{
856	ipmp_arpent_t *entp, *oentp;
857
858	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
859
860	if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len,
861	    KM_NOSLEEP)) == NULL)
862		return (NULL);
863
864	/*
865	 * Delete any existing ARP entry for this address.
866	 */
867	if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
868		ipmp_illgrp_destroy_arpent(illg, oentp);
869
870	/*
871	 * Prepend the new entry.
872	 */
873	entp->ia_ipaddr = ipaddr;
874	entp->ia_flags = flags;
875	entp->ia_lladdr_len = lladdr_len;
876	entp->ia_lladdr = (uchar_t *)&entp[1];
877	bcopy(lladdr, entp->ia_lladdr, lladdr_len);
878	entp->ia_proxyarp = proxyarp;
879	entp->ia_notified = B_TRUE;
880	list_insert_head(&illg->ig_arpent, entp);
881	return (entp);
882}
883
884/*
885 * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
886 */
887void
888ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
889{
890	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
891
892	list_remove(&illg->ig_arpent, entp);
893	kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len);
894}
895
896/*
897 * Mark that ARP has been notified about the IP address on `entp'; `illg' is
898 * taken as a debugging aid for DTrace FBT probes.
899 */
900/* ARGSUSED */
901void
902ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
903{
904	entp->ia_notified = B_TRUE;
905}
906
907/*
908 * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
909 * NULL, any IPMP ARP entry is requested.  Return NULL if it does not exist.
910 */
911ipmp_arpent_t *
912ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
913{
914	ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
915
916	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
917
918	if (addrp == NULL)
919		return (entp);
920
921	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
922		if (entp->ia_ipaddr == *addrp)
923			break;
924	return (entp);
925}
926
927/*
928 * Refresh ARP entries on `illg' to be distributed across its active
929 * interfaces.  Entries that cannot be refreshed (e.g., because there are no
930 * active interfaces) are marked so that subsequent calls can try again.
931 */
932void
933ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
934{
935	ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
936	uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
937	ipmp_arpent_t *entp;
938	ncec_t *ncec;
939	nce_t  *nce;
940
941	ASSERT(IAM_WRITER_ILL(ipmp_ill));
942	ASSERT(!ipmp_ill->ill_isv6);
943
944	ill = list_head(&illg->ig_actif);
945	entp = list_head(&illg->ig_arpent);
946	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
947		if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
948			entp->ia_notified = B_FALSE;
949			continue;
950		}
951
952		ASSERT(paddrlen == ill->ill_phys_addr_length);
953
954		/*
955		 * If this is a proxy ARP entry, we can skip notifying ARP if
956		 * the entry is already up-to-date.  If it has changed, we
957		 * update the entry's hardware address before notifying ARP.
958		 */
959		if (entp->ia_proxyarp) {
960			if (bcmp(ill->ill_phys_addr, entp->ia_lladdr,
961			    paddrlen) == 0 && entp->ia_notified)
962				continue;
963			bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen);
964		}
965
966		(void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr,
967		    paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED,
968		    &nce);
969		if (nce == NULL || !entp->ia_proxyarp) {
970			if (nce != NULL)
971				nce_refrele(nce);
972			continue;
973		}
974		ncec = nce->nce_common;
975		mutex_enter(&ncec->ncec_lock);
976		nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr);
977		mutex_exit(&ncec->ncec_lock);
978		nce_refrele(nce);
979		ipmp_illgrp_mark_arpent(illg, entp);
980
981		if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
982			ill = list_head(&illg->ig_actif);
983	}
984}
985
986/*
987 * Return an interface in `illg' with the specified `physaddr', or NULL if one
988 * doesn't exist.  Caller must hold ill_g_lock if it's not inside the IPSQ.
989 */
990ill_t *
991ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
992{
993	ill_t *ill;
994	ill_t *ipmp_ill = illg->ig_ipmp_ill;
995	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
996
997	ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
998
999	ill = list_head(&illg->ig_if);
1000	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
1001		if (ill->ill_phys_addr_length == paddrlen &&
1002		    bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
1003			return (ill);
1004	}
1005	return (NULL);
1006}
1007
1008/*
1009 * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
1010 * Caller must be inside the IPSQ unless this is initialization.
1011 */
1012static void
1013ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu)
1014{
1015	ill_t *ill = illg->ig_ipmp_ill;
1016	mblk_t *mp;
1017
1018	ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
1019
1020	/*
1021	 * If allocation fails, we have bigger problems than MTU.
1022	 */
1023	if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) {
1024		illg->ig_mtu = mtu;
1025		put(ill->ill_rq, mp);
1026	}
1027}
1028
1029/*
1030 * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
1031 * ill MTU if necessary.
1032 */
1033void
1034ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
1035{
1036	ill_t *ill;
1037	ill_t *ipmp_ill = illg->ig_ipmp_ill;
1038	uint_t mtu = 0;
1039
1040	ASSERT(IAM_WRITER_ILL(ipmp_ill));
1041
1042	/*
1043	 * Since ill_mtu can only change under ill_lock, we hold ill_lock
1044	 * for each ill as we iterate through the list.  Any changes to the
1045	 * ill_mtu will also trigger an update, so even if we missed it
1046	 * this time around, the update will catch it.
1047	 */
1048	ill = list_head(&illg->ig_if);
1049	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
1050		mutex_enter(&ill->ill_lock);
1051		if (mtu == 0 || ill->ill_mtu < mtu)
1052			mtu = ill->ill_mtu;
1053		mutex_exit(&ill->ill_lock);
1054	}
1055
1056	/*
1057	 * MTU must be at least the minimum MTU.
1058	 */
1059	mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
1060
1061	if (illg->ig_mtu != mtu)
1062		ipmp_illgrp_set_mtu(illg, mtu);
1063}
1064
1065/*
1066 * Link illgrp `illg' to IPMP group `grp'.  To simplify the caller, silently
1067 * allow the same link to be established more than once.
1068 */
1069void
1070ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
1071{
1072	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1073
1074	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1075
1076	if (illg->ig_ipmp_ill->ill_isv6) {
1077		ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
1078		grp->gr_v6 = illg;
1079	} else {
1080		ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
1081		grp->gr_v4 = illg;
1082	}
1083}
1084
1085/*
1086 * Unlink illgrp `illg' from its IPMP group.  Return an errno if the illgrp
1087 * cannot be unlinked (e.g., because there are still interfaces using it).
1088 */
1089int
1090ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
1091{
1092	ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
1093	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1094
1095	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1096
1097	if (illg->ig_ipmp_ill->ill_isv6) {
1098		if (grp->gr_nv6 + grp->gr_pendv6 != 0)
1099			return (EBUSY);
1100		grp->gr_v6 = NULL;
1101	} else {
1102		if (grp->gr_nv4 + grp->gr_pendv4 != 0)
1103			return (EBUSY);
1104		grp->gr_v4 = NULL;
1105	}
1106	return (0);
1107}
1108
1109/*
1110 * Place `ill' into `illg', and rebalance the data addresses on `illg'
1111 * to be spread evenly across the ills now in it.  Also, adjust the IPMP
1112 * ill as necessary to account for `ill' (e.g., MTU).
1113 */
1114void
1115ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
1116{
1117	ill_t *ipmp_ill;
1118	ipif_t *ipif;
1119	ip_stack_t *ipst = ill->ill_ipst;
1120
1121	/* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
1122	ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
1123	ASSERT(IAM_WRITER_ILL(ill));
1124	ASSERT(ill->ill_grp == NULL);
1125
1126	ipmp_ill = illg->ig_ipmp_ill;
1127
1128	/*
1129	 * Account for `ill' joining the illgrp.
1130	 */
1131	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1132	if (ill->ill_isv6)
1133		ill->ill_phyint->phyint_grp->gr_nv6++;
1134	else
1135		ill->ill_phyint->phyint_grp->gr_nv4++;
1136	rw_exit(&ipst->ips_ipmp_lock);
1137
1138	/*
1139	 * Ensure the ILLF_ROUTER flag remains consistent across the group.
1140	 */
1141	mutex_enter(&ill->ill_lock);
1142	if (ipmp_ill->ill_flags & ILLF_ROUTER)
1143		ill->ill_flags |= ILLF_ROUTER;
1144	else
1145		ill->ill_flags &= ~ILLF_ROUTER;
1146	mutex_exit(&ill->ill_lock);
1147
1148	/*
1149	 * Blow away all multicast memberships that currently exist on `ill'.
1150	 * This may seem odd, but it's consistent with the application view
1151	 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
1152	 */
1153	update_conn_ill(ill, ill->ill_ipst);
1154	if (ill->ill_isv6) {
1155		reset_mrt_ill(ill);
1156	} else {
1157		ipif = ill->ill_ipif;
1158		for (; ipif != NULL; ipif = ipif->ipif_next) {
1159			reset_mrt_vif_ipif(ipif);
1160		}
1161	}
1162	ip_purge_allmulti(ill);
1163
1164	/*
1165	 * Borrow the first ill's ill_phys_addr_length value for the illgrp's
1166	 * physical address length.  All other ills must have the same value,
1167	 * since they are required to all be the same mactype.  Also update
1168	 * the IPMP ill's MTU and CoS marking, if necessary.
1169	 */
1170	if (list_is_empty(&illg->ig_if)) {
1171		ASSERT(ipmp_ill->ill_phys_addr_length == 0);
1172		/*
1173		 * NOTE: we leave ill_phys_addr NULL since the IPMP group
1174		 * doesn't have a physical address.  This means that code must
1175		 * not assume that ill_phys_addr is non-NULL just because
1176		 * ill_phys_addr_length is non-zero.  Likewise for ill_nd_lla.
1177		 */
1178		ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
1179		ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
1180		ipmp_ill->ill_type = ill->ill_type;
1181
1182		if (ill->ill_flags & ILLF_COS_ENABLED) {
1183			mutex_enter(&ipmp_ill->ill_lock);
1184			ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1185			mutex_exit(&ipmp_ill->ill_lock);
1186		}
1187		ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
1188	} else {
1189		ASSERT(ipmp_ill->ill_phys_addr_length ==
1190		    ill->ill_phys_addr_length);
1191		ASSERT(ipmp_ill->ill_type == ill->ill_type);
1192
1193		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1194			mutex_enter(&ipmp_ill->ill_lock);
1195			ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1196			mutex_exit(&ipmp_ill->ill_lock);
1197		}
1198		if (illg->ig_mtu > ill->ill_mtu)
1199			ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
1200	}
1201
1202	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1203	list_insert_tail(&illg->ig_if, ill);
1204	ill->ill_grp = illg;
1205	rw_exit(&ipst->ips_ill_g_lock);
1206
1207	/*
1208	 * Hide the IREs on `ill' so that we don't accidentally find them when
1209	 * sending data traffic.
1210	 */
1211	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
1212
1213	ipmp_ill_refresh_active(ill);
1214}
1215
1216/*
1217 * Remove `ill' from its illgrp, and rebalance the data addresses in that
1218 * illgrp to be spread evenly across the remaining ills.  Also, adjust the
1219 * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
1220 */
1221void
1222ipmp_ill_leave_illgrp(ill_t *ill)
1223{
1224	ill_t *ipmp_ill;
1225	ipif_t *ipif;
1226	ipmp_arpent_t *entp;
1227	ipmp_illgrp_t *illg = ill->ill_grp;
1228	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1229
1230	ASSERT(IS_UNDER_IPMP(ill));
1231	ASSERT(IAM_WRITER_ILL(ill));
1232	ASSERT(illg != NULL);
1233
1234	ipmp_ill = illg->ig_ipmp_ill;
1235
1236	/*
1237	 * Cancel IPMP-specific ill timeouts.
1238	 */
1239	(void) untimeout(ill->ill_refresh_tid);
1240
1241	/*
1242	 * Expose any previously-hidden IREs on `ill'.
1243	 */
1244	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
1245
1246	/*
1247	 * Ensure the multicast state for each ipif on `ill' is down so that
1248	 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
1249	 * all eligible groups.
1250	 */
1251	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1252		if (ipif->ipif_flags & IPIF_UP)
1253			ipif_multicast_down(ipif);
1254
1255	/*
1256	 * Account for `ill' leaving the illgrp.
1257	 */
1258	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1259	if (ill->ill_isv6)
1260		ill->ill_phyint->phyint_grp->gr_nv6--;
1261	else
1262		ill->ill_phyint->phyint_grp->gr_nv4--;
1263	rw_exit(&ipst->ips_ipmp_lock);
1264
1265	/*
1266	 * Pull `ill' out of the interface lists.
1267	 */
1268	if (list_link_active(&ill->ill_actnode))
1269		ipmp_ill_deactivate(ill);
1270	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1271	list_remove(&illg->ig_if, ill);
1272	ill->ill_grp = NULL;
1273	rw_exit(&ipst->ips_ill_g_lock);
1274
1275	/*
1276	 * Re-establish multicast memberships that were previously being
1277	 * handled by the IPMP meta-interface.
1278	 */
1279	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1280		if (ipif->ipif_flags & IPIF_UP)
1281			ipif_multicast_up(ipif);
1282
1283	/*
1284	 * Refresh the group MTU based on the new interface list.
1285	 */
1286	ipmp_illgrp_refresh_mtu(illg);
1287
1288	if (list_is_empty(&illg->ig_if)) {
1289		/*
1290		 * No ills left in the illgrp; we no longer have a physical
1291		 * address length, nor can we support ARP, CoS, or anything
1292		 * else that depends on knowing the link layer type.
1293		 */
1294		while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
1295			ipmp_illgrp_destroy_arpent(illg, entp);
1296
1297		ipmp_ill->ill_phys_addr_length = 0;
1298		ipmp_ill->ill_nd_lla_len = 0;
1299		ipmp_ill->ill_type = IFT_OTHER;
1300		mutex_enter(&ipmp_ill->ill_lock);
1301		ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1302		mutex_exit(&ipmp_ill->ill_lock);
1303	} else {
1304		/*
1305		 * If `ill' didn't support CoS, see if it can now be enabled.
1306		 */
1307		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1308			ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
1309
1310			ill = list_head(&illg->ig_if);
1311			do {
1312				if (!(ill->ill_flags & ILLF_COS_ENABLED))
1313					break;
1314			} while ((ill = list_next(&illg->ig_if, ill)) != NULL);
1315
1316			if (ill == NULL) {
1317				mutex_enter(&ipmp_ill->ill_lock);
1318				ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1319				mutex_exit(&ipmp_ill->ill_lock);
1320			}
1321		}
1322	}
1323}
1324
1325/*
1326 * Check if `ill' should be active, and activate or deactivate if need be.
1327 * Return B_FALSE if a refresh was necessary but could not be performed.
1328 */
1329static boolean_t
1330ipmp_ill_try_refresh_active(ill_t *ill)
1331{
1332	boolean_t refreshed = B_TRUE;
1333
1334	ASSERT(IAM_WRITER_ILL(ill));
1335	ASSERT(IS_UNDER_IPMP(ill));
1336
1337	if (ipmp_ill_is_active(ill)) {
1338		if (!list_link_active(&ill->ill_actnode))
1339			refreshed = ipmp_ill_activate(ill);
1340	} else {
1341		if (list_link_active(&ill->ill_actnode))
1342			ipmp_ill_deactivate(ill);
1343	}
1344
1345	return (refreshed);
1346}
1347
1348/*
1349 * Check if `ill' should be active, and activate or deactivate if need be.
1350 * If the refresh fails, schedule a timer to try again later.
1351 */
1352void
1353ipmp_ill_refresh_active(ill_t *ill)
1354{
1355	if (!ipmp_ill_try_refresh_active(ill))
1356		ipmp_ill_refresh_active_timer_start(ill);
1357}
1358
1359/*
1360 * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
1361 */
1362static void
1363ipmp_ill_refresh_active_timer(void *ill_arg)
1364{
1365	ill_t *ill = ill_arg;
1366	boolean_t refreshed = B_FALSE;
1367
1368	/*
1369	 * Clear ill_refresh_tid to indicate that no timeout is pending
1370	 * (another thread could schedule a new timeout while we're still
1371	 * running, but that's harmless).  If the ill is going away, bail.
1372	 */
1373	mutex_enter(&ill->ill_lock);
1374	ill->ill_refresh_tid = 0;
1375	if (ill->ill_state_flags & ILL_CONDEMNED) {
1376		mutex_exit(&ill->ill_lock);
1377		return;
1378	}
1379	mutex_exit(&ill->ill_lock);
1380
1381	if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
1382		refreshed = ipmp_ill_try_refresh_active(ill);
1383		ipsq_exit(ill->ill_phyint->phyint_ipsq);
1384	}
1385
1386	/*
1387	 * If the refresh failed, schedule another attempt.
1388	 */
1389	if (!refreshed)
1390		ipmp_ill_refresh_active_timer_start(ill);
1391}
1392
1393/*
1394 * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
1395 */
1396static void
1397ipmp_ill_refresh_active_timer_start(ill_t *ill)
1398{
1399	mutex_enter(&ill->ill_lock);
1400
1401	/*
1402	 * If the ill is going away or a refresh is already scheduled, bail.
1403	 */
1404	if (ill->ill_refresh_tid != 0 ||
1405	    (ill->ill_state_flags & ILL_CONDEMNED)) {
1406		mutex_exit(&ill->ill_lock);
1407		return;
1408	}
1409
1410	ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
1411	    SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
1412
1413	mutex_exit(&ill->ill_lock);
1414}
1415
1416/*
1417 * Activate `ill' so it will be used to send and receive data traffic.  Return
1418 * B_FALSE if `ill' cannot be activated.  Note that we allocate any messages
1419 * needed to deactivate `ill' here as well so that deactivation cannot fail.
1420 */
1421static boolean_t
1422ipmp_ill_activate(ill_t *ill)
1423{
1424	ipif_t		*ipif;
1425	mblk_t		*linkupmp = NULL, *linkdownmp = NULL;
1426	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
1427	ipmp_illgrp_t	*illg = ill->ill_grp;
1428	ill_t		*maxill;
1429	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
1430
1431	ASSERT(IAM_WRITER_ILL(ill));
1432	ASSERT(IS_UNDER_IPMP(ill));
1433
1434	/*
1435	 * If this will be the first active interface in the group, allocate
1436	 * the link-up and link-down messages.
1437	 */
1438	if (grp->gr_nactif == 0) {
1439		linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
1440		linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
1441		if (linkupmp == NULL || linkdownmp == NULL)
1442			goto fail;
1443	}
1444
1445	if (list_is_empty(&illg->ig_actif)) {
1446		/*
1447		 * Now that we have an active ill, nominate it for multicast
1448		 * and broadcast duties.  Do this before ipmp_ill_bind_ipif()
1449		 * since that may need to send multicast packets (e.g., IPv6
1450		 * neighbor discovery probes).
1451		 */
1452		ipmp_illgrp_set_cast(illg, ill);
1453
1454		/*
1455		 * This is the first active ill in the illgrp -- add 'em all.
1456		 * We can access/walk ig_ipmp_ill's ipif list since we're
1457		 * writer on its IPSQ as well.
1458		 */
1459		ipif = illg->ig_ipmp_ill->ill_ipif;
1460		for (; ipif != NULL; ipif = ipif->ipif_next)
1461			if (ipmp_ipif_is_up_dataaddr(ipif))
1462				ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
1463	} else {
1464		/*
1465		 * Redistribute the addresses by moving them from the ill with
1466		 * the most addresses until the ill being activated is at the
1467		 * same level as the rest of the ills.
1468		 */
1469		for (;;) {
1470			maxill = ipmp_illgrp_max_ill(illg);
1471			ASSERT(maxill != NULL);
1472			if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
1473				break;
1474			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
1475			ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
1476		}
1477	}
1478
1479	/*
1480	 * Put the interface in the active list.
1481	 */
1482	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1483	list_insert_tail(&illg->ig_actif, ill);
1484	illg->ig_nactif++;
1485	illg->ig_next_ill = ill;
1486	rw_exit(&ipst->ips_ipmp_lock);
1487
1488	/*
1489	 * Refresh static/proxy ARP entries to use `ill', if need be.
1490	 */
1491	if (!ill->ill_isv6)
1492		ipmp_illgrp_refresh_arpent(illg);
1493
1494	/*
1495	 * Finally, mark the group link up, if necessary.
1496	 */
1497	if (grp->gr_nactif++ == 0) {
1498		ASSERT(grp->gr_linkdownmp == NULL);
1499		grp->gr_linkdownmp = linkdownmp;
1500		put(illg->ig_ipmp_ill->ill_rq, linkupmp);
1501	}
1502	return (B_TRUE);
1503fail:
1504	freemsg(linkupmp);
1505	freemsg(linkdownmp);
1506	return (B_FALSE);
1507}
1508
1509/*
1510 * Deactivate `ill' so it will not be used to send or receive data traffic.
1511 */
1512static void
1513ipmp_ill_deactivate(ill_t *ill)
1514{
1515	ill_t		*minill;
1516	ipif_t		*ipif, *ubnextipif, *ubheadipif = NULL;
1517	mblk_t		*mp;
1518	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
1519	ipmp_illgrp_t	*illg = ill->ill_grp;
1520	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
1521
1522	ASSERT(IAM_WRITER_ILL(ill));
1523	ASSERT(IS_UNDER_IPMP(ill));
1524
1525	/*
1526	 * Pull the interface out of the active list.
1527	 */
1528	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1529	list_remove(&illg->ig_actif, ill);
1530	illg->ig_nactif--;
1531	illg->ig_next_ill = list_head(&illg->ig_actif);
1532	rw_exit(&ipst->ips_ipmp_lock);
1533
1534	/*
1535	 * If the ill that's being deactivated had been nominated for
1536	 * multicast/broadcast, nominate a new one.
1537	 */
1538	if (ill == illg->ig_cast_ill)
1539		ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
1540
1541	/*
1542	 * Delete all nce_t entries using this ill, so that the next attempt
1543	 * to send data traffic will revalidate cached nce's.
1544	 */
1545	nce_flush(ill, B_TRUE);
1546
1547	/*
1548	 * Unbind all of the ipifs bound to this ill, and save 'em in a list;
1549	 * we'll rebind them after we tell the resolver the ill is no longer
1550	 * active.  We must do things in this order or the resolver could
1551	 * accidentally rebind to the ill we're trying to remove if multiple
1552	 * ills in the group have the same hardware address (which is
1553	 * unsupported, but shouldn't lead to a wedged machine).
1554	 */
1555	while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
1556		ipif->ipif_bound_next = ubheadipif;
1557		ubheadipif = ipif;
1558	}
1559	if (!ill->ill_isv6) {
1560
1561		/*
1562		 * Refresh static/proxy ARP entries that had been using `ill'.
1563		 */
1564		ipmp_illgrp_refresh_arpent(illg);
1565	}
1566
1567	/*
1568	 * Rebind each ipif from the deactivated ill to the active ill with
1569	 * the fewest ipifs.  If there are no active ills, the ipifs will
1570	 * remain unbound.
1571	 */
1572	for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
1573		ubnextipif = ipif->ipif_bound_next;
1574		ipif->ipif_bound_next = NULL;
1575
1576		if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
1577			ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
1578	}
1579
1580	if (list_is_empty(&illg->ig_actif)) {
1581		ill_t *ipmp_ill = illg->ig_ipmp_ill;
1582
1583		ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill,
1584		    (uchar_t *)ipmp_ill, ipmp_ill->ill_ipst);
1585	}
1586
1587	/*
1588	 * Remove any IRE_IF_CLONE for this ill since they might have
1589	 * an ire_nce_cache/nce_common which refers to another ill in the group.
1590	 */
1591	ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone,
1592	    ill, ill);
1593
1594	/*
1595	 * Finally, mark the group link down, if necessary.
1596	 */
1597	if (--grp->gr_nactif == 0) {
1598		mp = grp->gr_linkdownmp;
1599		grp->gr_linkdownmp = NULL;
1600		ASSERT(mp != NULL);
1601		put(illg->ig_ipmp_ill->ill_rq, mp);
1602	}
1603}
1604
1605/*
1606 * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
1607 * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
1608 */
1609static void
1610ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
1611{
1612	ipif_t *ipif;
1613
1614	ASSERT(IAM_WRITER_ILL(ill));
1615	ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
1616
1617	/*
1618	 * If `ill' is truly down, there are no messages to generate since:
1619	 *
1620	 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
1621	 *    and its addresses by bringing them down.  But that's already
1622	 *    true, so there's nothing to hide.
1623	 *
1624	 * 2. If cmd == RTM_ADD, then we're supposed to generate messages
1625	 *    indicating that any previously-hidden up addresses are again
1626	 *    back up (along with the interface).  But they aren't, so
1627	 *    there's nothing to expose.
1628	 */
1629	if (ill->ill_ipif_up_count == 0)
1630		return;
1631
1632	if (cmd == RTM_ADD)
1633		ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
1634
1635	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1636		if (ipif->ipif_flags & IPIF_UP)
1637			ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
1638
1639	if (cmd == RTM_DELETE)
1640		ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
1641}
1642
1643/*
1644 * Bind the address named by `ipif' to the underlying ill named by `ill'.
1645 * If `act' is Res_act_none, don't notify the resolver.  Otherwise, `act'
1646 * will indicate to the resolver whether this is an initial bringup of
1647 * `ipif', or just a rebind to another ill.
1648 */
1649static void
1650ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
1651{
1652	int err = 0;
1653	ip_stack_t *ipst = ill->ill_ipst;
1654
1655	ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
1656	ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
1657	ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
1658	ASSERT(ipif->ipif_bound_ill == NULL);
1659	ASSERT(ipif->ipif_bound_next == NULL);
1660
1661	ipif->ipif_bound_next = ill->ill_bound_ipif;
1662	ill->ill_bound_ipif = ipif;
1663	ill->ill_bound_cnt++;
1664	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1665	ipif->ipif_bound_ill = ill;
1666	rw_exit(&ipst->ips_ipmp_lock);
1667
1668	/*
1669	 * If necessary, tell ARP/NDP about the new mapping.  Note that
1670	 * ipif_resolver_up() cannot fail for IPv6 ills.
1671	 */
1672	if (act != Res_act_none) {
1673		if (ill->ill_isv6) {
1674			VERIFY(ipif_resolver_up(ipif, act) == 0);
1675			err = ipif_ndp_up(ipif, act == Res_act_initial);
1676		} else {
1677			err = ipif_resolver_up(ipif, act);
1678		}
1679
1680		/*
1681		 * Since ipif_ndp_up() never returns EINPROGRESS and
1682		 * ipif_resolver_up() only returns EINPROGRESS when the
1683		 * associated ill is not up, we should never be here with
1684		 * EINPROGRESS.  We rely on this to simplify the design.
1685		 */
1686		ASSERT(err != EINPROGRESS);
1687	}
1688	/* TODO: retry binding on failure? when? */
1689	ipif->ipif_bound = (err == 0);
1690}
1691
1692/*
1693 * Unbind the address named by `ipif' from the underlying ill named by `ill'.
1694 * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
1695 * If no ipifs are bound to `ill', NULL is returned.  If `notifyres' is
1696 * B_TRUE, notify the resolver about the change.
1697 */
1698static ipif_t *
1699ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
1700{
1701	ipif_t *previpif;
1702	ip_stack_t *ipst = ill->ill_ipst;
1703
1704	ASSERT(IAM_WRITER_ILL(ill));
1705	ASSERT(IS_UNDER_IPMP(ill));
1706
1707	/*
1708	 * If necessary, find an ipif to unbind.
1709	 */
1710	if (ipif == NULL) {
1711		if ((ipif = ill->ill_bound_ipif) == NULL) {
1712			ASSERT(ill->ill_bound_cnt == 0);
1713			return (NULL);
1714		}
1715	}
1716
1717	ASSERT(IAM_WRITER_IPIF(ipif));
1718	ASSERT(IS_IPMP(ipif->ipif_ill));
1719	ASSERT(ipif->ipif_bound_ill == ill);
1720	ASSERT(ill->ill_bound_cnt > 0);
1721
1722	/*
1723	 * Unbind it.
1724	 */
1725	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1726	ipif->ipif_bound_ill = NULL;
1727	rw_exit(&ipst->ips_ipmp_lock);
1728	ill->ill_bound_cnt--;
1729
1730	if (ill->ill_bound_ipif == ipif) {
1731		ill->ill_bound_ipif = ipif->ipif_bound_next;
1732	} else {
1733		previpif = ill->ill_bound_ipif;
1734		while (previpif->ipif_bound_next != ipif)
1735			previpif = previpif->ipif_bound_next;
1736
1737		previpif->ipif_bound_next = ipif->ipif_bound_next;
1738	}
1739	ipif->ipif_bound_next = NULL;
1740
1741	/*
1742	 * If requested, notify the resolvers (provided we're bound).
1743	 */
1744	if (notifyres && ipif->ipif_bound) {
1745		if (ill->ill_isv6)
1746			ipif_ndp_down(ipif);
1747		else
1748			(void) ipif_arp_down(ipif);
1749	}
1750	ipif->ipif_bound = B_FALSE;
1751
1752	return (ipif);
1753}
1754
1755/*
1756 * Check if `ill' is active.  Caller must hold ill_lock and phyint_lock if
1757 * it's not inside the IPSQ.  Since ipmp_ill_try_refresh_active() calls this
1758 * to determine whether an ill should be considered active, other consumers
1759 * may race and learn about an ill that should be deactivated/activated before
1760 * IPMP has performed the activation/deactivation.  This should be safe though
1761 * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
1762 * would've been cleaned up by ipmp_ill_deactivate().
1763 */
1764boolean_t
1765ipmp_ill_is_active(ill_t *ill)
1766{
1767	phyint_t *phyi = ill->ill_phyint;
1768
1769	ASSERT(IS_UNDER_IPMP(ill));
1770	ASSERT(IAM_WRITER_ILL(ill) ||
1771	    (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
1772
1773	/*
1774	 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
1775	 * set PHYI_FAILED whenever PHYI_RUNNING is cleared.  This allows the
1776	 * link flapping logic to be just in in.mpathd and allows us to ignore
1777	 * changes to PHYI_RUNNING.
1778	 */
1779	return (!(ill->ill_ipif_up_count == 0 ||
1780	    (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
1781}
1782
1783/*
1784 * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated
1785 * with `ill_arg'.
1786 */
1787static void
1788ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
1789{
1790	ill_t *ill = (ill_t *)ill_arg;
1791
1792	ASSERT(IAM_WRITER_ILL(ill));
1793	ASSERT(!IS_IPMP(ill));
1794
1795	if (ire->ire_ill != ill)
1796		return;
1797
1798	if (IRE_HIDDEN_TYPE(ire->ire_type)) {
1799		DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
1800		ire->ire_testhidden = B_TRUE;
1801	}
1802}
1803
1804/*
1805 * IRE walker callback: clear ire_testhidden if the IRE has a source address
1806 * on `ill_arg'.
1807 */
1808static void
1809ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
1810{
1811	ill_t *ill = (ill_t *)ill_arg;
1812
1813	ASSERT(IAM_WRITER_ILL(ill));
1814	ASSERT(!IS_IPMP(ill));
1815
1816	if (ire->ire_ill == ill) {
1817		DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
1818		ire->ire_testhidden = B_FALSE;
1819	}
1820}
1821
1822/*
1823 * Return a held pointer to the IPMP ill for underlying interface `ill', or
1824 * NULL if one doesn't exist.  (Unfortunately, this function needs to take an
1825 * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
1826 * ill_grp pointer may become stale when not inside an IPSQ and not holding
1827 * ipmp_lock.)  Caller need not be inside the IPSQ.
1828 */
1829ill_t *
1830ipmp_ill_hold_ipmp_ill(ill_t *ill)
1831{
1832	ip_stack_t *ipst = ill->ill_ipst;
1833	ipmp_illgrp_t *illg;
1834
1835	ASSERT(!IS_IPMP(ill));
1836
1837	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1838	illg = ill->ill_grp;
1839	if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) {
1840		rw_exit(&ipst->ips_ipmp_lock);
1841		return (illg->ig_ipmp_ill);
1842	}
1843	/*
1844	 * Assume `ill' was removed from the illgrp in the meantime.
1845	 */
1846	rw_exit(&ill->ill_ipst->ips_ipmp_lock);
1847	return (NULL);
1848}
1849
1850/*
1851 * Return the interface index for the IPMP ill tied to underlying interface
1852 * `ill', or zero if one doesn't exist.  Caller need not be inside the IPSQ.
1853 */
1854uint_t
1855ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
1856{
1857	uint_t ifindex = 0;
1858	ip_stack_t *ipst = ill->ill_ipst;
1859	ipmp_grp_t *grp;
1860
1861	ASSERT(!IS_IPMP(ill));
1862
1863	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1864	if ((grp = ill->ill_phyint->phyint_grp) != NULL)
1865		ifindex = grp->gr_phyint->phyint_ifindex;
1866	rw_exit(&ipst->ips_ipmp_lock);
1867	return (ifindex);
1868}
1869
1870/*
1871 * Place phyint `phyi' into IPMP group `grp'.
1872 */
1873void
1874ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
1875{
1876	ill_t *ill;
1877	ipsq_t *ipsq = phyi->phyint_ipsq;
1878	ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
1879	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1880
1881	ASSERT(IAM_WRITER_IPSQ(ipsq));
1882	ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
1883
1884	/*
1885	 * Send routing socket messages indicating that the phyint's ills
1886	 * and ipifs vanished.
1887	 */
1888	if (phyi->phyint_illv4 != NULL) {
1889		ill = phyi->phyint_illv4;
1890		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1891	}
1892
1893	if (phyi->phyint_illv6 != NULL) {
1894		ill = phyi->phyint_illv6;
1895		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1896	}
1897
1898	/*
1899	 * Snapshot the phyint's initial kstats as a baseline.
1900	 */
1901	ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
1902
1903	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1904
1905	phyi->phyint_grp = grp;
1906	if (++grp->gr_nif == 1)
1907		grp->gr_mactype = ill->ill_mactype;
1908	else
1909		ASSERT(grp->gr_mactype == ill->ill_mactype);
1910
1911	/*
1912	 * Now that we're in the group, request a switch to the group's xop
1913	 * when we ipsq_exit().  All future operations will be exclusive on
1914	 * the group xop until ipmp_phyint_leave_grp() is called.
1915	 */
1916	ASSERT(ipsq->ipsq_swxop == NULL);
1917	ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
1918	ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
1919
1920	rw_exit(&ipst->ips_ipmp_lock);
1921}
1922
1923/*
1924 * Remove phyint `phyi' from its current IPMP group.
1925 */
1926void
1927ipmp_phyint_leave_grp(phyint_t *phyi)
1928{
1929	uint_t i;
1930	ipsq_t *ipsq = phyi->phyint_ipsq;
1931	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1932	uint64_t phyi_kstats[IPMP_KSTAT_MAX];
1933
1934	ASSERT(IAM_WRITER_IPSQ(ipsq));
1935
1936	/*
1937	 * If any of the phyint's ills are still in an illgrp, kick 'em out.
1938	 */
1939	if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
1940		ipmp_ill_leave_illgrp(phyi->phyint_illv4);
1941	if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
1942		ipmp_ill_leave_illgrp(phyi->phyint_illv6);
1943
1944	/*
1945	 * Send routing socket messages indicating that the phyint's ills
1946	 * and ipifs have reappeared.
1947	 */
1948	if (phyi->phyint_illv4 != NULL)
1949		ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
1950	if (phyi->phyint_illv6 != NULL)
1951		ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
1952
1953	/*
1954	 * Calculate the phyint's cumulative kstats while it was in the group,
1955	 * and add that to the group's baseline.
1956	 */
1957	ipmp_phyint_get_kstats(phyi, phyi_kstats);
1958	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
1959		phyi_kstats[i] -= phyi->phyint_kstats0[i];
1960		atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
1961	}
1962
1963	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1964
1965	phyi->phyint_grp->gr_nif--;
1966	phyi->phyint_grp = NULL;
1967
1968	/*
1969	 * As our final act in leaving the group, request a switch back to our
1970	 * IPSQ's own xop when we ipsq_exit().
1971	 */
1972	ASSERT(ipsq->ipsq_swxop == NULL);
1973	ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
1974
1975	rw_exit(&ipst->ips_ipmp_lock);
1976}
1977
1978/*
1979 * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
1980 * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
1981 */
1982static void
1983ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
1984{
1985	uint_t		i, j;
1986	const char	*name;
1987	kstat_t		*ksp;
1988	kstat_named_t	*kn;
1989	ip_stack_t	*ipst = PHYINT_TO_IPST(phyi);
1990	zoneid_t	zoneid;
1991
1992	bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
1993	zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid);
1994	ksp = kstat_hold_byname("link", 0, phyi->phyint_name, zoneid);
1995	if (ksp == NULL)
1996		return;
1997
1998	KSTAT_ENTER(ksp);
1999
2000	if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
2001		/*
2002		 * Bring kstats up-to-date before recording.
2003		 */
2004		(void) KSTAT_UPDATE(ksp, KSTAT_READ);
2005
2006		kn = KSTAT_NAMED_PTR(ksp);
2007		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
2008			name = ipmp_kstats[i].name;
2009			kstats[i] = 0;
2010			for (j = 0; j < ksp->ks_ndata; j++) {
2011				if (strcmp(kn[j].name, name) != 0)
2012					continue;
2013
2014				switch (kn[j].data_type) {
2015				case KSTAT_DATA_INT32:
2016				case KSTAT_DATA_UINT32:
2017					kstats[i] = kn[j].value.ui32;
2018					break;
2019#ifdef	_LP64
2020				case KSTAT_DATA_LONG:
2021				case KSTAT_DATA_ULONG:
2022					kstats[i] = kn[j].value.ul;
2023					break;
2024#endif
2025				case KSTAT_DATA_INT64:
2026				case KSTAT_DATA_UINT64:
2027					kstats[i] = kn[j].value.ui64;
2028					break;
2029				}
2030				break;
2031			}
2032		}
2033	}
2034
2035	KSTAT_EXIT(ksp);
2036	kstat_rele(ksp);
2037}
2038
2039/*
2040 * Refresh the active state of all ills on `phyi'.
2041 */
2042void
2043ipmp_phyint_refresh_active(phyint_t *phyi)
2044{
2045	if (phyi->phyint_illv4 != NULL)
2046		ipmp_ill_refresh_active(phyi->phyint_illv4);
2047	if (phyi->phyint_illv6 != NULL)
2048		ipmp_ill_refresh_active(phyi->phyint_illv6);
2049}
2050
2051/*
2052 * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
2053 * doesn't exist.  Caller need not be inside the IPSQ.
2054 */
2055ill_t *
2056ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
2057{
2058	ill_t *boundill;
2059	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
2060
2061	ASSERT(IS_IPMP(ipif->ipif_ill));
2062
2063	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2064	boundill = ipif->ipif_bound_ill;
2065	if (boundill != NULL && ill_check_and_refhold(boundill)) {
2066		rw_exit(&ipst->ips_ipmp_lock);
2067		return (boundill);
2068	}
2069	rw_exit(&ipst->ips_ipmp_lock);
2070	return (NULL);
2071}
2072
2073/*
2074 * Return a pointer to the underlying ill bound to `ipif', or NULL if one
2075 * doesn't exist.  Caller must be inside the IPSQ.
2076 */
2077ill_t *
2078ipmp_ipif_bound_ill(const ipif_t *ipif)
2079{
2080	ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
2081	ASSERT(IS_IPMP(ipif->ipif_ill));
2082
2083	return (ipif->ipif_bound_ill);
2084}
2085
2086/*
2087 * Check if `ipif' is a "stub" (placeholder address not being used).
2088 */
2089boolean_t
2090ipmp_ipif_is_stubaddr(const ipif_t *ipif)
2091{
2092	if (ipif->ipif_flags & IPIF_UP)
2093		return (B_FALSE);
2094	if (ipif->ipif_ill->ill_isv6)
2095		return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2096	else
2097		return (ipif->ipif_lcl_addr == INADDR_ANY);
2098}
2099
2100/*
2101 * Check if `ipif' is an IPMP data address.
2102 */
2103boolean_t
2104ipmp_ipif_is_dataaddr(const ipif_t *ipif)
2105{
2106	if (ipif->ipif_flags & IPIF_NOFAILOVER)
2107		return (B_FALSE);
2108	if (ipif->ipif_ill->ill_isv6)
2109		return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2110	else
2111		return (ipif->ipif_lcl_addr != INADDR_ANY);
2112}
2113
2114/*
2115 * Check if `ipif' is an IPIF_UP IPMP data address.
2116 */
2117static boolean_t
2118ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
2119{
2120	return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
2121}
2122
2123/*
2124 * Check if `mp' contains a probe packet by verifying if the IP source address
2125 * is a test address on an underlying interface `ill'. Caller need not be inside
2126 * the IPSQ.
2127 */
2128boolean_t
2129ipmp_packet_is_probe(mblk_t *mp, ill_t *ill)
2130{
2131	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2132	ipha_t *ipha = (ipha_t *)mp->b_rptr;
2133
2134	ASSERT(DB_TYPE(mp) != M_CTL);
2135
2136	if (!IS_UNDER_IPMP(ill))
2137		return (B_FALSE);
2138
2139	if (ill->ill_isv6) {
2140		if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
2141		    ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL))
2142			return (B_TRUE);
2143	} else {
2144		if ((ipha->ipha_src != INADDR_ANY) &&
2145		    ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL))
2146			return (B_TRUE);
2147	}
2148	return (B_FALSE);
2149}
2150
2151/*
2152 * Pick out an appropriate underlying interface for packet transmit.  This
2153 * function may be called from the data path, so we need to verify that the
2154 * IPMP group associated with `ill' is non-null after holding the ill_g_lock.
2155 * Caller need not be inside the IPSQ.
2156 */
2157ill_t *
2158ipmp_ill_get_xmit_ill(ill_t *ill, boolean_t is_unicast)
2159{
2160	ill_t *xmit_ill;
2161	ip_stack_t *ipst = ill->ill_ipst;
2162
2163	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2164	if (ill->ill_grp == NULL) {
2165		/*
2166		 * The interface was taken out of the group. Return ill itself,
2167		 * but take a ref so that callers will always be able to do
2168		 * ill_refrele(ill);
2169		 */
2170		rw_exit(&ipst->ips_ill_g_lock);
2171		ill_refhold(ill);
2172		return (ill);
2173	}
2174	if (!is_unicast)
2175		xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
2176	else
2177		xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp);
2178	rw_exit(&ipst->ips_ill_g_lock);
2179	return (xmit_ill);
2180}
2181
2182/*
2183 * Flush out any nce that points at `ncec' from an underlying interface
2184 */
2185void
2186ipmp_ncec_flush_nce(ncec_t *ncec)
2187{
2188	ill_t		*ncec_ill = ncec->ncec_ill;
2189	ill_t		*ill;
2190	ipmp_illgrp_t	*illg;
2191	ip_stack_t	*ipst = ncec_ill->ill_ipst;
2192	list_t		dead;
2193	nce_t		*nce;
2194
2195	if (!IS_IPMP(ncec_ill))
2196		return;
2197
2198	illg = ncec_ill->ill_grp;
2199	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
2200
2201	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2202	ill = list_head(&illg->ig_if);
2203	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
2204		nce_fastpath_list_delete(ill, ncec, &dead);
2205	}
2206	rw_exit(&ipst->ips_ill_g_lock);
2207
2208	/*
2209	 * we may now nce_refrele() all dead entries since all locks have been
2210	 * dropped.
2211	 */
2212	while ((nce = list_head(&dead)) != NULL) {
2213		list_remove(&dead, nce);
2214		nce_refrele(nce);
2215	}
2216	ASSERT(list_is_empty(&dead));
2217	list_destroy(&dead);
2218}
2219
2220/*
2221 * For each interface in the IPMP group, if there are nce_t entries for the IP
2222 * address corresponding to `ncec', then their dl_unitdata_req_t and fastpath
2223 * information must be updated to match the link-layer address information in
2224 * `ncec'.
2225 */
2226void
2227ipmp_ncec_fastpath(ncec_t *ncec, ill_t *ipmp_ill)
2228{
2229	ill_t		*ill;
2230	ipmp_illgrp_t	*illg = ipmp_ill->ill_grp;
2231	ip_stack_t	*ipst = ipmp_ill->ill_ipst;
2232	nce_t		*nce, *nce_next;
2233	list_t		replace;
2234
2235	ASSERT(IS_IPMP(ipmp_ill));
2236
2237	/*
2238	 * if ncec itself is not reachable, there is no use in creating nce_t
2239	 * entries on the underlying interfaces in the group.
2240	 */
2241	if (!NCE_ISREACHABLE(ncec))
2242		return;
2243
2244	list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node));
2245	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2246	ill = list_head(&illg->ig_actif);
2247	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
2248		/*
2249		 * For each underlying interface, we first check if there is an
2250		 * nce_t for the address in ncec->ncec_addr. If one exists,
2251		 * we should trigger nce_fastpath for that nce_t. However, the
2252		 * catch is that we are holding the ips_ipmp_lock to prevent
2253		 * changes to the IPMP group membership, so that we cannot
2254		 * putnext() to the driver.  So we nce_delete the
2255		 * list nce_t entries that need to be updated into the
2256		 * `replace' list, and then process the `replace' list
2257		 * after dropping the ips_ipmp_lock.
2258		 */
2259		mutex_enter(&ill->ill_lock);
2260		for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
2261			nce_next = list_next(&ill->ill_nce, nce);
2262			if (!IN6_ARE_ADDR_EQUAL(&nce->nce_addr,
2263			    &ncec->ncec_addr)) {
2264				nce = nce_next;
2265				continue;
2266			}
2267			nce_refhold(nce);
2268			nce_delete(nce);
2269			list_insert_tail(&replace, nce);
2270			nce = nce_next;
2271		}
2272		mutex_exit(&ill->ill_lock);
2273	}
2274	rw_exit(&ipst->ips_ipmp_lock);
2275	/*
2276	 * `replace' now has the list of nce's on which we should be triggering
2277	 * nce_fastpath(). We now retrigger fastpath by setting up the nce
2278	 * again. The code in nce_lookup_then_add_v* ensures that nce->nce_ill
2279	 * is still in the group for ncec->ncec_ill
2280	 */
2281	while ((nce = list_head(&replace)) != NULL) {
2282		list_remove(&replace, nce);
2283		if (ncec->ncec_ill->ill_isv6) {
2284			(void) nce_lookup_then_add_v6(nce->nce_ill,
2285			    ncec->ncec_lladdr,  ncec->ncec_lladdr_length,
2286			    &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED,
2287			    NULL);
2288		} else {
2289			ipaddr_t ipaddr;
2290
2291			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr);
2292			(void) nce_lookup_then_add_v4(nce->nce_ill,
2293			    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
2294			    &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL);
2295		}
2296		nce_refrele(nce);
2297	}
2298	ASSERT(list_is_empty(&replace));
2299	list_destroy(&replace);
2300}
2301